--- title: "ChIP versus RNA-seq" author: "J-P Verta" date: '2020-01-02' output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) library(ggplot2) library(viridis) library(dplyr) library(ggpubr) library(ChIPpeakAnno) library(GenomicFeatures) library(AnnotationHub) library(AnnotationDbi) library(clusterProfiler) library(stringr) ``` ## ChIPmentation peaks Load annotated ChIPmentation (CM) peaks. ```{r} CMpeaksAnnotated = read.table("filtered_outputPeaksGCnorm_annotated.txt",sep="\t",header=T) colnames(CMpeaksAnnotated)[1] = "Peak_id" # reformat Annotation column if broken if (length(which(is.na(CMpeaksAnnotated$Annotation))) > 0.1 * length(CMpeaksAnnotated$Annotation)){ CMpeaksAnnotated$Annotation = unlist(lapply(CMpeaksAnnotated$Detailed.Annotation,function(x) unlist(strsplit(as.character(x),split=" "))[1])) } ``` Promoter peaks ```{r} CMpeaksAnnotatedPromoters = CMpeaksAnnotated[which(grepl("promoter",CMpeaksAnnotated$Annotation)),] ``` Proximal peaks (excluding promoter peaks) ```{r} CMpeaksAnnotatedProximal = CMpeaksAnnotated[which(grepl("promoter",CMpeaksAnnotated$Annotation)==F),] ``` Load normalized expression data. ```{r} normCounts = read.table("featureCountsImmatureNCBIVarStabNormalized.txt",sep="\t",header=T,row.names=1) ``` Read gene models as GRanges and attach annotations. ```{r} txdb = makeTxDbFromGFF(file="GCF_000233375.1_ICSASG_v2_genomic.gtf", format="gtf") salmonTranscriptRange = transcripts(txdb) # initiate annotationhub connection ah = AnnotationHub(localHub=FALSE) # retrieve available salmon annotations query(ah, "OrgDb")[which(query(ah, "OrgDb")$species %in% "Salmo salar")] salmodb = query(ah, "OrgDb")[["AH80527"]] salmonTranscriptAnnot = GRanges(cbind(salmonTranscriptRange,AnnotationDbi::select(salmodb, salmonTranscriptRange$tx_name, c("SYMBOL", "GENENAME", "ENTREZID"), "REFSEQ"))) allid = keys(salmodb, "REFSEQ") ``` Assign gene id to expression table. ```{r} getGeneID = function(x){ unlist(strsplit(x,":"))[2] } normCounts = cbind(normCounts,AnnotationDbi::select(salmodb, unlist(lapply(rownames(normCounts),getGeneID)), c("SYMBOL", "GENENAME"), "ENTREZID")) head(normCounts) ``` Calculate the number of peaks assigned to gene versus the expression level of that gene. ```{r} # all peaks from HOMER replicatePeaks.pl (FDR<0.05 $ logFC>1) peakNumberExpressionImm = data.frame() for (gene in rownames(normCounts)){ geneID = normCounts[gene,"SYMBOL"] genePeaks = CMpeaksAnnotated[which(CMpeaksAnnotated$Nearest.PromoterID == geneID),] peakNumberExpressionImm[gene,"peaks"] = nrow(genePeaks) peakNumberExpressionImm[gene,"meanExpression"] = mean(as.numeric(normCounts[match(geneID,normCounts$SYMBOL),c("SRR8479243","SRR8479245","SRR8479246")]),na.rm=T) } peakNumberExpressionImm = cbind(peakNumberExpressionImm,AnnotationDbi::select(salmodb, unlist(lapply(rownames(peakNumberExpressionImm),getGeneID)), c("SYMBOL", "GENENAME"), "ENTREZID")) ``` Plot the results ```{r} # group all genes with more than 7 peaks together peakNumberExpressionImm$peak_group = cut(peakNumberExpressionImm$peaks,c(seq(0,7,0.5),max(peakNumberExpressionImm$peaks)),right=F) # colored plot by peak group means peakPlot = peakNumberExpressionImm[which(is.na(peakNumberExpressionImm$peak_group)==F),]%>% group_by(peak_group)%>% mutate(mean_peaks=mean(meanExpression,na.rm=T))%>% ggplot(aes(factor(peak_group),meanExpression)) + geom_boxplot(aes(fill=mean_peaks)) + scale_x_discrete(labels=c(seq(0,6),"7+")) + scale_fill_viridis(option = "D") + ylab("Normalized expression") + xlab("Number of assigned peaks") + theme_classic() + theme(legend.title=element_blank(), panel.background = element_rect(fill = "transparent"), rect = element_rect(fill = "transparent"), legend.position="none") peakPlot ``` Calculate the number of significant **promoter** peaks assigned to nearest TSS versus the expression level of that TSS. ```{r} peakNumberExpressionImmPromoters = data.frame() for (gene in rownames(normCounts)){ geneID = normCounts[gene,"SYMBOL"] genePeaks = CMpeaksAnnotatedPromoters[which(CMpeaksAnnotatedPromoters$Nearest.PromoterID == geneID),] peakNumberExpressionImmPromoters[gene,"peaks"] = nrow(genePeaks) peakNumberExpressionImmPromoters[gene,"meanExpression"] = mean(as.numeric(normCounts[match(geneID,normCounts$SYMBOL),c("SRR8479243","SRR8479245","SRR8479246")]),na.rm=T) } peakNumberExpressionImmPromoters = cbind(peakNumberExpressionImmPromoters,AnnotationDbi::select(salmodb, unlist(lapply(rownames(peakNumberExpressionImmPromoters),getGeneID)), c("SYMBOL", "GENENAME"), "ENTREZID")) ``` Plot the results. ```{r} peakNumberExpressionImmPromoters$peak_group = cut(peakNumberExpressionImmPromoters$peaks,seq(0,3),right=F) # colored plot by peak group means promPeakPlot = peakNumberExpressionImmPromoters%>% group_by(peaks)%>% mutate(mean_peaks=mean(meanExpression,na.rm=T))%>% ggplot(aes(factor(peaks),meanExpression)) + geom_boxplot(aes(fill=mean_peaks)) + scale_x_discrete(labels=c(seq(0,3))) + scale_fill_viridis() + ylab("Normalized expression") + xlab("Number of promoter peaks") + theme_classic() + theme(legend.title=element_blank(), legend.position="none") promPeakPlot ``` Calculate the number of significant **proximal** peaks (excluding promoter peaks) assigned to nearest TSS versus the expression level of that TSS. ```{r} peakNumberExpressionImmProximal = data.frame() for (gene in rownames(normCounts)){ geneID = normCounts[gene,"SYMBOL"] genePeaks = CMpeaksAnnotatedProximal[which(CMpeaksAnnotatedProximal$Nearest.PromoterID == geneID),] peakNumberExpressionImmProximal[gene,"peaks"] = nrow(genePeaks) peakNumberExpressionImmProximal[gene,"meanExpression"] = mean(as.numeric(normCounts[match(geneID,normCounts$SYMBOL),c("SRR8479243","SRR8479245","SRR8479246")]),na.rm=T) } peakNumberExpressionImmProximal = cbind(peakNumberExpressionImmProximal,AnnotationDbi::select(salmodb, unlist(lapply(rownames(peakNumberExpressionImmProximal),getGeneID)), c("SYMBOL", "GENENAME"), "ENTREZID")) ``` Plot the results. ```{r} peakNumberExpressionImmProximal$peak_group = cut(peakNumberExpressionImmProximal$peaks,c(seq(0,7),max(peakNumberExpressionImmProximal$peaks)),right=F) # colored plot by peak group means proxPeakPlot = peakNumberExpressionImmProximal[which(is.na(peakNumberExpressionImmProximal$peak_group)==F),]%>% group_by(peak_group)%>% mutate(mean_peaks=mean(meanExpression,na.rm=T))%>% ggplot(aes(factor(peak_group),meanExpression)) + geom_boxplot(aes(fill=mean_peaks)) + scale_x_discrete(labels=c(seq(0,6),"7+")) + scale_fill_viridis() + ylab("Normalized expression") + xlab("Number of proximal peaks") + theme_classic() + theme(legend.title=element_blank(), legend.position="none") proxPeakPlot ``` Combine plots to a figure. ```{r} ggarrange(peakPlot, promPeakPlot, proxPeakPlot,nrow=1, ncol=3) ``` Combine all peaks and promoter peaks to a single plot so that color scale is the same for all. ```{r} peakNumberExpressionImmProximal$Feature = "Proximal peaks" peakNumberExpressionImmPromoters$Feature = "Promoter peaks" # re-order for plotting peakNumberExpressionImmProximal$Feature = factor(peakNumberExpressionImmProximal$Feature,levels=c("Promoter peaks","Proximal peaks")) plotData = rbind(peakNumberExpressionImmProximal, peakNumberExpressionImmPromoters) # re-calculate peak groups plotData$peak_group = cut(plotData$peaks,c(seq(0,7,0.5),max(plotData$peaks)),right=F) peakExpPlot = plotData[which(is.na(plotData$peak_group)==F),]%>% dplyr::group_by(peak_group, Feature)%>% dplyr::mutate(mean_peaks=median(meanExpression,na.rm=T))%>% ggplot(aes(factor(peak_group),meanExpression)) + geom_boxplot(aes(fill=mean_peaks)) + scale_fill_viridis(option = "D", name="Median normalized expression") + facet_grid(scales = "free",rows=vars(Feature)) + scale_x_discrete(labels=c(seq(0,6),"7+")) + ylab("Normalized expression") + xlab("Number of features") + theme_classic() + theme(legend.title=element_text(colour = "black"), legend.position="bottom", legend.text = element_text(colour = "black",size=8), axis.text.y = element_text(colour = "black",size=8), axis.text.x = element_text(colour = "black",size=8), strip.background = element_blank()) peakExpPlot # test significance of association summary(lm(plotData$meanExpression~plotData$peaks)) # Spearman's correlation cor(plotData$meanExpression,plotData$peaks,method="spearman") cor(plotData$meanExpression[which(plotData$Feature=="Proximal peaks")],plotData$peaks[which(plotData$Feature=="Proximal peaks")],method="spearman") cor(plotData$meanExpression[which(plotData$Feature=="Promoter peaks")],plotData$peaks[which(plotData$Feature=="Promoter peaks")],method="spearman") ``` ## GO analyses Load non-normalized expression values and select for genes that are expressed. ```{r} counts = read.table("featureCountsImmatureNCBI.txt",sep="\t",header=T,row.names=1) counts = cbind(counts,AnnotationDbi::select(salmodb, unlist(lapply(rownames(counts),getGeneID)), c("SYMBOL", "GENENAME"), "ENTREZID")) # select for genes where the average counts in immatures is higher than 10 reads. expressedGenes = counts$SYMBOL[apply(counts,1,function(x){ any(mean(as.numeric(x[1:3]))>10) })] length(expressedGenes) expressedGenesTXs = AnnotationDbi::select(txdb, keys = unique(expressedGenes), columns=columns(txdb), keytype="GENEID") expressedGenesAnnot = AnnotationDbi::select(salmodb, unique(expressedGenesTXs$TXNAME), c("ENTREZID"), "REFSEQ") ``` GO analysis of enhancer associated genes (at least 1 proximal peak). ```{r} # retrieve annotations for proximal genes keys = peakNumberExpressionImm$SYMBOL[which(peakNumberExpressionImm$peaks>0)] peakTXs = AnnotationDbi::select(txdb, keys = keys, columns=columns(txdb), keytype="GENEID") peakAnnot = AnnotationDbi::select(salmodb, unique(peakTXs$TXNAME), c("ENTREZID"), "REFSEQ") # test for GO enrichment peaksEnrichedMF = enrichGO(peakAnnot$ENTREZID, salmodb, ont = "MF", #universe = allAnnot$ENTREZID, universe = expressedGenesAnnot$ENTREZID, pAdjustMethod = "fdr", pvalueCutoff = 0.1) peaksEnrichedMF peaksEnrichedMF$ID peaksEnrichedMF$Description # remove redundancy simplify(peaksEnrichedMF) simplify(peaksEnrichedMF)$ID simplify(peaksEnrichedMF)$Description peaksEnrichedBP = enrichGO(peakAnnot$ENTREZID, salmodb, ont = "BP", #universe = allAnnot$ENTREZID, universe = expressedGenesAnnot$ENTREZID, pAdjustMethod = "fdr", pvalueCutoff = 0.1) peaksEnrichedBP peaksEnrichedBP$ID peaksEnrichedBP$Description ``` Visualization of GO analysis results ```{r} plotData = data.frame(GO=simplify(peaksEnrichedMF)$Description, Pvalue=simplify(peaksEnrichedMF)$p.adjust, Count=simplify(peaksEnrichedMF)$Count) plotData$GO = factor(plotData$GO,levels=plotData$GO[order(plotData$Pvalue)]) ggplot(plotData,aes(GO,Count)) + geom_bar(aes(fill=Pvalue),stat="identity") + scale_fill_viridis(option = "D",name = "P-value") + coord_flip() + scale_x_discrete(labels = function(x) str_wrap(x, width = 25)) + xlab("") + ylab("Count") + theme_classic() + theme(legend.title=element_text(colour = "black"), legend.position = "bottom", legend.text = element_text(colour = "black",size=8), axis.text.y = element_text(colour = "black",size=14), axis.text.x = element_text(colour = "black",size=8), strip.background = element_blank()) ``` Save all analyses. ```{r} save.image("ChIPversusTestesExpressionGCnorm.RData") ```