--- title: "Peak Comparison WGD Regions" author: "J-P Verta" date: '2021-01-05' output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) library(GenomicFeatures) library(ggplot2) library(plyr) library(tidyr) library(AnnotationHub) library(circlize) library(ggrepel) library(ChIPpeakAnno) library(ggpubr) library(viridis) library(RColorBrewer) library(ComplexHeatmap) ``` Load HOMER annotations for CM peaks. ```{r} mergedCMpeaks_HOMER = unique(read.table("filtered_outputPeaksGCnorm_annotated.txt",header=T,sep="\t",row.names=1)) # format Annotation column mergedCMpeaks_HOMER$Detailed.Annotation = unlist(lapply(mergedCMpeaks_HOMER$Annotation,function(x) unlist(strsplit(as.character(x),split=" "))[1])) #mergedCMpeaks_HOMER$Annotation = revalue(mergedCMpeaks_HOMER$Annotation,c("-intron"="intron", "-exon"="exon", "-promoter-TSS"="promoter-TSS", "-TTS"="TTS")) head(mergedCMpeaks_HOMER) ``` ## Peak annotations Read gene models as GRanges and attach annotations. ```{r} # gene models salmonGTF = toGRanges("GCF_000233375.1_ICSASG_v2_genomic.gtf", format="GTF") salmonGeneRange = salmonGTF[which(salmonGTF$type == "gene")] ``` Attach annotation ID's for peaks. ```{r} # define the peak data frame you want to analyze peakRange = makeGRangesFromDataFrame(mergedCMpeaks_HOMER,keep.extra.columns = T) # ranges of nearest promoter IDs # attach missing IDs by identifying overlaps between TRANSCRIPT MODELS and NEAREST PROMOTERS (not peaks them selves) # assign entrez id to PEAKS for (i in 1:length(peakRange)){ r = peakRange[i] tryCatch( expr = { if (is.na(r$Nearest.PromoterID) == F){ promID = r$Nearest.PromoterID nearestPromoterRange = salmonGeneRange[which(salmonGeneRange$gene_id == promID)] peakRange[i]$Entrez.ID = nearestPromoterRange$db_xref } }, error = function(e){ message("* Caught an error on line ", i, " of input file.") print(paste(seqnames(r),start(r),end(r),r$Annotation,r$Nearest.PromoterID)) print(e) } ) } ``` Sizes of peaks - nucleosome pattern. ```{r} library(egg) insertPlot = ggplot(as.data.frame(width(peakRange))) + geom_density(aes(width(peakRange))) + theme_classic() + xlab("") + ylab("") + xlim(400,1000) + theme(legend.title=element_text(colour = "black"), legend.position = "bottom", legend.text = element_text(colour = "black",size=14), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.text.x = element_text(colour = "black",size=14), strip.background = element_blank()) ggplot(as.data.frame(width(peakRange))) + geom_density(aes(width(peakRange))) + theme_classic() + xlab("Peak width") + ylab("Density") + annotation_custom( ggplotGrob(insertPlot), xmin = 1000, xmax = 4000, ymin = 0.001, ymax = 0.008) + theme(legend.title=element_text(colour = "black"), legend.position = "bottom", legend.text = element_text(colour = "black",size=14), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.text.x = element_text(colour = "black",size=14), strip.background = element_blank()) ``` ## Peak distribution Distribution of peaks into annotated regions. Statistical significance of these overlaps is calculated by HOMER. ```{r} freqMat = as.data.frame(table(mergedCMpeaks_HOMER$Detailed.Annotation)/sum(table(mergedCMpeaks_HOMER$Detailed.Annotation))*100) freqMat <- freqMat %>% arrange(desc(Var1)) %>% mutate(prop = Freq / sum(freqMat$Freq) *100) %>% mutate(ypos = cumsum(prop)- 0.5*prop ) ggplot(freqMat) + geom_bar(aes("",Freq,fill=Var1),stat="identity",width=1,color="white") + ylab("%") + xlab("") + coord_polar("y",start=0) + geom_text_repel(aes("", y=ypos, label=paste(round(prop),"%")), color="white", size=6) + scale_fill_brewer(palette="Set1",name = "Annotation", labels=c("Exon ***", "Intergenic", "Intron ***", "Promoter-TSS ***", "TTS")) + theme_void() ``` ## Homeoblocks Read duplicated genome regions, convert names to NCBI format and transform into GRanges. ```{r} NCBInames = read.table("SsaRefSeqNames.txt",sep="\t",header=T) NCBInames$ChromNum = unlist(lapply(NCBInames$Name,function(x){unlist(strsplit(x,"ssa"))[2]})) WGDregionsX = read.table("Lien_homeologous_blocks_X.txt", sep="\t", header=T) WGDregionsX$Seqnames = NCBInames$RefSeq[match(WGDregionsX$Seqnames, NCBInames$Name)] WGDregionsX$ChrY = NCBInames$RefSeq[match(WGDregionsX$ChrY, NCBInames$Name)] WGDregionsY = read.table("Lien_homeologous_blocks_Y.txt", sep="\t", header=T) WGDregionsY$Seqnames = NCBInames$RefSeq[match(WGDregionsY$Seqnames, NCBInames$Name)] WGDregionsY$ChrX = NCBInames$RefSeq[match(WGDregionsY$ChrX, NCBInames$Name)] WGDrangeX = makeGRangesFromDataFrame(WGDregionsX, keep.extra.columns = T) head(WGDrangeX) WGDrangeY = makeGRangesFromDataFrame(WGDregionsY, keep.extra.columns = T) head(WGDrangeY) ``` Substract WGD regions from whole genome to define non-WGD regions. ```{r} chromLength = read.table("GCF_000233375.1_ICSASG_v2_genomic.chrom.sizes.chromsOnly",sep="\t") colnames(chromLength) = c("Chr","End") chromLength$Start = 1 genomeRange = makeGRangesFromDataFrame(chromLength) # ignore non-mapped scaffolds ``` Overlap peaks with duplicated regions. ```{r} WGDrangeXpeaks = subsetByOverlaps(peakRange, WGDrangeX) WGDrangeYpeaks = subsetByOverlaps(peakRange, WGDrangeY) nonWGDpeaks = peakRange[peakRange %outside% WGDrangeX & peakRange %outside% WGDrangeY] # verify there's not overlap subsetByOverlaps(WGDrangeXpeaks, nonWGDpeaks) subsetByOverlaps(WGDrangeYpeaks, nonWGDpeaks) # verify that sum of peaks split into regions add up to the number of original peaks length(peakRange) == length(nonWGDpeaks) + length(WGDrangeXpeaks) + length(WGDrangeYpeaks) # print number of peaks in each region print(paste("Number of peaks in non-WGD regions:", length(nonWGDpeaks))) print(paste("Number of peaks in WGD-X regions:", length(WGDrangeXpeaks))) print(paste("Number of peaks in WGD-Y regions:", length(WGDrangeYpeaks))) ``` Compare peaks in X and Y WGD regions. Normalize the number of peaks per 100kb of homeoblock for each. ```{r, warning=T} numberOfPeaksPerBlock = data.frame() for (block in unique(WGDrangeX$HomeologBlockName)){ blockRangeX = WGDrangeX[which(WGDrangeX$HomeologBlockName == block)] blockRangeY = WGDrangeY[which(WGDrangeY$HomeologBlockName == block)] WGDpeaksBlockX = subsetByOverlaps(peakRange, blockRangeX) WGDpeaksBlockY = subsetByOverlaps(peakRange, blockRangeY) numberOfPeaksPerBlock[block,"chrX"] = unique(as.character(seqnames(blockRangeX))) numberOfPeaksPerBlock[block,"chrY"] = unique(as.character(seqnames(blockRangeY))) numberOfPeaksPerBlock[block,"minStartX"] = min(start(blockRangeX)) numberOfPeaksPerBlock[block,"minStartY"] = min(start(blockRangeY)) numberOfPeaksPerBlock[block,"maxEndX"] = max(end(blockRangeX)) numberOfPeaksPerBlock[block,"maxEndY"] = max(end(blockRangeY)) numberOfPeaksPerBlock[block,"perX"] = length(WGDpeaksBlockX) numberOfPeaksPerBlock[block,"perY"] = length(WGDpeaksBlockY) numberOfPeaksPerBlock[block,"per100kbX"] = length(WGDpeaksBlockX)/(sum(end(blockRangeX)-start(blockRangeX))/100000) numberOfPeaksPerBlock[block,"per100kbY"] = length(WGDpeaksBlockY)/(sum(end(blockRangeY)-start(blockRangeY))/100000) } numberOfPeaksPerBlock ``` Plot the results. ```{r} # plot correlation between number of peaks peaksPerBlockPlot = ggplot(numberOfPeaksPerBlock, aes(perX,perY)) + geom_smooth(method="lm") + geom_point(alpha=0.5) + xlab("Peaks") + ylab("Peaks") + theme(axis.text.x=element_text(size=14,color="black"), axis.title=element_text(size=14,color="black"), axis.title.x=element_blank(), axis.text.y=element_text(size=14,color="black"), legend.position = "none") + theme_classic() + xlim(0,600) + ylim(0,600) + #stat_cor(label.y = 600, size=3) + theme(legend.title=element_text(colour = "black"), legend.position = "bottom", legend.text = element_text(colour = "black",size=8), axis.text.y = element_text(colour = "black",size=8), axis.text.x = element_text(colour = "black",size=8), strip.background = element_blank()) peaksPerBlockPlot summary(lm(numberOfPeaksPerBlock$perY ~ numberOfPeaksPerBlock$perX)) numberOfPeaksPerBlock$perXPerYresiduals = lm(numberOfPeaksPerBlock$perY ~ numberOfPeaksPerBlock$perX)$residuals summary(lm(numberOfPeaksPerBlock$per100kbY ~ numberOfPeaksPerBlock$per100kbX)) numberOfPeaksPerBlock$per100kbXper100kbYresiduals = lm(numberOfPeaksPerBlock$per100kbY ~ numberOfPeaksPerBlock$per100kbX)$residuals # residuals from 1:1 amount numberOfPeaksPerBlock$per100kbOneToOneResiduals = lm(numberOfPeaksPerBlock$per100kbY - numberOfPeaksPerBlock$per100kbX ~ 0)$residuals # gradient color for homeoblocks based on residuals from 1:1 normalized number of peaks getPaletteResid = colorRampPalette(brewer.pal(9, "Blues"))(9) homeoblockColsResid = getPaletteResid[cut(abs(numberOfPeaksPerBlock$per100kbOneToOneResiduals),9)] names(homeoblockColsResid) = rownames(numberOfPeaksPerBlock) # plot correlation between number of peaks normalized by block length (in 100 kb) (i.e. peak "density") normPeaksPerBlockPlot = ggplot(numberOfPeaksPerBlock, aes(per100kbX,per100kbY)) + geom_abline(color="grey",linetype="dashed") + geom_point(alpha=0.5,color="black",size=2) + geom_point(color=homeoblockColsResid[rownames(numberOfPeaksPerBlock)]) + xlab("Peak density (per 100 Kb)") + ylab("Peak density (per 100 Kb)") + theme(axis.text.x=element_text(size=14,color="black"), axis.title=element_text(size=14,color="black"), axis.text.y=element_text(size=14,color="black"), legend.position = "none") + theme_classic() + theme(legend.title=element_text(colour = "black"), legend.position = "bottom", legend.text = element_text(colour = "black",size=8), axis.text.y = element_text(colour = "black",size=8), axis.text.x = element_text(colour = "black",size=8), strip.background = element_blank()) normPeaksPerBlockPlot ``` Top homeoblock by divergence in peak numbers. ```{r} #top 10 block with most divergence in residuals head(numberOfPeaksPerBlock[order(abs(numberOfPeaksPerBlock$per100kbOneToOneResiduals), decreasing = T),], n=10) topBlocks = head(numberOfPeaksPerBlock[order(abs(numberOfPeaksPerBlock$per100kbOneToOneResiduals), decreasing = T),], n=10) table(topBlocks$chrX) table(topBlocks$chrY) # chr19 : 3 out of 10 top blocks - test using hypergeometric test whiteBallsDrawn=3 whiteBallsTotal=length(which(numberOfPeaksPerBlock$chrX %in% "NC_027318.1" | numberOfPeaksPerBlock$chrY %in% "NC_027318.1")) blackBallsTotal=length(which(numberOfPeaksPerBlock$chrX %in% "NC_027318.1" == F & numberOfPeaksPerBlock$chrY %in% "NC_027318.1"== F)) drawnBallsTotal=10 phyper(whiteBallsDrawn,whiteBallsTotal,blackBallsTotal,drawnBallsTotal,lower.tail = F) # chr01 : 4 out of 10 top blocks - test using hypergeometric test whiteBallsDrawn=4 whiteBallsTotal=length(which(numberOfPeaksPerBlock$chrX %in% "NC_027300.1" | numberOfPeaksPerBlock$chrY %in% "NC_027300.1")) blackBallsTotal=length(which(numberOfPeaksPerBlock$chrX %in% "NC_027300.1" == F & numberOfPeaksPerBlock$chrY %in% "NC_027300.1"== F)) drawnBallsTotal=10 phyper(whiteBallsDrawn,whiteBallsTotal,blackBallsTotal,drawnBallsTotal,lower.tail = F) # chr09 : 3 out of 10 top blocks - test using hypergeometric test whiteBallsDrawn=3 whiteBallsTotal=length(which(numberOfPeaksPerBlock$chrX %in% "NC_027308.1" | numberOfPeaksPerBlock$chrY %in% "NC_027308.1")) blackBallsTotal=length(which(numberOfPeaksPerBlock$chrX %in% "NC_027308.1" == F & numberOfPeaksPerBlock$chrY %in% "NC_027308.1"== F)) drawnBallsTotal=10 phyper(whiteBallsDrawn,whiteBallsTotal,blackBallsTotal,drawnBallsTotal,lower.tail = F) # added length of top 10 diverged homeoblocks sum(topBlocks$maxEndX-topBlocks$minStartX,topBlocks$maxEndY-topBlocks$minStartY)/1000000 # length of all homeoblocks together sum(numberOfPeaksPerBlock$maxEndX-numberOfPeaksPerBlock$minStartX,numberOfPeaksPerBlock$maxEndY-numberOfPeaksPerBlock$minStartY)/1000000 ``` Homeoblock length compared to difference. ```{r} # mean block lengths numberOfPeaksPerBlock$meanBlockLength = apply(data.frame(abs(numberOfPeaksPerBlock$maxEndX-numberOfPeaksPerBlock$minStartX),abs(numberOfPeaksPerBlock$maxEndY-numberOfPeaksPerBlock$minStartY)),1,mean) # plot blockLenghtVsDiffPlot = ggplot(numberOfPeaksPerBlock,aes(meanBlockLength/1000000,abs(per100kbOneToOneResiduals))) + geom_point() + geom_point(alpha=0.5,color="black",size=2) + geom_point(color=homeoblockColsResid[rownames(numberOfPeaksPerBlock)]) + theme_classic() + theme(legend.title=element_text(colour = "black"), legend.position = "bottom", legend.text = element_text(colour = "black",size=8), axis.text.y = element_text(colour = "black",size=8), axis.text.x = element_text(colour = "black",size=8), strip.background = element_blank()) + ylab("Residuals from 1:1 peak density") + xlab("Block length (Mb)") blockLenghtVsDiffPlot ``` ## Homeoblock comparisons Compare divergence in peak loci to overall divergence of homeoblocks - are peak loci diverging slower than the blocks overall or just following sequence divergence of the blocks. Comparison of number of peaks between homeoblock groups with young versus old rediploidization age. (hypothesis: older rediploidization age leads to larger difference in peak numbers - independent evolution of elements in homeoblocks - younger rediploidization age corresponds to more similar numbers of peaks because they have remained "functionally similar"s for a longer time). ```{r} diploidAgeYoung = c( "3q-6p", "16qp-17qa", "2p-5q", "12qa-2q", "17qb-7q", "11qa-26", "4p-8q", "20qb-9qc" ) diploidAgeOld = c( "14qb-27", "5p-9qb", "11qb-4q", "19qb-29", "1qa-28", "15qa-6q", "18qa-1qa", "19qb-28", "21-25", "10qb-16qa", "19qa-29", "14qa-3p", "12qb-22", "20qa-24", "18qb-7p", "13qb-1qb", "13qb-4q", "10qa-23", "13qa-15qb", "1p-9qa", "10qa-16qa", "10qb-23", "11qb-1qb" ) WGDrangeX$rediploidizationAge = NA for (block in WGDrangeX$Block){ homeoBlock = as.character(WGDrangeX$HomeologBlockName[block]) if (strsplit(homeoBlock,"_")[[1]][1] %in% diploidAgeYoung){ WGDrangeX$rediploidizationAge[block] = "<40Mya" } if (strsplit(homeoBlock,"_")[[1]][1] %in% diploidAgeOld){ WGDrangeX$rediploidizationAge[block] = ">40Mya" } else { } } WGDrangeY$rediploidizationAge = NA for (block in WGDrangeY$Block){ homeoBlock = as.character(WGDrangeY$HomeologBlockName[block]) if (strsplit(homeoBlock,"_")[[1]][1] %in% diploidAgeYoung){ WGDrangeY$rediploidizationAge[block] = "<40Mya" } if (strsplit(homeoBlock,"_")[[1]][1] %in% diploidAgeOld){ WGDrangeY$rediploidizationAge[block] = ">40Mya" } else { } } numberOfPeaksPerBlock$block = sapply(rownames(numberOfPeaksPerBlock),function(x){strsplit(x,"_")[[1]][1]}) numberOfPeaksPerBlock$rediploidizationAge = NA for (block in 1:nrow(numberOfPeaksPerBlock)){ homeoBlock = rownames(numberOfPeaksPerBlock)[block] if (strsplit(homeoBlock,"_")[[1]][1] %in% diploidAgeYoung){ numberOfPeaksPerBlock$rediploidizationAge[block] = "<40Mya" } if (strsplit(homeoBlock,"_")[[1]][1] %in% diploidAgeOld){ numberOfPeaksPerBlock$rediploidizationAge[block] = ">40Mya" } else { } } ``` Plot and test. ```{r} # plot differences in normalized number of peaks my_comparisons <- list(c("<40Mya",">40Mya")) peakReploidPlot = ggplot(numberOfPeaksPerBlock[which(is.na(numberOfPeaksPerBlock$rediploidizationAge)==F),], aes(rediploidizationAge,abs(per100kbX-per100kbY))) + geom_boxplot() + ylab("Difference in peak density") + theme_classic() + xlab("Reploidization age") + stat_compare_means(aes(label = paste("p =", ..p.format..)), size=3, paired = F, method="wilcox",label.x=1.5,label.y=2.3) + theme(legend.title=element_text(colour = "black"), legend.text = element_text(colour = "black",size=8), axis.text.y = element_text(colour = "black",size=8), axis.text.x = element_text(colour = "black",size=8), strip.background = element_blank()) peakReploidPlot # check for normality etc ggqqplot(numberOfPeaksPerBlock$per100kbX[which(numberOfPeaksPerBlock$rediploidizationAge==">40Mya")]) shapiro.test(numberOfPeaksPerBlock$per100kbX[which(numberOfPeaksPerBlock$rediploidizationAge==">40Mya")]) ggqqplot(numberOfPeaksPerBlock$per100kbX[which(numberOfPeaksPerBlock$rediploidizationAge=="<40Mya")]) shapiro.test(numberOfPeaksPerBlock$per100kbX[which(numberOfPeaksPerBlock$rediploidizationAge=="<40Mya")]) # test with a non-parametric test because data deviates from normal distribution wilcox.test(abs(numberOfPeaksPerBlock$per100kbX-numberOfPeaksPerBlock$per100kbY)~numberOfPeaksPerBlock$rediploidizationAge) ``` Combine into a figure. ```{r} ggpubr::ggarrange(peaksPerBlockPlot, normPeaksPerBlockPlot, blockLenghtVsDiffPlot, peakReploidPlot, labels=c("A","B","C","D"), nrow=1, ncol=4) ``` Load expression analysis results. ```{r} load("ChIPversusTestesExpressionGCnorm.RData") ``` Calculate peak and gene density along genome in bins. ```{r} # re-order chromosome lengths based on chromsome name chromLength = chromLength[match(NCBInames$RefSeq[1:29],chromLength$Chr),] # chromosome lengths salmoSeqlengths = chromLength$End names(salmoSeqlengths) = chromLength$Chr # 1 MB bins bins = tileGenome(salmoSeqlengths, tilewidth=1000000,cut.last.tile.in.chrom=TRUE) # finciton to count # of bins (or reads) per bin BinChIPseq = function( reads, bins ){ mcols(bins)$score = countOverlaps( bins, reads ) return( bins ) } # number of peaks per 1MB bins binnedCMpeaks = BinChIPseq(peakRange,bins) # add chromosome lengths binnedCMpeaks$chromLength = chromLength$End[match(as.character(seqnames(binnedCMpeaks)),chromLength$Chr)] # number of expressed genes per 1MB bins binnedGenes = BinChIPseq(salmonGeneRange[which(salmonGeneRange$gene %in% expressedGenes)],bins) # add chromosome lengths binnedGenes$chromLength = chromLength$End[match(as.character(seqnames(binnedGenes)),chromLength$Chr)] ``` Plot binned peaks versus binned expressed genes. ```{r} plotData = data.frame(chrom=seqnames(binnedCMpeaks),ranges(binnedCMpeaks),peaks=binnedCMpeaks$score,genes=binnedGenes$score) ggplot(plotData,aes(peaks,genes)) + geom_point(alpha=0.5,color="black",size=2) + geom_smooth(method="lm") + theme_classic() + theme(legend.title=element_text(colour = "black"), legend.position = "bottom", legend.text = element_text(colour = "black",size=8), axis.text.y = element_text(colour = "black",size=8), axis.text.x = element_text(colour = "black",size=8), strip.background = element_blank()) + xlab("Peaks per MB") + ylab("Genes per MB") summary(lm(plotData$peaks ~ plotData$genes)) ggplot(plotData,aes(peaks,genes)) + geom_point(alpha=0.5,color="black",size=2) + facet_wrap(~chrom) + geom_smooth(method="lm") + theme_classic() + theme(legend.title=element_text(colour = "black"), legend.position = "bottom", legend.text = element_text(colour = "black",size=8), axis.text.y = element_text(colour = "black",size=8), axis.text.x = element_text(colour = "black",size=8), strip.background = element_blank()) + xlab("Peaks per MB") + ylab("Genes per MB") ``` ## Circos plot Calculate color for blocks based on normalized number of blocks. Calculate colors for links based on residuals from 1:1 number of peaks per homeoblock. ```{r, message=FALSE} # colors for H3K27ac peakColVir = cut_interval(binnedCMpeaks$score, n=10, labels=rev(viridis_pal(option="B")(10))) binnedCMpeaks$color = peakColVir # gradient color for homeoblocks blockBed = cbind(rbind( setNames(numberOfPeaksPerBlock[,c("chrX","minStartX","maxEndX")],c("seqnames","start","end")), setNames(numberOfPeaksPerBlock[,c("chrY","minStartY","maxEndY")],c("seqnames","start","end"))), per100kb=c(numberOfPeaksPerBlock$per100kbX,numberOfPeaksPerBlock$per100kbY), HomeologBlockName = rep(rownames(numberOfPeaksPerBlock),2)) getPaletteBlocks = colorRampPalette(brewer.pal(9, "BuPu"))(9) blockBed$blockCol = getPaletteBlocks[cut(abs(blockBed$per100kb),9)] # add chromosome lengths blockBed$chromLength = chromLength$End[match(blockBed$seqnames,chromLength$Chr)] # gradient color for links getPaletteLinks = colorRampPalette(brewer.pal(9, "Blues"))(9) homeoblockColsLinks = getPaletteLinks[cut(abs(numberOfPeaksPerBlock$per100kbOneToOneResiduals),9)] names(homeoblockColsLinks) = rownames(numberOfPeaksPerBlock) ``` Plot. ```{r, message=FALSE} circos.clear() # set gap between sectors circos.par(gap.degree=2,start.degree=90,track.height = 0.3) circosPlot = function(){ # salmon chromosomes #circos.genomicInitialize(genomeRange, sector.names = NCBInames$ChromNum[1:29],labels.cex=0.5) circos.genomicInitialize(genomeRange,plotType="labels",sector.names = NCBInames$Name[match(NCBInames$RefSeq,levels(seqnames(binnedCMpeaks)))[1:29]],labels.cex=0.5) # gap between tracks set_track_gap(mm_h(0)) # peak density histogram with y-axis lines at background circos.genomicTrack(as.data.frame(binnedCMpeaks)[,c("seqnames","start","end","score","color","chromLength")], bg.border = NA, ylim=c(0,53), panel.fun = function(region, value, ...) { circos.segments(x0=0,x1=value[,3],y0=10,y1=10,lwd=0.3,col="grey90") circos.segments(x0=0,x1=value[,3],y0=20,y1=20,lwd=0.3,col="grey90") circos.segments(x0=0,x1=value[,3],y0=30,y1=30,lwd=0.3,col="grey90") circos.segments(x0=0,x1=value[,3],y0=40,y1=40,lwd=0.3,col="grey90") circos.segments(x0=0,x1=value[,3],y0=50,y1=50,lwd=0.3,col="grey90") circos.genomicLines(region, value[,1], area=T, col="white", lwd=1.2, baseline=0) circos.genomicLines(region, value[,1], type="h", col=as.character(value$color), lwd=1.2, baseline=0) # color coding hack here! }) # x-axis on all segments brk <- c(0,20,40,60,80,100,120,140,160)*10^6 circos.track(track.index = get.current.track.index(), panel.fun = function(x, y) { circos.axis(h="bottom",major.at=brk,minor.ticks=0,direction="inside",labels=round(brk/10^6,1),labels.cex=0.25,col="black",labels.col="black",lwd=0.7,labels.facing="clockwise",major.tick.length=c(1)) },bg.border=F) # y-axis on last segment circos.yaxis(at=c(10,20,30,40,50),labels.cex=0.25,lwd=0,tick.length=0,labels.col="black",col="#FFFFFF") # gap between tracks set_track_gap(mm_h(3)) # homeoblock barplot and y-axis circos.genomicTrack(blockBed,track.height = 0.05, bg.border = NA, ylim=c(0,1.5), panel.fun = function(region, value, ...) { circos.segments(x0=0,x1=value[,4],y0=1,y1=1,lwd=0.3,col="grey90") circos.segments(x0=0,x1=value[,4],y0=2,y1=2,lwd=0.3,col="grey90") circos.genomicRect(region, value[,1], ytop.column = 1, ybottom = 0, col = as.character(value$blockCol), border=NA) }) # y-axis on last segment circos.yaxis(at=c(1,2,3),labels.cex=0.25,lwd=0,tick.length=0,labels.col="black",col="#FFFFFF") # gap between tracks set_track_gap(mm_h(0)) # homeoblock links circos.genomicLink(WGDregionsX[,c("Seqnames","Start","End")], WGDregionsX[,c("ChrY","StartY","EndY")], col = homeoblockColsLinks[as.character(as.data.frame(WGDrangeY)$HomeologBlockName)], border = "black", lwd=0.3) } circosPlot() ``` ## Homeologs from protein BLAST ```{r} library(biomaRt) proteinHomeologs = read.table("Salmo_salar.ICSASG_v2.pep.all.selfBlast.txt",sep="\t",header=T) # number of genes that have retained a copy as per our reciprocal mRNA BLAST best match analysis length(which(is.na(proteinHomeologs$Homeolog2)==F))/length(which(is.na(proteinHomeologs$Homeolog1)==F)) * 100 proteinHomeologs$ensemblShortGene1 = unlist(lapply(proteinHomeologs$Homeolog1,function(x){unlist(strsplit(x, "\\."))[1]})) proteinHomeologs$ensemblShortGene2 = unlist(lapply(proteinHomeologs$Homeolog2,function(x){unlist(strsplit(x, "\\."))[1]})) # map ensembl ID to entrez ID ensembl = useMart("ensembl") datasets = listDatasets(ensembl) datasets[177,] ensembl = useDataset("ssalar_gene_ensembl",mart=ensembl) attributes = listAttributes(ensembl) filters = listFilters(ensembl) # get name match (unordered) for gene 1 gene1names = getBM( filters="ensembl_gene_id", attributes=c("ensembl_gene_id", "entrezgene_id"), values=proteinHomeologs$ensemblShortGene1, mart=ensembl) # verify that it matches all(gene1names$ensembl_gene_id %in% proteinHomeologs$ensemblShortGene1) # get name match (unordered) for gene 2 gene2names = getBM( filters="ensembl_gene_id", attributes=c("ensembl_gene_id", "entrezgene_id"), values=proteinHomeologs$ensemblShortGene2, mart=ensembl) # verify that it matches all(gene2names$ensembl_gene_id %in% proteinHomeologs$ensemblShortGene2) # attach entrez IDs to homeologs proteinHomeologs$entrezGene1 = gene1names$entrezgene_id[match( proteinHomeologs$ensemblShortGene1,gene1names$ensembl_gene_id)] proteinHomeologs$entrezGene2 = gene2names$entrezgene_id[match( proteinHomeologs$ensemblShortGene2,gene2names$ensembl_gene_id)] # count peaks per homeolog for (i in 1:nrow(proteinHomeologs)){ h1 = paste("GeneID",proteinHomeologs[i,"entrezGene1"],sep=":") h2 = paste("GeneID",proteinHomeologs[i,"entrezGene2"],sep=":") if (length(h1) > 0 & length(h2) >0){ h1peaks = peakRange[which(peakRange$Entrez.ID == as.character(h1))] h2peaks = peakRange[which(peakRange$Entrez.ID == as.character(h2))] h1promPeaks = peakRange[which(peakRange$Entrez.ID == as.character(h1) & peakRange$Detailed.Annotation=="promoter-TSS")] h2promPeaks = peakRange[which(peakRange$Entrez.ID == as.character(h2) & peakRange$Detailed.Annotation=="promoter-TSS")] h1proxPeaks = peakRange[which(peakRange$Entrez.ID == as.character(h1) & peakRange$Detailed.Annotation!="promoter-TSS")] h2proxPeaks = peakRange[which(peakRange$Entrez.ID == as.character(h2) & peakRange$Detailed.Annotation!="promoter-TSS")] proteinHomeologs[i,"Homeolog1peaks"] = length(h1peaks) proteinHomeologs[i,"Homeolog2peaks"] = length(h2peaks) proteinHomeologs[i,"Homeolog1promPeaks"] = length(h1promPeaks) proteinHomeologs[i,"Homeolog2promPeaks"] = length(h2promPeaks) proteinHomeologs[i,"Homeolog1proxPeaks"] = length(h1proxPeaks) proteinHomeologs[i,"Homeolog2proxPeaks"] = length(h2proxPeaks) } } # assign homeologs to homeoblocks proteinHomeologs$Homeoblock = NA for (i in 1:nrow(proteinHomeologs)){ h1 = paste("GeneID",proteinHomeologs[i,"entrezGene1"],sep=":") h2 = paste("GeneID",proteinHomeologs[i,"entrezGene2"],sep=":") if (is.na(h1) == F & is.na(h2) == F){ # get coordinates h1pos = salmonGeneRange[which(salmonGeneRange$db_xref == h1)] h2pos = salmonGeneRange[which(salmonGeneRange$db_xref == h2)] # overlap with WGD regions h1block = unlist(list(WGDrangeX$HomeologBlockName[subjectHits(findOverlaps(h1pos,WGDrangeX))],WGDrangeY$HomeologBlockName[subjectHits(findOverlaps(h1pos,WGDrangeY))])) h2block = unlist(list(WGDrangeX$HomeologBlockName[subjectHits(findOverlaps(h2pos,WGDrangeX))],WGDrangeY$HomeologBlockName[subjectHits(findOverlaps(h2pos,WGDrangeY))])) if (length(h1block) == 1 & length(h2block) == 1){ if (h1block == h2block){ proteinHomeologs[i,"Homeoblock"] = as.character(h1block) } else { proteinHomeologs[i,"Homeoblock"] = "conflictingBlocks" } } else { proteinHomeologs[i,"Homeoblock"] = "conflictingBlocks" } } else { proteinHomeologs[i,"Homeoblock"] = NA } } # change homeoblock to a factor and sort by size to facilitate plotting in order blockLengths = c() for (block in unique(WGDrangeX$HomeologBlockName)){ blockRangeX = WGDrangeX[which(WGDrangeX$HomeologBlockName == block)] blockLengths[block] = sum(end(blockRangeX))-sum(start(blockRangeX)) } proteinHomeologs$Homeoblock = factor(proteinHomeologs$Homeoblock,levels = names(blockLengths)[order(blockLengths,decreasing = T)]) ``` Plot the results. ```{r} # plot for genes where at least 1 peak in both homeologs ggplot(proteinHomeologs[which(proteinHomeologs$Homeolog1peaks>0 & proteinHomeologs$Homeolog2peaks>0),],aes(Homeolog1peaks,Homeolog2peaks)) + geom_point(alpha=0.2) + theme_classic() + xlim(0,15) + ylim(0,15) + geom_abline() # plot per homeoblock for genes where at least 1 peak in both homeologs ggplot(proteinHomeologs[which(proteinHomeologs$Homeolog1peaks>0 & proteinHomeologs$Homeolog2peaks>0 & proteinHomeologs$Homeoblock!="conflictingBlocks"),],aes(Homeolog1peaks,Homeolog2peaks,group=Homeoblock)) + geom_abline(color="grey",linetype="dashed") + geom_point(alpha=0.2,position="jitter") + facet_wrap(~Homeoblock) + theme_classic() + scale_x_continuous(breaks=c(1,3,5,7,9),limits=c(0,9.5)) + scale_y_continuous(breaks=c(1,3,5,7,9),limits=c(0,9.5)) + xlab("Peaks per homeolog A") + ylab("Peaks per homeolog B") ``` Alternative plot ```{r} ggplot(proteinHomeologs[which(proteinHomeologs$Homeolog1peaks>=0 & proteinHomeologs$Homeolog2peaks>=0 & proteinHomeologs$Homeoblock!="conflictingBlocks"),],aes(Homeolog1peaks,Homeolog2peaks)) + stat_density_2d(aes(fill = ..density..), geom = "raster", contour = F, contour_var="count", adjust=3) + scale_fill_distiller(palette=4, direction=-1) + geom_point(color="black",alpha=0.2,size=1,shape=21) + theme_classic() + xlab("Peaks per homeolog A") + ylab("Peaks per homeolog B") + theme(legend.title=element_text(colour = "black"), legend.position = "none", legend.text = element_text(colour = "black",size=8), axis.text.y = element_text(colour = "black",size=8), axis.text.x = element_text(colour = "black",size=8), strip.background = element_blank()) + scale_x_continuous(expand = c(0, 0),breaks=c(0,5,10,15)) + scale_y_continuous(expand = c(0, 0),breaks=c(0,5,10,15)) ``` Correlation of peak number taking into account the expression level of the gene. ```{r} head(peakNumberExpressionImm) # expression levels proteinHomeologs$Homeolog1meanExpression = peakNumberExpressionImm[paste("GeneID",proteinHomeologs$entrezGene1,sep=":"),"meanExpression"] proteinHomeologs$Homeolog2meanExpression = peakNumberExpressionImm[paste("GeneID",proteinHomeologs$entrezGene2,sep=":"),"meanExpression"] # difference in expression proteinHomeologs$expressionDiff = abs(proteinHomeologs$Homeolog1meanExpression-proteinHomeologs$Homeolog2meanExpression) # differences in peak numbers proteinHomeologs$absPeakDiff = abs(proteinHomeologs$Homeolog1peaks-proteinHomeologs$Homeolog2peaks) # differences in promoter peak numbers proteinHomeologs$absPromPeakDiff = abs(proteinHomeologs$Homeolog1promPeaks-proteinHomeologs$Homeolog2promPeaks) # differences in proximal peak numbers proteinHomeologs$absProxPeakDiff = abs(proteinHomeologs$Homeolog1proxPeaks-proteinHomeologs$Homeolog2proxPeaks) # group genes by differences in peak numbers proteinHomeologs$peakDiffGroup = cut(proteinHomeologs$absPeakDiff,c(seq(0,7)),right=F) # group genes by differences in promoter peak numbers proteinHomeologs$promPeakDiffGroup = cut(proteinHomeologs$absPromPeakDiff,c(seq(0,3)),right=F) # group genes by differences in proximal peak numbers proteinHomeologs$proxPeakDiffGroup = cut(proteinHomeologs$absProxPeakDiff,c(seq(0,7)),right=F) ``` Plot ```{r} plotData = proteinHomeologs[which(proteinHomeologs$entrezGene1 != "NA" & proteinHomeologs$entrezGene2 != "NA"),] plotData%>% dplyr::group_by(factor(peakDiffGroup))%>% dplyr::mutate(mean_expr_diff=median(expressionDiff,na.rm=T))%>% ggplot(aes(factor(peakDiffGroup),expressionDiff)) + geom_boxplot(aes(fill=mean_expr_diff)) + scale_fill_viridis(option = "C",name = "Median expression difference") + scale_x_discrete(labels=c(seq(0,6),"7+")) + ylab("Normalized expression difference") + xlab("Difference in number of peaks") + theme_classic() + theme(legend.title=element_text(colour = "black"), legend.position = "bottom", legend.text = element_text(colour = "black",size=8), axis.text.y = element_text(colour = "black",size=8), axis.text.x = element_text(colour = "black",size=8), strip.background = element_blank()) # test summary(lm(plotData$expressionDiff~plotData$absPeakDiff)) plotData = proteinHomeologs[which(proteinHomeologs$entrezGene1 != "NA" & proteinHomeologs$entrezGene2 != "NA"),] plotData%>% dplyr::group_by(factor(promPeakDiffGroup))%>% dplyr::mutate(mean_expr_diff=median(expressionDiff,na.rm=T))%>% ggplot(aes(factor(promPeakDiffGroup),expressionDiff)) + geom_boxplot(aes(fill=mean_expr_diff)) + scale_fill_viridis(option = "C",name = "Median expression difference") + scale_x_discrete(labels=c(seq(0,16),"17+")) + ylab("Normalized expression difference") + xlab("Difference in number of peaks") + theme_classic() + theme(legend.title=element_text(colour = "black"), legend.position = "none", legend.text = element_text(colour = "black",size=8), axis.text.y = element_text(colour = "black",size=8), axis.text.x = element_text(colour = "black",size=8), strip.background = element_blank()) # test summary(lm(plotData$expressionDiff~plotData$absPromPeakDiff)) plotData = proteinHomeologs[which(proteinHomeologs$entrezGene1 != "NA" & proteinHomeologs$entrezGene2 != "NA"),] plotData%>% dplyr::group_by(factor(proxPeakDiffGroup))%>% dplyr::mutate(mean_expr_diff=median(expressionDiff,na.rm=T))%>% ggplot(aes(factor(proxPeakDiffGroup),expressionDiff)) + geom_boxplot(aes(fill=mean_expr_diff)) + scale_fill_viridis(option = "C",name = "Median expression difference") + scale_x_discrete(labels=c(seq(0,6),"7+")) + ylab("Normalized expression difference") + xlab("Difference in number of peaks") + theme_classic() + theme(legend.title=element_text(colour = "black"), legend.position = "bottom", legend.text = element_text(colour = "black",size=8), axis.text.y = element_text(colour = "black",size=8), axis.text.x = element_text(colour = "black",size=8), strip.background = element_blank()) # test summary(lm(plotData$expressionDiff~plotData$absProxPeakDiff)) ``` Combine expression plots to a single figure ```{r} plotDataProxPeaks = subset(proteinHomeologs[which(proteinHomeologs$Homeolog2 != "NA"),],select=-c(promPeakDiffGroup,peakDiffGroup)) plotDataProxPeaks$Feature = "Proximal peaks" colnames(plotDataProxPeaks)[20] = "PeakDiffGroup" plotDataPromPeaks = subset(proteinHomeologs[which(proteinHomeologs$Homeolog2 != "NA"),],select=-c(proxPeakDiffGroup,peakDiffGroup)) plotDataPromPeaks$Feature = "Promoter peaks" colnames(plotDataPromPeaks)[20] = "PeakDiffGroup" plotData = rbind(plotDataProxPeaks, plotDataPromPeaks) peakDiffPlot=plotData%>% dplyr::group_by(factor(PeakDiffGroup))%>% dplyr::mutate(mean_expr_diff=median(expressionDiff,na.rm=T))%>% ggplot(aes(factor(PeakDiffGroup),expressionDiff)) + geom_boxplot(aes(fill=mean_expr_diff)) + scale_fill_viridis(option = "C",name = "Median expression difference") + scale_x_discrete(labels=c(seq(0,6),"7+")) + facet_grid(scales = "free",rows=vars(Feature)) + ylab("Normalized ohnolog expression difference") + xlab("Difference in ohnolog feature numbers") + theme_classic() + theme(legend.title=element_text(colour = "black"), legend.position = "bottom", legend.text = element_text(colour = "black",size=8), axis.text.y = element_text(colour = "black",size=8), axis.text.x = element_text(colour = "black",size=8), strip.background = element_blank()) ggpubr::ggarrange(peakExpPlot, peakDiffPlot, nrow=1, ncol=2, labels=c("A","B")) ``` Save all analyses. ```{r} save.image("PeakComparisonWGDregionsGCnorm.RData") ```