library(msap) library(geepack) library(fdrtool) library(ggplot2) # Add some code to msap:::pcoa function to make objects accessible # This allows for making custom PCoA plots later trace(msap:::pcoa, edit=T) # insert the following code at the end #if (surname == "MSL") { # pcoa.msl <<- list(var1, var2, spcoo, pcol$li, groups) #} #if (surname == "NML") { # pcoa.nml <<- list(var1, var2, spcoo, pcol$li, groups) #} setwd("H:/09 Manuscripts/02 Epigenetics/Mol Ecol/Dryad") epi <- read.table("msap_transformed_scores.txt", header=TRUE) epi <- epi[order(epi$Pop),] o <- capture.output(msap.out <- msap("msap_raw_scores.csv", no.bands="u", meth=TRUE, loci.per.primer=c(18,17,17,13,17,18,15,14), error.rate.primer=c(0.17,0.14,0.13,0.17,0.12,0.15,0.17,0.21), do.amova=T, do.cluster=F, do.pairwisePhiST=T)) # browse "o" to see msap output and extract basic information o ##### Shannon analysis ##### msl.i <- apply(msap.out$transformed.MSL[,-c(1,2)],1,shannon) nml.i <- apply(msap.out$transformed.NML[,-c(1,2)],1,shannon) plot(msl.i~epi$Worms) cor.test(msl.i,epi$Worms, method="spearman") # no correlation between worms and diversity # population-level i.median <- tapply(msl.i, epi$Pop, median) i2.median <- tapply(nml.i, epi$Pop, median) tmp <- na.exclude(epi) w.median <- tapply(tmp$Worms, tmp$Pop, median) plot(i.median~w.median) cor.test(i.median, w.median, method="spearman") # no correlation between worms and diversity ##### AMOVA analyses ##### ### Custom AMOVA with two levels # msap.out$DM.NML is NULL for some reason -> make distance matrix manually nmld <- lingoes(as.dist(smc(as.matrix(msap.out$transformed.NML[-c(1,2)]), dist=T))) dmat <- list(msap.out$DM.MSL, nmld) amovapops <- msap.out$transformed.MSL$groups pgroups <- rep(1, times=length(amovapops)) # group pops into parasite groups (Figure 1 in paper) pgroups[which(amovapops %in% c(7,9,10,12,14,15,19))] <- 2 pgroups[which(amovapops %in% c(8,16,17,18,20,21))] <- 3 pgroups <- as.factor(pgroups) for(a in 1:2) { mat <- dmat[[a]] amova.out <- amova(mat~pgroups/amovapops, nperm=10000) print(amova.out) VarP <- amova.out$varcomp[,1]/sum(amova.out$varcomp[,1]) # Var % print(paste("Variance percentages:",VarP*100)) print(paste("FCT", amova.out$varcomp[1,1]/sum(amova.out$varcomp[,1]))) print(paste("FSC", amova.out$varcomp[2,1]/sum(amova.out$varcomp[2:3,1]))) print(paste("FST", 1-VarP[3])) } ### parse msap output to get pairwise phiST raw <- grep(" - ", o, value=TRUE)[-1] p1 <- as.numeric(substr(raw, 1,2)) s <- regexpr("- ", raw) p2 <- as.numeric(substr(raw, s+2 ,s+4)) ph <- as.numeric(substr(raw, regexpr(":", raw)+2, regexpr("\t", raw)-2)) pvals <- as.numeric(substr(raw, regexpr("P", raw)+2, regexpr(")", raw)-1)) breakpoint <- length(p1)/2 # first half is MSL, second half is NML length(which(p.adjust(pvals[1:breakpoint], method="fdr")<0.05)) # number of significant phiSTs length(which(p.adjust(pvals[(breakpoint+1):(breakpoint*2)], method="fdr")<0.05)) # number of significant phiSTs tmp1 <- p1[(breakpoint+1):length(p1)] tmp2 <- p2[(breakpoint+1):length(p1)] p1 <- c(p1[1:breakpoint], tmp2) p2 <- c(p2[1:breakpoint], tmp1) mtype <- c(rep("MSL", times=breakpoint), rep("NML", times=breakpoint)) phi <- data.frame(p1,p2,ph,mtype) # plot as heatmap phi[which(phi[,3]<0),3] <- 0 # don't allow negative phi diag <- data.frame(seq(1,21), seq(1,21)) names(diag) <- c("V1", "V2") ggplot(phi, aes(x=p1, y=p2)) + geom_tile(aes(fill=ph)) + geom_point(data=diag, aes(x=V1, y=V2), colour="black", size=3, shape=20) + scale_fill_gradientn(colours=c("ghostwhite", "gold", "red"), breaks=seq(0,0.09,by=0.015)) + scale_x_discrete(limits=seq(1,21)) + scale_y_discrete(limits=seq(1,21)) + labs(fill="Differentiation\n ", x="Population", y="Population") + coord_equal() + theme_classic() + theme(axis.title.x = element_text(vjust=0, size=12), axis.title.y = element_text(vjust=0.2, size=12), axis.text = element_text(size=8), legend.title = element_text(size=10), plot.margin=unit(c(0,0,2,2), "mm"), legend.text = element_text(size=10), legend.key.height=unit(10, "mm")) ### Mantel tests # correlate MSL vs. NML (msap function doesn't work, for some reason) mantel.rtest(dmat[[1]], dmat[[2]], nrepet=9999) # correlate MSL/NML with pairwise geographic distance and medians in worm counts #phi.data <- phi[phi$mtype=="MSL",] phi.data <- phi[phi$mtype=="NML",] an <- with(data.frame(phi.data), sort(unique(c(as.numeric(p1), as.numeric(p2))))) M <- array(0, c(length(an), length(an)), list(an, an)) i <- match(phi.data[,1], an) j <- match(phi.data[,2], an) M[cbind(i,j)] <- M[cbind(j,i)] <- phi.data[,3] M <- as.dist(M) w <- na.exclude(cbind(epi$Worms, epi$Pop)) wdist <- dist(as.vector(tapply(w[,1], w[,2], median))) mantel.rtest(M, wdist, nrepet = 9999) par(pty="s", pch=16) plot(wdist, M, xlab="Difference in parasite load", ylab="Differentiation") # geographic distance geodis <- read.table("geo_distances.txt", header=F) geodis <- as.dist(geodis) mantel.rtest(M, geodis, nrepet=9999) plot(geodis, M, xlab="Geographic distance", ylab="Differentiation") ##### Analysis of methylation levels ##### # extract methylation levels for each individual (msap gives population-specific levels only) states <- msap.out$patterns #$patterns is a list of locus states; need to unravel by population, then by individual m <- matrix("", ncol=length(states[[1]])) for(pop in 1:length(states)) { states[[pop]] <- states[[pop]][order(as.numeric(row.names(states[[pop]]))),] m <- rbind(m, as.matrix(states[[pop]])) } inn <- apply(m, 1, function(x){length(x[x=="i"])/length(x)}) hem <- apply(m, 1, function(x){length(x[x=="h"])/length(x)}) unm <- apply(m, 1, function(x){length(x[x=="u"])/length(x)}) m.out <- cbind(msap.out$transformed.MSL$group, as.character(msap.out$transformed.MSL$ind), inn[-1], hem[-1], unm[-1]) worms <- epi[,c("Sample", "Worms")] wrm <- data.frame(m.out[order(m.out[,2]),],worms[order(worms$Sample),2]) names(wrm) <- c("Pop", "Ind", "inn", "hem", "unm", "Worms") wrm[,3] <- as.numeric(as.character(wrm[,3])) wrm[,4] <- as.numeric(as.character(wrm[,4])) wrm[,5] <- as.numeric(as.character(wrm[,5])) kruskal.test(wrm[,"inn"]~wrm[,"Pop"]) # significant kruskal.test(wrm[,"hem"]~wrm[,"Pop"]) # significant kruskal.test(wrm[,"unm"]~wrm[,"Pop"]) # significant cor.test(wrm[,"inn"],wrm[,"Worms"], method="spearman") cor.test(wrm[,"hem"],wrm[,"Worms"], method="spearman") # significant cor.test(wrm[,"unm"],wrm[,"Worms"], method="spearman") # sex differences? wrm.m <- wrm[which(wrm[,2] %in% epi[which(epi$PCRsex=="M"),1]),] # MALES ONLY! wrm.f <- wrm[which(wrm[,2] %in% epi[which(epi$PCRsex=="F"),1]),] # FEMALES ONLY! wilcox.test(wrm.f[,"inn"], wrm.m[,"inn"]) wilcox.test(wrm.f[,"hem"], wrm.m[,"hem"]) wilcox.test(wrm.f[,"unm"], wrm.m[,"unm"]) # plots for visualisation par(mfcol=c(3,1), mar=c(0,5,0,0), oma=c(6,0,1,1), pch=21, cex.axis=1, cex.lab=1, las=1, mgp=c(4,1,0), fg="darkgrey") plot(wrm[,"inn"]~as.numeric(wrm[,"Worms"]), ylab="inner C methylated", yaxt="n", xaxt="n", type="n", ylim=c(0.1, 0.6)) axis(side=2, at=c(0.1, 0.2, 0.3, 0.4, 0.5)) points(wrm[,"inn"]~as.numeric(wrm[,"Worms"]), col="black", bg="beige") abline(lm(wrm[,"inn"]~wrm[,"Worms"]), col="black", lty=2, lwd=1) plot(wrm[,"hem"]~as.numeric(wrm[,"Worms"]), ylab="hemimethylated", yaxt="n", xaxt="n", type="n", ylim=c(0.1, 0.6)) axis(side=2, at=c(0.1, 0.2, 0.3, 0.4, 0.5)) points(wrm[,"hem"]~as.numeric(wrm[,"Worms"]), col="black", bg="goldenrod1") abline(lm(wrm[,"hem"]~wrm[,"Worms"]), col="black", lty=2, lwd=1) plot(wrm[,"unm"]~as.numeric(wrm[,"Worms"]), ylab="unmethylated", yaxt="n", type="n", ylim=c(0.1, 0.6)) axis(side=2, at=c(0.1, 0.2, 0.3, 0.4, 0.5)) points(wrm[,"unm"]~as.numeric(wrm[,"Worms"]), col="black", bg="orangered4") abline(lm(wrm[,"unm"]~wrm[,"Worms"]), col="black", lty=2, lwd=1) mtext("Parasite load", side=1, outer = TRUE, line=4, cex=0.8, col="black") # population medians and bar chart hem.median <- tapply(wrm[,"inn"], wrm[,1], median) hem.median <- hem.median[order(as.numeric(names(hem.median)))] inn.median <- tapply(wrm[,"hem"], wrm[,1], median) inn.median <- inn.median[order(as.numeric(names(inn.median)))] unm.median <- tapply(wrm[,"unm"], wrm[,1], median) unm.median <- unm.median[order(as.numeric(names(unm.median)))] par(pch=16, cex=1, cex.axis=0.5, cex.lab=1, col="black", xpd=T, las=1, mar=c(4,4,6,0)) b <- barplot(rbind(unm.median, hem.median, inn.median), xlab="Population", ylab="Frequency", col=c("red4", "darkgoldenrod1", "bisque"), border=NA, space=0.15) lines(x=b, y=hem.median+inn.median, col="black") points(x=b, y=hem.median+inn.median, col="black", cex=1.5) legend("topleft", inset=c(0,-0.4), c("fully methylated ","hemimethylated","unmethylated","total methylation"), col="black", cex=0.7, pt.bg=c("bisque", "darkgoldenrod1", "red4", NA), pch=c(22,22,22,16), lty=c(0,0,0,1), pt.cex=1.5, ncol=2) ##### Association tests between epiloci and parasite load ##### ### extract MLxxx names of non-methylated loci (NML) ### NML loci are unusable for association tests, so remove from epi dataset mids <- read.table("locus_ids.txt", header=T, as.is=T) l.msl <- names(as.matrix(msap.out$patterns)[[1]]) nmls <- mids[-which(mids[,2] %in% l.msl),c(1,3)] # MLXXX names of non-methylated loci epi <- na.exclude(epi) epi <- epi[,-which(names(epi) %in% nmls[,1])] # remove loci with fewer than 10 presence or absence observations (~0.05 minor allele frequency) obs <- apply(epi[,8:ncol(epi)], 2, sum) epi <- epi[,-which(names(epi) %in% names(obs[obs<10 | obs>(nrow(epi)-10)]))] #### some exploratory analysis # worm quantiles med <- as.vector(tapply(epi$Worms, epi$Pop, median)) qtl <- tapply(epi$Worms, epi$Pop, function(x) quantile(x)[c(2,4)]) q25 <- as.vector(unlist(qtl)[seq(1,42,2)]) q75 <- as.vector(unlist(qtl)[seq(2,42,2)]) write.table(round(q75), "clipboard", row.names=F) write.table(as.vector(unlist(qtl)[seq(2,42,2)]), "clipboard", row.names=F) # correlations among variables par(cex.lab=1.5) boxplot(Worms~PCRsex, data=epi) # no difference among sexes boxplot(Worms~Age, data=epi) # young birds have fewer parasites tapply(epi$Worms, epi$Age, median) # 506 vs 121 medians wilcox.test(Worms~Age, data=epi) # p=0.01 boxplot(Worms~Weight, data=epi) # substantial variability plot(Combarea~Age, data=epi) # young birds have substantially lower comb area plot(Combarea~PCRsex, data=epi) # females have substantially lower comb area plot(Combarea~Weight, data=epi) # strong positive correlation (F1,228, r-sq=0.364, p<0.001) summary(lm(Combarea~Weight+Age+PCRsex+Worms, data=epi)) ### Association tests test.epiloci <- function(model="GEE") { results <- matrix(0, nrow=ncol(epi)-10, ncol=4) for(locus in 11:ncol(epi)) { if(model=="GEE") fit <- geeglm(epi[,locus]~Worms+Combarea, data=epi, id=Pop, family=binomial(link="logit"), corstr="exchangeable", std.err="san.se") if(model=="SAM") fit <- glm(epi[,locus]~Worms, data=epi, family=binomial(link="logit")) results[locus-10,1] <- names(epi)[locus] slope <- coef(summary(fit))["Worms","Estimate"] pval <- coef(summary(fit))["Worms",4] if(pval==0) pval <- 1e-16 results[locus-10,2] <- round(slope, digits=10) results[locus-10,3] <- pval } qs <- fdrtool(as.numeric(results[,3]), statistic="pvalue") results[,4] <- qs$qval # significant hits and highest/lowest coefficients sigs <- results[as.numeric(results[,3])<=0.05,] tops <- rbind(head(results[order(as.numeric(results[,2])),]), tail(results[order(as.numeric(results[,2])),])) return(list(data.frame(results, stringsAsFactors=F), data.frame(sigs), data.frame(tops))) } ### Volcano plot volcano <- function(d, t) { plot(-log10(as.numeric(d[[1]][,3]))~as.numeric(d[[1]][,2]), xlab=expression(Coefficient~(beta[1])), ylab=expression(-log[10]~p), type="n", cex.axis=0.5, cex.lab=0.8, main=t, ylim=c(0,3), xaxt="n", yaxt="n") axis(1, at=c(-0.001, -0.0005, 0, 0.0005), labels=c("-0.001", "-0.0005", "0", "0.0005"), lwd=0.5) axis(2, at=c(0,0.5,1,1.5,2,2.5,3), labels=c(0,0.5,1,1.5,2,2.5,3), lwd=0.5) abline(h=1.3,col="red") abline(v=0, col="gray", lty=2) points(-log10(as.numeric(d[[1]][,3]))~as.numeric(d[[1]][,2]), pch=19, cex=0.7, xlab="Coefficient", ylab=expression(-log[10]~p), col=as.character(cut(as.numeric(d[[1]][,4]), breaks=c(0, 0.105, 1), labels=c("red", "black")))) } hits.gee <- test.epiloci("GEE") hits.sam <- test.epiloci("SAM") par(cex.main=0.8, cex.axis=0.5, cex.lab=0.8, pty="s", las=1, mfrow=c(1,2), mar=c(1,3,3,1), mgp=c(2,0.5,0), tck=-0.02, lwd=0.5) volcano(hits.gee, "GEE") volcano(hits.sam, "SAM") ##### Custom PCoA plot ##### par(mfrow=c(2,1), bty="n", mar=c(0,0,0,0), oma=c(4,0,4,0), lwd=0.6, pty="s", cex=0.6, cex.axis=0.4, cex.lab=0.6, las=1) plot(0, 0, type = "n", xlab = paste("C1 (", pcoa.msl[[1]], "%)"), ylab = paste("MSL C2 (", pcoa.msl[[2]], "%)"), xaxt="n", yaxt="n", lwd=0.5, xlim = c(-0.3, 0.3), ylim = c(-0.2, 0.3), frame = TRUE) axis(side=3, at=c(-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3), lwd=0.5) axis(side=2, at=c(-0.2, -0.1, 0, 0.1, 0.2, 0.3), lwd=0.5) #bgcolors <- rainbow(21 + 1)[-2] bgcolors <- rep(NA, times=21) colfunc <- colorRampPalette(c("red", "gold")) bgcolors[c(3,4,9,10,15,18,19)] <- colfunc(7) colfunc <- colorRampPalette(c("green4", "lightgray")) bgcolors[c(1,2,5:8,11:14,16,17,20,21)] <- colfunc(14) s.class(pcoa.msl[[4]], pcoa.msl[[5]], cpoint = 1, col = bgcolors, add.plot = TRUE) mtext(paste("MSL C1 (", pcoa.msl[[1]], "%)"), side=3, outer = TRUE, line=3, cex=0.4, col="black") plot(0, 0, type = "n", xlab = paste("C1 (", pcoa.nml[[1]], "%)"), ylab = paste("NML C2 (", pcoa.nml[[2]], "%)"), xaxt="n", yaxt="n", lwd=0.5, xlim = c(-0.3, 0.3), ylim = c(-0.2, 0.4), frame = TRUE) axis(side=1, at=c(-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3), lwd=0.5) axis(side=2, at=c(-0.2, -0.1, 0, 0.1, 0.2, 0.3, 0.4), lwd=0.5) #bgcolors <- rainbow(21 + 1)[-2] colfunc <- colorRampPalette(c("black", "lightgray")) bgcolors <- colfunc(21) s.class(pcoa.nml[[4]], pcoa.nml[[5]], cpoint = 1, col = bgcolors, add.plot = TRUE) mtext(paste("NML C1 (", pcoa.nml[[1]], "%)"), side=1, outer = TRUE, line=3, cex=0.4, col="black")