## Functions needed for processing of TASSEL results in R ## Specific to scripts run on MSI server (centOS). May not work on mthap (MacOS). ## Created by: John Stanton-Geddes, 12 August 2011 ## Modified: 7 August 2012 require("Cairo") ############################################################################## ## DropLevels ## drop unused factor levels from all factors in a data.frame ## Author: Kevin Wright, idea from Brian Ripley, posted on StackOverflow ############################################################################# DropLevels <- function(dat){ dat[] <- lapply(dat, function(x) x[,drop=TRUE]) return(dat) } ####################################################################### ## CompressionLevel ## Function to report the compression level used for P3D/EMMAX mixed-linear model fitting by tassel ## Compression level with the lowest value of -2LnLk is used for testing markers ## NOTE - compression level is the same for all chromosomes, so only need to report once ####################################################################### CompressionLevel = function(results_list, trait, ch, div="-") { # get filename for stats output from tassel compressionpattern <- paste(trait, ".+chr", ch, div, ".+compression", sep = "") # read in file compressionfile <- results_list[grep(compressionpattern, results_list)] cf <- read.table(compressionfile, sep = "\t", skip = 1, col.names = c("Trait", "Num_groups", "Compression", "minus2LnLk", "Var_genetic", "Var_error")) s.cf <- cf[order(cf$minus2LnLk),] # report compression level compression_level = s.cf[1,] return(compression_level) } # end function ####################################################################### ## ReadTasselStats ## Function to read in tassel output files for each chromosome ## assign chromosome name to "Locus" column ## assign "Marker" to numeric ####################################################################### ReadTasselStats = function(results_list, trait, ch, div="-") { # get filename for stats output from tassel statspattern <- paste(trait, ".+chr", ch, div, ".+stats", sep = "") statsname <- paste("chr", ch, "_stats", sep = "") # read in file and save dataframe in Global environment statsout <- results_list[grep(statspattern, results_list)] dfout <- read.table(statsout, sep = "\t", header = TRUE, as.is = c(F,T,T,T,T,T,T,T,T,T,T,T)) dfout[,"Locus"] <- ch dfout$Marker <- as.numeric(dfout$Marker) assign(statsname, dfout, envir = as.environment(.GlobalEnv)) } # end function ####################################################################### ## ReadTasselEffects ## Function to read in tassel output files for each chromosome ## assign chromosome name to "Locus" column ## assign "Marker" to numeric ####################################################################### ReadTasselEffects = function(results_list, trait, ch, div="-") { # get filename for stats output from tassel statspattern <- paste(trait, ".+chr", ch, div, ".+effects", sep = "") statsname <- paste("chr", ch, "_effects", sep = "") # read in file and save dataframe in Global environment statsout <- results_list[grep(statspattern, results_list)] dfout <- read.table(statsout, sep = "\t", header = TRUE) dfout[,"Locus"] <- ch #dfout$Marker <- as.numeric(dfout$Marker) assign(statsname, dfout, envir = as.environment(.GlobalEnv)) } # end function ############################################################################### ## QQ plot - specify file type as tiff or png (default to png) ## code modified from Stephen Turner, http://GettingGeneticsDone.blogspot.com/ ############################################################################### QQ = function(pvector, filename, ft = "png", ...) { if (!is.numeric(pvector)) stop("D'oh! p value vector is not numeric.") pvector <- pvector[!is.na(pvector) & pvector<1 & pvector>0] o = -log10(sort(pvector,decreasing=F)) #e = -log10( 1:length(o)/length(o) ) e = -log10(ppoints(length(pvector))) # set plot type if(ft=="png") png(file=paste(filename,ft,sep="."), height=1000, width=1000, res=200) else #NO LONGER WORKS ON MSI SERVER tiff(file = paste(filename,ft, sep="."), units = "mm", height = 90, width = 90, res = 1200, compression = "lzw", pointsize = 8) Cairo(file=paste(filename,ft,sep="."), type="tiff", pointsize=12, dpi=1000, bg="white") plot(e, o, pch=19, cex=1, xlab=expression(Expected~~-log[10](italic(p))), ylab=expression(Observed~~-log[10](italic(p))), xlim=c(0,max(e)), ylim=c(0,max(o)), ...) abline(0,1,col="red") dev.off() } ############################################################################### ## SNPcontext ## Function to get context, gene annotation and expression data for candidate SNPs from TASSEL analysis ############################################################################### SNPcontext <- function(candidate_SNPs, gene.context.dir=gene.context.dir, gi=gene.info, cTm=chrT.posmap) { # sort SNPs by locus and marker sort.SNPs <- candidate_SNPs[order(candidate_SNPs$Locus, candidate_SNPs$Marker), ] # get context information. for SNPs on chr1-8 and U, from gene_context files in gene.context.dir # for SNPs on chrT, use chrT.posmap to get gene ID and start/stop # make new matrix to collect SNP context information SNPs.context <- matrix(nrow=0,ncol=8) colnames(SNPs.context) <- c("Marker", "nearest_gene", "gene_context", "dist_nearest_gene", "ref_allele", "ref_amino_acid", "avg_quality", "max_quality") # by chromosome, get gene ID and context information for(i in unique(sort.SNPs$Locus)) { marker.df <- sort.SNPs[which(sort.SNPs$Locus==i), ] #print(marker.df[1,]) # un-comment to check that script is working correctly # if chrT, use chrT position map to get gene ID. if(i=="T") { for(n in 1:nrow(marker.df)) { mT <- marker.df[n,"Marker"] SNPs.context <- rbind(SNPs.context,c(mT, cTm[Start<=mT & End>=mT,]$ID, "TC", "0", ".", ".",".",".")) } # else, use gene context file for that chromosome } else for(m in 1:nrow(marker.df)) { folder <- paste(gene.context.dir, "Mt3.5_var288_chr", marker.df[m,"Locus"], "_context_20120606.txt", sep="") ma <- marker.df[m,"Marker"] gout <- system(paste("grep ", ma, " -w ", folder, sep=""), wait=T, intern=T) SNPs.context <- rbind(SNPs.context, noquote(unlist(strsplit(gout,"\t")))) } # end loop for each Marker on a chromosome } # end loop for each chromosome # add trait, rename gene ID column, add SNP rank (P-value) and effect size (markerR2) information and change to data.table for efficient sort and merge SNPs.context <- data.frame(SNPs.context) colnames(SNPs.context)[2] <- "ID" SNPs.context$ID <- as.character(SNPs.context$ID) SNPs.context$Trait <- sort.SNPs$Trait SNPs.context$p <- sort.SNPs$p SNPs.context$markerR2 <- sort.SNPs$markerR2 SNPs.context <- data.table(SNPs.context) setkey(SNPs.context, ID) #head(SNPs.context) #str(SNPs.context) # get gene annotation for candidate SNPs by searching gene.info file by 'ID' # NOTE - for genes with alternative splicing, more than one row will match # make new matrix to collect gene annotation information SNPs.genes <- matrix(nrow=0, ncol=12) # for each candidate SNP, extract gene information by matching gene ID for(g in 1:nrow(SNPs.context)) { ga <- gi[as.character(SNPs.context[g,ID])] #print(ga) SNPs.genes <- rbind(SNPs.genes, ga) } # end loop across candidate SNPs # subset genes dataframe to only unique genes, include all alternate splices for each gene SNPs.genes <- as.data.table(SNPs.genes) setkey(SNPs.genes, ID, splice) unique.genes <- SNPs.genes[!duplicated(SNPs.genes),] #dim(unique.genes) # merge with context data SNPinfo <- SNPs.context[unique.genes] #dim(SNPinfo) # Note - may be more than 200 because of alternative splicing for some genes #head(SNPinfo) # reorganize to relevant columns and return SNPinfo.out <- subset(SNPinfo, ,select=c('Trait','chr','Marker','start','stop','p','markerR2','ID','splice','gene_context','dist_nearest_gene','annotation','nodule','blade','flower','root','pod','bud','ref_allele','ref_amino_acid','avg_quality','max_quality')) return(SNPinfo.out) } # end function ############################################################################### ## Manhattan ## code modified from Stephen Turner, http://GettingGeneticsDone.blogspot.com/ ## changes: ## 1) chromosomes are separated by dashed vertical lines ## 2) output format can be specified to be png or tiff (higher resolution but larger file size) ############################################################################### Manhattan = function(dataframe, outfilename, ymax="max", cex.x.axis=1, suggestiveline=0, genomewideline=0, ft="png",...) { d=dataframe if (!("Locus" %in% names(d) & "Marker" %in% names(d) & "p" %in% names(d))) stop("Make sure your data frame contains columns Locus, Marker, and p") d=subset(na.omit(d[order(d$Locus, d$Marker), ]), (p>0 & p<=1)) # remove na's, sort, and keep only 0
0.10") #legend("topright", legend = legend.labels, col = c("red", "green", "blue", "black"), pch = 20) # Add vertical lines separating chromosomes for (i in 1:numchroms) { abline(v= vlines[i], lty = 3) } } # Add horizontal lines to denote significance levels if (suggestiveline!=0) abline(h=suggestiveline, col="blue") if (genomewideline!=0) abline(h=genomewideline, col="red", lty = 2) dev.off() } ####################################################################### ## ReadTasselBLUEs ## Function to read in tassel BLUEs output files from GLM ## assign chromosome name to "Locus" column ## assign "Marker" to numeric ####################################################################### ReadTasselBLUEs = function(results_list, trait, ch) { # get filename for BLUEs output from tassel BLUEspattern <- paste(trait, ".*chr", ch, ".*BLUEs", sep = "") BLUEsname <- paste("chr", ch, "_BLUEs", sep = "") # read in file and save dataframe in Global environment BLUEsout <- results_list[grep(BLUEspattern, results_list)] dfout <- read.table(BLUEsout, sep = "\t", header = TRUE, as.is = c(F,T,T,T,T,T,T,T,T,T,T,T)) dfout[,"Locus"] <- ch #dfout$Marker <- as.numeric(dfout$Marker) assign(BLUEsname, dfout, envir = as.environment(.GlobalEnv)) } # end function ####################################################################### ## ReadTasselFtest ## Function to read in tassel Ftest output files from GLM ## assign chromosome name to "Locus" column ## assign "Marker" to numeric ####################################################################### ReadTasselFtest = function(results_list, trait, ch) { # get filename for ftest output from tassel ftestpattern <- paste(trait, ".*chr", ch, ".*ftest", sep = "") ftestname <- paste("chr", ch, "_ftest", sep = "") # read in file and save dataframe in Global environment ftestout <- results_list[grep(ftestpattern, results_list)] dfout <- read.table(ftestout, sep = "\t", header = TRUE) dfout[,"Locus"] <- ch dfout$Marker <- as.numeric(dfout$Marker) assign(ftestname, dfout, envir = as.environment(.GlobalEnv)) } # end function