#!/usr/bin/env Rscript --vanilla # Author: J.A. Ballesteros # Licence GPL # This simple script uses fuctions from ape, reshape, ggplot2 and argapser packages, and must be appropiatedly credited. dependencies<-c("ape", "reshape", "argparser", "ggplot2") missing <-dependencies[!(dependencies %in% installed.packages()[,"Package"])] if(length(missing)) install.packages(missing, repos = "http://cran.us.r-project.org") # This function was minimally modified from ape::dist.aa to ignore gaps, stop codons and missing entries (-,*,?) pdistAA <-function (x, pairwise.deletion = FALSE, scaled = FALSE) { n <- nrow(x) d <- numeric(n * (n - 1)/2) X <- charToRaw("X") S <-charToRaw("*") G <-charToRaw("-") Q <-charToRaw("?") miss <-c(X,S,G,Q) k <- 0L if (!pairwise.deletion) { del <- apply(x, 2, function(y) any(y == X)) if (any(del)) x <- x[, !del] for (i in 1:(n - 1)) { for (j in (i + 1):n) { k <- k + 1L d[k] <- sum(x[i, ] != x[j, ]) } } if (scaled) d <- d/ncol(x) } else { for (i in 1:(n - 1)) { a <- x[i, ] for (j in (i + 1):n) { b <- x[j, ] del <- a %in% miss | b %in% miss p <- length(b <- b[!del]) tmp <- sum(a[!del] != b) k <- k + 1L d[k] <- if (scaled) tmp/p else tmp } } } attr(d, "Size") <- n attr(d, "Labels") <- rownames(x) attr(d, "Diag") <- attr(d, "Upper") <- FALSE attr(d, "call") <- match.call() class(d) <- "dist" d } library("ape") library("reshape") library("argparser") library("ggplot2") #Parse arguments p <- arg_parser("R script to calculate stistics and plot to evaluate saturation of aminoacid alignmenst using p-distances (uncorrected) as a function of patristic (model corrected) distances.") p <- add_argument(p, "--a", help = "Alignment if FASTA format") p <- add_argument(p, "--t", help = "Tree in NEWICK format") p <-add_argument(p, "--p", help = "Write PDF plot with linear regression", flag=TRUE) argv = parse_args(p) alnfile<-argv$a treefile<-argv$t M.matrix <- read.FASTA(alnfile, type="AA") T.newick <- read.tree(treefile) #Validate taxon names in Matrix and tree mismatch <- setdiff(T.newick$tip.label, names(M.matrix)) if (length(mismatch)){ print("The taxon names in the alignment and the tree do not match") cat(mismatch) stop() } # Compute p_distances # dist.aa calculates uncorrected p distance, the paiwise deletion option removes comparissons # involving missing values and the scale option provides values relative to the effective length. pdist <- pdistAA(as.matrix(M.matrix), pairwise.deletion=TRUE, scaled=TRUE) #Compute cophenetic distances tdist <- cophenetic(T.newick) #melt and reshape data m.pdist <-melt(as.matrix(pdist)) m.tdist <-melt(tdist) colnames(m.pdist)<- c("TaxA", "TaxB", "pdist") colnames(m.tdist)<- c("TaxA", "TaxB", "tdist") dm<-merge(m.pdist, m.tdist) dm<-dm[dm$TaxA != dm$TaxB,] dm<-(dm[!duplicated(t(apply(dm,1,sort))),]) #print(nrow(dm)) fname<- sub(pattern = "(.*)\\..*$", replacement = "\\1", basename(alnfile)) oname<-paste(fname, "_satdat.csv", sep="") write.csv(dm, file = oname) #Calculate regression line. Note that the regression Y intercept is forced through the origin. reg<-lm(dm$pdist ~ 0 + dm$tdist) reg.slope <- reg$coefficients reg.rsquare <- summary(reg)$r.squared reg.adjrsquare <- summary(reg)$adj.r.squared cat(paste("file:", alnfile, "slope:", reg.slope, "r.square:", reg.rsquare, "adj.rsquare", reg.adjrsquare, "\n", sep=" ")) #plot if(argv$p){ plab<-substitute(italic(y) == m %.% italic(x)*","~~italic(R)^2~"="~r2, list(m = format(unname(reg.slope), digits=4), r2 = format(unname(reg.rsquare), digits =4) )) eq<-as.expression(plab) plot<-ggplot(dm, aes(x=dm$tdist, y=dm$pdist)) + geom_point(color="deepskyblue1", alpha=0.5) + geom_abline(slope=reg$coefficients) + labs(x="patristic distance (ape.cophenetic)", y="pdist (ape.dist.aa)", title=paste("Saturation plot of ", alnfile, "compared to", treefile ), subtitle=eq) + theme_linedraw() pdf(paste(fname,"_satplot.pdf", sep="")) plot }