######################################## Puttick 2016 Biology Letters ########################################## # simDiscrete function to generate disceret characters with a variable number of states on a phylogeny. The number # of states can be specified alongside the rate matrix, and rate of evolution. The default 'rate.matrix' is an # equal-rates model, but a user-defined model can be supplied. Additionally, the 'root.state' is set to random but # Some of this code was guided by Nicolas Salamin's course on Markov processes - http://www2.unil.ch/phylo/teaching/Markov_day2.pdf simDiscrete<-function(phylo, nChrs=1,states=2, rates.matrix=NULL, rate=1, root.state="random"){ require(ape) require(Matrix) if(is.null(rates.matrix)==TRUE){ R.matrix <- matrix(1, states, states) diag(R.matrix) <- 0 }else { R.matrix <- rates.matrix } if(is(phylo) == "multiPhylo" && length(phylo)!=nChrs) stop("unequal number of trees and characters") if(is(phylo) == "multiPhylo" && sd(unlist(lapply(phylo, Ntip)))!=0) stop("different N of tips in trees - do individually") if(is(phylo) == "multiPhylo") trees<-phylo if(states < 2) stop("each character must have at least 2 states") if(states > Ntip(phylo)) stop("there can't be more states than species") characterStates <- c(0:(states-1)) #if(freq == "equal") state.freq <- c(rep(1/dim(R.matrix)[1], dim(R.matrix)[1])) #can be modified to HKY85 for multiple traits etc state.freq <- c(rep(1/dim(R.matrix)[1], dim(R.matrix)[1])) state.freq <- state.freq f <- diag(state.freq) Q.matrix <- (R.matrix %*% f)-diag(apply(R.matrix %*%f ,1,sum)) tipValues <- matrix(NA, nrow=Ntip(phylo), ncol=nChrs) nodeValues <- matrix(NA, nrow=Nnode(phylo), ncol=nChrs) for(y in 1:nChrs) { if(is(phylo) == "multiPhylo") phylo <- trees[[y]] IL <- phylo$edge[,1] ILuni <- unique(phylo$edge[,1])[-1] EL <- phylo$edge[,2] fossils <- which(tipTimes(phylo)>10e-6) fc <- phylo$tip.label[fossils] spcs <- which(EL <= Ntip(phylo)) if(length(fossils) > 0) spcs <- which(EL <= Ntip(phylo))[-fossils] edgeL <- phylo$edge.length invariant <- FALSE if(rate == "uniform") rate <- runif(1, 0.5, 2) if(rate == "gamma") rate <- rgamma(1, shape = 1, rate = 1) if(root.state == "random") root.state <- sample(characterStates, 1) while(invariant == "FALSE") { store <- matrix(NA, nrow=length(EL), ncol=2) newNode <- which(IL == IL[1]) store[newNode] <- root.state for(i in 1:2) { ancestral <- store[newNode][i]+1 prob.matrix <- expm(Q.matrix*edgeL[newNode][i]*rate) chrs <- characterStates[c(rmultinom(1, 1, prob.matrix[ancestral,])) == 1] store[newNode[i],2] <- chrs store[which(EL[newNode][i] == IL),1] <- chrs } for(k in 1:length(ILuni)) { oldNode <- which(IL == ILuni[k]) for(i in 1:2) { newNode <- which(IL == EL[oldNode][i]) ancestral <- store[oldNode][i]+1 prob.matrix <- expm(Q.matrix*edgeL[oldNode][i]*rate) chrs <- characterStates[c(rmultinom(1, 1, prob.matrix[ancestral,]))==1] store[oldNode[i],2] <- chrs store[newNode,1] <- chrs } } if(length(table(store[spcs,2])) == states) invariant <- "TRUE" } rad <- match(unique(phylo$edge[,1]), phylo$edge[,1]) nodeValues[,y] <- matrix(store[rad,1]) tipValues[,y] <- store[which(phylo$edge[,2] <= Ntip(phylo)),2] all <- c(nodeValues[,y], tipValues[,y]) } rownames(nodeValues) <- unique(phylo$edge[,1]); rownames(tipValues) <- phylo$tip.label return(list(tipValues=tipValues, nodeValues=nodeValues)) } ####Miscellaneous functions updated from APE tipTimes<-function(tree){ n <- Ntip(tree) x <- dist.nodes(tree)[n + 1, ][1:n] return(max(x)-x) } nodalTimes<-function(phy){ n <- Ntip(phy) m <- phy$Nnode ROOT <- n + 1L event <- time.event <- numeric(n + m) time.event[ROOT] <- 0 phy <- reorder(phy) for (i in 1:nrow(phy$edge)) time.event[phy$edge[i, 2]] <- time.event[phy$edge[i,1]] + phy$edge.length[i] present <- max(time.event) these <- which(phy$edge[,2] >= Ntip(phy)) time.event[these] e1 <- phy$edge[, 1] e2 <- phy$edge[, 2] EL <- phy$edge.length N <- length(e1) xx <- numeric(phy$Nnode) interns <- which(e2 > n) for (i in interns) xx[e2[i] - n] <- xx[e1[i] - n] + EL[i] depth <- present xx <- depth - xx names(xx) <- if (is.null(phy$node.label)) (n + 1):(n + phy$Nnode) else phy$node.label return(xx) } ############################ Simulate combined fossil and extant phylogenies ################################### require(TreeSim) n<-1000 lambda <- 1 mu <- 0.2 frac <-1 numbsim<-1 num<-c(500, 100, 100) #for(j in num){ # trees<-sim.bd.taxa(j, numbsim=1000,lambda,mu,frac,complete=T,stochsampling=F) # class(trees) <- "multiPhylo" #} trees<-sim.bd.taxa(100, numbsim=1000,lambda,mu,frac,complete=T,stochsampling=F) ############################ Simulate combined fossil and extant phylogenies ################################### require(ape) require(parallel) tree <- trees # this is for 20 extant-taxa trees. Can be easily modified for the larger # trees generated above by changing the '20' to '50' etc. #######Unequal rates and equal probability of transition between states twoStates <- mclapply(1:1000, mc.cores=2, function(x) simDiscrete(tree[[x]], nChrs=1, states=2, rates.matrix=NULL, rate="uniform", root.state="random")) threeStates <- mclapply(1:1000, mc.cores=2, function(x) simDiscrete(tree[[x]], Chrs=1, states=3, rates.matrix=NULL, rate="uniform", root.state="random")) fourStates <- mclapply(1:1000, mc.cores=2, function(x) simDiscrete(tree[[x]], nChrs=1, states=4, rates.matrix=NULL, rate="uniform", root.state="random")) fiveStates <- mclapply(1:1000, mc.cores=2, function(x) simDiscrete(tree[[x]], nChrs=1, states=5, rates.matrix=NULL, rate="uniform", root.state="random")) #######Equal rates and trend-like transition between states R.matrix <- matrix(1, 2, 2) ; diag(R.matrix) <- 0 R.matrix[1,2] <- 2 twoStates_trend <- mclapply(1:1000, mc.cores=2, function(x) simDiscrete(tree[[x]], nChrs=1, states=2, rates.matrix=R.matrix, rate=1, root.state=0)) R.matrix <- matrix(NA, 3, 3) ; R.matrix[lower.tri(R.matrix)] <- c(1,1,2) R.matrix[upper.tri(R.matrix)] <- c(2,1,2); diag(R.matrix) <- 0 threeStates_trend <- mclapply(1:1000, mc.cores=2, function(x) simDiscrete(tree[[x]], nChrs=1, states=3, rates.matrix=R.matrix, rate=1, root.state=0)) R.matrix<-matrix(NA, 4, 4) ; R.matrix[lower.tri(R.matrix)] <- c(1, 1, 1, 2, 2, 3) R.matrix[upper.tri(R.matrix)] <- c(3,2,3,1, 2, 3) ; diag(R.matrix)<-0 fourStates_trend <- mclapply(1:1000, mc.cores=2, function(x) simDiscrete(tree[[x]], nChrs=1, states=4, rates.matrix=R.matrix, rate=1, root.state=0)) R.matrix <- matrix(NA, 5, 5) ; R.matrix[lower.tri(R.matrix)] <- c(1, 1, 1, 1, 2, 2, 2, 3, 3, 4) R.matrix[upper.tri(R.matrix)] <- c(4,3,4, 2,3, 4, 1,2,3,4); diag(R.matrix) <- 0 fiveStates_trend <- mclapply(1:1000, mc.cores=2, function(x) simDiscrete(tree[[x]], nChrs=1, states=5, rates.matrix=R.matrix, rate=1, root.state=0)) ############################ Get ACE results from APE on comparable nodes ###################################### ###Example code to take 25% of fossil tips and change them to the incorrect state traitSim <- twoStates toChoose <- as.numeric(unique(traitSim[[1]]$tipValues)) incorrect_traitSim <- list() incorrect <- 0.25 for(i in 1:1000) { fossil <- which(tipTimes(tree[[i]]) > 10e-8 ) if(length(fossil) != 0) { fc<- tree[[i]]$tip.label[fossil] fos <-round(length(fc) * incorrect) if(fos==0) fos<-1 gr<-sample(fc, fos) if(fos==1) gr <- fc[1] ruin <- match(gr, fc) pruD <- data.frame(traitSim[[i]]$tipValues) for(q in 1:length(pruD[fc[ruin],])) { go <- match(pruD[fc[ruin],][q], toChoose) pruD[fc[ruin],][q] <- sample(toChoose[-go], 1) } } else { pruD<-data.frame(traitSim[[i]]$tipValues) } incorrect_traitSim[[i]] <- pruD } ancestral_correct <- ace(tree[[1]], x=traitSim[[1]]$tipValues[,1], "discrete", marginal = FALSE)$lik.anc ancestral_incorrect_25% <- ace(tree[[1]], x=incorrect_traitSim[[1]][,1], "discrete", marginal = FALSE)$lik.anc fossil <- which(tipTimes(tree[[1]]) > 10e-8) traitNoFossil <- traitSim[[1]]$tipValues[-fossil, 1] ancestral_correct_extant <- ace(drop.fossil(tree[[1]]), x=traitNoFossil, "discrete", marginal = FALSE)$lik.anc # these objects ancestral_correct, ancestral_incorrect_25%, and ancestral_correct_extant contain the ancestral state estimates for all nodes