############################ ####### functions from PHANGORN prepareDataSankoffNew <- function(data){ contrast = attr(data, "contrast") contrast[contrast == 0] = 1.0e+06 contrast[contrast == 1] <- 0.0 attr(data, "contrast") <- contrast data } sankoffNew <- function (tree, data, cost = NULL, site = 'pscore') { if (!inherits(data,"phyDat")) stop("data must be of class phyDat") data <- prepareDataSankoffNew(data) weight <- attr(data, "weight") levels <- attr(data, "levels") l = length(levels) if (is.null(cost)) { cost <- matrix(1, l, l) cost <- cost - diag(l) } l <- length(data) nr <- attr(data, "nr") if(inherits(tree,"phylo")) return(fit.sankoffNew(tree, data, cost, returnData =site)) if(inherits(tree,"multiPhylo")){ if(is.null(tree$TipLabel))tree = unclass(tree) return(sapply(tree, fit.sankoffNew, data, cost, site)) } } fit.sankoffNew <- function (tree, data, cost, returnData = c("pscore", "site", "data")) { if (is.null(attr(tree, "order")) || attr(tree, "order") == "cladewise") tree <- reorder(tree, "postorder") returnData <- match.arg(returnData) node <- tree$edge[, 1] edge <- tree$edge[, 2] weight = attr(data, "weight") nr = p = attr(data, "nr") contr = attr(data, "contrast") q = length(tree$tip.label) nc = l = attr(data, "nc") m = length(edge) + 1 dat = vector(mode = "list", length = m) dat[1:q] = data[tree$tip.label] node = as.integer(node - 1) edge = as.integer(edge - 1) nTips = as.integer(length(tree$tip.label)) mNodes = as.integer(max(node) + 1) res <- .Call("sankoff3B", dat, as.numeric(cost), as.integer(nr),as.integer(nc), node, edge, mNodes, nTips, as.double(contr), as.integer(nrow(contr)), PACKAGE="phangorn") root <- getRoot(tree) erg <- .Call("C_rowMin", res[[root]], as.integer(nr), as.integer(nc), PACKAGE = "phangorn") if (returnData=='site') return(erg) pscore <- sum(weight * erg) result = pscore if (returnData=="data"){ result <- list(pscore = pscore, dat = res) } result } ########################################## ######### nodeDes function nodeDes <- function (phylo, node, giveTips=TRUE) { tips <- Ntip(phylo) nextNodes <- node stopPoint <- FALSE storeNodes <- nextNodes while (stopPoint == FALSE) { startPoints <- unlist(sapply(nextNodes, function(x) which(phylo$edge[, 1] == x))) nextNodes <- phylo$edge[startPoints, 2] storeNodes <- c(storeNodes, nextNodes) stopPoint <- all(nextNodes <= tips) } if(giveTips == T) { tipClade <- storeNodes[which(storeNodes <= Ntip(phylo))] tippo <- phylo$tip.label[tipClade] return(tippo) } else { tipClade <- storeNodes[which(storeNodes >Ntip(phylo))] return(tipClade) } } ####### generateData function generateData <- function(tree, nChrs=100, binToFill=5, probabilityVector=c(0.2,0.2,0.2,0.2,0.2,0,0,0, 0, 0.2), maxCharacters=2, nCores=1, nReps=100) { binStore <- rep(0, 10) intoBins <- seq(0, 1, by=0.1) intNodes <- tree$edge[which(tree$edge[,2] > Ntip(tree)), 2] numbTip <- Ntip(tree) fullBin <- FALSE res <- c() XX <- c() counter <- 1 matrixOut <- list() binnedOutput <- mclapply(1:nReps, mc.cores=nCores, function(k) { while(fullBin == FALSE) { XX <- c() ciOutput <-c() res <- nStates <- c() for(p in 1:nChrs) { X <- rep(0, Ntip(tree)) chrType <- sample(1:10, 1, prob=probabilityVector, replace=T) intNodes <- tree$edge[which(tree$edge[,2] > Ntip(tree)), 2] if(chrType == 10) { ancNode <- sample(intNodes, 1) phyTemp <- nodeDes(tree, ancNode) X[which(!is.na(match(tree$tip.label, phyTemp)))] <- 1 names(X) <- tree$tip.label intNodes <- intNodes[-match(ancNode, intNodes)] monoClades <- nodeDes(tree, intNodes, F) if(length(monoClades) > 0) { inclusiveClades <- sample(c(0:(maxCharacters - 2)),1) incClades <- sample(monoClades, inclusiveClades) for(yy in 1:length(incClades)) { phyTemp <- nodeDes(tree, incClades[yy]) chrClade <- which(!is.na(match(tree$tip.label, phyTemp))) if(!any(table(X[-chrClade]) == 1)) { increaseClade <- max(X) + 1 X[chrClade] <- increaseClade } } } names(X) <- tree$tip.label missingSeq <- sort(which(is.na(match(1: max(X), X)))) while(length(missingSeq) > 0) { for(u in 1:length(missingSeq)) { noTooBig <- X > missingSeq[u] X[noTooBig] <- X[noTooBig] - 1 } missingSeq <- sort(which(is.na(match(1: max(X), X)))) } x = phyDat(as.matrix(X), type = "USER", levels=unique(X)) XX <- cbind(XX, X) } else { lowerBin <- c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)[chrType] upperBin <- c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1)[chrType] inBin <- TRUE count <- 1 while(inBin) { X <- rep(0, Ntip(tree)) sampleTaxa <- sample(c(1:numbTip), round(runif(1, 2, numbTip-2))) lengthTaxa <- length(sampleTaxa) replaceNumbers <- sample(1:(maxCharacters - 1), lengthTaxa, replace=T) missingSeq <- sort(which(is.na(match(1: max(replaceNumbers), replaceNumbers)))) if(length(missingSeq) > 0) for(u in 1:length(missingSeq)) { noTooBig <- replaceNumbers > missingSeq[u] replaceNumbers[noTooBig] <- replaceNumbers[noTooBig] - 1 } X[sampleTaxa] <- replaceNumbers names(X) <- tree$tip.label x = phyDat(as.matrix(X), type = "USER", levels=unique(X)) resNow <- sankoffNew(tree, data=x, cost = NULL) nStatesOne <- max(X) if(is.na(.bincode(nStatesOne/resNow, c(lowerBin, upperBin))) == FALSE) inBin <- FALSE count <- count + 1 if(count > 1000) { chrType <- sample(1:10, 1, prob=probabilityVector, replace=T) lowerBin <- c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)[chrType] upperBin <- c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1)[chrType] count <- 1 } } XX <- cbind(XX, X) } nStates[p] <- max(X) res <- c(res, sankoffNew(tree, data=x, cost = NULL)) } ciOutput <- sum(nStates) / sum(res) tx <- phyDat(XX, type="USER", levels=unique(c(XX))) ciOutput2 <- CI(tree, tx) if(ciOutput > 1) stop("state numbers are messed-up - check code") inCIBin <- as.numeric(names(table(.bincode(ciOutput, intoBins)))) if(ciOutput > intoBins[binToFill ] && ciOutput < intoBins[binToFill + 1]) { colnames(XX) <- NULL matrixOut[[counter]] <- XX fullBin <- TRUE } } return(list(XX, ciOutput)) } ) return(binnedOutput) } ####### tree trees (note branchs are ignored in the data simulation) sym <- read.tree(text="(((((t1:0.2,t2:0.2):0.2,(t3:0.2,t4:0.2):0.2):0.2,((t5:0.2,t6:0.2):0.2,(t7:0.2,t8:0.2):0.2):0.2):0.2,(((t9:0.2,t10:0.2):0.2,(t11:0.2,t12:0.2):0.2):0.2,((t13:0.2,t14:0.2):0.2,(t15:0.2,t16:0.2):0.2):0.2):0.2):0.2,((((t17:0.2,t18:0.2):0.2,(t19:0.2,t20:0.2):0.2):0.2,((t21:0.2,t22:0.2):0.2,(t23:0.2,t24:0.2):0.2):0.2):0.2,(((t25:0.2,t26:0.2):0.2,(t27:0.2,t28:0.2):0.2):0.2,((t29:0.2,t30:0.2):0.2,(t31:0.2,t32:0.2):0.2):0.2):0.2):0.2);") asym <- read.tree(text="(t1:1,(t2:0.9677419355,(t3:0.935483871,(t4:0.9032258065,(t5:0.8709677419,(t6:0.8387096774,(t7:0.8064516129,(t8:0.7741935484,(t9:0.7419354839,(t10:0.7096774194,(t11:0.6774193548,(t12:0.6451612903,(t13:0.6129032258,(t14:0.5806451613,(t15:0.5483870968,(t16:0.5161290323,(t17:0.4838709677,(t18:0.4516129032,(t19:0.4193548387,(t20:0.3870967742,(t21:0.3548387097,(t22:0.3225806452,(t23:0.2903225806,(t24:0.2580645161,(t25:0.2258064516,(t26:0.1935483871,(t27:0.1612903226,(t28:0.1290322581,(t29:0.09677419355,(t30:0.06451612903,(t31:0.03225806452,t32:0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452):0.03225806452);")