###Code for Burbrink et al. 2019 ###Part I. Examining monophyly by gene and setting up data for Neural Networks library(phangorn) library(ape) #get trees list.files()->files lapply() ###names for testing monophyly, where Toxicofera has 54 groups, Anillidae has 2, and Dibamidae has 7 Toxicofera<-"Acrochordidae|Agamidae|Anguidae|Aniliidae|Anniellidae|Anomalepididae|Atractaspididae|Boidae|Bolyeriidae|Calamaridae|Chamaeleonidae|Colubridae|Colubroides|Corytophanidae|Crotaphytidae|Cylindrophiidae|Dactyloidae|Dipsadidae|Elapidae|Elapoidea|Erycidae|Gerrhopilidae|Grayiidae|Helodermatidae|Homalopsidae|Hoplocercidae|Iguanidae|Lamprophiidae|Leiocephalidae|Leiosauridae|Leptotyphlopidae|Liolaemidae|Loxocemidae|Natricidae|Opluridae|Pareatidae|Phrynosomatidae|Polychrotidae|Psammophiidae|Pseudoxenodontidae|Pseudoxyrhophiidae|Pythonidae|Shinisauridae|Sibynophiidae|Tropidophiidae|Tropiduridae|Typhlopidae|Ungaliophiidae|Uropeltidae|Varanidae|Viperidae|Xenodermatidae|Xenopeltidae|Xenosauridae" Aniliidae<-"Aniliidae|Tropidophiidae" Dibamidae<-c("Carphodactylidae|Dibamidae|Dipodactylidae|Eublepharidae|Gekkonidae|Pygopodidae|Sphaerodactylidae") Scleroglossa<-c("Acrochordidae|Amphisbaenidae|Anguidae|Aniliidae|Anniellidae|Anomalepididae|Atractaspididae|Bipedidae|Boidae|Bolyeriidae|Calamaridae|Carphodactylidae|Colubridae|Colubroides| Cordylidae|Cylindrophiidae|Dibamidae|Diplodactylidae|Dipsadidae|Elapidae|Elapoidea|Erycidae|Eublepharidae|Gekkonidae|Gerrhopilidae|Gerrhosauridae|Grayiidae|Gymnophthalmidae|Helodermatidae|Homalopsidae|Lacertidae|Lacertidaea|Lamprophiidae|Leptotyphlopidae|Loxocemidae|Natricidae|Pareatidae|Psammophiidae|Pseudoxenodontidae|Pseudoxyrhophiidae|Pygopodidae|Pythonidae|Scincidae|Shinisauridae|Sibynophiidae|Sphaerodactylidae|Sphenodontidae|Teiidae|Trogonophiidae|Tropidophiidae|Typhlopidae|Ungaliophiidae|Uropeltidae|Varanidae|Viperidae|Xantusiidae|Xenodermatidae|Xenopeltidae|Xenosauridae") Macrostomata<-"Acrochordidae|Atractaspididae|Boidae|Bolyeriidae|Calamaridae|Colubridae|Dipsadidae|Elapidae|Erycidae|Homalopsidae|Lamprophiidae|Loxocemidae|Natricidae|Pareatidae|Psammophiidae|Pseudoxenodontidae|Pythonidae|Sibynophiidae|Tropidophiidae|Ungaliophiidae|Viperidae|Xenodermatidae" Dibamidae<-"Dibamidae|Sphaerodactylidae|Gekkonidae|Eublepharidae|Pygopodidae|Diplodactylidae|Carphodactylidae" lapply(c(1:length(files)), function(x) read.tree(files[[x]]))->Alltrees ###tests if monophyly is found in gene trees given a phylogenyc(trees) and some names of taxa (names) in the form for a list like this names<-"Aniliidae|Tropidophiidae". Each successive version of langaha, langaha2, and mono, scales this up to list of trees and then to lists of lists of trees (bootstraps for multiple loci) langaha<-function(trees, names) { lapply(names, function(x) grep(names, trees$tip.label))->taxa unlist(taxa)->taxa trees$tip.label[taxa]->tips tryCatch(as.numeric(is.monophyletic(phy=trees, tips=tips, reroot = !is.rooted(trees), plot = FALSE)),error=function(e) NA)->out return(out) } ###example for a list of trees (1 tree/locus) or use langaha_1.5 lapply(c(1:length(Alltrees)), function(x) langaha(Alltrees[[x]], Macrostomata) )->try langaha_1.5<-function(trees, names) {lapply(c(1:length(trees)), function(x) langaha(trees[[x]], names) )->out unlist(out)->out2 return(out2)} langaha_2<-function(trees, names) { lapply(c(1:length(trees)), function(x) langaha(trees[[x]], names) )->out unlist(out)->out2 sum(out2)/length(trees)->final return(final) } mono<-function(trees,names) { lapply(c(1:length(trees)), function(x) langaha_2(trees[[x]],names))->out return(out) } ##similar to above, but tests RF values for each bootstrap per locus against a species tree, with successive versions scaling up to multiple loci. new_RF<-function(gene_tree, species_tree) { species_tree$tip.label->st gene_tree$tip.label->tr setdiff(st, tr)->drop drop.tip(species_tree,drop)->newST RF.dist(gene_tree, newST)->val val/(2*(length(newST$tip.label)-3))->out return(out) } #gene_trees is a list of trees, species tree is a single astral tree new_RF2<-function(gene_trees, species_tree) { lapply(c(1:length(gene_trees)), function(x) new_RF(gene_trees[[x]], species_tree) )->out unlist(out)->out2 mean(out2)->final return(final) } new_RF3<-function(gene_trees, species_tree) { lapply(c(1:length(gene_trees)), function(x) new_RF2(gene_trees[[x]],species_tree))->out return(out) } list.files()->files #lapply(c(1:length(files)), function(x) read.tree(files[[x]]))->Alltrees lapply(c(1:100), function(x) read.tree(files[[x]]))->Alltrees1 lapply(c(101:200), function(x) read.tree(files[[x]]))->Alltrees2 lapply(c(201:300), function(x) read.tree(files[[x]]))->Alltrees3 lapply(c(301:370), function(x) read.tree(files[[x]]))->Alltrees4 c(Alltrees1, Alltrees2, Alltrees3, Alltrees4)->Alltrees names<-"Aniliidae|Tropidophiidae" #mono(Alltrees,names)->outputMono. The examples here use Langaha2, so this is for a distribution of bootstrap trees for each locus mono(Alltrees[1:100],names)->outputMono1 mono(Alltrees[101:200],names)->outputMono2 mono(Alltrees[201:300],names)->outputMono3 mono(Alltrees[301:370],names)->outputMono4 tryCatch(mono(Alltrees[201:300],names), error=function(e)(NA))->outputMono3 new_RF3(Alltrees[1:100], astral)->rfout1 new_RF3(Alltrees[101:200], astral)->rfout2 new_RF3(Alltrees[201:300], astral)->rfout3 new_RF3(Alltreesl[301:379], astral)->rfout4 c(rfout1,rfout2, rfout3, rfout4)->rfTot #dput(rfTot, "RF_Dist_Mean_AllBS_ALL_Loci") ###make sure all of your taxa are present in your dataset. A result of 0 means they aren't present testNames<-function(trees, names, number_of_groups) {grep(names, trees$tip.label)->taxa trees$tip.label[taxa]->tax gsub( "_.*", "", tax)->j as.factor(j)->j length(levels(j))->out as.numeric(out==number_of_groups)->out2 res<-list() setdiff return(out2)} lapply(c(1:370), function(x) testNames(Alltrees[[x]], names=Aniliidae, number_of_groups=2))->try ###dna_stats #length and other stats Align_stats<-function (x, check.gaps = TRUE, plot = TRUE, what = 1:4) { cat("\nNumber of sequences:", n <- nrow(x), "\nNumber of sites:", s <- ncol(x), "\n") if (check.gaps) { cat("\n") y <- DNAbin2indel(x) gap.length <- sort(unique.default(y))[-1] if (!length(gap.length)) cat("No gap in alignment.\n") else { rest <- gap.length%%3 if (any(cond <- rest > 0)) { cat("Some gap lengths are not multiple of 3:", gap.length[cond]) } else cat("All gap lengths are multiple of 3.") tab <- tabulate(y, gap.length[length(gap.length)]) tab <- tab[gap.length] cat("\n\nFrequencies of gap lengths:\n") names(tab) <- gap.length print(tab) col1 <- unique(y[, 1]) if (!col1[1]) col1 <- col1[-1] if (length(col1)) cat(" => length of gaps on the left border of the alignment:", unique(col1), "\n") else cat(" => no gap on the left border of the alignment\n") i <- which(y != 0, useNames = FALSE) jcol <- i%/%nrow(y) + 1 yi <- y[i] j <- yi == s - jcol + 1 if (any(j)) cat(" => length of gaps on the right border of the alignment:", yi[j], "\n") else cat(" => no gap on the right border of the alignment\n") A <- B <- numeric() for (i in seq_len(n)) { j <- which(y[i, ] != 0) if (!length(j)) next k <- j + y[i, j] if (j[1] != 1) k <- c(1, k) else j <- j[-1] if (k[length(k)] > s) k <- k[-length(k)] else j <- c(j, s + 1) A <- c(A, j) B <- c(B, k) } AB <- unique(cbind(A, B)) not.multiple.of.3 <- (AB[, 1] - AB[, 2])%%3 != 0 left.border <- AB[, 2] == 1 right.border <- AB[, 1] == s + 1 Nnot.mult3 <- sum(not.multiple.of.3) cat("\nNumber of unique contiguous base segments defined by gaps:", nrow(AB), "\n") if (!Nnot.mult3) cat("All segment lengths multiple of 3.\n") else { Nleft <- sum(not.multiple.of.3 & left.border) Nright <- sum(not.multiple.of.3 & right.border) cat("Number of segment lengths not multiple of 3:", Nnot.mult3, "\n", " => on the left border of the alignement:", Nleft, "\n", " => on the right border :", Nright, "\n") if (Nright + Nleft < Nnot.mult3) { cat(" => positions of these segments inside the alignment: ") sel <- not.multiple.of.3 & !left.border & !right.border cat(paste(AB[sel, 2], AB[sel, 1] - 1, sep = ".."), "\n") } } } } else gap.length <- numeric() ss <- seg.sites(x) cat("\nNumber of segregating sites (including gaps):", length(ss)) BF.col <- matrix(NA_real_, length(ss), 4) for (i in seq_along(ss)) BF.col[i, ] <- base.freq(x[, ss[i]]) tmp <- apply(BF.col, 1, function(x) sum(x > 0)) cat("\nNumber of sites with at least one substitution:", sum(tmp > 1)) cat("\nNumber of sites with 1, 2, 3 or 4 observed bases:\n") tab2 <- tabulate(tmp, 4L) tab2[1] <- s - sum(tab2) names(tab2) <- 1:4 print(tab2) cat("\n") H <- numeric(s) H[ss] <- apply(BF.col, 1, function(x) { x <- x[x > 0] -sum(x * log(x)) }) G <- rep(1, s) G[ss] <- tmp if (plot) { if (length(what) == 4) { mat <- if (length(gap.length)) 1:4 else c(1, 0, 2, 3) layout(matrix(mat, 2, 2)) } else { if (length(what) != 1) { what <- what[1] warning("argument 'what' has length > 1: the first value is taken") } } if (1 %in% what) image(x) if (2 %in% what && length(gap.length)) barplot(tab, xlab = "Gap length") if (3 %in% what) plot(1:s, H, "h", xlab = "Sequence position", ylab = "Shannon index (H)") if (4 %in% what) plot(1:s, G, "h", xlab = "Sequence position", ylab = "Number of observed bases") } res<-list() res$number_of_sequences<-n res$number_sites<-s res$base_freq<-base.freq(x) res$frq_gaps<-tab res$segsites_w_gaps<-length(ss) res$number_sites_1_or_more_subst<-sum(tmp > 1) res$freq_sites_w_1_4_obs_bases<-tab2 #res$length_of_ gaps_on_the_left_border_of_the_alignment<- unique(col1) return(res) } ###get parisimony informative sites #data is in nexus format, folder= ".", make sure you have navigated to that folder or write the link ahead of time, type="absolute", "fraction" of PIS. library(ape) library(ips) site_wise<-function(x) {x <- as.matrix(x) nms <- dimnames(x)[[1]] n <- dim(x) s <- n[2] n <- n[1] keep <- .C(ape:::GlobalDeletionDNA, x, n, s, rep(1L, s))[[4]] x <- x[, as.logical(keep)] s <- dim(x)[2] return(x)} ##getting stats for genes JYD<-function(folder, type) { list.files(folder)->files lapply(c(1:length(files)), function(x) read.dna(files[[x]], format="sequential"))->dat lapply(c(1:length(dat)), function(x) as.alignment(dat[[x]]))->dat2 lapply(c(1:length(dat2)), function(x) as.DNAbin(dat2[[x]]))->dat3 lapply(c(1:length(dat3)), function(x) site_wise(dat3[[x]]))->dat4 lapply(c(1:length(dat4)), function(x) pis(dat3[[x]], type))->dat5 unlist(dat5)->dat6 density(dat6)->dens plot(dens,ylab="density",xlab="pars inf sites", main="Density of Parsimony Informative Sites Up in Here", col="blue") polygon(dens, col="indianred4") files->loci dat6->pis data.frame(files, pis)->out return(out)} ##Neural Net predictions for why genes do not find Aniliidae, Toxicofera, and Dibamidae/Gekkota library(caret) library(picante) library(pROC) install.packages('e1071', dependencies=TRUE) ##data is a table of values for your predictors, here we reduce the number of loci (rows) to meet the number of observations in the testing monophyletic part (toxico) here read.table("NN_Test_Data_Anil_Dib_Tox.txt")->table ##reduce this to test for just Aniliidae (Amerophidea), you can do the same for Dibamida, Aniliidae table[-c(1:2,4,5,7,9)]->anil which(is.na(table$Toxicofera))->rem anil[-rem,]->anil2 dib2 tox2 ##put in your data table for data, add the name of predicted variable, below as Tox2 (3 places), and the total number of variables (as in 2:25) NN_Test<-function(data) { #as.factor(data[,1])->data[,1] maxs <- apply(data, 2, max) mins <- apply(data, 2, min) scaled <- as.data.frame(scale(data, center = mins, scale = maxs - mins)) data<-scaled index <- sample(1:nrow(data),round(0.7*nrow(data))) train.cv <- data[index,] test.cv <- data[-index,] as.factor(train.cv[,1])->train.cv[,1] as.factor(test.cv[,1])->test.cv[,1] model.nnet = train(Aniliidae~., train.cv, method="nnet") varImp(object=model.nnet)->important tail(order(important$importance))->stuff prediction1 <- predict(model.nnet, test.cv[,2:31]) prediction2 <- predict(model.nnet, test.cv[,2:31], probability=T,"prob") confused<-confusionMatrix(prediction1, test.cv[,1]) results<-list() results$accuracy<-postResample(pred=prediction1, obs=test.cv[,1])[1][1] transform(test.cv,Aniliidae= sample(Aniliidae) )->trans results$random<-postResample(pred=prediction1, obs=trans[,1])[1][1] results$auc<-roc(test.cv[,1], prediction2[,2])$auc[[1]] results$importance<-row.names(important$importance)[rev(stuff)] results$confusion_AccPVal<-confused$overall[6] return(results) } replicate(500, NN_Test(anil2),F)->output ###if you run each piece of the function and take the model.nnet, you can see variable importance. plot(varImp(object=model.nnet),main="NN - Variable Importance") #example lapply(c(1:500), function(x) output[[x]]$accuracy)->accuracy lapply(c(1:500), function(x) output[[x]]$random)->random lapply(c(1:500), function(x) output[[x]]$auc)->auc lapply(c(1:500), function(x) output[[x]]$importance)->imp lapply(c(1:500), function(x) output[[x]]$confusion_AccPVa)->confusion_AccPVa mean(unlist(accuracy)) mean(unlist(random)) mean(unlist(auc)) mean(unlist(confusion_AccPVa)) sd(unlist(accuracy)) sd(unlist(random)) sd(unlist(auc)) sd(unlist(confusion_AccPVa)) ###Plotting Neuralnet plot(density(unlist(accuracy)),xlim=c(0.5,.9)) polygon(density(unlist(accuracy)), col=rgb(1,0,0,0.5)) polygon(density(unlist(random)), col=rgb(0,0,1,0.5), add=T) abline(v=mean(unlist(accuracy)),lwd=2) abline(v=mean(unlist(random)),lwd=2) table(unlist(imp))->imp2 rev(sort(imp2))->imp2 plot(imp2,lwd=20) plot.nnet(model.nnet) varImp(object=model.nnet)->j row.names(j$importance)[1:5]->impt ####neural net regression to determine which gene characteristics predict RF between genes and species tree read.table("Data_S15. NN_Table_All_Stats.txt",header=T)->table table[,-c(1,2,5,6,32)]->data test_RF_NN<-function(data) { maxs <- apply(data, 2, max) mins <- apply(data, 2, min) scaled <- as.data.frame(scale(data, center = mins, scale = maxs - mins)) data<-scaled index <- sample(1:nrow(data),round(0.7*nrow(data))) train.cv <- data[index,] test.cv <- data[-index,] #as.factor(train.cv[,1])->train.cv[,1] #as.factor(test.cv[,1])->test.cv[,1] fit <- train(RF_IQ_TREE ~ ., data = train.cv, method = "nnet", maxit = 1000, trace = T, linout = 1) varImp(object=fit)->important tail(order(important$importance))->stuff row.names(important$importance)[rev(stuff)]->imp predict <- predict(fit, newdata = test.cv) rmse <- sqrt(mean((predict - test.cv$RF_IQ_TREE)^2)) results<-list() results$importance<-imp results$RMSE<-postResample(pred=predict, obs=test.cv$RF_IQ_TREE)[1] results$r2<-postResample(pred=predict, obs=test.cv$RF_IQ_TREE)[2] results$MAE<-postResample(pred=predict, obs=test.cv$RF_IQ_TREE)[3] transform(test.cv,RF_IQ_TREE= sample(RF_IQ_TREE) )->trans results$randomRMSE<-postResample(pred=predict, obs=trans$RF_IQ_TREE)[1] results$random_r2<-postResample(pred=predict, obs=trans$RF_IQ_TREE)[2] results$randomMAE<-postResample(pred=predict, obs=trans$RF_IQ_TREE)[3] return(results) } replicate(100, test_RF_NN(data),F)->output lapply(c(1:100), function(x) output[[x]]$importance)->imp lapply(c(1:100), function(x) output[[x]]$RMSE)->RMSE lapply(c(1:100), function(x) output[[x]]$r2)->r2 lapply(c(1:100), function(x) output[[x]]$MAE)->MAE lapply(c(1:100), function(x) output[[x]]$randomRMSE)->randomRMSE lapply(c(1:100), function(x) output[[x]]$random_r2)->random_r2 lapply(c(1:100), function(x) output[[x]]$randomMAE)->randomMAE mean(unlist(RMSE)) mean(unlist(r2)) mean(unlist(MAE)) mean(unlist(randomRMSE)) mean(unlist(random_r2)) mean(unlist(randomMAE)) sd(unlist(RMSE)) sd(unlist(r2)) sd(unlist(MAE)) sd(unlist(randomRMSE)) sd(unlist(random_r2)) sd(unlist(randomMAE)) ###Plotting Neuralnet plot(density(unlist(r2)),xlim=c(0,1)) polygon(density(unlist(r2)), col=rgb(1,0,0,0.5)) polygon(density(unlist(random_r2), adjust=3), col=rgb(0,0,1,0.5), adjust=2, add=T) abline(v=mean(unlist(r2)),lwd=1) abline(v=mean(unlist(random_r2)),lwd=1) table(unlist(imp))->imp2 rev(sort(imp2))->imp2 plot(imp2,lwd=20) ####Determine how multicolinearity affects NN. First figrue out which variables show mutlicolinearity and then remove ####determine how multicolinearity affects NN. First figrue out which variables show mutlicolinearity and then remove library(faraway) read.table("Data_S15. NN_Table_All_Stats.txt",header=T)->table table[,-c(1,2,5,6,32)]->data maxs <- apply(data, 2, max) mins <- apply(data, 2, min) scaled <- as.data.frame(scale(data, center = mins, scale = maxs - mins)) #figure out which have inflated variances and keep only one, look at "rem" to see which have colinearity > 10 vif(scaled)->out which(out>10)->rem #keep PIS, remove the rest rem[-(which(names(rem)=="pis"))]->rem2 match(names(rem2),names(scaled))->rem3 scaled[-rem3]->scaled2 replicate(100, test_RF_NN(scaled2),F)->output lapply(c(1:100), function(x) output[[x]]$importance)->imp lapply(c(1:100), function(x) output[[x]]$RMSE)->RMSE lapply(c(1:100), function(x) output[[x]]$r2)->r2 lapply(c(1:100), function(x) output[[x]]$MAE)->MAE lapply(c(1:100), function(x) output[[x]]$randomRMSE)->randomRMSE lapply(c(1:100), function(x) output[[x]]$random_r2)->random_r2 lapply(c(1:100), function(x) output[[x]]$randomMAE)->randomMAE mean(unlist(RMSE)) mean(unlist(r2)) mean(unlist(MAE)) mean(unlist(randomRMSE)) mean(unlist(random_r2)) mean(unlist(randomMAE)) sd(unlist(RMSE)) sd(unlist(r2)) sd(unlist(MAE)) sd(unlist(randomRMSE)) sd(unlist(random_r2)) sd(unlist(randomMAE)) table(unlist(imp))->imp2 rev(sort(imp2))->imp2 ###Part II. Make plots for phylogenetic informativeness per locus segplot(reorder(factor(Old_Locus_Names), diff_phyInf_90)~min_phyInf_90+max_phyInf_90,data=table) ##Writing_trees mapply(function(x,y) write.tree(x, y), x=trees, y=ufboot ) #Printing squamate Trees library(OutbreakTools) library(strap) t <- read.annotated.nexus( "Data_S6_Dated_Astral_IQ_Point_Tree" ) t$root.time <- t$root.annotation$height gsub("_[^_]+$","", t$tip.label)->names gsub("^.*?_", "", names)->names2 t$tip.label<-names2 pdf(height=20, width=20) geoscalePhylo(tree=ladderize(t,right=FALSE), units=c("Period", "Epoch"), boxes="Epoch", cex.tip=0.4, cex.age=0.7, cex.ts=1, label.offset=0.8, x.lim=c(-30,240), lwd=1, width=1) dev.off() ###tree support for each gene over each node like line figure in paper with support and no suport over all genes for each node in the species tree library(phangorn) read.nexus("Dated_Tree")->tree tree$node.label<-NULL force.ultrametric(tree)->tree read.tree( "Sq_IQ_Trees.tre" )->iq iq[[1]]->gene #get names, tree is the dates species tree and gene is the locus tree node_Date_Mono<-function(tree, gene) { setdiff(tree$tip.label, gene$tip.label)->mis drop.tip(tree, mis)->junk branching.times(junk)->bt as.numeric(row.names(as.data.frame(bt)))->b2 lapply(b2, function(x) Descendants(junk, x, "tips"))->desc lapply(c(1:length(desc)), function(x) junk$tip.label[desc[[x]][[1]]])->names lapply(c(1:length(names)), function(x) as.numeric(is.monophyletic(phy=gene, tips=names[[x]], reroot = !is.rooted(gene), plot = FALSE)))->out unlist(out)->out as.matrix(bt)->bt3 round(bt3,6)->bt4 cbind(bt4,out)->table as.data.frame(table)->table return(table) } #merged_tables lapply(c(1:length(iq)),function(x) node_Date_Mono(tree,iq[[x]]))->output #lapply(c(1:length(output)),function(x) output[[x]]$out)->j #data.frame(matrix(unlist(j), nrow=length(j), byrow=T))->z me<-Reduce(function(...) merge(..., all.x=T, all.y = F, by = "V1"), output) ##sum up the zeros (no support), ones (support), and NAs and put this into merged, or just get the table below lapply(c(1:length(me[,1])),function(x) length(which(me[x,]==1)))->support unlist(support)->support lapply(c(1:length(me[,1])),function(x) length(which(me[x,]==0)))->no_support -1*unlist(no_support)->no_support_negative cbind(me$V1,support, no_support_negative)->merged data.frame(merged)->merged names(merged)[1]<-"Date" ##sum up the zeros (no support), ones (support), and NAs and put this into merged, or just get the table below read.table( "Support_by_node_table" )->merged ##make the graphs ggplot() + geom_area(data=merged, aes(x=Date, y=support), fill="darkred")+geom_area(data=merged, aes(x=Date, y=no_support_negative), fill="darkblue")->p p+ scale_x_continuous(breaks = scales::pretty_breaks(n = 70))+ geom_point() + annotate("point", y=200, x=171.6975 ,colour = "green") + annotate("point", y=251, x=93.058439 ,colour = "blue") + annotate("point", y=144, x= 186.67769 ,colour = "orange") + annotate("point", y=358, x= 126.2685 ,colour = "yellow") + annotate("point", y=319, x= 154.4258 ,colour = "lightblue") + annotate("point", y=339, x= 131.9024 ,colour = "lightgreen") + annotate("point", y=256, x= 168.7758 ,colour = "black")+ annotate("point", y=296, x= 174.77 ,colour = "brown")+ annotate("point", y=355, x= 121.4431 ,colour = "grey") + annotate("point", y=237, x= 186.1226 ,colour = "pink") ###or plot separately: plot(merged$Date,merged$support,type="h",col="darkred") plot(merged$Date,merged$no_support,type="h",col="darkblue") # add points to the graph relating date and support by node using points() with x=support and y=millions of years ###Part III. Compare support among tree types library(ape) read.tree("Data_S2_Tree_1_Squamate_Astral_IQ_BS_Tree")->AS_BS read.tree("Data_S4_Tree_3_Squamata_IQ_Astral_Astral_Support" )->AS read.tre("Data_S5_Tree_4_Squam_IQ_Concat_Part_BS_SH.treefile")->IQ matchNodes(AS_BS,IQ)->match as.data.frame(match)->match names(match)<-c("as_bs_nodes","iq_nodes") matchNodes(AS,IQ)->match2 as.data.frame(match2)->match2 names(match2)<-c("AS_nodes","iq_nodes") #which(is.na(match[,2])=="TRUE")->drop #match[-drop,]->m AS_BS$node.label->AS_BS_N AS_BS->J_AS_BS J_AS_BS$node.label<-NULL branching.times(J_AS_BS)->AS_BS_BT names(AS_BS_BT) names(AS_BS_N)<-names(AS_BS_BT) as.data.frame(AS_BS_N)->df1 cbind(row.names(df1),df1)->df1.1 names(df1.1)<-c("as_bs_nodes", "AS_BS_N" ) as.data.frame(df1.1)->df1.1 merge(match,df1.1,by="as_bs_nodes")->x IQ_BS_SH$node.label->IQ_BS_SH_N IQ_BS_SH->J_IQ_BS_SH J_IQ_BS_SH$node.label<-NULL branching.times(J_IQ_BS_SH)->IQ_BS_SH_BT names(IQ_BS_SH_N)<-names(IQ_BS_SH_BT) as.data.frame(IQ_BS_SH_N)->df2 cbind(row.names(df2),df2)->df2.1 names(df2.1)<-c("iq_nodes", "IQ_BS_SH_N" ) as.data.frame(df2.1)->df2.1 merge(x,df2.1,by="iq_nodes")->z ##correlation between Astral IQ Boots and IQ_BS Support cor.test(as.numeric(as.character((z$AS_BS_N))), as.numeric(gsub("/.*","",z$IQ_BS_SH_N)),method="spearman") ##correlation between Astral IQ Boots and IQ_SH Support cor.test(as.numeric(as.character((z$AS_BS_N))), as.numeric(gsub(".*/","",z$IQ_BS_SH_N)),method="spearman") AS$node.label->AS_N AS->J_AS J_AS$node.label<-NULL branching.times(J_AS)->AS_BT names(AS_BT) names(AS_N)<-names(AS_BT) as.data.frame(AS_N)->df3 cbind(row.names(df3),df3)->df3.1 names(df3.1)<-c("AS_nodes", "AS_N" ) as.data.frame(df3.1)->df3.1 merge(match2,df3.1,by="AS_nodes")->y merge(y,df2.1,by="iq_nodes")->zy ##correlation between Astra PP_Support and IQ_BS Support cor.test(as.numeric(as.character((zy$AS_N))), as.numeric(gsub("/.*","",zy$IQ_BS_SH_N)),method="spearman") ##correlation between Astral PP_Support and IQ_SH Support cor.test(as.numeric(as.character((zy$AS_N))), as.numeric(gsub(".*/","",zy$IQ_BS_SH_N)),method="spearman") ####Part IV. Below are simple functions used throughout this paper: ###Adds root to a tree similar to how figtree does add.root<-function(tree) { terms <- tree$edge[, 2] <= Ntip(tree) terminal.edges <- tree$edge.length[terms] names(terminal.edges)<-tree$tip.label; names(terminal.edges) <- tree$tip.label[tree$edge[terms, 2]] as.list(terminal.edges)->te te$Sphenodontidae_Sphenodon_punctatus_I5870/2->edge which(tree$tip.label=="Sphenodontidae_Sphenodon_punctatus_I5870")->num reroot(tree, num, position=edge)->out return(out)} ####Convert Names using "Convert_Family_Names" format read.tree("Data_S6_Dated_Astral_IQ_Point_Tree")->tree as.data.frame(tree$tip.label)->TL joined[match(TL$tips, joined$Old_Sequences),]->stuff cbind(TL,stuff$Final)->replace as.character(replace$`stuff$Final`)->repl tree$tip.label<-replace$`stuff$Final` convertor_names<-function(tree, conv) { conv[match(tree$tip.label, conv$tips),]->try tree$tip.label<-as.character(try[,2]) return(tree) } tab[!duplicated(tab$Family),]->new #################################################################################### ###STATS AT END ####determine how multicolinearity affects NN. First figrue out which variables show mutlicolinearity and then remove library(faraway) read.table("Data_S15. NN_Table_All_Stats.txt",header=T)->table table[,-c(1,2,5,6,32)]->data maxs <- apply(data, 2, max) mins <- apply(data, 2, min) scaled <- as.data.frame(scale(data, center = mins, scale = maxs - mins)) #figure out which have inflated variances and keep only one, look at "rem" to see which have colinearity > 10 vif(scaled)->out which(out>10)->rem #keep PIS, remove the rest rem[-(which(names(rem)=="pis"))]->rem2 match(names(rem2),names(scaled))->rem3 scaled[-rem3]->scaled2 replicate(100, test_RF_NN(scaled2),F)->output lapply(c(1:100), function(x) output[[x]]$importance)->imp lapply(c(1:100), function(x) output[[x]]$RMSE)->RMSE lapply(c(1:100), function(x) output[[x]]$r2)->r2 lapply(c(1:100), function(x) output[[x]]$MAE)->MAE lapply(c(1:100), function(x) output[[x]]$randomRMSE)->randomRMSE lapply(c(1:100), function(x) output[[x]]$random_r2)->random_r2 lapply(c(1:100), function(x) output[[x]]$randomMAE)->randomMAE mean(unlist(RMSE)) mean(unlist(r2)) mean(unlist(MAE)) mean(unlist(randomRMSE)) mean(unlist(random_r2)) mean(unlist(randomMAE)) sd(unlist(RMSE)) sd(unlist(r2)) sd(unlist(MAE)) sd(unlist(randomRMSE)) sd(unlist(random_r2)) sd(unlist(randomMAE)) showing multicolinearity: as.list(rem2) $phy_info_166_176 [1] 3 $min_phyInf_50 [1] 4 $max_phyInf_50 [1] 5 $diff_phyInf_50 [1] 6 $min_phyInf_90 [1] 7 $max_phyInf_90 [1] 8 $diff_phyInf_90 [1] 9 $max_gaps [1] 11 $mean_gaps [1] 13 $sd_gaps [1] 14 $base_a [1] 15 $base_c [1] 16 $base_g [1] 17 $base_t [1] 18 $number_sites_1_or_more_subst [1] 19 $number_seg_sites_w_gaps [1] 20 $freq_sites_w_1_obs_bases [1] 21 $freq_sites_w_2_obs_bases [1] 22 $freq_sites_w_3_obs_bases [1] 23 $freq_sites_w_4_obs_bases [1] 24 $number_sites [1] 25 $pis [1] 26 mean(unlist(RMSE)) [1] 0.06249137 > mean(unlist(r2)) [1] 0.876175 > mean(unlist(MAE)) [1] 0.04750042 > mean(unlist(randomRMSE)) [1] 0.2438684 > mean(unlist(random_r2)) [1] 0.009500705 > mean(unlist(randomMAE)) [1] 0.1731723 > > > sd(unlist(RMSE)) [1] 0.004702842 > sd(unlist(r2)) [1] 0.02547815 > sd(unlist(MAE)) [1] 0.003108459 > sd(unlist(randomRMSE)) [1] 0.02976139 > sd(unlist(random_r2)) [1] 0.01275042 > sd(unlist(randomMAE)) [1] 0.02093342 Variable importance pis Number_Tips num_gaps Conc_Factors_Sites_SD Conc_Factors_Sites_Mean 100 100 100 100 100