This script details how phylogenetic signal was estimated for discrete characters and how ancestral states were reconstructed using stochastic character mapping

import packages

library(ape)
library(geiger)
library(phytools)

import data

tree<-read.nexus("~/r_scripts/data_files/beast_MCC.tre")
my.data<-read.csv("~/r_scripts/data_files/my_data.csv", header=T, row.names = 1)

match tree and data

tree<-drop.tip(tree, name.check(tree, my.data)$tree_not_data)
my.data<-my.data[tree$tip.label,]
all(rownames(my.data)==tree$tip)
## [1] TRUE
name.check(tree, my.data)
## [1] "OK"
plot(tree, cex=0.5)

Phylogenetic Signal

ML estimate of trait evolution under Pagel’s Lambda and with no phylogenetic component (star phylogeny)

rep.modes<-my.data$rep_mode; names(rep.modes)<-rownames(my.data)
rep.lamb<-fitDiscrete(tree, rep.modes, transform="lambda")
rep.lamb0<-fitDiscrete(tree,rep.modes, transform="white")

Comapare AICc scores

rep.lamb0$opt$aicc
## [1] 178.4174
rep.lamb$opt$aicc
## [1] 97.42607
(dAICc<-rep.lamb0$opt$aicc-rep.lamb$opt$aicc)
## [1] 80.99133

Stochastic Character Mapping (SCM)

Ancestral state reconstruction was perfomed using Stochastic Character mapping. Due to the paraphyletic nature of African Bufonidae, this was done only on Clade A (see text for details).

reimport data

reimport data and prune to include just Clade A. We will also create colour vectors for reproductive mode and habitat cluster (from the UPGMA)

my.data<-read.csv("~/r_scripts/data_files/my_data.csv", header=T, row.names = 1)
tree<-read.nexus("~/r_scripts/data_files/beast_MCC.tre")
tree<-extract.clade(tree, 131) #extract Clade A
my.data<-my.data[tree$tip,]
# colour vectors:
hab_cols<-c("#a595ed","#c4c0d3","#F56476","#BE3E82"); names(hab_cols)<-c(1,2,3,4)
rep_cols<-c("deepskyblue3","orange","#B404AE","gold","chartreuse3"); names(rep_cols)<-c("generalized","ephem","terr_eggs","steep_terr","viviparity")

plot(tree, cex=0.5, label.offset = 2)
tiplabels(pch=20, col=hab_cols[my.data$upgma_clusters], adj=1)
tiplabels(pch=20, col=rep_cols[as.character(my.data$rep_mode)], adj=2)

SCM for habitat preferences

make a probability matrix of tip states

hab.mode.prob<-data.frame(model.matrix(~as.character(my.data$upgma_clusters)-1))
colnames(hab.mode.prob)<-c("1","2","3","4")
rownames(hab.mode.prob)<-tree$tip
hab.mode.prob<-hab.mode.prob[tree$tip,]
hab.mode.prob<-as.matrix(hab.mode.prob)
name.check(tree, hab.mode.prob)
## [1] "OK"
head(hab.mode.prob)
##                             1 2 3 4
## Poyntonophrynus_kavangensis 0 0 1 0
## Mertensophryne_uzunguensis  0 1 0 0
## Nectophrynoides_wendyae     0 1 0 0
## Poyntonophrynus_damaranus   0 0 1 0
## Poyntonophrynus_dombensis   0 0 1 0
## Mertensophryne_lindneri     0 0 1 0

simulate stochastic character maps

sim.hab.trees <- make.simmap(tree, hab.mode.prob, model = "ER", nsim = 20, Q="mcmc", pi="estimated") #NOTE! the number of simulations has been reduced to 20 here for illustration purposes. Run code with nsim=999.
## Running MCMC burn-in. Please wait....
## Running 2000 generations of MCMC, sampling every 100 generations.
## Please wait....
## 
## make.simmap is simulating with a sample of Q from
## the posterior distribution
## 
## Mean Q from the posterior is
## Q =
##             1           2           3           4
## 1 -0.05831809  0.01943936  0.01943936  0.01943936
## 2  0.01943936 -0.05831809  0.01943936  0.01943936
## 3  0.01943936  0.01943936 -0.05831809  0.01943936
## 4  0.01943936  0.01943936  0.01943936 -0.05831809
## and (mean) root node prior probabilities
## pi =
##    1    2    3    4 
## 0.25 0.25 0.25 0.25
hab_simmap <- describe.simmap(sim.hab.trees, plot=F)

plot summarized node states as pie charts on tree

par(mar=c(4,4,4,4))
plot(tree, show.tip.label=T, label.offset=1, cex=0.4, main="SCM of Habitat Preferences")
tiplabels(pch=20, col=hab_cols[my.data$upgma_clusters],cex=1, adj=1)
nodelabels(pie=hab_simmap$ace, piecol= hab_cols[colnames(hab_simmap$tips)], cex = 0.5)
legend("bottomleft", bty="n", pch=19, col=hab_cols, legend=names(hab_cols), title="Hab. pref.")

SCM for reproductive mode

same code as above, but for reproductive modes:

make a probability matrix of tip states

rep.mode.prob<-data.frame(model.matrix(~my.data$rep_mode-1))
colnames(rep.mode.prob)<-levels(my.data$rep_mode)
rownames(rep.mode.prob)<-tree$tip
rep.mode.prob<-rep.mode.prob[tree$tip,]
rep.mode.prob<-as.matrix(rep.mode.prob)
name.check(tree, rep.mode.prob)
## [1] "OK"
head(rep.mode.prob)
##                             ephem generalized steep_terr terr_eggs
## Poyntonophrynus_kavangensis     0           1          0         0
## Mertensophryne_uzunguensis      0           1          0         0
## Nectophrynoides_wendyae         0           0          0         0
## Poyntonophrynus_damaranus       0           1          0         0
## Poyntonophrynus_dombensis       0           1          0         0
## Mertensophryne_lindneri         0           1          0         0
##                             viviparity
## Poyntonophrynus_kavangensis          0
## Mertensophryne_uzunguensis           0
## Nectophrynoides_wendyae              1
## Poyntonophrynus_damaranus            0
## Poyntonophrynus_dombensis            0
## Mertensophryne_lindneri              0
## NOTE: for species where reproductive mode is unknown/uncertain, we can include a degree of uncertainty in the tip state coding. For example, Churamiti maridadi is only inferred to be a generalized tadpole breeder, but in reality, we do not know. We could therefore assign euqal proabilities for each state like so:
#rep.mode.prob["Churamiti_maridadi",]<-c(0.2,0.2,0.2,0.2,0.2)

simulate stochastic character maps

sim.rep.trees <- make.simmap(tree, rep.mode.prob, model = "ER", nsim = 20, Q="mcmc", pi="estimated") #NOTE! the number of simulations has been reduced to 20 here for illustration purposes. Run code with nsim=999.
## Running MCMC burn-in. Please wait....
## Running 2000 generations of MCMC, sampling every 100 generations.
## Please wait....
## 
## make.simmap is simulating with a sample of Q from
## the posterior distribution
## 
## Mean Q from the posterior is
## Q =
##                    ephem  generalized   steep_terr    terr_eggs
## ephem       -0.012680834  0.003170209  0.003170209  0.003170209
## generalized  0.003170209 -0.012680834  0.003170209  0.003170209
## steep_terr   0.003170209  0.003170209 -0.012680834  0.003170209
## terr_eggs    0.003170209  0.003170209  0.003170209 -0.012680834
## viviparity   0.003170209  0.003170209  0.003170209  0.003170209
##               viviparity
## ephem        0.003170209
## generalized  0.003170209
## steep_terr   0.003170209
## terr_eggs    0.003170209
## viviparity  -0.012680834
## and (mean) root node prior probabilities
## pi =
##       ephem generalized  steep_terr   terr_eggs  viviparity 
##         0.2         0.2         0.2         0.2         0.2
rep_simmap <- describe.simmap(sim.rep.trees, plot=F)

plot summarized node states as pie charts on tree

par(mar=c(4,4,4,4))
plot(tree, show.tip.label=T, label.offset=1, cex=0.4, main="SCM of Reproductive Mode")
tiplabels(pch=20, col=rep_cols[as.character(my.data$rep_mode)],cex=2, adj=1)
nodelabels(pie=rep_simmap$ace, piecol= rep_cols[colnames(rep_simmap$tips)], cex = 0.5)
legend("bottomleft", bty="n", pch=19, col=rep_cols, legend=names(rep_cols), title="Rep. modes")

making plots with painted branches

the following code plots the SCM summaries as above, but colours any downstream branches (towards the tips) the most probably state of that lineage.

par(mfrow=c(1,2))
hab.tree<-tree
for(i in 1:nrow(hab_simmap$ace)){
  hab.tree<-paintSubTree(hab.tree,node=as.numeric(rownames(hab_simmap$ace)[i]),state=names(which.max(hab_simmap$ace[i,])))
}
plotSimmap(hab.tree,hab_cols,lwd=3, fsize=0.5, offset=1)
tiplabels(pch=20, col=hab_cols[my.data$upgma_clusters],cex=2,adj = 2)
nodelabels(pie=hab_simmap$ace,piecol= hab_cols[colnames(hab_simmap$tips)], cex = 1)
legend("bottomleft", bty="n", pch=19, col=hab_cols, legend=names(hab_cols), title="Hab. pref.", pt.cex=2, cex=0.75)

rep.tree<-tree
for(i in 1:nrow(rep_simmap$ace)){
  rep.tree<-paintSubTree(rep.tree,node=as.numeric(rownames(rep_simmap$ace)[i]),state=names(which.max(rep_simmap$ace[i,])))
}
plotSimmap(rep.tree,rep_cols,lwd=3, fsize = 0.5, offset=1) ### note, for some reason leftwards causes problems with plotting pies on nodes... 
tiplabels(pch=20, col=rep_cols[as.character(my.data$rep_mode)],cex=2, adj=2)
nodelabels(pie=rep_simmap$ace,piecol= rep_cols[colnames(rep_simmap$tips)], cex = 1)
legend("bottomleft", bty="n", pch=20, col=rep_cols, legend=names(rep_cols), title="Rep modes", pt.cex = 2, cex=0.75)