##Functions for host use models ##function embedded within host use model to split native food web into calibration / prediction datasets K_fold <- function(Nobs,K=5){ rs <- runif(Nobs) id <- seq(Nobs)[order(rs)] k <- as.integer(Nobs*seq(1,K-1)/K) k <- matrix(c(0,rep(k,each=2),Nobs),ncol=2,byrow=TRUE) k[,1] <- k[,1]+1 l <- lapply(seq.int(K),function(x,k,d) list(train=d[!(seq(d) %in% seq(k[x,1],k[x,2]))], test=d[seq(k[x,1],k[x,2])]),k=k,d=id) return(l) } ##key function for making host use model --only uses native food web. Splits food web into specified K-folds. For each herbivore, estimates minimum ##phylogenetic distance between any 'native' (= calibration) host of that herbivore and 'non-native' (=prediction) potential host. ##parameterizes binomial GLM with the form: novel host use (Y/N) ~ number of native hosts of herbivore * phylogenetic distance to novel host. ##currently, function uses phylogeny as only outside info on plants, but could easily be expanded to include plant traits or attributes of herbivore. makehostmodel<-function(foodweb,tree,K=5){ #set.seed(7402356) kmoth<-K_fold(nrow(foodweb), K=K) nhost<-apply(foodweb, 2,sum) ###run models for each K-fold modlist<-list() modlist_range<-list() for(h in 1:K){ print(paste("K-fold", h, "of", K)) flush.console() train<-foodweb[kmoth[[h]]$train,] train<-t(train) train<-train[which(rowSums(train)>0),] test<-foodweb[kmoth[[h]]$test,] test<-t(test) test<-test[which(rownames(test)%in%rownames(train)),] mdistmat<-matrix(NA, nrow=ncol(test), ncol=nrow(train)) rownames(mdistmat)<-colnames(test) colnames(mdistmat)<-rownames(train) print("estimating phylo distance") flush.console() for(i in 1:nrow(train)){ mdat<-train[i,] m<-row.names(train)[i] hostpl<-names(mdat[-which(mdat==0)]) hostdat<-tree[which(row.names(tree)%in% hostpl),] dmat<-matrix(NA, nrow=ncol(test), ncol=length(hostpl)) row.names(dmat)<-colnames(test) if(length(hostpl)==1){ pdist<-hostdat[which(names(hostdat)%in% colnames(test))] pdist<-pdist[which(duplicated(names(pdist))==F)] pdist<-pdist[colnames(test)] dmat[,1]<-pdist } if(length(hostpl)>1){ for(j in 1:length(hostpl)){ h1<-hostdat[j,] pdist<-h1[which(names(h1)%in% colnames(test))] pdist<-pdist[which(duplicated(names(pdist))==F)] pdist<-pdist[colnames(test)] dmat[,j]<-pdist } } dmin<-apply(dmat, 1, min) mdistmat[,i]<-dmin } num_host<-rep(NA, nrow(test)*ncol(test)) num_genhost<-rep(NA, nrow(test)*ncol(test)) nhost_cor<-nhost[which(names(nhost)%in%rownames(test))] for(i in 1:length(nhost_cor)){ nho<-rep(nhost_cor[i], ncol(test)) num_host[((i-1)*ncol(test)+1):((i-1)*ncol(test)+ncol(test))]<-nho } mothnam<-rownames(test) mothmoth<-rep(NA, nrow(test)*ncol(test)) for(i in 1:length(mothnam)){ mnam<-rep(mothnam[i], ncol(test)) mothmoth[((i-1)*ncol(test)+1):((i-1)*ncol(test)+ncol(test))]<-mnam } plantplant<-rep(colnames(test), nrow(test)) logdat<-data.frame(moth=mothmoth, plant=plantplant, real_int=as.numeric(t(test)), min_phylo=as.numeric(mdistmat), nhost=num_host) logdat$min_phylo[which(logdat$min_phylo=="Inf")]<-1000 print("running models in high heels") flush.console() mod1<-glm(real_int ~ min_phylo*nhost, data=logdat, family="binomial") modlist[[h]]<-mod1 } modmat<-matrix(NA,nrow=length(modlist), ncol=length(modlist[[1]]$coeff)) for(i in 1:length(modlist)){modmat[i,]<-modlist[[i]]$coeff} AveCoeff<-apply(modmat,2,mean) names(AveCoeff)<-names(modlist[[1]]$coeff) modAve<-modlist[[1]] modAve$coefficients<-AveCoeff return(modAve) } ##Key function for predicting host use model (output of previous function 'makehostmodel') onto dataset of non-native plants, and assessing ##predictive ability using AUC & ROC curves. ##the function assumes that you have real information on novel interactions between herbivores and non-native plants, but could easily ##be truncated to make predictions without any validation (such as might be the case for situations where a plant has not yet been introduced. predictNovelHosts<-function(model,novelWeb,nativeWeb,tree){ mdistmat<-matrix(NA, nrow=nrow(novelWeb), ncol=ncol(novelWeb)) rownames(mdistmat)<-rownames(novelWeb) colnames(mdistmat)<-colnames(novelWeb) nativeWeb<-t(nativeWeb) for(i in 1:nrow(nativeWeb)){ m<-row.names(nativeWeb)[i] mdat<-nativeWeb[i,] if(length(mdat[-which(mdat==0)])==0) mdistmat[,i]<-rep(NA,nrow(mdistmat)) if(length(mdat[-which(mdat==0)])>0){ hostpl<-names(mdat[-which(mdat==0)]) hostdat<-tree[which(row.names(tree)%in% hostpl),] dmat<-matrix(NA, nrow=nrow(novelWeb), ncol=length(hostpl)) row.names(dmat)<-rownames(mdistmat) if(length(hostpl)==1){ exdist<-hostdat[which(names(hostdat)%in% rownames(dmat))] exdist<-exdist[which(duplicated(names(exdist))==F)] exdist<-exdist[rownames(dmat)] dmat[,1]<-exdist } if(length(hostpl)>1){ for(j in 1:length(hostpl)){ h1<-hostdat[j,] exdist<-h1[which(names(h1)%in% rownames(dmat))] exdist<-exdist[which(duplicated(names(exdist))==F)] exdist<-exdist[rownames(dmat)] dmat[,j]<-exdist } } dmin<-apply(dmat, 1, min) dmin<-dmin[rownames(mdistmat)] mdistmat[,i]<-dmin } } mdistmat[is.na(mdistmat)]<-mean(mdistmat, na.rm=T) nhost<-apply(nativeWeb, 1,sum) num_host<-rep(NA, nrow(novelWeb)*ncol(novelWeb)) for(i in 1:length(nhost)){ nho<-rep(nhost[i], nrow(novelWeb)) num_host[((i-1)*nrow(novelWeb)+1):((i-1)*nrow(novelWeb)+nrow(novelWeb))]<-nho } mothnam<-colnames(novelWeb) mothmoth<-rep(NA, nrow(novelWeb)*ncol(novelWeb)) for(i in 1:length(mothnam)){ mnam<-rep(mothnam[i], nrow(novelWeb)) mothmoth[((i-1)*nrow(novelWeb)+1):((i-1)*nrow(novelWeb)+nrow(novelWeb))]<-mnam } plantplant<-rep(rownames(novelWeb), ncol(novelWeb)) logdat<-data.frame(moth=mothmoth, plant=plantplant, real_int=as.numeric(novelWeb), min_phylo=as.numeric(mdistmat), nhost=num_host) #logdat$BW_range<-plantrange[logdat$plant] logdat_nn<-logdat ###predict the natives model onto the nonnative plants modpred<-prediction(predictions=predict(model, newdata=logdat), labels=logdat$real_int) modperf<-performance(modpred, measure="tpr", x.measure="fpr") #modperf_all<-modperf #plot(modperf) auc<-performance(modpred, measure="auc") acc<-performance(modpred, measure="f") auc<-auc@y.values[[1]] return(list(auc,modperf,logdat, acc, modpred)) } #### dataset-specific code ###### ##The above functions may be generally useful in constructing and predicting from host-use models, but the code below is specific to Altermatt & Pearse 2015 Ecol Appl ##some of this code may require tinkering, as path names / file names will differ. Likewise, many of the data-removal simulations take a very long time ##to run, so I cached saved versions of those simulations, which are not provided here (I'm happy to provide them to you if you're interested, but I'm trying ##to keep data as straightforward as possible and avoid submitting a whole pile of output files from the simulations whose code is below). ##load data files #file 1 = phylogeny #file 2 = list of all plants in Baden Wuerttemburg #file 3 = matrix of plant-moth interactions #file 4 = list of all moth names referenced to moth ID number ##common graphical function error.bar <- function(x, y, upper, lower=upper, length=0.1,...){ if(length(x) != length(y) | length(y) !=length(lower) | length(lower) != length(upper)) stop("vectors must be same length") arrows(x,y+upper, x, y-lower, angle=90, code=3, length=length, ...) } tr<-read.tree("German Phyl trimmed to AltPea dataset.tre") #tr<-read.tree("plant_phylo_2010_11_30.tre") head(tr$tip.label) #plant<-read.table("BW_Plants_20101117_use.txt", header=TRUE, as.is=TRUE) plant<-read.csv("plant_information.csv") moth<-read.table("matrix_species_20120901.txt",header=TRUE, as.is=TRUE) moth[867,537]<-1 #plant$species<-tolower(plant$species) mothdat<-read.table("data_traits_Lepidoptera_20101217.txt", header=T, as.is=T) ##culling full plant list to phylogeny. This is getting rid of ferns plant<-plant[which(plant$species %in% tr$tip.label),] all(plant$species %in% tr$tip.label) ##should be TRUE #colnames(moth)<-tolower(colnames(moth)) #a<-tolower(colnames(moth)) %in% plant$species rownames(moth)<-moth[,1] moth<-moth[,which(colnames(moth) %in% plant$species)] ##adding plants with no moths into matrix eatplant<-plant$species[which(plant$species %in% colnames(moth))] noeatplant<-plant$species[-which(plant$species %in% eatplant)] length(noeatplant) ##should be 1838 nullmat<-matrix(0,nrow(moth),length(noeatplant)) colnames(nullmat)<-noeatplant moth<-cbind(moth,nullmat) all(colnames(moth) %in% tr$tip.label) all(plant$species %in% colnames(moth)) ###this dataset is now ready for analysis native<-plant[-which(plant$status == "ornamental_plant" | plant$status =="neophyte"),] moth<-t(moth) exotic<-plant[which(plant$status == "ornamental_plant" | plant$status =="neophyte"),] trmat<-cophenetic(tr) trmat<-trmat/max(trmat) mothnat<-moth[-which(rownames(moth)%in%exotic$species),] mothnatabs<-mothnat mothnat[which(mothnat>1)]<-1 miss<-colnames(mothnat[,which(colSums(mothnat)==0)]) mothnat<-mothnat[,-which(colnames(mothnat)%in%miss)] ##Arrange non-native moth-plant food web exint<-moth[which(rownames(moth)%in% exotic$species),] exint<-exint[which(duplicated(rownames(exint))==F),] exint[exint>0]<-1 exint<-exint[,-which(colnames(exint)%in%miss)] ##Get Full Model and Predictions #BestModel<-makehostmodel(mothnat,trmat) #BestPred<-predictNovelHosts(model=BestModel,novelWeb=exint,nativeWeb=mothnat,tree=trmat) ##Calculate AUC for models with decreasing pres<-which(mothnat==1) absenc<-which(mothnat==0) repeats<-4 presrem<-seq(0,4720,20) # outlist<-as.list(c(1:(length(presrem)*repeats))) # for(i in 1:(length(presrem))){ # for(j in 1:repeats){ # pr<-presrem[i] # print(paste(pr,"removed; repeat:",j)) # rem<-sample(pres, pr) # mothred<-mothnat # mothred[rem]<-0 # BestModel<-makehostmodel(mothred,trmat) # BestPred<-predictNovelHosts(model=BestModel,novelWeb=exint,nativeWeb=mothred,tree=trmat) # outlist[[(i-1)*repeats+j]]<-c(pr,j,BestPred[[1]],BestModel[[1]]) # if(round(i/50)==i/50){ # filnam<-paste("moth data removal number ", i, ".Rdata", sep="") # save(outlist,file=filnam) # } # rm(list=ls()[-which(ls()%in%c('mothnat','absenc','exint','i','j', # 'K_fold','makehostmodel','mothnat','outlist','predictNovelHosts','pres','se','trmat','repeats','presrem'))]) # } # } ##Calculate AUC for models with decreasing # pres<-which(mothnat==1) # presabs<-mothnatabs[which(mothnatabs>0)] # prbs<-1/2^(presabs-1) # repeats<-5 # presrem<-seq(0,4720,20) # outlist<-as.list(c(1:(length(presrem)*repeats))) # for(i in 1:(length(presrem))){ # for(j in 1:repeats){ # pr<-presrem[i] # print(paste(pr,"removed; repeat:",j)) # rem<-sample(pres, pr, prob=prbs) # mothred<-mothnat # mothred[rem]<-0 # BestModel<-makehostmodel(mothred,trmat) # BestPred<-predictNovelHosts(model=BestModel,novelWeb=exint,nativeWeb=mothred,tree=trmat) # outlist[[(i-1)*repeats+j]]<-c(pr,j,BestPred[[1]],BestModel[[1]]) # if(round(i/10)==i/10){ # filnam<-paste("moth data removal prop number ", i, ".Rdata", sep="") # save(outlist,file=filnam) # } # rm(list=ls()[-which(ls()%in%c('mothnat','absenc','exint','i','j', # 'K_fold','makehostmodel','mothnat','outlist','predictNovelHosts','pres','se','trmat','repeats','presrem','prbs'))]) # } # } # save(outlist, "moth data removal prop TOTAL.Rdata") outtab<-data.frame(removed=rep(NA,236),AUC=rep(NA,236)) for(i in 2:237){ outtab$removed[i-1]<-outlist[[i]][[1]] outtab$AUC[i-1]<-outlist[[i]][[3]] } outtab2<-data.frame(removed=rep(NA,length(outlist)),AUC=rep(NA,length(outlist))) for(i in 1:length(outlist)){ outtab2$removed[i]<-outlist[[i]][[1]] outtab2$AUC[i]<-outlist[[i]][[3]] } # ###now sequentially add misinformation (false records) # absenc<-which(mothnat==0) # repeats<-5 # absrem<-seq(1,4720,20) # outlist<-as.list(c(1:(length(absrem)*repeats))) # for(i in 1:(length(absrem))){ # for(j in 1:repeats){ # pr<-absrem[i] # print(paste(pr,"removed; repeat:",j)) # rem<-sample(absenc, pr) # mothred<-mothnat # mothred[rem]<-1 # BestModel<-makehostmodel(mothred,trmat) # BestPred<-predictNovelHosts(model=BestModel,novelWeb=exint,nativeWeb=mothred,tree=trmat) # outlist[[(i-1)*repeats+j]]<-c(pr,j,BestPred[[1]],BestModel[[1]]) # if(round(i/5)==i/5){ # filnam<-paste("moth data low nums misinfo ", i, ".Rdata", sep="") # save(outlist,file=filnam) # } # rm(list=ls()[-which(ls()%in%c('mothnat','absenc','exint','i','j', # 'K_fold','makehostmodel','mothnat','outlist','predictNovelHosts','pres','se','trmat','repeats','absrem'))]) # } # } # save(outlist,file="moth data low nums misinfo TOTAL.Rdata") ###get data and make plots load('moth removal entire 4 reps.Rdata') entire4rep<-outlist load('moth_data_removal_entire_one rep.Rdata') entire1rep<-outlist allremove<-c(entire4rep,entire1rep[1:240]) outtab2<-data.frame(removed=rep(NA,length(allremove)),AUC=rep(NA,length(allremove)), intercept=rep(NA,length(allremove)), minphylo=rep(NA,length(allremove)), nhost=rep(NA,length(allremove)), mpxnhost=rep(NA,length(allremove))) for(i in 1:length(allremove)){ if(length(allremove[[i]])>1){ outtab2$removed[i]<-allremove[[i]][[1]] outtab2$AUC[i]<-allremove[[i]][[3]] outtab2$intercept[i]<-allremove[[i]][[4]] outtab2$minphylo[i]<-allremove[[i]][[5]] outtab2$nhost[i]<-allremove[[i]][[6]] outtab2$mpxnhost[i]<-allremove[[i]][[7]] } } plot(outtab2$removed,outtab2$AUC) lo<-loess(AUC~removed, data=outtab2) plot(outtab2$removed,lo$fitted) ##now load and plot misinformation load('later misinfo complete.Rdata') rem_later<-outlist load('moth data misinfo number 100.Rdata') rem_misin100<-outlist[1:100] allmis<-c(rem_later,rem_misin100) outtab<-data.frame(removed=rep(NA,length(allmis)),AUC=rep(NA,length(allmis)), intercept=rep(NA,length(allmis)), minphylo=rep(NA,length(allmis)), nhost=rep(NA,length(allmis)), mpxnhost=rep(NA,length(allmis))) for(i in 1:length(allmis)){ if(length(allmis[[i]])>1){ outtab$removed[i]<-allmis[[i]][[1]] outtab$AUC[i]<-allmis[[i]][[3]] outtab$intercept[i]<-allmis[[i]][[4]] outtab$minphylo[i]<-allmis[[i]][[5]] outtab$nhost[i]<-allmis[[i]][[6]] outtab$mpxnhost[i]<-allmis[[i]][[7]] } } ###make table of misinfo low numbers load('moth data low nums misinfo TOTAL.Rdata') misinfolow<-outlist outtab4<-data.frame(removed=rep(NA,length(misinfolow)),AUC=rep(NA,length(misinfolow)), intercept=rep(NA,length(misinfolow)), minphylo=rep(NA,length(misinfolow)), nhost=rep(NA,length(misinfolow)), mpxnhost=rep(NA,length(misinfolow))) for(i in 1:length(misinfolow)){ if(length(misinfolow[[i]])>1){ outtab4$removed[i]<-misinfolow[[i]][[1]] outtab4$AUC[i]<-misinfolow[[i]][[3]] outtab4$intercept[i]<-misinfolow[[i]][[4]] outtab4$minphylo[i]<-misinfolow[[i]][[5]] outtab4$nhost[i]<-misinfolow[[i]][[6]] outtab4$mpxnhost[i]<-misinfolow[[i]][[7]] } } #write.csv('misinfo_low_nums.csv', row.names=F) ###arrange proportional removal of data load('moth data removal prop TOTAL.Rdata') remprop<-outlist outtab5<-data.frame(removed=rep(NA,length(remprop)),AUC=rep(NA,length(remprop)), intercept=rep(NA,length(remprop)), minphylo=rep(NA,length(remprop)), nhost=rep(NA,length(remprop)), mpxnhost=rep(NA,length(remprop))) for(i in 1:length(remprop)){ if(length(remprop[[i]])>1){ outtab5$removed[i]<-remprop[[i]][[1]] outtab5$AUC[i]<-remprop[[i]][[3]] outtab5$intercept[i]<-remprop[[i]][[4]] outtab5$minphylo[i]<-remprop[[i]][[5]] outtab5$nhost[i]<-remprop[[i]][[6]] outtab5$mpxnhost[i]<-remprop[[i]][[7]] } } write.csv(outtab5, 'remove prop.csv', row.names=F) ##plot decay of random removal versus proportional removal prop<-read.csv('remove prop.csv') removal<-read.csv('results removal with bmodgpred.csv') removal<-removal[which(is.na(removal$AUC)==F),] removal<-removal[order(removal$removed),] remlo<-loess(removal$AUC~removal$removed, span=0.25) rempred<-predict(remlo, se=T) proplo<-loess(prop$AUC~prop$removed, span=0.25) proppred<-predict(proplo, se=T) jpeg(filename = "Fig. proportional removal.jpg", width = 6, height = 6, units = "in", pointsize = 12, quality = 75, bg = "white", res = 600, restoreConsole = TRUE) plot(removal$removed,removal$AUC, type='n', ylim=c(0.5,1), xlim=c(0,4727),ylab='AUC - out of sample predictiveness', xlab='Percent (number) of records missing from food web', las=1, xaxt='n') points(removal$removed,removal$AUC, cex=0.3, col='darkgrey') points(prop$removed,prop$AUC,cex=0.3, col='lightblue') lines(removal$removed,rempred$fit, lwd=3, lty='solid',col='white') lines(removal$removed,rempred$fit, lwd=2, lty='solid') #polygon(c(removal$removed,rev(removal$removed)),c(rempred$fit-rempred$se.fit, rev(rempred$fit+rempred$se.fit)), col='lightgreen', border=NA) lines(prop$removed,proppred$fit,lwd=3, lty='dashed',col='white') lines(prop$removed,proppred$fit,lwd=2, lty='dashed',col='blue') axis(1, at=c(0,945,1891,2836,3782,4727), label=c('0%\n(0)','20%\n(945)','40%\n(1891)', '60%\n(2836)','80%\n(3782)','100%\n(4727)'), padj=0.5) segments(x0=750,x1=1000,y0=0.625,y1=0.625, lwd=2) segments(x0=750,x1=1000,y0=0.6,y1=0.6, lwd=2, lty='dashed',col='blue') legend(900,0.65,c('Removal random','Removal proportional to 1/encounter rate'), bty='n') dev.off() jpeg(filename = "Fig. removal versus misinfo.jpg", width = 6, height = 8, units = "in", pointsize = 12, quality = 75, bg = "white", res = 600, restoreConsole = TRUE) par(mfrow=c(2,1)) ###plot data removal line graph removal<-read.csv('results removal with bmodgpred.csv') r1<-removal[order(removal$removed),] r1<-r1[-1188,] rl<-loess(r1$AUC~r1$removed, span=0.25) rp<-predict(rl, se=T) plot(r1$removed,rp$fit, type='n', ylim=c(0.5,1), xlim=c(0,4727),ylab='AUC - out of sample predictiveness', xlab='Percent (number) of records removed from food web', las=1, xaxt='n') points(r1$removed,r1$AUC, col='darkgrey') points(prop$removed,prop$AUC,cex=0.3, col='lightblue') #polygon(c(r1$removed,rev(r1$removed)),c(rp$fit-rp$se.fit, rev(rp$fit+rp$se.fit)), col='lightgreen', border=NA) lines(r1$removed,rp$fit, lwd=4,col='white') lines(r1$removed,rp$fit, lwd=2) lines(prop$removed,proppred$fit,lwd=3, lty='dashed',col='white') lines(prop$removed,proppred$fit,lwd=2, lty='dashed',col='blue') axis(1, at=c(0,945,1891,2836,3782,4727), label=c('0%\n(0)','20%\n(945)','40%\n(1891)', '60%\n(2836)','80%\n(3782)','100%\n(4727)'), padj=0.5) text(x=0,y=1.1, "A", xpd=NA) ###plot misinformation line graph misinfo<-read.csv('results_misinformation_all.csv') ot1<-misinfo[order(misinfo$removed),] outpl<-loess(AUC~removed, data=ot1, span=0.5) outp<-predict(outpl, se=T) plot(ot1$removed,outp$fit, type='n', ylim=c(0.5,1), xlim=c(0,1901727),ylab='AUC - out of sample predictiveness', xlab='Percent (number) of false records added to food web',xaxt='n', las=1) points(ot1$removed,ot1$AUC, col='darkgrey') #polygon(c(ot1$removed,rev(ot1$removed)),c(outp$fit-outp$se.fit, rev(outp$fit+outp$se.fit)), col='lightgreen', border=NA) lines(ot1$removed,outp$fit, lwd=4,col='white') lines(ot1$removed,outp$fit, lwd=2) axis(1, at=c(0,380345.4,760690.8,1141036,1521382,1901707), label=c('0%\n(0)','20%\n(380345)','40%\n(760691)', '60%\n(1.1 E6)','80%\n(1.5 E6)','100%\n(1.9 E6)'),padj=0.5, cex=0.8) #text(x=1901707,y=.365,'100%\n(1901707)', xpd=NA) text(x=0,y=1.1, "B", xpd=NA) dev.off() ###make figure of missing records that has lines showing how the records were removed removal_gmod_bpred<-read.csv('results_goodmodel_badpred.csv') r2<-removal_gmod_bpred[order(removal_gmod_bpred$removed),] removal<-read.csv('results removal with bmodgpred.csv') r1<-removal[order(removal$removed),] r1<-r1[-1188,] r3<-r1 r3<-r3[-which(is.na(r3$AUCbmodgpred)),] rl.bmbp<-loess(r1$AUC~r1$removed, span=0.25) rp.bmbp<-predict(rl.bmbp, se=T) rl.bmgp<-loess(r3$AUCbmodgpred~r3$removed, span=0.25) rp.bmgp<-predict(rl.bmgp, se=T) rl.gmbp<-loess(r2$AUC~r2$removed, span=0.25) rp.gmbp<-predict(rl.gmbp, se=T) jpeg(filename = "Fig. removal model vs prediction.jpg", width = 8, height = 6, units = "in", pointsize = 12, quality = 75, bg = "white", res = 600, restoreConsole = TRUE) plot(r1$removed,r1$AUC, type='n', ylim=c(0.5,1), xlim=c(0,4727),ylab='AUC - out of sample predictiveness', xlab='Percent (number) of records missing from food web', las=1, xaxt='n') points(r1$removed,r1$AUC, cex=0.3) points(r3$removed,r3$AUCbmodgpred, cex=0.3,col='blue') points(r2$removed,r2$AUC, cex=0.3, col='orange') lines(r1$removed,rp.bmbp$fit, lwd=3, col='white') lines(r1$removed,rp.bmbp$fit, lwd=2) lines(r3$removed,rp.bmgp$fit,lwd=3,col='white') lines(r3$removed,rp.bmgp$fit,lwd=3,col='blue', lty="dashed") #lines(r2$removed,rp.gmbp$fit,lwd=3,col='white') lines(r2$removed,rp.gmbp$fit,lwd=2,col='orange') axis(1, at=c(0,945,1891,2836,3782,4727), label=c('0%\n(0)','20%\n(945)','40%\n(1891)', '60%\n(2836)','80%\n(3782)','100%\n(4727)'), padj=0.5) legend(900,0.65,c('removal from host-use model','removal from prediction','removal from both'), lty=c("dashed","solid","solid"), bty='n', col=c("blue","orange","black"), lwd=2) dev.off() ###make figure that shows the relative effect of missing versus erroneous records err<-read.csv('misinfo_low_nums.csv') removal<-read.csv('results removal with bmodgpred.csv') both<-merge(err,removal,by='removed') errvec<-tapply(err$AUC,err$removed, mean)[1:100] errvecse<-tapply(err$AUC,err$removed, se)[1:100] remvec<-tapply(removal$AUC,removal$removed, mean)[1:100] remvecse<-tapply(removal$AUC,removal$removed, se)[1:100] remo<-as.numeric(names(remvec)[1:100]) diffs<-remvec-errvec diffs<-as.numeric(diffs) diffsse<-sqrt(remvecse^2+errvecse^2) los<-loess(diffs~remo) preds<-predict(object=los,se=T) jpeg(filename = "Fig. missing erroneous biplot.jpg", width = 6, height = 6, units = "in", pointsize = 12, quality = 75, bg = "white", res = 600, restoreConsole = TRUE) par(mar=c(5,5,4,2)) plot(remo,diffs, type='n', ylab="",xlab="", las=1) polygon(c(remo,rev(remo)),c(preds$fit-preds$se.fit, rev(preds$fit+preds$se.fit)), col='lightgreen', border=NA) points(remo,diffs,col='darkgrey') lines(remo,preds$fit, lwd=2) mtext(2,text=expression(paste(Delta,'AUC (missing - erroneous)'), sep=""), line=3.5, cex=1) mtext(1, text='Number of records removed or erroneously added', line=3, cex=1) abline(0,0,lty='dashed', lwd=2) dev.off() ###Look at additivity of missing and erroneous records at set number of missing records = 2000 ##removal only # pres<-which(mothnat==1) # presabs<-mothnatabs[which(mothnatabs>0)] # prbs<-1/2^(presabs-1) # repeats<-20 # presrem<-2000 # outlist<-as.list(c(1:(length(presrem)*repeats))) # for(i in 1:(length(presrem))){ # for(j in 1:repeats){ # pr<-presrem[i] # print(paste(pr,"removed; repeat:",j)) # rem<-sample(pres, pr) # mothred<-mothnat # mothred[rem]<-0 # BestModel<-makehostmodel(mothred,trmat) # BestPred<-predictNovelHosts(model=BestModel,novelWeb=exint,nativeWeb=mothred,tree=trmat) # outlist[[(i-1)*repeats+j]]<-c(pr,j,BestPred[[1]],BestModel[[1]]) # rm(list=ls()[-which(ls()%in%c('mothnat','absenc','exint','i','j', # 'K_fold','makehostmodel','mothnat','outlist','predictNovelHosts','pres','se','trmat','repeats','presrem','prbs'))]) # } # } # removal_only<-rep(NA,20) # for(i in 1:20) removal_only[i]<-outlist[[i]][[3]] # write.csv(removal_only, file='removal_only_2000.csv', row.names=F) # ##both # pres<-which(mothnat==1) # abse<-which(mothnat==0) # #presabs<-mothnatabs[which(mothnatabs>0)] # #prbs<-1/2^(presabs-1) # repeats<-20 # presrem<-2000 # outlist<-as.list(c(1:(length(presrem)*repeats))) # for(i in 1:(length(presrem))){ # for(j in 1:repeats){ # pr<-presrem[i] # print(paste(pr,"removed; repeat:",j)) # rem<-sample(pres, pr) # ads<-sample(abse, pr) # mothred<-mothnat # mothred[rem]<-0 # mothred[ads]<-1 # BestModel<-makehostmodel(mothred,trmat) # BestPred<-predictNovelHosts(model=BestModel,novelWeb=exint,nativeWeb=mothred,tree=trmat) # outlist[[(i-1)*repeats+j]]<-c(pr,j,BestPred[[1]],BestModel[[1]]) # rm(list=ls()[-which(ls()%in%c('mothnat','absenc','exint','i','j', # 'K_fold','makehostmodel','mothnat','outlist','predictNovelHosts','pres','se','trmat','repeats','presrem','prbs', 'abse'))]) # } # } # removal_addition<-rep(NA,20) # for(i in 1:20) removal_addition[i]<-outlist[[i]][[3]] # write.csv(removal_addition, file='removal_addition_2000.csv', row.names=F) # ##adding erroneous # pres<-which(mothnat==1) # abse<-which(mothnat==0) # #presabs<-mothnatabs[which(mothnatabs>0)] # #prbs<-1/2^(presabs-1) # repeats<-20 # presrem<-2000 # outlist<-as.list(c(1:(length(presrem)*repeats))) # for(i in 1:(length(presrem))){ # for(j in 1:repeats){ # pr<-presrem[i] # print(paste(pr,"removed; repeat:",j)) # #rem<-sample(pres, pr) # ads<-sample(abse, pr) # mothred<-mothnat # #mothred[rem]<-0 # mothred[ads]<-1 # BestModel<-makehostmodel(mothred,trmat) # BestPred<-predictNovelHosts(model=BestModel,novelWeb=exint,nativeWeb=mothred,tree=trmat) # outlist[[(i-1)*repeats+j]]<-c(pr,j,BestPred[[1]],BestModel[[1]]) # rm(list=ls()[-which(ls()%in%c('mothnat','absenc','exint','i','j', # 'K_fold','makehostmodel','mothnat','outlist','predictNovelHosts','pres','se','trmat','repeats','presrem','prbs', 'abse'))]) # } # } # addition_only<-rep(NA,20) # for(i in 1:20) addition_only[i]<-outlist[[i]][[3]] # write.csv(addition_only, file='addition_only_2000.csv', row.names=F) ads<-read.csv('addition_only_2000.csv') rems<-read.csv('removal_only_2000.csv') both<-read.csv('removal_addition_2000.csv') BestModel<-makehostmodel(mothnat,trmat) BestPred<-predictNovelHosts(model=BestModel,novelWeb=exint,nativeWeb=mothnat,tree=trmat) BEST<-0.9287197 mns<-c(mean(rems[,1]), mean(ads[,1]), mean(both[,1]), mean(ads[,1])-(BEST-mean(rems[,1]))) ses<-c(se(rems[,1]), se(ads[,1]), se(both[,1]), sqrt((se(ads[,1]))^2+(se(rems[,1]))^2)) sds<-c(sd(rems[,1]), sd(ads[,1]), sd(both[,1]), sqrt((sd(ads[,1]))^2+(sd(rems[,1]))^2)) bp<-barplot(mns) error.bar(bp,mns,ses) segments(x0=3.5, x1=4.3, y0=mns[4],y1=mns[4], lwd=3) jpeg(filename = "Fig. additivity plot.jpg", width = 6, height = 6, units = "in", pointsize = 12, quality = 75, bg = "white", res = 600, restoreConsole = TRUE) bp<-boxplot(t(mns), ylim=c(0.8,1), names=c("","","","")) text(y=0.78, x=1,'missing', xpd=NA, pos=1) text(y=0.78, x=2,'erroneous', xpd=NA, pos=1) text(y=0.78, x=3,'both', xpd=NA, pos=1) text(y=0.78, x=4,'both\n(predicted additive)', xpd=NA, pos=1) mtext(side=2,text='AUC (out of sample predictiveness) with 2000 missing records', line=2.5) error.bar(c(1,2,3,4),mns,sds) segments(x0=0,x1=4.75,y1=BEST,y0=BEST, lty='dashed', lwd=3) dev.off() ##Calculate AUC for models with decreasing # pres<-which(mothnat==1) # presabs<-mothnatabs[which(mothnatabs>0)] # prbs<-1/2^(presabs-1) # repeats<-5 # presrem<-seq(0,4720,20) # outlist<-as.list(c(1:(length(presrem)*repeats))) # for(i in 1:(length(presrem))){ # for(j in 1:repeats){ # pr<-presrem[i] # print(paste(pr,"removed; repeat:",j)) # rem<-sample(pres, pr, prob=prbs) # mothred<-mothnat # mothred[rem]<-0 # BestModel<-makehostmodel(mothred,trmat) # BestPred<-predictNovelHosts(model=BestModel,novelWeb=exint,nativeWeb=mothred,tree=trmat) # outlist[[(i-1)*repeats+j]]<-c(pr,j,BestPred[[1]],BestModel[[1]]) # if(round(i/10)==i/10){ # filnam<-paste("moth data removal prop number ", i, ".Rdata", sep="") # save(outlist,file=filnam) # } # rm(list=ls()[-which(ls()%in%c('mothnat','absenc','exint','i','j', # 'K_fold','makehostmodel','mothnat','outlist','predictNovelHosts','pres','se','trmat','repeats','presrem','prbs'))]) # } # } # save(outlist, "moth data removal prop TOTAL.Rdata") ####try to mess with the phylogeny to set prediction threshold on phylogeny # phyvec<-seq(from=0,to=0.4,by=0.01) # aucvec<-phyvec # for(i in 1:length(phyvec)){ # trmat_n<-trmat # trmat_n[which(trmat_n>phyvec[i]*max(trmat_n))]<-max(trmat_n) # BestModel<-makehostmodel(mothnat,trmat_n) # BestPred<-predictNovelHosts(model=BestModel,novelWeb=exint,nativeWeb=mothnat,tree=trmat_n) # aucvec[i]<-BestPred[[1]] # } # dfr<-read.csv('phylo_limits.csv') # dfr$phylo_inv<-1-dfr$phylo_collapse # dfr$phylo_sc<-dfr$phylo_inv*420-420 # mod<-loess(dfr$AUC~dfr$phylo_sc, span=0.25) # BestModel<-makehostmodel(mothnat,trmat) # BestPred<-predictNovelHosts(model=BestModel,novelWeb=exint,nativeWeb=mothnat,tree=trmat) # logdat<-cbind(BestPred[[3]], predictions=BestPred[[5]]@predictions[[1]]) predictHostsTail<-function(mod, dat){ modpred<-prediction(predictions=predict(mod, newdata=dat), labels=dat$real_int) modperf<-performance(modpred, measure="tpr", x.measure="fpr") #modperf_all<-modperf #plot(modperf) auc<-performance(modpred, measure="auc") acc<-performance(modpred, measure="f") auc<-auc@y.values[[1]] return(auc) } mothred<-mothnat BestModel<-makehostmodel(mothred,trmat) BestPred<-predictNovelHosts(model=BestModel,novelWeb=exint,nativeWeb=mothred,tree=trmat) logdat<-cbind(BestPred[[3]], predictions=BestPred[[5]]@predictions[[1]]) falsedat<-logdat[which(logdat$real_int==0),] realdat<-logdat[which(logdat$real_int==1),] k=12 phyloslice<-rep(0,k+1) for(j in 2:(length(phyloslice)-1)){ phyloslice[[j]]<-sort(realdat$min_phylo)[round(nrow(realdat)*(j-1)/k)] } phyloslice[length(phyloslice)]<-1 aucvec<-rep(NA, length(phyloslice)-1) pscor<-rep(NA, length(phyloslice)-1) for(ll in 1:(length(phyloslice)-1)){ print(ll) flush.console() dd<-logdat[which(logdat$min_phylo>phyloslice[ll] & logdat$min_phylo0)] prbs<-1/2^(presabs-1) repeats<-10 presrem<-c(1575,1575*2) outlist<-as.list(c(1:(length(presrem)*repeats))) for(i in 1:(length(presrem))){ for(j in 1:repeats){ pr<-presrem[i] print(paste(pr,"removed; repeat:",j)) rem<-sample(pres, pr) mothred<-mothnat mothred[rem]<-0 BestModel<-makehostmodel(mothred,trmat) BestPred<-predictNovelHosts(model=BestModel,novelWeb=exint,nativeWeb=mothred,tree=trmat) logdat<-cbind(BestPred[[3]], predictions=BestPred[[5]]@predictions[[1]]) aucvec<-rep(NA, length(phyloslice)-1) pscor<-rep(NA, length(phyloslice)-1) for(ll in 1:(length(phyloslice)-1)){ print(ll) flush.console() dd<-logdat[which(logdat$min_phylo>phyloslice[ll] & logdat$min_phylophyloslice[i] & logdat$min_phylosc),])/nrow(falsedat) #TPR<-nrow(realdat[which(realdat$predictions>sc),])/nrow(realdat) TP<-nrow(realdat[which(realdat$predictions>sc),]) FP<-nrow(falsedat[which(falsedat$predictions>sc),]) FN<-nrow(realdat)-TP TN<-nrow(falsedat)-FP #FSC<-2*TP/(2*TP+FP+FN) ov<-c(TP,FP,FN,TN) sigdet[i,]<-ov } ##Eupethecia virginuata on Solidago canadensis/gigantea score = -2.363075 xval<-nrow(falsedat[which(falsedat$predictions>-2.363075),])/nrow(falsedat) yval<-nrow(realdat[which(realdat$predictions>-2.363075),])/nrow(realdat) jpeg(filename = "Phylo_threshold.jpg", width = 6, height = 6, units = "in", pointsize = 12, quality = 75, bg = "white", res = 600, restoreConsole = TRUE) prd<-predict(mod, se=T) plot(dfr$phylo_sc, dfr$AUC, ylab="Predictiveness of host use model (AUC)", xlab="Age (MYA) at which older phylogenetic branches were collapsed", ylim= c(0.8112263, 0.93)) lines(dfr$phylo_sc, prd$fit, lwd=2, lty='dotted') #polygon(c(dfr$phylo_sc,rev(dfr$phylo_sc)),c(prd$fit-prd$se.fit, rev(prd$fit+prd$se.fit)), col='lightgreen', border=NA) arrows(x0=-14, x1=-14, y0=0.8670202, y1=0.8112263, length=0.1, lwd=2) dev.off() nodeD<-node.depth.edgelength(tr) nodeR<-diff(range(nodeD)/100) tipdist<-max(nodeD) nodelist<-which(nodeD>420 & nodeD