############################################################################################################## ### Supporting Information ### # Title: A new tale of lost tails: correlates of tail breakage in the worm lizard Amphisbaena vermicularis Wagler, 1824 # Authors: Jhonny JM Guedes1, Henrique C Costa2,3,4, and Mário R Moura5* # 1 Programa de Pós-Graduação em Ecologia e Evolução, Departamento de Ecologia, Campus Samambaia, Universidade Federal de Goiás, Avenida Esperança, s/n, 74690-900, Goiânia, GO, Brazil. # 2 Programa de Pós-Graduação em Zoologia, Departamento de Zoologia, Universidade Federal de Minas Gerais, Avenida Antônio Carlos 6627, 31270-901, Belo Horizonte, MG, Brazil. # 3 Departamento de Biologia Animal, Universidade Federal de Viçosa, Avenida P.H. Rolfs, s/n, 36570-900, Viçosa, MG, Brazil. # 4 Current address: Departamento de Zoologia, Instituto de Ciências Biológicas, Universidade Federal de Juiz de Fora. Rua José Lourenço Kelmer, s/n, 36036-900, São Pedro, Juiz de Fora, MG, Brazil. # 5 Departamento de Ciências Biológicas, Universidade Federal da Paraíba, 58397-000, Areia, PB, Brazil. # * Corresponding author: mariormoura@gmail.com ############################################################################################################## # STEPS IN THIS SCRIPT # 1. Load and understand the dataset. # 2. Make the distribution map (figure 1) presented in the paper. # 3. Get the frequency of tail-breakage in A. vermicularis (including all individuals analyzed). # 4. Prepare the predictor variables and perform basic test statistics. # 5. Perform a few exploratory analysis and create figure 2 presented in the paper. # 6. Perform a binomial GLM with backward variable selection based on the likelihood ratio test (LRT). # 7. Perform a Sensitivity Analysis to account for unbalancing in the response variable. # 8. Create the sensitivity plot (figure 3 presented in the paper). # Install and load needed packages: needed_packages<-c("plyr", "dplyr", "ggplot2", "ggpubr", "car", "data.table", "lmtest", "devtools", "tidyr") new.packages<-needed_packages[!(needed_packages %in% installed.packages()[,"Package"])] if(length(new.packages)) install.packages(new.packages) lapply(needed_packages, require, character.only = TRUE) # Other packages from github: install_github("kassambara/easyGgplot2", force = T) library("easyGgplot2") # Clean workspace: rm(list=ls()); gc() # Set the working directory: setwd() ##### # STEP 1 - Load and understand the data ############################################################################################################## # STEP 1 - Load and understand the data ## We have 12 columns in this dataset, each of which is explained below: data <- read.csv("raw_data.csv", h = T, stringsAsFactors = T, na.strings=c(""," ","NA")) # 1. SpecimenID: Composed by the museum acronym and the specimen number in the respective scientific collection where it's housed. # 2. State: One of the Brazilian Federative units. # 3. Municipality: self-explanatory. # 4. Locality: Name attributed the specific locality (if available) where the specimen was collected. # 5. Sex: indicate whether the specimen is male or female. # 6. LifeStage: Show if the specimen is an adult or juvenile. # 7. Urotomy: Represent the response variable and indicates the presence/absence of healed broken tail. # 8. SVL: Snout-vent length of each specimen. # 9. Latitude: Latitude of the collection site of each specimen # 10. Longitude: Longitude of the collection site of each specimen. # 11. Temp: Mean annual temperature. # 12. Prec: Precipitation of the wettest quarter. ##### # STEP 2 - Create the distribution map (figure 1) presented in the paper #################################################################################################### # STEP 2 - Create the distribution map (figure 1) presented in the paper # Install and load additional R packages needed to create the map: needed_packages<-c("rgdal", "sp", "raster", "maptools", "maps", "rgeos", "plyr", "dplyr", "ggplot2", "ggnewscale") new.packages<-needed_packages[!(needed_packages %in% installed.packages()[,"Package"])] if(length(new.packages)) install.packages(new.packages) lapply(needed_packages, require, character.only = TRUE) # Duplicate data selecting only variables need for plotting the map: names(data) site_descriptors <- data[ , c(1,7,10:9)] site_descriptors <- site_descriptors[order(site_descriptors$Urotomy, decreasing = F), ] # group so that less events (Urotomy) will be plotted on top site_descriptors$Urotomy <- as.factor(as.character(site_descriptors$Urotomy)) site_descriptors$Urotomy<-factor(site_descriptors$Urotomy, levels = c('1', '0'), labels = c("Urotomy", "Non-Urotomy")) site_descriptors <- site_descriptors[complete.cases(site_descriptors[, 3:4]), ] # exclude specimens without coordinate data coords<-site_descriptors[, c(3:4)] # dataframe with only longitude and latitude site_descriptors<-SpatialPointsDataFrame(coords=coords, data=site_descriptors, proj4string = CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0")) # convert the object data into a SpatialPointsDataframe crs(site_descriptors) # Get or set the coordinate reference system (CRS) of a Raster* object site_descriptors@data$id<-rownames(site_descriptors@data) # convert into dataframe to use in ggplot2 site_descriptors_df <- data.frame(site_descriptors) # Read the shapefile; transform them into Albers Equal Area Conic projection; prepare them for ggplot2: # Download the SpatialPolygons in .rds format at . url <- "https://biogeo.ucdavis.edu/data/gadm3.6/Rsp/gadm36_BRA_1_sp.rds" # Specify URL where file is stored destfile <- paste0(getwd(), "/gadm36_BRA_1_sp.rds") # Specify destination where file should be saved download.file(url, destfile) # Apply download.file function in R terr_cover <- readRDS("gadm36_BRA_1_sp.rds") # Load the shapefile plot(terr_cover) crs(terr_cover) # Retrieve Coordinate Reference System From Object terr_cover@data$id<-rownames(terr_cover@data) terr_cover_df<-fortify(terr_cover, region="id") terr_cover_df<-plyr::join(terr_cover_df, terr_cover@data, by="id") # joining tables by "id" # Download the shapefile of Brazilian biomes: # Load additional package for unzipping files in R library("utils") # install if needed # Create a couple temp files temp <- tempfile(); temp2 <- tempfile() # Download the zip folder containing the shape files and save to 'temp' download.file("http://geoftp.ibge.gov.br/informacoes_ambientais/estudos_ambientais/biomas/vetores/Biomas_250mil.zip", temp) # Unzip the contents in 'temp' and save unzipped content in 'temp2' unzip(zipfile = temp, exdir = temp2) # Finds the filepath of the shapefile (.shp) file in the temp2 unzip folder # the $ at the end of ".shp$" ensures you are not also finding files such as .shp.xml biomes_IBGE<-list.files(temp2, pattern = ".shp$",full.names=TRUE) #read the shapefile. Alternatively make an assignment, such as f<-sf::read_sf(your_SHP_file) biomes <- readOGR(dsn = temp2, layer = "lm_bioma_250") biomes@data$Bioma <- revalue(biomes@data$Bioma, c("Amazônia"="Amazon", "Caatinga"="Caatinga", "Cerrado"="Cerrado", "Mata Atlântica"="Atlantic Forest", "Pampa"="Pampa", "Pantanal"="Pantanal")) crs(biomes) # Retrieve Coordinate Reference System From Object: crs(biomes)<-crs(terr_cover) biomes@data$id<-rownames(biomes@data) biomes_df<-fortify(biomes, region="id") biomes_df<-plyr::join(biomes_df, biomes@data, by="id") # joining tables by "id" # Define colors for each biome: levels(as.factor(biomes$Bioma)) biomeColors <- c("grey25", "grey95", "grey81", "grey39", "grey53", "grey67") names(biomeColors)<-levels(biomes$Bioma) # Plot the map: main_map <- ggplot(data=site_descriptors_df) + geom_polygon(data=biomes_df, aes(long, lat, group=group, fill=Bioma), colour=NA) + geom_polygon(data=terr_cover_df, aes(long, lat, group=group), colour="Black", fill=NA, size=0.25) + scale_fill_manual(name="", values=biomeColors) + coord_equal() # Add a new color scale for fill: main_map <- main_map + new_scale_fill() # Replot: main_map <- main_map + geom_point(aes(x=Longitude.1, y=Latitude.1, colour=Urotomy, shape=Urotomy, fill=Urotomy, size=Urotomy, alpha=Urotomy))+ scale_alpha_manual(name = "Urotomy", values=c(1, 0.5)) + # symbol size for geom_points scale_size_manual(name = "Urotomy", values=c(2, 3)) + # symbol size for geom_points scale_shape_manual(name = "Urotomy", values=c(21, 22)) + # symbol shape for geom_points scale_colour_manual(name = "Urotomy", values = c("gray20", "gray20")) + # outline color for geom_points scale_fill_manual(name = "Urotomy", values = c("darkorange1", "royalblue3")) + # fill color for geom_points theme(axis.line=element_blank(), axis.text=element_blank(), axis.ticks=element_blank(), axis.title=element_blank(), panel.grid.minor=element_blank(), panel.grid.major=element_blank(), legend.position=c(1, .5), legend.direction="vertical", legend.text=element_text(size=8), legend.key = element_blank(), legend.background=element_blank(), legend.title=element_blank(), panel.background=element_rect(fill = "transparent", colour = NA), plot.background=element_rect(fill="transparent", colour=NA)) # Run the script scaleBar and northArrow to create the functions to allow plotting such features on map: source("scaleBar_and_northArrow.R") Fig1 <- main_map + scale_bar(lon = -40, lat = -30, distance_lon = 500, distance_lat = 50, distance_legend = 150, dist_unit = "km", orientation = TRUE) # The plot above was later edited in the program Inkscape to adjust minor aesthetic details. # Save the plot: ggsave(paste0(getwd(), "/Fig1. Distribution map.pdf"), plot=Fig1, width=20, height=12, units="cm", dpi = "print") ggsave(paste0(getwd(), "/Fig1. Distribution map.png"), plot=Fig1, width=20, height=12, units="cm", dpi = "print") ##### # STEP 3 - Get the frequency of tail-breakage in A. vermicularis (including all individuals analyzed) ############################################################################################################## # STEP 3 - Get the frequency of tail-breakage in A. vermicularis (including all individuals analyzed) # Clean workspace: rm(list=setdiff(ls(), c("data"))) # Get the overall frequency of Urotomy: data %>% count(Urotomy) %>% mutate(freq = round(n / sum(n), 3)) #Urotomy n freq # 0 277 0.801 # 1 69 0.199 # We examined 346 specimens, of which 69 (19.9%) presented tail breakage (urotomy). ##### # STEP 4 - Prepare the predictor variables and perform basic test statistics ############################################################################################################## # STEP 4 - Prepare the predictor variables and perform basic test statistics # Remove species with missing data for variables needed during modeling: ### Life stage: summary(data$LifeStage) levels(data$LifeStage) data <- droplevels(data[!is.na(data$LifeStage), ]) # remove specimens with missing data # Remove juveniles - further analysis will be conducted with adults only. data <- droplevels(data[ !(data$LifeStage %in% "Juvenile"), ]) ### Sex: summary(data$Sex) levels(data$Sex) # Perform a chi-square test to investigate if there is significant differences in the frequency of Urotomy between sexes: chisq.test(data$Urotomy, data$Sex, correct=FALSE) # Result: X-squared = 0.19382, df = 1, p-value = 0.6598 # There is no sexual differences in the frequency of Urotomy between adult males and females. # Therefore, we can pool them in further analysis. # Check the proportion of Urotomy by sex: data[!is.na(data$Sex), ] %>% count(Urotomy, Sex) # proportion of urotomised males 30/144 # 20.8% # proportion of urotomised females 25/108 # 23.1% ### Body Size (snout-vent length): data <- droplevels(data[!is.na(data$SVL), ]) # remove spp with missing data # Check if there is sexual differences in body size between adults (data used to support hypothesis 1 in main text): tapply(data$SVL, data$Sex, summary, na.rm=T) # get summary statistics # Before performing the t-test, check its assumptions (normality and homogeneity): # Normality test (Shapiro-Wilks; H0 = data follows normal distribution) do.call("rbind", with(data, tapply(SVL, Sex, function(x) unlist(shapiro.test(x)[c("statistic", "p.value")])))) # statistic.W p.value # Female 0.965073 0.002 # Male 0.962306 0.0006 # Result: data is not normally distributed, therefore perform a non-parametric test (Mann-Whitney U test). # Mann-Whitney U non-parametric test (H0 = there is no difference between treatments) wilcox.test(SVL ~ Sex, data) # W = 8054, p-value = 0.5388 # Result: There is NO sexual difference in body size (SVL) among males and females (as mentioned in hypothesis 1). ### Latitude: data <- droplevels(data[!is.na(data$Latitude), ] ) # remove spp with missing data # Check the proportion of Urotomy:non-Urotomy in adult specimens used in the analysis: data %>% count(Urotomy) %>% mutate(freq = round(n / sum(n), 3)) #Urotomy n freq # 0 232 0.779 # 1 66 0.221 # Of 298 specimens used to model urotomy probability, 66 (22.1%) had urotomysed tails. ##### # STEP 5 - Check the skewness and kurtosis of the continuous predictor variables ############################################################################################################## # STEP 5 - Check the skewness and kurtosis of the continuous predictor variables # Check the overall frequency distributions of the predictors: library(gridExtra) grid.arrange(ggplot(data, aes(SVL)) + geom_histogram(na.rm = T, colour="black", fill="grey50")+ theme_classic(), ggplot(data, aes(Latitude)) + geom_histogram(na.rm = T, colour="black", fill="grey50")+ theme_classic(), ggplot(data, aes(Temp)) + geom_histogram(na.rm = T, colour="black", fill="grey50")+ theme_classic(), ggplot(data, aes(Prec)) + geom_histogram(na.rm = T, colour="black", fill="grey50")+ theme_classic(), nrow = 2, ncol = 2) # Check for skewed distributions and kurtosis (transform data if necessary): # The values for asymmetry and kurtosis between -2 and +2 are considered acceptable # to prove normal univariate distribution (George & Mallery, 2010) . e1071::skewness(data$SVL); e1071::kurtosis(data$SVL) # 0.809277 and 1.063876 e1071::skewness(data$Latitude); e1071::kurtosis(data$Latitude) # -0.2913203 and -1.065681 e1071::skewness(data$Temp); e1071::kurtosis(data$Temp) # -1.080174 and 1.150186 e1071::skewness(data$Prec); e1071::kurtosis(data$Prec) # -0.5219596 and -0.3075255 # Conclusion: no need to transform continuous predictors. ##### # STEP 6 - Perform a binomial GLM with backward variable selection based on the likelihood ratio test (LRT) ############################################################################################################## # STEP 6 - Perform a binomial GLM with backward variable selection based on the likelihood ratio test (LRT) # Standardize the predictor variables to make them comparable (mean = 0, SD = 1): mydata <- data # duplicate the main dataset names(mydata) mydata$SVL <- scale(mydata$SVL, center=T, scale=T) mydata$Temp <- scale(mydata$Temp, center=T, scale=T) mydata$Prec <- scale(mydata$Prec, center=T, scale=T) # Check the Pearson correlation among predictors: names(mydata) corr_matrix<-cor(mydata[ , c(8,11:12)], method="pearson") #corr_matrix<-cor(mydata[ , c(10,18,26:27)], method="pearson") corr_matrix[corr_matrix == 1] <- 0 # replace correlation values of '1' (autocorrelation) by 'NA' range(corr_matrix); rm(corr_matrix) # Conclusion: continuous predictors show low correlation # Check for multicollinearity (remove those with values higher than 10): usdm::vif(mydata[ , c(8,11:12)]) # variation inflation factor # Conclusion: continuous predictors show low multicollinearity (keep them all) # Specify the full model including all predictor variables: full.model <- glm(Urotomy ~ SVL + Temp + Prec, data = mydata, family = 'binomial') predictor_names<-c("SVL", "Temp", "Prec") for(i in 1:3){ # i number of predictors # Define the model being tested: if(i==1){my_model<-full.model} # model used to start the backward selection if(i>=2){my_model<-selected.model} # the selected model is defined at the end of each iteration # Perform the LRT using the full model: LRT_output<-drop1(my_model, test="Chisq") # Get the predictor holding the lowest LRT row_selected<-which(LRT_output$LRT==min(LRT_output$LRT, na.rm=T)) # Check if the p-value of the preditor selected above is non-signficant (needed to proceed): if(LRT_output[row_selected,5]>0.05){ # 5 refers to the column holding the p-value # Get the predictor name: predictor_to_remove<-row.names(LRT_output[row_selected,]) # Discard the predictor of the model predictors_to_keep<-predictor_names[!predictor_names %in% predictor_to_remove] # Create a new model formula and rebuild the object including the names of the remaining predictors: model_formula<-as.formula(paste("Urotomy ~", paste(predictors_to_keep, collapse= "+"))) predictor_names<-predictors_to_keep # Build the new model selected.model<-glm(model_formula, data = mydata, family = 'binomial') } # Check if the p-value of the preditor selected is signficant (needed to finish the variable selection): if(LRT_output[row_selected,5]<=0.05){ selected.model<-full.model break } } # Extract model coefficients and p-values: round(summary(selected.model)[[12]], 4) # Estimate Std. Error z value Pr(>|z|) # (Intercept) -1.3659 0.1528 -8.9421 0.0000 *** # SVL 0.5277 0.1457 3.6206 0.0003 *** # Temp 0.3866 0.1720 2.2475 0.0246 * # Prec -0.3228 0.1476 -2.1866 0.0288 * drop1(selected.model, test="Chisq") # Df Deviance AIC LRT Pr(>Chi) # 295.77 303.77 # SVL 1 309.48 315.48 13.7087 0.0002135 *** # Temp 1 301.44 307.44 5.6694 0.0172632 * # Prec 1 300.57 306.57 4.8006 0.0284491 * ##### # STEP 7 - Perform a Sensitivity Analysis to account for unbalancing in the response variable ############################################################################################################## # STEP 7 - Perform a Sensitivity Analysis to account for unbalancing in the response variable # Clean workspace: rm(list=setdiff(ls(), c("data"))) # Sensitivity analysis is needed due to unbalanced levels of the response variable, therefore: # 1st) analysis with the full dataset (see step 6 above) # 2nd) analysis based on subdatasets with proportions of 70:30 (2.3:1), 60:40 (1.5:1), 50-50 (1:1) for events:non-events. # Function to get subsets in different proportions of events and non-events: get.subsets <- function(data, var, prop){ if(prop == 5.5){ df1 <- data[sample(which(var=='0'), round(1*length(which(var=='1'))), replace=F), ] df2 <- data[which(var=='1'),] df_new <- rbind(df1, df2) } else if(prop == 6.4){ df1 <- data[sample(which(var=='0'), round(1.5*length(which(var=='1'))), replace=F), ] df2 <- data[which(var=='1'),] df_new <- rbind(df1, df2) } else if(prop == 7.3){ df1 <- data[sample(which(var=='0'), round(2.3*length(which(var=='1'))), replace=F), ] df2 <- data[which(var=='1'),] df_new <- rbind(df1, df2) } } # Object to guide the for loop and the information stored across iterations: j_prop<-c("50_50", "60_40", "70_30") # Specify the full model including all predictor variables: my_outputs<-as.data.frame(matrix(nrow=1, ncol=14)) names(my_outputs)<-c("Prop", "Iter", "SVL", "Temp", "Prec", "SVLStdErr", "TempStdErr", "PrecStdErr", "SVL_LRT", "Temp_LRT", "Prec_LRT", "SVL_p", "Temp_p", "Prec_p") # Perform the sensitivity analysis (NOTE: it takes about 15 minutes to run): for(j in 1:length(j_prop)){ # j levels of proportion (events vs non-events) my_outputs_subset<-as.data.frame(matrix(nrow=10000, ncol=14)) names(my_outputs_subset)<-c("Prop", "Iter", "SVL", "Temp", "Prec", "SVLStdErr", "TempStdErr", "PrecStdErr", "SVL_LRT", "Temp_LRT", "Prec_LRT", "SVL_p", "Temp_p", "Prec_p") my_outputs_subset[,1]<-j_prop[j] # store the proportion o events/non-events under test for(k in 1:nrow(my_outputs_subset)){ # k iterations across the sensitivity analysis set.seed(k) # to make it reproducible # Get subsets with different proportions if(j==1){mydata <- get.subsets(data, data$Urotomy, 5.5); table(mydata$Urotomy)} if(j==2){mydata <- get.subsets(data, data$Urotomy, 6.4); table(mydata$Urotomy)} if(j==3){mydata <- get.subsets(data, data$Urotomy, 7.3); table(mydata$Urotomy)} my_outputs_subset[k,2]<-k # store the iteration number # Standardize the predictor variables to make them comparable: mydata$SVL<-scale(mydata$SVL, center=T, scale=T) mydata$Temp<-scale(mydata$Temp, center=T, scale=T) mydata$Prec<-scale(mydata$Prec, center=T, scale=T) # Reset the model being tested at the beginning of each iteration full.model <- glm(Urotomy ~ SVL + Temp + Prec, data = mydata, family = 'binomial') predictor_names<-c("SVL", "Temp", "Prec") # Perform the backward selection via LRT: for(i in 1:3){ # i is the number of predictors # Define the model being tested: if(i==1){my_model<-full.model} # model used to start the backward selection if(i>=2){my_model<-selected.model} # the selected model is defined at the end of each iteration # Perform the LRT using the full model: LRT_output<-drop1(my_model, test="Chisq") GLM_Estimate<-summary(my_model) GLM_Estimate<-as.data.frame(summary(my_model)[[which(names(GLM_Estimate)=="coefficients")]]) # get the right table on std. coefs # Get the predictor holding the lowest LRT row_selected<-which(LRT_output$LRT==min(LRT_output$LRT, na.rm=T)) # Check if the p-value of the predictor selected above is non-significant (needed to proceed): if(LRT_output[row_selected,5]>0.05 & i<=2){ # Get the predictor name: predictor_to_remove<-row.names(LRT_output[row_selected,]) # Discard the predictor of the model predictors_to_keep<-predictor_names[!predictor_names %in% predictor_to_remove] # Create a new model formula and rebuild the object including the names of the remaining predictors: model_formula<-as.formula(paste("Urotomy ~", paste(predictors_to_keep, collapse= "+"))) predictor_names<-predictors_to_keep # Build the new model selected.model<-glm(model_formula, data = mydata, family = 'binomial') # Store the LRT of the predictor before its removal from the model: if (length(LRT_output[which(row.names(LRT_output)=="SVL"),1])==1 & predictor_to_remove=="SVL"){ my_outputs_subset[k,3]<-GLM_Estimate[(row.names(GLM_Estimate)=="SVL"),1] # coefficient my_outputs_subset[k,6]<-GLM_Estimate[(row.names(GLM_Estimate)=="SVL"),2] # StdError my_outputs_subset[k,9]<-LRT_output[(row.names(LRT_output)=="SVL"),4] # LRT my_outputs_subset[k,12]<-LRT_output[(row.names(LRT_output)=="SVL"),5] # p-value } if (length(LRT_output[which(row.names(LRT_output)=="Temp"),1])==1 & predictor_to_remove=="Temp"){ my_outputs_subset[k,4]<-GLM_Estimate[(row.names(GLM_Estimate)=="Temp"),1] # coefficient my_outputs_subset[k,7]<-GLM_Estimate[(row.names(GLM_Estimate)=="Temp"),2] # StdError my_outputs_subset[k,10]<-LRT_output[(row.names(LRT_output)=="Temp"),4] # LRT my_outputs_subset[k,13]<-LRT_output[(row.names(LRT_output)=="Temp"),5] # p-value } if (length(LRT_output[which(row.names(LRT_output)=="Prec"),1])==1 & predictor_to_remove=="Prec"){ my_outputs_subset[k,5]<-GLM_Estimate[(row.names(GLM_Estimate)=="Prec"),1] # coefficient my_outputs_subset[k,8]<-GLM_Estimate[(row.names(GLM_Estimate)=="Prec"),2] # StdError my_outputs_subset[k,11]<-LRT_output[(row.names(LRT_output)=="Prec"),4] # LRT my_outputs_subset[k,14]<-LRT_output[(row.names(LRT_output)=="Prec"),5] # p-value } } # some predictors are important if(LRT_output[row_selected,5]<=0.05 & i==1){ selected.model<-full.model # no predictor can be discarded } # all predictors are important if(LRT_output[row_selected,5]>0.05 & i==3){ # Get the predictor name: predictor_to_remove<-row.names(LRT_output[row_selected,]) # Store the LRT of the predictor before its removal from the model: if (length(LRT_output[which(row.names(LRT_output)=="SVL"),1])==1 & predictor_to_remove=="SVL"){ my_outputs_subset[k,3]<-GLM_Estimate[(row.names(GLM_Estimate)=="SVL"),1] my_outputs_subset[k,6]<-GLM_Estimate[(row.names(GLM_Estimate)=="SVL"),2] my_outputs_subset[k,9]<-LRT_output[(row.names(LRT_output)=="SVL"),4] # LRT my_outputs_subset[k,12]<-LRT_output[(row.names(LRT_output)=="SVL"),5] # p-value } if (length(LRT_output[which(row.names(LRT_output)=="Temp"),1])==1 & predictor_to_remove=="Temp"){ my_outputs_subset[k,4]<-GLM_Estimate[(row.names(GLM_Estimate)=="Temp"),1] my_outputs_subset[k,7]<-GLM_Estimate[(row.names(GLM_Estimate)=="Temp"),2] my_outputs_subset[k,10]<-LRT_output[(row.names(LRT_output)=="Temp"),4] # LRT my_outputs_subset[k,13]<-LRT_output[(row.names(LRT_output)=="Temp"),5] # p-value } if (length(LRT_output[which(row.names(LRT_output)=="Prec"),1])==1 & predictor_to_remove=="Prec"){ my_outputs_subset[k,5]<-GLM_Estimate[(row.names(GLM_Estimate)=="Prec"),1] my_outputs_subset[k,8]<-GLM_Estimate[(row.names(GLM_Estimate)=="Prec"),2] my_outputs_subset[k,11]<-LRT_output[(row.names(LRT_output)=="Prec"),4] # LRT my_outputs_subset[k,14]<-LRT_output[(row.names(LRT_output)=="Prec"),5] # p-value } # Build the new model selected.model<-glm(Urotomy ~ 1, data = mydata, family = 'binomial') } # no predictor is important } # end of i for loop # Store the outputs of the backward selection LRT_output<-as.data.frame(drop1(selected.model, test="Chisq")) GLM_Estimate<-summary(selected.model) GLM_Estimate<-as.data.frame(summary(selected.model)[[which(names(GLM_Estimate)=="coefficients")]]) # get the right table on std. coefs # Store the coefficient if included in the final model selected: if (length(GLM_Estimate[which(row.names(GLM_Estimate)=="SVL"),1])==1){ my_outputs_subset[k,3]<-GLM_Estimate[(row.names(GLM_Estimate)=="SVL"),1] my_outputs_subset[k,6]<-GLM_Estimate[(row.names(GLM_Estimate)=="SVL"),2] my_outputs_subset[k,9]<-LRT_output[(row.names(LRT_output)=="SVL"),4] my_outputs_subset[k,12]<-LRT_output[(row.names(LRT_output)=="SVL"),5] } if (length(GLM_Estimate[which(row.names(GLM_Estimate)=="Temp"),1])==1){ my_outputs_subset[k,4]<-GLM_Estimate[(row.names(GLM_Estimate)=="Temp"),1] my_outputs_subset[k,7]<-GLM_Estimate[(row.names(GLM_Estimate)=="Temp"),2] my_outputs_subset[k,10]<-LRT_output[(row.names(LRT_output)=="Temp"),4] my_outputs_subset[k,13]<-LRT_output[(row.names(LRT_output)=="Temp"),5] } if (length(GLM_Estimate[which(row.names(GLM_Estimate)=="Prec"),1])==1){ my_outputs_subset[k,5]<-GLM_Estimate[(row.names(GLM_Estimate)=="Prec"),1] my_outputs_subset[k,8]<-GLM_Estimate[(row.names(GLM_Estimate)=="Prec"),2] my_outputs_subset[k,11]<-LRT_output[(row.names(LRT_output)=="Prec"),4] my_outputs_subset[k,14]<-LRT_output[(row.names(LRT_output)=="Prec"),5] } rm(LRT_output, GLM_Estimate) print(k) } # end of k for loop my_outputs<-rbind(my_outputs, my_outputs_subset) rm(my_outputs_subset, mydata) } # end of j for loop # Remove unnecessary row value: my_outputs<-my_outputs[-1,] # Plot histograms to inspect the distribution of model coefficients and std. error values: Multipanel_plot<-gridExtra::grid.arrange( ggplot2.histogram(data=my_outputs, xName='SVL', groupName='Prop', addMeanLine=TRUE, meanLineSize=0.5, groupColors=c('red','gray','blue'), alpha=0.5, addDensityCurve = TRUE, removePanelGrid=TRUE, removePanelBorder=TRUE, axisLine=c(0.25, "solid", "black"), showLegend=TRUE, bins=50, backgroundColor="white") + labs(x="Body Size - Std.Coef", y="Density") + theme_classic() + theme(legend.position="none"), ggplot2.histogram(data=my_outputs, xName='SVLStdErr', groupName='Prop', addMeanLine=TRUE, meanLineSize=0.5, groupColors=c('red','gray','blue'), alpha=0.5, addDensityCurve = TRUE, removePanelGrid=TRUE, removePanelBorder=TRUE, axisLine=c(0.25, "solid", "black"), showLegend=TRUE, bins=50, backgroundColor="white") + labs(x="Body Size - Std.Error", y="Density") + theme_classic() + theme(legend.position="none"), ggplot2.histogram(data=my_outputs, xName='Temp', groupName='Prop', addMeanLine=TRUE, meanLineSize=0.5, groupColors=c('red','gray','blue'), alpha=0.5, addDensityCurve = TRUE, removePanelGrid=TRUE, removePanelBorder=TRUE, axisLine=c(0.25, "solid", "black"), showLegend=TRUE, bins=50, backgroundColor="white") + labs(x="Temperature - Std.Coef", y="Density") + theme_classic() + theme(legend.position="none"), ggplot2.histogram(data=my_outputs, xName='TempStdErr', groupName='Prop', addMeanLine=TRUE, meanLineSize=0.5, groupColors=c('red','gray','blue'), alpha=0.5, addDensityCurve = TRUE, removePanelGrid=TRUE, removePanelBorder=TRUE, axisLine=c(0.25, "solid", "black"), showLegend=TRUE, bins=50, backgroundColor="white") + labs(x="Temperature - Std.Error", y="Density") + theme_classic() + theme(legend.position="none"), ggplot2.histogram(data=my_outputs, xName='Prec', groupName='Prop', addMeanLine=TRUE, meanLineSize=0.5, groupColors=c('red','gray','blue'), alpha=0.5, addDensityCurve = TRUE, removePanelGrid=TRUE, removePanelBorder=TRUE, axisLine=c(0.25, "solid", "black"), showLegend=TRUE, bins=50, backgroundColor="white") + labs(x="Precipitation - Std.Coef", y="Density") + theme_classic() + theme(legend.position="none"), ggplot2.histogram(data=my_outputs, xName='PrecStdErr', groupName='Prop', addMeanLine=TRUE, meanLineSize=0.5, groupColors=c('red','gray','blue'), alpha=0.5, addDensityCurve = TRUE, removePanelGrid=TRUE, removePanelBorder=TRUE, axisLine=c(0.25, "solid", "black"), showLegend=TRUE, bins=50, backgroundColor="white") + labs(x="Precipitation - Std.Error", y="Density") + theme_classic() + theme(legend.position="none"), nrow = 3, ncol = 2) # Export the figure: ggsave(paste0(getwd(), "/Sensitivity_Histograms.pdf"), plot=Multipanel_plot, width=5, height=5.5, units="in") ggsave(paste0(getwd(), "/Sensitivity_Histograms.png"), plot=Multipanel_plot, width=5, height=5.5, units="in") # Convert output from 'wide' to 'long' format: head(my_outputs) my_outputs_long1 <- tidyr::gather(my_outputs, var, estimate, SVL:Prec, factor_key=TRUE) my_outputs_long1 <- my_outputs_long1[ , c(1:2,12:13)] my_outputs_long2 <- tidyr::gather(my_outputs, var2, StdErr, SVLStdErr:PrecStdErr, factor_key=TRUE) my_outputs_long2 <- my_outputs_long2[ , c(1,13)] my_outputs_long3 <- tidyr::gather(my_outputs, var3, LRT, SVL_LRT:Prec_LRT, factor_key=TRUE) my_outputs_long3 <- my_outputs_long3[ , c(1,13)] my_outputs_long4 <- tidyr::gather(my_outputs, var4, p.value, SVL_p:Prec_p, factor_key=TRUE) my_outputs_long4 <- my_outputs_long4[ , c(1,13)] # Now, merge the new datasets in a single long format dataframe: my_outputs_long <- cbind(my_outputs_long1,my_outputs_long2,my_outputs_long3,my_outputs_long4) # remove unecessary (duplicated) rows and dataframes: rm(my_outputs_long1,my_outputs_long2,my_outputs_long3,my_outputs_long4) names(my_outputs_long) my_outputs_long <- my_outputs_long[ , -c(5,7,9)] # Round values to 4 decimal places: my_outputs_long[,c(4:7)] <- round(my_outputs_long[,c(4:7)], 4) # Get the average across the output: avg_output<-as.data.frame(my_outputs_long %>% group_by(Prop, var) %>% summarise( # Get overall estimates AvgEstimate=mean(estimate, na.rm=T), AvgStdErr=mean(StdErr, na.rm=T), AvgLRT=mean(LRT, na.rm=T), P.value=round(pchisq(AvgLRT, df=1, lower.tail=FALSE), 4))) %>% # Get the upper and lower CI based on the avg. coefficients and std. errors mutate(upper95=round((AvgEstimate+(1.96*AvgStdErr)), 4), lower95=round((AvgEstimate-(1.96*AvgStdErr)), 4)) # Check the results: avg_output # Prop var AvgEstimate AvgStdErr AvgLRT P.value upper95 lower95 # 50_50 SVL 0.4975235 0.1994288 7.196932 0.0073 0.8884 0.1066 ** # 50_50 Temp 0.3677853 0.1944130 4.165772 0.0412 0.7488 -0.0133 ns # 50_50 Prec -0.3020743 0.1897758 2.877331 0.0898 0.0699 -0.6740 ns # 60_40 SVL 0.4945434 0.1755163 8.868251 0.0029 0.8386 0.1505 ** # 60_40 Temp 0.3665870 0.1823675 4.588671 0.0322 0.7240 0.0091 * # 60_40 Prec -0.3091892 0.1721261 3.486957 0.0619 0.0282 -0.6466 ns # 70_30 SVL 0.5007831 0.1571453 10.944973 0.0009 0.8088 0.1928 *** # 70_30 Temp 0.3703394 0.1743036 5.066851 0.0244 0.7120 0.0287 * # 70_30 Prec -0.3153263 0.1579367 4.135705 0.0420 -0.0058 -0.6249 * ##### # STEP 8 - Create the sensitivity plot (figure 2 presented in the paper) ############################################################################################################## # STEP 8 - Create the sensitivity plot (figure 2 presented in the paper) # Clean workspace rm(list=setdiff(ls(), c("data", "avg_output", "my_outputs"))) # Reorder levels to plot and add labels: levels(avg_output$var) avg_output$var <- factor(avg_output$var, levels=c("SVL","Temp","Prec"), labels=c("Body size","Temperature","Precipitation")) # Reorder levels to plot and add labels: avg_output$Prop <- factor(avg_output$Prop, levels=c('50_50','60_40','70_30'), labels=c('50:50','60:40','70:30')) # Define colors to be used in the plot myColors <- c("red", "gray30" ,"blue") names(myColors)<-levels(avg_output$Prop) # Prepare for plotting (Tree_plot <- ggplot(avg_output, aes(x = Prop, y = AvgEstimate, ymin = lower95, ymax = upper95))+ geom_pointrange(aes(col = Prop, shape = Prop), size = 0.4)+ scale_colour_manual(name = "Prop", values = myColors)+ scale_shape_manual(values=c(0,1,2,4,5,6,7,8,9,10))+ geom_errorbar(aes(ymin=lower95, ymax=upper95, col = Prop), width=0.1)+ geom_hline(yintercept =0, linetype=2)+ labs(y="Average Std. Coefficient (CI95%)", x=NULL)+ scale_x_discrete(limits = rev(levels(avg_output$Prop)))+ facet_wrap(~var, strip.position="left", nrow=9)+ theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank(), panel.background = element_blank(), axis.title = element_text(size=12, face="bold"), axis.text = element_text(size=10), axis.text.y = element_blank(), axis.line = element_line(colour="black"), plot.background=element_rect(fill = "white"), strip.background = element_blank(), strip.placement = "outside", legend.key = element_blank(), legend.title = element_blank(), legend.text=element_text(size=8), legend.justification = "center", legend.background = element_rect(fill = NA), legend.direction = "vertical", legend.position=c(.875, .15))+ coord_flip()) # Export the figure: ggsave(paste0(getwd(), "/Fig.2 - Sensitivity Plot.pdf"), plot=Tree_plot, width=3, height=4, units="in", dpi = "print") ggsave(paste0(getwd(), "/Fig.2 - Sensitivity Plot.png"), plot=Tree_plot, width=3, height=4, units="in", dpi = "print") ##### # ----------- End ----------- #