# The following set of R commands illustrates the process by which SNPs were correlated to climate and pollution levels. Examples are # # given for each type of analysis and need to be modified by the user to conduct a full analysis. As such these functions are # # distributed without warranty. # # # # Author: Andrew J. Eckert, Virginia Commonwealth University (aeckert2@vcu.edu) # # Corresponding Author: Om Rajora (Om.Rajora@unb.ca) # # # # Date: 01/25/2013 # # Notes: The genetic and climate data are given in the Dryad data submission for Bashalkhanov et al. # # # ########################################################################################################################################## ################# BEGIN ################################################################################################################## ### These libraries are helpful to have installed prior to trying to use any of these examples. library(hierfstat) ################ BEGIN HIERFSTAT EXAMPLE ################################################################################################## ### Please note that the genetic data released with the Bashalkhanov et al. manuscript need to be reformatted prior to executing these commands. The appropriate format follows FSTAT. ### Creation of factors to with which to classify sampled trees species <- rep("rs",960) population <- data_mod[,1] cohort <- data_mod[,2] pollution <- rep("no",960) ### Hierarchical structure with pollution class, populations and cohorts. Note that the black spruce populations are removed prior to the analysis (pop id = "nl"). levels_02 <- data.frame(as.factor(pollution), as.factor(population), as.factor(cohort)) nlrem <- which(population == "nl") colnames(levels_02) <- c("Pollution","Population","Cohort") global_varcomps_SSRs_poll <- varcomp.glob(levels_02[-nlrem,],SSR_data_hier[-nlrem,],diploid=T) boot_vc_SSRs_poll <- boot.vc(levels_02[-nlrem,],SSR_data_hier[-nlrem,],diploid=T,nboot=10000) global_varcomps_SNPs_poll <- varcomp.glob(levels_02[-nlrem,],SNP_data_hier[-nlrem,],diploid=T) boot_vc_SNPs_poll <- boot.vc(levels_02[-nlrem,],SNP_data_hier[-nlrem,],diploid=T,nboot=10000) ################ END HIERFSTAT EXAMPLE ################################################################################################## ################ BEGIN LOGISTIC REGRESSION EXAMPLE ###################################################################################### ### The example below gives the procedure across all sampled trees. As discussed in Bashalkhanov et al. Similar analyses were conducted by cohort. ### This creates the factor used to classify populations as polluted (=1) or unpolluted (=0) poll <- c(rep(1,180*3),rep(0,180*2)) ### These two objects contain data. You would need to create these. snp_coded: contains counts of the minor allele per genotype (0,1,2), dat6CLUMPP: contains the q-values from K= 6 in the STRUCTURE analysis (see Dryad data submission for these). snp_coded <- SNP_coded[-c(1:60),] dat6CLUMPP[-c(901:960),] -> corrs_struct ### The following two items are created to store output from the model fitting. AIC_struct <- matrix(nrow=33,ncol=3) pvals_struct <- matrix(nrow=33,ncol=3) ### A for loop was used to carry out for(j in 1:33) { bin <- matrix(nrow=900, ncol=2) for(i in 1:900) { if(is.na(snp_coded[i,j]) == "FALSE") { if(snp_coded[i,j] == 1) {bin[i,1]<-1} else { if(snp_coded[i,j] == 2) {bin[i,1] <- 2} else { bin[i,1] <- 0 } }}} for(i in 1:900) { if(is.na(bin[i,1]) == "FALSE") { if(bin[i,1] == 0) {bin[i,2]<-2} else { if(bin[i,1] == 1) {bin[i,2]<-1} else { bin[i,2] <- 0 } } }} m1<-glm(bin ~ corrs_struct[,4]+corrs_struct[,5]+corrs_struct[,6]+corrs_struct[,7]+corrs_struct[,8], family=binomial) m2<-glm(bin ~ corrs_struct[,4]+corrs_struct[,5]+corrs_struct[,6]+corrs_struct[,7]+corrs_struct[,8] + poll, family=binomial) m3 <- glm(bin ~ corrs_struct[,4]+corrs_struct[,5]+corrs_struct[,6]+corrs_struct[,7]+corrs_struct[,8] + poll + clim_pc, family=binomial) AIC_struct[j,1] <- m1$aic AIC_struct[j,2] <- m2$aic AIC_struct[j,3] <- m3$aic pvals_struct[j,1] <- anova(m1,m2,test="Chisq")[2,5] pvals_struct[j,2] <- anova(m1,m3,test="Chisq")[2,5] pvals_struct[j,3] <- anova(m2,m3,test="Chisq")[2,5] } ################ END LOGISTIC REGRESSION EXAMPLE ###################################################################################### ################ BEGIN PERMUTATION EXAMPLE ############################################################################################ ### The following is an example of the minimum p-value permutation approach used in Bashalkhanov et al. ### The following establishes the item that the output will be stored in: pdist_struct_ssr_total <- matrix(nrow=10000,ncol=2) ### A nested for loop is used to select 33 SSR alleles and perform an analysis as described above. To use this nested for loop, you will need to create an object containing SSR alleles coded as the SNP data (0,1,2) based on the genotype of each allele. The code below calls an item named SSR_pca_dat which has this information in it. for(k in 1:10000) { ### The following creates the pollution factor as above and then randomizes it with respect to sampled trees. poll <- c(rep(1,180*3),rep(0,180*2)) poll <- sample(poll,length(poll),replace=F) ### The following organizes data for analysis by removing the black spruce population (pop id = "nl"). ssr_coded <- SSR_pca_dat[-c(1:60),] dat6CLUMPP[-c(901:960),] -> corrs_struct ### The following creates temporary objects to store results in. AIC_struct_ssr <- matrix(nrow=33,ncol=3) pvals_struct_ssr <- matrix(nrow=33,ncol=3) ### The following randomly selects 33 SSR alleles ssr_coded <- ssr_coded[,sample(1:121,33)] for(j in 1:33) { bin <- matrix(nrow=nrow(ssr_coded), ncol=2) for(i in 1:nrow(ssr_coded)) { if(is.na(ssr_coded[i,j]) == "FALSE") { if(ssr_coded[i,j] == 1) {bin[i,1]<-1} else { if(ssr_coded[i,j] == 2) {bin[i,1] <- 2} else { bin[i,1] <- 0 } } } } for(i in 1:nrow(ssr_coded)) { if(is.na(bin[i,1]) == "FALSE") { if(bin[i,1] == 0) {bin[i,2]<-2} else { if(bin[i,1] == 1) {bin[i,2]<-1} else { bin[i,2] <- 0 } } } } m1<-glm(bin ~ corrs_struct[,4]+corrs_struct[,5]+corrs_struct[,6]+corrs_struct[,7]+corrs_struct[,8], family=binomial) m2<-glm(bin ~ corrs_struct[,4]+corrs_struct[,5]+corrs_struct[,6]+corrs_struct[,7]+corrs_struct[,8] + poll, family=binomial) m3<-glm(bin ~ corrs_struct[,4]+corrs_struct[,5]+corrs_struct[,6]+corrs_struct[,7]+corrs_struct[,8] + poll + clim_pc, family=binomial) AIC_struct_ssr[j,1] <- m1$aic AIC_struct_ssr[j,2] <- m2$aic AIC_struct_ssr[j,3] <- m3$aic pvals_struct_ssr[j,1] <- anova(m1,m2,test="Chisq")[2,5] pvals_struct_ssr[j,2] <- anova(m1,m3,test="Chisq")[2,5] pvals_struct_ssr[j,3] <- anova(m2,m3,test="Chisq")[2,5] } ### The following retrieves the minimum p-value from the analyses above for one round of 33 randomly selected SSR alleles. This is then repeated 10000 times. pdist_struct_ssr_total[k,1] <- min(pvals_struct_ssr[,2],na.rm=T) pdist_struct_ssr_total[k,2] <- min(pvals_struct_ssr[,3],na.rm=T) ### The following creates a print statement to monitor progress of the routine. Omit as needed. report <- seq(from=100, to=10000, by=100) if(is.element(k,report) == "TRUE") {print(k)} } ################ END PERMUTATION EXAMPLE ############################################################################################