############################################################## # R code for single-locus and multivariate regression analyses # used for "Admixture mapping in Populus" # by Doro Lindtke # 04.01.2013 ############################################################## #setwd("~/Documents/myFolder") # myFolder containing example files # 'linkage_Ita_f.edited', 'linkage_Ita_ss.edited', 'Marker.csv', and 'phenotypes_Ita.txt' #---------------------------------------------------------- # load required libraries library(AICcmodavg) library(MASS) library(MuMIn) # read and set up files #----------------------- # read marker info file marker<-read.csv("Marker.csv") # read structure site-by-site output data # raw structure site-by-site output file can be prepared simply by removing empty lines at beginning of file # read genetic data (includes the parentals, too) g.dat<-read.table("linkage_Ita_ss.edited") g.dat<-as.matrix(g.dat) # keep only columns needed for the analysis g.dat<-g.dat[,1:4] colnames(g.dat)<-c("ind","locus","p.aa","p.at") # p.aa and p.at give locus-specific ancestry probabilities from structure site-by-site output, # prob to be homozygous for species 1 (p.aa) or prob to be interspecific heterozygous (2*p.at) # bring into new format with loci in columns, individuals in rows, for both ancestry probabilities p.aa and p.at n.loci<-max(g.dat[,2]) # number of loci total.ind<-max(g.dat[,1]) # total number of individuals analysed, including parentals aa<-matrix(NA,ncol=n.loci,nrow=total.ind) at<-matrix(NA,ncol=n.loci,nrow=total.ind) colnames(aa)<-marker$Marker colnames(at)<-marker$Marker # define row start and endpoints of individual data in g.dat x1<-seq(1,dim(g.dat)[1],n.loci) # start x2<-(1:total.ind)*n.loci # end # copy and paste genetic data into new format for (i in 1:total.ind) { aa[i,]<-g.dat[x1[i]:x2[i],3] at[i,]<-g.dat[x1[i]:x2[i],4] } # keep only the admixed individuals # admixed individuals can be identified e.g. according to structure Q # (here it's done by structure Q, read from structure 'f' output; original file needs to be edited to be readable in R) admixprop<-read.table("linkage_Ita_f.edited",header=T,sep=",") admixed<-which(admixprop$q1 > 0.001 & admixprop$q1 < 0.999) n.inds<-length(admixed) # number of admixed individuals aa<-aa[admixed,] at<-at[admixed,] #------------------------------------------------------------------------------- # optional: exclude markers of bad quality; these are (Italy): O30_1, G1416, G430, O14, O206 (3,41,69,70,77) #------------------------------------------------------------------------------- excl<-c(3,41,69,70,77) marker<-marker[-excl,] aa<-aa[,-excl] at<-at[,-excl] n.loci<-n.loci-length(excl) #----------------------------------------------------------------------------- # LSA means over individuals mean.aa<-rowMeans(aa) # (mean.aa + mean.at is perfectly correlated to structure Q) mean.at<-rowMeans(at) #----------- # read the phenotype data; individual data need to be in the same order as used for the structure analysis # traits are in columns, individuals in rows; !!! includes admixed individuals only in this case!!! # note that there should not be any missing data, not in the structure ancestry estimates nor the phenotypes pheno.dat<-read.table("phenotypes_Ita.txt",header=TRUE) pheno.dat<-as.matrix(pheno.dat) n.traits<-dim(pheno.dat)[2] # number of traits ## standardization function my.stand<-function(y.dat) { y.dat.t<-(y.dat-mean(y.dat))/sd(y.dat) y.dat.t } pheno.dat<-apply(pheno.dat,2,my.stand) # standardize all columns #------------- ##################################################### # (1) detect candidate QTL by single-locus regression ##################################################### # set up file for storage of AIC values mod.sel.AIC<-array(dim=c(5,7,n.loci,n.traits),dimnames=list(c("Mn","M0","M0a","M1","M1a"), c("Modnames","K","AIC","Delta_AIC","ModelLik","AICWt","LL"), marker$Marker,colnames(pheno.dat))) # dimensions: Models, AICdiagn, loci, trait #---------------------------------------------- # detect initial canditate loci by model selection for (t in 1:n.traits) { #traits loop # "real" null-model, includes intercept only Mn<-lm(pheno.dat[,t]~1, na.action=na.fail) # null model with genome-wide control only, with dominance/recessive deviation term M0<-lm(pheno.dat[,t]~I(mean.aa[] + mean.at[]) + mean.at[], na.action=na.fail) # null model with genome-wide control only, strictly additive M0a<-lm(pheno.dat[,t]~I(mean.aa[] + mean.at[]), na.action=na.fail) for (l in 1:n.loci) { #loci loop # model including locus effect, with dominance/recessive deviation term M1<-lm(pheno.dat[,t]~ I(mean.aa[] + mean.at[])+ mean.at[] + I(aa[,l] + at[,l]) + at[,l], na.action=na.fail) #locus l # model including locus effect, strictly additive M1a<-lm(pheno.dat[,t]~ I(mean.aa[] + mean.at[]) + I(aa[,l] + at[,l]), na.action=na.fail) #locus l # model selection table, stores AIC, deltaAIC, etc. mod.sel.AIC[,,l,t]<-as.matrix(aictab(cand.set = list(Mn,M0,M0a,M1,M1a), modnames = c("Mn","M0","M0a","M1","M1a"), sort = FALSE, second.ord = FALSE)) } } #end traits loop #------------------------------------------- # save AIC's: dimensions: loci, traits # stores performance of model including a locus-effect relative to model including genome-wide control only # dominant/recessive model mod.AIC.M1<-array(dim=c(n.loci,n.traits),dimnames=list(marker$Marker,colnames(pheno.dat))) # additive model mod.AIC.M1a<-array(dim=c(n.loci,n.traits),dimnames=list(marker$Marker,colnames(pheno.dat))) for (t in 1:n.traits) { for (l in 1:n.loci) { mod.AIC.M1[l,t]<-as.numeric(mod.sel.AIC[2,3,l,t])-as.numeric(mod.sel.AIC[4,3,l,t]) #AIC(M0) - AIC(M1) mod.AIC.M1a[l,t]<-as.numeric(mod.sel.AIC[3,3,l,t])-as.numeric(mod.sel.AIC[5,3,l,t]) #AIC(M0a) - AIC(M1a) } } ############################################################################# # (2) model selection to find best set of candidates, multivariate regression ############################################################################# # first, prepare table with all possible parameters that can be included in model, and info table for these params # make list of all locus parameters my.params<-numeric(2*n.loci) for (l in 1:n.loci){ my.params[l]<-paste("I(aa[, ",l,"] + at[, ",l,"])",sep="") my.params[l + n.loci]<-paste("at[, ",l,"]",sep="") } # final lists all possible parameters that can be included in the model (add intercept and genome-wide controls) my.params<-c(my.params,"(Intercept)","I(mean.aa[] + mean.at[])","mean.at[]") # make info table for all parameters (marker names, and if the parameter is the additive or dominant component) par.info<-cbind(c(as.character(marker$Marker),as.character(marker$Marker)),rep(c("add","dom"),each=n.loci)) colnames(par.info)<-c("locus","action") rownames(par.info)<-1:(2*n.loci) info2<-rbind(c("(Intercept)",NA),c("Q","add"), c("Q","dom")) colnames(info2)<-colnames(par.info) # final info table for all parameters, including intercept and genome-wide controls par.info<-rbind(par.info,info2) #-------------------------- # prepare lists for data storage CandsExh<-list() # stores results from exhaustive search CandsStep<-list() # stores results from stepwise search ################################### #for (t in 1:n.traits) { # start traits loop; for exhaustive search, much time and memory might be needed; # might be better to run for subset of traits only for (t in c(4,5)) { # start traits loop for subset of traits #----------------------- # include markers with high support of AIC >= 4 in additive or dom/rec model only candidates<-which(mod.AIC.M1[,t]>=4 | mod.AIC.M1a[,t]>=4) if (length(candidates)>=1) { # only if something has been detected # construct full model that contains additive and dom/rec components for all candidate loci params2<-NULL for (l in candidates){ params2<-paste(params2,paste("I(aa[,",l,"] + at[,",l,"]) + at[,",l,"]",sep=""),sep=" + ") } # add beginning of formula, including genome-wide controls params2<-paste("~ I(mean.aa[] + mean.at[])+ mean.at[]",params2,sep="") # build null-model including genome-wide controls only params0<-"~ I(mean.aa[] + mean.at[])+ mean.at[]" # this gives the full model globM<-lm(as.formula(paste("pheno.dat[,t] ",params2,sep=""))) # this gives the null-model NullM<-lm(as.formula(paste("pheno.dat[,t] ","~ I(mean.aa[] + mean.at[])+ mean.at[]",sep=""))) #------------------------------------------------------------------------------ # model selection with MuMIn package (exhaustive search); this might need much time and memory #------------------------------------------------------------------------------- # perform model evaluation of all possible parameter combinations dred1<-dredge(globM,rank="BIC") # subset of best models av.dred1<-model.avg(dred1, subset=delta<4,rank="BIC",fit=TRUE) # relative importance of the predictor variables (including interactions), calculated # as a sum of the Akaike weights over all of the models in which the parameter of interest appears dred1.imp<-av.dred1$importance # add intercept (for data storage) dred1.imp<-c(NA,dred1.imp) names(dred1.imp)[1]<-"(Intercept)" dred1.par<-av.dred1$avg.model # parameter estimates and CIs dred1.par.s<-av.dred1$coef.shrinkage # full-model averaged parameter estimates (coef is zero if not included in model) # bring files in same order dred1.imp.ID<-match(names(dred1.imp),my.params) # get IDs of selected loci (now with intercept) dred1.par.ID<-match(rownames(dred1.par),my.params) # with intercept dred1.par.s.ID<-match(names(dred1.par.s),my.params) # with intercept dred1.par<-dred1.par[match(dred1.imp.ID,dred1.par.ID),] dred1.par.s<-dred1.par.s[match(dred1.imp.ID,dred1.par.s.ID)] # bind the different estimates together CandTab<-cbind(names(dred1.imp),par.info[dred1.imp.ID,],dred1.par,dred1.imp,dred1.par.s) colnames(CandTab)[1]<-"Param" rownames(CandTab)<-1:dim(CandTab)[1] #-------------------------------------------------------------------------------- # forward selection and backward elimination #--------------------------------------------------------------------------------- # forward selection; using BIC (k = log(n.inds)) M.cand2<-stepAIC(object = lm(pheno.dat[,t]~ 1),scope = as.formula(params2),direction = "forward",trace=0,k = log(n.inds)) # backward selection M.cand3<-stepAIC(object = M.cand2,direction="backward",trace=0,k = log(n.inds)) # extract names and p-vals of selected loci S.cand3<-names(coef(M.cand3)) # get IDs of selected loci sel.params2<-match(S.cand3,my.params) # bind the different estimates together Cands<-cbind(S.cand3,par.info[sel.params2,],-log10(summary(M.cand3)$coef[,4])) colnames(Cands)[c(1,4)]<-c("Param","-log10(Pr(>|t|))") rownames(Cands)<-1:dim(Cands)[1] } # close loop "only if something has been detected" if (length(candidates)==0) { # if nothing has been detected CandTab<-array(dim=c(0,10),dimnames=list(character(0),c("Param","locus","action","Estimate" , "Std. Error","Adjusted SE","Lower CI","Upper CI","dred1.imp","dred1.par.s" ))) Cands<-array(dim=c(0,4),dimnames=list(character(0),c("Param" , "locus","action","-log10(Pr(>|t|))" ))) } # bind results from different traits to list CandsExh[[t]]<-CandTab names(CandsExh)[t]<-colnames(pheno.dat)[t] CandsStep[[t]]<-Cands names(CandsStep)[t]<-colnames(pheno.dat)[t] } # end traits loop ####################################################################################### # most interesting results will be stored in: # mod.AIC.M1 # (delta AIC (null-model - locus-effect-model) for all loci and traits, dominant/recessive model, single marker regression) # mod.AIC.M1a # (delta AIC (null-model - locus-effect-model) for all loci and traits, additive model, single marker regression) # CandsExh # (results from exhaustive search, all traits, multivariate regression) (*) # CandsStep # (results from stepwise search, all traits, multivariate regression) (**) # (*) the following estimates are stored (see R documentation for model.avg for more details): # "Param" (parameter as it enters into model) # "locus" (locus name) # "action" (additive or dominant component of locus) # "Estimate" (model averaged parameters) # "Std. Error" (unconditional standard errors) # "Adjusted SE" (adjusted standard errors) # "Lower CI" # "Upper CI" # "dred1.imp" (relative importance of predictor variables) # "dred1.par.s" (full model-averaged coefficients) # (**) the following estimates are stored: # "Param" (parameter as it enters into model) # "locus" (locus name) # "action" (additive or dominant component of locus) # "-log10(Pr(>|t|))" (- log10 of (two-sided) p-value for the estimated model coefficients) ############################################################################################