# Major scripts and summaries from Brandvain et al "Speciation and introgression in M. nasutus and M. guttatus". PLoS Genetics #INCLUDES SCRIPTS FOR: # PCA [lines 30-50] # DISTRIBUTION OF NJ TREES [lines 50-100] # SUMMARIES OF DIVERSITY [lines 100-120] # HISTOGRAM OF DIVERSITY ACROSS 5KB WINDOWS [lines 120-175] # SCRIPTS FOR HIDDEN MARKOV MODEL FOR LOCAL ANCESTRY INFERENCE IN M. GUTTATUS [lines 175-370] load("~/Desktop/mim4dryad/brandainEtAlPLoSGen2014.Robj") #NOTE CHANGE TO MATCH YOUR DIRECTORY #loads an R-object with # 'nj.pca.snps' a matrix with genotypes for all samples at all SNPS used in nj and PCA analyses # 'pairwise.focal.mim' a list containing #'comps' - itself a list of all pairwise comparisons between focal samples. # For each pairwise comparison between focal samples the number of sites compared # & differences between two samples in 1kb windows is contained in a matrix #'bound' a matrix describing the location of each 1 kb region #************************************************************************************ #description of samples all.gut = c("TSG3G","YJS6G","AHQT1G","CAC6G","DUNG","MAR3G","BOG10G","REM8G","LMC24G","SLP9G","DPRG","PED5G","IM62.JGI","SWBG") all.nas = c("NHN26","KootN","CAC9N","SF5N","DPRN") focal.nas = c("NHN26", "KootN" ,"CAC9N" ,"DPRN" ) focal.gut = c("AHQT1G","CAC6G","DPRG","SLP9G") focal.north.gut = c("AHQT1G","CAC6G") focal.south.gut = c("DPRG","SLP9G") focal.allo.gut = c("SLP9G","AHQT1G") focal.sym.gut = c("DPRG","CAC6G") #************************************************************************************ #RECREATING THE PCA ANALYSES, WITH ALL NAS, AND WITH ONE NAS AT A TIME makePCA = function(dat){ all.means<-rowMeans(dat,na.rm=TRUE) all.var<-sqrt(all.means * (1-all.means)) all.normed.geno<- (dat -all.means)/all.var all.normed.geno<-apply(all.normed.geno,2,function(x){x-mean(x,na.rm=TRUE)/sd(x,na.rm=TRUE)}) all.cov<-cov(all.normed.geno,use="pairwise.complete.obs") all.eig<-eigen(all.cov) plot(all.eig$vectors[,1],all.eig$vectors[,2],type="n",axes=FALSE, main="PCA",xlab="PC_1",ylab="PC_2") text(all.eig$vectors[,1],all.eig$vectors[,2],colnames(all.normed.geno),col=ifelse(colnames(all.normed.geno)%in%all.gut,"blue","red")) box() } par(mfrow=c(3,2)) makePCA(nj.pca.snps [,!colnames(nj.pca.snps)%in%"Mdent"]) #MAKE A PCA WITH ALL NAS AN GUT SAMPLES lapply(all.nas ,function(X){ #MAKE A PCA REMOVING ONE NAS SAMPLE AT A TIME downsampled = nj.pca.snps [,!colnames(nj.pca.snps)%in%c(setdiff(all.nas,X),"Mdent")] makePCA(downsampled) }) #************************************************************************************ #MAKE A BOOTSTRAPPED COLLECTIONB OF NJ TREES AND PLOT WITH DENSITREE n.boot.reps = 100 #we resampled SNPs 1000 times to make 1000 trees, but we set this at 100 for now TREES = function(inds){ M=matrix(0,nrow=ncol(inds),ncol=ncol(inds)) colnames(M)= rownames(M)=colnames(inds) for(i in seq_along(inds[1,-1]) ){ for(j in (i+1):ncol(inds)){ M[i,j] = M[j,i] =mean(abs(inds[,i]-inds[,j]) , na.rm=TRUE) } } tree.1 = nj(M) tree.2 = root(tree.1 ,outgroup=c("Mdent"),resolve.root=TRUE) tree.3 = drop.tip(tree.2,"Mdent") chronopl(tree.3,lambda=1) } trees= lapply(1:n.boot.reps,function(Z){ inds = nj.pca.snps[sample(seq_along(nj.pca.snps[,1]),replace=TRUE),] TREES = function(inds){ M=matrix(0,nrow=ncol(inds),ncol=ncol(inds)) colnames(M)= rownames(M)=colnames(inds) for(i in seq_along(inds[1,-1]) ){ for(j in (i+1):ncol(inds)){ M[i,j] = M[j,i] =mean(abs(inds[,i]-inds[,j]) , na.rm=TRUE) } } tree.1 = nj(M) tree.2 = root(tree.1 ,outgroup=c("Mdent"),resolve.root=TRUE) tree.3 = drop.tip(tree.2,"Mdent") chronopl(tree.3,lambda=1) } TREES(inds) }) class(trees) = "multiPhylo" par(mfrow=c(1,1)) densiTree(trees,consensus = TREES(nj.pca.snps),alpha=.02) #************************************************************************************ #CACULATE BASIC SUMMARY STATS FORM FOCAL COMPS #WE CALL THIS 'all.pi' all.pi = data.frame(t(sapply(pairwise.focal.mim$comps,colSums))) all.pi = cbind(all.pi,with(all.pi,cbind(pis = diff.4/all.4, pins = ((diff.0/all.0)/(diff.4/all.4)) ))) all.pi = data.frame(cbind(do.call(rbind,strsplit(rownames(all.pi)," ")),all.pi,stringsAsFactors=FALSE) ) all.pi$type = rowSums(apply(all.pi,2,function(X){X%in%all.nas})) all.pi$type = with(all.pi,ifelse(type==0,"within.gut",ifelse(type==1,"interspecific","within.nas"))) #SUMMARIZING PIS and PIN/PIS for inter and intraspecific comps pis.summary = with(all.pi,tapply(diff.4,type,sum)) / with(all.pi,tapply(all.4,type,sum)) pin.pis.summary = (with(all.pi,tapply(diff.0,type,sum)) / with(all.pi,tapply(all.0,type,sum)))/pis.summary #************************************************************************************ #GET THE DISTRIBUTION OF PI IN OVERLAPPING 5KB WINDOWS #FUNCTION TO CACLCULATE SUMS OF OVERLAPPING WINDOWS slidingWindow = function(vect,lag){ x= lapply(c(1:lag),function(LAG){ if(LAG==1) { return(vect[-c( (length(vect) -lag+LAG+1 ) :length(vect) )] ) } if(LAG==lag){ return(vect[-c(1:(LAG-1))] ) } return(vect[-c(1:(LAG-1),(length(vect) -lag+LAG+1 ) :length(vect) ) ] ) } ) return(rowSums(do.call(cbind,x) , na.rm=TRUE)) } #finding pi in overlapping 5 kb windows chrGenomeShare = function(CHR,pis=FALSE){ this.chr = pairwise.focal.mim$bound$chr == CHR chr.pw = lapply(pairwise.focal.mim$comps,function(X){ X[this.chr,c("diff.pos","all.pos")] }) names(chr.pw) = apply(sapply(strsplit(names(chr.pw)," "),sort),2,paste,collapse=" ") chr.pi = sapply( chr.pw,function(COMBO){ tmp.pi = data.frame( diff = slidingWindow(COMBO[,"diff.pos"],5), all = slidingWindow(COMBO[,"all.pos"],5)) return(with(tmp.pi,ifelse(all>200,diff/all,NA))) }) return(chr.pi) } wind.sum = Reduce( "+" , lapply(paste("chr",1:14,sep="_"),function(CHR){ apply(chrGenomeShare(CHR),2, function(C){ table(cut( C, c(-1,seq(0,.5,.001),1)) )}) })) #potprocessing and plotting these values pi.dist.5kb = sapply( list( inter_allo =do.call(c,lapply(focal.nas,function(X){ paste( apply( cbind(X,focal.gut[c(1,4)]) , 1 , sort)[1,], apply( cbind(X,focal.gut[c(1,4)]) , 1 , sort)[2,] ) })), gut = apply( combn(focal.gut,2),2,function(X){ paste(sort(unlist(X)),collapse=" ")}), nas = apply( combn(focal.nas,2),2,function(X){ paste(sort(unlist(X)),collapse=" ")}), inter_all =do.call(c,lapply(focal.nas,function(X){ paste( apply( cbind(X,focal.gut) , 1 , sort)[1,], apply( cbind(X,focal.gut) , 1 , sort)[2,] ) })), ahq = sapply(focal.nas,function(X){ paste( sort(c(X,"AHQT1G")) ,collapse=" ") }), slp = sapply(focal.nas,function(X){ paste( sort(c(X,"SLP9G")) ,collapse=" ") }), med = sapply(focal.nas,function(X){ paste( sort(c(X,"DPRG")) ,collapse=" ") }), cac = sapply(focal.nas,function(X){ paste( sort(c(X,"CAC6G")) ,collapse=" ") }) ), function(COMP){ ( rowSums( wind.sum[,COMP] ) ) / sum( wind.sum[,COMP]) }) tmp.x.vals = as.numeric(do.call(rbind,strsplit(gsub("]","",rownames(pi.dist.5kb),fixed=TRUE),",",fixed=TRUE))[,2]) matplot(x=tmp.x.vals,pi.dist.5kb[,c("gut","nas","ahq","cac","med","slp")],type="l",ylim=c(0.001,.05),log="y",xlim=c(0,.2),lty=1,legend="topright", col=c("black","grey","maroon1","darkorchid4","darkblue","lightblue"),lwd=2) legend("topright", c("gut x gut","nas x nas","allo north gut (ahq) x nas ","sym north gut (cacg) x nas","sym south gut (dprg) x nas","allo north gut (slp) x nas"), text.col=c("black","grey","maroon1","darkorchid4","darkblue","lightblue"),bty="n") #************************************************************************************ #HMM #OUR CUSTUMIZED Hidden Markov Model for identiffying introgression of nasutus ancesrty into guttatus. #Note, the final lines (all.results = ...) are where we enter this calculation. It takes substantial time and CPU. namesList = function(these.names){ # this is a way to get a list where each elelment has a name equalto its value, so that when I lapply, I get back a [correctly] named list all.things = as.list(these.names) names(all.things)=these.names return(all.things) } vectorizedOrder = function(matrix.for.ordering,smallest.on.left=TRUE){ zeroed.mat = ifelse(is.na(matrix.for.ordering),2*smallest.on.left,matrix.for.ordering) to.order = 10*c(1:nrow(zeroed.mat)) + zeroed.mat this.order = order(c(to.order)) ordered.vector = c(matrix.for.ordering[this.order]) ordered.matrix = matrix(ordered.vector, nrow=nrow(to.order), byrow=TRUE) return(ordered.matrix) } forwardAlgorithm = function(log.em,rec,adm,threshold){ #log.em is the emission probs #rec is a silly guess of rec rate [across 1kb windows] MAY EM LATER #adm is a rough prior on admixture proportion #threshold is a value at which I get worried about underflow and then multply by L=nrow(log.em) transitions = data.frame( to.nas = c( from.nas = ( (1-rec)+rec * (adm)) , from.gut = ( rec * (adm))) , to.gut = c( from.nas = (rec * (1-adm) ) , from.gut = (1-rec+ rec * (1-adm) ) ) ) if( sum( sign(range(transitions)) ) !=2 ){ transitions[transitions>1 ] = (1-(1e-7));transitions[transitions<0 ] = (1e-7)} #fixing it when going over alpha = data.frame(matrix(nrow=(L+1),ncol=ncol(log.em))) alpha[1,]= log( 1) names(alpha) = c("nas.like","gut.like") temp=numeric(nrow(alpha));count=0 #RECURSION for(site in 1:L){ alpha$nas.like[(site+1)] = log.em[site,"p.nas"] + log( sum( exp(alpha[site,]) * transitions[,"to.nas"] ) ) alpha$gut.like[(site+1)] = log.em[site,"p.gut"] + log( sum( exp(alpha[site,]) * transitions[,"to.gut"] ) ) #This avoids underflow if(min(exp(alpha[(site+1),])) < threshold){ count=count+1 alpha[(site+1),]=alpha[(site+1),]+log(1/threshold) temp[(site+1):length(temp)]=count } } Px=log(sum(exp(alpha[nrow(alpha),])))+log(threshold)*temp[length(temp)] alpha=alpha+log(threshold)*temp return(list(alpha=alpha,Px=Px)) } backwardAlgorithm = function(log.em,rec,adm,threshold){ L=nrow(log.em) transitions = data.frame( to.nas = c( from.nas = ( (1-rec)+rec * (adm)) , from.gut = ( rec * (adm))) , to.gut = c( from.nas = (rec * (1-adm) ) , from.gut = (1-rec+ rec * (1-adm) ) ) ) if( sum( sign(range(transitions)) ) !=2 ){ transitions[transitions>1 ] = (1-(1e-7));transitions[transitions<0 ] = (1e-7)} #fixing it when going over beta = data.frame(matrix(nrow=(L+1),ncol=ncol(log.em))) # beta[nrow(beta),]= log(1) beta[nrow(beta),]=log(1) names(beta) = c("nas.like","gut.like") temp=numeric(nrow(beta));count=0 #RECURSION for(site in L:1){ beta$nas.like[site] = log( exp(log(transitions["from.nas","to.nas"])+log.em[(site),"p.nas"]+beta[site+1,"nas.like"])+ #l is nasutus #pt1 exp(log(transitions["from.nas","to.gut"])+log.em[(site),"p.gut"]+beta[site+1,"gut.like"]) #l is gutatus #pt1 ) beta$gut.like[site] = log( exp(log(transitions["from.gut","to.nas"])+log.em[(site),"p.nas"]+beta[site+1,"nas.like"])+ #l is nasutus #pt1 exp(log(transitions["from.gut","to.gut"])+log.em[(site),"p.gut"]+beta[site+1,"gut.like"]) #l is gutatus #pt1 ) #This avoids underflow if(min(exp(beta[site,])) < threshold){ count=count+1 beta[site,]=beta[site,]+log(1/threshold) temp[site:1]=count } } beta=beta+log(threshold)*temp return(beta) } minPiByWindCOMBOS = function(COMBOS){ all.comps = t(apply( do.call(rbind,strsplit(names(pairwise.focal.mim[[1]])," ")) , 1 , sort)) this.dat = pairwise.focal.mim[[1]][paste(all.comps[,1],all.comps[,2])%in%paste(COMBOS[,1],COMBOS[,2]) ] this.dat = lapply( this.dat , function(COMP){ COMP[ , c("diff.pos","all.pos")] }) bins = cbind( c(0,6,11,21,51,76,101,251) , c(5,10,20,50,75,100,250,1001)) # move across bins with # informative sites pi = lapply(1:nrow(bins),function(BIN){ do.call(cbind,lapply(this.dat,function(PAIR){ with(data.frame(PAIR),ifelse( (all.pos>bins[BIN,1] & all.pos<=bins[BIN,2]), (diff.pos/all.pos),NA ) ) })) }) names(pi) = bins[,2] lapply(pi,function(BIN){ vectorizedOrder(BIN[(rowSums(!is.na(BIN))>0),])[,1] }) } minPiTo = function(COMBOS){ all.comps = t(apply( do.call(rbind,strsplit(names(pairwise.focal.mim[[1]])," ")) , 1 , sort)) this.dat = pairwise.focal.mim[[1]] [paste(all.comps[,1],all.comps[,2])%in%paste(COMBOS[,1],COMBOS[,2]) ] this.dat = lapply( this.dat , function(COMP){ COMP[ , c("diff.pos","all.pos")] }) n.sites = sapply(this.dat,function(X) { X[,"all.pos"] }) this.pi = sapply(this.dat,function(X) { X[,"diff.pos"]/X[,"all.pos"] }) bins = cbind( c(0,6,11,21,51,76,101,251) , c(5,10,20,50,75,100,250,1001)) use.bin = apply( sapply(bins[,1],function(Z){ rowSums( n.sites >=Z ) })!=0,1,function(K){ ifelse( length(which(K))==0,0,max(which(K))) }) tmp.pi = data.frame( n = rep(NA,nrow(n.sites)) , p = rep(NA,nrow(n.sites)) ) for(i in 1:nrow(bins)){ focal.pos = use.bin==i tm = this.pi[focal.pos,] tm[ n.sites[focal.pos,] < bins[i,1] ] = NA tmp.pi[focal.pos,"p"] = vectorizedOrder(tm)[,1] tmp.pi[focal.pos,"n"] = ifelse(is.na( tmp.pi[focal.pos,"p"]),0,bins[i,2]) } return(tmp.pi) } prepMimulusAdmixHMM = function(focal){ #get hist of pi, separated by n informative sites nas_nas = minPiByWindCOMBOS(COMBOS = t(apply( combn(focal.nas,2) , 2,sort))) gut_nas = minPiByWindCOMBOS(COMBOS = t(apply( rbind(focal.nas,focal) , 2,sort)) ) cuts = c( seq(-.005, .0999 ,.005 ) , seq(.1,.499,.01),seq(.5,1.05,.05) ) pi.dist = lapply( list(mn = nas_nas, mg = gut_nas) , function(DIST){ p = cbind(NA, do.call(cbind,lapply( DIST , function(COVERAGE){(1+table( cut(COVERAGE,cuts) )) /(-1+length(cuts)+ (length(COVERAGE))) }))) colnames(p)[1]="0" return(p) }) #pidist provides probs. now we want # pi.to.nearest.nas = minPiTo(COMBOS = t(apply( rbind(focal.nas,focal) , 2,sort))) # # emissions getEmissions = function(X){ X=pi.to.nearest.nas X$p.nas=NA X$p.gut=NA X$p.bin=NA for( x in sort(setdiff( unique(X$n),"0")) ){ to.use = X$n==x X$p.nas[to.use] = as.numeric( pi.dist$mn[,as.character(x)][ as.character(cut( X[to.use,"p"] , cuts)) ] ) X$p.gut[to.use] = as.numeric( pi.dist$mg[,as.character(x)][ as.character(cut( X[to.use,"p"] , cuts)) ] ) X$p.bin[to.use] = as.character(cut( X[to.use,"p"] , cuts) ) } return(X) } getEmissions() } admixtureHMM = function(params,ch,hmm.prep){ R=exp(params[1]) A=exp(params[2]) nas = is.na( rowSums(hmm.prep) ) fwd = forwardAlgorithm( log.em = hmm.prep[!nas,],rec = R, adm = A, threshold = 1e-20) bwd = backwardAlgorithm( log.em = hmm.prep[!nas,],rec = R, adm = A, threshold = 1e-20) posterior.decode = exp(fwd[[1]]+bwd -(fwd[[2]]) ) #POSTERIOR DECODING SHOWS FAILURE tmp = data.frame( nas.like = rep(NA,(nrow(hmm.prep)+1)) , gut.like = rep(NA,(nrow(hmm.prep)+1)) ) tmp[c(TRUE,!nas),] = posterior.decode posterior.decode = tmp print(ch) save(posterior.decode, file = sprintf("tmpmimHMM%s.Robj",ch) ) # penalty for going over return(fwd$Px + sum(sapply( params,function(PAR){ifelse(PAR>0,-PAR*1000,0)})) ) } runHMM = function(params,COMP,this.dat){ print(COMP) this.dat = data.frame(apply(this.dat[,c("p.nas","p.gut")],2,function(A){ ifelse(is.na(A),0,log(A))})) temp = sapply( namesList(paste("chr",1:14,sep="_")),function(chr){ admixtureHMM(params,chr,hmm.prep = this.dat[ pairwise.focal.mim[[2]]$chr ==chr,]) } ) -sum(temp) } ################################################################################################################################################################ #WARNING THIS HMM TAKES TIME AND MEMORY. WE RECOMMEND PARRALELIZING IT [AS WE DID IN OUR RUNS]; #HOWEVER WE LEAVE IT IN THIS FORM SO THAT IT CAN BE RUN AS IS ################################################################################################################################################################ #init_params = log(c(R=1e-4,A=.05)) all.results = lapply(namesList(focal.gut),function(P){ all.hmm.prep = prepMimulusAdmixHMM( P ) init_params = log(c(R=1e-4,A=.05)) result = optim(init_params,runHMM,COMP=P,this.dat=all.hmm.prep,control=list(trace=300,maxit=300)) hmm.output = lapply(namesList(paste("chr",1:14,sep="_")),function(ch){ load(sprintf("tmpmimHMM%s.Robj",ch) ) system(sprintf("rm tmpmimHMM%s.Robj",ch)) return(posterior.decode) }) list(hmm.output=hmm.output,param.est.and.optim.updates = result) })