#Scripts adopted from : https://esnielsen.github.io/post/pool-seq-analyses-poolfstat-baypass/ # Use the OTSBayPass.snpdet file and give the columns headers. Importantly, the first column must be titled "scaffold" and the second column must be titled "pos" (these are the contig and position columns).Here I called this file 'snpdet.pruning.txt' # Read input file data<-read.table("snpdet.pruning.txt",sep="",header=T) # Initiate data for LD pruning data_LD<-NULL # Window for LD pruning, here at 1000 bp (select 1 SNP per 1000 base pairs) window<-1000 # Print every unique contig, and for every contig, randomly subsample 1 SNP per 1000 bp window scaffold_ID<-unique(data$scaffold) for(i in 1:length(scaffold_ID)){ print(scaffold_ID[i]) subset<-data[which(data$scaffold==scaffold_ID[i]),] for(j in 0:10){ print(window*j+1) subset_bin_i<-subset[which(subset$pos>=window*j & subset$pos<=window*(j+1)),] random<-sample(1:length(subset_bin_i[,1]),1) subsubset<-subset_bin_i[random,] data_LD<-rbind(data_LD,subsubset) } } # Get the unique LD pruned SNP list uniq.LD <- unique(data_LD) # Write to excel file WriteXLS(uniq.LD, "SG.LD.list.xls") ####### SUBSET BAYPASS FILE ########### # Filter the BayPass input files to just LD-pruned SNPs. library(dplyr) # Read the .genobaypass/.snpdet file type explained above, relating to all SNPs snp.data<-read.table("Ots.snpdet.geno.txt",sep="",header=F) # Read your LD SNP list. ld.data<-read.table("Ots.LD.index.txt",sep="\t",header=F) # Filter the list of all SNPs to now only list the LD pruned SNPs snp.data.f <- dplyr::inner_join(snp.data, ld.data) # Write out as new file, and then from there you can split back up into new .snpdet and .genobaypass input files write.table(snp.data.f, file="Ots.LD.snpdet.geno.txt", col.names = F, row.names= F, quote = F) source("baypass_utils.R") require(corrplot) ; require(ape) library(WriteXLS) # Read omega matrix BayPass output SG.omega=as.matrix(read.table("OtsBPout_mat_omega.out")) # Identify pool names for plotting colnames(SG.omega) <- c("KALA","METHsp","SPCR","WINT","WENAsp","SAWT","POWE","LEWI","MFJD","YAKI","BIGCfa","SECE","WENAsu","LYON","GRRO","WARM","ELOC","EFSA","LOST","MARS","BEAR","METHsu","LEMH","LOON","SFSA","WSAL","REDR","WFYF","CHAM","SALM","IMNA","LOLO","CAMA","SULP","TUCA","RAPI","CAPE","VALLE","COWLsp","COWLfa","MCKE","KLIC","CLEE","NFLE","JOHN","DESC","CLEA","PRIE") rownames(SG.omega) <- c("KALA","METHsp","SPCR","WINT","WENAsp","SAWT","POWE","LEWI","MFJD","YAKI","BIGCfa","SECE","WENAsu","LYON","GRRO","WARM","ELOC","EFSA","LOST","MARS","BEAR","METHsu","LEMH","LOON","SFSA","WSAL","REDR","WFYF","CHAM","SALM","IMNA","LOLO","CAMA","SULP","TUCA","RAPI","CAPE","VALLE","COWLsp","COWLfa","MCKE","KLIC","CLEE","NFLE","JOHN","DESC","CLEA","PRIE") # Create a correlation matrix of the omega values, which can be used to assess genomic differentiation between pools cor.mat=cov2cor(SG.omega) corrplot(cor.mat,method="color",mar=c(2,1,2,2)+0.1, main=expression("Correlation map based on"~hat(Omega))) # We can also assess population differentiation with hierarchical clustering: bta14.tree=as.phylo(hclust(as.dist(1-cor.mat**2))) plot(bta14.tree,type="p", main=expression("Hier. clust. tree based on"~hat(Omega)~"("*d[ij]*"=1-"*rho[ij]*")")) # Read the xtx BayPass output SG.snp.res=read.table("OtsBPout_summary_pi_xtx.out",h=T) # Get the Pi Beta distribution for POD generation SG.pi.beta.coef=read.table("OtsBPout_summary_beta_params.out",h=T)$Mean # Upload original data to get read counts SG.data<-geno2YN("OTSBayPass_OtsAnalysis11.genobaypass") # Simulate POD dataset to use for outlier SNP detection simu.SG <-simulate.baypass(omega.mat=SG.omega,nsnp=10000,sample.size=SG.data$NN, beta.pi=SG.pi.beta.coef,pi.maf=0.1,remove.fixed.loci = FALSE,suffix="SG.BP.sim") #Run through Baypass on the server: #nohup baypass -npop 48 -gfile G.SG.BP.sim -poolsizefile OTSBayPass_OtsAnalysis11.poolsize -d0yij 8 -outprefix SG.BP.sim -npilot 100 > BPsim.out & ######### Get the Baypass Outliers ############ source("baypass_utils.R") library(WriteXLS) # Read the new omega matrix and compare to original. Here you want similar values between the two SG.pod.omega=as.matrix(read.table("SG.BP.sim_mat_omega.out")) plot(SG.pod.omega,SG.omega) ; abline(a=0,b=1) # Get the Forstner and Moonen Distance (FMD) between simulated and original posterior estimates (here a smaller value is better) library(geigen) fmd.dist(SG.pod.omega,SG.omega) #2.54249 # Look at POD xtx values, and identify SNPs where the xtx values are above the 99% significance threshold from the POD. So in the plot, it is those loci (dots) which are above the abline SG.pod.xtx=read.table("SG.BP.sim_summary_pi_xtx.out",h=T)$M_XtX SG.pod.thresh=quantile(SG.pod.xtx,probs=0.99) plot(SG.snp.res$M_XtX) abline(h=SG.pod.thresh,lty=2) # Obtain the xtx threshold. Any SNPs with an xtx value greater than this are identified as outlier SNPs SG.pod.thresh # 99% # 58.07806 # Here I just saved the xtx values per SNP list, and then in excel/text editor filtered out the SNPs with an xtx greater than the threshold value as outliers. library(dplyr) snp.data<-read.table("OTSBayPass_OtsAnalysis11.snpdet",sep="",header=F) SG.snp.res=read.table("OtsBPout_summary_pi_xtx.out",h=T) snp.data$V5<-seq(1,1365130) x <- SG.snp.res %>% filter(SG.snp.res$M_XtX>58.07806) snp.list <- as.data.frame(x$MRK) y <- snp.data[snp.data$V5 %in% snp.list$`x$MRK` ,] write.table(x, file="SignSNPlist_BayPass", sep="\t", row.names=FALSE) write.table(y, file="SignSNPlist_BayPass", sep="\t", row.names=FALSE) q() #Produce a list of SNPs that are significant sed -i '1c\CHR\tBP\tREF\tALT' SNPnames_BayPass paste SNPnames_BayPass SignSNPlist_BayPass > SignBayPassSNPs_Ots.txt