################################################################################################################################################### # Your "start" folder should contain python_scripts/, bash_scripts/, Rscripts/, target_regions/ and data/ folder. # In the data folder the raw data should be located, the names of the files should be: # R01_002_CGATGT_L001_R1.fastq.gz R01_002_CGATGT_L001_R2.fastq.gz, etc. # # see http://www.ebi.ac.uk/ena/data/view/PRJEB5165 # # If you renamed the files/folders, you have to adjust the script. # # Start the script in the folder that contains all the sample folders by typing: bash ../bash_scripts/preprocessing_to_bam.sh . # # There should be one folder python_scripts including all relevant python scripts. # # There should be one folder bash_scripts including all relevant bash scripts. # # You need: pBWA, mpirun, vcftools, SAMTools, bcfutils, vcfutils.pl, bayenv2, bayescan, LOSITAN, admixture, and linux command line tools (sed, awk, grep, …) ################################################################################################################################################### # unpack the tar.gz files for i in *gz ; do tar xvfz $i; done ## Preprocessing each fastq file: # There should be one folder for each library, containing the R1 and R2 reads. cd data bash ../bash_scripts/preprocessing_to_bam.sh ## In the following is an example for library R01_002. # Unpack gunzip R01_002_CGATGT_L001_R1.fastq.gz mv R01_002_CGATGT_L001_R1.fastq R01_002_R1.fastq gunzip R01_002_CGATGT_L001_R2.fastq.gz mv R01_002_CGATGT_L001_R2.fastq R01_002_R2.fastq # Preprocess: allow 0 N's, 95% of the nucleotides of the trimmed reads have to have quality values greater than 20. The first nucleotide to keep is the 4th, the last the 93rd. python ../python_scripts/preprocess_fastq.py R01_002_R1.fastq R01_002_R1_trimmed.fastq R01_002_R1_trimmed.stats -n 0 -c 20 -p 0.95 -f 4 -l 93 python ../python_scripts/preprocess_fastq.py R01_002_R2.fastq R01_002_R2_trimmed.fastq R01_002_R2_trimmed.stats -n 0 -c 20 -p 0.95 -f 4 -l 93 # Cutatapt cutadapt -O 10 -b AGATCGGAAGAGC --untrimmed-output=R01_002_R1_noAdaptUntrimmed.fastq -o R01_002_R1_noAdaptTrimmedOnly.fastq -e 0.05 R01_002_R1_trimmed.fastq > info_R1.txt cutadapt -O 10 -b AGATCGGAAGAGC --untrimmed-output=R01_002_R2_noAdaptUntrimmed.fastq -o R01_002_R2_noAdaptTrimmedOnly.fastq -e 0.05 R01_002_R2_trimmed.fastq > info_R2.txt # Find pairs python ../python_scripts/find_PE_pairs.py R01_002_R1_noAdaptUntrimmed.fastq R01_002_R2_noAdaptUntrimmed.fastq # index target regions for mapping with pBWA: cd ../target_regions bwa_0.5.9 index pseudotsuga_nimblegen_set2.fasta # Mapping with pBWA v. 1.21009 cd ../data mpirun --mca orte_base_help_aggregate 0 -n 8 pBWA aln -t 1 -n 5 -f R01_002_R1.sai ../target_regions/pseudotsuga_nimblegen_set2.fasta R01_002_R1_noAdaptUntrimmed.fastq mpirun --mca orte_base_help_aggregate 0 -n 8 pBWA aln -t 1 -n 5 -f R01_002_R2.sai ../target_regions/pseudotsuga_nimblegen_set2.fasta R01_002_R2_noAdaptUntrimmed.fastq mpirun --mca orte_base_help_aggregate 0 -n 8 pBWA sampe -f R01_002.sam -M ../target_regions/pseudotsuga_nimblegen_set2.fasta R01_002_R1.sai R01_002_R2.sai R01_002_R1_noAdaptUntrimmed.fastq R01_002_R2_noAdaptUntrimmed.fastq rm -rf *sai # Using samtools v.0.18 samtools view -bS R01_002.sam | samtools_0.18 sort - R01_002_sorted # You can also make : samtools view -bSq 1 … but then the results are slightly different samtools view -bq 1 R01_002_sorted.bam -o mapq1_R01_002.bam ## Mapping and SNP calling: mkdir tmp_vcf cd tmp_vcf/ samtools view -H ../Sample_R01_002/mapq1_R01_002.bam | grep "\@SQ" | sed 's/^.*SN://g' | cut -f 1 | xargs -I {} -n 1 -P 18 sh -c "samtools mpileup -f ../target_regions/pseudotsuga_nimblegen_set2.fasta -D -g -C 50 -S -A -r {} ../Sa*/mapq1_*.bam | bcftools view -bvcg - > tmp.{}.bcf" ll tmp.PMUSI* | awk '{print $NF}' | sort -k2 -t_ -n | xargs -I {} sh -c "bcftools view {} | vcfutils.pl varFilter -D 10000 | grep -v '^#' - " > tmp1.vcf ll tmp.PMUSC* | awk '{print $NF}' | sort -k2 -t_ -n | xargs -I {} sh -c "bcftools view {} | vcfutils.pl varFilter -D 10000 | grep -v '^#' - " > tmp2.vcf ll tmp.d* | awk '{print $NF}' | sort -k2 -t_ -n | xargs -I {} sh -c "bcftools view {} | vcfutils.pl varFilter -D 10000 | grep -v '^#' - " > tmp3.vcf bcftools view tmp.PMUSC_01762.bcf | grep '^#' > vcf.header cat vcf.header tmp1.vcf tmp2.vcf tmp3.vcf > ../without_outlier.vcf cd .. # Get coverages bash ../bash_scripts/coverageBed_mapq1.sh # coverage per bp: for i in Sampl*/*_covperbase.txt ; do awk '{sum+=$6}END {print sum/NR}' $i ; done # coverage per bp without zero-coverage: for i in Sampl*/*_covperbase.txt ; do awk '{if ($3 != 0) print $6}' $i | awk '{sum+=$1}END {print sum/NR}' ; done # Filter vcf file cat without_outlier.vcf | vcf-annotate -f VDB=0.015/4=0.05/1=0.01/Q=30 -H > filtered.vcf python python_scripts/filterVCF.py filtered.vcf filtered_e1.vcf 10 0 -c 71 -n samples_sorted_by_locID.txt -e 1 # Shuffle the cols in a way that the trees are grouped according to the provenances vcf-shuffle-cols -t newOrder_without_outlier.vcf filtered_e1.vcf > tmp.vcf # Adjust genotypes python python_scripts/adjust_vcf_genotypes.py tmp.vcf ../filtered_e1.vcf cd .. ### filtered_e1.vcf is now in your "start" folder (which should now contain python_scripts/, bash_scripts/, Rscripts/, target_regions/ and data/ folder) ### Generate some additional files # Count how many SNPs consisted of which kind of genotypes grep '0/0' filtered_e1.vcf | grep -v '0/1' | grep -c -v '1/1' grep -v '0/0' filtered_e1.vcf | grep '0/1' | grep -c -v '1/1' grep -v '0/0' filtered_e1.vcf | grep -v '0/1' | grep -c '1/1' grep '0/0' filtered_e1.vcf | grep '0/1' | grep -c -v '1/1' grep '0/0' filtered_e1.vcf | grep -v '0/1' | grep -c '1/1' grep -v '0/0' filtered_e1.vcf | grep '0/1' | grep -c '1/1' grep '0/0' filtered_e1.vcf | grep '0/1' | grep -c '1/1' grep -v '^#' filtered_e1.vcf | awk '{cnt=0;for (i=1;i<=NF;i++) if ($i ~ /^0\/1/) cnt++; print cnt}' | sort -n | uniq -c # Make some additional file formats: python python_scripts/vcf2various.py filtered_e1.vcf filtered_e1.SNP 2SNP 10 0 -c 71 -e 1 python python_scripts/vcf2various.py filtered_e1.vcf filtered_e1 2structure 10 0 -c 71 -e 1 python python_scripts/vcf2various.py filtered_e1.vcf filtered_e1.distMat 2distMatrix 10 0 -c 71 -e 1 awk '{print $1}' filtered_e1.SNP | sed ':a;N;$!ba;s/\n/, /g' | sed 's/SNP-id, //g'> locus_names sed 's/,//g' locus_names > locus_names_NoComma sed -e 's/LA\([^\t]*\)/LA\1\tLA/g' -e 's/RI\([^\t]*\)/RI\1\tRI/g' -e 's/CR\([^\t]*\)/CR\1\tCR/g' -e 's/TI\([^\t]*\)/TI\1\tTI/g' -e 's/AR\([^\t]*\)/AR\1\tAR/g' filtered_e1.distMat > filtered_e1_withPOP.distMat sed -e 's/LA\([^\t]*\)/LA\1\tLA/g' -e 's/RI\([^\t]*\)/RI\1\tRI/g' -e 's/CR\([^\t]*\)/CR\1\tCR/g' -e 's/TI\([^\t]*\)/TI\1\tTI/g' -e 's/AR\([^\t]*\)/AR\1\tAR/g' filtered_e1.stru > filtered_e1_WITHPOP.stru cat locus_names_NoComma filtered_e1_WITHPOP.stru > filtered_e1_WITHPOP_AND_LOCUS.stru # The last file is a structure like file with population information and locus names ### VCFTOOLS #### #mkdir vcftools_results cd vcftools_results vcftools --vcf ../filtered_e1.vcf --TajimaD 10000 --relatedness --freq --counts --depth --site-depth --site-mean-depth --geno-depth --het --ld-window-bp 10000 --SNPdensity 10000 --site-pi --window-pi 10000 --weir-fst-pop ../AR_population.txt --weir-fst-pop ../CR_population.txt --weir-fst-pop ../LA_population.txt --weir-fst-pop ../RI_population.txt --weir-fst-pop ../TI_population.txt --out filtered_e1 --hardy vcftools --vcf ../filtered_e1.vcf --out filtered_e1 --weir-fst-pop ../AR_population.txt --weir-fst-pop ../CR_population.txt --weir-fst-pop ../LA_population.txt --weir-fst-pop ../RI_population.txt --weir-fst-pop ../TI_population.txt --fst-window-size 10000 # Get Fst between coastal populations vcftools --vcf ../filtered_e1.vcf --out coastal --remove-indv AR_Phil_R01_002 --remove-indv AR_Phil_R03_004 --remove-indv AR_Phil_R05_005 --remove-indv AR_Phil_R07_006 --remove-indv AR_Phil_R09_007 --remove-indv AR_Schl_R01_004 --remove-indv AR_Schl_R03_005 --remove-indv AR_Schl_R05_006 --remove-indv AR_Schl_R07_007 --remove-indv AR_Schl_R09_012 --remove-indv AR_Sifi_R01_005 --remove-indv AR_Sifi_R03_006 --remove-indv AR_Sifi_R05_007 --remove-indv AR_Sifi_R11_006 --weir-fst-pop ../CR_population.txt --weir-fst-pop ../LA_population.txt --weir-fst-pop ../RI_population.txt --weir-fst-pop ../TI_population.txt vcftools --vcf ../filtered_e1.vcf --out coastal --remove-indv AR_Phil_R01_002 --remove-indv AR_Phil_R03_004 --remove-indv AR_Phil_R05_005 --remove-indv AR_Phil_R07_006 --remove-indv AR_Phil_R09_007 --remove-indv AR_Schl_R01_004 --remove-indv AR_Schl_R03_005 --remove-indv AR_Schl_R05_006 --remove-indv AR_Schl_R07_007 --remove-indv AR_Schl_R09_012 --remove-indv AR_Sifi_R01_005 --remove-indv AR_Sifi_R03_006 --remove-indv AR_Sifi_R05_007 --remove-indv AR_Sifi_R11_006 --weir-fst-pop ../CR_population.txt --weir-fst-pop ../LA_population.txt --weir-fst-pop ../RI_population.txt --weir-fst-pop ../TI_population.txt --fst-window-size 10000 # Get Fst between coastal and AR provenance vcftools --vcf ../filtered_e1.vcf --out AR_vs_coastal --weir-fst-pop ../AR_population.txt --weir-fst-pop ../coastal_population.txt vcftools --vcf ../filtered_e1.vcf --out AR_vs_coastal --weir-fst-pop ../AR_population.txt --weir-fst-pop ../coastal_population.txt --fst-window-size 10000 # Only AR vcftools --vcf ../filtered_e1.vcf --indv AR_Phil_R01_002 --indv AR_Phil_R03_004 --indv AR_Phil_R05_005 --indv AR_Phil_R07_006 --indv AR_Phil_R09_007 --indv AR_Schl_R01_004 --indv AR_Schl_R03_005 --indv AR_Schl_R05_006 --indv AR_Schl_R07_007 --indv AR_Schl_R09_012 --indv AR_Sifi_R01_005 --indv AR_Sifi_R03_006 --indv AR_Sifi_R05_007 --indv AR_Sifi_R11_006 --out AR --TajimaD 10000 --site-pi --freq --counts # Only CR vcftools --vcf ../filtered_e1.vcf --out CR --indv CR_Phil_R02_013 --indv CR_Phil_R04_014 --indv CR_Phil_R06_015 --indv CR_Phil_R08_016 --indv CR_Phil_R11_012 --indv CR_Schl_R02_014 --indv CR_Schl_R04_015 --indv CR_Schl_R06_016 --indv CR_Schl_R08_018 --indv CR_Schl_R12_013 --indv CR_Sifi_R02_015 --indv CR_Sifi_R04_016 --indv CR_Sifi_R06_018 --indv CR_Sifi_R10_013 --indv CR_Sifi_R12_014 --TajimaD 10000 --site-pi --freq --counts # Only LA vcftools --vcf ../filtered_e1.vcf --out LA --indv LA_Phil_R01_006 --indv LA_Phil_R03_007 --indv LA_Phil_R05_012 --indv LA_Phil_R10_016 --indv LA_Phil_R12_019 --indv LA_Schl_R01_007 --indv LA_Schl_R03_012 --indv LA_Schl_R06_013 --indv LA_Schl_R10_015 --indv LA_Schl_R11_002 --indv LA_Sifi_R01_012 --indv LA_Sifi_R04_013 --indv LA_Sifi_R08_019 --indv LA_Sifi_R09_002 --TajimaD 10000 --site-pi --freq --counts # Only RI vcftools --vcf ../filtered_e1.vcf --out RI --indv RI_Phil_R02_016 --indv RI_Phil_R04_018 --indv RI_Phil_R08_013 --indv RI_Phil_R10_014 --indv RI_Phil_R12_015 --indv RI_Schl_R02_018 --indv RI_Schl_R08_014 --indv RI_Schl_R10_019 --indv RI_Schl_R12_016 --indv RI_Sifi_R02_019 --indv RI_Sifi_R06_014 --indv RI_Sifi_R08_015 --indv RI_Sifi_R10_018 --indv RI_Sifi_R12_018 --TajimaD 10000 --site-pi --freq --counts # Only TI vcftools --vcf ../filtered_e1.vcf --indv TI_Phil_R05_002 --indv TI_Phil_R06_019 --indv TI_Phil_R09_004 --indv TI_Phil_R11_004 --indv TI_Phil_R11_005 --indv TI_Schl_R04_019 --indv TI_Schl_R07_002 --indv TI_Schl_R07_004 --indv TI_Schl_R09_005 --indv TI_Sifi_R03_002 --indv TI_Sifi_R05_004 --indv TI_Sifi_R07_005 --indv TI_Sifi_R09_006 --indv TI_Sifi_R11_007 --out TI --TajimaD 10000 --site-pi --freq --counts # only coastal vcftools --vcf ../filtered_e1.vcf --out coastal --remove-indv AR_Phil_R01_002 --remove-indv AR_Phil_R03_004 --remove-indv AR_Phil_R05_005 --remove-indv AR_Phil_R07_006 --remove-indv AR_Phil_R09_007 --remove-indv AR_Schl_R01_004 --remove-indv AR_Schl_R03_005 --remove-indv AR_Schl_R05_006 --remove-indv AR_Schl_R07_007 --remove-indv AR_Schl_R09_012 --remove-indv AR_Sifi_R01_005 --remove-indv AR_Sifi_R03_006 --remove-indv AR_Sifi_R05_007 --remove-indv AR_Sifi_R11_006 --TajimaD 10000 --site-pi --freq --counts --recode ### make new vcf including only sites that are NOT homozygous in coastal # # # Used for e.g. DAPC with only coastal individuals grep '0/0' coastal.recode.vcf | grep -v '0/1' | grep -v '1/1' | awk '{print $1, $2}' > coastal_homozygous00.pos grep -v '0/0' coastal.recode.vcf | grep -v '0/1' | grep '1/1' | awk '{print $1, $2}' > coastal_homozygous11.pos cat coastal_homozygous00.pos coastal_homozygous11.pos > coastal_homozygous.pos vcftools --vcf ../filtered_e1.vcf --out coastal_noHom --remove-indv AR_Phil_R01_002 --remove-indv AR_Phil_R03_004 --remove-indv AR_Phil_R05_005 --remove-indv AR_Phil_R07_006 --remove-indv AR_Phil_R09_007 --remove-indv AR_Schl_R01_004 --remove-indv AR_Schl_R03_005 --remove-indv AR_Schl_R05_006 --remove-indv AR_Schl_R07_007 --remove-indv AR_Schl_R09_012 --remove-indv AR_Sifi_R01_005 --remove-indv AR_Sifi_R03_006 --remove-indv AR_Sifi_R05_007 --remove-indv AR_Sifi_R11_006 --recode --exclude-positions coastal_homozygous.pos #mkdir ../coastal mv coastal.recode.vcf coastal*hom* coastal_noHom.* ../coastal cd ../coastal python ../python_scripts/vcf2various.py coastal_noHom.recode.vcf coastal_noHom 2structure 0 0 -c 1 -e 1 python ../python_scripts/vcf2various.py coastal_noHom.recode.vcf coastal_noHom.SNP 2SNP 0 0 -c 1 -e 1 python ../python_scripts/vcf2various.py coastal_noHom.recode.vcf coastal_noHom.distMat 2distMatrix 0 0 -c 1 -e 1 awk '{print $1}' coastal_noHom.SNP | sed ':a;N;$!ba;s/\n/, /g' | sed 's/SNP-id, //g'> locus_names sed 's/,//g' locus_names > locus_names_NoComma sed -e 's/LA\([^\t]*\)/LA\1\tLA/g' -e 's/RI\([^\t]*\)/RI\1\tRI/g' -e 's/CR\([^\t]*\)/CR\1\tCR/g' -e 's/TI\([^\t]*\)/TI\1\tTI/g' -e 's/AR\([^\t]*\)/AR\1\tAR/g' coastal_noHom.distMat > coastal_noHom_withPOP.distMat sed -e 's/LA\([^\t]*\)/LA\1\tLA/g' -e 's/RI\([^\t]*\)/RI\1\tRI/g' -e 's/CR\([^\t]*\)/CR\1\tCR/g' -e 's/TI\([^\t]*\)/TI\1\tTI/g' -e 's/AR\([^\t]*\)/AR\1\tAR/g' coastal_noHom.stru > coastal_noHom_WITHPOP.stru cat locus_names_NoComma coastal_noHom_WITHPOP.stru > coastal_noHom_WITHPOP_AND_LOCUS.stru # For the calculation of pi we need the amount of bases covered by all individuals. We use the output of coverageBed per base for this task. cd ../vcftools_results bash ../bash_scripts/getNumberBasesWithCovInAll.sh # create one file with several vcftools results for the whole population and the individual provenances python ../python_scripts/getSummarizedResults.py -D filtered_e1.Tajima.D -F filtered_e1.windowed.weir.fst -f filtered_e1 ../target_regions/pseudotsuga_nimblegen_set2.fasta filtered_e1.tabular.txt filtered_e1.sites.pi -z PUTs_numberBases_0_inAtLeastOneLib.txt for TMP in "AR" "CR" "LA" "RI" "TI" ; do python ../python_scripts/getSummarizedResults.py ../target_regions/pseudotsuga_nimblegen_set2.fasta "$TMP".tabular.txt "$TMP".sites.pi -z "$TMP"_PUTs_numberBases_0_inAtLeastOneLib.txt ; done # Some analyzing commands # Tajima's D sort -g -k4 filtered_e1.tabular.txt | tail -n 20 > filtered_e1.top20TajimaD.txt for TMP in "AR" "CR" "LA" "RI" "TI" ; do sort -g -k4 $TMP.tabular.txt | tail -n 20 > $TMP.top20TajimaD.txt ; done sort -rg -k4 filtered_e1.tabular.txt | awk '{if ($4 != "nan" && $4 != "pi") print $0}' | tail -n 20 > filtered_e1.flop20TajimaD.txt sort -rg -k4 AR.tabular.txt | awk '{if ($4 != "nan" && $4 != "pi") print $0}' | tail -n 20 > AR.flop20TajimaD.txt sort -rg -k4 CR.tabular.txt | awk '{if ($4 != "nan" && $4 != "pi") print $0}' | tail -n 20 > CR.flop20TajimaD.txt sort -rg -k4 LA.tabular.txt | awk '{if ($4 != "nan" && $4 != "pi") print $0}' | tail -n 20 > LA.flop20TajimaD.txt sort -rg -k4 RI.tabular.txt | awk '{if ($4 != "nan" && $4 != "pi") print $0}' | tail -n 20 > RI.flop20TajimaD.txt sort -rg -k4 TI.tabular.txt | awk '{if ($4 != "nan" && $4 != "pi") print $0}' | tail -n 20 > TI.flop20TajimaD.txt # Get mean and std.dev of Tajima's D grep -v '^PUT' filtered_e1.tabular.txt | awk '{if ($4 != "nan") print $0}' | awk '{sum+=$4} END {print sum/NR}' grep -v '^PUT' filtered_e1.tabular.txt | awk '{if ($4 != "nan") print $0}' | awk '{sum+=$4; sumsq+=$4*$4}END {print sqrt(sumsq/NR-(sum/NR)**2)}' grep -v '^PUT' AR.tabular.txt | awk '{if ($4 != "nan") print $0}' | awk '{sum+=$4} END {print sum/NR}' grep -v '^PUT' AR.tabular.txt | awk '{if ($4 != "nan") print $0}' | awk '{sum+=$4; sumsq+=$4*$4}END {print sqrt(sumsq/NR-(sum/NR)**2)}' grep -v '^PUT' CR.tabular.txt | awk '{if ($4 != "nan") print $0}' | awk '{sum+=$4} END {print sum/NR}' grep -v '^PUT' CR.tabular.txt | awk '{if ($4 != "nan") print $0}' | awk '{sum+=$4; sumsq+=$4*$4}END {print sqrt(sumsq/NR-(sum/NR)**2)}' grep -v '^PUT' LA.tabular.txt | awk '{if ($4 != "nan") print $0}' | awk '{sum+=$4} END {print sum/NR}' grep -v '^PUT' LA.tabular.txt | awk '{if ($4 != "nan") print $0}' | awk '{sum+=$4; sumsq+=$4*$4}END {print sqrt(sumsq/NR-(sum/NR)**2)}' grep -v '^PUT' RI.tabular.txt | awk '{if ($4 != "nan") print $0}' | awk '{sum+=$4} END {print sum/NR}' grep -v '^PUT' RI.tabular.txt | awk '{if ($4 != "nan") print $0}' | awk '{sum+=$4; sumsq+=$4*$4}END {print sqrt(sumsq/NR-(sum/NR)**2)}' grep -v '^PUT' TI.tabular.txt | awk '{if ($4 != "nan") print $0}' | awk '{sum+=$4} END {print sum/NR}' grep -v '^PUT' TI.tabular.txt | awk '{if ($4 != "nan") print $0}' | awk '{sum+=$4; sumsq+=$4*$4}END {print sqrt(sumsq/NR-(sum/NR)**2)}' # pi, mean and std. dev grep -v '^PUT' filtered_e1.tabular.txt | awk '{if ($3 != "nan") print $0}' | awk '{sum+=$3} END {print sum/NR}' grep -v '^PUT' filtered_e1.tabular.txt | awk '{if ($3 != "nan") print $0}' | awk '{sum+=$3; sumsq+=$3*$3}END {print sqrt(sumsq/NR-(sum/NR)**2)}' grep -v '^PUT' AR.tabular.txt | awk '{if ($3 != "nan") print $0}' | awk '{sum+=$3} END {print sum/NR}' grep -v '^PUT' AR.tabular.txt | awk '{if ($3 != "nan") print $0}' | awk '{sum+=$3; sumsq+=$3*$3}END {print sqrt(sumsq/NR-(sum/NR)**2)}' grep -v '^PUT' CR.tabular.txt | awk '{if ($3 != "nan") print $0}' | awk '{sum+=$3} END {print sum/NR}' grep -v '^PUT' CR.tabular.txt | awk '{if ($3 != "nan") print $0}' | awk '{sum+=$3; sumsq+=$3*$3}END {print sqrt(sumsq/NR-(sum/NR)**2)}' grep -v '^PUT' LA.tabular.txt | awk '{if ($3 != "nan") print $0}' | awk '{sum+=$3} END {print sum/NR}' grep -v '^PUT' LA.tabular.txt | awk '{if ($3 != "nan") print $0}' | awk '{sum+=$3; sumsq+=$3*$3}END {print sqrt(sumsq/NR-(sum/NR)**2)}' grep -v '^PUT' RI.tabular.txt | awk '{if ($3 != "nan") print $0}' | awk '{sum+=$3} END {print sum/NR}' grep -v '^PUT' RI.tabular.txt | awk '{if ($3 != "nan") print $0}' | awk '{sum+=$3; sumsq+=$3*$3}END {print sqrt(sumsq/NR-(sum/NR)**2)}' grep -v '^PUT' TI.tabular.txt | awk '{if ($3 != "nan") print $0}' | awk '{sum+=$3} END {print sum/NR}' grep -v '^PUT' TI.tabular.txt | awk '{if ($3 != "nan") print $0}' | awk '{sum+=$3; sumsq+=$3*$3}END {print sqrt(sumsq/NR-(sum/NR)**2)}' # HWE grep -v '^#' ../filtered_e1.vcf | awk '{print $1}' > ../filtered_e1.names vcftools --vcf ../filtered_e1.vcf --hwe 0.00001 --out e1_HWE00001 --recode grep -v '^#' e1_HWE00001.recode.vcf | awk '{print $1 "\t" $2}' > HWE00001.e1.names # Only AR vcftools --vcf ../filtered_e1.vcf --indv AR_Phil_R01_002 --indv AR_Phil_R03_004 --indv AR_Phil_R05_005 --indv AR_Phil_R07_006 --indv AR_Phil_R09_007 --indv AR_Schl_R01_004 --indv AR_Schl_R03_005 --indv AR_Schl_R05_006 --indv AR_Schl_R07_007 --indv AR_Schl_R09_012 --indv AR_Sifi_R01_005 --indv AR_Sifi_R03_006 --indv AR_Sifi_R05_007 --indv AR_Sifi_R11_006 --out AR --hwe 0.001 --recode grep -v '^#' AR.recode.vcf | awk '{print $1, $2}' > AR_HWE001.names cat AR_HWE001.names ../filtered_e1.names | sort | uniq -c | grep ' 1 ' | awk '{print $2 "\t" $3}' > AR_not_in_HWE_001.names # Only CR vcftools --vcf ../filtered_e1.vcf --out CR --indv CR_Phil_R02_013 --indv CR_Phil_R04_014 --indv CR_Phil_R06_015 --indv CR_Phil_R08_016 --indv CR_Phil_R11_012 --indv CR_Schl_R02_014 --indv CR_Schl_R04_015 --indv CR_Schl_R06_016 --indv CR_Schl_R08_018 --indv CR_Schl_R12_013 --indv CR_Sifi_R02_015 --indv CR_Sifi_R04_016 --indv CR_Sifi_R06_018 --indv CR_Sifi_R10_013 --indv CR_Sifi_R12_014 --hwe 0.001 --recode grep -v '^#' CR.recode.vcf | awk '{print $1, $2}' > CR_HWE001.names cat CR_HWE001.names ../filtered_e1.names | sort | uniq -c | grep ' 1 ' | awk '{print $2 "\t" $3}' > CR_not_in_HWE_001.names # Only LA vcftools --vcf ../filtered_e1.vcf --out LA --indv LA_Phil_R01_006 --indv LA_Phil_R03_007 --indv LA_Phil_R05_012 --indv LA_Phil_R10_016 --indv LA_Phil_R12_019 --indv LA_Schl_R01_007 --indv LA_Schl_R03_012 --indv LA_Schl_R06_013 --indv LA_Schl_R10_015 --indv LA_Schl_R11_002 --indv LA_Sifi_R01_012 --indv LA_Sifi_R04_013 --indv LA_Sifi_R08_019 --indv LA_Sifi_R09_002 --hwe 0.001 --recode grep -v '^#' LA.recode.vcf | awk '{print $1, $2}' > LA_HWE001.names cat LA_HWE001.names ../filtered_e1.names | sort | uniq -c | grep ' 1 ' | awk '{print $2 "\t" $3}' > LA_not_in_HWE_001.names # Only RI vcftools --vcf ../filtered_e1.vcf --out RI --indv RI_Phil_R02_016 --indv RI_Phil_R04_018 --indv RI_Phil_R08_013 --indv RI_Phil_R10_014 --indv RI_Phil_R12_015 --indv RI_Schl_R02_018 --indv RI_Schl_R08_014 --indv RI_Schl_R10_019 --indv RI_Schl_R12_016 --indv RI_Sifi_R02_019 --indv RI_Sifi_R06_014 --indv RI_Sifi_R08_015 --indv RI_Sifi_R10_018 --indv RI_Sifi_R12_018 --hwe 0.001 --recode grep -v '^#' RI.recode.vcf | awk '{print $1, $2}' > RI_HWE001.names cat RI_HWE001.names ../filtered_e1.names | sort | uniq -c | grep ' 1 ' | awk '{print $2 "\t" $3}' > RI_not_in_HWE_001.names # Only TI vcftools --vcf ../filtered_e1.vcf --indv TI_Phil_R05_002 --indv TI_Phil_R06_019 --indv TI_Phil_R09_004 --indv TI_Phil_R11_004 --indv TI_Phil_R11_005 --indv TI_Schl_R04_019 --indv TI_Schl_R07_002 --indv TI_Schl_R07_004 --indv TI_Schl_R09_005 --indv TI_Sifi_R03_002 --indv TI_Sifi_R05_004 --indv TI_Sifi_R07_005 --indv TI_Sifi_R09_006 --indv TI_Sifi_R11_007 --out TI --hwe 0.001 --recode grep -v '^#' TI.recode.vcf | awk '{print $1, $2}' > TI_HWE001.names cat TI_HWE001.names ../filtered_e1.names | sort | uniq -c | grep ' 1 ' | awk '{print $2 "\t" $3}' > TI_not_in_HWE_001.names # mean minor allele frequency grep -v '^CHR' filtered_e1.frq | awk '{print $5 ":" $6}' | awk -F ":" '{if ($2>=$4) sum+=$4; else sum+=$2} END {print sum/NR}' # FST: # 2 pops: coast vs int. grep -v '^CHROM' AR_vs_coastal.windowed.weir.fst | awk '{print $5}' | grep -v nan | awk '{sum+=$1}END {print sum/NR}' grep -v '^CHROM' AR_vs_coastal.windowed.weir.fst | awk '{print $5}' | grep -v nan | awk '{sum+=$1; sumsq+=$1*$1}END {print sqrt(sumsq/NR-(sum/NR)**2)}' # 5 pops (all provenances) grep -v '^CHROM' filtered_e1.windowed.weir.fst | awk '{print $5}' | grep -v nan | awk '{sum+=$1}END {print sum/NR}' grep -v '^CHROM' filtered_e1.windowed.weir.fst | awk '{print $5}' | grep -v nan | awk '{sum+=$1; sumsq+=$1*$1}END {print sqrt(sumsq/NR-(sum/NR)**2)}' # 4 pops: coastal provenances grep -v '^CHROM' coastal.windowed.weir.fst | awk '{print $5}' | grep -v nan | awk '{sum+=$1}END {print sum/NR}' grep -v '^CHROM' coastal.windowed.weir.fst | awk '{print $5}' | grep -v nan | awk '{sum+=$1; sumsq+=$1*$1}END {print sqrt(sumsq/NR-(sum/NR)**2)}' # make pairwise Fst comparisons per PUT per provenance bash ../bash_scripts/pairwise_Fst_perPUT.sh # exclusive SNPs: grep -v ALLELE AR.frq.count | awk 'BEGIN {FS="[\t:]"} {if ($6 != 0 && $8 != 0) print $1, $2}' > AR.snps grep -v ALLELE CR.frq.count | awk 'BEGIN {FS="[\t:]"} {if ($6 != 0 && $8 != 0) print $1, $2}' > CR.snps grep -v ALLELE LA.frq.count | awk 'BEGIN {FS="[\t:]"} {if ($6 != 0 && $8 != 0) print $1, $2}' > LA.snps grep -v ALLELE RI.frq.count | awk 'BEGIN {FS="[\t:]"} {if ($6 != 0 && $8 != 0) print $1, $2}' > RI.snps grep -v ALLELE TI.frq.count | awk 'BEGIN {FS="[\t:]"} {if ($6 != 0 && $8 != 0) print $1, $2}' > TI.snps grep -v ALLELE AR.frq.count | awk 'BEGIN {FS="[\t:]"} {if ($6 == 0 || $8 == 0) print $1, $2}' > AR.notsnps grep -v ALLELE CR.frq.count | awk 'BEGIN {FS="[\t:]"} {if ($6 == 0 || $8 == 0) print $1, $2}' > CR.notsnps grep -v ALLELE LA.frq.count | awk 'BEGIN {FS="[\t:]"} {if ($6 == 0 || $8 == 0) print $1, $2}' > LA.notsnps grep -v ALLELE RI.frq.count | awk 'BEGIN {FS="[\t:]"} {if ($6 == 0 || $8 == 0) print $1, $2}' > RI.notsnps grep -v ALLELE TI.frq.count | awk 'BEGIN {FS="[\t:]"} {if ($6 == 0 || $8 == 0) print $1, $2}' > TI.notsnps cat AR.snps CR.notsnps LA.notsnps RI.notsnps TI.notsnps | sort | uniq -c | grep -c ' 5 ' cat AR.notsnps CR.snps LA.notsnps RI.notsnps TI.notsnps | sort | uniq -c | grep -c ' 5 ' cat AR.notsnps CR.notsnps LA.snps RI.notsnps TI.notsnps | sort | uniq -c | grep -c ' 5 ' cat AR.notsnps CR.notsnps LA.notsnps RI.snps TI.notsnps | sort | uniq -c | grep -c ' 5 ' cat AR.notsnps CR.notsnps LA.notsnps RI.notsnps TI.snps | sort | uniq -c | grep -c ' 5 ' cd .. ### Synon nonsynon and BAYENV ### # Make snpfile: grep -v '^#' filtered_e1.vcf | awk '{print $1,$2,$4,$5}' > filtered_e1.snpfile # get synon nonsynon vs Pinus taeda # To get P taeda sequences: # wget http://loblolly.ucdavis.edu/bipod/ftp/Genome_Data/genome/pinerefseq/Pita/v1.01/Pita_Annotation_v2/ptaeda.v1.01.scaffolds.trimmed.all.maker.proteins.highq_whole.fasta # makeblastdb -title PtaedaProt -out PtaedaProt -in ptaeda.v1.01.scaffolds.trimmed.all.maker.proteins.highq_whole.fasta -dbtype prot -parse_seqids # blastx -db PtaedaProt -query ../target_regions/pseudotsuga_nimblegen_set2.fasta -evalue 1e-3 -outfmt 5 -out set2_vs_PtaedaProt.xml -max_target_seqs 5 -num_threads 20 ### BAYENV2 for five population model: #mkdir bayenv_FIVEPOPS cd bayenv_FIVEPOPS python ../python_scripts/getSynonNonsynonSNPs.py ../target_regions/blast_results_set2/set2_vs_PtaedaProt.xml filtered_e1.snpfile ../target_regions/pseudotsuga_nimblegen_set2.fasta vs_Ptaeda # Idea: take only one SNP per PUT to ensure no or loose LD: awk '!x[$1]++' synon_vs_Ptaeda.txt > oneSynonSNPPerPUT_Ptaeda.txt # Prepare baySNP files python ../python_scripts/synonNonsynon2bayenvSNP.py ../filtered_e1.snpfile allSNPs ../vcftools_results/AR.frq.count ../vcftools_results/CR.frq.count ../vcftools_results/LA.frq.count ../vcftools_results/RI.frq.count ../vcftools_results/TI.frq.count -n 5 # get file for first step of Bayenv2, consisting of one SNP per PUT python ../python_scripts/synonNonsynon2bayenvSNP.py oneSynonSNPPerPUT_Ptaeda.txt oneSynonSNPPerPUT_Ptaeda ../vcftools_results/AR.frq.count ../vcftools_results/CR.frq.count ../vcftools_results/LA.frq.count ../vcftools_results/RI.frq.count ../vcftools_results/TI.frq.count -n 5 cat oneSynonSNPPerPUT_Ptaeda/* > synonSNPOnePerPUT_Ptaeda.txt #get matrix, first bayenv2 step bayenv2 -i synonSNPOnePerPUT_Ptaeda.txt -p 5 -k 100000 -r 87254 > matrix_oneSNPperPUT_Ptaeda.out tail -n 6 matrix_oneSNPperPUT_Ptaeda.out | head -n 5 > Ptaeda_lastMatrix_FIVEPOPS.out cp Ptaeda_lastMatrix_FIVEPOPS.out ../all_env.txt ../bash_scripts/run_bayenv_FIVEPOPS.sh allSNPs/ # Look at the bash script to check out a "parallel" version! cd allSNPs bash run_bayenv_FIVEPOPS.sh cat tmp_*xtx > Ptaeda_bayenv_allSNPs_FIVEPOPS_parallel.xtx cat tmp_*bf > Ptaeda_bayenv_allSNPs_FIVEPOPS_parallel.bf cd .. sort -g -r -k 2 allSNPs/Ptaeda_bayenv_allSNPs_FIVEPOPS_parallel.xtx | head -n 100 > top100xtx_FIVEPOPS.txt sort -g -r -k 2 allSNPs/Ptaeda_bayenv_allSNPs_FIVEPOPS_parallel.xtx | head -n 200 > top200xtx_FIVEPOPS.txt awk '{print $2"_"$3}' ../compare_LOSITAN_BayeScan/outlier_in_LOSITAN_BayeScan_FIVEPOPs.txt | xargs -n 1 -P 1 -I {} sh -c "grep '{}' top100xtx_FIVEPOPS.txt" | wc -l awk '{print $2"_"$3}' ../compare_LOSITAN_BayeScan/outlier_in_LOSITAN_BayeScan_FIVEPOPs.txt | xargs -n 1 -P 1 -I {} sh -c "grep '{}' top200xtx_FIVEPOPS.txt" | wc -l ### BAYENV2 for two population assumption: #mkdir ../bayenv_TWOPOPS cd ../bayenv_TWOPOPS awk '{print $1 "\t" $2+$3+$4+$5 "\t"}' ../bayenv_FIVEPOPS/synonSNPOnePerPUT_Ptaeda.txt > synonSNPOnePerPUT_Ptaeda_counts_TWOPOPS.txt # get matrix, first bayenv2 step bayenv2 -i synonSNPOnePerPUT_Ptaeda_counts_TWOPOPS.txt -p 2 -k 100000 -r 87254 > matrix_oneSNPperPUT_Ptaeda_TWOPOPS.out tail -n 3 matrix_oneSNPperPUT_Ptaeda_TWOPOPS.out | head -n 2 > Ptaeda_lastMatrix_TWOPOPS.out #mkdir all_SNPs # Prepare baySNP files python ../python_scripts/synonNonsynon2bayenvSNP.py ../filtered_e1.snpfile allSNPs ../vcftools_results/AR.frq.count ../vcftools_results/coastal.frq.count -n 2 cp Ptaeda_lastMatrix_TWOPOPS.out ../all_env.txt ../bash_scripts/run_bayenv_TWOPOPS.sh allSNPs cd allSNPs bash run_bayenv_TWOPOPS.sh cat tmp_*xtx > Ptaeda_bayenv_allSNPs_TWOPOPS_parallel.xtx cat tmp_*bf > Ptaeda_bayenv_allSNPs_TWOPOPS_parallel.bf cd .. sort -g -r -k 2 allSNPs/Ptaeda_bayenv_allSNPs_TWOPOPS_parallel.xtx | head -n 100 > top100xtx_TWOPOPS.txt sort -g -r -k 2 allSNPs/Ptaeda_bayenv_allSNPs_TWOPOPS_parallel.xtx | head -n 200 > top200xtx_TWOPOPS.txt awk '{print $2"_"$3}' ../compare_LOSITAN_BayeScan/outlier_in_LOSITAN_BayeScan_TWOPOPs.txt | xargs -n 1 -P 1 -I {} sh -c "grep '{}' top100xtx_TWOPOPS.txt" | wc -l awk '{print $2"_"$3}' ../compare_LOSITAN_BayeScan/outlier_in_LOSITAN_BayeScan_TWOPOPs.txt | xargs -n 1 -P 1 -I {} sh -c "grep '{}' top200xtx_TWOPOPS.txt" | wc -l cd .. ### ADMIXTURE #mkdir admixture vcftools --vcf filtered_e1.vcf --plink --out filtered_e1 plink --file filtered_e1 --make-bed --out filtered_e1 cp filtered_e1.bed filtered_e1.bim filtered_e1.fam admixture cd admixture bash ../bash_scripts/run_admixture.sh ### BayeScan #mkdir bayescan cd bayescan #mkdir FIVEPOPS cd FIVEPOPS BayeScan2.1_linux64bits filtered_e1.txt -o bayescan_out_odds100_1.txt -threads 11 -pr_odds 100 BayeScan2.1_linux64bits filtered_e1.txt -o bayescan_out_odds100_2.txt -threads 11 -pr_odds 100 BayeScan2.1_linux64bits filtered_e1.txt -o bayescan_out_odds100_3.txt -threads 11 -pr_odds 100 #BayeScan2.1_linux64bits filtered_e1.txt -o bayescan_out_odds10.txt -threads 11 -pr_odds 10 #BayeScan2.1_linux64bits filtered_e1.txt -o bayescan_out_odds5000.txt -threads 5 -pr_odds 5000 #BayeScan2.1_linux64bits filtered_e1.txt -o bayescan_out_odds1000.txt -threads 10 -pr_odds 1000 cd .. #mkdir TWOPOPS cd TWOPOPS BayeScan2.1_linux64bits twoPops.txt -o bayescan_out_odds100_1.txt -threads 8 -pr_odds 100 BayeScan2.1_linux64bits twoPops.txt -o bayescan_out_odds100_2.txt -threads 8 -pr_odds 100 BayeScan2.1_linux64bits twoPops.txt -o bayescan_out_odds100_3.txt -threads 8 -pr_odds 100 cd .. # Get the SNP to each ID of BayeScan output awk '{print NR, $0}' ../filtered_e1.snpfile > numbered_filtered_e1.snpfile # get those loci that were outlier in all three runs with the Rscript: bayescan_getOutlier.R see Rscripts/. Copy that file to the TWOPOPS or FIVEPOPS folder and run it. cd FIVEPOPS Rscript bayescan_getOutlier.R grep -v 'id' outlier_FDR01_3runs.txt | awk '{print $2}' | sort | uniq | wc -l cd ../.. ### LOSITAN # 1) It takes Genepop data format, but it only reads data correctly when each locus is listed on a separate row. (Genepop will also accept a data file in which the loci are separated by commas). sed -e 's/G/01/g' -e 's/A/02/g' -e 's/C/03/g' -e 's/T/04/g' -e 's/\///g' filtered_e1.distMat | sed -e 's/02R/AR/g' -e 's/04I/TI/g' -e 's/03R/CR/g' -e 's/L02/LA/g' | sort > tmp2 # add POP and commas grep -v '^#' filtered_e1.vcf | awk '{print $1 "_" $2}' > locus_names_one_per_row cat locus_names_one_per_row tmp2 > filtered_e1.popgenome echo "Add a header line here" > tmp cat tmp filtered_e1.popgenome > tmp2 mv tmp2 filtered_e1.popgenome #mkdir lositan cd lositan # run LOSITAN and save the results using 'strg+a' 'strg+c' and 'strg+v' into files called lociList_100000sim_1.txt lociList_100000sim_2.txt … # Parameters: 5 populations, size 14, 100000 sim, fdr 0.1, see publication # check results for outlier: grep Outlier lociList_100000sim_* | awk '{if ($4 != "-100.0") print $0}' | awk 'BEGIN {FS=":"} {print $2}' | awk '{print $1}' | sort | uniq -c | awk '{print $1}' | sort | uniq -c # get those loci that were outlier in all 5 runs grep Outlier lociList_100000sim_* | awk '{if ($4 != "-100.0") print $0}' | awk 'BEGIN {FS=":"} {print $2}' | awk '{print $1}' | sort | uniq -c | grep ' 5 ' | awk '{print $2}' | awk 'BEGIN {FS="_"} {print $1 "_" $2 "\t" $3}' > outlier_FIVEPOPS_in_5runs.txt # TWOPOPS case, i.e. assumed number of populations 2, get loci that were outlier in all 5 runs: grep Outlier lociList_TWOPOPS_100000sim_* | awk '{if ($4 > 0 && $4 != "-100.0") print $0}' | awk 'BEGIN {FS=":"} {print $2}' | awk '{print $1}' | sort | uniq -c | grep ' 5 ' | awk '{print $2}' | awk 'BEGIN {FS="_"} {print $1 "_" $2 "\t" $3}' > outlier_TWOPOPS_in_5runs.txt # Number different PUTs grep Outlier lociList_100000sim_* | awk '{if ($4 != "-100.0") print $0}' | awk 'BEGIN {FS=":"} {print $2}' | awk '{print $1}' | awk 'BEGIN {FS="_"} {print $1 "_" $2 "\t" $3}' | sort | uniq -c | grep ' 5 ' | awk '{print $2}' | sort | uniq -c | wc -l # Number of Outlier per PUTs grep Outlier lociList_100000sim_* | awk '{if ($4 != "-100.0") print $0}' | awk 'BEGIN {FS=":"} {print $2}' | awk '{print $1}' | awk 'BEGIN {FS="_"} {print $1 "_" $2 "\t" $3}' | sort | uniq -c | grep ' 5 ' | awk '{print $2}' | sort | uniq -c | awk '{print $1}' | sort |uniq -c cd .. ### Compare LOSITAN and BayeScan results: #mkdir compare_LOSITAN_BayeScan cd compare_LOSITAN_BayeScan grep Outlier ../lositan/lociList_100000sim_* | awk '{if ($4 != "-100.0") print $0}' | awk 'BEGIN {FS=":"} {print $2}' | awk '{print $1}' | awk 'BEGIN {FS="_"} {print $1 "_" $2 "\t" $3}' | sort | uniq -c | grep ' 5 ' | awk '{print $2 "\t" $3}' | sort | uniq | xargs -n 1 -P 1 -I {} sh -c "grep '{}' ../bayescan/FIVEPOPS/outlier_FDR01_3runs.txt" > outlier_in_LOSITAN_BayeScan_FIVEPOPs.txt awk '{print $2}' outlier_in_LOSITAN_BayeScan_FIVEPOPs.txt | sort | uniq > FIVEPOPS_only_PUTs.txt python ../python_scripts/getSeqs_largefiles.py ../target_regions/pseudotsuga_nimblegen_set2.fasta FIVEPOPS_PUTs.fasta -f FIVEPOPS_only_PUTs.txt blastx -query FIVEPOPS_PUTs.fasta -db nr -evalue 1e-6 -max_target_seqs 20 -num_threads 15 -outfmt 11 -out FIVEPOPS_vs_nr_e1-6_20seqs.asn blast_formatter -archive FIVEPOPS_vs_nr_e1-6_20seqs.asn -out FIVEPOPS_vs_nr_e1-6_20seqs.txt # TWOPOPS case grep Outlier ../lositan/lociList_TWOPOPS_100000sim_* | awk '{if ($4 > 0 && $4 != "-100.0") print $0}' | awk 'BEGIN {FS=":"} {print $2}' | awk '{print $1}' | awk 'BEGIN {FS="_"} {print $1 "_" $2 "\t" $3}' | sort | uniq -c | grep ' 5 ' | awk '{print $2 "\t" $3}' | sort | uniq | xargs -n 1 -P 1 -I {} sh -c "grep -P '{}' ../bayescan/TWOPOPS/outlier_FDR01_3runs.txt" > outlier_in_LOSITAN_BayeScan_TWOPOPs.txt awk '{print $2}' outlier_in_LOSITAN_BayeScan_TWOPOPs.txt | sort | uniq > TWOPOPS_only_PUTs.txt python ../python_scripts/getSeqs_largefiles.py ../target_regions/pseudotsuga_nimblegen_set2.fasta TWOPOPS_PUTs.fasta -f TWOPOPS_only_PUTs.txt blastx -query TWOPOPS_PUTs.fasta -db nr -evalue 1e-6 -max_target_seqs 20 -num_threads 15 -outfmt 11 -out TWOPOPS_vs_nr_e1-6_20seqs.asn blast_formatter -archive TWOPOPS_vs_nr_e1-6_20seqs.asn -out TWOPOPS_vs_nr_e1-6_20seqs.txt cd .. ### Private alleles etc. for msABC cd vcftools_results ## Private alleles: awk '{if ($3 != 0 ) print $1, $2}' AR.sites.pi | grep -v CHROM > AR.snps awk '{if ($3 == 0 ) print $1, $2}' AR.sites.pi | grep -v CHROM > AR.notsnps awk '{if ($3 != 0 ) print $1, $2}' CR.sites.pi | grep -v CHROM > CR.snps awk '{if ($3 == 0 ) print $1, $2}' CR.sites.pi | grep -v CHROM > CR.notsnps awk '{if ($3 != 0 ) print $1, $2}' LA.sites.pi | grep -v CHROM > LA.snps awk '{if ($3 == 0 ) print $1, $2}' LA.sites.pi | grep -v CHROM > LA.notsnps awk '{if ($3 != 0 ) print $1, $2}' RI.sites.pi | grep -v CHROM > RI.snps awk '{if ($3 == 0 ) print $1, $2}' RI.sites.pi | grep -v CHROM > RI.notsnps awk '{if ($3 != 0 ) print $1, $2}' TI.sites.pi | grep -v CHROM > TI.snps awk '{if ($3 == 0 ) print $1, $2}' TI.sites.pi | grep -v CHROM > TI.notsnps # Prov exclusive cat AR.snps LA.notsnps CR.notsnps RI.notsnps TI.notsnps | sort | uniq -c | grep -c ' 5 ' cat AR.notsnps CR.snps LA.notsnps RI.notsnps TI.notsnps | sort | uniq -c | grep -c ' 5 ' cat AR.notsnps CR.notsnps LA.snps RI.notsnps TI.notsnps | sort | uniq -c | grep -c ' 5 ' cat AR.notsnps CR.notsnps LA.notsnps RI.snps TI.notsnps | sort | uniq -c | grep -c ' 5 ' cat AR.notsnps CR.notsnps LA.notsnps RI.notsnps TI.snps | sort | uniq -c | grep -c ' 5 ' # coastal exclusive cat CR.snps LA.snps RI.snps TI.snps | sort | uniq | cat - AR.notsnps | sort | uniq -c | grep -c ' 2 ' ## Shared alleles between all 5 provs: cat AR.snps CR.snps LA.snps RI.snps TI.snps | sort | uniq -c | grep -c ' 5 ' ## Fixed allels: cat AR.notsnps LA.notsnps CR.notsnps RI.notsnps TI.notsnps | sort | uniq -c | grep -c ' 5 ' cd .. ### msABC # The folder msABC contains the necessary scripts and the data used in our simulations # Here we describe the data generation. cd msABC python ../python_scripts/generate_msABC_input_4_simulation.py ../vcftools_results/filtered_e1.tabular.txt cat msABC_simulation_input.txt | grep -v '^id' | awk '{print $1}' | xargs -n 1 -P 1 -I {} sh -c "grep '{}' ../vcftools_results/AR.Tajima.D " > tmp head -n 1 ../vcftools_results/AR.Tajima.D | cat - tmp > 4msABC_interior.Tajima.D cat msABC_simulation_input.txt | grep -v '^id' | awk '{print $1}' | xargs -n 1 -P 1 -I {} sh -c "grep '{}' ../vcftools_results/coastal.Tajima.D " > tmp head -n 1 ../vcftools_results/coastal.Tajima.D | cat - tmp > 4msABC_coastal.Tajima.D cat msABC_simulation_input.txt | grep -v '^id' | awk '{print $1}' | xargs -n 1 -P 1 -I {} sh -c "grep '{}' ../vcftools_results/filtered_e1.tabular.txt " > tmp head -n 1 ../vcftools_results/filtered_e1.tabular.txt | cat - tmp > 4msABC.tabular.txt # for msABC: grep -v 'PUT' 4msABC.tabular.txt | awk '{print $1}' | xargs -n 1 -P 1 -I {} sh -c "grep {} ../vcftools_results/AR.snps" > 4msABC_AR.snps grep -v 'PUT' 4msABC.tabular.txt | awk '{print $1}' | xargs -n 1 -P 1 -I {} sh -c "grep {} ../vcftools_results/AR.notsnps" > 4msABC_AR.notsnps grep -v 'PUT' 4msABC.tabular.txt | awk '{print $1}' | xargs -n 1 -P 1 -I {} sh -c "grep {} ../vcftools_results/CR.snps" > 4msABC_CR.snps grep -v 'PUT' 4msABC.tabular.txt | awk '{print $1}' | xargs -n 1 -P 1 -I {} sh -c "grep {} ../vcftools_results/CR.notsnps" > 4msABC_CR.notsnps grep -v 'PUT' 4msABC.tabular.txt | awk '{print $1}' | xargs -n 1 -P 1 -I {} sh -c "grep {} ../vcftools_results/LA.snps" > 4msABC_LA.snps grep -v 'PUT' 4msABC.tabular.txt | awk '{print $1}' | xargs -n 1 -P 1 -I {} sh -c "grep {} ../vcftools_results/LA.notsnps" > 4msABC_LA.notsnps grep -v 'PUT' 4msABC.tabular.txt | awk '{print $1}' | xargs -n 1 -P 1 -I {} sh -c "grep {} ../vcftools_results/RI.snps" > 4msABC_RI.snps grep -v 'PUT' 4msABC.tabular.txt | awk '{print $1}' | xargs -n 1 -P 1 -I {} sh -c "grep {} ../vcftools_results/RI.notsnps" > 4msABC_RI.notsnps grep -v 'PUT' 4msABC.tabular.txt | awk '{print $1}' | xargs -n 1 -P 1 -I {} sh -c "grep {} ../vcftools_results/TI.snps" > 4msABC_TI.snps grep -v 'PUT' 4msABC.tabular.txt | awk '{print $1}' | xargs -n 1 -P 1 -I {} sh -c "grep {} ../vcftools_results/TI.notsnps" > 4msABC_TI.notsnps # privat cat 4msABC_AR.snps 4msABC_LA.notsnps 4msABC_CR.notsnps 4msABC_RI.notsnps 4msABC_TI.notsnps | sort | uniq -c | grep ' 5 ' | awk '{print $2, $3}' > privateI.txt cat 4msABC_AR.snps 4msABC_LA.notsnps 4msABC_CR.notsnps 4msABC_RI.notsnps 4msABC_TI.notsnps | sort | uniq -c | grep ' 5 ' | awk '{print $2}' | sort | uniq -c > privateI_num.txt cat 4msABC_CR.snps 4msABC_LA.snps 4msABC_RI.snps 4msABC_TI.snps | sort | uniq | cat - 4msABC_AR.notsnps | sort | uniq -c | grep ' 2 ' | awk '{print $2, $3}' > privateC.txt cat 4msABC_CR.snps 4msABC_LA.snps 4msABC_RI.snps 4msABC_TI.snps | sort | uniq | cat - 4msABC_AR.notsnps | sort | uniq -c | grep ' 2 ' | awk '{print $2}' | sort | uniq -c > privateC_num.txt # Shared alleles between coastal and interior: cat 4msABC_CR.snps 4msABC_LA.snps 4msABC_RI.snps 4msABC_TI.snps | sort | uniq | cat - 4msABC_AR.snps | sort | uniq -c | grep ' 2 ' | awk '{print $2, $3}' > shared.txt cat 4msABC_CR.snps 4msABC_LA.snps 4msABC_RI.snps 4msABC_TI.snps | sort | uniq | cat - 4msABC_AR.snps | sort | uniq -c | grep ' 2 ' | awk '{print $2}' | sort | uniq -c > shared_num.txt # Fixed allels: cat 4msABC_AR.notsnps 4msABC_LA.notsnps 4msABC_CR.notsnps 4msABC_RI.notsnps 4msABC_TI.notsnps | sort | uniq -c | grep ' 5 ' | awk '{print $2, $3}' > antiRef.txt cat 4msABC_AR.notsnps 4msABC_LA.notsnps 4msABC_CR.notsnps 4msABC_RI.notsnps 4msABC_TI.notsnps | sort | uniq -c | grep ' 5 ' | awk '{print $2}' | sort | uniq -c > antiRef_num.txt awk '{print $1}' 4msABC.tabular.txt | grep -v CHROM | xargs -n 1 -P 1 -I {} sh -c "grep {} ../vcftools_results/filtered_e1.Tajima.D" | awk '{sum+=$3}END{print sum}' awk '{print $1}' 4msABC.tabular.txt | grep -v CHROM | xargs -n 1 -P 1 -I {} sh -c "grep {} ../vcftools_results/filtered_e1.weir.fst" | awk '{print $1, $2}' > SNPS_in_PUTS.txt # run msABC bash ../bash_scripts/run_msABC.sh cd .. ### Make DB # psql commands: createdb doug_SeqCapExp DROP TABLE IF EXISTS SNPWISE; CREATE TABLE SNPWISE ( id serial PRIMARY KEY, seq_name varchar, pos integer, ref char, var char, fst double precision, xtx_Ptaeda2pops double precision, xtx_Ptaeda5pops double precision, loading double precision, inhwe00001 boolean default TRUE, LOSITAN_5pops_outlier boolean DEFAULT FALSE, BAYESCAN_5pops_outlier boolean DEFAULT FALSE, LOSITAN_2pops_outlier boolean DEFAULT FALSE, BAYESCAN_2pops_outlier boolean DEFAULT FALSE ); DROP TABLE IF EXISTS PUTWISE; CREATE TABLE PUTWISE ( id serial PRIMARY KEY, seq_name varchar, bps integer, pi double precision, tajimasd double precision, weighted_fst double precision, mean_fst double precision ); # initial run for DB python python_scripts/results2SNPWISE_DB.py USERNAME -a -s filtered_e1.snpfile -d doug_SeqCapExp # fill DB python python_scripts/results2SNPWISE_DB.py USERNAME -d doug_SeqCapExp -f vcftools_results/filtered_e1.weir.fst -p bayenv_TWOPOPS/allSNPs/Ptaeda_bayenv_allSNPs_TWOPOPS_parallel.xtx -P bayenv_FIVEPOPS/allSNPs/Ptaeda_bayenv_allSNPs_FIVEPOPS_parallel.xtx -H vcftools_results/HWE/HWE00001.names -L lositan/outlier_TWOPOPS_in_5runs.txt -F lositan/outlier_FIVEPOPS_in_5runs.txt -b bayescan/TWOPOPS/outlier_FDR01_3runs.txt -B bayescan/FIVEPOPS/outlierNames_odds100_FDR01_3runs.txt # -l R_stuff/loadings.txt # loadings are from DAPC. python python_scripts/results2PUTWISE_DB.py USERNAME -d doug_SeqCapExp -i filtered_e1.tabular.txt # Load dump into database: gunzip doug_SeqCapExp.dump.gz psql -U postgres -h localhost -d doug_SeqCapExp < doug_SeqCapExp.dump