#SCRIPTS for "Runs of homozygosity reveal past bottlenecks and contemporary inbreeding across diverging island populations of a bird" #Claudia A. Martin, Eleanor C. Sheppard, Juan Carlos Illera, Alex Suh, Krystyna Nadachowska-Brzyska, Lewis G. Spurgin, David S. Richardson #10-09-2022 #This readme file describes the code used to produce analyses detialed in the above manuscript. These files must be created prior to running R scripts to produce outputs. # Raw sequence reads are provided separtely, as well as the three final VCF data sets as detailed in the manuscript. # Code used to map raw reads to the reference Berthelot's pipit (BP) genome and undertake variant calling are provided separately and were run on an individual basis (example EH1_gVCF.sh for one idividual in El Heirro). # gvcf files were joint called for the BP using GATK GenomicsDBImport, GenotypeGVCFs and SortVcf using array jobs (due to the large number of contigs) to produce the unmapped VCF. # The read me file for these steps and mapping of contigs to Zebra finch chromosomes is detialed in "gvcf to vcf.txt" # For the Tawny pipit gvcf format was converted to VCF GATK GenotypeGVCFs. # FILE SETS from the point of generating unmapped VCF files # BERTHELOTS PIPIT - the raw VCF file has joint called variants # Convert the raw Berthelot's pipit VCF file to bfiles for editing to add the Chromosome locations using Satusma output plink --vcf Berthelots_raw.vcf --keep-allele-order --real-ref-alleles --allow-extra-chr --recode --make-bed --out Berthelots_raw # Remap bim file locations using Satsuma output map of BP refrence genome to Zebra finch. Script "Satsuma_output_vcf.R" edited for new input and output files # Produces "__MAPPED_variants.bim" # Rename the fam, bed, map and ped files to match the new mapped bim version e.g. mv _raw_variants.bed _MAPPED_variants.bed # Convert back to VCF format plink --bfile _MAPPED_variants --keep-allele-order --real-ref-alleles --chr-set 55 --allow-extra-chr --make-bed --out _MAPPED_variants plink --bfile _MAPPED_variants --keep-allele-order --real-ref-alleles --chr-set 55 --allow-extra-chr --recode vcf --out _MAPPED_variants # Remove VCF header, Convert chromosome names to codes # Re-add quality information into the VCF using script "merge_MAPPED_and_QUALITY_VCF.R" # Edit VCF header to include codes for chomosomes and also INFO and FORMAT information cat vcf_header_QUALITY vcf_header_CHROM_CODES vcf_header_cols merging_MAPPED_and_QUALITY_VCF/noheader_Berthelots.vcf > Berthelots_MAPPED_and_QUALITY.vcf # Mapped and Quality file is called "Berthelots_MAPPED_and_QUALITY.vcf" #The following scripts relate to generation of datasets and analyses from the inital VCF file "Berthelots.vcf" # Calcuate the depth of coverage for each individual BEFORE then AFTER filtering steps vcftools --vcf ../Berthelots.vcf --depth --out depth vcftools --vcf ../Berthelots.vcf --depth --out depth # Calculate individual missingness for each individual BEFORE and AFTER filtering steps vcftools --vcf Berthelots.vcf --imiss --out missingness vcftools --vcf Berthelots.vcf --imiss --out missingness # Filter the VCF file to ensure high quality SNPs are retained. vcftools --vcf Berthelots.vcf --remove-indels --not-chr 0 --mac 1 --not-chr 31 --max-alleles 2 --max-meanDP 44 --min-meanDP 10 --minQ 30 --max-missing-count 4 --recode --recode-INFO-all --out Berthelots # Make bed files for filtering and future analysis plink --vcf ../Berthelots.vcf --chr-set 55 --double-id --make-bed --out Berthelots # Filter VCF files by individual. vcftools --vcf ../Berthelots.vcf --indv LZ_87_LZ_87 --recode --recode-INFO-all --out LZ_87_FILTERED # ALL PIPITS - the raw VCF file with all Berthelot's pipit samples and the Tawny pipit sample joint called # Repeat the process as above but with --max-mean-DP 45 instead # TAWNY PIPIT - this individual is handeled slightly differently as joint genotyping does not need to take place. # Convert the raw Tawny pipit VCF file to bfiles for editing to add the Chromosome locations using Satusma output plink --vcf Taw_462_raw_variants.vcf --keep-allele-order --real-ref-alleles --allow-extra-chr --recode --make-bed --out Taw_462_raw_variants # Remap bim file locations using same output from Satsuma as was used for Berthelot's samples. Script "Satsuma_output_vcf.R" edited for new input and output files # Produces "Taw_462__MAPPED_variants.bim" # Rename the fam, bed, map and ped files to match the new mapped bim version e.g. mv Taw_462_raw_variants.bed Taw_462_MAPPED_variants.bed # Convert back to VCF format plink --bfile Taw_462_MAPPED_variants --keep-allele-order --real-ref-alleles --chr-set 55 --allow-extra-chr --make-bed --out Taw_462_MAPPED_variants plink --bfile Taw_462_MAPPED_variants --keep-allele-order --real-ref-alleles --chr-set 55 --allow-extra-chr --recode vcf --out Taw_462_MAPPED_variants # Remove VCF header sed '/^#/d' Taw_462_MAPPED_variants.vcf > noheader_Taw_462_MAPPED_variants.vcf # Convert chromsome names to codes so that the file can later be recognised by Plink and the format matches the Berthelot's pipit VCF. Use "Genome_chromosome_codes.txt" # Scripts on the cluster # Recover the quality information from original VCF # Create mapped VCF file with quality information retained and chromosome codes in first column like in the Berthelot's pipit file. # File called "Taw_462_MAPPED_and_QUALITY_variants.vcf" # Quality trim Tawny VCF file vcftools --vcf Taw_462_MAPPED_and_QUALITY_variants.vcf --max-alleles 2 --remove-indels --minQ 30 --min-meanDP 10 --max-meanDP 55 --not-chr 0 --not-chr 31 --recode --recode-INFO-all --out Tawny # FILTERED variant depth vcftools --vcf Tawny.vcf --depth --out FILTERED_variant_depth # ---------- Analyses ------------- # ###### DIVERSITY ###### # Heterozygosity and VCFtools/Plink inbreeding by individual vcftools --vcf ../../Berthelots.vcf --het --out Inv_het plink --vcf ../../Berthelots.vcf --double-id --chr-set 55 --het --out Plink_het # Nucleotide diversity per individual vcftools --vcf ../../Berthelots.vcf --window-pi 500000 --keep TF_17 --window-pi-step 100000 --out 500k_100_TF_17 # In R quantile(LZ_87$PI, c(0.05, 0.5, 0.95)) # ROH detection 250Kb followed by 1Mb plink --vcf ../../Berthelots.vcf --double-id --chr-set 55 --homozyg --homozyg-gap 200 --homozyg-density 200 --homozyg-kb 250 --homozyg-window-snp 50 --homozyg-window-missing 5 --homozyg-window-het 2 --out ROH_het_250Kb plink --vcf ../../Berthelots.vcf --double-id --chr-set 55 --homozyg --homozyg-gap 200 --homozyg-density 200 --homozyg-kb 1000 --homozyg-window-snp 50 --homozyg-window-missing 5 --homozyg-window-het 2 --out ROH_het_1Mb plink --vcf Tawny.vcf --chr-set 55 --homozyg --allow-extra-chr --homozyg-gap 200 --homozyg-density 200 --homozyg-kb 250 --homozyg-window-snp 50 --homozyg-window-missing 5 --homozyg-window-het 2 --out Taw_ROH_250Kb ####### DIVERGENCE ###### # PCA plink --vcf ../../All_Pipits.vcf --allow-extra-chr --chr-set 55 --pca 2 --out pca_ALL_pipits plink --vcf ../Berthelots.vcf --double-id --chr-set 55 --pca 2 --out pca_Berthelot # Pairwise Fst between islands, 50 Kb windows # within-archipelgo comparisons: vcftools --vcf ../Berthelots.vcf --weir-fst-pop ../FAM_FILES/PS.name --weir-fst-pop ../FAM_FILES/M.names --fst-window-size 50000 --out PSvM vcftools --vcf ../Berthelots.vcf --weir-fst-pop ../FAM_FILES/LZ.names --weir-fst-pop ../FAM_FILES/TF.names --fst-window-size 50000 --out LZvTF vcftools --vcf ../Berthelots.vcf --weir-fst-pop ../FAM_FILES/LZ.names --weir-fst-pop ../FAM_FILES/EH.names --fst-window-size 50000 --out LZvEH vcftools --vcf ../Berthelots.vcf --weir-fst-pop ../FAM_FILES/EH.names --weir-fst-pop ../FAM_FILES/TF.names --fst-window-size 50000 --out EHvTF # between archipelagos vcftools --vcf ../Berthelots.vcf --weir-fst-pop ../FAM_FILES/LZ.names --weir-fst-pop ../FAM_FILES/SG.names --fst-window-size 50000 --out LZvSG vcftools --vcf ../Berthelots.vcf --weir-fst-pop ../FAM_FILES/LZ.names --weir-fst-pop ../FAM_FILES/M.names --fst-window-size 50000 --out LZvM vcftools --vcf ../Berthelots.vcf --weir-fst-pop ../FAM_FILES/LZ.names --weir-fst-pop ../FAM_FILES/PS.names --fst-window-size 50000 --out LZvPS vcftools --vcf ../Berthelots.vcf --weir-fst-pop ../FAM_FILES/EH.names --weir-fst-pop ../FAM_FILES/SG.names --fst-window-size 50000 --out EHvSG vcftools --vcf ../Berthelots.vcf --weir-fst-pop ../FAM_FILES/EH.names --weir-fst-pop ../FAM_FILES/M.names --fst-window-size 50000 --out EHvM vcftools --vcf ../Berthelots.vcf --weir-fst-pop ../FAM_FILES/EH.names --weir-fst-pop ../FAM_FILES/PS.names --fst-window-size 50000 --out EHvPS vcftools --vcf ../Berthelots.vcf --weir-fst-pop ../FAM_FILES/TF.names --weir-fst-pop ../FAM_FILES/SG.names --fst-window-size 50000 --out TFvSG vcftools --vcf ../Berthelots.vcf --weir-fst-pop ../FAM_FILES/TF.names --weir-fst-pop ../FAM_FILES/M.names --fst-window-size 50000 --out TFvM vcftools --vcf ../Berthelots.vcf --weir-fst-pop ../FAM_FILES/TF.names --weir-fst-pop ../FAM_FILES/PS.names --fst-window-size 50000 --out TFvPS vcftools --vcf ../Berthelots.vcf --weir-fst-pop ../FAM_FILES/SG.names --weir-fst-pop ../FAM_FILES/M.names --fst-window-size 50000 --out SGvM vcftools --vcf ../Berthelots.vcf --weir-fst-pop ../FAM_FILES/SG.names --weir-fst-pop ../FAM_FILES/PS.names --fst-window-size 50000 --out PSvSG # Then in R, set lower bound for Fst to zero MvPS_50k <- read.table("MvPS.windowed.weir.fst",header = T) MvPS_50k$MEAN_FST[MvPS_50k$MEAN_FST < 0] <- 0 mean(MvPS_50k$MEAN_FST) ####### Population History Modelling, PSMC ###### (example given) samtools mpileup -C50 -uf a.lines.fasta EH_161.bam | bcftools call -c | vcfutils.pl vcf2fq -d 10 -D 62 -Q 25 | gzip > diploid.fq.gz fq2psmcfa -q20 diploid.fq.gz > diploid.psmcfa psmc -N30 -t5 -r1 -p "4+30*2+4+6+10" -o diploid.psmc diploid.psmcfa ----- # Put all individuals on one plot # With the tawny pipit cat ../EH_161/PSMC/diploid.psmc ../TF_17/PSMC/PSMC/diploid.psmc ../LZ_93/PSMC/diploid.psmc ../SG_278/PSMC/diploid.psmc ../M_249/PSMC/diploid.psmc ../PS_506/PSMC/diploid.psmc ../Taw_462/PSMC/diploid.psmc > combined.psmc psmc_plot.pl -M 'EH,TF,LZ,SG,M,PS,Tawny' -g 2.2 -u 2.3e-09 combined combined.psmc # Just the Berthelot's pipit islands. Some individuals. cat ../EH_161/PSMC/diploid.psmc ../TF_17/PSMC/PSMC/diploid.psmc ../LZ_93/PSMC/diploid.psmc ../SG_278/PSMC/diploid.psmc ../M_249/PSMC/diploid.psmc ../PS_506/PSMC/diploid.psmc > combined.psmc psmc_plot.pl -M 'EH,TF,LZ,SG,M,PS' -g 2.2 -u 2.3e-09 combined combined.psmc # All individuals. Generation time used is ten times the estimate to allow showing recent pop sizes, even if the estimates are not accurate cat ../../LZ_87/PSMC/diploid.psmc ../../LZ_93/PSMC/diploid.psmc ../../TF_17/PSMC/PSMC/diploid.psmc ../../TF_6/PSMC/TF_6/diploid.psmc ../../EH_179/PSMC/EH_179/diploid.psmc ../../EH_161/PSMC/diploid.psmc ../../M_249/PSMC/diploid.psmc ../../M_305/PSMC/diploid.psmc ../../PS_506/PSMC/diploid.psmc ../../SG_278/PSMC/diploid.psmc ../../SG_300/PSMC/diploid.psmc > all_Berthelots.psmc psmc_plot.pl -M 'LZ, LZ , TF, TF, EH, EH, M, M, PS, SG, SG' -g 22.0 -u 2.3e-09 all_Berthelots all_Berthelots.psmc # note the output figure is produced with the extension .eps Just download this and click to open it and Macs will naturally convert to a PDF. These files were then edited for colour in Adobe Illustrator.