# List of commands used in the pipeline described in Therkildsen and Palumbi: "Practical low-coverage genome-wide sequencing of hundreds of individually barcoded samples for population and evolutionary genomics in non-model species" published in Molecular Ecology Resources. # Note: The sequence of bioinformatic processing steps described below was designed to carefully evaluate the quality of sequencing data generated with a new library preparation method. For routine applications (where it may not be necessary to quantify the loss of data at each individual filtering step), several steps can be skipped, optimized or combined for a more streamlined pipeline. #### CLEANING THE READS #### # Loop over each sample to complete the read cleaning procedure for SAMPLE in `cat SampleList.txt`; do # Remove duplicate sequences (both forward and reverse read identical) with fastuniq. We included this step to get an estimates of duplication rates prior to mapping, but if duplicates are marked after mapping (here with the Picard Tools MarkDuplicates), this step can be skipped. # Make input file lists as described in the fastuniq manual fastuniq -i '/FastUniq_InputFiles/'$SAMPLE'.txt' -t q -o $SAMPLE'_FastUniq_1.fastq' -p $SAMPLE'_FastUniq_2.fastq' -c 0 # Remove adapter sequence with Trimmomatic (the NexteraPE.fa is a list of the adapter/index sequences we used for our libraries) java -jar /Trimmomatic/trimmomatic-0.32.jar PE -threads 18 -phred33 $SAMPLE'_FastUniq_1.fastq' $SAMPLE'_FastUniq_2.fastq' $SAMPLE'_AdapterClipped_forward_paired.fastq' $SAMPLE'_AdapterClipped_forward_unpaired.fastq' $SAMPLE'_AdapterClipped_reverse_paired.fastq' $SAMPLE'_AdapterClipped_reverse_unpaired.fastq' ILLUMINACLIP:NexteraPE.fa:2:30:10:4:true # Trim off low quality sequence (this could be done in the same step as adapter removal, but we wanted to quantify the amount of data lost at each filtering step) java -jar /Trimmomatic/trimmomatic-0.32.jar PE -threads 18 -phred33 $SAMPLE'_AdapterClipped_forward_paired.fastq' $SAMPLE'_AdapterClipped_reverse_paired.fastq' $SAMPLE'_AdapterClipped_QualFiltered_forward_paired.fastq' $SAMPLE'_AdapterClipped_QualFiltered_forward_unpaired.fastq' $SAMPLE'_AdapterClipped_QualFiltered_reverse_paired.fastq' $SAMPLE'_AdapterClipped_QualFiltered_reverse_unpaired.fastq' LEADING:10 SLIDINGWINDOW:4:15 MINLEN:50 # Merge overlapping ends of paired reads with FLASH flash -M 85 $SAMPLE'_AdapterClipped_QualFiltered_forward_paired.fastq' $SAMPLE'_AdapterClipped_QualFiltered_reverse_paired.fastq' --output-prefix $SAMPLE --output-directory /FLASH/ # Add reads orphaned in the quality filtering (the other mate of the pair did not meet the filtering criteria) to the file with the merged reads (to have a combined file with all single-end reads). The extendedFrags.fastq files are merged read pair output from flash cat $SAMPLE'.extendedFrags.fastq' $SAMPLE'_AdapterClipped_QualFiltered_forward_unpaired.fastq' $SAMPLE'_AdapterClipped_QualFiltered_reverse_unpaired.fastq' > $SAMPLE'_SingleEndQualFiltered.fastq' # Remove potential contaminant reads (reads that map to a database of potential contaminant sequence) # For the paired-end data (the notCombined_1.fastq and notCombined_2.fastq are the forward and reverse reads that were not merged by flash) bowtie2 -q --phred33 --sensitive -I 0 -X 1000 --fr -x "/ContaminationDB/Human/GCA_000001405.15_GRCh38_top-level" -1 $SAMPLE".notCombined_1.fastq" -2 $SAMPLE".notCombined_2.fastq" -S $SAMPLE"_Paired_bowtie2_Human.sam" # Make a list of reads that mapped to the contaminant database and should be removed samtools view -S -F 4 $SAMPLE"_Paired_bowtie2_Human.sam" | cut -f 1 | uniq > $SAMPLE"_Paired_bowtie2_Human_RemoveList.txt" # In a similar way, map the reads and create RemoveLists for each additional contaminant database (we used databases of sequence from bacteria, vira, rRNA and Artemia spp. (see Manuscript Supplementary Note 2) # Combine all the paired-end RemoveLists and remove the contamination reads cat $SAMPLE"_Paired_bowtie2_Bacteria_PartA_RemoveList.txt" $SAMPLE"_Paired_bowtie2_Bacteria_PartB_RemoveList.txt" $SAMPLE"_Paired_bowtie2_Bacteria_PartC_RemoveList.txt" $SAMPLE"_Paired_bowtie2_Human_RemoveList.txt" $SAMPLE"_Paired_bowtie2_Virus_RemoveList.txt" $SAMPLE"_Paired_bowtie2_rRNA_RemoveList.txt" $SAMPLE"_Paired_bowtie2_Artemia_RemoveList.txt" | sort | uniq > $SAMPLE"_Paired_Complete_RemoveList.txt" # Use script in Dryad repository to remove contaminant sequences from the Fastq files. RemoveSequencesFromFastq.py $SAMPLE'.notCombined_1.fastq' $SAMPLE'_Paired_Complete_RemoveList.txt' $SAMPLE'_Paired_NoCont_1.fastq' RemoveSequencesFromFastq.py $SAMPLE'.notCombined_2.fastq' $SAMPLE'_Paired_Complete_RemoveList.txt' $SAMPLE'_Paired_NoCont_2.fastq' # For single end reads (reads pairs that became merged or orphaned in the quality filtering): bowtie2 -q --phred33 --sensitive -x /ContaminationDB/Human/GCA_000001405.15_GRCh38_top-level" -U $SAMPLE"_SingleEndQualFiltered.fastq" -S $SAMPLE"_SingleEndQualFiltered_bowtie2_Human.sam" samtools view -S -F 4 $SAMPLE"_SingleEndQualFiltered_bowtie2_Human.sam" | cut -f 1 | uniq > $SAMPLE"_SingleEndQualFiltered_bowtie2_Human_RemoveList.txt" # In a similar way, map the reads and create RemoveLists for each additional contaminant database (we used databases of sequence from bacteria, vira, rRNA and Artemia spp. (see Manuscript Supplementary Note 2) # Combine all the paired-end RemoveLists and removing the contamination reads cat $SAMPLE"_SingleEndQualFiltered_bowtie2_Bacteria_PartA_RemoveList.txt" $SAMPLE"_SingleEndQualFiltered_bowtie2_Bacteria_PartB_RemoveList.txt" $SAMPLE"_SingleEndQualFiltered_bowtie2_Bacteria_PartC_RemoveList.txt" $SAMPLE"_SingleEndQualFiltered_bowtie2_Human_RemoveList.txt" $SAMPLE"_SingleEndQualFiltered_bowtie2_Virus_RemoveList.txt" $SAMPLE"_SingleEndQualFiltered_bowtie2_rRNA_RemoveList.txt" $SAMPLE"_SingleEndQualFiltered_bowtie2_Artemia_RemoveList.txt" | sort | uniq > $SAMPLE"_SingleEndQualFiltered_Complete_RemoveList.txt" # Use script in Dryad repository to remove contaminant sequences from the Fastq files. RemoveSequencesFromFastq.py $SAMPLE'_SingleEndQualFiltered.fastq' $SAMPLE'_SingleEndQualFiltered_Complete_RemoveList.txt' $SAMPLE'_SingleEndQualFiltered_NoCont.fastq' done # Count the number of reads and bases remaining in each file after each sampling step: # Counting the number of reads in a fastq file (repeat with each fastq file): grep "@HWI" $SAMPLE'_SingleEndQualFiltered_NoCont.fastq' | wc -l # Counting the number of bases in a fastq file (repeat with each fastq file): grep -A 1 "@HWI" $SAMPLE'_SingleEndQualFiltered_NoCont.fastq' | grep "^[ACGTN]" | tr -d "\n" | wc -m #### MAPPING THE GENOMIC READS TO THE REFERENCE TRANSCRIPTOME #### # Loop over each sample to map them separately to the reference for SAMPLE in `cat SampleList.txt`; do # Extract relevant values from a table of sample ID, library ID, and platform unit (here in columns 5, 6, and 4, respectively) for each sequenced library SAMPLE_ID=`grep -P "${SAMPLE}\t" Sample_Pop_Library_Overview.txt | cut -f 5` LIB_ID=`grep -P "${SAMPLE}\t" Sample_Pop_Library_Overview.txt | cut -f 6` PU=`grep -P "${SAMPLE}\t" Sample_Pop_Library_Overview.txt | cut -f 4` # Map the paired-end reads bowtie2 -q --phred33 --very-sensitive-local -p 16 -I 0 -X 1500 --fr --rg-id SAMPLE_ID --rg SM:SAMPLE_ID --rg LB:LIBRARY_ID --rg PU:$PU --rg PL:ILLUMINA -x Menidia_TranscriptomeRef -1 $SAMPLE'_Paired_NoCont_1.fastq' -2 $SAMPLE'_Paired_NoCont_2.fastq' -S $SAMPLE'_Paired_bowtie2_Menidia_TranscriptomeRef.sam' # Convert to bam file for storage samtools view -bS -F 4 $SAMPLE'_Paired_bowtie2_Menidia_TranscriptomeRef.sam' > $SAMPLE'_Paired_bowtie2_Menidia_TranscriptomeRef.bam' rm $SAMPLE'_Paired_bowtie2_Menidia_TranscriptomeRef.sam' # Map the single-end reads bowtie2 -q --phred33 --very-sensitive-local -p 16 --rg-id $SAMPLE --rg SM:$SAMPLE_ID --rg LB:$LIB_ID --rg PU:$PU --rg PL:ILLUMINA -x Menidia_TranscriptomeRef -U $SAMPLE'_SingleEndQualFiltered_NoCont.fastq' -S $SAMPLE'_SingleEnd_bowtie2_Menidia_TranscriptomeRef.sam' # Convert to bam file for storage samtools view -bS -F 4 $SAMPLE'_SingleEnd_bowtie2_Menidia_TranscriptomeRef.sam' > $SAMPLE'_SingleEnd_bowtie2_Menidia_TranscriptomeRef.bam' rm $SAMPLE'_SingleEnd_bowtie2_Menidia_TranscriptomeRef.sam' # Filter bam files to remove poorly mapped reads (non-unique mappings and mappings with a quality score < 20) samtools view -h -q 20 $SAMPLE'_Paired_bowtie2_Menidia_TranscriptomeRef.bam' | grep -v XS:i | samtools view -buS - | samtools sort - $SAMPLE'_paired_to_TranscriptomeRef_MinQ20_sorted' samtools view -h -q 20 $SAMPLE'_SingleEnd_bowtie2_Menidia_TranscriptomeRef.bam' | grep -v XS:i | samtools view -buS - | samtools sort - $SAMPLE'_SingleEnd_to_TranscriptomeRef_MinQ20_sorted' # Filter the paired mappings by pairing status (we separate these out to be able to count the proportion of mapped pairs falling in each of these categories) # Extracting the concordantly mapped read pairs samtools view -h -b -f 0x002 $SAMPLE'_paired_to_TranscriptomeRef_MinQ20_sorted.bam' > $SAMPLE'_paired_to_TranscriptomeRef_MinQ20_CP.bam' # Extract reads where only one mate mapped (unpaired) samtools view -h $SAMPLE'_paired_to_TranscriptomeRef_MinQ20_sorted.bam' | perl -n -e 'print $_ if (/^\@/ || /YT:Z:UP/) ' | samtools view -bS - > $SAMPLE'_paired_to_TranscriptomeRef_MinQ20_AllUP.bam' # Could also extract unpaired reads through filtering on the bit flag (samtools view -bu -f 0x008), but this filtering leaves out those pairs where one mate mapped with poor quality (and therefore got filtered out so that the mapping was effectively unpaired) # Extract discordantly mapped read pairs samtools view -h $SAMPLE'_paired_to_TranscriptomeRef_MinQ20_sorted.bam' | perl -n -e 'print $_ if (/^\@/ || /YT:Z:DP/) ' | samtools view -bS - > $SAMPLE'_paired_to_TranscriptomeRef_MinQ20_DiscordantlyMappedPairs.bam' # Could also extract discordantly read pairs through filtering on the bit flag (samtools view -b -F 0x0008 $SAMPLE'_paired_to_TranscriptomeRef_MinQ20_sorted.bam' | samtools view -b -F 0x002), but that includes pairs where one mate mapped with poor quality (and therefore got filtered out so that the mapping was effectively unpaired) # Closer examination of the $SAMPLE'_paired_To_NR_Trans_MinQ20_CP.bam' revealed that there were a non-negligible number of concordantly mapped read pairs for which the mapped ends overlapped (i.e. they had not been merged by flash). To avoid double-counting these overlapping sections in SNP calling and genotype likelihood estimation, we used clipOverlap to soft-clip overlapping read ends (maintaining only the read with the highest quality score in overlapping regions). /bamUtil/bin/bam clipOverlap --in $SAMPLE'_paired_to_TranscriptomeRef_MinQ20_CP.bam' --out $SAMPLE'_paired_to_TranscriptomeRef_MinQ20_CP_Clipped.bam' --unmapped --storeOrig OC --stats # Remove the reads that became unmapped in the clipping samtools view -hb -F 4 $SAMPLE'_paired_to_TranscriptomeRef_MinQ20_CP_Clipped.bam' > $SAMPLE'_paired_to_TranscriptomeRef_MinQ20_CP_Clipped_MappedOnly.bam' # Merge the bam files to create a single bam file for each sample with all the reads to use for downstream analysis (the overlap-clipped concordantly mapped reads, the unpaired mapped reads (where only one end of the pair mapped to the transcriptome reference), and the single-end/orphaned mapped reads (the merged pairs and reads for which the other mate in the pair had been lost during the quality filtering)). We did not include discordantly mapped pairs in any downstream analysis. samtools merge -f $SAMPLE'_to_TranscriptomeRef_MinQ20_PairedAndSingle.bam' $SAMPLE'_paired_to_TranscriptomeRef_MinQ20_CP_Clipped_MappedOnly.bam' $SAMPLE'_paired_to_TranscriptomeRef_MinQ20_AllUP.bam' $SAMPLE'_SingleEnd_to_TranscriptomeRef_MinQ20_sorted.bam' done ### Counting the number of retained mapped reads for different stages of the filtering ### # For each intermediate bam file, we counted the number of mapped reads with samtools, e.g. samtools view $SAMPLE'_paired_to_TranscriptomeRef_MinQ20_CP_Clipped_MappedOnly.bam' | wc -l # Estimate the fragment length for each concordantly mapped read pair (this has to be done on unmerged reads) based on their mapping coordinates on the reference. for SAMPLE in `cat SampleList.txt`; do # Extract relevant values from table of sample ID, library ID, and platform unit for each sequenced library SAMPLE_ID=`grep -P "${SAMPLE}\t" Sample_Pop_Library_Overview.txt | cut -f 5` LIB_ID=`grep -P "${SAMPLE}\t" Sample_Pop_Library_Overview.txt | cut -f 6` PU=`grep -P "${SAMPLE}\t" Sample_Pop_Library_Overview.txt | cut -f 4` bowtie2 -q --phred33 --very-sensitive-local -p 16 -I 0 -X 5000 --fr --rg-id $SAMPLE --rg SM:$SAMPLE_ID --rg LB:$LIB_ID --rg PU:$PU --rg PL:ILLUMINA -x Menidia_TranscriptomeRef -1 $SAMPLE'_AdapterClipped_forward_paired.fastq' -2 $SAMPLE'_AdapterClipped_reverse_paired.fastq' -S $SAMPLE'_AdapterClippedReads_to_TranscriptomeRef.sam' # Filter bam files to remove poorly mapped reads (non-unique mappings and mappings with a quality score < 20) samtools view -h -q 20 $SAMPLE'_AdapterClippedReads_to_TranscriptomeRef.sam' | grep -v XS:i | samtools view -buS - | samtools sort - $SAMPLE'_AdapterClippedReads_to_TranscriptomeRef_MinQ20_sorted' # Extract the fragment lengths samtools view -f 0x002 $SAMPLE'_AdapterClippedReads_to_TranscriptomeRef_MinQ20_sorted.bam' | cut -f 9 > $SAMPLE'_FragmentLengths_Concordant.txt' done #### REMOVE DUPLICATES #### # Loop over each sample to estimate duplication rates separately in each for SAMPLE in `cat SampleList.txt`; do java -Xmx60g -jar picard.jar MarkDuplicates I=$SAMPLE'_to_TranscriptomeRef_MinQ20_PairedAndSingle.bam' O=$SAMPLE'_to_TranscriptomeRef_MinQ20_PairedAndSingle_dedup.bam' M=$SAMPLE'_to_TranscriptomeRef_MinQ20_PairedAndSingle_dupstat.txt' VALIDATION_STRINGENCY=SILENT REMOVE_DUPLICATES=true done #### REALIGN AROUND INDELS #### # This is done across all samples at once # Create list of potential indels java -Xmx40g -jar GenomeAnalysisTK.jar \ -T RealignerTargetCreator \ -R Menidia_TranscriptomeRef.fa \ -I AllSamples_To_TranscriptomeRef_dedup_sorted.list \ -o AllSamples_forIndelRealigner.intervals \ -drf BadMate # Run the indel realigner tool java -Xmx40g -jar GenomeAnalysisTK.jar \ -T IndelRealigner \ -R Menidia_TranscriptomeRef.fa \ -I AllSamples_To_TranscriptomeRef_dedup_sorted.list \ -targetIntervals AllSamples_forIndelRealigner.intervals \ --consensusDeterminationModel USE_READS \ --nWayOut realigned.bam #### GET COVERAGE STATS #### # Loop over each sample to obtain coverage statistics for each for SAMPLE in `cat SampleList.txt`; do /bedtools2/bin/coverageBed -hist -abam $SAMPLE'_to_TranscriptomeRef_MinQ20_PairedAndSingle_dedup_realigned.bam' -b Menidia_TranscriptomeRef.bed > $SAMPLE'_to_TranscriptomeRef_MinQ20_PairedAndSingle_dedup_realigned.hist' samtools mpileup -A -B -Q 0 -f Menidia_TranscriptomeRef.fa $SAMPLE'_to_TranscriptomeRef_MinQ20_PairedAndSingle_dedup_realigned.bam' > $SAMPLE'_to_TranscriptomeRef_MinQ20_PairedAndSingle_dedup_realigned.mpileup' cut -f 4 $SAMPLE'_to_TranscriptomeRef_MinQ20_PairedAndSingle_dedup_realigned.mpileup' > $SAMPLE'_to_TranscriptomeRef_MinQ20_PairedAndSingle_dedup_realigned_mpileup_Depths.txt done # The coverage histograms and the total depth columns from the mpileup files were summarized in R. #### SNP CALLING #### # Call SNPs across all samples angsd_0.910/angsd -b ListAllSampleBamFiles.txt -anc Menidia_TranscriptomeRef.fa -out AllSamples_MinQ20 -dosaf 1 -GL 1 -doGlf 3 -doMaf 1 -doMajorMinor 1 -doPost 1 -doVcf 1 -doCounts 1 -doDepth 1 -dumpCounts 1 -P 32 -remove_bads 0 -uniqueOnly 0 -only_proper_pairs 0 -SNP_pval 1e-6 -setMinDepth 300 -setMaxDepth 3028 -minInd 300 -minQ 20 -minMaf 0.01 # We use -only_proper_pairs 0 because we want to retain reads for which only one end of a pair is mapped (because we are mapping to a transcriptome, not the whole genome), and remove_bads 0 and -uniqueOnly 0 because we have already filtered the bam files for poorly mapped reads. ### PCA #### # Need output from an older version of angsd to use the ngsPopGen program # First create a list of SNPs called across all samples above by reformatting the AllSamples_MinQ20.mafs file to a regions file AllSamples_MinQ20_SNPList.regions (see angsd documentation) angsd_v0.700/angsd -b BamList_GeographicSamples.txt -out GeographicSamples_v7 -P 16 -GL 1 -doGeno 32 -doPost 1 -doMajorMinor 2 -doCounts 1 -doMaf 1 -remove_bads 0 -uniqueOnly 0 -only_proper_pairs 0 -setMinDepth 80 -minMaf 0.01 -setMaxDepth 1000 -rf AllSamples_MinQ20_SNPList.regions zcat GeographicSamples_v7.geno.gz > GeographicSamples_v7.geno zcat GeographicSamples_v7.mafs.gz > GeographicSamples_v7.mafs N_SITES=`tail -n +2 GeographicSamples_v7.mafs | wc -l` ngsPopGen/ngsCovar -probfile GeographicSamples_v7.geno -outfile GeographicSamples_v7.covar -nind 200 -nsites $N_SITES -call 0 -minmaf 0.01 # The resulting covariance matrix was processed with the eig() function in R #### MITOCHONDRIAL GENOME ANALYSIS #### # Extract consensus sequence based on mapping to a published reference mitochondrial genome sequence for the species # First map all samples to the reference mitochondrial genome with bowtie2 and filter the bam files using exactly the same procedure and parameter values as described above for mapping the reads to the full transcriptome reference. # Call SNPs in the mitochondrial genome for each sample separately with Freebayes in haploid mode for SAMPLE in `cat SampleList.txt`; do freebayes -f Menidia_MtGenome.fa -p 1 --min-alternate-count 3 --genotype-qualities $SAMPLE'_To_MtGenome_MinQ20_PairedAndSingle_dedup_sorted.bam' > $SAMPLE'_To_MtGenome_MinQ20.vcf' # Remove multi-allelic sites by picking the most common alternate allele with vcfflatten /vcflib/bin/vcfflatten $SAMPLE'_To_MtGenome_MinQ20.vcf' > $SAMPLE'_To_MtGenome_MinQ20_flattened.vcf' # Generate a fasta consensus sequence based on the variant data in the vcf file with vcf2fasta /vcflib/bin/vcf2fasta -f /Menidia_MtGenome.fa -P 1 $SAMPLE'_To_MtGenome_MinQ20_flattened.vcf' done # De novo assemble the mitochondrial genome from each individual for SAMPLE in `cat SampleList.txt`; do # Create an interleaved fastq input file with script available at MITObim/misc_scripts/interleave-fastqgz-MITOBIM.py # Select a sequence seed (e.g. a COI sequence or whatever is available for the species) # MITObim in de novo mode MITObim/MITObim_1.8.pl -sample $SAMPLE -ref Menidia_COI_KC015657.1 -readpool $SAMPLE'_AdapterClipped_QualFiltered_interleaved_paired.fastq' --quick Menidia_COI_KC015657.1.fa -end 100 --clean --pair --denovo --mirapath /Programs/mira_4.0.2_linux-gnu_x86_64_static/bin/ # MITObim in mapping mode MITObim_1.8.pl -sample $SAMPLE -ref Menidia_COI_KC015657 -readpool $SAMPLE'_AdapterClipped_QualFiltered_interleaved_paired.fastq' --quick Menidia_COI_KC015657.fa -end 200 --clean --pair --mirapath /Programs/mira_4.0.2_linux-gnu_x86_64_static/bin/ done