All commands were done on a linux based HPC CLUSTER (modifications may be needed in many of the the unix commands to use on a mac or Windows). Commands used are placed in between "#" marks with each line representing a single command. STEP 1 Clean raw reads used Trim galore (http://www.bioinformatics.babraham.ac.uk/projects/trim_galore/) commands for each set of reads # trim_galore -q 20 --phred33 --length 30 --dont_gzip --paired read1.fastq read2.fastq # STEP 2 Iterative baited assembly (IBA) Assembly uses IBA.py script to assemble each loci for each taxa. For each taxa we used these commands. We placed the raw data files, the IBA.py script and the reference folder (unzipped Lep1_ref.zip in this dryad package) in the same directory. # mkdir IBAass_taxaname cd IBAass_taxaname ../IBA.py -raw1 clean_read1.fastq -raw2 clean_read2.fastq -d ../Lep1_ref -n 3 -t 1 -p 16 -g 200 -c 10 -label BMORI -k 25 -taxa IDNUM_FAMILY_GENUS_SPECIES # STEP 3 Alignment and cut to probe region joined all the assembled loci for all taxa together in a single file # cat IBAass_*/*_final.fa > ALL_FULL_LOCI.fa # Changed fasta file have data on a single line for each sequnces using singleline.pl downloaded from http://www.bioinformatics-made-simple.com Then split into individual loci with split.py # perl singleline.pl ALL_FULL_LOCI.fa > sl_ALL_FULL_LOCI.fa ./splity.py sl_ALL_FULL_LOCI.fa _ # the resulting files are named by loci L1.fa-L855.fa for each loci we used unix command line with a loop to add the new assembled data to an alignment of the reference taxa using mafft and these resulting files will have a prefix of refaa_ # for X in L*.fas; do mafft --thread 8 --adjustdirectionaccurately -–addlong $X refaa_$X > AA_$X ; done # made alignment a single line for each sequnces using singleline.pl downloaded from http://www.bioinformatics-made-simple.com # for X in AA_L* ; do perl singleline.pl $X > 1l_$X; done # make list of single line fasta files files # ls 11_AA_L* > list # trim to probe region with Extract_probe_region.py and put into a single file # ./extract_probe_region.py list BMORI outdir cd outdir cat *.trimmed > 3by3.in # STEP 4 Single-Hit and Ortholog location genome mapping Blast all sequences 3by3.in to the Bombyx mori genome and check to see they are a single hit and remove sequences that do not fit single hit criteria # blastn -task blastn -query 3by3.in -db BDATABASE -out 3by3.out -outfmt 6 -max_target_seqs 3 -max_hsps 3 -num_threads 8 ./s_hit_checker.py 3by3.out 0.90 ./removelist.py 3by3.in 3by3_del_list0.90.txt 1by1.in # Used sequences that passed single hit criteria and found best hit with blast and filtered for orthologs homology # blastn -task blastn -query 1by1.in -db BDATABASE -out 1by1.out -outfmt 6 -max_target_seqs 1 -max_hsps 1 -num_threads 8 ./ortholog_filter.py 1by1.out ./getlist.py 1by1.in 1by1_keep_list ORTHO_PASS.fa # STEP 5 Alignment and isoform consensus Turned into a single line, used sed to formate sequence names, split into locii, alignment # perl singleline.pl ORTHO_PASS.fa > sl_ORTHO_PASS.fa sed -i "s/_/|/g" sl_ORTHO_PASS.fa sed -i "s/|seq/_seq/g" sl_ORTHO_PASS.fa ./split.py sl_ORTHO_PASS.fa \| for X in L*.fa; do mafft --thread 2 $X > $Xs; done # use FASconCAT-G (Kück and Longo 2014) (https://www.zfmk.de/en/research/research-centres-and-groups/fasconcat-g) to turn isoforms into a single sequences FASconCAT-G will only read files that end in *.fas # perl FASconCAT-G_v1.02.pl -c -c -c -o -s cat FcC_L* > NOISO_PROBE.fa sed -i "s/|/_/g" NOISO_PROBE.fa sed -i "s/_consensus//" NOISO_PROBE.fa # STEP 6 Contamination filter and duplicate removal self Blasts using usearch (http://drive5.com/usearch/) and contamination filtering, removing contaminates, removing duplicates # usearch -ublast NOISO_PROBE.fa -db NOISO_PROBE.fa -evalue 0.01 -id 0.99 -self -query_cov 0.95 -target_cov 0.95 -blast6out SELFBLASTOUT_NOISO_PROBE.fa -strand plus -threads 8 ./contamination_filter.py SELFBLASTOUT_NOISO_PROBE.fa _ 2 family ./removelist.py SELFBLASTOUT_NOISO_PROBE.fa SELFBLASTOUT_NOISO_PROBE_family_del_list.txt C_SCREENED.fa ./remove_duplicates.py C_SCREENED.fa # STEP 7 Align full length sequences that passed pipeline use usearch to get sequences using a feature that allows a partial match since we have removed part of the original sequence name # usearch -fastx_getseqs ALL_FULL_LOCI.fa -labels C_SCREENED_keep.list -label_substr_match -fastaout FINAL_FULL_CLEAN.fa perl singleline.pl FINAL_FULL_CLEAN.fa > 1l_FINAL_FULL_CLEAN.fa sed -i "s/_/|/g" 1l_FINAL_FULL_CLEAN.fa sed -i "s/|seq/_seq/g" 1l_FINAL_FULL_CLEAN.fa ./split.py 1l_FINAL_FULL_CLEAN.fa _ # for each loci aligned with mafft with the command below # mafft --thread 8 --adjustdirectionaccurately --allowshift --unalignlevel 0.8 --leavegappyregion --maxiterate 0 --globalpair infile > gapA_outfile.fas # STEP 8a Trim to probe region Make consensus of full length isoforms with FASconCAT-G which will read all files that end in .fas, join all sequence into a single file and count with counting_monster.py, use sed to rename sequences, turn into a single line, extract probe region, a few line to rename the files these are the final files for data set 1 and 2 # perl FASconCAT-G_v1.02.pl -c -c -c -o -s cat FcC_L* > ALL_single_for_CM.fas perl singleline.pl ALL_single_for_CM.fa > 1l_ALL_single_for_CM.fa counting_monster.py 1l_ALL_single_for_CM.fa _ sed -i "s/|/_/g" FcC_* sed -i -r "s/_+comp.\+//" FcC_* sed -i -r "s/_consensus//" FcC_* sed -i "s/^>L[0-9]\+_/>/" for X in FcC_*; do perl singleline.pl $X > 1l_$X; done ls 1l_FcC* > list ./Extract_probe_region.py list BMORI outdir cd outdir rm *.head rm *.tail ls > files sed -i "s/\(.\+\)\.trimmed/mv \1\trimmed \1_final.fas/" files chmod +x files ./files # STEP 8b Trim alignment including flanks used alignment_DE_trim.py and flank_dropper.py to clean alignments, make isoform consensus, count sequences with counting_moster.py, process sequences names with sed, the resulting alignments resulted dataset 6 # for Y in gapA_L* ; do python3 alignment_DE_trim.py $Y T_D60_E15_$Y 60 1.5 1; done for M in T_D60_E15_gapA_L* ; do python3 flank_dropper.py $M FD22_$M BMORI_R 2 2; done sed -i "s/_/|/g" FD22_* sed -i "s/|seq/_seq/g" FD22_* perl FASconCAT-G_v1.02.pl -c -c -c -o -s cat FcC_FD22* > ALL_single_FULLSEQS_CM.fa perl singleline.pl ALL_single_FULLSEQS_CM.fa > 1l_ALL_single_FULLSEQS_CM.fa counting_monster.py 1l_ALL_single_FULLSEQS_CM.fa _ sed -i "s/|/_/g" FcC_FD22* sed -i -r "s/_+comp.\+//" FcC_FD22* sed -i -r "s/_consensus//" FcC_FD22* sed -i "s/^>L[0-9]\+_/>/" FcC_FD22* # Cut out the probe region and the head region for datasets 5 & 6 # for X in FcC_FD22*; do perl singleline.pl $X > 1l_$X; done ls 1l_FcC_FD22* > list ./extract_probe_region.py list BMORI outdir cd outdir mkdir head_tail mv *.head ./head_tail mv *.tail ./head_tail cd head_tail ls *.head > fileH ls *.tail > fileT sed -i "s/\(.\+\)/mv \1 \1.fas/" fileH sed -i "s/\(.\+\)/mv \1 \1.fas/" fileH chmod +x file* ./fileH ./fileT cd .. ls > files sed -i "s/\(.\+\)\.trimmed/mv \1\trimmed \1_final.fas/" files chmod +x files ./files # We then used the table from the resulting counting_monster script to find loci that had 70% or more of the taxa present to use in the phylogentic analysis.