#Majoritary consensus generated with Geneious 8.0.5 Threshold 0% - Majority min Coverage 5 # Consensus global alignment Geneious Alignment Alignment type: Global alignment with free end gaps Cost Matrix: 65% similarity (5.0/-4.0) Gap open penalty: 12 Gap extension penalty: 3 #extraction gene regions #$1 global alignment in fasta format #$2 annotation tab with name_exon\tstartpos\tstoppos\tsens (file Cotesia_sesamiae_bracovirus_exon_pos.txt) #extract sample names grep ">" $1 | sed -e 's/>//g' > $1_all #extract for each sample the exons according to their positions in the global alignment while read sample do echo "$sample" > "$sample"_list fastaselect.pl "$1" "$sample"_list > "$sample".fasta while read i do gene=`echo "$i" | awk '{print $1}'` start=`echo "$i" | awk '{print $2}'` stop=`echo "$i" | awk '{print $3}'` samtools faidx "$sample".fasta "$sample":"$start"-"$stop" > "$sample"_"$gene".fasta done < $2 done < $1_all #gene recomposition awk '{print $1}' $2 | sed -e 's/_exon.//g' | uniq > $2_uniq for exon in `ls *_crv1.1_13.5_exon2.fasta` do sample=`echo "$exon" |sed -e 's/_crv1.1_13.5_exon2.fasta//g'` while read gene do fastaConcat --without-progress-bar -s @ "$sample"_"$gene"_* > "$sample"_"$gene"_final.fasta sed -e 's/@//g' -e 's/-/N/g' "$sample"_"$gene"_final.fasta > "$sample"_"$gene"_final.fasta_2 done < $2_uniq done #alignment of each gene for all samples for gene in `ls VOSH*` do sample=`echo "$gene" |sed -e 's/VOSH_//g'` cat *_"$sample" > ALL_"$sample"_2 done #reverse_complement of the reversed genes awk '{print $1"\t"$4}' $2 | sed -e 's/_exon.//g' | awk ' !x[$0]++' > $2_sens while read g do gene=`echo "$g" | awk '{print $1}'` sens=`echo "$g" | awk '{print $2}'` cat ALL_"$gene"_final.fasta_2_2 | awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' > ALL_"$gene"_final.fasta_3 sed -i '/^$/d' ALL_"$gene"_final.fasta_3 sed 's/\ //g' ALL_"$gene"_final.fasta_3 | sed -e 's/;//g' | sed -e 's/=//g' | perl -pe 's/:\w+//' > ALL_"$gene"_final.fasta_4 cat ALL_"$gene"_final.fasta_4 | sed 's/.*/\U&/' > ALL_"$gene"_final.fasta_5 if [[ ("$sens" = "reverse" ) ]] then fastx_reverse_complement -i ALL_"$gene"_final.fasta_5 -o ALL_"$gene"_final.fasta_6 else cp ALL_"$gene"_final.fasta_5 ALL_"$gene"_final.fasta_6 fi done < $2_sens #concatenation of all genes from all samples for the phylogeny for sample in `ls *_crv1.1_13.5_final.fasta_2` do fastaConcat --without-progress-bar -s @ "$sample"_* > "$sample"_allgenes.fasta sed -e 's/@//g' -e 's/-/N/g' "$sample"_allgenes.fasta > "$sample"_allgenes.fasta_2 done cat *_allgenes.fasta_2 > all_samples_allgenes.fasta #dN/dS estimation using PAML for each gene #conversion of the alignment in fasta format to phylip format using Fasta2Phylip.pl for i in `ls *.phy` do j=`echo "$i" |sed -e 's/\.phy//g'` #input each tree in NEWICK format with a star indicating the branch to test (each population and C. typha) for b in `ls *.newick` do c=`echo "$b" |sed -e 's/TREE//g' -e 's/.newick//g'` #modify input file sed -e 's/gena/'"$j"'/g' -e 's/TREE.newick/'"$b"'/g' -e 's/level/'"$c"'/g' codeml_gene.ctl > codeml_"$j"_"$c".ctl sed -e 's/gena/'"$j"'/g' -e 's/TREE.newick/'"$b"'/g' -e 's/level/'"$c"'/g' codeml_gene_neutral.ctl > codeml_"$j"_"$c"_neutral.ctl #run codeml to estimate dNdS codeml codeml_"$j"_"$c".ctl #run codeml with dNdS fixed codeml codeml_"$j"_"$c"_neutral.ctl lnl1=`grep "lnL" "$j"_"$c"_dnds | awk '{print $5}'` lnl0=`grep "lnL" "$j"_"$c"_neutral_dnds | awk '{print $5}'` #LRT in R session R --slave --vanilla --quiet --no-save --args < "$j"_"$c"_signi done done #dxy estimation for a in `ls *_6` do sed -e 's/ /_/g' -e 's/;/_/g' -e 's/:/_/g' -e 's/=/_/g' "$a" > "$a"_3 gene=`echo "$a" |sed -e 's/_final.fasta_6//g' -e 's/ALL_//g'` grep ">" "$a"_3 | sed -e 's/>//g' > "$gene"_all while read j do echo "$j" > "$j"_"$gene"_list fastaselect.pl "$a"_3 "$j"_"$gene"_list > "$j"_"$gene".fasta grep ">" -v "$j"_"$gene".fasta | sed 's/\(.\)/\1@/g' | tr "@" "\n" > "$j"_"$gene".line done < "$gene"_all rm *_"$gene".fasta rm *_"$gene"_list rm "$gene"_all rm "$a"_3 for d in `ls *_"$gene".line` do indiv=`echo "$d" |sed -e 's/_'"$gene"'.line//g'` for g in `ls *_"$gene".line` do indo=`echo "$g" |sed -e 's/_'"$gene"'.line//g'` for k in `ls *_"$gene".line` do inda=`echo "$k" |sed -e 's/_'"$gene"'.line//g'` paste "$d" "$g" "$k" > "$indiv"_"$indo"_"$inda"_"$gene".line2 done done done done for e in `ls *.line2` do f=`echo "$e" |sed -e 's/.line2//g'` sed '$d' "$e" > "$e"_2 awk '{print "THE_reference\t"NR"\t"$0}' "$e"_2 > "$f".line3 perl calculate-dxy.pl --input "$f".line3 --output "$f".dxy --min-covered-fraction 0.0 --window-size 20000 --step-size 20000 awk '{print "'$f'\t"$0}' "$f".dxy > "$f".dxy2 done