# from Kate Ostevik, modified by Brook Moyers # bash commands used to filter genotype call output from the uneak pipeline of TASSEL 3 # values and file names are specific to this project # needs to be walked through step-by-step rather than run all at once ## Change allele counts into A,B & H and changes weird ratios to Ns awk '{ print NF; exit }' map3all_fixedFINAL.hmc.txt ## Use this number in for loop in hmc2geno script # 389 (394-5) for map3all (subtract 5 for those last columns) awk '{ print $1 }' map3all_fixedFINAL.hmc.txt > full.txt # bash script 'hmc2genoBM.sh' # requires at least 10 total calls/locus, and assigns A v B based only on the order of alleles " #!/bin/bash for i in {2..389} do cut -f $i map3all_fixedFINAL.hmc.txt | awk 'BEGIN { FS = "|" } ; {if (length($1) >= 8) print $1; else if ($1+$2<5) print "N"; else if ($1==0) print "B"; else if ($2==0) print "A"; else if ($1/$2>50) print "A"; else if ($2/$1>50) print "B"; else if ($1/$2>10 && $1/$2<=50) print "N"; else if ($2/$1>10 && $2/$1<=50) print "N"; else print "H"}' > temp paste full.txt temp > temp2 mv temp2 full.txt done exit " # to run (takes a few minutes with my amount of data) ./hmc2genoBM.sh map3all_fixedFINAL.hmc.txt # pull out only fixed differences: grep -e 'rs\s' -e 'TP[0-9][0-9]*\sA\sB' full.txt > ABgeno.txt #3930 SNPs grep 'TP[0-9][0-9]*\sB\sA' full.txt > BAgeno.txt #3944 SNPs # make all of the parent 1 alleles 'A' and parent 2 'B' sed 's/\tB/\tC/g' BAgeno.txt > temp sed 's/\tA/\tD/g' temp > BAgeno.txt sed 's/\tC/\tA/g' BAgeno.txt > temp sed 's/\tD/\tB/g' temp > BAgeno.txt cat ABgeno.txt BAgeno.txt > map3allparentHomoDiff.txt wc -l map3allparentHomoDiff.txt #7875 total sites # filter for coverage and also heterozygosity awk 'NR==1{print $0}' map3allparentHomoDiff.txt > header sed 's/[^N]//g' map3allparentHomoDiff.txt | awk '{ print length }' | sed '1s/^.*$/N/' > temp.N sed 's/[^H]//g' map3allparentHomoDiff.txt | awk '{ print length }' | sed '1s/^.*$/H/' > temp.H paste map3allparentHomoDiff.txt temp.N temp.H > temp awk '{print NF; exit}' temp #the second to last is the N column awk '$390<=194 {print $0}' temp > temp2 #hard coding in 50% coverage wc -l temp2 #1009 SNPs awk '{print $0, $391/(388-$390)}' temp2 > temp3 #this calculates Ho as total hets (last column) divided by the number of individuals (total col - 3) with genotypes (minus N count) awk '$392<=0.7 && $392>=0.3 {print $0}' temp3 > temp4 wc -l temp4 #715 (is 562 if btw. 40 and 60, or 860 if btw. 20 and 80) cat header temp4 > temp5 mv temp5 map3allparentHomoDiff_Ho3_7_50p.txt rm temp* rm header ## Format snp.table for rQTL: add ones, transpose awk '{print NF; exit}' map3allparentHomoDiff_Ho3_7_50p.txt cut -f1 map3allparentHomoDiff_Ho3_7_50p.txt > alleles cut -f4-389 map3allparentHomoDiff_Ho3_7_50p.txt > temp wc -l temp rm ones bash for i in {1..716} do echo "1" >> ones done exit paste alleles ones temp > temp2 python -c "import sys; print('\n'.join(' '.join(c) for c in zip(*(l.split() for l in sys.stdin.readlines() if l.strip()))))" < temp2 > temp3 tr ' ' ',' < temp3 > map3allparentHomoDiff_Ho3_7_50p.rqtl rm temp* rm ones rm alleles # use nano to clear out 'rs' and first '1' on line 2 --> 386 individuals in map3all