####PIPELINE SUMMARY#### #1) RAW DATA SORTED BY INDIVIDUAL MID - trimming the MID sequence + 1bp ###### SortTrim.R ######## # PstI cutting site TGCAG rm(list=ls()) library(Biostrings) library(ShortRead) lib<-"LOH_L2_" subdat<-"8_" dir<-"IlluminaRaw_DIRpath" MIDs<-read.table ("DIRpath/barcode.txt") N.MIDs<-dim(MIDs)[1] reads<-readFastq(dirPath=" ", pattern="FileName.fastq", withIds=T) RS<-DNAString("TGCAG") ### define restriction enzyme's recognition site prfct.rds<-reads[grep(RS,narrow(sread(reads),start=6,end=10),value=FALSE)] # reads-subset with perfect RS in perfect position del.rds<-reads[grep(RS,narrow(sread(reads),start=5,end=9),value=FALSE)] # reads-subset with perfect RS but in wrong position due to 1b deletion in MID ins.rds<-reads[grep(RS,narrow(sread(reads),start=7,end=11),value=FALSE)] # reads-subset with perfect RS but in wrong position due to 1b insertion in MID RS<-DNAString("NGCAG") ok.rds<-narrow(reads[grep(RS,narrow(sread(reads),start=6,end=10),value=FALSE)]) # reads-subset with RS in perfect position, trimmed to 100b RS<-DNAString("TNCAG") delN.rds<-narrow(reads[grep(RS,narrow(sread(reads),start=5,end=9),value=FALSE)]) # reads-subset with RS but in wrong position due to 1b deletion in MID stats<-NULL stats<-rbind(stats,"total reads",length(reads)) for (i in 1:N.MIDs){ MID<-DNAString(as.character(MIDs[i,1])) aaa<-agrep(MID,narrow(sread(prfct.rds),start=1,end=5),max.distance=list(all=1,insertions=0,deletions=0,substitutions=1)) a<-prfct.rds[aaa] a<-narrow(a,start=6,end=99) # end was 75; trim away MID plus last base in sequence (needed to match reading frame of sqs below) ShortRead does NOT require this however (length differences are allowed)! a<-clean(a) # remove all rds containing 1 or more ambiguous (i.e. non-A,T,G,C) bases; this eliminates ALL Ns - DANGER #a<-a[grep("NN",sread(a),invert=T)] name<-paste(dir,lib,as.character(MID),".fastq",sep="") #softer way to eliminate Ns, in case there are systematic Ns introduced by Illumina if (length(a)>0) #writeFastq(a,file=name,mode="w") # write object to NEW fastq file writeFastq(a,file=name,mode="a") # write object to EXISTING fastq file (mode="a"); needed when handling subfiles of a library beyond the first one prfct.rds<-prfct.rds[-aaa] ddd<-agrep(MID,narrow(sread(ok.rds),start=1,end=5),max.distance=list(all=1,insertions=0,deletions=0,substitutions=1)) d<-ok.rds [ddd] d<-narrow(d,start=6,end=99) # end was 75; trim away MID plus last base in sequence (needed to match reading frame of sqs below) ShortRead does NOT require this however (length differences are allowed)! if(length(d)>0){ at<-matrix(c(TRUE,rep(FALSE,93)),nrow=length(d),ncol=width(d)[1],byrow=T) #next three lines needed to replace the N produced by Illumina at original pos6 letter<-DNAStringSet(as.character(rep("T",length(d)))) d<-ShortReadQ(sread=(replaceLetterAt(sread(d),at,letter)),quality=quality(d),id=id(d)) } d<-clean(d) # remove all rds containing 1 or more ambiguous (i.e. non-A,T,G,C) bases; this eliminates the Ns if(length(d)>0) {writeFastq(d,file=name,mode="a")} ok.rds<-ok.rds[-ddd] #remove these reads from ok.rds to increase speed b<-del.rds[grep(MID[2:5],narrow(sread(del.rds),start=1,end=4))] # filter from del.rds those that start with a perfect match for MID position 2-5 b<-narrow(b,start=5,end=98) # end was 74; trim away MID (which is 4b due to deletion) plus last two bases; product matches reading frame of a b<-clean(b) # remove all rds containing 1 or more ambiguous (i.e. non-A,T,G,C) bases; this eliminates the Ns #b<-b[grep("NN",sread(b),invert=T)] #softer way to eliminate Ns, in case there are systematic Ns introduced by Illumina if (length(b)>0) writeFastq(b,file=name,mode="a") # append to the fastq file created above c<-ins.rds[grep(MID,narrow(sread(ins.rds),start=2,end=6))] # filter from ins.rds those that have any type of single-base insertion before the MID c<-narrow(c,start=7,end=100) # end was 76; trim away MID (which is 6b due to insertion); product mathces reading frame of a c<-clean(c) # remove all rds containing 1 or more ambiguous (i.e. non-A,T,G,C) bases; this eliminates the Ns #c<-c[grep("NN",sread(c),invert=T)] #softer way to eliminate Ns, in case there are systematic Ns introduced by Illumina if (length(c)>0) writeFastq(c,file=name,mode="a") # append to the fastq file created above e<-delN.rds[grep(MID[2:5],narrow(sread(delN.rds),start=1,end=4))] # filter from del.rds those that match MID (with first base missing) at position 1-4; very few e<-narrow(e,start=5,end=98) # end was 74; trim away MID (which is 4b due to deletion) plus last two bases; product matches reading frame of a if(length(e)>0) { # some reads are rare so to avoid errors if some MIDs are not found with a deletion and N on pos 6 for example in some of the files with raw data at<-matrix(c(FALSE,TRUE,rep(FALSE,92)),nrow=length(e),ncol=width(e)[1],byrow=T) #next three lines needed to replace the N produced by Illumina at original pos6 letter<-DNAStringSet(as.character(rep("G",length(e)))) e<-ShortReadQ(sread=(replaceLetterAt(sread(e),at,letter)),quality=quality(e),id=id(e)) } e<-clean(e) # remove all rds containing 1 or more ambiguous (i.e. non-A,T,G,C) bases; this eliminates the Ns (1% or fewer) #e<-e[grep("NN",sread(e),invert=T)] if(length(e)>0) {writeFastq(e,file=name,mode="a")} rds.tot<-sum(length(a),length(d),length(b), length(c), length(e)) stats<-rbind(stats,name,rds.tot) } statsname<-paste(dir,lib,subdat,"stats.txt",sep="") write.table(stats,file=statsname,quote=F) ##2) ALIGNMENT OF SHORT READS TO DAPHNIA 2.4 GENOME - NovoAlign novoalign -d DIRpath/dmagna_v2_4.ndx -F ILMFQ -t246 -g40 -x15 -oSAM -oFullNW --3Prime -rN -f Infile_Path/MID.fastq" #score is 246 which is 8.2 high quality mismatches and in 93bp long read that is on average one mismatch per 11.34 bases ###3) CONVERTING TO BAM FORMAT - SAMtools samtools view -b -S File_path.sam > File_path.bam ####4) CALLING CONSENSUS SEQUENCES - HOMOZYGOTES, HETEROZYGOTE SNPs, UNCLEAR CALLS => obtaining "_consensus.txt" files library(Rsamtools) rm(list=ls()) IND<-"Sample_name" ### specify MID # genotyping settings: foldcov<-3 ### # a RAD locus is not gennotyped if its total seq coverage is more than foldcov times the overall mean coverage (=length(posi)/length(u.scaffpos)). eliminates repeats prop.1.2<-0.7 # a RAD locus is not genotyped if the proportion of the two dominant haplotypes together is <= prop.1.2, relative to the total coverage of the locus. eliminates repeats # down in the script sum.1.2<-as.integer((length(posi)/length(u.scaffpos))/foldcov) # additional threshold will be added for better dinstinction of low coverage and possible hemizygotes sum.half<-sum.1.2/2 het<-1/3 # a diploid locus is called heterozygous if the ratio count.htp.2nd/count.htp.dominant is greater than het min<-3 # a haploid locus is called when coverage is at least min (but lower than sum.1.2) infile<-paste("DIR_path",IND,".bam",sep="") ### outfile<-paste("DIR_path",IND,"_consensus.txt",sep="") ### param <-ScanBamParam(flag=scanBamFlag(isUnmappedQuery=FALSE),what=c("rname", "pos", "seq", "strand"), tag="Z3", reverseComplement=TRUE) f<-scanBam(infile, param=param)[[1]] ### WATCH OUT - SHORTER CUTTER OVERHANG!! scaff<-c(as.character(f$rname[grep("-",f$strand,invert=T)]),as.character(f$rname[grep("-",f$strand,invert=F)])) posi<-c(f$pos[grep("-",f$strand,invert=T)],f$tag[[1]][grep("-",f$strand,invert=F)]) #seqs<-c(narrow(f$seq,start=6,end=94)[grep("-",f$strand,invert=T)],narrow(f$seq,start=6,end=94)[grep("-",f$strand,invert=F)]) ### is start/end ok? seqs<-as.character(c(narrow(f$seq,start=7,end=94)[grep("-",f$strand,invert=T)],narrow(f$seq,start=7,end=94)[grep("-",f$strand,invert=F)])) ### is start/end ok? scaffpos<-paste(scaff, posi, sep=' ') u.scaffpos<-sort(unique(scaffpos)) length(posi) #total number of alignments length(u.scaffpos) #this gives the number of unique RAD loci length(posi)/length(u.scaffpos) #mean coverage per RAD locus dat<-data.frame(cbind(scaffpos, seqs)) c.thr<-as.integer(length(posi)/length(u.scaffpos)*foldcov) # relative coverage threshold based on average coverage; c.thr sum.1.2<-as.integer((length(posi)/length(u.scaffpos))/foldcov) sum.1.2 ## coverage minimum for a good diploid sum.half<-as.integer(sum.1.2/2) sum.half ## Homozygotes with coverage sum.1.2>=stacks-le>=sum.half will be considered #clean the environment and release memory: rm(f, scaff, posi, seqs, scaffpos) #no longer used gc() #garbage collection, returns memory to the operating system #outfile data structure: scaffpos / coverage (sum of 2 variants) / X-Y (needed for SNP calling) / diploid_homoz(N)-OR-diploid_heteroz(Y)-OR-haploid_low(L) -OR - hemizygote (H) / seq cons<-NULL for (i in 1:length(u.scaffpos)){ stack<-DNAStringSet(as.character(subset(dat, scaffpos==u.scaffpos[i])[,2])) stack.le<-length(stack) if(stack.le<=c.thr){ #monomorphic: if (length(unique(stack))==1){ #well covered: if(stack.le>=sum.1.2){ gtp.X<-paste(u.scaffpos[i], stack.le, "X", "N", as.character(stack[1]), sep=" ") gtp.Y<-paste(u.scaffpos[i], stack.le, "Y", "N", as.character(stack[1]), sep=" ") cons<-rbind(cons, gtp.X, gtp.Y) } #half of the minimum coverage--> consider haploid... if stack.le=sum.half)){ ## there is only one variant counted so no need for min threshold here...that is covered in polymorphic loci gtp.X<-paste(u.scaffpos[i], stack.le, "X", "H", as.character(stack[1]), sep=" ") cons<-rbind(cons, gtp.X) } } #polymorphic: if(length(unique(stack))>1){ cnts<-sort(table(as.character(stack)), decreasing=T)[1:2] #no repeat issue: if((sum(cnts)/stack.le)>prop.1.2){ #heterozygote: if((sum(cnts)>=sum.1.2) && ((cnts[2]/cnts[1])>het)){ gtp.X<-paste(u.scaffpos[i], stack.le, "X", "Y", as.character(names(cnts)[1]), sep=" ") gtp.Y<-paste(u.scaffpos[i], stack.le, "Y", "Y", as.character(names(cnts)[2]), sep=" ") cons<-rbind(cons,gtp.X, gtp.Y) } #homozygote (minor variant too rare, artifact): #if((sum(cnts)>=sum.1.2) && ((cnts[2]/cnts[1])<=het)){ # minor variant too rare, artifact; hence locus diploid homozygote (cnt.1=30 and cnt.2=8 would pass this condition) #this would be to relaxed for LOH search in offspring so if the artifact occurs more than 3 times (min=3) then it should be taken seriously... if ((cnts[1]>=sum.1.2) && (cnts[2]<=min)) {## this is true homozygote allowing for a seq error (much more stringent that if would be estimated from true seq error rates) gtp.X<-paste(u.scaffpos[i], stack.le, "X", "N", as.character(names(cnts)[1]), sep=" ") gtp.Y<-paste(u.scaffpos[i], stack.le, "Y", "N", as.character(names(cnts)[1]), sep=" ") cons<-rbind(cons,gtp.X, gtp.Y) } #coverage poor, gtp unclear, haploid call --> dominanat variant satisfies hemizygote candidate condition; 2nd varinat too rare : if((sum(cnts)=sum.half) && (cnts[2]=sum.half) && (cnts[2]>=min)){ gtp.X<-paste(u.scaffpos[i], stack.le, "X", "L", as.character(names(cnts)[1]), sep=" ") cons<-rbind(cons,gtp.X) } } } } } write.table(cons, outfile, row.names=F, col.names=F, quote=F) #write to individual file ##### 5) IDENTIFICATION OF INFORMATIVE LOCI (heterzygous in stem mothers) & ASSESING THEM IN THE OFFSPRING => obtaining "_markers_offspring_gtps.txt" # scan offspring for loss of heterozygosity at loci detected in the reference (mother, sequenced twice) # approach: 1) scan the mothers for clean heterozygous SNPs, allowing a specified number (p.tr) of SNPs per RAD locus => RAD-markers # 2) evaluate the offspring at those SNPs (markers_offspring_gtps) ##### module 1 - find SNPs in mother: library(Biostrings) rm(list=ls()) folder<-'DIR_path' ID.list<-'LineSpecific_LOHindlist.txt' # the file specifying the mother (2x, on top) and the offspring IDs<-read.table(paste(folder, ID.list, sep=''), h=FALSE) p.tr<-3 # polymorphism threshold; max number of polym per RAD locus # upload and rbind the two mother consensus data sets: m<-NULL for(i in 1:2){ m<-rbind(m, read.table(paste(folder, IDs[i,1],'_consensus.txt', sep=''), h=FALSE)) } loc<-paste(m[,1], m[,2], sep=' ') u.loc<-unique(loc) length(u.loc) # scan all RAD loci for polymorphisms; record up to p.tr per RAD locus: het.locs<-NULL for(i in 1:length(u.loc)){ pat<-paste('^', u.loc[i], '$', sep='') stack<-DNAStringSet(m[grep(pat,loc),6]) if(length(unique(stack))>1){ p<-0 for(j in 1:width(stack[1])){ nucs<-narrow(stack, start=j, width=1) if((table(as.character(nucs))[2]==2)&&(as.character(nucs[1])!=as.character(nucs[2]))&&p repeat for each file from ID.list starting from 3rd (first two are mothers) d<-read.table(paste(folder, IDs[i,1],'_cons.txt', sep=''), h=FALSE) ## load table containing consensus seqs ind.gtps<-NULL for(j in 1:length(u.loc)){ #open unique RAD locus loop pos<-het.locs[grep(u.loc[j], loc),] stack<-d[grep(u.loc[j], paste(d[,1], d[,2], sep=' ')),6] if(length(stack)==2){ ##### this evaluation forces a stack to be 1) present at all and 2) diploid for(k in 1:length(pos[,1])){ gtp.1<-paste(narrow(DNAStringSet(as.character(stack)), start=pos[k,3], width=1)[1], narrow(DNAStringSet(as.character(stack)), start=pos[k,3], width=1)[2], sep='') ##### as character() gtp.2<-paste(narrow(DNAStringSet(as.character(stack)), start=pos[k,3], width=1)[2], narrow(DNAStringSet(as.character(stack)), start=pos[k,3], width=1)[1], sep='') ##### as character() ifelse(((gtp.1!=pos[k,4]) && (gtp.2!=pos[k,4])), ind.gtps<-rbind(ind.gtps, gtp.1), ind.gtps<-rbind(ind.gtps, as.character('--'))) #ifelse(((gtp.1!=pos[k,4]) && (gtp.2!=pos[k,4])), ind.gtps<-rbind(ind.gtps, gtp.1), ind.gtps<-rbind(ind.gtps, as.character(paste(as.character(unlist(pos[1])), as.character(unlist(pos[2])), sep=" ")))) } #close the multiple SNPs per unique RAD locus } if(length(stack)==1){ ##### this writes the loci with only haploid coverage for(k in 1:length(pos[,1])){ ind.gtps<-rbind(ind.gtps, as.character('LL')) #ind.gtps<-rbind(ind.gtps, as.character(paste(as.character(unlist(pos[1])), as.character(unlist(pos[2])), sep=" "))) } #close the multiple SNPs per unique RAD locus } if(length(stack)==0){ ##### this writes the loci missing altogether for(k in 1:length(pos[,1])){ #ind.gtps<-rbind(ind.gtps, as.character(paste(as.character(unlist(pos[1])), as.character(unlist(pos[2])), sep=" "))) ind.gtps<-rbind(ind.gtps, as.character('NN')) } #close the multiple SNPs per unique RAD locus } } #close the unique RAD locus dat<-cbind(dat, ind.gtps) names(dat)[length(dat[1,])]<-as.character(IDs[i,1]) } #close the individual write.table(dat, paste(folder, "offspr.gtps2.txt", sep=''), col.names=T, row.names=F, quote=F) ### END ###### 6) INSPECTING PUTATIVE LOH LOCI # After obtaining a table with offspring genotypes for maternal heterozygous sites, loci identified as homozygus were manually extracted for additional inspection. # All markers that were not successfully genotyped in eight out of ten daughter individuals, were not taken into consideration # In a few instances, loci with high counts for the 2nd, 3rd and 4th variant slipped through our genotype calling condidtions, thus it crucial check the individual coverage of each detected variant per putative RAD locus # module to explore the haplotype counts per RAD locus for individuals showing 'LOH' library(Rsamtools) rm(list=ls()) folder<-'DIR_path' #here are the BAMs and the reference txt file with individuals and RAD sites to inspect ref<-read.table(paste(folder, '/', 'Line_specific_LOHlist.txt',sep=''), h=T, stringsAsFactors=F) #individuals and RAD sites to inspect u.ind<-unique(ref[,1]) #these are the unique individuals summary<-NULL for(i in 1:length(u.ind)){ infile<-paste(folder, '/', u.ind[i], '.bam', sep='') #give path to target bam file param <-ScanBamParam(flag=scanBamFlag(isUnmappedQuery=FALSE),what=c("rname", "pos", "seq", "strand"), tag="Z3", reverseComplement=TRUE) f<-scanBam(infile, param=param)[[1]] chr<-c(as.character(f$rname[grep("-",f$strand,invert=T)]),as.character(f$rname[grep("-",f$strand,invert=F)])) posi<-c(f$pos[grep("-",f$strand,invert=T)],f$tag[[1]][grep("-",f$strand,invert=F)]) # this ensures that RAD locus positions refer to the end where the enzyme recognition site is sitting Rsite<-paste(chr, posi, sep='_') seqs<-as.character(c(narrow(f$seq,start=6,end=94)[f$strand!='-'],narrow(f$seq,start=6,end=94)[f$strand=='-'])) ind.loc<-ref[ref[,1]==u.ind[i],2] #get the RAD site(s) for the target individual for(j in 1:length(ind.loc)){ tab<-sort(table(seqs[Rsite==ind.loc[j]]), decreasing=T) if(length(tab)==1){summ<-paste(u.ind[i], ind.loc[j], names(tab)[1], tab[1], sep=' ')} if(length(tab)==2){summ<-paste(u.ind[i], ind.loc[j], names(tab)[1], tab[1], names(tab)[2], tab[2], sep=' ')} if(length(tab)==3){summ<-paste(u.ind[i], ind.loc[j], names(tab)[1], tab[1], names(tab)[2], tab[2], names(tab)[3], tab[3], sep=' ')} if(length(tab)>=4){summ<-paste(u.ind[i], ind.loc[j], names(tab)[1], tab[1], names(tab)[2], tab[2], names(tab)[3], tab[3], names(tab)[4], tab[4], sep=' ')} summary<-c(summary, summ) } } write.table(summary, paste(folder, '/Line_haploCntsLOH.txt', sep=''), quote=F, row.names=F, col.names=F) # ==> All putative LOH loci that had the 2nd variant count higher than 3x were discarded # ==> Out of the remaining putative LOH loci the best candidates are chosen to be re-sequenced.F