####################################### ## The working dirctory needs to be changed to the folder with master dataframes ## Specify the number of MIDs used in the analysis (Run1=13; Run2=8) MIDNum=13 # Read in the dataframes Run1_90 =read.csv("Run1_90bp.csv"); dim(Run1_90) Run1_100 =read.csv("Run1_100bp.csv"); dim(Run1_100) Run2_100 =read.csv("Run2_100bp.csv"); dim(Run2_100) AllSeqInfo=Run1_100 #Assign which database to look at AllSeqInfo [1:25,] #ASI names (AllSeqInfo) #"IonNumber" "Tax" "Evalue" "Species" "MID" "Qual" "Primer" "F_Tag" "R_Tag" "Seq" "IndivID" dim(AllSeqInfo) ASI1= AllSeqInfo[AllSeqInfo$Primer!="NoPri",] # Remove those with no Primer dim(ASI1) ASI2= ASI1[ASI1$F_Tag!="F_NoTag",]# Remove those with no Tag on For dim(ASI2) ASI3= ASI2[ASI2$R_Tag!="R_NoTag",]# Remove those with no Tag on Rev dim(ASI3);sum(AllSeqInfo$Tag!="None") ASI4= ASI3 [ASI3$Species!="None",]# Remove those Species=None sum(ASI3$Species=="None") dim(ASI4) table(ASI3$Species) ################################################### ##### Various summary statistics ##### Proportion of dataset matching primer 1-((sum(AllSeqInfo$Primer=="NoPri")+sum(AllSeqInfo$F_Tag=="F_NoTag")+sum(AllSeqInfo$R_Tag=="R_NoTag"))/length(AllSeqInfo$Primer)) 1-(sum(AllSeqInfo$Tag=="None")/dim(AllSeqInfo)[1]) ##### Number of primer matching dim(ASI3)[1] length(AllSeqInfo$Primer)-((sum(AllSeqInfo$Primer=="NoPri")+sum(AllSeqInfo$F_Tag=="F_NoTag")+sum(AllSeqInfo$R_Tag=="R_NoTag")) ) dim(ASI3)[1] / dim(AllSeqInfo)[1] ##### Proportion with primer match dim(ASI4)[1] /dim(ASI3)[1] ##### Proportion assigned to taxonomy table(ASI4$Species)[2]/sum(table(ASI4$Species)) ##### Proportion asigned that are harbour seal ##### Proportion asigned that are Tet (An amplicon sequenced in the run preceeding Run1) table(AllSeqInfo$Species)[6]/ sum(table(AllSeqInfo$Species)) sum(AllSeqInfo$Tag!="None") # number with primer ##### Quality comparison with and without primer par(mfrow=c(2,1)) hist(AllSeqInfo$Qual[AllSeqInfo$Tag!="None"],col="red", main="Primer found") hist(AllSeqInfo$Qual[AllSeqInfo$Tag=="None"],col="red", main="No Primer") mean(AllSeqInfo$Qual[AllSeqInfo$Tag!="None"]) #mean quality of primer match mean(AllSeqInfo$Qual[AllSeqInfo$Tag=="None"]) #mean Q no primer ##### Look at composition of assigned sequences with no primer id NoPri=table(AllSeqInfo$Species[AllSeqInfo$Tag=="None"]) # All sequences with no primer found SumNoPri=sum(NoPri) sum(NoPri[c(1,3,4)]) NoPri/SumNoPri NoPri[c(1,3,4)]/sum( NoPri[c(1,3,4)]) # proportion unassigned prey (note: different directions give different results) NoPri[2]/sum( NoPri[c(1:4)]) # proportion unassigned harbour seal ###### Some code to look at sequences without primer or taxanomic assignment (= 76,000 sequences) #length(AllSeqInfo$Seq[AllSeqInfo$Tag=="None" ]) # Number of seq with no primer/Tag #length(AllSeqInfo$Seq[AllSeqInfo$Tag=="None" & AllSeqInfo$Species=="None"]) # Number of these unassigned #DNAstringNoPriUn=DNAStringSet(AllSeqInfo$Seq[AllSeqInfo$Tag=="None" & AllSeqInfo$Species=="None"]) #names(DNAstringNoPriUn)=AllSeqInfo$IonNumber[AllSeqInfo$Tag=="None" & AllSeqInfo$Species=="None"] #writeXStringSet(DNAstringNoPriUn, file="NoPriUn.fasta", format="fasta") ##### Comparison of quality among MIDs tapply(ASI4$Qual,ASI4$MID,mean) tapply(ASI4$Qual[ASI4$Species=="Herring"],ASI4$MID[ASI4$Species=="Herring"],mean) tapply(ASI4$Qual[ASI4$Species=="Mackerel"],ASI4$MID[ASI4$Species=="Mackerel"],mean) tapply(ASI4$Qual[ASI4$Species=="Capelin"],ASI4$MID[ASI4$Species=="Capelin"],mean) boxplot(ASI4$Qual[ASI4$Species=="Herring"& ASI4$MID=="Ion05"],ASI4$Qual[ASI4$Species=="Herring"& ASI4$MID=="Ion04"]) evalq(sum(Qual[Species=="Herring"& MID=="Ion04"]>30)/sum(Qual[Species=="Herring"& MID=="Ion04"]>10) ,ASI4) evalq(sum(Qual[Species=="Herring"& MID=="Ion05"]>30)/sum(Qual[Species=="Herring"& MID=="Ion05"]>10) ,ASI4) evalq(sum(Qual[Species=="Mackerel"& MID=="Ion04"]>30)/sum(Qual[Species=="Mackerel"& MID=="Ion04"]>10) ,ASI4) evalq(sum(Qual[Species=="Mackerel"& MID=="Ion05"]>30)/sum(Qual[Species=="Mackerel"& MID=="Ion05"]>10) ,ASI4) ##### Quality Scores for different tags for each species (Rev primers only - to look at TagC effect) tapply(ASI4$Qual[ASI4$Species=="Herring"& ASI4$Primer=="Rev"],ASI4$Tag[ASI4$Species=="Herring"& ASI4$Primer=="Rev"],mean) tapply(ASI4$Qual[ASI4$Species=="Capelin"& ASI4$Primer=="Rev"],ASI4$Tag[ASI4$Species=="Capelin"& ASI4$Primer=="Rev"],mean) tapply(ASI4$Qual[ASI4$Species=="Mackerel"& ASI4$Primer=="Rev"],ASI4$Tag[ASI4$Species=="Mackerel"& ASI4$Primer=="Rev"],mean) #################################################################################### ############## Look at the good sequences. Forward and Reverse reads separately ASIclean=ASI4; dim(ASIclean) ASIclean[1:20,] ForCleanSeq=ASIclean[ASIclean$F_Tag!="NotF",] RevCleanSeq=ASIclean[ASIclean$R_Tag!="NotR",] AllSp=sort(unique(ASIclean$Species)) hist(ASIclean$Qual, col=7, main= "All Sequnces") ### Histogram of all quality scores in good sequences par(mfrow=c(2,1)) hist(ForCleanSeq$Qual, freq = FALSE, col=7,main="Forward Reads");mean(ForCleanSeq$Qual) hist(RevCleanSeq$Qual,freq = FALSE, col=7, main= "Reverse Reads"); mean(RevCleanSeq$Qual) ##################################################### ############ Put together data from individual samples (i.e. with unique tag and MID). Separate databases ############ for forward + reverse reads; gives species counts, % composition for a range of quality scores ##################################################### sort(unique(ASIclean$IndivID)) QualDat=c(seq(10,30,2)) #What quality score cut-offs to consider SampCountListF=list();Index=0# FORWARD for each Quality cut-off counts seqs for each sample and puts into list for( i in QualDat){ Ind_Count=evalq(tapply(factor(Species,levels=AllSp),IndivID,table),ForCleanSeq[ForCleanSeq$Qual>i,]) IC=unlist(Ind_Count) dim(IC)=c(4,3*MIDNum) IC=t(IC);colnames(IC)=c("Cap","HS","Her","Mac") IC=as.data.frame(IC[,c("Cap","Her","Mac","HS")]) IC$PreyTot=apply(IC[,c("Cap","Her","Mac")],1,sum) IC$CapPer=IC$Cap/IC$PreyTot;mean(IC$CapPer) IC$HerPer=IC$Her/IC$PreyTot;mean(IC$HerPer) IC$MacPer=IC$Mac/IC$PreyTot;mean(IC$MacPer) IC$Tag=c(rep("A",MIDNum), rep("B",MIDNum),rep("C",MIDNum)) IC$QCut=rep(i,3*MIDNum);IC$sampNum=c(1:(3*MIDNum));IC$sampNum2=c(1:MIDNum) Index=Index+1;SampCountListF[[Index]]=IC } SampCountListR=list();Index=0# REVERSE for( i in QualDat){ Ind_Count=evalq(tapply(factor(Species,levels=AllSp),IndivID,table),RevCleanSeq[RevCleanSeq$Qual>i,]) IC=unlist(Ind_Count) dim(IC)=c(4,3*MIDNum) IC=t(IC);colnames(IC)=c("Cap","HS","Her","Mac") IC=as.data.frame(IC[,c("Cap","Her","Mac","HS")]) IC$PreyTot=apply(IC[,c("Cap","Her","Mac")],1,sum) IC$CapPer=IC$Cap/IC$PreyTot;mean(IC$CapPer) IC$HerPer=IC$Her/IC$PreyTot;mean(IC$HerPer) IC$MacPer=IC$Mac/IC$PreyTot;mean(IC$MacPer) IC$Tag=c(rep("A",MIDNum), rep("B",MIDNum),rep("C",MIDNum)) IC$QCut=rep(i,3*MIDNum);IC$sampNum=c(1:(3*MIDNum));IC$sampNum2=c(1:MIDNum) Index=Index+1;SampCountListR[[Index]]=IC } Fcount= do.call("rbind", SampCountListF) Rcount= do.call("rbind", SampCountListR) head(Fcount) ### First look at the data by just combining forward and reverse read counts #Counts of F and R reads for all quality scores >10 cbind(Fcount [Fcount$QCut==10,1:5],Rcount [Rcount$QCut==10,1:5]) #This adds the forward and reverse count for each of the samples FR=Fcount [Fcount$QCut==10,1:5]+ Rcount [Rcount$QCut==10,1:5]; head(FR) # Cap Her Mac HS PreyTot FRCap=FR[,1]/FR[,5]; FRHer=FR[,2]/FR[,5]; FRMac=FR[,3]/FR[,5] mean(FRCap);sd(FRCap) mean(FRHer);sd(FRHer) mean(FRMac);sd(FRMac) # FIGURE 2 Plot (based on the Run1 100 bp dataset) Fish= c("Capelin","Herring","Mackerel") boxplot(FRCap,FRHer,FRMac,col=c(4,2,3),ylim=c(0,1),names =Fish,las=3,ylab= "Proportion of prey" ) points(c(1:3),c(.485,.34,.175),col="white",pch=17,cex=1.0) points(c(1:3),c(.485,.34,.175),col=c(4,2,3),pch=2,cex=1.2,lwd=2) #Alternately can give equal weights to forward and reverse reads FRequal= (Fcount [Fcount$QCut==10,6:8]+ Rcount [Rcount$QCut==10,6:8] )/2 boxplot(FRequal[,1],FRequal[,2],FRequal[,3],col=c(4,2,3),ylim=c(0,1)) #Look at the correlation between forward and Rev reads (exclude TagA due to bias in Run1) # In this case looking at capelin in F vs R plot(Fcount [Fcount$QCut==10&Fcount$Tag!="A",6 ] ,Rcount [Rcount$QCut==10&Rcount$Tag!="A",6]) cor(Fcount [Fcount$QCut==10&Fcount$Tag!="A",6 ] ,Rcount [Rcount$QCut==10&Rcount$Tag!="A",6]) head(Fcount) #################### # Proportions in individual samples # FIGURE 3 Plot (based on the Run1 100 bp dataset) IPD=Rcount # ... consider the F or R counts? head(IPD) windows(width=7, height=3.5);par(mfrow=c(1,2),mar=c(3,3,2,1)) barplot(t(as.matrix(IPD[IPD$QCut==10,6:8])),col=c(4,2,3),axisnames=F)#las=2,cex.names=.3) abline(h=c(0.485,(.485+.34)),lty=2,lwd=3,col="white") ### windows(width=3.5, height=5) boxplot(IPD$CapPer[IPD$QCut==10],IPD$HerPer[IPD$QCut==10],IPD$MacPer[IPD$QCut==10],col=c(4,2,3),ylim=c(0,1) ) mean(IPD$CapPer[IPD$QCut==10]);sd(IPD$CapPer[IPD$QCut==10]) mean(IPD$HerPer[IPD$QCut==10]);sd(IPD$HerPer[IPD$QCut==10]) mean(IPD$MacPer[IPD$QCut==10]);sd(IPD$MacPer[IPD$QCut==10]) #################### # Sequence counts #################### # Decrease in total reads as quality filtering goes up tapply( Fcount$PreyTot,Fcount$QCut,mean ); plot(unique(Fcount$QCut),tapply(Fcount$PreyTot,Fcount$QCut,mean)) tapply( Rcount$PreyTot,Rcount$QCut,mean ); plot(unique(Rcount$QCut),tapply(Rcount$PreyTot,Rcount$QCut,mean)) # FIGURE 4 Plot (based on the Run1 100 bp dataset) windows(width=7, height=4);par(mfrow=c(1,2),mar=c(4.5,4.5,3,2)) plot(unique(Fcount$QCut),tapply(Fcount$Her,Fcount$QCut,mean),ylim=c(0,1700),type= "l",col=2,lwd=2,xlab="Quality score" ,ylab="Mean sequence count") points(unique(Fcount$QCut),tapply(Fcount$Mac,Fcount$QCut,mean),type= "l",col=3,lwd=2) points(unique(Fcount$QCut),tapply(Fcount$Cap,Fcount$QCut,mean),type= "l",col=4,lwd=2) plot(unique(Rcount$QCut),tapply(Rcount$Her,Rcount$QCut,mean),ylim=c(0,1700),type= "l",col=2,lwd=2,xlab="Quality score" ,ylab="") points(unique(Rcount$QCut),tapply(Rcount$Mac,Rcount$QCut,mean),type= "l",col=3,lwd=2) points(unique(Rcount$QCut),tapply(Rcount$Cap,Rcount$QCut,mean),type= "l",col=4,lwd=2) legend(19, 1700, Fish, col = c(4,2,3),lty=1,cex=.8,lwd=2) # FIGURE 5 Plot (based on the Run1 100 bp dataset) windows(width=3.5, height=9) ;par(mfrow=c(3,1),mar=c(4.5,4.5,0.5,1)) plot(QualDat[QualDat>15],tapply(IPD$CapPer [ IPD$Tag=="A"& IPD$QCut>15], IPD$QCut[ IPD$Tag=="A"& IPD$QCut>15],mean ),ylim=c(0,1),type="b",cex=.9, ylab="Proportion of sequences",xlab="", pch=15,col="blue",cex.lab=1.2,cex.axis=.9) # ,xaxt="n" points(QualDat[QualDat>15],tapply(IPD$CapPer [ IPD$Tag=="B"& IPD$QCut>15], IPD$QCut[ IPD$Tag=="B"& IPD$QCut>15],mean ),ylim=c(0,.2),type="b", pch=17,col="blue") points(QualDat[QualDat>15],tapply(IPD$CapPer [ IPD$Tag=="C"& IPD$QCut>15], IPD$QCut[ IPD$Tag=="C"& IPD$QCut>15],mean ),ylim=c(0,.2),type="b", pch=19,col="blue") plot(QualDat[QualDat>15],tapply(IPD$HerPer [ IPD$Tag=="A"& IPD$QCut>15], IPD$QCut[ IPD$Tag=="A"& IPD$QCut>15],mean ),ylim=c(0,1),type="b",cex=.9, ylab="Proportion of sequences",xlab="", pch=15,col=2,cex.lab=1.2,cex.axis=.9)#,xaxt="n" points(QualDat[QualDat>15],tapply(IPD$HerPer [ IPD$Tag=="B"& IPD$QCut>15], IPD$QCut[ IPD$Tag=="B"& IPD$QCut>15],mean ),ylim=c(0,1),type="b", pch=17,col=2) points(QualDat[QualDat>15],tapply(IPD$HerPer [ IPD$Tag=="C"& IPD$QCut>15], IPD$QCut[ IPD$Tag=="C"& IPD$QCut>15],mean ),ylim=c(0,1),type="b", pch=19,col=2) plot(QualDat[QualDat>15],tapply(IPD$MacPer [ IPD$Tag=="A"& IPD$QCut>15], IPD$QCut[ IPD$Tag=="A"& IPD$QCut>15],mean ),ylim=c(0,1),type="b",cex=.9, ylab="Proportion of sequences",xlab="Quality Score cutoff", pch=15,col=3,cex.lab=1.2,cex.axis=.9)# ,xaxp=c(11, 25, 7) points(QualDat[QualDat>15],tapply(IPD$MacPer [ IPD$Tag=="B"& IPD$QCut>15], IPD$QCut[ IPD$Tag=="B"& IPD$QCut>15],mean ),ylim=c(0,1),type="b", pch=17,col=3) points(QualDat[QualDat>15],tapply(IPD$MacPer [ IPD$Tag=="C"& IPD$QCut>15], IPD$QCut[ IPD$Tag=="C"& IPD$QCut>15],mean ),ylim=c(0,1),type="b", pch=19,col=3) tapply(IPD$PreyTot,IPD$QCut,mean ) windows(width=3.5, height=5) boxplot(IPD$CapPer[IPD$QCut==18],IPD$HerPer[IPD$QCut==18],IPD$MacPer[IPD$QCut==18],col=c(4,2,3),ylim=c(0,1) ) windows(width=3.5, height=5) boxplot(IPD$CapPer[IPD$QCut==30],IPD$HerPer[IPD$QCut==30],IPD$MacPer[IPD$QCut==30],col=c(4,2,3),ylim=c(0,1) ) #################################### ### FIGURE 6 Plot - quality across sequences #################################### # Need to read files QbpRun1_100 =read.csv("Run1_100bp_Qbp.csv"); dim(QbpRun1_100) QualityBP=QbpRun1_100 QualBP1= QualityBP[AllSeqInfo$Primer!="NoPri",] # Remove those with no Primer QualBP2= QualBP1[ASI1$F_Tag!="F_NoTag",]# Remove those with no Tag on For QualBP3= QualBP2[ASI2$R_Tag!="R_NoTag",]# Remove those with no Tag on For QualBP4= QualBP3[ASI3$Species!="None",]# Remove those with no Tag on Rev dim(ASI4);dim(QualBP4) QualBPclean=QualBP4 QualBPcleanFor=QualBPclean[ASIclean$F_Tag!="NotF",] QualBPcleanRev=QualBPclean[ASIclean$R_Tag!="NotR",] levels(ForCleanSeq$Species)#[1] "None""Tet2" table(ForCleanSeq$Species) ## Look at the mean sequence for the three species AveCapF=apply (QualBPcleanFor[ForCleanSeq$Species=="Capelin", ],1,mean) AveHerF=apply (QualBPcleanFor[ForCleanSeq$Species=="Herring", ],1,mean) AveMacF=apply (QualBPcleanFor[ForCleanSeq$Species=="Mackerel", ],1,mean) #AvesealF=apply (QualBPcleanFor[ForCleanSeq$Species=="Harbour_seal", ],1,mean) Fish= c("Capelin","Herring","Mackerel") windows(width=2.5, height=4) boxplot(AveCapF,AveHerF,AveMacF, ylim=c(10,40),col=c(4,2,3), names =Fish,las=3,ylab= "Mean Sequence Quality" ) # boxplot(AveCapF,AveHerF,AveMacF,AvesealF, ylim=c(10,30)) windows(width=6.5, height=4) plot(apply(QualBPcleanFor[ForCleanSeq$Species=="Capelin", ],2,mean),type="l",col=4,ylim=c(14,37),yaxt="n",ylab="", xlab="Sequence position") lines(apply(QualBPcleanFor[ForCleanSeq$Species=="Herring", ],2,mean),type="l",col=2) lines(apply(QualBPcleanFor[ForCleanSeq$Species=="Mackerel", ],2,mean),type="l",col=3) #lines(apply(QualBPcleanFor[ForCleanSeq$Species=="Harbour_seal", ],2,mean),type="l",col="grey") legend(80, 37.5, Fish, col = c(4,2,3),lty=1,cex=.8) head (RevCleanSeq) ######## AveCapR=apply (QualBPcleanRev[RevCleanSeq$Species=="Capelin" & RevCleanSeq$Tag=="TagA", ],1,mean) AveHerR=apply (QualBPcleanRev[RevCleanSeq$Species=="Herring"& RevCleanSeq$Tag=="TagA", ],1,mean) AveMacR=apply (QualBPcleanRev[RevCleanSeq$Species=="Mackerel"& RevCleanSeq$Tag=="TagA", ],1,mean) #AvesealR=apply (QualBPcleanRev[RevCleanSeq$Species=="Harbour_seal", ],1,mean) Fish= c("Capelin","Herring","Mackerel") windows(width=2.5, height=4) boxplot(AveCapR,AveHerR,AveMacR, ylim=c(10,40),col=c(4,2,3), names =Fish,las=3,ylab= "Mean Sequence Quality" ) #,notch=TRUE # boxplot(AveCapF,AveHerF,AveMacF,AvesealF, ylim=c(10,30)) windows(width=6.5, height=4) plot(apply(QualBPcleanRev[RevCleanSeq$Species=="Capelin", ],2,mean),type="l",col=4,ylim=c(10,40),yaxt="n",ylab="", xlab="Sequence position") lines(apply(QualBPcleanRev[RevCleanSeq$Species=="Herring"& RevCleanSeq$Tag=="TagC", ],2,mean),type="l",col=2) lines(apply(QualBPcleanRev[RevCleanSeq$Species=="Mackerel"& RevCleanSeq$Tag=="TagC", ],2,mean),type="l",col=3) #lines(apply(QualBPcleanRev[RevCleanSeq$Species=="Harbour_seal", ],2,mean),type="l",col="grey") legend(80, 40.7, Fish, col = c(4,2,3),lty=1,cex=.8) mean (AveCapF) mean (AveHerF) mean (AveMacF) mean (AveCapR) mean (AveHerR) mean (AveMacR) # mean quality regardless of species for each position plot(apply(QualBPcleanRev,2,mean)) apply(QualBPcleanRev,2,mean) ########################################## ### Looking at homopolymer in the reverse primer head(RevCleanSeq$Seq) length(RevCleanSeq$Seq) length(grep("GGTCGCCC",RevCleanSeq$Seq)) ### In all sequences length(grep("GGTCGCCAAC",RevCleanSeq$Seq)) # 2 Cs length(grep("GGTCGCCCAAC",RevCleanSeq$Seq)) # 3 Cs length(grep("GGTCGCCCCAAC",RevCleanSeq$Seq)) # 4 Cs in homopolymer length(grep("GGTCGCCCCCAAC",RevCleanSeq$Seq)) # 5 Cs ### In all the sequences length(agrep("CCXXGGTCGCCAAC",AllSeqInfo$Seq,max.distance=2)) # 2 Cs length(agrep("CCXXGGTCGCCCAAC",AllSeqInfo$Seq,max.distance=2)) # 3 Cs length(agrep("CCXXGGTCGCCCCAAC",AllSeqInfo$Seq,max.distance=2)) # 4 Cs length(agrep("CCXXGGTCGCCCCCAAC",AllSeqInfo$Seq,max.distance=2)) # 5 Cs