#!/usr/bin/env Rscript # USAGE: FilterBarcoding.R [File] [Sample name] [control name] [separator] [minimum # reads] # [filter select (01234)] [min subsample repeats] [min sample repeats] [max control repeats] # set arguments file_name=commandArgs(trailingOnly = TRUE)[1] sample_name=commandArgs(trailingOnly = TRUE)[2] control_name=commandArgs(trailingOnly = TRUE)[3] # separator if (is.na(commandArgs(trailingOnly = TRUE)[4])) { separator="\t" } else separator=commandArgs(trailingOnly = TRUE)[4] # minimum # reads if (is.na(commandArgs(trailingOnly = TRUE)[5])) { min_read=1 } else min_read=as.numeric(commandArgs(trailingOnly = TRUE)[5]) # minimum filter select if (is.na(commandArgs(trailingOnly = TRUE)[6])) { filter_select="0" } else filter_select=commandArgs(trailingOnly = TRUE)[6] # minimum subsample repeats if (is.na(commandArgs(trailingOnly = TRUE)[7])) { min_subsample_rep=1 } else min_subsample_rep=as.numeric(commandArgs(trailingOnly = TRUE)[7]) # minimum sample repeats if (is.na(commandArgs(trailingOnly = TRUE)[8])) { min_sample_rep=1 } else min_sample_rep=as.numeric(commandArgs(trailingOnly = TRUE)[8]) # maximum blank / negative repeats if (is.na(commandArgs(trailingOnly = TRUE)[9])) { max_blank_neg_rep=1 } else max_blank_neg_rep=as.integer(commandArgs(trailingOnly = TRUE)[9]) ########################################################################################################### # SET VALUES MANUALLY #file_name="Hubia2_100_mammal.csv" #sample_name="Hubia2" #control_name="C" #separator="\t" #min_read=10 #filter_select="12" #min_subsample_rep=1 #min_sample_rep=2 #max_blank_neg_rep=1 ########################################################################################################### # dna data, read also in two files, the first 6 lines including names # and the rest with the data dna=read.table(file_name,skip=1,sep=separator,dec=".",quote="\"") names.dna=read.table(file_name,nrows=1,sep=separator, colClasses="character",quote="\"") colnames(dna)=names.dna[1,] colnames(names.dna)=names.dna[1,] # splitting the data in three components, blank, negative, and sample dna.control.sel=which(grepl(control_name,names.dna[1,])) dna.sample.sel=which(grepl(sample_name,names.dna[1,])) dna.control=dna[,dna.control.sel] dna.sample=dna[,dna.sample.sel] ################################################################## # control data # Filter the control data with the min X [user provided] read criteria dna.control[dna.control0]=1 # number of PCR per sample dna.control.PCR=array(0,dim=c(nrow(dna.control),round(ncol(dna.control)/8))) for (i in 1:round(ncol(dna.control)/8)) dna.control.PCR[,i]=apply(dna.control.01[,(1+8*(i-1)):(8*i)],MAR=1,sum) # calculate mean only using samples with reads dna.control.sub.mean=matrix(NA,nrow=nrow(dna.control),ncol=1) for (i in 1:nrow(dna.control)) dna.control.sub.mean[i,]=rowSums(dna.control[i,])/(length(which(dna.control.PCR[i,]>0))*8) # summary file with mean seq/PCR, max seq, mean number of PCR with seq and max number of PCR per sample dna.control.sum=cbind.data.frame(family_name=dna$family_name,genus_name=dna$genus_name, mean_t=apply(dna.control,MAR=1,mean),max_t=apply(dna.control,MAR=1,max), PCR_s=apply(dna.control.01,MAR=1,mean),max_PCR=apply(dna.control.PCR,MAR=1,max), sub_mean=dna.control.sub.mean[,1],median=apply(dna.control,MAR=1,function(dna.control) {median(dna.control[dna.control>0])}),sum=apply(dna.control,MAR=1,sum)) ############################################# # lake sediment samples # Filter sediment data with the min X [user provided] read criteria dna.sample[dna.sample0]=1 # number of PCR per sample dna.sample.PCR=array(0,dim=c(nrow(dna.sample),round(ncol(dna.sample)/8))) for (i in 1:round(ncol(dna.sample)/8)) dna.sample.PCR[,i]=apply(dna.sample.01[,(1+8*(i-1)):(8*i)],MAR=1,sum) # Remove the each sampling point with less than X replicates [user provided] for (i in 1:ncol(dna.sample.PCR)){ for (j in 1:nrow(dna.sample.PCR)){ if (dna.sample.PCR[j,i]0]=1 # recalculate the number of PCR repeats dna.sample.PCR=array(0,dim=c(nrow(dna.sample),round(ncol(dna.sample)/8))) for (i in 1:round(ncol(dna.sample)/8)) dna.sample.PCR[,i]=apply(dna.sample.01[,(1+8*(i-1)):(8*i)],MAR=1,sum) # calculate mean only using samples with reads dna.sample.sub.mean=matrix(NA,nrow=nrow(dna.sample),ncol=1) for (i in 1:nrow(dna.sample)) dna.sample.sub.mean[i,]=rowSums(dna.sample[i,])/(length(which(dna.sample.PCR[i,]>0))*8) # summary file with mean seq/PCR, max seq, mean number of PCR with seq and max number of PCR per sample dna.sample.sum=cbind.data.frame(family_name=dna$family_name,genus_name=dna$genus_name, mean_t=apply(dna.sample,MAR=1,mean),max_t=apply(dna.sample,MAR=1,max), PCR_s=apply(dna.sample.01,MAR=1,mean),max_PCR=apply(dna.sample.PCR,MAR=1,max), sub_mean=dna.sample.sub.mean[,1],median=apply(dna.sample,MAR=1,function(dna.sample) {median(dna.sample[dna.sample>0])}),sum=apply(dna.sample,MAR=1,sum)) ############################################# # FILTER STEP! # This is where the filting of the DNA data takes place # THE Different filter options final_select=dna.sample.sum$sum>1 # base filter to remove taxa without presence in the samples # Filter by comparing the mean # reads between the samples and the negative controls / blanks mean_filter=(dna.sample.sum$sub_mean >= dna.control.sum$mean_t) # Filter by minimum PCR repeats of the samples min_PCR_rep=(dna.sample.sum$max_PCR>=min_sample_rep) # Filter by maximum PCR repeats of the blanks / negatives max_PCR_rep=(dna.control.sum$max_PCR<=max_blank_neg_rep) # Filter by comparing the PCR repeats compare_PCR_rep=(dna.sample.sum$max_PCR > dna.control.sum$max_PCR) # Add up the selected filters # mean filter if (grepl("1",filter_select)) { final_select=final_select&mean_filter } # minimum PCR repeats in samples if (grepl("2",filter_select)) { final_select=final_select&min_PCR_rep } # maximum PCR repeats of blank / negatives if (grepl("3",filter_select)) { final_select=final_select&max_PCR_rep } # keep sample > blank / negatives repeats if (grepl("4",filter_select)) { final_select=final_select&compare_PCR_rep } # Apply filter dna.match=dna[final_select,] # get headers dna.match.header=rbind(names.dna,dna.match) # Set output name (input file name + the select filter settings) out_name=gsub('\\.tsv', paste(paste("",min_read,filter_select,min_subsample_rep,min_sample_rep,max_blank_neg_rep,sep="_"),".csv",sep=""),file_name) # print output to a new file write.table(dna.match.header, file=out_name, quote=FALSE, sep="\t", col.names=FALSE, row.names=FALSE)