#BiocManager::install("dada2") library(dada2); packageVersion("dada2") path <- "~"#/Volumes/matty/MiSeqData/GMCFastqs/GMCFastqsTrimmedFastqs/" # CHANGE ME to the directory containing the fastq files after unzipping. list.files(path) # Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq fnFs <- sort(list.files(path, pattern="_R1_001.fastq", full.names = TRUE)) fnRs <- sort(list.files(path, pattern="_R2_001.fastq", full.names = TRUE)) # Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1) sample.names plotQualityProfile(fnFs[1:10]) plotQualityProfile(fnRs[1:10]) # Place filtered files in filtered/ subdirectory filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz")) filtRs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq.gz")) # Change trunclen to length based on plotQualityProfile out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(150,150), maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=TRUE) # On Windows set multithread=FALSE out #Stop and check that a high proportion of reads passed filtering. If not, change settings. errF <- learnErrors(filtFs, multithread=TRUE) errR <- learnErrors(filtRs, multithread=TRUE) plotErrors(errF, nominalQ=TRUE) plotErrors(errR, nominalQ=TRUE) derepFs <- derepFastq(filtFs, verbose=TRUE) derepRs <- derepFastq(filtRs, verbose=TRUE) # Name the derep-class objects by the sample names names(derepFs) <- sample.names names(derepRs) <- sample.names dadaFs <- dada(derepFs, err=errF, multithread=TRUE) dadaRs <- dada(derepRs, err=errR, multithread=TRUE) # dadaFs[[1]] mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE) # Inspect the merger data.frame from the first sample head(mergers[[1]]) seqtab <- makeSequenceTable(mergers) dim(seqtab) # Inspect distribution of sequence lengths table(nchar(getSequences(seqtab))) seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE) dim(seqtab.nochim) sum(seqtab.nochim)/sum(seqtab) table(nchar(getSequences(seqtab.nochim))) getN <- function(x) sum(getUniques(x)) #Multiple samples track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim)) # If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs) # track <- cbind(out, getN(dadaFs), getN(dadaRs), getN(mergers), rowSums(seqtab.nochim)) colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim") rownames(track) <- sample.names head(track) track #Write Seq Tab to File write.table(track, "/Volumes/matty/MiSeqData/GMCFastqs/track.txt", append = FALSE, sep = '\t') write.table(seqtab.nochim, "/Volumes/matty/MiSeqData/GMCFastqs/SeqTab.txt", append = FALSE, sep = "\t") #to inspect fastqs fn <- "~"#/Volumes/matty/MiSeqData/Run190303Methods-2/GFL/GFLTrimmedFastqs2/fBKZ12_R2_001.fastq" foo <- readLines(fn) seqlens <- sapply(foo[seq(2,length(foo),4)], nchar) quallens <- sapply(foo[seq(4,length(foo),4)], nchar) table(seqlens); table(quallens) quallens != seqlens