# DADA2 Analysis CTRI Survey # data gathered 2017 # analysis Mar 9 2018 path <- "~/Documents/Illumina_data/2017/Hummingbird/VannetteHummingbird16S" # CHANGE ME to the directory containing the fastq files after unzipping. list.files(path) library(dada2) # Filter and trim # Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq fnFs <- sort(list.files(path, pattern="_R1_001.fastq", full.names = TRUE)) fnRs <- sort(list.files(path, pattern="_R2_001.fastq", full.names = TRUE)) # Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1) # Examine quality profiles of reads plotQualityProfile(fnFs[1]) plotQualityProfile(fnRs[5]) # Perform Filtering and trimming filt_path <- file.path(path, "filtered") # Place filtered files in filtered/ subdirectory filtFs <- file.path(filt_path, paste0(sample.names, "_F_filt.fastq.gz")) filtRs <- file.path(filt_path, paste0(sample.names, "_R_filt.fastq.gz")) #filter forward and reverse reads, using recommended parameters out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(280,190), maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=TRUE, trimLeft = 10) # On Windows set multithread=FALSE head(out) # learn error rates errF <- learnErrors(filtFs, multithread=TRUE) errR <- learnErrors(filtRs, multithread=TRUE) # plot to see if error rates are reasonable plotErrors(errF, nominalQ=TRUE) # dereplicate derepFs <- derepFastq(filtFs, verbose=TRUE) derepRs <- derepFastq(filtRs, verbose=TRUE) # Name the derep-class objects by the sample names names(derepFs) <- sample.names names(derepRs) <- sample.names dadaFs <- dada(derepFs, err=errF, multithread=TRUE) dadaRs <- dada(derepRs, err=errR, multithread=TRUE) # inspect dadaFs[[1]] # merge paired and denoised reads mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE) # Inspect the merger data.frame from the first sample head(mergers[[1]]) # make sequence table seqtab <- makeSequenceTable(dadaFs) dim(seqtab) # Inspect distribution of sequence lengths table(nchar(getSequences(seqtab))) # optional: remove sequences much longer or shorter than expected seqtab2 <- seqtab[,nchar(colnames(seqtab)) %in% seq(240,256)] # Remove Chimeras seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE) dim(seqtab.nochim) sum(seqtab.nochim)/sum(seqtab) # track number of reads that make it through the pipeline getN <- function(x) sum(getUniques(x)) track <- cbind(out, sapply(dadaFs, getN), sapply(mergers, getN), rowSums(seqtab), rowSums(seqtab.nochim)) # If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs) colnames(track) <- c("input", "filtered", "denoised", "merged", "tabled", "nonchim") rownames(track) <- sample.names head(track) # Assign taxonomy taxa <- assignTaxonomy(seqtab.nochim, "~/Box Sync/R files/Training/silva_nr_v128_train_set.fa.gz", multithread=TRUE) # Inspect Taxonomic assignments taxa.print <- taxa # Removing sequence rownames for display only rownames(taxa.print) <- NULL head(taxa.print, n=20) ################ # Downstream analyses