################################################################################################# # STEPWISE THRESHOLD CLUSTERING: this script takes the output table generated the the readFNA script ################################################################################################# ######################################################################### ##################### SET PARAMETERS FOR STC ############################ ######################################################################### #minimum number of reads for genotyping min_reads <- 80 #maximum number of reads before sub-sampling max_reads <- 1000 #correct length for amplicon amp_length <- 213 #starting similarity for clustering (%) # set at minimum distance in distance matrix #maximum similarity before clustering stops (%) max_sim <- 97 #how many reps per round of clustering? reps <- 100 #size criterion threshold size_thresh <- 1/22 #dominance criterion threshold dom_thresh <- 4/1 #frequency cutoff for small alleles (# of samples in which the allele must appear as true allele) small_cutoff <- 3 #create sample specific sequence tables (TRUE/FALSE)? create_sequence_tables <- TRUE ###################################################################### # Things to do before STC can start ###################################################################### #libraries needed library(ape) library(seqinr) library(plyr) #source functions source(file = "stc_functions.R") #data files needed reads <- read.csv("raw_reads.csv") #table of raw reads bc_table <- read.csv("output/sample_barcodes.csv") #table of sample IDs and barcodes alls <- read.dna("output/sequences.fasta",format = "fasta", as.character = TRUE) #list of all sequences #convert IDs from numeric to original sample names reads <- convert.IDs(reads) #change sample IDs to reflect which samples are duplicates reads <- dup.IDs(reads,bc_table) #create list of sample IDs to use for clustering (skip if running the example sample!) samples <- data.frame(sampleID = unique(c(as.character(bc_table$sampleID), as.character(new_dup_names$sampleID)))) write(as.vector(samples$sampleID),"output/samples", ncolumns = 1) ##################################################################### ################### PHASE 1: SEQUENCE PREPARATION ################### ##################################################################### ######################################################################### # Create data tables ####################################################################### #empty tables for different cluster types all_TAs <- data.frame() all_smalls <- data.frame() all_clusters <- data.frame() #empty lists of samples that are skipped due to low read numbers samples_skipped <- c() #empty lists of samples that are sub-sampled samples_big <- rbind() #empty table for allele summary alleles_summary <- data.frame() samples_summary <- data.frame() #start sample summary table duplicates_summary <- data.frame(sample = A_samples, duplicate = "A") duplicates_summary <- rbind(duplicates_summary,data.frame(sampleID = paste(duplicates$sampleID,"b",sep = ""), duplicate = "B")) ###################################################################### #Start cycling through individual samples (Phases 2 and 3) ###################################################################### #if running the example sample from Stutz and Bolnick, 2014 sample <- samples[256,1] for (sample in samples[,1]){ #so you know what sample is currently active print (sample) #create a unique filename path for this sequence seqpath <- paste("output",sample,sep = "/") ############################################### #determine whether the sample should be genotyped and sub-sampled ############################################### #extract reads from sample sample_reads <- na.omit(reads[reads$sampleID == sample,]) sample_reads$sampleID <- factor(sample_reads$sampleID) sample_reads$seqID <- factor(sample_reads$seqID) #how many reads? read_numb <- nrow(sample_reads) #check if too few reads if (read_numb < min_reads){ info <- data.frame(sampleID = sample,read_numb) samples_skipped <- rbind(samples_skipped,info) write.csv(samples_skipped,"output/samples_skipped.csv", row.names = FALSE) #add data to summary table samp_summ <- data.frame(duplicates_summary[duplicates_summary$sampleID == sample, ], read_number = read_numb, read_number_used = 0, good_alleles = NA, small_clusters = NA, amb_clusters = NA ) samples_summary <- rbind(samples_summary,samp_summ) next() } #if there are more than 1000 reads if (read_numb > 1000){ info <- cbind(sample,read_numb) samples_big <- rbind(samples_big,info) write.table(samples_big,"output/samples_big.csv") #subsample reads down to 1000 without replacement sample_reads <- sample_reads[sample(nrow(sample_reads),1000, replace = FALSE),] #re-factor the sample sequence IDs sample_reads$seqID <- factor(sample_reads$seqID) } #add data to summary table samp_summ <- data.frame(duplicates_summary[duplicates_summary$sampleID == sample, ], read_number = read_numb, read_number_used = nrow(sample_reads) ) ##################################################################### ################### PHASE 2: SEQUENCE COMBINATION ################### ##################################################################### ############################################### # create alignment of sequences and distance matrix ############################################### #create table of the number of reads assigned to each sequence seq_table <- table(sample_reads$seqID) #which sequences are present in the given sample? seqs <- na.omit(names(table(sample_reads$seqID))) #extract sequence data for sequences sample_seqs <- alls[seqs] write.dna(sample_seqs,file = paste(seqpath,".fasta",sep = ""), format = "fasta") #align sequences using clustal aln_seqs <- align.seqs(sample_seqs) #create distance matrix dist <- dist.dna(aln_seqs, model = "JC69", as.matrix = TRUE) #convert distance matrix to percent similarity dist <- 100*(1-dist) #calcuate minimum distance in matrix min_dist <- min(dist) #write distance matrix file to file write.table(dist,file = paste(seqpath,"_dist.csv",sep = ""), sep = ",") ############################################### #create list of paired sequences to combine ############################################### #re-upload alignment in character format aln_seqs <- read.dna(file = paste(seqpath,".aln",sep = ""), format = "interleaved", as.character = TRUE) #order rownames from lowest hapID to highest (for efficiency) aln_seqs <- aln_seqs[order(as.numeric(rownames(aln_seqs))),] #make list of all possible pairs of sequences temp_seqs <- data.frame(ID = seqs, numb = seq(1:length(seqs))) all_pairs <- make.pairs(temp_seqs,seq_table) #create list of pairs to combine pairs_use <- c() for(pair in c(1:nrow(all_pairs))){ #how do the two sequences differ? diffs <- pair.diffs(pair) #add pair if pair falls into type 1,2, or 3 pairs_use <- rbind(pairs_use, type.pair(diffs)) } #adjust pairs pairs_use <- adjust.pairs(pairs_use) #write pairs to file write.csv(pairs_use, file = paste(seqpath,"_pairs.csv",sep = "")) ############################################## #combine pairs ############################################## #adjust table of sequences to account for combinations old_seq_table <- seq_table #combine pairs in sample reads table sample_reads <- ddply(sample_reads,.(seq.number),combine.pairs) #adjust sample read table to account for combinations sample_reads$seqID <- factor(sample_reads$seqID) #adjust table of sequences to account for combinations seq_table <- table(sample_reads$seqID) ##################################################################### ################### PHASE 3: CLUSTERING ############################# ##################################################################### ##################################################################### #initiate data tables ##################################################################### #new table for good clusters good_clusters <- rbind() #new table for small clusters small_clusters <- data.frame() #new table for true alleles true_alleles <- data.frame() #create table of sequences to start with and assign null cluster values start_seqs <<- data.frame(cluster = NA, sequence = names(seq_table), nreads = as.vector(seq_table)) #how many unique sequences to start? seq_remain <- nrow(start_seqs) ####################################################################### # run the clustering replicates ####################################################################### #set first cutoff cutoff <- round(min_dist) while(cutoff <= 97 & nrow(start_seqs) > 0){ #display current cutoff value #print(cutoff) #set existing clusters to have no value start_seqs$cluster <- NA #new table for ambiguous clusters amb_clusters <- data.frame() #perform the clustering replicates cluster_replicates <- repeat.cluster(cutoff,start_seqs,reps) #parse replicates and assign cluster names final_clusters <- parse.results(start_seqs,cluster_replicates) #pull good clusters good_clusters <- rbind(good_clusters, ddply(final_clusters,.(cluster),pull.goods)) #pull small clusters small_clusters <- rbind(small_clusters, ddply(final_clusters,.(cluster),pull.smalls)) #pull ambigous clusters amb_clusters <- rbind(amb_clusters, ddply(final_clusters,.(cluster),pull.ambs)) #start with ambigous clusters for next round start_seqs <- amb_clusters #if no new clusters were formed, skip a round of clustering if(nrow(amb_clusters) == nrow(cluster_replicates)) { #if cutoff is less than 80 if(cutoff < 80){ cutoff <- cutoff + 5 #if cutoff is greater than 80... } else { #but less than 90... if(cutoff < 90){ cutoff <- cutoff + 3 #adn greater than 90 } else { cutoff <- cutoff + 1 } } } else { cutoff <- cutoff + 1 } } #create table of all clusters for sample sample_clusters <- rbind(good_clusters,small_clusters,amb_clusters) sample_clusters <- data.frame(sample_clusters,sampleID = sample) ############################################################################ # divide ambiguous clusters among the top two alleles ############################################################################ #which clusters are ambigous? amb_cluster_names <- names(table(amb_clusters$cluster)) #parse ambiguous clusters for(name in amb_cluster_names){ #pull cluster cl <- amb_clusters[amb_clusters$cluster == name,] #calculate summary stats for ambiguous cluster summary <- amb.summary(cl) #should the cluster be re-classified as small? small_clusters <- rbind(small_clusters,check.smalls(summary,cl)) #should the cluster be re-classified as one good cluster? good_clusters <- rbind(good_clusters, check.good(summary,cl)) } #create table of true alleles from existing good clusters true_alleles <- ddply(good_clusters,.(cluster), collapse.goods) ######################################################## #create summary tables for sequences from each sample ######################################################## if(create_sequence_tables == TRUE){ #create table sequence_summary <- create.sequence.summary(old_seq_table) #account for sequences that were combined during phase 2 sequence_summary <- ddply(sequence_summary,.(sequence), combined.seqs) #write to file sequence_summary <- sequence_summary[order(sequence_summary$cluster),] write.csv(sequence_summary,paste(seqpath,"_sequence_summary.csv",sep = "")) } ########################################################################## #clean up ########################################################################## #add sampleID to small clusters and good clusters if(nrow(small_clusters) > 0){ small_clusters <- data.frame(small_clusters,sampleID = sample) } good_clusters <- data.frame(good_clusters,sampleID = sample) #remove "cluster", but keep "allele", in true allele table true_alleles <- true_alleles[,-1] #create fasta file of true alleles for sample true_seqs <- alls[names(table(factor(true_alleles$allele)))] ########################################################################## #add sample results to global data tables ########################################################################## #append true alleles to global true allele file all_TAs <- rbind(all_TAs,true_alleles) #append small alleles to global small allele file all_smalls <- rbind(all_smalls, small_clusters) #append all clusters to global clusters file all_clusters <- rbind(all_clusters, sample_clusters) #create sample summary samp_summ <- data.frame(samp_summ, good_alleles = nrow(true_alleles), small_clusters = length(table(small_clusters$cluster)), amb_clusters = length(table(amb_clusters$cluster)) ) samples_summary <- rbind(samples_summary,samp_summ) ############################################################################ # write data to file ############################################################################ #write individual fish file for TAs write.csv(true_alleles, file = paste(seqpath,"_TAs.csv",sep = "")) #write individual fish file for clusters write.csv(sample_clusters, file = paste(seqpath,"_clusters.csv",sep = "")) #write small alleles fish file write.csv(small_clusters,file = paste(seqpath,"_smalls.csv",sep = "")) #write fasta file for true allele sequences for fish to file write.dna(true_seqs,file = paste(seqpath,"_true.fasta",sep = ""), format = "fasta") #write current sample summary write.csv(samples_summary, "output/samples_summary.csv", row.names = FALSE) #update cluster files write.csv(all_TAs,"output/all_TAs_phase3.csv", row.names = FALSE) write.csv(all_smalls,"output/all_smalls.csv", row.names = FALSE) write.csv(all_clusters,"output/all_clusters.csv", row.names = FALSE) } ##################################################################### ################ PHASE 4: SMALL CLUSTER CROSS CHECKING ############## ##################################################################### #reload true allele and small cluster tables all_TAs <- read.csv("output/all_TAs_phase3.csv") all_smalls <- read.csv("output/all_smalls.csv") samples_summary <- read.csv("output/samples_summary.csv") #add a good status to indicate alleles that did not drop out all_TAs$status <- "good" #change small clusters to small alleles names(all_smalls)[1] <- "allele" #create table of current true alleles alleles <- table(all_TAs$allele) #Which true alleles will be used for cross-checking? alleles.cc <- names(alleles[alleles >= small_cutoff]) #add small clusters as true alleles if they match common alleles d_ply(all_smalls, .(sampleID,allele), cross.check) #resort by sampleID all_TAs <- all_TAs[order(all_TAs$sampleID), ] #create preliminary list of TAs, fasta file, and alignment final.list(all_TAs,alls) ##################################################################### ########### PHASE 4: IDENTIFY AND REMOVE CHIMERIC ALLELES ########### ##################################################################### #re-upload nucleic acid alignment aln_TAs <- read.dna("output/TAs.aln", format = "interleaved", as.character = TRUE) #source chimera functions source(file = "chimeras_functions.R") ####################### #parameters to set ####################### #maximum number sites at which daughter sequence can match NEITHER putative parent no_match <- 2 #set lower bound for proportion of informative sites that must be consistent with recombinant daughter proportion <- 0.98 ############################################## #find all the possible recombinants in each sample ############################################## possible_recombinants <- ddply(all_TAs,.(sampleID), find.recombs) #write possibles to file write.csv(possible_recombinants, "output/possible_recombinants.csv", row.names = FALSE) ############################################### #determine if any of the possible recombinants are likely chimeras ############################################### #load the list of possible recombinants recomb_dat <- read.csv("output/possible_recombinants.csv") #in how many samples was each allele found in overall? allele_totals <- table(all_TAs$allele) #cycle through recombinant sequences possible_chimeras <- ddply(recomb_dat,.(seqID), find.chimeras) #sort by how the percentage match possible_chimeras <- possible_chimeras[order(possible_chimeras$proportion_match, decreasing = TRUE),] #write results to file write.csv(possible_chimeras, "output/chimera_results.csv", row.names = FALSE) ############################################### # remove likely chimeras from final results ############################################### #load list of possible chimeras possible_chimeras <- read.csv("output/chimera_results.csv") #count as chimeras only if potential parent sequences were found in all samples where the putative chimera occured chimeras <- possible_chimeras[possible_chimeras$proportion_match == 1,] #if there are no chimeras if(length(chimeras) == 0){ chimeras <- data.frame(allele = NA) } #indicate which TAs are chimeras all_TAs <- ddply(all_TAs, .(allele), indicate.chimeras) #resort by sampleID all_TAs <- all_TAs[order(all_TAs$sampleID), ] #update sequence summary tables to account for chimeras and dropped alleles if(create_sequence_tables == TRUE){ #which samples were skipped? skip <- read.csv("output/samples_skipped.csv", sep = ",") #cycle through samples and update sequency summaries ddply(samples, .(sampleID), update.sequence.summary) } ##################################################################### ########### POST STC CLEAN UP AND DATA PROCESSING ################### ##################################################################### ######################################################## # create a summary table of alleles ######################################################## alleles_summary <- ddply(all_TAs, .(allele), create.allele.summary) #sort most common to least common alleles_summary <- alleles_summary[order(alleles_summary$samples_found, decreasing = TRUE),] #write summary to file write.csv(alleles_summary, "output/alleles_summary.csv", row.names = FALSE) #indicate which TAs are not the correct length all_TAs <- ddply(all_TAs, .(allele), indicate.lengths) ###################################################################################### # update sample summary table ###################################################################################### #load exisiting table samples_summary <- read.csv("output/samples_summary.csv") #add chimera, length, and true allele columns samples_summary <- merge(samples_summary, ddply(all_TAs,.(sampleID), new.TAs), by = c("sampleID"), all.x = TRUE ) #output updated table write.csv(samples_summary, "output/samples_summary.csv", row.names = FALSE) ###################################################################################### # Create list of final true alleles along with a fasta file and alignment of just true alleles ###################################################################################### #write new list of TAs to file write.csv(all_TAs, "output/all_TAs_phase4.csv", row.names = FALSE) #upload fasta file of reads from the run alls <- read.dna("output/sequences.fasta",format = "fasta", as.character = TRUE) #create list of TAs, fasta file, and alignment final.list(all_TAs,alls) ############################################################### # Create sample wise data table to use in subsequent analyses ############################################################### #write table of alleles all_TAs_use <- all_TAs[all_TAs$correct_length == "yes" & all_TAs$chimera == "no",] data_use <- table(all_TAs_use$sampleID, all_TAs_use$allele) write.csv(data_use,"output/data_use.csv") #re-upload and turn into useable table data_use <- data.frame(read.csv("output/data_use.csv")) colnames(data_use)[1] <- "sampleID" #merge with sample summary data_use <- merge(samples_summary, data_use, by = c("sampleID"), all.x = TRUE) #write to file write.csv(data_use,"output/data_use.csv", row.names = FALSE)