###################################################################### # Contains all functions needed for running the stc.R script ###################################################################### # convert.IDs # dup.IDs # adjust.aln # aln_seqs # make.pairs # pair.diffs # pair.info # type.pair # seq.one.correct # adjust.pairs # combine.pairs ###################################################################### ###################################################################### #functions for converting IDs and dealing with duplicate ID names ###################################################################### ###################################################################### #converts IDS from sequencing IDs to original IDs convert.IDs <- function(reads){ #split reads into identified and non identified reads A <- reads[is.na(reads$genericID) == FALSE,] B <- reads[is.na(reads$genericID) == TRUE,] #add sampleIDs that correspond to genericIDs new <- ddply(A, .(genericID), match.IDs) #add on NA rows B$sampleID <- NA new <- rbind(new,B) #write to file if (nrow(new) == nrow(reads)){ write.csv(new,"output/reads_converted.csv") } return(new) } #matches IDs between reads table and barcode table match.IDs <- function(genID){ #which sampleID matches the generic ID match <- bc_table[bc_table$genericID == genID$genericID[1],"sampleID"] #change IDs genID$sampleID <- match #output new data return(genID) } #changes the fishIDs for duplicated samples dup.IDs <- function(reads,bc_table){ #which samples (if any) were run in duplicate (or more) reptab <- table(bc_table$sampleID) dups <- reptab[reptab > 1] sings <- reptab[reptab == 1] #if there are duplicates if(length(dups) > 1){ #list of samples run in duplicate duplicates <<- data.frame("sampleID" = names(dups)) singles <<- data.frame("sampleID" = names(sings)) A_samples <<- rbind(singles,duplicates) write.csv(duplicates,"output/duplicate_samples.csv", row.names = FALSE) #subset reads from duplicated and non-duplicated samples dup_reads <- reads[reads$sampleID %in% duplicates$sampleID,] non_reads <- reads[reads$sampleID %in% singles$sampleID,] dup_reads$sampleID <- as.character(dup_reads$sampleID) #subset barcode table for duplicated samples dup_bc_table <- bc_table[bc_table$sampleID %in% duplicates$sampleID,] dup_bc_table <- dup_bc_table[order(dup_bc_table$sampleID,dup_bc_table$fbc),] dup_bc_table$sampleID <- as.character(dup_bc_table$sampleID) #rename duplicate samples in barcode table new_duplicates <- rbind() for (i in duplicates$sampleID){ new <- dup_bc_table[dup_bc_table$sampleID == i,] new$sampleID[2] <- paste(new$sampleID[1],"b", sep = "") new_duplicates <- rbind(new_duplicates,new) } dup_bc_table <- new_duplicates #create barcode table with new duplicate names (i.e. those including "b") new_dup_names <<- data.frame(sampleID = dup_bc_table$sampleID) #change duplicate sample IDs to include "b" samples for (i in c(1:nrow(dup_reads))){ for (j in c(1:nrow(dup_bc_table))){ if (dup_reads$fbc[i] == dup_bc_table$fbc[j] & dup_reads$rbc[i] == dup_bc_table$rbc[j]) { dup_reads$sampleID[i] <- dup_bc_table$sampleID[j] } } } #combine reads from duplicated and non-duplicated samples new_reads <- rbind(non_reads,dup_reads) #write updated table of reads to file write.csv(new_reads,"output/reads.csv", row.names = FALSE) output <- new_reads } else { #write updated table of reads to file write.csv(reads,"output/reads.csv", row.names = FALSE) output <- reads } return(output) } ###################################################################### ###################################################################### #functions for aligning sequences ###################################################################### ###################################################################### #perform the alignment and write to file align.seqs <- function(sample_seqs){ aln_seqs <- clustal(sample_seqs, exec = "clustalw2", quiet = TRUE, gapopen = 15,gapext = 0.66) #write alignment to file write.dna(aln_seqs,file = paste(seqpath,".aln",sep = ""), format = "interleaved") #re-upload alignment in character format aln_seqs <- read.dna(file = paste(seqpath,".aln",sep = ""), format = "interleaved", as.character = TRUE) #manually adjust alignments aln_seqs <- t(apply(aln_seqs,1,adjust.aln)) #write alignment to file write.dna(aln_seqs,file = paste(seqpath,".aln",sep = ""), format = "interleaved") #re-upload alignment in character format aln_seqs <- read.dna(file = paste(seqpath,".aln",sep = ""), format = "interleaved", as.character = FALSE) return(aln_seqs) } #hand adjust alignments to take into account artifacts with an "A" inserted after 1st basepair adjust.aln <- function(row_i) { if(row_i[1] == "-" & row_i[2] == "t"){ row_i[1] <- "t" row_i[2] <- "-" } return(row_i) } ###################################################################### ###################################################################### # Functions for creating a table of pairs to combine ###################################################################### ###################################################################### #################### #create a table of all unique pairwise combinations of alleles #################### make.pairs <- function(seqs,seq_table){ output <- ddply(temp_seqs,.(numb),mp) names(output) <- c("pairID","seq1","seq2") #remove pairs where both are singletons non_singletons <- names(seq_table[seq_table > 1]) output <- output[output$seq1 %in% non_singletons | output$seq2 %in% non_singletons,] return(output) } mp <- function(s1){ temp_seqs2 <- temp_seqs[c(s1$numb:nrow(temp_seqs)),] if(s1$numb == nrow(temp_seqs)){ out <- data.frame(s1$ID,x = s1$ID) } else { x <- apply(temp_seqs2,1,mp2) out <- data.frame(s1$ID,x) } return(out) } mp2 <- function(s2){ out <- (s2[1]) return(out) } #################### #characterize differences between two given sequences #################### pair.diffs <- function(pair) { #extract the mismatch base pairs for each pair of alleles allele_a <- as.character(all_pairs[pair,2]) allele_b <- as.character(all_pairs[pair,3]) aln_sub <- aln_seqs[c(allele_a,allele_b),] #if the two alleles are really similar, re-align to correct for rare alignment errors if(dist[allele_a,allele_b] > 100 & allele_a != allele_b){ sub_alleles <- alls[c(allele_a,allele_b)] sub_aln <- clustal(sub_alleles, exec = "clustalw2", quiet = TRUE, gapopen = 15,gapext = 0.66) write.dna(sub_aln,file = "output/sub_aln", format = "interleaved") aln_sub <- read.dna("output/sub_aln", format = "interleaved", as.character = TRUE) } seq_a <- aln_sub[1,] seq_b <- aln_sub[2,] nreads_a <- seq_table[names(seq_table) == allele_a] nreads_b <- seq_table[names(seq_table) == allele_b] x_cols <- c() mismatch_a <- c() mismatch_b <- c() for (i in c(1:length(seq_a))){ if (seq_a[i] != seq_b[i]){ x_cols <- c(x_cols,i) mismatch_a <- c(mismatch_a,seq_a[i]) mismatch_b <- c(mismatch_b,seq_b[i]) } } #for pairs with no mismatches if(length(mismatch_a) == 0){ mismatch_a <- 0 mismatch_b <- 0 x_cols <- 0 } output <- list( "pairID" = pair, "columns" = x_cols, "allele_one" = allele_a, "allele_two" = allele_b, "mismatch_a" = mismatch_a, "mismatch_b" = mismatch_b, "nreads_a" = as.vector(nreads_a), "nreads_b" = as.vector(nreads_b) ) return(output) } #################### # determine if a given pair is to useable #################### type.pair <- function(diffs){ diff_numb <- length(diffs$columns) a <- diffs$mismatch_a b <- diffs$mismatch_b if(diff_numb > 3 | diff_numb == 0){ return() } #type 1 pairs if (diff_numb == 1 & (a == "-" || b == "-")){ return(pair.info(type = 1, diffs)) } if (diff_numb == 2){ #type 2 pairs if((a[1] == "-" & b[2] == "-") | (a[2] == "-" & b[1] == "-")) { return(pair.info(type = 2, diffs)) } #type 3 pairs ab <- c(a,b) if(sum(ab == "-") == 1){ return(pair.info(type = 3, diffs)) } } if (diff_numb == 3){ ab <- c(a,b) if(sum(ab == "-") == 1){ return(pair.info(type = 3, diffs)) } } } #################### # add pair info to list of pairs #################### pair.info <- function(type, diffs){ #pull sequence from indel_diffs_all table seq1 <- aln_seqs[rownames(aln_seqs) == diffs$allele_one,] seq2 <- aln_seqs[rownames(aln_seqs) == diffs$allele_two,] #remove indels sequences seq1 <- seq1[seq1 != "-"] seq2 <- seq2[seq2 != "-"] #calculate lengths l1 <- length(seq1) l2 <- length(seq2) #combine info info <- data.frame( pairID = diffs$pairID, seq1 = all_pairs[pair,2], seq2 = all_pairs[pair,3], nreads1 = diffs$nreads_a, nreads2 = diffs$nreads_b, l1, l2, type = type ) return(info) } #################### #reorder rows so that sequence one has the correct length #################### seq.one.correct <- function(row_i){ require(plyr) #create new output row row_out <- row_i if((row_out$l1 != amp_length) | ((row_out$l1 == row_out$l2) & (row_out$nreads2 > row_out$nreads1))){ #swap values row_out$seq1 <- row_i$seq2 row_out$seq2 <- row_i$seq1 row_out$nreads1 <- row_i$nreads2 row_out$nreads2 <- row_i$nreads1 row_out$l1 <- row_i$l2 row_out$l2 <- row_i$l1 } return(row_out) } #################### #adjust list of pairs to combine #################### adjust.pairs <- function(seqs, dist, seqpath){ #remove pairs where neither sequence is 213 basepairs pairs_use <- pairs_use[pairs_use$l1 == amp_length | pairs_use$l2 == amp_length,] #change pairs so first sequence is the one that is 213 basepairs longs pairs_use <- ddply(pairs_use,.(pairID),seq.one.correct) #sort by most common to least common good allele pairs_use <- pairs_use[order(pairs_use$nreads1, decreasing = TRUE),] #keep only pairs where the first sequence is more common pairs_use <- pairs_use[(pairs_use$nreads1 > pairs_use$nreads2),] #keep only type 3 and 4 pairs if first member has more than 3 reads pairs_use <- pairs_use[((pairs_use$type == 3 | pairs_use$type == 4) & pairs_use$nreads1 > 3) | ((pairs_use$type == 1) | (pairs_use$type == 2)),] return(pairs_use) } ############################################### #combine pairs ############################################### combine.pairs <- function(read){ read_out <- read #check if the sequence is listed as a sequence to be combined if(read_out$seqID %in% pairs_use$seq2){ #if so, combine it possible_pairs <- pairs_use[pairs_use$seq2 == read_out$seqID,] read_out$seqID <- possible_pairs$seq1[1] } return(read_out) } ###################################################################### ###################################################################### # Functions used during actual clustering ###################################################################### ###################################################################### #################################### #rename clusters from generic numbers to most frequent sequence #################################### name.clusters <- function(row_i){ row_i$name <- row_i$sequence[1] return(row_i) } #################################### # assign sequences into clusters #################################### cluster.seq <- function(s){ seq_id <<- s$sequence #is the cluster table empty? if(nrow(temp_cluster_table) == 0){ #start a new cluster with that sequence output <- data.frame(cluster = 1, sequence = s$sequence, nreads = s$nreads ) #if the cluster table is not empty } else { #which sequences are currently the most frequent in each cluster? tops <- ddply(temp_cluster_table,.(cluster), identify.tops) #which cluster is the closest to the target sequence tops <- tops[order(tops$sim,decreasing = TRUE),] best_match <- tops[1,] #if the best match is creater than the cutoff if (best_match$sim[1] >= cutoff){ output <- data.frame(cluster = best_match$cluster[1], sequence = seq_id, nreads = s$nreads ) } else { output <- data.frame(cluster = max(temp_cluster_table$cluster)+1, sequence = seq_id, nreads = s$nreads ) } } #write sequence result to cluster table temp_cluster_table <<- rbind(temp_cluster_table,output) # return(output) } #################################### # identify the top sequences in each cluster and how similar they are to a given sequence #################################### identify.tops <- function(cl){ #order by number of reads cl <- cl[order(cl$nreads,decreasing = TRUE),] #pull top sequence new_top <- data.frame(cluster = cl$cluster[1], sim = dist[as.character(cl$sequence[1]),as.character(seq_id)] ) return(new_top) } #################################### #perform the replicates at a given cutoff #################################### repeat.cluster <- function(cutoff, start_seqs, reps){ #refactor start_seqs to remove cluster sequences from levels() start_seqs$sequence <- factor(start_seqs$sequence) #generate replicate table rep_table <- data.frame(reps = seq(1:reps)) #run individual clustering for each replicate replicates <- ddply(rep_table,.(reps), cluster) #clean up table output <- t(replicates[,-1]) #output results from all replicates return(output) } #################################### # perform a single clustering replicate #################################### cluster <- function(rep) { #refactor start_seqs to remove cluster sequences from levels() start_seqs$sequence <- factor(start_seqs$sequence) #table of sequences to be clustered start_table <- start_seqs #remove any existing cluster assignments start_table$cluster <- NA #shuffle start_table start_table <- start_table[sample(nrow(start_table), replace = FALSE),] #generate empty table for clusters temp_cluster_table <<- data.frame() #assign sucessive sequences to clusters cluster_table <- ddply(start_table,.(sequence), cluster.seq) #sort the cluster table by cluster cluster_table <- cluster_table[order(cluster_table$cluster,cluster_table$nreads, decreasing = TRUE),] #assign clusters to the most frequent read in each cluster cluster_table <- ddply(cluster_table,.(cluster),name.clusters) #record cluster assignments for each sequence in the assignments table cluster_table <- cluster_table[order(as.character(cluster_table$sequence)),] output <- as.character(cluster_table$name) return(output) } #################################### #parse clustering replicates #################################### parse.results <- function(start_seqs,cluster_replicates){ #refactor start_seqs to remove cluster sequences from levels() start_seqs$sequence <- factor(start_seqs$sequence) #add rownames to replicates rownames(cluster_replicates) <- sort(as.character(start_seqs$sequence)) #initiate new results table results <- start_seqs #assign each sequence its most commonly assigned cluster over the replicates results <- ddply(results,.(sequence),assign.mode) #sort results by cluster assigment results <- results[order(results$cluster,results$nreads,decreasing = TRUE),] return(results) } #assign clusters based on modal assignment over replicates assign.mode <- function(row_i){ r <- sort(table(cluster_replicates[row_i$sequence,]), decreasing = TRUE) row_i$cluster <- names(r)[1] return(row_i) } #################################### #functions to separate clusters into categories #################################### pull.goods <- function(cl){ #create output output <- c() #does the cluster pass the size criteria? if (sum(cl$nreads) >= size_thresh*nrow(sample_reads) ){ #is it only one sequence? if(nrow(cl) == 1){ output <- cl #is there more than one sequence? } else { if(cl$nreads[1]/cl$nreads[2] > dom_thresh){ output <- cl } } } #output cluster if good return(output) } pull.smalls <- function(cl){ #create output output <- c() #does the cluster pass the size criteria? if (sum(cl$nreads) < size_thresh*nrow(sample_reads) ){ output <- cl if (sum(output$nreads == 1) == nrow(output)){ output <- output[order(as.numeric(as.character(output$sequence)), decreasing = FALSE), ] output$cluster <- output$sequence[1] } } #output cluster if small return(output) } pull.ambs <- function(cl){ #create output output <- c() #does the cluster pass the size criteria? if (sum(cl$nreads) >= size_thresh*nrow(sample_reads) ){ #does it have more than one sequence? if(nrow(cl) > 1 & cl$nreads[1]/cl$nreads[2] <= dom_thresh){ output <- cl } } #output cluster if small return(output) } #################################### #functions to deal with ambiguous alleles #################################### #create summary data for top two sequences in an ambiguous cluster amb.summary <- function(cl){ #what are the top two sequences? top1 <- as.character(cl$sequence[1]) top2 <- as.character(cl$sequence[2]) #how long are they? length1 <- summary(alls[top1])[1] length2 <- summary(alls[top2])[1] #how may reads in the top two? reads1 <- cl$nreads[1] reads2 <- cl$nreads[2] #what is the proportion of reads for the top two? prop1 <- reads1/(reads1 + reads2) prop2 <- reads2/(reads1 + reads2) #calculate potential nreads nreads1 <- round(prop1*sum(cl$nreads)) nreads2 <- round(prop2*sum(cl$nreads)) #potential nreads if top two are combined nreads3 <- sum(cl$nreads) #output if cluster is divided into two new1 <- data.frame(cluster = cl$cluster[1], sequence = as.numeric(top1), nreads = nreads1) new2 <- data.frame(cluster = top2, sequence = as.numeric(top2), nreads = nreads2) A <- cbind(top1,length1,reads1,prop1,nreads1) B <- cbind(top2,length2,reads2,prop2,nreads2) output <- list(top1 = top1,top2 = top2, length1 = length1,length2 = length2, reads1 = reads1,reads2 = reads2, prop1 = prop1,prop2 = prop2, nreads1 = nreads1,nreads2 = nreads2,nreads3 = nreads3, new1 = new1, new2 = new2) } #create summary data for top sequences and third sequence in an ambiguous cluster amb.summary.redo <- function(cl){ #what are the top two sequences? top1 <- as.character(cl$sequence[1]) top2 <- as.character(cl$sequence[3]) #how long are they? length1 <- summary(alls[top1])[1] length2 <- summary(alls[top2])[1] #how may reads in the top two? reads1 <- cl$nreads[1] reads2 <- cl$nreads[3] #what is the proportion of reads for the top two? prop1 <- reads1/(reads1 + reads2) prop2 <- reads2/(reads1 + reads2) #calculate potential nreads nreads1 <- round(prop1*sum(cl$nreads)) nreads2 <- round(prop2*sum(cl$nreads)) #potential nreads if top two are combined nreads3 <- sum(cl$nreads) #output if cluster is divided into two new1 <- data.frame(cluster = cl$cluster[1], sequence = as.numeric(top1), nreads = nreads1) new2 <- data.frame(cluster = top2, sequence = as.numeric(top2), nreads = nreads2) A <- cbind(top1,length1,reads1,prop1,nreads1) B <- cbind(top2,length2,reads2,prop2,nreads2) output <- list(top1 = top1,top2 = top2, length1 = length1,length2 = length2, reads1 = reads1,reads2 = reads2, prop1 = prop1,prop2 = prop2, nreads1 = nreads1,nreads2 = nreads2,nreads3 = nreads3, new1 = new1, new2 = new2) } #create summary data for top sequences and third sequence in an ambiguous cluster amb.summary.redo2 <- function(cl){ #what are the top two sequences? top1 <- as.character(cl$sequence[2]) top2 <- as.character(cl$sequence[3]) #how long are they? length1 <- summary(alls[top1])[1] length2 <- summary(alls[top2])[1] #how may reads in the top two? reads1 <- cl$nreads[2] reads2 <- cl$nreads[3] #what is the proportion of reads for the top two? prop1 <- reads1/(reads1 + reads2) prop2 <- reads2/(reads1 + reads2) #calculate potential nreads nreads1 <- round(prop1*sum(cl$nreads)) nreads2 <- round(prop2*sum(cl$nreads)) #potential nreads if top two are combined nreads3 <- sum(cl$nreads) #output if cluster is divided into two new1 <- data.frame(cluster = top1, sequence = as.numeric(top1), nreads = nreads1) new2 <- data.frame(cluster = top2, sequence = as.numeric(top2), nreads = nreads2) A <- cbind(top1,length1,reads1,prop1,nreads1) B <- cbind(top2,length2,reads2,prop2,nreads2) output <- list(top1 = top1,top2 = top2, length1 = length1,length2 = length2, reads1 = reads1,reads2 = reads2, prop1 = prop1,prop2 = prop2, nreads1 = nreads1,nreads2 = nreads2,nreads3 = nreads3, new1 = new1, new2 = new2) } #should one or both clusters be classified as small? check.smalls <- function(summary,cl){ #if both top alleles are not 213 bp, then classify the cluster as small if(summary$length1 != 213 & summary$length2 != 213){ output <- cl } else { output <- c() } #if just the second top allele is not 213bp if (summary$length2 != 213){ #check the third sequence length if(summary(alls[as.character(cl$sequence[3])])[1] == 213) { #move second sq summary <-amb.summary.redo(cl) #if only the top allele meets the size threshold, classify the second as a small allele if(summary$nreads1 >= size_thresh*nrow(sample_reads) & summary$nreads2 < size_thresh*nrow(sample_reads)){ output <- summary$new2 } #if neither meets size threshold count both as small alleles if(summary$nreads1 < size_thresh*nrow(sample_reads) & summary$nreads2 < size_thresh*nrow(sample_reads)){ output <- rbind(summary$new1,summary$new2) } } else { if(summary$nreads3 < size_thresh*nrow(sample_reads)){ output <- cl } } } #if both are the correct length if(summary$length1 == 213 & summary$length2 == 213) { #if only the top allele meets the size threshold, classify the second as a small allele if(summary$nreads1 > size_thresh*nrow(sample_reads) & summary$nreads2 < size_thresh*nrow(sample_reads)){ output <- summary$new2 } #if neither meets size threshold count both as small alleles if(summary$nreads1 < size_thresh*nrow(sample_reads) & summary$nreads2 < size_thresh*nrow(sample_reads)){ output <- rbind(summary$new1,summary$new2) } } return(output) } #should one or both clusters be classified as good? check.good <- function(summary,cl){ #if both top alleles are not 213 bp, then do nothing if(summary$length1 != 213 & summary$length2 != 213){ output <- c() } #if just the second top allele is not 213bp if (summary$length2 != 213){ #check the third sequence length if(summary(alls[as.character(cl$sequence[3])])[1] == 213) { #move second sq summary <-amb.summary.redo(cl) #if both meet the size theshold if(summary$nreads1 >= size_thresh*nrow(sample_reads) & summary$nreads2 >= size_thresh*nrow(sample_reads)){ output <- rbind(summary$new1,summary$new2) } #if only the top allele meets the size threshold, classify the whole as a good allele if(summary$nreads1 >= size_thresh*nrow(sample_reads) & summary$nreads2 < size_thresh*nrow(sample_reads)){ output <- cl } #if neither meets size threshold if(summary$nreads1 < size_thresh*nrow(sample_reads) & summary$nreads2 < size_thresh*nrow(sample_reads)){ output <- c() } } else { if(summary$nreads3 >= size_thresh*nrow(sample_reads)){ output <- cl } } } #if just the first top allele is not 213bp if (summary$length1 != 213 & summary$length2 == 213){ #check the third sequence length if(summary(alls[as.character(cl$sequence[3])])[1] == 213) { #move second sq summary <-amb.summary.redo2(cl) #if both meet the size theshold if(summary$nreads1 > size_thresh*nrow(sample_reads) & summary$nreads2 > size_thresh*nrow(sample_reads)){ output <- rbind(summary$new1,summary$new2) } #if only the top allele meets the size threshold, classify the whole as a good allele if(summary$nreads1 > size_thresh*nrow(sample_reads) & summary$nreads2 < size_thresh*nrow(sample_reads)){ cl$cluster <- cl$sequence[2] output <- cl } #if neither meets size threshold if(summary$nreads1 < size_thresh*nrow(sample_reads) & summary$nreads2 < size_thresh*nrow(sample_reads)){ output <- c() } } else { if(summary$nreads3 > size_thresh*nrow(sample_reads)){ cl$cluster <- cl$sequence[2] output <- cl } } } #if both are the correct length if(summary$length1 == 213 & summary$length2 == 213) { #if both meet the size theshold if(summary$nreads1 >= size_thresh*nrow(sample_reads) & summary$nreads2 >= size_thresh*nrow(sample_reads)){ output <- rbind(summary$new1,summary$new2) } #if only the top allele meets the size threshold, classify the whole as a good allele if(summary$nreads1 >= size_thresh*nrow(sample_reads) & summary$nreads2 < size_thresh*nrow(sample_reads)){ output <- cl } #if neither meets size threshold if(summary$nreads1 < size_thresh*nrow(sample_reads) & summary$nreads2 < size_thresh*nrow(sample_reads)){ output <- c() } } return(output) } ###################################################################### ###################################################################### # Clean up and phase 4 functions ###################################################################### ###################################################################### #collapse the table of good clusters into a table of true alleles collapse.goods <- function(cl){ #create new data for allele output <- data.frame(allele = cl$sequence[1], nreads = sum(cl$nreads), sampleID = sample, proportion = sum(cl$nreads)/(nrow(sample_reads))) return(output) } #create list of common alleles from each population common.alleles <- function(input){ #subset the data robL <- input[input$pair == "Roberts" & input$habitat == "Lake" & is.na(input$pair) == FALSE,] robS <- input[input$pair == "Roberts" & input$habitat == "Stream" & is.na(input$pair) == FALSE,] farL <- input[input$pair == "Farewell" & input$habitat == "Lake" & is.na(input$pair) == FALSE,] farS <- input[input$pair == "Farewell" & input$habitat == "Stream" & is.na(input$pair) == FALSE,] comL <- input[input$pair == "Comida" & input$habitat == "Lake" & is.na(input$pair) == FALSE,] comS <- input[input$pair == "Comida" & input$habitat == "Stream" & is.na(input$pair) == FALSE,] rob <- input[input$pair == "Roberts" & is.na(input$pair) == FALSE,] robL.alleles <<- names(table(robL$alle))[table(robL$allele) >= small_cutoff] robS.alleles <<- names(table(robS$alle))[table(robS$allele) >= small_cutoff] farL.alleles <<- names(table(farL$alle))[table(farL$allele) >= small_cutoff] farS.alleles <<- names(table(farS$alle))[table(farS$allele) >= small_cutoff] comL.alleles <<- names(table(comL$alle))[table(comL$allele) >= small_cutoff] comS.alleles <<- names(table(comS$alle))[table(comS$allele) >= small_cutoff] rob.alleles <<- names(table(rob$alle))[table(rob$allele) >= small_cutoff] } # cross check small cluster against common true alleles cross.check <- function(cl){ #which pair and habitat is the sample in? sub <- all_TAs_parapatry[all_TAs_parapatry$sampleID == as.character(cl$sampleID[1]),c("pair","habitat")] if(sub$pair[1] == "Roberts" & is.na(sub$habitat[1]) == TRUE){ alleles.cc <- rob.alleles } if(sub$pair[1] == "Roberts" & is.na(sub$habitat[1]) == FALSE){ if(sub$pair[1] == "Roberts" & sub$habitat[1] == "Lake"){ alleles.cc <- robL.alleles } if(sub$pair[1] == "Roberts" & sub$habitat[1] == "Stream"){ alleles.cc <- robS.alleles } } if(sub$pair[1] == "Farewell" & sub$habitat[1] == "Lake"){ alleles.cc <- farL.alleles } if(sub$pair[1] == "Farewell" & sub$habitat[1] == "Stream"){ alleles.cc <- farS.alleles } if(sub$pair[1] == "Comida" & sub$habitat[1] == "Lake"){ alleles.cc <- comL.alleles } if(sub$pair[1] == "Comida" & sub$habitat[1] == "Stream"){ alleles.cc <- comS.alleles } #if the cluster matches a common true allele and the cluster is at least 3 reads in total if(cl$allele[1] %in% alleles.cc & sum(cl$nreads) > 2){ #how many reads in the sample sample_reads <- samples_summary[samples_summary$sampleID == as.character(cl$sampleID[1]),"read_number_used"] #add the cluster to true alleles output <- data.frame(allele = cl$allele[1], nreads = sum(cl$nreads), sampleID = cl$sampleID[1], proportion = sum(cl$nreads)/sample_reads, status = "dropped" ) all_TAs <<- rbind(all_TAs,output) } } #indicate chimeras in TA list indicate.chimeras <- function(al){ if(al$allele[1] %in% chimeras$seqID){ output <- data.frame(al, chimera = "yes") } else { output <- data.frame(al, chimera = "no") } } #indicate incorrect length TAs in TA list indicate.lengths <- function(al){ if(alleles_summary[alleles_summary$allele == al$allele[1], "length"] == 213){ output <- data.frame(al, correct_length = "yes") } else { output <- data.frame(al, correct_length = "no") } } #update samples summary to account for dropped alleles new.TAs <- function(samp){ output <- data.frame(sampleID = samp$sampleID[1], dropped_alleles = nrow(samp[samp$status == "dropped",]), all_alleles = nrow(samp), chimeras = nrow(samp[samp$chimera == "yes",]), incorrect_length = nrow(samp[samp$correct_length == "no",]), true_alleles = nrow(samp[samp$chimera == "no" & samp$correct_length == "yes",]) ) return(output) } #create list of final true alleles along with a fasta file and alignment of just true alleles final.list <- function(all_TAs,alls){ #write names of haplotypes to file TAs <- names(table(all_TAs$allele)) write(TAs,"output/TA_list", ncolumns = 1) #extract just the true allele sequences TA_seqs <- alls[TAs] write.dna(TA_seqs, file = "output/TAs.fasta",format = "fasta") #align sequences using clustal aln_alleles <- clustal(TA_seqs, exec = "clustalw2", quiet = TRUE, pw.gapopen = 20) #write alignment to file write.dna(aln_alleles,file = "output/TAs.aln", format = "interleaved") #create distance matrix TA_dist <- dist.dna(aln_alleles, model = "JC69", as.matrix = TRUE) #write distace matrix to file write.csv(TA_dist,"output/TA_dist.csv") } #calculate summary statistics for each true allele create.allele.summary <- function(al){ #length of allele length <- summary(alls[as.character(al$allele)])[1] #nuber of of samples where it was found samples_found <- nrow(al) #percentage of all samples in library where found perc_samples <- samples_found/nrow(samples) #number of times appeared as a good cluster goods <- nrow(al[al$status == "good",]) #number of times the sample was dropped dropped <- nrow(al[al$status == "dropped",]) #% of samples where dropped perc_dropped <- dropped/(dropped + goods) #average cluster size (mean proportion of total reads) mean_cluster_size <- mean(al$proportion) #average cluster size of good clusters g <- al[al$status == "good",] mean_good_cluster_size <- mean(g$proportion) #is it a chimera? chimera <- al$chimera[1] #does it only appear once in the data set? if(samples_found == 1){ singleton <- "yes" } else { singleton <- "no" } output <- data.frame(length, samples_found, perc_samples, goods, dropped, perc_dropped, mean_cluster_size, mean_good_cluster_size, chimera, singleton ) return(output) } #create summary info for each sequence in the sample create.sequence.summary <- function(old_seq_table){ temp_seq_table <- data.frame(old_seq_table) names(temp_seq_table) <- c("sequence","nreads") sequence_summary <- ddply(temp_seq_table,.(sequence), seq.tables) return(sequence_summary) } #create sequence tables for each fish seq.tables <- function(s){ #calculate length of sequence s$length <- summary(alls[as.character(s$sequence)])[1] #if it was in a good cluster if(s$sequence %in% good_clusters$sequence){ s$cluster <- good_clusters[good_clusters$sequence == as.character(s$sequence),"cluster"] s$class <- "good" } else { s$cluster <- NA s$class <- NA } #if it was in an ambiguous cluster if(s$sequence %in% amb_clusters$sequence){ #what were the top two sequences in the cluster clus <- amb_clusters[amb_clusters$sequence == as.character(s$sequence),] clus <- amb_clusters[amb_clusters$cluster == clus$cluster,] s$cluster <- paste(clus$sequence[1],clus$sequence[2], sep = "/") s$class <- "ambiguous" } #was it a small cluster if(s$sequence %in% small_clusters$sequence){ s$cluster <- small_clusters[small_clusters$sequence == as.character(s$sequence),"cluster"] s$class <- "small" } #was it a true allele if(s$sequence %in% true_alleles$allele){ s$true <- "yes" } else { s$true <- "no" } #indicate not combined for now s$combined <- "NA" return(s) } #update sequence summary table to account for combined sequences combined.seqs <- function(s){ #was it combined during phase 2 if(s$sequence %in% pairs_use$seq2){ combined <- pairs_use[pairs_use$seq2 == s$sequence, "seq1"] type <- pairs_use[pairs_use$seq2 == s$sequence, "type"] classification <- sequence_summary[sequence_summary$sequence == combined[1],"class"] cluster <- sequence_summary[sequence_summary$sequence == combined[1],"cluster"] s$combined <- as.character(combined[1]) s$type <- type[1] s$class <- classification[1] s$cluster <- cluster[1] } return(s) } update.sequence.summary <- function(id){ samp <<- as.character(id$sampleID[1]) #check if skipped if(id$sampleID[1] %in% skip$sampleID){ } else { #create a unique filename path for this sequence seqpath <- paste("output",id$sampleID[1],sep = "/") #upload sequence summary summ <- read.csv(paste(seqpath,"_sequence_summary.csv",sep = ""), row.names = 1) #add chimera indicator summ <- ddply(summ,.(sequence), add.chimera.indicator) #update true allele status after the incorporation of dropped alleles summ <- ddply(summ,.(sequence), add.dropped.indicator) #write to file write.csv(summ,paste(seqpath,"_sequence_summary.csv",sep = ""), row.names = FALSE) } } #update sequence summary table after accounting for chimeras add.chimera.indicator <- function(s){ if(s$cluster %in% chimeras$allele){ s$cluster.chimera <- "yes" s$true <- "no" } else { s$cluster.chimera <- "no" } if(s$sequence %in% chimeras$allele){ s$chimera <- "yes" } else { s$chimera <- "no" } return(s) } #update sequence summary table after accounting for chimeras add.dropped.indicator <- function(s){ #what sample? samp_TAs <- all_TAs[all_TAs$sampleID == samp,] #what cluster(s) clus <- as.vector(strsplit(as.character(s$cluster),"/")[[1]]) #pull TAs clus <- samp_TAs[samp_TAs$allele == clus[1],] if(nrow(clus) > 0 & clus$status[1] == "dropped"){ #what cluster(s) clus <- as.vector(strsplit(as.character(s$cluster),"/")[[1]]) s$dropped <- "yes" s$true <- factor("yes") } else { s$dropped <- "no" } return(s) }