### Lessons learned from microsatellite development for non-model organisms using 454 pyrosequencing ### Schoebel et al. ### microsatellite script ################################################## # This script has to be run for dinucleotides, trinucleotides and tetranucleotides separately. # Here we show an example script for dinucleotides. # Save the msatcommander output files (di_microsatellites.csv, di_primers.csv) as text files. # Delete all rows under potentially duplicated primers in the primer file (di_primers.txt). # Set the working directory to the directory where you saved the text files: setwd("C:\\microsatellites\\Dinucleotides") ################################################## # Open the text file containing the microsatellites (di_microsatellites.txt): microsats <- read.table("di_microsatellites.txt", header=TRUE) # Open the text file containing the primers (di_primers.txt): primers <- read.table("di_primers.txt" , header=TRUE) ################################################## # Merge "primers" with "microsats" into a common data frame named, "merged.data". # The column with the read name in both file is called "name". merged.data <- merge(x=microsats, y=primers, by="name") # Save the merged table: write.table(merged.data, "merged.data.txt", sep=",") ################################################## # Remove duplicated reads from the merged table. # Identify duplicated reads (column header = "name") in the merged table: dupl_reads <- merged.data[duplicated(merged.data$name),] # Transform names of duplicated reads to a vector: dupl_names <- as.vector(dupl_reads$name) # Remove all rows from the merged table where the read name matches with one of the duplicates: unique_reads <- merged.data[ !merged.data[,"name"] %in% dupl_names, ] # Print number of unique reads: print(nrow(unique_reads)) # Save the table with unique reads: write.table(unique_reads, "unique_reads.txt", sep=",") ################################################## # Remove duplicated left primers from the table containing unique reads (unique_reads). # Identify duplicated left sequences (column header = "left_sequence") in the "unique_reads" table: dupl_left <- unique_reads[duplicated(unique_reads$left_sequence),] # Transform names of duplicated left sequences to a vector: dupl_left_names <- as.vector(dupl_left$left_sequence) # Remove all rows from the merged table where the left sequence matches with one of the left sequences unique_left <- unique_reads[ !unique_reads[,"left_sequence"] %in% dupl_left_names, ] # Print number of unique reads with unique left sequences print (nrow(unique_left)) # Save the table with unique reads with unique left sequences write.table(unique_left, "unique_left.txt", sep=",") ################################################## # Remove duplicated right primers from the table containing unique reads with unique left sequences # (unique_left). # Identify duplicated right sequences (column header = "right_sequence") in the "unique_left" table: dupl_all <- unique_left[duplicated(unique_left$right_sequence),] # Transform names of duplicated right sequences to a vector: dupl_all_names <- as.vector(dupl_all$right_sequence) # Remove all rows from the "unique_left" table where the right sequence matches with one of the right sequences: unique_all <- unique_left[ !unique_left[,"right_sequence"] %in% dupl_all_names, ] # Print number of unique reads with unique left and right sequences: print (nrow(unique_all)) # Save the table with unique reads with unique left sequences: write.table(unique_all, "unique_all.txt", sep=",")