### Script to perform the ABBA BABA tests with a block-jacknife procedure to estimate variance ### used for MARTIN ET AL. 2013 GENOME WIDE EVIDENCE FOR SPECIATION WITH GENE FLOW IN HELICONIUS BUTTERFLIES # Input is a csv of allele frequencies. See the file set31.Zupdated.autoANDchrZ.ALLSHARED.SNP.derFreqs.csv submitted herewith # we also load a table of tests to perform, so that that we can perform multiple tests automatically See ABBA_BABA_tests.csv for example. #function to rearrange scaffolds into chromosomes based on the linkage map agp file (Hmel1-1_hox_RAD_matepair_chromosomes_Zupdated.agp) as.chromosomes <- function(table,agp) { chromosome <- vector(length = length(table[,1])) chrom_pos <- vector(length = length(table[,1])) for (x in 1:length(table[,1])){ scaf <- table$scaffold[x] if (table$scaffold[x] %in% agp$scaffold){ y <- which(agp$scaffold == scaf) chromosome[x] <- agp$chromosome[y] if (agp$ori[y] == "+"){ chrom_pos[x] <- table$position[x] + agp$start[y] } else { chrom_pos[x] <- agp$end[y] - table$position[x] } } else{ chromosome[x] <- NA chrom_pos[x] <- NA } if (x %% 100000 == 0){ print(x) } } WG_table <- cbind(chromosome,chrom_pos,table) } # read agp for chromosomal info agp <- read.delim("/home/shm45/references/Hmel1-1_hox_RAD_matepair_chromosomes_Zupdated.agp", as.is = T, sep = "\t", header = F) names(agp) = c("chromosome", "start", "end", "order", "DN", "scaffold", "one", "length", "ori") # frequencies freq_table <- read.csv("set31.Zupdated.autoANDchrZ.ALLSHARED.SNP.derFreqs.csv") #rearrange lines into chromosomes, discarding unmapped scaffolds WG_table <- as.chromosomes(freq_table,agp) # chromosome names - can be all, or only autosomes or only Z chromNames <- c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chrZ") #chromNames <- c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20") #chromNames <- c("chrZ") # filter based on chromosomes of interest WG_table <- subset(WG_table, chromosme %in% chromNames) attach(WG_table) #read table of tests to perform (exampe is submitted herewith) tests <- read.csv("ABBA_BABA_tests.csv") #create output variables D <- numeric(length = length(tests[,1])) # D-statistic D_err <- numeric(length = length(tests[,1])) # D std error D_Z <- numeric(length = length(tests[,1])) # D Z-score D_p <- numeric(length = length(tests[,1])) # D p-value f <- numeric(length = length(tests[,1])) # f-statistic f_err <- numeric(length = length(tests[,1])) # D std error ### normal test #create a list of positions that will be considered during each jackknife replicate indices <- list() n <- 1 for (chrom in chromNames) { chrom_end <- max(WG_table[which(WG_table$chromosome == chrom),"chrom_pos"]) starts <- seq(1,chrom_end,1000000) ends <- starts + 999999 for (x in 1:length(starts)) { indices[[n]] <- which(WG_table$chromosome %in% chromNames & !(WG_table$chromosome == chrom & WG_table$chrom_pos >= starts[x] & WG_table$chrom_pos <= ends[x])) print(n) n <- n + 1 } } #calculate D and f statistics for (z in 1:length(tests[,1])) { donor_name <- as.character(tests$donor[z]) recipient_name <- as.character(tests$recipient[z]) non_recipient_name <- as.character(tests$non_recipient[z]) ABBA <- ((1 - get(non_recipient_name)) * get(recipient_name) * get(donor_name)) BABA <- (get(non_recipient_name) * (1 - get(recipient_name)) * get(donor_name)) maxABBA <- ((1 - get(non_recipient_name)) * get(paste(donor_name, "_1", sep = "")) * get(paste(donor_name, "_2", sep = ""))) maxBABA <- (get(non_recipient_name) * (1 - get(paste(donor_name, "_1", sep = ""))) * get(paste(donor_name, "_2", sep = ""))) D[z] <- (sum(ABBA, na.rm = T) - sum(BABA, na.rm = T))/(sum(ABBA, na.rm = T) + sum(BABA, na.rm = T)) f[z] <- (sum(ABBA, na.rm = T) - sum(BABA, na.rm = T))/(sum(maxABBA, na.rm = T) - sum(maxBABA, na.rm = T)) # jacknife to get errors for D and f D_pseudo <- vector(length = length(indices)) f_pseudo <- vector(length = length(indices)) n <- 1 for (x in 1:length(indices)) { jacked_positions <- indices[[x]] jacked_D <- (sum(ABBA[jacked_positions], na.rm = T) - sum(BABA[jacked_positions], na.rm = T))/(sum(ABBA[jacked_positions], na.rm = T) + sum(BABA[jacked_positions], na.rm = T)) jacked_f <- (sum(ABBA[jacked_positions], na.rm = T) - sum(BABA[jacked_positions], na.rm = T))/(sum(maxABBA[jacked_positions], na.rm = T) - sum(maxBABA[jacked_positions], na.rm = T)) D_pseudo[n] <- D[z]*length(indices) - jacked_D*(length(indices)-1) f_pseudo[n] <- f[z]*length(indices) - jacked_f*(length(indices)-1) print(n) n <- n + 1 } D_err[z] <- sqrt(var(D_pseudo)/length(D_pseudo)) D_Z[z] <- D[z] / D_err[z] D_p[z] <- 2*pnorm(-abs(D_Z[z])) f_err[z] <- sqrt(var(f_pseudo)/length(f_pseudo)) #then put together the results and write output <- cbind(tests,D,D_err,D_Z,D_p,f,f_err) write.csv(output, file = "ABBA_BABA_results.csv", row.names = F, quote = F) }