## ***** Hybridization Rate Estimation: Determining Mobility of Crosses and Quantifying Hybridization with mixtools *****
## This code requires ggplot2 version 0.9.1
## This code requires mixtools version 0.4.5
library(ggplot2)
library(mixtools)
# Data file: mobility_of_crosses.csv
# The "Rf" column contains the position sampled on the gel (relative to dye migration)
#
# The Count column: As described in the paper, tips on an 8-channel pipette were used to sample the gel after the run was finished.
# Instead of going into liquid media to do serial dilutions, the tips were poked serially 12 times on an agar plate containing a bacterial (Pp) lawn.
# The locations of the serial pokes would exhibit a plaque until the viruses were diluted in successive pokes. Thus, a count of 1 represents the most dilute
# detectable concentration and 12 most concentrated. The dilution factor of this method is approximately 2.5, meaning that a count of 1 represents a 2.5-fold
# dilution and 12 represents a dilution of 2.5^-12. **The function d_log(), see below, is used to transform these counts to reflect their logarithmic scale.
#
# The Replicate column indicates
# The remaining columns are self explanatory
# Import Data from .csv file
data <- read.csv("mobility_of_crosses.csv")
# Make data columns available to R
# Creates a data frame for each cross with the same structure as the .csv file. They are named SSxFF, PT309xFF, SSxSS, FFxFF, and so on, for each control and experimental cross
all_splits <- split(data, data$Name)
attach(all_splits)
# d_log_adj: Function adjusts counts to reflect their logarithmic scale and combines three separate runs by making the counts proportional to the total count # in each run.
# The input is a data frame generated above. Example: for FFxFF is run as d_log_adj(FFxFF) and outputs a vector of the distribution of Rf values.
d_log_adj <- function (x) {
# Selecting only the first replicate run.
s1 <- as.numeric(unlist(subset(x, Replicate == 1, Count)))
Rf1 <- as.numeric(unlist(subset(x, Replicate == 1, Rf)))
# For the first run the count numbers are made proportional to the total and multiplied to account for the 2.5 fold dilution factor to recover
# an integer > 0 for the proportional count value
adj1 <- round(s1/sum(s1)*25)
# The Rf value is repeated 2^"Count" times for each strain to create a univariate distribution of the mobility of each strain along the gel. The adjusted count serves as the exponent
# for base 2 to recover the logarithmic distribution created by the ~2.5 dilution factor.
Rfadj1 <- rep(Rf1, 2^adj1)
# Same as above for Replicate 2
s2 <- as.numeric(unlist(subset(x, Replicate == 2, Count)))
Rf2 <- as.numeric(unlist(subset(x, Replicate == 2, Rf)))
adj2 <- round(s2/sum(s2)*25)
Rfadj2 <- rep(Rf2, 2^adj2)
# Same as above for Replicate 3
s3 <- as.numeric(unlist(subset(x, Replicate == 3, Count)))
Rf3 <- as.numeric(unlist(subset(x, Replicate == 3, Rf)))
adj3 <- round(s3/sum(s3)*25)
Rfadj3 <- rep(Rf3, 2^adj3)
# Now combine all three distributions into 1.
d_x <- c(Rfadj1, Rfadj2, Rfadj3)
return(d_x)
}
## ***** Figure 1C - Selection for fast and slow electrophoretic mobility - Figure 1C *****
# For the phi6-Fast1 (FF) control cross:
d_FFxFF <- d_log_adj(FFxFF)
# generate a histogram of Rf values
hist(d_FFxFF)
# calculate mean and standard deviation.
mean(d_FFxFF) #This is equal to 1.463648, see Table 1 in paper
sd(d_FFxFF) #This is equal to 0.4014751, see Table 1 in paper
# All other means and sd's in Table 1 are calculated this way
# Now create Rf vectors for phi6-Slow1 (SS) control cross and the cross of phi6-Slow1 x phi6-Fast1
d_SSxSS <- d_log_adj(SSxSS)
d_SSxFF <- d_log_adj(SSxFF)
# Recreate Figure 1C
#Simplify names and create a data frame for easy graphing in ggpolt2
FF <- d_FFxFF
SS <- d_SSxSS
SSFF <- d_SSxFF
SSFFdf <- rbind(data.frame(Name="FF",Count=FF),
data.frame(Name="SS",Count=SS),
data.frame(Name="SSFF",Count=SSFF))
# Generate Figure 1C
ggplot(SSFFdf, aes(x=Count, fill=Name)) + geom_density(alpha=0.5) +
xlab("Rf: Relative Migration") +
ylab("Density") +
opts(legend.title=theme_blank())
# Test for significant differences between the parental and hybrid strains of SS and FF using an ANOVA and a Tukey test
model <- aov(Count ~ Name, data = SSFFdf)
summary(model)
TukeyHSD(model)
## ***** Estimating Hybridization Rate using Mixtools *****
# This function determines the proportion of each component (assuming a normal distribution) in a distribution. We constrained means for parental strains and made the
# algorithm estimate a third component. The status as an intermediate hybrid can be verified by the mean (mu) assigned by the algorithm.
results <- normalmixEM(d_SSxFF, k=3, mean.constr = c(0.8068881, NA, 1.463648), epsilon=1e-2, maxit=400) # The hardcoded numbers are the means for parental strains
summary(results)
# The output used for the published paper is replicated here because the results are simulation based and may differ slightly
# summary of normalmixEM object:
# comp 1 comp 2 comp 3
# lambda 0.0667079 0.821598 0.111694 # lambda is the proportion of each component in the mixture. In this case the hybrid component (comp 2) accounts for 0.82
# mu 0.8068881 0.987183 1.463648 # of the total mixture at a mean Rf of 0.98 in between the parental strains (comp1 and comp3).
# sigma 0.3671616 0.371713 0.605058
# loglik at estimate: -129.3074
## ***** Figure 5 - Illustration of steps taken to estimate hybridization rate, using finite mixed models - Figure 5 *****
# Create a vector for all strain crosses to be used in this section
d_CA65dxFF <- d_log_adj(CA65dxFF)
d_CA65dxCA65d <- d_log_adj(CA65dxCA65d)
d_KRI289xFF <- d_log_adj(KRI289xFF)
d_KRI289xKRI289 <- d_log_adj(KRI289xKRI289)
# Generate individual histograms for each strain, with a log scale
# Histogram for CA65d control cross
x <- sort(d_CA65dxCA65d)
a <- hist(d_CA65dxCA65d, plot = FALSE)
a$counts = log10(a$counts)
plot(a, ylab='Density', xlab='Rf: Relative Mobility', main=NULL, xlim=c(0, 2.0))
# Histogram for KRI289 control cross
x <- sort(d_KRI289xKRI289)
a <- hist(d_KRI289xKRI289, plot = FALSE)
a$counts = log10(a$counts)
plot(a, ylab='Density', xlab='Rf: Relative Mobility', main=NULL, xlim=c(0, 2.0))
# Histogram for FF (phi6-Fast1) control cross
x <- sort(d_FFxFF)
a <- hist(d_FFxFF, plot = FALSE)
a$counts = log10(a$counts)
plot(a, ylab='Density', xlab='Rf: Relative Mobility', main=NULL, xlim=c(0, 2.0))
# Run mixture analysis on CA65dxFF
results <- normalmixEM(d_CA65dxFF, k=3, mean.constr = c(0.6782956, NA, 1.463648), epsilon=1e-2, maxit=400)
summary(results)
# summary of normalmixEM object:
# comp 1 comp 2 comp 3
# lambda 0.3329145 0.390613 0.2764729
# mu 0.6782956 1.111459 1.4636480
# sigma 0.0699866 0.449645 0.0610677
# loglik at estimate: -55.28077
plot(results, which=2) # this gives a plot with the mixture components similar to Fig 5D, but the histogram scale is not log adjusted
# Plot histogram of mixture components with a log scale
x <- sort(results$x)
a <- hist(results$x, plot = FALSE)
a$counts = log10(a$counts)
maxy <- max(max(a$density), 0.3989 * results$lambda/results$sigma)
plot(a, ylab='Density', xlab='Rf: Relative Mobility', ylim = c(0, maxy), xlim=c(0,2.0), main=NULL)
lines(x, results$lambda[1] * dnorm(x, mean = results$mu[1], sd = results$sigma[1]), col = "red", lwd = 2)
lines(x, results$lambda[2] * dnorm(x, mean = results$mu[2], sd = results$sigma[2]), col = "green", lwd = 2)
lines(x, results$lambda[3] * dnorm(x, mean = results$mu[3], sd = results$sigma[3]), col = "blue", lwd = 2)
# Below is the same procedure as above for KRI289xFF
results <- normalmixEM(d_KRI289xFF, k=3, mean.constr = c(0.3551109, NA, 1.463648), epsilon=1e-2, maxit=400)
summary(results)
# summary of normalmixEM object:
# comp 1 comp 2 comp 3
# lambda 0.8657570 0.0462086 0.0880345
# mu 0.3551109 0.7079029 1.4636480
# sigma 0.0938401 0.2205595 0.2075946
plot(results, which=2) # this gives a plot with the mixture components similar to Fig 5D, but the histogram scale is not log adjusted
# Plot histogram of mixture components with a log scale
x <- sort(results$x)
a <- hist(results$x, plot = FALSE)
a$counts = log10(a$counts)
maxy <- max(max(a$density), 0.3989 * results$lambda/results$sigma)
plot(a, ylab='Density', xlab='Rf: Relative Mobility', ylim = c(0, maxy), xlim=c(0,2.0), main=NULL)
lines(x, results$lambda[1] * dnorm(x, mean = results$mu[1], sd = results$sigma[1]), col = "red", lwd = 2)
lines(x, results$lambda[2] * dnorm(x, mean = results$mu[2], sd = results$sigma[2]), col = "green", lwd = 2)
lines(x, results$lambda[3] * dnorm(x, mean = results$mu[3], sd = results$sigma[3]), col = "blue", lwd = 2)
#If you made it this far, let me know how this data package was useful for you. Very interested in how people will use open data and open code. Please contact me at the following email address:
me ((at)) samdiazmunoz.org
-Samuel Díaz-Muñoz