Simulations to estimate the variance associated with the estimator of p (frequency of the reference allele within the pool)
setwd("/Users/nicolasrode/Dropbox/GBS/Article/Dryad")
list.files()
## [1] "DataHoechst.csv"
## [2] "DataPicogreen.csv"
## [3] "DNAconcentration_estimation_script_Fig.2-3.Rmd"
## [4] "Effect of the number of sample, technical and reading replicate on CVbefore and CVafter (Hoechst).pdf"
## [5] "Fig. 3 Effect of the number of sample, technical and reading replicate on CVbefore (Picogreen).pdf"
## [6] "Fig. S5 Correlation tissue mass - DNA yield Hoechst vs PicoGreen.pdf"
## [7] "Fig.1_SEpool_SEindividual.pdf"
## [8] "Fig.1Fig.S1-4.html"
## [9] "Fig.2-3.html"
## [10] "Figure 2 Effect of the number of independent samples from the same individual on CVbefore.pdf"
## [11] "Figure S1 Estep1-3 Lambda equals 20 or 50.pdf"
## [12] "Figure S2 Estep1-3 Lambda equals 20 or 50.pdf"
## [13] "Figure S3 Estep1-3 Multiplicative overdispersion equals 1 or 3.pdf"
## [14] "Figure S4 Vstep1-3 Multiplicative overdispersion equals 1 or 3.pdf"
## [15] "PipetteMetrology.csv"
## [16] "Simulation_script_Fig.1Fig.S1-4.Rmd"
library(ggplot2)
library(gridExtra)
library(RColorBrewer)
## Parameters for the graphic
N1=1
N2=100
## Size of axis labels
cextextaxis = 12
## Size of axis title
cextitleaxis = 20
## Function baster on Eq 9 in Rode et al 2017 and Eq/ 2 in Gautier et al 2013
ratioSDpoolSDind <- function(x,alpha=alphai,lambdapool=lambdapooli,CV=CVi,Nind=Nindi,LambdaInd=LambdaIndi){
valpool=(1 + CV^2 * ((2*x-1)/(2*x))*(1-(1+lambdapool*(1+alpha))/lambdapool^2)+(2*x-1)*(1+lambdapool*(1+alpha))/lambdapool^2)/(2*x)
valind=(1 +(1+LambdaInd*(1+alphai))/LambdaInd^2)/(2*Nind)
val=sqrt(valpool/valind)
return(val)
}
## initial values
CVi=50/100
lambdapooli=100
alphai=0.5
Nindi=20
LambdaIndi=5
# plot
## Parameters for the graphic
N1=1
N2=100
## Size of axis labels
cextextaxis = 12
## Size of axis title
cextitleaxis = 20
# Palette de couleur
my_colors = brewer.pal(3, "Set1")
## Values for CV
CV1=20/100
CV2=50/100
CV3=100/100
## Values for alpha
Alpha1=0.2
Alpha2=0.5
Alpha3=1
# plot
LambdaPool1=50
LambdaPool2=100
LambdaPool3=200
## Thickness of the lines
cex1 <- 0.5
cex2 <- 1
cex3 <- 1.5
## Size of letters
lettersize=7
my.labs <- list(bquote(paste(CV==.(CV1*100)," %")) , bquote(paste(CV==.(CV2*100)," %")) , bquote(paste(CV==.(CV3*100)," %")) )
CVplot <- ggplot(data.frame(x=c(N1, N2)), aes(x)) +
stat_function(fun=ratioSDpoolSDind, args=list(CV=CV1), aes(colour = paste("CV = ",CV1,sep="")), size = cex1, linetype="solid") +
stat_function(fun=ratioSDpoolSDind, args=list(CV=CV2), aes(colour = paste("CV = ",CV2,sep="")), size = cex2, linetype="solid") +
stat_function(fun=ratioSDpoolSDind, args=list(CV=CV3), aes(colour = paste("CV = ",CV3,sep="")), size = cex3, linetype="solid") +
theme(legend.text=element_text(size=15), legend.position = c(.7, .9), legend.title=element_blank(), axis.text.x =element_text(color = "black", size = cextextaxis),axis.text.y =element_text(color = "black", size = cextextaxis),axis.title.x = element_text(face="bold", colour="black", size=cextitleaxis),axis.title.y = element_text(face="bold", colour="black", size=cextitleaxis) )+
scale_y_continuous("SE pool / SE individual",limits = c(0.5, 1.5)) +
scale_x_continuous("Pool sample size (N)") +
scale_colour_manual(values=my_colors, labels=my.labs)+
guides(colour = guide_legend("Type", override.aes = list( size = c(cex1,cex2,cex3))))+
## See: http://ggplot2.tidyverse.org/reference/annotate.html
annotate("text", x = 5.5, y=1.5, label = "(a)",size=lettersize)+
geom_hline(yintercept = 1, color="black", linetype="dotted")
my.labs <- list(bquote(alpha==.(Alpha1)) , bquote(alpha==.(Alpha2)) , bquote(alpha==.(Alpha3)) )
alphaplot <- ggplot(data.frame(x=c(N1, N2)), aes(x)) +
stat_function(fun=ratioSDpoolSDind, args=list(alpha=Alpha1), aes(colour = "Alpha1"), size = cex1, linetype="solid") +
stat_function(fun=ratioSDpoolSDind, args=list(alpha=Alpha2), aes(colour = "Alpha2"), size = cex2, linetype="solid") +
stat_function(fun=ratioSDpoolSDind, args=list(alpha=Alpha3), aes(colour = "Alpha3"), size = cex3, linetype="solid") +
theme(legend.text=element_text(size=15), legend.position = c(.73, .9), legend.title=element_blank(), axis.text.x =element_text(color = "black", size = cextextaxis),axis.text.y =element_text(color = "black", size = cextextaxis),axis.title.x = element_text(face="bold", colour="black", size=cextitleaxis),axis.title.y = element_text(face="bold", colour="black", size=cextitleaxis) )+
scale_y_continuous("",limits = c(0.5, 1.5)) +
scale_x_continuous("Pool sample size (N)") +
scale_colour_manual(values=my_colors, labels=my.labs)+
guides(colour = guide_legend("Type", override.aes = list( size = c(cex1,cex2,cex3))))+
annotate("text", x = 5.5, y=1.5, label = "(b)",size=lettersize)+
geom_hline(yintercept = 1, color="black", linetype="dotted")
## Test with lambda = 200
n <- N1:N2
plot(n,sapply(n,ratioSDpoolSDind,lambdapool=LambdaPool3), ylab= "SE pool/SE individual")
my.labs <- list(bquote(paste(lambda==.(LambdaPool1)," X")) , bquote(paste(lambda==.(LambdaPool2)," X")) , bquote(paste(lambda==.(LambdaPool3)," X")) )
lambdaplot <- ggplot(data.frame(x=c(N1, N2)), aes(x)) +
stat_function(fun=ratioSDpoolSDind, args=list(lambdapool=LambdaPool1), aes(colour = "Lambda1"), size = cex1, linetype="solid")+
stat_function(fun=ratioSDpoolSDind, args=list(lambdapool=LambdaPool2), aes(colour = "Lambda2"), size = cex2, linetype="solid") +
stat_function(fun=ratioSDpoolSDind, args=list(lambdapool=LambdaPool3), aes(colour = "Lambda3"), size = cex3, linetype="solid") +
theme(legend.text=element_text(size=15), legend.position = c(.73, .9), legend.title=element_blank(), axis.text.x =element_text(color = "black", size = cextextaxis),axis.text.y =element_text(color = "black", size = cextextaxis),axis.title.x = element_text(face="bold", colour="black", size=cextitleaxis),axis.title.y = element_text(face="bold", colour="black", size=cextitleaxis) )+
scale_y_continuous("",limits = c(0.5, 1.5)) +
scale_x_continuous("Pool sample size (N)") +
scale_colour_manual(values=my_colors, labels=my.labs)+
guides(colour = guide_legend("Type", override.aes = list( size = c(cex1,cex2,cex3))))+
annotate("text", x = 5.5, y=1.5, label = "(c)",size=lettersize)+
geom_hline(yintercept = 1, color="black", linetype="dotted")
## Text with legend
x1 <- 50
y1 <- 0.98
y2 <- 0.75
y1b <- 1.05
y2b <- 1.25
leg <- ggplot(data.frame(x=c(N1, N2)), aes(x)) +
#theme(panel.background = element_rect(fill = 'white', colour = 'white'))+
theme(axis.line=element_blank(),axis.text.x=element_blank(),
axis.text.y=element_blank(),axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank(),legend.position="none",
panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),plot.background=element_blank())+
scale_y_continuous("",limits = c(0.5, 1.5)) +
scale_x_continuous("",limits = c(1, 100)) +
geom_segment(aes(x = x1, y = y1, xend = x1, yend = y2),arrow=arrow(length=unit(0.3,"cm"),ends = "last",type="closed"),lwd=2.5)+
geom_segment(aes(x = x1, y = y2b, xend = x1, yend = y1b),arrow=arrow(length=unit(0.3,"cm"),ends = "first",type="closed"),lwd=2.5)+
annotate("text", x = 55, y=1.3, label = "Ind-seq preferable",size=lettersize)+
annotate("text", x = 55, y=0.70, label = "Pool-seq preferable",size=lettersize)
pdf(file="Fig.1_SEpool_SEindividual.pdf", width = 29.7/2, height = 21/2)
grid.arrange(CVplot, alphaplot,lambdaplot,leg, ncol = 4)
## Warning: Removed 8 rows containing missing values (geom_path).
## Warning: Removed 10 rows containing missing values (geom_path).
## Warning: Removed 16 rows containing missing values (geom_path).
## Warning: Removed 9 rows containing missing values (geom_path).
## Warning: Removed 10 rows containing missing values (geom_path).
## Warning: Removed 11 rows containing missing values (geom_path).
## Warning: Removed 13 rows containing missing values (geom_path).
## Warning: Removed 10 rows containing missing values (geom_path).
## Warning: Removed 9 rows containing missing values (geom_path).
dev.off()
## quartz_off_screen
## 2
#dev.copy(png,file=paste(getwd(),"Fig.1_SEpool_SEindivdual.png",sep="/")); dev.off()
#dev.copy(pdf,file=paste(getwd(),"Fig. 1 SSEpool/SEindivdual.pdf",sep="/")); dev.off()
## Standard deviation of the mass per individual sample
sig <- 0.1
## seed: seed used to initialize the random number generator
## mu: average DNA yield for one individual
## sigma: standard deviation in the DNA yield of one individual
## p: frequency of the reference allele
## N: total number of individuals sampled
simsampunique<- function(seed, p, N, mu, sigma=sig){
set.seed(seed)
## Number of individuals with reference allele sampled among a total of N individuals
nsim <- rbinom(n=1,size=N, prob=p)
if(nsim > 0 & nsim < N){
#nsim=6
## Mass of each individual sample
## Normal distribution
#Msim <- rnorm(N,mean=mu,sd=sigma)
## Gamma distribution
#Msim <- rgamma(N,shape=mu^2/sigma^2,scale=sigma^2/mu)
## Lognormal distribution
Msim <- rlnorm(N, meanlog = log(mu)-log(((sigma^2)/mu^2)+1)/2, sdlog = sqrt(log(((sigma^2)/mu^2)+1)))
## Proportion of focal genotype in the sample
DNAprop <- sum(Msim[1:nsim])/sum(Msim)
}else{
DNAprop <- nsim/N
}
return(DNAprop)
}
## Parallelization over many (10000) replicates
simsamp<- function(p=0.5,N=10,mu,sigma=sig,nrep=10000){
probsim <- sapply(1:nrep, simsampunique, mu=mu, p=p, N=N)
probsim
return(list(mean(probsim),sd(probsim)))
}
## Test with one example
simsamp(mu=10)
## [[1]]
## [1] 0.5002259
##
## [[2]]
## [1] 0.15792
#simsamp(mu=1,nrep=10000)
## To vectorize the simulations
##sapply(musim,simsamp,p=0.1)
## Coefficient of variation ranging from 0.1 to 1
musim <- sig/(1:10/10)
## Frequency of the reference allele
prob <- c(0.01,0.1, 0.25,0.5)
## Sample size
Ntot <- c(20,100)
## Creates empty dataframe
simquantsd <- data.frame(row.names=c("prob","N","mu","CV","mean","sd"))
## Loop to compute the frequency of the reference allele in the mass of the pool for each combination of parameters
for (Ntotsim in Ntot){
for (probsim in prob){
print(probsim)
sim <- sapply(musim,simsamp,p=probsim,N=Ntotsim)
sim <- matrix(unlist(sim),ncol=2,byrow=T)
simquantsd <- rbind(simquantsd,cbind(rep(probsim,length(musim)),rep(Ntotsim,length(musim)),musim,sig/musim,sim))
}
}
## [1] 0.01
## [1] 0.1
## [1] 0.25
## [1] 0.5
## [1] 0.01
## [1] 0.1
## [1] 0.25
## [1] 0.5
## Convert as dataframe
simquantsd <- as.data.frame(simquantsd)
colnames(simquantsd) <- c("prob","N","mu","CV","mean","sd")
## Using the analytical approximation (see V_(step1-2) Supp. Inf. p4)
simquantsd$approx<- sqrt((simquantsd$prob * (1-simquantsd$prob)/simquantsd$N)*(1+((simquantsd$N-1)/simquantsd$N)*(simquantsd$CV)^2))
simquantsd$probnum <- simquantsd$prob
simquantsd$prob <- as.factor(simquantsd$prob)
simquantsd$frequency <- factor(simquantsd$prob,levels=sort(levels(simquantsd$prob),decreasing = T))
simquantsd$color <- as.factor(simquantsd$prob)
## Graph for the expected frequency:
gg20mean <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[1],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg20mean <- gg20mean +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle("N = 20")
gg20mean
gg100mean <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[2],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg100mean <-gg100mean +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle("N = 100")
gg100mean
grid.arrange(gg20mean, gg100mean, ncol = 2)
## Variance
gg20 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[1],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg20 <- gg20 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle("N = 20")
#gg + guides(linetype=guide_legend(title="Haplotype\nfrequency"))+geom_line(data=simquantsd, aes(x=CV, y=approx, group = prob,col=frequency))+ scale_colour_grey(end=0.9)
gg20
gg100 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[2],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg100 <-gg100 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle("N = 100")
gg100
grid.arrange(gg20, gg100, ncol = 2)
## Simulation parametrized using the mean of the Poisson law that generates the truncated Poisson
rtpois <- function(n, lambda=5) {
qpois(runif(n, dpois(0, lambda), 1), lambda)}
## Zero truncated negative binomial
rtnbinom <- function(n, lambda=5, sdlambda=1.00001) {
qnbinom(runif(n, dnbinom(0, size=lambda/(sdlambda-1), mu=lambda), 1), size=lambda/(sdlambda-1),mu=lambda)}
## Standard deviation for the mass per individual sample (e.g. mg)
sig1 <- 0.1
## Standard deviation for the DNA yield per mass unit (e.g. ng DNA / mg mass)
sig2 <- 0.01
## seed: seed used to initialize the random number generator
## p: frequency of the reference allele
## N: total number of individuals sampled
## mu: average DNA yield for one individual
## sigma1: Standard deviation for the mass per individual sample (e.g. mg)
## sigma2: Standard deviation for the DNA yield per mass unit (e.g. ng DNA / mg mass)
## lambdasim: average sequencing depth
## sdlambdasim: multiplicative overdispersion (s) for the negative binomial as parametrize in Gautier et al 2013
simsampunique<- function(seed, p, N, mu, sigma1=sig1, sigma2=sig2, ratio=1, lambdasim=5, sdlambdasim=1){
set.seed(seed)
## Number of individuals of focal genotype in the sample
nsim <- rbinom(n=1,size=N, prob=p)
#nsim=6
## Mass of each individual sample
if(nsim > 0 & nsim < N){
## Mean and variance for the mass recovered for each individual sample
## Normal distribution
# Msim <- rnorm(N,mean=mu,sd=sigma1)
## Gamma distribution
#Msim <- rgamma(N,shape=mu^2/sigma^2,scale=sigma^2/mu)
#Msim <- rgamma(N,shape=mu^2/sigma1^2,scale=sigma1^2/mu)
## Lognormal distribution
## Mean and variance of the first lognormal equals to mu and sigma1^2
location1 = log(mu)-log(((sigma1/mu)^2)+1)/2
scale1 = sqrt(log(((sigma1/mu)^2)+1))
Msim <- rlnorm(N, meanlog = location1, sdlog = scale1)
## Mean and variance for DNA yield per mass unit
## Mean and variance of the second lognormal
scale2=sqrt(log(0.5*(sqrt(4*sigma2^2+1)+1)))
Yieldsim <- rlnorm(N, meanlog = 0, sdlog = scale2)
## Multiply the mass by the DNA yield per mass unit
Msim <- Msim*Yieldsim
## Sample the total number of reads
if(sdlambdasim==1){
## Poisson distribution
Totreads <- rtpois(n=1,lambda=lambdasim)
}else{
## With neg binomial
Totreads <- rtnbinom(n=1, lambda=lambdasim, sdlambda=sdlambdasim)
}
## Proportion of focal genotype in the sample
if(sum(Msim)<=0|sum(Msim[1:nsim])<=0|sum(Msim[(nsim+1):N])<=0){
allelefreq <- NA
}else{
numfocalreads <- ifelse(sum(Msim[1:nsim]) <= 0,0,rbinom(n=1,size=Totreads,prob=sum(Msim[1:nsim])/sum(Msim)))
#print(c(seed,numfocalreads/Totreads))
allelefreq <- c(numfocalreads/Totreads)
}
}else{
allelefreq <- nsim/N
}
return(allelefreq)
}
## Function to parallelize over many replicates
simsamp<- function(p=0.5, N=10, mu, sigma1=sig1, sigma2=sig2, lambdasim=5, sdlambdasim=1, nrep=10000){
probvec <- sapply(1:nrep, simsampunique, mu=mu, p=p, N=N, lambdasim=lambdasim, sdlambdasim=sdlambdasim, sigma1=sigma1, sigma2=sigma2)
probvec
return(list(mean(na.omit(probvec)),sd(na.omit(probvec))))
}
## Test with one example
simsamp(mu=0.1)
## [[1]]
## [1] 0.4967572
##
## [[2]]
## [1] 0.3058073
#simsamp(mu=1,nrep=10000)
## To vectorize the simulations
##sapply(musim,simsamp,p=0.1)
#sdlambda=c(1,3)
#lambda=c(20)
simvarstep1to3 <- function(sdlambda=c(1), lambda=c(20, 50)){
## Coefficient of variation ranging from 0.1 to 1
musim <- 0.1/c(0.001,(1:10/10))
## Coefficient of variation ranging from 0.1 to 1
## Mean and variance for the mass recovered for each individual sample
location1=log(musim)-log(((sig1/musim)^2)+1)/2
scale1=sqrt(log(((sig1/musim)^2)+1))
location1=log(musim)-(scale1^2)/2
## Mean and variance for DNA yield per mass unit
scale2=sqrt(log(0.5*(sqrt(4*sig2^2+1)+1)))
scale=sqrt(scale1^2+scale2^2)
## Analytical formula based on logNormal resulting from the product of the first and second lognormal (see Note 1 at the bottom of Supp. Inf. S1 p4)
#vareq <- (exp(scale^2)-1)*exp(2*location1+scale^2)
#vareq2 <- (1/2)*(sqrt(4*sig2^2+1)+1)*((1/2)*(sqrt(4*sig2^2+1)+1)*(sig1^2+musim^2)-musim^2)
#vareq3 <- sig1^2*sig2^2+musim^2*sig2^2+((sig1^2)/2)*(sqrt(4*sig2^2+1)+1)
#data.frame(vareq,vareq2,vareq3)
## Compute CV to verify it is between 0.01 and 1
##sqrt(vareq)/musim
## Focal frequency
prob <- c(0.01,0.1, 0.25,0.5)
## Sample size
Ntot <- c(20,200)
## Creates empty dataframe
simquantsd <- data.frame(row.names=c("prob","N","mu","mean","sd","lambda","sdlambda","sig1","sig2"))
## Loop to compute the frequency of the reference allele in the pool of reads for each combination of parameters
for (Ntotsim in Ntot){
for (lambdasim in lambda){
for (sdlambdasim in sdlambda){
for (probsim in prob){
print(c(Ntotsim,lambdasim,sdlambdasim,probsim))
sim <- sapply(musim,simsamp,p=probsim,N=Ntotsim,lambdasim=lambdasim,sdlambdasim=sdlambdasim)
sim <- matrix(unlist(sim),ncol=2,byrow=T)
simquantsd <- rbind(simquantsd,cbind(rep(probsim,length(musim)),rep(Ntotsim,length(musim)),musim,sim,rep(lambdasim,length(musim)),rep(sdlambdasim,length(musim)),rep(sig1,length(musim)),rep(sig2,length(musim))))
}
}
}
}
simquantsd <- as.data.frame(simquantsd)
colnames(simquantsd) <- c("prob","N","mu","mean","sd","lambda","sdlambda","sig1","sig2")
## Analytical formula based on logNormal resulting from the product of the first and second lognormal (see Note 1 at the bottom of Supp. Inf. S1 p4)
simquantsd$location1 <- log(simquantsd$mu)-log(((simquantsd$sig1/simquantsd$mu)^2)+1)/2
simquantsd$scale1 <- sqrt(log(((simquantsd$sig1/simquantsd$mu)^2)+1))
simquantsd$scale2 <- sqrt(log(0.5*(sqrt(4*simquantsd$sig2^2+1)+1)))
simquantsd$scale <- sqrt(simquantsd$scale1^2+simquantsd$scale2^2)
## Compute the variance in individual contribution to the DNA pool
simquantsd$vareq <- (exp(simquantsd$scale^2)-1)*exp(2*simquantsd$location1+simquantsd$scale^2)
## Compute the coeffcicent of variation of individual contribution to the DNA pool
simquantsd$CV <- sqrt(simquantsd$vareq)/simquantsd$mu
## Expecteation for 1/r (i.e. 1 over the total number of reads at a given position, provided that at least one read has been sequenced)
## See Eq 1 p1 in Supp. Inf. of Gautier et al 2013
simquantsd$lambdavar <- (simquantsd$lambda+simquantsd$sdlambda)/simquantsd$lambda^2
## Using the analytical approximation (see V_(step1-2) Supp. Inf. p4)
simquantsd$approx <- sqrt((simquantsd$prob * (1-simquantsd$prob)/simquantsd$N)*(1+((simquantsd$N-1) *(1-simquantsd$lambdavar)*(simquantsd$CV)^2)/simquantsd$N+((simquantsd$N-1)*simquantsd$lambdavar)))
## Allele frequencies ad factors or a as numeric variables
simquantsd$probnum <- simquantsd$prob
simquantsd$prob <- as.factor(simquantsd$prob)
simquantsd$frequency <- factor(simquantsd$prob,levels=sort(levels(simquantsd$prob),decreasing = T))
simquantsd$color <- as.factor(simquantsd$prob)
#head(simquantsd)
head(simquantsd[simquantsd$N==Ntot[1],])
return(simquantsd)
}
#pdf(file="Read coverage variance lognormal error lambda=20,50.pdf")
sdlambda = c(1)
lambda = c(20,50)
simquantsd <- simvarstep1to3(sdlambda = c(1), lambda = c(20,50))
## [1] 20.00 20.00 1.00 0.01
## [1] 20.0 20.0 1.0 0.1
## [1] 20.00 20.00 1.00 0.25
## [1] 20.0 20.0 1.0 0.5
## [1] 20.00 50.00 1.00 0.01
## [1] 20.0 50.0 1.0 0.1
## [1] 20.00 50.00 1.00 0.25
## [1] 20.0 50.0 1.0 0.5
## [1] 2e+02 2e+01 1e+00 1e-02
## [1] 200.0 20.0 1.0 0.1
## [1] 200.00 20.00 1.00 0.25
## [1] 200.0 20.0 1.0 0.5
## [1] 2e+02 5e+01 1e+00 1e-02
## [1] 200.0 50.0 1.0 0.1
## [1] 200.00 50.00 1.00 0.25
## [1] 200.0 50.0 1.0 0.5
## Mean for lambda=20
gg20mean <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg20mean <- gg20mean +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 20,", ", lambda," = ",20,", s = ",1,sep="")))
gg20mean
gg100mean <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg100mean <-gg100mean +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 200,", ", lambda," = ",20,", s = ",1,sep="")))
gg100mean
grid.arrange(gg20mean, gg100mean, ncol = 2)
## Mean for lambda=50
gg20mean50 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[2]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg20mean50 <- gg20mean50 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[2]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 20,", ", lambda," = ",50,", s = ",1,sep="")))
gg20mean50
gg100mean50 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[2]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg100mean50 <-gg100mean50 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[2]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 200,", ", lambda," = ",50,", s = ",1,sep="")))
gg100mean50
grid.arrange(gg20mean50, gg100mean50, ncol = 2)
grid.arrange(gg20mean, gg100mean,gg20mean50, gg100mean50, ncol = 2)
dev.copy(pdf,file=paste(getwd(),"Figure S1 Estep1-3 Lambda equals 20 or 50.pdf",sep="/"));dev.off()
## pdf
## 3
## quartz_off_screen
## 2
## Variance for lambda=20
gg20 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg20 <- gg20 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 20,", ", lambda," = ",20,", s = ",1,sep="")))
#gg + guides(linetype=guide_legend(title="Haplotype\nfrequency"))+geom_line(data=simquantsd, aes(x=CV, y=approx, group = prob,col=frequency))+ scale_colour_grey(end=0.9)
gg20
gg100 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg100 <-gg100 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 200,", ", lambda," = ",20,", s = ",1,sep="")))
gg100
grid.arrange(gg20, gg100, ncol = 2)
## Variance for lambda=50
gg2050 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[2]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg2050 <- gg2050 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[2]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle(expression(paste("N = 20, ",lambda," = 50, s = 1",sep="")))
#gg + guides(linetype=guide_legend(title="Haplotype\nfrequency"))+geom_line(data=simquantsd, aes(x=CV, y=approx, group = prob,col=frequency))+ scale_colour_grey(end=0.9)
gg2050
gg10050 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg10050 <-gg10050 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle(expression(paste("N = 200, ",lambda," = 50, s = 1",sep="")))
gg10050
grid.arrange(gg2050, gg10050, ncol = 2)
grid.arrange(gg20,gg100,gg2050, gg10050, ncol = 2, nrow = 2)
dev.copy(pdf,file=paste(getwd(),"Figure S2 Estep1-3 Lambda equals 20 or 50.pdf",sep="/"));dev.off()
## pdf
## 3
## quartz_off_screen
## 2
#dev.off()
lambda = c(20)
sdlambda = c(1,3)
simquantsd <- simvarstep1to3(lambda = c(20),sdlambda = c(1,3))
## [1] 20.00 20.00 1.00 0.01
## [1] 20.0 20.0 1.0 0.1
## [1] 20.00 20.00 1.00 0.25
## [1] 20.0 20.0 1.0 0.5
## [1] 20.00 20.00 3.00 0.01
## [1] 20.0 20.0 3.0 0.1
## [1] 20.00 20.00 3.00 0.25
## [1] 20.0 20.0 3.0 0.5
## [1] 2e+02 2e+01 1e+00 1e-02
## [1] 200.0 20.0 1.0 0.1
## [1] 200.00 20.00 1.00 0.25
## [1] 200.0 20.0 1.0 0.5
## [1] 2e+02 2e+01 3e+00 1e-02
## [1] 200.0 20.0 3.0 0.1
## [1] 200.00 20.00 3.00 0.25
## [1] 200.0 20.0 3.0 0.5
#pdf(file="Read coverage variance lognormal error sdlambda=1,3.pdf")
## Mean sdlambda=1
gg20mean <- ggplot() + geom_point(data = simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg20mean <- gg20mean +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 20,", ", lambda," = ",20,", s = ",1,sep="")))
gg20mean
gg100mean <- ggplot() + geom_point(data = simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg100mean <-gg100mean +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 200,", ", lambda," = ",20,", s =",1,sep="")))
gg100mean
grid.arrange(gg20mean, gg100mean, ncol = 2)
## Mean sdlambda=3
gg20mean3 <- ggplot() + geom_point(data = simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[2],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg20mean3 <- gg20mean3 + theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[2],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 20,", ", lambda," = ",20,", s = ",3,sep="")))
gg20mean3
gg100mean3 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[2],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg100mean3 <-gg100mean3 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[2],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 200,", ", lambda," = ",20,", s = ",3,sep="")))
gg100mean3
grid.arrange(gg20mean3, gg100mean3, ncol = 2)
grid.arrange(gg20mean, gg100mean,gg20mean3, gg100mean3, ncol = 2, nrow = 2)
dev.copy(pdf,file=paste(getwd(),"Figure S3 Estep1-3 Multiplicative overdispersion equals 1 or 3.pdf",sep="/"));dev.off()
## pdf
## 3
## quartz_off_screen
## 2
## Variance sdlambda=1
gg20 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg20 <- gg20 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 20,", ", lambda," = ",20,", s = ",1,sep="")))
#gg + guides(linetype=guide_legend(title="Haplotype\nfrequency"))+geom_line(data=simquantsd, aes(x=CV, y=approx, group = prob,col=frequency))+ scale_colour_grey(end=0.9)
gg20
gg100 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg100 <-gg100 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 200,", ", lambda," = ",20,", s = ",1,sep="")))
gg100
grid.arrange(gg20, gg100, ncol = 2)
## Variance sdlambda=3
gg20sdlambda3 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[2],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg20sdlambda3 <- gg20sdlambda3 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[2],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle(expression(paste("N = 20, ",lambda," = 20, s = 3",sep="")))
#gg + guides(linetype=guide_legend(title="Haplotype\nfrequency"))+geom_line(data=simquantsd, aes(x=CV, y=approx, group = prob,col=frequency))+ scale_colour_grey(end=0.9)
gg20sdlambda3
gg100sdlambda3 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[2],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg100sdlambda3 <-gg100sdlambda3 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[2],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle(expression(paste("N = 200, ",lambda," = 20, s = 3",sep="")))
gg100sdlambda3
grid.arrange(gg20sdlambda3,gg100sdlambda3,nrow = 2)
grid.arrange(gg20, gg100,gg20sdlambda3,gg100sdlambda3,nrow = 2, ncol = 2)
#dev.off()
dev.copy(pdf,file=paste(getwd(),"Figure S4 Vstep1-3 Multiplicative overdispersion equals 1 or 3.pdf",sep="/"));dev.off()
## pdf
## 3
## quartz_off_screen
## 2