Simulations to estimate the variance associated with the estimator of p (frequency of the reference allele within the pool)


1 Setworking directory

setwd("/Users/nicolasrode/Dropbox/GBS/Article/Dryad")
list.files()
##  [1] "DataHoechst.csv"                                                                                      
##  [2] "DataPicogreen.csv"                                                                                    
##  [3] "DNAconcentration_estimation_script_Fig.2-3.Rmd"                                                       
##  [4] "Effect of the number of sample, technical and reading replicate on CVbefore and CVafter (Hoechst).pdf"
##  [5] "Fig. 3 Effect of the number of sample, technical and reading replicate on CVbefore  (Picogreen).pdf"  
##  [6] "Fig. S5 Correlation tissue mass - DNA yield Hoechst vs PicoGreen.pdf"                                 
##  [7] "Fig.1_SEpool_SEindividual.pdf"                                                                        
##  [8] "Fig.1Fig.S1-4.html"                                                                                   
##  [9] "Fig.2-3.html"                                                                                         
## [10] "Figure 2 Effect of the number of independent samples from the same individual on CVbefore.pdf"        
## [11] "Figure S1 Estep1-3 Lambda equals 20 or 50.pdf"                                                        
## [12] "Figure S2 Estep1-3 Lambda equals 20 or 50.pdf"                                                        
## [13] "Figure S3 Estep1-3 Multiplicative overdispersion equals 1 or 3.pdf"                                   
## [14] "Figure S4 Vstep1-3 Multiplicative overdispersion equals 1 or 3.pdf"                                   
## [15] "PipetteMetrology.csv"                                                                                 
## [16] "Simulation_script_Fig.1Fig.S1-4.Rmd"
library(ggplot2)
library(gridExtra)
library(RColorBrewer)

2 Figure 1

2.1 Parameters

## Parameters for the graphic
N1=1
N2=100
  
## Size of axis labels
cextextaxis = 12
## Size of axis title
cextitleaxis = 20   


## Function baster on Eq 9 in Rode et al 2017 and Eq/ 2 in Gautier et al 2013
ratioSDpoolSDind <- function(x,alpha=alphai,lambdapool=lambdapooli,CV=CVi,Nind=Nindi,LambdaInd=LambdaIndi){

   valpool=(1 + CV^2 * ((2*x-1)/(2*x))*(1-(1+lambdapool*(1+alpha))/lambdapool^2)+(2*x-1)*(1+lambdapool*(1+alpha))/lambdapool^2)/(2*x)
   valind=(1 +(1+LambdaInd*(1+alphai))/LambdaInd^2)/(2*Nind)
   val=sqrt(valpool/valind)
  return(val)

}

## initial values

CVi=50/100
lambdapooli=100
alphai=0.5
Nindi=20
LambdaIndi=5

    # plot
## Parameters for the graphic
N1=1
N2=100
  
## Size of axis labels
cextextaxis = 12
## Size of axis title
cextitleaxis = 20   

# Palette de couleur
my_colors = brewer.pal(3, "Set1") 
## Values for CV
CV1=20/100
CV2=50/100
CV3=100/100
## Values for alpha
Alpha1=0.2
Alpha2=0.5
Alpha3=1

    # plot

LambdaPool1=50
LambdaPool2=100
LambdaPool3=200

## Thickness of the lines
cex1 <- 0.5
cex2 <- 1
cex3 <- 1.5

## Size of letters
lettersize=7

2.2 Graph

    my.labs <- list(bquote(paste(CV==.(CV1*100)," %")) , bquote(paste(CV==.(CV2*100)," %")) , bquote(paste(CV==.(CV3*100)," %")) )
CVplot <-   ggplot(data.frame(x=c(N1, N2)), aes(x)) +
    stat_function(fun=ratioSDpoolSDind, args=list(CV=CV1),  aes(colour = paste("CV = ",CV1,sep="")), size = cex1, linetype="solid") +
        stat_function(fun=ratioSDpoolSDind, args=list(CV=CV2),  aes(colour = paste("CV = ",CV2,sep="")), size = cex2, linetype="solid") +
        stat_function(fun=ratioSDpoolSDind, args=list(CV=CV3),  aes(colour = paste("CV = ",CV3,sep="")), size = cex3, linetype="solid") +
        theme(legend.text=element_text(size=15), legend.position = c(.7, .9), legend.title=element_blank(), axis.text.x =element_text(color = "black", size = cextextaxis),axis.text.y =element_text(color = "black", size = cextextaxis),axis.title.x = element_text(face="bold", colour="black", size=cextitleaxis),axis.title.y = element_text(face="bold", colour="black", size=cextitleaxis) )+
      scale_y_continuous("SE pool / SE individual",limits = c(0.5, 1.5)) +
      scale_x_continuous("Pool sample size (N)") +
        scale_colour_manual(values=my_colors, labels=my.labs)+
      guides(colour = guide_legend("Type", override.aes = list( size = c(cex1,cex2,cex3))))+
  ## See: http://ggplot2.tidyverse.org/reference/annotate.html
      annotate("text", x = 5.5, y=1.5, label = "(a)",size=lettersize)+
      geom_hline(yintercept = 1, color="black", linetype="dotted")




    my.labs <- list(bquote(alpha==.(Alpha1)) , bquote(alpha==.(Alpha2)) , bquote(alpha==.(Alpha3)) )
alphaplot <-    ggplot(data.frame(x=c(N1, N2)), aes(x)) +  
        stat_function(fun=ratioSDpoolSDind, args=list(alpha=Alpha1),  aes(colour = "Alpha1"), size = cex1, linetype="solid") +
        stat_function(fun=ratioSDpoolSDind, args=list(alpha=Alpha2),  aes(colour = "Alpha2"), size = cex2, linetype="solid") +
        stat_function(fun=ratioSDpoolSDind, args=list(alpha=Alpha3),  aes(colour = "Alpha3"), size = cex3, linetype="solid") +
        theme(legend.text=element_text(size=15), legend.position = c(.73, .9), legend.title=element_blank(), axis.text.x =element_text(color = "black", size = cextextaxis),axis.text.y =element_text(color = "black", size = cextextaxis),axis.title.x = element_text(face="bold", colour="black", size=cextitleaxis),axis.title.y = element_text(face="bold", colour="black", size=cextitleaxis) )+
      scale_y_continuous("",limits = c(0.5, 1.5)) +
      scale_x_continuous("Pool sample size (N)") +
        scale_colour_manual(values=my_colors, labels=my.labs)+
      guides(colour = guide_legend("Type", override.aes = list( size = c(cex1,cex2,cex3))))+
      annotate("text", x = 5.5, y=1.5, label = "(b)",size=lettersize)+
      geom_hline(yintercept = 1, color="black", linetype="dotted") 
    



## Test with lambda = 200
n <- N1:N2
plot(n,sapply(n,ratioSDpoolSDind,lambdapool=LambdaPool3), ylab= "SE pool/SE individual")

    my.labs <- list(bquote(paste(lambda==.(LambdaPool1)," X")) , bquote(paste(lambda==.(LambdaPool2)," X")) , bquote(paste(lambda==.(LambdaPool3)," X")) )
lambdaplot <-   ggplot(data.frame(x=c(N1, N2)), aes(x)) +  
        stat_function(fun=ratioSDpoolSDind, args=list(lambdapool=LambdaPool1), aes(colour = "Lambda1"), size = cex1, linetype="solid")+ 
        stat_function(fun=ratioSDpoolSDind, args=list(lambdapool=LambdaPool2), aes(colour = "Lambda2"), size = cex2, linetype="solid") +
        stat_function(fun=ratioSDpoolSDind, args=list(lambdapool=LambdaPool3), aes(colour = "Lambda3"), size = cex3, linetype="solid") +
        theme(legend.text=element_text(size=15), legend.position = c(.73, .9), legend.title=element_blank(), axis.text.x =element_text(color = "black", size = cextextaxis),axis.text.y =element_text(color = "black", size = cextextaxis),axis.title.x = element_text(face="bold", colour="black", size=cextitleaxis),axis.title.y = element_text(face="bold", colour="black", size=cextitleaxis) )+
        scale_y_continuous("",limits = c(0.5, 1.5)) +
        scale_x_continuous("Pool sample size (N)") +
          scale_colour_manual(values=my_colors, labels=my.labs)+
        guides(colour = guide_legend("Type", override.aes = list( size = c(cex1,cex2,cex3))))+
        annotate("text", x = 5.5, y=1.5, label = "(c)",size=lettersize)+
        geom_hline(yintercept = 1, color="black", linetype="dotted") 
    

## Text with legend
 x1 <- 50
 y1 <- 0.98
 y2 <- 0.75
 y1b <- 1.05
 y2b <- 1.25
 
leg <- ggplot(data.frame(x=c(N1, N2)), aes(x)) +
#theme(panel.background = element_rect(fill = 'white', colour = 'white'))+
   theme(axis.line=element_blank(),axis.text.x=element_blank(),
          axis.text.y=element_blank(),axis.ticks=element_blank(),
          axis.title.x=element_blank(),
          axis.title.y=element_blank(),legend.position="none",
          panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(),
          panel.grid.minor=element_blank(),plot.background=element_blank())+
  scale_y_continuous("",limits = c(0.5, 1.5)) +
  scale_x_continuous("",limits = c(1, 100)) +
  geom_segment(aes(x = x1, y = y1, xend = x1, yend = y2),arrow=arrow(length=unit(0.3,"cm"),ends = "last",type="closed"),lwd=2.5)+
  geom_segment(aes(x = x1, y = y2b, xend = x1, yend = y1b),arrow=arrow(length=unit(0.3,"cm"),ends = "first",type="closed"),lwd=2.5)+
 annotate("text", x = 55, y=1.3, label = "Ind-seq preferable",size=lettersize)+
annotate("text", x = 55, y=0.70, label = "Pool-seq preferable",size=lettersize)




pdf(file="Fig.1_SEpool_SEindividual.pdf", width = 29.7/2, height = 21/2)



grid.arrange(CVplot, alphaplot,lambdaplot,leg, ncol = 4)
## Warning: Removed 8 rows containing missing values (geom_path).
## Warning: Removed 10 rows containing missing values (geom_path).
## Warning: Removed 16 rows containing missing values (geom_path).
## Warning: Removed 9 rows containing missing values (geom_path).
## Warning: Removed 10 rows containing missing values (geom_path).
## Warning: Removed 11 rows containing missing values (geom_path).
## Warning: Removed 13 rows containing missing values (geom_path).
## Warning: Removed 10 rows containing missing values (geom_path).
## Warning: Removed 9 rows containing missing values (geom_path).
dev.off()
## quartz_off_screen 
##                 2
#dev.copy(png,file=paste(getwd(),"Fig.1_SEpool_SEindivdual.png",sep="/")); dev.off()
#dev.copy(pdf,file=paste(getwd(),"Fig. 1 SSEpool/SEindivdual.pdf",sep="/")); dev.off()

3 Comparison of simulations to analytical predictions for Vstep1-2

3.1 Function to compute the proportion of the total mass with the reference allele within the grinded pool

## Standard deviation of the mass per individual sample
sig <- 0.1

## seed: seed used to initialize the random number generator
## mu: average DNA yield for one individual
## sigma: standard deviation in the DNA yield of one individual
## p: frequency of the reference allele
## N: total number of individuals sampled

simsampunique<- function(seed, p, N, mu, sigma=sig){
  
    set.seed(seed)
    ## Number of individuals with reference allele sampled among a total of N individuals
    nsim <- rbinom(n=1,size=N, prob=p)
    if(nsim > 0 & nsim < N){
      #nsim=6
      ## Mass of each individual sample
      ## Normal distribution
      #Msim <- rnorm(N,mean=mu,sd=sigma)
      ## Gamma distribution
      #Msim <- rgamma(N,shape=mu^2/sigma^2,scale=sigma^2/mu)
      ## Lognormal distribution
      Msim <- rlnorm(N, meanlog = log(mu)-log(((sigma^2)/mu^2)+1)/2, sdlog = sqrt(log(((sigma^2)/mu^2)+1)))
      ## Proportion of focal genotype in the sample
      DNAprop <- sum(Msim[1:nsim])/sum(Msim)
    }else{
      DNAprop <- nsim/N
    }
    return(DNAprop)
  }

3.2 Function to compute the mean and standard deviation across many samples of the frequency of the mass with reference allele within the grinded pool

## Parallelization over many (10000) replicates
simsamp<- function(p=0.5,N=10,mu,sigma=sig,nrep=10000){
 probsim <- sapply(1:nrep, simsampunique, mu=mu, p=p, N=N)
 probsim
 return(list(mean(probsim),sd(probsim)))
}

## Test with one example
simsamp(mu=10)
## [[1]]
## [1] 0.5002259
## 
## [[2]]
## [1] 0.15792

3.3 Simulations

#simsamp(mu=1,nrep=10000)
## To vectorize the simulations
##sapply(musim,simsamp,p=0.1)

## Coefficient of variation ranging from 0.1 to 1
musim <- sig/(1:10/10)
## Frequency of the reference allele
prob <- c(0.01,0.1, 0.25,0.5)
## Sample size
Ntot <- c(20,100)

## Creates empty dataframe
simquantsd <- data.frame(row.names=c("prob","N","mu","CV","mean","sd"))

## Loop to compute the frequency of the reference allele in the mass of the pool for each combination of parameters
for (Ntotsim in Ntot){
  for (probsim in prob){
    print(probsim)
    sim <- sapply(musim,simsamp,p=probsim,N=Ntotsim)
    sim <- matrix(unlist(sim),ncol=2,byrow=T)
    simquantsd <- rbind(simquantsd,cbind(rep(probsim,length(musim)),rep(Ntotsim,length(musim)),musim,sig/musim,sim))
 }
}
## [1] 0.01
## [1] 0.1
## [1] 0.25
## [1] 0.5
## [1] 0.01
## [1] 0.1
## [1] 0.25
## [1] 0.5
## Convert as dataframe
simquantsd <- as.data.frame(simquantsd)
colnames(simquantsd) <- c("prob","N","mu","CV","mean","sd")

## Using the analytical approximation (see V_(step1-2) Supp. Inf. p4)
simquantsd$approx<- sqrt((simquantsd$prob * (1-simquantsd$prob)/simquantsd$N)*(1+((simquantsd$N-1)/simquantsd$N)*(simquantsd$CV)^2))
simquantsd$probnum <- simquantsd$prob

simquantsd$prob <- as.factor(simquantsd$prob)
simquantsd$frequency <- factor(simquantsd$prob,levels=sort(levels(simquantsd$prob),decreasing = T))
simquantsd$color <- as.factor(simquantsd$prob)

3.4 Graphs

## Graph for the expected frequency: 
gg20mean <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[1],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg20mean <- gg20mean +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle("N = 20")
gg20mean

gg100mean <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[2],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg100mean <-gg100mean +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle("N = 100")
gg100mean

grid.arrange(gg20mean, gg100mean, ncol = 2)

## Variance
gg20 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[1],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg20 <- gg20 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle("N = 20")
#gg + guides(linetype=guide_legend(title="Haplotype\nfrequency"))+geom_line(data=simquantsd, aes(x=CV, y=approx, group = prob,col=frequency))+ scale_colour_grey(end=0.9)
gg20

gg100 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[2],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg100 <-gg100 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle("N = 100")
gg100

grid.arrange(gg20, gg100, ncol = 2)

4 Comparison of simulations to analytical predictions for Vstep1-3

4.1 Function with to sample the total number of reads

## Simulation parametrized using the mean of the Poisson law that generates the truncated Poisson
rtpois <- function(n, lambda=5) {
            qpois(runif(n, dpois(0, lambda), 1), lambda)}

## Zero truncated negative binomial
rtnbinom <- function(n, lambda=5, sdlambda=1.00001) {
            qnbinom(runif(n, dnbinom(0, size=lambda/(sdlambda-1), mu=lambda), 1), size=lambda/(sdlambda-1),mu=lambda)}

4.2 Function to compute the propotion of reads with the reference allele within the pool of reads

## Standard deviation for the mass per individual sample (e.g. mg)

sig1 <- 0.1
## Standard deviation for the DNA yield per mass unit (e.g. ng DNA / mg mass)
sig2 <- 0.01

## seed: seed used to initialize the random number generator
## p: frequency of the reference allele
## N: total number of individuals sampled
## mu: average DNA yield for one individual
## sigma1: Standard deviation for the mass per individual sample (e.g. mg)
## sigma2: Standard deviation for the DNA yield per mass unit (e.g. ng DNA / mg mass)
## lambdasim: average sequencing depth
## sdlambdasim: multiplicative overdispersion (s) for the negative binomial as parametrize in Gautier et al 2013

simsampunique<- function(seed, p, N, mu, sigma1=sig1, sigma2=sig2, ratio=1, lambdasim=5, sdlambdasim=1){
  
    set.seed(seed)
    ## Number of individuals of focal genotype in the sample
    nsim <- rbinom(n=1,size=N, prob=p)
    #nsim=6
    ## Mass of each individual sample
    if(nsim > 0 & nsim < N){
    ## Mean and variance for the mass recovered for each individual sample
      ## Normal distribution
      # Msim <- rnorm(N,mean=mu,sd=sigma1)
      ## Gamma distribution
      #Msim <- rgamma(N,shape=mu^2/sigma^2,scale=sigma^2/mu)
      #Msim <- rgamma(N,shape=mu^2/sigma1^2,scale=sigma1^2/mu)
      
      ## Lognormal distribution
      ## Mean and variance of the first lognormal equals to mu and sigma1^2
      location1 = log(mu)-log(((sigma1/mu)^2)+1)/2
      scale1 = sqrt(log(((sigma1/mu)^2)+1))
      Msim <- rlnorm(N, meanlog = location1, sdlog = scale1)

    ## Mean and variance for DNA yield per mass unit
      ## Mean and variance of the second lognormal
      scale2=sqrt(log(0.5*(sqrt(4*sigma2^2+1)+1)))
      Yieldsim <- rlnorm(N, meanlog = 0, sdlog = scale2)
      
    ## Multiply the mass by the DNA yield per mass unit
      Msim <- Msim*Yieldsim
      
    ## Sample the total number of reads
      if(sdlambdasim==1){
        ## Poisson distribution
        Totreads <- rtpois(n=1,lambda=lambdasim)
      
      }else{
        ## With neg binomial
        Totreads <- rtnbinom(n=1, lambda=lambdasim, sdlambda=sdlambdasim)
      }
      
    ## Proportion of focal genotype in the sample
      if(sum(Msim)<=0|sum(Msim[1:nsim])<=0|sum(Msim[(nsim+1):N])<=0){
        allelefreq <- NA
      }else{
        numfocalreads <- ifelse(sum(Msim[1:nsim]) <= 0,0,rbinom(n=1,size=Totreads,prob=sum(Msim[1:nsim])/sum(Msim)))
        #print(c(seed,numfocalreads/Totreads))
        allelefreq <- c(numfocalreads/Totreads)
      }
    }else{
      allelefreq <- nsim/N
    }
    return(allelefreq)
}

4.3 Function to compute the mean and standard deviation of the frequency of the reference allele across many samples

## Function to parallelize over many replicates
simsamp<- function(p=0.5, N=10, mu, sigma1=sig1, sigma2=sig2, lambdasim=5, sdlambdasim=1, nrep=10000){
 probvec <- sapply(1:nrep, simsampunique, mu=mu, p=p, N=N, lambdasim=lambdasim, sdlambdasim=sdlambdasim, sigma1=sigma1, sigma2=sigma2)
 probvec
 return(list(mean(na.omit(probvec)),sd(na.omit(probvec))))
}

## Test with one example
simsamp(mu=0.1)
## [[1]]
## [1] 0.4967572
## 
## [[2]]
## [1] 0.3058073

4.4 Simulations

#simsamp(mu=1,nrep=10000)
## To vectorize the simulations
##sapply(musim,simsamp,p=0.1)


#sdlambda=c(1,3)

#lambda=c(20)


simvarstep1to3 <- function(sdlambda=c(1), lambda=c(20, 50)){

## Coefficient of variation ranging from 0.1 to 1
musim <- 0.1/c(0.001,(1:10/10))

## Coefficient of variation ranging from 0.1 to 1
## Mean and variance for the mass recovered for each individual sample
location1=log(musim)-log(((sig1/musim)^2)+1)/2
scale1=sqrt(log(((sig1/musim)^2)+1))
location1=log(musim)-(scale1^2)/2


## Mean and variance for DNA yield per mass unit
scale2=sqrt(log(0.5*(sqrt(4*sig2^2+1)+1)))
scale=sqrt(scale1^2+scale2^2)

## Analytical formula based on logNormal resulting from the product of the first and second lognormal (see Note 1 at the bottom of Supp. Inf. S1 p4)
#vareq <- (exp(scale^2)-1)*exp(2*location1+scale^2)
#vareq2 <- (1/2)*(sqrt(4*sig2^2+1)+1)*((1/2)*(sqrt(4*sig2^2+1)+1)*(sig1^2+musim^2)-musim^2)
#vareq3 <- sig1^2*sig2^2+musim^2*sig2^2+((sig1^2)/2)*(sqrt(4*sig2^2+1)+1)
#data.frame(vareq,vareq2,vareq3)

## Compute CV to verify it is between 0.01 and 1
##sqrt(vareq)/musim

## Focal frequency
prob <- c(0.01,0.1, 0.25,0.5)
## Sample size
Ntot <- c(20,200)


## Creates empty dataframe
simquantsd <- data.frame(row.names=c("prob","N","mu","mean","sd","lambda","sdlambda","sig1","sig2"))


## Loop to compute the frequency of the reference allele in the pool of reads for each combination of parameters
for (Ntotsim in Ntot){
  for (lambdasim in lambda){
    for (sdlambdasim in sdlambda){
      for (probsim in prob){
        print(c(Ntotsim,lambdasim,sdlambdasim,probsim))
        sim <- sapply(musim,simsamp,p=probsim,N=Ntotsim,lambdasim=lambdasim,sdlambdasim=sdlambdasim)
        sim <- matrix(unlist(sim),ncol=2,byrow=T)
        simquantsd <- rbind(simquantsd,cbind(rep(probsim,length(musim)),rep(Ntotsim,length(musim)),musim,sim,rep(lambdasim,length(musim)),rep(sdlambdasim,length(musim)),rep(sig1,length(musim)),rep(sig2,length(musim))))
     }
    }
  }
}

simquantsd <- as.data.frame(simquantsd)
colnames(simquantsd) <- c("prob","N","mu","mean","sd","lambda","sdlambda","sig1","sig2")

## Analytical formula based on logNormal resulting from the product of the first and second lognormal (see Note 1 at the bottom of Supp. Inf. S1 p4)
simquantsd$location1 <- log(simquantsd$mu)-log(((simquantsd$sig1/simquantsd$mu)^2)+1)/2
simquantsd$scale1 <- sqrt(log(((simquantsd$sig1/simquantsd$mu)^2)+1))
simquantsd$scale2 <- sqrt(log(0.5*(sqrt(4*simquantsd$sig2^2+1)+1)))
simquantsd$scale <- sqrt(simquantsd$scale1^2+simquantsd$scale2^2)
## Compute the variance in individual contribution to the DNA pool
simquantsd$vareq <- (exp(simquantsd$scale^2)-1)*exp(2*simquantsd$location1+simquantsd$scale^2)

## Compute the coeffcicent of variation of individual contribution to the DNA pool
simquantsd$CV <- sqrt(simquantsd$vareq)/simquantsd$mu


## Expecteation for 1/r (i.e. 1 over the total number of reads at a given position, provided that at least one read has been sequenced)
## See Eq 1 p1 in Supp. Inf. of Gautier et al 2013
simquantsd$lambdavar <- (simquantsd$lambda+simquantsd$sdlambda)/simquantsd$lambda^2

## Using the analytical approximation (see V_(step1-2) Supp. Inf. p4)
simquantsd$approx <- sqrt((simquantsd$prob * (1-simquantsd$prob)/simquantsd$N)*(1+((simquantsd$N-1) *(1-simquantsd$lambdavar)*(simquantsd$CV)^2)/simquantsd$N+((simquantsd$N-1)*simquantsd$lambdavar)))



## Allele frequencies ad factors or a as numeric variables
simquantsd$probnum <- simquantsd$prob
simquantsd$prob <- as.factor(simquantsd$prob)
simquantsd$frequency <- factor(simquantsd$prob,levels=sort(levels(simquantsd$prob),decreasing = T))
simquantsd$color <- as.factor(simquantsd$prob)

#head(simquantsd)
head(simquantsd[simquantsd$N==Ntot[1],])

return(simquantsd)

}

4.5 Figure S1-2 (lambda=20, 50)

#pdf(file="Read coverage variance lognormal error lambda=20,50.pdf")

sdlambda = c(1)
lambda = c(20,50)

simquantsd <- simvarstep1to3(sdlambda = c(1), lambda = c(20,50))
## [1] 20.00 20.00  1.00  0.01
## [1] 20.0 20.0  1.0  0.1
## [1] 20.00 20.00  1.00  0.25
## [1] 20.0 20.0  1.0  0.5
## [1] 20.00 50.00  1.00  0.01
## [1] 20.0 50.0  1.0  0.1
## [1] 20.00 50.00  1.00  0.25
## [1] 20.0 50.0  1.0  0.5
## [1] 2e+02 2e+01 1e+00 1e-02
## [1] 200.0  20.0   1.0   0.1
## [1] 200.00  20.00   1.00   0.25
## [1] 200.0  20.0   1.0   0.5
## [1] 2e+02 5e+01 1e+00 1e-02
## [1] 200.0  50.0   1.0   0.1
## [1] 200.00  50.00   1.00   0.25
## [1] 200.0  50.0   1.0   0.5
## Mean for lambda=20
gg20mean <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg20mean <- gg20mean +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 20,", ", lambda," = ",20,", s = ",1,sep="")))
gg20mean

gg100mean <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg100mean <-gg100mean +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 200,", ", lambda," = ",20,", s = ",1,sep="")))
gg100mean

grid.arrange(gg20mean, gg100mean, ncol = 2)

## Mean for lambda=50
gg20mean50 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[2]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg20mean50 <- gg20mean50 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[2]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 20,", ", lambda," = ",50,", s = ",1,sep="")))
gg20mean50

gg100mean50 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[2]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg100mean50 <-gg100mean50 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[2]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 200,", ", lambda," = ",50,", s = ",1,sep="")))
gg100mean50

grid.arrange(gg20mean50, gg100mean50, ncol = 2)

grid.arrange(gg20mean, gg100mean,gg20mean50, gg100mean50, ncol = 2)

dev.copy(pdf,file=paste(getwd(),"Figure S1 Estep1-3 Lambda equals 20 or 50.pdf",sep="/"));dev.off()
## pdf 
##   3
## quartz_off_screen 
##                 2
## Variance for lambda=20
gg20 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg20 <- gg20 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 20,", ", lambda," = ",20,", s = ",1,sep="")))
#gg + guides(linetype=guide_legend(title="Haplotype\nfrequency"))+geom_line(data=simquantsd, aes(x=CV, y=approx, group = prob,col=frequency))+ scale_colour_grey(end=0.9)
gg20

gg100 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg100 <-gg100 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 200,", ", lambda," = ",20,", s = ",1,sep="")))
gg100

grid.arrange(gg20, gg100, ncol = 2)

## Variance for lambda=50
gg2050 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[2]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg2050 <- gg2050 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[2]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle(expression(paste("N = 20, ",lambda," = 50, s = 1",sep="")))
#gg + guides(linetype=guide_legend(title="Haplotype\nfrequency"))+geom_line(data=simquantsd, aes(x=CV, y=approx, group = prob,col=frequency))+ scale_colour_grey(end=0.9)
gg2050

gg10050 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg10050 <-gg10050 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle(expression(paste("N = 200, ",lambda," = 50, s = 1",sep="")))
gg10050

grid.arrange(gg2050, gg10050, ncol = 2)

grid.arrange(gg20,gg100,gg2050, gg10050, ncol = 2, nrow = 2)

dev.copy(pdf,file=paste(getwd(),"Figure S2 Estep1-3 Lambda equals 20 or 50.pdf",sep="/"));dev.off()
## pdf 
##   3
## quartz_off_screen 
##                 2
#dev.off()

4.6 Figure S3-4 (multiplicative overdispersion=1 or 3)

lambda = c(20)
sdlambda = c(1,3)
simquantsd <- simvarstep1to3(lambda = c(20),sdlambda = c(1,3))
## [1] 20.00 20.00  1.00  0.01
## [1] 20.0 20.0  1.0  0.1
## [1] 20.00 20.00  1.00  0.25
## [1] 20.0 20.0  1.0  0.5
## [1] 20.00 20.00  3.00  0.01
## [1] 20.0 20.0  3.0  0.1
## [1] 20.00 20.00  3.00  0.25
## [1] 20.0 20.0  3.0  0.5
## [1] 2e+02 2e+01 1e+00 1e-02
## [1] 200.0  20.0   1.0   0.1
## [1] 200.00  20.00   1.00   0.25
## [1] 200.0  20.0   1.0   0.5
## [1] 2e+02 2e+01 3e+00 1e-02
## [1] 200.0  20.0   3.0   0.1
## [1] 200.00  20.00   3.00   0.25
## [1] 200.0  20.0   3.0   0.5
#pdf(file="Read coverage variance lognormal error sdlambda=1,3.pdf")

## Mean sdlambda=1
gg20mean <- ggplot() + geom_point(data = simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg20mean <- gg20mean +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 20,", ", lambda," = ",20,", s = ",1,sep="")))
gg20mean

gg100mean <- ggplot() + geom_point(data = simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg100mean <-gg100mean +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 200,", ", lambda," = ",20,", s =",1,sep="")))
gg100mean

grid.arrange(gg20mean, gg100mean, ncol = 2)

## Mean sdlambda=3
gg20mean3 <- ggplot() + geom_point(data = simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[2],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg20mean3 <- gg20mean3 + theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[2],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 20,", ", lambda," = ",20,", s = ",3,sep="")))
gg20mean3

gg100mean3 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[2],], aes(x = CV, y = mean, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(mu[hat(pi)] ),limits = c(0, 0.6))
gg100mean3 <-gg100mean3 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[2],], aes(x=CV, y=probnum, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 200,", ", lambda," = ",20,", s = ",3,sep="")))
gg100mean3

grid.arrange(gg20mean3, gg100mean3, ncol = 2)

grid.arrange(gg20mean, gg100mean,gg20mean3, gg100mean3, ncol = 2, nrow = 2)

dev.copy(pdf,file=paste(getwd(),"Figure S3 Estep1-3 Multiplicative overdispersion equals 1 or 3.pdf",sep="/"));dev.off()
## pdf 
##   3
## quartz_off_screen 
##                 2
## Variance sdlambda=1
gg20 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg20 <- gg20 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 20,", ", lambda," = ",20,", s = ",1,sep="")))
#gg + guides(linetype=guide_legend(title="Haplotype\nfrequency"))+geom_line(data=simquantsd, aes(x=CV, y=approx, group = prob,col=frequency))+ scale_colour_grey(end=0.9)
gg20

gg100 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg100 <-gg100 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[1],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle(expression(paste("N = ", 200,", ", lambda," = ",20,", s = ",1,sep="")))
gg100

grid.arrange(gg20, gg100, ncol = 2)

## Variance sdlambda=3
gg20sdlambda3 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[2],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg20sdlambda3 <- gg20sdlambda3 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[1]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[2],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle(expression(paste("N = 20, ",lambda," = 20, s = 3",sep="")))
#gg + guides(linetype=guide_legend(title="Haplotype\nfrequency"))+geom_line(data=simquantsd, aes(x=CV, y=approx, group = prob,col=frequency))+ scale_colour_grey(end=0.9)
gg20sdlambda3

gg100sdlambda3 <- ggplot() +geom_point(data = simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[2],], aes(x = CV, y = sd, group = prob,col=frequency)) + scale_x_continuous("Coefficient of variation") + scale_y_continuous(expression(sigma[hat(pi)] ),limits = c(0, 0.2))
gg100sdlambda3 <-gg100sdlambda3 +theme_bw()+geom_line(data=simquantsd[simquantsd$N==Ntot[2]&simquantsd$lambda==lambda[1]&simquantsd$sdlambda==sdlambda[2],], aes(x=CV, y=approx, group = prob,col=frequency))+ ggtitle(expression(paste("N = 200, ",lambda," = 20, s = 3",sep="")))
gg100sdlambda3

grid.arrange(gg20sdlambda3,gg100sdlambda3,nrow = 2)

grid.arrange(gg20, gg100,gg20sdlambda3,gg100sdlambda3,nrow = 2, ncol = 2)

#dev.off()
dev.copy(pdf,file=paste(getwd(),"Figure S4 Vstep1-3 Multiplicative overdispersion equals 1 or 3.pdf",sep="/"));dev.off()
## pdf 
##   3
## quartz_off_screen 
##                 2