# ------------------------------ # Sampling error correction idea # Theory # definition: var(x) = E(x^2) - E(x)^2 # x and y are sister species measured without error, and Var(x) = Var(y) # All means are 0, ie E(x), E(y) = 0 E( (x-y)^2 ) = E( x^2 -2xy + y^2) = E(x^2) -E(2xy) + E(y^2) = Var(x) + Var(y) = 2 Var(x) # So E( (x-y)^2 )/2 = Var(x) # x + e is x measured with sampling error, y + f is y measured with sampling error # All means are 0, ie E(x), E(y), E(e), E(f) = 0 # e and f are sampling error E( ((x + e) - (y + f))^2 ) = E( (x + e - y - f)^2 ) (x + e - y - f)(x + e - y - f) = E(x^2 + 2xe - 2xy - 2xf + e^2 - 2ef + y^2 + 2yf + f^2) = E(x^2) + E(2xe) - E(2xy) - E(2xf) + E(e^2) - E(2ef) + E(y^2) + E(2yf) + E(f^2) = E(x^2) + E(e^2) + E(y^2) + E(f^2) = Var(x) + Var(y) + Var(e) + Var(f) = 2Var(x) + Var(e) + Var(f) # So E( ((x + e) - (y + f))^2 )/2 = Var(x) + (Var(e) + Var(f))/2 [ E( ((x + e) - (y + f))^2 ) - Var(e) - Var(f) ]/2 = Var(x) # You want to estimate (u1-u2)^2 between a pair of species, where u1 and u2 are true species means in some trait. # Use log scale. # You decide to estimate using sample means (xbar1-xbar2)^2 # On average, (xbar1-xbar2)^2 will be larger than (u1-u2)^2 because of sampling error in estimating the means. # In other words, (xbar1-xbar2)^2 is biased. # To calculate the bias, let v be the estimated variance in the trait within both species. # Estimate it using the pooled sample variance, or the Error Mean Square from an anova comparing means of all species. # n is the sample size for each species. # The variance of the sample mean is v/n. (This is the square of the familiar standard error.) # The bias in the estimate of (u1-u2)^2 is 2*v/n. # On average, (xbar1-xbar2)^2 - 2*v/n will equal (u1-u2)^2 # NB: if sample sizes unequal then use (xbar1-xbar2)^2 - v/n1 - v/n2 # A reasonable measure of the difference between species is to take the square root of the bias-corrected difference: # sqrt( (xbar1-xbar2)^2 - 2*v/n ), which is on the original (log) scale. # Proof by simulation in R. I'll assume that the variance within species, v, is known # since it is based on a large sample of species. u1<-10 u2<-20 print(u1-u2)^2 # Difference between true means, squared v = 25 sd = sqrt(v) n=4 dsquared = vector(length=10000) # initialize vector for results for(i in 1:10000){ xbar1 = mean(rnorm(n, u1, sd)) xbar2 = mean(rnorm(n, u2, sd)) dsquared[i] <- (xbar1-xbar2)^2 } print(mean(dsquared)) # will be larger than 100=(u1-u2)^2 print(mean(dsquared) - 2*v/n) # will be equal to (u1-u2)^2 # ------------------------- # Simulating the Brownian motion process # to see if I can recover the correct slope parameter # Var(y) = (b^2)t t <- seq(0.001,10,0.001) # range of times length(t) # [1] 10000 b2 <- 0.5 # True value of b^2 y1 <- rnorm(10000, sd=sqrt( (b2)*t )) # rnorm with variance equal to (b^2)t y2 <- rnorm(10000, sd=sqrt( (b2)*t )) # ditto plot((y1-y2)^2 ~ t, pch = ".") z <- lm( I((y1-y2)^2) ~ t - 1) # unweighted regression - estimate is off by factor of 2 lines(predict(z) ~ t) coef(z) # t # 1.000189 # Divide response variable by 2, ie model ((y1 - y2)^2 )/2 z <- lm( I((1/2)*(y1-y2)^2) ~ t - 1) # unweighted regression coef(z) # t # 0.5000945 z <- lm( I((1/2)*(y1-y2)^2) ~ t - 1, weights = 1/t) # Variance of residuals assumed proportional to time coef(z) # t # 0.5000021 # ------------------------- # Simulating the O-U process # to see if I can recover the correct parameters # (b^2)/2a (1 - exp(-2at)) t <- seq(0.001,10,0.001) # range of times length(t) # [1] 10000 b2 <- 0.5 # True value of b^2 a <- 1 # true value of alpha y1 <- rnorm(10000, sd=sqrt( ((b2)/(2*a)) * (1-exp(-2*a*t)) )) # rnorm with variance equal to (b^2)t y2 <- rnorm(10000, sd=sqrt( ((b2)/(2*a)) * (1-exp(-2*a*t)) )) # ditto plot((y1-y2)^2 ~ t, pch = ".") z <- nls( I((1/2)*(y1-y2)^2) ~ ((b2)/(2*a)) * (1-exp(-2*a*t)), start = list(a=2, b2=1)) plot( I((1/2)*(y1-y2)^2) ~ t, pch = ".") lines(predict(z) ~ t) coef(z) # a b2 # 1.031089 0.499602 # Plotting curves for models to be used to analyze the bird sister species data # ------------------ # Brownian motion # Var(y) = (b^2)t # squared difference (xbar1-xbar2)^2 is proportional to t # (y1 - y2)/sqrt(2) ~ b sqrt(t) # or xbar1-xbar2 # t <- seq(0,10,0.01) b2 <- 0.5 y2 <- b2*t plot(y2~ t, type = "l") plot(sqrt(y2) ~ t, type = "l") # Ornstein Uhlenbeck # Formula from: # Cooper, N., G. H. Thomas, C. Venditti, A. Meade, and R. P. Freckleton. 2016. # A cautionary note on the use of Ornstein Uhlenbeck models in macroevolutionary studies. # Biological Journal of the Linnean Society 118: 64-77. # Var(y) = (b^2)/2a (1 - exp(-2at)) t <- seq(0,10,0.01) a <- 0.5 b2 <- 0.5 y2 <- ((b2^2)/(2*a)) * (1 - exp(-2*a*t)) plot(y2~ t, type = "l") plot(sqrt(y2) ~ t, type = "l")