library(tidyverse)
library(magrittr)
library(readxl)
library(openxlsx)
library(lubridate)
library(cowplot)
library(lsmeans)
library(lme4)
library(lmerTest) # import lmerTest *AFTER* lme4
Define functions for histogram and QQ plot:
norm.hist <- function(data.vec, xlab){ # function to draw normal density plot
hist.x <- sort(na.omit(data.vec))
hist.seq <- seq(min(hist.x), max(hist.x), length.out=length(hist.x))
hist.d <- dnorm(hist.seq, mean(hist.x), sd(hist.x))
df.hist <- data.frame(x = hist.x, seq = hist.seq, d = hist.d)
ggplot(df.hist) + geom_histogram(aes(x = x, y = ..density..)) +
geom_line(aes(x = seq, y = d), colour = "red", size = 1.2) +
labs(x = xlab, y = "Frequency")
}
draw.qq <- function(data.vec, xlab){ # function to draw QQ-plot
y <- sort(na.omit(data.vec))
probs <- (1:length(y))/(length(y) + 1)
norm.quantiles = qnorm(probs, mean(y), sd(y))
df.qq <- data.frame(y = y, nq = norm.quantiles)
p <- ggplot(df.qq, aes(x = nq, y = y)) + geom_point() +
geom_abline(slope = 1, intercept = 0, color='red') +
labs(x = "Normal Quantiles", y = xlab) +
geom_abline(slope = 1, intercept = 0, color='red')
}
Preparation of Wenjie’s Experiment_4 flowering data.
# Read in indiv plant data from excel.
in_name <- "wenjie_fourth-2020-05-18b.xlsx"
infile <- file.path(path.expand("~"),"FlrGWAS","Flr_Data", in_name)
df <- read_excel(infile, sheet = 1)
dim(df)
## [1] 2304 8
df2 <- df %>% rename(RefPop = GenoNum,
Experiment = ExptNum) %>%
mutate(RackNum = as.factor(RackNum),
RepNum = as.factor(RepNum)) %>%
select(Experiment, FlrAge, PltNum, RefPop, RepNum, RackNum, Alive2)
Prepare data:
# Alive2 == 1 means alive after cold
# Alive2 == 0 means dead at end of cold; FlrAge is NA
df2$Alive2 <- ifelse(df2$Alive2 == 2, 1, df2$Alive2) # Alive at time 2
(NumPlanted <- nrow(df2))
## [1] 2304
df2 <- filter(df2, Alive2 > 0)
(NumSurvived <- sum(df2$Alive2))
## [1] 1509
df3 <- filter(df2, FlrAge > 0) %>% drop_na() # Exclude if FlrAge is missing
df3 <- filter(df3, FlrAge > 0) # Delete one very early flowering outlier ***
(NumFlowered <- nrow(df3))
## [1] 1303
glimpse(df3)
## Rows: 1,303
## Columns: 7
## $ Experiment <chr> "W4", "W4", "W4", "W4", "W4", "W4", "W4", "W4", "W4", "W4"…
## $ FlrAge <dbl> 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15…
## $ PltNum <dbl> 866, 1634, 338, 1106, 770, 1510, 300, 684, 1068, 999, 432,…
## $ RefPop <chr> "RP059", "RP059", "RP452", "RP452", "RP455", "RP621", "RP6…
## $ RepNum <fct> 3, 5, 1, 3, 3, 4, 1, 2, 3, 3, 2, 1, 4, 6, 5, 3, 2, 4, 5, 1…
## $ RackNum <fct> 19, 35, 8, 24, 17, 32, 7, 15, 23, 21, 9, 6, 27, 43, 36, 21…
## $ Alive2 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
(maxFlrAge <- max(df3$FlrAge))
## [1] 72
(num_genos <- n_distinct(df3$RefPop, na.rm = TRUE)) # Number of flowering genotypes)
## [1] 356
Now, histogram and QQ-plot with original and log-transformed Flower_Age:
p1 <- norm.hist(df3$FlrAge, "Flower_Age")
p2 <- draw.qq(df3$FlrAge, "Flower_Age")
plot_grid(p1, p2, nrow = 1, labels = c("A", "B")) # join & print figures
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The original flowering distribution deviates from normality
df3 <- df3 %>% mutate(LogFlrAge = log10(FlrAge))
names(df3)
## [1] "Experiment" "FlrAge" "PltNum" "RefPop" "RepNum"
## [6] "RackNum" "Alive2" "LogFlrAge"
p1 <- norm.hist(df3$LogFlrAge, "Log10FlrAge")
p2 <- draw.qq(df3$LogFlrAge, "Log10FlrAge")
plot_grid(p1, p2, nrow = 1, labels = c("A", "B")) # join & print figures
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The log transformed flowering distribution has improved normality. For consistency among experiments, we will analyze this.
Mean flowering time in Wenjie’s fourth experiment (original scale):
(FlrInfo <- select(df3, FlrAge) %>%
summarize_all(list(mean = mean, sd = sd), na.rm = TRUE))
## # A tibble: 1 x 2
## mean sd
## <dbl> <dbl>
## 1 25.5 7.97
Summary: The original number of plants was 2304. Of these, 1509 survived until the end of cold treatment. The number of flowering plants was 1303. Surviving plants that did not flower total 206, or 13.6514%. (This reflects campus-wide lockdown due to COVID19.) Latest flowering ocurred at 72 days after cold treatment. The number of genotypes that flowered is 356. Average flowering age is 25.4904 +/- 0.2208.
Compute random effects ANOVA:
# See review on formulas:
# https://conjugateprior.org/2013/01/formulae-in-r-anova/
mod <- lmer(LogFlrAge ~ (1 | RefPop) + (1 | RackNum), data=df3)
summary(mod)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: LogFlrAge ~ (1 | RefPop) + (1 | RackNum)
## Data: df3
##
## REML criterion at convergence: -2528
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.116 -0.497 -0.051 0.424 5.443
##
## Random effects:
## Groups Name Variance Std.Dev.
## RefPop (Intercept) 0.00909 0.0953
## RackNum (Intercept) 0.00037 0.0192
## Residual 0.00465 0.0682
## Number of obs: 1303, groups: RefPop, 356; RackNum, 48
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 1.39e+00 6.14e-03 2.48e+02 226 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RackNum nested in RepNum, but has miniscule effects, so didn't use RepNum
Get BLUPs
blup <- ranef(mod) # get BLUPs
RP.blup <- as.vector(blup$RefPop[, 1]) # vector of BLUPs for each genotype
# get RefPop means
df.byRefPop <- df3 %>% group_by(RefPop)
RP.means <- summarize(df.byRefPop, RPmean = mean(LogFlrAge))
RP_IDs <- RP.means$RefPop # vector of RefPop IDs
RP.means <- as.vector(RP.means$RPmean) #vector of genotype means
df.mn.blup <- tibble(ID = RP_IDs, RP_meanW4 = RP.means, RP_blupW4 = RP.blup)
dim(df.mn.blup)
## [1] 356 3
cor(RP.means, RP.blup) # Correlation between BLUPs and RefPop means
## [1] 0.9931
Conclusion: BLUP estimates are highly correlated with genotype means.
# histogram of BLUPs:
norm.hist(df.mn.blup$RP_blupW4, "LogFlrAge BLUPs")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Output BLUP data frame
file_name <- "Wenjie_fourth_LogFlr_BLUPS.txt"
out_name <- file.path(path.expand("~"),"FlrGWAS", "output", file_name)
write_tsv(df.mn.blup, out_name)
Variance components, Coefficient of Determination, & Heritability
Residual Var Comp is taken from summary()
printout. Change manually if the data or model change.
# Residual variance component for this model only
# Change this value if the model or data are changed
vc.Resid <- 0.00465 # This is model specific! ***
Next, compute R-squared and Heritability
vcs <- summary(mod)$varcor
(vc.RefPop <- vcs$RefPop[1,1])
## [1] 0.009091
(vc.RackNum <- vcs$RackNum[1,1])
## [1] 0.0003701
(vc.Explained <- sum(vc.RefPop + vc.RackNum)) # total of variance components
## [1] 0.009461
(vc.Tot <- (vc.Explained + vc.Resid))
## [1] 0.01411
(R2 <- vc.Explained / vc.Tot) # R-squared
## [1] 0.6705
(H2 <- vc.RefPop / vc.Tot) # heritability
## [1] 0.6442
Conclusions:
The model explains 67.0467% of variation in log-transformed age at first flowering. Heritability of LogFlrAge = 64.424%.