library(tidyverse)
library(magrittr)
library(readxl)
library(openxlsx)
library(lubridate)
library(cowplot)
library(lsmeans)
library(lme4)
library(lmerTest) # import lmerTest *AFTER* lme4

Define functions for histogram and QQ plot:

norm.hist <- function(data.vec, xlab){ # function to draw normal density plot
  hist.x <- sort(na.omit(data.vec))
  hist.seq <- seq(min(hist.x), max(hist.x), length.out=length(hist.x))
  hist.d <- dnorm(hist.seq, mean(hist.x), sd(hist.x))
  df.hist <- data.frame(x = hist.x, seq = hist.seq, d = hist.d)
  ggplot(df.hist) + geom_histogram(aes(x = x, y = ..density..)) +
    geom_line(aes(x = seq, y = d), colour = "red", size = 1.2) +
    labs(x = xlab, y = "Frequency")
}

draw.qq <- function(data.vec, xlab){ # function to draw QQ-plot
  y <- sort(na.omit(data.vec))
  probs <- (1:length(y))/(length(y) + 1)
  norm.quantiles = qnorm(probs, mean(y), sd(y))
  df.qq <- data.frame(y = y, nq = norm.quantiles)
  p <- ggplot(df.qq, aes(x = nq, y = y)) + geom_point() + 
    geom_abline(slope = 1, intercept = 0, color='red') +
    labs(x = "Normal Quantiles", y = xlab) +
    geom_abline(slope = 1, intercept = 0, color='red')
}

Preparation of Wenjie’s Experiment_3 flowering data.

# Read in indiv plant data from excel
# Alive 2==1 means alive after cold; Alive 2==0 means dead after cold; 
# Alive 3==1 means alive and flowered; 
# Alive 3==0 means dead during wait for flowering; 
# Alive 3==NA/blank means they are dead after cold.
# Same categories are identified using the following:

in_name <- "wenjie_third-2020-05-18b.xlsx"
infile <- file.path(path.expand("~"),"FlrGWAS","Flr_Data", in_name)
df <- read_excel(infile, sheet = 1)
dim(df)
## [1] 2154    8
df2 <- df %>% rename(RefPop     = GenoNum,
                     Experiment = ExptNum) %>% 
              mutate(RackNum    = as.factor(RackNum),
                     RepNum     = as.factor(RepNum)) %>% 
              select(Experiment, FlrAge, PltNum, RefPop, RepNum, RackNum, Alive2)

# Correct typo in data file:
df2$RefPop <- ifelse(df2$RefPop == "P330", "RP330", df2$RefPop)
names(df2)
## [1] "Experiment" "FlrAge"     "PltNum"     "RefPop"     "RepNum"    
## [6] "RackNum"    "Alive2"
(NumPlanted <- nrow(df2))
## [1] 2154

Prepare data:

df2$Alive2 <- ifelse(df2$Alive2 == 2, 1, df2$Alive2) # Alive at time 2
df2 <- filter(df2, Alive2 >= 0)
(NumSurvived <- sum(df2$Alive2))
## [1] 1227

df3 <- filter(df2, FlrAge > 0) # Exclude if FlrAge is nonpositive
(NumFlowered <- nrow(df3))
## [1] 1203
glimpse(df3)
## Rows: 1,203
## Columns: 7
## $ Experiment <chr> "W3", "W3", "W3", "W3", "W3", "W3", "W3", "W3", "W3", "W3"…
## $ FlrAge     <dbl> 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14…
## $ PltNum     <dbl> 925, 1357, 61, 458, 975, 1719, 1424, 951, 832, 1690, 26, 1…
## $ RefPop     <chr> "RP278", "RP278", "RP278", "RP453", "RP507", "RP659", "RP0…
## $ RepNum     <fct> 3, 4, 1, 2, 3, 4, 4, 3, 2, 4, 1, 4, 2, 5, 4, 3, 5, 3, 4, 3…
## $ RackNum    <fct> 20, 29, 2, 10, 21, 36, 30, 20, 18, 36, 1, 28, 16, 43, 30, …
## $ Alive2     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
(maxFlrAge <- max(df3$FlrAge))
## [1] 95
(num_genos <- n_distinct(df3$RefPop, na.rm = TRUE)) # Number of flowering genotypes)
## [1] 329

Now, histogram and QQ-plot with original and log-transformed Flower_Age:

p1 <- norm.hist(df3$FlrAge, "Flower_Age")
p2 <- draw.qq(df3$FlrAge, "Flower_Age")

plot_grid(p1, p2, nrow = 1, labels = c("A", "B")) # join & print figures
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The original flowering distribution deviates from normality

df3 <- df3 %>% mutate(LogFlrAge = log10(FlrAge))
names(df3)
## [1] "Experiment" "FlrAge"     "PltNum"     "RefPop"     "RepNum"    
## [6] "RackNum"    "Alive2"     "LogFlrAge"

p1 <- norm.hist(df3$LogFlrAge, "Log10FlrAge")
p2 <- draw.qq(df3$LogFlrAge, "Log10FlrAge")

plot_grid(p1, p2, nrow = 1, labels = c("A", "B")) # join & print figures
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The log transformed flowering distribution has improved normality. For consistency among experiments, we will analyze this.


Save data file for GxE analysis

df3.W3 <- df3 %>% select(Experiment, RefPop, RackNum, LogFlrAge)
file_name <- "W3_plants_LogFlr.txt"
out_name <- file.path(path.expand("~"),"FlrGWAS", "output", file_name)
write_tsv(df3.W3, out_name)

Mean flowering time in Wenjie’s third experiment (original scale):

(FlrInfo <- select(df3, FlrAge) %>% 
  summarize_all(list(mean = mean, sd = sd), na.rm = TRUE))
## # A tibble: 1 x 2
##    mean    sd
##   <dbl> <dbl>
## 1  26.6  11.8

Summary: The original number of plants was 2154. Of these, 1227 survived until the end of cold treatment. The number of flowering plants was 1203. Surviving plants that did not flower total 24, or 1.956%. Latest flowering ocurred at 95 days after cold treatment. The number of genotypes that flowered is 329. Average flowering age is 26.6309 +/- 0.3388.


Compute random effects ANOVA:

# See review on formulas: 
# https://conjugateprior.org/2013/01/formulae-in-r-anova/

mod <- lmer(LogFlrAge ~ (1 | RefPop) +  (1 | RackNum), data=df3)
summary(mod)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: LogFlrAge ~ (1 | RefPop) + (1 | RackNum)
##    Data: df3
## 
## REML criterion at convergence: -1786
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -4.761 -0.367 -0.090  0.191  5.586 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  RefPop   (Intercept) 0.011816 0.1087  
##  RackNum  (Intercept) 0.000402 0.0201  
##  Residual             0.007800 0.0883  
## Number of obs: 1203, groups:  RefPop, 329; RackNum, 45
## 
## Fixed effects:
##             Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept) 1.41e+00   7.25e-03 2.15e+02     194   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# RackNum nested in RepNum, but has miniscule effects, so didn't use RepNum 

Get BLUPs

blup <- ranef(mod) # get BLUPs
RP.blup <- as.vector(blup$RefPop[, 1]) # vector of BLUPs for each genotype

# get RefPop means
df.byRefPop <- df3 %>% group_by(RefPop)
RP.means <- summarize(df.byRefPop, RPmean = mean(LogFlrAge))
RP_IDs <- RP.means$RefPop              # vector of RefPop IDs
RP.means <- as.vector(RP.means$RPmean) #vector of genotype means

df.mn.blup <- tibble(ID = RP_IDs, RP_meanW3 = RP.means, RP_blupW3 = RP.blup)
dim(df.mn.blup)
## [1] 329   3

cor(RP.means, RP.blup) # Correlation between BLUPs and RefPop means
## [1] 0.9886

Conclusion: BLUP estimates are highly correlated with genotype means.


# histogram of BLUPs:
norm.hist(df.mn.blup$RP_blupW3, "LogFlrAge BLUPs")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


#Output BLUP data frame
file_name <- "Wenjie_third_LogFlr_BLUPS.txt"
out_name <- file.path(path.expand("~"),"FlrGWAS", "output", file_name)
write_tsv(df.mn.blup, out_name)

Variance components, Coefficient of Determination, & Heritability

Residual Var Comp is taken from summary() printout. Change manually if the data or model change.

# Residual variance component for this model only
# Change this value if the model or data are changed

vc.Resid <- 0.007800 # This is model specific! ***

Next, compute R-squared and Heritability

vcs <- summary(mod)$varcor
(vc.RefPop <- vcs$RefPop[1,1])
## [1] 0.01182
(vc.RackNum  <- vcs$RackNum[1,1])
## [1] 0.0004022
(vc.Explained <- vc.RefPop + vc.RackNum) # total of variance components
## [1] 0.01222
(vc.Tot <-  (vc.Explained + vc.Resid))
## [1] 0.02002
(R2 <- vc.Explained / vc.Tot) # R-squared
## [1] 0.6104
(H2 <- vc.RefPop / vc.Tot) # heritability
## [1] 0.5903

Conclusions:

The model explains 61.0354% of variation in log-transformed age at first flowering. Heritability of LogFlrAge = 59.0262%.