library(tidyverse)
library(magrittr)
library(readxl)
library(openxlsx)
library(lubridate)
library(cowplot)
library(lsmeans)
library(lme4)
library(lmerTest) # import lmerTest *AFTER* lme4

Define functions for histogram and QQ plot:

norm.hist <- function(data.vec, xlab){ # function to draw normal density plot
  hist.x <- sort(na.omit(data.vec))
  hist.seq <- seq(min(hist.x), max(hist.x), length.out=length(hist.x))
  hist.d <- dnorm(hist.seq, mean(hist.x), sd(hist.x))
  df.hist <- data.frame(x = hist.x, seq = hist.seq, d = hist.d)
  ggplot(df.hist) + geom_histogram(aes(x = x, y = ..density..)) +
    geom_line(aes(x = seq, y = d), colour = "red", size = 1.2) +
    labs(x = xlab, y = "Frequency")
}

draw.qq <- function(data.vec, xlab){ # function to draw QQ-plot
  y <- sort(na.omit(data.vec))
  probs <- (1:length(y))/(length(y) + 1)
  norm.quantiles = qnorm(probs, mean(y), sd(y))
  df.qq <- data.frame(y = y, nq = norm.quantiles)
  p <- ggplot(df.qq, aes(x = nq, y = y)) + geom_point() + 
    geom_abline(slope = 1, intercept = 0, color='red') +
    labs(x = "Normal Quantiles", y = xlab) +
    geom_abline(slope = 1, intercept = 0, color='red')
}

Preparation of Emily Chan’s flowering data. Data originally from GWAS_induced_data_20170619.csv. A flowering date typo has been corrected in the following Excel file. For that individual, the year was 2018, not 2017. Now corrected in GWAS_induced_data_20170619.TMO.03.xlsx

# Read in indiv plant data from excel
in_name <- "GWAS_induced_data_20170619.TMO.03.xlsx"
infile <- file.path(path.expand("~"),"FlrGWAS","Flr_Data", in_name)
df <- read_excel(infile, sheet = 1)
names(df)
##  [1] "Row"           "Set"           "ID"            "RefPop"       
##  [5] "Coldroom"      "Treatment"     "Replicate"     "Block"        
##  [9] "Position"      "Pre_Bench"     "Post_Bench"    "Pre_Date"     
## [13] "Pre_Obs"       "Alive1"        "Pre_Cold_ROS"  "Out_Cold_Date"
## [17] "Treatment_2"   "Post_Date"     "Post_Obs"      "Alive2"       
## [21] "Post_Cold_ROS" "Longest_Leaf"  "Flower_date"   "Height"       
## [25] "Inflr_Num"     "Mini_ROS"      "Date_Image"    "Notes"
NumPlanted <- nrow(df)

Prepare data:

df2 <- select(df, RefPop, Treatment_2, Set, Replicate, Block, 
              Alive2, Out_Cold_Date, Flower_date, Height)

df2$Alive2 <- ifelse(df2$Alive2 == 2, 1, df2$Alive2) # Alive at time 2
df2 <- filter(df2, Alive2 >= 0)
NumSurvived <- sum(df2$Alive2)

df2 <- filter(df2, Flower_date >= 0) # Exclude if Flower_Date is missing
NumFlowered <- nrow(df2)

df3 <- df2 %>% mutate(Out_Cold_Date = ymd(Out_Cold_Date),
                      Flower_Date   = ymd(Flower_date),
                      Flower_Age    = as.numeric(difftime(Flower_Date, Out_Cold_Date,
                                                 units = "days"))) %>%
       select(-c(Flower_date, Flower_Date, Out_Cold_Date))

(num_genos <- n_distinct(df3$RefPop, na.rm = TRUE)) # Number of flowering genotypes)
## [1] 380
(maxFlrAge <- max(df3$Flower_Age))
## [1] 39

Next, examine relationships of these variables.

# Figure out relationships among experimental variables:
# Not executed because has long output
df3 %>% group_by(Treatment_2) %>% summarize(N = n()) # 2 treatments

df3 %>% group_by(Set) %>% summarize(N = n()) # 5 sets

df3 %>% group_by(Replicate) %>% summarize(N = n()) # 5 reps

df.blk <- df3 %>% group_by(Block) %>% summarize(N = n()) 
nrow(df.blk) # 100 blocks

mytable <- table(df3$Set, df3$Replicate) 
ftable(mytable) # Set and Replicate are cross-classified

mytable <- table(df3$Set, df3$Treatment_2) 
ftable(mytable) # Set and Treatment_2 are cross-classified

mytable <- table(df3$Replicate, df3$Treatment_2) 
ftable(mytable) # Replicate and Treatment_2 are cross-classified

mytable <- table(df3$Block, df3$Replicate) 
ftable(mytable) # Block is nested within Replicate

mytable <- table(df3$Block, df3$Set) 
ftable(mytable) # Block is nested within Set

mytable <- table(df3$Block, df3$Treatment_2) 
ftable(mytable) # Block and Treatment_2 are cross-classified

Therefore:

Set and Replicate are cross-classified

Block is nested within Replicate

Block is nested within Set

Grouping variables to be analyzed are: Set, Replicate, Block within Rep:Set

All of these are random.

Treatments were imposed after flowering, so there is no effect of treatment. It is ignored in this analysis.


Now, histogram and QQ-plot with original and log-transformed Flower_Age:

p1 <- norm.hist(df3$Flower_Age, "Flower_Age")
p2 <- draw.qq(df3$Flower_Age, "Flower_Age")

plot_grid(p1, p2, nrow = 1, labels = c("A", "B")) # join & print figures
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The original flowering distribution deviates from normality

df3 <- df3 %>% mutate(LogFlrAge = log10(Flower_Age))

p1 <- norm.hist(df3$LogFlrAge, "Log10FlrAge")
p2 <- draw.qq(df3$LogFlrAge, "Log10FlrAge")

plot_grid(p1, p2, nrow = 1, labels = c("A", "B")) # join & print figures
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The log transformed flowering distribution has improved normality


Save data file for GxE analysis

# names(df3)
# glimpse(df3)

df3.E1 <- df3 %>% mutate(Experiment = "E1") %>% 
                  rename(RackNum = Block)
# names(df3.E1)

df3.E1 <- df3.E1 %>% select(Experiment, RefPop, RackNum, LogFlrAge)
file_name <- "E1_plants_LogFlr.txt"
out_name <- file.path(path.expand("~"),"FlrGWAS", "output", file_name)
write_tsv(df3.E1, out_name)

Mean flowering time in Emily’s Experiment 1 (original scale):

(FlrInfo <- select(df3, Flower_Age) %>% 
  summarize_all(list(mean = mean, sd = sd), na.rm = TRUE))
## # A tibble: 1 x 2
##    mean    sd
##   <dbl> <dbl>
## 1  20.0  3.33

Summary: The original number of plants was 4800. Of these, 3419 survived until the end of cold treatment. The number of flowering plants was 3222. Surviving plants that did not flower total 197, or 5.7619%. Latest flowering ocurred at 39 days after cold treatment. The number of genotypes that flowered is 380. Average flowering age is 20.0062 +/- 0.0587.


# This block can output data for other analyses
df4 <- df3 %>% select(LogFlrAge, RefPop, Set, Block, Replicate) %>%
               rename(ExptlRep = Replicate) %>% drop_na
glimpse(df4)
## Rows: 3,220
## Columns: 5
## $ LogFlrAge <dbl> 1.255, 1.230, 1.255, 1.230, 1.230, 1.230, 1.230, 1.255, 1.2…
## $ RefPop    <chr> "RP003", "RP003", "RP003", "RP003", "RP003", "RP003", "RP00…
## $ Set       <chr> "Set1", "Set1", "Set1", "Set1", "Set1", "Set1", "Set1", "Se…
## $ Block     <chr> "1A", "1B", "2C", "2D", "3A", "4B", "5C", "16B", "16D", "17…
## $ ExptlRep  <chr> "Rep1", "Rep1", "Rep2", "Rep2", "Rep3", "Rep4", "Rep5", "Re…

# file_name <- "Expt1_Emily_LogFlr_df4.txt"
# out_name <- file.path(path.expand("~"),"FlrGWAS", "output", file_name)
# write_tsv(df4, out_name)

Compute random effects ANOVA:

# See review on formulas: 
# https://conjugateprior.org/2013/01/formulae-in-r-anova/

# Replicate had miniscule effects, so not used in this analysis

mod <- lmer(LogFlrAge ~ (1 | RefPop) + (1 | Set) + (1 | Set:Block), data=df4)

blup <- ranef(mod) # get BLUPs
RP.blup <- as.vector(blup$RefPop[, 1]) # vector of BLUPs for each genotype

# get RefPop means
df.byRefPop <- df4 %>% group_by(RefPop)
RP.means <- summarize(df.byRefPop, RPmean = mean(LogFlrAge))
RP_IDs <- RP.means$RefPop              # vector of RefPop IDs
RP.means <- as.vector(RP.means$RPmean) #vector of genotype means

df.mn.blup <- tibble(ID = RP_IDs, RP_meanE1 = RP.means, RP_blupE1 = RP.blup)
dim(df.mn.blup)
## [1] 380   3

cor(RP.means, RP.blup) # Correlation between BLUPs and RefPop means
## [1] 0.9855

Conclusion: BLUP estimates are highly correlated with genotype means.


# histogram of BLUPs:
norm.hist(df.mn.blup$RP_blupE1, "LogFlrAge BLUPs")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


#Output BLUP data frame
file_name <- "Expt1_Emily_LogFlr_BLUPS.txt"
out_name <- file.path(path.expand("~"),"FlrGWAS", "output", file_name)
write_tsv(df.mn.blup, out_name)

summary(mod)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: LogFlrAge ~ (1 | RefPop) + (1 | Set) + (1 | Set:Block)
##    Data: df4
## 
## REML criterion at convergence: -11880
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -4.980 -0.487 -0.059  0.430  8.352 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  RefPop    (Intercept) 3.94e-03 0.06274 
##  Set:Block (Intercept) 9.35e-05 0.00967 
##  Set       (Intercept) 1.59e-04 0.01262 
##  Residual              9.24e-04 0.03040 
## Number of obs: 3220, groups:  RefPop, 380; Set:Block, 100; Set, 5
## 
## Fixed effects:
##             Estimate Std. Error     df t value Pr(>|t|)    
## (Intercept)   1.2977     0.0066 5.1040     197  4.2e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Variance components, Coefficient of Determination, & Heritability:

Residual Var Comp is taken from summary() printout. Change manually if the data or model change.

# Residual variance component for this model and data only!!
# Change this value if the model or data are changed

vc.Resid <- 9.24e-04 # This is model specific! ***
vcs <- summary(mod)$varcor
# str(vcs)
(vc.RefPop <- vcs$RefPop[1,1])
## [1] 0.003936
(vc.Set    <- vcs$Set[1,1])
## [1] 0.0001592
(vc.Block  <- vcs$`Set:Block`[1,1])
## [1] 9.351e-05
(vc.Explained <- sum(vc.RefPop + vc.Set + vc.Block)) # total of variance components
## [1] 0.004188
(vc.Tot <-  (vc.Explained + vc.Resid))
## [1] 0.005112
(R2 <- vc.Explained / vc.Tot) # R-squared
## [1] 0.8193
(H2 <- vc.RefPop / vc.Tot) # heritability
## [1] 0.7698

Conclusions:

The model explains 81.9264% of variation in log-transformed age at first flowering. Heritability of LogFlrAge = 76.9839%.