Preparation of Baosheng’s flowering data Set 1.

library(tidyverse)
library(readxl)
library(openxlsx)
library(lubridate)
library(cowplot)
library(lsmeans)
library(lme4)
library(lmerTest) # import lmerTest *AFTER* lme4

Define functions for histogram and QQ plot:

norm.hist <- function(data.vec, xlab){ # function to draw normal density plot
  hist.x <- sort(na.omit(data.vec))
  hist.seq <- seq(min(hist.x), max(hist.x), length.out=length(hist.x))
  hist.d <- dnorm(hist.seq, mean(hist.x), sd(hist.x))
  df.hist <- data.frame(x = hist.x, seq = hist.seq, d = hist.d)
  ggplot(df.hist) + geom_histogram(aes(x = x, y = ..density..)) +
    geom_line(aes(x = seq, y = d), colour = "red", size = 1.2) +
    labs(x = xlab, y = "Frequency")
}

draw.qq <- function(data.vec, xlab){ # function to draw QQ-plot
  y <- sort(na.omit(data.vec))
  probs <- (1:length(y))/(length(y) + 1)
  norm.quantiles = qnorm(probs, mean(y), sd(y))
  df.qq <- data.frame(y = y, nq = norm.quantiles)
  p <- ggplot(df.qq, aes(x = nq, y = y)) + geom_point() + 
    geom_abline(slope = 1, intercept = 0, color='red') +
    labs(x = "Normal Quantiles", y = xlab) +
    geom_abline(slope = 1, intercept = 0, color='red')
}

Name input and output files, then input flowering data

# Two steps are required to analyze each new data set.
# First, edit the Set number as 1, 3, 4, 5, or 6
# Second, to compute R2 & H2, run once, then update the residual var comp: line 194

# Assign Set info here. 
# Do not use Set2, which has too many families with no flowering plants

this.set <- "5"                                # Change here
in_name  <- "BSW_flowered_Set5.txt"            # Change here
out_name <- "Expt_BSW_Set5_LogFlr_BLUPS.txt"   # Change here

infile <- file.path(path.expand("~"),"FlrGWAS","output", in_name)
df <- read_tsv(infile)
## Parsed with column specification:
## cols(
##   RefPop = col_character(),
##   Set = col_character(),
##   Rep = col_character(),
##   Pos = col_character(),
##   FlrDat = col_double()
## )
num.rows <- nrow(df)
names(df)
## [1] "RefPop" "Set"    "Rep"    "Pos"    "FlrDat"

# How many LTM & SAD12?
filter(df, RefPop == "LTM" | RefPop == "SAD12") %>% 
               group_by(RefPop) %>% summarize(N = n())
## # A tibble: 2 x 2
##   RefPop     N
##   <chr>  <int>
## 1 LTM       99
## 2 SAD12    104
# Rename LTM & SAD12 to RP IDs
df$RefPop <- ifelse(df$RefPop == "LTM", "RP072", df$RefPop)
df$RefPop <- ifelse(df$RefPop == "SAD12", "RP067", df$RefPop)

df$Flat <- str_c(df$Rep, df$Set) # Assign name of each Flat

num.tot <- nrow(df)
df3 <- df %>% drop_na() 
num.alive <- nrow(df3)

df4 <- df3 %>% mutate(FlrAge    = FlrDat, # Already adjusted for vernalization dates 
                      LogFlrAge = log10(FlrAge)) %>% select(-FlrDat)
num_genos <- n_distinct(df4$RefPop, na.rm = TRUE) # Number of flowering genotypes)

min.flrdat  <- min(df4$FlrAge)
max.flrdat  <- max(df4$FlrAge)
NumFlowered <- nrow(df4)

Flowering time is counted as the number of days after the end of cold vernalization.

The full model does not converge, so we test for effects of RefPop and Flat.


Now, histogram and QQ-plot with original and log-transformed Flower_Age:

p1 <- norm.hist(df4$FlrAge, "Flower_Age")
p2 <- draw.qq(df4$FlrAge, "Flower_Age")

plot_grid(p1, p2, nrow = 1, labels = c("A", "B")) # join & print figures
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The original flowering distribution deviates from normality

p1 <- norm.hist(df4$LogFlrAge, "Log10FlrAge")
p2 <- draw.qq(df4$LogFlrAge, "Log10FlrAge")

plot_grid(p1, p2, nrow = 1, labels = c("A", "B")) # join & print figures
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The log transformed flowering distribution has improved normality


Mean flowering time (original scale):

(FlrInfo <- select(df4, FlrAge) %>% 
  summarize_all(list(mean = mean, sd = sd), na.rm = TRUE))
## # A tibble: 1 x 2
##    mean    sd
##   <dbl> <dbl>
## 1  19.5  3.56

Summary: The original number of plants was 2343. The number of flowering plants was 2208. Latest flowering ocurred at 33 days after cold treatment. The number of genotypes that flowered is 432. Minimum flowering age is 8, and maximum flowering age equals 33. Average flowering age is 19.4887 +/- 0.0758. 432 genotypes flowered.


Compute random effects ANOVA:

# See review on formulas: 
# https://conjugateprior.org/2013/01/formulae-in-r-anova/

mod <- lmer(LogFlrAge ~ (1 | RefPop) + (1 | Flat), data=df4)

blup <- ranef(mod) # get BLUPs
RP.blup <- as.vector(blup$RefPop[, 1]) # vector of BLUPs for each genotype
 
# get RefPop means
df.byRefPop <- df4 %>% group_by(RefPop)
RP.means <- summarize(df.byRefPop, RPmean = mean(LogFlrAge))
RP_IDs <- RP.means$RefPop              # vector of RefPop IDs
RP.means <- as.vector(RP.means$RPmean) #vector of genotype means

df.mn.blup <- tibble(ID = RP_IDs, RP_mean = RP.means, RP_blup = RP.blup)
names(df.mn.blup) <- c("ID", str_c("RP_mean", this.set), str_c("RP_blup", this.set))
dim(df.mn.blup)
## [1] 432   3

cor(RP.means, RP.blup) # Correlation between BLUPs and RefPop means
## [1] 0.9901

Conclusion: BLUP estimates are highly correlated with genotype means.

# histogram of BLUPs
# norm.hist(df.mn.blup$RP_blupB1, "LogFlrAge BLUPs")

#Output BLUP data frame
out_name <- file.path(path.expand("~"),"FlrGWAS", "output", out_name)
write_tsv(df.mn.blup, out_name)

summary(mod)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: LogFlrAge ~ (1 | RefPop) + (1 | Flat)
##    Data: df4
## 
## REML criterion at convergence: -5977
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -7.320 -0.458 -0.036  0.474  4.665 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  RefPop   (Intercept) 0.004002 0.0633  
##  Flat     (Intercept) 0.000238 0.0154  
##  Residual             0.002513 0.0501  
## Number of obs: 2208, groups:  RefPop, 432; Flat, 25
## 
## Fixed effects:
##             Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)  1.28009    0.00449 64.91977     285   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Variance components, Coefficient of Determination, & Heritability:

Residual Var Comp is taken from summary() printout. Change manually if the data or model change.

# Copy the value of the residual variance component into below
# Residual variance component for this model and data only!!
# Change this value if the model or data are changed

vc.Resid <- 0.002513 # This is model specific! ***        # Change here

Get variance components, R-squared, and heritability:

vcs <- summary(mod)$varcor
# str(vcs)
(vc.RefPop <- vcs$RefPop[1,1])
## [1] 0.004002
(vc.Flat    <- vcs$Flat[1,1])
## [1] 0.0002383
(vc.Explained <- sum(vc.RefPop + vc.Flat)) # total of variance components
## [1] 0.00424
(vc.Tot <-  (vc.Explained + vc.Resid))
## [1] 0.006753
(R2 <- vc.Explained / vc.Tot) # R-squared
## [1] 0.6279
(H2 <- vc.RefPop / vc.Tot) # heritability
## [1] 0.5926

Conclusions:

The model explains 62.7881% of variation in log-transformed age at first flowering. Heritability of LogFlrAge = 59.2596%.