library(readxl)
library(openxlsx)
library(car)
library(broom)
library(ggmap)
library(magrittr)
library(tidyverse)
library(cowplot)
library(lme4)
library(lmerTest) # import lmerTest *AFTER* lme4

Functions to check normality

norm.hist <- function(data.vec, xlab){ # function to draw normal density plot
  hist.x <- sort(na.omit(data.vec))
  hist.seq <- seq(min(hist.x), max(hist.x), length.out=length(hist.x))
  hist.d <- dnorm(hist.seq, mean(hist.x), sd(hist.x))
  df.hist <- data.frame(x = hist.x, seq = hist.seq, d = hist.d)
  ggplot(df.hist) + geom_histogram(aes(x = x, y = ..density..)) +
    geom_line(aes(x = seq, y = d), colour = "red", size = 1.2) +
    labs(x = xlab, y = "Frequency")
}

draw.qq <- function(data.vec, xlab){ # function to draw QQ-plot
  y <- sort(na.omit(data.vec))
  probs <- (1:length(y))/(length(y) + 1)
  norm.quantiles = qnorm(probs, mean(y), sd(y))
  df.qq <- data.frame(y = y, nq = norm.quantiles)
  p <- ggplot(df.qq, aes(x = nq, y = y)) + geom_point() + 
    geom_abline(slope = 1, intercept = 0, color='red') +
    labs(x = "Normal Quantiles", y = xlab) +
    geom_abline(slope = 1, intercept = 0, color='red')
}

Input and merge data files

Input qtl-SNPs

Stricta genotypes are imputed, so only retro has missing SNP genotypes. Derived / ancestral is unknown for qtl6.1 & qtl7.1. But all other QTLs have the most common allele as ancestral, so we hypothesize that is true for qtl6.1. ALT allele is most common for qtl7.1. We hypothesize that this may be ancestral.

in_name1 <- "Flowering_SNPs_transpose.xlsx"
infile1 <- file.path(path.expand("~"),"FlrGWAS","Flr_Data", in_name1)
df.SNPs <- read_excel(infile1, sheet = 1, na = "NA")
df.SNPs <- df.SNPs %>% rename(ID = Name)
names(df.SNPs)
##  [1] "ID"     "qtl1.1" "qtl2.1" "qtl3.1" "qtl3.2" "qtl4.1" "qtl6.1" "qtl6.2"
##  [9] "qtl7.1" "qtl7.2"

Input LogFlr BLUPs for experiment E1


in_name2 <- "Expt1_Emily_LogFlr_BLUPS.txt"
infile2 <- file.path(path.expand("~"),"FlrGWAS","output", in_name2)
df.E1 <- read_tsv(infile2)
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   RP_meanE1 = col_double(),
##   RP_blupE1 = col_double()
## )
df.E1 <- df.E1 %>% select(ID, RP_blupE1)
df.E1 <- df.E1 %>% rename(Log_FlrDatE1 = RP_blupE1)
names(df.E1)
## [1] "ID"           "Log_FlrDatE1"

Input Bioclim info:

file_in <- "ID_PopGroup_worldclim_PCA.txt"
in_name3 <- file.path(path.expand("~"),"FlrGWAS", "output", file_in)
df.bioclim <- read_tsv(in_name3)
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   ID = col_character(),
##   PopGroup = col_character()
## )
## See spec(...) for full column specifications.
df.bioclim <- df.bioclim %>% 
  select(ID, PopGroup, elev_m, lons, lats, bio1, bio2, bio3, bio4, bio5, bio6, 
         bio7, bio8, bio9, bio10, bio11, bio12, bio13, bio14, bio15, 
         bio16, bio17, bio18, bio19, PC1, PC2, PC3, PC4, PC5)
names(df.bioclim)
##  [1] "ID"       "PopGroup" "elev_m"   "lons"     "lats"     "bio1"    
##  [7] "bio2"     "bio3"     "bio4"     "bio5"     "bio6"     "bio7"    
## [13] "bio8"     "bio9"     "bio10"    "bio11"    "bio12"    "bio13"   
## [19] "bio14"    "bio15"    "bio16"    "bio17"    "bio18"    "bio19"   
## [25] "PC1"      "PC2"      "PC3"      "PC4"      "PC5"

We don’t use the following file, but here is the info, if needed:

in_name4 <- "Bstricta_RefPop_20_02_21.xlsx"
in_name4 <- file.path(path.expand("~"),"FlrGWAS","Flr_Data", in_name4)
df.RP <- read_excel(in_name4, sheet = 1, na = "NA")
df.RP <- df.RP %>% mutate(prefix = str_sub(ID, 1, 2))
df.RP <- df.RP %>% filter(prefix == "RP")
df.RP <- df.RP %>% select(ID, PopGroup, Latitude, Longitude, Elev_m)
names(df.RP)

Input LogFlr BLUPs for experiment W3. (Use later for GxE.)

in_name5 <- "Wenjie_third_LogFlr_BLUPS.txt"
(infile5 <- file.path(path.expand("~"),"FlrGWAS","output", in_name5))
## [1] "/Users/tmo1/FlrGWAS/output/Wenjie_third_LogFlr_BLUPS.txt"

df.W3 <- read_tsv(infile5)
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   RP_meanW3 = col_double(),
##   RP_blupW3 = col_double()
## )
df.W3 <- df.W3 %>% select(ID, RP_blupW3)
df.W3 <- df.W3 %>% rename(Log_FlrDatW3 = RP_blupW3)
names(df.W3)
## [1] "ID"           "Log_FlrDatW3"

Merge three dataframes

df.temp1 <- left_join(df.SNPs, df.E1)
## Joining, by = "ID"
names(df.temp1)
##  [1] "ID"           "qtl1.1"       "qtl2.1"       "qtl3.1"       "qtl3.2"      
##  [6] "qtl4.1"       "qtl6.1"       "qtl6.2"       "qtl7.1"       "qtl7.2"      
## [11] "Log_FlrDatE1"
df2 <- left_join(df.temp1, df.bioclim)
## Joining, by = "ID"
df2 <- mutate(df2, PopGroup = as.factor(PopGroup))
dim(df2); names(df2)
## [1] 518  39
##  [1] "ID"           "qtl1.1"       "qtl2.1"       "qtl3.1"       "qtl3.2"      
##  [6] "qtl4.1"       "qtl6.1"       "qtl6.2"       "qtl7.1"       "qtl7.2"      
## [11] "Log_FlrDatE1" "PopGroup"     "elev_m"       "lons"         "lats"        
## [16] "bio1"         "bio2"         "bio3"         "bio4"         "bio5"        
## [21] "bio6"         "bio7"         "bio8"         "bio9"         "bio10"       
## [26] "bio11"        "bio12"        "bio13"        "bio14"        "bio15"       
## [31] "bio16"        "bio17"        "bio18"        "bio19"        "PC1"         
## [36] "PC2"          "PC3"          "PC4"          "PC5"

Data transformations and summary statistics

Pool heterozygotes with ALT homozygotes, so that SNP is 0 (REF) or 1 (ALT).

# SNP genotypes: join hets and Alt homozygotes, scored as 1.
# 0 is ancestral allele
# 1 is homozygous or heterozygous for derived allele 
# Except: derived / ancestral is unknown for qtl6.1 & qtl7.1
names(df2)[2:10] # Correct homozygote / heterozygote for these columns
## [1] "qtl1.1" "qtl2.1" "qtl3.1" "qtl3.2" "qtl4.1" "qtl6.1" "qtl6.2" "qtl7.1"
## [9] "qtl7.2"
for(this.col in 2:10) {
  for(this.row in 2:nrow(df2)) {
    if(df2[this.row, this.col] == 2) {
      df2[this.row, this.col] <- 1
    }
  }
}
df2$qtl7.1 <- ifelse(df2$qtl7.1 == 0, 1, 0)

Get genotype frequencies for entire data set:

# Allele freqs for entire data set:
qtls <- df2 %>% 
  filter(ID != "Retro") %>% drop_na() %>% 
  select(qtl1.1, qtl2.1, qtl3.1, qtl3.2, 
         qtl4.1, qtl6.1, qtl6.2, qtl7.1, qtl7.2) %>% summarise_all(mean)
qtls
## # A tibble: 1 x 9
##   qtl1.1 qtl2.1 qtl3.1 qtl3.2 qtl4.1 qtl6.1 qtl6.2 qtl7.1 qtl7.2
##    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 0.0465  0.150 0.0432 0.0266 0.0299  0.103 0.0764  0.159 0.0465

Get genotype freqs by group:

by_grp <- group_by(df2, PopGroup)
## Warning: Factor `PopGroup` contains implicit NA, consider using
## `forcats::fct_explicit_na`

grp.freqs <- summarise(by_grp, 
               fr1.1 = mean(qtl1.1, na.rm = TRUE),
               fr2.1 = mean(qtl2.1, na.rm = TRUE),
               fr3.1 = mean(qtl3.1, na.rm = TRUE),
               fr3.2 = mean(qtl3.2, na.rm = TRUE),
               fr4.1 = mean(qtl4.1, na.rm = TRUE),
               fr6.1 = mean(qtl6.1, na.rm = TRUE),
               fr6.2 = mean(qtl6.2, na.rm = TRUE),
               fr7.1 = mean(qtl7.1, na.rm = TRUE),
               fr7.2 = mean(qtl7.2, na.rm = TRUE))
grp.freqs
## # A tibble: 6 x 10
##   PopGroup  fr1.1 fr2.1   fr3.1   fr3.2   fr4.1 fr6.1  fr6.2  fr7.1  fr7.2
##   <fct>     <dbl> <dbl>   <dbl>   <dbl>   <dbl> <dbl>  <dbl>  <dbl>  <dbl>
## 1 ADM      0      0     0       0       0       0     0      0      0     
## 2 COL      0.147  0.358 0.2     0.105   0.126   0.474 0.305  0.0105 0.221 
## 3 NOR      0.0154 0     0       0       0       0     0      0      0     
## 4 UTA      0.0305 0.168 0.00763 0.00763 0.00763 0     0.0458 0.0153 0.0305
## 5 WES      0.0351 0     0       0       0       0     0      0.930  0     
## 6 <NA>     0.0519 0.110 0.0779  0.0909  0.188   0.222 0.214  0.137  0.208

Conclusion: COL & UTA are the only groups with much polymorphism. Shown here:

grp.freqs[c(2,4),]
## # A tibble: 2 x 10
##   PopGroup  fr1.1 fr2.1   fr3.1   fr3.2   fr4.1 fr6.1  fr6.2  fr7.1  fr7.2
##   <fct>     <dbl> <dbl>   <dbl>   <dbl>   <dbl> <dbl>  <dbl>  <dbl>  <dbl>
## 1 COL      0.147  0.358 0.2     0.105   0.126   0.474 0.305  0.0105 0.221 
## 2 UTA      0.0305 0.168 0.00763 0.00763 0.00763 0     0.0458 0.0153 0.0305

Create dataframes containing only COL, only UTA, and COL+UTA

df.COL <- df2 %>% filter(PopGroup == "COL")
dim(df.COL)
## [1] 95 39

df.UTA <- df2 %>% filter(PopGroup == "UTA")
dim(df.UTA)
## [1] 131  39

df.COL.UTA <- df2 %>% filter(PopGroup == "COL" | PopGroup == "UTA") %>% 
  mutate(qtl6.2 = as.factor(qtl6.2),
         qtl7.2 = as.factor(qtl7.2))
dim(df.COL.UTA)
## [1] 226  39

Effects of QTLs on flowering date

ANCOVA using COL+UTA, with all QTLs

fit.Flr.QTL  <- aov(Log_FlrDatE1~PopGroup+qtl1.1+qtl2.1+qtl3.1+qtl3.2+qtl4.1+qtl6.1+qtl6.2+qtl7.1+qtl7.2,
                    df.COL.UTA)
Anova(fit.Flr.QTL, type="III")
## Anova Table (Type III tests)
## 
## Response: Log_FlrDatE1
##             Sum Sq  Df F value  Pr(>F)    
## (Intercept) 0.0473   1   26.22 8.0e-07 ***
## PopGroup    0.0176   1    9.73 0.00212 ** 
## qtl1.1      0.0051   1    2.80 0.09612 .  
## qtl2.1      0.0148   1    8.18 0.00475 ** 
## qtl3.1      0.0004   1    0.23 0.62985    
## qtl3.2      0.0024   1    1.33 0.25072    
## qtl4.1      0.0112   1    6.22 0.01354 *  
## qtl6.1      0.0134   1    7.43 0.00708 ** 
## qtl6.2      0.0254   1   14.09 0.00024 ***
## qtl7.1      0.0031   1    1.72 0.19193    
## qtl7.2      0.0302   1   16.72 6.6e-05 ***
## Residuals   0.3142 174                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The following five QTLs are significant in this model: qtl2.1, qtl4.1, qtl6.1, qtl6.2, and qtl7.2.


Get R-squared for the complete model:

fit1 <- lm(Log_FlrDatE1~PopGroup+qtl1.1+qtl2.1+qtl3.1+qtl3.2+qtl4.1+qtl6.1+qtl6.2+qtl7.1+qtl7.2,
            df.COL.UTA)
summary(fit1)
## 
## Call:
## lm(formula = Log_FlrDatE1 ~ PopGroup + qtl1.1 + qtl2.1 + qtl3.1 + 
##     qtl3.2 + qtl4.1 + qtl6.1 + qtl6.2 + qtl7.1 + qtl7.2, data = df.COL.UTA)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.11574 -0.02895 -0.00088  0.02330  0.14638 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.04183    0.00817   -5.12  8.0e-07 ***
## PopGroupUTA  0.02749    0.00881    3.12  0.00212 ** 
## qtl1.1      -0.02275    0.01360   -1.67  0.09612 .  
## qtl2.1       0.02295    0.00803    2.86  0.00475 ** 
## qtl3.1       0.00649    0.01345    0.48  0.62985    
## qtl3.2       0.02320    0.02013    1.15  0.25072    
## qtl4.1       0.04512    0.01809    2.49  0.01354 *  
## qtl6.1       0.03536    0.01297    2.73  0.00708 ** 
## qtl6.21      0.04338    0.01156    3.75  0.00024 ***
## qtl7.1       0.03357    0.02563    1.31  0.19193    
## qtl7.21      0.05513    0.01348    4.09  6.6e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0425 on 174 degrees of freedom
##   (41 observations deleted due to missingness)
## Multiple R-squared:  0.418,  Adjusted R-squared:  0.385 
## F-statistic: 12.5 on 10 and 174 DF,  p-value: 3.08e-16

# ---------------------------------------------

# Get R-squared with PopGroup only
fit2  <- lm(Log_FlrDatE1~PopGroup, df.COL.UTA)
summary(fit2)
## 
## Call:
## lm(formula = Log_FlrDatE1 ~ PopGroup, data = df.COL.UTA)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.14543 -0.03478 -0.00254  0.02723  0.16640 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.01080    0.00666    1.62    0.107  
## PopGroupUTA -0.01716    0.00827   -2.08    0.039 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0537 on 183 degrees of freedom
##   (41 observations deleted due to missingness)
## Multiple R-squared:  0.023,  Adjusted R-squared:  0.0177 
## F-statistic: 4.31 on 1 and 183 DF,  p-value: 0.0393

These nine QTLs explain 39.5% of genetic variation for flowering time in experiment E1. Other polygenic variation accounts for 60.5% of genetic variance for flowering time.

In experiment E1, heritability of LogFlrDat is 77.0% of total variaton, so these nine QTLs explain 30.415% of phenotypic variation.


In COL, what are the effect sizes of qtl6.2 & qtl7.2 on Log_FlrDatE1?

df.COL.nomiss <- df.COL %>% drop_na()
fit.2qtl  <- lm(Log_FlrDatE1 ~ qtl6.2 + qtl7.2, df.COL.nomiss)
coeffs <- fit.2qtl$coefficients
coeffs
## (Intercept)      qtl6.2      qtl7.2 
##    -0.02020     0.06834     0.07758
sd.fd <- sd(df.COL.nomiss$Log_FlrDatE1) 
sd.fd
## [1] 0.06855

effect.qtl6.2 <- coeffs[2] - coeffs[1]
effect.qtl6.2.sd <- effect.qtl6.2 / sd.fd 

effect.qtl7.2 <- coeffs[3] - coeffs[1]
effect.qtl7.2.sd <- effect.qtl7.2 / sd.fd 

In experiment E1, qtl6.2 has effect size of 1.2917 standard deviations for Log_FlrDatE1. The derived allele delays flowering.

In experiment E1, qtl7.2 has effect size of 1.4264 standard deviations for Log_FlrDatE1. The derived allele delays flowering.

Notice that we have accounded for LD between the two QTLs.


Geographic locations of alleles

Geographic location of qtl6.2 derived genotypes for COL+UTA:

df.QTLs <- select(df.COL.UTA, ID, PopGroup, Log_FlrDatE1, qtl1.1, qtl2.1, 
                   qtl3.1, qtl3.2, qtl4.1, qtl6.2, qtl7.2,  
                   lats, lons, elev_m, PC1, PC2, PC3, PC4, PC5, bio6) %>% 
            mutate(qtl6.2 = as.character(qtl6.2),
                   qtl7.2 = as.character(qtl7.2)) %>% 
            drop_na() %>% arrange(qtl6.2)

myMap <- get_stamenmap(bbox = c(left=-117.5, bottom=36.2, right=-101.5, top=49), 
                       zoom = 5, maptype = "toner-background")
## Source : http://tile.stamen.com/toner-background/5/5/10.png
## Source : http://tile.stamen.com/toner-background/5/6/10.png
## Source : http://tile.stamen.com/toner-background/5/5/11.png
## Source : http://tile.stamen.com/toner-background/5/6/11.png
## Source : http://tile.stamen.com/toner-background/5/5/12.png
## Source : http://tile.stamen.com/toner-background/5/6/12.png
ggmap(myMap) + theme_void() +
  geom_point(aes(x = lons, y = lats, colour = qtl6.2), data = df.QTLs)


Geographic location of qtl6.2 derived genotypes for COL+UTA:

df.QTLs <- arrange(df.QTLs, qtl6.2)

MyMap <- get_stamenmap(bbox = c(left=-117.5, bottom=36.2, right=-101.5, top=49), 
                       zoom = 5, maptype = "toner-background")
MyMap <- ggmap(myMap) + theme_void() +
  geom_point(aes(x = lons, y = lats, colour = qtl6.2), data = df.QTLs) +
  theme_classic() + scale_colour_manual(values = c("red", "blue", "green")) +
  scale_fill_manual(values = c("red", "blue", "green"))
ggsave("MyMap.png")
## Saving 7 x 5 in image

Geographic location of qtl7.2 derived genotypes for COL+UTA:

df.QTLs <- arrange(df.QTLs, qtl7.2)

MyMap <- get_stamenmap(bbox = c(left=-117.5, bottom=36.2, right=-101.5, top=49), 
                       zoom = 5, maptype = "toner-background")
MyMap <- ggmap(myMap) + theme_void() +
  geom_point(aes(x = lons, y = lats, colour = qtl7.2), data = df.QTLs) +
  theme_classic() + scale_colour_manual(values = c("red", "blue", "green")) +
  scale_fill_manual(values = c("red", "blue", "green"))
ggsave("MyMap.17.2.png")
## Saving 7 x 5 in image
MyMap


LD between qtl6.2 and qtl7.2

Notice that genotypes at 1tl6.2 and qtl7.2 seem to be correlated. Check using cross-tabs for genotypes at qtl6.2 and qtl7.2:

(locus.pair <- xtabs(~ qtl6.2 + qtl7.2, df.QTLs) %>% addmargins)
##       qtl7.2
## qtl6.2   0   1 Sum
##    0   153   9 162
##    1    18   5  23
##    Sum 171  14 185

Both loci are ancestral in 82.7027% of accessions. In 14.5946% of genotypes one locus is ancestral, and the other is derived. Both loci are derived in 2.7027% of accessions. For both QTLs, the derived allele delays flowering.

Are these two loci statistically independent?

chi2.QTLs <- chisq.test(df.QTLs$qtl6.2, df.QTLs$qtl7.2, correct = FALSE)
## Warning in chisq.test(df.QTLs$qtl6.2, df.QTLs$qtl7.2, correct = FALSE): Chi-
## squared approximation may be incorrect
chi2.results <- glance(chi2.QTLs) 
chi2.results
## # A tibble: 1 x 4
##   statistic p.value parameter method                    
##       <dbl>   <dbl>     <int> <chr>                     
## 1      7.54 0.00603         1 Pearson's Chi-squared test

Genotypes at these two loci are significantly associated.

Effects of QTLs on flowering


What is the effect of these two flowering QTLs on Log_FlrDatE1 in COL+UTA?

# ANCOVA with COL+UTA
fit.Flr.QTL  <- aov(Log_FlrDatE1~PopGroup + qtl6.2 + qtl7.2, df.COL.UTA)
Anova(fit.Flr.QTL, type="III")
## Anova Table (Type III tests)
## 
## Response: Log_FlrDatE1
##             Sum Sq  Df F value  Pr(>F)    
## (Intercept)  0.015   1    6.94  0.0092 ** 
## PopGroup     0.001   1    0.65  0.4197    
## qtl6.2       0.079   1   37.65 5.2e-09 ***
## qtl7.2       0.049   1   23.17 3.1e-06 ***
## Residuals    0.380 181                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Highly significant prediction of flowering time for both qtl6.2 & qtl7.2.


Get R-squared for the two QTLs:

# Get R-squared with PopGroup and 2 QTLs
fit1  <- lm(Log_FlrDatE1~PopGroup + qtl6.2 + qtl7.2, df.COL.UTA)
summary(fit1)
## 
## Call:
## lm(formula = Log_FlrDatE1 ~ PopGroup + qtl6.2 + qtl7.2, data = df.COL.UTA)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.11720 -0.03427  0.00069  0.02043  0.15185 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.01742    0.00661   -2.63   0.0092 ** 
## PopGroupUTA  0.00614    0.00759    0.81   0.4197    
## qtl6.21      0.06644    0.01083    6.14  5.2e-09 ***
## qtl7.21      0.06410    0.01332    4.81  3.1e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0458 on 181 degrees of freedom
##   (41 observations deleted due to missingness)
## Multiple R-squared:  0.295,  Adjusted R-squared:  0.284 
## F-statistic: 25.3 on 3 and 181 DF,  p-value: 1.05e-13

# ---------------------------------------------

# Get R-squared with PopGroup only
fit2  <- lm(Log_FlrDatE1~PopGroup, df.COL.UTA)
summary(fit2)
## 
## Call:
## lm(formula = Log_FlrDatE1 ~ PopGroup, data = df.COL.UTA)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.14543 -0.03478 -0.00254  0.02723  0.16640 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.01080    0.00666    1.62    0.107  
## PopGroupUTA -0.01716    0.00827   -2.08    0.039 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0537 on 183 degrees of freedom
##   (41 observations deleted due to missingness)
## Multiple R-squared:  0.023,  Adjusted R-squared:  0.0177 
## F-statistic: 4.31 on 1 and 183 DF,  p-value: 0.0393

The two QTLs explain 27.2% of genetic variation in flowering time.


Is abiotic environment at source populations associated with flowering time?

At the level of quantitative traits, what are the effects of the Bioclim PCs on Log_FlrDatE1 in COL+UTA? (The Bioclim PCs and loadings were created in Bioclim_Pop_Group.html.)

# ANCOVA with F-ratios
fit.Flr.PCs  <- aov(Log_FlrDatE1~PopGroup + PC1 + PC2 + PC3 + PC4 + PC5, df.COL.UTA)
Anova(fit.Flr.PCs, type="III")
## Anova Table (Type III tests)
## 
## Response: Log_FlrDatE1
##             Sum Sq  Df F value Pr(>F)   
## (Intercept)  0.000   1    0.20 0.6591   
## PopGroup     0.002   1    0.80 0.3731   
## PC1          0.012   1    4.81 0.0295 * 
## PC2          0.000   1    0.11 0.7439   
## PC3          0.026   1   10.23 0.0016 **
## PC4          0.001   1    0.32 0.5734   
## PC5          0.008   1    3.19 0.0757 . 
## Residuals    0.453 178                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Get slopes for the Bioclim PCs on Log_FlrDatE1 in COL+UTA:

# Slope of PCs
slopes.Flr.PCs  <- lm(Log_FlrDatE1~PopGroup + PC1 + PC2 + PC3 + PC4 + PC5, df.COL.UTA)
summary(slopes.Flr.PCs)
## 
## Call:
## lm(formula = Log_FlrDatE1 ~ PopGroup + PC1 + PC2 + PC3 + PC4 + 
##     PC5, data = df.COL.UTA)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.15257 -0.03252 -0.00404  0.02413  0.17180 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -0.004323   0.009781   -0.44   0.6591   
## PopGroupUTA  0.010426   0.011676    0.89   0.3731   
## PC1         -0.003481   0.001587   -2.19   0.0295 * 
## PC2          0.000807   0.002465    0.33   0.7439   
## PC3          0.008789   0.002748    3.20   0.0016 **
## PC4         -0.002660   0.004715   -0.56   0.5734   
## PC5          0.008928   0.004997    1.79   0.0757 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0504 on 178 degrees of freedom
##   (41 observations deleted due to missingness)
## Multiple R-squared:  0.161,  Adjusted R-squared:  0.133 
## F-statistic: 5.69 on 6 and 178 DF,  p-value: 1.96e-05

At the level of quantitative traits, PC1 & PC3 have significant effect on Log_FlrDatE1. And the entire model is highly significant: F-statistic: 5.69 on 6 and 178 DF, p-value: 1.96e-05.


Are QTL genotypes associated with PCs for ancestral abiotic environment?

For the two largest QTLs, are they associated with PC1 - PC5?

PC1:

# QTLs not significant. Not run.
fit.PC1.2QTLs  <- aov(PC1 ~ PopGroup + qtl6.2 + qtl7.2, df.COL.UTA)
Anova(fit.PC1.2QTLs, type="III")
## Anova Table (Type III tests)
## 
## Response: PC1
##             Sum Sq  Df F value  Pr(>F)    
## (Intercept)     92   1   13.25 0.00034 ***
## PopGroup       103   1   14.87 0.00015 ***
## qtl6.2           1   1    0.15 0.69920    
## qtl7.2           3   1    0.43 0.51328    
## Residuals     1538 222                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

qtl6.2 and qtl7.2 are not associated with PC1 ancestral climate.


PC2:

# QTL 7.2 is marginally significant. Not run.
fit.PC2.2QTLs  <- aov(PC2 ~ PopGroup + qtl6.2 + qtl7.2, df.COL.UTA)
Anova(fit.PC2.2QTLs, type="III")
## Anova Table (Type III tests)
## 
## Response: PC2
##             Sum Sq  Df F value  Pr(>F)    
## (Intercept)      0   1    0.05   0.816    
## PopGroup        49   1   17.80 3.6e-05 ***
## qtl6.2           8   1    3.11   0.079 .  
## qtl7.2          11   1    4.09   0.044 *  
## Residuals      606 222                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

qtl7.2 has marginal association with PC2 ancestral climate (P = 0.044)


PC3:

# QTLs not significant. Not run.
fit.PC3.2QTLs  <- aov(PC3 ~ PopGroup + qtl6.2 + qtl7.2, df.COL.UTA)
Anova(fit.PC3.2QTLs, type="III")
## Anova Table (Type III tests)
## 
## Response: PC3
##             Sum Sq  Df F value  Pr(>F)    
## (Intercept)     25   1   11.80 0.00071 ***
## PopGroup       168   1   78.14   3e-16 ***
## qtl6.2           8   1    3.56 0.06032 .  
## qtl7.2           2   1    1.10 0.29546    
## Residuals      477 222                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

qtl6.2 and qtl7.2 are not associated with PC3 ancestral climate.


PC4:

fit.PC4.2QTLs  <- lm(PC4 ~ PopGroup + qtl6.2 + qtl7.2, df.COL.UTA)
Anova(fit.PC4.2QTLs, type="III")
## Anova Table (Type III tests)
## 
## Response: PC4
##             Sum Sq  Df F value  Pr(>F)    
## (Intercept)   57.8   1   61.76 1.7e-13 ***
## PopGroup      24.1   1   25.77 8.1e-07 ***
## qtl6.2         8.6   1    9.22  0.0027 ** 
## qtl7.2         8.2   1    8.73  0.0035 ** 
## Residuals    207.8 222                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

qtl6.2 and qtl7.2 are significantly associated with PC4 ancestral climate.

Main components of PC4:

BIO8 = Mean Temperature of Wettest Quarter

BIO18 = Precipitation of Warmest Quarter


PC5:

# QTLs not significant. Not run.
fit.PC5.2QTLs  <- lm(PC5 ~ PopGroup + qtl6.2 + qtl7.2, df.COL.UTA)
Anova(fit.PC5.2QTLs, type="III")
## Anova Table (Type III tests)
## 
## Response: PC5
##             Sum Sq  Df F value  Pr(>F)    
## (Intercept)   33.6   1   35.96 8.1e-09 ***
## PopGroup      31.1   1   33.34 2.6e-08 ***
## qtl6.2         0.1   1    0.13    0.72    
## qtl7.2         1.7   1    1.77    0.18    
## Residuals    207.2 222                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

qtl6.2 and qtl7.2 are not associated with PC5 ancestral climate.


Conclusion: Genotypes at qtl6.2 & qtl7.2 are only associated with PC4.

qtl6.2 is negatively associated with PC4

qtl7.2 is positively associated with PC4

PC4 is negatively associated with BIO8 (Mean Temperature of Wettest Quarter) & BIO18 (Precipitation of Warmest Quarter):


Are qtl6.2 + qtl7.2 associated for components of PC4 (bio8 or bio18)?

Bio8:

fit.bio8.2QTLs  <- lm(bio8 ~ PopGroup + qtl6.2 + qtl7.2, df.COL.UTA)
summary(fit.bio8.2QTLs)
## 
## Call:
## lm(formula = bio8 ~ PopGroup + qtl6.2 + qtl7.2, data = df.COL.UTA)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -188.0  -77.5   16.6   58.1  133.9 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    68.61       9.78    7.02  2.7e-11 ***
## PopGroupUTA   -58.51      11.61   -5.04  9.6e-07 ***
## qtl6.21         5.39      15.93    0.34     0.74    
## qtl7.21       -17.91      18.01   -0.99     0.32    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78.7 on 222 degrees of freedom
## Multiple R-squared:  0.117,  Adjusted R-squared:  0.105 
## F-statistic: 9.78 on 3 and 222 DF,  p-value: 4.34e-06

No significant association


Bio18

fit.bio18.2QTLs  <- lm(bio18 ~ PopGroup + qtl6.2 + qtl7.2, df.COL.UTA)
summary(fit.bio18.2QTLs)
## 
## Call:
## lm(formula = bio18 ~ PopGroup + qtl6.2 + qtl7.2, data = df.COL.UTA)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -62.03 -11.21  -0.49  13.70  39.97 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   157.03       2.28   68.76   <2e-16 ***
## PopGroupUTA   -36.82       2.71  -13.58   <2e-16 ***
## qtl6.21        11.73       3.72    3.15   0.0018 ** 
## qtl7.21        -2.73       4.21   -0.65   0.5167    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.4 on 222 degrees of freedom
## Multiple R-squared:  0.541,  Adjusted R-squared:  0.535 
## F-statistic: 87.3 on 3 and 222 DF,  p-value: <2e-16

qtl6.2 is significantly associated with bio18 = Precipitation of Warmest Quarter (P = 0.0018). Genotypes that flower early come from places with drier summers.

Check residuals from this model:

p1 <- norm.hist(fit.bio18.2QTLs$residuals, "Residuals")
p2 <- draw.qq(fit.bio18.2QTLs$residuals, "fit$residuals")
plot_grid(p1, p2, nrow = 1, labels = c("A", "B")) # join & print figures
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Residuals are well behaved. Genotypes at qtl6.2 are strongly associated with bio18 (Precipitation of Warmest Quarter):


# Create column with polarity as ancestral or derived
df.COL.UTA <- df.COL.UTA %>%  mutate(Genotype = as.character(qtl6.2))
df.COL.UTA$Genotype <- if_else(df.COL.UTA$Genotype == "0", "Ancestral", "Derived")

ggplot(df.COL.UTA, aes(x = bio18, fill = Genotype, color = Genotype)) +
  geom_histogram(alpha=0.6) +
  ggtitle( "Derived allele at qtl6.2 is associated with wetter summers") +
  labs(x = "Precipitation of Warmest Quarter (Bio18)", y = "Frequency") +
  theme_classic() + scale_colour_manual(values = c("red", "blue", "green")) +
  scale_fill_manual(values = c("red", "blue", "green"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


ggsave("Hist_qtl6.2.png")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Genotypes with the derived allele are found in environments with wetter summers.


Next, compare flowering time in environments E1 and W3:

Controlling for PopGroup in COL+UTA, how similar is flowering time in E1 and W3?.

df.CU.E1W3 <- df.COL.UTA %>% select(ID, PopGroup, Log_FlrDatE1, qtl6.2, qtl7.2)
df.CU.E1W3 <- left_join(df.CU.E1W3, df.W3) %>% drop_na()
## Joining, by = "ID"
dim(df.CU.E1W3)
## [1] 119   6
corr.E1W3 <- cor(df.CU.E1W3$Log_FlrDatE1, df.CU.E1W3$Log_FlrDatW3)  

FlrDat.1.3 <- ggplot(df.CU.E1W3) + 
  geom_jitter(aes(x = Log_FlrDatE1, y = Log_FlrDatW3, color = PopGroup), 
              size=1.75, alpha=0.8, width = 0.02, height = 0.02) + theme_classic() +
  scale_colour_manual(values = c("red", "blue", "green")) +
  scale_fill_manual(values = c("red", "blue", "green")) +
  labs(x = "Age at First Flower in E1 (Log10)", 
       y = "Age at First Flower in W3 (Log10)")

FlrDat.1.3

ggsave("FlrDat.E1.W3.png")
## Saving 7 x 5 in image

The overall correlation between genotype mean flowering times in E1 & W3 = 0.4208.


Test for QxE between E1 & W3

First, convert to tall format for ANCOVA:

df.COL.UTA.Flr <- pivot_longer(df.CU.E1W3,
                    -c(ID, PopGroup, qtl6.2, qtl7.2), # pivot cols to long format
                    names_to = "Envirn", 
                    values_to = "FlrDat")
dim(df.COL.UTA.Flr)
## [1] 238   6

Is there significant QxE between environments E1 and W3 for the two largest QTLs?

# Fixed effects ANCOVA with interaction, for COL+UTA accessions
# drop1 -- Gives Type III sums of squares. 
options(contrasts = c("contr.sum","contr.poly"))
model <- lm(FlrDat ~ PopGroup + Envirn + qtl6.2 + qtl7.2 + qtl6.2:Envirn + qtl7.2:Envirn,
            data=df.COL.UTA.Flr)
drop1(model, .~., test="F")
## Single term deletions
## 
## Model:
## FlrDat ~ PopGroup + Envirn + qtl6.2 + qtl7.2 + qtl6.2:Envirn + 
##     qtl7.2:Envirn
##               Df Sum of Sq  RSS   AIC F value  Pr(>F)    
## <none>                     1.41 -1207                    
## PopGroup       1    0.0029 1.41 -1208    0.48 0.48965    
## Envirn         1    0.0038 1.41 -1208    0.62 0.43026    
## qtl6.2         1    0.0568 1.47 -1199    9.30 0.00256 ** 
## qtl7.2         1    0.0769 1.49 -1196   12.60 0.00047 ***
## Envirn:qtl6.2  1    0.0189 1.43 -1205    3.10 0.07953 .  
## Envirn:qtl7.2  1    0.0003 1.41 -1209    0.05 0.81657    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

These large effect QTLs do not show significant QxE (P > 0.05).


# This is an alternative way to ask the same question. Results are very similar.
fit  <- aov(FlrDat~PopGroup + Envirn + qtl6.2 + qtl7.2 + qtl6.2:Envirn + qtl7.2:Envirn,
            df.COL.UTA.Flr)
Anova(fit, type="III")
(tidy_aov <- tidy(fit))

References:

Sasaki, E., et al. (2015). “”Missing" G x E Variation Controls Flowering Time in Arabidopsis thaliana." PLoS Genetics 11(10).