library(readxl)
library(openxlsx)
library(car)
library(broom)
library(ggmap)
library(magrittr)
library(tidyverse)
library(cowplot)
library(lme4)
library(lmerTest) # import lmerTest *AFTER* lme4
norm.hist <- function(data.vec, xlab){ # function to draw normal density plot
hist.x <- sort(na.omit(data.vec))
hist.seq <- seq(min(hist.x), max(hist.x), length.out=length(hist.x))
hist.d <- dnorm(hist.seq, mean(hist.x), sd(hist.x))
df.hist <- data.frame(x = hist.x, seq = hist.seq, d = hist.d)
ggplot(df.hist) + geom_histogram(aes(x = x, y = ..density..)) +
geom_line(aes(x = seq, y = d), colour = "red", size = 1.2) +
labs(x = xlab, y = "Frequency")
}
draw.qq <- function(data.vec, xlab){ # function to draw QQ-plot
y <- sort(na.omit(data.vec))
probs <- (1:length(y))/(length(y) + 1)
norm.quantiles = qnorm(probs, mean(y), sd(y))
df.qq <- data.frame(y = y, nq = norm.quantiles)
p <- ggplot(df.qq, aes(x = nq, y = y)) + geom_point() +
geom_abline(slope = 1, intercept = 0, color='red') +
labs(x = "Normal Quantiles", y = xlab) +
geom_abline(slope = 1, intercept = 0, color='red')
}
Input qtl-SNPs
Stricta genotypes are imputed, so only retro has missing SNP genotypes. Derived / ancestral is unknown for qtl6.1 & qtl7.1. But all other QTLs have the most common allele as ancestral, so we hypothesize that is true for qtl6.1. ALT allele is most common for qtl7.1. We hypothesize that this may be ancestral.
in_name1 <- "Flowering_SNPs_transpose.xlsx"
infile1 <- file.path(path.expand("~"),"FlrGWAS","Flr_Data", in_name1)
df.SNPs <- read_excel(infile1, sheet = 1, na = "NA")
df.SNPs <- df.SNPs %>% rename(ID = Name)
names(df.SNPs)
## [1] "ID" "qtl1.1" "qtl2.1" "qtl3.1" "qtl3.2" "qtl4.1" "qtl6.1" "qtl6.2"
## [9] "qtl7.1" "qtl7.2"
Input LogFlr BLUPs for experiment E1
in_name2 <- "Expt1_Emily_LogFlr_BLUPS.txt"
infile2 <- file.path(path.expand("~"),"FlrGWAS","output", in_name2)
df.E1 <- read_tsv(infile2)
## Parsed with column specification:
## cols(
## ID = col_character(),
## RP_meanE1 = col_double(),
## RP_blupE1 = col_double()
## )
df.E1 <- df.E1 %>% select(ID, RP_blupE1)
df.E1 <- df.E1 %>% rename(Log_FlrDatE1 = RP_blupE1)
names(df.E1)
## [1] "ID" "Log_FlrDatE1"
Input Bioclim info:
file_in <- "ID_PopGroup_worldclim_PCA.txt"
in_name3 <- file.path(path.expand("~"),"FlrGWAS", "output", file_in)
df.bioclim <- read_tsv(in_name3)
## Parsed with column specification:
## cols(
## .default = col_double(),
## ID = col_character(),
## PopGroup = col_character()
## )
## See spec(...) for full column specifications.
df.bioclim <- df.bioclim %>%
select(ID, PopGroup, elev_m, lons, lats, bio1, bio2, bio3, bio4, bio5, bio6,
bio7, bio8, bio9, bio10, bio11, bio12, bio13, bio14, bio15,
bio16, bio17, bio18, bio19, PC1, PC2, PC3, PC4, PC5)
names(df.bioclim)
## [1] "ID" "PopGroup" "elev_m" "lons" "lats" "bio1"
## [7] "bio2" "bio3" "bio4" "bio5" "bio6" "bio7"
## [13] "bio8" "bio9" "bio10" "bio11" "bio12" "bio13"
## [19] "bio14" "bio15" "bio16" "bio17" "bio18" "bio19"
## [25] "PC1" "PC2" "PC3" "PC4" "PC5"
We don’t use the following file, but here is the info, if needed:
in_name4 <- "Bstricta_RefPop_20_02_21.xlsx"
in_name4 <- file.path(path.expand("~"),"FlrGWAS","Flr_Data", in_name4)
df.RP <- read_excel(in_name4, sheet = 1, na = "NA")
df.RP <- df.RP %>% mutate(prefix = str_sub(ID, 1, 2))
df.RP <- df.RP %>% filter(prefix == "RP")
df.RP <- df.RP %>% select(ID, PopGroup, Latitude, Longitude, Elev_m)
names(df.RP)
Input LogFlr BLUPs for experiment W3. (Use later for GxE.)
in_name5 <- "Wenjie_third_LogFlr_BLUPS.txt"
(infile5 <- file.path(path.expand("~"),"FlrGWAS","output", in_name5))
## [1] "/Users/tmo1/FlrGWAS/output/Wenjie_third_LogFlr_BLUPS.txt"
df.W3 <- read_tsv(infile5)
## Parsed with column specification:
## cols(
## ID = col_character(),
## RP_meanW3 = col_double(),
## RP_blupW3 = col_double()
## )
df.W3 <- df.W3 %>% select(ID, RP_blupW3)
df.W3 <- df.W3 %>% rename(Log_FlrDatW3 = RP_blupW3)
names(df.W3)
## [1] "ID" "Log_FlrDatW3"
Merge three dataframes
df.temp1 <- left_join(df.SNPs, df.E1)
## Joining, by = "ID"
names(df.temp1)
## [1] "ID" "qtl1.1" "qtl2.1" "qtl3.1" "qtl3.2"
## [6] "qtl4.1" "qtl6.1" "qtl6.2" "qtl7.1" "qtl7.2"
## [11] "Log_FlrDatE1"
df2 <- left_join(df.temp1, df.bioclim)
## Joining, by = "ID"
df2 <- mutate(df2, PopGroup = as.factor(PopGroup))
dim(df2); names(df2)
## [1] 518 39
## [1] "ID" "qtl1.1" "qtl2.1" "qtl3.1" "qtl3.2"
## [6] "qtl4.1" "qtl6.1" "qtl6.2" "qtl7.1" "qtl7.2"
## [11] "Log_FlrDatE1" "PopGroup" "elev_m" "lons" "lats"
## [16] "bio1" "bio2" "bio3" "bio4" "bio5"
## [21] "bio6" "bio7" "bio8" "bio9" "bio10"
## [26] "bio11" "bio12" "bio13" "bio14" "bio15"
## [31] "bio16" "bio17" "bio18" "bio19" "PC1"
## [36] "PC2" "PC3" "PC4" "PC5"
Pool heterozygotes with ALT homozygotes, so that SNP is 0 (REF) or 1 (ALT).
# SNP genotypes: join hets and Alt homozygotes, scored as 1.
# 0 is ancestral allele
# 1 is homozygous or heterozygous for derived allele
# Except: derived / ancestral is unknown for qtl6.1 & qtl7.1
names(df2)[2:10] # Correct homozygote / heterozygote for these columns
## [1] "qtl1.1" "qtl2.1" "qtl3.1" "qtl3.2" "qtl4.1" "qtl6.1" "qtl6.2" "qtl7.1"
## [9] "qtl7.2"
for(this.col in 2:10) {
for(this.row in 2:nrow(df2)) {
if(df2[this.row, this.col] == 2) {
df2[this.row, this.col] <- 1
}
}
}
df2$qtl7.1 <- ifelse(df2$qtl7.1 == 0, 1, 0)
Get genotype frequencies for entire data set:
# Allele freqs for entire data set:
qtls <- df2 %>%
filter(ID != "Retro") %>% drop_na() %>%
select(qtl1.1, qtl2.1, qtl3.1, qtl3.2,
qtl4.1, qtl6.1, qtl6.2, qtl7.1, qtl7.2) %>% summarise_all(mean)
qtls
## # A tibble: 1 x 9
## qtl1.1 qtl2.1 qtl3.1 qtl3.2 qtl4.1 qtl6.1 qtl6.2 qtl7.1 qtl7.2
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0465 0.150 0.0432 0.0266 0.0299 0.103 0.0764 0.159 0.0465
Get genotype freqs by group:
by_grp <- group_by(df2, PopGroup)
## Warning: Factor `PopGroup` contains implicit NA, consider using
## `forcats::fct_explicit_na`
grp.freqs <- summarise(by_grp,
fr1.1 = mean(qtl1.1, na.rm = TRUE),
fr2.1 = mean(qtl2.1, na.rm = TRUE),
fr3.1 = mean(qtl3.1, na.rm = TRUE),
fr3.2 = mean(qtl3.2, na.rm = TRUE),
fr4.1 = mean(qtl4.1, na.rm = TRUE),
fr6.1 = mean(qtl6.1, na.rm = TRUE),
fr6.2 = mean(qtl6.2, na.rm = TRUE),
fr7.1 = mean(qtl7.1, na.rm = TRUE),
fr7.2 = mean(qtl7.2, na.rm = TRUE))
grp.freqs
## # A tibble: 6 x 10
## PopGroup fr1.1 fr2.1 fr3.1 fr3.2 fr4.1 fr6.1 fr6.2 fr7.1 fr7.2
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ADM 0 0 0 0 0 0 0 0 0
## 2 COL 0.147 0.358 0.2 0.105 0.126 0.474 0.305 0.0105 0.221
## 3 NOR 0.0154 0 0 0 0 0 0 0 0
## 4 UTA 0.0305 0.168 0.00763 0.00763 0.00763 0 0.0458 0.0153 0.0305
## 5 WES 0.0351 0 0 0 0 0 0 0.930 0
## 6 <NA> 0.0519 0.110 0.0779 0.0909 0.188 0.222 0.214 0.137 0.208
Conclusion: COL & UTA are the only groups with much polymorphism. Shown here:
grp.freqs[c(2,4),]
## # A tibble: 2 x 10
## PopGroup fr1.1 fr2.1 fr3.1 fr3.2 fr4.1 fr6.1 fr6.2 fr7.1 fr7.2
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 COL 0.147 0.358 0.2 0.105 0.126 0.474 0.305 0.0105 0.221
## 2 UTA 0.0305 0.168 0.00763 0.00763 0.00763 0 0.0458 0.0153 0.0305
Create dataframes containing only COL, only UTA, and COL+UTA
df.COL <- df2 %>% filter(PopGroup == "COL")
dim(df.COL)
## [1] 95 39
df.UTA <- df2 %>% filter(PopGroup == "UTA")
dim(df.UTA)
## [1] 131 39
df.COL.UTA <- df2 %>% filter(PopGroup == "COL" | PopGroup == "UTA") %>%
mutate(qtl6.2 = as.factor(qtl6.2),
qtl7.2 = as.factor(qtl7.2))
dim(df.COL.UTA)
## [1] 226 39
ANCOVA using COL+UTA, with all QTLs
fit.Flr.QTL <- aov(Log_FlrDatE1~PopGroup+qtl1.1+qtl2.1+qtl3.1+qtl3.2+qtl4.1+qtl6.1+qtl6.2+qtl7.1+qtl7.2,
df.COL.UTA)
Anova(fit.Flr.QTL, type="III")
## Anova Table (Type III tests)
##
## Response: Log_FlrDatE1
## Sum Sq Df F value Pr(>F)
## (Intercept) 0.0473 1 26.22 8.0e-07 ***
## PopGroup 0.0176 1 9.73 0.00212 **
## qtl1.1 0.0051 1 2.80 0.09612 .
## qtl2.1 0.0148 1 8.18 0.00475 **
## qtl3.1 0.0004 1 0.23 0.62985
## qtl3.2 0.0024 1 1.33 0.25072
## qtl4.1 0.0112 1 6.22 0.01354 *
## qtl6.1 0.0134 1 7.43 0.00708 **
## qtl6.2 0.0254 1 14.09 0.00024 ***
## qtl7.1 0.0031 1 1.72 0.19193
## qtl7.2 0.0302 1 16.72 6.6e-05 ***
## Residuals 0.3142 174
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The following five QTLs are significant in this model: qtl2.1, qtl4.1, qtl6.1, qtl6.2, and qtl7.2.
Get R-squared for the complete model:
fit1 <- lm(Log_FlrDatE1~PopGroup+qtl1.1+qtl2.1+qtl3.1+qtl3.2+qtl4.1+qtl6.1+qtl6.2+qtl7.1+qtl7.2,
df.COL.UTA)
summary(fit1)
##
## Call:
## lm(formula = Log_FlrDatE1 ~ PopGroup + qtl1.1 + qtl2.1 + qtl3.1 +
## qtl3.2 + qtl4.1 + qtl6.1 + qtl6.2 + qtl7.1 + qtl7.2, data = df.COL.UTA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.11574 -0.02895 -0.00088 0.02330 0.14638
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.04183 0.00817 -5.12 8.0e-07 ***
## PopGroupUTA 0.02749 0.00881 3.12 0.00212 **
## qtl1.1 -0.02275 0.01360 -1.67 0.09612 .
## qtl2.1 0.02295 0.00803 2.86 0.00475 **
## qtl3.1 0.00649 0.01345 0.48 0.62985
## qtl3.2 0.02320 0.02013 1.15 0.25072
## qtl4.1 0.04512 0.01809 2.49 0.01354 *
## qtl6.1 0.03536 0.01297 2.73 0.00708 **
## qtl6.21 0.04338 0.01156 3.75 0.00024 ***
## qtl7.1 0.03357 0.02563 1.31 0.19193
## qtl7.21 0.05513 0.01348 4.09 6.6e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0425 on 174 degrees of freedom
## (41 observations deleted due to missingness)
## Multiple R-squared: 0.418, Adjusted R-squared: 0.385
## F-statistic: 12.5 on 10 and 174 DF, p-value: 3.08e-16
# ---------------------------------------------
# Get R-squared with PopGroup only
fit2 <- lm(Log_FlrDatE1~PopGroup, df.COL.UTA)
summary(fit2)
##
## Call:
## lm(formula = Log_FlrDatE1 ~ PopGroup, data = df.COL.UTA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.14543 -0.03478 -0.00254 0.02723 0.16640
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.01080 0.00666 1.62 0.107
## PopGroupUTA -0.01716 0.00827 -2.08 0.039 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0537 on 183 degrees of freedom
## (41 observations deleted due to missingness)
## Multiple R-squared: 0.023, Adjusted R-squared: 0.0177
## F-statistic: 4.31 on 1 and 183 DF, p-value: 0.0393
These nine QTLs explain 39.5% of genetic variation for flowering time in experiment E1. Other polygenic variation accounts for 60.5% of genetic variance for flowering time.
In experiment E1, heritability of LogFlrDat is 77.0% of total variaton, so these nine QTLs explain 30.415% of phenotypic variation.
In COL, what are the effect sizes of qtl6.2 & qtl7.2 on Log_FlrDatE1?
df.COL.nomiss <- df.COL %>% drop_na()
fit.2qtl <- lm(Log_FlrDatE1 ~ qtl6.2 + qtl7.2, df.COL.nomiss)
coeffs <- fit.2qtl$coefficients
coeffs
## (Intercept) qtl6.2 qtl7.2
## -0.02020 0.06834 0.07758
sd.fd <- sd(df.COL.nomiss$Log_FlrDatE1)
sd.fd
## [1] 0.06855
effect.qtl6.2 <- coeffs[2] - coeffs[1]
effect.qtl6.2.sd <- effect.qtl6.2 / sd.fd
effect.qtl7.2 <- coeffs[3] - coeffs[1]
effect.qtl7.2.sd <- effect.qtl7.2 / sd.fd
In experiment E1, qtl6.2 has effect size of 1.2917 standard deviations for Log_FlrDatE1. The derived allele delays flowering.
In experiment E1, qtl7.2 has effect size of 1.4264 standard deviations for Log_FlrDatE1. The derived allele delays flowering.
Notice that we have accounded for LD between the two QTLs.
Geographic location of qtl6.2 derived genotypes for COL+UTA:
df.QTLs <- select(df.COL.UTA, ID, PopGroup, Log_FlrDatE1, qtl1.1, qtl2.1,
qtl3.1, qtl3.2, qtl4.1, qtl6.2, qtl7.2,
lats, lons, elev_m, PC1, PC2, PC3, PC4, PC5, bio6) %>%
mutate(qtl6.2 = as.character(qtl6.2),
qtl7.2 = as.character(qtl7.2)) %>%
drop_na() %>% arrange(qtl6.2)
myMap <- get_stamenmap(bbox = c(left=-117.5, bottom=36.2, right=-101.5, top=49),
zoom = 5, maptype = "toner-background")
## Source : http://tile.stamen.com/toner-background/5/5/10.png
## Source : http://tile.stamen.com/toner-background/5/6/10.png
## Source : http://tile.stamen.com/toner-background/5/5/11.png
## Source : http://tile.stamen.com/toner-background/5/6/11.png
## Source : http://tile.stamen.com/toner-background/5/5/12.png
## Source : http://tile.stamen.com/toner-background/5/6/12.png
ggmap(myMap) + theme_void() +
geom_point(aes(x = lons, y = lats, colour = qtl6.2), data = df.QTLs)
Geographic location of qtl6.2 derived genotypes for COL+UTA:
df.QTLs <- arrange(df.QTLs, qtl6.2)
MyMap <- get_stamenmap(bbox = c(left=-117.5, bottom=36.2, right=-101.5, top=49),
zoom = 5, maptype = "toner-background")
MyMap <- ggmap(myMap) + theme_void() +
geom_point(aes(x = lons, y = lats, colour = qtl6.2), data = df.QTLs) +
theme_classic() + scale_colour_manual(values = c("red", "blue", "green")) +
scale_fill_manual(values = c("red", "blue", "green"))
ggsave("MyMap.png")
## Saving 7 x 5 in image
Geographic location of qtl7.2 derived genotypes for COL+UTA:
df.QTLs <- arrange(df.QTLs, qtl7.2)
MyMap <- get_stamenmap(bbox = c(left=-117.5, bottom=36.2, right=-101.5, top=49),
zoom = 5, maptype = "toner-background")
MyMap <- ggmap(myMap) + theme_void() +
geom_point(aes(x = lons, y = lats, colour = qtl7.2), data = df.QTLs) +
theme_classic() + scale_colour_manual(values = c("red", "blue", "green")) +
scale_fill_manual(values = c("red", "blue", "green"))
ggsave("MyMap.17.2.png")
## Saving 7 x 5 in image
MyMap
Notice that genotypes at 1tl6.2 and qtl7.2 seem to be correlated. Check using cross-tabs for genotypes at qtl6.2 and qtl7.2:
(locus.pair <- xtabs(~ qtl6.2 + qtl7.2, df.QTLs) %>% addmargins)
## qtl7.2
## qtl6.2 0 1 Sum
## 0 153 9 162
## 1 18 5 23
## Sum 171 14 185
Both loci are ancestral in 82.7027% of accessions. In 14.5946% of genotypes one locus is ancestral, and the other is derived. Both loci are derived in 2.7027% of accessions. For both QTLs, the derived allele delays flowering.
Are these two loci statistically independent?
chi2.QTLs <- chisq.test(df.QTLs$qtl6.2, df.QTLs$qtl7.2, correct = FALSE)
## Warning in chisq.test(df.QTLs$qtl6.2, df.QTLs$qtl7.2, correct = FALSE): Chi-
## squared approximation may be incorrect
chi2.results <- glance(chi2.QTLs)
chi2.results
## # A tibble: 1 x 4
## statistic p.value parameter method
## <dbl> <dbl> <int> <chr>
## 1 7.54 0.00603 1 Pearson's Chi-squared test
Genotypes at these two loci are significantly associated.
What is the effect of these two flowering QTLs on Log_FlrDatE1 in COL+UTA?
# ANCOVA with COL+UTA
fit.Flr.QTL <- aov(Log_FlrDatE1~PopGroup + qtl6.2 + qtl7.2, df.COL.UTA)
Anova(fit.Flr.QTL, type="III")
## Anova Table (Type III tests)
##
## Response: Log_FlrDatE1
## Sum Sq Df F value Pr(>F)
## (Intercept) 0.015 1 6.94 0.0092 **
## PopGroup 0.001 1 0.65 0.4197
## qtl6.2 0.079 1 37.65 5.2e-09 ***
## qtl7.2 0.049 1 23.17 3.1e-06 ***
## Residuals 0.380 181
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Highly significant prediction of flowering time for both qtl6.2 & qtl7.2.
Get R-squared for the two QTLs:
# Get R-squared with PopGroup and 2 QTLs
fit1 <- lm(Log_FlrDatE1~PopGroup + qtl6.2 + qtl7.2, df.COL.UTA)
summary(fit1)
##
## Call:
## lm(formula = Log_FlrDatE1 ~ PopGroup + qtl6.2 + qtl7.2, data = df.COL.UTA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.11720 -0.03427 0.00069 0.02043 0.15185
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.01742 0.00661 -2.63 0.0092 **
## PopGroupUTA 0.00614 0.00759 0.81 0.4197
## qtl6.21 0.06644 0.01083 6.14 5.2e-09 ***
## qtl7.21 0.06410 0.01332 4.81 3.1e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0458 on 181 degrees of freedom
## (41 observations deleted due to missingness)
## Multiple R-squared: 0.295, Adjusted R-squared: 0.284
## F-statistic: 25.3 on 3 and 181 DF, p-value: 1.05e-13
# ---------------------------------------------
# Get R-squared with PopGroup only
fit2 <- lm(Log_FlrDatE1~PopGroup, df.COL.UTA)
summary(fit2)
##
## Call:
## lm(formula = Log_FlrDatE1 ~ PopGroup, data = df.COL.UTA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.14543 -0.03478 -0.00254 0.02723 0.16640
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.01080 0.00666 1.62 0.107
## PopGroupUTA -0.01716 0.00827 -2.08 0.039 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0537 on 183 degrees of freedom
## (41 observations deleted due to missingness)
## Multiple R-squared: 0.023, Adjusted R-squared: 0.0177
## F-statistic: 4.31 on 1 and 183 DF, p-value: 0.0393
The two QTLs explain 27.2% of genetic variation in flowering time.
At the level of quantitative traits, what are the effects of the Bioclim PCs on Log_FlrDatE1 in COL+UTA? (The Bioclim PCs and loadings were created in Bioclim_Pop_Group.html.)
# ANCOVA with F-ratios
fit.Flr.PCs <- aov(Log_FlrDatE1~PopGroup + PC1 + PC2 + PC3 + PC4 + PC5, df.COL.UTA)
Anova(fit.Flr.PCs, type="III")
## Anova Table (Type III tests)
##
## Response: Log_FlrDatE1
## Sum Sq Df F value Pr(>F)
## (Intercept) 0.000 1 0.20 0.6591
## PopGroup 0.002 1 0.80 0.3731
## PC1 0.012 1 4.81 0.0295 *
## PC2 0.000 1 0.11 0.7439
## PC3 0.026 1 10.23 0.0016 **
## PC4 0.001 1 0.32 0.5734
## PC5 0.008 1 3.19 0.0757 .
## Residuals 0.453 178
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Get slopes for the Bioclim PCs on Log_FlrDatE1 in COL+UTA:
# Slope of PCs
slopes.Flr.PCs <- lm(Log_FlrDatE1~PopGroup + PC1 + PC2 + PC3 + PC4 + PC5, df.COL.UTA)
summary(slopes.Flr.PCs)
##
## Call:
## lm(formula = Log_FlrDatE1 ~ PopGroup + PC1 + PC2 + PC3 + PC4 +
## PC5, data = df.COL.UTA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.15257 -0.03252 -0.00404 0.02413 0.17180
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.004323 0.009781 -0.44 0.6591
## PopGroupUTA 0.010426 0.011676 0.89 0.3731
## PC1 -0.003481 0.001587 -2.19 0.0295 *
## PC2 0.000807 0.002465 0.33 0.7439
## PC3 0.008789 0.002748 3.20 0.0016 **
## PC4 -0.002660 0.004715 -0.56 0.5734
## PC5 0.008928 0.004997 1.79 0.0757 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0504 on 178 degrees of freedom
## (41 observations deleted due to missingness)
## Multiple R-squared: 0.161, Adjusted R-squared: 0.133
## F-statistic: 5.69 on 6 and 178 DF, p-value: 1.96e-05
At the level of quantitative traits, PC1 & PC3 have significant effect on Log_FlrDatE1. And the entire model is highly significant: F-statistic: 5.69 on 6 and 178 DF, p-value: 1.96e-05.
For the two largest QTLs, are they associated with PC1 - PC5?
PC1:
# QTLs not significant. Not run.
fit.PC1.2QTLs <- aov(PC1 ~ PopGroup + qtl6.2 + qtl7.2, df.COL.UTA)
Anova(fit.PC1.2QTLs, type="III")
## Anova Table (Type III tests)
##
## Response: PC1
## Sum Sq Df F value Pr(>F)
## (Intercept) 92 1 13.25 0.00034 ***
## PopGroup 103 1 14.87 0.00015 ***
## qtl6.2 1 1 0.15 0.69920
## qtl7.2 3 1 0.43 0.51328
## Residuals 1538 222
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qtl6.2 and qtl7.2 are not associated with PC1 ancestral climate.
PC2:
# QTL 7.2 is marginally significant. Not run.
fit.PC2.2QTLs <- aov(PC2 ~ PopGroup + qtl6.2 + qtl7.2, df.COL.UTA)
Anova(fit.PC2.2QTLs, type="III")
## Anova Table (Type III tests)
##
## Response: PC2
## Sum Sq Df F value Pr(>F)
## (Intercept) 0 1 0.05 0.816
## PopGroup 49 1 17.80 3.6e-05 ***
## qtl6.2 8 1 3.11 0.079 .
## qtl7.2 11 1 4.09 0.044 *
## Residuals 606 222
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qtl7.2 has marginal association with PC2 ancestral climate (P = 0.044)
PC3:
# QTLs not significant. Not run.
fit.PC3.2QTLs <- aov(PC3 ~ PopGroup + qtl6.2 + qtl7.2, df.COL.UTA)
Anova(fit.PC3.2QTLs, type="III")
## Anova Table (Type III tests)
##
## Response: PC3
## Sum Sq Df F value Pr(>F)
## (Intercept) 25 1 11.80 0.00071 ***
## PopGroup 168 1 78.14 3e-16 ***
## qtl6.2 8 1 3.56 0.06032 .
## qtl7.2 2 1 1.10 0.29546
## Residuals 477 222
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qtl6.2 and qtl7.2 are not associated with PC3 ancestral climate.
PC4:
fit.PC4.2QTLs <- lm(PC4 ~ PopGroup + qtl6.2 + qtl7.2, df.COL.UTA)
Anova(fit.PC4.2QTLs, type="III")
## Anova Table (Type III tests)
##
## Response: PC4
## Sum Sq Df F value Pr(>F)
## (Intercept) 57.8 1 61.76 1.7e-13 ***
## PopGroup 24.1 1 25.77 8.1e-07 ***
## qtl6.2 8.6 1 9.22 0.0027 **
## qtl7.2 8.2 1 8.73 0.0035 **
## Residuals 207.8 222
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qtl6.2 and qtl7.2 are significantly associated with PC4 ancestral climate.
Main components of PC4:
BIO8 = Mean Temperature of Wettest Quarter
BIO18 = Precipitation of Warmest Quarter
PC5:
# QTLs not significant. Not run.
fit.PC5.2QTLs <- lm(PC5 ~ PopGroup + qtl6.2 + qtl7.2, df.COL.UTA)
Anova(fit.PC5.2QTLs, type="III")
## Anova Table (Type III tests)
##
## Response: PC5
## Sum Sq Df F value Pr(>F)
## (Intercept) 33.6 1 35.96 8.1e-09 ***
## PopGroup 31.1 1 33.34 2.6e-08 ***
## qtl6.2 0.1 1 0.13 0.72
## qtl7.2 1.7 1 1.77 0.18
## Residuals 207.2 222
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qtl6.2 and qtl7.2 are not associated with PC5 ancestral climate.
Conclusion: Genotypes at qtl6.2 & qtl7.2 are only associated with PC4.
qtl6.2 is negatively associated with PC4
qtl7.2 is positively associated with PC4
PC4 is negatively associated with BIO8 (Mean Temperature of Wettest Quarter) & BIO18 (Precipitation of Warmest Quarter):
Bio8:
fit.bio8.2QTLs <- lm(bio8 ~ PopGroup + qtl6.2 + qtl7.2, df.COL.UTA)
summary(fit.bio8.2QTLs)
##
## Call:
## lm(formula = bio8 ~ PopGroup + qtl6.2 + qtl7.2, data = df.COL.UTA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -188.0 -77.5 16.6 58.1 133.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68.61 9.78 7.02 2.7e-11 ***
## PopGroupUTA -58.51 11.61 -5.04 9.6e-07 ***
## qtl6.21 5.39 15.93 0.34 0.74
## qtl7.21 -17.91 18.01 -0.99 0.32
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 78.7 on 222 degrees of freedom
## Multiple R-squared: 0.117, Adjusted R-squared: 0.105
## F-statistic: 9.78 on 3 and 222 DF, p-value: 4.34e-06
No significant association
Bio18
fit.bio18.2QTLs <- lm(bio18 ~ PopGroup + qtl6.2 + qtl7.2, df.COL.UTA)
summary(fit.bio18.2QTLs)
##
## Call:
## lm(formula = bio18 ~ PopGroup + qtl6.2 + qtl7.2, data = df.COL.UTA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -62.03 -11.21 -0.49 13.70 39.97
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 157.03 2.28 68.76 <2e-16 ***
## PopGroupUTA -36.82 2.71 -13.58 <2e-16 ***
## qtl6.21 11.73 3.72 3.15 0.0018 **
## qtl7.21 -2.73 4.21 -0.65 0.5167
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.4 on 222 degrees of freedom
## Multiple R-squared: 0.541, Adjusted R-squared: 0.535
## F-statistic: 87.3 on 3 and 222 DF, p-value: <2e-16
qtl6.2 is significantly associated with bio18 = Precipitation of Warmest Quarter (P = 0.0018). Genotypes that flower early come from places with drier summers.
Check residuals from this model:
p1 <- norm.hist(fit.bio18.2QTLs$residuals, "Residuals")
p2 <- draw.qq(fit.bio18.2QTLs$residuals, "fit$residuals")
plot_grid(p1, p2, nrow = 1, labels = c("A", "B")) # join & print figures
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Residuals are well behaved. Genotypes at qtl6.2 are strongly associated with bio18 (Precipitation of Warmest Quarter):
# Create column with polarity as ancestral or derived
df.COL.UTA <- df.COL.UTA %>% mutate(Genotype = as.character(qtl6.2))
df.COL.UTA$Genotype <- if_else(df.COL.UTA$Genotype == "0", "Ancestral", "Derived")
ggplot(df.COL.UTA, aes(x = bio18, fill = Genotype, color = Genotype)) +
geom_histogram(alpha=0.6) +
ggtitle( "Derived allele at qtl6.2 is associated with wetter summers") +
labs(x = "Precipitation of Warmest Quarter (Bio18)", y = "Frequency") +
theme_classic() + scale_colour_manual(values = c("red", "blue", "green")) +
scale_fill_manual(values = c("red", "blue", "green"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("Hist_qtl6.2.png")
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Genotypes with the derived allele are found in environments with wetter summers.
Controlling for PopGroup in COL+UTA, how similar is flowering time in E1 and W3?.
df.CU.E1W3 <- df.COL.UTA %>% select(ID, PopGroup, Log_FlrDatE1, qtl6.2, qtl7.2)
df.CU.E1W3 <- left_join(df.CU.E1W3, df.W3) %>% drop_na()
## Joining, by = "ID"
dim(df.CU.E1W3)
## [1] 119 6
corr.E1W3 <- cor(df.CU.E1W3$Log_FlrDatE1, df.CU.E1W3$Log_FlrDatW3)
FlrDat.1.3 <- ggplot(df.CU.E1W3) +
geom_jitter(aes(x = Log_FlrDatE1, y = Log_FlrDatW3, color = PopGroup),
size=1.75, alpha=0.8, width = 0.02, height = 0.02) + theme_classic() +
scale_colour_manual(values = c("red", "blue", "green")) +
scale_fill_manual(values = c("red", "blue", "green")) +
labs(x = "Age at First Flower in E1 (Log10)",
y = "Age at First Flower in W3 (Log10)")
FlrDat.1.3
ggsave("FlrDat.E1.W3.png")
## Saving 7 x 5 in image
The overall correlation between genotype mean flowering times in E1 & W3 = 0.4208.
First, convert to tall format for ANCOVA:
df.COL.UTA.Flr <- pivot_longer(df.CU.E1W3,
-c(ID, PopGroup, qtl6.2, qtl7.2), # pivot cols to long format
names_to = "Envirn",
values_to = "FlrDat")
dim(df.COL.UTA.Flr)
## [1] 238 6
Is there significant QxE between environments E1 and W3 for the two largest QTLs?
# Fixed effects ANCOVA with interaction, for COL+UTA accessions
# drop1 -- Gives Type III sums of squares.
options(contrasts = c("contr.sum","contr.poly"))
model <- lm(FlrDat ~ PopGroup + Envirn + qtl6.2 + qtl7.2 + qtl6.2:Envirn + qtl7.2:Envirn,
data=df.COL.UTA.Flr)
drop1(model, .~., test="F")
## Single term deletions
##
## Model:
## FlrDat ~ PopGroup + Envirn + qtl6.2 + qtl7.2 + qtl6.2:Envirn +
## qtl7.2:Envirn
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 1.41 -1207
## PopGroup 1 0.0029 1.41 -1208 0.48 0.48965
## Envirn 1 0.0038 1.41 -1208 0.62 0.43026
## qtl6.2 1 0.0568 1.47 -1199 9.30 0.00256 **
## qtl7.2 1 0.0769 1.49 -1196 12.60 0.00047 ***
## Envirn:qtl6.2 1 0.0189 1.43 -1205 3.10 0.07953 .
## Envirn:qtl7.2 1 0.0003 1.41 -1209 0.05 0.81657
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
These large effect QTLs do not show significant QxE (P > 0.05).
# This is an alternative way to ask the same question. Results are very similar.
fit <- aov(FlrDat~PopGroup + Envirn + qtl6.2 + qtl7.2 + qtl6.2:Envirn + qtl7.2:Envirn,
df.COL.UTA.Flr)
Anova(fit, type="III")
(tidy_aov <- tidy(fit))
Sasaki, E., et al. (2015). “”Missing" G x E Variation Controls Flowering Time in Arabidopsis thaliana." PLoS Genetics 11(10).