library(tidyverse)
library(readxl)
library(openxlsx)
library(raster)
library(sp)
library(GGally)
library(lme4)
library(lmerTest) # import lmerTest *AFTER* lme4

Questions:

  1. What climatic variables are associated with flowering time for genotypes grown in common garden?

  2. How well does climatic variation predict the genetic mean flowering time in Boechera stricta?

  3. How do these patterns in B. stricta compare with similar patterns in Arabidopsis thaliana?


Get RefPop data

in_name1 <- "RefPop_lat_lon.txt"
infile1 <- file.path(path.expand("~"),"FlrGWAS","output", in_name1)
df.locations <- read_tsv(infile1)
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   LogFlrDat = col_double(),
##   Elev_m = col_double(),
##   lat = col_double(),
##   lon = col_double()
## )
glimpse(df.locations)
## Rows: 402
## Columns: 5
## $ ID        <chr> "RP001", "RP003", "RP004", "RP008", "RP009", "RP010", "RP01…
## $ LogFlrDat <dbl> 1.544, 1.397, 1.529, 1.595, 1.306, 1.374, 1.541, 1.443, 1.4…
## $ Elev_m    <dbl> 2587, 2642, 2128, 2400, 2624, 2799, 3071, 2439, 2072, 2018,…
## $ lat       <dbl> 38.51, 44.62, 44.45, 40.44, 43.97, 44.62, 40.68, 41.84, 43.…
## $ lon       <dbl> -109.3, -114.5, -115.0, -111.6, -114.7, -114.5, -110.9, -11…

# Split into vectors to use in bioclim download
id <- as.vector(df.locations$ID)
RefPop <- as.vector(df.locations$ID) # same vector, different name
logflrdat <- as.vector(df.locations$LogFlrDat)
elev_m <- as.vector(df.locations$Elev_m)
lats <- as.vector(df.locations$lat)
lons <- as.vector(df.locations$lon)

lonlat <- cbind(lons, lats)
# head(lonlat)
(num.pts <- nrow(lonlat)) # Number of geographic points
## [1] 402

Get bio data, either from file, or from online. Choose code block for eval = TRUE

Get from local data file:

# This code block reads data from pre-built file, assembled by `input.Bioclim.R` 
in_name2 <- "RefPop_bio.txt"
infile2 <- file.path(path.expand("~"),"FlrGWAS","output", in_name2)
df.bio <- read_tsv(infile2)
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   RefPop = col_character()
## )
## See spec(...) for full column specifications.
# glimpse(df.bio)
print("Pre-built data from local file")
## [1] "Pre-built data from local file"

Or get from online:

# This code block gets data online. Not run!
pts <- SpatialPoints(lonlat)
# class(pts)
showDefault(pts)

crdref <- CRS('+proj=longlat +datum=WGS84')
pts <- SpatialPoints(lonlat, proj4string=crdref)
pts

# https://www.rdocumentation.org/packages/raster/versions/3.1-5/topics/getData
# 2020-06-24: tried res= 0.5, but fails
# r <- getData('worldclim', var="bio", res=0.5, lon=-109.43, lat=41.35) # gives error

r <- getData("worldclim",var="bio",res=2.5)
r <- r[[1:19]]
values <- extract(r, pts)

df.bio <- cbind.data.frame(id, elev_m, logflrdat, coordinates(pts), values)
df.bio <- as_tibble(df.bio)
print("Data from online file")

Info on df.bio

dim(df.bio)
## [1] 402  24
names(df.bio)
##  [1] "RefPop"    "elev_m"    "logflrdat" "lons"      "lats"      "bio1"     
##  [7] "bio2"      "bio3"      "bio4"      "bio5"      "bio6"      "bio7"     
## [13] "bio8"      "bio9"      "bio10"     "bio11"     "bio12"     "bio13"    
## [19] "bio14"     "bio15"     "bio16"     "bio17"     "bio18"     "bio19"

Get PCA

# PCA on zero-centered, standardized correlation matrix
# Bio1 .. Bio19, from columns 6 .. 24
bio.pca <- prcomp(df.bio[, 6:24], center=TRUE, retx=TRUE, scale.=TRUE)

summary(bio.pca)
## Importance of components:
##                          PC1   PC2   PC3   PC4    PC5    PC6    PC7     PC8
## Standard deviation     2.685 2.055 1.671 1.388 1.0671 0.9793 0.6351 0.37189
## Proportion of Variance 0.379 0.222 0.147 0.101 0.0599 0.0505 0.0212 0.00728
## Cumulative Proportion  0.379 0.602 0.749 0.850 0.9100 0.9605 0.9817 0.98895
##                            PC9    PC10    PC11    PC12    PC13    PC14    PC15
## Standard deviation     0.28826 0.22022 0.15751 0.14204 0.13135 0.08449 0.06551
## Proportion of Variance 0.00437 0.00255 0.00131 0.00106 0.00091 0.00038 0.00023
## Cumulative Proportion  0.99333 0.99588 0.99719 0.99825 0.99916 0.99953 0.99976
##                           PC16    PC17    PC18     PC19
## Standard deviation     0.05529 0.03362 0.02038 1.04e-15
## Proportion of Variance 0.00016 0.00006 0.00002 0.00e+00
## Cumulative Proportion  0.99992 0.99998 1.00000 1.00e+00

PC1 - PC5 explain 91% of variation.


Get factor loadings

V <- bio.pca$rotation # eigenvectors
L <- diag(bio.pca$sdev) # diag matrix w/sqrts of eigenvalues on diag.
bio.loadings <- V %*% L

# Save loadings to text file
df.loadings <- as_tibble(bio.loadings[, 1:5]) # Only PC1 - PC5
## Warning: The `x` argument of `as_tibble.matrix()` must have column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
head(df.loadings)
## # A tibble: 6 x 5
##       V1      V2     V3      V4     V5
##    <dbl>   <dbl>  <dbl>   <dbl>  <dbl>
## 1 -0.624 -0.665  -0.274 -0.0876  0.277
## 2 -0.558  0.435   0.382  0.391   0.405
## 3 -0.251 -0.0830  0.755  0.238   0.310
## 4 -0.321  0.636  -0.540  0.0674  0.120
## 5 -0.860 -0.186  -0.286  0.188   0.236
## 6 -0.211 -0.916  -0.169 -0.201  -0.108

# Save loadings for further examination
file_out <- "loadings.bio.txt"
out_name <- file.path(path.expand("~"),"FlrGWAS", "output", file_out)
write_tsv(df.loadings, out_name)

Loadings for PC1 - PC5 have been written to “loadings.bio.txt”.


Get PCA scores for each genotype

df.pcs <- cbind.data.frame(id, bio.pca$x[, 1:5])
df.pcs <- as_tibble(df.pcs) %>% rename(RefPop = id)

df.tot <- left_join(df.bio, df.pcs)
## Joining, by = "RefPop"
## Warning: Column `RefPop` joining character vector and factor, coercing into
## character vector
names(df.tot)
##  [1] "RefPop"    "elev_m"    "logflrdat" "lons"      "lats"      "bio1"     
##  [7] "bio2"      "bio3"      "bio4"      "bio5"      "bio6"      "bio7"     
## [13] "bio8"      "bio9"      "bio10"     "bio11"     "bio12"     "bio13"    
## [19] "bio14"     "bio15"     "bio16"     "bio17"     "bio18"     "bio19"    
## [25] "PC1"       "PC2"       "PC3"       "PC4"       "PC5"

The following analyses are incomplete until we include Structure_Group as a categorical predictor in the models.

First, check number of genos with missing group info

# Read in all of the RefPop genotypes. This include our PopGroup info.
in_name3 <- "Bs517_20200508_to_Tom5_trim.xlsx"
in_name3 <- file.path(path.expand("~"),"FlrGWAS","Flr_Data", in_name3)
df.RP <- read_excel(in_name3, sheet = 1, na = "NA")
names(df.RP)
## [1] "ID"       "PopGroup"

# Read in the genotypes used in Wenjie's flowering experiments
in_name4 <- "LogFlr_BLUPs_Five_Expts.txt"
in_name4 <- file.path(path.expand("~"),"FlrGWAS","output", in_name4)
df.flr.genos <- read_tsv(in_name4, na = "NA")
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   RP_blupE1 = col_double(),
##   RP_blupW1 = col_double(),
##   RP_blupW2 = col_double(),
##   RP_blupW3 = col_double(),
##   RP_blupW4 = col_double()
## )

# Get IDs for genotypes used in Wenjie's flowering experiments
# Get just the ID column (given problems with dplyr)
df.flr.genos <- as_tibble(df.flr.genos$ID)
names(df.flr.genos)[1] <- "ID"
dim(df.flr.genos)
## [1] 479   1

# Merge genos in Wenjie's experiment with other RefPop info
# These are just the genos used in her experiment
df.flr.genos <- left_join(df.flr.genos, df.RP, by = "ID")
names(df.flr.genos)
## [1] "ID"       "PopGroup"
dim(df.flr.genos)
## [1] 479   2

# How many accessions in each PopGroup?
(Num.accessions <- nrow(df.flr.genos))
## [1] 479

(Num.each.group <- df.flr.genos %>% group_by(PopGroup) %>% count())
## # A tibble: 6 x 2
## # Groups:   PopGroup [6]
##   PopGroup     n
##   <chr>    <int>
## 1 ADM         18
## 2 COL        137
## 3 NOR         97
## 4 UTA        139
## 5 WES         73
## 6 <NA>        15

# Save data
file_out <- "Wenjie_flrdat_genotypes.txt"
out_name <- file.path(path.expand("~"),"FlrGWAS", "output", file_out)
write_tsv(df.flr.genos, out_name)

RefPop information for the flowering genos has been written to “Wenjie_flrdat_genotypes.txt”.

Join data files for analysis

names(df.tot)[1] <- "ID"
df.tot <- left_join(df.tot, df.RP) %>% na.omit()
## Joining, by = "ID"
names(df.tot)
##  [1] "ID"        "elev_m"    "logflrdat" "lons"      "lats"      "bio1"     
##  [7] "bio2"      "bio3"      "bio4"      "bio5"      "bio6"      "bio7"     
## [13] "bio8"      "bio9"      "bio10"     "bio11"     "bio12"     "bio13"    
## [19] "bio14"     "bio15"     "bio16"     "bio17"     "bio18"     "bio19"    
## [25] "PC1"       "PC2"       "PC3"       "PC4"       "PC5"       "PopGroup"
dim(df.tot)
## [1] 391  30

# Save merged data
file_out <- "ID_PopGroup_worldclim_PCA.txt"
out_name <- file.path(path.expand("~"),"FlrGWAS", "output", file_out)
write_tsv(df.tot, out_name)

What environmental factors predict flowering time?

df.5pcs <- df.tot[c("PC1", "PC2", "PC3", "PC4", "PC5")]
ggpairs(df.5pcs)

cor(df.5pcs)
##           PC1       PC2       PC3       PC4       PC5
## PC1  1.000000 -0.002853  0.003361  0.006688 -0.004612
## PC2 -0.002853  1.000000 -0.007680 -0.005601  0.013174
## PC3  0.003361 -0.007680  1.000000  0.006170 -0.011025
## PC4  0.006688 -0.005601  0.006170  1.000000  0.003122
## PC5 -0.004612  0.013174 -0.011025  0.003122  1.000000

These 5 PCs are uncorrelated and have approximately linear relationships.


Prediction of flowering time based on latitude and elevation:

options(contrasts = c("contr.sum","contr.poly"))
regress.lat.elev <- lm(logflrdat ~ PopGroup + lats + elev_m, data=df.tot)
drop1(regress.lat.elev, .~., test="F")
## Single term deletions
## 
## Model:
## logflrdat ~ PopGroup + lats + elev_m
##          Df Sum of Sq  RSS   AIC F value  Pr(>F)    
## <none>                4.41 -1740                    
## PopGroup  4     0.191 4.60 -1731    4.16  0.0026 ** 
## lats      1     0.110 4.52 -1732    9.59  0.0021 ** 
## elev_m    1     0.311 4.72 -1715   27.11 3.1e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Highly significant effects of PopGroup, Latitude, and elevation.

Now, summary to check R-squared:

summary(regress.lat.elev)
## 
## Call:
## lm(formula = logflrdat ~ PopGroup + lats + elev_m, data = df.tot)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4104 -0.0728 -0.0019  0.0597  0.3641 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.29e+00   1.96e-01   11.69  < 2e-16 ***
## PopGroup1    1.91e-04   2.14e-02    0.01   0.9929    
## PopGroup2    4.02e-02   1.38e-02    2.90   0.0039 ** 
## PopGroup3   -3.41e-02   1.24e-02   -2.75   0.0062 ** 
## PopGroup4   -5.74e-03   1.21e-02   -0.48   0.6343    
## lats        -1.14e-02   3.69e-03   -3.10   0.0021 ** 
## elev_m      -1.16e-04   2.23e-05   -5.21  3.1e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.107 on 384 degrees of freedom
## Multiple R-squared:  0.112,  Adjusted R-squared:  0.0985 
## F-statistic: 8.11 on 6 and 384 DF,  p-value: 2.92e-08

Latitude + elevation explain 11.2% of genetic variation in flowering time. Flowering is later at higher elevation and more northerly sites.


Does abiotic climate correlate with plasticty of flowering time in W1 vs. W3?

# Get PopGroup info for merge (workaround for error with select())
IDs <- df.tot$ID
PGs <- df.tot$PopGroup
PC1 <- df.tot$PC1
PC2 <- df.tot$PC2
PC3 <- df.tot$PC3
PC4 <- df.tot$PC4
PC5 <- df.tot$PC5
df.7col <- data.frame(IDs, PGs, PC1, PC2, PC3, PC4, PC5)
df.7col <- unique(df.7col) %>% rename(RefPop   = IDs,
                                      PopGroup = PGs)

# Get plasticity of flowering date for each genotype
plast.name <- file.path(path.expand("~"),"FlrGWAS", "output", "plastic.W1.W3.tsv")
df.plast <- read_tsv(plast.name)
## Parsed with column specification:
## cols(
##   RefPop = col_character(),
##   LogFlrDatW1 = col_double(),
##   LogFlrDatW3 = col_double(),
##   deltaFlrDat = col_double()
## )

# Merge with PopGroup info
df.plast <- left_join(df.plast, df.7col)
## Joining, by = "RefPop"
## Warning: Column `RefPop` joining character vector and factor, coercing into
## character vector

options(contrasts = c("contr.sum","contr.poly"))
mod.plast <- lm(deltaFlrDat ~ PopGroup + PC1 + PC2 + PC3 + PC4 + PC5, data=df.plast)
drop1(mod.plast, .~., test="F")
## Single term deletions
## 
## Model:
## deltaFlrDat ~ PopGroup + PC1 + PC2 + PC3 + PC4 + PC5
##          Df Sum of Sq   RSS  AIC F value Pr(>F)  
## <none>                0.694 -447                 
## PopGroup  4    0.0796 0.773 -445    2.44  0.053 .
## PC1       1    0.0041 0.698 -449    0.51  0.479  
## PC2       1    0.0243 0.718 -446    2.98  0.088 .
## PC3       1    0.0159 0.710 -447    1.94  0.167  
## PC4       1    0.0251 0.719 -446    3.08  0.083 .
## PC5       1    0.0000 0.694 -449    0.00  0.957  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

PopGroup has a marginal effect of plasticity of flowering time. However, the sample sizes are small, and this does not seem promising.


Prediction of flowering time based on 5 PCs as predictors:

options(contrasts = c("contr.sum","contr.poly"))
pc.regress1 <- lm(logflrdat ~ PopGroup + PC1 + PC2 + PC3 + PC4 + PC5, data=df.tot)
drop1(pc.regress1, .~., test="F")
## Single term deletions
## 
## Model:
## logflrdat ~ PopGroup + PC1 + PC2 + PC3 + PC4 + PC5
##          Df Sum of Sq  RSS   AIC F value  Pr(>F)    
## <none>                4.26 -1747                    
## PopGroup  4    0.0554 4.32 -1750    1.24 0.29360    
## PC1       1    0.0354 4.30 -1746    3.17 0.07600 .  
## PC2       1    0.1384 4.40 -1737   12.38 0.00049 ***
## PC3       1    0.0290 4.29 -1746    2.59 0.10821    
## PC4       1    0.1153 4.38 -1739   10.31 0.00144 ** 
## PC5       1    0.1262 4.39 -1738   11.28 0.00086 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Highly significant effects of PC2, PC4, and PC5. PopGroup is not significant. (After accounting for climate, PopGroup provides no additional influence on flowering time.)

Leaving out PC1 and PC3 (not significant in regression) and PC5 (no substantial loadings), we are left with:

Positive correlation with flowering time:

BIO1 = Annual Mean Temperature

BIO6 = Min Temperature of Coldest Month

BIO8 = Mean Temperature of Wettest Quarter

BIO11 = Mean Temperature of Coldest Quarter

BIO18 = Precipitation of Warmest Quarter

Negative correlation flowering time:

BIO4 = Temperature Seasonality (standard deviation *100)

BIO7 = Temperature Annual Range (BIO5-BIO6)

Now, summary to check R-squared:

summary(pc.regress1)
## 
## Call:
## lm(formula = logflrdat ~ PopGroup + PC1 + PC2 + PC3 + PC4 + PC5, 
##     data = df.tot)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3989 -0.0663 -0.0073  0.0616  0.3751 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.51524    0.00692  218.82  < 2e-16 ***
## PopGroup1    0.00669    0.02109    0.32  0.75115    
## PopGroup2   -0.00452    0.01475   -0.31  0.75960    
## PopGroup3   -0.01682    0.01342   -1.25  0.21058    
## PopGroup4   -0.00740    0.01260   -0.59  0.55707    
## PC1         -0.00391    0.00220   -1.78  0.07600 .  
## PC2         -0.01083    0.00308   -3.52  0.00049 ***
## PC3          0.00650    0.00403    1.61  0.10821    
## PC4         -0.01559    0.00485   -3.21  0.00144 ** 
## PC5          0.02023    0.00602    3.36  0.00086 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.106 on 381 degrees of freedom
## Multiple R-squared:  0.143,  Adjusted R-squared:  0.122 
## F-statistic: 7.04 on 9 and 381 DF,  p-value: 2e-09

14.3% of genetic variation in flowering time under common garden conditions is explained by PC2, PC4, and PC5.


Prediction of flowering time based on relevant Bio indicators:

form <- "logflrdat ~ PopGroup+bio1+bio6+bio8+bio11+bio18+bio4+bio7+lats+elev_m"
options(contrasts = c("contr.sum","contr.poly"))
pc.regress2 <- lm(form, data=df.tot)
drop1(pc.regress2, .~., test="F")
## Single term deletions
## 
## Model:
## logflrdat ~ PopGroup + bio1 + bio6 + bio8 + bio11 + bio18 + bio4 + 
##     bio7 + lats + elev_m
##          Df Sum of Sq  RSS   AIC F value  Pr(>F)    
## <none>                4.00 -1764                    
## PopGroup  4    0.0073 4.00 -1771    0.17 0.95288    
## bio1      1    0.0007 4.00 -1766    0.07 0.79840    
## bio6      1    0.0001 4.00 -1766    0.01 0.91912    
## bio8      1    0.0469 4.04 -1762    4.43 0.03598 *  
## bio11     1    0.0007 4.00 -1766    0.07 0.79208    
## bio18     1    0.1617 4.16 -1751   15.26 0.00011 ***
## bio4      1    0.0010 4.00 -1766    0.09 0.76399    
## bio7      1    0.0032 4.00 -1766    0.30 0.58169    
## lats      1    0.0329 4.03 -1763    3.10 0.07892 .  
## elev_m    1    0.1760 4.17 -1749   16.61 5.6e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

PopGroup is not significant. Significant prediction by bio18, bio8, & elevation.

BIO18 = Precipitation of Warmest Quarter

BIO8 = Mean Temperature of Wettest Quarter

Both Bio8 and Bio18 have positive loadings and positive slopes. Therefore, in common garden, genotypes flower later from places where there is more summer rainfall and warmer temperature in the wetest quarter.

Now, summary to check R-squared:

summary(pc.regress2)
## 
## Call:
## lm(formula = form, data = df.tot)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3862 -0.0669 -0.0039  0.0580  0.3574 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.81e+00   4.50e-01    6.23  1.2e-09 ***
## PopGroup1    1.03e-02   2.13e-02    0.48  0.62906    
## PopGroup2   -6.49e-05   1.69e-02    0.00  0.99694    
## PopGroup3   -1.07e-02   1.42e-02   -0.75  0.45242    
## PopGroup4    2.67e-03   1.37e-02    0.20  0.84532    
## bio1         1.25e-03   4.89e-03    0.26  0.79840    
## bio6         2.86e-04   2.81e-03    0.10  0.91912    
## bio8         1.73e-04   8.24e-05    2.10  0.03598 *  
## bio11       -1.60e-03   6.06e-03   -0.26  0.79208    
## bio18        1.33e-03   3.41e-04    3.91  0.00011 ***
## bio4        -2.21e-05   7.37e-05   -0.30  0.76399    
## bio7        -7.76e-04   1.41e-03   -0.55  0.58169    
## lats        -1.48e-02   8.38e-03   -1.76  0.07892 .  
## elev_m      -1.84e-04   4.52e-05   -4.08  5.6e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.103 on 377 degrees of freedom
## Multiple R-squared:  0.196,  Adjusted R-squared:  0.168 
## F-statistic: 7.07 on 13 and 377 DF,  p-value: 2.53e-12

19.6% of genetic variation in flowering time under common garden conditions is explained by significant effects of Bio8 (Mean Temperature of Wettest Quarter), Bio18 (Precipitation of Warmest Quarter), and elevation.


PC1 has 9 loadings with abs values >= 0.6. Now test these 9 contributors to PC1, plus latitude and elevation:

form <- "logflrdat ~ lats + elev_m + bio1 + bio5 + bio10 + bio12 + bio13 + bio14 + bio16 + bio17 + bio19"
pc1.regress <- lm(form , data = df.tot)
summary(pc1.regress)
## 
## Call:
## lm(formula = form, data = df.tot)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3910 -0.0698 -0.0032  0.0592  0.3565 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.17e+00   4.30e-01    7.36  1.1e-12 ***
## lats        -2.26e-02   5.91e-03   -3.83  0.00015 ***
## elev_m      -2.09e-04   4.47e-05   -4.68  4.0e-06 ***
## bio1         1.11e-03   1.57e-03    0.71  0.47932    
## bio5        -1.12e-03   1.14e-03   -0.99  0.32483    
## bio10       -6.08e-04   1.48e-03   -0.41  0.68129    
## bio12        1.53e-04   3.79e-04    0.40  0.68672    
## bio13        3.81e-03   3.00e-03    1.27  0.20550    
## bio14        4.78e-03   3.16e-03    1.52  0.13024    
## bio16       -2.64e-04   1.32e-03   -0.20  0.84089    
## bio17       -5.54e-04   1.32e-03   -0.42  0.67466    
## bio19       -1.48e-03   2.74e-04   -5.40  1.1e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.102 on 379 degrees of freedom
## Multiple R-squared:  0.199,  Adjusted R-squared:  0.176 
## F-statistic: 8.56 on 11 and 379 DF,  p-value: 1.52e-13

18.5% of genetic variation in flowering time under common garden conditions is explained by latitude and elevation, plus the Bioclim variables that load on PC1. Only one of the Bio columns is significant.


PC2 has 5 loadings with abs values >= 0.6. Now test these 5 contributors to PC2, plus latitude and elevation:

form <- "logflrdat ~ lats + elev_m + bio1 + bio4 + bio6 + bio7 + bio11"
pc2.regress <- lm(form , data = df.tot)
summary(pc2.regress)
## 
## Call:
## lm(formula = form, data = df.tot)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4118 -0.0749 -0.0007  0.0611  0.3704 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.48e+00   4.29e-01    5.78  1.5e-08 ***
## lats         1.78e-03   7.47e-03    0.24  0.81231    
## elev_m      -1.57e-04   4.48e-05   -3.49  0.00053 ***
## bio1         1.70e-03   4.86e-03    0.35  0.72732    
## bio4         6.64e-05   7.18e-05    0.93  0.35535    
## bio6        -7.78e-03   2.32e-03   -3.35  0.00088 ***
## bio7        -4.81e-03   1.16e-03   -4.13  4.4e-05 ***
## bio11        6.83e-03   6.00e-03    1.14  0.25543    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.106 on 383 degrees of freedom
## Multiple R-squared:  0.127,  Adjusted R-squared:  0.111 
## F-statistic: 7.94 on 7 and 383 DF,  p-value: 5.26e-09

12.2% of genetic variation in flowering time under common garden conditions is explained by latitude and elevation, plus the Bioclim variables that load on PC2. Two of the Bio columns are significant.


Using the significant predictors from the previous two models, plus PopGroup:

form <- "logflrdat ~ PopGroup + lats + elev_m + bio6 + bio7 + bio19"
options(contrasts = c("contr.sum","contr.poly"))
pc.regress2 <- lm(form, data=df.tot)
drop1(pc.regress2, .~., test="F")
## Single term deletions
## 
## Model:
## logflrdat ~ PopGroup + lats + elev_m + bio6 + bio7 + bio19
##          Df Sum of Sq  RSS   AIC F value  Pr(>F)    
## <none>                4.17 -1755                    
## PopGroup  4    0.1084 4.28 -1753    2.47 0.04402 *  
## lats      1    0.1664 4.34 -1742   15.19 0.00011 ***
## elev_m    1    0.2717 4.44 -1733   24.81 9.6e-07 ***
## bio6      1    0.0556 4.23 -1752    5.08 0.02479 *  
## bio7      1    0.1493 4.32 -1744   13.64 0.00025 ***
## bio19     1    0.1295 4.30 -1745   11.83 0.00065 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Or alternatively:

form <- "logflrdat ~ PopGroup + lats + elev_m + bio6 + bio7 + bio19"
pc.min.regress <- lm(form , data = df.tot)
summary(pc.min.regress)
## 
## Call:
## lm(formula = form, data = df.tot)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3935 -0.0677 -0.0061  0.0593  0.3536 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.57e+00   4.14e-01    8.61  < 2e-16 ***
## PopGroup1    1.07e-03   2.10e-02    0.05  0.95937    
## PopGroup2    3.34e-02   1.38e-02    2.41  0.01635 *  
## PopGroup3   -2.69e-02   1.24e-02   -2.17  0.03036 *  
## PopGroup4   -1.95e-03   1.25e-02   -0.16  0.87548    
## lats        -1.88e-02   4.82e-03   -3.90  0.00011 ***
## elev_m      -2.22e-04   4.45e-05   -4.98  9.6e-07 ***
## bio6        -1.46e-03   6.48e-04   -2.25  0.02479 *  
## bio7        -2.22e-03   6.02e-04   -3.69  0.00025 ***
## bio19       -5.20e-04   1.51e-04   -3.44  0.00065 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.105 on 381 degrees of freedom
## Multiple R-squared:  0.16,   Adjusted R-squared:  0.141 
## F-statistic: 8.09 on 9 and 381 DF,  p-value: 5.28e-11

Biological conclusions:

13.4% of genetic variation for flowering time in common garden conditions is explained by temperature (minimum temp in coldest month [Bio6] and annual temperature range [Bio7]), as well as latitude, elevation, and winter precipitation [Bio19].

Direction of effects: genotypes flower earlier at higher elevation and more northerly latitude (i.e., colder climates and shorter growing season.) Places with colder winters [Bio6] or wider annual temperature range have earlier flowering genotypes. Boechera stricta habitats with higher levels of winter precipitation [Bio19] have earlier flowering genotypes. Thus, areas with more winter snow are associated with earlier flowering genotypes.

Arabidopsis in Spain shows the opposite relationship between elevation and flowering time, where they flower earlier at lower elevation (Montesinos-Navarro, 2011). Probably this is because the growing season is shorter at low elevation, where drought comes earlier in the year.

Montesinos-Navarro, A., et al. (2011). Arabidopsis thaliana populations show clinal variation in a climatic gradient associated with altitude." New Phytologist 189: 282-294.