Program to merge the BLUPS from each experiment. Then compute correlations among envirns, and PCA for flowering experiments.

The Zscore between W2 & W3 data sets is NOT used.

library(tidyverse)
library(GGally)
library(cowplot)

norm.hist <- function(data.vec, xlab){ # function to draw normal density plot
  hist.x <- sort(na.omit(data.vec))
  hist.seq <- seq(min(hist.x), max(hist.x), length.out=length(hist.x))
  hist.d <- dnorm(hist.seq, mean(hist.x), sd(hist.x))
  df.hist <- data.frame(x = hist.x, seq = hist.seq, d = hist.d)
  ggplot(df.hist) + geom_histogram(aes(x = x, y = ..density..)) +
    geom_line(aes(x = seq, y = d), colour = "red", size = 1.2) +
    labs(x = xlab, y = "Frequency")
}

Input genotype means and BLUPs for each experiment for Log Flowering Age:

# Input file names shown here:
in_name1 <- "Expt1_Emily_LogFlr_BLUPS.txt"
infile1 <- file.path(path.expand("~"),"FlrGWAS","output", in_name1)
df1 <- read_tsv(infile1)
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   RP_meanE1 = col_double(),
##   RP_blupE1 = col_double()
## )
dim(df1)
## [1] 380   3

in_name2 <- "Wenjie_first_LogFlr_BLUPS.txt"
infile2 <- file.path(path.expand("~"),"FlrGWAS","output", in_name2)
df2 <- read_tsv(infile2)
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   RP_meanW1 = col_double(),
##   RP_blupW1 = col_double()
## )
dim(df2)
## [1] 402   3

in_name3 <- "Wenjie_second_LogFlr_BLUPS.txt"
infile3 <- file.path(path.expand("~"),"FlrGWAS","output", in_name3)
df3 <- read_tsv(infile3)
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   RP_meanW2 = col_double(),
##   RP_blupW2 = col_double()
## )
dim(df3)
## [1] 356   3

in_name4 <- "Wenjie_third_LogFlr_BLUPS.txt"
infile4 <- file.path(path.expand("~"),"FlrGWAS","output", in_name4)
df4 <- read_tsv(infile4)
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   RP_meanW3 = col_double(),
##   RP_blupW3 = col_double()
## )
dim(df4)
## [1] 329   3

in_name5 <- "Wenjie_fourth_LogFlr_BLUPS.txt"
infile5 <- file.path(path.expand("~"),"FlrGWAS","output", in_name5)
df5 <- read_tsv(infile5)
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   RP_meanW4 = col_double(),
##   RP_blupW4 = col_double()
## )
dim(df5)
## [1] 356   3

in_name6 <- "Expt_BSW_Set1_LogFlr_BLUPS.txt"
infile6 <- file.path(path.expand("~"),"FlrGWAS","output", in_name6)
df6 <- read_tsv(infile6)
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   RP_mean1 = col_double(),
##   RP_blup1 = col_double()
## )
df6 <- df6 %>% select(ID, RP_blup1)
dim(df6)
## [1] 384   2

# We don't use Set2, becasue too many families lacked flowering plants

in_name7 <- "Expt_BSW_Set3_LogFlr_BLUPS.txt"
infile7 <- file.path(path.expand("~"),"FlrGWAS","output", in_name7)
df7 <- read_tsv(infile7)
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   RP_mean3 = col_double(),
##   RP_blup3 = col_double()
## )
df7 <- df7 %>% select(ID, RP_blup3)
dim(df7)
## [1] 390   2

in_name8 <- "Expt_BSW_Set4_LogFlr_BLUPS.txt"
infile8 <- file.path(path.expand("~"),"FlrGWAS","output", in_name8)
df8 <- read_tsv(infile8)
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   RP_mean4 = col_double(),
##   RP_blup4 = col_double()
## )
df8 <- df8 %>% select(ID, RP_blup4)
dim(df8)
## [1] 430   2

in_name9 <- "Expt_BSW_Set5_LogFlr_BLUPS.txt"
infile9 <- file.path(path.expand("~"),"FlrGWAS","output", in_name9)
df9 <- read_tsv(infile9)
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   RP_mean5 = col_double(),
##   RP_blup5 = col_double()
## )
df9 <- df9 %>% select(ID, RP_blup5)
dim(df9)
## [1] 432   2

in_name10 <- "Expt_BSW_Set6_LogFlr_BLUPS.txt"
infile10 <- file.path(path.expand("~"),"FlrGWAS","output", in_name10)
df10 <- read_tsv(infile10)
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   RP_mean6 = col_double(),
##   RP_blup6 = col_double()
## )
df10 <- df10 %>% select(ID, RP_blup6)
dim(df10)
## [1] 424   2

Merge results from each experiment:

df12           <- full_join(df1, df2, by = "ID")
df123          <- full_join(df12, df3, by = "ID")
df1234         <- full_join(df123, df4, by = "ID")
df12345        <- full_join(df1234, df5, by = "ID")
df123456       <- full_join(df12345, df6, by = "ID")
df1234567      <- full_join(df123456, df7, by = "ID")
df12345678     <- full_join(df1234567, df8, by = "ID")
df123456789    <- full_join(df12345678, df9, by = "ID")
df12345678910  <- full_join(df123456789, df10, by = "ID")

Prepare and save BLUP data frame

df.blups <- df12345678910 %>% 
                   select("ID", "RP_blupE1", "RP_blupW1", "RP_blupW2", 
                          "RP_blupW3", "RP_blupW4", "RP_blup1", "RP_blup3", 
                          "RP_blup4", "RP_blup5", "RP_blup6")
names(df.blups)
##  [1] "ID"        "RP_blupE1" "RP_blupW1" "RP_blupW2" "RP_blupW3" "RP_blupW4"
##  [7] "RP_blup1"  "RP_blup3"  "RP_blup4"  "RP_blup5"  "RP_blup6"

#Output merged BLUP data frame
file_out <- "LogFlr_BLUPs_10_Expts.txt"
out_name <- file.path(path.expand("~"),"FlrGWAS", "output", file_out)
write_tsv(df.blups, out_name)
num_genos <- nrow(df.blups)

# Get genotypes that have flowering data in all environments
df.blups.all <- na.omit(df.blups)
(num_genos_in_all <- nrow(df.blups.all))
## [1] 133

The total number of genotypes used is 488. The number of genotypes with data from all environments is 133.


Correlation of genotype BLUPs across environments:

df.BLUPs.merged <- df.blups %>% select(-ID)

names(df.BLUPs.merged) <- c("E1", "W1", "W2", "W3", "W4", "B1", "B3", "B4", "B5", "B6")
  
(corrs <- cor(df.BLUPs.merged, use = "pairwise.complete.obs"))
##        E1     W1     W2     W3     W4     B1     B3     B4     B5     B6
## E1 1.0000 0.6009 0.5418 0.5501 0.7625 0.6079 0.7031 0.7201 0.6896 0.6866
## W1 0.6009 1.0000 0.5001 0.4347 0.6438 0.5232 0.6319 0.5788 0.6131 0.6060
## W2 0.5418 0.5001 1.0000 0.3844 0.6205 0.5202 0.6350 0.5712 0.5816 0.5640
## W3 0.5501 0.4347 0.3844 1.0000 0.5594 0.4401 0.4312 0.5128 0.4371 0.4809
## W4 0.7625 0.6438 0.6205 0.5594 1.0000 0.5522 0.6852 0.6879 0.6968 0.6696
## B1 0.6079 0.5232 0.5202 0.4401 0.5522 1.0000 0.6165 0.6173 0.6099 0.6035
## B3 0.7031 0.6319 0.6350 0.4312 0.6852 0.6165 1.0000 0.6946 0.7224 0.6736
## B4 0.7201 0.5788 0.5712 0.5128 0.6879 0.6173 0.6946 1.0000 0.8129 0.7606
## B5 0.6896 0.6131 0.5816 0.4371 0.6968 0.6099 0.7224 0.8129 1.0000 0.8192
## B6 0.6866 0.6060 0.5640 0.4809 0.6696 0.6035 0.6736 0.7606 0.8192 1.0000

min.corrs <- min(corrs)

ggpairs(df.BLUPs.merged)


ggsave("ggpairs.png")
## Saving 7 x 5 in image

Environments W2 & W3 have the lowest correlation, r = 0.3844.

Conclusion: Environment W3 shows the lowest correlation with other environments. And flowering of plants in their first versus second season does not show unusually low correlations.


Get dataframe with W2 & W3 data sets:

# Dataframe for least similar experiments, W2 & W3:
df.W2W3 <- df12345678910 %>% select("ID", "RP_blupW2", "RP_blupW3")
df.W2W3 <- na.omit(df.W2W3) # Only non-missing families

glimpse(df.W2W3)
## Rows: 244
## Columns: 3
## $ ID        <chr> "RP003", "RP008", "RP009", "RP010", "RP012", "RP014", "RP01…
## $ RP_blupW2 <dbl> -0.092047, 0.013621, -0.106506, -0.062587, -0.060899, -0.02…
## $ RP_blupW3 <dbl> -0.078790, 0.074965, -0.071868, -0.141099, -0.075685, -0.07…
num.genos.W2W3 <- nrow(df.W2W3) # number of genotypes in common

For W2 & W3, the number of genotypes in common is 244.

Get difference between standarized (z-score) flowering times in W2 and W3 (NOT USED).

df.W2W3 <- df.W2W3 %>% rename(W2 = RP_blupW2,
                              W3 = RP_blupW3) %>%
                       mutate(zW2      = (W2 - mean(W2)) / sd(W2),
                              zW3      = (W3 - mean(W3)) / sd(W3),
                              diffW2W3 = zW2 - zW3)
df.W2W3 <- df.W2W3 %>% mutate(Zdiff    = diffW2W3 / sd(diffW2W3))

norm.hist(df.W2W3$Zdiff, "Differences between Standarized flowering ages")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

For Zdiff, mean = 0 and sd = 1. Next, output the standardized difference:

df.W2W3  <- df.W2W3 %>% select(ID, Zdiff)
file_out <- "LogFlr_BLUPs_Zscore_W2W3.txt"
out_name <- file.path(path.expand("~"),"FlrGWAS", "output", file_out)
write_tsv(df.W2W3, out_name)

Now, get dataframe with W1 & W3 data sets:

# Dataframe for experiments W1 & W3:
df.W1W3 <- df12345678910 %>% select("ID", "RP_blupW1", "RP_blupW3")
df.W1W3 <- na.omit(df.W1W3) # Only non-missing families

glimpse(df.W1W3)
## Rows: 275
## Columns: 3
## $ ID        <chr> "RP003", "RP008", "RP009", "RP010", "RP012", "RP014", "RP01…
## $ RP_blupW1 <dbl> -0.094013, 0.071776, -0.134949, -0.107275, 0.019756, -0.054…
## $ RP_blupW3 <dbl> -0.078790, 0.074965, -0.071868, -0.141099, -0.075685, -0.07…
num.genos.W1W3 <- nrow(df.W1W3) # number of genotypes in common

For W1 & W3, the number of genotypes in common is 275.

Get difference between standarized (z-score) flowering times in W1 and W3:

df.W1W3 <- df.W1W3 %>% rename(W1 = RP_blupW1,
                              W3 = RP_blupW3) %>%
                       mutate(zW1      = (W1 - mean(W1)) / sd(W1),
                              zW3      = (W3 - mean(W3)) / sd(W3),
                              diffW1W3 = zW1 - zW3)
df.W1W3 <- df.W1W3 %>% mutate(Zdiff    = diffW1W3 / sd(diffW1W3))

norm.hist(df.W1W3$Zdiff, "Differences between Standarized flowering ages")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

For Zdiff, mean = 0 and sd = 1. Next, output the standardized difference:

df.W1W3  <- df.W1W3 %>% select(ID, Zdiff)
file_out <- "LogFlr_BLUPs_Zscore_W1W3.txt"
out_name <- file.path(path.expand("~"),"FlrGWAS", "output", file_out)
write_tsv(df.W1W3, out_name)

Next, we do Principal Components Analysis on the BLUPs for flowering time in these six environments, based on the zero-centered, standardized correlation matrix.

# PCA on zero-centered, standardized correlation matrix
blup.pca <- prcomp(df.blups.all[, 2:11], center=TRUE, retx=TRUE, scale.=TRUE)
    # center=TRUE mean centers the data
    # retx=TRUE returns the PC scores
    # to do PCA on the correlation matrix set scale.=TRUE
    #    -- notice the period after scale!

summary(blup.pca)
## Importance of components:
##                          PC1    PC2    PC3    PC4    PC5    PC6    PC7    PC8
## Standard deviation     2.592 0.9147 0.6906 0.6601 0.6333 0.5683 0.5028 0.4613
## Proportion of Variance 0.672 0.0837 0.0477 0.0436 0.0401 0.0323 0.0253 0.0213
## Cumulative Proportion  0.672 0.7554 0.8032 0.8467 0.8868 0.9191 0.9444 0.9657
##                          PC9   PC10
## Standard deviation     0.436 0.3910
## Proportion of Variance 0.019 0.0153
## Cumulative Proportion  0.985 1.0000

Get factor loadings

V <- blup.pca$rotation # eigenvectors
L <- diag(blup.pca$sdev) # diag mtx w/sqrts of eigenvalues on diag.

blup.loadings <- V %*% L
blup.loadings[, 1:5]
##             [,1]      [,2]     [,3]     [,4]     [,5]
## RP_blupE1 0.8512 -0.006459 -0.12113  0.08656 -0.06521
## RP_blupW1 0.7814 -0.045633  0.50340  0.29863 -0.02606
## RP_blupW2 0.7891  0.113872  0.14139 -0.55029 -0.08168
## RP_blupW3 0.4639 -0.875398 -0.09077 -0.06233  0.01161
## RP_blupW4 0.8656 -0.040427  0.18090 -0.06883  0.18265
## RP_blup1  0.8229  0.047240 -0.21592  0.12629 -0.45118
## RP_blup3  0.8784  0.061052  0.11649 -0.03279 -0.18654
## RP_blup4  0.8762  0.166991 -0.21144  0.01475  0.14665
## RP_blup5  0.8921  0.136017 -0.17257  0.07877  0.17799
## RP_blup6  0.8851  0.036402 -0.11496  0.06480  0.25388

PC1 (67.2% of var) is positively loaded on flowering time in all environments.

PC2 (8.4% of var) identifies a contrast between flowering times in W3 versus other environments.

PC3 (4.8% of var) identifies a contrast between flowering times in W1 versus B1 and B4.


Prepare pairwise plots of PC1, PC2, and PC3

pca.scores.df <- as.data.frame(blup.pca$x)[, 1:3]
head(pca.scores.df)
##       PC1      PC2      PC3
## 1 -4.0628 -0.05986  0.09553
## 2  2.6888 -0.57693  0.20058
## 3 -3.3935 -0.11459 -0.31889
## 4 -2.1473  0.10405  0.85464
## 5 -1.1907  0.60518 -0.10637
## 6 -0.9231 -0.54707 -0.44067
# glimpse(pca.scores.df)
# summarise_all(pca.scores.df, list(mean, sd))
summarise_all(pca.scores.df, list(min, max))
##   PC1_fn1 PC2_fn1 PC3_fn1 PC1_fn2 PC2_fn2 PC3_fn2
## 1  -6.083  -4.802   -1.81   6.176   2.403   2.242
# cor(pca.scores.df)

coord.system <- coord_fixed(ratio=1, xlim=c(-6.5,6.5),ylim=c(-6.5,6.5))

pc1v2 <-
  pca.scores.df %>%
  ggplot(aes(x = PC1, y= PC2)) +
  geom_point() +
  coord.system + theme_classic()

pc1v3 <-
  pca.scores.df %>%
  ggplot(aes(x = PC1, y= PC3)) +
  geom_point() +
  coord.system + theme_classic() 

pc2v3 <-
  pca.scores.df %>%
  ggplot(aes(x = PC2, y= PC3)) +
  geom_point() +
  coord.system + theme_classic() 

cowplot::plot_grid(pc1v2, pc1v3, pc2v3, nrow = 1)

In the plots above we show the position of each genotype in the new principal component axes (a rotation of axes, viewing from a new direction).


Prepare for biplot

loadings.1and2 <-
  data.frame(blup.loadings[,1:2]) %>%
  rename(PC1.loading = X1, PC2.loading = X2)

loadings.1and2$variable <- c("EC", "W1", "W2", "W3", "W4", "B1", "B3", "B4", "B5", "B6")

loadings.1and2
##           PC1.loading PC2.loading variable
## RP_blupE1      0.8512   -0.006459       EC
## RP_blupW1      0.7814   -0.045633       W1
## RP_blupW2      0.7891    0.113872       W2
## RP_blupW3      0.4639   -0.875398       W3
## RP_blupW4      0.8656   -0.040427       W4
## RP_blup1       0.8229    0.047240       B1
## RP_blup3       0.8784    0.061052       B3
## RP_blup4       0.8762    0.166991       B4
## RP_blup5       0.8921    0.136017       B5
## RP_blup6       0.8851    0.036402       B6

Draw biplot

stretch = 4 # How much to stretch the experiment arrows for clarity
pc1v2.biplot <-
  pc1v2 +
  geom_segment(data=loadings.1and2,
               aes(x = 0, y = 0, xend = stretch*PC1.loading, 
                                 yend = stretch*PC2.loading),
               color='red', 
               arrow = arrow(angle=15, length=unit(0.1,"inches"))) # +
  # Labels are too clustered, so do not include
  # geom_text(data=loadings.1and2,
  #           aes(x = stretch*PC1.loading, y = stretch*PC2.loading, label=variable),
  #           color='red', nudge_x = 0.5, nudge_y = 0)
 
pc1v2.biplot


biplot.file <- file.path(path.expand("~"),"FlrGWAS","output", "BiPlot.png")
png(file=biplot.file, width=600, height=600)
dev.off()
## quartz_off_screen 
##                 2

This figure shows both the BLUPs and the environmental influences on flowering in a single plot. We can see that PC1 is highly correlated with effects of all environments except W3, which has rather different effects on flowering time. (For visibility, arrows have been stretched by a factor of 4.) Together, PC1 + PC2 summarize 75.5% of genetic variance in flowering time.