Program to merge the BLUPS from each experiment. Then compute correlations among envirns, and PCA for flowering experiments.
The Zscore between W2 & W3 data sets is NOT used.
library(tidyverse)
library(GGally)
library(cowplot)
norm.hist <- function(data.vec, xlab){ # function to draw normal density plot
hist.x <- sort(na.omit(data.vec))
hist.seq <- seq(min(hist.x), max(hist.x), length.out=length(hist.x))
hist.d <- dnorm(hist.seq, mean(hist.x), sd(hist.x))
df.hist <- data.frame(x = hist.x, seq = hist.seq, d = hist.d)
ggplot(df.hist) + geom_histogram(aes(x = x, y = ..density..)) +
geom_line(aes(x = seq, y = d), colour = "red", size = 1.2) +
labs(x = xlab, y = "Frequency")
}
Input genotype means and BLUPs for each experiment for Log Flowering Age:
# Input file names shown here:
in_name1 <- "Expt1_Emily_LogFlr_BLUPS.txt"
infile1 <- file.path(path.expand("~"),"FlrGWAS","output", in_name1)
df1 <- read_tsv(infile1)
## Parsed with column specification:
## cols(
## ID = col_character(),
## RP_meanE1 = col_double(),
## RP_blupE1 = col_double()
## )
dim(df1)
## [1] 380 3
in_name2 <- "Wenjie_first_LogFlr_BLUPS.txt"
infile2 <- file.path(path.expand("~"),"FlrGWAS","output", in_name2)
df2 <- read_tsv(infile2)
## Parsed with column specification:
## cols(
## ID = col_character(),
## RP_meanW1 = col_double(),
## RP_blupW1 = col_double()
## )
dim(df2)
## [1] 402 3
in_name3 <- "Wenjie_second_LogFlr_BLUPS.txt"
infile3 <- file.path(path.expand("~"),"FlrGWAS","output", in_name3)
df3 <- read_tsv(infile3)
## Parsed with column specification:
## cols(
## ID = col_character(),
## RP_meanW2 = col_double(),
## RP_blupW2 = col_double()
## )
dim(df3)
## [1] 356 3
in_name4 <- "Wenjie_third_LogFlr_BLUPS.txt"
infile4 <- file.path(path.expand("~"),"FlrGWAS","output", in_name4)
df4 <- read_tsv(infile4)
## Parsed with column specification:
## cols(
## ID = col_character(),
## RP_meanW3 = col_double(),
## RP_blupW3 = col_double()
## )
dim(df4)
## [1] 329 3
in_name5 <- "Wenjie_fourth_LogFlr_BLUPS.txt"
infile5 <- file.path(path.expand("~"),"FlrGWAS","output", in_name5)
df5 <- read_tsv(infile5)
## Parsed with column specification:
## cols(
## ID = col_character(),
## RP_meanW4 = col_double(),
## RP_blupW4 = col_double()
## )
dim(df5)
## [1] 356 3
in_name6 <- "Expt_BSW_Set1_LogFlr_BLUPS.txt"
infile6 <- file.path(path.expand("~"),"FlrGWAS","output", in_name6)
df6 <- read_tsv(infile6)
## Parsed with column specification:
## cols(
## ID = col_character(),
## RP_mean1 = col_double(),
## RP_blup1 = col_double()
## )
df6 <- df6 %>% select(ID, RP_blup1)
dim(df6)
## [1] 384 2
# We don't use Set2, becasue too many families lacked flowering plants
in_name7 <- "Expt_BSW_Set3_LogFlr_BLUPS.txt"
infile7 <- file.path(path.expand("~"),"FlrGWAS","output", in_name7)
df7 <- read_tsv(infile7)
## Parsed with column specification:
## cols(
## ID = col_character(),
## RP_mean3 = col_double(),
## RP_blup3 = col_double()
## )
df7 <- df7 %>% select(ID, RP_blup3)
dim(df7)
## [1] 390 2
in_name8 <- "Expt_BSW_Set4_LogFlr_BLUPS.txt"
infile8 <- file.path(path.expand("~"),"FlrGWAS","output", in_name8)
df8 <- read_tsv(infile8)
## Parsed with column specification:
## cols(
## ID = col_character(),
## RP_mean4 = col_double(),
## RP_blup4 = col_double()
## )
df8 <- df8 %>% select(ID, RP_blup4)
dim(df8)
## [1] 430 2
in_name9 <- "Expt_BSW_Set5_LogFlr_BLUPS.txt"
infile9 <- file.path(path.expand("~"),"FlrGWAS","output", in_name9)
df9 <- read_tsv(infile9)
## Parsed with column specification:
## cols(
## ID = col_character(),
## RP_mean5 = col_double(),
## RP_blup5 = col_double()
## )
df9 <- df9 %>% select(ID, RP_blup5)
dim(df9)
## [1] 432 2
in_name10 <- "Expt_BSW_Set6_LogFlr_BLUPS.txt"
infile10 <- file.path(path.expand("~"),"FlrGWAS","output", in_name10)
df10 <- read_tsv(infile10)
## Parsed with column specification:
## cols(
## ID = col_character(),
## RP_mean6 = col_double(),
## RP_blup6 = col_double()
## )
df10 <- df10 %>% select(ID, RP_blup6)
dim(df10)
## [1] 424 2
Merge results from each experiment:
df12 <- full_join(df1, df2, by = "ID")
df123 <- full_join(df12, df3, by = "ID")
df1234 <- full_join(df123, df4, by = "ID")
df12345 <- full_join(df1234, df5, by = "ID")
df123456 <- full_join(df12345, df6, by = "ID")
df1234567 <- full_join(df123456, df7, by = "ID")
df12345678 <- full_join(df1234567, df8, by = "ID")
df123456789 <- full_join(df12345678, df9, by = "ID")
df12345678910 <- full_join(df123456789, df10, by = "ID")
Prepare and save BLUP data frame
df.blups <- df12345678910 %>%
select("ID", "RP_blupE1", "RP_blupW1", "RP_blupW2",
"RP_blupW3", "RP_blupW4", "RP_blup1", "RP_blup3",
"RP_blup4", "RP_blup5", "RP_blup6")
names(df.blups)
## [1] "ID" "RP_blupE1" "RP_blupW1" "RP_blupW2" "RP_blupW3" "RP_blupW4"
## [7] "RP_blup1" "RP_blup3" "RP_blup4" "RP_blup5" "RP_blup6"
#Output merged BLUP data frame
file_out <- "LogFlr_BLUPs_10_Expts.txt"
out_name <- file.path(path.expand("~"),"FlrGWAS", "output", file_out)
write_tsv(df.blups, out_name)
num_genos <- nrow(df.blups)
# Get genotypes that have flowering data in all environments
df.blups.all <- na.omit(df.blups)
(num_genos_in_all <- nrow(df.blups.all))
## [1] 133
The total number of genotypes used is 488. The number of genotypes with data from all environments is 133.
Correlation of genotype BLUPs across environments:
df.BLUPs.merged <- df.blups %>% select(-ID)
names(df.BLUPs.merged) <- c("E1", "W1", "W2", "W3", "W4", "B1", "B3", "B4", "B5", "B6")
(corrs <- cor(df.BLUPs.merged, use = "pairwise.complete.obs"))
## E1 W1 W2 W3 W4 B1 B3 B4 B5 B6
## E1 1.0000 0.6009 0.5418 0.5501 0.7625 0.6079 0.7031 0.7201 0.6896 0.6866
## W1 0.6009 1.0000 0.5001 0.4347 0.6438 0.5232 0.6319 0.5788 0.6131 0.6060
## W2 0.5418 0.5001 1.0000 0.3844 0.6205 0.5202 0.6350 0.5712 0.5816 0.5640
## W3 0.5501 0.4347 0.3844 1.0000 0.5594 0.4401 0.4312 0.5128 0.4371 0.4809
## W4 0.7625 0.6438 0.6205 0.5594 1.0000 0.5522 0.6852 0.6879 0.6968 0.6696
## B1 0.6079 0.5232 0.5202 0.4401 0.5522 1.0000 0.6165 0.6173 0.6099 0.6035
## B3 0.7031 0.6319 0.6350 0.4312 0.6852 0.6165 1.0000 0.6946 0.7224 0.6736
## B4 0.7201 0.5788 0.5712 0.5128 0.6879 0.6173 0.6946 1.0000 0.8129 0.7606
## B5 0.6896 0.6131 0.5816 0.4371 0.6968 0.6099 0.7224 0.8129 1.0000 0.8192
## B6 0.6866 0.6060 0.5640 0.4809 0.6696 0.6035 0.6736 0.7606 0.8192 1.0000
min.corrs <- min(corrs)
ggpairs(df.BLUPs.merged)
ggsave("ggpairs.png")
## Saving 7 x 5 in image
Environments W2 & W3 have the lowest correlation, r = 0.3844.
Conclusion: Environment W3 shows the lowest correlation with other environments. And flowering of plants in their first versus second season does not show unusually low correlations.
Get dataframe with W2 & W3 data sets:
# Dataframe for least similar experiments, W2 & W3:
df.W2W3 <- df12345678910 %>% select("ID", "RP_blupW2", "RP_blupW3")
df.W2W3 <- na.omit(df.W2W3) # Only non-missing families
glimpse(df.W2W3)
## Rows: 244
## Columns: 3
## $ ID <chr> "RP003", "RP008", "RP009", "RP010", "RP012", "RP014", "RP01…
## $ RP_blupW2 <dbl> -0.092047, 0.013621, -0.106506, -0.062587, -0.060899, -0.02…
## $ RP_blupW3 <dbl> -0.078790, 0.074965, -0.071868, -0.141099, -0.075685, -0.07…
num.genos.W2W3 <- nrow(df.W2W3) # number of genotypes in common
For W2 & W3, the number of genotypes in common is 244.
Get difference between standarized (z-score) flowering times in W2 and W3 (NOT USED).
df.W2W3 <- df.W2W3 %>% rename(W2 = RP_blupW2,
W3 = RP_blupW3) %>%
mutate(zW2 = (W2 - mean(W2)) / sd(W2),
zW3 = (W3 - mean(W3)) / sd(W3),
diffW2W3 = zW2 - zW3)
df.W2W3 <- df.W2W3 %>% mutate(Zdiff = diffW2W3 / sd(diffW2W3))
norm.hist(df.W2W3$Zdiff, "Differences between Standarized flowering ages")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
For Zdiff, mean = 0 and sd = 1. Next, output the standardized difference:
df.W2W3 <- df.W2W3 %>% select(ID, Zdiff)
file_out <- "LogFlr_BLUPs_Zscore_W2W3.txt"
out_name <- file.path(path.expand("~"),"FlrGWAS", "output", file_out)
write_tsv(df.W2W3, out_name)
Now, get dataframe with W1 & W3 data sets:
# Dataframe for experiments W1 & W3:
df.W1W3 <- df12345678910 %>% select("ID", "RP_blupW1", "RP_blupW3")
df.W1W3 <- na.omit(df.W1W3) # Only non-missing families
glimpse(df.W1W3)
## Rows: 275
## Columns: 3
## $ ID <chr> "RP003", "RP008", "RP009", "RP010", "RP012", "RP014", "RP01…
## $ RP_blupW1 <dbl> -0.094013, 0.071776, -0.134949, -0.107275, 0.019756, -0.054…
## $ RP_blupW3 <dbl> -0.078790, 0.074965, -0.071868, -0.141099, -0.075685, -0.07…
num.genos.W1W3 <- nrow(df.W1W3) # number of genotypes in common
For W1 & W3, the number of genotypes in common is 275.
Get difference between standarized (z-score) flowering times in W1 and W3:
df.W1W3 <- df.W1W3 %>% rename(W1 = RP_blupW1,
W3 = RP_blupW3) %>%
mutate(zW1 = (W1 - mean(W1)) / sd(W1),
zW3 = (W3 - mean(W3)) / sd(W3),
diffW1W3 = zW1 - zW3)
df.W1W3 <- df.W1W3 %>% mutate(Zdiff = diffW1W3 / sd(diffW1W3))
norm.hist(df.W1W3$Zdiff, "Differences between Standarized flowering ages")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
For Zdiff, mean = 0 and sd = 1. Next, output the standardized difference:
df.W1W3 <- df.W1W3 %>% select(ID, Zdiff)
file_out <- "LogFlr_BLUPs_Zscore_W1W3.txt"
out_name <- file.path(path.expand("~"),"FlrGWAS", "output", file_out)
write_tsv(df.W1W3, out_name)
Next, we do Principal Components Analysis on the BLUPs for flowering time in these six environments, based on the zero-centered, standardized correlation matrix.
# PCA on zero-centered, standardized correlation matrix
blup.pca <- prcomp(df.blups.all[, 2:11], center=TRUE, retx=TRUE, scale.=TRUE)
# center=TRUE mean centers the data
# retx=TRUE returns the PC scores
# to do PCA on the correlation matrix set scale.=TRUE
# -- notice the period after scale!
summary(blup.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## Standard deviation 2.592 0.9147 0.6906 0.6601 0.6333 0.5683 0.5028 0.4613
## Proportion of Variance 0.672 0.0837 0.0477 0.0436 0.0401 0.0323 0.0253 0.0213
## Cumulative Proportion 0.672 0.7554 0.8032 0.8467 0.8868 0.9191 0.9444 0.9657
## PC9 PC10
## Standard deviation 0.436 0.3910
## Proportion of Variance 0.019 0.0153
## Cumulative Proportion 0.985 1.0000
Get factor loadings
V <- blup.pca$rotation # eigenvectors
L <- diag(blup.pca$sdev) # diag mtx w/sqrts of eigenvalues on diag.
blup.loadings <- V %*% L
blup.loadings[, 1:5]
## [,1] [,2] [,3] [,4] [,5]
## RP_blupE1 0.8512 -0.006459 -0.12113 0.08656 -0.06521
## RP_blupW1 0.7814 -0.045633 0.50340 0.29863 -0.02606
## RP_blupW2 0.7891 0.113872 0.14139 -0.55029 -0.08168
## RP_blupW3 0.4639 -0.875398 -0.09077 -0.06233 0.01161
## RP_blupW4 0.8656 -0.040427 0.18090 -0.06883 0.18265
## RP_blup1 0.8229 0.047240 -0.21592 0.12629 -0.45118
## RP_blup3 0.8784 0.061052 0.11649 -0.03279 -0.18654
## RP_blup4 0.8762 0.166991 -0.21144 0.01475 0.14665
## RP_blup5 0.8921 0.136017 -0.17257 0.07877 0.17799
## RP_blup6 0.8851 0.036402 -0.11496 0.06480 0.25388
PC1 (67.2% of var) is positively loaded on flowering time in all environments.
PC2 (8.4% of var) identifies a contrast between flowering times in W3 versus other environments.
PC3 (4.8% of var) identifies a contrast between flowering times in W1 versus B1 and B4.
Prepare pairwise plots of PC1, PC2, and PC3
pca.scores.df <- as.data.frame(blup.pca$x)[, 1:3]
head(pca.scores.df)
## PC1 PC2 PC3
## 1 -4.0628 -0.05986 0.09553
## 2 2.6888 -0.57693 0.20058
## 3 -3.3935 -0.11459 -0.31889
## 4 -2.1473 0.10405 0.85464
## 5 -1.1907 0.60518 -0.10637
## 6 -0.9231 -0.54707 -0.44067
# glimpse(pca.scores.df)
# summarise_all(pca.scores.df, list(mean, sd))
summarise_all(pca.scores.df, list(min, max))
## PC1_fn1 PC2_fn1 PC3_fn1 PC1_fn2 PC2_fn2 PC3_fn2
## 1 -6.083 -4.802 -1.81 6.176 2.403 2.242
# cor(pca.scores.df)
coord.system <- coord_fixed(ratio=1, xlim=c(-6.5,6.5),ylim=c(-6.5,6.5))
pc1v2 <-
pca.scores.df %>%
ggplot(aes(x = PC1, y= PC2)) +
geom_point() +
coord.system + theme_classic()
pc1v3 <-
pca.scores.df %>%
ggplot(aes(x = PC1, y= PC3)) +
geom_point() +
coord.system + theme_classic()
pc2v3 <-
pca.scores.df %>%
ggplot(aes(x = PC2, y= PC3)) +
geom_point() +
coord.system + theme_classic()
cowplot::plot_grid(pc1v2, pc1v3, pc2v3, nrow = 1)
In the plots above we show the position of each genotype in the new principal component axes (a rotation of axes, viewing from a new direction).
Prepare for biplot
loadings.1and2 <-
data.frame(blup.loadings[,1:2]) %>%
rename(PC1.loading = X1, PC2.loading = X2)
loadings.1and2$variable <- c("EC", "W1", "W2", "W3", "W4", "B1", "B3", "B4", "B5", "B6")
loadings.1and2
## PC1.loading PC2.loading variable
## RP_blupE1 0.8512 -0.006459 EC
## RP_blupW1 0.7814 -0.045633 W1
## RP_blupW2 0.7891 0.113872 W2
## RP_blupW3 0.4639 -0.875398 W3
## RP_blupW4 0.8656 -0.040427 W4
## RP_blup1 0.8229 0.047240 B1
## RP_blup3 0.8784 0.061052 B3
## RP_blup4 0.8762 0.166991 B4
## RP_blup5 0.8921 0.136017 B5
## RP_blup6 0.8851 0.036402 B6
Draw biplot
stretch = 4 # How much to stretch the experiment arrows for clarity
pc1v2.biplot <-
pc1v2 +
geom_segment(data=loadings.1and2,
aes(x = 0, y = 0, xend = stretch*PC1.loading,
yend = stretch*PC2.loading),
color='red',
arrow = arrow(angle=15, length=unit(0.1,"inches"))) # +
# Labels are too clustered, so do not include
# geom_text(data=loadings.1and2,
# aes(x = stretch*PC1.loading, y = stretch*PC2.loading, label=variable),
# color='red', nudge_x = 0.5, nudge_y = 0)
pc1v2.biplot
biplot.file <- file.path(path.expand("~"),"FlrGWAS","output", "BiPlot.png")
png(file=biplot.file, width=600, height=600)
dev.off()
## quartz_off_screen
## 2
This figure shows both the BLUPs and the environmental influences on flowering in a single plot. We can see that PC1 is highly correlated with effects of all environments except W3, which has rather different effects on flowering time. (For visibility, arrows have been stretched by a factor of 4.) Together, PC1 + PC2 summarize 75.5% of genetic variance in flowering time.