--- title: "Insights into the genetic basis of predator-induced response in Daphnia" author: "VT, JHN, AE, MC, Universitaet Hamburg" date: "August 2020" output: html_document --- This analysis starts with the read counts after HTSeq-analysis. #### Differential gene expression analysis (DESeq2 analysis) This Rmd-file describes the differential gene expression analysis of two clonal lines (genotypes M6 & M9) of **Daphnia galeata** being exposed to two environmental conditions: kairomone-free water (control) or fish kairomone water (fish) as described in the manuscript. ```{r setup, include=FALSE} #setwd() source("https://bioconductor.org/biocLite.R") biocLite( c( "limma", "edgeR", "DESeq2", "vsn" ) ) biocLite("GenomeInfoDb") library("GenomeInfoDb") library("DESeq2") #vignette("DESeq2") ####Set options to save the figure output of the analysis #knitr::opts_chunk$set(fig.path = ) ####get package informations #sessionInfo() #citation(package="DESeq2") ``` ### 1. Preparation of raw data (HTseq) ##### 1.1 Create a DESeq-Count Table ```{r CountTable, warning=FALSE, message=FALSE, echo=FALSE, include=FALSE} #Input1: count tables from HTSeq directory<- "Input/read_counts/" #Input2: Load Samplesheet SampleSheet<-read.csv("Input/read_counts/SampleSheet.csv", sep=",") SampleSheetM6<-read.csv("Input/read_counts/SampleSheetM6.csv", sep=",") SampleSheetM9<-read.csv("Input/read_counts/SampleSheetM9.csv", sep=",") #Create DESeq-CountTable #Reference level has been assigned by calling the design as ~environment ddsHTSeq1 <- DESeqDataSetFromHTSeqCount(sampleTable = SampleSheet, directory = directory, design= ~ environment) #[1] 32903 12 ddsHTSeq2<- DESeqDataSetFromHTSeqCount(sampleTable = SampleSheet, directory = directory, design= ~ genotype) #[1] 32903 12 #Create DESeq-Counttable GxE ddsHTSeq3<- DESeqDataSetFromHTSeqCount(sampleTable = SampleSheet, directory = directory, design= ~ genotype + environment + genotype:environment) #[1] 32903 12 #create a DESeq-CountTable per genotype #M6 ddsHTSeq6 <- DESeqDataSetFromHTSeqCount(sampleTable = SampleSheetM6, directory = directory, design= ~ environment) #[1] 32903 6 #M9 ddsHTSeq9 <- DESeqDataSetFromHTSeqCount(sampleTable = SampleSheetM9, directory = directory, design= ~ environment) #[1] 32903 6 ``` ##### 1.2 Pre-filtering of low count genes All transcripts with a read count lower than 12 are excluded. ```{r PreFilter, warning=FALSE, message=FALSE, echo=FALSE, include=FALSE} #read counts >12 ddsHTSeq1<- ddsHTSeq1[ rowSums(counts(ddsHTSeq1)) > 11, ] # [1] 23982 12 ddsHTSeq2<-ddsHTSeq2[ rowSums(counts(ddsHTSeq2)) > 11, ] ##[1] 23982 12 ddsHTSeq3<-ddsHTSeq3[ rowSums(counts(ddsHTSeq3)) > 11, ] ##[1] 23982 12 ddsHTSeq6<-ddsHTSeq6[ rowSums(counts(ddsHTSeq6)) > 11, ] ##[1] 21740 12 ddsHTSeq9<-ddsHTSeq9[ rowSums(counts(ddsHTSeq9)) > 11, ] ##[1] 21813 12 ``` ### 2. Differential Gene Expression The differential expression analysis is based on the negative binomial (a.k.a. Gamma-Poisson) distribution. We accounted for multiple testing correction by applying posthoc filtering using a p-adjust value of 0.05 to reduce the false discovery rate (FDR) (Benjamini & Hochberg 1995). #### 2.1 Are there differentially expressed transcripts between environments? ```{r, DiffExpr_E, warning=FALSE, message=FALSE, echo=FALSE, include=FALSE} #input: pre-filtered count table, design ~ environment #default analysis with the steps: #1. estimation of size factor: estimateSizeFactors #2. estimation of dispersion: estimateDispersions #3. negative binomial glm fitting and Wald statistics: nbinomWaldTest ddsHTSeq_de1<- DESeq(ddsHTSeq1) summary(ddsHTSeq_de1) res_de1<- results(ddsHTSeq_de1, na.rm=TRUE, contrast= c("environment", "fish", "control")) resultsNames(ddsHTSeq_de1) #[1] "Intercept" "environment_fish_vs_control" summary(res_de1) # no DETs, 115 outliers mcols(res_de1)$description #environment fish vs control #remove NAs res_de1<- res_de1[!is.na(res_de1$padj),] #order transcripts in results by the smallest adjusted p value: resOrdered1 <-res_de1[order(res_de1$padj),] dim(resOrdered1) #[1] 23867 6 #filter genes with log2foldchange >=1 & adjusted p value <0.05 resFiltered1<- res_de1[abs(res_de1$log2FoldChange)>=1 & (res_de1$padj < 0.05), na.rm=TRUE ] dim(resFiltered1) #[1] 0 6 => no differentially expressed transcripts by environment summary(resFiltered1) #=> no differentially expressed transcripts by environment ``` No, there are no differentially expressed transcripts (hereafter: DETs) between environments (control vs. fish). #### 2.2 Are there differentially expressed transcripts between genotypes? ```{r, DiffExpr_G, warning=FALSE, message=FALSE, echo=FALSE, include=FALSE} ddsHTSeq_de2<- DESeq(ddsHTSeq2) res_de2<- results(ddsHTSeq_de2, contrast=c("genotype", "M6","M9"), na.rm=TRUE) summary(res_de2) resultsNames(ddsHTSeq_de2) #[1] "Intercept" "genotype_M9_vs_M6" #remove NAs res_de2<- res_de2[!is.na(res_de2$padj),] #filter transcripts with log2foldchange >=1 & adjusted p value <0.05 resFiltered2<- res_de2[abs(res_de2$log2FoldChange)>=1 & (res_de2$padj < 0.05), na.rm=TRUE ] dim(resFiltered2) #[1] 5283 6 summary(resFiltered2, alpha=0.05) #up:2228, down:3055 #create an output folder with results #write.csv(as.data.frame(resFiltered2), file="Output/1_DETs_byGenotype.csv") write.table(DETsbyG, file="Output/1_DETs_byGenotype.txt", row.names = FALSE, col.names = FALSE) mcols(res_de2)$description #genotype M6 vs M9 ``` Yes, there are 5,283 DETs between genotypes M6 and M9; 2,228 were upregulated (42%) and 3,055 were downregulated (58%). ```{r DETs_G_bin, include=FALSE, echo=FALSE} #bin DETs in 4 categories # foldchange <2 resFiltered2.2<- res_de2[abs(res_de2$log2FoldChange)>=1 & abs(res_de2$log2FoldChange)<=2 & (res_de2$padj < 0.05), na.rm=TRUE ] dim(resFiltered2.2) #[1] 1964 6 summary(resFiltered2.2, alpha=0.05) #up:743, down:1221 write.csv(as.data.frame(resFiltered2.2), file="Output/1_DETs_byGenotypeFC2.csv") # foldchange 2-4 resFiltered2.4<- res_de2[abs(res_de2$log2FoldChange)>=2 & abs(res_de2$log2FoldChange)<=4 & (res_de2$padj < 0.05), na.rm=TRUE ] dim(resFiltered2.4) #[1] 1486 6 summary(resFiltered2.4, alpha=0.05) #up:630, down:856 write.csv(as.data.frame(resFiltered2.4), file="Output/1_DETs_byGenotypeFC4.csv") # foldchange 4-6 resFiltered2.6<- res_de2[abs(res_de2$log2FoldChange)>=4 & abs(res_de2$log2FoldChange)<=6 & (res_de2$padj < 0.05), na.rm=TRUE ] dim(resFiltered2.6) #[1] 927 6 summary(resFiltered2.6, alpha=0.05) #up:410, down:517 write.csv(as.data.frame(resFiltered2.6), file="Output/1_DETs_byGenotypeFC6.csv") # foldchange >6 resFiltered2.7<- res_de2[abs(res_de2$log2FoldChange)>=6 & (res_de2$padj < 0.05), na.rm=TRUE ] dim(resFiltered2.7) #[1] 906 6 summary(resFiltered2.7, alpha=0.05) #up:445, down:461 write.csv(as.data.frame(resFiltered2.7), file="Output/1_DETs_byGenotypeFC7.csv") ``` DETS were binned in 4 categories of foldchanges to estimate the strength of expression differences. #### 2.3 Are there differentially expressed transcripts between environment within each genotype? ```{r, DiffExpr_E_M6, warning=FALSE, message=FALSE, echo=FALSE, include=FALSE} ddsHTSeq_de6<- DESeq(ddsHTSeq6) res_de6<- results(ddsHTSeq_de6, contrast= c("environment","fish", "control"), na.rm=TRUE) resultsNames(ddsHTSeq_de6) #[1] "Intercept" "environment_fish_vs_control" summary(res_de6, alpha=0.05) #remove NAs res_de6<- res_de6[!is.na(res_de6$padj),] #filter transcripts with log2foldchange >=1 & adjusted p value <0.05 resFiltered6<- res_de6[abs(res_de6$log2FoldChange)>=1 & (res_de6$padj < 0.05), na.rm=TRUE ] dim(resFiltered6) #[1] 30 6 summary(resFiltered6, alpha=0.05) #up:3, down:27 write.csv(as.data.frame(resFiltered6), file="Output/1_DETs_M6.csv") mcols(res_de6)$description #environment fish vs control ``` There are 30 DETs between environment within genotype M6. When exposed to fish 3 were upregulated (10%), while 27 were downregulated (90%). ```{r DETs_M6_bin, include=FALSE, echo=FALSE} #bin DETs in 4 categories # foldchange <2 resFiltered6.2<- res_de6[abs(res_de6$log2FoldChange)>=1 & abs(res_de6$log2FoldChange)<=2 & (res_de6$padj < 0.05), na.rm=TRUE ] dim(resFiltered6.2) #[1] 11 6 summary(resFiltered6.2, alpha=0.05) #up:3, down:8 write.csv(as.data.frame(resFiltered6.2), file="Output/1_DETs_M6FC2.csv") # foldchange 2-4 resFiltered6.4<- res_de6[abs(res_de6$log2FoldChange)>=2 & abs(res_de6$log2FoldChange)<=4 & (res_de6$padj < 0.05), na.rm=TRUE ] dim(resFiltered6.4) #[1] 11 6 summary(resFiltered6.4, alpha=0.05) #up:0, down:11 write.csv(as.data.frame(resFiltered6.4), file="Output/1_DETs_M6FC4.csv") # foldchange 4-6 resFiltered6.6<- res_de6[abs(res_de6$log2FoldChange)>=4 & abs(res_de6$log2FoldChange)<=6 & (res_de6$padj < 0.05), na.rm=TRUE ] dim(resFiltered6.6) #[1] 6 6 summary(resFiltered6.6, alpha=0.05) #up:0, down:6 write.csv(as.data.frame(resFiltered6.6), file="Output/1_DETs_M6FC6.csv") # foldchange >6 resFiltered6.7<- res_de6[abs(res_de6$log2FoldChange)>=6 & (res_de6$padj < 0.05), na.rm=TRUE ] dim(resFiltered6.7) #[1] 2 6 summary(resFiltered6.7, alpha=0.05) #up:0, down:2 write.csv(as.data.frame(resFiltered6.7), file="Output/1_DETs_M6FC7.csv") ``` DETs were binned in 4 categories of foldchanges to estimate the strength of expression differences. ```{r, DiffExpr_E_M9, warning=FALSE, message=FALSE, echo=FALSE, include=FALSE} ddsHTSeq_de9<- DESeq(ddsHTSeq9) summary(ddsHTSeq_de9) res_de9<- results(ddsHTSeq_de9, contrast= c("environment","fish", "control"), na.rm=TRUE) resultsNames(ddsHTSeq_de9) #[1] "Intercept" "environment_fish_vs_control" summary(res_de9) #remove NAs res_de9<- res_de9[!is.na(res_de9$padj),] #filter transcripts with log2foldchange >=1 & adjusted p value <0.05 resFiltered9<- res_de9[abs(res_de9$log2FoldChange)>=1 & (res_de9$padj < 0.05), na.rm=TRUE ] dim(resFiltered9) #[1] 57 6 summary(resFiltered9, alpha=0.05) #up:21, down:36 write.csv(as.data.frame(resFiltered9), file="Output/1_DETs_M9.csv") ``` There are 57 DETs between environment within genotype M9. 21 were upregulated (37%), while 36 were downregulated (63%). ```{r DETs_M9_bin, include=FALSE, echo=FALSE} #bin DETs in 4 categories # foldchange <2 resFiltered9.2<- res_de9[abs(res_de9$log2FoldChange)>=1 & abs(res_de9$log2FoldChange)<=2 & (res_de9$padj < 0.05), na.rm=TRUE ] dim(resFiltered9.2) #[1] 24 6 summary(resFiltered9.2, alpha=0.05) #up:16, down:8 write.csv(as.data.frame(resFiltered9.2), file="Output/1_DETs_M9FC2.csv") # foldchange 2-4 resFiltered9.4<- res_de9[abs(res_de9$log2FoldChange)>=2 & abs(res_de9$log2FoldChange)<=4 & (res_de9$padj < 0.05), na.rm=TRUE ] dim(resFiltered9.4) #[1] 27 6 summary(resFiltered9.4, alpha=0.05) #up:5, down:22 write.csv(as.data.frame(resFiltered9.4), file="Output/1_DETs_M9FC4.csv") # foldchange 4-6 resFiltered9.6<- res_de9[abs(res_de9$log2FoldChange)>=4 & abs(res_de9$log2FoldChange)<=6 & (res_de9$padj < 0.05), na.rm=TRUE ] dim(resFiltered9.6) #[1] 5 6 summary(resFiltered9.6, alpha=0.05) #up:0, down:5 write.csv(as.data.frame(resFiltered9.6), file="Output/1_DETs_M9FC6.csv") # foldchange >6 resFiltered9.7<- res_de9[abs(res_de9$log2FoldChange)>=6 & (res_de9$padj < 0.05), na.rm=TRUE ] dim(resFiltered9.7) #[1] 1 6 summary(resFiltered9.7, alpha=0.05) #up:0, down:1 write.csv(as.data.frame(resFiltered9.7), file="Output/1_DETs_M9FC7.csv") ``` DETs were binned in 4 categories of foldchanges to estimate the strength of expression differences. #### 2.4 Is there a Genotype x Environment (GxE) interaction effect? ```{r, DiffExpr_GxE_a, warning=FALSE, message=FALSE, echo=FALSE, include=FALSE} ddsHTSeq_de3<- DESeq(ddsHTSeq3) resultsNames(ddsHTSeq_de3) #"genotype_M9_vs_M6" ,"environment_fish_vs_control", "genotypeM9.environmentfish" # design ~ genotype + environment + genotype:environment #reference level: alphabetic => here M6 and control #could be set with the function relevel() #a. the environment effect for the 1st genotype: M6 (the main effect) res_M6<- results(ddsHTSeq_de3, contrast=c("environment", "fish", "control"), na.rm=TRUE) summary(res_M6, alpha=0.05) res_M6<- res_M6[!is.na(res_M6$padj),] resFilteredM6<- res_M6[abs(res_M6$log2FoldChange)>=1 & (res_M6$padj < 0.05), ] dim(resFilteredM6) #[1] 4 6 summary(resFilteredM6, alpha=0.05)#up: 1, 25% , down: 3, 75% write.csv(as.data.frame(resFilteredM6), file="Output/1_DETs_E_M6.csv") ``` ```{r DiffExpr_GxE_b1, warning=FALSE, message=FALSE, echo=FALSE, include=FALSE} #b1.environment effect for the 2nd genotype: M9 (by definition: main effect + interaction term (environment effect in 2nd genotype compared to 1st genotype)) res_M9<- results(ddsHTSeq_de3,list(c("environment_fish_vs_control", "genotypeM9.environmentfish")), na.rm=TRUE) summary(res_M9, alpha=0.05) res_M9<- res_M9[!is.na(res_M9$padj),] resFilteredM9<- res_M9[abs(res_M9$log2FoldChange)>=1 & (res_M9$padj < 0.05), ] dim(resFilteredM9) #[1] 68 6 summary(resFilteredM9, alpha=0.05)# up: 29, 43%; down= 39, 57% write.csv(as.data.frame(resFilteredM9), file="Output/1_DETs_E_M9.csv") ``` ```{r DiffExpr_GxE_b2, warning=FALSE, message=FALSE, echo=FALSE, include=FALSE} #b2 difference between genotype M6 and M9 res_M6vsM9<- results(ddsHTSeq_de3,contrast=c("genotype", "M6", "M9"), na.rm=TRUE) summary(res_M6vsM9, alpha=0.05) res_M6vsM9<- res_M6vsM9[!is.na(res_M6vsM9$padj),] resFilteredM6vsM9<- res_M6vsM9[abs(res_M6vsM9$log2FoldChange)>=1 & (res_M6vsM9$padj < 0.05), ] dim(resFilteredM6vsM9) #[1] 4687 6 summary(resFilteredM6vsM9, alpha=0.05)# up: 1990, 42%; down= 2697, 58% write.csv(as.data.frame(resFilteredM6vsM9), file="Output/1_DETs_M6vsM9.csv") ``` ```{r DiffExpr_GxE_b3, warning=FALSE, message=FALSE, echo=FALSE, include=FALSE} #b3 with environment, what is the difference between the genotypes? res_M6vsM9_E<- results(ddsHTSeq_de3, list(c("genotype_M9_vs_M6", "genotypeM9.environmentfish")), na.rm=TRUE) summary(res_M6vsM9_E, alpha=0.05) res_M6vsM9_E<- res_M6vsM9_E[!is.na(res_M6vsM9_E$padj),] resFilteredM6vsM9_E<- res_M6vsM9_E[abs(res_M6vsM9_E$log2FoldChange)>=1 & (res_M6vsM9_E$padj < 0.05), ] dim(resFilteredM6vsM9_E) #[1] 3820 6 summary(resFilteredM6vsM9_E, alpha=0.05)# up: 2016, 53%; down= 1804, 47% write.csv(as.data.frame(resFilteredM6vsM9_E), file="Output/1_DETs_M6vsM9_E.csv") ``` ```{r DiffExpr_GxE_c, warning=FALSE, message=FALSE, echo=FALSE, include=FALSE} #c.interaction term: is the environment effect different across genotypes? #different responses in genotypes res_GxE<- results(ddsHTSeq_de3, name="genotypeM9.environmentfish", na.rm=TRUE) summary(res_GxE, alpha=0.05) res_GxE<- res_GxE[!is.na(res_GxE$padj),] resFilteredGxE<- res_GxE[abs(res_GxE$log2FoldChange)>=1 & (res_GxE$padj < 0.05), ] dim(resFilteredGxE) #[1] 22 6 summary(resFilteredGxE, alpha=0.05)# up: 7, 32%; down= 15, 68% write.csv(as.data.frame(resFilteredGxE), file="Output/1_DETs_byGxE.csv") ``` design ~ genotype + environment + genotype:environment a. environemnt effect for genotype M6 There are 4 DETs for M6 between environment. When exposed to fish 1 transcript was upregulated and 3 downregulated. [environment fish vs control] b1. the environment effect for genotype M9 There are 68 differentially expressed transcripts. When exposed to fish 29 (43%) transcripts were upregulated and 39 (57%) downregulated. [environment_fish_vs_control+genotypeM9.environmentfish effect] b2. differences between genotypes (control environment) There are 4687 DETs between genotypes. Genotype M6 has 1990 upregulated transcripts (42%) and 2697 down regulated transcripts (58%) compared to M9. [genotype M6 vs M9] b3. difference between the genotypes (fish environment) There are 3820 DETs in respect to environment. Genotype M6 has 2016 upregulated transcripts (53%) and 1804 down regulated transcripts (47%) compared to M9. [genotype_M9_vs_M6+genotypeM9.environmentfish effect] c. GxE interaction term: does the environment effect differ across genotypes? There are 22 differentially expressed transcripts. Fish exposed genotype M9 had 7 upregulated transcripts (32%) and 15 downregulated (68%). [genotypeM9.environmentfish] ```{r, Bins.GxE_a, warning=FALSE, message=FALSE, echo=FALSE, include=FALSE } #bin DETs in 4 categories # foldchange <2 resFilteredM6.2<- res_M6[abs(res_M6$log2FoldChange)>=1 & abs(res_M6$log2FoldChange)<=2 & (res_M6$padj < 0.05), na.rm=TRUE ] dim(resFilteredM6.2) #[1] 1 6 summary(resFilteredM6.2, alpha=0.05) #up:0, down:1 write.csv(as.data.frame(resFilteredM6.2), file="Output/1_DETs_M6_GxE_FC2.csv") # foldchange 2-4 resFilteredM6.4<- res_M6[abs(res_M6$log2FoldChange)>=2 & abs(res_M6$log2FoldChange)<=4 & (res_M6$padj < 0.05), na.rm=TRUE ] dim(resFilteredM6.4) #[1] 2 6 summary(resFilteredM6.4, alpha=0.05) #up:0, down:2 write.csv(as.data.frame(resFilteredM6.4), file="Output/1_DETs_M6_GxE_FC4.csv") # foldchange 4-6 resFilteredM6.6<- res_M6[abs(res_M6$log2FoldChange)>=4 & abs(res_M6$log2FoldChange)<=6 & (res_M6$padj < 0.05), na.rm=TRUE ] dim(resFilteredM6.6) #[1] 0 6 summary(resFilteredM6.6, alpha=0.05) #up:0, down:0 #write.csv(as.data.frame(resFilteredM6.6), file="Output/1_DETs_M6_GxE_FC6.csv") # foldchange >6 resFilteredM6.7<- res_M6[abs(res_M6$log2FoldChange)>=6 & (res_M6$padj < 0.05), na.rm=TRUE ] dim(resFilteredM6.7) #[1] 1 6 summary(resFilteredM6.7, alpha=0.05) #up:1, down:0 write.csv(as.data.frame(resFilteredM6.7), file="Output/1_DETs_M6_GxE_FC7.csv") ``` ```{r, Bins.GxE_b1, warning=FALSE, message=FALSE, echo=FALSE, include=FALSE } #bin DETs in 4 categories # foldchange <2 resFilteredM9.2<- res_M9[abs(res_M9$log2FoldChange)>=1 & abs(res_M9$log2FoldChange)<=2 & (res_M9$padj < 0.05), na.rm=TRUE ] dim(resFilteredM9.2) #[1] 45 6 summary(resFilteredM9.2, alpha=0.05) #up:22, down:23 write.csv(as.data.frame(resFilteredM9.2), file="Output/1_DETs_M9_GxE_FC2.csv") # foldchange 2-4 resFilteredM9.4<- res_M9[abs(res_M9$log2FoldChange)>=2 & abs(res_M9$log2FoldChange)<=4 & (res_M9$padj < 0.05), na.rm=TRUE ] dim(resFilteredM9.4) #[1] 16 6 summary(resFilteredM9.4, alpha=0.05) #up:5, down:11 write.csv(as.data.frame(resFilteredM9.4), file="Output/1_DETs_M9_GxE_FC4.csv") # foldchange 4-6 resFilteredM9.6<- res_M9[abs(res_M9$log2FoldChange)>=4 & abs(res_M9$log2FoldChange)<=6 & (res_M9$padj < 0.05), na.rm=TRUE ] dim(resFilteredM9.6) #[1] 6 6 summary(resFilteredM9.6, alpha=0.05) #up:1, down:5 write.csv(as.data.frame(resFilteredM9.6), file="Output/1_DETs_M9_GxE_FC6.csv") # foldchange >6 resFilteredM9.7<- res_M9[abs(res_M9$log2FoldChange)>=6 & (res_M9$padj < 0.05), na.rm=TRUE ] dim(resFilteredM9.7) #[1] 1 6 summary(resFilteredM9.7, alpha=0.05) #up:1, down:0 write.csv(as.data.frame(resFilteredM9.7), file="Output/1_DETs_M9_GxE_FC7.csv") ``` ```{r, Bins.GxE_b2, warning=FALSE, message=FALSE, echo=FALSE, include=FALSE } #bin DETs in 4 categories # foldchange <2 resFilteredM6vsM9.2<- res_M6vsM9[abs(res_M6vsM9$log2FoldChange)>=1 & abs(res_M6vsM9$log2FoldChange)<=2 & (res_M6vsM9$padj < 0.05), na.rm=TRUE ] dim(resFilteredM6vsM9.2) #[1] 1624 6 summary(resFilteredM6vsM9.2, alpha=0.05) #up:633, down:991 write.csv(as.data.frame(resFilteredM6vsM9.2), file="Output/1_DETs_M6vsM9_FC2.csv") # foldchange 2-4 resFilteredM6vsM9.4<- res_M6vsM9[abs(res_M6vsM9$log2FoldChange)>=2 & abs(res_M6vsM9$log2FoldChange)<=4 & (res_M6vsM9$padj < 0.05), na.rm=TRUE ] dim(resFilteredM6vsM9.4) #[1] 1204 6 summary(resFilteredM6vsM9.4, alpha=0.05) #up:494, down:710 write.csv(as.data.frame(resFilteredM6vsM9.4), file="Output/1_DETs_M6vsM9_FC4.csv") # foldchange 4-6 resFilteredM6vsM9.6<- res_M6vsM9[abs(res_M6vsM9$log2FoldChange)>=4 & abs(res_M6vsM9$log2FoldChange)<=6 & (res_M6vsM9$padj < 0.05), na.rm=TRUE ] dim(resFilteredM6vsM9.6) #[1] 899 6 summary(resFilteredM6vsM9.6, alpha=0.05) #up:405, down:494 write.csv(as.data.frame(resFilteredM6vsM9.6), file="Output/1_DETs_M6vsM9_FC6.csv") # foldchange >6 resFilteredM6vsM9.7<- res_M6vsM9[abs(res_M6vsM9$log2FoldChange)>=6 & (res_M6vsM9$padj < 0.05), na.rm=TRUE ] dim(resFilteredM6vsM9.7) #[1] 960 6 summary(resFilteredM6vsM9.7, alpha=0.05) #up:458, down:502 write.csv(as.data.frame(resFilteredM6vsM9.7), file="Output/1_DETs_M6vsM9_FC7.csv") ``` ```{r, Bins.GxE_b3, warning=FALSE, message=FALSE, echo=FALSE, include=FALSE } #bin DETs in 4 categories # foldchange <2 resFilteredM6vsM9_E.2<- res_M6vsM9_E[abs(res_M6vsM9_E$log2FoldChange)>=1 & abs(res_M6vsM9_E$log2FoldChange)<=2 & (res_M6vsM9_E$padj < 0.05), na.rm=TRUE ] dim(resFilteredM6vsM9_E.2) #[1] 1114 6 summary(resFilteredM6vsM9_E.2, alpha=0.05) #up:611, down:503 write.csv(as.data.frame(resFilteredM6vsM9_E.2), file="Output/1_DETs_M6vsM9_E_FC2.csv") # foldchange 2-4 resFilteredM6vsM9_E.4<- res_M6vsM9_E[abs(res_M6vsM9_E$log2FoldChange)>=2 & abs(res_M6vsM9_E$log2FoldChange)<=4 & (res_M6vsM9_E$padj < 0.05), na.rm=TRUE ] dim(resFilteredM6vsM9_E.4) #[1] 915 6 summary(resFilteredM6vsM9_E.4, alpha=0.05) #up:478, down:437 write.csv(as.data.frame(resFilteredM6vsM9_E.4), file="Output/1_DETs_M6vsM9_E_FC4.csv") # foldchange 4-6 resFilteredM6vsM9_E.6<- res_M6vsM9_E[abs(res_M6vsM9_E$log2FoldChange)>=4 & abs(res_M6vsM9_E$log2FoldChange)<=6 & (res_M6vsM9_E$padj < 0.05), na.rm=TRUE ] dim(resFilteredM6vsM9_E.6) #[1] 826 6 summary(resFilteredM6vsM9_E.6, alpha=0.05) #up:428, down:398 write.csv(as.data.frame(resFilteredM6vsM9_E.6), file="Output/1_DETs_M6vsM9_E_FC6.csv") # foldchange >6 resFilteredM6vsM9_E.7<- res_M6vsM9_E[abs(res_M6vsM9_E$log2FoldChange)>=6 & (res_M6vsM9_E$padj < 0.05), na.rm=TRUE ] dim(resFilteredM6vsM9_E.7) #[1] 965 6 summary(resFilteredM6vsM9_E.7, alpha=0.05) #up:499, down:466 write.csv(as.data.frame(resFilteredM6vsM9_E.7), file="Output/1_DETs_M6vsM9_E_FC7.csv") ``` ```{r, Bins.GxE_c, warning=FALSE, message=FALSE, echo=FALSE, include=FALSE } #bin DETs in 4 categories # foldchange <2 resFilteredGxE.2<- res_GxE[abs(res_GxE$log2FoldChange)>=1 & abs(res_GxE$log2FoldChange)<=2 & (res_GxE$padj < 0.05), na.rm=TRUE ] dim(resFilteredGxE.2) #[1] 11 6 summary(resFilteredGxE.2, alpha=0.05) #up:3, down:8 write.csv(as.data.frame(resFilteredGxE.2), file="Output/1_DETs_GxE_FC2.csv") # foldchange 2-4 resFilteredGxE.4<- res_GxE[abs(res_GxE$log2FoldChange)>=2 & abs(res_GxE$log2FoldChange)<=4 & (res_GxE$padj < 0.05), na.rm=TRUE ] dim(resFilteredGxE.4) #[1] 6 6 summary(resFilteredGxE.4, alpha=0.05) #up:4, down:2 write.csv(as.data.frame(resFilteredGxE.4), file="Output/1_DETs_GxE_FC4.csv") # foldchange 4-6 resFilteredGxE.6<- res_GxE[abs(res_GxE$log2FoldChange)>=4 & abs(res_GxE$log2FoldChange)<=6 & (res_GxE$padj < 0.05), na.rm=TRUE ] dim(resFilteredGxE.6) #[1] 4 6 summary(resFilteredGxE.6, alpha=0.05) #up:0, down:4 write.csv(as.data.frame(resFilteredGxE.6), file="Output/1_DETs_GxE_FC6.csv") # foldchange >6 resFilteredGxE.7<-res_GxE[abs(res_GxE$log2FoldChange)>=6 & (res_GxE$padj < 0.05), na.rm=TRUE ] dim(resFilteredGxE.7) #[1] 1 6 summary(resFilteredGxE.7, alpha=0.05) #up:0, down:1 write.csv(as.data.frame(resFilteredGxE.7), file="Output/1_DETs_GxE_FC7.csv") ``` ### 3. Exploring and Exporting Results #### 3.1 MA-plots MA-plots show the log2 fold changes attributable to a given variable over the mean of normalized counts. Points will be colored red if the adjusted p-value is less than 0.1. Points which fall out of the window are plotted as open triangles pointing either up or down. They allow us to see how the shrinkage of fold changes works for transcripts with low counts. ```{r, MAplot_E, warning=FALSE, message=FALSE, fig.cap="MAplot showing log2 fold change over normalized mean counts by 'environment'", include=FALSE} #png('Figures/MAplot_E.png') plotMA(res_de1, main="log2 fold change by 'environment' ", ylim=c(-5,5)) #dev.off() ############################################################################ #to interactively detect the row number of individual transcripts by clicking on the plot. #One can then recover the transcript identifiers by saving the resulting indices: #idx<- identify(res_de$baseMean, res_de$log2FoldChange) #rownames(res_de)[idx] ``` ```{r, MAplot_G, warning=FALSE, message=FALSE, fig.cap="MAplot showing log2 fold change over normalized mean counts by 'genotype' ", include=FALSE} #png('Figures/MAplot_G.png') plotMA(res_de2, main="log2 fold change by 'genotype' ", ylim=c(-7,7)) #dev.off() ``` ```{r, MAplot_M6, include=FALSE, warning=FALSE, message=FALSE, fig.cap="MAplot showing log2 fold change over normalized mean counts by 'genotype M6' "} #png('Figures/MAplot_M6.png') plotMA(res_de6, main="log2 fold change by 'environment' ", ylim=c(-6,6)) #dev.off() ``` ```{r, MAplot_M9, warning=FALSE, message=FALSE, fig.cap="MAplot showing log2 fold change over normalized mean counts by 'genotype M9' ", include=FALSE} #png('Figures/MAplot_M9.png') plotMA(res_de9, main="log2 fold change by 'environment' ", ylim=c(-6,6)) #dev.off() ``` #### 3.2 Plot Read Counts ##### 3.2.1 ... per transcript ```{r, plotCounts_single.transcript, warning=FALSE, message=FALSE, fig.cap="Normalized counts of a single transcript", include=FALSE} #examine counts for a single transcript #png('Figures/plotCounts_single_transcript.png') plotCounts(ddsHTSeq_de1, gene=which.min(res_de1$padj), intgroup="environment") #dev.off() ``` ```{r, setup.ggplot, echo= FALSE, include=FALSE} library("ggplot2") ``` ##### 3.2.2 ... per environment ```{r, plotCounts_E_E, warning=FALSE, message=FALSE, fig.cap="Normalized counts for all transcripts by environment", include=FALSE} d_E <- plotCounts(ddsHTSeq_de1, gene=which.min(res_de1$padj), intgroup="environment", returnData=TRUE) ggplot(d_E, aes(x=environment, y=count)) + theme_bw() + geom_point(position=position_jitter(w=0.1,h=0)) + geom_text(aes(label=SampleSheet$genotype),hjust=0, vjust=0)+ scale_y_log10(breaks=c(25,100,400)) #ggsave("Figures/plotCounts_E_E.png") ``` ##### 3.2.3 ... per genotype ```{r, plotCounts_E_G, warning=FALSE, message=FALSE, fig.cap="Normalized counts for all transcripts by genotype", include=FALSE} d_G <- plotCounts(ddsHTSeq_de1, gene=which.min(res_de1$padj), intgroup="genotype", returnData=TRUE) ggplot(d_G, aes(x=genotype, y=count)) + geom_point(position=position_jitter(w=0.1,h=0)) + theme_bw() + geom_text(aes(label=SampleSheet$environment),hjust=0, vjust=0) + scale_y_log10(breaks=c(25,100,400)) ``` ##### 3.2.4 ... per replicates ```{r, plotCounts_E_rep, warning=FALSE, message=FALSE, , fig.cap="Normalized counts for all transcripts by replicates", include=FALSE} d_rep <- plotCounts(ddsHTSeq_de1, gene=which.min(res_de1$padj), intgroup="replicate", returnData=TRUE) ggplot(d_rep, aes(x=replicate, y=count)) + geom_point(position=position_jitter(w=0.1,h=0)) + theme_bw() + geom_text(aes(label=SampleSheet$genotype),hjust=0, vjust=0) + scale_y_log10(breaks=c(25,100,400)) ``` #### 3.2.5 .... per environment per genotype ```{r, plotCounts_E_M6, warning=FALSE, message=FALSE, fig.cap="Normalized counts for all transcripts by environment", include=FALSE} d_E6 <- plotCounts(ddsHTSeq_de6, gene=which.min(res_de6$padj), intgroup="environment", returnData=TRUE) ggplot(d_E6, aes(x=environment, y=count)) + theme_bw() + geom_point(position=position_jitter(w=0.1,h=0)) + geom_text(aes(label=SampleSheetM6$genotype),hjust=0, vjust=0)+ scale_y_log10(breaks=c(25,100,400)) ``` ```{r, plotCounts_E_M9, warning=FALSE, message=FALSE, fig.cap="Normalized counts for all transcripts by environment", include=FALSE} d_E9 <- plotCounts(ddsHTSeq_de9, gene=which.min(res_de9$padj), intgroup="environment", returnData=TRUE) ggplot(d_E9, aes(x=environment, y=count)) + theme_bw() + geom_point(position=position_jitter(w=0.1,h=0)) + geom_text(aes(label=SampleSheetM9$genotype),hjust=0, vjust=0)+ scale_y_log10(breaks=c(25,100,400)) ``` #### 3.3 Plot p-value distribution of DET's by genotype How to interpret a p-value histogram see: http://varianceexplained.org/statistics/interpreting-pvalue-histogram/ When you have thousands of p-values create a histogram of p-values first. Do this before you perform multiple hypothesis test correction, false discovery rate control, or any other means of interpreting your many p-values. This graph gives you an immediate sense of how your test behaved across all your hypotheses, and immediately diagnoses some potential problems. Here: Scenario A: Anti-conservative p-values (Hooray!) => set of well behaved p-values, uniform distribution. That peak close to 0 is where your alternative hypotheses live- along with some potential false positives. Notice that there are plenty of null hypotheses that appear at low p-values, so you can't just say 'call all p-values less than .05 significant' without thinking, or you'll get lots of false discoveries. Notice also that some alternative hypotheses end up with high p-values: those are the hypotheses you won't be able to detect with your test (false negatives). The job of any multiple hypothesis test correction is to figure out where best to place the cutoff for significance. Now, just how many of your hypotheses are alternative rather than null? You can get a sense of this from a histogram by looking at how tall the peak on the left is: the taller the peak, the more p-values are close to 0 and therefore significant. Similarly, the 'depth' of the histogram on the right side shows how many of your p-values are null. (From RNA-seq analysis course) ```{r, histogram.pvalues, warning=FALSE, echo=FALSE, message=FALSE, include=FALSE} hist(res_de2$pvalue[res_de2$baseMean > 1], breaks = 0:20/20, col = "grey50", border = "white") ``` #### 3.4 Customized PCA Plot ##### 3.4.1 Transform count data ```{r, transformation, warning=FALSE, message=FALSE, include=FALSE} #input: all, design ~ environment rld1 <- tryCatch(rlog(ddsHTSeq_de1), error = function(e) { rlog(ddsHTSeq_de1, fitType = 'mean') }) #input: all, design ~ genotype rld2 <- tryCatch(rlog(ddsHTSeq_de2), error = function(e) { rlog(ddsHTSeq_de2, fitType = 'mean') }) #input: M6, design ~ environment rld6 <- tryCatch(rlog(ddsHTSeq_de6), error = function(e) { rlog(ddsHTSeq_de6, fitType = 'mean') }) #input: M9, design ~ environment rld9 <- tryCatch(rlog(ddsHTSeq_de9), error = function(e) { rlog(ddsHTSeq_de9, fitType = 'mean') }) ``` ##### 3.4.2 Plot PCA ```{r, plotPCA_E, warning=FALSE, message=FALSE, include=FALSE} genotype<- SampleSheet$genotype png('Figures/1_plotPCA_E.png', res=100) plotPCA(rld1, intgroup = "environment") + theme_bw() + scale_colour_manual(name="", values = c("control"="orange", "fish"="blue")) + geom_text(aes(label= genotype),hjust=0, vjust=1 ) + geom_point(aes(shape= genotype), size=4) scale_shape_manual(values=c("M6"=21, "M9"=24)) dev.off() ``` 93% of the variance of read counts is explained by PC1 (83%) and PC2 (10%). There is no clear clustering of read counts per environment. Instead there is clear clustering per genotype. ```{r, plotPCA_M6, warning=FALSE, message=FALSE, fig.cap="PCA plot for genotype M6", include=FALSE} png('Figures/1_plotPCA_M6.png') plotPCA(rld6, intgroup = "environment") + theme_bw() + scale_colour_manual(name="", values = c("control"="grey", "fish"="black")) + #geom_text(aes(label= genotype),hjust=0, vjust=1 ) + #geom_point(aes(shape= genotype), size=4) scale_shape_manual(values=c("control"=21, "fish"=24)) dev.off() ``` 74% of the variance of read counts in genotype M6 is explained by PC1 (51%) and PC2 (23%). There is no clear clustering of read counts per environment. ```{r, plotPCA_M9, warning=FALSE, message=FALSE, fig.cap="PCA plot for genotype M9", include=FALSE} png('Figures/1_plotPCA_M9.png') plotPCA(rld9, intgroup = "environment") + theme_bw() + scale_colour_manual(name="", values = c("control"="grey", "fish"="black")) + #geom_text(aes(label= genotype),hjust=0, vjust=1 ) + #geom_point(aes(shape= genotype), size=4) scale_shape_manual(values=c("control"=21, "fish"=24)) dev.off() ``` 83% of the variance of read counts in genotype M9 is explained by PC1 (65%) and PC2 (18%). There is no clear clustering of read counts per environment. ### 4. Data Transformation and Visualization We transformed the data by using the function varianceStabilizingTransformation(), which stabilized the variance during transformation including the option blind =FALSE. This option causes that differences between genotypes and environments will not contribute to the expected variance mean trend of the experiment. This transformed dataset is used for subsequent gene co-expression network analysis (WGCNA). ```{r, vst, warning=FALSE, message=FALSE, include=FALSE} vst1 <- varianceStabilizingTransformation(ddsHTSeq1) vst<- assay(vst1) head(vst) write.csv(as.data.frame(vst), file="Output/1_Sample_counts_vst.csv") dim(vst) #23982 12 ``` ##### 4.1 Effects of Transformation on Variance ```{r, effect1, warning=FALSE, message=FALSE, fig.show='hide', include=FALSE} library("vsn") meanSdPlot(assay(vst1)) ``` ##### 4.2 Data quality assessment by sample clustering and visualization Our purpose is the detection of differential expressed transcripts. In Particular we are looking for samples whose experimental environment suffered from an anormality that renders the data points obtained from these particular samples detrimental to our purpose. Heatmap: Sample To Sample Distance ```{r heatmap_E, warning=FALSE, message=FALSE, fig.cap="Sample to Sample Distance of DET's", include=FALSE} sampleDists1 <- dist(t(assay(vst1))) library("RColorBrewer") library("pheatmap") sampleDistMatrix1 <- as.matrix(sampleDists1) rownames(sampleDistMatrix1) <- paste(vst1$environment, vst1$genotype, sep="-") colnames(sampleDistMatrix1) <- NULL colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) #png('Figures/heatmap_sample_to_sample_distance.png', res=600) pheatmap(sampleDistMatrix1, clustering_distance_rows=sampleDists1, clustering_distance_cols=sampleDists1, col=colors) #dev.off() ``` #### 5. Extract all enviroment-related DETs ```{r, total.DETs_E, include=FALSE} library(dplyr) #One-Factor analysis #M6 DETs_M6<- read.csv(file="Output/1_DETs_M6.csv") nrow(DETs_M6) #30 #DETs_M6 <- as.character(DETs_M6$X) DETs_M6$gene.set = "M6" #View(DETs_M6) DETs_M6_ordered <-DETs_M6[order(DETs_M6$log2FoldChange),] #View(DETs_M6_ordered) DETs_M6_ordered2 <- as.character(DETs_M6_ordered$X) #M9 DETs_M9<- read.csv(file="Output/1_DETs_M9.csv") nrow(DETs_M9) #57 DETs_M9$gene.set = "M9" DETs_M9_ordered <-DETs_M9[order(DETs_M9$log2FoldChange),] DETs_M9_ordered2 <- as.character(DETs_M9_ordered$X) #Two-Factor analysis #M6 DETs_E_M6<- read.csv(file="Output/1_DETs_E_M6.csv") nrow(DETs_E_M6) #4 DETs_E_M6$gene.set = "M6.2" DETs_E_M6_ordered <-DETs_E_M6[order(DETs_E_M6$log2FoldChange),] DETs_E_M6_ordered2 <- as.character(DETs_E_M6_ordered$X) #M9 DETs_E_M9<- read.csv(file="Output/1_DETs_E_M9.csv") nrow(DETs_E_M9) #68 DETs_E_M9$gene.set = "M9.2" DETs_E_M9_ordered <-DETs_E_M9[order(DETs_E_M9$log2FoldChange),] DETs_E_M9_ordered2 <- as.character(DETs_E_M9_ordered$X) # exclude overlap of DETs in one- and two-factor analysis #M6 M6<- union(DETs_M6_ordered2, DETs_E_M6_ordered2) length(M6) #32 unique DETs write.table(M6, file="Output/1_uniqueDETs_M6.txt", row.names = FALSE, col.names = FALSE) #M9 M9<- union(DETs_M9_ordered2, DETs_E_M9_ordered2) length(M9) #93 unique DETs write.table(M9, file="Output/1_uniqueDETs_M9.txt", row.names = FALSE, col.names = FALSE) #all DETs_E<- c(M6, M9) length(DETs_E) #125 write.csv(DETs_E, file="Output/1_DETs_E.csv") DETs_E2<- as.data.frame(DETs_E) # create transcript list with all information of DET #all DETs between genotypes all<- rbind(DETs_M6_ordered, DETs_E_M6_ordered, DETs_M9_ordered, DETs_E_M9_ordered) nrow(all) #159 all_ordered <-all[order(all$log2FoldChange),] colnames(all_ordered)[1] <- "transcript" write.csv(all_ordered, file="Output/1_all_DETs_ordered.csv") #exclude duplicates #159 (all) - 125 (unique) =34 to exclude ``` In total we identified 125 DETs related to the exposure to fish kairomones. ```{r, venny} #Use a different program called venny to make a Venn diagram to extract which transcripts are shared between the genotypes and environments. Therefore insert the transcript lists of interest into the onlinetool. #venny #version: 2.1.0 #reference: Oliveros, J.C. (2007-2015) Venny. An interactive tool for comparing lists with Venn's diagrams. http://bioinfogp.cnb.csic.es/tools/venny/index.html ```