#Enrichment_test.R #updated 8.1.2018 #by Kohta Yoshida #Gene lists of test and background group are loaded as tdata and bdata, respectively. #In the files, gene id of stickleback are listed and separated with newlines. tdata<-read.table("test_gene_list",header=F) bdata<-read.table("background_gene_list",header=F) #Tab delimited text data for annotaion of GO terms to all stickleback genes is loaded as pdata. #That contains all GO lists in each row. pdata<-read.table("mart83_GO_compiled.txt",header=T,sep="\t",quote="") ##Function for enrichment test## enrichment_test<-function(bgocolumn,tgocolumn){ hyper_two_tail_midP<-function(tt,bt,bf,nt){ dprob<-dhyper(tt,bt,bf,nt) count<-0 prob<-0 while(count <= nt){ lprob<-dhyper(count,bt,bf,nt) if(lprob < dprob){ prob <- prob+lprob }else{ if(lprob==dprob){ prob<-prob+lprob/2 } } count<-count+1 } return(prob) } bnum<-length(bgocolumn) bgolists<-strsplit(as.character(bgocolumn)," /// ") bgovec<-unlist(bgolists) bcounttable<-table(bgovec) tested_golist<-unique(names(bcounttable)[bcounttable >= 5]) tnum<-length(tgocolumn) tgolists<-strsplit(as.character(tgocolumn)," /// ") tgovec<-unlist(tgolists) tcounttable<-table(tgovec) bt<-c() tt<-c() p_value<-c() for(i in 1:length(tested_golist)){ goname<-tested_golist[i] bt[i]<-bcounttable[names(bcounttable)==goname] if(length(which(names(tcounttable)==goname))>0){ tt[i]<-tcounttable[names(tcounttable)==goname] }else{ tt[i]<-0 } bf<-bnum-bt[i] p_value[i]<-hyper_two_tail_midP(tt[i],bt[i],bf,tnum) } q_value<-p.adjust(p_value,method="BH") odds_in_test<-tt/(tnum-tt) odds_in_others<-(bt-tt)/(bnum-bt+tt) odds_ratio<-odds_in_test/odds_in_others wdata<-data.frame(Go=tested_golist,Count_in_test=tt,Count_in_back=bt,Odds_ratio=odds_ratio,p_value=p_value,q_value=q_value) return(wdata) } ##Indexes of genes of test group and background group## bind<-match(bdata$V1,pdata$Gene_ID) bind<-bind[is.na(bind)==F] tind<-match(tdata$V1,pdata$Gene_ID) tind<-tind[is.na(tind)==F] ##Enrichment test for GO slim and 3 GO domains## # 4 resultant dataframes are produced. bgocolumn_input<-pdata$GO_slim[bind] tgocolumn_input<-pdata$GO_slim[tind] GO_slim_result<-enrichment_test(bgocolumn_input,tgocolumn_input) bgocolumn_input<-pdata$GO_biological_process[bind] tgocolum_inputn<-pdata$GO_biological_process[tind] GO_BP_result<-enrichment_test(bgocolumn_input,tgocolumn_input) bgocolumn_input<-pdata$GO_molecular_function[bind] tgocolumn_input<-pdata$GO_molecular_function[tind] GO_MF_result<-enrichment_test(bgocolumn_input,tgocolumn_input) bgocolumn_input<-pdata$GO_cellular_component[bind] tgoclumn_input<-pdata$GO_cellular_component[tind] GO_CC_result<-enrichment_test(bgocolumn_input,tgocolumn_input) ##Option:adding gene lists## gene_on_go<-function(go,tgocolumn){ fgvec<-grep(go,tgocolumn) genelist<-pdata[tind,][fgvec,1] gl_in_str<-paste(genelist,collapse=",") return(gl_in_str) } #example bgocolumn_input<-pdata$GO_slim[bind] tgoclumn_input<-pdata$GO_slim[tind] focal_go<-GO_slim_result[GO_slim_result$q_value<0.05,1] gl_on_focalgo<-sapply(focal_go,gene_on_go,tgoclumn_input) new.GO_slim_result<-cbind(GO_slim_result[GO_slim_result$q_value<0.05,1:6],gl_on_focalgo)