## Checking Blast2GO results ## A Hipp, 22 April 2013 ## read in the datasets that Elisabeth sent oak.blasts = lapply(dir('./2013-04-09.data/', full = T), read.delim, as.is = T) names(oak.blasts) = c('est.others', 'quercus.all', 'ref.seq.RNA') for(i in names(oak.blasts)) oak.blasts[[i]]$data.source <- i concat.fields <- c("query", "subject", "subject.accessions", "evalue", "query.start", "query.end", "subject.start", "subject.end", "bit.score", "Seq..Description", "Seq..Length", "X.Hits", "min..eValue", "mean.Similarity", "X.GOs", "GOs", "Enzyme.Codes", 'data.source') oak.blasts.new <- rbind(oak.blasts[[1]][concat.fields], oak.blasts[[2]][concat.fields]) oak.blasts.new <- rbind(oak.blasts.new, oak.blasts[[3]][concat.fields]) oak.blasts <- oak.blasts.new rm(oak.blasts.new) unique.loci <- sort(unique(oak.blasts$query)) stats.of.interest <- c('number of hits', 'number of unique descriptions', 'descriptions concatenated', 'number of unique GOs', 'GO terms concatenated', 'QUERCUS', 'EST.OTHERS', 'REF.SEQ.RNA') locus.summary.mat <- matrix(NA, length(unique.loci), length(stats.of.interest), dimnames = list(unique.loci, stats.of.interest)) for(i in unique.loci) { # browser() temp <- oak.blasts[oak.blasts$query == i, ] locus.summary.mat[i, 'QUERCUS'] <- sum(temp$data.source == 'quercus.all') locus.summary.mat[i, 'EST.OTHERS'] <- sum(temp$data.source == 'est.others') locus.summary.mat[i, 'REF.SEQ.RNA'] <- sum(temp$data.source == 'ref.seq.RNA') locus.summary.mat[i, 1] = dim(temp)[1] temp.desc <- unique(temp$Seq..Description) missing <- unique(c(grep("NA", temp.desc, fixed = TRUE), grep("N/A", temp.desc, fixed = TRUE))) if(length(missing) > 0) temp.desc <- temp.desc[-missing] temp.desc <- temp.desc[temp.desc != "0"] locus.summary.mat[i, 2] <- length(temp.desc) locus.summary.mat[i, 3] <- paste(temp.desc, collapse = "||") GOs <- sort(unique(unlist(strsplit(as.character(temp$GOs), "; ")))) GOs <- GOs[!GOs %in% c("", "-")] locus.summary.mat[i, 4] <- length(GOs) locus.summary.mat[i, 5] <- paste(GOs, collapse = "||") }