#survey_data_metadata_management.R #### Working with data and metadata #### #### Q25. Does your lab use online data #### summary(data_clean[,90]) #### Q26. Does your lab have a website #### summary(data_clean[,91]) #Compare online data use to having a website compwebdata <- data_clean[,90:91] compwebdata <- compwebdata[which(compwebdata[,1] != "" & compwebdata[,2] != ""),] nn <- nrow(compwebdata[which(compwebdata[,1] == "No" & compwebdata[,2] == "No"),]) ny <- nrow(compwebdata[which(compwebdata[,1] == "No" & compwebdata[,2] == "Yes"),]) yn <- nrow(compwebdata[which(compwebdata[,1] == "Yes" & compwebdata[,2] == "No"),]) yy <- nrow(compwebdata[which(compwebdata[,1] == "Yes" & compwebdata[,2] == "Yes"),]) #### Q28. Data formats used #### dformats <- data_clean[,94:100] names(dformats) <- c("ASCII","WordProcessing","Spreadsheets","RDBS","Machine", "Semantic","Spatial") dformats <- dformats[-which(is.na(dformats[,1]) & is.na(dformats[,2]) & is.na(dformats[,3]) & is.na(dformats[,4]) & is.na(dformats[,5]) & is.na(dformats[,6]) & is.na(dformats[,7])),] for(i in 1:7){ levels(dformats[,i]) = c('1') dformats[,i] <-as.numeric(as.character(dformats[,i])) } dformat_colsums <- colSums(dformats[,1:7], na.rm = TRUE) dfcolsums <- as.data.frame(t(t(dformat_colsums))) dfcolsums2 <- data.frame(type = row.names(dfcolsums), count = dfcolsums[,1]) dfcolsums2$percent <- round(dfcolsums2$count/nrow(dformats) * 100, 1) plot_datafileformat <- ggplot(dfcolsums2, aes(x = reorder(type, -count), y = count)) + geom_bar(stat = 'identity', color = "black", fill = "#0DA6F0", ) + geom_text(aes(label = paste(count, "\n (", round(percent, 0), "%)", sep = ""), y = count), size = 2.5, vjust = -.2) + xlab("Data file formats used") + ylab("Number of research groups") + coord_cartesian(ylim = c(0,62)) + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1, family = "Helvetica", size = 11, color = "black"), axis.text.y = element_text(family = "Helvetica", size = 11, color = "black"), plot.margin=unit(c(0.5,0,0,0.5),units="cm")) plot_datafileformat ggsave(filename = "plots/plot_datafileformat.eps", width = 2.8, height = 3, bg = "white", colormodel = "cymk") dformat_rowsums <- rowSums(dformats[1:nrow(dformats),]) #### Q29. Metadata formats used #### mformats <- data_clean[,102:110] names(mformats) <- c("Handwritten","ASCII","WordProcessing","Spreadsheets","RDBS", "EML","FGDC","SensorML","WaterML") mformats <- mformats[-which(is.na(mformats[,1]) & is.na(mformats[,2]) & is.na(mformats[,3]) & is.na(mformats[,4]) & is.na(mformats[,5]) & is.na(mformats[,6]) & is.na(mformats[,7]) & is.na(mformats[,8]) & is.na(mformats[,9])),] for(i in 1:9){ levels(mformats[,i]) = c('1') mformats[,i] <-as.numeric(as.character(mformats[,i])) } mformats_colsums <- colSums(mformats[,1:9], na.rm = TRUE) mdfcolsums <- as.data.frame(t(t(mformats_colsums))) mdfcolsums2 <- data.frame(type = row.names(mdfcolsums), count = mdfcolsums[,1]) mdfcolsums2$percent <- round(mdfcolsums2$count/nrow(mformats) * 100, 1) plot_metadatafileformat <- ggplot(mdfcolsums2, aes(x = reorder(type, -count), y = count)) + geom_bar(stat = 'identity', color = "black", fill = "#0DA6F0") + geom_text(aes(label = paste(count, "\n (", round(percent, 0), "%)", sep = ""), y = count), size = 2.5, vjust = -.2) + xlab("Metadata file formats used") + ylab("Number of research groups") + coord_cartesian(ylim = c(0,62)) + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1, family = "Helvetica", size = 11, color = "black"), axis.text.y = element_text(family = "Helvetica", size = 11, color = "black"), plot.margin=unit(c(0.5,0,0,0.5),units="cm")) ggsave(filename = "plots/plot_metadatafileformat.eps", plot = plot_metadatafileformat, width = 3, height = 3, bg = "white", colormodel = "cymk") mformat_rowsums <- rowSums(mformats[1:nrow(mformats),]) #### Q30. Does your lab use automated scientific workflows #### summary(data_clean[,112]) data_clean[,113] #### Q31. Does your lab use ontologies or scientific workflows to #### #standardize metadata summary(data_clean[,114]) data_clean[,115] #### Q32. Does your lab deposit data into data repositories #### datarepositories <- data_clean[,c(3,116:120)] datarepositories <- datarepositories[-which(is.na(datarepositories[,2]) & is.na(datarepositories[,3]) & is.na(datarepositories[,4]) & is.na(datarepositories[,5]) & is.na(datarepositories[,6])), ] names(datarepositories) <- c("RespondentID","Network","JournalDB","DataCenter","University","LabArchives") datarepositories2 <- melt(datarepositories, id.vars = "RespondentID") datarepositories2 <- datarepositories2[,2:3] datarepositories2$value <- as.factor(datarepositories2$value) datarepositories2$value <- mapvalues(datarepositories2$value, from = c("No data sets", "< 5 data sets per year", "5-20 data sets per year","> 20 data sets per year"), to = c("No", "Yes", "Yes","Yes")) #datarepositories2$value <- factor(datarepositories2$value,levels(datarepositories2$value)[c(2,1)]) datarepositories2 <- unique(ddply(datarepositories2, c("variable","value"), function(x) data.frame(x, count=nrow(x)))) datarepositories2$percent <- round(datarepositories2$count/77*100, 1) plot_repositoryuse <- ggplot(datarepositories2, aes(x = variable, y = percent, fill = value)) + geom_histogram(stat = "identity", colour = "transparent", width = 0.7) + coord_flip(ylim = c(0,100)) + xlab("") + ylab("Percent repondents") + ggtitle("Where data are stored/shared") + scale_x_discrete(limits=rev(c("LabArchives","University","DataCenter","JournalDB","Network")), labels = rev(c("Lab Archives","University Archives","Regional Data Center", "Journal Data Repository","Network Data Repository"))) + scale_fill_manual(name = "", values = rev(brewer.pal(5,"RdBu"))[c(1,5)]) + theme_bw() + theme(axis.text.x = element_text(angle = 0, hjust = 1, family = "Helvetica", size = 12, color = "black"), axis.text.y = element_text(family = "Helvetica", size = 12, color = "black"), legend.text = element_text(family = "Helvetica", size = 12, color = "black")) ggsave(filename = "plots/plot_repositoryuse.jpg", plot = plot_repositoryuse, width = 9, height = 3, units = "in", dpi = 300) #All 4 levels datarepositories <- data_clean[,c(3,116:120)] datarepositories <- datarepositories[-which(is.na(datarepositories[,2]) & is.na(datarepositories[,3]) & is.na(datarepositories[,4]) & is.na(datarepositories[,5]) & is.na(datarepositories[,6])), ] names(datarepositories) <- c("RespondentID","Network","JournalDB","DataCenter","University","LabArchives") datarepositories2 <- melt(datarepositories, id.vars = "RespondentID") datarepositories2 <- datarepositories2[,2:3] datarepositories2$value <- as.factor(datarepositories2$value) datarepositories2 <- unique(ddply(datarepositories2, c("variable","value"), function(x) data.frame(x, count=nrow(x)))) datarepositories2$percent <- round(datarepositories2$count/77*100, 1) datarepositories2$value <- factor(datarepositories2$value, exclude = NULL) datarepositories2$value <- factor(datarepositories2$value, levels(datarepositories2$value)[c(5,4,1,3,2)], exclude = NULL) levels(datarepositories2$value) <- c("NA","0","<5","5-20",">20") datarepositories2 <- datarepositories2[order(datarepositories2$variable, datarepositories2$value),] plot_repositoryuse_all_levels <- ggplot(datarepositories2, aes(x = variable, y = percent, fill = value)) + geom_histogram(stat = "identity", colour = "gray70", width = 0.7) + coord_flip(ylim = c(0,100)) + xlab("") + ylab("Percent repondents") + scale_x_discrete(limits=rev(c("LabArchives","University","DataCenter","JournalDB","Network")), labels = rev(c("Lab Archives","University Archives","Regional Data Center", "Journal Data Repository","Network Data Repository"))) + scale_fill_manual(name = "# data sets/year", values = brewer.pal(5,"Blues")) + theme_bw() + theme(axis.text.x = element_text(angle = 0, hjust = .5, family = "Helvetica", size = 12, color = "black"), axis.text.y = element_text(family = "Helvetica", size = 12, color = "black"), legend.text = element_text(family = "Helvetica", size = 12, color = "black"), legend.title = element_text(family = "Helvetica", size = 12, color = "black", face = "plain"), legend.position = "top", plot.margin=unit(c(0,0.5,0,0),units="cm")) plot_repositoryuse_all_levels ggsave(filename = "plots/plot_repositoryuse_all_levels.eps", plot = plot_repositoryuse_all_levels, width = 6.5, height = 2.3, bg = "white", colormodel = "cymk")