install.packages(c('ggplot2', 'reshape2', 'plyr', 'car', 'tibble', 'ggfortify')) library(ggplot2) library(reshape2) library(plyr) library(car) library(ggfortify) repos_data <- read.csv("repos_data_rfcc.csv") summary(repos_data) # Measure the number of issues, security issues, and fraction of issues that are security issues in our entire dataset. sum(repos_data$num_issues) sum(repos_data$num_security_issues) sum(repos_data$num_security_issues) / sum(repos_data$num_issues) # Add up total churn repos_data$churn_unreviewed_total <- repos_data$churn_unreviewed_lines_added + repos_data$churn_unreviewed_lines_deleted repos_data$churn_total <- repos_data$churn_total_lines_added + repos_data$churn_total_lines_deleted # Convert GitHub timestamp strings to datetimes stringToTime <- function(s) { strptime(s, format="%Y-%m-%dT%T+00:00", tz="GMT") } # Calculate age as a time difference in seconds. repos_data$age <- as.numeric(stringToTime(repos_data$pushed_at) - stringToTime(repos_data$created_at), units='secs') # We notice that some repos have negative age. Let's inspect. repos_data[repos_data$age <= 0, c('owner', 'name', 'created_at', 'pushed_at')] # Drop repos with negative age as bad. repos_data <- subset(repos_data, repos_data$age > 0) # Filter to make sure that we meet the original population requirement # (But not restricted to 2015, so slightly less stringent) too_few_issues <- subset(repos_data, repos_data$num_issues < 5) nrow(too_few_issues) too_few_contribs <- subset(repos_data, repos_data$num_contributors < 4) nrow(too_few_contribs) # Perform the actual filtering nrow(repos_data) # rows before repos_data <- subset(repos_data, repos_data$num_issues >= 5) repos_data <- subset(repos_data, repos_data$num_contributors >= 4) nrow(repos_data) # rows after # Additionally, filter for num_prs > 0 , since they just get dropped for NA later in the model nrow(repos_data) repos_data <- subset(repos_data, repos_data$num_prs > 0) nrow(repos_data) # Filter repositories that don't appear to have any code (no primary language). # These are often text-only repositories (RFCs, markdown, etc.) nrow(subset(repos_data, repos_data$language1 == '')) # rows with no primary language repos_data <- subset(repos_data, repos_data$language1 != '') nrow(repos_data) repos_data[repos_data$language2 == "Mercury",] repos_data[repos_data$language2 == "AGS Script",] cond <- !is.na(repos_data$language2) & repos_data$language2 == "AGS Script"; if (any(cond)) { repos_data[cond,]$language2 <- "" } cond <- !is.na(repos_data$language2) & repos_data$language2 == "M"; if (any(cond)) { repos_data[cond,]$language2 <- "Matlab"} # Actually Markdown, which is text, so set to empty. cond <- !is.na(repos_data$language1) & repos_data$language1 == "GCC Machine Description"; if (any(cond)) { repos_data[cond,]$language1 = "" } cond <- !is.na(repos_data$language2) & repos_data$language2 == "GCC Machine Description"; if (any(cond)) { repos_data[cond,]$language2 = "" } cond <- !is.na(repos_data$language3) & repos_data$language3 == "GCC Machine Description"; if (any(cond)) { repos_data[cond,]$language3 = "" } cond <- !is.na(repos_data$language1) & repos_data$language1 == "Logos"; if (any(cond)) { repos_data[cond,]$language1 = "" } cond <- !is.na(repos_data$language2) & repos_data$language2 == "Logos"; if (any(cond)) { repos_data[cond,]$language2 = "" } cond <- !is.na(repos_data$language3) & repos_data$language3 == "Logos"; if (any(cond)) { repos_data[cond,]$language3 = "" } cond <- !is.na(repos_data$language1) & repos_data$language1 == "Game Maker Language"; if (any(cond)) { repos_data[cond,]$language1 = "" } cond <- !is.na(repos_data$language2) & repos_data$language2 == "IDL"; if (any(cond)) { repos_data[cond,]$language2 = "QMake" } cond <- !is.na(repos_data$language3) & repos_data$language3 == "IDL"; if (any(cond)) { repos_data[cond,]$language3 = "QMake" } cond <- !is.na(repos_data$language2) & repos_data$language2 == "Slash"; if (any(cond)) { repos_data[cond,]$language2 = "" } cond <- !is.na(repos_data$language3) & repos_data$language3 == "Component Pascal"; if (any(cond)) { repos_data[cond,]$language3 = "" } cond <- !is.na(repos_data$language3) & repos_data$language3 == "DIGITAL Command Language"; if (any(cond)) { repos_data[cond,]$language3 = "" } # Some special cases repos_data[repos_data$owner == "aard-pwfa" & repos_data$name == "E200_data",]$language2 = "Matlab" repos_data[repos_data$owner == "danielbierwirth" & repos_data$name == "DropboxBrowser",]$language2 = "Objective-C" # Load in our language info data lang_info <- read.csv("lang-info.csv", stringsAsFactors=FALSE) # Convert from factors to strings... lang_info$name <- as.character(lang_info$name) repos_data$language1 <- as.character(repos_data$language1) repos_data$language2 <- as.character(repos_data$language2) repos_data$language3 <- as.character(repos_data$language3) # Add new cols: # - language1.is.mem.safe # - language1.is.programming repos_data$language1.is.mem.safe <- as.character( lapply(repos_data$language1, function (x) lang_info[lang_info$name == x, "is.mem.safe"])) repos_data$language1.is.programming <- as.character( lapply(repos_data$language1, function (x) lang_info[lang_info$name == x, "is.programming"])) repos_data$language2.is.mem.safe <- as.character( lapply(repos_data$language2, function (x) lang_info[lang_info$name == x, "is.mem.safe"])) repos_data$language2.is.programming <- as.character( lapply(repos_data$language2, function (x) lang_info[lang_info$name == x, "is.programming"])) repos_data$language3.is.mem.safe <- as.character( lapply(repos_data$language3, function (x) lang_info[lang_info$name == x, "is.mem.safe"])) repos_data$language3.is.programming <- as.character( lapply(repos_data$language3, function (x) lang_info[lang_info$name == x, "is.programming"])) # Filter repositories that have no programming languages nrow(repos_data) mask <- (repos_data$language1 == "" | is.na(repos_data$language1) | repos_data$language1.is.programming == "FALSE") & (repos_data$language2 == "" | is.na(repos_data$language2) | repos_data$language2.is.programming == "FALSE") & (repos_data$language3 == "" | is.na(repos_data$language3) | repos_data$language3.is.programming == "FALSE") repos_data <- repos_data[!mask,] nrow(repos_data) # We say a repo is memory safe if all of its languages are memory safe # (Written this way to count MAYBE as TRUE for our purposes here.) repos_data$is.mem.safe <- as.numeric( (repos_data$language1.is.mem.safe != 'FALSE') & (repos_data$language2.is.mem.safe != 'FALSE') & (repos_data$language3.is.mem.safe != 'FALSE')) nrow(subset(repos_data, is.mem.safe == 1)) # Count primary languages and their sizes across all remaining repositories. # We look at the top languages. numsum <- function(x) { sum(as.numeric(x)) } counts <- count(repos_data$language1) bytes <- aggregate(repos_data$language1_bytes, by=list(Language=repos_data$language1), FUN=numsum) counts$bytes <- bytes$x head(counts[order(-counts$freq),], 10) summary(repos_data) # Count the number of repositories, the total number of issues, and the total number of PRs nrow(repos_data) sum(repos_data$num_issues) sum(repos_data$num_prs) # Count the number of languages langs = subset(repos_data, select=c('language1', 'language2', 'language3')) langs_long = stack(lapply(langs[0:3], as.character)) length(unique(langs_long$values)) par(mfrow=c(4,4)) vars <- c('num_prs_self_reviewed','num_security_issues', 'num_forks', 'size', 'num_stars', 'num_watchers', 'num_contributors', 'commenters_per_pr_mean', 'num_issues', 'num_prs', 'age', 'is.mem.safe', 'churn_unreviewed_total', 'churn_total') for(v in vars) { plot(density(na.omit(repos_data[, v])), main=v) } # Correlation matrix (pairwise) using Spearman due to non-normality of raw variables. cor(repos_data[, c('num_security_issues', 'num_prs_self_reviewed', 'num_forks', 'size', 'num_stars', 'num_watchers', 'num_contributors', 'num_issues', 'age', 'is.mem.safe', 'churn_unreviewed_total', 'churn_total', 'commenters_per_pr_mean')], use = "pairwise.complete.obs", method = "spearman") # Log (natural) normalize skewed variables repos_data$num_forks_log <- log(repos_data$num_forks + 1) repos_data$size_log <- log(repos_data$size + 1) repos_data$num_stars_log <- log(repos_data$num_stars + 1) repos_data$num_watchers_log <- log(repos_data$num_watchers + 1) repos_data$num_contributors_log <- log(repos_data$num_contributors + 1) repos_data$num_prs_log <- log(repos_data$num_prs + 1) repos_data$age_log <- log(repos_data$age + 1) repos_data$churn_unreviewed_total_log <- log(repos_data$churn_unreviewed_total + 1) repos_data$churn_total_log <- log(repos_data$churn_total + 1) repos_data$commenters_per_issue_mean_log <- log(repos_data$commenters_per_issue_mean + 1) repos_data$commenters_per_pr_mean_log <- log(repos_data$commenters_per_pr_mean + 1) repos_data$num_prs_self_reviewed_log <- log(repos_data$num_prs_self_reviewed + 1) repos_data$num_issues_log <- log(repos_data$num_issues + 1) repos_data$num_security_issues_log <- log(repos_data$num_security_issues + 1) # More participation metrics repos_data$pr_comments_mean <- repos_data$pr_issue_comments_mean + repos_data$pr_review_comments_mean repos_data$pr_comments_mean_log <- log(repos_data$pr_comments_mean + 1) repos_data$pr_issue_comments_mean_log <- log(repos_data$pr_issue_comments_mean + 1) repos_data$pr_review_comments_mean_log <- log(repos_data$pr_review_comments_mean + 1) # Inspect corrected data par(mfrow=c(5,4)) vars <- c('num_prs_self_reviewed_log', 'num_security_issues_log', 'num_forks_log', 'size_log', 'num_stars_log', 'num_watchers_log', 'num_contributors_log', 'commenters_per_issue_mean_log', 'commenters_per_pr_mean_log', 'num_issues_log', 'num_prs_log', 'age_log', 'is.mem.safe', 'churn_unreviewed_total_log', 'churn_total_log', 'pr_comments_mean_log', 'pr_issue_comments_mean_log', 'pr_review_comments_mean_log') for(v in vars) { plot(density(na.omit(repos_data[, v])), main=v) } # Spearman Correlation matrix (pairwise) on log-normalized data vars <- c('num_prs_self_reviewed_log', 'num_security_issues_log', 'num_forks_log', 'size_log', 'num_stars_log', 'num_watchers_log', 'num_contributors_log', 'commenters_per_pr_mean_log', 'num_issues_log', 'num_prs_log', 'age_log', 'is.mem.safe', 'churn_unreviewed_total_log', 'churn_total_log', 'pr_comments_mean_log', 'pr_issue_comments_mean_log', 'pr_review_comments_mean_log') cor(repos_data[, vars], method='spearman', use = "pairwise.complete.obs") # Spearman Correlation matrix (pairwise) on log-normalized data vars <- c('num_prs_self_reviewed_log', 'num_security_issues_log', 'size_log', 'num_stars_log', 'num_contributors_log', 'commenters_per_pr_mean_log', 'num_issues_log', 'num_prs_log', 'age_log', 'is.mem.safe', 'churn_total_log', 'pr_comments_mean_log', 'pr_review_comments_mean_log') cor(repos_data[, vars], method='spearman', use = "pairwise.complete.obs") # (RQ1) Regression on log total issues ~ log factors (including PRs) rq1.model <- lm(num_issues_log ~ num_prs_self_reviewed_log + repos_data$ size_log + num_contributors_log + churn_total_log + num_stars_log + age_log + factor(is.mem.safe), data=repos_data) summary(rq1.model) # (RQ1 Alternate) Regression on log total issues ~ log churn coverage factors # We check whether using unreviewed churn instead affects the model (it does not). rq1.alt <- lm(num_issues_log ~ churn_unreviewed_total_log + churn_total_log + num_prs_log + size_log + num_contributors_log + num_stars_log + age_log + factor(is.mem.safe), data=repos_data) summary(rq1.alt) # Evaluate Collinearity vif(rq1.model) # variance inflation factors any(vif(rq1.model) > 5) # problem? # Diagnostic plots autoplot(rq1.model, ncol = 2) # (RQ2) Regression on log security issues ~ log factors (including PRs) rq2.model <- lm(num_security_issues_log ~ num_prs_self_reviewed_log + size_log + num_contributors_log + num_prs_log + churn_total_log + num_stars_log + age_log + factor(is.mem.safe), data=repos_data) summary(rq2.model) # (RQ2 Alternate) log num security issues ~ log churn coverage factors # We check whether using unreviewed churn instead affects the model (it does not) rq2.1 <- lm(num_security_issues_log ~ churn_unreviewed_total_log + churn_total_log + num_prs_log + size_log + num_contributors_log + num_stars_log + age_log + factor(is.mem.safe), data=repos_data) summary(rq2.1) # Evaluate Collinearity vif(rq2.model) # variance inflation factors any(vif(rq2.model) > 5) # problem? # Diagnostic plots autoplot(rq2.model, ncol = 2) # RQ3: Participation factors to overall issues rq3.model <- lm(num_issues_log ~ commenters_per_pr_mean_log + pr_review_comments_mean_log + num_prs_log + size_log + num_contributors_log + churn_total_log + num_stars_log + age_log + factor(is.mem.safe), data=repos_data) summary(rq3.model) # Evaluate collinearity vif(rq3.model) # variance inflation factors any(vif(rq3.model) > 5) # problem? # Diagnostic plots autoplot(rq3.model, ncol = 2) # RQ4: Non-normalized participation factors to non-normalized security issues rq4.model <- lm(num_security_issues_log ~ commenters_per_pr_mean_log + pr_review_comments_mean_log + size_log + num_contributors_log + num_stars_log + age_log + churn_total_log + num_prs_log + factor(is.mem.safe), data=repos_data) summary(rq4.model) # Evaluate collinearity vif(rq4.model) # variance inflation factors any(vif(rq4.model) > 5) # problem? # Diagnostic plots autoplot(rq4.model, ncol = 2) sessionInfo()