--- title: "Abby's SILAC data analysis" output: html_notebook --- This analysis refers to the 3 day SILAC timecourse (0, 1, 2, 3 days) Abby did using C2C12 cells. You may want to start by creating a new R Project inside the directory where you have the data for this analysis. Here are the libraries we'll need for this script: ```{r message=FALSE, warning=FALSE, results='hide'} library(tidyverse) library(readr) library(tidyr) library(broom) library(minpack.lm) #library(nls2) #library(multidplyr) #library(fields) #library(clue) #library(readxl) #library(ggplot2) #library(reshape2) ``` # Data cleanup and filtering: We'll begin by importing and cleaning the raw data. That can be found in 4 separate .txt files inside the raw data files folder. ```{r message=FALSE, warning=FALSE, results='hide'} dir.create("RESULTS") # Can be changed to csv or other if needed extension <- "txt" fileNames <- Sys.glob(paste("*.", extension, sep = "")) fileNumber <- 4 for (fileNumber in 1:length(fileNames)) { aggFileName <- paste("RESULTS/mean-", sub(paste("\\.", extension, sep = ""), "", fileNames[fileNumber]), ".", extension, sep = "") OriginalFileName <- paste("RESULTS/Original-", sub(paste("\\.", extension, sep = ""), "", fileNames[fileNumber]), ".", extension, sep = "") FilteredFileName <- paste("RESULTS/Filtered-", sub(paste("\\.", extension, sep = ""), "", fileNames[fileNumber]), ".", extension, sep = "") MHSK_22 <- read_delim(fileNames[fileNumber], "\t", escape_double = FALSE, col_names = FALSE, trim_ws = TRUE, comment = "H\t") # MHSK_22 <- read_delim(paste0("Mito/",fileNames[fileNumber]), "\t", escape_double = FALSE, col_names = FALSE, trim_ws = TRUE, comment = "H\t") conn <- file(fileNames[fileNumber],open="r") linn <-readLines(conn) pat = "H,PLINE,LOCUS.*|H,SLINE,UNIQUE.*|H PLINE LOCUS.*|H SLINE UNIQUE.*" MHSKhead <- do.call(rbind.data.frame, strsplit(grep(pattern = pat, x= linn, value = T), "[,|\t]")) rm(conn) rm(linn) rm(pat) ProtName <- c("foo") Descript <- c("bar") MHSK_22$UniprotID <- "" MHSK_22$Description <- "" MHSKhead[] <- sapply(MHSKhead, as.character) ProtN <- grep(pattern = "LOCUS", x = MHSKhead)[1] - 1 DescN <- grep(pattern = "DESCRIPTION", x = MHSKhead)[1] - 1 for (i in 1:dim(MHSK_22)[1]) { if (MHSK_22$X1[i] == "P"){ ProtName <- MHSK_22[i,ProtN] Descript <- MHSK_22[i,DescN] } else if (MHSK_22$X1[i] == "S") { MHSK_22[i,"UniprotID"] <- ProtName MHSK_22[i,"Description"] <- Descript } } rm(ProtName) rm(Descript) MHSK_22 <- MHSK_22[MHSK_22$X1 != "P",] rmcols <- grep(pattern = "N/A",x = MHSK_22[1,]) MHSK_22 <- MHSK_22 %>% select(-rmcols) replace_col <- dim(select(MHSK_22, starts_with("X")))[2] colnames(MHSK_22)[1:replace_col] <- as.character(MHSKhead[2,c(2:(replace_col+1))]) # Keep an eye here, sometimes depending on the databases the search was done against you may need to change the indices on the second part of the equation (MHSKhead). rm(MHSKhead) write.table(MHSK_22, OriginalFileName, append = FALSE, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) # This is where the filtering happens, so if any changes are necessary you can do it by altering what we have below. #note that the regression factor filter is taking out a lot. MHSK_22filt <- MHSK_22 %>% filter(AREA_RATIO != 1 & PROFILE_SCORE >= 0.8) %>% filter(AREA_RATIO >= 9 | (AREA_RATIO <= 0.111 & REGRESSION_FACTOR > 0) | (between(AREA_RATIO, 0.111, 9) & REGRESSION_FACTOR >=0.8)) rm(MHSK_22) # Here we subset the table to keep only the columns of interest. We also create a new column where we calculate the percent heavy from the area ratios. MHSK_22sub <- MHSK_22filt %>% select(UNIQUE, SEQUENCE, AREA_RATIO, UniprotID, Description) %>% filter(AREA_RATIO != Inf) %>% mutate(PercentHeavy = 100 * (1/(1+AREA_RATIO))) rm(MHSK_22filt) write.table(MHSK_22sub, FilteredFileName, append = FALSE, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) # Now we create a nother table where we aggregate the data at the protein level. We calculate the mean, standard deviation, and peptide count for each protein. aggdata22 <- MHSK_22sub %>% group_by(UniprotID, Description) %>% summarise(mean=mean(PercentHeavy), sd=sd(PercentHeavy), N = length(PercentHeavy)) rm(MHSK_22sub) write.table(aggdata22, aggFileName, append = FALSE, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) rm(aggdata22) } ``` Now that we have imported, cleaned up, and saved the clean data tables, let's move ahead cleaning the environment and re-importing only the tables of interest here. ```{r} rm(list=ls()) ``` Write a function to load and combine the data for different timepoints: ```{r message=FALSE, warning=FALSE, paged.print=FALSE} readFun <- function(filename) { # read in the data data <- read_delim(filename, "\t", escape_double = FALSE, trim_ws = TRUE) # add a file column containing the file name data$file <- filename return( data ) } ``` We set the working directory for our files and run the data import function: ```{r message=FALSE, warning=FALSE, paged.print=FALSE} #setwd("/Users/abigailbuchwalter/PROTEOMICS/LM proteome wide 3-17-19/RESULTS") # Apply readFun All_data <- plyr::ldply(.data = list.files(pattern = "Filtered-*"),.fun = readFun) #Remember to change the timepoints here according to your experimental design. Time_d = c(0,1,2,3) Samples <- cbind(Files = unique(All_data$file), Time_d) # Add time points according to file names. All_data$Time_d <- plyr::mapvalues(All_data$file,Samples[,1],Samples[,2]) ``` We can see how many peptides we had for each timepoint below: ```{r message=FALSE, warning=FALSE} table(All_data$file, All_data$Time_d) ``` We can also check how the proportion of heavy label for the peptides over time: ```{r message=FALSE, warning=FALSE} ggplot(data=All_data, aes(x=as.factor(Time_d), y=PercentHeavy)) + geom_boxplot() ``` We'll do this for peptide level data and also for protein level data. Want to calculate fits for individual peptides and average values to obtain protein-level calculations with a standard error. So we'll import the tables of aggregate data too, also created above. We set the working directory for our files and run the data import function: ```{r message=FALSE, warning=FALSE} # If this give a warning indicating it cannot change the working directory, do that by hand and then run this chunck ny pasting it in the console. #setwd("/data/users_data/collaborations/JC_SK_AB/AB 3D myotube SILAC for Ju 2/raw data files/RESULTS") # Apply readFun All_P_data <- plyr::ldply(.data = list.files(pattern = "mean-*"),.fun = readFun) Samples <- cbind(Samples, Files2 = unique(All_P_data$file)) # Add time points: All_P_data$Time_d <- plyr::mapvalues(All_P_data$file,Samples[,3],Samples[,2]) ``` We can see how many proteins we had for each timepoint below: ```{r message=FALSE, warning=FALSE} table(All_P_data$file, All_P_data$Time_d) ``` We can also check the proportion of heavy label for the proteins over time: ```{r message=FALSE, warning=FALSE} ggplot(data=All_P_data, aes(x=as.factor(Time_d), y=mean)) + geom_boxplot() ``` Now let's filter any proteins that do not have at least 2 peptides in any given timepoint and that are not present in at least 3 timepoints: ```{r message=FALSE, warning=FALSE} All_P_data2 <- filter(All_P_data, N > 1) test <- plyr::count(All_P_data2$UniprotID) test <- filter(test, freq > 2) All_P_data2 <- All_P_data2[All_P_data2$UniprotID %in% test$x, ] rm(test) ``` ###Test filtering for peptides present in at least 3 timepoints (Ju addition 3/20/19) ```{r} filterpep <- All_data %>% select(UniprotID,Description,SEQUENCE,UNIQUE,Time_d) %>% group_by(UniprotID,Description,SEQUENCE,UNIQUE) %>% tally() %>% filter(n < 3) %>% select(UniprotID,Description,SEQUENCE,UNIQUE) %>% distinct() All_data_filt <- anti_join(x = All_data, y = filterpep) ### The above can still be combined with the previous protein filter (1st one) to ensure there were at least 2 peptides per timepoint if you'd like. Or that can be done later too. # Just checking how much we loose: All_data %>% select(UniprotID,Description,SEQUENCE,UNIQUE) %>% group_by(UniprotID,Description,SEQUENCE,UNIQUE) %>% tally() %>% distinct() %>% nrow() All_data_filt %>% select(UniprotID,Description,SEQUENCE,UNIQUE) %>% group_by(UniprotID,Description,SEQUENCE,UNIQUE) %>% tally() %>% distinct() %>% nrow() ``` Same filter as above for the peptide level data: ```{r message=FALSE, warning=FALSE} All_data2 <- All_data[All_data$UniprotID %in% All_P_data2$UniprotID, ] ``` OF NOTE: We already lost almost half the protein present using the filters above. Just thought I should mention that in case you guys think those are too stringent. Save the tables: ```{r message=FALSE, warning=FALSE} #setwd("/data/users_data/collaborations/JC_SK_AB/AB 3D myotube SILAC for Ju 2/raw data files/RESULTS") write.csv(All_data2, "filtered peptide data for half-life.csv") write.csv(All_P_data2, "filtered protein data for half-life.csv") ``` Further filter to only reviewed Uniprot IDs. This is done manually by pulling the list of Uniprot IDs off the above tables, running them through Uniprot, exporting reviewed IDs only, and matching with the code below. Peptide level - Table S1: ```{r} library(readr) Dat <- read_csv("filtered peptide data for half-life.csv") ref_list <- read_csv("peptide data Uniprot reviewed IDs.csv", col_names = FALSE) ref_list_IDs <- ref_list[,"X1"] sorted <- Dat$UniprotID %in% ref_list_IDs$X1 subset <- Dat[sorted,] write_csv(subset,"Table S1 filtered peptide data for half-life reviewed only.csv") ``` Protein level - Table S2: ```{r} library(readr) Dat <- read_csv("filtered protein data for half-life.csv") ref_list <- read_csv("protein data Uniprot reviewed IDs.csv", col_names = FALSE) ref_list_IDs <- ref_list[,"X1"] sorted <- Dat$UniprotID %in% ref_list_IDs$X1 subset <- Dat[sorted,] write_csv(subset,"Table S2 filtered protein data for half-life reviewed only.csv") ``` # Fitting NLS and LM for comparison: We'll begin by averaging the values of the percent heavy for peptides with the exact same sequence mapped to the same protein from the same timepoint. We need to do this because we have removed the other variables (like CS state) that could differentiate them, also because I just think that it makes sense. # Abby note: I don't think I agree with this. I think that each individual spec count should be treated as an independent observation. This may be collapsing down the influence of some "good" peptides that fly well and were observed multiple times. # I want to go forward with peptide-level and protein-level analysis on All_P_data2, AKA "filtered protein data for half-life" ```{r} All_data2_avg <- All_data2 %>% rename(time = Time_d, ratio = PercentHeavy) %>% mutate(time = as.numeric(time)) All_P_data2_avg <- All_P_data2 %>% rename(time = Time_d, ratio = mean) %>% mutate(time = as.numeric(time)) ``` Since Shefali had mentioned before that she would like to have the half-lives even for the protein and peptides that did not pass the filters above, we'll use that entire dataset to fit the models as well. So we'll do the same averaging here. ```{r} All_data_avg <- All_data %>% rename(time = Time_d, ratio = PercentHeavy) %>% mutate(time = as.numeric(time)) All_P_data_avg <- All_P_data %>% rename(time = Time_d, ratio = mean) %>% mutate(time = as.numeric(time)) ``` OF NOTE: The data indicated with a '2' is the filtered dataset (e.g. All_data2_avg) while the data with no number is the unfiltered data (e.g. All_data_avg). Also, the data indicated with a '_P_' is the aggregate protein level data (e.g. All_P_data_avg), and the data without the P is the peptide level data (above). First I'm creating a function that will fit the linear model to the log(Percent heavy) over time. This is the equation for the model: $log(y) = \beta_1 + \alpha t$, where $y$ is the percent heavy, $\beta_1$ is the intercept, $t$ is time, and $\alpha$ is the rate of degradation. This first round will consider each peptide individually, without taking into account if 2 peptides belong to the same protein. This is only going to be useful so you can check what the Thalf estimate for each peptide was. For example, in case there an obviously divergent Thalf estimate for a single peptide that is not unique, we could check if maybe that peptide most likely belongs to another protein and consider eliminating that. The linear model fit will be done for the peptide level data and also for the aggregate protein level data. Create the functions: For the peptide level: ```{r message=FALSE, warning=FALSE} FitLMdegPep <- function(Curve) { FitDataLM <- Curve %>% group_by(UNIQUE, SEQUENCE,UniprotID,Description) %>% do(fit = lm(log(ratio) ~ time, data = .)) FitDataLM1 <- FitDataLM %>% tidy(fit) %>% select(UNIQUE, SEQUENCE, UniprotID, Description, term, estimate) %>% spread(term, estimate) %>% rename(intercept = `(Intercept)`, alpha = time) %>% mutate(ThalfEst = log(2)/-alpha) # FitDataLM2 <- FitDataLM %>% glance(fit) %>% # rename(p_val = p.value) %>% # select(UNIQUE, SEQUENCE, UniprotID, Description, term, p_val) %>% # spread(term, p_val) %>% # select(UNIQUE, SEQUENCE, UniprotID, Description, time) %>% # rename(p_val_alpha = time) %>% # bind_cols(NRMSE = FitDataLMrmse/100) ErrorsModelr <- function(modlist) { MAPE = modelr::mape(model = modlist, data = modlist$model %>% rename("ratio" = "log(ratio)") %>% mutate(ratio = exp(ratio))) RSAE = modelr::rsae(model = modlist, data = modlist$model %>% rename("ratio" = "log(ratio)") %>% mutate(ratio = exp(ratio))) Rsquared = modelr::rsquare(model = modlist, data = modlist$model %>% rename("ratio" = "log(ratio)") %>% mutate(ratio = exp(ratio))) Errdf <- cbind(MAPE,RSAE,Rsquared) return(Errdf) } FitDataLMErr <- as_tibble(t(sapply(FitDataLM$fit, ErrorsModelr))) colnames(FitDataLMErr) <- c("MAPE","RSAE","Rsqr") FitDataLM3 <- cbind(ungroup(FitDataLM1), FitDataLMErr) return(FitDataLM3) } ``` For the protein level: ```{r} FitLMdegProt <- function(Curve) { FitDataLM <- Curve %>% group_by(UniprotID,Description) %>% do(fit = lm(log(ratio) ~ time, data = .)) FitDataLM1 <- FitDataLM %>% tidy(fit) %>% select(UniprotID, Description, term, estimate) %>% spread(term, estimate) %>% rename(intercept = `(Intercept)`, alpha = time) %>% mutate(ThalfEst = log(2)/-alpha) ErrorsModelr <- function(modlist) { MAPE = modelr::mape(model = modlist, data = modlist$model %>% rename("ratio" = "log(ratio)") %>% mutate(ratio = exp(ratio))) RSAE = modelr::rsae(model = modlist, data = modlist$model %>% rename("ratio" = "log(ratio)") %>% mutate(ratio = exp(ratio))) Rsquared = modelr::rsquare(model = modlist, data = modlist$model %>% rename("ratio" = "log(ratio)") %>% mutate(ratio = exp(ratio))) Errdf <- cbind(MAPE,RSAE,Rsquared) return(Errdf) } FitDataLMErr <- as_tibble(t(sapply(FitDataLM$fit, ErrorsModelr))) colnames(FitDataLMErr) <- c("MAPE","RSAE","Rsqr") FitDataLM3 <- cbind(ungroup(FitDataLM1), FitDataLMErr) return(FitDataLM3) } ``` Fit the linear models to the data: ```{r message=FALSE, warning=FALSE, include=FALSE, results='hide'} #The peptide level functions take quite a while to run, so we'll save it for later. This is only useful in case we need to check if a specific peptide is messing up our protein level data. #Pep2_LM <- FitLMdegPep(All_data2_avg) #Prot2_LM <- FitLMdegProt(All_P_data2_avg) Pep_LM <- FitLMdegPep(All_data_avg) #Run below if you'd like data for proteins that did not make the filter cut. #Prot_LM <- FitLMdegProt(All_P_data_avg) ``` Quick check on the values we get from the LMs: ```{r} Prot2_LM%>% #select(UniprotID, ends_with("RSAE")) %>% mutate(MAPE = log(MAPE), ThalfEst = log(ThalfEst)) %>% rename("log_MAPE"="MAPE", "log_ThalfEst" = "ThalfEst") %>% gather(key = Measurements, value = Values, -UniprotID, -Description) %>% ggplot(aes(x = Values)) + geom_histogram(bins = 100) + facet_wrap(. ~ Measurements, scales = "free") ``` ```{r} Prot_LM%>% #select(UniprotID, ends_with("RSAE")) %>% mutate(MAPE = log(MAPE), ThalfEst = log(ThalfEst)) %>% rename("log_MAPE"="MAPE", "log_ThalfEst" = "ThalfEst") %>% gather(key = Measurements, value = Values, -UniprotID, -Description) %>% ggplot(aes(x = Values)) + geom_histogram(bins = 100) + facet_wrap(. ~ Measurements, scales = "free") ``` OF NOTE: So far the only filter was on the initial data, from the peptides and proteins. We kept the data for proteins that had peptides present in at least 3 timepoints and also had at least 2 peptides for each timepoint. This is important here because we'll soon use a second round of filters, where we use the error values for the model fits to eliminate more proteins. We can also save these complete data sets before filtering further: ```{r} #setwd("/data/users_data/collaborations/JC_SK_AB/AB 3D myotube SILAC for Ju 2/model fits") write_csv(Pep_LM, "LM peptide no filter.csv") # write_csv(Pep_LM_Comb, "LM peptide grouped by protein no filter.csv") #write_csv(Pep2_LM, "LM peptide filter 1.csv") # write_csv(Pep2_LM_Comb, "LM peptide grouped by protein filter 1.csv") #write_csv(Prot_LM, "LM aggregate protein no filter.csv") #write_csv(Prot2_LM, "LM aggregate protein filter 1.csv") ``` Want to compute average protein-level values from individual peptide fits that surpass quality filters. need to first filter by Rsq >0.8 need to remove NAs in Rsqr column need to remove peptides with negative half-lives (where are these coming from??) Note that if you remove peptides that start at day 0 < 90% old that might help too. ```{r} Pep2_LM_filt <- Pep2_LM[Pep2_LM$Rsqr >= 0.8 | is.na(Pep2_LM$Rsqr),] Pep2_LM_filt2 <- Pep2_LM_filt[Pep2_LM_filt$ThalfEst > 0 | is.na(Pep2_LM_filt$ThalfEst),] Pep2_LM_filtclean <- drop_na(Pep2_LM_filt2, var = "Rsqr") ``` then need to average these values and report standard deviation for rate; protein-level half-life; and protein-level Rsqr ```{r} library(dplyr) Prot_from_pept_LM <- summarise(group_by(Pep2_LM_filtclean,UniprotID), alpha_mean=mean(alpha), sd=sd(alpha), ThalfEst = mean(ThalfEst), Rsqr = mean(Rsqr)) ``` One more level of filtering: remove proteins that only have 1 peptide that had a good fit. These will have "na" as standard deviation for alpha. This is a pretty stringent criterion. ```{r} Prot_from_pept_LM_2peptidefits <- drop_na(Prot_from_pept_LM, var = "sd") ``` ```{r} write_csv(Prot_from_pept_LM, "LM protein averaged from peptide data.csv") write_csv(Prot_from_pept_LM_2peptidefits, "LM protein averaged >2 peptides w fit.csv") ``` # Annotate the data: First, subset the proteome-level data by removing out redundant proteins and retaining only reviewed Uniprot IDs. Manually map all Uniprot IDs in "LM protein averaged from peptide data.csv" in Uniprot; pull out subset of IDs that are reviewed. Will subset those as done below. ```{r} library(readr) Dat <- read_csv("LM protein averaged from peptide data.csv") ref_list <- read_csv("LM results Uniprot reviewed IDs.csv", col_names = FALSE) ref_list_IDs <- ref_list[,"X1"] sorted <- Dat$UniprotID %in% ref_list_IDs$X1 subset <- Dat[sorted,] write_csv(subset,"LM protein averaged from peptide data reviewed only.csv") ``` ```{r} Dat2 <-read_csv("LM protein averaged >2 peptides w fit.csv") reflist_2 <- read_csv("LM results 2pepts Uniprot reviewed IDs.csv", col_names = FALSE) reflist_2 <- reflist_2[,"X1"] sorted2 <- Dat2$UniprotID %in% reflist_2$X1 subset2 <- Dat2[sorted2,] write_csv(subset2, "LM protein averaged >2 peptides reviewed only.csv") ``` With this cleaned up proteome-level dataset, want to report some proteome-level statistics, including: 1. median %heavy 2. mean Thalf 2. median Thalf 3. standard deviation 4. some meaningful statistical estimate of spread (quintiles or deciles, perhaps) so that can make statements about calc half life being in top 10% or bottom 10%, etc. Cleaning up protein-level %heavy dataset to reviewed protein IDs that had good fits only: ```{r} ref_list <- read_csv("LM results Uniprot reviewed IDs.csv", col_names = FALSE) ref_list_IDs <- ref_list[,"X1"] sorted <- All_P_data2_avg$UniprotID %in% ref_list_IDs$X1 subset <- All_P_data2_avg[sorted,] write_csv(subset,"proteins reviewed only.csv") ``` Cross-referencing reviewed proteins with good fits to larger subset of reviewed proteins with good coverage from above (Table S2) to see which proteins didn't get half-life prediction. May also want to report this in a supplemental table. ```{r} Dat3 <- read_csv("Table S2 filtered protein data for half-life reviewed only.csv") ref_list <- read_csv("LM results Uniprot reviewed IDs.csv", col_names = FALSE) ref_list_IDs <- ref_list[,"X1"] sorted <- !(Dat3$UniprotID %in% ref_list_IDs$X1) subset <- Dat3[sorted,] write_csv(subset,"reviewed proteins with no fit.csv") ``` Median percent heavy for entire proteome (plot this in Figure 1E): ```{r} All_P_data2_avg %>% spread(key = time,value = ratio) %>% rename("day0" = "0","day1" = "1","day2" = "2","day3" = "3") %>% dplyr::select(day0, day1, day2, day3) %>% summarise_all(median, na.rm=T) ``` Next proteome-level values: 2. mean Thalf 2. median Thalf 3. standard deviation 4. some meaningful statistical estimate of spread (quintiles or deciles, perhaps) so that can make statements about calc half life being in top 10% or bottom 10%, etc. Report these values in Table S5 ```{r} mean(subset2$ThalfEst) mean(subset2$alpha_mean) median(subset2$ThalfEst) median(subset2$alpha_mean) mean(subset2$Rsqr) median(subset2$Rsqr) sd(subset2$ThalfEst) sd(subset2$alpha_mean) subset2 %>% mutate(quintile = ntile(ThalfEst, 10)) -> subset_deciles write_csv(subset_deciles, "LM protein averaged >2 peptides reviewed only.csv") ``` checking peptides ```{r} All_data2_avg %>% filter(UniprotID == "O71033" & SEQUENCE == "R.KLEAGIR.G") %>% ggplot(aes(x = time, y = log(ratio), color = SEQUENCE)) + geom_point() + geom_line() ``` We'll begin by annotating the data according to Abby's lists of UniprotIDs for proteins present in different cellular compartments. ```{r} #setwd("/data/users_data/collaborations/JC_SK_AB/AB 3D myotube SILAC for Ju 2/curated Uniprot lists") NET <- read_csv("mouse_NET_IDs .csv", col_names = "UniprotID") NET <- cbind(NET, CC = rep("NET",length(NET$UniprotID))) IntFil <- read_csv("mouse_intermedfilament_IDs.csv", col_names = "UniprotID") IntFil <- cbind(IntFil, CC = rep("IntFil",length(IntFil$UniprotID))) INM <- read_csv("mouse_INM_IDs .csv", col_names = "UniprotID") INM <- cbind(INM, CC = rep("INM",length(INM$UniprotID))) ER <- read_csv("mouse_ER_IDs .csv", col_names = "UniprotID") ER <- cbind(ER, CC = rep("ER",length(ER$UniprotID))) Hist <- read_csv("Histone_IDs.csv", col_names = c("UniprotID", "ENTRY-NAME"), col_types = cols(X3 = col_skip(), X4 = col_skip(), X5 = col_skip(), X6 = col_skip(), X7 = col_skip()), skip = 1) Hist$CC <- rep("histone", length(Hist$UniprotID)) # if (!requireNamespace("BiocManager")) # install.packages("BiocManager") # BiocManager::install() # # if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # BiocManager::install("UniProt.ws", version = "3.8") library(UniProt.ws) up <- UniProt.ws(10090) IntFil <- inner_join(IntFil, select(up, IntFil$UniprotID, "ENTRY-NAME", "UNIPROTKB"), by = c("UniprotID" = "UNIPROTKB")) ER <- inner_join(ER, select(up, ER$UniprotID, "ENTRY-NAME", "UNIPROTKB"), by = c("UniprotID" = "UNIPROTKB")) INM <- inner_join(INM, select(up, INM$UniprotID, "ENTRY-NAME", "UNIPROTKB"), by = c("UniprotID" = "UNIPROTKB")) NET <- inner_join(NET, select(up, NET$UniprotID, "ENTRY-NAME", "UNIPROTKB"), by = c("UniprotID" = "UNIPROTKB")) # From the paper figures I'm not sure what the histones are for, so I'll keep them in a separate table tor now. Abby_anno <- rbind(IntFil, ER, INM, NET) rm(INM) rm(IntFil) rm(ER) rm(NET) ``` Add the combined LM or NLS data to the annotations: ```{r} Anno_pep2_LM <- inner_join(Abby_anno, Pep2_LM_Comb) Anno_pep2_NLSensbl <- inner_join(Abby_anno, Pep2comb_NLS_ensemble) ``` Save tables with Anno of interest: ```{r} setwd("/data/users_data/collaborations/JC_SK_AB/AB 3D myotube SILAC for Ju 2/model fits") write_csv(Anno_pep2_LM, "Linear Model peptide combined by protein Abby's annotations ERROR NOT FINAL.csv") write_csv(Prot2_NLS_ensemble, "NLS ensemble peptide combined by protein Abby's annotations ERROR NOT FINAL.csv") ``` ```{r} Anno_pep2_NLSensbl %>% ggplot(aes(x = CC, y = ThalfEst)) + geom_boxplot() + geom_jitter(alpha=0.5) + ylim(0,20) + theme_classic() ``` ANOVA is not significant ```{r} summary(aov(formula = ThalfEst ~ CC, data = Anno_pep2_NLSensbl)) ``` ```{r} Anno_pep2_LM %>% ggplot(aes(x = CC, y = ThalfEst)) + geom_boxplot() + geom_jitter(alpha=0.5) + ylim(0,12) + theme_classic() ``` ANOVA is not significant ```{r} summary(aov(formula = ThalfEst ~ CC, data = Anno_pep2_LM)) ``` Examples of how to subset data for given UniprotID or peptide sequence, how to fit single LM or NLS models, quickly calculate model error and pretein Thalf for single proteins,etc. ```{r} testNLS <- nls(ratio ~SSasymp(time, ratio_f, ratio_0, log_alpha), data = filter(All_data2_avg, UniprotID == "A0A075DCB1")) qpcR::RMSE(testNLS) # Thalf log(2)/-tidy(testNLS)[3,2] ``` ```{r} testLM <- lm(log(ratio) ~ time, filter(All_data2_avg, UniprotID == "A0A075DCB1")) exp(qpcR::RMSE(testLM)) #Thalf log(2)/-tidy(testLM)[2,2] ``` ```{r} All_data2_avg %>% filter(UniprotID == "A0A075DCB1") ``` ```{r} All_data2_avg %>% filter(UniprotID == "QO1320") %>% ggplot(aes(x = time, y = log(ratio), color = SEQUENCE)) + geom_point() + geom_line() ``` ```{r} All_data2_avg %>% filter(UniprotID == "A0A075DCB1") %>% ggplot(aes(x = time, y = log(ratio))) + geom_point() + geom_smooth(method = "lm", formula = y ~ x, se=FALSE) ``` ```{r} All_data2_avg %>% filter(UniprotID == "A0A075DCB1") %>% ggplot(aes(x = time, y = ratio)) + geom_point() + geom_smooth(method = "nls", formula = y ~SSasymp(x, ratio_f, ratio_0, log_alpha), se=FALSE) ``` ```{r} testNLSpep <- nls(ratio ~SSasymp(time, ratio_f, ratio_0, log_alpha), data = filter(All_data2_avg, UniprotID == "A0A075DCB1" & SEQUENCE == "K.TSLTLLDLIWLEK.I")) qpcR::RMSE(testNLSpep) exp(qpcR::RMSE(testLM)) #Thalf log(2)/-tidy(testNLSpep)[3,2] ``` ```{r} testLMpep <- lm(log(ratio) ~ time, filter(All_data2_avg, UniprotID == "A0A075DCB1" & SEQUENCE == "K.TSLTLLDLIWLEK.I")) exp(qpcR::RMSE(testLMpep)) exp(qpcR::RMSE(testLM)) #Thalf log(2)/-tidy(testLMpep)[2,2] ``` ```{r} All_data2_avg %>% filter(UniprotID == "A0A075DCB1" & SEQUENCE == "K.TSLTLLDLIWLEK.I") ``` ```{r} All_data2_avg %>% filter(UniprotID == "A0A075DCB1" & SEQUENCE == "K.TSLTLLDLIWLEK.I") %>% ggplot(aes(x = time, y = log(ratio), color = SEQUENCE)) + geom_point() + geom_line() ``` ```{r} All_data2_avg %>% filter(UniprotID == "A0A075DCB1" & SEQUENCE == "K.TSLTLLDLIWLEK.I") %>% ggplot(aes(x = time, y = log(ratio))) + geom_point() + geom_smooth(method = "lm", formula = y ~ x, se=FALSE) ``` ```{r} All_data2_avg %>% filter(UniprotID == "A0A075DCB1" & SEQUENCE == "K.TSLTLLDLIWLEK.I") %>% ggplot(aes(x = time, y = ratio)) + geom_point() + geom_smooth(method = "nls", formula = y ~SSasymp(x, ratio_f, ratio_0, log_alpha), se=FALSE) ``` Q922A3 ```{r} testNLS <- nls(ratio ~SSasymp(time, ratio_f, ratio_0, log_alpha), data = filter(All_data2_avg, UniprotID == "Q922A3")) qpcR::RMSE(testNLS) # Thalf log(2)/-tidy(testNLS)[3,2] ``` ```{r} testLM <- lm(log(ratio) ~ time, filter(All_data2_avg, UniprotID == "Q922A3")) exp(qpcR::RMSE(testLM)) #Thalf log(2)/-tidy(testLM)[2,2] ``` ```{r} All_data2_avg %>% filter(UniprotID == "Q922A3") ``` ```{r} All_data2_avg %>% filter(UniprotID == "P48678") %>% ggplot(aes(x = time, y = ratio, color = SEQUENCE)) + geom_point() + geom_line() + theme(legend.position = "none") ``` ```{r} All_data2_avg %>% filter(UniprotID == "P48678") %>% ggplot(aes(x = time, y = log(ratio))) + geom_point() + geom_smooth(method = "lm", formula = y ~ x, se=FALSE) ``` ```{r} All_data2_avg %>% filter(UniprotID == "P48678") %>% ggplot(aes(x = time, y = ratio)) + geom_point() + geom_smooth(method = "nls", formula = y ~SSasymp(x, ratio_f, ratio_0, log_alpha), se=FALSE) ``` Histone H4 ```{r} testNLS <- nls(ratio ~SSasymp(time, ratio_f, ratio_0, log_alpha), data = filter(All_data2_avg, UniprotID == "Q922A3"), control = nls.control(maxiter = 100, minFactor = 1/2048)) qpcR::RMSE(testNLS) # Thalf log(2)/tidy(testNLS)[3,2] ``` ```{r} testLM <- lm(log(ratio) ~ time, filter(All_data2_avg, UniprotID == "Q922A3")) exp(qpcR::RMSE(testLM)) #Thalf log(2)/-tidy(testLM)[2,2] ```