R-script as supplementary information to the publication

Proteomic fingerprinting enables quantitative biodiversity assessments of species and ontogenetic stages in Calanus congeners (Copepoda, Crustacea) from the Arctic Ocean

published in

Molecular Ecology Resources

Sven Rossel[1] (0000-0002-1187-346X), Patricia Kaiser[2] (OrcID: 0000-0002-9517-3515), Maya Bode-Dalby[2] (OrcID: 0000-0001-8002-0746), Jasmin Renz[3], Silke Laakmann[4,5] (OrcID: 0000-0003-3273-7907), Holger Auel[2], Wilhelm Hagen[2], Pedro Martínez Arbizu[1] (OrcID: 0000-0002-0891-1154), Janna Peters[3] (OrcID: 0000-0002-0941-8257)

[1] German Centre for Marine Biodiversity Research (DZMB), Senckenberg Research Institute, 26382 Wilhelmshaven, Germany

[2] Universität Bremen, BreMarE - Bremen Marine Ecology, Marine Zoology, 28334 Bremen, Germany

[3] German Centre for Marine Biodiversity Research (DZMB), Senckenberg Research Institute, 20146 Hamburg, Germany

[4] Helmholtz Institute for Functional Marine Biodiversity at the University of Oldenburg (HIFMB), 26129 Oldenburg, Germany

[5] Alfred Wegener Institute, Helmholtz-Centre for Polar and Marine Research (AWI), 27570 Bremerhaven, Germany

Stage level

Load installed R-packages needed for spectra processing and analyses.

library("MALDIquant")
library("MALDIquantForeign")
library("splus2R")
library("plyr")
library("vegan")
library("pvclust")
library("Rtsne")
library("randomForest")
library("RFtools")
library("reshape2")
library("ggplot2")

Set parameters for plotting PDFs:

pdfHeight <- 14
pdfWidth <- 24

Import Raw Spectra

rawSpectra <- importBrukerFlex("D:\\Your\\path\\to\\the\\folder\\containing\\spectra",removeEmptySpectra=TRUE)

Import reference table including information on species, sampling and stage,

referenceTable <- read.csv("ReferenceTable.csv", sep=";", row.names = NULL)

Applying reference table to raw data to store it in the objects.

rawSpectra <- lapply(rawSpectra, function(s) {
id <- metaData(s)$sampleName
idx <- which(id == referenceTable$id)
if (length(idx) != 1) {
stop("Found multiple matches for id: ", id, " (",
paste0(idx, collapse=", "), ")") }
metaData(s)$id <- referenceTable$id[idx]
metaData(s)$species <- as.character(referenceTable$Species[idx])
metaData(s)$stage <- as.character(referenceTable$Stage[idx])
metaData(s)$depth <- as.character(referenceTable$Depth[idx])
metaData(s)$station <- as.character(referenceTable$Station[idx])
metaData(s)$maldi_species <- as.character(referenceTable$maldi_spec[idx])
metaData(s)$dilution <- as.character(referenceTable$dilution[idx])
metaData(s)$dilution2 <- as.character(referenceTable$dilution2[idx])
return(s)
})

Choose a species to investigate in more detail.

rawSpectra2 <- rawSpectra[which(sapply(rawSpectra, function(x)metaData(x)$maldi_species)=="C_finmarchicus")]

Processing of raw data as described in the publication.

trimmedSpectra <- trim(rawSpectra2, c(2000, 20000))
transformedSpectra <- transformIntensity(trimmedSpectra, method="sqrt")
smoothedSpectra <- smoothIntensity(transformedSpectra, method="SavitzkyGolay",halfWindowSize=10)
baselineCorrectedSpectra <- removeBaseline(smoothedSpectra, method="SNIP",iterations=10)
normalizedSpectra <- calibrateIntensity(baselineCorrectedSpectra, method="TIC")
SampleId <- sapply(normalizedSpectra, function(x)metaData(x)$sampleName)
averagedSpectra <- averageMassSpectra(normalizedSpectra, labels=SampleId,method="mean")

Peak-picking settings and binning of masses across the data set.

peaks <- detectPeaks(averagedSpectra, halfWindowSize=13, SNR=12)
tolerance<- 0.002
bin_method <- "strict" 
binned_peaks <- peaks
binning_progress <- vector()
number_peaks_progress <- 0
repeat { 
  binned_peaks <-  binPeaks(binned_peaks, tolerance=tolerance, method=bin_method)
  binned_peak_matrix <- as.matrix(intensityMatrix(binned_peaks))
  number_peaks <- numCols(binned_peak_matrix)
  if(number_peaks==number_peaks_progress) {
    break
  }
  number_peaks_progress <- number_peaks
  binning_progress  <- append(binning_progress,number_peaks)
}
binning_progress
## [1] 405 373 371

Create intensity matrix from binned-peaks object.

featureMatrix <- intensityMatrix(binned_peaks)

Extracting information from binned-peaks object.

binned_id<- sapply(peaks, function(x)metaData(x)$id)
binned_species <- sapply(peaks, function(x)metaData(x)$species)
binned_stage <- sapply(peaks, function(x)metaData(x)$stage)
binned_depth <- sapply(peaks, function(x)metaData(x)$depth)
binned_station <- sapply(peaks, function(x)metaData(x)$station)
binned_maldi <- sapply(peaks, function(x)metaData(x)$maldi_species)
binned_dilution <- sapply(peaks, function(x)metaData(x)$dilution)
binned_dilution2 <- sapply(peaks, function(x)metaData(x)$dilution2)

Setting a signal-to-noise ratio threshold during peak picking results in the loss of mass peaks which are not noise but are below the SNR. The following code retains peaks below the initial SNR which where found in other specimens as valid peaks above the SNR. However, a final SNR of 1.75 is kept.

meannoise <- sapply(averagedSpectra, function (singleSpectrum) {
  noise <- estimateNoise(singleSpectrum)
  returnValue <- apply(noise,2,mean)
  return(returnValue[2])
})

SNRcalc <- featureMatrix/as.vector(meannoise)
SNRraw <- SNRcalc^2
noise_df <- as.data.frame(meannoise)
noise_df$binnedId2 <- binned_id
means_Noise <- ddply(noise_df, c("binnedId2"), summarise, mean =mean(meannoise))
PM_num_7 <- apply(featureMatrix, 2, function(x) ifelse(x < 1.75*means_Noise$mean, 0, x))

Adding labels to feature matrix and replacing NAs by 0.

labels <- paste(binned_id,binned_species,binned_stage, sep=":")
rownames(PM_num_7) <- labels
featureMatrix2 <- PM_num_7
featureMatrix2[is.na(featureMatrix2)] <- 0
featureMatrix2 <- data.frame(featureMatrix2)

Applying hellinger transformation to data matrix. It was shown to improve classification success.

dat_hell = data.frame(decostand(featureMatrix2,method='hellinger'))
write.table(dat_hell, file="Hellinger_Matrix_C_finmarchicus.txt", sep=",", col.names = NA)

Carry out hierachical clustering analysis including bootstrapping.

pv2 <- pvclust(t(dat_hell), nboot=100,
      method.hclust = "ward.D",
      method.dist = "euclidean")
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.9)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.3)... Done.
## Bootstrap (r = 1.4)... Done.

Plot clustering analysis with initial morphological identification.

plot(pv2,cex=0.2, float=0.01,cex.pv=0.2)

Export clustering analysis as .pdf or .svg file.

pdf(file= "Clusteranalysis_stage_C_finmarchicus.pdf", 
    height=pdfHeight, width=pdfWidth)
par(mar=c(6, 3, 3, 2),cex=0.4)
plot(pv2,float=0.0025, main="Euclidean + ward")
dev.off()
## png 
##   2
svg(file="Clusteranalysis_stage_C_finmarchicus.svg", height=pdfHeight*2, width=pdfWidth*2, pointsize=10)
plot(pv2, main="Euclidean + ward")
dev.off()
## png 
##   2

Plot data in a t-SNE plot using stages.

tsneR_raw = Rtsne(dat_hell,perplexity=10,theta=0.0,dims=2, max_iter=4000,check_duplicates = FALSE)
pch2=c(rep(21:25,5)[as.factor(binned_stage)])
bg2 = sample(colors(),20)[as.factor(binned_stage)]
for_legend1 <- paste(pch2, bg2, sep=".")
for_legend2 <- unique(for_legend1)
for_legend3 <- as.numeric(gsub("\\..*","",for_legend2))
for_legend4 <- as.character(gsub("^.*\\.","", for_legend2))
plot(tsneR_raw$Y, pch=pch2, cex=3,bg=bg2, main="TSNE stage")
legend("topleft",legend= unique(binned_stage),pch=for_legend3, pt.bg = for_legend4, horiz=FALSE, cex=1.25, pt.cex=2)

png(file="Tsne_Raw_Data_C_finmarchicus.png",width = 1200, height = 1200)
plot(tsneR_raw$Y, pch=pch2, cex=3,bg=bg2, main="TSNE Raw")
legend("topleft",legend= unique(binned_stage),pch=for_legend3, pt.bg = for_legend4, horiz=FALSE, cex=1.25, pt.cex=2)
dev.off()
## png 
##   2

Creating a random Forest model on stage level for this species. The confusion matrix is showing the class error within the model.

dat_hell2 <- cbind.data.frame(binned_stage=as.factor(binned_stage), dat_hell)
rf <- randomForest(binned_stage ~.,data=dat_hell2,proximity=T,importance=T,ntree= 2000,mtry=35,sampsize=c(rep(min(table(dat_hell2$binned_stage)),length(unique(dat_hell2$binned_stage))))) 
rf$confusion
##     C1 C2 C3 C4 C5 C6f class.error
## C1   9  1  0  0  0   0         0.1
## C2   0 10  0  0  0   0         0.0
## C3   0  0  9  0  1   0         0.1
## C4   0  0  1  8  1   0         0.2
## C5   0  0  0  2  8   0         0.2
## C6f  0  0  0  0  0  11         0.0

Identifying the most important mass peaks for stage differentiation within a random Forest model using the Gini Index. Here, the six most important Peaks are depicted.

rf_imp <- data.frame(importance(rf))
rf_imp2 <- rf_imp[rev(order(rf_imp$MeanDecreaseGini)),]
head(rf_imp2)
##                           C1        C2       C3       C4       C5       C6f MeanDecreaseAccuracy
## X5913.50096138092 18.6957259 11.314499 7.204994 4.489121 5.806948  5.953288             18.22637
## X6250.31661474909 -0.7382738  3.273257 7.383095 8.336069 1.239739 17.040850             16.88741
## X4474.67749682034  9.0590105  9.430402 8.119846 9.682775 7.257162  5.092708             12.44569
## X13414.1899818229  6.4148912  1.059211 6.793936 8.766349 7.742226  8.996803             13.12414
## X4526.23686542591  4.6759263  5.072908 4.788294 8.886873 9.450490 12.122793             13.37637
## X4414.60756216033 -0.9215762  5.151634 7.467393 5.196687 9.608269  5.768777             11.44852
##                   MeanDecreaseGini
## X5913.50096138092         1.719486
## X6250.31661474909         1.396647
## X4474.67749682034         1.119286
## X13414.1899818229         1.118432
## X4526.23686542591         1.073928
## X4414.60756216033         1.061469

Choosing a number of important peaks (here 10) and save the column number

b <- c()
for(i in 1:10){
  a <- which(colnames(dat_hell)==rownames(rf_imp2)[i])
  b <- c(b,a)
  }

Extract the most important peaks and plot them in a heat map.

dat_hell_figure <- dat_hell[,c(b)]  #Extract peaks by column number
dat_hell_figure2 <- dat_hell_figure
label_new <- paste(binned_stage,binned_id, sep="_")
rownames(dat_hell_figure2) <- label_new
dat_hell_figure2$id <- label_new
grafik_melt <- melt(dat_hell_figure2, id.vars = c("id"))
grafik_melt2 <- grafik_melt
grafik_melt2[,2] <- gsub("X","",grafik_melt2[,2]) 
grafik_melt2[,2] <- as.numeric(as.character(grafik_melt2[,2]))
grafik_melt2[,2] <- round(grafik_melt2[,2], digits=0)
grafik_melt2[,2] <- as.factor(as.character(grafik_melt2[,2]))

p <- ggplot(grafik_melt2, aes(x=variable,y = id))+ 
  geom_tile(aes(fill = value),colour='white')+
  scale_fill_gradientn(colours=c("white","cyan3","blue4"))+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
p

ggsave(file="Important_stage_peaks_C_finmarchicus.svg", plot=p, width=15, height=8)
ggsave(file="Important_stage_peaks_C_finmarchicus.pdf", plot=p, width=15, height=8)
grafik_melt2$stage <- gsub("\\_.*","",grafik_melt2$id)
f <- ggplot(grafik_melt2, aes(x=variable, y=value,fill=stage)) + 
  geom_boxplot()+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
f

ggsave(file="Important_stage_peaks_C_finmarchicus_boxplots.pdf", plot=f, width=25, height=8)

Calculating Euclidean distances and creating a boxplot between developmental stages.

dist_stages <- vegdist(dat_hell, method="euclidean")
group_stages <- data.frame(as.matrix(dist_stages), binned_stage)
colnames(group_stages)[ncol(group_stages)] <- "stage"
dist_melt <- melt(group_stages, id.vars =c("stage"))

vsstage2 <- c()
for(i in 1:length(binned_stage)){
  vsstage <- as.character(rep(binned_stage[i],length(binned_stage)))
  vsstage2 <- c(vsstage2,vsstage)
}
dist_melt$vs <- vsstage2
dist_C1_c2 <- c(subset(dist_melt, dist_melt$stage=="C1" & dist_melt$vs =="C2"),subset(dist_melt, dist_melt$stage=="C2" & dist_melt$vs =="C1"))
dist_C1_c3 <- c(subset(dist_melt, dist_melt$stage=="C1" & dist_melt$vs =="C3"),subset(dist_melt, dist_melt$stage=="C3" & dist_melt$vs =="C1"))
dist_C1_c4 <- c(subset(dist_melt, dist_melt$stage=="C1" & dist_melt$vs =="C4"),subset(dist_melt, dist_melt$stage=="C4" & dist_melt$vs =="C1"))
dist_C1_c5 <- c(subset(dist_melt, dist_melt$stage=="C1" & dist_melt$vs =="C5"),subset(dist_melt, dist_melt$stage=="C5" & dist_melt$vs =="C1"))
dist_C1_c6f <- c(subset(dist_melt, dist_melt$stage=="C1" & dist_melt$vs =="C6f"),subset(dist_melt, dist_melt$stage=="C6f" & dist_melt$vs =="C1"))
boxplot(dist_C1_c2$value,dist_C1_c3$value,dist_C1_c5$value,dist_C1_c5$value,dist_C1_c6f$value, names=c("C1 vs. C2","C1 vs. C3","C1 vs. C4","C1 vs. C5","C1 vs. C6f"),ylab = "Euclidean distances",xlab="Stage comparisons", col=c("darkorchid1","darkslateblue","deepskyblue","aquamarine3","chartreuse"))