############################################################################# ## Lin & Denis 2019 - Acknowledging differences: number, characteristics, ## ## and distribution of marine benthic communities along Taiwan coast ## ## Ecosphere 10(7):e02803, doi:10.1002/ecs2.2803 ## ############################################################################# # packages library(vegan) library(ggplot2) library(plyr) library(tidyverse) library(readxl) library(gclus) library(mvabund) library(ade4) library(MASS) library(cluster) library(tidyr) library(stats) library(factoextra) library(labdsv) library(Rtsne) rm(list=ls()) setwd('') ### data imporation data.fct <- read_excel("Lin&Denis_dataset.xlsx", col_names = TRUE) # read the data matrix data.fct. <- data.fct[,c(-1,-2)] # remove the OTU & category columns data.fct.1 <- aggregate(.~fct, as.data.frame(data.fct.), FUN=sum) # aggregate to functional level row.names(data.fct.1) <- data.fct.1[ ,1] # assign row.names to transects data.fct.1[ ,1] <- NULL # empty first row data.fct.t <-as.data.frame(t(data.fct.1)) # transprose the matrix hell.trans <- as.data.frame(decostand(data.fct.t, "hell")) # Hellinger transformation ### k-means clustering hell.cas <- cascadeKM(hell.trans, inf.gr=2, sup.gr=15, iter=1000, criterion = 'ssi') # looking at the best number of partitions from 2~15 using ssi criterion plot(hell.cas, sortg =T) # plot the cascade result: 7 are the optimum number of clusters set.seed(3) kmeans7 <- kmeans(hell.trans, centers= substr(names(hell.cas$result['ssi',])[which.max(hell.cas$result['ssi',])],1,1), nstart=1000, iter.max=1000) # k-means 7 clusters mrpp(hell.trans, kmeans7$cluster, permutations = 9999)# permutation test kmeans7bc <- mapvalues(kmeans7$cluster, c("1", "2","3","4","5","6","7"), c("BC1", "BC5", "BC3", "BC7", "BC4", "BC2", "BC6")) # assign each cluster to a BC col <- c( "#00CCFF", # light blue "#0066FF", # deep blue "#66FF33", # light green "#339900", # dark green "#FF0000", # light red "#990000", # dark red "#999999") # gray set.seed(3) tsne <- Rtsne(as.matrix(hell.trans), check_duplicates=FALSE, pca=TRUE, perplexity=15, theta=0.5, dims=2) # t-SNE analysis tsne_2d <- as.data.frame(tsne$Y) # two dimension matrix tsne_2d$cl_kmeans <- factor(kmeans7bc) # combine with k-means cluster matrix colnames(tsne_2d)<- c("D1","D2","cl_kmeans") # add column name of the matrix ggplot(tsne_2d, aes_string(x="D1", y="D2", color=tsne_2d$cl_kmeans)) + # plot tSNE geom_point(size=3)+ guides(colour=guide_legend(override.aes=list(size=4))) + scale_colour_manual(values = col)+ theme_light(base_size=10) + theme_bw() + theme(axis.title.x = element_text(size=12), axis.title.y = element_text(size=12), axis.text.x = element_text(size=8), axis.text.y = element_text(size=8), legend.text = element_text(size = 15), legend.direction = "horizontal", legend.position = "bottom", legend.box = "horizontal") dev.off() ### distribution of BCs spa <- as.data.frame(kmeans7bc) # data preparation spa <- rownames_to_column(spa) # split the row name into several columns spa <- as.data.frame(separate(spa, rowname, c('Region','Site', 'Depth', 'Tran'))) # assign the information of transects to row name spa$Site <- factor(as.factor(spa$Site), levels = c('KI','BC','DBS','GW','SL','HCK','XW','LK','MBT')) # assign sites spa$Depth <- factor(as.factor(spa$Depth), levels = c('40','10')) # assign depths colnames(spa) <- c('Region','Site', 'Depth', 'Tran','groups') # assign column names to the matrix ggplot(spa, aes(x = 1, y= Depth)) + # plot distribution of BCs geom_dotplot(aes(fill= factor(groups)), binaxis="y", stackdir="center", method = 'histodot', stackgroups=T, binwidth=.25, dotsize=.4, binpositions="all") + theme_minimal() + theme(plot.title = element_text(size=20), axis.title.x = element_text(size=20), axis.title.y = element_text(size=20), legend.text = element_text(size = 20), legend.position = 'bottom', legend.box.background = element_rect())+ scale_fill_manual(values = col, name = 'Benthic community') + theme(text = element_text(size=20))+ theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) + scale_x_continuous(breaks=NULL) + facet_grid(.~Site) + theme(strip.text.x = element_text(size = 20), axis.text = element_text(size=20))+ labs(y = 'Depth (m)') dev.off() ### Dufrene and Legendre indicator species analysis for each BC siv <- indval(hell.trans, kmeans7bc) # look for the indicator of each BC (DL species indicator analysis) gr <- siv$maxcls[siv$pval<=0.05] # find the class each species has maximum indicator value iv <- siv$indcls[siv$pval<=0.05] # find the indicator value for each species to its maximum class pv <- siv$pval[siv$pval<=0.05] # find the probability of obtaining indicator values as high as observed over the specified number of iterations fr <- apply(hell.trans>0, 2, sum)[siv$pval<=0.05] # calculate the frequency of the species fidg <- data.frame(group=gr, indval=iv, pvalue=pv, freq=fr) # table containing significant indicator (p<0.05) and associated values (fidg <- fidg[order(fidg$group, -fidg$indval),]) # order table by groups write.csv(fidg, 'Indval_das.csv') ### percentage composition of each BC by 51 functional groups ### kmeans7bcper <- as.numeric(mapvalues(kmeans7bc, c("BC1", "BC2", "BC3", "BC4", "BC5", "BC6", "BC7"), c("1", "2","3","4","5","6","7"))) # assign the name to each BC a <- cbind(kmeans7bcper, data.fct.t) # combine the result of the cluster with the original matrix per.BC <- NULL# a loop calculating the percentage composition (using 51 functional groups) of each BC for (i in 1:max(a[,1])){ clu_g <- a[a[,1]==i,-1] b <-colSums(clu_g) c <- 100*(clu_g/sum(b)) d<- (round(b/sum(b)*100,digits=1)) e<- round(apply(c,2,sd),digits = 1) per.BC <- cbind(per.BC,cbind(d,e)) } colnames(per.BC)<- paste0("G",rep(1:7,each=2),"_",c(rep(c("mean","SD"),7))) # assign the column name to the matrix of BC composition write.csv(per.BC, 'per.BC.csv') ### percentage composition of each BC by major taxonomic groups data.fct.otu <- data.fct[,c(-2,-3)] # remove the otu & fct column from the original matrix data.maj <- aggregate(.~category, as.data.frame(data.fct.otu), FUN=sum) # aggregate the column by major taxonomic groups row.names(data.maj) <- data.maj[,1] # assign row names data.maj.t <- as.data.frame(t(data.maj[,-1])) # transpose the matrix f <- cbind(kmeans7bcper, data.maj.t) # combine the result of the cluster with the matrix per.maj.BC <- NULL # a loop calculating the percentage composition (using major taxonomic groups) of each BC for (i in 1:max(f[,1])){ clu_g <- f[f[,1]==i,-1] g <-colSums(clu_g) h <- 100*(clu_g/sum(g)) i<- (round(g/sum(g)*100,digits=1)) j<- round(apply(h,2,sd),digits = 1) per.maj.BC<-cbind(per.maj.BC,cbind(i,j)) } colnames(per.maj.BC)<- paste0("G",rep(1:7,each=2),"_",c(rep(c("mean","SD"),7))) # assign column names to the matrix of BC composition write.csv(per.maj.BC,'per.maj.BC.csv') ### test regional and depth factors region <- table(spa$groups,spa$Region) # built a matix containg the BC as row and region as column chisq.test(region) # region is a significant factor to distinguish BC, and x2=178 depth <- table(spa$groups,spa$Depth) # built a matrix containg the BC as row and depth as column chisq.test(depth) # depth is a significant factor to distinguish BC, and x2=76.691