################################################################################################## ################################ R CODE RELATED TO GUENSER ET AL. ################################ ### "WHEN LESS IS MORE AND MORE IS LESS: THE IMPACT OF SAMPLING EFFORT ON SPECIES DELINEATION" ### ################################################################################################## # MFA #### # setwd("[INSERT YOUR WORKING DIRECTORY]") library(FactoMineR) dataRep <- read.table(file = "data_for_MFA.csv",h=T,sep=";",dec=".") afmRep <- MFA(dataRep[,2:142],group=c(1,2,3,60,75), type=c("s","s","n","c","c"), ncp=13,# here ncp is chosen in the light of BSM results name.group=c("ratio","angle","cusp","latbend","abview"), ind.sup = c(160:184)) # Broken-stick model (BSM) #### #First: do a MFA with ncp = 100 to have as many eigenvalues as possible #Then, do BSM doPcaSignif <- function(eigvals) { acron <- character() criterion <- character() nsigncomp <- numeric() n <- length(eigvals) pcvars <- 100 * eigvals / sum(eigvals) # Broken stick model (MacArthur 1957). bsm <- data.frame(j = seq(1:n), p = 0) bsm$p[1] <- 1 / n for(i in 2:n) bsm$p[i] <- bsm$p[i-1] + (1/(n + 1 - i)) bsm$p <- 100 * bsm$p / n bsvars <- bsm$p[n:1] acron <- c(acron, "BSM") criterion <- c(criterion, "broken stick model") nsigncomp <- c(nsigncomp, length(which(pcvars >= bsvars))) dp <- data.frame(acron, criterion, nsigncomp) } pcasign <- doPcaSignif(afmRep$eig[,1]) print(pcasign) # plot MFA #### library(dplyr) library(ggplot2) # Focusing on taxonomy #### coord_afm <- data.frame(dataRep$Taxonomy[1:159],afmRep$ind$coord[,1],afmRep$ind$coord[,2], afmRep$ind$coord[,3],afmRep$ind$coord[,4]) names(coord_afm) <- c("Taxonomy","Dim1","Dim2","Dim3","Dim4") coord_afm$Taxonomy <- factor(coord_afm$Taxonomy, levels = c("cristagalli","posterolongatus","waageni", "pakistanensis", "aff_cristagalli", "aff_posterolongatus","aff_waageni", "aff_pakistanensis","indet")) AFM <- ggplot(data=coord_afm,aes(x=coord_afm$Dim1, y=coord_afm$Dim2, colour=coord_afm$Taxonomy))+ scale_colour_manual(values=c("blue","deeppink4","mediumspringgreen","goldenrod", "blue","deeppink4","mediumspringgreen","goldenrod", "grey60"), labels=c("Ns. cristagalli","Ns. posterolongatus","Ns. waageni","Nv. pakistanensis", "Ns. aff. cristagalli","Ns. aff. posterolongatus","Ns. aff. waageni", "Nv. aff. pakistanensis","Undetermined specimen"))+ geom_point(aes(shape=coord_afm$Taxonomy),size=3)+ scale_shape_manual(values=c(17,17,17,17,16,16,16,16,16), labels=c("Ns. cristagalli","Ns. posterolongatus","Ns. waageni","Nv. pakistanensis", "Ns. aff. cristagalli","Ns. aff. posterolongatus","Ns. aff. waageni", "Nv. aff. pakistanensis","Undetermined specimen"))+ theme(panel.background = element_rect(fill = "transparent", colour = "transparent", size = NULL, linetype = "solid", color = "transparent", inherit.blank = FALSE))+ theme(panel.grid.major = element_line(size = 0.25, linetype = "solid", colour = "grey90"))+ theme(panel.grid.minor = element_line(color="transparent"))+ theme(strip.background = element_rect(fill = "red", colour = "red", size = NULL, linetype = "solid", color = NULL, inherit.blank = FALSE))+ theme(legend.title = element_text(face = "bold",size=15))+ theme(legend.text = element_text(face = "italic",size=15))+ theme(legend.key = element_blank())+ theme(axis.text.x = element_text(size=15), axis.ticks = element_line(size=1), axis.text.y = element_text(size=15), axis.title.x = element_text(size=15), axis.title.y = element_text(size=15))+ labs(colour="Legend",shape="Legend",x="Dim1 = XXX %", y="Dim2 = XXX %") AFM ggsave(file="MFA_1-2.svg", plot=AFM, width=10, height=10) # Calculation of error measurement #### # Euclidean distances between each individual on the 13 axes of MFA Dist <- data.frame() for(i in 1:159){ for (j in 1:159){ D=0 for (k in 1:13){ d=(afm_total[i,k]-afm_total[j,k])^2 D=d+D } Dist[i,j]=sqrt(D) } } # The minimum distance for each individual Distmin <- matrix(nrow=1,ncol=159) for (i in 1:159){ Distmin[1,i] <- sort(Dist[,i],decreasing=T)[159-1] } # Euclidean distance between replicas and the average #One data set per replica rep17 <- rbind(afm_total[17,], afm_total[160:164,]) rep21 <- rbind(afm_total[21,], afm_total[165:169,]) rep78 <- rbind(afm_total[78,], afm_total[170:174,]) rep89 <- rbind(afm_total[89,], afm_total[175:179,]) rep98 <- rbind(afm_total[98,], afm_total[180:184,]) #function for average distance between average rep and replicas mean_rep <- function(data){ DistRep <- data.frame() for (j in 2:6) { D=0 for (k in 1:13){ d=(data[1,k]-data[j,k])^2 D=d+D } DistRep [j,1]<-sqrt(D) } Mean <- mean(DistRep[2:6,]) } #Calculate this average distance for each replica mean17 <- mean_rep(rep17) mean21 <- mean_rep(rep21) mean78 <- mean_rep(rep78) mean89 <- mean_rep(rep89) mean98 <- mean_rep(rep98) #Distribution of minimal pair-wise distances between all individuals hist(Distmin) quantile(Distmin,0.01) #The replicas average distances are among the 1% of minimal distances: GOOD! # Clustering #### library(FactoMineR) hcpc <- HCPC(afmRep, min=2, consol=F) # Sub-sampling #### ## Random sub-sampling before mfa clust.iter <- function(df, percent, iter, group, type, ncp) { nclust <- rep(NA, iter) for (i in 1:iter) { tryCatch({ #Allows to skip errors s <- sample(x=1:dim(df)[1], size=dim(df)[1]*percent, replace=F) sdf <- df[s,] mfa <- MFA(base=sdf, group=group, type=type, ncp=ncp, graph=F) hclust <- HCPC(res=mfa, nb.clust=-1, min=2, graph=F) nclust[i] <- hclust$call$t$nb.clust }, error=function(e){}) } nclust } clust10 <- clust.iter(df=dataRep[1:159,4:144], percent=0.1, iter=1000, group=c(1,2,3,60,75), type=c("s","s","n","c","c"), ncp=10) clust20 <- clust.iter(df=dataRep[1:159,4:144], percent=0.2, iter=1000, group=c(1,2,3,60,75), type=c("s","s","n","c","c"), ncp=10) clust30 <- clust.iter(df=dataRep[1:159,4:144], percent=0.3, iter=1000, group=c(1,2,3,60,75), type=c("s","s","n","c","c"), ncp=10) clust40 <- clust.iter(df=dataRep[1:159,4:144], percent=0.4, iter=1000, group=c(1,2,3,60,75), type=c("s","s","n","c","c"), ncp=10) clust50 <- clust.iter(df=dataRep[1:159,4:144], percent=0.5, iter=1000, group=c(1,2,3,60,75), type=c("s","s","n","c","c"), ncp=10) clust60 <- clust.iter(df=dataRep[1:159,4:144], percent=0.6, iter=1000, group=c(1,2,3,60,75), type=c("s","s","n","c","c"), ncp=10) clust70 <- clust.iter(df=dataRep[1:159,4:144], percent=0.7, iter=1000, group=c(1,2,3,60,75), type=c("s","s","n","c","c"), ncp=10) clust80 <- clust.iter(df=dataRep[1:159,4:144], percent=0.8, iter=1000, group=c(1,2,3,60,75), type=c("s","s","n","c","c"), ncp=10) clust90 <- clust.iter(df=dataRep[1:159,4:144], percent=0.9, iter=1000, group=c(1,2,3,60,75), type=c("s","s","n","c","c"), ncp=10) ## Systematist-like sub-sampling: starting from identified specimens clust.iter6 <- function(df, nb, sp, group, type, ncp) { #sp=vector of species abbreviations, nb=number of 'closest' individuals to select mfa <- MFA(base=df, group=group, type=type, ncp=ncp, graph=F) a<-mfa$global.pca$ind$coord starters <- matrix(NA, 4, 10) starters[1,] <- apply(matrix(a[which(sp=="cristagalli"),], nrow=length(which(sp=="cristagalli"))), 2, mean) starters[2,] <- apply(matrix(a[which(sp=="pakistanensis"),], nrow=length(which(sp=="pakistanensis"))), 2, mean) starters[3,] <- apply(matrix(a[which(sp=="posterolongatus"),], nrow=length(which(sp=="posterolongatus"))), 2, mean) starters[4,] <- apply(matrix(a[which(sp=="waageni"),], nrow=length(which(sp=="waageni"))), 2, mean) dis <- dist(rbind(starters, as.matrix(a)), diag=T, upper=T) mat <- as.matrix(dis) nclust <- rep(NA, nb) percent <- rep(NA, nb) for (i in 2:nb) { closest <- apply(mat[-c(1:4),1:4],2, order)[1:i,] sdf <- data.frame(a[unique(c(closest)),]) percent[i] <- length(unique(c(closest)))/159*100 hclust <- HCPC(res=sdf, nb.clust=-1, min=2, graph=F) nclust[i] <- hclust$call$t$nb.clust } list(nclust=nclust, percent=percent) } dataRep2 <- read.csv2(file="for_mix_rep_moy_sansWL.csv", dec=".", sep=";") clust690 <- clust.iter6(df=dataRep2[1:159,4:144], nb=155, sp=dataRep2$Taxonomy, group=c(1,2,3,60,75), type=c("s","s","n","c","c"), ncp=10) ## Combining the results ls.clust <- list(clust10,clust20, clust30, clust40, clust50, clust60, clust70, clust80, clust90) mean.clust <- lapply(ls.clust, mean, na.rm=T) sd.clust <- lapply(ls.clust, sd, na.rm=T) high <- unlist(mean.clust)+unlist(sd.clust) low <- unlist(mean.clust)-unlist(sd.clust) svg(filename = paste("clusters.svg"), width = 8, height = 10, pointsize = 12) par(mar=c(4,4,2,2)) plot(seq(10, 90, by=10), mean.clust, type="l", xlim=c(12,88), ylim=c(1,7), main="HCPC resample early", las=1, xlab="Percentage of specimens included", ylab="N° clusters found", mgp=c(2,1,0)) polygon(c(seq(10, 90, by=10),seq(90, 10, by=-10)), c(high,low[9:1]), col="grey", border=NA) lines(seq(10, 90, by=10), mean.clust, lwd=3) lines(x=clust690$percent, y=clust690$nclust, col="red") dev.off()