# This is the R code for bootstrapping across sites # Load libraries library(stringr) library(plyr) library(apcluster) library(cluster) library(mclust) library(ggplot2) library(ggpubr) library(forcats) library(matlab) library(cowplot) library(dplyr) library(flextable) library(umap) # Read in data female.mfcc.values <- read.csv('ClinkandKlinck2020GibbonFemaleData.csv') call.dist <- as.data.frame(table(female.mfcc.values$female.id)) newsubset <- subset(call.dist,Freq > 6) length(unique(newsubset$Var1)) # Remove females female.mfcc.values <- droplevels(female.mfcc.values[female.mfcc.values$female.id %in% newsubset$Var1,] ) length(unique(female.mfcc.values$female.id)) nrow(female.mfcc.values) table(female.mfcc.values$site,female.mfcc.values$female.id) # Part 1. Bootstrapping across sites ---------------------------------------------- # Part 1a. Clustering using affinity propagation new.site.index <- unique(female.mfcc.values$site) clustering.df.affinity <- data.frame() for(a in 1:20){ for (b in 1:5) { #print(b) if(b != 6){ mfcc.data.subset <- droplevels(subset(female.mfcc.values, site == new.site.index[b])) site <- new.site.index[b] } else{ mfcc.data.subset <- female.mfcc.values site <- 'All' } for(c in 1:5){ #print(c) n.calls.site <- nrow(mfcc.data.subset) subset75 <- n.calls.site * 0.8 mfcc.data.subset.reduced <- mfcc.data.subset[round(runif(subset75,min=1,max=n.calls.site)),] n.females.site <- length(unique(mfcc.data.subset.reduced$female.id)) n.calls.inc <- nrow(mfcc.data.subset.reduced) if (length(unique(mfcc.data.subset.reduced$female.id)) > 1) { cluster.df <- apcluster::apcluster( negDistMat(r = 2), mfcc.data.subset.reduced[, -c(1:3, 181, 182)], maxits = 100000, convits = 10000, nonoise = T ) sil <- cluster::silhouette(x = cluster.df@idx, dist = dist(mfcc.data.subset.reduced[, -c(1:3, 181, 182)])) sil.coef <- summary(sil)$avg.width match.cluster.df <- cbind.data.frame(mfcc.data.subset.reduced$female.id[cluster.df@exemplars], cluster.df@exemplars) colnames(match.cluster.df) <- c('female', 'cluster.id') duplicated.clusters <- match.cluster.df[duplicated(match.cluster.df$female),] unique.duplicated <- unique(duplicated.clusters$female) if(length(unique.duplicated) >0){ updated.duplicated.df <- data.frame() for(g in 1:length(unique.duplicated)){ temp.duplicated <- droplevels(subset(match.cluster.df,female==unique.duplicated[g])) female.id <- unique.duplicated[g] cluster.list <- list() for(e in 1:nrow(temp.duplicated)){ cluster.index <- which(cluster.df@exemplars==temp.duplicated$cluster.id[e]) cluster.length <- length(cluster.df@clusters[[cluster.index]]) cluster.list[[e]] <- cluster.length } for(f in 1:nrow(temp.duplicated)){ if( f== which.max(unlist(cluster.list))) { temp.duplicated$female[f] <- female.id } else{ temp.duplicated$female[f] <- 'NA' } } updated.duplicated.df <- rbind.data.frame(updated.duplicated.df,temp.duplicated) } temp.matchdf <- match.cluster.df[! match.cluster.df$female %in% unique.duplicated, ] match.cluster.df <- rbind.data.frame(temp.matchdf,updated.duplicated.df) match.cluster.df$female <- fct_explicit_na(match.cluster.df$female, na_level = "None") } mfcc.data.subset.reduced$affinity.cluster <- cluster.df@idx female.mfcc.affinity.added <- data.frame() for (a in 1:length(cluster.df@exemplars)) { female.match <- subset(match.cluster.df, cluster.id ==cluster.df@exemplars[[a]]) #print(female.match) data.frame.subset <- subset(mfcc.data.subset.reduced,affinity.cluster==female.match$cluster.id) data.frame.subset$affinity.id <- rep(female.match$female, nrow(data.frame.subset)) female.mfcc.affinity.added <- rbind.data.frame(female.mfcc.affinity.added, data.frame.subset) } female.index <- unique(female.mfcc.affinity.added$female.id) female.mfcc.affinity.added.adjusted <- data.frame() for (d in 1:length(female.index)) { temp.female.cluster.df <- droplevels(subset(female.mfcc.affinity.added, female.id == female.index[d])) cluster.female.table <- table(temp.female.cluster.df$affinity.id) cluster.most <- attr(cluster.female.table, "dimnames")[[1]] [(which.max(cluster.female.table))] for (i in 1:nrow(temp.female.cluster.df)) { if (temp.female.cluster.df$female.id[i] != cluster.most) { temp.female.cluster.df$affinity.id[i] = "NA" } } temp.female.cluster.df$affinity.id <- fct_explicit_na(temp.female.cluster.df$affinity.id, na_level = "None") female.mfcc.affinity.added.adjusted <- rbind.data.frame(female.mfcc.affinity.added.adjusted, temp.female.cluster.df) } list.vals <- list() for (i in 1:nrow(female.mfcc.affinity.added.adjusted)) { matching.vals <- ifelse( female.mfcc.affinity.added.adjusted$female.id[i] == female.mfcc.affinity.added.adjusted$affinity.id[i], 1, 0 ) list.vals[[i]] <- matching.vals } list.vals <- lapply(list.vals, function(x) replace(x, is.na(x %% 1 == 0), 0)) rand.index <- mclust::adjustedRandIndex( female.mfcc.affinity.added.adjusted$female.id, female.mfcc.affinity.added.adjusted$affinity.id ) nmi.val <- aricode::NMI( female.mfcc.affinity.added.adjusted$female.id, female.mfcc.affinity.added.adjusted$affinity.id ) n.clusters <- length(unique(cluster.df@exemplars)) percent <- sum(unlist(list.vals)) / nrow(female.mfcc.affinity.added.adjusted) percent.correct.affinity <- cbind.data.frame(site, percent, n.females.site,n.calls.inc, n.clusters, rand.index, nmi.val, sil.coef) print(percent.correct.affinity) clustering.df.affinity <- rbind.data.frame(clustering.df.affinity, percent.correct.affinity) # # write.csv(clustering.df.affinity, # 'clustering.df.affinityactual.site.80percent.October2020final.all.csv') } } } } str(clustering.df.affinity) ggboxplot(data=clustering.df.affinity,x='site',y='percent') # K-mediods # Part 1b. Clustering using k-medoids new.site.index <- unique(female.mfcc.values$site) percent.correct.pam <- data.frame() clustering.df.pam <- data.frame() for(a in 1:20){ for (b in 1:5) { #print(b) if(b != 6){ mfcc.data.subset <- droplevels(subset(female.mfcc.values, site == new.site.index[b])) site <- new.site.index[b] } else{ mfcc.data.subset <- female.mfcc.values site <- 'All' } for(c in 1:5){ #print(c) n.calls.site <- nrow(mfcc.data.subset) subset75 <- n.calls.site * 0.8 mfcc.data.subset.reduced <- droplevels( mfcc.data.subset[round(runif(subset75,min=1,max=n.calls.site)),]) n.females.site <- length(unique(mfcc.data.subset.reduced$female.id)) n.calls.inc <- nrow(mfcc.data.subset.reduced) if (length(unique(mfcc.data.subset.reduced$female.id)) > 2) { sil.list <- list() for (z in 2:53) { if (z < nrow(mfcc.data.subset.reduced)) { # print(z) pr4 <- pam(mfcc.data.subset.reduced[, -c(1:3, 181, 182)], z) sil.sum <- summary(silhouette(pr4)) sil.list[[z]] <- (sil.sum$avg.width) } } kmedoids.top.model <- pam(mfcc.data.subset[, -c(1:3, 181, 182)], k = (which.max(unlist(sil.list)) + 1)) mfcc.data.subset$pamclust <- kmedoids.top.model$clustering female.id.kmedoids.df <- data.frame() cluster.index <- length(unique(mfcc.data.subset$pamclust)) for (d in 1:cluster.index) { temp.kmedoids.df <- droplevels(subset(mfcc.data.subset, pamclust == d)) cluster.number.table <- table(temp.kmedoids.df$female.id, temp.kmedoids.df$pamclust) female.most.likely <- attr(cluster.number.table, "dimnames")[[1]] [(which.max(cluster.number.table))] if (female.most.likely %in% female.id.kmedoids.df$female.id.added ) { female.most.likely <- 'NA' } female.id.added <- rep(female.most.likely, nrow(temp.kmedoids.df)) new.cluster.table <- cbind.data.frame(temp.kmedoids.df, female.id.added) female.id.kmedoids.df <- rbind.data.frame(female.id.kmedoids.df, new.cluster.table) } female.index <- unique(female.id.kmedoids.df$female.id) female.id.kmedoids.df.adjusted <- data.frame() for (d in 1:length(female.index)) { temp.female.kmedoids.df <- droplevels(subset(female.id.kmedoids.df, female.id == female.index[d])) cluster.female.table <- table(temp.female.kmedoids.df$pamclust) cluster.most <- attr(cluster.female.table, "dimnames")[[1]] [(which.max(cluster.female.table))] for (i in 1:nrow(temp.female.kmedoids.df)) { if (temp.female.kmedoids.df$pamclust[i] != cluster.most) { temp.female.kmedoids.df$female.id.added[i] = "NA" } } female.id.kmedoids.df.adjusted <- rbind.data.frame(female.id.kmedoids.df.adjusted, temp.female.kmedoids.df) } list.vals <- list() for (i in 1:nrow(female.id.kmedoids.df.adjusted)) { matching.vals <- ifelse( female.id.kmedoids.df.adjusted$female.id[i] == female.id.kmedoids.df.adjusted$female.id.added[i], 1, 0 ) list.vals[[i]] <- matching.vals } list.vals <- lapply(list.vals, function(x) replace(x, is.na(x %% 1 == 0), 0)) female.id.kmedoids.df.adjusted$female.id.added <- fct_explicit_na(female.id.kmedoids.df.adjusted$female.id.added, na_level = "None") rand.index <- mclust::adjustedRandIndex( female.id.kmedoids.df.adjusted$female.id, female.id.kmedoids.df.adjusted$female.id.added ) nmi.val <- aricode::NMI( female.id.kmedoids.df.adjusted$female.id, female.id.kmedoids.df.adjusted$female.id.added ) percent <- sum(unlist(list.vals)) / nrow(female.id.kmedoids.df.adjusted) sil.coef <- max(unlist(sil.list)) n.clusters <- length(unique( kmedoids.top.model$id.med)) n.calls <- nrow(mfcc.data.subset) percent.correct.pam <- cbind.data.frame( site, percent, n.females.site,n.calls.inc, n.clusters, rand.index, nmi.val, sil.coef ) print(percent.correct.pam) clustering.df.pam <- rbind.data.frame(clustering.df.pam, percent.correct.pam) # write.csv(clustering.df.pam, 'clustering.df.pamactual.site.80percent.October2020.final.csv') } } } } # Part 1c. Clustering using Gaussian mixture models new.site.index <- unique(female.mfcc.values$site) percent.correct.mclust <- data.frame() clustering.df.mclust <- data.frame() for(a in 1:20){ for (b in 1:5) { #print(b) if(b != 6){ mfcc.data.subset <- droplevels(subset(female.mfcc.values, site == new.site.index[b])) site <- new.site.index[b] } else{ mfcc.data.subset <- female.mfcc.values site <- 'All' } for(c in 1:5){ #print(c) n.calls.site <- nrow(mfcc.data.subset) subset75 <- n.calls.site * 0.8 mfcc.data.subset.reduced <- droplevels(mfcc.data.subset[round(runif(subset75,min=1,max=n.calls.site)),]) n.females.site <- length(unique(mfcc.data.subset.reduced$female.id)) n.calls.inc <- nrow(mfcc.data.subset.reduced) if (length(unique(mfcc.data.subset.reduced$female.id)) > 2) { gaussian.top.model <- Mclust( mfcc.data.subset.reduced[, -c(1:3, 180, 181, 182)], G = 1:53, c("EII", "VII", "EEI", "EVI", "VEI", "VVI"), verbose = F ) sil <- cluster::silhouette(x = gaussian.top.model$classification, dist = dist(mfcc.data.subset.reduced[, -c(1:3, 180, 181, 182)])) mfcc.data.subset.reduced$mclustclust <- gaussian.top.model$classification female.id.mclust.df <- data.frame() cluster.index <- length(unique(mfcc.data.subset.reduced$mclustclust)) for (d in 1:cluster.index) { temp.mclust.df <- droplevels(subset(mfcc.data.subset.reduced, mclustclust == d)) cluster.number.table <- table(temp.mclust.df$female.id, temp.mclust.df$mclustclust) female.most.likely <- attr(cluster.number.table, "dimnames")[[1]] [(which.max(cluster.number.table))] if (female.most.likely %in% female.id.mclust.df$female.id.added) { female.most.likely <- 'NA' } female.id.added <- rep(female.most.likely, nrow(temp.mclust.df)) new.cluster.table <- cbind.data.frame(temp.mclust.df, female.id.added) female.id.mclust.df <- rbind.data.frame(female.id.mclust.df, new.cluster.table) } female.index <- unique(female.id.mclust.df$female.id) female.id.mclust.df.adjusted <- data.frame() for (d in 1:length(female.index)) { temp.female.mclust.df <- droplevels(subset(female.id.mclust.df, female.id == female.index[d])) cluster.female.table <- table(temp.female.mclust.df$mclustclust) cluster.most <- attr(cluster.female.table, "dimnames")[[1]] [(which.max(cluster.female.table))] for (i in 1:nrow(temp.female.mclust.df)) { if (temp.female.mclust.df$mclustclust[i] != cluster.most) { temp.female.mclust.df$female.id.added[i] <- 'NA' } } female.id.mclust.df.adjusted <- rbind.data.frame(female.id.mclust.df.adjusted, temp.female.mclust.df) } library(forcats) female.id.mclust.df.adjusted$female.id.added <- fct_explicit_na(female.id.mclust.df.adjusted$female.id.added, na_level = "None") list.vals <- list() for (i in 1:nrow(female.id.mclust.df.adjusted)) { matching.vals <- ifelse( female.id.mclust.df.adjusted$female.id[i] == female.id.mclust.df.adjusted$female.id.added[i], 1, 0 ) list.vals[[i]] <- matching.vals } list.vals <- lapply(list.vals, function(x) replace(x, is.na(x %% 1 == 0), 0)) percent <- sum(unlist(list.vals)) / nrow(female.id.mclust.df.adjusted) rand.index <- mclust::adjustedRandIndex( female.id.mclust.df.adjusted$female.id, female.id.mclust.df.adjusted$female.id.added ) nmi.val <- aricode::NMI( female.id.mclust.df.adjusted$female.id, female.id.mclust.df.adjusted$female.id.added ) if(is.na(sil) != 'TRUE'){ sil.coef <- (summary(sil)$avg.width) } else{ sil.coef <- 'NA' } n.clusters <- length(unique(mfcc.data.subset.reduced$mclustclust)) percent.correct.mclust <- cbind.data.frame(site, percent, n.females.site,n.calls.inc, n.clusters, rand.index, nmi.val, sil.coef) print(percent.correct.mclust) clustering.df.mclust <- rbind.data.frame(clustering.df.mclust, percent.correct.mclust) write.csv(clustering.df.mclust, 'clustering.df.mclustactual.site.80percent.Oct2020.final.csv') } } } } # Part 2. Bootstrapping by site plots ------------------------------------- # Read in the data clustering.df.affinity <- read.csv('clustering.df.affinityactual.site.80percent.October2020final.all.csv') colnames.tables <- colnames(clustering.df.affinity) percent.correct.mod.affinity <- cbind.data.frame(clustering.df.affinity, rep('affinity', nrow(clustering.df.affinity))) colnames(percent.correct.mod.affinity) <- c(colnames.tables, 'method') percent.correct.pam <- read.csv('clustering.df.pamactual.site.80percent.October2020.final.csv') percent.correct.mod.pam <- cbind.data.frame(percent.correct.pam, rep('pam', nrow(percent.correct.pam))) colnames(percent.correct.mod.pam) <- c(colnames.tables, 'method') percent.correct.mclust <- read.csv('clustering.df.mclustactual.site.80percent.Oct2020.final.csv') percent.correct.mod.mclust <- cbind.data.frame(percent.correct.mclust, rep('mclust', nrow(percent.correct.mclust))) colnames(percent.correct.mod.mclust) <- c(colnames.tables, 'method') combined.nmi <- rbind.data.frame( percent.correct.mod.affinity, percent.correct.mod.pam, percent.correct.mod.mclust ) combined.nmi$rand.index <- as.numeric(combined.nmi$rand.index) combined.nmi$nmi.val <- as.numeric(combined.nmi$nmi.val) combined.nmi$n.females.site <- as.numeric(combined.nmi$n.females.site) # Deviation plots for.deviation.plot.nobootstrap <- combined.nmi for.deviation.plot.nobootstrap$deviation <- as.numeric(for.deviation.plot.nobootstrap$n.clusters) - as.numeric(for.deviation.plot.nobootstrap$n.females.site) for.deviation.plot.nobootstrap$deviation <- as.numeric(for.deviation.plot.nobootstrap$n.clusters) - as.numeric(for.deviation.plot.nobootstrap$n.females.site) for.deviation.plot.nobootstrap$method <- plyr::revalue( for.deviation.plot.nobootstrap$method, c( 'affinity' = 'Affinity', 'pam' = 'K-medoids', 'mclust' = 'Gaussian' ) ) for.deviation.plot.nobootstrap$method <- factor(for.deviation.plot.nobootstrap$method, levels = c('Affinity', 'Gaussian', 'K-medoids')) levels(for.deviation.plot.nobootstrap$method) levels(for.deviation.plot.nobootstrap$site) <- c( 'Deramakot \n (N=8)', 'Danum Valley \n (N=12) ', 'Imbak Canyon \n (N=8)', 'Maliau Basin \n (N=3)', 'Kalabakan \n (N=22)' ) for.deviation.plot.nobootstrap <- for.deviation.plot.nobootstrap[order(for.deviation.plot.nobootstrap$n.females.site), ] deviation.actual <- ggboxplot( data = for.deviation.plot.nobootstrap, x = 'site', y = 'deviation', fill = 'method', width = 0.5 ) + xlab('') + ylab('Deviation from actual number \n of females') + theme(legend.title = element_blank()) + scale_fill_manual(values = rev(matlab::jet.colors(3))) + guides(fill = F) for.deviation.plot.nobootstrap$percent <- round(as.numeric(for.deviation.plot.nobootstrap$percent), 2) for.deviation.plot.nobootstrap$percent <- for.deviation.plot.nobootstrap$percent*100 percent.actual <- ggboxplot( data = for.deviation.plot.nobootstrap, x = 'site', y = 'percent', fill = 'method', width = 0.5 ) + xlab('') + ylab('Percent correct \n classification') + theme(legend.title = element_blank()) + scale_fill_manual(values = rev(matlab::jet.colors(3))) cowplot::plot_grid( percent.actual, deviation.actual, nrow = 2, labels = c('A', 'B'), label_x = 0.97 ) # Part 3. Code for Table 3 ------------------------------------------------ for.deviation.plot.nobootstrap$rand.index <- round(for.deviation.plot.nobootstrap$rand.index, digits = 2) for.deviation.plot.nobootstrap$nmi.val <- round(for.deviation.plot.nobootstrap$nmi.val, digits = 2) for.deviation.plot.nobootstrap$sil.coef <- round(as.numeric(for.deviation.plot.nobootstrap$sil.coef), digits = 2) summary.plot <- for.deviation.plot.nobootstrap %>% group_by(site,method) %>% dplyr::summarize(Percent=mean(percent),Percent.SD=sd(percent), Silhouette=mean(sil.coef),Silhouette.sd=sd(sil.coef), NMI=mean(nmi.val),NMI.sd=sd(nmi.val)) summary.plot[,3:8] <- round(summary.plot[,3:8], digits = 2) summary.plot$Percent.comb <- paste(summary.plot$Percent,' ± ', summary.plot$Percent.SD, sep='') summary.plot$Silhouette.comb <- paste(summary.plot$Silhouette,' ± ', summary.plot$Silhouette.sd, sep='') summary.plot$NMI.comb <- paste(summary.plot$NMI,' ± ', summary.plot$NMI.sd, sep='') summary.plot <- summary.plot[,c(1,2,9:11)] summary.plot <- rename(summary.plot, c("Site"= "site", "Clustering \n Method"="method", 'Percent correct \n (mean ± sd)' = 'Percent.comb' , 'Silhouette \n (mean ± sd)' = 'Silhouette.comb' , 'NMI \n (mean ± sd)' = 'NMI.comb' )) myft <- flextable( (summary.plot)) myft <- width(myft, width = 1) myft <- bold(myft, part = "header") myft <- merge_v(myft, j = c("Site") ) myft save_as_docx(myft,path='/Users/denasmacbook/UnsupervisedGibbonClassification/Table3.docx') # Part 4. UMAP Plot of gibbon calls --------------------------------------- female.umap <- umap::umap(female.mfcc.values[, -c(1:3, 181, 182)],labels=as.factor(female.mfcc.values$female.id),controlscale=TRUE,scale=3) # Part 5a: Affinty propagation clustering cluster.df.all <- apcluster::apcluster( negDistMat(r = 2), female.mfcc.values[, -c(1:3, 181, 182)], maxits = 100000, convits = 10000, nonoise = T ) match.cluster.df.all <- cbind.data.frame(female.mfcc.values$female.id[cluster.df.all@exemplars], cluster.df.all@exemplars) colnames(match.cluster.df.all) <- c('female', 'cluster.id') duplicated.clusters <- match.cluster.df.all[duplicated(match.cluster.df.all$female),] unique.duplicated <- unique(duplicated.clusters$female) updated.duplicated.df <- data.frame() for(g in 1:length(unique.duplicated)){ temp.duplicated <- droplevels(subset(match.cluster.df.all,female==unique.duplicated[g])) female.id <- unique.duplicated[g] cluster.list <- list() for(e in 1:nrow(temp.duplicated)){ cluster.index <- which(cluster.df.all@exemplars==temp.duplicated$cluster.id[e]) cluster.length <- length(cluster.df.all@clusters[[cluster.index]]) cluster.list[[e]] <- cluster.length } for(f in 1:nrow(temp.duplicated)){ if( f== which.max(unlist(cluster.list))) { temp.duplicated$female[f] <- female.id } else{ temp.duplicated$female[f] <- 'NA' } } updated.duplicated.df <- rbind.data.frame(updated.duplicated.df,temp.duplicated) } temp.matchdf <- match.cluster.df.all[! match.cluster.df.all$female %in% unique.duplicated, ] match.cluster.df.all <- rbind.data.frame(temp.matchdf,updated.duplicated.df) match.cluster.df.all$female <- fct_explicit_na(match.cluster.df.all$female, na_level = "None") female.mfcc.values$affinity.cluster <- cluster.df.all@idx female.mfcc.affinity.added <- data.frame() for (a in 1:length(cluster.df.all@exemplars)) { female.match <- subset(match.cluster.df.all, cluster.id ==cluster.df.all@exemplars[[a]]) #print(female.match) data.frame.subset <- subset(female.mfcc.values,affinity.cluster==female.match$cluster.id) data.frame.subset$affinity.id <- rep(female.match$female, nrow(data.frame.subset)) female.mfcc.affinity.added <- rbind.data.frame(female.mfcc.affinity.added, data.frame.subset) } female.index <- unique(female.mfcc.affinity.added$female.id) female.mfcc.affinity.added.adjusted <- data.frame() for (d in 1:length(female.index)) { temp.female.cluster.df.all <- droplevels(subset(female.mfcc.affinity.added, female.id == female.index[d])) cluster.female.table <- table(temp.female.cluster.df.all$affinity.id) cluster.most <- attr(cluster.female.table, "dimnames")[[1]] [(which.max(cluster.female.table))] for (i in 1:nrow(temp.female.cluster.df.all)) { if (temp.female.cluster.df.all$female.id[i] != cluster.most) { temp.female.cluster.df.all$affinity.id[i] = "NA" } } temp.female.cluster.df.all$affinity.id <- fct_explicit_na(temp.female.cluster.df.all$affinity.id, na_level = "None") female.mfcc.affinity.added.adjusted <- rbind.data.frame(female.mfcc.affinity.added.adjusted, temp.female.cluster.df.all) } list.vals <- list() for (i in 1:nrow(female.mfcc.affinity.added.adjusted)) { matching.vals <- ifelse( female.mfcc.affinity.added.adjusted$female.id[i] == female.mfcc.affinity.added.adjusted$affinity.id[i], 'Yes', 'No' ) list.vals[[i]] <- matching.vals } list.vals <- lapply(list.vals, function(x) replace(x, is.na(x), 'No')) female.mfcc.affinity.added.adjusted$match <- unlist(list.vals) recorder.id.df.affinity <- cbind.data.frame( female.umap$layout[,1:2], female.mfcc.affinity.added.adjusted$female.id, female.mfcc.affinity.added.adjusted$match ) colnames(recorder.id.df.affinity) <- c("Comp.1", "Comp.2", "recorder.id", 'match') recorder.id.df.affinity$recorder.id <- as.factor(recorder.id.df.affinity$recorder.id) color.vals <- matlab::jet.colors(2) nrow(subset(recorder.id.df.affinity,match=='Yes'))/nrow(recorder.id.df.affinity) recorder.id.df.affinity$match <- as.factor(recorder.id.df.affinity$match) my_plot_affinity <- ggplot(data = recorder.id.df.affinity, aes( x = Comp.1, y = Comp.2, colour = match, shape = match )) + geom_point(size = 3) + scale_color_manual(values = c('red', 'blue')) + theme_bw() + ggtitle('Affinity (65% correct)') + labs(shape = "Correct \n classification?", color = "Correct \n classification?")+ xlab('UMAP: Dim 1')+ylab('UMAP: Dim 2') my_plot_affinity # Part 5b. All females k-medoids female.mfcc.values <- dplyr::select(female.mfcc.values, -c(affinity.cluster)) sil.list <- list() for (z in 2:100) { print(z) pr4 <- pam(female.mfcc.values[, -c(1:3, 180, 181, 182,183)], z) sil.sum <- summary(silhouette(pr4)) sil.list[[z]] <- (sil.sum$avg.width) } kmedoids.top.model.all <- pam(female.mfcc.values[, -c(1:3, 180, 181, 182)], k = (which.max(unlist(sil.list)) + 1)) female.mfcc.with.snr.pam <- female.mfcc.values female.mfcc.with.snr.pam$pamclust <- kmedoids.top.model.all$clustering female.id.kmedoids.df <- data.frame() cluster.index <- length(unique(female.mfcc.with.snr.pam$pamclust)) for (d in 1:cluster.index) { temp.kmedoids.df <- droplevels(subset(female.mfcc.with.snr.pam, pamclust == d)) cluster.number.table <- table(temp.kmedoids.df$female.id, temp.kmedoids.df$pamclust) female.most.likely <- attr(cluster.number.table, "dimnames")[[1]] [(which.max(cluster.number.table))] female.id.added <- rep(female.most.likely, nrow(temp.kmedoids.df)) new.cluster.table <- cbind.data.frame(temp.kmedoids.df, female.id.added) female.id.kmedoids.df <- rbind.data.frame(female.id.kmedoids.df, new.cluster.table) } female.index <- unique(female.id.kmedoids.df$female.id) female.id.kmedoids.all.df.adjusted <- data.frame() for (d in 1:length(female.index)) { temp.female.kmedoids.all.df <- droplevels(subset(female.id.kmedoids.df, female.id == female.index[d])) kmedoids.all.female.table <- table(temp.female.kmedoids.all.df$pamclust) kmedoids.all.most <- attr(kmedoids.all.female.table, "dimnames")[[1]] [(which.max(kmedoids.all.female.table))] if (kmedoids.all.most %in% temp.female.kmedoids.all.df$female.id.added) { female.most.likely <- 'NA' } for (i in 1:nrow(temp.female.kmedoids.all.df)) { if (temp.female.kmedoids.all.df$pamclust[i] != kmedoids.all.most) { temp.female.kmedoids.all.df$female.id.added[i] = "NA" } } female.id.kmedoids.all.df.adjusted <- rbind.data.frame(female.id.kmedoids.all.df.adjusted, temp.female.kmedoids.all.df) } list.vals <- list() for (i in 1:nrow(female.id.kmedoids.all.df.adjusted)) { matching.vals <- ifelse( female.id.kmedoids.all.df.adjusted$female.id[i] == female.id.kmedoids.all.df.adjusted$female.id.added[i], 'Yes', 'No' ) list.vals[[i]] <- matching.vals } list.vals <- lapply(list.vals, function(x) replace(x, is.na(x), 'No')) female.id.kmedoids.df$match <- unlist(list.vals) recorder.id.df.kmedoids <- cbind.data.frame(female.umap$layout[,1:2], female.id.kmedoids.df$female.id, female.id.kmedoids.df$match) colnames(recorder.id.df.kmedoids) <- c("Comp.1", "Comp.2", "recorder.id", 'match') recorder.id.df.kmedoids$recorder.id <- as.factor(recorder.id.df.kmedoids$recorder.id) color.vals <- matlab::jet.colors(length(unique(recorder.id.df.kmedoids$recorder.id))) nrow(subset(recorder.id.df.kmedoids,match=='Yes'))/nrow(recorder.id.df.kmedoids) recorder.id.df.kmedoids$match <- as.factor(recorder.id.df.kmedoids$match) my_plot_kmedoids <- ggplot(data = recorder.id.df.kmedoids, aes( x = Comp.1, y = Comp.2, colour = match, shape = match )) + geom_point(size = 3) + scale_color_manual(values = c('red', 'blue')) + theme_bw() + ggtitle('K-medoids (17% correct)') + labs(shape = "Correct \n classification?", color = "Correct \n classification?") + xlab('UMAP: Dim 1')+ylab('UMAP: Dim 2') my_plot_kmedoids # Part 5c. All females Gaussian mixture models allfemales.mclustmodel <- Mclust(female.mfcc.values[, -c(1:3, 180, 181, 182)], G = 1:100, c("EII", "VII", "EEI", "EVI", "VEI", "VVI")) mclust.all.data <- female.mfcc.values mclust.all.data$mclust.cluster <- allfemales.mclustmodel$classification female.id.mclust.df <- data.frame() cluster.index <- length(unique(mclust.all.data$mclust.cluster)) for (d in 1:cluster.index) { print(d) temp.mclust.df <- droplevels(subset(mclust.all.data, mclust.cluster == d)) cluster.number.table <- table(temp.mclust.df$female.id, temp.mclust.df$mclust.cluster) female.most.likely <- attr(cluster.number.table, "dimnames")[[1]] [(which.max(cluster.number.table))] if (female.most.likely %in% female.id.mclust.df$female.id.added) { female.most.likely <- 'NA' } female.id.added <- rep(female.most.likely, nrow(temp.mclust.df)) new.cluster.table <- cbind.data.frame(temp.mclust.df, female.id.added) female.id.mclust.df <- rbind.data.frame(female.id.mclust.df, new.cluster.table) } female.index <- unique(female.id.mclust.df$female.id) female.id.mclust.all.df.adjusted <- data.frame() for (d in 1:length(female.index)) { temp.female.mclust.all.df <- droplevels(subset(female.id.mclust.df, female.id == female.index[d])) mclust.all.female.table <- table(temp.female.mclust.all.df$mclust.cluster) mclust.all.most <- attr(mclust.all.female.table, "dimnames")[[1]] [(which.max(mclust.all.female.table))] for (i in 1:nrow(temp.female.mclust.all.df)) { if (temp.female.mclust.all.df$mclust.cluster[i] != mclust.all.most) { temp.female.mclust.all.df$female.id.added[i] = "NA" } } female.id.mclust.all.df.adjusted <- rbind.data.frame(female.id.mclust.all.df.adjusted, temp.female.mclust.all.df) } list.vals <- list() for (i in 1:nrow(female.id.mclust.all.df.adjusted)) { matching.vals <- ifelse( female.id.mclust.all.df.adjusted$female.id[i] == female.id.mclust.all.df.adjusted$female.id.added[i], 'Yes', 'No' ) list.vals[[i]] <- matching.vals } list.vals <- lapply(list.vals, function(x) replace(x, is.na(x), 'No')) female.id.mclust.df$match <- unlist(list.vals) recorder.id.df.gaussian <- cbind.data.frame(female.umap$layout[,1:2], female.id.mclust.df$female.id, female.id.mclust.df$match) colnames(recorder.id.df.gaussian) <- c("Comp.1", "Comp.2", "recorder.id", 'match') recorder.id.df.gaussian$recorder.id <- as.factor(recorder.id.df.gaussian$recorder.id) color.vals <- matlab::jet.colors(length(unique(recorder.id.df.gaussian$recorder.id))) recorder.id.df.gaussian$match <- as.factor(recorder.id.df.gaussian$match) nrow(subset(recorder.id.df.gaussian,match=='Yes'))/nrow(recorder.id.df.gaussian) my_plot_mclust <- ggplot(data = recorder.id.df.gaussian, aes( x = Comp.1, y = Comp.2, colour = match, shape = match )) + geom_point(size = 3) + scale_color_manual(values = c('red', 'blue')) + theme_bw() + ggtitle('Gaussian (53% correct)') + labs(shape = "Correct \n classification?", color = "Correct \n classification?") + xlab('UMAP: Dim 1')+ylab('UMAP: Dim 2') my_plot_mclust # Plot color by female ID plot.for.females <- recorder.id.df.gaussian plot.for.females$shape.dummy <- rep('A',nrow(recorder.id.df.gaussian)) my_plot_females <- ggplot(data = plot.for.females, aes( x = Comp.1, y = Comp.2, colour = recorder.id,shape = shape.dummy )) + geom_point(size = 3) + scale_color_manual(values = matlab::jet.colors (length(unique(recorder.id.df.gaussian$recorder.id)))) + theme_bw() + ggtitle('Actual Female Identity (N=53)') + guides(color=FALSE) + xlab('UMAP: Dim 1')+ylab('UMAP: Dim 2')+ labs(shape = "Correct \n classification?")# + theme(legend.position = "none") my_plot_females # Combine all plots cowplot::plot_grid(my_plot_females,my_plot_affinity, my_plot_mclust, my_plot_kmedoids, labels = c('A','B','C','D'), label_x = 0.75 )