# =========================== # # # Jenkins et al. 2020 Ecology and Evolution # R script for Corsica lobster assignment analysis # # Description: # Run and visualise a DAPC analysis # Assign Corsica lobsters to the Atlantic or the Mediterranean # # Notes before execution: # 1. Make sure all required R packages are installed. # 2. Set working directory to the location of this R script. # # =========================== # # Load libraries library(adegenet) library(assignPOP) library(ggplot2) library(dplyr) library(reshape2) library(gridExtra) # ----------------- # # # Data import and preparation # # ----------------- # # Import SNP genotypes load("./Data/all_filt_geno.RData") snpdata # Merge Laz/Tar and Sar13 and Sar17 popNames(snpdata) = gsub("Tar", "Laz", popNames(snpdata)) popNames(snpdata) = gsub("Sar13", "Sar", popNames(snpdata)) popNames(snpdata) = gsub("Sar17", "Sar", popNames(snpdata)) summary(snpdata$pop) # ----------------- # # # Discriminant analysis of principal components # # ----------------- # # Run cross validation to find the optimal number of PCs to retain in DAPC set.seed(123) # x = tab(snpdata, NA.method="mean") # xval = xvalDapc(x, snpdata$pop, result="groupMean", parallel="snow", ncpus=2) # Number of PCs with best stats. Lower score = better # xval$`Number of PCs Achieving Highest Mean Success` # xval$`Number of PCs Achieving Lowest MSE` # xval$`Root Mean Squared Error by Number of PCs of PCA` # 70 PCs to retain # Run a DAPC using population IDs as priors dapc1 = dapc(snpdata, snpdata$pop, n.pca=70, n.da=3) # Analyse how much percent of genetic variance is explained by each axis percent = dapc1$eig/sum(dapc1$eig)*100 percent barplot(percent, ylab="Percent of genetic variance explained by eigenvectors", names.arg=round(percent, 2)) # Create dataframe with PC info df_pca = as.data.frame(dapc1$ind.coord) # Add a column containing individuals df_pca = cbind(rownames(df_pca), df_pca) # Rename first three columns colnames(df_pca) = c("Indiv","PC1","PC2","PC3") # Add a column with the population IDs df_pca$pop = snpdata$pop # Flip axis by multiplying by minus 1 # df_pca[ , 2:4] = df_pca[ , 2:4]*-1 # ----------------- # # # Visualise DAPC # # ----------------- # # Function to add country labels to population labels addregion = function(x){ # If pop label is present function will output the region if(x=="Ale"|x=="The"|x=="Tor"|x=="Sky") y = " Aegean Sea " if(x=="Sar"|x=="Laz") y = " Central Mediterranean " if(x=="Csa") y = " Corsica " if(x=="Jer") y = " Atlantic " if(x=="Idr16"|x=="Idr17"|x=="Bre") y = " Atlantic " if(x=="Hel") y = " Atlantic " if(x=="Cor"|x=="Hoo"|x=="Kil"|x=="Mul"|x=="Ven") y = " Atlantic " if(x=="Oos") y = " Atlantic " if(x=="Tro"|x=="Ber"|x=="Flo"|x=="Sin") y = " Atlantic " if(x=="Gul"|x=="Kav"|x=="Lys") y = " Atlantic " if(x=="Vig") y = " Atlantic " if(x=="Brd"|x=="Cro"|x=="Eye"|x=="Heb"|x=="Iom"|x=="Ios"|x=="Loo"|x=="Lyn"|x=="Ork"|x=="Pad"|x=="Pem"|x=="She"|x=="Sbs"|x=="Sul") y = " Atlantic " return(y) } df_pca$region = sapply(df_pca$pop, addregion) unique(df_pca$region) # Reorder levels for plotting df_pca$region = factor(df_pca$region, levels=c(" Aegean Sea ", " Central Mediterranean ", " Atlantic ", " Corsica ")) # Calculate centroid position for each population centroid = aggregate(cbind(PC1, PC2, PC3) ~ pop, data=df_pca, FUN=mean) centroid$region = sapply(centroid$pop, addregion) # Find and store coordinate info required to draw segments segs = merge(df_pca, setNames(centroid, c("pop","oPC1S1","oPC2S2","oPC3S3")), by = "pop", sort = FALSE) # Colour scheme cols = c("#E31A1C","#FDB462","#377EB8","yellow") # ggplot2 theme dapctheme = theme(legend.title = element_blank(), axis.text.y = element_text(colour="black", size=14), axis.text.x = element_text(colour="black", size=14), axis.title = element_text(colour="black", size=14), legend.position = "top", legend.text = element_text(size=15), legend.key = element_blank(), legend.key.size = unit(0.7,"cm"), legend.box.spacing = unit(0, "cm"), panel.grid = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour="black", fill=NA, size=1) ) # Scatter plot axis 1 vs 2 dapc1_plt = ggplot(df_pca, aes(x=PC1, y=PC2))+ geom_hline(yintercept = 0)+ geom_vline(xintercept = 0)+ # spiders geom_segment(data=subset(segs, pop!="Csa"), aes(xend=oPC1S1, yend=oPC2S2,colour=region), show.legend=FALSE)+ # geom_point(data=subset(df_pca, pop!="Csa"), aes(fill=region), shape=21, # size=3, show.legend=T)+ geom_point(data=df_pca, aes(fill=region), shape=21, size=3, show.legend=T)+ # centroids geom_label(data=subset(centroid, pop!="Csa"), size=5, aes(label=pop, fill=region), show.legend=FALSE)+ # Corsica labels geom_label(data=subset(df_pca, pop=="Csa"), aes(label=Indiv), fill="yellow", size=4, fontface="bold")+ scale_fill_manual(values=cols)+ scale_color_manual(values=cols)+ labs(x=paste("Axis 1 (",format(round(percent[1], 1), nsmall=1)," %)", sep=""))+ labs(y=paste("Axis 2 (",format(round(percent[2], 1), nsmall=1)," %)", sep=""))+ dapctheme+ # Remove double legend guides(colour = FALSE) dapc1_plt # ggsave("dapc.png", width=10, height=8, dpi=600) # ----------------- # # # Corsica assignment analysis # # ----------------- # # Import Corsica genotypes corsica_geno = read.Genepop("./Data/corsica_filt_geno.gen") corsica_geno # Import baseline assignment data from Jenkins et al. 2019 # https://doi.org/10.5061/dryad.2v1kr38 # "lobster_basin_baseline.gen" in "Assignment_genotype_data.zip" baseline_dataset = read.Genepop("./Data/lobster_basin_baseline.gen", pop.names=c("Atlantic","Mediterranean")) baseline_dataset # Perform assignment test on Corsica individuals using the SVM model set.seed(123) corsica_assign = assign.X(x1 = baseline_dataset, x2 = corsica_geno, dir = "Corsica_assignment_svn/", model = "svm", mplot = TRUE) # Model predictions corsica_pred = corsica_assign$data[1:20,] %>% arrange(Ind.ID) subset(corsica_pred, select = c("Ind.ID","pred.pop")) # Extract assignment probabilities assign_prop = read.table("Corsica_assignment_svn/AssignmentResult.txt", header=TRUE) %>% arrange(Ind.ID) assign_prop write.csv(assign_prop, file="./Corsica_assignment_svn/Corsica_assignments.csv") # Transform data to long format assign_prop = melt(assign_prop, id.vars = c("Ind.ID","pred.pop")) # ggplot theme for barplot ggtheme = theme( axis.title = element_text(colour="black", size=15), axis.text.y = element_text(colour="black", size=12), axis.text.x = element_text(colour="black", size=12), panel.background = element_blank(), legend.title = element_blank(), legend.text = element_text(size=18), legend.position = "top" ) # Barplot assign_bar = ggplot(data=assign_prop, aes(x=Ind.ID, y=value, fill=variable))+ geom_bar(stat="identity", show.legend = F)+ scale_y_continuous(expand=c(0,0))+ scale_fill_manual("Origin", values=c("#377EB8","#FDB462"), labels=c(" Atlantic "," Mediterranean "))+ ylab("Assignment probability")+ scale_x_discrete(labels = substr(assign_prop$Ind.ID, 4, 5))+ xlab("Individual")+ ggtheme assign_bar # ggsave("assign_bar.png", width=12, height=6.5, dpi=600) # ----------------- # # # Composite figure # # ----------------- # # ggtheme for combined plot ggthemeC = theme( axis.title = element_text(colour="black", size=14), legend.text = element_text(colour="black", size=14), plot.title = element_blank() ) # Combine plots using ggarrange plt = grid.arrange(dapc1_plt + ggthemeC + guides(fill = guide_legend(override.aes = list(size = 5))) + labs(tag="A"), assign_bar + ggthemeC + labs(tag="B"), nrow = 3, layout_matrix = rbind(1,1,2)) # Export figure dev.copy(png, "Fig3.png", width=10, height=10, units="in", res=600) dev.off() dev.copy(tiff, "Fig3.tiff", width=10, height=10, units="in", res=600) dev.off() dev.copy2pdf(file="Fig3.pdf", width=10, height=10) dev.off() # ----------------- # # # Sardinia assignment analysis # # ----------------- # # Import Sardinia genotypes sardinia_geno = read.Genepop("./Data/sardinia_geno.gen") sardinia_geno # Import baseline dataset excluding Sardinia samples baseline_excSar = read.Genepop("./Data/lobster_basin_baseline_excSardinia.gen", pop.names=c("Atlantic","Mediterranean")) baseline_excSar # Perform assignment test on Corsica individuals using the SVM model # Function will ask to 'use common features for assignment' # Input a "Y" (yes) into the R console set.seed(123) sardinia_assign = assign.X(x1 = baseline_excSar, x2 = sardinia_geno, dir = "Sardinia_assignment_svn/", model = "svm", mplot = TRUE) # Model predictions sardinia_pred = sardinia_assign$data[1:20,] %>% arrange(Ind.ID) subset(sardinia_pred, select = c("Ind.ID","pred.pop")) # Extract assignment proportions assign_sar = read.table("Sardinia_assignment_svn/AssignmentResult.txt", header=TRUE) %>% arrange(Ind.ID) assign_sar write.csv(assign_sar, file="./Sardinia_assignment_svn/Sardinia_assignments.csv") # Transform data to long format assign_sar = melt(assign_sar, id.vars = c("Ind.ID","pred.pop")) # Barplot sar_bar = ggplot(data=assign_sar, aes(x=Ind.ID, y=value, fill=variable))+ geom_bar(stat="identity", show.legend = F)+ scale_y_continuous(expand=c(0,0))+ scale_fill_manual("Origin", values=c("#377EB8","#FDB462"), labels=c(" Atlantic "," Mediterranean "))+ ylab("Assignment probability")+ xlab("Individual")+ theme( axis.title = element_text(colour="black", size=15), axis.title.x = element_blank(), axis.text.y = element_text(colour="black", size=12), axis.text.x = element_text(colour="black", size=12, angle = 90, vjust = 0.5), panel.background = element_blank(), legend.title = element_blank(), legend.text = element_text(size=18), legend.position = "top" ) sar_bar # ggsave("A2_sardinia_assignments.png", width=10, height=4, dpi=600) # ggsave("A2_sardinia_assignments.pdf", width=10, height=4)