### STRUCTURE ### # script for population STRUCTURE analysis using package 'StrataG' # needed packages # install from GitHub #devtools::install_github('ericarcher/strataG', build_vignettes = TRUE) #if (!require("BiocManager", quietly = TRUE)) # install.packages("BiocManager") #BiocManager::install("SNPRelate") library(strataG) library(adegenet) library(dartR) library(tidyverse) library(scatterpie) library(sf) library(cowplot) library(dplyr) library(leaflet) library(leaflet.minicharts) library(ggsn) library(ggpubr) library(parallel) library(doParallel) # load filtered gl dataset gl <- readRDS("data/gl.filt_no_related_ind.rds") # data in this slot needs to be present for the report LD function to work gl$loc.all #gl$loc.all <- rep("G/C",nLoc(gl)) # get data ready for LD report gl$position <- gl$other$loc.metrics$ChromPos_Lichenostomus_HeHo_v1 gl$chromosome <- as.factor(gl$other$loc.metrics$Chrom_Lichenostomus_HeHo_v1) # create loci in linkage disequilibrium report ld_res <- gl.report.ld.map(gl,ld_max_pairwise = 10000000) # filter using report gl.filt_LD <- gl.filter.ld(x=gl,ld_report = ld_res) # 12269 loci retained after filtering # save out file saveRDS(gl.filt_LD, "data/gl.filt_no_related_ind_LD.rds") # convert genlight object to {adegenet} genind pard_genind <- gl2gi(gl.filt_LD, v=0) # convert genid to gtypes to work in StrataG gt.data <- genind2gtypes(pard_genind) saveRDS(gt.data, "data/gt.data_for_structure.rds") ### STRUCTURE ### # The structureRun function formats the data, writes input files, runs the external executable for STRUCTURE, then reads and passes the output files into an R list structure. # Run STRUCTURE for k = 1 to 8 # noadmix = default is TRUE (1), when set to FALSE (0) allows for an assumption of admixture among your populations # FREQSCORR = default is FALSE (0) (I always set this to 1 as a model of correlated allele freqs among populations is realistic if there is some gene flow. However, Wang reckons the model is more powerful with uncorrelated allele freqs. Maybe try Wang's version but you could always rerun it afterwards with correlated allele freqs) # ALPHA default is 1.0 (set to 1/K, or at least 1/ 'expected number of pops'), but we set to 0.12 (divided 1.0/n of k (in our case 8) # define UNIFPRIORALPHA 1 (set at 0 - this is the parameter in the Wang et al paper that enables alpha to vary among populations) # define STARTATPOPINFO 0 (0 is probably OK but 1 will start the simulations with individuals split into their sampled population. If your runs are long enough, the data will be resolved properly anyway) - STRUCTURE <- structureRun(gt.data, k.range = 1:8, num.k.rep = 20, burnin = 100000, numreps = 100000,noadmix = FALSE, freqscorr = TRUE, unifprioralpha = FALSE, alpha = 0.125, n_cores = 5, delete.files = FALSE, label = "STRUCTURE_corr_freq_related_ind_remov") saveRDS(STRUCTURE, "data/STRUCTURE_related_ind_remov.rds") # Display diagnostics plots and table from Evanno et al (2005) evanno(STRUCTURE) # new analysis # The clumpp function aggregates STRUCTURE runs for a single value of K and is visualized with the structurePlot function which uses the GGPLOT2 graphics package # Run CLUMPP to aggregate the results of k = 3 (decided after exploratory visualisation) STRUCTURE.clumpp3 <-clumpp (STRUCTURE, k = 3) # Show distribution of group membership probabilities by strata in a barplot # check how many individuals remained in each pop now that multiple nestlings from a brood got removed summary(pop(pard_genind)) # turn the column id into a factor and reorder the populations, so I can have them plotted in this order (not alphabetically which is ggplot's default) STRUCTURE.clumpp3$orig.pop <- factor(STRUCTURE.clumpp3$orig.pop, levels=c("Maria", "Tinderbox", "North Bruny", "South Bruny", "Partridge", "Southport")) # create colour pallete col <- c("#40BC72FF", "#EDE51BFF","#31688EFF") # plot p <- structurePlotAlt(STRUCTURE.clumpp3, horiz = FALSE, scale.x.labels = c("SP (3)", "P (12)" , "SB (16)", "NB (113)", "T(25)", "M (79)"), col = col) p <- p + theme(axis.text=element_text(size=12), axis.title=element_text(size=12), axis.text.x = element_text(angle = 45, hjust = 1), legend.text=element_text(size=14)) # now plot mean pop ancestry coefficients on a map # First for the "best" according to evanno's plot for the analysis in which freqscorr = TRUE # get ancestry coefficients matrix for a cluster of 3 (according to evanno's) Qmatrix_T <- STRUCTURE.clumpp3 [,c("id", "orig.pop", "Group.1", "Group.2", "Group.3")] # create a matrix by pop Qmatrix_pop_T <- Qmatrix_T %>% group_by(orig.pop) %>% summarise(Group.1 = mean (Group.1), Group.2 = mean (Group.2), Group.3 = mean (Group.3)) # coordinates for each "pop" # Maria = -42.637536, 148.074719 # Tinderbox = -43.038770, 147.322325 # North Bruny = -43.131711, 147.374027 # South Bruny = -43.392666, 147.249700 # Partridge Island = -43.390106, 147.099930 # Southport = -43.424706, 146.973252 # creating a data frame with coordinates for each pop and sample size # create vectors orig.pop <- c("Maria", "Tinderbox", "North Bruny", "South Bruny", "Partridge", "Southport") lat <- c(-42.637536, -43.038770, -43.131711, -43.392666, -43.390106, -43.424706) lon <- c(148.074719, 147.322325, 147.374027, 147.249700, 147.099930, 146.973252) samp.size <- c(79, 25, 114, 16, 12, 3) pie.radius <- c(2.5,2,3,2,1.5,1) # this will represent sample size in a more sensible way # now make data frame pop.coord <- data.frame(orig.pop, lat, lon, samp.size, pie.radius) # join Q.matrix dataframe and coordinates Qmatrix_pop_T <- dplyr::inner_join(Qmatrix_pop_T, pop.coord, by="orig.pop") # load shapefile of Tasmania tas_shp <- read_sf("tasmania/TAS_STATE_POLYGON_shp.shp") # check that the shapefile has a coordinate reference system (CRS) st_crs(tas_shp) #EPSG 4283 - GDA94 # have a look at the shapefile, so I can get the coordinates to zoom in to ggplot()+ geom_sf(data=tas_shp) # crop only sampled area, so I can zoom in later sampled_area <- st_crop(tas_shp, xmin = 146.7, xmax = 149, ymin = -44.7, ymax = -42.4) # extract bounding box of the study area study_area_bb = st_as_sfc(st_bbox(sampled_area)) # plot first map which will be the inset map1_c3 <- ggplot() + geom_sf(data = tas_shp, fill = "white") + geom_sf(data = study_area_bb, fill = NA, color = "red", size = 1.2) + theme_void()+ theme(panel.border = element_rect(colour="black", fill = NA), panel.background = element_rect(fill = "white")) # plot that will be used in the multi-panel (without legend) map2_c3 <- ggplot() + geom_sf(data=sampled_area, color = "dark grey")+ geom_scatterpie(data = Qmatrix_pop_T, aes(lon, lat, r = pie.radius/50), cols = c("Group.1", "Group.2", "Group.3"), pie_scale = 1.5) + scale_fill_manual(values=col)+ # colour pallete created above for the ancestor membership plot theme_bw() + theme(plot.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_blank(), axis.line = element_line(color = 'black'), legend.position = "none", axis.text=element_text(size=14),axis.title=element_text(size=14)) + labs(x = "Longitude", y = "Latitude") + north(sampled_area, location = "topright", symbol = 4) + # adds arrow scalebar(sampled_area, x.min = 147.9, x.max = 148.2, y.min = 45, y.max = 43,dist = 8, dist_unit = "km", transform = T, location = "bottomleft") # now join inset and main map pop_cluster3 <- ggdraw() + draw_plot(map2_c3) + draw_plot(map1_c3, x = 0.72, y = 0.15, width = 0.25, height = 0.25) # x = 0.65, y = 0.12 #pop_cluster3 # now make a 2 panel plot with map and membership probability barplot png('plots/STRUCTURE_cluster3.v2.png', width=15, height=8, units='in',res=300, type="cairo") ggarrange(ggarrange(p, pop_cluster3, ncol = 2, labels = c("A", "B"))) dev.off() #### Plot ancestry coefficients for individuals for freqT # get coord and individual id id_coord <- gt.data@other$genind$ind.metrics [, c("id", "lat", "lon", "original_id")] id_coord$id <- as.character(id_coord$id) # join individual Q matrix and coordinates Qmatrix_freqT <- dplyr::inner_join(STRUCTURE.clumpp3, id_coord, by="id") # create new columns to store jittered lat and lon, so overlapping points can be seen (still needs to zoom in tough) Qmatrix_freqT$latitude <- jitter(Qmatrix_freqT$lat, factor = 20) #120 and 80 SB; #100 and 60 NB; 15 and 6 Partridge Qmatrix_freqT$longitude <- jitter(Qmatrix_freqT$lon, factor = 10) #20 and 10 Southport; 80 and 40 others # interactive plot leaflet(data=Qmatrix_freqT)%>% addTiles()%>% addMinicharts(Qmatrix_freqT$longitude, Qmatrix_freqT$latitude, type = "pie", chartdata = Qmatrix_freqT[, c("Group.1", "Group.2", "Group.3")], popup=popupArgs(supValues = Qmatrix_freqT[,c("original_id", "id")]), colorPalette = col, # colours created at the beginning of the code width = 20, transitionTime = 0)