## Scripts to simulate hybridization and backcrossing for ADMIXTURE analyses ## using bi-allelic SNPs (input file is .ped format) # 1st create datasets without missing values > Missing data is replaced by # randomly chosing an allele from the population impute.gen <- function(x, y = unique(x[,1])[1], z = unique(x[,1])[2]){ # version 0.2 written by Volker Bahn, May 2015, volker.bahn@wright.edu # Copyright 2015 Volker Bahn # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # For a copy of the GNU General Public License see # . # x is the dataset. Loci are in columns 7 to end. Column one has population id # y is the identifier for population one, z the identifier for population two # The function is limited to two populations # Missing data are zeros. They are replaced by randomly sampling from the column # they are in with replacement # This is slow with large datasets but only has to be run once # Check for two populations if (length(unique(x[,1])) != 2) stop("Not two populations") # Defining population rows pop1 <- x[,1] == y pop2 <- x[,1] == z # Looping through columns for (i in 7:(dim(x)[2])){ # Identify missing values in population 1 pos <- which(x[,i] == 0 & pop1) # Replacing all missing values with random sample with replacement within pop 1 if (length(pos) != 0) x[pos, i] <- sample(x[pop1 & x[,i] != 0, i], length(pos), replace = T) # Identify missing values in population 2 pos <- which(x[,i] == 0 & pop2) # if no missing values are found in pop 2, move on to next column (loop) if (length(pos) == 0) next # Replacing all missing values with random sample with replacement within pop 2 x[pos, i] <- sample(x[pop2 & x[,i] != 0, i], length(pos), replace = T) } # Output imputed dataset x } ################## end of function impute.gen() ############################### sim.hybrid <- function(x, y = unique(x[,1])[1], z = unique(x[,1])[2], frange = 10, n = 10){ # version 0.2 written by Volker Bahn, April 2015, volker.bahn@wright.edu # Copyright 2015 Volker Bahn # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # For a copy of the GNU General Public License see # . # # x is the dataset. The population id needs to be in column 1 # y is the identifier for population one, z the identifier for population two # The function is limited to two populations # Loci are in columns 7 to 536 (**need to change the last number to match number # of columns in input ped file), which is the last column. # Data contains no columns of class "factor", only character vectors # y and z default to the 1. and 2. unique character string encountered in column 1 # frange is the number of generations to be simulated # n is the number of individuals to be simulated # Each simulated individual will be forced to be a hybrid in F1 # In subsequent generations, half of these hybrids will back cross into pop 1 # and half into pop 2 # For each locus, alleles will be random draws from the two different populations # for F1. In subsequent generations, individuals will stay in a single population # In each generation positions of alleles on loci A or B are randomized # Output is a file identical in columns to the input, but Family.ID will be # the pop the individual ended up in after hybridization, sex will be all # male and individual will be named according to the following scheme: # F1.h.1 (for generation 1, hybrid, individual 1) # Identifyer for F2 and beyond: F2.GRSC.2 (for generation 2, back cross to pop # GRSC, and individual 2). # There is no accomodation for missing values (or the zero used in original data) # They need to be dealt with before hand. # Results dataframe results <- x[1:(frange*n),] # Check for two populations if (length(unique(x[,1])) != 2) stop("Not two populations") # Defining population rows pop1 <- which(x[,1] == y) pop2 <- which(x[,1] == z) # Loop through simulated individuals for (i in 1:n){ # Pick random allele from A or B rand.allele <- c(sample(c(T, F), 536, replace = T)) # Don't pick from the first 6 columns (do not contain alleles) rand.allele[1:6] <- F # Pick first allele from random individual of population 1 results[i, rand.allele] <- x[sample(pop1,1), rand.allele] # Chance the excluded first 6 columns before using the whole vector flipped rand.allele[1:6] <- T # Pick the other allele from an individual of population 2 results[i, !rand.allele] <- x[sample(pop2,1), !rand.allele] } # Second and subsequent generations need a different loop # Need to split the simulated individuals in half and back cross frange-1 times # Since the original back cross population doesn't change, # all generations are done at once # Loop for simulated individuals for (i in 1:n){ # Loop for subsequent generations (after the F1 hybridization) for (j in 1:(frange - 1)){ # randomize which allele will be picked, A or B rand.allele <- c(sample(c(T, F), 536, replace = T)) # Don't pick from the first 6 columns (do not contain alleles) rand.allele[1:6] <- F # Get one half of alleles from the hybrid results[i+j*n, rand.allele] <- results[i+(j-1)*n, rand.allele] # Get other half of alleles from the new home population if (i > n/2) foo <- pop2 else foo <- pop1 # Flip the first six columns before using the whole vector flipped rand.allele[1:6] <- T # Get other half of alleles from the new home population results[i+j*n, !rand.allele] <- x[sample(foo, 1), !rand.allele] }} # Add indivdual labels as described above gen <- paste0("F", 1:frange) results[,2] <- paste0(rep(gen, each = n), ".", c(rep("h", n), rep(c(y, z), times = frange - 1, each = n/2)), ".", rep(1:10, frange)) # Also fix up other columns in results as described above results[,1] <- c(rep("hybrid", n), rep(c(y, z), times = frange - 1, each = n/2)) results[,5] <- 1 # merge to original data and output foo <- rbind(w, results) foo } ################## end of function sim.hybrid() ###############################