## Scripts to simulate hybridization and backcrossing for ADMIXTURE analyses
## using bi-allelic SNPs (input file is .ped format)
# 1st create datasets without missing values > Missing data is replaced by
# randomly chosing an allele from the population
impute.gen <- function(x, y = unique(x[,1])[1], z = unique(x[,1])[2]){
# version 0.2 written by Volker Bahn, May 2015, volker.bahn@wright.edu
# Copyright 2015 Volker Bahn
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# For a copy of the GNU General Public License see
# .
# x is the dataset. Loci are in columns 7 to end. Column one has population id
# y is the identifier for population one, z the identifier for population two
# The function is limited to two populations
# Missing data are zeros. They are replaced by randomly sampling from the column
# they are in with replacement
# This is slow with large datasets but only has to be run once
# Check for two populations
if (length(unique(x[,1])) != 2) stop("Not two populations")
# Defining population rows
pop1 <- x[,1] == y
pop2 <- x[,1] == z
# Looping through columns
for (i in 7:(dim(x)[2])){
# Identify missing values in population 1
pos <- which(x[,i] == 0 & pop1)
# Replacing all missing values with random sample with replacement within pop 1
if (length(pos) != 0) x[pos, i] <- sample(x[pop1 & x[,i] != 0, i],
length(pos), replace = T)
# Identify missing values in population 2
pos <- which(x[,i] == 0 & pop2)
# if no missing values are found in pop 2, move on to next column (loop)
if (length(pos) == 0) next
# Replacing all missing values with random sample with replacement within pop 2
x[pos, i] <- sample(x[pop2 & x[,i] != 0, i], length(pos), replace = T)
}
# Output imputed dataset
x
}
################## end of function impute.gen() ###############################
sim.hybrid <- function(x, y = unique(x[,1])[1], z = unique(x[,1])[2],
frange = 10, n = 10){
# version 0.2 written by Volker Bahn, April 2015, volker.bahn@wright.edu
# Copyright 2015 Volker Bahn
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# For a copy of the GNU General Public License see
# .
#
# x is the dataset. The population id needs to be in column 1
# y is the identifier for population one, z the identifier for population two
# The function is limited to two populations
# Loci are in columns 7 to 536 (**need to change the last number to match number
# of columns in input ped file), which is the last column.
# Data contains no columns of class "factor", only character vectors
# y and z default to the 1. and 2. unique character string encountered in column 1
# frange is the number of generations to be simulated
# n is the number of individuals to be simulated
# Each simulated individual will be forced to be a hybrid in F1
# In subsequent generations, half of these hybrids will back cross into pop 1
# and half into pop 2
# For each locus, alleles will be random draws from the two different populations
# for F1. In subsequent generations, individuals will stay in a single population
# In each generation positions of alleles on loci A or B are randomized
# Output is a file identical in columns to the input, but Family.ID will be
# the pop the individual ended up in after hybridization, sex will be all
# male and individual will be named according to the following scheme:
# F1.h.1 (for generation 1, hybrid, individual 1)
# Identifyer for F2 and beyond: F2.GRSC.2 (for generation 2, back cross to pop
# GRSC, and individual 2).
# There is no accomodation for missing values (or the zero used in original data)
# They need to be dealt with before hand.
# Results dataframe
results <- x[1:(frange*n),]
# Check for two populations
if (length(unique(x[,1])) != 2) stop("Not two populations")
# Defining population rows
pop1 <- which(x[,1] == y)
pop2 <- which(x[,1] == z)
# Loop through simulated individuals
for (i in 1:n){
# Pick random allele from A or B
rand.allele <- c(sample(c(T, F), 536, replace = T))
# Don't pick from the first 6 columns (do not contain alleles)
rand.allele[1:6] <- F
# Pick first allele from random individual of population 1
results[i, rand.allele] <- x[sample(pop1,1), rand.allele]
# Chance the excluded first 6 columns before using the whole vector flipped
rand.allele[1:6] <- T
# Pick the other allele from an individual of population 2
results[i, !rand.allele] <- x[sample(pop2,1), !rand.allele]
}
# Second and subsequent generations need a different loop
# Need to split the simulated individuals in half and back cross frange-1 times
# Since the original back cross population doesn't change,
# all generations are done at once
# Loop for simulated individuals
for (i in 1:n){
# Loop for subsequent generations (after the F1 hybridization)
for (j in 1:(frange - 1)){
# randomize which allele will be picked, A or B
rand.allele <- c(sample(c(T, F), 536, replace = T))
# Don't pick from the first 6 columns (do not contain alleles)
rand.allele[1:6] <- F
# Get one half of alleles from the hybrid
results[i+j*n, rand.allele] <- results[i+(j-1)*n, rand.allele]
# Get other half of alleles from the new home population
if (i > n/2) foo <- pop2 else foo <- pop1
# Flip the first six columns before using the whole vector flipped
rand.allele[1:6] <- T
# Get other half of alleles from the new home population
results[i+j*n, !rand.allele] <- x[sample(foo, 1), !rand.allele]
}}
# Add indivdual labels as described above
gen <- paste0("F", 1:frange)
results[,2] <- paste0(rep(gen, each = n), ".", c(rep("h", n), rep(c(y, z),
times = frange - 1, each = n/2)), ".", rep(1:10, frange))
# Also fix up other columns in results as described above
results[,1] <- c(rep("hybrid", n), rep(c(y, z), times = frange - 1, each = n/2))
results[,5] <- 1
# merge to original data and output
foo <- rbind(w, results)
foo
}
################## end of function sim.hybrid() ###############################