#wouldn't run for all ponds at once, had to do separately #read in genotype data genos <- read.csv("snps_long_filtered.csv", header = T) genos$loc <- paste(genos$CHROM, genos$POS,sep="_") #identifies parental individuals pare <- geno[grepl("Pa", geno$pop),] parents <- unique(pare$pop) #removes grand parents from the genotype file no_pare <- genos[!genos$pop %in% parents,] #create a dataframe that counts the number of individuals represented at each locus tester <- ddply(no_pare, "loc", summarise, nind = length(unique(variable))) #subset out loci with more than 1130 individuals covered, this gives 1799 markers tester2 <- subset(tester, tester$nind > 1130) # use this to cut down all markers to only these high data loci from the original genos file subs <- tester2$loc #subset out only those sites in subs (those with lots of data) geno_parentage <- no_pare[no_pare$loc %in% subs,] #reformat data to work in MasterBayes library(reshape2) test <- dcast(geno_parentage, loc ~ variable, value.var='value') #unmelt data to make it wide test2 <- as.data.frame(t(test[,-1])) #transpose dataframe n <- test$loc #extract column headings names(test2) <- n #makes the extracted headings into the column headings test2$id <- row.names(test2) #make row names into their own column rownames(test2) <- NULL #drop row names test3 <- do.call(cbind, lapply(test2[,1:1799], function(i) do.call(rbind, strsplit(as.character(i), split= '')))) #split genotypes into separate columns test4 <- as.data.frame(cbind(test3,test2[, 1800]))#rebind in the individual (id) column n <- colnames(test2) #extract the column names from the original file n2 <- rep(n, each=2) #duplicate the loci names n3 <- n2[-3600] #remove extra duplicated ID heading names(test4) <- n3 #sets the extracted headings as the column headings #MasterBayes Pedigree Analysis library(MasterBayes) #read in the sex data (phenotype) file for MasterBayes fishP <- read.csv("ind_sex2.csv", header = T) #Pond 5 PP5 <- fishP[grepl("_P5_", fishP$id),] res1fish <- expression(varPed(x = "offspring", restrict = 0)) PdPfish <- PdataPed(formula = list(res1fish), data = PP5) # offspring, sex and id are implicit #read in genotype file fishG <- test4 GP5 <- fishG[grepl("_P5_", fishG$id),] GdPfish <- GdataPed(GP5, perlocus = FALSE, marker.type = "MSW") #set up design matrix gX <- getXlist(PdPfish, GdPfish, E1=0.02, E2=0.02, mm.tol=40) # Get the maximum likelihood pedigree F2pedigree.ml <- MLE.ped(gX) # The most likely parent combinations for F2's table(F2pedigree.ml$P[,2], F2pedigree.ml$P[,3]) P_Pond5 <- as.data.frame(F2pedigree.ml$P) temp <- as.data.frame(F2pedigree.ml$prob) P_Pond5 <- cbind(P_Pond5, temp) #Pond 6 PP6 <- fishP[grepl("_P6_", fishP$id),] res1fish <- expression(varPed(x = "offspring", restrict = 0)) PdPfish <- PdataPed(formula = list(res1fish), data = PP6) # offspring, sex and id are implicit GP6 <- fishG[grepl("_P6_", fishG$id),] GdPfish <- GdataPed(GP6, perlocus = FALSE, marker.type = "MSW") #set up design matrix gX <- getXlist(PdPfish, GdPfish, E1=0.02, E2=0.02, mm.tol=40) # Get the maximum likelihood pedigree F2pedigree.ml <- MLE.ped(gX) # The most likely parent combinations for F2's table(F2pedigree.ml$P[,2], F2pedigree.ml$P[,3]) P_Pond6 <- as.data.frame(F2pedigree.ml$P) temp <- as.data.frame(F2pedigree.ml$prob) P_Pond6 <- cbind(P_Pond6, temp) #Pond 7 fishG <- test4 PP7 <- fishP[grepl("_P7_", fishP$id),] res1fish <- expression(varPed(x = "offspring", restrict = 0)) PdPfish <- PdataPed(formula = list(res1fish), data = PP7) # offspring, sex and id are implicit GP7 <- fishG[grepl("_P7_", fishG$id),] GdPfish <- GdataPed(GP7, perlocus = FALSE, marker.type = "MSW") #set up design matrix gX <- getXlist(PdPfish, GdPfish, E1=0.02, E2=0.02, mm.tol=40) # Get the maximum likelihood pedigree F2pedigree.ml <- MLE.ped(gX) # The most likely parent combinations for F2's table(F2pedigree.ml$P[,2], F2pedigree.ml$P[,3]) P_Pond7 <- as.data.frame(F2pedigree.ml$P) temp <- as.data.frame(F2pedigree.ml$prob) P_Pond7 <- cbind(P_Pond7, temp) #column2 is females #Pond 8 PP8 <- fishP[grepl("_P8_", fishP$id),] res1fish <- expression(varPed(x = "offspring", restrict = 0)) PdPfish <- PdataPed(formula = list(res1fish), data = PP8) # offspring, sex and id are implicit GP8 <- fishG[grepl("_P8_", fishG$id),] GdPfish <- GdataPed(GP8, perlocus = FALSE, marker.type = "MSW") #set up design matrix gX <- getXlist(PdPfish, GdPfish, E1=0.02, E2=0.02, mm.tol=40) # Get the maximum likelihood pedigree F2pedigree.ml <- MLE.ped(gX) # The most likely parent combinations for F2's table(F2pedigree.ml$P[,2], F2pedigree.ml$P[,3]) P_Pond8 <- as.data.frame(F2pedigree.ml$P) temp <- as.data.frame(F2pedigree.ml$prob) P_Pond8 <- cbind(P_Pond8, temp) #Pond 10 PP10 <- fishP[grepl("_P10_", fishP$id),] res1fish <- expression(varPed(x = "offspring", restrict = 0)) PdPfish <- PdataPed(formula = list(res1fish), data = PP10) # offspring, sex and id are implicit GP10 <- fishG[grepl("_P10_", fishG$id),] GdPfish <- GdataPed(GP10, perlocus = FALSE, marker.type = "MSW") #set up design matrix gX <- getXlist(PdPfish, GdPfish, E1=0.02, E2=0.02, mm.tol=40) # Get the maximum likelihood pedigree F2pedigree.ml <- MLE.ped(gX) # The most likely parent combinations for F2's table(F2pedigree.ml$P[,2], F2pedigree.ml$P[,3]) P_Pond10 <- as.data.frame(F2pedigree.ml$P) temp <- as.data.frame(F2pedigree.ml$prob) P_Pond10 <- cbind(P_Pond10, temp) #Pond 11 PP11 <- fishP[grepl("_P11_", fishP$id),] res1fish <- expression(varPed(x = "offspring", restrict = 0)) PdPfish <- PdataPed(formula = list(res1fish), data = PP11) # offspring, sex and id are implicit GP11 <- fishG[grepl("_P11_", fishG$id),] GdPfish <- GdataPed(GP11, perlocus = FALSE, marker.type = "MSW") #set up design matrix gX <- getXlist(PdPfish, GdPfish, E1=0.02, E2=0.02, mm.tol=40) # Get the maximum likelihood pedigree F2pedigree.ml <- MLE.ped(gX) P_Pond11 <- as.data.frame(F2pedigree.ml$P) temp <- as.data.frame(F2pedigree.ml$prob) P_Pond11 <- cbind(P_Pond11, temp) # The most likely parent combinations for F2's table(F2pedigree.ml$P[,2], F2pedigree.ml$P[,3]) #Pond 12 PP12 <- fishP[grepl("_P12_", fishP$id),] res1fish <- expression(varPed(x = "offspring", restrict = 0)) PdPfish <- PdataPed(formula = list(res1fish), data = PP12) # offspring, sex and id are implicit GP12 <- fishG[grepl("_P12_", fishG$id),] GdPfish <- GdataPed(GP12, perlocus = FALSE, marker.type = "MSW") #set up design matrix gX <- getXlist(PdPfish, GdPfish, E1=0.02, E2=0.02, mm.tol=40) # Get the maximum likelihood pedigree F2pedigree.ml <- MLE.ped(gX) # The most likely parent combinations for F2's table(F2pedigree.ml$P[,2], F2pedigree.ml$P[,3]) P_Pond12 <- as.data.frame(F2pedigree.ml$P) temp <- as.data.frame(F2pedigree.ml$prob) P_Pond12 <- cbind(P_Pond12, temp) #Pond 14 PP14 <- fishP[grepl("_P14_", fishP$id),] res1fish <- expression(varPed(x = "offspring", restrict = 0)) PdPfish <- PdataPed(formula = list(res1fish), data = PP14) # offspring, sex and id are implicit GP14 <- fishG[grepl("_P14_", fishG$id),] GdPfish <- GdataPed(GP14, perlocus = FALSE, marker.type = "MSW") #set up design matrix gX <- getXlist(PdPfish, GdPfish, E1=0.02, E2=0.02, mm.tol=40) # Get the maximum likelihood pedigree F2pedigree.ml <- MLE.ped(gX) # The most likely parent combinations for F2's table(F2pedigree.ml$P[,2], F2pedigree.ml$P[,3]) P_Pond14 <- as.data.frame(F2pedigree.ml$P) temp <- as.data.frame(F2pedigree.ml$prob) P_Pond14 <- cbind(P_Pond14, temp) ####################### #Merge all of the P_Pond (pedigree files) ped_merge <- read.csv("ind_sex.csv", header = T) all_ped <- do.call("rbind", list(P_Pond5, P_Pond6, P_Pond7, P_Pond8, P_Pond9, P_Pond10, P_Pond11, P_Pond12, P_Pond13, P_Pond14 )) write.csv(all_ped,"ped_input.csv") #in excel I added in the Pure benthic and limnetic grandparents as the parents of the F1s (except ponds 9 and 13). I also added the pure parents to the end of the file since we have sequence data for them. I also changed the parents to zero if the assignment probability was less tha 0.85 ##### #This is for pursuing markers that are fixed in the parentals #subset out the parents pare <- geno_parentage[grepl("Pa", geno_parentage$pop),] translate <- c("AA" = 2,"CC" = 2, "GG" = 2, "TT"=2, "AC" = 1, "AG" = 1, "AT" = 1, "CG" = 1, "CT" = 1, "CA" = 1,"GC" = 1, "GT" = 1, "GA" = 1, "TA" = 1, "TG" = 1, "TC" = 1) pare$geno <- as.numeric(translate[as.character(pare$value)]) temp <- ddply(pare, "loc", summarise, geno = mean(geno)) table(temp$geno) temp2 <- subset(temp, temp $geno ==2) subs <- temp2 $loc #subset out only those sites with homozygotes homo_pare <- pare[pare $loc %in% subs,] translate <- c("AA" = 1,"CC" = 2, "GG" = 3, "TT"=4) homo_pare$homo <- as.numeric(translate[as.character(homo_pare$value)]) temp <- ddply(homo_pare, "loc", summarise, geno = mean(homo))