#set working directory #setwd("C:\\Users\\Jeremiah\\Dropbox\\Campanula mating system\\RAD\\Geography") #Read in modified vcf setwd("C:/Users/Jeremiah/Documents") df <- read.csv("bellflower.vcf", header=T, sep="\t", stringsAsFactors=F) # Drop extraneous columns # ID,REF,ALT,QUAL,FILTER,INFO,FORMAT # Format = CATG. Strip off "GT:DP:" df <- df[,-(3:9)] df$POS <- as.numeric(df$POS) colnames(df)[1] = "CHROM" df$CHROM <- gsub(":", "_", df$CHROM) # Simplify formatting for allele calls df <- as.data.frame(apply(df,2,function(x) sub(".*:","",x)),stringsAsFactors=F) df$POS <- as.numeric(df$POS) rownames(df) = make.names(paste(df$CHROM,df$POS,sep="-"), unique=TRUE) # Get list of all individuals in appalachian mountains # Worth examining robustness of result to all app pops here # change depending on the ancestral range #for western origin analysis appal.pops <- c("AL2012", "AL79", "ALBG","IN46", "KY51", "OH119", "OH64", "PA27", "TN19", "MD5", "WV72", "VA73") #for full analysis #appal.pops <- c("MD5","WV72","VA73") a<-lapply(appal.pops,function(x) grepl(x,colnames(df))) appal.df <- cbind(df[,1:2], df[,which(Reduce("+",a)>0)]) # Decompose combined CATG strings into separate calls string.cutter <- function(string.vector,position) { result <- lapply(string.vector,function(y) strsplit(y, split=",")[[1]][position]) return(as.numeric(result)) } # Make a 3 dimensional array of reads. # Arrays are referenced by [row,column,depth] appal.array.cols <- colnames(appal.df)[-c(1,2)] appal.array.rows <- rownames(appal.df) appal.array.names <- c("C","A","T","G") appal.array.vector <- unlist(lapply(1:4, function (pos) apply(appal.df[,-c(1,2)], 2, function(x) string.cutter(x,pos)))) appal.array.dims <- c(nrow(appal.df),ncol(appal.df)-2,4) appal.array <- array(appal.array.vector,dim=appal.array.dims,dimnames=list(appal.array.rows,appal.array.cols,appal.array.names)) # Clean up rm(appal.array.vector) # Remove any NA's that might result from missing data. appal.array[is.na(appal.array)] <- 0 # Add across rows for all C's, A's, ect. to get four columns # Identify reference allele at each site by majority rule ref <- data.frame(C=apply(appal.array[,,1],1,sum),A=apply(appal.array[,,2],1,sum),T=apply(appal.array[,,3],1,sum),G=apply(appal.array[,,4],1,sum)) ref$major.pos <- max.col(ref, ties.method = "random") ref$major.call <- colnames(ref)[ref$major.pos] # Cull errors by retaining rows with read depth >= 20 ref<-ref[apply(ref[,1:4],1,sum)>20,]