#Script to Estimate pairwise correlations library(reshape2) library(dplyr) t <- read.csv("pacific_fst_2017.csv") #min 3 variable sites head(t) colkeep <- c(15,24,25) #keep population indicator, locus and fst columns tt <- t[,colkeep] #subset tt2 <- dcast(tt, lg_bp_group ~ pop_comp, value.var = 'fst') #make wide format tt3 <- tt2[,2:29] # drop locus column, which was used for grouping #Both of these methods for handling missing data work, but they give slightly different results. In the manual it warns that "pairwise.complete.obs, only works with the pearson methods and that it can result in correlation matrices which are not semi-definite". #cor(combined2, use = "complete.obs") #I don't think this is optimal b/c it is dropping a lot of data p <- cor(tt3, use = "pairwise.complete.obs") d <- as.dist(p) #extract this as a distance matrix (get rid of repetition) m <- as.matrix(d) m2 <- melt(m)[melt(upper.tri(m))$value,] head(m2) m2$comp <- paste(m2$Var1, m2$Var2) #indicates populations being compared head(m2) #write.csv(m2,file="cor_out2.csv") #at this point I currated the ecology and geography categories by hand. #this is the resulting file: cor_out3b.csv #check p.values after correction for multiple testing library(Hmisc) tt4 <- as.matrix(tt3) a <- rcorr(tt4, type="pearson") b <- as.data.frame(a[3]) d <- as.dist(b) m <- as.matrix(d) m2 <- melt(m)[melt(upper.tri(m))$value,] head(m2) corrected <- p.adjust(m2$value,method="fdr") min(corrected) max(corrected) #permutation of ecology and geography library(ggplot2) corin <- read.csv("cor_out3b.csv") head(corin) corin$combined <- paste(corin$Ecology, corin$Geography, sep = "_") corin$combined2 <- factor(corin$combined, levels = c(" Diff_Ecology_ Diff_Ocean", " Same_Ecology_ Diff_Ocean", " Diff_Ecology_ Same_Ocean", " Same_Ecology_ Same_Ocean")) #Geography test b <- ggplot(corin, aes(Geography,Correlation)) b + geom_boxplot() boxplot(corin$Correlation ~ corin$combined) rowstat <- c(NA) perm <- 10000 #perm <- 10 #test run emp <- lm(corin$Correlation ~ corin$Geography) empa <- summary(emp) empb <- coef(empa)[, "t value"] empc <- empb[2] for(i in 1:perm){ x <- as.data.frame(sample(corin$Geography, replace=FALSE)) y <- cbind(x, corin$Correlation) z <- lm(y[,2]~ y[,1]) a <- summary(z) b <- coef(a)[, "t value"] c1 <- b[2] rowstat[[i]] <- c1 } rowstat2 <-data.frame(rowstat) rowstat3 <- t(rowstat2) final <- cbind(rowstat3,empc) pval_fun<-function(perm,h){(length(perm[perm>=h])+1)/(10001)} pval<-data.frame() for (r in 1:nrow(final)) { perm<-final[r,1:10000] h<-final[r, 10001] #empirical value pvalh<-pval_fun(perm,h) pval<-rbind(pval,pvalh) } pval ##pvalue= 9.999e-05 #Ecology test b <- ggplot(corin, aes(Ecology,Correlation)) b + geom_boxplot() emp <- lm(corin$Correlation ~ corin$Ecology) empa <- summary(emp) empb <- coef(empa)[, "t value"] empc <- empb[2] rowstat <- c(NA) perm <- 10000 for(i in 1:perm){ x <- as.data.frame(sample(corin$Ecology, replace=FALSE)) y <- cbind(x, corin$Correlation) z <- lm(y[,2]~ y[,1]) a <- summary(z) b <- coef(a)[, "t value"] c1 <- b[2] rowstat[[i]] <- c1 } rowstat2 <-data.frame(rowstat) rowstat3 <- t(rowstat2) final <- cbind(rowstat3,empc) pval_fun<-function(perm,h){(length(perm[perm>=h])+1)/(10001)} pval<-data.frame() for (r in 1:nrow(final)) { perm<-final[r,1:10000] h<-final[r, 10001] #empirical value pvalh<-pval_fun(perm,h) pval<-rbind(pval,pvalh) } pval #Test for an interaction between ecology and geography b <- ggplot(corin, aes(combined,Correlation)) b + geom_boxplot() emp <- lm(corin$Correlation ~ corin$Ecology + corin$Ecology:corin$Geography) empa <- anova(emp) empb <- empa[,"F value"] empc <- empb[2] rowstat <- c(NA) perm <- 10000 for(i in 1:perm){ x <- as.data.frame(sample(corin$Ecology, replace=FALSE)) k <- as.data.frame(sample(corin$Geography, replace=FALSE)) j <- cbind(x,k) y <- cbind(j, corin$Correlation) z <- lm(y[,3]~ y[,1] + (y[,1]):(y[,2])) a <- anova(z) b <- a[,"F value"] c1 <- b[2] rowstat[[i]] <- c1 } rowstat2 <-data.frame(rowstat) rowstat3 <- t(rowstat2) final <- cbind(rowstat3,empc) pval_fun<-function(perm,h){(length(perm[perm>=h])+1)/(10001)} pval<-data.frame() for (r in 1:nrow(final)) { perm<-final[r,1:10000] h<-final[r, 10001] #empirical value pvalh<-pval_fun(perm,h) pval<-rbind(pval,pvalh) } pval #control for geography #Test for an interaction between ecology and geography test <- subset(corin,corin$Geography == " Same_Ocean") b <- ggplot(test, aes(combined,Correlation)) b + geom_boxplot() emp <- lm(test$Correlation ~ test$Ecology) empa <- summary(emp) empb <- coef(empa)[, "t value"] empc <- empb[2] rowstat <- c(NA) perm <- 10000 for(i in 1:perm){ x <- as.data.frame(sample(test$Ecology, replace=FALSE)) y <- cbind(x, test$Correlation) z <- lm(y[,2]~ y[,1]) a <- summary(z) b <- coef(a)[, "t value"] c1 <- b[2] rowstat[[i]] <- c1 } rowstat2 <-data.frame(rowstat) rowstat3 <- t(rowstat2) final <- cbind(rowstat3,empc) pval_fun<-function(perm,h){(length(perm[perm>=h])+1)/(10001)} pval<-data.frame() for (r in 1:nrow(final)) { perm<-final[r,1:10000] h<-final[r, 10001] #empirical value pvalh<-pval_fun(perm,h) pval<-rbind(pval,pvalh) } pval test <- subset(corin,corin$Geography == " Diff_Ocean") b <- ggplot(test, aes(combined,Correlation)) b + geom_boxplot() emp <- lm(test$Correlation ~ test$Ecology) empa <- summary(emp) empb <- coef(empa)[, "t value"] empc <- empb[2] rowstat <- c(NA) perm <- 10000 for(i in 1:perm){ x <- as.data.frame(sample(test$Ecology, replace=FALSE)) y <- cbind(x, test$Correlation) z <- lm(y[,2]~ y[,1]) a <- summary(z) b <- coef(a)[, "t value"] c1 <- b[2] rowstat[[i]] <- c1 } rowstat2 <-data.frame(rowstat) rowstat3 <- t(rowstat2) final <- cbind(rowstat3,empc) pval_fun<-function(perm,h){(length(perm[perm>=h])+1)/(10001)} pval<-data.frame() for (r in 1:nrow(final)) { perm<-final[r,1:10000] h<-final[r, 10001] #empirical value pvalh<-pval_fun(perm,h) pval<-rbind(pval,pvalh) } pval #parapatric vs allo comparison b <- ggplot(corin, aes(APS2,Correlation)) b + geom_boxplot() emp <- lm(corin$Correlation ~ corin$APS2) empa <- summary(emp) empb <- coef(empa)[, "t value"] empc <- empb[2] rowstat <- c(NA) perm <- 10000 for(i in 1:perm){ x <- as.data.frame(sample(corin$APS2, replace=FALSE)) y <- cbind(x, corin$Correlation) z <- lm(y[,2]~ y[,1]) a <- summary(z) b <- coef(a)[, "t value"] c1 <- b[2] rowstat[[i]] <- c1 } rowstat2 <-data.frame(rowstat) rowstat3 <- t(rowstat2) final <- cbind(rowstat3,empc) pval_fun<-function(perm,h){(length(perm[perm>=h])+1)/(10001)} pval<-data.frame() for (r in 1:nrow(final)) { perm<-final[r,1:10000] h<-final[r, 10001] #empirical value pvalh<-pval_fun(perm,h) pval<-rbind(pval,pvalh) } pval #9.999e-05 #within non-allopatric j <- subset(corin, corin$APS != "Allopatric") b <- ggplot(j, aes(APS,Correlation)) b + geom_boxplot() emp <- lm(j$Correlation ~ j$APS) empa <- summary(emp) empb <- coef(empa)[, "t value"] empc <- empb[2] rowstat <- c(NA) perm <- 10000 for(i in 1:perm){ x <- as.data.frame(sample(j$APS, replace=FALSE)) y <- cbind(x, j$Correlation) z <- lm(y[,2]~ y[,1]) a <- summary(z) b <- coef(a)[, "t value"] c1 <- b[2] rowstat[[i]] <- c1 } rowstat2 <-data.frame(rowstat) rowstat3 <- t(rowstat2) final <- cbind(rowstat3,empc) pval_fun<-function(perm,h){(length(perm[perm>=h])+1)/(10001)} pval<-data.frame() for (r in 1:nrow(final)) { perm<-final[r,1:10000] h<-final[r, 10001] #empirical value pvalh<-pval_fun(perm,h) pval<-rbind(pval,pvalh) } pval