--- title: "Size Analysis" author: "Iliana Medina & Emma Sherrat" date: "7 November 2016" output: html_document --- **1.** Script to extract data for **SIZE** analysis **2.** This first code chooses comparisons of a cuckoo and its host **WITHIN** the same clutch ```{r} library(ggplot2) library(lme4) library(car) setwd("") ## set working directory sizes_eggs=read.csv("Average size for random_new.csv") #### File with egg size (measured as egg length) for each egg. View(sizes_eggs) distance=as.data.frame(cbind(sizes_eggs$size)) # Create new data frame called 'distance' and list all egg size values. colnames(distance)=c("size") # Rename column to size. rownames(distance)=sizes_eggs$code # Adds code (clutch identification and egg identifcation (C, H1, H2)) to each row. matrix2=as.matrix(dist(distance, upper=TRUE)) # Creates matrix of differences in size (i.e., egg length) for every possible paired combination of eggs. These differences are called 'distance'. # For host eggs number 1 (H1) row.ind <- grep("H1",rownames(matrix2),value=TRUE) # row.ind lists the rownames of all first host egg labelled (called H1 within each clutch). All of these row names have clutch ID and H1 at the end. col.ind <- match(sub("H1","C",row.ind),colnames(matrix2)) #sub replaces the all H1 values with cuckoo ID for each row. It then finds what place each C label is postioned in the matrix. your.df1 <- data.frame(sample=colnames(matrix2)[col.ind],dist=(diag(matrix2[row.ind,col.ind]))) # This creates a dataframe with the PC distance between C and H1 from the same clutch. # For host eggs number 2 (H2) row.ind <- grep("H2",rownames(matrix2),value=TRUE) col.ind <- match(sub("H2","C",row.ind),colnames(matrix2)) your.df2 <- data.frame(sample=colnames(matrix2)[col.ind],dist=(diag(matrix2[row.ind,col.ind]))) # For host eggs number 3 (H3) row.ind <- grep("H3",rownames(matrix2),value=TRUE) col.ind <- match(sub("H3","C",row.ind),colnames(matrix2)) your.df3 <- data.frame(sample=colnames(matrix2)[col.ind],dist=(diag(matrix2[row.ind,col.ind]))) ``` Build dataset combining distances of host-cuckoo within clutches ```{r} real_distances=na.omit((rbind(your.df1,your.df2,your.df3))) # Combines all distances of cuckoos vs eggs of their real hosts. As such you will have two rows with the same label if there were two host eggs present. Basically, it has all cuckoo to host egg matches within the same clutch. dist_ok=merge(real_distances, sizes_eggs, by.x=c("sample"),by.y=c("code"),all.x=TRUE) #This merges the real_distances dataframe with the sizes_eggs data frame and renames the dataframe dist_ok. The dataframes are merged based on row names (x) under "sample" column and "code" values (clutch identification and egg identification) (y). # attach information on nest type etc dist_final=dist_ok[1:9] # Final dataset of distances for analyses and graphs. Uses first 9 rows. combined <- paste(dist_final$Nest, dist_final$CuckooSp) # create variable that combines cuckoo sp and nest type, for graph dist_final=cbind(dist_final,combined) ``` **3.** **Graphs** ```{r,fig.width=4, fig.height=4} p <- ggplot(dist_final, aes(factor(Nest), log(dist))) # creates empty plot p + geom_boxplot(aes(fill=factor(CuckooSp))) + geom_jitter() + labs(x="Nest type",y="Distance in size between cuckoo egg and host egg (log)") #Plots pair-wise differences in all individual cuckoo and host eggs. The column is the spread of differences in size between cuckoo egg and each individual host egg in clutch. ``` ```{r} colnames(distance)=c("size") # Create column called size for distance values based on egg size. rownames(distance)=sizes_eggs$code_c1 # Create row names based on code C1. matrix2=as.matrix(dist(distance, upper=TRUE)) #Displays each distance value between any two eggs (can be same or different clutches) as a matrix. View(matrix2) library(reshape) ##### Creater vectors where each type of distance is going to be stored ####### #Each df value below is a list of counts. df1 <- data.frame( Name = c('AGAINST_OWN_closed'), Count = c(1:3160), stringsAsFactors = FALSE) df2 <- data.frame( Name = c('RANDOM'), Count = c(1:3640), stringsAsFactors = FALSE) df3 <- data.frame( Name = c('AGAINST_OWN_open'), Count = c(1:345), stringsAsFactors = FALSE) df4 <- data.frame( Name = c('RANDOM'), Count = c(1:2115), stringsAsFactors = FALSE) df5 <- data.frame( Name = c('AGAINST_OWN_open'), Count = c(1:1980), stringsAsFactors = FALSE) df6 <- data.frame( Name = c('RANDOM'), Count = c(1:3120), stringsAsFactors = FALSE) df7 <- data.frame( Name = c('AGAINST_OWN_closed'), Count = c(1:90), stringsAsFactors = FALSE) ### for first cuckoo species #### all_dist_C1=cbind(melt( as.matrix(matrix2)[grep("C1", colnames(matrix2)), grep("O1", rownames(matrix2)) ]),df1) # creates data frame that binds all eggs in nest (both cuckoo and host) belonging to Cacomantis flabelliformis. "C1" and "O1" are the cuckoo and host egg of Cacomantis flabelliformis, respectively. Column X1 lists the cuckoo eggs and column X2 lists the host eggs. A new column, stating "AGAINST_ALL_CLOSED" is added for all rows to specify that all host eggs listed were parisitised by the same species and that the hosts lay closed nests. all_dist_C1_2=cbind(melt( as.matrix(matrix2)[grep("C1", colnames(matrix2)), grep("R1", rownames(matrix2)) ]),df2) # creates data frame that lists all possible combinations of cuckoo eggs from Cacomantis flabelliformis with all host eggs from species they do not parasitise. The column called 'name' lists all these combinations as "RANDOM" as in reality these hosts are not parasitised by this cuckoo species. #### For second cuckoo species ##### rownames(distance)=sizes_eggs$code_c2 matrix=dist(distance, upper=TRUE) matrix2=as.matrix(dist(distance, upper=TRUE)) all_dist_C2=cbind(melt( as.matrix(matrix2)[grep("C2", colnames(matrix2)), grep("O2", rownames(matrix2)) ]),df3) # creates new data frame that includes all cuckoo eggs from Cacomantis variolosus ("C2") and host egggs from open nests (labelled "O2"). Note that all hosts of Cacomantis variolosus with closed nests are excluded. Column listing "AGAINST_ALL_OPEN" is added to all rows to specify that all host eggs listed were parisitised by the same species and that these hosts lay open nests. all_dist_C2_3=cbind(melt( as.matrix(matrix2)[grep("C2", colnames(matrix2)), grep("OB2", rownames(matrix2)) ]),df7) # creates new data frame that includes all cuckoo eggs from Cacomantis variolosus ("C2") and host eggs from closed nests (labelled "OB2"). all_dist_C2_2=cbind(melt( as.matrix(matrix2)[grep("C2", colnames(matrix2)), grep("R2", rownames(matrix2)) ]),df4) # creates new data frame that includes all cuckoo eggs from Cacomantis variolosus ("C2") and host eggs not belonging to this cuckoo species (labelled "R2"). ### For third cuckoo species #### rownames(distance)=sizes_eggs$code_c3 matrix=dist(distance, upper=TRUE) matrix2=as.matrix(dist(distance, upper=TRUE)) all_dist_C3=cbind(melt( as.matrix(matrix2)[grep("C3", colnames(matrix2)), grep("O3", rownames(matrix2)) ]),df5) # creates new data frame that includes all cuckoo eggs from Cuculuus pallidus (C3) and their host eggs (labelled "O3"). All their hosts build open nests. all_dist_C3_2=cbind(melt( as.matrix(matrix2)[grep("C3", colnames(matrix2)), grep("R3", rownames(matrix2)) ]),df6) # creates new data frame that includes all cuckoo eggs from Cuculuus pallidus (C3) and host eggs not belonging to this cuckoo species (labelled "R3"). #### Combine all the distances BUT this file is only useful for the RANDOM distances. Get real distances from previous file because this contains within same cuckoo sp. but different clutch #### all_dist=rbind(all_dist_C1,all_dist_C1_2,all_dist_C2_3,all_dist_C2,all_dist_C2_2,all_dist_C3,all_dist_C3_2) colnames(all_dist)=c("clutch1","clutch2", "dist","Name") View(all_dist) library(stringi) all_dist$clutch2 <- as.factor(stri_sub(all_dist$clutch2, 1, -5)) # Extracts substrings under code point-based index ranges provided all_dist$clutch2 <- as.factor(gsub( "H", "", as.character(all_dist$clutch2))) # Returns a copy of str with the all occurrences of pattern substituted for the second argument. all_dist$clutch2 <- as.factor(paste0(all_dist$clutch2, "C")) # Concatenate vectors after converting to character. all_info=merge(all_dist, sizes_eggs, by.x="clutch1", by.y="code_c1", all.x=TRUE ) all_info <- all_info[,-(9:16), drop=FALSE] all_info_c1 <- merge(all_info, sizes_eggs, by.x="clutch2", by.y="code",all.x=TRUE) all_info_c1 <- all_info_c1[,-(17:25), drop=FALSE] all_info_c1 <- all_info_c1[,-c(5,9,10,11,12,13,14,15), drop=FALSE] colnames(all_info_c1)=c("clutch2","clutch1", "dist","Name","Nest","CuckooGen","CuckooSp","HostSp") ```