#setwd("C:/Documents and Settings/evjelmerp/My Documents/sc_crows/SIMUL") library(base); library(MASS) datasets <- list.files(pattern="readcounts_") ## Set parameters (shape=shape param for gamma dist of dispersion values; rate= idem but is rate param; logfc = different log fold change values to generate data for): shape <- 0.85 # Inferred for flycatchers: 17 scale <- 0.5 # Inferred for flycatchers: 17 logfc<-c(rep(0,50),rep(0.5,20),rep(1,16),rep(2,8),rep(3,4),rep(4,2)) for (dataset in datasets) { reads <- read.table(paste(dataset),header=T) breads <- reads$breads # Create a vector with the read counts per gene. spA.counts <- data.frame(); spB.counts <- data.frame() # Create empty dataframes that will be filled with generated count data. size <- sample(rgamma(length(breads),shape=shape,scale=scale),length(breads),replace=FALSE) # dispersion parameter print(paste("***starting with",dataset)) readcounts.a <- breads; i <- 0 for (readcount.a in readcounts.a) { # For each readcount in the simulated data, i <- i+1 ## Generate counts for 10 ind according to NB distribution ("Size" is the dispersion, "mu" is the mean): spA <- rnbinom(n=10,size=1/size[i],mu=readcount.a/10) ## Not more counts than what is available should be generated!: # while (sum(spA)>a.value) { spA <- rnbinom(n=10,size=1/size[i],mu=a.value/10) } spA <- c(sum(spA),readcount.a,spA) spA.counts <- rbind(spA.counts,spA) } # Collect the counts. print("readcounts.a done") readcounts.b <- breads; i <- 0 for (readcount.b in readcounts.b) { i <- i+1 ## Same for a second species, taking the desired LogFoldChange into account: lfc <- sample(logfc,1) # Take a random LFC spB <- rnbinom(n=10,size=1/size[i],mu=((readcount.b/(2^lfc))/10)) # while (sum(spB)>a[i]) { spB <- rnbinom(n=10,size=1/size[i],mu=((sum(spA.counts[i,3:12]))/(2^lfc))/10) } spB <- c(sum(spB),spB,lfc,1/size[i]) spB.counts <- rbind(spB.counts,spB) } print("readcounts.b done") ## Generate table with counts: spAB.counts <- cbind(spA.counts,spB.counts) # Create a dataframe with all the counts and some sums. colsA <- vector(); for (i in 1:10) {colsA <- c(colsA,paste("spA_",i,sep=""))} colsB <- vector(); for (i in 1:10) {colsB <- c(colsB,paste("spB_",i,sep=""))} colnames(spAB.counts) <- c("spA.sum","breads",colsA, "spB.sum",colsB,"lfc","dispersion") AdivbyB <- spAB.counts$spA.sum/spAB.counts$spB.sum # This column is to check if the desired LFC has been obtained. spAB.counts <- cbind(AdivbyB,spAB.counts) ## Shuffle such that it is random whether species A or B has the higher count: highlow <- c(rep("high",length(breads)/2),rep("low",length(breads)/2)) # Species A shouldn't always have the high counts, so these lines shuffle A and B. randomhighlow <- sample(highlow,length(breads),replace=FALSE) # Vector as long as the nr of genes, with "high" and "low" in random order. z <- rep(NA,length(breads),sep="") shuffled.counts <- data.frame(i1=z,i2=z,i3=z,i4=z,i5=z,i6=z,i7=z,i8=z,i9=z,i10=z,i11=z,i12=z,i13=z,i14=z,i15=z,i16=z,i17=z,i18=z,i19=z,i20=z) i<-0 for (highOrLow in randomhighlow) { i<-i+1 if ( highOrLow == "high" ) { shuffled.counts[i,1:10] <- spAB.counts[i,4:13]; shuffled.counts[i,11:20] <- spAB.counts[i,15:24] } else { shuffled.counts[i,1:10] <- spAB.counts[i,15:24]; shuffled.counts[i,11:20] <- spAB.counts[i,4:13] } } ## Create final table: bGene <- reads$bGene; btrans <- reads$btrans; LFC <- spAB.counts$lfc; dispersion <- spAB.counts$dispersion shuffled.counts <- cbind(bGene,btrans,LFC,dispersion, shuffled.counts) write.matrix(shuffled.counts,paste("new",dataset,sep="")) }