-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathfunction_NBsimSingleCell.R
More file actions
158 lines (145 loc) · 6.82 KB
/
function_NBsimSingleCell.R
File metadata and controls
158 lines (145 loc) · 6.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
## From zingeR package
## Citation
## Clement, L. Observation weights unlock bulk RNA-seq tools for zero inflation and single-cell applications. 1–17 (2018). doi:10.1186/s13059-018-1406-4
NBsimSingleCell <- function(dataset, group, nTags = 10000, nlibs = length(group), lib.size = NULL, drop.low.lambda = TRUE, drop.extreme.dispersion = 0.1, pUp=.5, foldDiff=3, verbose=TRUE, ind=NULL, params=NULL, cpm="AveLogCPM", max.dispersion=400, min.dispersion=0.1, normalizeLambda=FALSE)
{
require(edgeR)
group = as.factor(group)
expit=function(x) exp(x)/(1+exp(x))
logit=function(x) log(x/(1-x))
sample.fun <- function(object)
{
nlibs <- object$nlibs
nTags <- object$nTags
AveLogCPM <-object$dataset$dataset.AveLogCPM
dispersion <- object$dataset$dataset.dispersion
lambda <- object$dataset$dataset.lambda
#lambda <- (2^AveLogCPM)/1e6
propZeroGene <- dat$dataset$dataset.propZeroGene
id_r <- sample(length(AveLogCPM), nTags, replace = TRUE)
object$AveLogCPM <- AveLogCPM[id_r]
Lambda <- lambda[id_r]
if(normalizeLambda) Lambda <- Lambda/sum(Lambda) #normalize so they all sum to 1
Dispersion <- dispersion[id_r]
Dispersion[Dispersion>max.dispersion] = max.dispersion
Dispersion[Dispersion<min.dispersion] = min.dispersion
propZeroGene <- propZeroGene[id_r]
Lambda <- expandAsMatrix(Lambda, dim = c(nTags, nlibs))
object$Lambda <- Lambda
Dispersion <- expandAsMatrix(Dispersion, dim = c(nTags, nlibs))
object$Dispersion <- Dispersion
object$propZeroGene <- propZeroGene
object
}
diff.fun <- function(object)
{
group <- object$group
pUp <- object$pUp
foldDiff <- object$foldDiff
Lambda <- object$Lambda
nTags <- object$nTags
g <- group == levels(group)[1]
#AveLogCPM = expandAsMatrix(object$AveLogCPM,dim=c(nTags, nlibs))
if(length(ind)>0 & !all(foldDiff==1)) {
fcDir <- sample(c(-1,1), length(ind), prob=c(1-pUp,pUp), replace=TRUE)
Lambda[ind,g] <- Lambda[ind,g]*exp(log(foldDiff)/2*fcDir)
Lambda[ind,!g] <- Lambda[ind,!g]*exp(log(foldDiff)/2*(-fcDir))
object$Lambda <- Lambda
object$indDE <- ind
object$indNonDE <- (1:nTags)[-ind]
foldDiff[fcDir==1] <- 1/foldDiff[fcDir==1]
object$foldDiff <- foldDiff #group2 / group1
}
if(all(foldDiff==1)) object$indDE <- NA
object
}
sim.fun <- function(object)
{
Lambda <- object$Lambda
Dispersion <- object$Dispersion
nTags <- object$nTags
nlibs <- object$nlibs
lib.size <- object$lib.size
zeroFit <- dat$dataset$dataset.propZeroFit
propZeroGene <- dat$propZeroGene
propZeroGene[propZeroGene==1] <- 1-1e-4
propZeroGene[propZeroGene==0] <- 1e-4
design <- object$design
avLogCpm <- object$AveLogCPM
mids <- object$dataset$dataset.mids
breaks <- object$dataset$dataset.breaks
## get matrix of zero probabilities
libPredict=rep(log(lib.size),each=length(avLogCpm))
cpmPredict=rep(avLogCpm,length(lib.size))
zeroProbMatLink = matrix(predict(zeroFit, newdata=data.frame(logLibHlp=libPredict, midsHlp=cpmPredict), type="link"), byrow=FALSE, ncol=nlibs, nrow=nTags)
meanDiff = rowMeans(zeroProbMatLink)-logit(propZeroGene)
zeroProbMat = expit(sweep(zeroProbMatLink,1,meanDiff,"-"))
#zeroProbMat = matrix(predict(zeroFit, newdata=data.frame(logLibHlp=libPredict, midsHlp=cpmPredict), type="response"), byrow=FALSE, ncol=nlibs)
## simulate negative binomial counts
# mu=sweep(Lambda,2,lib.size,"*")
# mu[mu<0.1] = 0.1
# #adjustment = zeroProbMat*mu
# #mu=mu+adjustment
# counts = matrix(rnbinom(n=nTags*nlibs, mu=mu, size=1/Dispersion), nrow=nTags, ncol=nlibs, byrow=FALSE)
# zeroProbNegBin = matrix(dnbinom(0, mu=mu, size=1/Dispersion), nrow=nTags, ncol=nlibs, byrow=FALSE)
# expectedZeroProbablityNegBinomial = rowMeans(zeroProbNegBin)
## simulate negative binomial counts
mu=sweep(Lambda,2,lib.size,"*")
zeroProbNegBin = matrix(dnbinom(0, mu=mu, size=1/Dispersion), nrow=nTags, ncol=nlibs, byrow=FALSE)
expectedZeroProbablityNegBinomial = rowMeans(zeroProbNegBin)
dropoutGenes = expectedZeroProbablityNegBinomial < rowMeans(zeroProbMat)
adjustment = zeroProbMat*mu
mu[dropoutGenes,]=mu[dropoutGenes,]+adjustment[dropoutGenes,]
mu[mu<0.1] = 0.1
counts = matrix(rnbinom(n=nTags*nlibs, mu=mu, size=1/Dispersion), nrow=nTags, ncol=nlibs, byrow=FALSE)
counts0=counts
## calculate dropouts
dropoutGenes = expectedZeroProbablityNegBinomial < rowMeans(zeroProbMat)
message(paste0("Adding extra zeros w.r.t. NB for ",sum(dropoutGenes)," genes"))
#dropout matrix is 0 for dropout.
dropoutMatrix = 1-matrix(rbinom(n=nTags*nlibs, size=1, prob=zeroProbMat), nrow=nTags, ncol=nlibs, byrow=FALSE)
dropoutMatrix[!dropoutGenes,] = 1
#avoid all dropout genes
allDropoutId <- which(rowSums(dropoutMatrix)==0)
while(length(allDropoutId)>0 ){
dropoutMatrix[allDropoutId,] = 1-matrix(rbinom(n=length(allDropoutId)*nlibs, size=1, prob=zeroProbMat[allDropoutId,]), nrow=length(allDropoutId), ncol=nlibs, byrow=FALSE)
allDropoutId <- which(rowSums(dropoutMatrix)==0)
}
#add dropouts
dropoutMatrix[counts==0 & dropoutMatrix==0]=1 #if count already zero, it's not a dropout
counts = counts*dropoutMatrix
object$dropout = dropoutMatrix
## resample positive counts for features with all zero counts
zeroCountsId <- which(rowSums(counts)==0)
while(length(zeroCountsId)>0 ){
counts[zeroCountsId,] = matrix(rnbinom(n=length(zeroCountsId)*nlibs, mu=mu[zeroCountsId,], size=1/Dispersion[zeroCountsId,]), nrow=length(zeroCountsId), ncol=nlibs, byrow=FALSE)
counts0[zeroCountsId,] = counts[zeroCountsId,]
counts[zeroCountsId,]=counts[zeroCountsId,]*dropoutMatrix[zeroCountsId,]
zeroCountsId <- which(rowSums(counts)==0)
}
## name features, return object.
rownames(counts) <- paste("ids", 1:nTags, sep = "")
colnames(counts) <- paste("sample",1:nlibs,sep="")
object$counts <- counts
object$counts0 <- counts0
object
}
if(verbose) message("Preparing dataset.\n")
if(is.null(params)){
dataset <- getDatasetZTNB(counts = dataset, drop.extreme.dispersion = drop.extreme.dispersion, drop.low.lambda = drop.low.lambda)
} else {
dataset <- params
}
dat <- new("DGEList", list(dataset = dataset, nTags = nTags, lib.size = lib.size, nlibs = nlibs, group = group, design = model.matrix(~group), pUp = pUp, foldDiff = foldDiff))
if(cpm=="aCpm") dat$dataset$dataset.AveLogCPM = dat$dataset$dataset.aCpm
if(is.null(dat$lib.size)){
dat$lib.size <- sample(dataset$dataset.lib.size, nlibs, replace=TRUE)}
if(is.null(nTags)) dat$nTags <- dat$dataset$dataset.nTags
if(verbose) message("Sampling.\n")
dat <- sample.fun(dat)
if(verbose) message("Calculating differential expression.\n")
dat <- diff.fun(dat)
if(verbose) message("Simulating data.\n")
dat <- sim.fun(dat)
dat
}