scRNA/function_NBsimSingleCell.R at master · yiwang12/scRNA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157

## From zingeR package
## Citation
## Clement, L. Observation weights unlock bulk RNA-seq tools for zero inflation and single-cell applications. 1–17 (2018). doi:10.1186/s13059-018-1406-4

NBsimSingleCell <- function(dataset, group, nTags = 10000, nlibs = length(group), lib.size = NULL, drop.low.lambda = TRUE, drop.extreme.dispersion = 0.1, pUp=.5, foldDiff=3, verbose=TRUE, ind=NULL, params=NULL, cpm="AveLogCPM", max.dispersion=400, min.dispersion=0.1, normalizeLambda=FALSE)
{
  require(edgeR)
  group = as.factor(group)
  expit=function(x) exp(x)/(1+exp(x))
  logit=function(x) log(x/(1-x))

  sample.fun <- function(object)
  {
    nlibs <- object$nlibs
    nTags <- object$nTags
    AveLogCPM <-object$dataset$dataset.AveLogCPM
    dispersion <- object$dataset$dataset.dispersion
    lambda <- object$dataset$dataset.lambda
    #lambda <- (2^AveLogCPM)/1e6
    propZeroGene <- dat$dataset$dataset.propZeroGene
    id_r <- sample(length(AveLogCPM), nTags, replace = TRUE)
    object$AveLogCPM <- AveLogCPM[id_r]
    Lambda <- lambda[id_r]
    if(normalizeLambda) Lambda <- Lambda/sum(Lambda) #normalize so they all sum to 1
    Dispersion <- dispersion[id_r]
    Dispersion[Dispersion>max.dispersion] = max.dispersion
    Dispersion[Dispersion<min.dispersion] = min.dispersion
    propZeroGene <- propZeroGene[id_r]
    Lambda <- expandAsMatrix(Lambda, dim = c(nTags, nlibs))
    object$Lambda <- Lambda
    Dispersion <- expandAsMatrix(Dispersion, dim = c(nTags, nlibs))
    object$Dispersion <- Dispersion
    object$propZeroGene <- propZeroGene
    object
  }
  diff.fun <- function(object)
  {
    group <- object$group
    pUp <-  object$pUp
    foldDiff <- object$foldDiff
    Lambda <- object$Lambda
    nTags <- object$nTags
    g <- group == levels(group)[1]
    #AveLogCPM = expandAsMatrix(object$AveLogCPM,dim=c(nTags, nlibs))
    if(length(ind)>0 & !all(foldDiff==1)) {
      fcDir <- sample(c(-1,1), length(ind), prob=c(1-pUp,pUp), replace=TRUE)
      Lambda[ind,g] <- Lambda[ind,g]*exp(log(foldDiff)/2*fcDir)
      Lambda[ind,!g] <- Lambda[ind,!g]*exp(log(foldDiff)/2*(-fcDir))
      object$Lambda <- Lambda
      object$indDE <- ind
      object$indNonDE <- (1:nTags)[-ind]
      foldDiff[fcDir==1] <- 1/foldDiff[fcDir==1]
      object$foldDiff <- foldDiff #group2 / group1
    }
    if(all(foldDiff==1)) object$indDE <- NA
    object
  }
  sim.fun <- function(object)
  {
    Lambda <- object$Lambda
    Dispersion <- object$Dispersion
    nTags <- object$nTags
    nlibs <- object$nlibs
    lib.size <- object$lib.size
    zeroFit <- dat$dataset$dataset.propZeroFit
    propZeroGene <- dat$propZeroGene
    propZeroGene[propZeroGene==1] <- 1-1e-4
    propZeroGene[propZeroGene==0] <- 1e-4
    design <- object$design
    avLogCpm <- object$AveLogCPM
    mids <- object$dataset$dataset.mids
    breaks <- object$dataset$dataset.breaks

    ## get matrix of zero probabilities
    libPredict=rep(log(lib.size),each=length(avLogCpm))
    cpmPredict=rep(avLogCpm,length(lib.size))
    zeroProbMatLink = matrix(predict(zeroFit, newdata=data.frame(logLibHlp=libPredict, midsHlp=cpmPredict), type="link"), byrow=FALSE, ncol=nlibs, nrow=nTags)
    meanDiff = rowMeans(zeroProbMatLink)-logit(propZeroGene)
    zeroProbMat = expit(sweep(zeroProbMatLink,1,meanDiff,"-"))
    #zeroProbMat = matrix(predict(zeroFit, newdata=data.frame(logLibHlp=libPredict, midsHlp=cpmPredict), type="response"), byrow=FALSE, ncol=nlibs)

    ## simulate negative binomial counts
    # mu=sweep(Lambda,2,lib.size,"*")
    # mu[mu<0.1] = 0.1
    # #adjustment = zeroProbMat*mu
    # #mu=mu+adjustment
    # counts = matrix(rnbinom(n=nTags*nlibs, mu=mu, size=1/Dispersion), nrow=nTags, ncol=nlibs, byrow=FALSE)
    # zeroProbNegBin = matrix(dnbinom(0, mu=mu, size=1/Dispersion), nrow=nTags, ncol=nlibs, byrow=FALSE)
    # expectedZeroProbablityNegBinomial = rowMeans(zeroProbNegBin)

    ## simulate negative binomial counts
    mu=sweep(Lambda,2,lib.size,"*")
    zeroProbNegBin = matrix(dnbinom(0, mu=mu, size=1/Dispersion), nrow=nTags, ncol=nlibs, byrow=FALSE)
    expectedZeroProbablityNegBinomial = rowMeans(zeroProbNegBin)
    dropoutGenes = expectedZeroProbablityNegBinomial < rowMeans(zeroProbMat)
    adjustment = zeroProbMat*mu
    mu[dropoutGenes,]=mu[dropoutGenes,]+adjustment[dropoutGenes,]
    mu[mu<0.1] = 0.1
    counts = matrix(rnbinom(n=nTags*nlibs, mu=mu, size=1/Dispersion), nrow=nTags, ncol=nlibs, byrow=FALSE)
    counts0=counts
    ## calculate dropouts
    dropoutGenes = expectedZeroProbablityNegBinomial < rowMeans(zeroProbMat)
    message(paste0("Adding extra zeros w.r.t. NB for ",sum(dropoutGenes)," genes"))
    #dropout matrix is 0 for dropout.
    dropoutMatrix = 1-matrix(rbinom(n=nTags*nlibs, size=1, prob=zeroProbMat), nrow=nTags, ncol=nlibs, byrow=FALSE)
    dropoutMatrix[!dropoutGenes,] = 1
    #avoid all dropout genes
    allDropoutId <- which(rowSums(dropoutMatrix)==0)
    while(length(allDropoutId)>0 ){
      dropoutMatrix[allDropoutId,] = 1-matrix(rbinom(n=length(allDropoutId)*nlibs, size=1, prob=zeroProbMat[allDropoutId,]), nrow=length(allDropoutId), ncol=nlibs, byrow=FALSE)
      allDropoutId <- which(rowSums(dropoutMatrix)==0)
    }
    #add dropouts
    dropoutMatrix[counts==0 & dropoutMatrix==0]=1 #if count already zero, it's not a dropout
    counts = counts*dropoutMatrix
    object$dropout = dropoutMatrix

    ## resample positive counts for features with all zero counts
    zeroCountsId <- which(rowSums(counts)==0)
    while(length(zeroCountsId)>0 ){
      counts[zeroCountsId,] = matrix(rnbinom(n=length(zeroCountsId)*nlibs, mu=mu[zeroCountsId,], size=1/Dispersion[zeroCountsId,]), nrow=length(zeroCountsId), ncol=nlibs, byrow=FALSE)
      counts0[zeroCountsId,] = counts[zeroCountsId,]
      counts[zeroCountsId,]=counts[zeroCountsId,]*dropoutMatrix[zeroCountsId,]
      zeroCountsId <- which(rowSums(counts)==0)
    }

    ## name features, return object.
    rownames(counts) <- paste("ids", 1:nTags, sep = "")
    colnames(counts) <- paste("sample",1:nlibs,sep="")
    object$counts <- counts
    object$counts0 <- counts0

    object
  }

  if(verbose) message("Preparing dataset.\n")
  if(is.null(params)){
    dataset <- getDatasetZTNB(counts = dataset, drop.extreme.dispersion = drop.extreme.dispersion, drop.low.lambda = drop.low.lambda)
  } else {
    dataset <- params
  }
  dat <- new("DGEList", list(dataset = dataset, nTags = nTags, lib.size = lib.size, nlibs = nlibs, group = group, design = model.matrix(~group), pUp = pUp, foldDiff = foldDiff))
  if(cpm=="aCpm") dat$dataset$dataset.AveLogCPM = dat$dataset$dataset.aCpm


  if(is.null(dat$lib.size)){
    dat$lib.size <- sample(dataset$dataset.lib.size, nlibs, replace=TRUE)}
  if(is.null(nTags)) dat$nTags <- dat$dataset$dataset.nTags
  if(verbose) message("Sampling.\n")
  dat <- sample.fun(dat)
  if(verbose) message("Calculating differential expression.\n")
  dat <- diff.fun(dat)
  if(verbose) message("Simulating data.\n")
  dat <- sim.fun(dat)
  dat
}