Zuzu/spsimseq_test.R at main · BenjaminATaylor/Zuzu · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
library(SPsimSeq)

# load the Zhang bulk RNA-seq data (availabl with the package)
data("zhang.data.sub")
# filter genes with sufficient expression (important step to avoid bugs)
zhang.counts <- zhang.data.sub$counts # Despite the implication of the documentation, these are indeed counts
MYCN.status  <- zhang.data.sub$MYCN.status #The grouping variable

set.seed(123) #Set seed for reproducibility
# simulate data
sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts,
                          group = MYCN.status, n.genes = 3000, batch.config = 1,
                          group.config = c(0.5, 0.5), tot.samples = ncol(zhang.counts),
                          pDE = 0.1, lfc.thrld = 0.5, result.format = "list", return.details = TRUE)

sim.data.bulk1 <- sim.data.bulk$sim.data.list[[1]]
head(sim.data.bulk1$counts[, seq_len(5)])  # count data

head(sim.data.bulk1$colData)        # sample info
head(sim.data.bulk1$rowData)        # gene info

geneDens = evaluateDensities(sim.data.bulk, newData = rownames(zhang.counts)[1])
#This returns for every sample, the midpoints (mids) and associated densities (gy)

# compare the distributions of the mean expressions, variability,
# and fraction of zero counts per gene
library(LSD) # for generating heatmap plots
# normalize counts for comparison
Y0.log.cpm <- log2(edgeR::cpm(zhang.counts)+1)
Y1.log.cpm <- log2(edgeR::cpm(sim.data.bulk1$counts)+1)
Y0.log.cpm <- Y0.log.cpm[rowMeans(Y0.log.cpm>0)>=0.1, ]
Y1.log.cpm <- Y1.log.cpm[rowMeans(Y1.log.cpm>0)>=0.1, ]
rowVars <- function(X){apply(X, 1, var, na.rm=TRUE)}
rowCVs <- function(X){apply(X, 1, function(x) sd(x, na.rm=TRUE)/mean(x, na.rm=TRUE))}
par(mfrow=c(1, 3))
boxplot(list(real.data=log(colSums(zhang.counts)),
             simulated.data=log(sim.data.bulk1$colData$sim.Lib.Size)),
        main="library size")
boxplot(list(real.data=rowMeans(Y0.log.cpm),
             simulated.data=rowMeans(Y1.log.cpm)),
        main="mean expression of genes")
boxplot(list(real.data=rowVars(Y0.log.cpm),
             simulated.data=rowVars(Y1.log.cpm)),
        main="variance of gene expressions")

# compare the relationship between the mean and variability
par(mfrow=c(1,3), mar=c(4,4,4,1))
heatscatter(rowMeans(Y0.log.cpm), rowCVs(Y0.log.cpm), ylim=c(0, 6), xlim=c(0, 16),
            colpal="bl2gr2rd", main="real data", xlab="mean log2-CPM",
            ylab="coefficients of variation", cexplot=0.5, alpha = 60, cex.lab=1.25)
heatscatter(rowMeans(Y1.log.cpm), rowCVs(Y1.log.cpm), ylim=c(0, 6), xlim=c(0, 16),
            main="SPsimSeq", xlab="mean log2-CPM", ylab="coefficients of variation",
            cexplot=0.5, alpha = 60, colpal="bl2gr2rd", cex.lab=1.25)

n.gride <- 1000
min.g   <- seq(0, 20, length.out = n.gride+1)[-n.gride]
max.g   <- seq(0, 20, length.out = n.gride+1)[-1]
mid.g   <- (min.g+max.g)/2
f.real  <- vapply(seq_len(n.gride), FUN.VALUE = double(1), function(r){
  x <- Y0.log.cpm[rowMeans(Y0.log.cpm)<=max.g[r] & rowMeans(Y0.log.cpm)>min.g[r],]
  y <- ifelse(!is.null(dim(x)), mean(rowCVs(x)), mean(sd(x)/mean(x)))
  y
})
f.SPsim <- vapply(seq_len(n.gride), FUN.VALUE = double(1), function(r){
  x <- Y1.log.cpm[rowMeans(Y1.log.cpm)<=max.g[r] & rowMeans(Y1.log.cpm)>min.g[r],]
  y <- ifelse(!is.null(dim(x)), mean(rowCVs(x)), mean(sd(x)/mean(x)))
  y
})

sm1 <- loess(I(f.SPsim-f.real)~mid.g)
plot(mid.g, f.SPsim-f.real, xlim=c(0, 14), col="lightskyblue", pch=20, cex.lab=1.25,
     cex.main=1.4, main="SPsimSeq - real data", ylab="difference", xlab="mean log2-CPM")
lines(mid.g,predict(sm1, newdata = mid.g), col="blue", lwd=3)


# compare the correlation between genes and samples
cor.mat.Y0 <- cor(t(Y0.log.cpm))
cor.mat.Y1 <- cor(t(Y1.log.cpm))
cor.vec.Y0 <- cor.mat.Y0[upper.tri(cor.mat.Y0)]
cor.vec.Y1 <- cor.mat.Y1[upper.tri(cor.mat.Y1)]
par(mfrow=c(1,3), mar=c(4,4,3.5,1))
hist(cor.vec.Y0, nclass = 30, probability = TRUE,
     border="gray", col="steelblue1", main="real data", xlab="Genewise correlations",
     ylim=c(0, 3.5), xlim=c(-1, 1), cex.lab=1.25)
hist(cor.vec.Y1, nclass = 30, probability = TRUE, border="gray",
     col="steelblue1",  main="SPsimSeq", xlab="Genewise correlations",
     ylim=c(0, 3.5), xlim=c(-1, 1), cex.lab=1.25)
plot(seq(-1, 1, 0.1), seq(-1, 1, 0.1), type="n", xlab="quantile (real data)",
     ylab="quantile (simulated data)",  main="correlation quantile-quantile plot")
abline(0, 1, col="gray")
points(quantile(cor.vec.Y0, seq(0, 1, 0.001)), quantile(cor.vec.Y1, seq(0, 1, 0.001)),
       col="blue", pch=20, cex=1.5, cex.lab=1.25)


#### For our data ####
# To make SPsimSeq work, we need to ensure even sample sizes and remove genes with zero counts
evensample = ncol(counts.tmp) - (ncol(counts.tmp) %% 2)
counts.tmp = counts.true[-c(which(rowSums(counts.true) == 0)),]
spsimseq = SPsimSeq(n.sim = 1,
                    s.data = as.matrix(counts.tmp),
                    group = metadata.true$phenotype,
                    n.genes = nrow(counts.tmp),
                    batch.config = 1,
                    group.config = c(0.5, 0.5),
                    tot.samples = evensample,
                    pDE = 0.1,
                    lfc.thrld = 0.5,
                    result.format = "list",
                    return.details = TRUE)