I've run into 2 errors using groHMM, and below is the patch with unit tests to reproduce the issues and the changes to fix both of them.
You can save the patch below and apply it to the root directory of the groHMM package using the unix patch command shown below (shell output is prefixed with #>). Don't forget to add R/utils.R and the files in tests/ to Bioconductor's subversion repo after applying the patch.
diff --git a/DESCRIPTION b/DESCRIPTION
index c0b2349..7a771a2 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -8,6 +8,8 @@ Authors@R: c(person("Charles G.", "Danko", role = c("aut"),
email="minho.chae@gmail.com"),
person("Andre", "Martins", role = c("ctb"),
email="alm253@cornell.edu"),
+ person("Pariksheet", "Nanda", role = c("ctb"),
+ email="pariksheet.nanda@uconn.edu"),
person("W. Lee", "Kraus", role = c("aut", "cre"),
email="lee.kraus@utsouthwestern.edu"))
Author: Charles G. Danko, Minho Chae, Andre Martins, W. Lee Kraus
@@ -26,6 +28,7 @@ Suggests:
GenomicFeatures,
edgeR,
org.Hs.eg.db,
+ testthat,
TxDb.Hsapiens.UCSC.hg19.knownGene
Description: A pipeline for the analysis of GRO-seq data.
URL: https://github.com/Kraus-Lab/groHMM
@@ -33,3 +36,4 @@ BugReports: https://github.com/Kraus-Lab/groHMM/issues
License: GPL-3
biocViews: Sequencing, Software
LazyLoad: yes
+NeedsCompilation: yes
diff --git a/R/breakTranscriptsOnGenes.R b/R/breakTranscriptsOnGenes.R
index e71870b..41f69ce 100644
--- a/R/breakTranscriptsOnGenes.R
+++ b/R/breakTranscriptsOnGenes.R
@@ -97,6 +97,10 @@ breakInterval <- function(gr, brPos, gap=5, strand="+") {
#' bPlus <- breakTranscriptsOnGenes(tx, annox, strand="+")
breakTranscriptsOnGenes <- function(tx, annox, strand="+", geneSize=5000,
threshold=0.8, gap=5, plot=FALSE) {
+ ## Validate inputs
+ tx <- .normArgRanges(tx, errorOnEmpty=TRUE)
+ annox <- .normArgRanges(annox)
+
tx <- tx[strand(tx) == strand,]
annox <- annox[strand(annox) == strand,]
mcols(tx)$status <- "NA"
@@ -147,12 +151,19 @@ breakTranscriptsOnGenes <- function(tx, annox, strand="+", geneSize=5000,
}
}
+ okTrans <- setdiff(1:length(tx), dupTrans)
+
+ if (length(bT) == 0) {
+ message("No join errors detected in transcripts")
+ all <- tx[okTrans, ]
+ return(all[order(as.character(seqnames(all)), start(all)), ])
+ }
+
mcols(bT)$status <- "broken"
mcols(bT)$ID <- paste(seqnames(bT), "_", start(bT), strand(bT), sep="")
- okTrans <- setdiff(1:length(tx), dupTrans)
all <- c(tx[okTrans,], bT)
- cat(length(unique(ol.df$trans)), " transcripts are broken into ",
- length(bT), "\n")
+ message(length(unique(ol.df$trans)),
+ " transcripts are broken into ", length(bT))
return(all[order(as.character(seqnames(all)), start(all)),])
}
@@ -183,6 +194,10 @@ breakTranscriptsOnGenes <- function(tx, annox, strand="+", geneSize=5000,
#' combined <- combineTranscripts(tx, annox)
combineTranscripts <- function(tx, annox, geneSize=1000, threshold=0.8,
plot=FALSE) {
+ ## Validate inputs
+ tx <- .normArgRanges(tx, errorOnEmpty=TRUE)
+ annox <- .normArgRanges(annox)
+
annox <- annox[width(annox) > geneSize,]
ol <- findOverlaps(tx, annox)
ol.df <- data.frame(trans=queryHits(ol), gene=subjectHits(ol),
@@ -222,11 +237,18 @@ combineTranscripts <- function(tx, annox, geneSize=1000, threshold=0.8,
}
}
- cat(NROW(ol.df), " transcripts are combined to ", NROW(cT), "\n")
+ okTrans <- setdiff(seq_along(tx), ol.df$trans)
+
+ if (length(cT) == 0) {
+ message("No break errors detected in transcripts")
+ all <- tx[okTrans, ]
+ return(all[order(as.character(seqnames(all)), start(all)), ])
+ }
+
+ message(NROW(ol.df), " transcripts are combined to ", NROW(cT))
mcols(cT)$ID <- paste(seqnames(cT), "_", start(cT), strand(cT), sep="")
mcols(cT)$status <- "combined"
- okTrans <- setdiff(seq_along(tx), ol.df$trans)
all <- c(tx[okTrans,], cT)
return(all[order(as.character(seqnames(all)), start(all)),])
}
diff --git a/R/utils.R b/R/utils.R
new file mode 100644
index 0000000..760decd
--- /dev/null
+++ b/R/utils.R
@@ -0,0 +1,12 @@
+.normArgRanges <- function(ranges_, errorOnEmpty=FALSE) {
+ var_ <- deparse(substitute(ranges_))
+ if (is(ranges_, "GRanges") || is(ranges_, "IRanges")) {
+ if (length(ranges_) == 0)
+ if (!errorOnEmpty)
+ warning(paste0("'", var_, "' is empty"))
+ else
+ stop(paste0("'", var_, "' cannot be empty"))
+ return(ranges_)
+ }
+ stop(paste0("'", var_, "' must be a GRanges or an IRanges object"))
+}
diff --git a/R/windowAnalysis.R b/R/windowAnalysis.R
index e0ba9af..1818fee 100644
--- a/R/windowAnalysis.R
+++ b/R/windowAnalysis.R
@@ -48,6 +48,8 @@
#' # Fp <- windowAnalysis(S0mR1, strand="+", windowSize=50)
windowAnalysis <- function(reads, strand="*", windowSize=stepSize,
stepSize=windowSize, chrom=NULL, limitPCRDups=FALSE, ...) {
+ reads <- .normArgRanges(reads, errorOnEmpty=TRUE)
+
if (!(windowSize > 0 & (windowSize <= max(end(reads)))))
stop("'windowSize' is out of range!")
@@ -57,6 +59,7 @@ windowAnalysis <- function(reads, strand="*", windowSize=stepSize,
if (!is.null(chrom))
reads <- reads[seqnames(reads) == chrom,]
+ seqlevels(reads) <- seqlevelsInUse(reads)
readsList <- split(reads, seqnames(reads))
if (limitPCRDups) {
warning("Using limitPCRDups assumes all reads are the same size!
@@ -81,7 +84,6 @@ windowAnalysis <- function(reads, strand="*", windowSize=stepSize,
})
H <- mclapply(readsList, function(x) {
- seqlevels(x) <- seqlevelsInUse(x)
cov <- coverage(x)[[1]]
to <- (length(cov) %/% windowSize)*windowSize
starts <- seq(1, to, stepSize)
diff --git a/tests/testthat.R b/tests/testthat.R
new file mode 100644
index 0000000..ad01d80
--- /dev/null
+++ b/tests/testthat.R
@@ -0,0 +1,4 @@
+library(testthat)
+library(groHMM)
+
+test_check("groHMM")
diff --git a/tests/testthat/test_gene_boundary_corrections.R b/tests/testthat/test_gene_boundary_corrections.R
new file mode 100644
index 0000000..506f000
--- /dev/null
+++ b/tests/testthat/test_gene_boundary_corrections.R
@@ -0,0 +1,93 @@
+library(groHMM)
+context("Gene boundary corrections")
+
+## Fixtures
+tx <- GRanges(c("chr7:1000-30000"), strand="+")
+annox <- GRanges(c("chr7:1000-11000",
+ "chr7:20000-30000"), strand="+")
+txC <- annox
+annoxC <- tx
+numTxBr <- function(annox) length(ranges(breakTranscriptsOnGenes(tx, annox)))
+numTxCb <- function(txC) length(ranges(combineTranscripts(txC, annoxC)))
+
+test_that("Inputs 'tx' and 'annox' are validated", {
+ expect_error(breakTranscriptsOnGenes(NULL, annox),
+ "'tx'", ignore.case = TRUE)
+ expect_error(breakTranscriptsOnGenes(GRanges(), annox),
+ "'tx'", ignore.case = TRUE)
+ expect_error(combineTranscripts(NULL, annox),
+ "'tx'", ignore.case = TRUE)
+ expect_error(combineTranscripts(GRanges(), annox),
+ "'tx'", ignore.case = TRUE)
+
+ expect_error(breakTranscriptsOnGenes(tx, NULL),
+ "'annox'", ignore.case = TRUE)
+ expect_warning(breakTranscriptsOnGenes(tx, GRanges()),
+ "'annox'", ignore.case = TRUE)
+ expect_error(combineTranscripts(tx, NULL),
+ "'annox'", ignore.case = TRUE)
+ expect_warning(combineTranscripts(tx, GRanges()),
+ "'annox'", ignore.case = TRUE)
+})
+
+test_that("breakTranscriptsOnGenes returns 'gap' sized IRanges", {
+ gap <- formals(breakTranscriptsOnGenes)$gap
+
+ expect_equal(ranges(breakTranscriptsOnGenes(tx, annox)[1]),
+ narrow(ranges(GRanges(c("chr7:1000-20000"))),
+ end=-gap - 1))
+})
+
+test_that("Gene repair functions honor 'geneSize'", {
+ geneSize <- formals(breakTranscriptsOnGenes)$geneSize
+ annoxShortest <- resize(annox, geneSize + 1)
+ annoxTooShort <- narrow(annoxShortest, end=-2)
+
+ expect_equal(numTxBr(annoxShortest), 2)
+ expect_equal(numTxBr(annoxTooShort), 1)
+
+ geneSize <- formals(combineTranscripts)$geneSize
+ txShortest <- resize(txC, geneSize + 1)
+ txTooShort <- narrow(txShortest, end=-2)
+
+ expect_equal(numTxCb(txShortest), 1)
+ expect_equal(numTxCb(txTooShort), 1)
+})
+
+test_that("breakTranscriptsOnGenes honors 'threshold'", {
+ threshold <- formals(breakTranscriptsOnGenes)$threshold
+ annoxMinOverlap <- shift(annox, width(annox) * (1 - threshold))
+ annoxInsuffOverlap <- shift(annox, width(annox) * (1.05 - threshold))
+
+ expect_equal(numTxBr(annoxMinOverlap), 2)
+ ## The second gene now does not overlap with the transcript.
+ expect_equal(numTxBr(annoxInsuffOverlap), 1)
+})
+
+test_that("breakInterval always returns an S4 GRanges object when bounded", {
+ ## Empty objects
+ expect_s4_class(breakInterval(GRanges(), NULL), "GRanges")
+ expect_s4_class(breakInterval(tx, NULL), "GRanges")
+ expect_s4_class(breakInterval(tx, start(annox[2])), "GRanges")
+ expect_equal(length(breakInterval(GRanges(), NULL)), 0)
+ expect_equal(length(breakInterval(tx, NULL)), 1)
+ expect_equal(length(breakInterval(tx, start(annox[2]))), 2)
+})
+
+test_that("breakInterval errors when 'brPos' is out of 'gr' Range", {
+ expect_error(breakInterval(tx, start(tx) - 500))
+ expect_error(breakInterval(tx, end(tx) + 500))
+})
+
+test_that("breakInterval honors 'strand'", {
+ gap <- formals(breakInterval)$gap
+ brPos <- start(annox[2])
+
+ res <- breakInterval(tx, brPos, strand="+")
+ expect_equal(end(res)[1], brPos - gap)
+ expect_equal(start(res)[2], brPos)
+
+ res <- breakInterval(tx, brPos, strand="-")
+ expect_equal(end(res)[1], brPos)
+ expect_equal(start(res)[2], brPos + gap)
+})
diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_utils.R
new file mode 100644
index 0000000..fe7d025
--- /dev/null
+++ b/tests/testthat/test_utils.R
@@ -0,0 +1,35 @@
+library(groHMM)
+context("Utilities")
+
+# Fixtures
+reads <- GRanges(c("chr7:1000-30000"), strand="+")
+
+test_that("windowAnalysis returns list", {
+ result <- windowAnalysis(reads = reads, windowSize = 100)
+ expect_is(result, "list")
+ lapply(result, expect_is, "Rle")
+})
+
+test_that("windowAnalysis returns correct Rle value", {
+ expected <- list(chr7 = Rle(c(0, 1, 100), c(9, 1, 290)))
+ result <- windowAnalysis(reads = reads, windowSize = 100)
+ expect_equal(expected, result)
+})
+
+test_that("windowAnalysis throws error with empty GRanges", {
+ reads <- GRanges()
+ expect_error(windowAnalysis(reads = reads, windowSize = 100), "cannot be empty")
+})
+
+test_that("windowAnalysis allows empty seqlevels", {
+ # Add empty seqlevels to reads.
+ require(TxDb.Hsapiens.UCSC.hg19.knownGene)
+ hg19_seqinfo <- seqinfo(TxDb.Hsapiens.UCSC.hg19.knownGene)
+ idx_has <- which(names(hg19_seqinfo) %in% names(seqinfo(reads)))
+ idx <- union(idx_has, 1:length(hg19_seqinfo))
+ seqinfo(reads) <- hg19_seqinfo[names(hg19_seqinfo)[idx]]
+ # Set warnings as errors to catch any mclapply errors.
+ on.exit(options(warn = getOption("warn")))
+ options(warn = 2)
+ expect_is(windowAnalysis(reads = reads, windowSize = 100, strand="-"), "list")
+})
I've run into 2 errors using groHMM, and below is the patch with unit tests to reproduce the issues and the changes to fix both of them.
The errors one encounters are:
breakTranscriptsOnGenes()fails when no breaks are found:windowAnalysis()fails when a strand is chosen, but the seqname (i.e. chromosome) has no reads:You can save the patch below and apply it to the root directory of the groHMM package using the unix
patchcommand shown below (shell output is prefixed with#>). Don't forget to addR/utils.Rand the files intests/to Bioconductor's subversion repo after applying the patch.Also feel free to run the new unit test suite:
Patch: