Bug fixes for breakTranscriptsOnGenes() and windowAnalysis()

I've run into 2 errors using groHMM, and below is the patch with unit tests to reproduce the issues and the changes to fix both of them.

The errors one encounters are:

1.  `breakTranscriptsOnGenes()` fails when no breaks are found:
    ``` R
    Error in GRangesList(transcriptsBroken) :
      all elements in '...' must be GRanges objects
    ```

2.  `windowAnalysis()` fails when a strand is chosen, but the seqname (i.e. chromosome) has no reads:
    ``` R
    $chrM                                                                                                                                                                                     
    [1] "Error in normalizeDoubleBracketSubscript(i, x, exact = exact, error.if.nomatch = FALSE) : \n  subscript is out of bounds\n"                                                          
    attr(,"class")                                                                                                                                                                            
    [1] "try-error"                                                                                                                                                                           
    attr(,"condition")                                                                                                                                                                        
    <simpleError in normalizeDoubleBracketSubscript(i, x, exact = exact, error.if.nomatch = FALSE): subscript is out of bounds>
    ```

You can save the patch below and apply it to the root directory of the groHMM package using the unix `patch` command shown below (shell output is prefixed with `#>`).  Don't forget to add `R/utils.R` and the files in `tests/` to Bioconductor's subversion repo after applying the patch.

``` shell
pwd
#> /tmp/groHMM
patch -p1 -i /tmp/fix-breakTranscriptsOnGenes-and-windowAnalysis.patch
#> patching file DESCRIPTION
#> patching file R/breakTranscriptsOnGenes.R
#> patching file R/utils.R
#> patching file R/windowAnalysis.R
#> patching file tests/testthat.R
#> patching file tests/testthat/test_gene_boundary_corrections.R
#> patching file tests/testthat/test_utils.R
```

Also feel free to run the new unit test suite:

``` R
install.packages("devtools")
# Make sure you are in the groHMM directory
devtools::test()
```

Patch:

``` diff
diff --git a/DESCRIPTION b/DESCRIPTION
index c0b2349..7a771a2 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -8,6 +8,8 @@ Authors@R: c(person("Charles G.", "Danko", role = c("aut"),
                 email="minho.chae@gmail.com"),
             person("Andre", "Martins", role = c("ctb"),
                 email="alm253@cornell.edu"),
+	    person("Pariksheet", "Nanda", role = c("ctb"),
+	        email="pariksheet.nanda@uconn.edu"),
             person("W. Lee", "Kraus", role = c("aut", "cre"),
                 email="lee.kraus@utsouthwestern.edu"))
 Author: Charles G. Danko, Minho Chae, Andre Martins, W. Lee Kraus
@@ -26,6 +28,7 @@ Suggests:
     GenomicFeatures,
     edgeR,
     org.Hs.eg.db,
+    testthat,
     TxDb.Hsapiens.UCSC.hg19.knownGene
 Description: A pipeline for the analysis of GRO-seq data.
 URL: https://github.com/Kraus-Lab/groHMM
@@ -33,3 +36,4 @@ BugReports: https://github.com/Kraus-Lab/groHMM/issues
 License: GPL-3
 biocViews: Sequencing, Software
 LazyLoad: yes
+NeedsCompilation: yes
diff --git a/R/breakTranscriptsOnGenes.R b/R/breakTranscriptsOnGenes.R
index e71870b..41f69ce 100644
--- a/R/breakTranscriptsOnGenes.R
+++ b/R/breakTranscriptsOnGenes.R
@@ -97,6 +97,10 @@ breakInterval <- function(gr, brPos, gap=5, strand="+") {
 #' bPlus <- breakTranscriptsOnGenes(tx, annox, strand="+")
 breakTranscriptsOnGenes <- function(tx, annox, strand="+", geneSize=5000, 
     threshold=0.8, gap=5, plot=FALSE) {
+    ## Validate inputs
+    tx <- .normArgRanges(tx, errorOnEmpty=TRUE)
+    annox <- .normArgRanges(annox)
+
     tx <- tx[strand(tx) == strand,]
     annox <- annox[strand(annox) == strand,]
     mcols(tx)$status <- "NA"
@@ -147,12 +151,19 @@ breakTranscriptsOnGenes <- function(tx, annox, strand="+", geneSize=5000,
         }
     }
 
+    okTrans <- setdiff(1:length(tx), dupTrans)
+
+    if (length(bT) == 0) {
+        message("No join errors detected in transcripts")
+        all <- tx[okTrans, ]
+        return(all[order(as.character(seqnames(all)), start(all)), ])
+    }
+
     mcols(bT)$status <- "broken"
     mcols(bT)$ID <- paste(seqnames(bT), "_", start(bT), strand(bT), sep="")
-    okTrans <- setdiff(1:length(tx), dupTrans)
     all <- c(tx[okTrans,], bT)
-    cat(length(unique(ol.df$trans)), " transcripts are broken into ", 
-        length(bT), "\n")
+    message(length(unique(ol.df$trans)),
+            " transcripts are broken into ", length(bT))
 
     return(all[order(as.character(seqnames(all)), start(all)),])
 }
@@ -183,6 +194,10 @@ breakTranscriptsOnGenes <- function(tx, annox, strand="+", geneSize=5000,
 #' combined <- combineTranscripts(tx, annox)
 combineTranscripts <- function(tx, annox, geneSize=1000, threshold=0.8, 
     plot=FALSE) {
+    ## Validate inputs
+    tx <- .normArgRanges(tx, errorOnEmpty=TRUE)
+    annox <- .normArgRanges(annox)
+
     annox <- annox[width(annox) > geneSize,]
     ol <- findOverlaps(tx, annox)
     ol.df <- data.frame(trans=queryHits(ol), gene=subjectHits(ol),
@@ -222,11 +237,18 @@ combineTranscripts <- function(tx, annox, geneSize=1000, threshold=0.8,
         }
     }
 
-    cat(NROW(ol.df), " transcripts are combined to ", NROW(cT), "\n")
+    okTrans <- setdiff(seq_along(tx), ol.df$trans)
+
+    if (length(cT) == 0) {
+        message("No break errors detected in transcripts")
+        all <- tx[okTrans, ]
+        return(all[order(as.character(seqnames(all)), start(all)), ])
+    }
+
+    message(NROW(ol.df), " transcripts are combined to ", NROW(cT))
     mcols(cT)$ID <- paste(seqnames(cT), "_", start(cT), strand(cT), sep="")
     mcols(cT)$status <- "combined"
 
-    okTrans <- setdiff(seq_along(tx), ol.df$trans)
     all <- c(tx[okTrans,], cT)
     return(all[order(as.character(seqnames(all)), start(all)),])
 }
diff --git a/R/utils.R b/R/utils.R
new file mode 100644
index 0000000..760decd
--- /dev/null
+++ b/R/utils.R
@@ -0,0 +1,12 @@
+.normArgRanges <- function(ranges_, errorOnEmpty=FALSE) {
+    var_ <- deparse(substitute(ranges_))
+    if (is(ranges_, "GRanges") || is(ranges_, "IRanges")) {
+        if (length(ranges_) == 0)
+            if (!errorOnEmpty)
+                warning(paste0("'", var_, "' is empty"))
+            else
+                stop(paste0("'", var_, "' cannot be empty"))
+        return(ranges_)
+    }
+    stop(paste0("'", var_, "' must be a GRanges or an IRanges object"))
+}
diff --git a/R/windowAnalysis.R b/R/windowAnalysis.R
index e0ba9af..1818fee 100644
--- a/R/windowAnalysis.R
+++ b/R/windowAnalysis.R
@@ -48,6 +48,8 @@
 #' # Fp <- windowAnalysis(S0mR1, strand="+", windowSize=50)
 windowAnalysis <- function(reads, strand="*", windowSize=stepSize, 
     stepSize=windowSize, chrom=NULL, limitPCRDups=FALSE, ...) {
+    reads <- .normArgRanges(reads, errorOnEmpty=TRUE)
+
     if (!(windowSize > 0 & (windowSize <= max(end(reads)))))
         stop("'windowSize' is out of range!")
 
@@ -57,6 +59,7 @@ windowAnalysis <- function(reads, strand="*", windowSize=stepSize,
     if (!is.null(chrom))  
         reads <- reads[seqnames(reads) == chrom,]
 
+    seqlevels(reads) <- seqlevelsInUse(reads)
     readsList <- split(reads, seqnames(reads))
     if (limitPCRDups) {
         warning("Using limitPCRDups assumes all reads are the same size!  
@@ -81,7 +84,6 @@ windowAnalysis <- function(reads, strand="*", windowSize=stepSize,
     })
 
     H <- mclapply(readsList, function(x) {
-            seqlevels(x) <- seqlevelsInUse(x)
             cov <- coverage(x)[[1]]
             to <- (length(cov) %/% windowSize)*windowSize
             starts <- seq(1, to, stepSize)
diff --git a/tests/testthat.R b/tests/testthat.R
new file mode 100644
index 0000000..ad01d80
--- /dev/null
+++ b/tests/testthat.R
@@ -0,0 +1,4 @@
+library(testthat)
+library(groHMM)
+
+test_check("groHMM")
diff --git a/tests/testthat/test_gene_boundary_corrections.R b/tests/testthat/test_gene_boundary_corrections.R
new file mode 100644
index 0000000..506f000
--- /dev/null
+++ b/tests/testthat/test_gene_boundary_corrections.R
@@ -0,0 +1,93 @@
+library(groHMM)
+context("Gene boundary corrections")
+
+## Fixtures
+tx <- GRanges(c("chr7:1000-30000"), strand="+")
+annox <- GRanges(c("chr7:1000-11000",
+                   "chr7:20000-30000"), strand="+")
+txC <- annox
+annoxC <- tx
+numTxBr <- function(annox) length(ranges(breakTranscriptsOnGenes(tx, annox)))
+numTxCb <- function(txC) length(ranges(combineTranscripts(txC, annoxC)))
+
+test_that("Inputs 'tx' and 'annox' are validated", {
+    expect_error(breakTranscriptsOnGenes(NULL, annox),
+                 "'tx'", ignore.case = TRUE)
+    expect_error(breakTranscriptsOnGenes(GRanges(), annox),
+                 "'tx'", ignore.case = TRUE)
+    expect_error(combineTranscripts(NULL, annox),
+                 "'tx'", ignore.case = TRUE)
+    expect_error(combineTranscripts(GRanges(), annox),
+                 "'tx'", ignore.case = TRUE)
+
+    expect_error(breakTranscriptsOnGenes(tx, NULL),
+                 "'annox'", ignore.case = TRUE)
+    expect_warning(breakTranscriptsOnGenes(tx, GRanges()),
+                   "'annox'", ignore.case = TRUE)
+    expect_error(combineTranscripts(tx, NULL),
+                 "'annox'", ignore.case = TRUE)
+    expect_warning(combineTranscripts(tx, GRanges()),
+                   "'annox'", ignore.case = TRUE)
+})
+
+test_that("breakTranscriptsOnGenes returns 'gap' sized IRanges", {
+    gap <- formals(breakTranscriptsOnGenes)$gap
+
+    expect_equal(ranges(breakTranscriptsOnGenes(tx, annox)[1]),
+                 narrow(ranges(GRanges(c("chr7:1000-20000"))),
+                        end=-gap - 1))
+})
+
+test_that("Gene repair functions honor 'geneSize'", {
+    geneSize <- formals(breakTranscriptsOnGenes)$geneSize
+    annoxShortest <- resize(annox, geneSize + 1)
+    annoxTooShort <- narrow(annoxShortest, end=-2)
+
+    expect_equal(numTxBr(annoxShortest), 2)
+    expect_equal(numTxBr(annoxTooShort), 1)
+
+    geneSize <- formals(combineTranscripts)$geneSize
+    txShortest <- resize(txC, geneSize + 1)
+    txTooShort <- narrow(txShortest, end=-2)
+
+    expect_equal(numTxCb(txShortest), 1)
+    expect_equal(numTxCb(txTooShort), 1)
+})
+
+test_that("breakTranscriptsOnGenes honors 'threshold'", {
+    threshold <- formals(breakTranscriptsOnGenes)$threshold
+    annoxMinOverlap <- shift(annox, width(annox) * (1 - threshold))
+    annoxInsuffOverlap <- shift(annox, width(annox) * (1.05 - threshold))
+
+    expect_equal(numTxBr(annoxMinOverlap), 2)
+    ## The second gene now does not overlap with the transcript.
+    expect_equal(numTxBr(annoxInsuffOverlap), 1)
+})
+
+test_that("breakInterval always returns an S4 GRanges object when bounded", {
+    ## Empty objects
+    expect_s4_class(breakInterval(GRanges(), NULL), "GRanges")
+    expect_s4_class(breakInterval(tx, NULL), "GRanges")
+    expect_s4_class(breakInterval(tx, start(annox[2])), "GRanges")
+    expect_equal(length(breakInterval(GRanges(), NULL)), 0)
+    expect_equal(length(breakInterval(tx, NULL)), 1)
+    expect_equal(length(breakInterval(tx, start(annox[2]))), 2)
+})
+
+test_that("breakInterval errors when 'brPos' is out of 'gr' Range", {
+    expect_error(breakInterval(tx, start(tx) - 500))
+    expect_error(breakInterval(tx, end(tx) + 500))
+})
+
+test_that("breakInterval honors 'strand'", {
+    gap <- formals(breakInterval)$gap
+    brPos <- start(annox[2])
+
+    res <- breakInterval(tx, brPos, strand="+")
+    expect_equal(end(res)[1], brPos - gap)
+    expect_equal(start(res)[2], brPos)
+
+    res <- breakInterval(tx, brPos, strand="-")
+    expect_equal(end(res)[1], brPos)
+    expect_equal(start(res)[2], brPos + gap)
+})
diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_utils.R
new file mode 100644
index 0000000..fe7d025
--- /dev/null
+++ b/tests/testthat/test_utils.R
@@ -0,0 +1,35 @@
+library(groHMM)
+context("Utilities")
+
+# Fixtures
+reads <- GRanges(c("chr7:1000-30000"), strand="+")
+
+test_that("windowAnalysis returns list", {
+  result <- windowAnalysis(reads = reads, windowSize = 100)
+  expect_is(result, "list")
+  lapply(result, expect_is, "Rle")
+})
+
+test_that("windowAnalysis returns correct Rle value", {
+  expected <- list(chr7 = Rle(c(0, 1, 100), c(9, 1, 290)))
+  result <- windowAnalysis(reads = reads, windowSize = 100)
+  expect_equal(expected, result)
+})
+
+test_that("windowAnalysis throws error with empty GRanges", {
+  reads <- GRanges()
+  expect_error(windowAnalysis(reads = reads, windowSize = 100), "cannot be empty")
+})
+
+test_that("windowAnalysis allows empty seqlevels", {
+  # Add empty seqlevels to reads.
+  require(TxDb.Hsapiens.UCSC.hg19.knownGene)
+  hg19_seqinfo <- seqinfo(TxDb.Hsapiens.UCSC.hg19.knownGene)
+  idx_has <- which(names(hg19_seqinfo) %in% names(seqinfo(reads)))
+  idx <- union(idx_has, 1:length(hg19_seqinfo))
+  seqinfo(reads) <- hg19_seqinfo[names(hg19_seqinfo)[idx]]
+  # Set warnings as errors to catch any mclapply errors.
+  on.exit(options(warn = getOption("warn")))
+  options(warn = 2)
+  expect_is(windowAnalysis(reads = reads, windowSize = 100, strand="-"), "list")
+})

```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Bug fixes for breakTranscriptsOnGenes() and windowAnalysis() #2

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Bug fixes for breakTranscriptsOnGenes() and windowAnalysis() #2

Description

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions