From 2c6d338157bd6543b30626ad5e1a08b5ccf131dc Mon Sep 17 00:00:00 2001 From: aviezerl Date: Sun, 12 Apr 2026 19:27:39 +0300 Subject: [PATCH 1/4] fix: direction="below" with bidirect=TRUE now takes max across strands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When direction="below" and bidirect=TRUE, the edit distance was incorrectly taking the minimum across forward and reverse strands. Since a genomic substitution changes both strands simultaneously, disrupting a motif site requires bringing *both* strands below the threshold — so the harder strand (max) should determine the answer. For direction="above", min remains correct: a match on either strand suffices. Also rewrites the edit distance documentation into a dedicated section with clear explanation of direction/strand semantics, parameter reference, and worked examples using the examples DB. --- DESCRIPTION | 2 +- NEWS.md | 5 + R/vtrack.R | 121 ++++++++++- man/gvtrack.create.Rd | 121 ++++++++++- src/PWMEditDistanceScorer.cpp | 188 ++++++++++++------ tests/testthat/test-pwm-edit-distance-below.R | 65 +++++- 6 files changed, 424 insertions(+), 78 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d5f3611d..40f4d1ec 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: misha Title: Toolkit for Analysis of Genomic Data -Version: 5.6.9 +Version: 5.6.10 Authors@R: c( person("Misha", "Hoichman", , "misha@hoichman.com", role = "aut"), person("Aviezer", "Lifshitz", , "aviezer.lifshitz@weizmann.ac.il", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index 124ca7e2..6f315fca 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# misha 5.6.10 + +* Fixed `direction="below"` with `bidirect=TRUE` taking min across strands instead of max. A genomic substitution changes both strands, so disrupting a motif requires both strands to fall below the threshold. +* Improved edit distance documentation: dedicated section with direction/strand semantics, parameter reference, and worked examples. + # misha 5.6.9 * Added `k` parameter to `gsynth.train()` to configure the Markov order (1-8, default 5). diff --git a/R/vtrack.R b/R/vtrack.R index ee40546e..32fd257e 100644 --- a/R/vtrack.R +++ b/R/vtrack.R @@ -644,12 +644,70 @@ #' \item \code{score.thresh}: Threshold for \code{pwm.count}. Anchors with log-likelihood >= \code{score.thresh} are counted; only one count per genomic start. #' \item Spatial weighting (\code{spat_factor}, \code{spat_bin}, \code{spat_min}, \code{spat_max}): optional position-dependent weights applied in log-space. Provide a positive numeric vector \code{spat_factor}; \code{spat_bin} (integer > 0) defines bin width; \code{spat_min}/\code{spat_max} restrict the scanning window. #' \item \code{pwm.max.pos}: Positions are reported 1-based relative to the final scan window (after iterator shifts and spatial trimming). Ties resolve to the most 5' anchor; the forward strand wins ties at the same coordinate. Values are signed when \code{bidirect = TRUE} (positive for forward, negative for reverse). -#' \item \code{score.thresh}: For edit distance functions, this is the PWM log-likelihood target that the algorithm tries to reach via substitutions. The edit distance is the minimum number of single-base changes needed for the window to achieve this score. -#' \item \code{max_edits}: For edit distance functions only. Optional positive integer setting the maximum search depth. If reaching the threshold requires more than \code{max_edits} substitutions, NA is returned. When NULL (default), exact computation is used. -#' \item \code{max_indels}: For \code{pwm.edit_distance}, \code{pwm.edit_distance.pos}, and \code{pwm.max.edit_distance} only. Not supported by LSE variants. Optional non-negative integer specifying the maximum number of insertions and deletions allowed (default 0, substitutions only). When > 0, a banded Needleman-Wunsch DP is used to find the minimum total edits (substitutions + indels) to reach the score threshold. Typical values are 1-2. -#' \item \code{score.min}: For edit distance functions only. Optional numeric filter. Windows whose PWM log-likelihood is below \code{score.min} are skipped (edit distance returns NA). Improves performance by avoiding expensive computation on low-scoring windows. For LSE variants, the filter applies to the aggregate LSE score across all windows rather than individual window scores. Default NULL (no filter). When \code{direction = "below"} and \code{score.min} is NULL, it defaults to \code{score.thresh} — windows already scoring below the threshold trivially need 0 edits, so filtering them out gives the useful semantics of "how many edits to disrupt this motif match". Use \code{score.min = -Inf} to disable this. -#' \item \code{score.max}: For edit distance functions only. Optional numeric filter. Windows whose PWM log-likelihood is above \code{score.max} are skipped (edit distance returns NA). Combined with \code{score.min}, enables efficient regime-specific queries (e.g., only positions with score in [\code{score.min}, \code{score.max}]). For LSE variants, the filter applies to the aggregate LSE score across all windows rather than individual window scores. Default NULL (no filter). -#' \item \code{direction}: For edit distance functions only. Direction of the edit distance query: \code{"above"} (default) finds the minimum edits to bring the score above \code{score.thresh}; \code{"below"} finds the minimum edits to bring the score below \code{score.thresh}. +#' } +#' +#' \strong{Edit distance notes} +#' +#' The edit distance functions answer "how many single-base changes are needed to reach a target +#' PWM score?" Each genomic window (same length as the PSSM) gets an edit distance: the minimum +#' number of substitutions (and optionally indels) required to bring its PWM log-likelihood score +#' to the target threshold. The virtual track returns the minimum edit distance across all windows +#' in the iterator interval. +#' +#' \strong{Direction.} The \code{direction} parameter controls which side of the threshold to target: +#' \itemize{ +#' \item \code{"above"} (default): minimum edits to make the score \strong{reach or exceed} +#' \code{score.thresh}. Use this to ask "how close is this sequence to becoming a motif match?" +#' A window already scoring above the threshold needs 0 edits. +#' \item \code{"below"}: minimum edits to make the score \strong{fall below} +#' \code{score.thresh}. Use this to ask "how fragile is this motif match — how many mutations +#' would disrupt it?" A window already scoring below the threshold needs 0 edits. +#' } +#' +#' \strong{Strand handling differs by direction.} When \code{bidirect = TRUE}, both strands are +#' scanned at each genomic position. The way their edit distances are combined depends on +#' the direction: +#' \itemize{ +#' \item \code{direction = "above"}: takes the \strong{minimum} across both strands. A motif +#' match on \emph{either} strand is sufficient (e.g., for TF binding), so the easier strand +#' wins. +#' \item \code{direction = "below"}: takes the \strong{maximum} across both strands. A single +#' genomic substitution changes both strands simultaneously (the forward base and its reverse +#' complement). To truly disrupt a binding site you must bring \emph{both} strands below the +#' threshold, so the harder strand determines the answer. For example, if the forward strand +#' needs 3 edits to go below the threshold and the reverse needs 1, the result is 3 — after +#' just 1 edit the forward strand would still have a match. +#' } +#' Then, across all windows in the iterator interval, the minimum is reported (the position +#' where it is easiest to create or disrupt a match). +#' +#' \strong{Parameters:} +#' \itemize{ +#' \item \code{score.thresh}: The target PWM log-likelihood score. Edit distance counts the +#' minimum substitutions needed for a window to cross this threshold (above or below it, +#' depending on \code{direction}). +#' \item \code{max_edits}: Optional positive integer. Cap the search depth: if reaching the +#' threshold requires more than \code{max_edits} edits, NA is returned for that window. +#' Default NULL (exact computation, no cap). +#' \item \code{max_indels}: Optional non-negative integer (default 0). When > 0, allows +#' insertions and deletions in addition to substitutions, using a banded Needleman-Wunsch DP. +#' Typical values are 1-2. Only supported by the non-LSE variants +#' (\code{pwm.edit_distance}, \code{pwm.edit_distance.pos}, \code{pwm.max.edit_distance}). +#' \item \code{score.min}: Optional numeric filter. Windows whose current PWM score is below +#' \code{score.min} are skipped (edit distance returns NA for that window). Default NULL. +#' +#' \strong{Special default for \code{direction = "below"}}: when \code{score.min} is NULL and +#' \code{direction = "below"}, it automatically defaults to \code{score.thresh}. This filters +#' out windows already scoring below the threshold (which would trivially need 0 edits), +#' giving the useful semantics of "how many edits to disrupt this existing match." Set +#' \code{score.min = -Inf} explicitly to disable this filtering. +#' +#' For LSE variants, the filter applies to the aggregate LSE score, not individual windows. +#' \item \code{score.max}: Optional numeric filter. Windows whose current PWM score is above +#' \code{score.max} are skipped (edit distance returns NA). Combined with \code{score.min}, +#' this enables regime-specific queries (e.g., only windows scoring in a given range). +#' Default NULL (no filter). For LSE variants, applies to the aggregate LSE score. +#' \item \code{direction}: \code{"above"} (default) or \code{"below"}. See Direction above. #' } #' #' \strong{Spatial weighting} @@ -875,6 +933,57 @@ #' spat_max = 450 #' ) #' gextract("window_spatial_pwm", gintervals(1, 0, 10000), iterator = 500) +#' +#' # --- Edit distance examples --- +#' # (Using the same 4-position PSSM defined above) +#' +#' # First, check the PWM score distribution to pick sensible thresholds +#' gvtrack.create("pwm_score", NULL, "pwm.max", pssm = pssm) +#' head(gextract("pwm_score", gintervals(1, 500, 520), iterator = 1)) +#' # Scores range from ~-8.3 (poor) to ~-0.8 (strong match) +#' +#' # --- direction = "above": how many edits to CREATE a motif match? --- +#' # "How many substitutions until this window scores >= -5?" +#' gvtrack.create("edist_above", NULL, "pwm.edit_distance", +#' pssm = pssm, score.thresh = -5 +#' ) +#' # At 1bp: each position gets its own edit distance (0 if already matching) +#' head(gextract(c("pwm_score", "edist_above"), +#' gintervals(1, 500, 520), +#' iterator = 1 +#' )) +#' +#' # --- direction = "below": how many edits to DISRUPT a motif match? --- +#' # "How many substitutions to push the score below -5?" +#' # By default, windows already below -5 are filtered out (score.min +#' # defaults to score.thresh), so only existing matches are considered. +#' gvtrack.create("edist_below", NULL, "pwm.edit_distance", +#' pssm = pssm, score.thresh = -5, direction = "below" +#' ) +#' # Positions where score > -5 get an edit distance; others get NA +#' head(gextract(c("pwm_score", "edist_above", "edist_below"), +#' gintervals(1, 500, 520), +#' iterator = 1 +#' )) +#' +#' # --- max_edits: cap the search depth --- +#' # Return NA if more than 2 substitutions are needed +#' gvtrack.create("edist_max2", NULL, "pwm.edit_distance", +#' pssm = pssm, score.thresh = -5, max_edits = 2 +#' ) +#' # Positions needing 3+ edits now return NA instead of 3 +#' head(gextract(c("pwm_score", "edist_above", "edist_max2"), +#' gintervals(1, 500, 520), +#' iterator = 1 +#' )) +#' +#' # --- Aggregation over intervals --- +#' # With a larger iterator, the minimum edit distance across the +#' # interval is returned. +#' gextract(c("pwm_score", "edist_above", "edist_below"), +#' gintervals(1, 0, 2000), +#' iterator = 200 +#' ) #' @seealso \code{\link{gvtrack.iterator}}, \code{\link{gvtrack.iterator.2d}}, \code{\link{gvtrack.filter}} #' @export gvtrack.create gvtrack.create <- function(vtrack = NULL, src = NULL, func = NULL, params = NULL, dim = NULL, sshift = NULL, eshift = NULL, filter = NULL, ...) { diff --git a/man/gvtrack.create.Rd b/man/gvtrack.create.Rd index 7d2827b1..022238cd 100644 --- a/man/gvtrack.create.Rd +++ b/man/gvtrack.create.Rd @@ -182,12 +182,70 @@ The sections below provide additional notes for motif, interval, k-mer, and mask \item \code{score.thresh}: Threshold for \code{pwm.count}. Anchors with log-likelihood >= \code{score.thresh} are counted; only one count per genomic start. \item Spatial weighting (\code{spat_factor}, \code{spat_bin}, \code{spat_min}, \code{spat_max}): optional position-dependent weights applied in log-space. Provide a positive numeric vector \code{spat_factor}; \code{spat_bin} (integer > 0) defines bin width; \code{spat_min}/\code{spat_max} restrict the scanning window. \item \code{pwm.max.pos}: Positions are reported 1-based relative to the final scan window (after iterator shifts and spatial trimming). Ties resolve to the most 5' anchor; the forward strand wins ties at the same coordinate. Values are signed when \code{bidirect = TRUE} (positive for forward, negative for reverse). - \item \code{score.thresh}: For edit distance functions, this is the PWM log-likelihood target that the algorithm tries to reach via substitutions. The edit distance is the minimum number of single-base changes needed for the window to achieve this score. - \item \code{max_edits}: For edit distance functions only. Optional positive integer setting the maximum search depth. If reaching the threshold requires more than \code{max_edits} substitutions, NA is returned. When NULL (default), exact computation is used. - \item \code{max_indels}: For \code{pwm.edit_distance}, \code{pwm.edit_distance.pos}, and \code{pwm.max.edit_distance} only. Not supported by LSE variants. Optional non-negative integer specifying the maximum number of insertions and deletions allowed (default 0, substitutions only). When > 0, a banded Needleman-Wunsch DP is used to find the minimum total edits (substitutions + indels) to reach the score threshold. Typical values are 1-2. - \item \code{score.min}: For edit distance functions only. Optional numeric filter. Windows whose PWM log-likelihood is below \code{score.min} are skipped (edit distance returns NA). Improves performance by avoiding expensive computation on low-scoring windows. For LSE variants, the filter applies to the aggregate LSE score across all windows rather than individual window scores. Default NULL (no filter). When \code{direction = "below"} and \code{score.min} is NULL, it defaults to \code{score.thresh} — windows already scoring below the threshold trivially need 0 edits, so filtering them out gives the useful semantics of "how many edits to disrupt this motif match". Use \code{score.min = -Inf} to disable this. - \item \code{score.max}: For edit distance functions only. Optional numeric filter. Windows whose PWM log-likelihood is above \code{score.max} are skipped (edit distance returns NA). Combined with \code{score.min}, enables efficient regime-specific queries (e.g., only positions with score in [\code{score.min}, \code{score.max}]). For LSE variants, the filter applies to the aggregate LSE score across all windows rather than individual window scores. Default NULL (no filter). - \item \code{direction}: For edit distance functions only. Direction of the edit distance query: \code{"above"} (default) finds the minimum edits to bring the score above \code{score.thresh}; \code{"below"} finds the minimum edits to bring the score below \code{score.thresh}. +} + +\strong{Edit distance notes} + +The edit distance functions answer "how many single-base changes are needed to reach a target +PWM score?" Each genomic window (same length as the PSSM) gets an edit distance: the minimum +number of substitutions (and optionally indels) required to bring its PWM log-likelihood score +to the target threshold. The virtual track returns the minimum edit distance across all windows +in the iterator interval. + +\strong{Direction.} The \code{direction} parameter controls which side of the threshold to target: +\itemize{ + \item \code{"above"} (default): minimum edits to make the score \strong{reach or exceed} + \code{score.thresh}. Use this to ask "how close is this sequence to becoming a motif match?" + A window already scoring above the threshold needs 0 edits. + \item \code{"below"}: minimum edits to make the score \strong{fall below} + \code{score.thresh}. Use this to ask "how fragile is this motif match — how many mutations + would disrupt it?" A window already scoring below the threshold needs 0 edits. +} + +\strong{Strand handling differs by direction.} When \code{bidirect = TRUE}, both strands are +scanned at each genomic position. The way their edit distances are combined depends on +the direction: +\itemize{ + \item \code{direction = "above"}: takes the \strong{minimum} across both strands. A motif + match on \emph{either} strand is sufficient (e.g., for TF binding), so the easier strand + wins. + \item \code{direction = "below"}: takes the \strong{maximum} across both strands. A single + genomic substitution changes both strands simultaneously (the forward base and its reverse + complement). To truly disrupt a binding site you must bring \emph{both} strands below the + threshold, so the harder strand determines the answer. For example, if the forward strand + needs 3 edits to go below the threshold and the reverse needs 1, the result is 3 — after + just 1 edit the forward strand would still have a match. +} +Then, across all windows in the iterator interval, the minimum is reported (the position +where it is easiest to create or disrupt a match). + +\strong{Parameters:} +\itemize{ + \item \code{score.thresh}: The target PWM log-likelihood score. Edit distance counts the + minimum substitutions needed for a window to cross this threshold (above or below it, + depending on \code{direction}). + \item \code{max_edits}: Optional positive integer. Cap the search depth: if reaching the + threshold requires more than \code{max_edits} edits, NA is returned for that window. + Default NULL (exact computation, no cap). + \item \code{max_indels}: Optional non-negative integer (default 0). When > 0, allows + insertions and deletions in addition to substitutions, using a banded Needleman-Wunsch DP. + Typical values are 1-2. Only supported by the non-LSE variants + (\code{pwm.edit_distance}, \code{pwm.edit_distance.pos}, \code{pwm.max.edit_distance}). + \item \code{score.min}: Optional numeric filter. Windows whose current PWM score is below + \code{score.min} are skipped (edit distance returns NA for that window). Default NULL. + + \strong{Special default for \code{direction = "below"}}: when \code{score.min} is NULL and + \code{direction = "below"}, it automatically defaults to \code{score.thresh}. This filters + out windows already scoring below the threshold (which would trivially need 0 edits), + giving the useful semantics of "how many edits to disrupt this existing match." Set + \code{score.min = -Inf} explicitly to disable this filtering. + + For LSE variants, the filter applies to the aggregate LSE score, not individual windows. + \item \code{score.max}: Optional numeric filter. Windows whose current PWM score is above + \code{score.max} are skipped (edit distance returns NA). Combined with \code{score.min}, + this enables regime-specific queries (e.g., only windows scoring in a given range). + Default NULL (no filter). For LSE variants, applies to the aggregate LSE score. + \item \code{direction}: \code{"above"} (default) or \code{"below"}. See Direction above. } \strong{Spatial weighting} @@ -398,6 +456,57 @@ gvtrack.create( spat_max = 450 ) gextract("window_spatial_pwm", gintervals(1, 0, 10000), iterator = 500) + +# --- Edit distance examples --- +# (Using the same 4-position PSSM defined above) + +# First, check the PWM score distribution to pick sensible thresholds +gvtrack.create("pwm_score", NULL, "pwm.max", pssm = pssm) +head(gextract("pwm_score", gintervals(1, 500, 520), iterator = 1)) +# Scores range from ~-8.3 (poor) to ~-0.8 (strong match) + +# --- direction = "above": how many edits to CREATE a motif match? --- +# "How many substitutions until this window scores >= -5?" +gvtrack.create("edist_above", NULL, "pwm.edit_distance", + pssm = pssm, score.thresh = -5 +) +# At 1bp: each position gets its own edit distance (0 if already matching) +head(gextract(c("pwm_score", "edist_above"), + gintervals(1, 500, 520), + iterator = 1 +)) + +# --- direction = "below": how many edits to DISRUPT a motif match? --- +# "How many substitutions to push the score below -5?" +# By default, windows already below -5 are filtered out (score.min +# defaults to score.thresh), so only existing matches are considered. +gvtrack.create("edist_below", NULL, "pwm.edit_distance", + pssm = pssm, score.thresh = -5, direction = "below" +) +# Positions where score > -5 get an edit distance; others get NA +head(gextract(c("pwm_score", "edist_above", "edist_below"), + gintervals(1, 500, 520), + iterator = 1 +)) + +# --- max_edits: cap the search depth --- +# Return NA if more than 2 substitutions are needed +gvtrack.create("edist_max2", NULL, "pwm.edit_distance", + pssm = pssm, score.thresh = -5, max_edits = 2 +) +# Positions needing 3+ edits now return NA instead of 3 +head(gextract(c("pwm_score", "edist_above", "edist_max2"), + gintervals(1, 500, 520), + iterator = 1 +)) + +# --- Aggregation over intervals --- +# With a larger iterator, the minimum edit distance across the +# interval is returned. +gextract(c("pwm_score", "edist_above", "edist_below"), + gintervals(1, 0, 2000), + iterator = 200 +) } \seealso{ \code{\link{gvtrack.info}}, \code{\link{gvtrack.iterator}}, diff --git a/src/PWMEditDistanceScorer.cpp b/src/PWMEditDistanceScorer.cpp index 2a0337a9..fdff818d 100644 --- a/src/PWMEditDistanceScorer.cpp +++ b/src/PWMEditDistanceScorer.cpp @@ -723,6 +723,13 @@ PWMEditDistanceScorer::ScanMetrics PWMEditDistanceScorer::evaluate_windows(const const bool has_score_filter = has_score_min || has_score_max; const bool apply_score_filter = has_score_filter && (m_max_indels == 0); + // For direction=BELOW with bidirectional scanning: a genomic substitution + // affects both strands, so we need enough edits to bring the *harder* strand + // below the threshold. Take max across strands at each position, then min + // across positions. (For ABOVE, min across strands is correct: any one strand + // reaching the threshold suffices.) + const bool below_bidirect = is_below() && scan_forward && scan_reverse && need_min; + const size_t max_start = std::min(interval_length, target_length - motif_length + 1); if (max_start == 0 || (!scan_forward && !scan_reverse)) { return metrics; @@ -808,91 +815,146 @@ PWMEditDistanceScorer::ScanMetrics PWMEditDistanceScorer::evaluate_windows(const // directly, avoiding two function calls (compute_window_edits → passes_prefilter) // for the 90%+ of windows that get rejected. For subs-only, shift is always 0, // so the inner loop is tight: just B bidx lookups + 1 viable table read per block. - if (scan_forward) { - if (need_min) { + if (below_bidirect) { + // direction=BELOW + bidirectional: compute both strands, take max + // (prefilter is always off for BELOW, so no pigeonhole check needed) + float fwd_edits = std::numeric_limits::quiet_NaN(); + { bool pass_score_filter = true; if (apply_score_filter) { float logp = compute_window_pwm_score(window_start, /*reverse=*/false); if (has_score_min && logp < m_score_min) pass_score_filter = false; if (has_score_max && logp > m_score_max) pass_score_filter = false; } - if (pass_score_filter) { - const int* bidx = fwd_bidx.data() + offset; - bool prefilter_pass = true; - if (m_use_prefilter && m_max_indels == 0) { - prefilter_pass = false; - for (const auto& block : m_prefilter_blocks) { - int block_len = (int)block.columns.size(); - int hash = 0; - bool valid = true; - for (int j = 0; j < block_len; j++) { - int b = bidx[block.columns[j]]; - if (b >= 4) { valid = false; break; } - hash += b << (2 * j); - } - if (valid && block.viable[hash]) { - prefilter_pass = true; - break; - } - } - } - if (prefilter_pass) { - float edits = compute_window_edits(bidx, seq_avail, /*reverse=*/false); - maybe_update_min(edits, offset, +1); - } + fwd_edits = compute_window_edits(fwd_bidx.data() + offset, seq_avail, /*reverse=*/false); } } - if (need_pwm) { - float logp = 0.0f; - std::string::const_iterator it = seq.begin() + offset; - m_pssm.calc_like(it, logp); - maybe_update_pwm(logp, offset, +1); - } - } - if (scan_reverse) { - if (need_min) { + float rev_edits = std::numeric_limits::quiet_NaN(); + { bool pass_score_filter = true; if (apply_score_filter) { float logp = compute_window_pwm_score(window_start, /*reverse=*/true); if (has_score_min && logp < m_score_min) pass_score_filter = false; if (has_score_max && logp > m_score_max) pass_score_filter = false; } - if (pass_score_filter) { - const int* bidx = rev_bidx.data() + offset; - bool prefilter_pass = true; - if (m_use_prefilter && m_max_indels == 0) { - prefilter_pass = false; - const int L_val = static_cast(motif_length); - for (const auto& block : m_prefilter_blocks) { - int block_len = (int)block.columns.size(); - int hash = 0; - bool valid = true; - for (int j = 0; j < block_len; j++) { - int seq_idx = L_val - 1 - block.columns[j]; - int b = bidx[seq_idx]; - if (b >= 4) { valid = false; break; } - hash += b << (2 * j); - } - if (valid && block.viable[hash]) { - prefilter_pass = true; - break; + rev_edits = compute_window_edits(rev_bidx.data() + offset, seq_avail, /*reverse=*/true); + } + } + + // Combine: max of non-NaN values (NaN = strand filtered out / no match) + bool have_fwd = !std::isnan(fwd_edits); + bool have_rev = !std::isnan(rev_edits); + if (have_fwd || have_rev) { + float combined; + int combined_dir; + if (have_fwd && have_rev) { + // Both strands have matches — take the harder one to disrupt + if (fwd_edits >= rev_edits) { + combined = fwd_edits; + combined_dir = +1; + } else { + combined = rev_edits; + combined_dir = -1; + } + } else if (have_fwd) { + combined = fwd_edits; + combined_dir = +1; + } else { + combined = rev_edits; + combined_dir = -1; + } + maybe_update_min(combined, offset, combined_dir); + } + } else if (scan_forward || scan_reverse) { + if (scan_forward) { + if (need_min) { + bool pass_score_filter = true; + if (apply_score_filter) { + float logp = compute_window_pwm_score(window_start, /*reverse=*/false); + if (has_score_min && logp < m_score_min) pass_score_filter = false; + if (has_score_max && logp > m_score_max) pass_score_filter = false; + } + + if (pass_score_filter) { + const int* bidx = fwd_bidx.data() + offset; + bool prefilter_pass = true; + if (m_use_prefilter && m_max_indels == 0) { + prefilter_pass = false; + for (const auto& block : m_prefilter_blocks) { + int block_len = (int)block.columns.size(); + int hash = 0; + bool valid = true; + for (int j = 0; j < block_len; j++) { + int b = bidx[block.columns[j]]; + if (b >= 4) { valid = false; break; } + hash += b << (2 * j); + } + if (valid && block.viable[hash]) { + prefilter_pass = true; + break; + } } } + if (prefilter_pass) { + float edits = compute_window_edits(bidx, seq_avail, /*reverse=*/false); + maybe_update_min(edits, offset, +1); + } } - if (prefilter_pass) { - float edits = compute_window_edits(bidx, seq_avail, /*reverse=*/true); - maybe_update_min(edits, offset, -1); - } + } + if (need_pwm) { + float logp = 0.0f; + std::string::const_iterator it = seq.begin() + offset; + m_pssm.calc_like(it, logp); + maybe_update_pwm(logp, offset, +1); } } - if (need_pwm) { - float logp_rc = 0.0f; - std::string::const_iterator it2 = seq.begin() + offset; - m_pssm.calc_like_rc(it2, logp_rc); - maybe_update_pwm(logp_rc, offset, -1); + + if (scan_reverse) { + if (need_min) { + bool pass_score_filter = true; + if (apply_score_filter) { + float logp = compute_window_pwm_score(window_start, /*reverse=*/true); + if (has_score_min && logp < m_score_min) pass_score_filter = false; + if (has_score_max && logp > m_score_max) pass_score_filter = false; + } + + if (pass_score_filter) { + const int* bidx = rev_bidx.data() + offset; + bool prefilter_pass = true; + if (m_use_prefilter && m_max_indels == 0) { + prefilter_pass = false; + const int L_val = static_cast(motif_length); + for (const auto& block : m_prefilter_blocks) { + int block_len = (int)block.columns.size(); + int hash = 0; + bool valid = true; + for (int j = 0; j < block_len; j++) { + int seq_idx = L_val - 1 - block.columns[j]; + int b = bidx[seq_idx]; + if (b >= 4) { valid = false; break; } + hash += b << (2 * j); + } + if (valid && block.viable[hash]) { + prefilter_pass = true; + break; + } + } + } + if (prefilter_pass) { + float edits = compute_window_edits(bidx, seq_avail, /*reverse=*/true); + maybe_update_min(edits, offset, -1); + } + } + } + if (need_pwm) { + float logp_rc = 0.0f; + std::string::const_iterator it2 = seq.begin() + offset; + m_pssm.calc_like_rc(it2, logp_rc); + maybe_update_pwm(logp_rc, offset, -1); + } } } } diff --git a/tests/testthat/test-pwm-edit-distance-below.R b/tests/testthat/test-pwm-edit-distance-below.R index 27764a64..c40575d7 100644 --- a/tests/testthat/test-pwm-edit-distance-below.R +++ b/tests/testthat/test-pwm-edit-distance-below.R @@ -226,13 +226,15 @@ test_that("pwm.edit_distance direction=below bidirectional considers both strand iterator = test_interval ) - # Bidirectional should return minimum of both strands + # For direction=below, bidirectional takes max across strands at each + # position (need to disrupt *both* strands), then min across positions. + # This gives bidi >= max(fwd, rev) in general. fwd <- result$edist_below_fwd[1] rev <- result$edist_below_rev[1] bidi <- result$edist_below_bidi[1] if (!is.na(fwd) && !is.na(rev)) { - expect_equal(bidi, min(fwd, rev), tolerance = 1e-6) + expect_true(bidi >= max(fwd, rev) - 1e-6) } else if (!is.na(fwd)) { expect_equal(bidi, fwd, tolerance = 1e-6) } else if (!is.na(rev)) { @@ -240,6 +242,65 @@ test_that("pwm.edit_distance direction=below bidirectional considers both strand } }) +test_that("pwm.edit_distance direction=below bidirectional takes max across strands per position", { + remove_all_vtracks() + + pssm <- create_test_pssm() + + # Use a wider interval with 1bp iterator to check per-position behavior + test_interval <- gintervals(1, 200, 260) + threshold <- -5.0 + + gvtrack.create("edist_below_fwd_1bp", NULL, + func = "pwm.edit_distance", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = FALSE, strand = 1, extend = FALSE, prior = 0 + ) + gvtrack.create("edist_below_rev_1bp", NULL, + func = "pwm.edit_distance", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = FALSE, strand = -1, extend = FALSE, prior = 0 + ) + gvtrack.create("edist_below_bidi_1bp", NULL, + func = "pwm.edit_distance", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = TRUE, extend = FALSE, prior = 0 + ) + + result <- gextract( + c("edist_below_fwd_1bp", "edist_below_rev_1bp", "edist_below_bidi_1bp"), + test_interval, + iterator = 1 + ) + + # At each 1bp position: bidi should be max of fwd and rev (per-position) + # when both are non-NA, and equal to the non-NA one when only one is present + for (i in seq_len(nrow(result))) { + fwd <- result$edist_below_fwd_1bp[i] + rev <- result$edist_below_rev_1bp[i] + bidi <- result$edist_below_bidi_1bp[i] + + if (!is.na(fwd) && !is.na(rev)) { + expect_equal(bidi, max(fwd, rev), + tolerance = 1e-6, + info = paste("position", i, "fwd=", fwd, "rev=", rev, "bidi=", bidi) + ) + } else if (!is.na(fwd)) { + expect_equal(bidi, fwd, tolerance = 1e-6) + } else if (!is.na(rev)) { + expect_equal(bidi, rev, tolerance = 1e-6) + } else { + expect_true(is.na(bidi)) + } + } +}) + test_that("pwm.edit_distance.pos and pwm.max.edit_distance work with direction=below", { remove_all_vtracks() From a1b1b5dd4f453e5db91a2d299d189bbd0039cfa0 Mon Sep 17 00:00:00 2001 From: aviezerl Date: Sun, 12 Apr 2026 22:40:22 +0300 Subject: [PATCH 2/4] fix: remove hidden score.min default for direction="below" Previously, direction="below" silently set score.min = score.thresh when score.min was NULL. This hidden default was a footgun: users who pre-screened for strong matches and then ran edit distance would get unexpected NAs without realizing score.min was being set behind their back. score.min now defaults to NULL (no filter) for both directions, matching score.max behavior. Users who want filtering must set score.min explicitly. Also updates docs to clearly explain score.min vs score.thresh: score.min is a pre-filter (which windows to evaluate), score.thresh is the target (what score to reach). --- NEWS.md | 1 + R/vtrack.R | 78 ++++++++++--------- man/gvtrack.create.Rd | 65 ++++++++++------ tests/testthat/test-pwm-edit-distance-below.R | 20 ++--- 4 files changed, 95 insertions(+), 69 deletions(-) diff --git a/NEWS.md b/NEWS.md index 6f315fca..c364389e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,7 @@ # misha 5.6.10 * Fixed `direction="below"` with `bidirect=TRUE` taking min across strands instead of max. A genomic substitution changes both strands, so disrupting a motif requires both strands to fall below the threshold. +* **Breaking**: removed the hidden default that set `score.min = score.thresh` when `direction = "below"`. `score.min` now defaults to NULL (no filter) for both directions. Set `score.min` explicitly to filter windows. * Improved edit distance documentation: dedicated section with direction/strand semantics, parameter reference, and worked examples. # misha 5.6.9 diff --git a/R/vtrack.R b/R/vtrack.R index 32fd257e..0a47bb1f 100644 --- a/R/vtrack.R +++ b/R/vtrack.R @@ -332,13 +332,6 @@ } } - # For direction="below", default score.min to score.thresh: windows already - # scoring below the threshold trivially need 0 edits and would dominate the - # minimum, making the result always 0. - if (direction == "below" && is.null(score.min)) { - score.min <- score.thresh - } - # Set params with processed values list( pssm = pssm, @@ -433,12 +426,6 @@ } } - # For direction="below", default score.min to score.thresh (see comment - # in .vtrack_params_pwm_edit_distance for rationale). - if (direction == "below" && is.null(score.min)) { - score.min <- score.thresh - } - # Set params with processed values list( pssm = pssm, @@ -683,9 +670,22 @@ #' #' \strong{Parameters:} #' \itemize{ -#' \item \code{score.thresh}: The target PWM log-likelihood score. Edit distance counts the -#' minimum substitutions needed for a window to cross this threshold (above or below it, -#' depending on \code{direction}). +#' \item \code{score.thresh}: The target PWM log-likelihood score that the edit distance +#' algorithm tries to reach. For \code{direction = "above"}, this is the score to reach +#' or exceed; for \code{direction = "below"}, this is the score to fall below. +#' \item \code{score.min}, \code{score.max}: Optional numeric pre-filters on the \strong{current} +#' (unedited) PWM score. Windows scoring outside the [\code{score.min}, \code{score.max}] +#' range are skipped (edit distance returns NA). Default NULL (no filter). +#' These are independent of \code{score.thresh} — they control \emph{which windows to +#' evaluate}, not the target score. +#' +#' \strong{Typical usage with \code{direction = "below"}}: use \code{score.min} to restrict +#' to windows that currently have a motif match. For example, with \code{score.thresh = -18} +#' and \code{score.min = -14}, only windows scoring >= -14 are considered, and the edit +#' distance tells you how many edits to push each one below -18. Without \code{score.min}, +#' windows already scoring below -18 will return 0 edits. +#' +#' For LSE variants, the filters apply to the aggregate LSE score, not individual windows. #' \item \code{max_edits}: Optional positive integer. Cap the search depth: if reaching the #' threshold requires more than \code{max_edits} edits, NA is returned for that window. #' Default NULL (exact computation, no cap). @@ -693,20 +693,6 @@ #' insertions and deletions in addition to substitutions, using a banded Needleman-Wunsch DP. #' Typical values are 1-2. Only supported by the non-LSE variants #' (\code{pwm.edit_distance}, \code{pwm.edit_distance.pos}, \code{pwm.max.edit_distance}). -#' \item \code{score.min}: Optional numeric filter. Windows whose current PWM score is below -#' \code{score.min} are skipped (edit distance returns NA for that window). Default NULL. -#' -#' \strong{Special default for \code{direction = "below"}}: when \code{score.min} is NULL and -#' \code{direction = "below"}, it automatically defaults to \code{score.thresh}. This filters -#' out windows already scoring below the threshold (which would trivially need 0 edits), -#' giving the useful semantics of "how many edits to disrupt this existing match." Set -#' \code{score.min = -Inf} explicitly to disable this filtering. -#' -#' For LSE variants, the filter applies to the aggregate LSE score, not individual windows. -#' \item \code{score.max}: Optional numeric filter. Windows whose current PWM score is above -#' \code{score.max} are skipped (edit distance returns NA). Combined with \code{score.min}, -#' this enables regime-specific queries (e.g., only windows scoring in a given range). -#' Default NULL (no filter). For LSE variants, applies to the aggregate LSE score. #' \item \code{direction}: \code{"above"} (default) or \code{"below"}. See Direction above. #' } #' @@ -954,17 +940,37 @@ #' )) #' #' # --- direction = "below": how many edits to DISRUPT a motif match? --- -#' # "How many substitutions to push the score below -5?" -#' # By default, windows already below -5 are filtered out (score.min -#' # defaults to score.thresh), so only existing matches are considered. +#' # +#' # score.thresh is the target: "push the score below this value" +#' # score.min is a pre-filter: "only consider windows currently scoring above this" +#' # These are independent — score.min controls WHICH windows to evaluate, +#' # score.thresh controls WHAT score to target. +#' # +#' # Example: among windows scoring >= -5 (strong matches), +#' # how many edits to push below -7 (disrupt the match)? #' gvtrack.create("edist_below", NULL, "pwm.edit_distance", -#' pssm = pssm, score.thresh = -5, direction = "below" +#' pssm = pssm, score.thresh = -7, score.min = -5, +#' direction = "below" #' ) -#' # Positions where score > -5 get an edit distance; others get NA #' head(gextract(c("pwm_score", "edist_above", "edist_below"), #' gintervals(1, 500, 520), #' iterator = 1 #' )) +#' # score >= -5 (e.g. -2.7): gets edit distance (2 edits to go below -7) +#' # score < -5 (e.g. -6.4): NA (filtered out by score.min) +#' +#' # Without score.min, ALL windows are evaluated — windows already +#' # below -7 return 0 edits: +#' gvtrack.create("edist_below_all", NULL, "pwm.edit_distance", +#' pssm = pssm, score.thresh = -7, direction = "below" +#' ) +#' head(gextract(c("pwm_score", "edist_below", "edist_below_all"), +#' gintervals(1, 500, 520), +#' iterator = 1 +#' )) +#' # score >= -5: same result as edist_below (2 edits) +#' # score between -7 and -5 (e.g. -6.4): edist_below=NA, edist_below_all=0 +#' # score < -7: both return 0 (already below threshold) #' #' # --- max_edits: cap the search depth --- #' # Return NA if more than 2 substitutions are needed @@ -980,7 +986,7 @@ #' # --- Aggregation over intervals --- #' # With a larger iterator, the minimum edit distance across the #' # interval is returned. -#' gextract(c("pwm_score", "edist_above", "edist_below"), +#' gextract(c("pwm_score", "edist_above", "edist_below_all"), #' gintervals(1, 0, 2000), #' iterator = 200 #' ) diff --git a/man/gvtrack.create.Rd b/man/gvtrack.create.Rd index 022238cd..1fbba3ef 100644 --- a/man/gvtrack.create.Rd +++ b/man/gvtrack.create.Rd @@ -221,9 +221,22 @@ where it is easiest to create or disrupt a match). \strong{Parameters:} \itemize{ - \item \code{score.thresh}: The target PWM log-likelihood score. Edit distance counts the - minimum substitutions needed for a window to cross this threshold (above or below it, - depending on \code{direction}). + \item \code{score.thresh}: The target PWM log-likelihood score that the edit distance + algorithm tries to reach. For \code{direction = "above"}, this is the score to reach + or exceed; for \code{direction = "below"}, this is the score to fall below. + \item \code{score.min}, \code{score.max}: Optional numeric pre-filters on the \strong{current} + (unedited) PWM score. Windows scoring outside the [\code{score.min}, \code{score.max}] + range are skipped (edit distance returns NA). Default NULL (no filter). + These are independent of \code{score.thresh} — they control \emph{which windows to + evaluate}, not the target score. + + \strong{Typical usage with \code{direction = "below"}}: use \code{score.min} to restrict + to windows that currently have a motif match. For example, with \code{score.thresh = -18} + and \code{score.min = -14}, only windows scoring >= -14 are considered, and the edit + distance tells you how many edits to push each one below -18. Without \code{score.min}, + windows already scoring below -18 will return 0 edits. + + For LSE variants, the filters apply to the aggregate LSE score, not individual windows. \item \code{max_edits}: Optional positive integer. Cap the search depth: if reaching the threshold requires more than \code{max_edits} edits, NA is returned for that window. Default NULL (exact computation, no cap). @@ -231,20 +244,6 @@ where it is easiest to create or disrupt a match). insertions and deletions in addition to substitutions, using a banded Needleman-Wunsch DP. Typical values are 1-2. Only supported by the non-LSE variants (\code{pwm.edit_distance}, \code{pwm.edit_distance.pos}, \code{pwm.max.edit_distance}). - \item \code{score.min}: Optional numeric filter. Windows whose current PWM score is below - \code{score.min} are skipped (edit distance returns NA for that window). Default NULL. - - \strong{Special default for \code{direction = "below"}}: when \code{score.min} is NULL and - \code{direction = "below"}, it automatically defaults to \code{score.thresh}. This filters - out windows already scoring below the threshold (which would trivially need 0 edits), - giving the useful semantics of "how many edits to disrupt this existing match." Set - \code{score.min = -Inf} explicitly to disable this filtering. - - For LSE variants, the filter applies to the aggregate LSE score, not individual windows. - \item \code{score.max}: Optional numeric filter. Windows whose current PWM score is above - \code{score.max} are skipped (edit distance returns NA). Combined with \code{score.min}, - this enables regime-specific queries (e.g., only windows scoring in a given range). - Default NULL (no filter). For LSE variants, applies to the aggregate LSE score. \item \code{direction}: \code{"above"} (default) or \code{"below"}. See Direction above. } @@ -477,17 +476,37 @@ head(gextract(c("pwm_score", "edist_above"), )) # --- direction = "below": how many edits to DISRUPT a motif match? --- -# "How many substitutions to push the score below -5?" -# By default, windows already below -5 are filtered out (score.min -# defaults to score.thresh), so only existing matches are considered. +# +# score.thresh is the target: "push the score below this value" +# score.min is a pre-filter: "only consider windows currently scoring above this" +# These are independent — score.min controls WHICH windows to evaluate, +# score.thresh controls WHAT score to target. +# +# Example: among windows scoring >= -5 (strong matches), +# how many edits to push below -7 (disrupt the match)? gvtrack.create("edist_below", NULL, "pwm.edit_distance", - pssm = pssm, score.thresh = -5, direction = "below" + pssm = pssm, score.thresh = -7, score.min = -5, + direction = "below" ) -# Positions where score > -5 get an edit distance; others get NA head(gextract(c("pwm_score", "edist_above", "edist_below"), gintervals(1, 500, 520), iterator = 1 )) +# score >= -5 (e.g. -2.7): gets edit distance (2 edits to go below -7) +# score < -5 (e.g. -6.4): NA (filtered out by score.min) + +# Without score.min, ALL windows are evaluated — windows already +# below -7 return 0 edits: +gvtrack.create("edist_below_all", NULL, "pwm.edit_distance", + pssm = pssm, score.thresh = -7, direction = "below" +) +head(gextract(c("pwm_score", "edist_below", "edist_below_all"), + gintervals(1, 500, 520), + iterator = 1 +)) +# score >= -5: same result as edist_below (2 edits) +# score between -7 and -5 (e.g. -6.4): edist_below=NA, edist_below_all=0 +# score < -7: both return 0 (already below threshold) # --- max_edits: cap the search depth --- # Return NA if more than 2 substitutions are needed @@ -503,7 +522,7 @@ head(gextract(c("pwm_score", "edist_above", "edist_max2"), # --- Aggregation over intervals --- # With a larger iterator, the minimum edit distance across the # interval is returned. -gextract(c("pwm_score", "edist_above", "edist_below"), +gextract(c("pwm_score", "edist_above", "edist_below_all"), gintervals(1, 0, 2000), iterator = 200 ) diff --git a/tests/testthat/test-pwm-edit-distance-below.R b/tests/testthat/test-pwm-edit-distance-below.R index c40575d7..650f461d 100644 --- a/tests/testthat/test-pwm-edit-distance-below.R +++ b/tests/testthat/test-pwm-edit-distance-below.R @@ -402,7 +402,7 @@ test_that("pwm.edit_distance direction=below with score.min/score.max filtering" test_interval <- gintervals(1, 200, 240) threshold <- -5.0 - # Default (no score.min) — for direction="below", defaults to score.thresh + # Default (no score.min) — no filtering, same as score.min = -Inf gvtrack.create("edist_below_default", NULL, func = "pwm.edit_distance", pssm = pssm, score.thresh = threshold, @@ -410,20 +410,20 @@ test_that("pwm.edit_distance direction=below with score.min/score.max filtering" bidirect = FALSE, extend = FALSE, prior = 0 ) - # Explicit score.min = score.thresh — should match the default - gvtrack.create("edist_below_explicit_thresh", NULL, + # Explicit score.min = -Inf — should match the default + gvtrack.create("edist_below_no_filter", NULL, func = "pwm.edit_distance", pssm = pssm, score.thresh = threshold, - score.min = threshold, + score.min = -Inf, direction = "below", bidirect = FALSE, extend = FALSE, prior = 0 ) - # With -Inf score.min (disables filtering, old behavior) - gvtrack.create("edist_below_lowfilt", NULL, + # Explicit score.min = score.thresh — filters windows already below threshold + gvtrack.create("edist_below_filtered", NULL, func = "pwm.edit_distance", pssm = pssm, score.thresh = threshold, - score.min = -Inf, + score.min = threshold, direction = "below", bidirect = FALSE, extend = FALSE, prior = 0 ) @@ -438,13 +438,13 @@ test_that("pwm.edit_distance direction=below with score.min/score.max filtering" ) result <- gextract( - c("edist_below_default", "edist_below_explicit_thresh", "edist_below_lowfilt", "edist_below_highfilt"), + c("edist_below_default", "edist_below_no_filter", "edist_below_filtered", "edist_below_highfilt"), test_interval, iterator = test_interval ) - # Default should match explicit score.min = score.thresh - expect_equal(result$edist_below_default[1], result$edist_below_explicit_thresh[1], tolerance = 1e-6) + # Default should match explicit score.min = -Inf (no filter) + expect_equal(result$edist_below_default[1], result$edist_below_no_filter[1], tolerance = 1e-6) # High filter should either be NA or >= default result if (!is.na(result$edist_below_highfilt[1]) && !is.na(result$edist_below_default[1])) { From ac043bd7ad333f8a1a1fd8d998e0e74e18f5b8ea Mon Sep 17 00:00:00 2001 From: aviezerl Date: Mon, 13 Apr 2026 12:49:40 +0300 Subject: [PATCH 3/4] feat: add pwm.n_mutations virtual track function Count single-base substitutions that independently cross a PWM score threshold. For each window, computes how many (position, alt_base) pairs would each individually change the score past the threshold. Returns 0 if threshold already satisfied, NA if no single edit suffices. Builds on existing PWMEditDistanceScorer infrastructure (gain tables, direction handling, score filtering). Subs-only, no indels. R API: gvtrack.create("name", NULL, "pwm.n_mutations", pssm=..., score.thresh=..., direction=..., bidirect=..., score.min=..., score.max=...) Tests: 20 tests, 94 assertions covering correctness, direction, bidirectional, score filters, integration, and parameter validation. --- R/vtrack.R | 93 ++- man/gvtrack.create.Rd | 6 + src/PWMEditDistanceScorer.cpp | 207 +++++-- src/PWMEditDistanceScorer.h | 7 +- src/SequenceVarProcessor.cpp | 15 +- src/TrackExpressionVars.cpp | 81 ++- src/TrackExpressionVars.h | 10 +- tests/testthat/test-pwm-n-mutations.R | 800 ++++++++++++++++++++++++++ 8 files changed, 1172 insertions(+), 47 deletions(-) create mode 100644 tests/testthat/test-pwm-n-mutations.R diff --git a/R/vtrack.R b/R/vtrack.R index 0a47bb1f..36d62855 100644 --- a/R/vtrack.R +++ b/R/vtrack.R @@ -348,6 +348,89 @@ ) } +#' Validate and process PWM n_mutations function parameters +#' @noRd +.vtrack_params_pwm_n_mutations <- function(func, params, dots) { + if (!is.null(params)) { + if (!is.list(params) || !("pssm" %in% names(params))) { + stop("pwm.n_mutations requires a list with at least 'pssm' matrix parameter") + } + dots <- params + } + + if (!("pssm" %in% names(dots))) { + stop("pwm.n_mutations requires a 'pssm' matrix parameter") + } + if (!("score.thresh" %in% names(dots))) { + stop("pwm.n_mutations requires a 'score.thresh' parameter") + } + + pssm <- dots$pssm + score.thresh <- dots$score.thresh + score.min <- dots$score.min + score.max <- dots$score.max + bidirect <- if (!is.null(dots$bidirect)) dots$bidirect else TRUE + prior <- if (!is.null(dots$prior)) dots$prior else 0.01 + extend <- if (!is.null(dots$extend)) dots$extend else TRUE + strand <- if (!is.null(dots$strand)) dots$strand else 1 + direction <- if (!is.null(dots$direction)) dots$direction else "above" + + pssm <- .coerce_pssm_matrix( + pssm, + numeric_msg = "PSSM must be a numeric matrix or data frame with numeric columns", + ncol_msg = "PSSM must have columns named A, C, G, T", + colnames_msg = "PSSM must have columns named A, C, G, T" + ) + + if (!is.numeric(prior) || prior < 0 || prior > 1) { + stop("prior must be a number between 0 and 1") + } + + if (!is.logical(bidirect)) { + stop("bidirect must be TRUE or FALSE") + } + + if (!is.logical(extend) && !(is.numeric(extend) && length(extend) == 1 && extend == as.integer(extend) && extend >= 0)) { + stop("extend must be TRUE, FALSE, or a non-negative integer") + } + + if (strand != 1 && strand != -1) { + stop("strand must be 1 or -1") + } + + if (!direction %in% c("above", "below")) { + stop("direction must be 'above' or 'below'") + } + + if (!is.numeric(score.thresh) || length(score.thresh) != 1) { + stop("score.thresh must be a single numeric value") + } + + if (!is.null(score.min)) { + if (!is.numeric(score.min) || length(score.min) != 1) { + stop("score.min must be NULL or a single numeric value") + } + } + + if (!is.null(score.max)) { + if (!is.numeric(score.max) || length(score.max) != 1) { + stop("score.max must be NULL or a single numeric value") + } + } + + list( + pssm = pssm, + score.thresh = score.thresh, + score.min = score.min, + score.max = score.max, + bidirect = bidirect, + prior = prior, + extend = extend, + strand = strand, + direction = direction + ) +} + #' Validate and process PWM LSE edit distance function parameters #' @noRd .vtrack_params_pwm_edit_distance_lse <- function(func, params, dots) { @@ -486,6 +569,7 @@ pwm.max.edit_distance = .vtrack_params_pwm_edit_distance, pwm.edit_distance.lse = .vtrack_params_pwm_edit_distance_lse, pwm.edit_distance.lse.pos = .vtrack_params_pwm_edit_distance_lse, + pwm.n_mutations = .vtrack_params_pwm_n_mutations, neighbor.count = .vtrack_params_neighbor_count ) @@ -495,7 +579,8 @@ "kmer.count", "kmer.frac", "masked.count", "masked.frac", "pwm.edit_distance", "pwm.edit_distance.pos", "pwm.max.edit_distance", - "pwm.edit_distance.lse", "pwm.edit_distance.lse.pos" + "pwm.edit_distance.lse", "pwm.edit_distance.lse.pos", + "pwm.n_mutations" ) #' Creates a new virtual track @@ -597,6 +682,12 @@ #' NULL (sequence) \tab pwm.max.edit_distance \tab pssm, score.thresh, max_edits, max_indels, score.min, score.max, direction, bidirect, prior, extend, strand \tab Edit distance at the best-scoring PWM window (same location as pwm.max/pwm.max.pos). \cr #' } #' +#' \strong{Mutation count summarizers} +#' \tabular{llll}{ +#' Source \tab func \tab Key params \tab Description \cr +#' NULL (sequence) \tab pwm.n_mutations \tab pssm, score.thresh, score.min, score.max, direction, bidirect, prior, extend, strand \tab Number of single-base substitutions that independently cross \code{score.thresh}. Returns 0 if threshold already satisfied, NA if no single edit suffices. \cr +#' } +#' #' \strong{LSE edit distance summarizers} #' \tabular{llll}{ #' Source \tab func \tab Key params \tab Description \cr diff --git a/man/gvtrack.create.Rd b/man/gvtrack.create.Rd index 1fbba3ef..9b8404f0 100644 --- a/man/gvtrack.create.Rd +++ b/man/gvtrack.create.Rd @@ -148,6 +148,12 @@ interval after all modifier adjustments. NULL (sequence) \tab pwm.max.edit_distance \tab pssm, score.thresh, max_edits, max_indels, score.min, score.max, direction, bidirect, prior, extend, strand \tab Edit distance at the best-scoring PWM window (same location as pwm.max/pwm.max.pos). \cr } +\strong{Mutation count summarizers} +\tabular{llll}{ + Source \tab func \tab Key params \tab Description \cr + NULL (sequence) \tab pwm.n_mutations \tab pssm, score.thresh, score.min, score.max, direction, bidirect, prior, extend, strand \tab Number of single-base substitutions that independently cross \code{score.thresh}. Returns 0 if threshold already satisfied, NA if no single edit suffices. \cr +} + \strong{LSE edit distance summarizers} \tabular{llll}{ Source \tab func \tab Key params \tab Description \cr diff --git a/src/PWMEditDistanceScorer.cpp b/src/PWMEditDistanceScorer.cpp index fdff818d..2dce00b1 100644 --- a/src/PWMEditDistanceScorer.cpp +++ b/src/PWMEditDistanceScorer.cpp @@ -326,6 +326,8 @@ float PWMEditDistanceScorer::score_interval(const GInterval& interval, return metrics.min_edits_position; case Mode::PWM_MAX_EDITS: return metrics.best_pwm_edits; + case Mode::N_MUTATIONS: + return metrics.n_mutations; default: return std::numeric_limits::quiet_NaN(); } @@ -702,6 +704,51 @@ float PWMEditDistanceScorer::compute_window_edits(const int* bidx, int seq_avail return compute_heuristic(bidx, reverse, m_max_edits); } +float PWMEditDistanceScorer::compute_n_mutations(const int* bidx, bool reverse) { + int L = m_pssm.length(); + + // Compute current window score using the flat lookup table + double score = 0; + for (int i = 0; i < L; i++) { + int col = reverse ? (L - 1 - i) : i; + int base = bidx[i]; + if (base >= 4) { + score += m_score_table[col][4]; + } else { + score += m_score_table[col][base]; + } + } + + // If threshold already satisfied, return 0 + if (threshold_satisfied(score)) return 0.0f; + + // Compute deficit + double deficit = compute_deficit(score); + + // Count single-base substitutions that independently cross threshold + int count = 0; + for (int i = 0; i < L; i++) { + int col = reverse ? (L - 1 - i) : i; + int current_base = bidx[i]; + if (current_base >= 4) continue; // skip N bases + + for (int b = 0; b < 4; b++) { + if (b == current_base) continue; + + double delta; + if (is_below()) { + delta = m_score_table[col][current_base] - m_score_table[col][b]; + } else { + delta = m_score_table[col][b] - m_score_table[col][current_base]; + } + + if (delta >= deficit) count++; + } + } + + return (count > 0) ? static_cast(count) : std::numeric_limits::quiet_NaN(); +} + PWMEditDistanceScorer::ScanMetrics PWMEditDistanceScorer::evaluate_windows(const std::string& seq, size_t interval_length) { @@ -715,7 +762,8 @@ PWMEditDistanceScorer::ScanMetrics PWMEditDistanceScorer::evaluate_windows(const const bool scan_forward = should_scan_forward(); const bool scan_reverse = should_scan_reverse(); - const bool need_min = (m_mode != Mode::PWM_MAX_EDITS); + const bool need_n_mutations = (m_mode == Mode::N_MUTATIONS); + const bool need_min = (m_mode != Mode::PWM_MAX_EDITS && !need_n_mutations); const bool need_min_pos = (m_mode == Mode::MIN_EDITS_POSITION); const bool need_pwm = (m_mode == Mode::PWM_MAX_EDITS); const bool has_score_min = !std::isnan(m_score_min); @@ -728,7 +776,7 @@ PWMEditDistanceScorer::ScanMetrics PWMEditDistanceScorer::evaluate_windows(const // below the threshold. Take max across strands at each position, then min // across positions. (For ABOVE, min across strands is correct: any one strand // reaching the threshold suffices.) - const bool below_bidirect = is_below() && scan_forward && scan_reverse && need_min; + const bool below_bidirect = is_below() && scan_forward && scan_reverse && (need_min || need_n_mutations); const size_t max_start = std::min(interval_length, target_length - motif_length + 1); if (max_start == 0 || (!scan_forward && !scan_reverse)) { @@ -792,6 +840,16 @@ PWMEditDistanceScorer::ScanMetrics PWMEditDistanceScorer::evaluate_windows(const } }; + auto maybe_update_n_mutations = [&](float nmut) { + if (!need_n_mutations || std::isnan(nmut)) { + return; + } + // Track maximum n_mutations across qualifying windows + if (std::isnan(metrics.n_mutations) || nmut > metrics.n_mutations) { + metrics.n_mutations = nmut; + } + }; + for (size_t offset = 0; offset < max_start; ++offset) { // Sliding N-count maintenance (for offset > 0, slide the window by 1) if (use_n_skip && offset > 0) { @@ -818,55 +876,100 @@ PWMEditDistanceScorer::ScanMetrics PWMEditDistanceScorer::evaluate_windows(const if (below_bidirect) { // direction=BELOW + bidirectional: compute both strands, take max // (prefilter is always off for BELOW, so no pigeonhole check needed) - float fwd_edits = std::numeric_limits::quiet_NaN(); - { - bool pass_score_filter = true; - if (apply_score_filter) { - float logp = compute_window_pwm_score(window_start, /*reverse=*/false); - if (has_score_min && logp < m_score_min) pass_score_filter = false; - if (has_score_max && logp > m_score_max) pass_score_filter = false; - } - if (pass_score_filter) { - fwd_edits = compute_window_edits(fwd_bidx.data() + offset, seq_avail, /*reverse=*/false); + if (need_min) { + float fwd_edits = std::numeric_limits::quiet_NaN(); + { + bool pass_score_filter = true; + if (apply_score_filter) { + float logp = compute_window_pwm_score(window_start, /*reverse=*/false); + if (has_score_min && logp < m_score_min) pass_score_filter = false; + if (has_score_max && logp > m_score_max) pass_score_filter = false; + } + if (pass_score_filter) { + fwd_edits = compute_window_edits(fwd_bidx.data() + offset, seq_avail, /*reverse=*/false); + } } - } - float rev_edits = std::numeric_limits::quiet_NaN(); - { - bool pass_score_filter = true; - if (apply_score_filter) { - float logp = compute_window_pwm_score(window_start, /*reverse=*/true); - if (has_score_min && logp < m_score_min) pass_score_filter = false; - if (has_score_max && logp > m_score_max) pass_score_filter = false; - } - if (pass_score_filter) { - rev_edits = compute_window_edits(rev_bidx.data() + offset, seq_avail, /*reverse=*/true); + float rev_edits = std::numeric_limits::quiet_NaN(); + { + bool pass_score_filter = true; + if (apply_score_filter) { + float logp = compute_window_pwm_score(window_start, /*reverse=*/true); + if (has_score_min && logp < m_score_min) pass_score_filter = false; + if (has_score_max && logp > m_score_max) pass_score_filter = false; + } + if (pass_score_filter) { + rev_edits = compute_window_edits(rev_bidx.data() + offset, seq_avail, /*reverse=*/true); + } } - } - // Combine: max of non-NaN values (NaN = strand filtered out / no match) - bool have_fwd = !std::isnan(fwd_edits); - bool have_rev = !std::isnan(rev_edits); - if (have_fwd || have_rev) { - float combined; - int combined_dir; - if (have_fwd && have_rev) { - // Both strands have matches — take the harder one to disrupt - if (fwd_edits >= rev_edits) { + // Combine: max of non-NaN values (NaN = strand filtered out / no match) + bool have_fwd = !std::isnan(fwd_edits); + bool have_rev = !std::isnan(rev_edits); + if (have_fwd || have_rev) { + float combined; + int combined_dir; + if (have_fwd && have_rev) { + // Both strands have matches — take the harder one to disrupt + if (fwd_edits >= rev_edits) { + combined = fwd_edits; + combined_dir = +1; + } else { + combined = rev_edits; + combined_dir = -1; + } + } else if (have_fwd) { combined = fwd_edits; combined_dir = +1; } else { combined = rev_edits; combined_dir = -1; } - } else if (have_fwd) { - combined = fwd_edits; - combined_dir = +1; - } else { - combined = rev_edits; - combined_dir = -1; + maybe_update_min(combined, offset, combined_dir); + } + } + if (need_n_mutations) { + // N_MUTATIONS + BELOW + bidirect: compute both strands, take max across strands + float fwd_nmut = std::numeric_limits::quiet_NaN(); + { + bool pass_score_filter = true; + if (apply_score_filter) { + float logp = compute_window_pwm_score(window_start, /*reverse=*/false); + if (has_score_min && logp < m_score_min) pass_score_filter = false; + if (has_score_max && logp > m_score_max) pass_score_filter = false; + } + if (pass_score_filter) { + fwd_nmut = compute_n_mutations(fwd_bidx.data() + offset, /*reverse=*/false); + } + } + + float rev_nmut = std::numeric_limits::quiet_NaN(); + { + bool pass_score_filter = true; + if (apply_score_filter) { + float logp = compute_window_pwm_score(window_start, /*reverse=*/true); + if (has_score_min && logp < m_score_min) pass_score_filter = false; + if (has_score_max && logp > m_score_max) pass_score_filter = false; + } + if (pass_score_filter) { + rev_nmut = compute_n_mutations(rev_bidx.data() + offset, /*reverse=*/true); + } + } + + // Combine: max of non-NaN values across strands + bool have_fwd = !std::isnan(fwd_nmut); + bool have_rev = !std::isnan(rev_nmut); + if (have_fwd || have_rev) { + float combined; + if (have_fwd && have_rev) { + combined = std::max(fwd_nmut, rev_nmut); + } else if (have_fwd) { + combined = fwd_nmut; + } else { + combined = rev_nmut; + } + maybe_update_n_mutations(combined); } - maybe_update_min(combined, offset, combined_dir); } } else if (scan_forward || scan_reverse) { if (scan_forward) { @@ -904,6 +1007,18 @@ PWMEditDistanceScorer::ScanMetrics PWMEditDistanceScorer::evaluate_windows(const } } } + if (need_n_mutations) { + bool pass_score_filter = true; + if (apply_score_filter) { + float logp = compute_window_pwm_score(window_start, /*reverse=*/false); + if (has_score_min && logp < m_score_min) pass_score_filter = false; + if (has_score_max && logp > m_score_max) pass_score_filter = false; + } + if (pass_score_filter) { + float nmut = compute_n_mutations(fwd_bidx.data() + offset, /*reverse=*/false); + maybe_update_n_mutations(nmut); + } + } if (need_pwm) { float logp = 0.0f; std::string::const_iterator it = seq.begin() + offset; @@ -949,6 +1064,18 @@ PWMEditDistanceScorer::ScanMetrics PWMEditDistanceScorer::evaluate_windows(const } } } + if (need_n_mutations) { + bool pass_score_filter = true; + if (apply_score_filter) { + float logp = compute_window_pwm_score(window_start, /*reverse=*/true); + if (has_score_min && logp < m_score_min) pass_score_filter = false; + if (has_score_max && logp > m_score_max) pass_score_filter = false; + } + if (pass_score_filter) { + float nmut = compute_n_mutations(rev_bidx.data() + offset, /*reverse=*/true); + maybe_update_n_mutations(nmut); + } + } if (need_pwm) { float logp_rc = 0.0f; std::string::const_iterator it2 = seq.begin() + offset; diff --git a/src/PWMEditDistanceScorer.h b/src/PWMEditDistanceScorer.h index 6d426b66..66cccdfe 100644 --- a/src/PWMEditDistanceScorer.h +++ b/src/PWMEditDistanceScorer.h @@ -36,7 +36,8 @@ class PWMEditDistanceScorer : public GenomeSeqScorer enum class Mode { MIN_EDITS, MIN_EDITS_POSITION, - PWM_MAX_EDITS + PWM_MAX_EDITS, + N_MUTATIONS }; enum class Direction { @@ -82,6 +83,7 @@ class PWMEditDistanceScorer : public GenomeSeqScorer float get_last_pwm_max_logp() const { return m_last_metrics.best_pwm_logp; } float get_last_pwm_max_edits() const { return m_last_metrics.best_pwm_edits; } float get_last_pwm_max_pos() const { return m_last_metrics.best_pwm_position; } + float get_last_n_mutations() const { return m_last_metrics.n_mutations; } int get_max_indels() const { return m_max_indels; } private: @@ -96,6 +98,8 @@ class PWMEditDistanceScorer : public GenomeSeqScorer float best_pwm_position = std::numeric_limits::quiet_NaN(); size_t best_pwm_index = 0; int best_pwm_direction = 1; + + float n_mutations = std::numeric_limits::quiet_NaN(); }; DnaPSSM m_pssm; @@ -210,6 +214,7 @@ class PWMEditDistanceScorer : public GenomeSeqScorer inline bool should_scan_reverse() const; inline char complement_base(char base) const; inline float compute_window_edits(const int* bidx, int seq_avail, bool reverse); + float compute_n_mutations(const int* bidx, bool reverse); /** * Get the effective PSSM score and gain for a motif position aligned with a sequence base. diff --git a/src/SequenceVarProcessor.cpp b/src/SequenceVarProcessor.cpp index eba2320a..d3aeed14 100644 --- a/src/SequenceVarProcessor.cpp +++ b/src/SequenceVarProcessor.cpp @@ -25,7 +25,8 @@ namespace { enum class EditDistAggMode { MIN_EDITS, // PWM_EDIT_DISTANCE, PWM_EDIT_DISTANCE_LSE MIN_EDITS_POS, // PWM_EDIT_DISTANCE_POS, PWM_EDIT_DISTANCE_LSE_POS - MAX_EDITS // PWM_MAX_EDIT_DISTANCE + MAX_EDITS, // PWM_MAX_EDIT_DISTANCE + MAX_SCORE // PWM_N_MUTATIONS (take maximum score across parts) }; // Per-part scoring result for edit distance aggregation @@ -44,6 +45,8 @@ EditDistAggMode get_agg_mode(TrackExpressionVars::Track_var::Val_func func) { return EditDistAggMode::MIN_EDITS_POS; case TrackExpressionVars::Track_var::PWM_MAX_EDIT_DISTANCE: return EditDistAggMode::MAX_EDITS; + case TrackExpressionVars::Track_var::PWM_N_MUTATIONS: + return EditDistAggMode::MAX_SCORE; default: return EditDistAggMode::MIN_EDITS; } @@ -96,6 +99,16 @@ double aggregate_edit_distance_parts( } return found ? best_edits : numeric_limits::quiet_NaN(); } + case EditDistAggMode::MAX_SCORE: { + // Take maximum score across parts (for N_MUTATIONS) + double max_score = numeric_limits::quiet_NaN(); + for (const auto &r : parts) { + if (std::isnan(max_score) || (!std::isnan(r.score) && r.score > max_score)) { + max_score = r.score; + } + } + return max_score; + } } return numeric_limits::quiet_NaN(); // unreachable } diff --git a/src/TrackExpressionVars.cpp b/src/TrackExpressionVars.cpp index 91e3e5d2..ded1e8e7 100644 --- a/src/TrackExpressionVars.cpp +++ b/src/TrackExpressionVars.cpp @@ -63,7 +63,8 @@ const char *TrackExpressionVars::Track_var::FUNC_NAMES[TrackExpressionVars::Trac "exists", "size", "sample", "sample.pos.abs", "sample.pos.relative", "first", "first.pos.abs", "first.pos.relative", "last", "last.pos.abs", "last.pos.relative", "pwm.edit_distance", "pwm.edit_distance.pos", "pwm.max.edit_distance", - "pwm.edit_distance.lse", "pwm.edit_distance.lse.pos"}; + "pwm.edit_distance.lse", "pwm.edit_distance.lse.pos", + "pwm.n_mutations"}; const char *TrackExpressionVars::Interv_var::FUNC_NAMES[TrackExpressionVars::Interv_var::NUM_FUNCS] = { "distance", "distance.center", "distance.edge", "coverage", "neighbor.count" }; @@ -790,6 +791,84 @@ void TrackExpressionVars::add_vtrack_var(const string &vtrack, SEXP rvtrack) var.percentile = numeric_limits::quiet_NaN(); var.requires_pv = false; + // Attach filter if present + attach_filter_to_var(rvtrack, vtrack, var); + return; + } else if (func == "pwm.n_mutations") { + // N_MUTATIONS mode: count single-base substitutions that independently cross threshold + m_track_vars.push_back(Track_var()); + Track_var &var = m_track_vars.back(); + var.var_name = vtrack; + var.val_func = Track_var::PWM_N_MUTATIONS; + var.track_n_imdf = nullptr; + var.seq_imdf1d = nullptr; + + SEXP rparams = get_rvector_col(rvtrack, "params", vtrack.c_str(), false); + + // Parse PWM parameters using helper struct + TrackExprParams::PWMParams pwm_params = TrackExprParams::PWMParams::parse(rparams, vtrack); + + // Extract score.min parameter (optional, default NaN = no filter) + float score_min = std::numeric_limits::quiet_NaN(); + if (Rf_isNewList(rparams)) { + int sm_idx = findListElementIndex(rparams, "score.min"); + if (sm_idx >= 0) { + SEXP rscore_min = VECTOR_ELT(rparams, sm_idx); + if (!Rf_isNull(rscore_min)) { + if (Rf_isReal(rscore_min) && Rf_length(rscore_min) == 1) { + score_min = static_cast(REAL(rscore_min)[0]); + } else if (Rf_isInteger(rscore_min) && Rf_length(rscore_min) == 1) { + score_min = static_cast(INTEGER(rscore_min)[0]); + } else { + verror("score.min parameter must be NULL or a single numeric value for vtrack %s", vtrack.c_str()); + } + } + } + } + + // Extract score.max parameter (optional, default NaN = no filter) + float score_max = std::numeric_limits::quiet_NaN(); + if (Rf_isNewList(rparams)) { + int smx_idx = findListElementIndex(rparams, "score.max"); + if (smx_idx >= 0) { + SEXP rscore_max = VECTOR_ELT(rparams, smx_idx); + if (!Rf_isNull(rscore_max)) { + if (Rf_isReal(rscore_max) && Rf_length(rscore_max) == 1) { + score_max = static_cast(REAL(rscore_max)[0]); + } else if (Rf_isInteger(rscore_max) && Rf_length(rscore_max) == 1) { + score_max = static_cast(INTEGER(rscore_max)[0]); + } else { + verror("score.max parameter must be NULL or a single numeric value for vtrack %s", vtrack.c_str()); + } + } + } + } + + auto direction = parse_direction_param(rparams, vtrack); + + // Construct scorer with N_MUTATIONS mode, no max_edits or max_indels + var.pwm_edit_distance_scorer = std::make_unique( + pwm_params.core.pssm, + &m_shared_seqfetch, + static_cast(pwm_params.core.score_thresh), + -1, // max_edits: not applicable for n_mutations + pwm_params.extend_flag, + static_cast(pwm_params.core.strand_mode), + PWMEditDistanceScorer::Mode::N_MUTATIONS, + score_min, + 0, // max_indels: not applicable for n_mutations + score_max, + direction + ); + + // Parse optional iterator modifier + Iterator_modifier1D imdf1d; + parse_imdf(rvtrack, vtrack, &imdf1d, NULL); + var.seq_imdf1d = add_imdf(imdf1d); + + var.percentile = numeric_limits::quiet_NaN(); + var.requires_pv = false; + // Attach filter if present attach_filter_to_var(rvtrack, vtrack, var); return; diff --git a/src/TrackExpressionVars.h b/src/TrackExpressionVars.h index 7d2136d7..4dee978f 100644 --- a/src/TrackExpressionVars.h +++ b/src/TrackExpressionVars.h @@ -157,6 +157,7 @@ class TrackExpressionVars { PWM_MAX_EDIT_DISTANCE, PWM_EDIT_DISTANCE_LSE, PWM_EDIT_DISTANCE_LSE_POS, + PWM_N_MUTATIONS, NUM_FUNCS }; @@ -410,7 +411,8 @@ inline bool TrackExpressionVars::is_seq_variable(unsigned ivar) const { m_track_vars[ivar].val_func == Track_var::PWM_EDIT_DISTANCE_POS || m_track_vars[ivar].val_func == Track_var::PWM_MAX_EDIT_DISTANCE || m_track_vars[ivar].val_func == Track_var::PWM_EDIT_DISTANCE_LSE || - m_track_vars[ivar].val_func == Track_var::PWM_EDIT_DISTANCE_LSE_POS; + m_track_vars[ivar].val_func == Track_var::PWM_EDIT_DISTANCE_LSE_POS || + m_track_vars[ivar].val_func == Track_var::PWM_N_MUTATIONS; } // Helper methods to check variable function types @@ -423,7 +425,8 @@ inline bool TrackExpressionVars::is_sequence_based_function(Track_var::Val_func func == Track_var::PWM_EDIT_DISTANCE_POS || func == Track_var::PWM_MAX_EDIT_DISTANCE || func == Track_var::PWM_EDIT_DISTANCE_LSE || - func == Track_var::PWM_EDIT_DISTANCE_LSE_POS; + func == Track_var::PWM_EDIT_DISTANCE_LSE_POS || + func == Track_var::PWM_N_MUTATIONS; } inline bool TrackExpressionVars::is_pwm_function(Track_var::Val_func func) { @@ -436,7 +439,8 @@ inline bool TrackExpressionVars::is_pwm_edit_distance_function(Track_var::Val_fu func == Track_var::PWM_EDIT_DISTANCE_POS || func == Track_var::PWM_MAX_EDIT_DISTANCE || func == Track_var::PWM_EDIT_DISTANCE_LSE || - func == Track_var::PWM_EDIT_DISTANCE_LSE_POS; + func == Track_var::PWM_EDIT_DISTANCE_LSE_POS || + func == Track_var::PWM_N_MUTATIONS; } inline bool TrackExpressionVars::is_pwm_lse_edit_distance_function(Track_var::Val_func func) { diff --git a/tests/testthat/test-pwm-n-mutations.R b/tests/testthat/test-pwm-n-mutations.R new file mode 100644 index 00000000..a0f08d6a --- /dev/null +++ b/tests/testthat/test-pwm-n-mutations.R @@ -0,0 +1,800 @@ +create_isolated_test_db() + +# --- R reference implementation for pwm.n_mutations --- +# For a single motif-length window: count single-base substitutions that +# independently cross the score threshold. +# Returns 0 if threshold already satisfied, NA if no single edit suffices. +manual_pwm_n_mutations <- function(seq, pssm_mat, threshold, direction = "above") { + L <- nrow(pssm_mat) + bases <- c("A", "C", "G", "T") + seq_bases <- strsplit(seq, "")[[1]] + + if (length(seq_bases) < L) { + return(NA_real_) + } + + # Compute current score + score <- 0 + for (i in 1:L) { + b <- seq_bases[i] + bidx <- match(b, bases) + if (is.na(bidx)) { + # N base: use the score from the LUT (index 4 maps to min) + score <- score + min(log(pssm_mat[i, ])) + } else { + score <- score + log(pssm_mat[i, bidx]) + } + } + + # Check if threshold already satisfied + if (direction == "above" && score >= threshold) return(0) + if (direction == "below" && score <= threshold) return(0) + + # Compute deficit + deficit <- if (direction == "above") threshold - score else score - threshold + + # Count single-base substitutions that independently cross threshold + count <- 0 + for (i in 1:L) { + current_base <- seq_bases[i] + current_idx <- match(current_base, bases) + if (is.na(current_idx)) next # skip N bases + + current_log <- log(pssm_mat[i, current_idx]) + + for (alt_base in setdiff(bases, current_base)) { + alt_idx <- match(alt_base, bases) + alt_log <- log(pssm_mat[i, alt_idx]) + + delta <- if (direction == "above") { + alt_log - current_log + } else { + current_log - alt_log + } + + # Handle NaN from -Inf - (-Inf) or similar + if (is.na(delta) || !is.finite(delta)) next + if (delta >= deficit - 1e-12) count <- count + 1 + } + } + + if (count == 0) return(NA_real_) + return(count) +} + +# Scan across all windows in a sequence, returning max n_mutations +manual_pwm_n_mutations_scan <- function(seq, pssm_mat, threshold, direction = "above") { + L <- nrow(pssm_mat) + n <- nchar(seq) + if (n < L) return(NA_real_) + + best <- NA_real_ + for (start in 1:(n - L + 1)) { + window_seq <- substr(seq, start, start + L - 1) + cand <- manual_pwm_n_mutations(window_seq, pssm_mat, threshold, direction) + if (is.na(best) || (!is.na(cand) && cand > best)) { + best <- cand + } + } + best +} + +# Reverse complement helper +revcomp <- function(seq) { + comp <- c(A = "T", C = "G", G = "C", T = "A", N = "N") + bases <- strsplit(toupper(seq), "")[[1]] + paste0(rev(comp[bases]), collapse = "") +} + +# ============================================================================ +# Basic correctness tests +# ============================================================================ + +test_that("pwm.n_mutations returns 0 when threshold already satisfied (direction=above)", { + remove_all_vtracks() + + pssm <- create_test_pssm() # AC motif + + # Find position with "AC" pattern + full_seq <- toupper(gseq.extract(gintervals(1, 200, 300))) + ac_pos <- gregexpr("AC", full_seq)[[1]][1] + + if (ac_pos > 0) { + abs_pos <- 200 + ac_pos - 1 + test_interval <- gintervals(1, abs_pos, abs_pos + 2) + + # Threshold at 0 — an AC window scores log(1)+log(1) = 0, satisfying >= 0 + threshold <- 0.0 + gvtrack.create("nmut_perfect", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract("nmut_perfect", test_interval, iterator = test_interval) + expect_equal(result$nmut_perfect[1], 0, tolerance = 1e-6) + } else { + skip("No AC pattern found in test region") + } +}) + +test_that("pwm.n_mutations returns correct count for 1-edit window (direction=above)", { + remove_all_vtracks() + + # Use a 3-position PSSM with clear preferences + pssm <- matrix(c( + 0.8, 0.1, 0.05, 0.05, # Strong A + 0.1, 0.8, 0.05, 0.05, # Strong C + 0.1, 0.05, 0.8, 0.05 # Strong G + ), ncol = 4, byrow = TRUE) + colnames(pssm) <- c("A", "C", "G", "T") + + test_interval <- gintervals(1, 200, 240) + seq <- toupper(gseq.extract(test_interval)) + + # Choose threshold that should allow single edits for some windows + threshold <- -3.0 + gvtrack.create("nmut_count", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract("nmut_count", test_interval, iterator = test_interval) + + expected <- manual_pwm_n_mutations_scan(seq, pssm, threshold, "above") + + if (is.na(expected)) { + expect_true(is.na(result$nmut_count[1])) + } else { + expect_equal(result$nmut_count[1], expected, tolerance = 1e-6) + } +}) + +test_that("pwm.n_mutations returns NA when no single edit suffices", { + remove_all_vtracks() + + pssm <- create_test_pssm() + + test_intervals <- gintervals(1, 200, 240) + + # Impossibly high threshold — no single edit can reach it + threshold <- 100.0 + gvtrack.create("nmut_unreachable", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract("nmut_unreachable", test_intervals, iterator = test_intervals) + expect_true(is.na(result$nmut_unreachable[1])) +}) + +test_that("pwm.n_mutations matches R reference on multiple intervals", { + remove_all_vtracks() + + pssm <- matrix(c( + 0.8, 0.1, 0.05, 0.05, # Strong A + 0.1, 0.8, 0.05, 0.05, # Strong C + 0.1, 0.05, 0.8, 0.05 # Strong G + ), ncol = 4, byrow = TRUE) + colnames(pssm) <- c("A", "C", "G", "T") + + test_intervals <- gintervals( + chrom = c(1, 1, 1), + start = c(200, 300, 400), + end = c(230, 330, 430) + ) + + threshold <- -3.5 + gvtrack.create("nmut_ref", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract("nmut_ref", test_intervals, iterator = test_intervals) + + for (i in seq_len(nrow(test_intervals))) { + seq <- toupper(gseq.extract(test_intervals[i, ])) + expected <- manual_pwm_n_mutations_scan(seq, pssm, threshold, "above") + + if (is.na(expected)) { + expect_true(is.na(result$nmut_ref[i]), + info = paste("Interval", i, "expected NA") + ) + } else { + expect_equal(result$nmut_ref[i], expected, + tolerance = 1e-6, + info = paste("Interval", i) + ) + } + } +}) + +test_that("pwm.n_mutations with 1bp iterator matches R reference per position", { + remove_all_vtracks() + + # Use a PSSM without zero entries to avoid -Inf complications + # (the C++ score table replaces log-zero with col_max for ABOVE direction) + pssm <- matrix(c( + 0.8, 0.1, 0.05, 0.05, + 0.05, 0.8, 0.1, 0.05, + 0.05, 0.05, 0.8, 0.1 + ), ncol = 4, byrow = TRUE) + colnames(pssm) <- c("A", "C", "G", "T") + motif_len <- nrow(pssm) + + test_interval <- gintervals(1, 200, 210) + threshold <- -3.0 + + gvtrack.create("nmut_1bp", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = TRUE, prior = 0 + ) + + result_1bp <- gextract("nmut_1bp", test_interval, iterator = 1) + + expect_true(nrow(result_1bp) > 0) + + for (idx in 1:min(5, nrow(result_1bp))) { + pos <- result_1bp$start[idx] + seq_window <- toupper(gseq.extract(gintervals(1, pos, pos + motif_len))) + expected <- manual_pwm_n_mutations(seq_window, pssm, threshold, "above") + + if (is.na(expected)) { + expect_true(is.na(result_1bp$nmut_1bp[idx]), + info = paste("Position", pos) + ) + } else { + expect_equal(result_1bp$nmut_1bp[idx], expected, + tolerance = 1e-6, + info = paste("Position", pos) + ) + } + } +}) + +# ============================================================================ +# Direction tests +# ============================================================================ + +test_that("pwm.n_mutations direction=below returns 0 when score already below threshold", { + remove_all_vtracks() + + pssm <- create_test_pssm() # AC motif + + test_intervals <- gintervals(1, 200, 240) + + # Very high threshold — all windows score well below it + threshold <- 100.0 + gvtrack.create("nmut_below_already", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract("nmut_below_already", test_intervals, iterator = test_intervals) + expect_equal(result$nmut_below_already[1], 0, tolerance = 1e-6) +}) + +test_that("pwm.n_mutations direction=below matches R reference", { + remove_all_vtracks() + + pssm <- matrix(c( + 0.8, 0.1, 0.05, 0.05, + 0.1, 0.8, 0.05, 0.05, + 0.1, 0.05, 0.8, 0.05 + ), ncol = 4, byrow = TRUE) + colnames(pssm) <- c("A", "C", "G", "T") + + test_intervals <- gintervals( + chrom = c(1, 1, 1), + start = c(200, 300, 400), + end = c(230, 330, 430) + ) + + threshold <- -3.5 + gvtrack.create("nmut_below_ref", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract("nmut_below_ref", test_intervals, iterator = test_intervals) + + for (i in seq_len(nrow(test_intervals))) { + seq <- toupper(gseq.extract(test_intervals[i, ])) + expected <- manual_pwm_n_mutations_scan(seq, pssm, threshold, "below") + + if (is.na(expected)) { + expect_true(is.na(result$nmut_below_ref[i]), + info = paste("Interval", i, "expected NA") + ) + } else { + expect_equal(result$nmut_below_ref[i], expected, + tolerance = 1e-6, + info = paste("Interval", i) + ) + } + } +}) + +test_that("pwm.n_mutations direction=below returns NA when threshold unreachable", { + remove_all_vtracks() + + # Uniform PSSM: min score per position = log(0.25) ~ -1.386 + pssm <- matrix(c( + 0.25, 0.25, 0.25, 0.25, + 0.25, 0.25, 0.25, 0.25 + ), ncol = 4, byrow = TRUE) + colnames(pssm) <- c("A", "C", "G", "T") + + test_intervals <- gintervals(1, 200, 240) + + # Threshold below the minimum possible score (-Inf can't be reached + # with a uniform PSSM since all entries are log(0.25)) + threshold <- -100.0 + gvtrack.create("nmut_below_impossible", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract("nmut_below_impossible", test_intervals, iterator = test_intervals) + expect_true(is.na(result$nmut_below_impossible[1])) +}) + +# ============================================================================ +# Bidirectional tests +# ============================================================================ + +test_that("pwm.n_mutations bidirect=TRUE returns max across strands (direction=above)", { + remove_all_vtracks() + + pssm <- create_test_pssm() + + test_interval <- gintervals(1, 200, 260) + threshold <- -5.0 + + gvtrack.create("nmut_fwd", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, strand = 1, extend = FALSE, prior = 0 + ) + + gvtrack.create("nmut_rev", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, strand = -1, extend = FALSE, prior = 0 + ) + + gvtrack.create("nmut_bidi", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = TRUE, extend = FALSE, prior = 0 + ) + + result <- gextract(c("nmut_fwd", "nmut_rev", "nmut_bidi"), + test_interval, + iterator = test_interval + ) + + fwd <- result$nmut_fwd[1] + rev <- result$nmut_rev[1] + bidi <- result$nmut_bidi[1] + + # For direction=above, bidirectional should take max across strands at each + # window position, since either strand crossing the threshold counts. + # The aggregation across windows is still max. + # So bidi >= max(fwd, rev) is expected (since both strands contribute). + if (!is.na(fwd) && !is.na(rev)) { + expect_true(bidi >= max(fwd, rev) - 1e-6, + info = paste("fwd=", fwd, "rev=", rev, "bidi=", bidi) + ) + } +}) + +test_that("pwm.n_mutations bidirect=TRUE with direction=below uses max across strands per position", { + remove_all_vtracks() + + # Use a non-degenerate PSSM so scores are finite and direction=below is meaningful + pssm <- matrix(c( + 0.8, 0.1, 0.05, 0.05, + 0.05, 0.8, 0.1, 0.05, + 0.05, 0.05, 0.8, 0.1 + ), ncol = 4, byrow = TRUE) + colnames(pssm) <- c("A", "C", "G", "T") + + # PWM max scores in this region are around -3.4 to -9.0 + # Use threshold=-4.0 so some windows score above it (needing edits to go below) + test_interval <- gintervals(1, 200, 260) + threshold <- -4.0 + + gvtrack.create("nmut_below_fwd", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = FALSE, strand = 1, extend = TRUE, prior = 0 + ) + + gvtrack.create("nmut_below_rev", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = FALSE, strand = -1, extend = TRUE, prior = 0 + ) + + gvtrack.create("nmut_below_bidi", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = TRUE, extend = TRUE, prior = 0 + ) + + result <- gextract( + c("nmut_below_fwd", "nmut_below_rev", "nmut_below_bidi"), + test_interval, + iterator = 1 + ) + + # Verify we have data to compare + has_both <- !is.na(result$nmut_below_fwd) & !is.na(result$nmut_below_rev) + expect_true(any(has_both), info = "Should have at least some positions with both strands non-NA") + + # For direction=below + bidirect, each position combines strands via max + # (a genomic substitution affects both strands) + for (i in which(has_both)) { + fwd <- result$nmut_below_fwd[i] + rev <- result$nmut_below_rev[i] + bidi <- result$nmut_below_bidi[i] + + # Per-position combine = max of strands + expect_true(bidi >= max(fwd, rev) - 1e-6, + info = paste("pos", i, "fwd=", fwd, "rev=", rev, "bidi=", bidi) + ) + } +}) + +# ============================================================================ +# Score filter tests +# ============================================================================ + +test_that("pwm.n_mutations score.min filters out low-scoring windows", { + remove_all_vtracks() + + pssm <- create_test_pssm() + + test_interval <- gintervals(1, 200, 240) + threshold <- -3.0 + + # No filter + gvtrack.create("nmut_nofilt", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + # With score.min = 0 (very restrictive — only perfect-scoring windows pass) + gvtrack.create("nmut_highfilt", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + score.min = 0.0, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract(c("nmut_nofilt", "nmut_highfilt"), + test_interval, + iterator = test_interval + ) + + # High filter should either be NA (all windows filtered) or <= the unfiltered + # count (since fewer windows contribute). But since n_mutations is MAX across + # qualifying windows, filtering can only reduce or NA-ify the result. + if (!is.na(result$nmut_highfilt[1]) && !is.na(result$nmut_nofilt[1])) { + expect_true(result$nmut_highfilt[1] <= result$nmut_nofilt[1] + 1e-6) + } +}) + +test_that("pwm.n_mutations score.max filters out high-scoring windows", { + remove_all_vtracks() + + pssm <- create_test_pssm() + + test_interval <- gintervals(1, 200, 240) + threshold <- -5.0 + + # No filter + gvtrack.create("nmut_nomax", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + # With score.max = very low value (filters most windows) + gvtrack.create("nmut_maxfilt", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + score.max = -100.0, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract(c("nmut_nomax", "nmut_maxfilt"), + test_interval, + iterator = test_interval + ) + + # With such a restrictive score.max, most or all windows are filtered out + # Result should be NA or at most equal to the unfiltered result + if (!is.na(result$nmut_maxfilt[1]) && !is.na(result$nmut_nomax[1])) { + expect_true(result$nmut_maxfilt[1] <= result$nmut_nomax[1] + 1e-6) + } +}) + +test_that("pwm.n_mutations direction=below score.min=-Inf matches default for below", { + remove_all_vtracks() + + pssm <- create_test_pssm() + + test_interval <- gintervals(1, 200, 240) + threshold <- -5.0 + + # Default (no score.min) + gvtrack.create("nmut_below_default", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + direction = "below", + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + # Explicit score.min = -Inf + gvtrack.create("nmut_below_explicit", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract(c("nmut_below_default", "nmut_below_explicit"), + test_interval, + iterator = test_interval + ) + + expect_equal(result$nmut_below_default[1], result$nmut_below_explicit[1], tolerance = 1e-6) +}) + +# ============================================================================ +# Edge case: N bases +# ============================================================================ + +test_that("pwm.n_mutations handles windows consistently with non-degenerate PSSM", { + remove_all_vtracks() + + # Use a PSSM without zero entries so R reference and C++ agree + pssm <- matrix(c( + 0.8, 0.1, 0.05, 0.05, + 0.05, 0.8, 0.1, 0.05 + ), ncol = 4, byrow = TRUE) + colnames(pssm) <- c("A", "C", "G", "T") + + test_interval <- gintervals(1, 200, 240) + seq <- toupper(gseq.extract(test_interval)) + + threshold <- -3.0 + gvtrack.create("nmut_ntest", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract("nmut_ntest", test_interval, iterator = test_interval) + expected <- manual_pwm_n_mutations_scan(seq, pssm, threshold, "above") + + if (is.na(expected)) { + expect_true(is.na(result$nmut_ntest[1])) + } else { + expect_equal(result$nmut_ntest[1], expected, tolerance = 1e-6) + } +}) + +# ============================================================================ +# Integration: gscreen filtering +# ============================================================================ + +test_that("pwm.n_mutations works in gscreen expression", { + remove_all_vtracks() + + pssm <- create_test_pssm() + threshold <- -5.0 + + gvtrack.create("nmut_screen", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + # Find intervals where n_mutations is not NA + screened <- gscreen("!is.na(nmut_screen)", + gintervals(1, 0, 5000), + iterator = 20 + ) + + # gscreen returns NULL when no intervals match; otherwise a data frame + if (!is.null(screened)) { + expect_s3_class(screened, "data.frame") + expect_true(all(c("chrom", "start", "end") %in% colnames(screened))) + } + + # Also verify gextract works with the same expression + extracted <- gextract("nmut_screen", gintervals(1, 0, 1000), iterator = 20) + expect_s3_class(extracted, "data.frame") + expect_true("nmut_screen" %in% colnames(extracted)) +}) + +test_that("pwm.n_mutations returns 0 for windows where threshold is already met (gextract integration)", { + remove_all_vtracks() + + pssm <- create_test_pssm() # AC motif + motif_len <- nrow(pssm) + + # Use extend=TRUE with 1bp iterator to check individual windows + # Very low threshold that most windows should already satisfy + threshold <- -20.0 + gvtrack.create("nmut_easy", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = TRUE, prior = 0 + ) + + result <- gextract("nmut_easy", gintervals(1, 200, 210), iterator = 1) + expect_true(nrow(result) > 0) + + # With such a low threshold, windows with non-N bases should score above it + # and return 0 + non_na_rows <- !is.na(result$nmut_easy) + if (any(non_na_rows)) { + expect_true(all(result$nmut_easy[non_na_rows] == 0), + info = "All qualifying windows should return 0 for a very low threshold" + ) + } +}) + +# ============================================================================ +# Parameter validation +# ============================================================================ + +test_that("pwm.n_mutations requires pssm parameter", { + remove_all_vtracks() + + expect_error( + gvtrack.create("nmut_nopssm", NULL, + func = "pwm.n_mutations", + score.thresh = -5.0 + ), + "pssm" + ) +}) + +test_that("pwm.n_mutations requires score.thresh parameter", { + remove_all_vtracks() + + pssm <- create_test_pssm() + + expect_error( + gvtrack.create("nmut_nothresh", NULL, + func = "pwm.n_mutations", + pssm = pssm + ), + "score.thresh" + ) +}) + +test_that("pwm.n_mutations rejects invalid direction", { + remove_all_vtracks() + + pssm <- create_test_pssm() + + expect_error( + gvtrack.create("nmut_baddir", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = -5.0, + direction = "sideways" + ), + "direction" + ) +}) + +# ============================================================================ +# Aggregation semantics: MAX across windows +# ============================================================================ + +test_that("pwm.n_mutations aggregates as MAX across qualifying windows", { + remove_all_vtracks() + + pssm <- matrix(c( + 0.8, 0.1, 0.05, 0.05, + 0.1, 0.8, 0.05, 0.05, + 0.1, 0.05, 0.8, 0.05 + ), ncol = 4, byrow = TRUE) + colnames(pssm) <- c("A", "C", "G", "T") + + motif_len <- nrow(pssm) + + # Use a wide interval so there are many windows + test_interval <- gintervals(1, 200, 260) + seq <- toupper(gseq.extract(test_interval)) + + threshold <- -3.5 + gvtrack.create("nmut_agg", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract("nmut_agg", test_interval, iterator = test_interval) + + # Compute per-window n_mutations manually and verify the MAX + n_windows <- nchar(seq) - motif_len + 1 + per_window <- numeric(0) + for (start in 1:n_windows) { + window_seq <- substr(seq, start, start + motif_len - 1) + cand <- manual_pwm_n_mutations(window_seq, pssm, threshold, "above") + per_window <- c(per_window, cand) + } + + # Max across all non-NA values + non_na <- per_window[!is.na(per_window)] + expected_max <- if (length(non_na) == 0) NA_real_ else max(non_na) + + if (is.na(expected_max)) { + expect_true(is.na(result$nmut_agg[1])) + } else { + expect_equal(result$nmut_agg[1], expected_max, tolerance = 1e-6) + } +}) + +# ============================================================================ +# Prior handling +# ============================================================================ + +test_that("pwm.n_mutations with prior=0 vs prior>0 gives different results", { + remove_all_vtracks() + + pssm <- create_test_pssm() # Has zeros in it + + test_interval <- gintervals(1, 200, 240) + threshold <- -3.0 + + gvtrack.create("nmut_prior0", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + gvtrack.create("nmut_prior01", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0.01 + ) + + result <- gextract(c("nmut_prior0", "nmut_prior01"), + test_interval, + iterator = test_interval + ) + + # With prior=0 on a PSSM that has zero entries, scores can be -Inf + # vs prior>0 where scores are always finite. Results should generally differ. + # This is a consistency check — at least one should be non-NA. + expect_true(!is.na(result$nmut_prior0[1]) || !is.na(result$nmut_prior01[1])) +}) From 26787437c42e1a1cab358e49c319ac08ac45dd91 Mon Sep 17 00:00:00 2001 From: aviezerl Date: Mon, 13 Apr 2026 09:51:28 +0000 Subject: [PATCH 4/4] Style code (GHA) --- tests/testthat/test-pwm-n-mutations.R | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-pwm-n-mutations.R b/tests/testthat/test-pwm-n-mutations.R index a0f08d6a..f66c2055 100644 --- a/tests/testthat/test-pwm-n-mutations.R +++ b/tests/testthat/test-pwm-n-mutations.R @@ -27,8 +27,12 @@ manual_pwm_n_mutations <- function(seq, pssm_mat, threshold, direction = "above" } # Check if threshold already satisfied - if (direction == "above" && score >= threshold) return(0) - if (direction == "below" && score <= threshold) return(0) + if (direction == "above" && score >= threshold) { + return(0) + } + if (direction == "below" && score <= threshold) { + return(0) + } # Compute deficit deficit <- if (direction == "above") threshold - score else score - threshold @@ -58,7 +62,9 @@ manual_pwm_n_mutations <- function(seq, pssm_mat, threshold, direction = "above" } } - if (count == 0) return(NA_real_) + if (count == 0) { + return(NA_real_) + } return(count) } @@ -66,7 +72,9 @@ manual_pwm_n_mutations <- function(seq, pssm_mat, threshold, direction = "above" manual_pwm_n_mutations_scan <- function(seq, pssm_mat, threshold, direction = "above") { L <- nrow(pssm_mat) n <- nchar(seq) - if (n < L) return(NA_real_) + if (n < L) { + return(NA_real_) + } best <- NA_real_ for (start in 1:(n - L + 1)) {