diff --git a/DESCRIPTION b/DESCRIPTION index d5f3611d..40f4d1ec 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: misha Title: Toolkit for Analysis of Genomic Data -Version: 5.6.9 +Version: 5.6.10 Authors@R: c( person("Misha", "Hoichman", , "misha@hoichman.com", role = "aut"), person("Aviezer", "Lifshitz", , "aviezer.lifshitz@weizmann.ac.il", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index 124ca7e2..c364389e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# misha 5.6.10 + +* Fixed `direction="below"` with `bidirect=TRUE` taking min across strands instead of max. A genomic substitution changes both strands, so disrupting a motif requires both strands to fall below the threshold. +* **Breaking**: removed the hidden default that set `score.min = score.thresh` when `direction = "below"`. `score.min` now defaults to NULL (no filter) for both directions. Set `score.min` explicitly to filter windows. +* Improved edit distance documentation: dedicated section with direction/strand semantics, parameter reference, and worked examples. + # misha 5.6.9 * Added `k` parameter to `gsynth.train()` to configure the Markov order (1-8, default 5). diff --git a/R/vtrack.R b/R/vtrack.R index ee40546e..36d62855 100644 --- a/R/vtrack.R +++ b/R/vtrack.R @@ -332,13 +332,6 @@ } } - # For direction="below", default score.min to score.thresh: windows already - # scoring below the threshold trivially need 0 edits and would dominate the - # minimum, making the result always 0. - if (direction == "below" && is.null(score.min)) { - score.min <- score.thresh - } - # Set params with processed values list( pssm = pssm, @@ -355,6 +348,89 @@ ) } +#' Validate and process PWM n_mutations function parameters +#' @noRd +.vtrack_params_pwm_n_mutations <- function(func, params, dots) { + if (!is.null(params)) { + if (!is.list(params) || !("pssm" %in% names(params))) { + stop("pwm.n_mutations requires a list with at least 'pssm' matrix parameter") + } + dots <- params + } + + if (!("pssm" %in% names(dots))) { + stop("pwm.n_mutations requires a 'pssm' matrix parameter") + } + if (!("score.thresh" %in% names(dots))) { + stop("pwm.n_mutations requires a 'score.thresh' parameter") + } + + pssm <- dots$pssm + score.thresh <- dots$score.thresh + score.min <- dots$score.min + score.max <- dots$score.max + bidirect <- if (!is.null(dots$bidirect)) dots$bidirect else TRUE + prior <- if (!is.null(dots$prior)) dots$prior else 0.01 + extend <- if (!is.null(dots$extend)) dots$extend else TRUE + strand <- if (!is.null(dots$strand)) dots$strand else 1 + direction <- if (!is.null(dots$direction)) dots$direction else "above" + + pssm <- .coerce_pssm_matrix( + pssm, + numeric_msg = "PSSM must be a numeric matrix or data frame with numeric columns", + ncol_msg = "PSSM must have columns named A, C, G, T", + colnames_msg = "PSSM must have columns named A, C, G, T" + ) + + if (!is.numeric(prior) || prior < 0 || prior > 1) { + stop("prior must be a number between 0 and 1") + } + + if (!is.logical(bidirect)) { + stop("bidirect must be TRUE or FALSE") + } + + if (!is.logical(extend) && !(is.numeric(extend) && length(extend) == 1 && extend == as.integer(extend) && extend >= 0)) { + stop("extend must be TRUE, FALSE, or a non-negative integer") + } + + if (strand != 1 && strand != -1) { + stop("strand must be 1 or -1") + } + + if (!direction %in% c("above", "below")) { + stop("direction must be 'above' or 'below'") + } + + if (!is.numeric(score.thresh) || length(score.thresh) != 1) { + stop("score.thresh must be a single numeric value") + } + + if (!is.null(score.min)) { + if (!is.numeric(score.min) || length(score.min) != 1) { + stop("score.min must be NULL or a single numeric value") + } + } + + if (!is.null(score.max)) { + if (!is.numeric(score.max) || length(score.max) != 1) { + stop("score.max must be NULL or a single numeric value") + } + } + + list( + pssm = pssm, + score.thresh = score.thresh, + score.min = score.min, + score.max = score.max, + bidirect = bidirect, + prior = prior, + extend = extend, + strand = strand, + direction = direction + ) +} + #' Validate and process PWM LSE edit distance function parameters #' @noRd .vtrack_params_pwm_edit_distance_lse <- function(func, params, dots) { @@ -433,12 +509,6 @@ } } - # For direction="below", default score.min to score.thresh (see comment - # in .vtrack_params_pwm_edit_distance for rationale). - if (direction == "below" && is.null(score.min)) { - score.min <- score.thresh - } - # Set params with processed values list( pssm = pssm, @@ -499,6 +569,7 @@ pwm.max.edit_distance = .vtrack_params_pwm_edit_distance, pwm.edit_distance.lse = .vtrack_params_pwm_edit_distance_lse, pwm.edit_distance.lse.pos = .vtrack_params_pwm_edit_distance_lse, + pwm.n_mutations = .vtrack_params_pwm_n_mutations, neighbor.count = .vtrack_params_neighbor_count ) @@ -508,7 +579,8 @@ "kmer.count", "kmer.frac", "masked.count", "masked.frac", "pwm.edit_distance", "pwm.edit_distance.pos", "pwm.max.edit_distance", - "pwm.edit_distance.lse", "pwm.edit_distance.lse.pos" + "pwm.edit_distance.lse", "pwm.edit_distance.lse.pos", + "pwm.n_mutations" ) #' Creates a new virtual track @@ -610,6 +682,12 @@ #' NULL (sequence) \tab pwm.max.edit_distance \tab pssm, score.thresh, max_edits, max_indels, score.min, score.max, direction, bidirect, prior, extend, strand \tab Edit distance at the best-scoring PWM window (same location as pwm.max/pwm.max.pos). \cr #' } #' +#' \strong{Mutation count summarizers} +#' \tabular{llll}{ +#' Source \tab func \tab Key params \tab Description \cr +#' NULL (sequence) \tab pwm.n_mutations \tab pssm, score.thresh, score.min, score.max, direction, bidirect, prior, extend, strand \tab Number of single-base substitutions that independently cross \code{score.thresh}. Returns 0 if threshold already satisfied, NA if no single edit suffices. \cr +#' } +#' #' \strong{LSE edit distance summarizers} #' \tabular{llll}{ #' Source \tab func \tab Key params \tab Description \cr @@ -644,12 +722,69 @@ #' \item \code{score.thresh}: Threshold for \code{pwm.count}. Anchors with log-likelihood >= \code{score.thresh} are counted; only one count per genomic start. #' \item Spatial weighting (\code{spat_factor}, \code{spat_bin}, \code{spat_min}, \code{spat_max}): optional position-dependent weights applied in log-space. Provide a positive numeric vector \code{spat_factor}; \code{spat_bin} (integer > 0) defines bin width; \code{spat_min}/\code{spat_max} restrict the scanning window. #' \item \code{pwm.max.pos}: Positions are reported 1-based relative to the final scan window (after iterator shifts and spatial trimming). Ties resolve to the most 5' anchor; the forward strand wins ties at the same coordinate. Values are signed when \code{bidirect = TRUE} (positive for forward, negative for reverse). -#' \item \code{score.thresh}: For edit distance functions, this is the PWM log-likelihood target that the algorithm tries to reach via substitutions. The edit distance is the minimum number of single-base changes needed for the window to achieve this score. -#' \item \code{max_edits}: For edit distance functions only. Optional positive integer setting the maximum search depth. If reaching the threshold requires more than \code{max_edits} substitutions, NA is returned. When NULL (default), exact computation is used. -#' \item \code{max_indels}: For \code{pwm.edit_distance}, \code{pwm.edit_distance.pos}, and \code{pwm.max.edit_distance} only. Not supported by LSE variants. Optional non-negative integer specifying the maximum number of insertions and deletions allowed (default 0, substitutions only). When > 0, a banded Needleman-Wunsch DP is used to find the minimum total edits (substitutions + indels) to reach the score threshold. Typical values are 1-2. -#' \item \code{score.min}: For edit distance functions only. Optional numeric filter. Windows whose PWM log-likelihood is below \code{score.min} are skipped (edit distance returns NA). Improves performance by avoiding expensive computation on low-scoring windows. For LSE variants, the filter applies to the aggregate LSE score across all windows rather than individual window scores. Default NULL (no filter). When \code{direction = "below"} and \code{score.min} is NULL, it defaults to \code{score.thresh} — windows already scoring below the threshold trivially need 0 edits, so filtering them out gives the useful semantics of "how many edits to disrupt this motif match". Use \code{score.min = -Inf} to disable this. -#' \item \code{score.max}: For edit distance functions only. Optional numeric filter. Windows whose PWM log-likelihood is above \code{score.max} are skipped (edit distance returns NA). Combined with \code{score.min}, enables efficient regime-specific queries (e.g., only positions with score in [\code{score.min}, \code{score.max}]). For LSE variants, the filter applies to the aggregate LSE score across all windows rather than individual window scores. Default NULL (no filter). -#' \item \code{direction}: For edit distance functions only. Direction of the edit distance query: \code{"above"} (default) finds the minimum edits to bring the score above \code{score.thresh}; \code{"below"} finds the minimum edits to bring the score below \code{score.thresh}. +#' } +#' +#' \strong{Edit distance notes} +#' +#' The edit distance functions answer "how many single-base changes are needed to reach a target +#' PWM score?" Each genomic window (same length as the PSSM) gets an edit distance: the minimum +#' number of substitutions (and optionally indels) required to bring its PWM log-likelihood score +#' to the target threshold. The virtual track returns the minimum edit distance across all windows +#' in the iterator interval. +#' +#' \strong{Direction.} The \code{direction} parameter controls which side of the threshold to target: +#' \itemize{ +#' \item \code{"above"} (default): minimum edits to make the score \strong{reach or exceed} +#' \code{score.thresh}. Use this to ask "how close is this sequence to becoming a motif match?" +#' A window already scoring above the threshold needs 0 edits. +#' \item \code{"below"}: minimum edits to make the score \strong{fall below} +#' \code{score.thresh}. Use this to ask "how fragile is this motif match — how many mutations +#' would disrupt it?" A window already scoring below the threshold needs 0 edits. +#' } +#' +#' \strong{Strand handling differs by direction.} When \code{bidirect = TRUE}, both strands are +#' scanned at each genomic position. The way their edit distances are combined depends on +#' the direction: +#' \itemize{ +#' \item \code{direction = "above"}: takes the \strong{minimum} across both strands. A motif +#' match on \emph{either} strand is sufficient (e.g., for TF binding), so the easier strand +#' wins. +#' \item \code{direction = "below"}: takes the \strong{maximum} across both strands. A single +#' genomic substitution changes both strands simultaneously (the forward base and its reverse +#' complement). To truly disrupt a binding site you must bring \emph{both} strands below the +#' threshold, so the harder strand determines the answer. For example, if the forward strand +#' needs 3 edits to go below the threshold and the reverse needs 1, the result is 3 — after +#' just 1 edit the forward strand would still have a match. +#' } +#' Then, across all windows in the iterator interval, the minimum is reported (the position +#' where it is easiest to create or disrupt a match). +#' +#' \strong{Parameters:} +#' \itemize{ +#' \item \code{score.thresh}: The target PWM log-likelihood score that the edit distance +#' algorithm tries to reach. For \code{direction = "above"}, this is the score to reach +#' or exceed; for \code{direction = "below"}, this is the score to fall below. +#' \item \code{score.min}, \code{score.max}: Optional numeric pre-filters on the \strong{current} +#' (unedited) PWM score. Windows scoring outside the [\code{score.min}, \code{score.max}] +#' range are skipped (edit distance returns NA). Default NULL (no filter). +#' These are independent of \code{score.thresh} — they control \emph{which windows to +#' evaluate}, not the target score. +#' +#' \strong{Typical usage with \code{direction = "below"}}: use \code{score.min} to restrict +#' to windows that currently have a motif match. For example, with \code{score.thresh = -18} +#' and \code{score.min = -14}, only windows scoring >= -14 are considered, and the edit +#' distance tells you how many edits to push each one below -18. Without \code{score.min}, +#' windows already scoring below -18 will return 0 edits. +#' +#' For LSE variants, the filters apply to the aggregate LSE score, not individual windows. +#' \item \code{max_edits}: Optional positive integer. Cap the search depth: if reaching the +#' threshold requires more than \code{max_edits} edits, NA is returned for that window. +#' Default NULL (exact computation, no cap). +#' \item \code{max_indels}: Optional non-negative integer (default 0). When > 0, allows +#' insertions and deletions in addition to substitutions, using a banded Needleman-Wunsch DP. +#' Typical values are 1-2. Only supported by the non-LSE variants +#' (\code{pwm.edit_distance}, \code{pwm.edit_distance.pos}, \code{pwm.max.edit_distance}). +#' \item \code{direction}: \code{"above"} (default) or \code{"below"}. See Direction above. #' } #' #' \strong{Spatial weighting} @@ -875,6 +1010,77 @@ #' spat_max = 450 #' ) #' gextract("window_spatial_pwm", gintervals(1, 0, 10000), iterator = 500) +#' +#' # --- Edit distance examples --- +#' # (Using the same 4-position PSSM defined above) +#' +#' # First, check the PWM score distribution to pick sensible thresholds +#' gvtrack.create("pwm_score", NULL, "pwm.max", pssm = pssm) +#' head(gextract("pwm_score", gintervals(1, 500, 520), iterator = 1)) +#' # Scores range from ~-8.3 (poor) to ~-0.8 (strong match) +#' +#' # --- direction = "above": how many edits to CREATE a motif match? --- +#' # "How many substitutions until this window scores >= -5?" +#' gvtrack.create("edist_above", NULL, "pwm.edit_distance", +#' pssm = pssm, score.thresh = -5 +#' ) +#' # At 1bp: each position gets its own edit distance (0 if already matching) +#' head(gextract(c("pwm_score", "edist_above"), +#' gintervals(1, 500, 520), +#' iterator = 1 +#' )) +#' +#' # --- direction = "below": how many edits to DISRUPT a motif match? --- +#' # +#' # score.thresh is the target: "push the score below this value" +#' # score.min is a pre-filter: "only consider windows currently scoring above this" +#' # These are independent — score.min controls WHICH windows to evaluate, +#' # score.thresh controls WHAT score to target. +#' # +#' # Example: among windows scoring >= -5 (strong matches), +#' # how many edits to push below -7 (disrupt the match)? +#' gvtrack.create("edist_below", NULL, "pwm.edit_distance", +#' pssm = pssm, score.thresh = -7, score.min = -5, +#' direction = "below" +#' ) +#' head(gextract(c("pwm_score", "edist_above", "edist_below"), +#' gintervals(1, 500, 520), +#' iterator = 1 +#' )) +#' # score >= -5 (e.g. -2.7): gets edit distance (2 edits to go below -7) +#' # score < -5 (e.g. -6.4): NA (filtered out by score.min) +#' +#' # Without score.min, ALL windows are evaluated — windows already +#' # below -7 return 0 edits: +#' gvtrack.create("edist_below_all", NULL, "pwm.edit_distance", +#' pssm = pssm, score.thresh = -7, direction = "below" +#' ) +#' head(gextract(c("pwm_score", "edist_below", "edist_below_all"), +#' gintervals(1, 500, 520), +#' iterator = 1 +#' )) +#' # score >= -5: same result as edist_below (2 edits) +#' # score between -7 and -5 (e.g. -6.4): edist_below=NA, edist_below_all=0 +#' # score < -7: both return 0 (already below threshold) +#' +#' # --- max_edits: cap the search depth --- +#' # Return NA if more than 2 substitutions are needed +#' gvtrack.create("edist_max2", NULL, "pwm.edit_distance", +#' pssm = pssm, score.thresh = -5, max_edits = 2 +#' ) +#' # Positions needing 3+ edits now return NA instead of 3 +#' head(gextract(c("pwm_score", "edist_above", "edist_max2"), +#' gintervals(1, 500, 520), +#' iterator = 1 +#' )) +#' +#' # --- Aggregation over intervals --- +#' # With a larger iterator, the minimum edit distance across the +#' # interval is returned. +#' gextract(c("pwm_score", "edist_above", "edist_below_all"), +#' gintervals(1, 0, 2000), +#' iterator = 200 +#' ) #' @seealso \code{\link{gvtrack.iterator}}, \code{\link{gvtrack.iterator.2d}}, \code{\link{gvtrack.filter}} #' @export gvtrack.create gvtrack.create <- function(vtrack = NULL, src = NULL, func = NULL, params = NULL, dim = NULL, sshift = NULL, eshift = NULL, filter = NULL, ...) { diff --git a/man/gvtrack.create.Rd b/man/gvtrack.create.Rd index 7d2827b1..9b8404f0 100644 --- a/man/gvtrack.create.Rd +++ b/man/gvtrack.create.Rd @@ -148,6 +148,12 @@ interval after all modifier adjustments. NULL (sequence) \tab pwm.max.edit_distance \tab pssm, score.thresh, max_edits, max_indels, score.min, score.max, direction, bidirect, prior, extend, strand \tab Edit distance at the best-scoring PWM window (same location as pwm.max/pwm.max.pos). \cr } +\strong{Mutation count summarizers} +\tabular{llll}{ + Source \tab func \tab Key params \tab Description \cr + NULL (sequence) \tab pwm.n_mutations \tab pssm, score.thresh, score.min, score.max, direction, bidirect, prior, extend, strand \tab Number of single-base substitutions that independently cross \code{score.thresh}. Returns 0 if threshold already satisfied, NA if no single edit suffices. \cr +} + \strong{LSE edit distance summarizers} \tabular{llll}{ Source \tab func \tab Key params \tab Description \cr @@ -182,12 +188,69 @@ The sections below provide additional notes for motif, interval, k-mer, and mask \item \code{score.thresh}: Threshold for \code{pwm.count}. Anchors with log-likelihood >= \code{score.thresh} are counted; only one count per genomic start. \item Spatial weighting (\code{spat_factor}, \code{spat_bin}, \code{spat_min}, \code{spat_max}): optional position-dependent weights applied in log-space. Provide a positive numeric vector \code{spat_factor}; \code{spat_bin} (integer > 0) defines bin width; \code{spat_min}/\code{spat_max} restrict the scanning window. \item \code{pwm.max.pos}: Positions are reported 1-based relative to the final scan window (after iterator shifts and spatial trimming). Ties resolve to the most 5' anchor; the forward strand wins ties at the same coordinate. Values are signed when \code{bidirect = TRUE} (positive for forward, negative for reverse). - \item \code{score.thresh}: For edit distance functions, this is the PWM log-likelihood target that the algorithm tries to reach via substitutions. The edit distance is the minimum number of single-base changes needed for the window to achieve this score. - \item \code{max_edits}: For edit distance functions only. Optional positive integer setting the maximum search depth. If reaching the threshold requires more than \code{max_edits} substitutions, NA is returned. When NULL (default), exact computation is used. - \item \code{max_indels}: For \code{pwm.edit_distance}, \code{pwm.edit_distance.pos}, and \code{pwm.max.edit_distance} only. Not supported by LSE variants. Optional non-negative integer specifying the maximum number of insertions and deletions allowed (default 0, substitutions only). When > 0, a banded Needleman-Wunsch DP is used to find the minimum total edits (substitutions + indels) to reach the score threshold. Typical values are 1-2. - \item \code{score.min}: For edit distance functions only. Optional numeric filter. Windows whose PWM log-likelihood is below \code{score.min} are skipped (edit distance returns NA). Improves performance by avoiding expensive computation on low-scoring windows. For LSE variants, the filter applies to the aggregate LSE score across all windows rather than individual window scores. Default NULL (no filter). When \code{direction = "below"} and \code{score.min} is NULL, it defaults to \code{score.thresh} — windows already scoring below the threshold trivially need 0 edits, so filtering them out gives the useful semantics of "how many edits to disrupt this motif match". Use \code{score.min = -Inf} to disable this. - \item \code{score.max}: For edit distance functions only. Optional numeric filter. Windows whose PWM log-likelihood is above \code{score.max} are skipped (edit distance returns NA). Combined with \code{score.min}, enables efficient regime-specific queries (e.g., only positions with score in [\code{score.min}, \code{score.max}]). For LSE variants, the filter applies to the aggregate LSE score across all windows rather than individual window scores. Default NULL (no filter). - \item \code{direction}: For edit distance functions only. Direction of the edit distance query: \code{"above"} (default) finds the minimum edits to bring the score above \code{score.thresh}; \code{"below"} finds the minimum edits to bring the score below \code{score.thresh}. +} + +\strong{Edit distance notes} + +The edit distance functions answer "how many single-base changes are needed to reach a target +PWM score?" Each genomic window (same length as the PSSM) gets an edit distance: the minimum +number of substitutions (and optionally indels) required to bring its PWM log-likelihood score +to the target threshold. The virtual track returns the minimum edit distance across all windows +in the iterator interval. + +\strong{Direction.} The \code{direction} parameter controls which side of the threshold to target: +\itemize{ + \item \code{"above"} (default): minimum edits to make the score \strong{reach or exceed} + \code{score.thresh}. Use this to ask "how close is this sequence to becoming a motif match?" + A window already scoring above the threshold needs 0 edits. + \item \code{"below"}: minimum edits to make the score \strong{fall below} + \code{score.thresh}. Use this to ask "how fragile is this motif match — how many mutations + would disrupt it?" A window already scoring below the threshold needs 0 edits. +} + +\strong{Strand handling differs by direction.} When \code{bidirect = TRUE}, both strands are +scanned at each genomic position. The way their edit distances are combined depends on +the direction: +\itemize{ + \item \code{direction = "above"}: takes the \strong{minimum} across both strands. A motif + match on \emph{either} strand is sufficient (e.g., for TF binding), so the easier strand + wins. + \item \code{direction = "below"}: takes the \strong{maximum} across both strands. A single + genomic substitution changes both strands simultaneously (the forward base and its reverse + complement). To truly disrupt a binding site you must bring \emph{both} strands below the + threshold, so the harder strand determines the answer. For example, if the forward strand + needs 3 edits to go below the threshold and the reverse needs 1, the result is 3 — after + just 1 edit the forward strand would still have a match. +} +Then, across all windows in the iterator interval, the minimum is reported (the position +where it is easiest to create or disrupt a match). + +\strong{Parameters:} +\itemize{ + \item \code{score.thresh}: The target PWM log-likelihood score that the edit distance + algorithm tries to reach. For \code{direction = "above"}, this is the score to reach + or exceed; for \code{direction = "below"}, this is the score to fall below. + \item \code{score.min}, \code{score.max}: Optional numeric pre-filters on the \strong{current} + (unedited) PWM score. Windows scoring outside the [\code{score.min}, \code{score.max}] + range are skipped (edit distance returns NA). Default NULL (no filter). + These are independent of \code{score.thresh} — they control \emph{which windows to + evaluate}, not the target score. + + \strong{Typical usage with \code{direction = "below"}}: use \code{score.min} to restrict + to windows that currently have a motif match. For example, with \code{score.thresh = -18} + and \code{score.min = -14}, only windows scoring >= -14 are considered, and the edit + distance tells you how many edits to push each one below -18. Without \code{score.min}, + windows already scoring below -18 will return 0 edits. + + For LSE variants, the filters apply to the aggregate LSE score, not individual windows. + \item \code{max_edits}: Optional positive integer. Cap the search depth: if reaching the + threshold requires more than \code{max_edits} edits, NA is returned for that window. + Default NULL (exact computation, no cap). + \item \code{max_indels}: Optional non-negative integer (default 0). When > 0, allows + insertions and deletions in addition to substitutions, using a banded Needleman-Wunsch DP. + Typical values are 1-2. Only supported by the non-LSE variants + (\code{pwm.edit_distance}, \code{pwm.edit_distance.pos}, \code{pwm.max.edit_distance}). + \item \code{direction}: \code{"above"} (default) or \code{"below"}. See Direction above. } \strong{Spatial weighting} @@ -398,6 +461,77 @@ gvtrack.create( spat_max = 450 ) gextract("window_spatial_pwm", gintervals(1, 0, 10000), iterator = 500) + +# --- Edit distance examples --- +# (Using the same 4-position PSSM defined above) + +# First, check the PWM score distribution to pick sensible thresholds +gvtrack.create("pwm_score", NULL, "pwm.max", pssm = pssm) +head(gextract("pwm_score", gintervals(1, 500, 520), iterator = 1)) +# Scores range from ~-8.3 (poor) to ~-0.8 (strong match) + +# --- direction = "above": how many edits to CREATE a motif match? --- +# "How many substitutions until this window scores >= -5?" +gvtrack.create("edist_above", NULL, "pwm.edit_distance", + pssm = pssm, score.thresh = -5 +) +# At 1bp: each position gets its own edit distance (0 if already matching) +head(gextract(c("pwm_score", "edist_above"), + gintervals(1, 500, 520), + iterator = 1 +)) + +# --- direction = "below": how many edits to DISRUPT a motif match? --- +# +# score.thresh is the target: "push the score below this value" +# score.min is a pre-filter: "only consider windows currently scoring above this" +# These are independent — score.min controls WHICH windows to evaluate, +# score.thresh controls WHAT score to target. +# +# Example: among windows scoring >= -5 (strong matches), +# how many edits to push below -7 (disrupt the match)? +gvtrack.create("edist_below", NULL, "pwm.edit_distance", + pssm = pssm, score.thresh = -7, score.min = -5, + direction = "below" +) +head(gextract(c("pwm_score", "edist_above", "edist_below"), + gintervals(1, 500, 520), + iterator = 1 +)) +# score >= -5 (e.g. -2.7): gets edit distance (2 edits to go below -7) +# score < -5 (e.g. -6.4): NA (filtered out by score.min) + +# Without score.min, ALL windows are evaluated — windows already +# below -7 return 0 edits: +gvtrack.create("edist_below_all", NULL, "pwm.edit_distance", + pssm = pssm, score.thresh = -7, direction = "below" +) +head(gextract(c("pwm_score", "edist_below", "edist_below_all"), + gintervals(1, 500, 520), + iterator = 1 +)) +# score >= -5: same result as edist_below (2 edits) +# score between -7 and -5 (e.g. -6.4): edist_below=NA, edist_below_all=0 +# score < -7: both return 0 (already below threshold) + +# --- max_edits: cap the search depth --- +# Return NA if more than 2 substitutions are needed +gvtrack.create("edist_max2", NULL, "pwm.edit_distance", + pssm = pssm, score.thresh = -5, max_edits = 2 +) +# Positions needing 3+ edits now return NA instead of 3 +head(gextract(c("pwm_score", "edist_above", "edist_max2"), + gintervals(1, 500, 520), + iterator = 1 +)) + +# --- Aggregation over intervals --- +# With a larger iterator, the minimum edit distance across the +# interval is returned. +gextract(c("pwm_score", "edist_above", "edist_below_all"), + gintervals(1, 0, 2000), + iterator = 200 +) } \seealso{ \code{\link{gvtrack.info}}, \code{\link{gvtrack.iterator}}, diff --git a/src/PWMEditDistanceScorer.cpp b/src/PWMEditDistanceScorer.cpp index 2a0337a9..2dce00b1 100644 --- a/src/PWMEditDistanceScorer.cpp +++ b/src/PWMEditDistanceScorer.cpp @@ -326,6 +326,8 @@ float PWMEditDistanceScorer::score_interval(const GInterval& interval, return metrics.min_edits_position; case Mode::PWM_MAX_EDITS: return metrics.best_pwm_edits; + case Mode::N_MUTATIONS: + return metrics.n_mutations; default: return std::numeric_limits::quiet_NaN(); } @@ -702,6 +704,51 @@ float PWMEditDistanceScorer::compute_window_edits(const int* bidx, int seq_avail return compute_heuristic(bidx, reverse, m_max_edits); } +float PWMEditDistanceScorer::compute_n_mutations(const int* bidx, bool reverse) { + int L = m_pssm.length(); + + // Compute current window score using the flat lookup table + double score = 0; + for (int i = 0; i < L; i++) { + int col = reverse ? (L - 1 - i) : i; + int base = bidx[i]; + if (base >= 4) { + score += m_score_table[col][4]; + } else { + score += m_score_table[col][base]; + } + } + + // If threshold already satisfied, return 0 + if (threshold_satisfied(score)) return 0.0f; + + // Compute deficit + double deficit = compute_deficit(score); + + // Count single-base substitutions that independently cross threshold + int count = 0; + for (int i = 0; i < L; i++) { + int col = reverse ? (L - 1 - i) : i; + int current_base = bidx[i]; + if (current_base >= 4) continue; // skip N bases + + for (int b = 0; b < 4; b++) { + if (b == current_base) continue; + + double delta; + if (is_below()) { + delta = m_score_table[col][current_base] - m_score_table[col][b]; + } else { + delta = m_score_table[col][b] - m_score_table[col][current_base]; + } + + if (delta >= deficit) count++; + } + } + + return (count > 0) ? static_cast(count) : std::numeric_limits::quiet_NaN(); +} + PWMEditDistanceScorer::ScanMetrics PWMEditDistanceScorer::evaluate_windows(const std::string& seq, size_t interval_length) { @@ -715,7 +762,8 @@ PWMEditDistanceScorer::ScanMetrics PWMEditDistanceScorer::evaluate_windows(const const bool scan_forward = should_scan_forward(); const bool scan_reverse = should_scan_reverse(); - const bool need_min = (m_mode != Mode::PWM_MAX_EDITS); + const bool need_n_mutations = (m_mode == Mode::N_MUTATIONS); + const bool need_min = (m_mode != Mode::PWM_MAX_EDITS && !need_n_mutations); const bool need_min_pos = (m_mode == Mode::MIN_EDITS_POSITION); const bool need_pwm = (m_mode == Mode::PWM_MAX_EDITS); const bool has_score_min = !std::isnan(m_score_min); @@ -723,6 +771,13 @@ PWMEditDistanceScorer::ScanMetrics PWMEditDistanceScorer::evaluate_windows(const const bool has_score_filter = has_score_min || has_score_max; const bool apply_score_filter = has_score_filter && (m_max_indels == 0); + // For direction=BELOW with bidirectional scanning: a genomic substitution + // affects both strands, so we need enough edits to bring the *harder* strand + // below the threshold. Take max across strands at each position, then min + // across positions. (For ABOVE, min across strands is correct: any one strand + // reaching the threshold suffices.) + const bool below_bidirect = is_below() && scan_forward && scan_reverse && (need_min || need_n_mutations); + const size_t max_start = std::min(interval_length, target_length - motif_length + 1); if (max_start == 0 || (!scan_forward && !scan_reverse)) { return metrics; @@ -785,6 +840,16 @@ PWMEditDistanceScorer::ScanMetrics PWMEditDistanceScorer::evaluate_windows(const } }; + auto maybe_update_n_mutations = [&](float nmut) { + if (!need_n_mutations || std::isnan(nmut)) { + return; + } + // Track maximum n_mutations across qualifying windows + if (std::isnan(metrics.n_mutations) || nmut > metrics.n_mutations) { + metrics.n_mutations = nmut; + } + }; + for (size_t offset = 0; offset < max_start; ++offset) { // Sliding N-count maintenance (for offset > 0, slide the window by 1) if (use_n_skip && offset > 0) { @@ -808,91 +873,215 @@ PWMEditDistanceScorer::ScanMetrics PWMEditDistanceScorer::evaluate_windows(const // directly, avoiding two function calls (compute_window_edits → passes_prefilter) // for the 90%+ of windows that get rejected. For subs-only, shift is always 0, // so the inner loop is tight: just B bidx lookups + 1 viable table read per block. - if (scan_forward) { + if (below_bidirect) { + // direction=BELOW + bidirectional: compute both strands, take max + // (prefilter is always off for BELOW, so no pigeonhole check needed) if (need_min) { - bool pass_score_filter = true; - if (apply_score_filter) { - float logp = compute_window_pwm_score(window_start, /*reverse=*/false); - if (has_score_min && logp < m_score_min) pass_score_filter = false; - if (has_score_max && logp > m_score_max) pass_score_filter = false; + float fwd_edits = std::numeric_limits::quiet_NaN(); + { + bool pass_score_filter = true; + if (apply_score_filter) { + float logp = compute_window_pwm_score(window_start, /*reverse=*/false); + if (has_score_min && logp < m_score_min) pass_score_filter = false; + if (has_score_max && logp > m_score_max) pass_score_filter = false; + } + if (pass_score_filter) { + fwd_edits = compute_window_edits(fwd_bidx.data() + offset, seq_avail, /*reverse=*/false); + } } - if (pass_score_filter) { - const int* bidx = fwd_bidx.data() + offset; - bool prefilter_pass = true; - if (m_use_prefilter && m_max_indels == 0) { - prefilter_pass = false; - for (const auto& block : m_prefilter_blocks) { - int block_len = (int)block.columns.size(); - int hash = 0; - bool valid = true; - for (int j = 0; j < block_len; j++) { - int b = bidx[block.columns[j]]; - if (b >= 4) { valid = false; break; } - hash += b << (2 * j); - } - if (valid && block.viable[hash]) { - prefilter_pass = true; - break; - } - } + float rev_edits = std::numeric_limits::quiet_NaN(); + { + bool pass_score_filter = true; + if (apply_score_filter) { + float logp = compute_window_pwm_score(window_start, /*reverse=*/true); + if (has_score_min && logp < m_score_min) pass_score_filter = false; + if (has_score_max && logp > m_score_max) pass_score_filter = false; } - if (prefilter_pass) { - float edits = compute_window_edits(bidx, seq_avail, /*reverse=*/false); - maybe_update_min(edits, offset, +1); + if (pass_score_filter) { + rev_edits = compute_window_edits(rev_bidx.data() + offset, seq_avail, /*reverse=*/true); } } + + // Combine: max of non-NaN values (NaN = strand filtered out / no match) + bool have_fwd = !std::isnan(fwd_edits); + bool have_rev = !std::isnan(rev_edits); + if (have_fwd || have_rev) { + float combined; + int combined_dir; + if (have_fwd && have_rev) { + // Both strands have matches — take the harder one to disrupt + if (fwd_edits >= rev_edits) { + combined = fwd_edits; + combined_dir = +1; + } else { + combined = rev_edits; + combined_dir = -1; + } + } else if (have_fwd) { + combined = fwd_edits; + combined_dir = +1; + } else { + combined = rev_edits; + combined_dir = -1; + } + maybe_update_min(combined, offset, combined_dir); + } } - if (need_pwm) { - float logp = 0.0f; - std::string::const_iterator it = seq.begin() + offset; - m_pssm.calc_like(it, logp); - maybe_update_pwm(logp, offset, +1); - } - } + if (need_n_mutations) { + // N_MUTATIONS + BELOW + bidirect: compute both strands, take max across strands + float fwd_nmut = std::numeric_limits::quiet_NaN(); + { + bool pass_score_filter = true; + if (apply_score_filter) { + float logp = compute_window_pwm_score(window_start, /*reverse=*/false); + if (has_score_min && logp < m_score_min) pass_score_filter = false; + if (has_score_max && logp > m_score_max) pass_score_filter = false; + } + if (pass_score_filter) { + fwd_nmut = compute_n_mutations(fwd_bidx.data() + offset, /*reverse=*/false); + } + } - if (scan_reverse) { - if (need_min) { - bool pass_score_filter = true; - if (apply_score_filter) { - float logp = compute_window_pwm_score(window_start, /*reverse=*/true); - if (has_score_min && logp < m_score_min) pass_score_filter = false; - if (has_score_max && logp > m_score_max) pass_score_filter = false; + float rev_nmut = std::numeric_limits::quiet_NaN(); + { + bool pass_score_filter = true; + if (apply_score_filter) { + float logp = compute_window_pwm_score(window_start, /*reverse=*/true); + if (has_score_min && logp < m_score_min) pass_score_filter = false; + if (has_score_max && logp > m_score_max) pass_score_filter = false; + } + if (pass_score_filter) { + rev_nmut = compute_n_mutations(rev_bidx.data() + offset, /*reverse=*/true); + } } - if (pass_score_filter) { - const int* bidx = rev_bidx.data() + offset; - bool prefilter_pass = true; - if (m_use_prefilter && m_max_indels == 0) { - prefilter_pass = false; - const int L_val = static_cast(motif_length); - for (const auto& block : m_prefilter_blocks) { - int block_len = (int)block.columns.size(); - int hash = 0; - bool valid = true; - for (int j = 0; j < block_len; j++) { - int seq_idx = L_val - 1 - block.columns[j]; - int b = bidx[seq_idx]; - if (b >= 4) { valid = false; break; } - hash += b << (2 * j); - } - if (valid && block.viable[hash]) { - prefilter_pass = true; - break; + // Combine: max of non-NaN values across strands + bool have_fwd = !std::isnan(fwd_nmut); + bool have_rev = !std::isnan(rev_nmut); + if (have_fwd || have_rev) { + float combined; + if (have_fwd && have_rev) { + combined = std::max(fwd_nmut, rev_nmut); + } else if (have_fwd) { + combined = fwd_nmut; + } else { + combined = rev_nmut; + } + maybe_update_n_mutations(combined); + } + } + } else if (scan_forward || scan_reverse) { + if (scan_forward) { + if (need_min) { + bool pass_score_filter = true; + if (apply_score_filter) { + float logp = compute_window_pwm_score(window_start, /*reverse=*/false); + if (has_score_min && logp < m_score_min) pass_score_filter = false; + if (has_score_max && logp > m_score_max) pass_score_filter = false; + } + + if (pass_score_filter) { + const int* bidx = fwd_bidx.data() + offset; + bool prefilter_pass = true; + if (m_use_prefilter && m_max_indels == 0) { + prefilter_pass = false; + for (const auto& block : m_prefilter_blocks) { + int block_len = (int)block.columns.size(); + int hash = 0; + bool valid = true; + for (int j = 0; j < block_len; j++) { + int b = bidx[block.columns[j]]; + if (b >= 4) { valid = false; break; } + hash += b << (2 * j); + } + if (valid && block.viable[hash]) { + prefilter_pass = true; + break; + } } } + if (prefilter_pass) { + float edits = compute_window_edits(bidx, seq_avail, /*reverse=*/false); + maybe_update_min(edits, offset, +1); + } + } + } + if (need_n_mutations) { + bool pass_score_filter = true; + if (apply_score_filter) { + float logp = compute_window_pwm_score(window_start, /*reverse=*/false); + if (has_score_min && logp < m_score_min) pass_score_filter = false; + if (has_score_max && logp > m_score_max) pass_score_filter = false; } - if (prefilter_pass) { - float edits = compute_window_edits(bidx, seq_avail, /*reverse=*/true); - maybe_update_min(edits, offset, -1); + if (pass_score_filter) { + float nmut = compute_n_mutations(fwd_bidx.data() + offset, /*reverse=*/false); + maybe_update_n_mutations(nmut); } } + if (need_pwm) { + float logp = 0.0f; + std::string::const_iterator it = seq.begin() + offset; + m_pssm.calc_like(it, logp); + maybe_update_pwm(logp, offset, +1); + } } - if (need_pwm) { - float logp_rc = 0.0f; - std::string::const_iterator it2 = seq.begin() + offset; - m_pssm.calc_like_rc(it2, logp_rc); - maybe_update_pwm(logp_rc, offset, -1); + + if (scan_reverse) { + if (need_min) { + bool pass_score_filter = true; + if (apply_score_filter) { + float logp = compute_window_pwm_score(window_start, /*reverse=*/true); + if (has_score_min && logp < m_score_min) pass_score_filter = false; + if (has_score_max && logp > m_score_max) pass_score_filter = false; + } + + if (pass_score_filter) { + const int* bidx = rev_bidx.data() + offset; + bool prefilter_pass = true; + if (m_use_prefilter && m_max_indels == 0) { + prefilter_pass = false; + const int L_val = static_cast(motif_length); + for (const auto& block : m_prefilter_blocks) { + int block_len = (int)block.columns.size(); + int hash = 0; + bool valid = true; + for (int j = 0; j < block_len; j++) { + int seq_idx = L_val - 1 - block.columns[j]; + int b = bidx[seq_idx]; + if (b >= 4) { valid = false; break; } + hash += b << (2 * j); + } + if (valid && block.viable[hash]) { + prefilter_pass = true; + break; + } + } + } + if (prefilter_pass) { + float edits = compute_window_edits(bidx, seq_avail, /*reverse=*/true); + maybe_update_min(edits, offset, -1); + } + } + } + if (need_n_mutations) { + bool pass_score_filter = true; + if (apply_score_filter) { + float logp = compute_window_pwm_score(window_start, /*reverse=*/true); + if (has_score_min && logp < m_score_min) pass_score_filter = false; + if (has_score_max && logp > m_score_max) pass_score_filter = false; + } + if (pass_score_filter) { + float nmut = compute_n_mutations(rev_bidx.data() + offset, /*reverse=*/true); + maybe_update_n_mutations(nmut); + } + } + if (need_pwm) { + float logp_rc = 0.0f; + std::string::const_iterator it2 = seq.begin() + offset; + m_pssm.calc_like_rc(it2, logp_rc); + maybe_update_pwm(logp_rc, offset, -1); + } } } } diff --git a/src/PWMEditDistanceScorer.h b/src/PWMEditDistanceScorer.h index 6d426b66..66cccdfe 100644 --- a/src/PWMEditDistanceScorer.h +++ b/src/PWMEditDistanceScorer.h @@ -36,7 +36,8 @@ class PWMEditDistanceScorer : public GenomeSeqScorer enum class Mode { MIN_EDITS, MIN_EDITS_POSITION, - PWM_MAX_EDITS + PWM_MAX_EDITS, + N_MUTATIONS }; enum class Direction { @@ -82,6 +83,7 @@ class PWMEditDistanceScorer : public GenomeSeqScorer float get_last_pwm_max_logp() const { return m_last_metrics.best_pwm_logp; } float get_last_pwm_max_edits() const { return m_last_metrics.best_pwm_edits; } float get_last_pwm_max_pos() const { return m_last_metrics.best_pwm_position; } + float get_last_n_mutations() const { return m_last_metrics.n_mutations; } int get_max_indels() const { return m_max_indels; } private: @@ -96,6 +98,8 @@ class PWMEditDistanceScorer : public GenomeSeqScorer float best_pwm_position = std::numeric_limits::quiet_NaN(); size_t best_pwm_index = 0; int best_pwm_direction = 1; + + float n_mutations = std::numeric_limits::quiet_NaN(); }; DnaPSSM m_pssm; @@ -210,6 +214,7 @@ class PWMEditDistanceScorer : public GenomeSeqScorer inline bool should_scan_reverse() const; inline char complement_base(char base) const; inline float compute_window_edits(const int* bidx, int seq_avail, bool reverse); + float compute_n_mutations(const int* bidx, bool reverse); /** * Get the effective PSSM score and gain for a motif position aligned with a sequence base. diff --git a/src/SequenceVarProcessor.cpp b/src/SequenceVarProcessor.cpp index eba2320a..d3aeed14 100644 --- a/src/SequenceVarProcessor.cpp +++ b/src/SequenceVarProcessor.cpp @@ -25,7 +25,8 @@ namespace { enum class EditDistAggMode { MIN_EDITS, // PWM_EDIT_DISTANCE, PWM_EDIT_DISTANCE_LSE MIN_EDITS_POS, // PWM_EDIT_DISTANCE_POS, PWM_EDIT_DISTANCE_LSE_POS - MAX_EDITS // PWM_MAX_EDIT_DISTANCE + MAX_EDITS, // PWM_MAX_EDIT_DISTANCE + MAX_SCORE // PWM_N_MUTATIONS (take maximum score across parts) }; // Per-part scoring result for edit distance aggregation @@ -44,6 +45,8 @@ EditDistAggMode get_agg_mode(TrackExpressionVars::Track_var::Val_func func) { return EditDistAggMode::MIN_EDITS_POS; case TrackExpressionVars::Track_var::PWM_MAX_EDIT_DISTANCE: return EditDistAggMode::MAX_EDITS; + case TrackExpressionVars::Track_var::PWM_N_MUTATIONS: + return EditDistAggMode::MAX_SCORE; default: return EditDistAggMode::MIN_EDITS; } @@ -96,6 +99,16 @@ double aggregate_edit_distance_parts( } return found ? best_edits : numeric_limits::quiet_NaN(); } + case EditDistAggMode::MAX_SCORE: { + // Take maximum score across parts (for N_MUTATIONS) + double max_score = numeric_limits::quiet_NaN(); + for (const auto &r : parts) { + if (std::isnan(max_score) || (!std::isnan(r.score) && r.score > max_score)) { + max_score = r.score; + } + } + return max_score; + } } return numeric_limits::quiet_NaN(); // unreachable } diff --git a/src/TrackExpressionVars.cpp b/src/TrackExpressionVars.cpp index 91e3e5d2..ded1e8e7 100644 --- a/src/TrackExpressionVars.cpp +++ b/src/TrackExpressionVars.cpp @@ -63,7 +63,8 @@ const char *TrackExpressionVars::Track_var::FUNC_NAMES[TrackExpressionVars::Trac "exists", "size", "sample", "sample.pos.abs", "sample.pos.relative", "first", "first.pos.abs", "first.pos.relative", "last", "last.pos.abs", "last.pos.relative", "pwm.edit_distance", "pwm.edit_distance.pos", "pwm.max.edit_distance", - "pwm.edit_distance.lse", "pwm.edit_distance.lse.pos"}; + "pwm.edit_distance.lse", "pwm.edit_distance.lse.pos", + "pwm.n_mutations"}; const char *TrackExpressionVars::Interv_var::FUNC_NAMES[TrackExpressionVars::Interv_var::NUM_FUNCS] = { "distance", "distance.center", "distance.edge", "coverage", "neighbor.count" }; @@ -790,6 +791,84 @@ void TrackExpressionVars::add_vtrack_var(const string &vtrack, SEXP rvtrack) var.percentile = numeric_limits::quiet_NaN(); var.requires_pv = false; + // Attach filter if present + attach_filter_to_var(rvtrack, vtrack, var); + return; + } else if (func == "pwm.n_mutations") { + // N_MUTATIONS mode: count single-base substitutions that independently cross threshold + m_track_vars.push_back(Track_var()); + Track_var &var = m_track_vars.back(); + var.var_name = vtrack; + var.val_func = Track_var::PWM_N_MUTATIONS; + var.track_n_imdf = nullptr; + var.seq_imdf1d = nullptr; + + SEXP rparams = get_rvector_col(rvtrack, "params", vtrack.c_str(), false); + + // Parse PWM parameters using helper struct + TrackExprParams::PWMParams pwm_params = TrackExprParams::PWMParams::parse(rparams, vtrack); + + // Extract score.min parameter (optional, default NaN = no filter) + float score_min = std::numeric_limits::quiet_NaN(); + if (Rf_isNewList(rparams)) { + int sm_idx = findListElementIndex(rparams, "score.min"); + if (sm_idx >= 0) { + SEXP rscore_min = VECTOR_ELT(rparams, sm_idx); + if (!Rf_isNull(rscore_min)) { + if (Rf_isReal(rscore_min) && Rf_length(rscore_min) == 1) { + score_min = static_cast(REAL(rscore_min)[0]); + } else if (Rf_isInteger(rscore_min) && Rf_length(rscore_min) == 1) { + score_min = static_cast(INTEGER(rscore_min)[0]); + } else { + verror("score.min parameter must be NULL or a single numeric value for vtrack %s", vtrack.c_str()); + } + } + } + } + + // Extract score.max parameter (optional, default NaN = no filter) + float score_max = std::numeric_limits::quiet_NaN(); + if (Rf_isNewList(rparams)) { + int smx_idx = findListElementIndex(rparams, "score.max"); + if (smx_idx >= 0) { + SEXP rscore_max = VECTOR_ELT(rparams, smx_idx); + if (!Rf_isNull(rscore_max)) { + if (Rf_isReal(rscore_max) && Rf_length(rscore_max) == 1) { + score_max = static_cast(REAL(rscore_max)[0]); + } else if (Rf_isInteger(rscore_max) && Rf_length(rscore_max) == 1) { + score_max = static_cast(INTEGER(rscore_max)[0]); + } else { + verror("score.max parameter must be NULL or a single numeric value for vtrack %s", vtrack.c_str()); + } + } + } + } + + auto direction = parse_direction_param(rparams, vtrack); + + // Construct scorer with N_MUTATIONS mode, no max_edits or max_indels + var.pwm_edit_distance_scorer = std::make_unique( + pwm_params.core.pssm, + &m_shared_seqfetch, + static_cast(pwm_params.core.score_thresh), + -1, // max_edits: not applicable for n_mutations + pwm_params.extend_flag, + static_cast(pwm_params.core.strand_mode), + PWMEditDistanceScorer::Mode::N_MUTATIONS, + score_min, + 0, // max_indels: not applicable for n_mutations + score_max, + direction + ); + + // Parse optional iterator modifier + Iterator_modifier1D imdf1d; + parse_imdf(rvtrack, vtrack, &imdf1d, NULL); + var.seq_imdf1d = add_imdf(imdf1d); + + var.percentile = numeric_limits::quiet_NaN(); + var.requires_pv = false; + // Attach filter if present attach_filter_to_var(rvtrack, vtrack, var); return; diff --git a/src/TrackExpressionVars.h b/src/TrackExpressionVars.h index 7d2136d7..4dee978f 100644 --- a/src/TrackExpressionVars.h +++ b/src/TrackExpressionVars.h @@ -157,6 +157,7 @@ class TrackExpressionVars { PWM_MAX_EDIT_DISTANCE, PWM_EDIT_DISTANCE_LSE, PWM_EDIT_DISTANCE_LSE_POS, + PWM_N_MUTATIONS, NUM_FUNCS }; @@ -410,7 +411,8 @@ inline bool TrackExpressionVars::is_seq_variable(unsigned ivar) const { m_track_vars[ivar].val_func == Track_var::PWM_EDIT_DISTANCE_POS || m_track_vars[ivar].val_func == Track_var::PWM_MAX_EDIT_DISTANCE || m_track_vars[ivar].val_func == Track_var::PWM_EDIT_DISTANCE_LSE || - m_track_vars[ivar].val_func == Track_var::PWM_EDIT_DISTANCE_LSE_POS; + m_track_vars[ivar].val_func == Track_var::PWM_EDIT_DISTANCE_LSE_POS || + m_track_vars[ivar].val_func == Track_var::PWM_N_MUTATIONS; } // Helper methods to check variable function types @@ -423,7 +425,8 @@ inline bool TrackExpressionVars::is_sequence_based_function(Track_var::Val_func func == Track_var::PWM_EDIT_DISTANCE_POS || func == Track_var::PWM_MAX_EDIT_DISTANCE || func == Track_var::PWM_EDIT_DISTANCE_LSE || - func == Track_var::PWM_EDIT_DISTANCE_LSE_POS; + func == Track_var::PWM_EDIT_DISTANCE_LSE_POS || + func == Track_var::PWM_N_MUTATIONS; } inline bool TrackExpressionVars::is_pwm_function(Track_var::Val_func func) { @@ -436,7 +439,8 @@ inline bool TrackExpressionVars::is_pwm_edit_distance_function(Track_var::Val_fu func == Track_var::PWM_EDIT_DISTANCE_POS || func == Track_var::PWM_MAX_EDIT_DISTANCE || func == Track_var::PWM_EDIT_DISTANCE_LSE || - func == Track_var::PWM_EDIT_DISTANCE_LSE_POS; + func == Track_var::PWM_EDIT_DISTANCE_LSE_POS || + func == Track_var::PWM_N_MUTATIONS; } inline bool TrackExpressionVars::is_pwm_lse_edit_distance_function(Track_var::Val_func func) { diff --git a/tests/testthat/test-pwm-edit-distance-below.R b/tests/testthat/test-pwm-edit-distance-below.R index 27764a64..650f461d 100644 --- a/tests/testthat/test-pwm-edit-distance-below.R +++ b/tests/testthat/test-pwm-edit-distance-below.R @@ -226,13 +226,15 @@ test_that("pwm.edit_distance direction=below bidirectional considers both strand iterator = test_interval ) - # Bidirectional should return minimum of both strands + # For direction=below, bidirectional takes max across strands at each + # position (need to disrupt *both* strands), then min across positions. + # This gives bidi >= max(fwd, rev) in general. fwd <- result$edist_below_fwd[1] rev <- result$edist_below_rev[1] bidi <- result$edist_below_bidi[1] if (!is.na(fwd) && !is.na(rev)) { - expect_equal(bidi, min(fwd, rev), tolerance = 1e-6) + expect_true(bidi >= max(fwd, rev) - 1e-6) } else if (!is.na(fwd)) { expect_equal(bidi, fwd, tolerance = 1e-6) } else if (!is.na(rev)) { @@ -240,6 +242,65 @@ test_that("pwm.edit_distance direction=below bidirectional considers both strand } }) +test_that("pwm.edit_distance direction=below bidirectional takes max across strands per position", { + remove_all_vtracks() + + pssm <- create_test_pssm() + + # Use a wider interval with 1bp iterator to check per-position behavior + test_interval <- gintervals(1, 200, 260) + threshold <- -5.0 + + gvtrack.create("edist_below_fwd_1bp", NULL, + func = "pwm.edit_distance", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = FALSE, strand = 1, extend = FALSE, prior = 0 + ) + gvtrack.create("edist_below_rev_1bp", NULL, + func = "pwm.edit_distance", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = FALSE, strand = -1, extend = FALSE, prior = 0 + ) + gvtrack.create("edist_below_bidi_1bp", NULL, + func = "pwm.edit_distance", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = TRUE, extend = FALSE, prior = 0 + ) + + result <- gextract( + c("edist_below_fwd_1bp", "edist_below_rev_1bp", "edist_below_bidi_1bp"), + test_interval, + iterator = 1 + ) + + # At each 1bp position: bidi should be max of fwd and rev (per-position) + # when both are non-NA, and equal to the non-NA one when only one is present + for (i in seq_len(nrow(result))) { + fwd <- result$edist_below_fwd_1bp[i] + rev <- result$edist_below_rev_1bp[i] + bidi <- result$edist_below_bidi_1bp[i] + + if (!is.na(fwd) && !is.na(rev)) { + expect_equal(bidi, max(fwd, rev), + tolerance = 1e-6, + info = paste("position", i, "fwd=", fwd, "rev=", rev, "bidi=", bidi) + ) + } else if (!is.na(fwd)) { + expect_equal(bidi, fwd, tolerance = 1e-6) + } else if (!is.na(rev)) { + expect_equal(bidi, rev, tolerance = 1e-6) + } else { + expect_true(is.na(bidi)) + } + } +}) + test_that("pwm.edit_distance.pos and pwm.max.edit_distance work with direction=below", { remove_all_vtracks() @@ -341,7 +402,7 @@ test_that("pwm.edit_distance direction=below with score.min/score.max filtering" test_interval <- gintervals(1, 200, 240) threshold <- -5.0 - # Default (no score.min) — for direction="below", defaults to score.thresh + # Default (no score.min) — no filtering, same as score.min = -Inf gvtrack.create("edist_below_default", NULL, func = "pwm.edit_distance", pssm = pssm, score.thresh = threshold, @@ -349,20 +410,20 @@ test_that("pwm.edit_distance direction=below with score.min/score.max filtering" bidirect = FALSE, extend = FALSE, prior = 0 ) - # Explicit score.min = score.thresh — should match the default - gvtrack.create("edist_below_explicit_thresh", NULL, + # Explicit score.min = -Inf — should match the default + gvtrack.create("edist_below_no_filter", NULL, func = "pwm.edit_distance", pssm = pssm, score.thresh = threshold, - score.min = threshold, + score.min = -Inf, direction = "below", bidirect = FALSE, extend = FALSE, prior = 0 ) - # With -Inf score.min (disables filtering, old behavior) - gvtrack.create("edist_below_lowfilt", NULL, + # Explicit score.min = score.thresh — filters windows already below threshold + gvtrack.create("edist_below_filtered", NULL, func = "pwm.edit_distance", pssm = pssm, score.thresh = threshold, - score.min = -Inf, + score.min = threshold, direction = "below", bidirect = FALSE, extend = FALSE, prior = 0 ) @@ -377,13 +438,13 @@ test_that("pwm.edit_distance direction=below with score.min/score.max filtering" ) result <- gextract( - c("edist_below_default", "edist_below_explicit_thresh", "edist_below_lowfilt", "edist_below_highfilt"), + c("edist_below_default", "edist_below_no_filter", "edist_below_filtered", "edist_below_highfilt"), test_interval, iterator = test_interval ) - # Default should match explicit score.min = score.thresh - expect_equal(result$edist_below_default[1], result$edist_below_explicit_thresh[1], tolerance = 1e-6) + # Default should match explicit score.min = -Inf (no filter) + expect_equal(result$edist_below_default[1], result$edist_below_no_filter[1], tolerance = 1e-6) # High filter should either be NA or >= default result if (!is.na(result$edist_below_highfilt[1]) && !is.na(result$edist_below_default[1])) { diff --git a/tests/testthat/test-pwm-n-mutations.R b/tests/testthat/test-pwm-n-mutations.R new file mode 100644 index 00000000..f66c2055 --- /dev/null +++ b/tests/testthat/test-pwm-n-mutations.R @@ -0,0 +1,808 @@ +create_isolated_test_db() + +# --- R reference implementation for pwm.n_mutations --- +# For a single motif-length window: count single-base substitutions that +# independently cross the score threshold. +# Returns 0 if threshold already satisfied, NA if no single edit suffices. +manual_pwm_n_mutations <- function(seq, pssm_mat, threshold, direction = "above") { + L <- nrow(pssm_mat) + bases <- c("A", "C", "G", "T") + seq_bases <- strsplit(seq, "")[[1]] + + if (length(seq_bases) < L) { + return(NA_real_) + } + + # Compute current score + score <- 0 + for (i in 1:L) { + b <- seq_bases[i] + bidx <- match(b, bases) + if (is.na(bidx)) { + # N base: use the score from the LUT (index 4 maps to min) + score <- score + min(log(pssm_mat[i, ])) + } else { + score <- score + log(pssm_mat[i, bidx]) + } + } + + # Check if threshold already satisfied + if (direction == "above" && score >= threshold) { + return(0) + } + if (direction == "below" && score <= threshold) { + return(0) + } + + # Compute deficit + deficit <- if (direction == "above") threshold - score else score - threshold + + # Count single-base substitutions that independently cross threshold + count <- 0 + for (i in 1:L) { + current_base <- seq_bases[i] + current_idx <- match(current_base, bases) + if (is.na(current_idx)) next # skip N bases + + current_log <- log(pssm_mat[i, current_idx]) + + for (alt_base in setdiff(bases, current_base)) { + alt_idx <- match(alt_base, bases) + alt_log <- log(pssm_mat[i, alt_idx]) + + delta <- if (direction == "above") { + alt_log - current_log + } else { + current_log - alt_log + } + + # Handle NaN from -Inf - (-Inf) or similar + if (is.na(delta) || !is.finite(delta)) next + if (delta >= deficit - 1e-12) count <- count + 1 + } + } + + if (count == 0) { + return(NA_real_) + } + return(count) +} + +# Scan across all windows in a sequence, returning max n_mutations +manual_pwm_n_mutations_scan <- function(seq, pssm_mat, threshold, direction = "above") { + L <- nrow(pssm_mat) + n <- nchar(seq) + if (n < L) { + return(NA_real_) + } + + best <- NA_real_ + for (start in 1:(n - L + 1)) { + window_seq <- substr(seq, start, start + L - 1) + cand <- manual_pwm_n_mutations(window_seq, pssm_mat, threshold, direction) + if (is.na(best) || (!is.na(cand) && cand > best)) { + best <- cand + } + } + best +} + +# Reverse complement helper +revcomp <- function(seq) { + comp <- c(A = "T", C = "G", G = "C", T = "A", N = "N") + bases <- strsplit(toupper(seq), "")[[1]] + paste0(rev(comp[bases]), collapse = "") +} + +# ============================================================================ +# Basic correctness tests +# ============================================================================ + +test_that("pwm.n_mutations returns 0 when threshold already satisfied (direction=above)", { + remove_all_vtracks() + + pssm <- create_test_pssm() # AC motif + + # Find position with "AC" pattern + full_seq <- toupper(gseq.extract(gintervals(1, 200, 300))) + ac_pos <- gregexpr("AC", full_seq)[[1]][1] + + if (ac_pos > 0) { + abs_pos <- 200 + ac_pos - 1 + test_interval <- gintervals(1, abs_pos, abs_pos + 2) + + # Threshold at 0 — an AC window scores log(1)+log(1) = 0, satisfying >= 0 + threshold <- 0.0 + gvtrack.create("nmut_perfect", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract("nmut_perfect", test_interval, iterator = test_interval) + expect_equal(result$nmut_perfect[1], 0, tolerance = 1e-6) + } else { + skip("No AC pattern found in test region") + } +}) + +test_that("pwm.n_mutations returns correct count for 1-edit window (direction=above)", { + remove_all_vtracks() + + # Use a 3-position PSSM with clear preferences + pssm <- matrix(c( + 0.8, 0.1, 0.05, 0.05, # Strong A + 0.1, 0.8, 0.05, 0.05, # Strong C + 0.1, 0.05, 0.8, 0.05 # Strong G + ), ncol = 4, byrow = TRUE) + colnames(pssm) <- c("A", "C", "G", "T") + + test_interval <- gintervals(1, 200, 240) + seq <- toupper(gseq.extract(test_interval)) + + # Choose threshold that should allow single edits for some windows + threshold <- -3.0 + gvtrack.create("nmut_count", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract("nmut_count", test_interval, iterator = test_interval) + + expected <- manual_pwm_n_mutations_scan(seq, pssm, threshold, "above") + + if (is.na(expected)) { + expect_true(is.na(result$nmut_count[1])) + } else { + expect_equal(result$nmut_count[1], expected, tolerance = 1e-6) + } +}) + +test_that("pwm.n_mutations returns NA when no single edit suffices", { + remove_all_vtracks() + + pssm <- create_test_pssm() + + test_intervals <- gintervals(1, 200, 240) + + # Impossibly high threshold — no single edit can reach it + threshold <- 100.0 + gvtrack.create("nmut_unreachable", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract("nmut_unreachable", test_intervals, iterator = test_intervals) + expect_true(is.na(result$nmut_unreachable[1])) +}) + +test_that("pwm.n_mutations matches R reference on multiple intervals", { + remove_all_vtracks() + + pssm <- matrix(c( + 0.8, 0.1, 0.05, 0.05, # Strong A + 0.1, 0.8, 0.05, 0.05, # Strong C + 0.1, 0.05, 0.8, 0.05 # Strong G + ), ncol = 4, byrow = TRUE) + colnames(pssm) <- c("A", "C", "G", "T") + + test_intervals <- gintervals( + chrom = c(1, 1, 1), + start = c(200, 300, 400), + end = c(230, 330, 430) + ) + + threshold <- -3.5 + gvtrack.create("nmut_ref", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract("nmut_ref", test_intervals, iterator = test_intervals) + + for (i in seq_len(nrow(test_intervals))) { + seq <- toupper(gseq.extract(test_intervals[i, ])) + expected <- manual_pwm_n_mutations_scan(seq, pssm, threshold, "above") + + if (is.na(expected)) { + expect_true(is.na(result$nmut_ref[i]), + info = paste("Interval", i, "expected NA") + ) + } else { + expect_equal(result$nmut_ref[i], expected, + tolerance = 1e-6, + info = paste("Interval", i) + ) + } + } +}) + +test_that("pwm.n_mutations with 1bp iterator matches R reference per position", { + remove_all_vtracks() + + # Use a PSSM without zero entries to avoid -Inf complications + # (the C++ score table replaces log-zero with col_max for ABOVE direction) + pssm <- matrix(c( + 0.8, 0.1, 0.05, 0.05, + 0.05, 0.8, 0.1, 0.05, + 0.05, 0.05, 0.8, 0.1 + ), ncol = 4, byrow = TRUE) + colnames(pssm) <- c("A", "C", "G", "T") + motif_len <- nrow(pssm) + + test_interval <- gintervals(1, 200, 210) + threshold <- -3.0 + + gvtrack.create("nmut_1bp", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = TRUE, prior = 0 + ) + + result_1bp <- gextract("nmut_1bp", test_interval, iterator = 1) + + expect_true(nrow(result_1bp) > 0) + + for (idx in 1:min(5, nrow(result_1bp))) { + pos <- result_1bp$start[idx] + seq_window <- toupper(gseq.extract(gintervals(1, pos, pos + motif_len))) + expected <- manual_pwm_n_mutations(seq_window, pssm, threshold, "above") + + if (is.na(expected)) { + expect_true(is.na(result_1bp$nmut_1bp[idx]), + info = paste("Position", pos) + ) + } else { + expect_equal(result_1bp$nmut_1bp[idx], expected, + tolerance = 1e-6, + info = paste("Position", pos) + ) + } + } +}) + +# ============================================================================ +# Direction tests +# ============================================================================ + +test_that("pwm.n_mutations direction=below returns 0 when score already below threshold", { + remove_all_vtracks() + + pssm <- create_test_pssm() # AC motif + + test_intervals <- gintervals(1, 200, 240) + + # Very high threshold — all windows score well below it + threshold <- 100.0 + gvtrack.create("nmut_below_already", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract("nmut_below_already", test_intervals, iterator = test_intervals) + expect_equal(result$nmut_below_already[1], 0, tolerance = 1e-6) +}) + +test_that("pwm.n_mutations direction=below matches R reference", { + remove_all_vtracks() + + pssm <- matrix(c( + 0.8, 0.1, 0.05, 0.05, + 0.1, 0.8, 0.05, 0.05, + 0.1, 0.05, 0.8, 0.05 + ), ncol = 4, byrow = TRUE) + colnames(pssm) <- c("A", "C", "G", "T") + + test_intervals <- gintervals( + chrom = c(1, 1, 1), + start = c(200, 300, 400), + end = c(230, 330, 430) + ) + + threshold <- -3.5 + gvtrack.create("nmut_below_ref", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract("nmut_below_ref", test_intervals, iterator = test_intervals) + + for (i in seq_len(nrow(test_intervals))) { + seq <- toupper(gseq.extract(test_intervals[i, ])) + expected <- manual_pwm_n_mutations_scan(seq, pssm, threshold, "below") + + if (is.na(expected)) { + expect_true(is.na(result$nmut_below_ref[i]), + info = paste("Interval", i, "expected NA") + ) + } else { + expect_equal(result$nmut_below_ref[i], expected, + tolerance = 1e-6, + info = paste("Interval", i) + ) + } + } +}) + +test_that("pwm.n_mutations direction=below returns NA when threshold unreachable", { + remove_all_vtracks() + + # Uniform PSSM: min score per position = log(0.25) ~ -1.386 + pssm <- matrix(c( + 0.25, 0.25, 0.25, 0.25, + 0.25, 0.25, 0.25, 0.25 + ), ncol = 4, byrow = TRUE) + colnames(pssm) <- c("A", "C", "G", "T") + + test_intervals <- gintervals(1, 200, 240) + + # Threshold below the minimum possible score (-Inf can't be reached + # with a uniform PSSM since all entries are log(0.25)) + threshold <- -100.0 + gvtrack.create("nmut_below_impossible", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract("nmut_below_impossible", test_intervals, iterator = test_intervals) + expect_true(is.na(result$nmut_below_impossible[1])) +}) + +# ============================================================================ +# Bidirectional tests +# ============================================================================ + +test_that("pwm.n_mutations bidirect=TRUE returns max across strands (direction=above)", { + remove_all_vtracks() + + pssm <- create_test_pssm() + + test_interval <- gintervals(1, 200, 260) + threshold <- -5.0 + + gvtrack.create("nmut_fwd", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, strand = 1, extend = FALSE, prior = 0 + ) + + gvtrack.create("nmut_rev", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, strand = -1, extend = FALSE, prior = 0 + ) + + gvtrack.create("nmut_bidi", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = TRUE, extend = FALSE, prior = 0 + ) + + result <- gextract(c("nmut_fwd", "nmut_rev", "nmut_bidi"), + test_interval, + iterator = test_interval + ) + + fwd <- result$nmut_fwd[1] + rev <- result$nmut_rev[1] + bidi <- result$nmut_bidi[1] + + # For direction=above, bidirectional should take max across strands at each + # window position, since either strand crossing the threshold counts. + # The aggregation across windows is still max. + # So bidi >= max(fwd, rev) is expected (since both strands contribute). + if (!is.na(fwd) && !is.na(rev)) { + expect_true(bidi >= max(fwd, rev) - 1e-6, + info = paste("fwd=", fwd, "rev=", rev, "bidi=", bidi) + ) + } +}) + +test_that("pwm.n_mutations bidirect=TRUE with direction=below uses max across strands per position", { + remove_all_vtracks() + + # Use a non-degenerate PSSM so scores are finite and direction=below is meaningful + pssm <- matrix(c( + 0.8, 0.1, 0.05, 0.05, + 0.05, 0.8, 0.1, 0.05, + 0.05, 0.05, 0.8, 0.1 + ), ncol = 4, byrow = TRUE) + colnames(pssm) <- c("A", "C", "G", "T") + + # PWM max scores in this region are around -3.4 to -9.0 + # Use threshold=-4.0 so some windows score above it (needing edits to go below) + test_interval <- gintervals(1, 200, 260) + threshold <- -4.0 + + gvtrack.create("nmut_below_fwd", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = FALSE, strand = 1, extend = TRUE, prior = 0 + ) + + gvtrack.create("nmut_below_rev", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = FALSE, strand = -1, extend = TRUE, prior = 0 + ) + + gvtrack.create("nmut_below_bidi", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = TRUE, extend = TRUE, prior = 0 + ) + + result <- gextract( + c("nmut_below_fwd", "nmut_below_rev", "nmut_below_bidi"), + test_interval, + iterator = 1 + ) + + # Verify we have data to compare + has_both <- !is.na(result$nmut_below_fwd) & !is.na(result$nmut_below_rev) + expect_true(any(has_both), info = "Should have at least some positions with both strands non-NA") + + # For direction=below + bidirect, each position combines strands via max + # (a genomic substitution affects both strands) + for (i in which(has_both)) { + fwd <- result$nmut_below_fwd[i] + rev <- result$nmut_below_rev[i] + bidi <- result$nmut_below_bidi[i] + + # Per-position combine = max of strands + expect_true(bidi >= max(fwd, rev) - 1e-6, + info = paste("pos", i, "fwd=", fwd, "rev=", rev, "bidi=", bidi) + ) + } +}) + +# ============================================================================ +# Score filter tests +# ============================================================================ + +test_that("pwm.n_mutations score.min filters out low-scoring windows", { + remove_all_vtracks() + + pssm <- create_test_pssm() + + test_interval <- gintervals(1, 200, 240) + threshold <- -3.0 + + # No filter + gvtrack.create("nmut_nofilt", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + # With score.min = 0 (very restrictive — only perfect-scoring windows pass) + gvtrack.create("nmut_highfilt", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + score.min = 0.0, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract(c("nmut_nofilt", "nmut_highfilt"), + test_interval, + iterator = test_interval + ) + + # High filter should either be NA (all windows filtered) or <= the unfiltered + # count (since fewer windows contribute). But since n_mutations is MAX across + # qualifying windows, filtering can only reduce or NA-ify the result. + if (!is.na(result$nmut_highfilt[1]) && !is.na(result$nmut_nofilt[1])) { + expect_true(result$nmut_highfilt[1] <= result$nmut_nofilt[1] + 1e-6) + } +}) + +test_that("pwm.n_mutations score.max filters out high-scoring windows", { + remove_all_vtracks() + + pssm <- create_test_pssm() + + test_interval <- gintervals(1, 200, 240) + threshold <- -5.0 + + # No filter + gvtrack.create("nmut_nomax", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + # With score.max = very low value (filters most windows) + gvtrack.create("nmut_maxfilt", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + score.max = -100.0, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract(c("nmut_nomax", "nmut_maxfilt"), + test_interval, + iterator = test_interval + ) + + # With such a restrictive score.max, most or all windows are filtered out + # Result should be NA or at most equal to the unfiltered result + if (!is.na(result$nmut_maxfilt[1]) && !is.na(result$nmut_nomax[1])) { + expect_true(result$nmut_maxfilt[1] <= result$nmut_nomax[1] + 1e-6) + } +}) + +test_that("pwm.n_mutations direction=below score.min=-Inf matches default for below", { + remove_all_vtracks() + + pssm <- create_test_pssm() + + test_interval <- gintervals(1, 200, 240) + threshold <- -5.0 + + # Default (no score.min) + gvtrack.create("nmut_below_default", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + direction = "below", + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + # Explicit score.min = -Inf + gvtrack.create("nmut_below_explicit", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + score.min = -Inf, + direction = "below", + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract(c("nmut_below_default", "nmut_below_explicit"), + test_interval, + iterator = test_interval + ) + + expect_equal(result$nmut_below_default[1], result$nmut_below_explicit[1], tolerance = 1e-6) +}) + +# ============================================================================ +# Edge case: N bases +# ============================================================================ + +test_that("pwm.n_mutations handles windows consistently with non-degenerate PSSM", { + remove_all_vtracks() + + # Use a PSSM without zero entries so R reference and C++ agree + pssm <- matrix(c( + 0.8, 0.1, 0.05, 0.05, + 0.05, 0.8, 0.1, 0.05 + ), ncol = 4, byrow = TRUE) + colnames(pssm) <- c("A", "C", "G", "T") + + test_interval <- gintervals(1, 200, 240) + seq <- toupper(gseq.extract(test_interval)) + + threshold <- -3.0 + gvtrack.create("nmut_ntest", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract("nmut_ntest", test_interval, iterator = test_interval) + expected <- manual_pwm_n_mutations_scan(seq, pssm, threshold, "above") + + if (is.na(expected)) { + expect_true(is.na(result$nmut_ntest[1])) + } else { + expect_equal(result$nmut_ntest[1], expected, tolerance = 1e-6) + } +}) + +# ============================================================================ +# Integration: gscreen filtering +# ============================================================================ + +test_that("pwm.n_mutations works in gscreen expression", { + remove_all_vtracks() + + pssm <- create_test_pssm() + threshold <- -5.0 + + gvtrack.create("nmut_screen", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + # Find intervals where n_mutations is not NA + screened <- gscreen("!is.na(nmut_screen)", + gintervals(1, 0, 5000), + iterator = 20 + ) + + # gscreen returns NULL when no intervals match; otherwise a data frame + if (!is.null(screened)) { + expect_s3_class(screened, "data.frame") + expect_true(all(c("chrom", "start", "end") %in% colnames(screened))) + } + + # Also verify gextract works with the same expression + extracted <- gextract("nmut_screen", gintervals(1, 0, 1000), iterator = 20) + expect_s3_class(extracted, "data.frame") + expect_true("nmut_screen" %in% colnames(extracted)) +}) + +test_that("pwm.n_mutations returns 0 for windows where threshold is already met (gextract integration)", { + remove_all_vtracks() + + pssm <- create_test_pssm() # AC motif + motif_len <- nrow(pssm) + + # Use extend=TRUE with 1bp iterator to check individual windows + # Very low threshold that most windows should already satisfy + threshold <- -20.0 + gvtrack.create("nmut_easy", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = TRUE, prior = 0 + ) + + result <- gextract("nmut_easy", gintervals(1, 200, 210), iterator = 1) + expect_true(nrow(result) > 0) + + # With such a low threshold, windows with non-N bases should score above it + # and return 0 + non_na_rows <- !is.na(result$nmut_easy) + if (any(non_na_rows)) { + expect_true(all(result$nmut_easy[non_na_rows] == 0), + info = "All qualifying windows should return 0 for a very low threshold" + ) + } +}) + +# ============================================================================ +# Parameter validation +# ============================================================================ + +test_that("pwm.n_mutations requires pssm parameter", { + remove_all_vtracks() + + expect_error( + gvtrack.create("nmut_nopssm", NULL, + func = "pwm.n_mutations", + score.thresh = -5.0 + ), + "pssm" + ) +}) + +test_that("pwm.n_mutations requires score.thresh parameter", { + remove_all_vtracks() + + pssm <- create_test_pssm() + + expect_error( + gvtrack.create("nmut_nothresh", NULL, + func = "pwm.n_mutations", + pssm = pssm + ), + "score.thresh" + ) +}) + +test_that("pwm.n_mutations rejects invalid direction", { + remove_all_vtracks() + + pssm <- create_test_pssm() + + expect_error( + gvtrack.create("nmut_baddir", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = -5.0, + direction = "sideways" + ), + "direction" + ) +}) + +# ============================================================================ +# Aggregation semantics: MAX across windows +# ============================================================================ + +test_that("pwm.n_mutations aggregates as MAX across qualifying windows", { + remove_all_vtracks() + + pssm <- matrix(c( + 0.8, 0.1, 0.05, 0.05, + 0.1, 0.8, 0.05, 0.05, + 0.1, 0.05, 0.8, 0.05 + ), ncol = 4, byrow = TRUE) + colnames(pssm) <- c("A", "C", "G", "T") + + motif_len <- nrow(pssm) + + # Use a wide interval so there are many windows + test_interval <- gintervals(1, 200, 260) + seq <- toupper(gseq.extract(test_interval)) + + threshold <- -3.5 + gvtrack.create("nmut_agg", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + result <- gextract("nmut_agg", test_interval, iterator = test_interval) + + # Compute per-window n_mutations manually and verify the MAX + n_windows <- nchar(seq) - motif_len + 1 + per_window <- numeric(0) + for (start in 1:n_windows) { + window_seq <- substr(seq, start, start + motif_len - 1) + cand <- manual_pwm_n_mutations(window_seq, pssm, threshold, "above") + per_window <- c(per_window, cand) + } + + # Max across all non-NA values + non_na <- per_window[!is.na(per_window)] + expected_max <- if (length(non_na) == 0) NA_real_ else max(non_na) + + if (is.na(expected_max)) { + expect_true(is.na(result$nmut_agg[1])) + } else { + expect_equal(result$nmut_agg[1], expected_max, tolerance = 1e-6) + } +}) + +# ============================================================================ +# Prior handling +# ============================================================================ + +test_that("pwm.n_mutations with prior=0 vs prior>0 gives different results", { + remove_all_vtracks() + + pssm <- create_test_pssm() # Has zeros in it + + test_interval <- gintervals(1, 200, 240) + threshold <- -3.0 + + gvtrack.create("nmut_prior0", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0 + ) + + gvtrack.create("nmut_prior01", NULL, + func = "pwm.n_mutations", + pssm = pssm, score.thresh = threshold, + bidirect = FALSE, extend = FALSE, prior = 0.01 + ) + + result <- gextract(c("nmut_prior0", "nmut_prior01"), + test_interval, + iterator = test_interval + ) + + # With prior=0 on a PSSM that has zero entries, scores can be -Inf + # vs prior>0 where scores are always finite. Results should generally differ. + # This is a consistency check — at least one should be non-NA. + expect_true(!is.na(result$nmut_prior0[1]) || !is.na(result$nmut_prior01[1])) +})