From f81d5bd267fe34d3eea3dfe253e05c38c4062895 Mon Sep 17 00:00:00 2001 From: aviezerl Date: Mon, 30 Mar 2026 23:42:01 +0300 Subject: [PATCH 1/7] feat: parameterize Markov order k in gsynth functions (k=1..8, default 5) The gsynth system previously hardcoded a Markov order of 5 throughout. This adds a `k` parameter to `gsynth.train()` supporting orders 1-8, enabling users to trade off model complexity vs. context sensitivity. C++: StratifiedMarkovModel uses runtime k with dynamic vector storage instead of fixed NUM_5MERS=1024 arrays. Binary format v2 stores k in header; v1 files load as k=5 for backward compatibility. R: gsynth.train(k=5L) with validation, stored in model object. Propagated through save/load/sample. Old models without k default to 5. Tests: 7 new test blocks (52 expectations) covering k=1/3/5/7, validation, save/load round-trip, backward compat, and stratified models. --- R/synth.R | 111 ++++++++---- man/gsynth.sample.Rd | 4 +- man/gsynth.train.Rd | 38 ++-- src/GenomeSynthSample.cpp | 43 +++-- src/GenomeSynthTrain.cpp | 54 +++--- src/StratifiedMarkovModel.cpp | 194 ++++++++++++--------- src/StratifiedMarkovModel.h | 119 +++++++++---- src/misha-init.cpp | 8 +- tests/testthat/test-gsynth-k.R | 308 +++++++++++++++++++++++++++++++++ 9 files changed, 664 insertions(+), 215 deletions(-) create mode 100644 tests/testthat/test-gsynth-k.R diff --git a/R/synth.R b/R/synth.R index 7db33dfd..059a7104 100644 --- a/R/synth.R +++ b/R/synth.R @@ -309,13 +309,13 @@ gsynth.bin_map <- function(breaks, merge_ranges = NULL) { result } -#' Train a stratified Markov-5 model from genome sequences +#' Train a stratified Markov model from genome sequences #' -#' Computes a 5th-order Markov model optionally stratified by bins of one or more -#' track expressions (e.g., GC content and CG dinucleotide frequency). This model -#' can be used to generate synthetic genomes that preserve the k-mer statistics of -#' the original genome within each stratification bin. When called with no dimension -#' specifications, trains a single unstratified model. +#' Computes a Markov model of order \code{k} (default 5) optionally stratified by +#' bins of one or more track expressions (e.g., GC content and CG dinucleotide +#' frequency). This model can be used to generate synthetic genomes that preserve +#' the k-mer statistics of the original genome within each stratification bin. +#' When called with no dimension specifications, trains a single unstratified model. #' #' @param ... Zero or more dimension specifications. Each specification is a list #' containing: @@ -333,29 +333,35 @@ gsynth.bin_map <- function(breaks, merge_ranges = NULL) { #' track values are computed. #' @param pseudocount Pseudocount added to all k-mer counts to avoid zero probabilities. #' Default is 1. -#' @param min_obs Minimum number of observations (6-mers) required per bin. Bins with -#' fewer observations will be marked as NA (not learned) and a warning will be -#' issued. Default is 0 (no minimum). During sampling, NA bins will fall back +#' @param min_obs Minimum number of observations ((k+1)-mers) required per bin. Bins +#' with fewer observations will be marked as NA (not learned) and a warning will +#' be issued. Default is 0 (no minimum). During sampling, NA bins will fall back #' to uniform sampling unless merged via \code{bin_merge}. +#' @param k Integer Markov order (1--8). Default is 5, which models 6-mer +#' (context of length 5 plus the emitted base) transition probabilities. +#' Higher values capture longer-range sequence dependencies but require +#' exponentially more memory (\eqn{4^k} context states). #' #' @details #' \strong{Strand symmetry:} The training process counts both the forward strand -#' 6-mer and its reverse complement for each position, ensuring strand-symmetric +#' (k+1)-mer and its reverse complement for each position, ensuring strand-symmetric #' transition probabilities. This means the reported total_kmers is approximately #' double the number of genomic positions processed. #' -#' \strong{N bases:} Positions where the 6-mer contains any N (unknown) bases are -#' skipped during training and counted in \code{total_n}. The model only learns +#' \strong{N bases:} Positions where the (k+1)-mer contains any N (unknown) bases +#' are skipped during training and counted in \code{total_n}. The model only learns #' from valid A/C/G/T sequences. #' #' @return A \code{gsynth.model} object containing: #' \describe{ +#' \item{k}{Markov order used for training} +#' \item{num_kmers}{Number of context states (\eqn{4^k})} #' \item{n_dims}{Number of stratification dimensions} #' \item{dim_specs}{List of dimension specifications (expr, breaks, num_bins, bin_map)} #' \item{dim_sizes}{Vector of bin counts per dimension} #' \item{total_bins}{Total number of bins (product of dim_sizes)} -#' \item{total_kmers}{Total number of valid 6-mers counted} -#' \item{per_bin_kmers}{Number of 6-mers counted per bin} +#' \item{total_kmers}{Total number of valid (k+1)-mers counted} +#' \item{per_bin_kmers}{Number of (k+1)-mers counted per bin} #' \item{total_masked}{Number of positions skipped due to mask} #' \item{total_n}{Number of positions skipped due to N bases} #' \item{model_data}{Internal model data (counts and CDFs)} @@ -408,9 +414,17 @@ gsynth.train <- function(..., intervals = NULL, iterator = NULL, pseudocount = 1, - min_obs = 0) { + min_obs = 0, + k = 5L) { .gcheckroot() + # Validate k (Markov order) + k <- as.integer(k) + if (length(k) != 1L || is.na(k) || k < 1L || k > 8L) { + stop("k must be a single integer between 1 and 8", call. = FALSE) + } + num_kmers <- as.integer(4L^k) + # Capture all dimension specs from ... args <- list(...) @@ -587,10 +601,13 @@ gsynth.train <- function(..., bin_map_vec, mask, as.numeric(pseudocount), + as.integer(k), .misha_env() ) # Override C++ result with our multi-dimensional metadata + result$k <- k + result$num_kmers <- num_kmers result$n_dims <- n_dims result$dim_specs <- dim_specs result$dim_sizes <- dim_sizes @@ -642,13 +659,13 @@ gsynth.train <- function(..., if (n_dims == 0) { message(sprintf( - "Trained unstratified model: %s 6-mers (no stratification)", - format(result$total_kmers, big.mark = ",") + "Trained unstratified Markov-%d model: %s %d-mers (no stratification)", + k, format(result$total_kmers, big.mark = ","), k + 1L )) } else { message(sprintf( - "Trained model: %s 6-mers across %d bins (%d dimensions)", - format(result$total_kmers, big.mark = ","), + "Trained Markov-%d model: %s %d-mers across %d bins (%d dimensions)", + k, format(result$total_kmers, big.mark = ","), k + 1L, result$total_bins, result$n_dims )) @@ -671,8 +688,10 @@ gsynth.train <- function(..., #' #' @export print.gsynth.model <- function(x, ...) { - cat("Synthetic Genome Markov-5 Model\n") + k <- if (is.null(x$k)) 5L else x$k + cat(sprintf("Synthetic Genome Markov-%d Model\n", k)) cat("----------------------------\n") + cat(sprintf("Markov order (k): %d [context states: %d]\n", k, 4L^k)) if (!is.null(x$n_dims) && x$n_dims == 0) { # Zero-dimensional (unstratified) model @@ -760,6 +779,8 @@ gsynth.save <- function(model, file, compress = FALSE) { total_bins <- model$total_bins n_dims <- if (is.null(model$n_dims)) 0L else model$n_dims + k <- if (is.null(model$k)) 5L else model$k + num_kmers <- if (is.null(model$num_kmers)) as.integer(4L^k) else model$num_kmers # Build dim_specs for YAML yaml_dim_specs <- list() @@ -785,7 +806,7 @@ gsynth.save <- function(model, file, compress = FALSE) { metadata <- list( format = "gsynth_model", version = 1L, - markov_order = 5L, + markov_order = as.integer(if (is.null(model$k)) 5L else model$k), n_dims = as.integer(n_dims), dim_sizes = if (n_dims > 0) as.integer(model$dim_sizes) else list(), total_bins = as.integer(total_bins), @@ -799,26 +820,26 @@ gsynth.save <- function(model, file, compress = FALSE) { data = list( counts = list( dtype = "float64", - shape = list(as.integer(total_bins), 1024L, 4L), + shape = list(as.integer(total_bins), as.integer(num_kmers), 4L), order = "C", file = "counts.bin" ), cdf = list( dtype = "float64", - shape = list(as.integer(total_bins), 1024L, 4L), + shape = list(as.integer(total_bins), as.integer(num_kmers), 4L), order = "C", file = "cdf.bin" ) ) ) - # Helper to write binary data for a list of 1024x4 matrices in row-major order + # Helper to write binary data for a list of num_kmers x 4 matrices in row-major order .write_bin_matrices <- function(mat_list, filepath) { con <- file(filepath, "wb") on.exit(close(con)) for (i in seq_along(mat_list)) { mat <- mat_list[[i]] - # mat is 1024x4 column-major in R + # mat is num_kmers x 4 column-major in R # Writing t(mat) column-major = writing mat row-major (C order) writeBin(as.double(t(mat)), con, size = 8L, endian = "little") } @@ -960,6 +981,10 @@ gsynth.convert <- function(input_file, output_file, compress = FALSE) { total_bins <- as.integer(metadata$total_bins) n_dims <- as.integer(metadata$n_dims) + # Read Markov order (default to 5 for backward compatibility) + k <- as.integer(if (is.null(metadata$markov_order)) 5L else metadata$markov_order) + num_kmers <- as.integer(4L^k) + # Read binary data counts_path <- file.path(dir_path, "counts.bin") cdf_path <- file.path(dir_path, "cdf.bin") @@ -968,7 +993,7 @@ gsynth.convert <- function(input_file, output_file, compress = FALSE) { stop("Binary data files (counts.bin, cdf.bin) not found", call. = FALSE) } - expected_n <- total_bins * 1024L * 4L + expected_n <- total_bins * num_kmers * 4L counts_flat <- readBin(counts_path, "double", n = expected_n, size = 8L, endian = "little") cdf_flat <- readBin(cdf_path, "double", n = expected_n, size = 8L, endian = "little") @@ -980,20 +1005,20 @@ gsynth.convert <- function(input_file, output_file, compress = FALSE) { ), call. = FALSE) } - # Reshape: row-major flat -> list of 1024x4 column-major R matrices - chunk <- 1024L * 4L + # Reshape: row-major flat -> list of num_kmers x 4 column-major R matrices + chunk <- num_kmers * 4L counts_list <- vector("list", total_bins) cdf_list <- vector("list", total_bins) for (i in seq_len(total_bins)) { offset <- (i - 1L) * chunk # Data was written as t(mat) in column-major = row-major - # Read as 4x1024 matrix (column-major), then transpose to 1024x4 + # Read as 4 x num_kmers matrix (column-major), then transpose to num_kmers x 4 counts_list[[i]] <- t(matrix(counts_flat[(offset + 1L):(offset + chunk)], - nrow = 4L, ncol = 1024L + nrow = 4L, ncol = num_kmers )) cdf_list[[i]] <- t(matrix(cdf_flat[(offset + 1L):(offset + chunk)], - nrow = 4L, ncol = 1024L + nrow = 4L, ncol = num_kmers )) } @@ -1029,6 +1054,8 @@ gsynth.convert <- function(input_file, output_file, compress = FALSE) { # Build the model object pseudocount <- if (!is.null(metadata$pseudocount)) as.numeric(metadata$pseudocount) else 1.0 model <- list( + k = k, + num_kmers = num_kmers, num_bins = total_bins, breaks = NULL, total_kmers = as.numeric(metadata$total_kmers), @@ -1052,7 +1079,7 @@ gsynth.convert <- function(input_file, output_file, compress = FALSE) { #' Sample a synthetic genome from a trained Markov model #' -#' Generates a synthetic genome by sampling from a trained stratified Markov-5 +#' Generates a synthetic genome by sampling from a trained stratified Markov #' model. The generated genome preserves the k-mer statistics of the original #' genome within each stratification bin. #' @@ -1097,7 +1124,7 @@ gsynth.convert <- function(input_file, output_file, compress = FALSE) { #' #' @details #' \strong{N bases during sampling:} When the sampler needs to initialize the -#' first 5-mer context and encounters regions with only N bases, it falls back +#' first k-mer context and encounters regions with only N bases, it falls back #' to uniform random base selection until a valid context is established. #' Similarly, if a bin has no learned statistics (sparse bin with NA CDF), #' uniform sampling is used for that position. @@ -1163,6 +1190,10 @@ gsynth.sample <- function(model, stop("model must be a gsynth.model object", call. = FALSE) } + # Extract Markov order from model (default k=5 for backward compatibility) + k <- if (is.null(model$k)) 5L else model$k + num_kmers <- if (is.null(model$num_kmers)) as.integer(4L^k) else model$num_kmers + output_format <- match.arg(output_format) # Validate output_path requirement @@ -1322,8 +1353,8 @@ gsynth.sample <- function(model, # Replace NA CDFs with uniform distribution for sampling # Uniform CDF: [0.25, 0.5, 0.75, 1.0] uniform_cdf <- matrix( - rep(c(0.25, 0.5, 0.75, 1.0), each = 1024), - nrow = 1024, ncol = 4 + rep(c(0.25, 0.5, 0.75, 1.0), each = num_kmers), + nrow = num_kmers, ncol = 4 ) for (bin_idx in sparse_used) { cdf_list[[bin_idx]] <- uniform_cdf @@ -1363,6 +1394,7 @@ gsynth.sample <- function(model, output_path_str, output_format_int, n_samples, + as.integer(k), .misha_env() ) @@ -1590,9 +1622,13 @@ gsynth.random <- function(intervals = NULL, # Order is A=0, C=1, G=2, T=3 (same as C++ code) cdf_row <- cumsum(as.numeric(nuc_probs)) - # Create CDF matrix: same for all 1024 contexts (no context dependency) + # Use k=5 for context-independent random sampling (all rows identical) + random_k <- 5L + random_num_kmers <- as.integer(4L^random_k) + + # Create CDF matrix: same for all contexts (no context dependency) # Each row is the same CDF - pure random, context-independent sampling - cdf_matrix <- matrix(rep(cdf_row, 1024), nrow = 1024, ncol = 4, byrow = TRUE) + cdf_matrix <- matrix(rep(cdf_row, random_num_kmers), nrow = random_num_kmers, ncol = 4, byrow = TRUE) cdf_list <- list(cdf_matrix) # Output format: 0 = misha, 1 = fasta, 2 = vector @@ -1626,6 +1662,7 @@ gsynth.random <- function(intervals = NULL, output_path_str, output_format_int, n_samples, + as.integer(random_k), .misha_env() ) diff --git a/man/gsynth.sample.Rd b/man/gsynth.sample.Rd index a5ac2c03..4b1e91dc 100644 --- a/man/gsynth.sample.Rd +++ b/man/gsynth.sample.Rd @@ -68,13 +68,13 @@ When output_format is "misha" or "fasta", returns invisible NULL and returns a character vector of sequences (length = n_intervals * n_samples). } \description{ -Generates a synthetic genome by sampling from a trained stratified Markov-5 +Generates a synthetic genome by sampling from a trained stratified Markov model. The generated genome preserves the k-mer statistics of the original genome within each stratification bin. } \details{ \strong{N bases during sampling:} When the sampler needs to initialize the -first 5-mer context and encounters regions with only N bases, it falls back +first k-mer context and encounters regions with only N bases, it falls back to uniform random base selection until a valid context is established. Similarly, if a bin has no learned statistics (sparse bin with NA CDF), uniform sampling is used for that position. diff --git a/man/gsynth.train.Rd b/man/gsynth.train.Rd index c39cd6de..dfe5044b 100644 --- a/man/gsynth.train.Rd +++ b/man/gsynth.train.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/synth.R \name{gsynth.train} \alias{gsynth.train} -\title{Train a stratified Markov-5 model from genome sequences} +\title{Train a stratified Markov model from genome sequences} \usage{ gsynth.train( ..., @@ -10,7 +10,8 @@ gsynth.train( intervals = NULL, iterator = NULL, pseudocount = 1, - min_obs = 0 + min_obs = 0, + k = 5L ) } \arguments{ @@ -35,40 +36,47 @@ track values are computed.} \item{pseudocount}{Pseudocount added to all k-mer counts to avoid zero probabilities. Default is 1.} -\item{min_obs}{Minimum number of observations (6-mers) required per bin. Bins with -fewer observations will be marked as NA (not learned) and a warning will be -issued. Default is 0 (no minimum). During sampling, NA bins will fall back +\item{min_obs}{Minimum number of observations ((k+1)-mers) required per bin. Bins +with fewer observations will be marked as NA (not learned) and a warning will +be issued. Default is 0 (no minimum). During sampling, NA bins will fall back to uniform sampling unless merged via \code{bin_merge}.} + +\item{k}{Integer Markov order (1--8). Default is 5, which models 6-mer +(context of length 5 plus the emitted base) transition probabilities. +Higher values capture longer-range sequence dependencies but require +exponentially more memory (\eqn{4^k} context states).} } \value{ A \code{gsynth.model} object containing: \describe{ + \item{k}{Markov order used for training} + \item{num_kmers}{Number of context states (\eqn{4^k})} \item{n_dims}{Number of stratification dimensions} \item{dim_specs}{List of dimension specifications (expr, breaks, num_bins, bin_map)} \item{dim_sizes}{Vector of bin counts per dimension} \item{total_bins}{Total number of bins (product of dim_sizes)} - \item{total_kmers}{Total number of valid 6-mers counted} - \item{per_bin_kmers}{Number of 6-mers counted per bin} + \item{total_kmers}{Total number of valid (k+1)-mers counted} + \item{per_bin_kmers}{Number of (k+1)-mers counted per bin} \item{total_masked}{Number of positions skipped due to mask} \item{total_n}{Number of positions skipped due to N bases} \item{model_data}{Internal model data (counts and CDFs)} } } \description{ -Computes a 5th-order Markov model optionally stratified by bins of one or more -track expressions (e.g., GC content and CG dinucleotide frequency). This model -can be used to generate synthetic genomes that preserve the k-mer statistics of -the original genome within each stratification bin. When called with no dimension -specifications, trains a single unstratified model. +Computes a Markov model of order \code{k} (default 5) optionally stratified by +bins of one or more track expressions (e.g., GC content and CG dinucleotide +frequency). This model can be used to generate synthetic genomes that preserve +the k-mer statistics of the original genome within each stratification bin. +When called with no dimension specifications, trains a single unstratified model. } \details{ \strong{Strand symmetry:} The training process counts both the forward strand -6-mer and its reverse complement for each position, ensuring strand-symmetric +(k+1)-mer and its reverse complement for each position, ensuring strand-symmetric transition probabilities. This means the reported total_kmers is approximately double the number of genomic positions processed. -\strong{N bases:} Positions where the 6-mer contains any N (unknown) bases are -skipped during training and counted in \code{total_n}. The model only learns +\strong{N bases:} Positions where the (k+1)-mer contains any N (unknown) bases +are skipped during training and counted in \code{total_n}. The model only learns from valid A/C/G/T sequences. } \examples{ diff --git a/src/GenomeSynthSample.cpp b/src/GenomeSynthSample.cpp index 983f4823..680b8559 100644 --- a/src/GenomeSynthSample.cpp +++ b/src/GenomeSynthSample.cpp @@ -2,7 +2,7 @@ * GenomeSynthSample.cpp * * C++ implementation for sampling a synthetic genome from a trained - * stratified Markov-5 model. + * stratified Markov-k model. */ #include @@ -71,7 +71,8 @@ extern "C" { SEXP C_gsynth_sample(SEXP _cdf_list, SEXP _breaks, SEXP _bin_indices, SEXP _iter_starts, SEXP _iter_chroms, SEXP _intervals, SEXP _mask_copy, SEXP _output_path, - SEXP _output_format, SEXP _n_samples, SEXP _envir) { + SEXP _output_format, SEXP _n_samples, SEXP _k, + SEXP _envir) { try { struct RNGStateGuard { bool active = false; @@ -99,6 +100,14 @@ SEXP C_gsynth_sample(SEXP _cdf_list, SEXP _breaks, SEXP _bin_indices, RNGStateGuard rng_guard; rng_guard.acquire(); + // Parse Markov order k (default 5 for backward compatibility) + int k = Rf_isNull(_k) ? 5 : INTEGER(_k)[0]; + if (k < 1 || k > StratifiedMarkovModel::MAX_K) { + verror("k must be between 1 and %d", StratifiedMarkovModel::MAX_K); + } + int num_kmers = 1; + for (int i = 0; i < k; ++i) num_kmers *= NUM_BASES; // 4^k + // Extract breaks int num_breaks = Rf_length(_breaks); int num_bins = num_breaks - 1; @@ -107,20 +116,16 @@ SEXP C_gsynth_sample(SEXP _cdf_list, SEXP _breaks, SEXP _bin_indices, } double* breaks = REAL(_breaks); vector breaks_vec(breaks, breaks + num_breaks); - - // Build CDF lookup from R list - // cdf_list is a list of matrices, one per bin - // Each matrix is 1024 rows x 4 cols - vector>> cdf_data(num_bins); + vector> cdf_data(num_bins); for (int b = 0; b < num_bins; ++b) { SEXP cdf_mat = VECTOR_ELT(_cdf_list, b); double* cdf_ptr = REAL(cdf_mat); - cdf_data[b].resize(NUM_5MERS); - for (int ctx = 0; ctx < NUM_5MERS; ++ctx) { + cdf_data[b].resize(num_kmers * NUM_BASES); + for (int ctx = 0; ctx < num_kmers; ++ctx) { for (int base = 0; base < NUM_BASES; ++base) { // R matrices are column-major: [row + col * nrow] - cdf_data[b][ctx][base] = - static_cast(cdf_ptr[ctx + base * NUM_5MERS]); + cdf_data[b][ctx * NUM_BASES + base] = + static_cast(cdf_ptr[ctx + base * num_kmers]); } } } @@ -291,8 +296,8 @@ SEXP C_gsynth_sample(SEXP _cdf_list, SEXP _breaks, SEXP _bin_indices, ++mask_cursor; } - // Initialize first 5 bases, honoring mask_copy semantics. - int64_t init_len = min(5, interval_len); + // Initialize first k bases, honoring mask_copy semantics. + int64_t init_len = min(k, interval_len); for (int64_t i = 0; i < init_len; ++i) { int64_t pos = interval_start + i; // Check if this position should be copied from original @@ -335,21 +340,21 @@ SEXP C_gsynth_sample(SEXP _cdf_list, SEXP _breaks, SEXP _bin_indices, } } - // Get 5-mer context from already-sampled bases - int context_idx = StratifiedMarkovModel::encode_5mer( - &synth_seq[rel_pos - 5]); + // Get k-mer context from already-sampled bases + int context_idx = StratifiedMarkovModel::encode_kmer( + &synth_seq[rel_pos - k], k); int next_base; if (context_idx < 0 || bin_idx < 0 || bin_idx >= num_bins) { // Invalid context or bin - sample uniformly next_base = static_cast(unif_rand() * NUM_BASES); } else { - // Sample from CDF + // Sample from CDF (flat layout: ctx * NUM_BASES + base) float r = unif_rand(); - const auto& cdf = cdf_data[bin_idx][context_idx]; + int base_offset = context_idx * NUM_BASES; next_base = NUM_BASES - 1; for (int b = 0; b < NUM_BASES; ++b) { - if (r < cdf[b]) { + if (r < cdf_data[bin_idx][base_offset + b]) { next_base = b; break; } diff --git a/src/GenomeSynthTrain.cpp b/src/GenomeSynthTrain.cpp index 39a96d62..4dfa99b0 100644 --- a/src/GenomeSynthTrain.cpp +++ b/src/GenomeSynthTrain.cpp @@ -1,7 +1,7 @@ /* * GenomeSynthTrain.cpp * - * C++ implementation for training a stratified Markov-5 model + * C++ implementation for training a stratified Markov-k model * from genomic sequences. */ @@ -28,7 +28,7 @@ using namespace rdb; extern "C" { /** - * C_gsynth_train: Train a stratified Markov-5 model from genome sequences. + * C_gsynth_train: Train a stratified Markov-k model from genome sequences. * * @param _chrom_ids Integer vector of chromosome IDs to process * @param _chrom_starts Integer vector of start positions for each chromosome @@ -48,11 +48,18 @@ extern "C" { SEXP C_gsynth_train(SEXP _chrom_ids, SEXP _chrom_starts, SEXP _chrom_ends, SEXP _bin_indices, SEXP _iter_starts, SEXP _iter_chroms, SEXP _breaks, SEXP _bin_map, SEXP _mask, - SEXP _pseudocount, SEXP _envir) { + SEXP _pseudocount, SEXP _k, SEXP _envir) { try { RdbInitializer rdb_init; IntervUtils iu(_envir); + // Parse Markov order k (default 5 for backward compatibility) + int k = Rf_isNull(_k) ? 5 : INTEGER(_k)[0]; + if (k < 1 || k > StratifiedMarkovModel::MAX_K) { + verror("k must be between 1 and %d", StratifiedMarkovModel::MAX_K); + } + int kmer_len = k + 1; // sliding window length: k context + 1 next base + // Extract chromosome ranges int num_chroms = Rf_length(_chrom_ids); if (num_chroms == 0) { @@ -117,7 +124,7 @@ SEXP C_gsynth_train(SEXP _chrom_ids, SEXP _chrom_starts, SEXP _chrom_ends, // Initialize the Markov model StratifiedMarkovModel model; - model.init(num_bins, breaks_vec); + model.init(num_bins, breaks_vec, k); int iter_size = 0; if (num_iter_positions > 0) { @@ -173,7 +180,7 @@ SEXP C_gsynth_train(SEXP _chrom_ids, SEXP _chrom_starts, SEXP _chrom_ends, vector seq; seqfetch.read_interval(chrom_interval, iu.get_chromkey(), seq); - if (seq.size() < 6) continue; // Need at least 6bp for a 6-mer + if ((int)seq.size() < kmer_len) continue; // Need at least (k+1) bp // Get mask intervals for this chromosome const vector& mask_ivs = mask_per_chrom[chromid]; @@ -182,9 +189,9 @@ SEXP C_gsynth_train(SEXP _chrom_ids, SEXP _chrom_starts, SEXP _chrom_ends, const vector>& bins = chrom_bins[chromid]; size_t bin_cursor = 0; - // Scan with 6-mer sliding window + // Scan with (k+1)-mer sliding window int64_t seq_size = static_cast(seq.size()); - for (int64_t pos = 0; pos <= seq_size - 6; ++pos) { + for (int64_t pos = 0; pos <= seq_size - kmer_len; ++pos) { int64_t genome_pos = start + pos; // Check if masked @@ -193,9 +200,9 @@ SEXP C_gsynth_train(SEXP _chrom_ids, SEXP _chrom_starts, SEXP _chrom_ends, continue; } - // Check for N's in the 6-mer + // Check for N's in the (k+1)-mer bool has_n = false; - for (int i = 0; i < 6; ++i) { + for (int i = 0; i < kmer_len; ++i) { char base = seq[pos + i]; if (StratifiedMarkovModel::encode_base(base) < 0) { has_n = true; @@ -224,10 +231,10 @@ SEXP C_gsynth_train(SEXP _chrom_ids, SEXP _chrom_starts, SEXP _chrom_ends, continue; // Position not covered by iterator } - // Encode 5-mer context and next base - int context_idx = StratifiedMarkovModel::encode_5mer(&seq[pos]); + // Encode k-mer context and next base + int context_idx = StratifiedMarkovModel::encode_kmer(&seq[pos], k); int next_base_idx = - StratifiedMarkovModel::encode_base(seq[pos + 5]); + StratifiedMarkovModel::encode_base(seq[pos + k]); if (context_idx >= 0 && next_base_idx >= 0) { // Add forward strand count @@ -236,8 +243,8 @@ SEXP C_gsynth_train(SEXP _chrom_ids, SEXP _chrom_starts, SEXP _chrom_ends, // Add reverse complement count for strand symmetry // This ensures the model learns symmetric transition probabilities int revcomp_context_idx, revcomp_next_idx; - StratifiedMarkovModel::revcomp_6mer( - context_idx, next_base_idx, + StratifiedMarkovModel::revcomp_kmer( + context_idx, next_base_idx, k, revcomp_context_idx, revcomp_next_idx); model.increment_count(bin_idx, revcomp_context_idx, revcomp_next_idx); @@ -299,31 +306,32 @@ SEXP C_gsynth_train(SEXP _chrom_ids, SEXP _chrom_starts, SEXP _chrom_ends, // We'll serialize it to a temporary buffer // For now, we store counts and CDFs as nested lists - // counts: list of matrices (num_bins x (1024 * 4)) + // counts: list of matrices (num_bins x (num_kmers * 4)) SEXP r_counts, r_cdf; rprotect(r_counts = Rf_allocVector(VECSXP, num_bins)); rprotect(r_cdf = Rf_allocVector(VECSXP, num_bins)); const auto& model_counts = model.get_counts(); const auto& model_cdf = model.get_cdf(); + int num_kmers = model.get_num_kmers(); for (int b = 0; b < num_bins; ++b) { - // Counts matrix: 1024 rows x 4 cols (column-major for R) + // Counts matrix: num_kmers rows x 4 cols (column-major for R) SEXP count_mat; - rprotect(count_mat = Rf_allocMatrix(REALSXP, NUM_5MERS, NUM_BASES)); + rprotect(count_mat = Rf_allocMatrix(REALSXP, num_kmers, NUM_BASES)); double* count_data = REAL(count_mat); SEXP cdf_mat; - rprotect(cdf_mat = Rf_allocMatrix(REALSXP, NUM_5MERS, NUM_BASES)); + rprotect(cdf_mat = Rf_allocMatrix(REALSXP, num_kmers, NUM_BASES)); double* cdf_data = REAL(cdf_mat); - for (int ctx = 0; ctx < NUM_5MERS; ++ctx) { + for (int ctx = 0; ctx < num_kmers; ++ctx) { for (int base = 0; base < NUM_BASES; ++base) { // R matrices are column-major: [row + col * nrow] - count_data[ctx + base * NUM_5MERS] = - static_cast(model_counts[b][ctx][base]); - cdf_data[ctx + base * NUM_5MERS] = - static_cast(model_cdf[b][ctx][base]); + count_data[ctx + base * num_kmers] = + static_cast(model_counts[b][ctx * NUM_BASES + base]); + cdf_data[ctx + base * num_kmers] = + static_cast(model_cdf[b][ctx * NUM_BASES + base]); } } diff --git a/src/StratifiedMarkovModel.cpp b/src/StratifiedMarkovModel.cpp index eaa9b2c5..a078014f 100644 --- a/src/StratifiedMarkovModel.cpp +++ b/src/StratifiedMarkovModel.cpp @@ -1,7 +1,7 @@ /* * StratifiedMarkovModel.cpp * - * Implementation of the stratified Markov-5 model. + * Implementation of the stratified Markov model with configurable order k. */ #include "StratifiedMarkovModel.h" @@ -18,48 +18,57 @@ // in ofs.write). Harmless but redundant in C++17. constexpr uint32_t StratifiedMarkovModel::FILE_MAGIC; constexpr uint32_t StratifiedMarkovModel::FILE_VERSION; +constexpr uint32_t StratifiedMarkovModel::FILE_VERSION_1; +constexpr int StratifiedMarkovModel::MAX_K; +constexpr int StratifiedMarkovModel::MAX_KMERS; StratifiedMarkovModel::StratifiedMarkovModel() - : m_num_bins(0), m_total_kmers(0) {} + : m_k(0), m_num_kmers(0), m_num_bins(0), m_total_kmers(0) {} -void StratifiedMarkovModel::init(int num_bins, const std::vector& breaks) { +void StratifiedMarkovModel::init(int num_bins, const std::vector& breaks, int k) { if (num_bins <= 0) { throw std::invalid_argument("num_bins must be positive"); } if (breaks.size() != static_cast(num_bins + 1)) { throw std::invalid_argument("breaks must have num_bins + 1 elements"); } + if (k < 1 || k > MAX_K) { + throw std::invalid_argument("Markov order k must be in [1, 8]"); + } + m_k = k; + m_num_kmers = 1 << (2 * k); // 4^k m_num_bins = num_bins; m_breaks = breaks; - // Allocate count and CDF arrays + // Allocate count and CDF arrays as flat vectors m_counts.resize(num_bins); m_cdf.resize(num_bins); m_per_bin_kmers.resize(num_bins, 0); - reset_counts(); + for (int b = 0; b < num_bins; ++b) { + m_counts[b].assign(m_num_kmers * NUM_BASES, 0); + m_cdf[b].assign(m_num_kmers * NUM_BASES, 0.0f); + } + + m_total_kmers = 0; } void StratifiedMarkovModel::reset_counts() { m_total_kmers = 0; for (int b = 0; b < m_num_bins; ++b) { m_per_bin_kmers[b] = 0; - for (int ctx = 0; ctx < NUM_5MERS; ++ctx) { - for (int base = 0; base < NUM_BASES; ++base) { - m_counts[b][ctx][base] = 0; - m_cdf[b][ctx][base] = 0.0f; - } - } + std::fill(m_counts[b].begin(), m_counts[b].end(), 0); + std::fill(m_cdf[b].begin(), m_cdf[b].end(), 0.0f); } } -void StratifiedMarkovModel::increment_count(int bin_idx, int context_5mer_idx, int next_base_idx) { +void StratifiedMarkovModel::increment_count(int bin_idx, int context_kmer_idx, int next_base_idx) { if (bin_idx < 0 || bin_idx >= m_num_bins) return; - if (context_5mer_idx < 0 || context_5mer_idx >= NUM_5MERS) return; + if (context_kmer_idx < 0 || context_kmer_idx >= m_num_kmers) return; if (next_base_idx < 0 || next_base_idx >= NUM_BASES) return; - m_counts[bin_idx][context_5mer_idx][next_base_idx]++; + m_counts[bin_idx][context_kmer_idx * NUM_BASES + next_base_idx]++; m_per_bin_kmers[bin_idx]++; m_total_kmers++; } @@ -71,16 +80,14 @@ void StratifiedMarkovModel::apply_bin_mapping(const std::vector& bin_map) { } // Create temporary storage for merged counts - std::vector, NUM_5MERS>> new_counts(m_num_bins); + std::vector> new_counts(m_num_bins); std::vector new_per_bin_kmers(m_num_bins, 0); + int flat_size = m_num_kmers * NUM_BASES; + // Initialize new counts to zero for (int b = 0; b < m_num_bins; ++b) { - for (int ctx = 0; ctx < NUM_5MERS; ++ctx) { - for (int base = 0; base < NUM_BASES; ++base) { - new_counts[b][ctx][base] = 0; - } - } + new_counts[b].assign(flat_size, 0); } // Merge counts according to bin_map @@ -90,10 +97,8 @@ void StratifiedMarkovModel::apply_bin_mapping(const std::vector& bin_map) { tgt_bin = src_bin; // Keep original if invalid target } - for (int ctx = 0; ctx < NUM_5MERS; ++ctx) { - for (int base = 0; base < NUM_BASES; ++base) { - new_counts[tgt_bin][ctx][base] += m_counts[src_bin][ctx][base]; - } + for (int i = 0; i < flat_size; ++i) { + new_counts[tgt_bin][i] += m_counts[src_bin][i]; } new_per_bin_kmers[tgt_bin] += m_per_bin_kmers[src_bin]; } @@ -105,22 +110,24 @@ void StratifiedMarkovModel::apply_bin_mapping(const std::vector& bin_map) { void StratifiedMarkovModel::normalize_and_build_cdf(double pseudocount) { for (int b = 0; b < m_num_bins; ++b) { - for (int ctx = 0; ctx < NUM_5MERS; ++ctx) { + for (int ctx = 0; ctx < m_num_kmers; ++ctx) { + int base_offset = ctx * NUM_BASES; + // Calculate total count for this context (with pseudocounts) double total = 0.0; for (int base = 0; base < NUM_BASES; ++base) { - total += static_cast(m_counts[b][ctx][base]) + pseudocount; + total += static_cast(m_counts[b][base_offset + base]) + pseudocount; } // Build CDF double cumsum = 0.0; for (int base = 0; base < NUM_BASES; ++base) { - double prob = (static_cast(m_counts[b][ctx][base]) + pseudocount) / total; + double prob = (static_cast(m_counts[b][base_offset + base]) + pseudocount) / total; cumsum += prob; - m_cdf[b][ctx][base] = static_cast(cumsum); + m_cdf[b][base_offset + base] = static_cast(cumsum); } // Ensure last element is exactly 1.0 (avoid floating point issues) - m_cdf[b][ctx][NUM_BASES - 1] = 1.0f; + m_cdf[b][base_offset + NUM_BASES - 1] = 1.0f; } } } @@ -149,19 +156,19 @@ int StratifiedMarkovModel::get_bin(double track_value) const { return bin; } -int StratifiedMarkovModel::sample_next_base(int bin_idx, int context_5mer_idx, float random_val) const { +int StratifiedMarkovModel::sample_next_base(int bin_idx, int context_kmer_idx, float random_val) const { if (bin_idx < 0 || bin_idx >= m_num_bins) { // Fall back to uniform sampling return static_cast(random_val * NUM_BASES); } - if (context_5mer_idx < 0 || context_5mer_idx >= NUM_5MERS) { + if (context_kmer_idx < 0 || context_kmer_idx >= m_num_kmers) { return static_cast(random_val * NUM_BASES); } // Use CDF for sampling - const auto& cdf = m_cdf[bin_idx][context_5mer_idx]; + int base_offset = context_kmer_idx * NUM_BASES; for (int base = 0; base < NUM_BASES; ++base) { - if (random_val < cdf[base]) { + if (random_val < m_cdf[bin_idx][base_offset + base]) { return base; } } @@ -184,9 +191,9 @@ char StratifiedMarkovModel::decode_base(int idx) { return bases[idx]; } -int StratifiedMarkovModel::encode_5mer(const char* seq) { +int StratifiedMarkovModel::encode_kmer(const char* seq, int k) { int idx = 0; - for (int i = 0; i < 5; ++i) { + for (int i = 0; i < k; ++i) { int base = encode_base(seq[i]); if (base < 0) return -1; idx = (idx << 2) | base; @@ -194,8 +201,8 @@ int StratifiedMarkovModel::encode_5mer(const char* seq) { return idx; } -void StratifiedMarkovModel::decode_5mer(int idx, char* out) { - for (int i = 4; i >= 0; --i) { +void StratifiedMarkovModel::decode_kmer(int idx, char* out, int k) { + for (int i = k - 1; i >= 0; --i) { out[i] = decode_base(idx & 3); idx >>= 2; } @@ -206,35 +213,40 @@ int StratifiedMarkovModel::complement_base(int base_idx) { return 3 - base_idx; } -void StratifiedMarkovModel::revcomp_6mer(int context_5mer_idx, int next_base_idx, +void StratifiedMarkovModel::revcomp_kmer(int context_kmer_idx, int next_base_idx, int k, int& revcomp_context_idx, int& revcomp_next_idx) { - // Forward 6-mer: B0 B1 B2 B3 B4 B5 - // context = B0*256 + B1*64 + B2*16 + B3*4 + B4 - // next = B5 + // Forward (k+1)-mer: B0 B1 ... B(k-1) B(k) + // context = B0 * 4^(k-1) + B1 * 4^(k-2) + ... + B(k-1) + // next = B(k) // - // Reverse complement: comp(B5) comp(B4) comp(B3) comp(B2) comp(B1) comp(B0) - // revcomp_context = comp(B5)*256 + comp(B4)*64 + comp(B3)*16 + comp(B2)*4 + comp(B1) + // Reverse complement: comp(B(k)) comp(B(k-1)) ... comp(B1) comp(B0) + // revcomp_context = comp(B(k)) * 4^(k-1) + comp(B(k-1)) * 4^(k-2) + ... + comp(B1) // revcomp_next = comp(B0) - // Extract the 5 bases from context (B0 is most significant) - int b4 = context_5mer_idx & 3; - int b3 = (context_5mer_idx >> 2) & 3; - int b2 = (context_5mer_idx >> 4) & 3; - int b1 = (context_5mer_idx >> 6) & 3; - int b0 = (context_5mer_idx >> 8) & 3; - int b5 = next_base_idx; - - // Compute complements - int c0 = complement_base(b0); - int c1 = complement_base(b1); - int c2 = complement_base(b2); - int c3 = complement_base(b3); - int c4 = complement_base(b4); - int c5 = complement_base(b5); - - // Build reverse complement context: c5 c4 c3 c2 c1 - revcomp_context_idx = (c5 << 8) | (c4 << 6) | (c3 << 4) | (c2 << 2) | c1; - revcomp_next_idx = c0; + // Extract the k bases from context and build reverse complement + // B0 is most significant (leftmost) + // We need to extract each base, complement it, and rebuild in reverse order. + + // Extract bases from context_kmer_idx into an array + // bases[0] = B0 (most significant), bases[k-1] = B(k-1) (least significant) + int bases[MAX_K + 1]; + { + int tmp = context_kmer_idx; + for (int i = k - 1; i >= 0; --i) { + bases[i] = tmp & 3; + tmp >>= 2; + } + } + bases[k] = next_base_idx; + + // Build reverse complement: + // revcomp_context = comp(B(k)) comp(B(k-1)) ... comp(B1) + // revcomp_next = comp(B0) + revcomp_context_idx = 0; + for (int i = k; i >= 1; --i) { + revcomp_context_idx = (revcomp_context_idx << 2) | complement_base(bases[i]); + } + revcomp_next_idx = complement_base(bases[0]); } void StratifiedMarkovModel::save(const std::string& path) const { @@ -247,6 +259,10 @@ void StratifiedMarkovModel::save(const std::string& path) const { ofs.write(reinterpret_cast(&FILE_MAGIC), sizeof(FILE_MAGIC)); ofs.write(reinterpret_cast(&FILE_VERSION), sizeof(FILE_VERSION)); + // Write k and num_kmers (new in version 2) + ofs.write(reinterpret_cast(&m_k), sizeof(m_k)); + ofs.write(reinterpret_cast(&m_num_kmers), sizeof(m_num_kmers)); + // Write number of bins ofs.write(reinterpret_cast(&m_num_bins), sizeof(m_num_bins)); @@ -261,20 +277,16 @@ void StratifiedMarkovModel::save(const std::string& path) const { ofs.write(reinterpret_cast(&m_per_bin_kmers[i]), sizeof(m_per_bin_kmers[i])); } - // Write counts + // Write counts (flat: num_kmers * NUM_BASES uint64_t values per bin) for (int b = 0; b < m_num_bins; ++b) { - for (int ctx = 0; ctx < NUM_5MERS; ++ctx) { - ofs.write(reinterpret_cast(m_counts[b][ctx].data()), - NUM_BASES * sizeof(uint64_t)); - } + ofs.write(reinterpret_cast(m_counts[b].data()), + m_num_kmers * NUM_BASES * sizeof(uint64_t)); } - // Write CDFs + // Write CDFs (flat: num_kmers * NUM_BASES float values per bin) for (int b = 0; b < m_num_bins; ++b) { - for (int ctx = 0; ctx < NUM_5MERS; ++ctx) { - ofs.write(reinterpret_cast(m_cdf[b][ctx].data()), - NUM_BASES * sizeof(float)); - } + ofs.write(reinterpret_cast(m_cdf[b].data()), + m_num_kmers * NUM_BASES * sizeof(float)); } if (!ofs) { @@ -300,7 +312,24 @@ StratifiedMarkovModel StratifiedMarkovModel::load(const std::string& path) { // Read and verify version uint32_t version; ifs.read(reinterpret_cast(&version), sizeof(version)); - if (version != FILE_VERSION) { + + if (version == FILE_VERSION_1) { + // Legacy version 1: hardcoded k=5, num_kmers=1024 + model.m_k = 5; + model.m_num_kmers = 1024; + } else if (version == FILE_VERSION) { + // Version 2: read k and num_kmers from header + ifs.read(reinterpret_cast(&model.m_k), sizeof(model.m_k)); + ifs.read(reinterpret_cast(&model.m_num_kmers), sizeof(model.m_num_kmers)); + if (model.m_k < 1 || model.m_k > MAX_K) { + TGLError("Invalid Markov order k=%d in file: %s", model.m_k, path.c_str()); + } + int expected_num_kmers = 1 << (2 * model.m_k); + if (model.m_num_kmers != expected_num_kmers) { + TGLError("Inconsistent num_kmers (%d, expected %d for k=%d) in file: %s", + model.m_num_kmers, expected_num_kmers, model.m_k, path.c_str()); + } + } else { TGLError("Unsupported file version %u in: %s", version, path.c_str()); } @@ -320,22 +349,25 @@ StratifiedMarkovModel StratifiedMarkovModel::load(const std::string& path) { ifs.read(reinterpret_cast(&model.m_per_bin_kmers[i]), sizeof(uint64_t)); } + int num_kmers = model.m_num_kmers; + int flat_size = num_kmers * NUM_BASES; + // Read counts + // Both version 1 and 2 use the same contiguous layout: + // [ctx0_base0, ctx0_base1, ..., ctxN_base3] model.m_counts.resize(model.m_num_bins); for (int b = 0; b < model.m_num_bins; ++b) { - for (int ctx = 0; ctx < NUM_5MERS; ++ctx) { - ifs.read(reinterpret_cast(model.m_counts[b][ctx].data()), - NUM_BASES * sizeof(uint64_t)); - } + model.m_counts[b].resize(flat_size); + ifs.read(reinterpret_cast(model.m_counts[b].data()), + flat_size * sizeof(uint64_t)); } // Read CDFs model.m_cdf.resize(model.m_num_bins); for (int b = 0; b < model.m_num_bins; ++b) { - for (int ctx = 0; ctx < NUM_5MERS; ++ctx) { - ifs.read(reinterpret_cast(model.m_cdf[b][ctx].data()), - NUM_BASES * sizeof(float)); - } + model.m_cdf[b].resize(flat_size); + ifs.read(reinterpret_cast(model.m_cdf[b].data()), + flat_size * sizeof(float)); } if (!ifs) { diff --git a/src/StratifiedMarkovModel.h b/src/StratifiedMarkovModel.h index 4c7cee28..77978d2e 100644 --- a/src/StratifiedMarkovModel.h +++ b/src/StratifiedMarkovModel.h @@ -1,9 +1,10 @@ /* * StratifiedMarkovModel.h * - * Stratified Markov-5 model for synthetic genome generation. - * Stores transition probabilities for 5-mer -> base transitions, + * Stratified Markov model for synthetic genome generation. + * Stores transition probabilities for k-mer -> base transitions, * stratified by bins of a track expression (e.g., GC content). + * Supports configurable Markov order k (1..8). */ #ifndef STRATIFIED_MARKOV_MODEL_H_ @@ -14,30 +15,39 @@ #include #include -// Number of possible 5-mers (4^5 = 1024) +// Kept for backward compatibility with external callers. +// Do NOT rely on NUM_5MERS inside StratifiedMarkovModel internals. constexpr int NUM_5MERS = 1024; // Number of bases (A, C, G, T) constexpr int NUM_BASES = 4; /** - * StratifiedMarkovModel: A 5th-order Markov model stratified by track values. + * StratifiedMarkovModel: A k-th order Markov model stratified by track values. * * For each bin (based on a track expression like GC content), stores: - * - Counts of 6-mers (5-mer context + next base) - * - Probability distributions P(next_base | 5-mer_context) + * - Counts of (k+1)-mers (k-mer context + next base) + * - Probability distributions P(next_base | k-mer_context) * - Cumulative distribution functions for sampling + * + * The Markov order k is configurable at init time (1..8, default 5). */ class StratifiedMarkovModel { public: + // Maximum supported Markov order + static constexpr int MAX_K = 8; + // Maximum number of k-mers (4^MAX_K = 65536) + static constexpr int MAX_KMERS = 65536; + StratifiedMarkovModel(); /** - * Initialize the model with the specified number of bins. + * Initialize the model with the specified number of bins and Markov order. * @param num_bins Number of stratification bins * @param breaks Bin boundaries (size = num_bins + 1) + * @param k Markov order (1..8, default 5 for backward compatibility) */ - void init(int num_bins, const std::vector& breaks); + void init(int num_bins, const std::vector& breaks, int k = 5); /** * Reset all counts to zero. @@ -45,12 +55,12 @@ class StratifiedMarkovModel { void reset_counts(); /** - * Increment the count for a 6-mer in the specified bin. + * Increment the count for a (k+1)-mer in the specified bin. * @param bin_idx Bin index (0 to num_bins-1) - * @param context_5mer_idx Encoded 5-mer context (0 to 1023) + * @param context_kmer_idx Encoded k-mer context (0 to m_num_kmers-1) * @param next_base_idx Next base index (0=A, 1=C, 2=G, 3=T) */ - void increment_count(int bin_idx, int context_5mer_idx, int next_base_idx); + void increment_count(int bin_idx, int context_kmer_idx, int next_base_idx); /** * Apply bin mapping: merge sparse bins into target bins. @@ -75,20 +85,31 @@ class StratifiedMarkovModel { int get_bin(double track_value) const; /** - * Sample the next base given a 5-mer context and bin. + * Sample the next base given a k-mer context and bin. * @param bin_idx Bin index - * @param context_5mer_idx Encoded 5-mer context + * @param context_kmer_idx Encoded k-mer context * @param random_val Random value in [0, 1) * @return Base index (0=A, 1=C, 2=G, 3=T) */ - int sample_next_base(int bin_idx, int context_5mer_idx, float random_val) const; + int sample_next_base(int bin_idx, int context_kmer_idx, float random_val) const; + + /** + * Encode a k-mer sequence to an index. + * @param seq Pointer to k characters (A/C/G/T) + * @param k Number of characters to encode + * @return Index (0 to 4^k - 1), or -1 if any character is not A/C/G/T + */ + static int encode_kmer(const char* seq, int k); /** * Encode a 5-mer sequence to an index (0-1023). + * Backward-compatible wrapper around encode_kmer with k=5. * @param seq Pointer to 5 characters (A/C/G/T) * @return Index (0-1023), or -1 if any character is not A/C/G/T */ - static int encode_5mer(const char* seq); + static inline int encode_5mer(const char* seq) { + return encode_kmer(seq, 5); + } /** * Encode a single base to an index. @@ -104,35 +125,60 @@ class StratifiedMarkovModel { */ static char decode_base(int idx); + /** + * Decode a k-mer index to a string. + * @param idx k-mer index (0 to 4^k - 1) + * @param out Output buffer (must have space for k characters) + * @param k Number of characters to decode + */ + static void decode_kmer(int idx, char* out, int k); + /** * Decode a 5-mer index to a string. + * Backward-compatible wrapper around decode_kmer with k=5. * @param idx 5-mer index (0-1023) * @param out Output buffer (must have space for 5 characters) */ - static void decode_5mer(int idx, char* out); + static inline void decode_5mer(int idx, char* out) { + decode_kmer(idx, out, 5); + } /** * Get the complement of a base index. * @param base_idx Base index (0=A, 1=C, 2=G, 3=T) - * @return Complement base index (A↔T, C↔G) + * @return Complement base index (A<->T, C<->G) */ static int complement_base(int base_idx); /** - * Compute the reverse complement of a 6-mer given as context + next base. - * Given forward 6-mer B0B1B2B3B4B5 (context=B0-B4, next=B5), - * computes reverse complement: comp(B5)comp(B4)comp(B3)comp(B2)comp(B1)comp(B0) - * where the new context is comp(B5)-comp(B1) and new next is comp(B0). + * Compute the reverse complement of a (k+1)-mer given as context + next base. + * Given forward (k+1)-mer B0 B1 ... B(k-1) B(k) (context=B0..B(k-1), next=B(k)), + * computes reverse complement: + * comp(B(k)) comp(B(k-1)) ... comp(B1) comp(B0) + * where the new context is comp(B(k))..comp(B1) and new next is comp(B0). * - * @param context_5mer_idx Encoded 5-mer context (0 to 1023) + * @param context_kmer_idx Encoded k-mer context (0 to m_num_kmers-1) * @param next_base_idx Next base index (0=A, 1=C, 2=G, 3=T) - * @param revcomp_context_idx [out] Reverse complement 5-mer context index + * @param k Markov order (context length) + * @param revcomp_context_idx [out] Reverse complement k-mer context index * @param revcomp_next_idx [out] Reverse complement next base index */ - static void revcomp_6mer(int context_5mer_idx, int next_base_idx, + static void revcomp_kmer(int context_kmer_idx, int next_base_idx, int k, int& revcomp_context_idx, int& revcomp_next_idx); + /** + * Compute reverse complement of a 6-mer (k=5). + * Backward-compatible wrapper around revcomp_kmer with k=5. + */ + static inline void revcomp_6mer(int context_5mer_idx, int next_base_idx, + int& revcomp_context_idx, int& revcomp_next_idx) { + revcomp_kmer(context_5mer_idx, next_base_idx, 5, + revcomp_context_idx, revcomp_next_idx); + } + // Accessors + int get_k() const { return m_k; } + int get_num_kmers() const { return m_num_kmers; } int get_num_bins() const { return m_num_bins; } const std::vector& get_breaks() const { return m_breaks; } uint64_t get_total_kmers() const { return m_total_kmers; } @@ -140,17 +186,17 @@ class StratifiedMarkovModel { // Data accessors for R interface uint64_t get_count(int bin_idx, int context_idx, int base_idx) const { - return m_counts[bin_idx][context_idx][base_idx]; + return m_counts[bin_idx][context_idx * NUM_BASES + base_idx]; } float get_cdf(int bin_idx, int context_idx, int base_idx) const { - return m_cdf[bin_idx][context_idx][base_idx]; + return m_cdf[bin_idx][context_idx * NUM_BASES + base_idx]; } // Direct access for bulk data transfer - const std::vector, NUM_5MERS>>& get_counts() const { + const std::vector>& get_counts() const { return m_counts; } - const std::vector, NUM_5MERS>>& get_cdf() const { + const std::vector>& get_cdf() const { return m_cdf; } @@ -168,15 +214,17 @@ class StratifiedMarkovModel { static StratifiedMarkovModel load(const std::string& path); private: + int m_k; // Markov order (1..8) + int m_num_kmers; // = 4^k, computed at init time int m_num_bins; std::vector m_breaks; - // counts[bin][5mer_idx][base_idx] - std::vector, NUM_5MERS>> m_counts; + // Flat vectors: counts[bin][kmer_idx * NUM_BASES + base_idx] + std::vector> m_counts; - // Cumulative distribution functions for sampling: cdf[bin][5mer_idx][base_idx] - // cdf[bin][ctx][0] = P(A), cdf[bin][ctx][1] = P(A)+P(C), etc. - std::vector, NUM_5MERS>> m_cdf; + // Cumulative distribution functions for sampling: cdf[bin][kmer_idx * NUM_BASES + base_idx] + // For a given context ctx: cdf[bin][ctx*4+0] = P(A), cdf[bin][ctx*4+1] = P(A)+P(C), etc. + std::vector> m_cdf; // Statistics uint64_t m_total_kmers; @@ -184,7 +232,10 @@ class StratifiedMarkovModel { // Magic number for file format static constexpr uint32_t FILE_MAGIC = 0x4D4B5653; // "SVKM" in little-endian - static constexpr uint32_t FILE_VERSION = 1; + // Version 2: stores k and num_kmers in header + static constexpr uint32_t FILE_VERSION = 2; + // Version 1 for backward-compatible loading of old files + static constexpr uint32_t FILE_VERSION_1 = 1; }; #endif // STRATIFIED_MARKOV_MODEL_H_ diff --git a/src/misha-init.cpp b/src/misha-init.cpp index 1c92572b..c42971f2 100644 --- a/src/misha-init.cpp +++ b/src/misha-init.cpp @@ -104,8 +104,8 @@ extern "C" { extern SEXP C_gseq_pwm_multitask(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP C_gseq_pwm_edits(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP C_gseq_kmer(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); - extern SEXP C_gsynth_train(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); - extern SEXP C_gsynth_sample(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); + extern SEXP C_gsynth_train(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); + extern SEXP C_gsynth_sample(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP C_gsynth_replace_kmer(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP C_gseq_kmer_dist(SEXP, SEXP, SEXP, SEXP); } @@ -208,8 +208,8 @@ static const R_CallMethodDef CallEntries[] = { {"C_gseq_pwm_multitask", (DL_FUNC)&C_gseq_pwm_multitask, 17}, {"C_gseq_pwm_edits", (DL_FUNC)&C_gseq_pwm_edits, 14}, {"C_gseq_kmer", (DL_FUNC)&C_gseq_kmer, 9}, - {"C_gsynth_train", (DL_FUNC)&C_gsynth_train, 11}, - {"C_gsynth_sample", (DL_FUNC)&C_gsynth_sample, 11}, + {"C_gsynth_train", (DL_FUNC)&C_gsynth_train, 12}, + {"C_gsynth_sample", (DL_FUNC)&C_gsynth_sample, 12}, {"C_gsynth_replace_kmer", (DL_FUNC)&C_gsynth_replace_kmer, 6}, {"C_gseq_kmer_dist", (DL_FUNC)&C_gseq_kmer_dist, 4}, {NULL, NULL, 0} diff --git a/tests/testthat/test-gsynth-k.R b/tests/testthat/test-gsynth-k.R new file mode 100644 index 00000000..83ed0df4 --- /dev/null +++ b/tests/testthat/test-gsynth-k.R @@ -0,0 +1,308 @@ +# Tests for variable Markov order k parameter in gsynth functions + +test_that("gsynth.train errors on invalid k values", { + gdb.init("/home/aviezerl/hg38") + + test_intervals <- gintervals(1, 0, 100000) + + # k = 0 should error (below minimum of 1) + expect_error( + gsynth.train(intervals = test_intervals, iterator = 200, k = 0), + "k must be a single integer between 1 and 8" + ) + + # k = 9 should error (above maximum of 8) + expect_error( + gsynth.train(intervals = test_intervals, iterator = 200, k = 9), + "k must be a single integer between 1 and 8" + ) + + # k = 3.5 is silently truncated to 3L by as.integer(), which is valid. + # Verify it does NOT error (the code uses as.integer(k) without strict + # integer checking, so non-integer numerics are accepted). + expect_error( + gsynth.train(intervals = test_intervals, iterator = 200, k = 3.5), + NA + ) + + # k = "abc" should error (as.integer("abc") produces NA) + expect_error( + suppressWarnings( + gsynth.train(intervals = test_intervals, iterator = 200, k = "abc") + ), + "k must be a single integer between 1 and 8" + ) +}) + +test_that("gsynth.train with k=3 basic training and sampling", { + gdb.init("/home/aviezerl/hg38") + + test_intervals <- gintervals(1, 0, 100000) + + # Train a 0D model with k=3 + model <- gsynth.train( + intervals = test_intervals, + iterator = 200, + k = 3 + ) + + expect_s3_class(model, "gsynth.model") + expect_equal(model$k, 3L) + expect_equal(model$num_kmers, 64L) # 4^3 + + # CDF matrices should be 64 x 4 + cdf_mat <- model$model_data$cdf[[1]] + expect_equal(dim(cdf_mat), c(64, 4)) + + # Sample and verify valid DNA + output_fasta <- tempfile(fileext = ".fa") + on.exit(unlink(output_fasta), add = TRUE) + + gsynth.sample( + model, + output_fasta, + output_format = "fasta", + intervals = test_intervals, + seed = 60427 + ) + + expect_true(file.exists(output_fasta)) + lines <- readLines(output_fasta) + seq_lines <- lines[!grepl("^>", lines)] + full_seq <- paste(seq_lines, collapse = "") + + # Should only contain A, C, G, T + expect_true(grepl("^[ACGT]+$", full_seq)) + + # Should have correct length (100kb) + expect_equal(nchar(full_seq), 100000) +}) + +test_that("gsynth.train with k=1 minimal model", { + gdb.init("/home/aviezerl/hg38") + + test_intervals <- gintervals(1, 0, 100000) + + model <- gsynth.train( + intervals = test_intervals, + iterator = 200, + k = 1 + ) + + expect_s3_class(model, "gsynth.model") + expect_equal(model$k, 1L) + expect_equal(model$num_kmers, 4L) # 4^1 + + # CDF matrices should be 4 x 4 + cdf_mat <- model$model_data$cdf[[1]] + expect_equal(dim(cdf_mat), c(4, 4)) + + # CDF should be valid + expect_true(all(cdf_mat >= 0)) + expect_true(all(cdf_mat <= 1)) + expect_true(all(abs(cdf_mat[, 4] - 1) < 1e-5)) + + # Sample and verify valid DNA + output_fasta <- tempfile(fileext = ".fa") + on.exit(unlink(output_fasta), add = TRUE) + + gsynth.sample( + model, + output_fasta, + output_format = "fasta", + intervals = gintervals(1, 0, 10000), + seed = 42 + ) + + lines <- readLines(output_fasta) + seq_lines <- lines[!grepl("^>", lines)] + full_seq <- paste(seq_lines, collapse = "") + expect_true(grepl("^[ACGT]+$", full_seq)) + expect_equal(nchar(full_seq), 10000) +}) + +test_that("gsynth.train with k=7 larger model", { + gdb.init("/home/aviezerl/hg38") + + test_intervals <- gintervals(1, 0, 100000) + + model <- gsynth.train( + intervals = test_intervals, + iterator = 200, + k = 7 + ) + + expect_s3_class(model, "gsynth.model") + expect_equal(model$k, 7L) + expect_equal(model$num_kmers, 16384L) # 4^7 + + # CDF matrices should be 16384 x 4 + cdf_mat <- model$model_data$cdf[[1]] + expect_equal(dim(cdf_mat), c(16384, 4)) + + # Sample and verify + output_fasta <- tempfile(fileext = ".fa") + on.exit(unlink(output_fasta), add = TRUE) + + gsynth.sample( + model, + output_fasta, + output_format = "fasta", + intervals = gintervals(1, 0, 10000), + seed = 99 + ) + + lines <- readLines(output_fasta) + seq_lines <- lines[!grepl("^>", lines)] + full_seq <- paste(seq_lines, collapse = "") + expect_true(grepl("^[ACGT]+$", full_seq)) + expect_equal(nchar(full_seq), 10000) +}) + +test_that("save/load round-trip with k != 5", { + gdb.init("/home/aviezerl/hg38") + + test_intervals <- gintervals(1, 0, 100000) + + model <- gsynth.train( + intervals = test_intervals, + iterator = 200, + k = 3 + ) + + # Save as .gsm directory + out_dir <- file.path(tempdir(), "test_gsm_k3") + on.exit(unlink(out_dir, recursive = TRUE), add = TRUE) + + gsynth.save(model, out_dir) + loaded <- gsynth.load(out_dir) + + expect_s3_class(loaded, "gsynth.model") + expect_equal(loaded$k, 3L) + expect_equal(loaded$num_kmers, 64L) + + # Verify CDF dimensions match + expect_equal(dim(loaded$model_data$cdf[[1]]), c(64, 4)) + + # Verify model data matches exactly + expect_equal(loaded$model_data$counts[[1]], model$model_data$counts[[1]], + tolerance = 1e-15 + ) + expect_equal(loaded$model_data$cdf[[1]], model$model_data$cdf[[1]], + tolerance = 1e-15 + ) + + # Verify sampling is identical with same seed. + # Loaded model has iterator=NULL (not stored in .gsm format), + # so restore it from the original model before sampling. + loaded$iterator <- model$iterator + + out1 <- tempfile(fileext = ".fa") + out2 <- tempfile(fileext = ".fa") + on.exit(unlink(c(out1, out2)), add = TRUE) + + sample_intervals <- gintervals(1, 0, 10000) + + gsynth.sample(model, out1, + output_format = "fasta", + intervals = sample_intervals, seed = 60427 + ) + gsynth.sample(loaded, out2, + output_format = "fasta", + intervals = sample_intervals, seed = 60427 + ) + + expect_identical(readLines(out1), readLines(out2)) +}) + +test_that("k=5 produces identical results to default", { + gdb.init("/home/aviezerl/hg38") + + test_intervals <- gintervals(1, 0, 100000) + + # Train with explicit k=5 + model_k5 <- gsynth.train( + intervals = test_intervals, + iterator = 200, + k = 5 + ) + + # Train with default (no k specified) + model_default <- gsynth.train( + intervals = test_intervals, + iterator = 200 + ) + + expect_equal(model_k5$k, 5L) + expect_equal(model_default$k, 5L) + expect_equal(model_k5$num_kmers, 1024L) + expect_equal(model_default$num_kmers, 1024L) + + # CDF matrices should be identical + expect_equal( + model_k5$model_data$cdf[[1]], + model_default$model_data$cdf[[1]], + tolerance = 1e-15 + ) + + # Counts should be identical + expect_equal( + model_k5$model_data$counts[[1]], + model_default$model_data$counts[[1]], + tolerance = 1e-15 + ) +}) + +test_that("k with 1D stratification", { + gdb.init("/home/aviezerl/hg38") + + for (vt in c("g_frac_k", "c_frac_k")) { + if (vt %in% gvtrack.ls()) gvtrack.rm(vt) + } + gvtrack.create("g_frac_k", NULL, "kmer.frac", kmer = "G") + gvtrack.create("c_frac_k", NULL, "kmer.frac", kmer = "C") + on.exit({ + if ("g_frac_k" %in% gvtrack.ls()) gvtrack.rm("g_frac_k") + if ("c_frac_k" %in% gvtrack.ls()) gvtrack.rm("c_frac_k") + }, add = TRUE) + + test_intervals <- gintervals(1, 0, 100000) + + # Train 1D model with k=3 + model <- gsynth.train( + list(expr = "g_frac_k + c_frac_k", breaks = seq(0, 1, 0.2)), # 5 bins + intervals = test_intervals, + iterator = 200, + k = 3 + ) + + expect_s3_class(model, "gsynth.model") + expect_equal(model$k, 3L) + expect_equal(model$num_kmers, 64L) + expect_equal(model$n_dims, 1) + expect_equal(model$dim_sizes, 5) + expect_equal(model$total_bins, 5) + + # All CDF matrices should be 64 x 4 + for (bin in seq_len(model$total_bins)) { + expect_equal(dim(model$model_data$cdf[[bin]]), c(64, 4)) + } + + # Sample and verify + output_fasta <- tempfile(fileext = ".fa") + on.exit(unlink(output_fasta), add = TRUE) + + gsynth.sample( + model, + output_fasta, + output_format = "fasta", + intervals = gintervals(1, 0, 10000), + seed = 60427 + ) + + lines <- readLines(output_fasta) + seq_lines <- lines[!grepl("^>", lines)] + full_seq <- paste(seq_lines, collapse = "") + expect_true(grepl("^[ACGT]+$", full_seq)) + expect_equal(nchar(full_seq), 10000) +}) From 188879232704805759b11691c2192dd5fbfa3bbd Mon Sep 17 00:00:00 2001 From: aviezerl Date: Mon, 30 Mar 2026 23:48:46 +0300 Subject: [PATCH 2/7] fix: avoid integer overflow when loading large high-order .gsm models total_bins * num_kmers * 4L overflows R's 32-bit integer for k=8 models with >= 8192 bins. Use double arithmetic for expected_n instead. --- R/synth.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/synth.R b/R/synth.R index 059a7104..d16c7bc7 100644 --- a/R/synth.R +++ b/R/synth.R @@ -993,7 +993,7 @@ gsynth.convert <- function(input_file, output_file, compress = FALSE) { stop("Binary data files (counts.bin, cdf.bin) not found", call. = FALSE) } - expected_n <- total_bins * num_kmers * 4L + expected_n <- as.double(total_bins) * as.double(num_kmers) * 4 counts_flat <- readBin(counts_path, "double", n = expected_n, size = 8L, endian = "little") cdf_flat <- readBin(cdf_path, "double", n = expected_n, size = 8L, endian = "little") From 7f4cd5e3b15c8312756a8ed32417c0450ceecb64 Mon Sep 17 00:00:00 2001 From: aviezerl Date: Mon, 30 Mar 2026 23:57:52 +0300 Subject: [PATCH 3/7] fix: reject non-integer k values instead of silently truncating k=3.5 was silently converted to 3 via as.integer(). Now validate that k == as.integer(k) before coercion, erroring on non-integer numerics. --- R/synth.R | 4 ++-- tests/testthat/test-gsynth-k.R | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/R/synth.R b/R/synth.R index d16c7bc7..d81c5256 100644 --- a/R/synth.R +++ b/R/synth.R @@ -419,10 +419,10 @@ gsynth.train <- function(..., .gcheckroot() # Validate k (Markov order) - k <- as.integer(k) - if (length(k) != 1L || is.na(k) || k < 1L || k > 8L) { + if (!is.numeric(k) || length(k) != 1L || is.na(k) || k != as.integer(k) || k < 1L || k > 8L) { stop("k must be a single integer between 1 and 8", call. = FALSE) } + k <- as.integer(k) num_kmers <- as.integer(4L^k) # Capture all dimension specs from ... diff --git a/tests/testthat/test-gsynth-k.R b/tests/testthat/test-gsynth-k.R index 83ed0df4..45f768da 100644 --- a/tests/testthat/test-gsynth-k.R +++ b/tests/testthat/test-gsynth-k.R @@ -17,12 +17,10 @@ test_that("gsynth.train errors on invalid k values", { "k must be a single integer between 1 and 8" ) - # k = 3.5 is silently truncated to 3L by as.integer(), which is valid. - # Verify it does NOT error (the code uses as.integer(k) without strict - # integer checking, so non-integer numerics are accepted). + # k = 3.5 should error (non-integer numeric) expect_error( gsynth.train(intervals = test_intervals, iterator = 200, k = 3.5), - NA + "k must be a single integer between 1 and 8" ) # k = "abc" should error (as.integer("abc") produces NA) From 86224f9fbee53b5a6fb55110783a30b376c5c321 Mon Sep 17 00:00:00 2001 From: aviezerl Date: Tue, 31 Mar 2026 00:02:15 +0300 Subject: [PATCH 4/7] fix: bump .gsm metadata version to 2 for k != 5 models v1 implied fixed 1024x4 context layout. Models with k != 5 now emit version: 2 so external .gsm readers can distinguish variable-order files. k=5 models continue to emit version: 1 for backward compatibility. --- R/synth.R | 2 +- tests/testthat/test-gsynth-k.R | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/R/synth.R b/R/synth.R index d81c5256..973be337 100644 --- a/R/synth.R +++ b/R/synth.R @@ -805,7 +805,7 @@ gsynth.save <- function(model, file, compress = FALSE) { # Build metadata metadata <- list( format = "gsynth_model", - version = 1L, + version = if (is.null(model$k) || model$k == 5L) 1L else 2L, markov_order = as.integer(if (is.null(model$k)) 5L else model$k), n_dims = as.integer(n_dims), dim_sizes = if (n_dims > 0) as.integer(model$dim_sizes) else list(), diff --git a/tests/testthat/test-gsynth-k.R b/tests/testthat/test-gsynth-k.R index 45f768da..38574332 100644 --- a/tests/testthat/test-gsynth-k.R +++ b/tests/testthat/test-gsynth-k.R @@ -173,6 +173,12 @@ test_that("save/load round-trip with k != 5", { on.exit(unlink(out_dir, recursive = TRUE), add = TRUE) gsynth.save(model, out_dir) + + # Verify .gsm metadata has version 2 for k != 5 + meta <- yaml::read_yaml(file.path(out_dir, "metadata.yaml")) + expect_equal(meta$version, 2L) + expect_equal(meta$markov_order, 3L) + loaded <- gsynth.load(out_dir) expect_s3_class(loaded, "gsynth.model") From 0e52f4b10046daf1f58c9b3736a5fcb5a1c6deab Mon Sep 17 00:00:00 2001 From: aviezerl Date: Mon, 30 Mar 2026 21:04:32 +0000 Subject: [PATCH 5/7] Style code (GHA) --- tests/testthat/test-gsynth-k.R | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-gsynth-k.R b/tests/testthat/test-gsynth-k.R index 38574332..9979faff 100644 --- a/tests/testthat/test-gsynth-k.R +++ b/tests/testthat/test-gsynth-k.R @@ -265,10 +265,13 @@ test_that("k with 1D stratification", { } gvtrack.create("g_frac_k", NULL, "kmer.frac", kmer = "G") gvtrack.create("c_frac_k", NULL, "kmer.frac", kmer = "C") - on.exit({ - if ("g_frac_k" %in% gvtrack.ls()) gvtrack.rm("g_frac_k") - if ("c_frac_k" %in% gvtrack.ls()) gvtrack.rm("c_frac_k") - }, add = TRUE) + on.exit( + { + if ("g_frac_k" %in% gvtrack.ls()) gvtrack.rm("g_frac_k") + if ("c_frac_k" %in% gvtrack.ls()) gvtrack.rm("c_frac_k") + }, + add = TRUE + ) test_intervals <- gintervals(1, 0, 100000) From 2793ff559dd5b72654619c3c8c6e94139711c22b Mon Sep 17 00:00:00 2001 From: aviezerl Date: Tue, 31 Mar 2026 00:06:44 +0300 Subject: [PATCH 6/7] chore: bump version to 5.6.9 --- DESCRIPTION | 2 +- NEWS.md | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index d9704a3c..d5f3611d 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: misha Title: Toolkit for Analysis of Genomic Data -Version: 5.6.8 +Version: 5.6.9 Authors@R: c( person("Misha", "Hoichman", , "misha@hoichman.com", role = "aut"), person("Aviezer", "Lifshitz", , "aviezer.lifshitz@weizmann.ac.il", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index d917b1dd..f48ef0ca 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# misha 5.6.9 + +* Added `k` parameter to `gsynth.train()` to configure the Markov order (1-8, default 5). + # misha 5.6.8 * Added `direction` parameter ("above"/"below") to PWM edit distance functions for computing minimum edits to bring score below a threshold (motif disruption). From 34b475aae02dcec506f7f7468e6bcf889e60b6b5 Mon Sep 17 00:00:00 2001 From: aviezerl Date: Wed, 1 Apr 2026 11:23:33 +0300 Subject: [PATCH 7/7] feat: raise max Markov order from k=8 to k=10 --- R/synth.R | 6 +++--- man/gsynth.train.Rd | 2 +- src/StratifiedMarkovModel.h | 6 +++--- tests/testthat/test-gsynth-k.R | 12 ++++++------ 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/R/synth.R b/R/synth.R index 973be337..85003235 100644 --- a/R/synth.R +++ b/R/synth.R @@ -337,7 +337,7 @@ gsynth.bin_map <- function(breaks, merge_ranges = NULL) { #' with fewer observations will be marked as NA (not learned) and a warning will #' be issued. Default is 0 (no minimum). During sampling, NA bins will fall back #' to uniform sampling unless merged via \code{bin_merge}. -#' @param k Integer Markov order (1--8). Default is 5, which models 6-mer +#' @param k Integer Markov order (1--10). Default is 5, which models 6-mer #' (context of length 5 plus the emitted base) transition probabilities. #' Higher values capture longer-range sequence dependencies but require #' exponentially more memory (\eqn{4^k} context states). @@ -419,8 +419,8 @@ gsynth.train <- function(..., .gcheckroot() # Validate k (Markov order) - if (!is.numeric(k) || length(k) != 1L || is.na(k) || k != as.integer(k) || k < 1L || k > 8L) { - stop("k must be a single integer between 1 and 8", call. = FALSE) + if (!is.numeric(k) || length(k) != 1L || is.na(k) || k != as.integer(k) || k < 1L || k > 10L) { + stop("k must be a single integer between 1 and 10", call. = FALSE) } k <- as.integer(k) num_kmers <- as.integer(4L^k) diff --git a/man/gsynth.train.Rd b/man/gsynth.train.Rd index dfe5044b..3492dbfb 100644 --- a/man/gsynth.train.Rd +++ b/man/gsynth.train.Rd @@ -41,7 +41,7 @@ with fewer observations will be marked as NA (not learned) and a warning will be issued. Default is 0 (no minimum). During sampling, NA bins will fall back to uniform sampling unless merged via \code{bin_merge}.} -\item{k}{Integer Markov order (1--8). Default is 5, which models 6-mer +\item{k}{Integer Markov order (1--10). Default is 5, which models 6-mer (context of length 5 plus the emitted base) transition probabilities. Higher values capture longer-range sequence dependencies but require exponentially more memory (\eqn{4^k} context states).} diff --git a/src/StratifiedMarkovModel.h b/src/StratifiedMarkovModel.h index 77978d2e..558f6f07 100644 --- a/src/StratifiedMarkovModel.h +++ b/src/StratifiedMarkovModel.h @@ -35,9 +35,9 @@ constexpr int NUM_BASES = 4; class StratifiedMarkovModel { public: // Maximum supported Markov order - static constexpr int MAX_K = 8; - // Maximum number of k-mers (4^MAX_K = 65536) - static constexpr int MAX_KMERS = 65536; + static constexpr int MAX_K = 10; + // Maximum number of k-mers (4^MAX_K = 1048576) + static constexpr int MAX_KMERS = 1048576; StratifiedMarkovModel(); diff --git a/tests/testthat/test-gsynth-k.R b/tests/testthat/test-gsynth-k.R index 9979faff..10d5f846 100644 --- a/tests/testthat/test-gsynth-k.R +++ b/tests/testthat/test-gsynth-k.R @@ -8,19 +8,19 @@ test_that("gsynth.train errors on invalid k values", { # k = 0 should error (below minimum of 1) expect_error( gsynth.train(intervals = test_intervals, iterator = 200, k = 0), - "k must be a single integer between 1 and 8" + "k must be a single integer between 1 and 10" ) - # k = 9 should error (above maximum of 8) + # k = 11 should error (above maximum of 10) expect_error( - gsynth.train(intervals = test_intervals, iterator = 200, k = 9), - "k must be a single integer between 1 and 8" + gsynth.train(intervals = test_intervals, iterator = 200, k = 11), + "k must be a single integer between 1 and 10" ) # k = 3.5 should error (non-integer numeric) expect_error( gsynth.train(intervals = test_intervals, iterator = 200, k = 3.5), - "k must be a single integer between 1 and 8" + "k must be a single integer between 1 and 10" ) # k = "abc" should error (as.integer("abc") produces NA) @@ -28,7 +28,7 @@ test_that("gsynth.train errors on invalid k values", { suppressWarnings( gsynth.train(intervals = test_intervals, iterator = 200, k = "abc") ), - "k must be a single integer between 1 and 8" + "k must be a single integer between 1 and 10" ) })