From f81d5bd267fe34d3eea3dfe253e05c38c4062895 Mon Sep 17 00:00:00 2001
From: aviezerl <aviezer.lifshitz@weizmann.ac.il>
Date: Mon, 30 Mar 2026 23:42:01 +0300
Subject: [PATCH 1/7] feat: parameterize Markov order k in gsynth functions
 (k=1..8, default 5)

The gsynth system previously hardcoded a Markov order of 5 throughout.
This adds a `k` parameter to `gsynth.train()` supporting orders 1-8,
enabling users to trade off model complexity vs. context sensitivity.

C++: StratifiedMarkovModel uses runtime k with dynamic vector storage
instead of fixed NUM_5MERS=1024 arrays. Binary format v2 stores k in
header; v1 files load as k=5 for backward compatibility.

R: gsynth.train(k=5L) with validation, stored in model object. Propagated
through save/load/sample. Old models without k default to 5.

Tests: 7 new test blocks (52 expectations) covering k=1/3/5/7, validation,
save/load round-trip, backward compat, and stratified models.
---
 R/synth.R                      | 111 ++++++++----
 man/gsynth.sample.Rd           |   4 +-
 man/gsynth.train.Rd            |  38 ++--
 src/GenomeSynthSample.cpp      |  43 +++--
 src/GenomeSynthTrain.cpp       |  54 +++---
 src/StratifiedMarkovModel.cpp  | 194 ++++++++++++---------
 src/StratifiedMarkovModel.h    | 119 +++++++++----
 src/misha-init.cpp             |   8 +-
 tests/testthat/test-gsynth-k.R | 308 +++++++++++++++++++++++++++++++++
 9 files changed, 664 insertions(+), 215 deletions(-)
 create mode 100644 tests/testthat/test-gsynth-k.R

diff --git a/R/synth.R b/R/synth.R
index 7db33dfd..059a7104 100644
--- a/R/synth.R
+++ b/R/synth.R
@@ -309,13 +309,13 @@ gsynth.bin_map <- function(breaks, merge_ranges = NULL) {
     result
 }
 
-#' Train a stratified Markov-5 model from genome sequences
+#' Train a stratified Markov model from genome sequences
 #'
-#' Computes a 5th-order Markov model optionally stratified by bins of one or more
-#' track expressions (e.g., GC content and CG dinucleotide frequency). This model
-#' can be used to generate synthetic genomes that preserve the k-mer statistics of
-#' the original genome within each stratification bin. When called with no dimension
-#' specifications, trains a single unstratified model.
+#' Computes a Markov model of order \code{k} (default 5) optionally stratified by
+#' bins of one or more track expressions (e.g., GC content and CG dinucleotide
+#' frequency). This model can be used to generate synthetic genomes that preserve
+#' the k-mer statistics of the original genome within each stratification bin.
+#' When called with no dimension specifications, trains a single unstratified model.
 #'
 #' @param ... Zero or more dimension specifications. Each specification is a list
 #'        containing:
@@ -333,29 +333,35 @@ gsynth.bin_map <- function(breaks, merge_ranges = NULL) {
 #'        track values are computed.
 #' @param pseudocount Pseudocount added to all k-mer counts to avoid zero probabilities.
 #'        Default is 1.
-#' @param min_obs Minimum number of observations (6-mers) required per bin. Bins with
-#'        fewer observations will be marked as NA (not learned) and a warning will be
-#'        issued. Default is 0 (no minimum). During sampling, NA bins will fall back
+#' @param min_obs Minimum number of observations ((k+1)-mers) required per bin. Bins
+#'        with fewer observations will be marked as NA (not learned) and a warning will
+#'        be issued. Default is 0 (no minimum). During sampling, NA bins will fall back
 #'        to uniform sampling unless merged via \code{bin_merge}.
+#' @param k Integer Markov order (1--8). Default is 5, which models 6-mer
+#'        (context of length 5 plus the emitted base) transition probabilities.
+#'        Higher values capture longer-range sequence dependencies but require
+#'        exponentially more memory (\eqn{4^k} context states).
 #'
 #' @details
 #' \strong{Strand symmetry:} The training process counts both the forward strand
-#' 6-mer and its reverse complement for each position, ensuring strand-symmetric
+#' (k+1)-mer and its reverse complement for each position, ensuring strand-symmetric
 #' transition probabilities. This means the reported total_kmers is approximately
 #' double the number of genomic positions processed.
 #'
-#' \strong{N bases:} Positions where the 6-mer contains any N (unknown) bases are
-#' skipped during training and counted in \code{total_n}. The model only learns
+#' \strong{N bases:} Positions where the (k+1)-mer contains any N (unknown) bases
+#' are skipped during training and counted in \code{total_n}. The model only learns
 #' from valid A/C/G/T sequences.
 #'
 #' @return A \code{gsynth.model} object containing:
 #'   \describe{
+#'     \item{k}{Markov order used for training}
+#'     \item{num_kmers}{Number of context states (\eqn{4^k})}
 #'     \item{n_dims}{Number of stratification dimensions}
 #'     \item{dim_specs}{List of dimension specifications (expr, breaks, num_bins, bin_map)}
 #'     \item{dim_sizes}{Vector of bin counts per dimension}
 #'     \item{total_bins}{Total number of bins (product of dim_sizes)}
-#'     \item{total_kmers}{Total number of valid 6-mers counted}
-#'     \item{per_bin_kmers}{Number of 6-mers counted per bin}
+#'     \item{total_kmers}{Total number of valid (k+1)-mers counted}
+#'     \item{per_bin_kmers}{Number of (k+1)-mers counted per bin}
 #'     \item{total_masked}{Number of positions skipped due to mask}
 #'     \item{total_n}{Number of positions skipped due to N bases}
 #'     \item{model_data}{Internal model data (counts and CDFs)}
@@ -408,9 +414,17 @@ gsynth.train <- function(...,
                          intervals = NULL,
                          iterator = NULL,
                          pseudocount = 1,
-                         min_obs = 0) {
+                         min_obs = 0,
+                         k = 5L) {
     .gcheckroot()
 
+    # Validate k (Markov order)
+    k <- as.integer(k)
+    if (length(k) != 1L || is.na(k) || k < 1L || k > 8L) {
+        stop("k must be a single integer between 1 and 8", call. = FALSE)
+    }
+    num_kmers <- as.integer(4L^k)
+
     # Capture all dimension specs from ...
     args <- list(...)
 
@@ -587,10 +601,13 @@ gsynth.train <- function(...,
         bin_map_vec,
         mask,
         as.numeric(pseudocount),
+        as.integer(k),
         .misha_env()
     )
 
     # Override C++ result with our multi-dimensional metadata
+    result$k <- k
+    result$num_kmers <- num_kmers
     result$n_dims <- n_dims
     result$dim_specs <- dim_specs
     result$dim_sizes <- dim_sizes
@@ -642,13 +659,13 @@ gsynth.train <- function(...,
 
     if (n_dims == 0) {
         message(sprintf(
-            "Trained unstratified model: %s 6-mers (no stratification)",
-            format(result$total_kmers, big.mark = ",")
+            "Trained unstratified Markov-%d model: %s %d-mers (no stratification)",
+            k, format(result$total_kmers, big.mark = ","), k + 1L
         ))
     } else {
         message(sprintf(
-            "Trained model: %s 6-mers across %d bins (%d dimensions)",
-            format(result$total_kmers, big.mark = ","),
+            "Trained Markov-%d model: %s %d-mers across %d bins (%d dimensions)",
+            k, format(result$total_kmers, big.mark = ","), k + 1L,
             result$total_bins,
             result$n_dims
         ))
@@ -671,8 +688,10 @@ gsynth.train <- function(...,
 #'
 #' @export
 print.gsynth.model <- function(x, ...) {
-    cat("Synthetic Genome Markov-5 Model\n")
+    k <- if (is.null(x$k)) 5L else x$k
+    cat(sprintf("Synthetic Genome Markov-%d Model\n", k))
     cat("----------------------------\n")
+    cat(sprintf("Markov order (k): %d  [context states: %d]\n", k, 4L^k))
 
     if (!is.null(x$n_dims) && x$n_dims == 0) {
         # Zero-dimensional (unstratified) model
@@ -760,6 +779,8 @@ gsynth.save <- function(model, file, compress = FALSE) {
 
     total_bins <- model$total_bins
     n_dims <- if (is.null(model$n_dims)) 0L else model$n_dims
+    k <- if (is.null(model$k)) 5L else model$k
+    num_kmers <- if (is.null(model$num_kmers)) as.integer(4L^k) else model$num_kmers
 
     # Build dim_specs for YAML
     yaml_dim_specs <- list()
@@ -785,7 +806,7 @@ gsynth.save <- function(model, file, compress = FALSE) {
     metadata <- list(
         format = "gsynth_model",
         version = 1L,
-        markov_order = 5L,
+        markov_order = as.integer(if (is.null(model$k)) 5L else model$k),
         n_dims = as.integer(n_dims),
         dim_sizes = if (n_dims > 0) as.integer(model$dim_sizes) else list(),
         total_bins = as.integer(total_bins),
@@ -799,26 +820,26 @@ gsynth.save <- function(model, file, compress = FALSE) {
         data = list(
             counts = list(
                 dtype = "float64",
-                shape = list(as.integer(total_bins), 1024L, 4L),
+                shape = list(as.integer(total_bins), as.integer(num_kmers), 4L),
                 order = "C",
                 file = "counts.bin"
             ),
             cdf = list(
                 dtype = "float64",
-                shape = list(as.integer(total_bins), 1024L, 4L),
+                shape = list(as.integer(total_bins), as.integer(num_kmers), 4L),
                 order = "C",
                 file = "cdf.bin"
             )
         )
     )
 
-    # Helper to write binary data for a list of 1024x4 matrices in row-major order
+    # Helper to write binary data for a list of num_kmers x 4 matrices in row-major order
     .write_bin_matrices <- function(mat_list, filepath) {
         con <- file(filepath, "wb")
         on.exit(close(con))
         for (i in seq_along(mat_list)) {
             mat <- mat_list[[i]]
-            # mat is 1024x4 column-major in R
+            # mat is num_kmers x 4 column-major in R
             # Writing t(mat) column-major = writing mat row-major (C order)
             writeBin(as.double(t(mat)), con, size = 8L, endian = "little")
         }
@@ -960,6 +981,10 @@ gsynth.convert <- function(input_file, output_file, compress = FALSE) {
     total_bins <- as.integer(metadata$total_bins)
     n_dims <- as.integer(metadata$n_dims)
 
+    # Read Markov order (default to 5 for backward compatibility)
+    k <- as.integer(if (is.null(metadata$markov_order)) 5L else metadata$markov_order)
+    num_kmers <- as.integer(4L^k)
+
     # Read binary data
     counts_path <- file.path(dir_path, "counts.bin")
     cdf_path <- file.path(dir_path, "cdf.bin")
@@ -968,7 +993,7 @@ gsynth.convert <- function(input_file, output_file, compress = FALSE) {
         stop("Binary data files (counts.bin, cdf.bin) not found", call. = FALSE)
     }
 
-    expected_n <- total_bins * 1024L * 4L
+    expected_n <- total_bins * num_kmers * 4L
 
     counts_flat <- readBin(counts_path, "double", n = expected_n, size = 8L, endian = "little")
     cdf_flat <- readBin(cdf_path, "double", n = expected_n, size = 8L, endian = "little")
@@ -980,20 +1005,20 @@ gsynth.convert <- function(input_file, output_file, compress = FALSE) {
         ), call. = FALSE)
     }
 
-    # Reshape: row-major flat -> list of 1024x4 column-major R matrices
-    chunk <- 1024L * 4L
+    # Reshape: row-major flat -> list of num_kmers x 4 column-major R matrices
+    chunk <- num_kmers * 4L
     counts_list <- vector("list", total_bins)
     cdf_list <- vector("list", total_bins)
 
     for (i in seq_len(total_bins)) {
         offset <- (i - 1L) * chunk
         # Data was written as t(mat) in column-major = row-major
-        # Read as 4x1024 matrix (column-major), then transpose to 1024x4
+        # Read as 4 x num_kmers matrix (column-major), then transpose to num_kmers x 4
         counts_list[[i]] <- t(matrix(counts_flat[(offset + 1L):(offset + chunk)],
-            nrow = 4L, ncol = 1024L
+            nrow = 4L, ncol = num_kmers
         ))
         cdf_list[[i]] <- t(matrix(cdf_flat[(offset + 1L):(offset + chunk)],
-            nrow = 4L, ncol = 1024L
+            nrow = 4L, ncol = num_kmers
         ))
     }
 
@@ -1029,6 +1054,8 @@ gsynth.convert <- function(input_file, output_file, compress = FALSE) {
     # Build the model object
     pseudocount <- if (!is.null(metadata$pseudocount)) as.numeric(metadata$pseudocount) else 1.0
     model <- list(
+        k = k,
+        num_kmers = num_kmers,
         num_bins = total_bins,
         breaks = NULL,
         total_kmers = as.numeric(metadata$total_kmers),
@@ -1052,7 +1079,7 @@ gsynth.convert <- function(input_file, output_file, compress = FALSE) {
 
 #' Sample a synthetic genome from a trained Markov model
 #'
-#' Generates a synthetic genome by sampling from a trained stratified Markov-5
+#' Generates a synthetic genome by sampling from a trained stratified Markov
 #' model. The generated genome preserves the k-mer statistics of the original
 #' genome within each stratification bin.
 #'
@@ -1097,7 +1124,7 @@ gsynth.convert <- function(input_file, output_file, compress = FALSE) {
 #'
 #' @details
 #' \strong{N bases during sampling:} When the sampler needs to initialize the
-#' first 5-mer context and encounters regions with only N bases, it falls back
+#' first k-mer context and encounters regions with only N bases, it falls back
 #' to uniform random base selection until a valid context is established.
 #' Similarly, if a bin has no learned statistics (sparse bin with NA CDF),
 #' uniform sampling is used for that position.
@@ -1163,6 +1190,10 @@ gsynth.sample <- function(model,
         stop("model must be a gsynth.model object", call. = FALSE)
     }
 
+    # Extract Markov order from model (default k=5 for backward compatibility)
+    k <- if (is.null(model$k)) 5L else model$k
+    num_kmers <- if (is.null(model$num_kmers)) as.integer(4L^k) else model$num_kmers
+
     output_format <- match.arg(output_format)
 
     # Validate output_path requirement
@@ -1322,8 +1353,8 @@ gsynth.sample <- function(model,
             # Replace NA CDFs with uniform distribution for sampling
             # Uniform CDF: [0.25, 0.5, 0.75, 1.0]
             uniform_cdf <- matrix(
-                rep(c(0.25, 0.5, 0.75, 1.0), each = 1024),
-                nrow = 1024, ncol = 4
+                rep(c(0.25, 0.5, 0.75, 1.0), each = num_kmers),
+                nrow = num_kmers, ncol = 4
             )
             for (bin_idx in sparse_used) {
                 cdf_list[[bin_idx]] <- uniform_cdf
@@ -1363,6 +1394,7 @@ gsynth.sample <- function(model,
         output_path_str,
         output_format_int,
         n_samples,
+        as.integer(k),
         .misha_env()
     )
 
@@ -1590,9 +1622,13 @@ gsynth.random <- function(intervals = NULL,
     # Order is A=0, C=1, G=2, T=3 (same as C++ code)
     cdf_row <- cumsum(as.numeric(nuc_probs))
 
-    # Create CDF matrix: same for all 1024 contexts (no context dependency)
+    # Use k=5 for context-independent random sampling (all rows identical)
+    random_k <- 5L
+    random_num_kmers <- as.integer(4L^random_k)
+
+    # Create CDF matrix: same for all contexts (no context dependency)
     # Each row is the same CDF - pure random, context-independent sampling
-    cdf_matrix <- matrix(rep(cdf_row, 1024), nrow = 1024, ncol = 4, byrow = TRUE)
+    cdf_matrix <- matrix(rep(cdf_row, random_num_kmers), nrow = random_num_kmers, ncol = 4, byrow = TRUE)
     cdf_list <- list(cdf_matrix)
 
     # Output format: 0 = misha, 1 = fasta, 2 = vector
@@ -1626,6 +1662,7 @@ gsynth.random <- function(intervals = NULL,
         output_path_str,
         output_format_int,
         n_samples,
+        as.integer(random_k),
         .misha_env()
     )
 
diff --git a/man/gsynth.sample.Rd b/man/gsynth.sample.Rd
index a5ac2c03..4b1e91dc 100644
--- a/man/gsynth.sample.Rd
+++ b/man/gsynth.sample.Rd
@@ -68,13 +68,13 @@ When output_format is "misha" or "fasta", returns invisible NULL and
         returns a character vector of sequences (length = n_intervals * n_samples).
 }
 \description{
-Generates a synthetic genome by sampling from a trained stratified Markov-5
+Generates a synthetic genome by sampling from a trained stratified Markov
 model. The generated genome preserves the k-mer statistics of the original
 genome within each stratification bin.
 }
 \details{
 \strong{N bases during sampling:} When the sampler needs to initialize the
-first 5-mer context and encounters regions with only N bases, it falls back
+first k-mer context and encounters regions with only N bases, it falls back
 to uniform random base selection until a valid context is established.
 Similarly, if a bin has no learned statistics (sparse bin with NA CDF),
 uniform sampling is used for that position.
diff --git a/man/gsynth.train.Rd b/man/gsynth.train.Rd
index c39cd6de..dfe5044b 100644
--- a/man/gsynth.train.Rd
+++ b/man/gsynth.train.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/synth.R
 \name{gsynth.train}
 \alias{gsynth.train}
-\title{Train a stratified Markov-5 model from genome sequences}
+\title{Train a stratified Markov model from genome sequences}
 \usage{
 gsynth.train(
   ...,
@@ -10,7 +10,8 @@ gsynth.train(
   intervals = NULL,
   iterator = NULL,
   pseudocount = 1,
-  min_obs = 0
+  min_obs = 0,
+  k = 5L
 )
 }
 \arguments{
@@ -35,40 +36,47 @@ track values are computed.}
 \item{pseudocount}{Pseudocount added to all k-mer counts to avoid zero probabilities.
 Default is 1.}
 
-\item{min_obs}{Minimum number of observations (6-mers) required per bin. Bins with
-fewer observations will be marked as NA (not learned) and a warning will be
-issued. Default is 0 (no minimum). During sampling, NA bins will fall back
+\item{min_obs}{Minimum number of observations ((k+1)-mers) required per bin. Bins
+with fewer observations will be marked as NA (not learned) and a warning will
+be issued. Default is 0 (no minimum). During sampling, NA bins will fall back
 to uniform sampling unless merged via \code{bin_merge}.}
+
+\item{k}{Integer Markov order (1--8). Default is 5, which models 6-mer
+(context of length 5 plus the emitted base) transition probabilities.
+Higher values capture longer-range sequence dependencies but require
+exponentially more memory (\eqn{4^k} context states).}
 }
 \value{
 A \code{gsynth.model} object containing:
   \describe{
+    \item{k}{Markov order used for training}
+    \item{num_kmers}{Number of context states (\eqn{4^k})}
     \item{n_dims}{Number of stratification dimensions}
     \item{dim_specs}{List of dimension specifications (expr, breaks, num_bins, bin_map)}
     \item{dim_sizes}{Vector of bin counts per dimension}
     \item{total_bins}{Total number of bins (product of dim_sizes)}
-    \item{total_kmers}{Total number of valid 6-mers counted}
-    \item{per_bin_kmers}{Number of 6-mers counted per bin}
+    \item{total_kmers}{Total number of valid (k+1)-mers counted}
+    \item{per_bin_kmers}{Number of (k+1)-mers counted per bin}
     \item{total_masked}{Number of positions skipped due to mask}
     \item{total_n}{Number of positions skipped due to N bases}
     \item{model_data}{Internal model data (counts and CDFs)}
   }
 }
 \description{
-Computes a 5th-order Markov model optionally stratified by bins of one or more
-track expressions (e.g., GC content and CG dinucleotide frequency). This model
-can be used to generate synthetic genomes that preserve the k-mer statistics of
-the original genome within each stratification bin. When called with no dimension
-specifications, trains a single unstratified model.
+Computes a Markov model of order \code{k} (default 5) optionally stratified by
+bins of one or more track expressions (e.g., GC content and CG dinucleotide
+frequency). This model can be used to generate synthetic genomes that preserve
+the k-mer statistics of the original genome within each stratification bin.
+When called with no dimension specifications, trains a single unstratified model.
 }
 \details{
 \strong{Strand symmetry:} The training process counts both the forward strand
-6-mer and its reverse complement for each position, ensuring strand-symmetric
+(k+1)-mer and its reverse complement for each position, ensuring strand-symmetric
 transition probabilities. This means the reported total_kmers is approximately
 double the number of genomic positions processed.
 
-\strong{N bases:} Positions where the 6-mer contains any N (unknown) bases are
-skipped during training and counted in \code{total_n}. The model only learns
+\strong{N bases:} Positions where the (k+1)-mer contains any N (unknown) bases
+are skipped during training and counted in \code{total_n}. The model only learns
 from valid A/C/G/T sequences.
 }
 \examples{
diff --git a/src/GenomeSynthSample.cpp b/src/GenomeSynthSample.cpp
index 983f4823..680b8559 100644
--- a/src/GenomeSynthSample.cpp
+++ b/src/GenomeSynthSample.cpp
@@ -2,7 +2,7 @@
  * GenomeSynthSample.cpp
  *
  * C++ implementation for sampling a synthetic genome from a trained
- * stratified Markov-5 model.
+ * stratified Markov-k model.
  */
 
 #include <algorithm>
@@ -71,7 +71,8 @@ extern "C" {
 SEXP C_gsynth_sample(SEXP _cdf_list, SEXP _breaks, SEXP _bin_indices,
                       SEXP _iter_starts, SEXP _iter_chroms, SEXP _intervals,
                       SEXP _mask_copy, SEXP _output_path,
-                      SEXP _output_format, SEXP _n_samples, SEXP _envir) {
+                      SEXP _output_format, SEXP _n_samples, SEXP _k,
+                      SEXP _envir) {
     try {
         struct RNGStateGuard {
             bool active = false;
@@ -99,6 +100,14 @@ SEXP C_gsynth_sample(SEXP _cdf_list, SEXP _breaks, SEXP _bin_indices,
         RNGStateGuard rng_guard;
         rng_guard.acquire();
 
+        // Parse Markov order k (default 5 for backward compatibility)
+        int k = Rf_isNull(_k) ? 5 : INTEGER(_k)[0];
+        if (k < 1 || k > StratifiedMarkovModel::MAX_K) {
+            verror("k must be between 1 and %d", StratifiedMarkovModel::MAX_K);
+        }
+        int num_kmers = 1;
+        for (int i = 0; i < k; ++i) num_kmers *= NUM_BASES;  // 4^k
+
         // Extract breaks
         int num_breaks = Rf_length(_breaks);
         int num_bins = num_breaks - 1;
@@ -107,20 +116,16 @@ SEXP C_gsynth_sample(SEXP _cdf_list, SEXP _breaks, SEXP _bin_indices,
         }
         double* breaks = REAL(_breaks);
         vector<double> breaks_vec(breaks, breaks + num_breaks);
-
-        // Build CDF lookup from R list
-        // cdf_list is a list of matrices, one per bin
-        // Each matrix is 1024 rows x 4 cols
-        vector<vector<array<float, NUM_BASES>>> cdf_data(num_bins);
+        vector<vector<float>> cdf_data(num_bins);
         for (int b = 0; b < num_bins; ++b) {
             SEXP cdf_mat = VECTOR_ELT(_cdf_list, b);
             double* cdf_ptr = REAL(cdf_mat);
-            cdf_data[b].resize(NUM_5MERS);
-            for (int ctx = 0; ctx < NUM_5MERS; ++ctx) {
+            cdf_data[b].resize(num_kmers * NUM_BASES);
+            for (int ctx = 0; ctx < num_kmers; ++ctx) {
                 for (int base = 0; base < NUM_BASES; ++base) {
                     // R matrices are column-major: [row + col * nrow]
-                    cdf_data[b][ctx][base] =
-                        static_cast<float>(cdf_ptr[ctx + base * NUM_5MERS]);
+                    cdf_data[b][ctx * NUM_BASES + base] =
+                        static_cast<float>(cdf_ptr[ctx + base * num_kmers]);
                 }
             }
         }
@@ -291,8 +296,8 @@ SEXP C_gsynth_sample(SEXP _cdf_list, SEXP _breaks, SEXP _bin_indices,
                         ++mask_cursor;
                     }
 
-                    // Initialize first 5 bases, honoring mask_copy semantics.
-                    int64_t init_len = min<int64_t>(5, interval_len);
+                    // Initialize first k bases, honoring mask_copy semantics.
+                    int64_t init_len = min<int64_t>(k, interval_len);
                     for (int64_t i = 0; i < init_len; ++i) {
                         int64_t pos = interval_start + i;
                         // Check if this position should be copied from original
@@ -335,21 +340,21 @@ SEXP C_gsynth_sample(SEXP _cdf_list, SEXP _breaks, SEXP _bin_indices,
                             }
                         }
 
-                        // Get 5-mer context from already-sampled bases
-                        int context_idx = StratifiedMarkovModel::encode_5mer(
-                            &synth_seq[rel_pos - 5]);
+                        // Get k-mer context from already-sampled bases
+                        int context_idx = StratifiedMarkovModel::encode_kmer(
+                            &synth_seq[rel_pos - k], k);
 
                         int next_base;
                         if (context_idx < 0 || bin_idx < 0 || bin_idx >= num_bins) {
                             // Invalid context or bin - sample uniformly
                             next_base = static_cast<int>(unif_rand() * NUM_BASES);
                         } else {
-                            // Sample from CDF
+                            // Sample from CDF (flat layout: ctx * NUM_BASES + base)
                             float r = unif_rand();
-                            const auto& cdf = cdf_data[bin_idx][context_idx];
+                            int base_offset = context_idx * NUM_BASES;
                             next_base = NUM_BASES - 1;
                             for (int b = 0; b < NUM_BASES; ++b) {
-                                if (r < cdf[b]) {
+                                if (r < cdf_data[bin_idx][base_offset + b]) {
                                     next_base = b;
                                     break;
                                 }
diff --git a/src/GenomeSynthTrain.cpp b/src/GenomeSynthTrain.cpp
index 39a96d62..4dfa99b0 100644
--- a/src/GenomeSynthTrain.cpp
+++ b/src/GenomeSynthTrain.cpp
@@ -1,7 +1,7 @@
 /*
  * GenomeSynthTrain.cpp
  *
- * C++ implementation for training a stratified Markov-5 model
+ * C++ implementation for training a stratified Markov-k model
  * from genomic sequences.
  */
 
@@ -28,7 +28,7 @@ using namespace rdb;
 extern "C" {
 
 /**
- * C_gsynth_train: Train a stratified Markov-5 model from genome sequences.
+ * C_gsynth_train: Train a stratified Markov-k model from genome sequences.
  *
  * @param _chrom_ids Integer vector of chromosome IDs to process
  * @param _chrom_starts Integer vector of start positions for each chromosome
@@ -48,11 +48,18 @@ extern "C" {
 SEXP C_gsynth_train(SEXP _chrom_ids, SEXP _chrom_starts, SEXP _chrom_ends,
                      SEXP _bin_indices, SEXP _iter_starts, SEXP _iter_chroms,
                      SEXP _breaks, SEXP _bin_map, SEXP _mask,
-                     SEXP _pseudocount, SEXP _envir) {
+                     SEXP _pseudocount, SEXP _k, SEXP _envir) {
     try {
         RdbInitializer rdb_init;
         IntervUtils iu(_envir);
 
+        // Parse Markov order k (default 5 for backward compatibility)
+        int k = Rf_isNull(_k) ? 5 : INTEGER(_k)[0];
+        if (k < 1 || k > StratifiedMarkovModel::MAX_K) {
+            verror("k must be between 1 and %d", StratifiedMarkovModel::MAX_K);
+        }
+        int kmer_len = k + 1;  // sliding window length: k context + 1 next base
+
         // Extract chromosome ranges
         int num_chroms = Rf_length(_chrom_ids);
         if (num_chroms == 0) {
@@ -117,7 +124,7 @@ SEXP C_gsynth_train(SEXP _chrom_ids, SEXP _chrom_starts, SEXP _chrom_ends,
 
         // Initialize the Markov model
         StratifiedMarkovModel model;
-        model.init(num_bins, breaks_vec);
+        model.init(num_bins, breaks_vec, k);
 
         int iter_size = 0;
         if (num_iter_positions > 0) {
@@ -173,7 +180,7 @@ SEXP C_gsynth_train(SEXP _chrom_ids, SEXP _chrom_starts, SEXP _chrom_ends,
             vector<char> seq;
             seqfetch.read_interval(chrom_interval, iu.get_chromkey(), seq);
 
-            if (seq.size() < 6) continue;  // Need at least 6bp for a 6-mer
+            if ((int)seq.size() < kmer_len) continue;  // Need at least (k+1) bp
 
             // Get mask intervals for this chromosome
             const vector<GInterval>& mask_ivs = mask_per_chrom[chromid];
@@ -182,9 +189,9 @@ SEXP C_gsynth_train(SEXP _chrom_ids, SEXP _chrom_starts, SEXP _chrom_ends,
             const vector<pair<int64_t, int>>& bins = chrom_bins[chromid];
             size_t bin_cursor = 0;
 
-            // Scan with 6-mer sliding window
+            // Scan with (k+1)-mer sliding window
             int64_t seq_size = static_cast<int64_t>(seq.size());
-            for (int64_t pos = 0; pos <= seq_size - 6; ++pos) {
+            for (int64_t pos = 0; pos <= seq_size - kmer_len; ++pos) {
                 int64_t genome_pos = start + pos;
 
                 // Check if masked
@@ -193,9 +200,9 @@ SEXP C_gsynth_train(SEXP _chrom_ids, SEXP _chrom_starts, SEXP _chrom_ends,
                     continue;
                 }
 
-                // Check for N's in the 6-mer
+                // Check for N's in the (k+1)-mer
                 bool has_n = false;
-                for (int i = 0; i < 6; ++i) {
+                for (int i = 0; i < kmer_len; ++i) {
                     char base = seq[pos + i];
                     if (StratifiedMarkovModel::encode_base(base) < 0) {
                         has_n = true;
@@ -224,10 +231,10 @@ SEXP C_gsynth_train(SEXP _chrom_ids, SEXP _chrom_starts, SEXP _chrom_ends,
                     continue;  // Position not covered by iterator
                 }
 
-                // Encode 5-mer context and next base
-                int context_idx = StratifiedMarkovModel::encode_5mer(&seq[pos]);
+                // Encode k-mer context and next base
+                int context_idx = StratifiedMarkovModel::encode_kmer(&seq[pos], k);
                 int next_base_idx =
-                    StratifiedMarkovModel::encode_base(seq[pos + 5]);
+                    StratifiedMarkovModel::encode_base(seq[pos + k]);
 
                 if (context_idx >= 0 && next_base_idx >= 0) {
                     // Add forward strand count
@@ -236,8 +243,8 @@ SEXP C_gsynth_train(SEXP _chrom_ids, SEXP _chrom_starts, SEXP _chrom_ends,
                     // Add reverse complement count for strand symmetry
                     // This ensures the model learns symmetric transition probabilities
                     int revcomp_context_idx, revcomp_next_idx;
-                    StratifiedMarkovModel::revcomp_6mer(
-                        context_idx, next_base_idx,
+                    StratifiedMarkovModel::revcomp_kmer(
+                        context_idx, next_base_idx, k,
                         revcomp_context_idx, revcomp_next_idx);
                     model.increment_count(bin_idx, revcomp_context_idx, revcomp_next_idx);
 
@@ -299,31 +306,32 @@ SEXP C_gsynth_train(SEXP _chrom_ids, SEXP _chrom_starts, SEXP _chrom_ends,
         // We'll serialize it to a temporary buffer
         // For now, we store counts and CDFs as nested lists
 
-        // counts: list of matrices (num_bins x (1024 * 4))
+        // counts: list of matrices (num_bins x (num_kmers * 4))
         SEXP r_counts, r_cdf;
         rprotect(r_counts = Rf_allocVector(VECSXP, num_bins));
         rprotect(r_cdf = Rf_allocVector(VECSXP, num_bins));
 
         const auto& model_counts = model.get_counts();
         const auto& model_cdf = model.get_cdf();
+        int num_kmers = model.get_num_kmers();
 
         for (int b = 0; b < num_bins; ++b) {
-            // Counts matrix: 1024 rows x 4 cols (column-major for R)
+            // Counts matrix: num_kmers rows x 4 cols (column-major for R)
             SEXP count_mat;
-            rprotect(count_mat = Rf_allocMatrix(REALSXP, NUM_5MERS, NUM_BASES));
+            rprotect(count_mat = Rf_allocMatrix(REALSXP, num_kmers, NUM_BASES));
             double* count_data = REAL(count_mat);
 
             SEXP cdf_mat;
-            rprotect(cdf_mat = Rf_allocMatrix(REALSXP, NUM_5MERS, NUM_BASES));
+            rprotect(cdf_mat = Rf_allocMatrix(REALSXP, num_kmers, NUM_BASES));
             double* cdf_data = REAL(cdf_mat);
 
-            for (int ctx = 0; ctx < NUM_5MERS; ++ctx) {
+            for (int ctx = 0; ctx < num_kmers; ++ctx) {
                 for (int base = 0; base < NUM_BASES; ++base) {
                     // R matrices are column-major: [row + col * nrow]
-                    count_data[ctx + base * NUM_5MERS] =
-                        static_cast<double>(model_counts[b][ctx][base]);
-                    cdf_data[ctx + base * NUM_5MERS] =
-                        static_cast<double>(model_cdf[b][ctx][base]);
+                    count_data[ctx + base * num_kmers] =
+                        static_cast<double>(model_counts[b][ctx * NUM_BASES + base]);
+                    cdf_data[ctx + base * num_kmers] =
+                        static_cast<double>(model_cdf[b][ctx * NUM_BASES + base]);
                 }
             }
 
diff --git a/src/StratifiedMarkovModel.cpp b/src/StratifiedMarkovModel.cpp
index eaa9b2c5..a078014f 100644
--- a/src/StratifiedMarkovModel.cpp
+++ b/src/StratifiedMarkovModel.cpp
@@ -1,7 +1,7 @@
 /*
  * StratifiedMarkovModel.cpp
  *
- * Implementation of the stratified Markov-5 model.
+ * Implementation of the stratified Markov model with configurable order k.
  */
 
 #include "StratifiedMarkovModel.h"
@@ -18,48 +18,57 @@
 // in ofs.write).  Harmless but redundant in C++17.
 constexpr uint32_t StratifiedMarkovModel::FILE_MAGIC;
 constexpr uint32_t StratifiedMarkovModel::FILE_VERSION;
+constexpr uint32_t StratifiedMarkovModel::FILE_VERSION_1;
+constexpr int StratifiedMarkovModel::MAX_K;
+constexpr int StratifiedMarkovModel::MAX_KMERS;
 
 StratifiedMarkovModel::StratifiedMarkovModel()
-    : m_num_bins(0), m_total_kmers(0) {}
+    : m_k(0), m_num_kmers(0), m_num_bins(0), m_total_kmers(0) {}
 
-void StratifiedMarkovModel::init(int num_bins, const std::vector<double>& breaks) {
+void StratifiedMarkovModel::init(int num_bins, const std::vector<double>& breaks, int k) {
     if (num_bins <= 0) {
         throw std::invalid_argument("num_bins must be positive");
     }
     if (breaks.size() != static_cast<size_t>(num_bins + 1)) {
         throw std::invalid_argument("breaks must have num_bins + 1 elements");
     }
+    if (k < 1 || k > MAX_K) {
+        throw std::invalid_argument("Markov order k must be in [1, 8]");
+    }
 
+    m_k = k;
+    m_num_kmers = 1 << (2 * k);  // 4^k
     m_num_bins = num_bins;
     m_breaks = breaks;
 
-    // Allocate count and CDF arrays
+    // Allocate count and CDF arrays as flat vectors
     m_counts.resize(num_bins);
     m_cdf.resize(num_bins);
     m_per_bin_kmers.resize(num_bins, 0);
 
-    reset_counts();
+    for (int b = 0; b < num_bins; ++b) {
+        m_counts[b].assign(m_num_kmers * NUM_BASES, 0);
+        m_cdf[b].assign(m_num_kmers * NUM_BASES, 0.0f);
+    }
+
+    m_total_kmers = 0;
 }
 
 void StratifiedMarkovModel::reset_counts() {
     m_total_kmers = 0;
     for (int b = 0; b < m_num_bins; ++b) {
         m_per_bin_kmers[b] = 0;
-        for (int ctx = 0; ctx < NUM_5MERS; ++ctx) {
-            for (int base = 0; base < NUM_BASES; ++base) {
-                m_counts[b][ctx][base] = 0;
-                m_cdf[b][ctx][base] = 0.0f;
-            }
-        }
+        std::fill(m_counts[b].begin(), m_counts[b].end(), 0);
+        std::fill(m_cdf[b].begin(), m_cdf[b].end(), 0.0f);
     }
 }
 
-void StratifiedMarkovModel::increment_count(int bin_idx, int context_5mer_idx, int next_base_idx) {
+void StratifiedMarkovModel::increment_count(int bin_idx, int context_kmer_idx, int next_base_idx) {
     if (bin_idx < 0 || bin_idx >= m_num_bins) return;
-    if (context_5mer_idx < 0 || context_5mer_idx >= NUM_5MERS) return;
+    if (context_kmer_idx < 0 || context_kmer_idx >= m_num_kmers) return;
     if (next_base_idx < 0 || next_base_idx >= NUM_BASES) return;
 
-    m_counts[bin_idx][context_5mer_idx][next_base_idx]++;
+    m_counts[bin_idx][context_kmer_idx * NUM_BASES + next_base_idx]++;
     m_per_bin_kmers[bin_idx]++;
     m_total_kmers++;
 }
@@ -71,16 +80,14 @@ void StratifiedMarkovModel::apply_bin_mapping(const std::vector<int>& bin_map) {
     }
 
     // Create temporary storage for merged counts
-    std::vector<std::array<std::array<uint64_t, NUM_BASES>, NUM_5MERS>> new_counts(m_num_bins);
+    std::vector<std::vector<uint64_t>> new_counts(m_num_bins);
     std::vector<uint64_t> new_per_bin_kmers(m_num_bins, 0);
 
+    int flat_size = m_num_kmers * NUM_BASES;
+
     // Initialize new counts to zero
     for (int b = 0; b < m_num_bins; ++b) {
-        for (int ctx = 0; ctx < NUM_5MERS; ++ctx) {
-            for (int base = 0; base < NUM_BASES; ++base) {
-                new_counts[b][ctx][base] = 0;
-            }
-        }
+        new_counts[b].assign(flat_size, 0);
     }
 
     // Merge counts according to bin_map
@@ -90,10 +97,8 @@ void StratifiedMarkovModel::apply_bin_mapping(const std::vector<int>& bin_map) {
             tgt_bin = src_bin; // Keep original if invalid target
         }
 
-        for (int ctx = 0; ctx < NUM_5MERS; ++ctx) {
-            for (int base = 0; base < NUM_BASES; ++base) {
-                new_counts[tgt_bin][ctx][base] += m_counts[src_bin][ctx][base];
-            }
+        for (int i = 0; i < flat_size; ++i) {
+            new_counts[tgt_bin][i] += m_counts[src_bin][i];
         }
         new_per_bin_kmers[tgt_bin] += m_per_bin_kmers[src_bin];
     }
@@ -105,22 +110,24 @@ void StratifiedMarkovModel::apply_bin_mapping(const std::vector<int>& bin_map) {
 
 void StratifiedMarkovModel::normalize_and_build_cdf(double pseudocount) {
     for (int b = 0; b < m_num_bins; ++b) {
-        for (int ctx = 0; ctx < NUM_5MERS; ++ctx) {
+        for (int ctx = 0; ctx < m_num_kmers; ++ctx) {
+            int base_offset = ctx * NUM_BASES;
+
             // Calculate total count for this context (with pseudocounts)
             double total = 0.0;
             for (int base = 0; base < NUM_BASES; ++base) {
-                total += static_cast<double>(m_counts[b][ctx][base]) + pseudocount;
+                total += static_cast<double>(m_counts[b][base_offset + base]) + pseudocount;
             }
 
             // Build CDF
             double cumsum = 0.0;
             for (int base = 0; base < NUM_BASES; ++base) {
-                double prob = (static_cast<double>(m_counts[b][ctx][base]) + pseudocount) / total;
+                double prob = (static_cast<double>(m_counts[b][base_offset + base]) + pseudocount) / total;
                 cumsum += prob;
-                m_cdf[b][ctx][base] = static_cast<float>(cumsum);
+                m_cdf[b][base_offset + base] = static_cast<float>(cumsum);
             }
             // Ensure last element is exactly 1.0 (avoid floating point issues)
-            m_cdf[b][ctx][NUM_BASES - 1] = 1.0f;
+            m_cdf[b][base_offset + NUM_BASES - 1] = 1.0f;
         }
     }
 }
@@ -149,19 +156,19 @@ int StratifiedMarkovModel::get_bin(double track_value) const {
     return bin;
 }
 
-int StratifiedMarkovModel::sample_next_base(int bin_idx, int context_5mer_idx, float random_val) const {
+int StratifiedMarkovModel::sample_next_base(int bin_idx, int context_kmer_idx, float random_val) const {
     if (bin_idx < 0 || bin_idx >= m_num_bins) {
         // Fall back to uniform sampling
         return static_cast<int>(random_val * NUM_BASES);
     }
-    if (context_5mer_idx < 0 || context_5mer_idx >= NUM_5MERS) {
+    if (context_kmer_idx < 0 || context_kmer_idx >= m_num_kmers) {
         return static_cast<int>(random_val * NUM_BASES);
     }
 
     // Use CDF for sampling
-    const auto& cdf = m_cdf[bin_idx][context_5mer_idx];
+    int base_offset = context_kmer_idx * NUM_BASES;
     for (int base = 0; base < NUM_BASES; ++base) {
-        if (random_val < cdf[base]) {
+        if (random_val < m_cdf[bin_idx][base_offset + base]) {
             return base;
         }
     }
@@ -184,9 +191,9 @@ char StratifiedMarkovModel::decode_base(int idx) {
     return bases[idx];
 }
 
-int StratifiedMarkovModel::encode_5mer(const char* seq) {
+int StratifiedMarkovModel::encode_kmer(const char* seq, int k) {
     int idx = 0;
-    for (int i = 0; i < 5; ++i) {
+    for (int i = 0; i < k; ++i) {
         int base = encode_base(seq[i]);
         if (base < 0) return -1;
         idx = (idx << 2) | base;
@@ -194,8 +201,8 @@ int StratifiedMarkovModel::encode_5mer(const char* seq) {
     return idx;
 }
 
-void StratifiedMarkovModel::decode_5mer(int idx, char* out) {
-    for (int i = 4; i >= 0; --i) {
+void StratifiedMarkovModel::decode_kmer(int idx, char* out, int k) {
+    for (int i = k - 1; i >= 0; --i) {
         out[i] = decode_base(idx & 3);
         idx >>= 2;
     }
@@ -206,35 +213,40 @@ int StratifiedMarkovModel::complement_base(int base_idx) {
     return 3 - base_idx;
 }
 
-void StratifiedMarkovModel::revcomp_6mer(int context_5mer_idx, int next_base_idx,
+void StratifiedMarkovModel::revcomp_kmer(int context_kmer_idx, int next_base_idx, int k,
                                           int& revcomp_context_idx, int& revcomp_next_idx) {
-    // Forward 6-mer: B0 B1 B2 B3 B4 B5
-    //   context = B0*256 + B1*64 + B2*16 + B3*4 + B4
-    //   next = B5
+    // Forward (k+1)-mer: B0 B1 ... B(k-1) B(k)
+    //   context = B0 * 4^(k-1) + B1 * 4^(k-2) + ... + B(k-1)
+    //   next = B(k)
     //
-    // Reverse complement: comp(B5) comp(B4) comp(B3) comp(B2) comp(B1) comp(B0)
-    //   revcomp_context = comp(B5)*256 + comp(B4)*64 + comp(B3)*16 + comp(B2)*4 + comp(B1)
+    // Reverse complement: comp(B(k)) comp(B(k-1)) ... comp(B1) comp(B0)
+    //   revcomp_context = comp(B(k)) * 4^(k-1) + comp(B(k-1)) * 4^(k-2) + ... + comp(B1)
     //   revcomp_next = comp(B0)
 
-    // Extract the 5 bases from context (B0 is most significant)
-    int b4 = context_5mer_idx & 3;
-    int b3 = (context_5mer_idx >> 2) & 3;
-    int b2 = (context_5mer_idx >> 4) & 3;
-    int b1 = (context_5mer_idx >> 6) & 3;
-    int b0 = (context_5mer_idx >> 8) & 3;
-    int b5 = next_base_idx;
-
-    // Compute complements
-    int c0 = complement_base(b0);
-    int c1 = complement_base(b1);
-    int c2 = complement_base(b2);
-    int c3 = complement_base(b3);
-    int c4 = complement_base(b4);
-    int c5 = complement_base(b5);
-
-    // Build reverse complement context: c5 c4 c3 c2 c1
-    revcomp_context_idx = (c5 << 8) | (c4 << 6) | (c3 << 4) | (c2 << 2) | c1;
-    revcomp_next_idx = c0;
+    // Extract the k bases from context and build reverse complement
+    // B0 is most significant (leftmost)
+    // We need to extract each base, complement it, and rebuild in reverse order.
+
+    // Extract bases from context_kmer_idx into an array
+    // bases[0] = B0 (most significant), bases[k-1] = B(k-1) (least significant)
+    int bases[MAX_K + 1];
+    {
+        int tmp = context_kmer_idx;
+        for (int i = k - 1; i >= 0; --i) {
+            bases[i] = tmp & 3;
+            tmp >>= 2;
+        }
+    }
+    bases[k] = next_base_idx;
+
+    // Build reverse complement:
+    //   revcomp_context = comp(B(k)) comp(B(k-1)) ... comp(B1)
+    //   revcomp_next = comp(B0)
+    revcomp_context_idx = 0;
+    for (int i = k; i >= 1; --i) {
+        revcomp_context_idx = (revcomp_context_idx << 2) | complement_base(bases[i]);
+    }
+    revcomp_next_idx = complement_base(bases[0]);
 }
 
 void StratifiedMarkovModel::save(const std::string& path) const {
@@ -247,6 +259,10 @@ void StratifiedMarkovModel::save(const std::string& path) const {
     ofs.write(reinterpret_cast<const char*>(&FILE_MAGIC), sizeof(FILE_MAGIC));
     ofs.write(reinterpret_cast<const char*>(&FILE_VERSION), sizeof(FILE_VERSION));
 
+    // Write k and num_kmers (new in version 2)
+    ofs.write(reinterpret_cast<const char*>(&m_k), sizeof(m_k));
+    ofs.write(reinterpret_cast<const char*>(&m_num_kmers), sizeof(m_num_kmers));
+
     // Write number of bins
     ofs.write(reinterpret_cast<const char*>(&m_num_bins), sizeof(m_num_bins));
 
@@ -261,20 +277,16 @@ void StratifiedMarkovModel::save(const std::string& path) const {
         ofs.write(reinterpret_cast<const char*>(&m_per_bin_kmers[i]), sizeof(m_per_bin_kmers[i]));
     }
 
-    // Write counts
+    // Write counts (flat: num_kmers * NUM_BASES uint64_t values per bin)
     for (int b = 0; b < m_num_bins; ++b) {
-        for (int ctx = 0; ctx < NUM_5MERS; ++ctx) {
-            ofs.write(reinterpret_cast<const char*>(m_counts[b][ctx].data()),
-                      NUM_BASES * sizeof(uint64_t));
-        }
+        ofs.write(reinterpret_cast<const char*>(m_counts[b].data()),
+                  m_num_kmers * NUM_BASES * sizeof(uint64_t));
     }
 
-    // Write CDFs
+    // Write CDFs (flat: num_kmers * NUM_BASES float values per bin)
     for (int b = 0; b < m_num_bins; ++b) {
-        for (int ctx = 0; ctx < NUM_5MERS; ++ctx) {
-            ofs.write(reinterpret_cast<const char*>(m_cdf[b][ctx].data()),
-                      NUM_BASES * sizeof(float));
-        }
+        ofs.write(reinterpret_cast<const char*>(m_cdf[b].data()),
+                  m_num_kmers * NUM_BASES * sizeof(float));
     }
 
     if (!ofs) {
@@ -300,7 +312,24 @@ StratifiedMarkovModel StratifiedMarkovModel::load(const std::string& path) {
     // Read and verify version
     uint32_t version;
     ifs.read(reinterpret_cast<char*>(&version), sizeof(version));
-    if (version != FILE_VERSION) {
+
+    if (version == FILE_VERSION_1) {
+        // Legacy version 1: hardcoded k=5, num_kmers=1024
+        model.m_k = 5;
+        model.m_num_kmers = 1024;
+    } else if (version == FILE_VERSION) {
+        // Version 2: read k and num_kmers from header
+        ifs.read(reinterpret_cast<char*>(&model.m_k), sizeof(model.m_k));
+        ifs.read(reinterpret_cast<char*>(&model.m_num_kmers), sizeof(model.m_num_kmers));
+        if (model.m_k < 1 || model.m_k > MAX_K) {
+            TGLError("Invalid Markov order k=%d in file: %s", model.m_k, path.c_str());
+        }
+        int expected_num_kmers = 1 << (2 * model.m_k);
+        if (model.m_num_kmers != expected_num_kmers) {
+            TGLError("Inconsistent num_kmers (%d, expected %d for k=%d) in file: %s",
+                     model.m_num_kmers, expected_num_kmers, model.m_k, path.c_str());
+        }
+    } else {
         TGLError("Unsupported file version %u in: %s", version, path.c_str());
     }
 
@@ -320,22 +349,25 @@ StratifiedMarkovModel StratifiedMarkovModel::load(const std::string& path) {
         ifs.read(reinterpret_cast<char*>(&model.m_per_bin_kmers[i]), sizeof(uint64_t));
     }
 
+    int num_kmers = model.m_num_kmers;
+    int flat_size = num_kmers * NUM_BASES;
+
     // Read counts
+    // Both version 1 and 2 use the same contiguous layout:
+    // [ctx0_base0, ctx0_base1, ..., ctxN_base3]
     model.m_counts.resize(model.m_num_bins);
     for (int b = 0; b < model.m_num_bins; ++b) {
-        for (int ctx = 0; ctx < NUM_5MERS; ++ctx) {
-            ifs.read(reinterpret_cast<char*>(model.m_counts[b][ctx].data()),
-                     NUM_BASES * sizeof(uint64_t));
-        }
+        model.m_counts[b].resize(flat_size);
+        ifs.read(reinterpret_cast<char*>(model.m_counts[b].data()),
+                 flat_size * sizeof(uint64_t));
     }
 
     // Read CDFs
     model.m_cdf.resize(model.m_num_bins);
     for (int b = 0; b < model.m_num_bins; ++b) {
-        for (int ctx = 0; ctx < NUM_5MERS; ++ctx) {
-            ifs.read(reinterpret_cast<char*>(model.m_cdf[b][ctx].data()),
-                     NUM_BASES * sizeof(float));
-        }
+        model.m_cdf[b].resize(flat_size);
+        ifs.read(reinterpret_cast<char*>(model.m_cdf[b].data()),
+                 flat_size * sizeof(float));
     }
 
     if (!ifs) {
diff --git a/src/StratifiedMarkovModel.h b/src/StratifiedMarkovModel.h
index 4c7cee28..77978d2e 100644
--- a/src/StratifiedMarkovModel.h
+++ b/src/StratifiedMarkovModel.h
@@ -1,9 +1,10 @@
 /*
  * StratifiedMarkovModel.h
  *
- * Stratified Markov-5 model for synthetic genome generation.
- * Stores transition probabilities for 5-mer -> base transitions,
+ * Stratified Markov model for synthetic genome generation.
+ * Stores transition probabilities for k-mer -> base transitions,
  * stratified by bins of a track expression (e.g., GC content).
+ * Supports configurable Markov order k (1..8).
  */
 
 #ifndef STRATIFIED_MARKOV_MODEL_H_
@@ -14,30 +15,39 @@
 #include <string>
 #include <vector>
 
-// Number of possible 5-mers (4^5 = 1024)
+// Kept for backward compatibility with external callers.
+// Do NOT rely on NUM_5MERS inside StratifiedMarkovModel internals.
 constexpr int NUM_5MERS = 1024;
 
 // Number of bases (A, C, G, T)
 constexpr int NUM_BASES = 4;
 
 /**
- * StratifiedMarkovModel: A 5th-order Markov model stratified by track values.
+ * StratifiedMarkovModel: A k-th order Markov model stratified by track values.
  *
  * For each bin (based on a track expression like GC content), stores:
- * - Counts of 6-mers (5-mer context + next base)
- * - Probability distributions P(next_base | 5-mer_context)
+ * - Counts of (k+1)-mers (k-mer context + next base)
+ * - Probability distributions P(next_base | k-mer_context)
  * - Cumulative distribution functions for sampling
+ *
+ * The Markov order k is configurable at init time (1..8, default 5).
  */
 class StratifiedMarkovModel {
 public:
+    // Maximum supported Markov order
+    static constexpr int MAX_K = 8;
+    // Maximum number of k-mers (4^MAX_K = 65536)
+    static constexpr int MAX_KMERS = 65536;
+
     StratifiedMarkovModel();
 
     /**
-     * Initialize the model with the specified number of bins.
+     * Initialize the model with the specified number of bins and Markov order.
      * @param num_bins Number of stratification bins
      * @param breaks Bin boundaries (size = num_bins + 1)
+     * @param k Markov order (1..8, default 5 for backward compatibility)
      */
-    void init(int num_bins, const std::vector<double>& breaks);
+    void init(int num_bins, const std::vector<double>& breaks, int k = 5);
 
     /**
      * Reset all counts to zero.
@@ -45,12 +55,12 @@ class StratifiedMarkovModel {
     void reset_counts();
 
     /**
-     * Increment the count for a 6-mer in the specified bin.
+     * Increment the count for a (k+1)-mer in the specified bin.
      * @param bin_idx Bin index (0 to num_bins-1)
-     * @param context_5mer_idx Encoded 5-mer context (0 to 1023)
+     * @param context_kmer_idx Encoded k-mer context (0 to m_num_kmers-1)
      * @param next_base_idx Next base index (0=A, 1=C, 2=G, 3=T)
      */
-    void increment_count(int bin_idx, int context_5mer_idx, int next_base_idx);
+    void increment_count(int bin_idx, int context_kmer_idx, int next_base_idx);
 
     /**
      * Apply bin mapping: merge sparse bins into target bins.
@@ -75,20 +85,31 @@ class StratifiedMarkovModel {
     int get_bin(double track_value) const;
 
     /**
-     * Sample the next base given a 5-mer context and bin.
+     * Sample the next base given a k-mer context and bin.
      * @param bin_idx Bin index
-     * @param context_5mer_idx Encoded 5-mer context
+     * @param context_kmer_idx Encoded k-mer context
      * @param random_val Random value in [0, 1)
      * @return Base index (0=A, 1=C, 2=G, 3=T)
      */
-    int sample_next_base(int bin_idx, int context_5mer_idx, float random_val) const;
+    int sample_next_base(int bin_idx, int context_kmer_idx, float random_val) const;
+
+    /**
+     * Encode a k-mer sequence to an index.
+     * @param seq Pointer to k characters (A/C/G/T)
+     * @param k Number of characters to encode
+     * @return Index (0 to 4^k - 1), or -1 if any character is not A/C/G/T
+     */
+    static int encode_kmer(const char* seq, int k);
 
     /**
      * Encode a 5-mer sequence to an index (0-1023).
+     * Backward-compatible wrapper around encode_kmer with k=5.
      * @param seq Pointer to 5 characters (A/C/G/T)
      * @return Index (0-1023), or -1 if any character is not A/C/G/T
      */
-    static int encode_5mer(const char* seq);
+    static inline int encode_5mer(const char* seq) {
+        return encode_kmer(seq, 5);
+    }
 
     /**
      * Encode a single base to an index.
@@ -104,35 +125,60 @@ class StratifiedMarkovModel {
      */
     static char decode_base(int idx);
 
+    /**
+     * Decode a k-mer index to a string.
+     * @param idx k-mer index (0 to 4^k - 1)
+     * @param out Output buffer (must have space for k characters)
+     * @param k Number of characters to decode
+     */
+    static void decode_kmer(int idx, char* out, int k);
+
     /**
      * Decode a 5-mer index to a string.
+     * Backward-compatible wrapper around decode_kmer with k=5.
      * @param idx 5-mer index (0-1023)
      * @param out Output buffer (must have space for 5 characters)
      */
-    static void decode_5mer(int idx, char* out);
+    static inline void decode_5mer(int idx, char* out) {
+        decode_kmer(idx, out, 5);
+    }
 
     /**
      * Get the complement of a base index.
      * @param base_idx Base index (0=A, 1=C, 2=G, 3=T)
-     * @return Complement base index (A↔T, C↔G)
+     * @return Complement base index (A<->T, C<->G)
      */
     static int complement_base(int base_idx);
 
     /**
-     * Compute the reverse complement of a 6-mer given as context + next base.
-     * Given forward 6-mer B0B1B2B3B4B5 (context=B0-B4, next=B5),
-     * computes reverse complement: comp(B5)comp(B4)comp(B3)comp(B2)comp(B1)comp(B0)
-     * where the new context is comp(B5)-comp(B1) and new next is comp(B0).
+     * Compute the reverse complement of a (k+1)-mer given as context + next base.
+     * Given forward (k+1)-mer B0 B1 ... B(k-1) B(k) (context=B0..B(k-1), next=B(k)),
+     * computes reverse complement:
+     *   comp(B(k)) comp(B(k-1)) ... comp(B1) comp(B0)
+     * where the new context is comp(B(k))..comp(B1) and new next is comp(B0).
      *
-     * @param context_5mer_idx Encoded 5-mer context (0 to 1023)
+     * @param context_kmer_idx Encoded k-mer context (0 to m_num_kmers-1)
      * @param next_base_idx Next base index (0=A, 1=C, 2=G, 3=T)
-     * @param revcomp_context_idx [out] Reverse complement 5-mer context index
+     * @param k Markov order (context length)
+     * @param revcomp_context_idx [out] Reverse complement k-mer context index
      * @param revcomp_next_idx [out] Reverse complement next base index
      */
-    static void revcomp_6mer(int context_5mer_idx, int next_base_idx,
+    static void revcomp_kmer(int context_kmer_idx, int next_base_idx, int k,
                              int& revcomp_context_idx, int& revcomp_next_idx);
 
+    /**
+     * Compute reverse complement of a 6-mer (k=5).
+     * Backward-compatible wrapper around revcomp_kmer with k=5.
+     */
+    static inline void revcomp_6mer(int context_5mer_idx, int next_base_idx,
+                                    int& revcomp_context_idx, int& revcomp_next_idx) {
+        revcomp_kmer(context_5mer_idx, next_base_idx, 5,
+                     revcomp_context_idx, revcomp_next_idx);
+    }
+
     // Accessors
+    int get_k() const { return m_k; }
+    int get_num_kmers() const { return m_num_kmers; }
     int get_num_bins() const { return m_num_bins; }
     const std::vector<double>& get_breaks() const { return m_breaks; }
     uint64_t get_total_kmers() const { return m_total_kmers; }
@@ -140,17 +186,17 @@ class StratifiedMarkovModel {
 
     // Data accessors for R interface
     uint64_t get_count(int bin_idx, int context_idx, int base_idx) const {
-        return m_counts[bin_idx][context_idx][base_idx];
+        return m_counts[bin_idx][context_idx * NUM_BASES + base_idx];
     }
     float get_cdf(int bin_idx, int context_idx, int base_idx) const {
-        return m_cdf[bin_idx][context_idx][base_idx];
+        return m_cdf[bin_idx][context_idx * NUM_BASES + base_idx];
     }
 
     // Direct access for bulk data transfer
-    const std::vector<std::array<std::array<uint64_t, NUM_BASES>, NUM_5MERS>>& get_counts() const {
+    const std::vector<std::vector<uint64_t>>& get_counts() const {
         return m_counts;
     }
-    const std::vector<std::array<std::array<float, NUM_BASES>, NUM_5MERS>>& get_cdf() const {
+    const std::vector<std::vector<float>>& get_cdf() const {
         return m_cdf;
     }
 
@@ -168,15 +214,17 @@ class StratifiedMarkovModel {
     static StratifiedMarkovModel load(const std::string& path);
 
 private:
+    int m_k;          // Markov order (1..8)
+    int m_num_kmers;  // = 4^k, computed at init time
     int m_num_bins;
     std::vector<double> m_breaks;
 
-    // counts[bin][5mer_idx][base_idx]
-    std::vector<std::array<std::array<uint64_t, NUM_BASES>, NUM_5MERS>> m_counts;
+    // Flat vectors: counts[bin][kmer_idx * NUM_BASES + base_idx]
+    std::vector<std::vector<uint64_t>> m_counts;
 
-    // Cumulative distribution functions for sampling: cdf[bin][5mer_idx][base_idx]
-    // cdf[bin][ctx][0] = P(A), cdf[bin][ctx][1] = P(A)+P(C), etc.
-    std::vector<std::array<std::array<float, NUM_BASES>, NUM_5MERS>> m_cdf;
+    // Cumulative distribution functions for sampling: cdf[bin][kmer_idx * NUM_BASES + base_idx]
+    // For a given context ctx: cdf[bin][ctx*4+0] = P(A), cdf[bin][ctx*4+1] = P(A)+P(C), etc.
+    std::vector<std::vector<float>> m_cdf;
 
     // Statistics
     uint64_t m_total_kmers;
@@ -184,7 +232,10 @@ class StratifiedMarkovModel {
 
     // Magic number for file format
     static constexpr uint32_t FILE_MAGIC = 0x4D4B5653; // "SVKM" in little-endian
-    static constexpr uint32_t FILE_VERSION = 1;
+    // Version 2: stores k and num_kmers in header
+    static constexpr uint32_t FILE_VERSION = 2;
+    // Version 1 for backward-compatible loading of old files
+    static constexpr uint32_t FILE_VERSION_1 = 1;
 };
 
 #endif // STRATIFIED_MARKOV_MODEL_H_
diff --git a/src/misha-init.cpp b/src/misha-init.cpp
index 1c92572b..c42971f2 100644
--- a/src/misha-init.cpp
+++ b/src/misha-init.cpp
@@ -104,8 +104,8 @@ extern "C" {
     extern SEXP C_gseq_pwm_multitask(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
     extern SEXP C_gseq_pwm_edits(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
     extern SEXP C_gseq_kmer(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-    extern SEXP C_gsynth_train(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-    extern SEXP C_gsynth_sample(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
+    extern SEXP C_gsynth_train(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
+    extern SEXP C_gsynth_sample(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
     extern SEXP C_gsynth_replace_kmer(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
     extern SEXP C_gseq_kmer_dist(SEXP, SEXP, SEXP, SEXP);
 }
@@ -208,8 +208,8 @@ static const R_CallMethodDef CallEntries[] = {
     {"C_gseq_pwm_multitask", (DL_FUNC)&C_gseq_pwm_multitask, 17},
     {"C_gseq_pwm_edits", (DL_FUNC)&C_gseq_pwm_edits, 14},
     {"C_gseq_kmer", (DL_FUNC)&C_gseq_kmer, 9},
-    {"C_gsynth_train", (DL_FUNC)&C_gsynth_train, 11},
-    {"C_gsynth_sample", (DL_FUNC)&C_gsynth_sample, 11},
+    {"C_gsynth_train", (DL_FUNC)&C_gsynth_train, 12},
+    {"C_gsynth_sample", (DL_FUNC)&C_gsynth_sample, 12},
     {"C_gsynth_replace_kmer", (DL_FUNC)&C_gsynth_replace_kmer, 6},
     {"C_gseq_kmer_dist", (DL_FUNC)&C_gseq_kmer_dist, 4},
     {NULL, NULL, 0}
diff --git a/tests/testthat/test-gsynth-k.R b/tests/testthat/test-gsynth-k.R
new file mode 100644
index 00000000..83ed0df4
--- /dev/null
+++ b/tests/testthat/test-gsynth-k.R
@@ -0,0 +1,308 @@
+# Tests for variable Markov order k parameter in gsynth functions
+
+test_that("gsynth.train errors on invalid k values", {
+    gdb.init("/home/aviezerl/hg38")
+
+    test_intervals <- gintervals(1, 0, 100000)
+
+    # k = 0 should error (below minimum of 1)
+    expect_error(
+        gsynth.train(intervals = test_intervals, iterator = 200, k = 0),
+        "k must be a single integer between 1 and 8"
+    )
+
+    # k = 9 should error (above maximum of 8)
+    expect_error(
+        gsynth.train(intervals = test_intervals, iterator = 200, k = 9),
+        "k must be a single integer between 1 and 8"
+    )
+
+    # k = 3.5 is silently truncated to 3L by as.integer(), which is valid.
+    # Verify it does NOT error (the code uses as.integer(k) without strict
+    # integer checking, so non-integer numerics are accepted).
+    expect_error(
+        gsynth.train(intervals = test_intervals, iterator = 200, k = 3.5),
+        NA
+    )
+
+    # k = "abc" should error (as.integer("abc") produces NA)
+    expect_error(
+        suppressWarnings(
+            gsynth.train(intervals = test_intervals, iterator = 200, k = "abc")
+        ),
+        "k must be a single integer between 1 and 8"
+    )
+})
+
+test_that("gsynth.train with k=3 basic training and sampling", {
+    gdb.init("/home/aviezerl/hg38")
+
+    test_intervals <- gintervals(1, 0, 100000)
+
+    # Train a 0D model with k=3
+    model <- gsynth.train(
+        intervals = test_intervals,
+        iterator = 200,
+        k = 3
+    )
+
+    expect_s3_class(model, "gsynth.model")
+    expect_equal(model$k, 3L)
+    expect_equal(model$num_kmers, 64L) # 4^3
+
+    # CDF matrices should be 64 x 4
+    cdf_mat <- model$model_data$cdf[[1]]
+    expect_equal(dim(cdf_mat), c(64, 4))
+
+    # Sample and verify valid DNA
+    output_fasta <- tempfile(fileext = ".fa")
+    on.exit(unlink(output_fasta), add = TRUE)
+
+    gsynth.sample(
+        model,
+        output_fasta,
+        output_format = "fasta",
+        intervals = test_intervals,
+        seed = 60427
+    )
+
+    expect_true(file.exists(output_fasta))
+    lines <- readLines(output_fasta)
+    seq_lines <- lines[!grepl("^>", lines)]
+    full_seq <- paste(seq_lines, collapse = "")
+
+    # Should only contain A, C, G, T
+    expect_true(grepl("^[ACGT]+$", full_seq))
+
+    # Should have correct length (100kb)
+    expect_equal(nchar(full_seq), 100000)
+})
+
+test_that("gsynth.train with k=1 minimal model", {
+    gdb.init("/home/aviezerl/hg38")
+
+    test_intervals <- gintervals(1, 0, 100000)
+
+    model <- gsynth.train(
+        intervals = test_intervals,
+        iterator = 200,
+        k = 1
+    )
+
+    expect_s3_class(model, "gsynth.model")
+    expect_equal(model$k, 1L)
+    expect_equal(model$num_kmers, 4L) # 4^1
+
+    # CDF matrices should be 4 x 4
+    cdf_mat <- model$model_data$cdf[[1]]
+    expect_equal(dim(cdf_mat), c(4, 4))
+
+    # CDF should be valid
+    expect_true(all(cdf_mat >= 0))
+    expect_true(all(cdf_mat <= 1))
+    expect_true(all(abs(cdf_mat[, 4] - 1) < 1e-5))
+
+    # Sample and verify valid DNA
+    output_fasta <- tempfile(fileext = ".fa")
+    on.exit(unlink(output_fasta), add = TRUE)
+
+    gsynth.sample(
+        model,
+        output_fasta,
+        output_format = "fasta",
+        intervals = gintervals(1, 0, 10000),
+        seed = 42
+    )
+
+    lines <- readLines(output_fasta)
+    seq_lines <- lines[!grepl("^>", lines)]
+    full_seq <- paste(seq_lines, collapse = "")
+    expect_true(grepl("^[ACGT]+$", full_seq))
+    expect_equal(nchar(full_seq), 10000)
+})
+
+test_that("gsynth.train with k=7 larger model", {
+    gdb.init("/home/aviezerl/hg38")
+
+    test_intervals <- gintervals(1, 0, 100000)
+
+    model <- gsynth.train(
+        intervals = test_intervals,
+        iterator = 200,
+        k = 7
+    )
+
+    expect_s3_class(model, "gsynth.model")
+    expect_equal(model$k, 7L)
+    expect_equal(model$num_kmers, 16384L) # 4^7
+
+    # CDF matrices should be 16384 x 4
+    cdf_mat <- model$model_data$cdf[[1]]
+    expect_equal(dim(cdf_mat), c(16384, 4))
+
+    # Sample and verify
+    output_fasta <- tempfile(fileext = ".fa")
+    on.exit(unlink(output_fasta), add = TRUE)
+
+    gsynth.sample(
+        model,
+        output_fasta,
+        output_format = "fasta",
+        intervals = gintervals(1, 0, 10000),
+        seed = 99
+    )
+
+    lines <- readLines(output_fasta)
+    seq_lines <- lines[!grepl("^>", lines)]
+    full_seq <- paste(seq_lines, collapse = "")
+    expect_true(grepl("^[ACGT]+$", full_seq))
+    expect_equal(nchar(full_seq), 10000)
+})
+
+test_that("save/load round-trip with k != 5", {
+    gdb.init("/home/aviezerl/hg38")
+
+    test_intervals <- gintervals(1, 0, 100000)
+
+    model <- gsynth.train(
+        intervals = test_intervals,
+        iterator = 200,
+        k = 3
+    )
+
+    # Save as .gsm directory
+    out_dir <- file.path(tempdir(), "test_gsm_k3")
+    on.exit(unlink(out_dir, recursive = TRUE), add = TRUE)
+
+    gsynth.save(model, out_dir)
+    loaded <- gsynth.load(out_dir)
+
+    expect_s3_class(loaded, "gsynth.model")
+    expect_equal(loaded$k, 3L)
+    expect_equal(loaded$num_kmers, 64L)
+
+    # Verify CDF dimensions match
+    expect_equal(dim(loaded$model_data$cdf[[1]]), c(64, 4))
+
+    # Verify model data matches exactly
+    expect_equal(loaded$model_data$counts[[1]], model$model_data$counts[[1]],
+        tolerance = 1e-15
+    )
+    expect_equal(loaded$model_data$cdf[[1]], model$model_data$cdf[[1]],
+        tolerance = 1e-15
+    )
+
+    # Verify sampling is identical with same seed.
+    # Loaded model has iterator=NULL (not stored in .gsm format),
+    # so restore it from the original model before sampling.
+    loaded$iterator <- model$iterator
+
+    out1 <- tempfile(fileext = ".fa")
+    out2 <- tempfile(fileext = ".fa")
+    on.exit(unlink(c(out1, out2)), add = TRUE)
+
+    sample_intervals <- gintervals(1, 0, 10000)
+
+    gsynth.sample(model, out1,
+        output_format = "fasta",
+        intervals = sample_intervals, seed = 60427
+    )
+    gsynth.sample(loaded, out2,
+        output_format = "fasta",
+        intervals = sample_intervals, seed = 60427
+    )
+
+    expect_identical(readLines(out1), readLines(out2))
+})
+
+test_that("k=5 produces identical results to default", {
+    gdb.init("/home/aviezerl/hg38")
+
+    test_intervals <- gintervals(1, 0, 100000)
+
+    # Train with explicit k=5
+    model_k5 <- gsynth.train(
+        intervals = test_intervals,
+        iterator = 200,
+        k = 5
+    )
+
+    # Train with default (no k specified)
+    model_default <- gsynth.train(
+        intervals = test_intervals,
+        iterator = 200
+    )
+
+    expect_equal(model_k5$k, 5L)
+    expect_equal(model_default$k, 5L)
+    expect_equal(model_k5$num_kmers, 1024L)
+    expect_equal(model_default$num_kmers, 1024L)
+
+    # CDF matrices should be identical
+    expect_equal(
+        model_k5$model_data$cdf[[1]],
+        model_default$model_data$cdf[[1]],
+        tolerance = 1e-15
+    )
+
+    # Counts should be identical
+    expect_equal(
+        model_k5$model_data$counts[[1]],
+        model_default$model_data$counts[[1]],
+        tolerance = 1e-15
+    )
+})
+
+test_that("k with 1D stratification", {
+    gdb.init("/home/aviezerl/hg38")
+
+    for (vt in c("g_frac_k", "c_frac_k")) {
+        if (vt %in% gvtrack.ls()) gvtrack.rm(vt)
+    }
+    gvtrack.create("g_frac_k", NULL, "kmer.frac", kmer = "G")
+    gvtrack.create("c_frac_k", NULL, "kmer.frac", kmer = "C")
+    on.exit({
+        if ("g_frac_k" %in% gvtrack.ls()) gvtrack.rm("g_frac_k")
+        if ("c_frac_k" %in% gvtrack.ls()) gvtrack.rm("c_frac_k")
+    }, add = TRUE)
+
+    test_intervals <- gintervals(1, 0, 100000)
+
+    # Train 1D model with k=3
+    model <- gsynth.train(
+        list(expr = "g_frac_k + c_frac_k", breaks = seq(0, 1, 0.2)), # 5 bins
+        intervals = test_intervals,
+        iterator = 200,
+        k = 3
+    )
+
+    expect_s3_class(model, "gsynth.model")
+    expect_equal(model$k, 3L)
+    expect_equal(model$num_kmers, 64L)
+    expect_equal(model$n_dims, 1)
+    expect_equal(model$dim_sizes, 5)
+    expect_equal(model$total_bins, 5)
+
+    # All CDF matrices should be 64 x 4
+    for (bin in seq_len(model$total_bins)) {
+        expect_equal(dim(model$model_data$cdf[[bin]]), c(64, 4))
+    }
+
+    # Sample and verify
+    output_fasta <- tempfile(fileext = ".fa")
+    on.exit(unlink(output_fasta), add = TRUE)
+
+    gsynth.sample(
+        model,
+        output_fasta,
+        output_format = "fasta",
+        intervals = gintervals(1, 0, 10000),
+        seed = 60427
+    )
+
+    lines <- readLines(output_fasta)
+    seq_lines <- lines[!grepl("^>", lines)]
+    full_seq <- paste(seq_lines, collapse = "")
+    expect_true(grepl("^[ACGT]+$", full_seq))
+    expect_equal(nchar(full_seq), 10000)
+})

From 188879232704805759b11691c2192dd5fbfa3bbd Mon Sep 17 00:00:00 2001
From: aviezerl <aviezer.lifshitz@weizmann.ac.il>
Date: Mon, 30 Mar 2026 23:48:46 +0300
Subject: [PATCH 2/7] fix: avoid integer overflow when loading large high-order
 .gsm models

total_bins * num_kmers * 4L overflows R's 32-bit integer for k=8 models
with >= 8192 bins. Use double arithmetic for expected_n instead.
---
 R/synth.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/synth.R b/R/synth.R
index 059a7104..d16c7bc7 100644
--- a/R/synth.R
+++ b/R/synth.R
@@ -993,7 +993,7 @@ gsynth.convert <- function(input_file, output_file, compress = FALSE) {
         stop("Binary data files (counts.bin, cdf.bin) not found", call. = FALSE)
     }
 
-    expected_n <- total_bins * num_kmers * 4L
+    expected_n <- as.double(total_bins) * as.double(num_kmers) * 4
 
     counts_flat <- readBin(counts_path, "double", n = expected_n, size = 8L, endian = "little")
     cdf_flat <- readBin(cdf_path, "double", n = expected_n, size = 8L, endian = "little")

From 7f4cd5e3b15c8312756a8ed32417c0450ceecb64 Mon Sep 17 00:00:00 2001
From: aviezerl <aviezer.lifshitz@weizmann.ac.il>
Date: Mon, 30 Mar 2026 23:57:52 +0300
Subject: [PATCH 3/7] fix: reject non-integer k values instead of silently
 truncating

k=3.5 was silently converted to 3 via as.integer(). Now validate that
k == as.integer(k) before coercion, erroring on non-integer numerics.
---
 R/synth.R                      | 4 ++--
 tests/testthat/test-gsynth-k.R | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/R/synth.R b/R/synth.R
index d16c7bc7..d81c5256 100644
--- a/R/synth.R
+++ b/R/synth.R
@@ -419,10 +419,10 @@ gsynth.train <- function(...,
     .gcheckroot()
 
     # Validate k (Markov order)
-    k <- as.integer(k)
-    if (length(k) != 1L || is.na(k) || k < 1L || k > 8L) {
+    if (!is.numeric(k) || length(k) != 1L || is.na(k) || k != as.integer(k) || k < 1L || k > 8L) {
         stop("k must be a single integer between 1 and 8", call. = FALSE)
     }
+    k <- as.integer(k)
     num_kmers <- as.integer(4L^k)
 
     # Capture all dimension specs from ...
diff --git a/tests/testthat/test-gsynth-k.R b/tests/testthat/test-gsynth-k.R
index 83ed0df4..45f768da 100644
--- a/tests/testthat/test-gsynth-k.R
+++ b/tests/testthat/test-gsynth-k.R
@@ -17,12 +17,10 @@ test_that("gsynth.train errors on invalid k values", {
         "k must be a single integer between 1 and 8"
     )
 
-    # k = 3.5 is silently truncated to 3L by as.integer(), which is valid.
-    # Verify it does NOT error (the code uses as.integer(k) without strict
-    # integer checking, so non-integer numerics are accepted).
+    # k = 3.5 should error (non-integer numeric)
     expect_error(
         gsynth.train(intervals = test_intervals, iterator = 200, k = 3.5),
-        NA
+        "k must be a single integer between 1 and 8"
     )
 
     # k = "abc" should error (as.integer("abc") produces NA)

From 86224f9fbee53b5a6fb55110783a30b376c5c321 Mon Sep 17 00:00:00 2001
From: aviezerl <aviezer.lifshitz@weizmann.ac.il>
Date: Tue, 31 Mar 2026 00:02:15 +0300
Subject: [PATCH 4/7] fix: bump .gsm metadata version to 2 for k != 5 models

v1 implied fixed 1024x4 context layout. Models with k != 5 now emit
version: 2 so external .gsm readers can distinguish variable-order files.
k=5 models continue to emit version: 1 for backward compatibility.
---
 R/synth.R                      | 2 +-
 tests/testthat/test-gsynth-k.R | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/R/synth.R b/R/synth.R
index d81c5256..973be337 100644
--- a/R/synth.R
+++ b/R/synth.R
@@ -805,7 +805,7 @@ gsynth.save <- function(model, file, compress = FALSE) {
     # Build metadata
     metadata <- list(
         format = "gsynth_model",
-        version = 1L,
+        version = if (is.null(model$k) || model$k == 5L) 1L else 2L,
         markov_order = as.integer(if (is.null(model$k)) 5L else model$k),
         n_dims = as.integer(n_dims),
         dim_sizes = if (n_dims > 0) as.integer(model$dim_sizes) else list(),
diff --git a/tests/testthat/test-gsynth-k.R b/tests/testthat/test-gsynth-k.R
index 45f768da..38574332 100644
--- a/tests/testthat/test-gsynth-k.R
+++ b/tests/testthat/test-gsynth-k.R
@@ -173,6 +173,12 @@ test_that("save/load round-trip with k != 5", {
     on.exit(unlink(out_dir, recursive = TRUE), add = TRUE)
 
     gsynth.save(model, out_dir)
+
+    # Verify .gsm metadata has version 2 for k != 5
+    meta <- yaml::read_yaml(file.path(out_dir, "metadata.yaml"))
+    expect_equal(meta$version, 2L)
+    expect_equal(meta$markov_order, 3L)
+
     loaded <- gsynth.load(out_dir)
 
     expect_s3_class(loaded, "gsynth.model")

From 0e52f4b10046daf1f58c9b3736a5fcb5a1c6deab Mon Sep 17 00:00:00 2001
From: aviezerl <aviezerl@users.noreply.github.com>
Date: Mon, 30 Mar 2026 21:04:32 +0000
Subject: [PATCH 5/7] Style code (GHA)

---
 tests/testthat/test-gsynth-k.R | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/testthat/test-gsynth-k.R b/tests/testthat/test-gsynth-k.R
index 38574332..9979faff 100644
--- a/tests/testthat/test-gsynth-k.R
+++ b/tests/testthat/test-gsynth-k.R
@@ -265,10 +265,13 @@ test_that("k with 1D stratification", {
     }
     gvtrack.create("g_frac_k", NULL, "kmer.frac", kmer = "G")
     gvtrack.create("c_frac_k", NULL, "kmer.frac", kmer = "C")
-    on.exit({
-        if ("g_frac_k" %in% gvtrack.ls()) gvtrack.rm("g_frac_k")
-        if ("c_frac_k" %in% gvtrack.ls()) gvtrack.rm("c_frac_k")
-    }, add = TRUE)
+    on.exit(
+        {
+            if ("g_frac_k" %in% gvtrack.ls()) gvtrack.rm("g_frac_k")
+            if ("c_frac_k" %in% gvtrack.ls()) gvtrack.rm("c_frac_k")
+        },
+        add = TRUE
+    )
 
     test_intervals <- gintervals(1, 0, 100000)
 

From 2793ff559dd5b72654619c3c8c6e94139711c22b Mon Sep 17 00:00:00 2001
From: aviezerl <aviezer.lifshitz@weizmann.ac.il>
Date: Tue, 31 Mar 2026 00:06:44 +0300
Subject: [PATCH 6/7] chore: bump version to 5.6.9

---
 DESCRIPTION | 2 +-
 NEWS.md     | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index d9704a3c..d5f3611d 100755
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: misha
 Title: Toolkit for Analysis of Genomic Data
-Version: 5.6.8
+Version: 5.6.9
 Authors@R: c(
     person("Misha", "Hoichman", , "misha@hoichman.com", role = "aut"),
     person("Aviezer", "Lifshitz", , "aviezer.lifshitz@weizmann.ac.il", role = c("aut", "cre")),
diff --git a/NEWS.md b/NEWS.md
index d917b1dd..f48ef0ca 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,7 @@
+# misha 5.6.9
+
+* Added `k` parameter to `gsynth.train()` to configure the Markov order (1-8, default 5).
+
 # misha 5.6.8
 
 * Added `direction` parameter ("above"/"below") to PWM edit distance functions for computing minimum edits to bring score below a threshold (motif disruption).

From 34b475aae02dcec506f7f7468e6bcf889e60b6b5 Mon Sep 17 00:00:00 2001
From: aviezerl <aviezer.lifshitz@weizmann.ac.il>
Date: Wed, 1 Apr 2026 11:23:33 +0300
Subject: [PATCH 7/7] feat: raise max Markov order from k=8 to k=10

---
 R/synth.R                      |  6 +++---
 man/gsynth.train.Rd            |  2 +-
 src/StratifiedMarkovModel.h    |  6 +++---
 tests/testthat/test-gsynth-k.R | 12 ++++++------
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/R/synth.R b/R/synth.R
index 973be337..85003235 100644
--- a/R/synth.R
+++ b/R/synth.R
@@ -337,7 +337,7 @@ gsynth.bin_map <- function(breaks, merge_ranges = NULL) {
 #'        with fewer observations will be marked as NA (not learned) and a warning will
 #'        be issued. Default is 0 (no minimum). During sampling, NA bins will fall back
 #'        to uniform sampling unless merged via \code{bin_merge}.
-#' @param k Integer Markov order (1--8). Default is 5, which models 6-mer
+#' @param k Integer Markov order (1--10). Default is 5, which models 6-mer
 #'        (context of length 5 plus the emitted base) transition probabilities.
 #'        Higher values capture longer-range sequence dependencies but require
 #'        exponentially more memory (\eqn{4^k} context states).
@@ -419,8 +419,8 @@ gsynth.train <- function(...,
     .gcheckroot()
 
     # Validate k (Markov order)
-    if (!is.numeric(k) || length(k) != 1L || is.na(k) || k != as.integer(k) || k < 1L || k > 8L) {
-        stop("k must be a single integer between 1 and 8", call. = FALSE)
+    if (!is.numeric(k) || length(k) != 1L || is.na(k) || k != as.integer(k) || k < 1L || k > 10L) {
+        stop("k must be a single integer between 1 and 10", call. = FALSE)
     }
     k <- as.integer(k)
     num_kmers <- as.integer(4L^k)
diff --git a/man/gsynth.train.Rd b/man/gsynth.train.Rd
index dfe5044b..3492dbfb 100644
--- a/man/gsynth.train.Rd
+++ b/man/gsynth.train.Rd
@@ -41,7 +41,7 @@ with fewer observations will be marked as NA (not learned) and a warning will
 be issued. Default is 0 (no minimum). During sampling, NA bins will fall back
 to uniform sampling unless merged via \code{bin_merge}.}
 
-\item{k}{Integer Markov order (1--8). Default is 5, which models 6-mer
+\item{k}{Integer Markov order (1--10). Default is 5, which models 6-mer
 (context of length 5 plus the emitted base) transition probabilities.
 Higher values capture longer-range sequence dependencies but require
 exponentially more memory (\eqn{4^k} context states).}
diff --git a/src/StratifiedMarkovModel.h b/src/StratifiedMarkovModel.h
index 77978d2e..558f6f07 100644
--- a/src/StratifiedMarkovModel.h
+++ b/src/StratifiedMarkovModel.h
@@ -35,9 +35,9 @@ constexpr int NUM_BASES = 4;
 class StratifiedMarkovModel {
 public:
     // Maximum supported Markov order
-    static constexpr int MAX_K = 8;
-    // Maximum number of k-mers (4^MAX_K = 65536)
-    static constexpr int MAX_KMERS = 65536;
+    static constexpr int MAX_K = 10;
+    // Maximum number of k-mers (4^MAX_K = 1048576)
+    static constexpr int MAX_KMERS = 1048576;
 
     StratifiedMarkovModel();
 
diff --git a/tests/testthat/test-gsynth-k.R b/tests/testthat/test-gsynth-k.R
index 9979faff..10d5f846 100644
--- a/tests/testthat/test-gsynth-k.R
+++ b/tests/testthat/test-gsynth-k.R
@@ -8,19 +8,19 @@ test_that("gsynth.train errors on invalid k values", {
     # k = 0 should error (below minimum of 1)
     expect_error(
         gsynth.train(intervals = test_intervals, iterator = 200, k = 0),
-        "k must be a single integer between 1 and 8"
+        "k must be a single integer between 1 and 10"
     )
 
-    # k = 9 should error (above maximum of 8)
+    # k = 11 should error (above maximum of 10)
     expect_error(
-        gsynth.train(intervals = test_intervals, iterator = 200, k = 9),
-        "k must be a single integer between 1 and 8"
+        gsynth.train(intervals = test_intervals, iterator = 200, k = 11),
+        "k must be a single integer between 1 and 10"
     )
 
     # k = 3.5 should error (non-integer numeric)
     expect_error(
         gsynth.train(intervals = test_intervals, iterator = 200, k = 3.5),
-        "k must be a single integer between 1 and 8"
+        "k must be a single integer between 1 and 10"
     )
 
     # k = "abc" should error (as.integer("abc") produces NA)
@@ -28,7 +28,7 @@ test_that("gsynth.train errors on invalid k values", {
         suppressWarnings(
             gsynth.train(intervals = test_intervals, iterator = 200, k = "abc")
         ),
-        "k must be a single integer between 1 and 8"
+        "k must be a single integer between 1 and 10"
     )
 })