ncsa · keshav-gandhi · Nov 29, 2025 · Nov 29, 2025 · Dec 5, 2025 · Dec 5, 2025
diff --git a/config_template/template_neat_config.yml b/config_template/template_neat_config.yml
@@ -133,8 +133,7 @@ rng_seed: .
 # type: int | required = no
 min_mutations: .
 
-# Overwrite the output files, if they are named the same as the current run.
-# Default is to quit if files already exist to avoid data destruction
+# Overwrite output files if they already exist
 # type: bool | required = no | default = false
 overwrite_output: .
 
@@ -153,12 +152,12 @@ parallel_block_size: .
 threads: .
 
 # Delete the 'splits' directory after stitching completes
-# Note if threads == 1, this option has no effect.
+# Note: If threads == 1, this option has no effect.
 # type = bool | required: no | default = true
 cleanup_splits: .
 
 # Reuse existing files in '<out_dir>/splits' and skip the split step.
 # The directory must contain NEAT-generated files and must be in the output directory within "splits"
-# Note if threads == 1, this option has no effect.
+# Note: If threads == 1, this option has no effect.
 # type = bool | required: no | default = False
-reuse_splits: .
+reuse_splits: .
diff --git a/neat/cli/commands/__init__.py b/neat/cli/commands/__init__.py
@@ -1,2 +1,3 @@
-"""Modules related to the subcommands' interfaces"""
+"""Modules related to the subcommands"""
+
 from .base import *
diff --git a/neat/cli/commands/model_qual_score.py b/neat/cli/commands/model_qual_score.py
@@ -0,0 +1,87 @@
+import argparse
+
+from ...model_quality_score import model_qual_score_runner
+from .base import BaseCommand
+from .options import output_group
+
+
+class Command(BaseCommand):
+    """
+    Generate a quality score model (traditional or Markov) from FASTQ.
+    """
+
+    name = "model-qual-score"
+    description = "Generate quality score model from FASTQ (optional Markov chain)."
+
+    def add_arguments(self, parser: argparse.ArgumentParser):
+
+        parser.add_argument(
+            "-i",
+            dest="input_files",
+            metavar="FILE",
+            nargs="+",
+            required=True,
+            help="Input FASTQ file(s) (gzipped or plain).",
+        )
+
+        parser.add_argument(
+            "-q",
+            dest="quality_offset",
+            type=int,
+            default=33,
+            help="Quality score offset [33].",
+        )
+
+        parser.add_argument(
+            "-Q",
+            dest="quality_scores",
+            type=int,
+            nargs="+",
+            default=[42],
+            help="Max quality or explicit list of quality scores [42].",
+        )
+
+        parser.add_argument(
+            "-m",
+            dest="max_num",
+            type=int,
+            default=-1,
+            help="Max number of reads to process [-1 = all].",
+        )
+
+        parser.add_argument(
+            "--markov",
+            dest="use_markov",
+            action="store_true",
+            default=False,
+            help="Use Markov quality model instead of the traditional model.",
+        )
+
+        parser.add_argument(
+            "--overwrite",
+            dest="overwrite",
+            action="store_true",
+            default=False,
+            help="Overwrite existing output file if present.",
+        )
+
+        output_group.add_to_parser(parser)
+
+    def execute(self, arguments: argparse.Namespace):
+
+        if len(arguments.quality_scores) == 1:
+            qual_scores: int | list[int] = arguments.quality_scores[0]
+
+        else:
+            qual_scores = arguments.quality_scores
+
+        model_qual_score_runner(
+            files=arguments.input_files,
+            offset=arguments.quality_offset,
+            qual_scores=qual_scores,
+            max_reads=arguments.max_num,
+            overwrite=arguments.overwrite,
+            output_dir=arguments.output_dir,
+            output_prefix=arguments.prefix,
+            use_markov=arguments.use_markov,
+        )
diff --git a/neat/model_quality_score/__init__.py b/neat/model_quality_score/__init__.py
@@ -0,0 +1,9 @@
+"""
+Submodule to build quality score models for NEAT that wraps NEAT’s
+existing sequencing error model and traditional quality model, with the
+option to construct a Markov chain–based quality model instead
+"""
+
+__all__ = ["model_qual_score_runner"]
+
+from .runner import model_qual_score_runner
diff --git a/neat/model_quality_score/runner.py b/neat/model_quality_score/runner.py
@@ -0,0 +1,197 @@
+"""
+Runner for creating quality score models.
+
+This module implements the core logic for generating quality score models
+from input FASTQ files.
+"""
+
+import gzip
+import pickle
+import logging
+from pathlib import Path
+from typing import Iterable, List, Optional, Tuple
+
+import numpy as np
+
+from ..common import validate_input_path, validate_output_path
+from ..model_sequencing_error.utils import parse_file
+from ..models import SequencingErrorModel, TraditionalQualityModel
+from ..models.markov_quality_model import MarkovQualityModel
+from ..quality_score_modeling.markov_utils import build_markov_model
+
+__all__ = ["model_qual_score_runner"]
+
+_LOG = logging.getLogger(__name__)
+
+
+def _prepare_quality_scores_argument(
+    qual_scores: int | Iterable[int | float],
+) -> Tuple[List[int], Optional[List[int]]]:
+    """
+    Returns:
+      - full_range_scores: required by parse_file indexing
+      - allowed_bins: None (no binning) or sorted unique user bins (for Markov binning)
+    """
+
+    if isinstance(qual_scores, int):
+        max_q = int(qual_scores)
+        return list(range(0, max_q + 1)), None
+
+    bins = sorted({int(x) for x in qual_scores})
+
+    if not bins:
+        raise ValueError("quality_scores list must not be empty.")
+
+    max_q = max(bins)
+
+    # parse_file requires indices up to max_q (it indexes by raw Q value)
+    return list(range(0, max_q + 1)), bins
+
+
+def model_qual_score_runner(
+    files: List[str],
+    offset: int,
+    qual_scores: int | Iterable[int | float],
+    max_reads: int,
+    use_markov: bool,
+    overwrite: bool,
+    output_dir: str,
+    output_prefix: str,
+) -> None:
+    """Create and save a quality score model from FASTQ data."""
+
+    if len(files) > 2:
+        _LOG.info("Only processing the first two input files")
+        files = files[:2]
+
+    # Validate input paths
+    for file in files:
+        validate_input_path(file)
+
+    _LOG.debug("Input files: %s", ", ".join(str(x) for x in files))
+    _LOG.debug("Quality offset: %d", offset)
+
+    final_quality_scores, allowed_bins = _prepare_quality_scores_argument(qual_scores)
+    _LOG.debug("Quality scores range: %s", final_quality_scores)
+
+    if allowed_bins is not None:
+        _LOG.debug("Markov binning enabled with bins: %s", allowed_bins)
+
+    if max_reads in (-1, None):
+        num_records_to_process = float("inf")
+
+    else:
+        num_records_to_process = max_reads
+
+    # Validate output directory and file
+    validate_output_path(output_dir, is_file=False)
+    output_path = Path(output_dir)
+    output_file = output_path / f"{output_prefix}.p.gz"
+    validate_output_path(output_file, overwrite=overwrite)
+
+    _LOG.info("Writing output to: %s", output_file)
+
+    # Containers for per-file quality model parameters
+    read_parameters: List[np.ndarray] = []
+    average_errors: List[float] = []
+    read_length = 0
+
+    # Traditional model parameters (existing NEAT utility)
+    for idx_file, file in enumerate(files, start=1):
+
+        _LOG.info("Reading file %d of %d", idx_file, len(files))
+
+        params_by_position, file_avg_error, read_length = parse_file(
+            file,
+            final_quality_scores,
+            num_records_to_process,
+            offset,
+            read_length,
+        )
+
+        read_parameters.append(params_by_position)
+        average_errors.append(file_avg_error)
+
+        _LOG.info("Finished reading file %d", idx_file)
+
+    if not read_parameters:
+        raise RuntimeError("No quality score parameters were computed. Check input FASTQ files.")
+
+    average_error = float(np.average(average_errors)) if average_errors else 0.0
+    _LOG.info("Average sequencing error across files: %f", average_error)
+
+    # Prepare models for each input file
+    models: List[Tuple[SequencingErrorModel, TraditionalQualityModel, Optional[MarkovQualityModel]]] = []
+
+    for idx in range(len(read_parameters)):
+
+        # Sequencing error model (always produced)
+        seq_err_model = SequencingErrorModel(avg_seq_error=average_error, read_length=read_length)
+
+        # Traditional quality model (always produced)
+        trad_model = TraditionalQualityModel(
+            average_error=average_error,
+            quality_scores=np.array(final_quality_scores),
+            qual_score_probs=read_parameters[idx],
+        )
+
+        markov_model: Optional[MarkovQualityModel] = None
+
+        # Optionally build Markov quality model
+
+        if use_markov:
+
+            # Position-specific transition matrices
+            init_dist, pos_dists, trans_dists, max_quality, train_read_length = build_markov_model(
+                [files[idx]],
+                num_records_to_process,
+                offset,
+                allowed_quality_scores=allowed_bins,
+            )
+
+            markov_model = MarkovQualityModel(
+                initial_distribution=init_dist,
+                position_distributions=pos_dists,
+                max_quality=max_quality,
+                read_length=train_read_length,
+                transition_distributions=trans_dists,
+            )
+
+        models.append((seq_err_model, trad_model, markov_model))
+
+    # Write out the models
+
+    with gzip.open(output_file, "wb") as out_model:
+
+        if len(models) == 1:
+
+            seq_err1, trad1, markov1 = models[0]
+            pickle.dump(
+                {
+                    "error_model1": seq_err1,
+                    "error_model2": None,
+                    "qual_score_model1": markov1 if use_markov else trad1,
+                    "qual_score_model2": None,
+                },
+                out_model,
+            )
+
+        elif len(models) == 2:
+
+            (seq_err1, trad1, markov1), (seq_err2, trad2, markov2) = models
+            pickle.dump(
+                {
+                    "error_model1": seq_err1,
+                    "error_model2": seq_err2,
+                    "qual_score_model1": markov1 if use_markov else trad1,
+                    "qual_score_model2": markov2 if use_markov else trad2,
+                },
+                out_model,
+            )
+
+        else:
+
+            # NEAT's read simulator only understands one or two models
+            raise RuntimeError(f"Expected at most two quality models, but constructed {len(models)}.")
+
+    _LOG.info("Quality score model saved to %s", output_file)
diff --git a/neat/models/__init__.py b/neat/models/__init__.py
@@ -1,3 +1,4 @@
 from .error_models import *
 from .mutation_model import *
 from .fragment_model import *
+from .markov_quality_model import *