Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
fdb8d2c
More error model and mutation model tests.
keshav-gandhi Nov 29, 2025
069272c
Readability updates to README.
keshav-gandhi Nov 29, 2025
9d43fa3
Full Markov integration set of scripts.
keshav-gandhi Dec 5, 2025
b539df2
Merge branch 'main' into markov-integration
keshav-gandhi Dec 5, 2025
313e1e8
Merge branch 'main' into markov-integration
keshav-gandhi Jan 29, 2026
a666171
Merge pull request #224 from ncsa/markov-integration
keshav-gandhi Jan 29, 2026
7d6808f
Renaming to parallel_block_size and parallel_mode; fixing tests.
keshav-gandhi Jan 30, 2026
55541ff
Installation instructions.
keshav-gandhi Feb 2, 2026
2e8e30c
Merge pull request #240 from ncsa/quality-of-life
keshav-gandhi Feb 18, 2026
5a4d347
Markov quality score model changes copied in here.
keshav-gandhi Feb 23, 2026
b0ad46a
README updates including Markov stuff.
keshav-gandhi Feb 27, 2026
af8ba09
Removing accidental log file.
keshav-gandhi Feb 28, 2026
e94e785
Transition matrices better accounted for.
keshav-gandhi Mar 5, 2026
cfb8323
Bug fixing and cleaning up.
keshav-gandhi Mar 5, 2026
96085c0
Type bug and cleaning up.
keshav-gandhi Mar 5, 2026
015c01f
Merge branch 'markov-integration' into markov-separate
keshav-gandhi Mar 5, 2026
a026d33
Merge pull request #251 from ncsa/markov-separate
keshav-gandhi Mar 5, 2026
c1cbe3e
Merge branch 'develop' into markov-integration
keshav-gandhi Mar 5, 2026
ff13ea9
Formatting in template_neat_config.yml
keshav-gandhi Mar 5, 2026
0b69f8d
Delete version.py (we already have a 4.3.6 and version tracker now)
keshav-gandhi Mar 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions config_template/template_neat_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,7 @@ rng_seed: .
# type: int | required = no
min_mutations: .

# Overwrite the output files, if they are named the same as the current run.
# Default is to quit if files already exist to avoid data destruction
# Overwrite output files if they already exist
# type: bool | required = no | default = false
overwrite_output: .

Expand All @@ -153,12 +152,12 @@ parallel_block_size: .
threads: .

# Delete the 'splits' directory after stitching completes
# Note if threads == 1, this option has no effect.
# Note: If threads == 1, this option has no effect.
# type = bool | required: no | default = true
cleanup_splits: .

# Reuse existing files in '<out_dir>/splits' and skip the split step.
# The directory must contain NEAT-generated files and must be in the output directory within "splits"
# Note if threads == 1, this option has no effect.
# Note: If threads == 1, this option has no effect.
# type = bool | required: no | default = False
reuse_splits: .
reuse_splits: .
3 changes: 2 additions & 1 deletion neat/cli/commands/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
"""Modules related to the subcommands' interfaces"""
"""Modules related to the subcommands"""

from .base import *
87 changes: 87 additions & 0 deletions neat/cli/commands/model_qual_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import argparse

from ...model_quality_score import model_qual_score_runner
from .base import BaseCommand
from .options import output_group


class Command(BaseCommand):
"""
Generate a quality score model (traditional or Markov) from FASTQ.
"""

name = "model-qual-score"
description = "Generate quality score model from FASTQ (optional Markov chain)."

def add_arguments(self, parser: argparse.ArgumentParser):

parser.add_argument(
"-i",
dest="input_files",
metavar="FILE",
nargs="+",
required=True,
help="Input FASTQ file(s) (gzipped or plain).",
)

parser.add_argument(
"-q",
dest="quality_offset",
type=int,
default=33,
help="Quality score offset [33].",
)

parser.add_argument(
"-Q",
dest="quality_scores",
type=int,
nargs="+",
default=[42],
help="Max quality or explicit list of quality scores [42].",
)

parser.add_argument(
"-m",
dest="max_num",
type=int,
default=-1,
help="Max number of reads to process [-1 = all].",
)

parser.add_argument(
"--markov",
dest="use_markov",
action="store_true",
default=False,
help="Use Markov quality model instead of the traditional model.",
)

parser.add_argument(
"--overwrite",
dest="overwrite",
action="store_true",
default=False,
help="Overwrite existing output file if present.",
)

output_group.add_to_parser(parser)

def execute(self, arguments: argparse.Namespace):

if len(arguments.quality_scores) == 1:
qual_scores: int | list[int] = arguments.quality_scores[0]

else:
qual_scores = arguments.quality_scores

model_qual_score_runner(
files=arguments.input_files,
offset=arguments.quality_offset,
qual_scores=qual_scores,
max_reads=arguments.max_num,
overwrite=arguments.overwrite,
output_dir=arguments.output_dir,
output_prefix=arguments.prefix,
use_markov=arguments.use_markov,
)
9 changes: 9 additions & 0 deletions neat/model_quality_score/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""
Submodule to build quality score models for NEAT that wraps NEAT’s
existing sequencing error model and traditional quality model, with the
option to construct a Markov chain–based quality model instead
"""

__all__ = ["model_qual_score_runner"]

from .runner import model_qual_score_runner
197 changes: 197 additions & 0 deletions neat/model_quality_score/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
"""
Runner for creating quality score models.

This module implements the core logic for generating quality score models
from input FASTQ files.
"""

import gzip
import pickle
import logging
from pathlib import Path
from typing import Iterable, List, Optional, Tuple

import numpy as np

from ..common import validate_input_path, validate_output_path
from ..model_sequencing_error.utils import parse_file
from ..models import SequencingErrorModel, TraditionalQualityModel
from ..models.markov_quality_model import MarkovQualityModel
from ..quality_score_modeling.markov_utils import build_markov_model

__all__ = ["model_qual_score_runner"]

_LOG = logging.getLogger(__name__)


def _prepare_quality_scores_argument(
qual_scores: int | Iterable[int | float],
) -> Tuple[List[int], Optional[List[int]]]:
"""
Returns:
- full_range_scores: required by parse_file indexing
- allowed_bins: None (no binning) or sorted unique user bins (for Markov binning)
"""

if isinstance(qual_scores, int):
max_q = int(qual_scores)
return list(range(0, max_q + 1)), None

bins = sorted({int(x) for x in qual_scores})

if not bins:
raise ValueError("quality_scores list must not be empty.")

max_q = max(bins)

# parse_file requires indices up to max_q (it indexes by raw Q value)
return list(range(0, max_q + 1)), bins


def model_qual_score_runner(
files: List[str],
offset: int,
qual_scores: int | Iterable[int | float],
max_reads: int,
use_markov: bool,
overwrite: bool,
output_dir: str,
output_prefix: str,
) -> None:
"""Create and save a quality score model from FASTQ data."""

if len(files) > 2:
_LOG.info("Only processing the first two input files")
files = files[:2]

# Validate input paths
for file in files:
validate_input_path(file)

_LOG.debug("Input files: %s", ", ".join(str(x) for x in files))
_LOG.debug("Quality offset: %d", offset)

final_quality_scores, allowed_bins = _prepare_quality_scores_argument(qual_scores)
_LOG.debug("Quality scores range: %s", final_quality_scores)

if allowed_bins is not None:
_LOG.debug("Markov binning enabled with bins: %s", allowed_bins)

if max_reads in (-1, None):
num_records_to_process = float("inf")

else:
num_records_to_process = max_reads

# Validate output directory and file
validate_output_path(output_dir, is_file=False)
output_path = Path(output_dir)
output_file = output_path / f"{output_prefix}.p.gz"
validate_output_path(output_file, overwrite=overwrite)

_LOG.info("Writing output to: %s", output_file)

# Containers for per-file quality model parameters
read_parameters: List[np.ndarray] = []
average_errors: List[float] = []
read_length = 0

# Traditional model parameters (existing NEAT utility)
for idx_file, file in enumerate(files, start=1):

_LOG.info("Reading file %d of %d", idx_file, len(files))

params_by_position, file_avg_error, read_length = parse_file(
file,
final_quality_scores,
num_records_to_process,
offset,
read_length,
)

read_parameters.append(params_by_position)
average_errors.append(file_avg_error)

_LOG.info("Finished reading file %d", idx_file)

if not read_parameters:
raise RuntimeError("No quality score parameters were computed. Check input FASTQ files.")

average_error = float(np.average(average_errors)) if average_errors else 0.0
_LOG.info("Average sequencing error across files: %f", average_error)

# Prepare models for each input file
models: List[Tuple[SequencingErrorModel, TraditionalQualityModel, Optional[MarkovQualityModel]]] = []

for idx in range(len(read_parameters)):

# Sequencing error model (always produced)
seq_err_model = SequencingErrorModel(avg_seq_error=average_error, read_length=read_length)

# Traditional quality model (always produced)
trad_model = TraditionalQualityModel(
average_error=average_error,
quality_scores=np.array(final_quality_scores),
qual_score_probs=read_parameters[idx],
)

markov_model: Optional[MarkovQualityModel] = None

# Optionally build Markov quality model

if use_markov:

# Position-specific transition matrices
init_dist, pos_dists, trans_dists, max_quality, train_read_length = build_markov_model(
[files[idx]],
num_records_to_process,
offset,
allowed_quality_scores=allowed_bins,
)

markov_model = MarkovQualityModel(
initial_distribution=init_dist,
position_distributions=pos_dists,
max_quality=max_quality,
read_length=train_read_length,
transition_distributions=trans_dists,
)

models.append((seq_err_model, trad_model, markov_model))

# Write out the models

with gzip.open(output_file, "wb") as out_model:

if len(models) == 1:

seq_err1, trad1, markov1 = models[0]
pickle.dump(
{
"error_model1": seq_err1,
"error_model2": None,
"qual_score_model1": markov1 if use_markov else trad1,
"qual_score_model2": None,
},
out_model,
)

elif len(models) == 2:

(seq_err1, trad1, markov1), (seq_err2, trad2, markov2) = models
pickle.dump(
{
"error_model1": seq_err1,
"error_model2": seq_err2,
"qual_score_model1": markov1 if use_markov else trad1,
"qual_score_model2": markov2 if use_markov else trad2,
},
out_model,
)

else:

# NEAT's read simulator only understands one or two models
raise RuntimeError(f"Expected at most two quality models, but constructed {len(models)}.")

_LOG.info("Quality score model saved to %s", output_file)
1 change: 1 addition & 0 deletions neat/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .error_models import *
from .mutation_model import *
from .fragment_model import *
from .markov_quality_model import *
Loading
Loading