From a93257ea16ec2e281488d0ca7b436821bfde4c5b Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Sat, 24 Jan 2026 09:52:04 -0600 Subject: [PATCH 01/13] Adding version flag --- neat/cli/cli.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/neat/cli/cli.py b/neat/cli/cli.py index d7abd353..f5d12d4d 100644 --- a/neat/cli/cli.py +++ b/neat/cli/cli.py @@ -1,6 +1,9 @@ """Implements command line interface used by the package.""" __all__ = ['Cli', 'main', 'run'] +__version__ = "4.3.5" +__author__ = "Joshua Allen" +__email__ = "jallen17@illinois.edu" import argparse import importlib @@ -41,6 +44,11 @@ def __init__(self): self.parser = argparse.ArgumentParser( prog="neat", description="Run NEAT components" ) + self.parser.add_argument( + "--version", + action="version", + version="%(prog)s {version}".format(version=__version__), + ) self.parser.add_argument( "--no-log", default=False, From 750ffb2c72cf7021a8f12829e2093dff1e4ae606 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Sat, 24 Jan 2026 10:04:42 -0600 Subject: [PATCH 02/13] Trying shutil.move instead of os.rename due to filesystem conflicts --- neat/cli/cli.py | 2 +- neat/read_simulator/runner.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/neat/cli/cli.py b/neat/cli/cli.py index f5d12d4d..b049e4f2 100644 --- a/neat/cli/cli.py +++ b/neat/cli/cli.py @@ -45,7 +45,7 @@ def __init__(self): prog="neat", description="Run NEAT components" ) self.parser.add_argument( - "--version", + "-v", "--version", action="version", version="%(prog)s {version}".format(version=__version__), ) diff --git a/neat/read_simulator/runner.py b/neat/read_simulator/runner.py index a756c7bb..424b6ed2 100644 --- a/neat/read_simulator/runner.py +++ b/neat/read_simulator/runner.py @@ -2,7 +2,7 @@ Runner for generate_reads task """ import logging -import os +import shutil import subprocess import time import multiprocessing as mp @@ -239,7 +239,7 @@ def read_simulator_runner(config: str, output_dir: str, file_prefix: str): temp_file = str(options.temp_dir_path / "temp.sorted.vcf.gz") subprocess.run(["bcftools", "sort", "-o", temp_file, "-Ob9", str(file)]) Path(temp_file).is_file() - os.rename(temp_file, str(file)) + shutil.move(temp_file, str(file)) _LOG.info("Indexing vcf") pysam.tabix_index(str(file), preset="vcf", force=force) From 9897d811d78299128ad438086ea7f4ca1088f4fe Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Sat, 24 Jan 2026 14:33:25 -0600 Subject: [PATCH 03/13] Made the naming for parallel_mode and mode consistent --- config_template/simple_template.yml | 4 ++-- neat/cli/commands/options.py | 1 + neat/cli/commands/read_simulator.py | 2 +- neat/read_simulator/utils/options.py | 5 +++-- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/config_template/simple_template.yml b/config_template/simple_template.yml index 21769af9..2520a09f 100644 --- a/config_template/simple_template.yml +++ b/config_template/simple_template.yml @@ -26,8 +26,8 @@ rng_seed: . min_mutations: . overwrite_output: . -mode: . -size: . +parallel_mode: . +parallel_block_size: . threads: . cleanup_splits: . reuse_splits: . diff --git a/neat/cli/commands/options.py b/neat/cli/commands/options.py index 274a973a..765fb215 100644 --- a/neat/cli/commands/options.py +++ b/neat/cli/commands/options.py @@ -14,6 +14,7 @@ "--output_dir", dest="output_dir", type=str, + required=True, help="Path to the output directory. Will create if not present.", default=os.getcwd() ) diff --git a/neat/cli/commands/read_simulator.py b/neat/cli/commands/read_simulator.py index 6438e34b..e1c6e217 100644 --- a/neat/cli/commands/read_simulator.py +++ b/neat/cli/commands/read_simulator.py @@ -33,7 +33,7 @@ def add_arguments(self, parser: argparse.ArgumentParser): "-c", "--config", metavar="config", type=str, - required=False, + required=True, help="Path (including filename) to the configuration file for this run." ) diff --git a/neat/read_simulator/utils/options.py b/neat/read_simulator/utils/options.py index 16290908..a45f8664 100644 --- a/neat/read_simulator/utils/options.py +++ b/neat/read_simulator/utils/options.py @@ -237,8 +237,8 @@ def from_cli(output_dir: Path, 'rng_seed': (int, None, None, None), 'min_mutations': (int, 0, None, None), 'overwrite_output': (bool, False, None, None), - 'mode': (str, 'size', 'choice', ['size', 'contig']), - 'size': (int, 500000, None, None), + 'parallel_mode': (str, 'size', 'choice', ['size', 'contig']), + 'parallel_block_size': (int, 500000, None, None), 'threads': (int, 1, 1, 1000), 'cleanup_splits': (bool, True, None, None), 'reuse_splits': (bool, False, None, None) @@ -377,6 +377,7 @@ def check_options(self): """ Some sanity checks and corrections to the options. """ + if not (self.produce_bam or self.produce_vcf or self.produce_fastq): _LOG.error('No files would be produced, as all file types are set to false') sys.exit(1) From e606d671ce21e1cc49ac4b6231627114df1d480e Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Sat, 24 Jan 2026 14:50:36 -0600 Subject: [PATCH 04/13] Updating version number for new release --- neat/cli/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neat/cli/cli.py b/neat/cli/cli.py index b049e4f2..f97c2599 100644 --- a/neat/cli/cli.py +++ b/neat/cli/cli.py @@ -1,7 +1,7 @@ """Implements command line interface used by the package.""" __all__ = ['Cli', 'main', 'run'] -__version__ = "4.3.5" +__version__ = "4.3.6" __author__ = "Joshua Allen" __email__ = "jallen17@illinois.edu" From b2324dc0c9b02d01bbd6364a721c9e6535e36a88 Mon Sep 17 00:00:00 2001 From: Keshav Date: Fri, 20 Feb 2026 15:15:14 +0100 Subject: [PATCH 05/13] Merged template changes and put into new branch for unique PR. --- config_template/template_neat_config.yml | 96 ++++++++++++------------ 1 file changed, 47 insertions(+), 49 deletions(-) diff --git a/config_template/template_neat_config.yml b/config_template/template_neat_config.yml index 84d2c3af..4bdc8fb1 100644 --- a/config_template/template_neat_config.yml +++ b/config_template/template_neat_config.yml @@ -1,25 +1,27 @@ -## Template for gen_reads parallel +## Template for NEAT's read-simulator (as of version 4.3.5, parallelization-friendly) ## Any parameter that is not required but has a default value will use the ## default value even if the variable is not included in the config. For -## required items, they must be included in the config and the must be given a value. +## required items, they must be included in the config and they must be given a value. ## All other items can be present or not. If present and the value is set to a single ## period, the variable will be treated as though it had been omitted. Please do -## not modify this template, but instead make a copy in your working directory. Done this -## way, you can run without even needing to declare -c. +## not modify this template, but instead make a copy in your working directory. +## Run with this command: neat read-simulator -c -o [-p ] -# Absolute path to input reference fasta file +# Absolute path to input reference FASTA file # type = string | required: yes reference: REQUIRED -# Read length of the reads in the fastq output. Only required if @produce_fastq is set to true -# type = int | required: no | default = 101 +# Read length of the reads in the FASTQ output. Only required if @produce_fastq is set to true +# type = int | required: no | default = 151 read_len: . -# Average Coverage for the entire genome. +# Average coverage for the entire genome # type = float | required: no | default = 10.0 coverage: . -# Absolute path to file with sequencing error model +# Absolute path to file with sequencing error model or quality-score model. +# Error models are typically produced by neat model-seq-err (from FASTQ/BAM-like inputs), while +# quality-score models can be produced by neat model-qual-score (optionally fit with --markov) # type = string | required: no | default: /neat/models/defaults/default_error_model.pickle.gz error_model: . @@ -27,13 +29,12 @@ error_model: . # type = float | required = no | must be between 0.0 and 0.3 avg_seq_error: . -# This scales the quality scores to match the desired average sequencing error rate -# specified by avg_seq_error. +# This scales the quality scores to match the desired average sequencing error rate specified by avg_seq_error # type: boolean | required = no | default = false rescale_qualities: . -# This is the factor to add to the quality scores to get the ascii text version of the -# score. The default follows the sanger quality offset +# This is the factor to add to the quality scores to get the ASCII text version of the +# score. The default follows the Sanger quality offset # type: int | required = no | default = 33 quality_offset: . @@ -41,97 +42,94 @@ quality_offset: . # type = int | required = no | default = 2 ploidy: . -# Absolute path to vcf file containing variants that will always be included, regardless -# of genotype and filter. You can pre-filter your vcf for these fields before inputting it -# if this is not the desired behavior. +# Absolute path to VCF file containing variants that will always be included, regardless +# of genotype and filter. You can pre-filter your VCF for these fields before inputting it +# if this is not the desired behavior # type: string | required = no include_vcf: . -# Absolute path to bed file containing reference regions that the simulation -# should target. +# Absolute path to BED file containing reference regions that the simulation should target # type = string | required = no target_bed: . -# Scalar value for coverage in regions outside the targeted bed. Example 0.5 +# Scalar value for coverage in regions outside the targeted BED. Example: 0.5 # would get you roughly half the coverage as the on target areas. Default is -# 0 coverage in off-target regions. Number should be a float in decimal. +# 0 coverage in off-target regions. Number should be a float in decimal # type: float | required = no | default = 0.00 off_target_scalar: . -# Absolute path to bed file containing reference regions that the simulation -# should discard. +# Absolute path to BED file containing reference regions that the simulation should discard # type = string | required = no discard_bed: . # Absolute path to the mutation model pickle file. Omitting this value will cause -# NEAT to use the default model, with some standard parameters, and generally uniform biases. +# NEAT to use the default model, with some standard parameters, and generally uniform biases # type: string | required = no mutation_model: . -# Average mutation rate per base pair. Overall average is 0.001, or model default -# Use either this value to override the mutation rate for the default or input model. +# Average mutation rate per base pair. Overall average is 0.001, or model default. +# Use either this value to override the mutation rate for the default or input model # type: float | required = no | must be between 0.0 and 0.3 mutation_rate: . -# Absolute path to a bed file with mutation rates by region. -# Rates must be in the fourth column and be of the form "mut_rate=x.xx" -# Rates must be between 0.00 and 0.03 +# Absolute path to a BED file with mutation rates by region. +# Rates must be in the third column and be of the form "mut_rate=x.xx" +# Rates must be between 0.0 and 0.3 # type: string | required = no mutation_bed: . -# Whether the output should be paired ended. For certain conditions (i.e., vcf only or -# fasta only), this will be ignored. If this is true, then there must be an included fragment +# Whether the output should be paired ended. For certain conditions (i.e., VCF only or +# FASTA only), this will be ignored. If this is true, then there must be an included fragment # length model output from runner.py or a mean and standard deviation -# by declaring values for @fragment_mean and @fragment_std_dev. +# by declaring values for @fragment_mean and @fragment_std_dev # type: boolean | required = no | default = false paired_ended: . -# Absolute path to a pickle file containing the fragment length model output -# from runner.py. +# Absolute path to a pickle file containing the fragment length model. +# Typically produced by neat model-fraglen (learned from BAM alignments) # type: string | required = no | default: /neat/models/defaults/default_fraglen_model.pickle.gz fragment_model: . -# Mean for the paired end fragment length. This only applies if paired-ended is set to true. +# Mean for the paired-end fragment length. This only applies if paired-ended is set to true. # This number will form the mean for the sample distribution of the fragment lengths in the simulation # Note: This number is REQUIRED if paired_ended is set to true, unless a fragment length model is used. # type: float | required: no (unless paired-ended) fragment_mean: . -# Standard deviation for the paired end fragment length. This only applies if paired-ended is set to true. +# Standard deviation for the paired-end fragment length. This only applies if paired-ended is set to true. # This number will form the standard deviation about the mean specified above for the sample distribution -# of the fragment lengths in the simulation. +# of the fragment lengths in the simulation # Note: This number is REQUIRED if paired_ended is set to true, unless a fragment length model is used. # type: float | required: no (unless paired-ended) fragment_st_dev: . -# Whether to produce the golden bam file. This file will contain the reads +# Whether to produce the golden BAM file. This file will contain the reads # aligned with the exact region of the genome # type: boolean | required = no | default = false produce_bam: . -# Whether to produce a vcf file containing all the mutation errors added -# by NEAT. +# Whether to produce a VCF file containing all the mutation errors added by NEAT # type: boolean | required = no | default = false produce_vcf: . -# Whether to output the fastq(s) of the reads. This is the default output. NEAT -# will produce 1 fastq for single ended reads or 2 fastqs for paired ended. +# Whether to output the FASTQ(s) of the reads. This is the default output. NEAT +# will produce 1 FASTQ for single-ended reads or 2 FASTQs for paired-ended reads # type: boolean | required = no | default = true produce_fastq: . # If set to true, this will ignore statistical models and force coverage to be -# constant across the genome. This is considered a debugging feature. +# constant across the genome. This is considered a debugging feature # type: boolean | required = no | default = false no_coverage_bias: . # Set an RNG seed value. Runs using identical RNG values should produce identical results # so things like read locations, variant positions, error positions, etc. should be the same. -# Useful for debugging. +# Useful for debugging # type: int | required = no rng_seed: . # Set an absolute minimum number of mutations. The program always adds at least 1 mutation. -# Useful for very small datasets. +# Useful for very small datasets # type: int | required = no min_mutations: . @@ -141,17 +139,17 @@ min_mutations: . overwrite_output: . # How to split the input reference for parallelization -# Note if threads == 1, this option has no effect. +# Note: If threads == 1, this option has no effect. # type = string | required: no | default = contig | values: contig, size parallel_mode: . # Target block size if by = size (overlap = read_len * 2). -# Default is 500000 when by = size. Not used for by = contig. +# Default is 500000 when by = size. Not used for by = contig # type = int | required: no | default = 500000 (when by=size) parallel_block_size: . -# Maximum number of concurrent NEAT jobs (threads or hyperthreads) to run. -# type = int | required: no | default = all available. +# Maximum number of concurrent NEAT jobs (threads or hyperthreads) to run +# type = int | required: no | default = all available threads: . # Delete the 'splits' directory after stitching completes @@ -160,7 +158,7 @@ threads: . cleanup_splits: . # Reuse existing files in '/splits' and skip the split step. -# The directory must contain neat-generated files and must be in the output dir within "splits" +# The directory must contain NEAT-generated files and must be in the output directory within "splits" # Note if threads == 1, this option has no effect. # type = bool | required: no | default = False reuse_splits: . \ No newline at end of file From b8a7e95bf3a77f04fb79f96f7e97c805c19e0e22 Mon Sep 17 00:00:00 2001 From: Keshav Date: Fri, 20 Feb 2026 15:49:49 +0100 Subject: [PATCH 06/13] Parallelization renaming in utils scripts. --- neat/read_simulator/utils/options.py | 2 +- neat/read_simulator/utils/split_inputs.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/neat/read_simulator/utils/options.py b/neat/read_simulator/utils/options.py index a45f8664..fd05b6de 100644 --- a/neat/read_simulator/utils/options.py +++ b/neat/read_simulator/utils/options.py @@ -439,7 +439,7 @@ def log_configuration(self): if self.parallel_mode == 'size': _LOG.info(f'Splitting reference into chunks.') - _LOG.info(f' - splitting input into size {self.size}') + _LOG.info(f' - splitting input into size {self.parallel_block_size}') elif self.parallel_mode == 'contig': _LOG.info(f'Splitting input by contig.') if not self.cleanup_splits or self.reuse_splits: diff --git a/neat/read_simulator/utils/split_inputs.py b/neat/read_simulator/utils/split_inputs.py index 82f63cef..ad3a42f9 100644 --- a/neat/read_simulator/utils/split_inputs.py +++ b/neat/read_simulator/utils/split_inputs.py @@ -67,7 +67,7 @@ def main(options: Options, reference_index: dict) -> tuple[dict, int]: # We'll keep track of chunks by contig, to help us out later split_fasta_dict: dict[str, dict[tuple[int, int], Path]] = {key: {} for key in reference_index.keys()} for contig, seq_record in reference_index.items(): - if options.mode == "contig": + if options.parallel_mode == "contig": stem = f"{idx:0{pad}d}__{contig}" fa = options.splits_dir / f"{stem}.fa.gz" write_fasta(contig, seq_record.seq.upper(), fa) @@ -75,7 +75,7 @@ def main(options: Options, reference_index: dict) -> tuple[dict, int]: idx += 1 written += 1 else: - for start, subseq in chunk_record(seq_record.seq.upper(), options.size, overlap): + for start, subseq in chunk_record(seq_record.seq.upper(), options.parallel_block_size, overlap): stem = f"{idx:0{pad}d}__{contig}" fa = options.splits_dir / f"{stem}.fa.gz" write_fasta(contig, subseq, fa) From 123429eac20532c48392511edbf65081070defca Mon Sep 17 00:00:00 2001 From: Keshav Date: Sun, 22 Feb 2026 16:56:44 +0100 Subject: [PATCH 07/13] Testing CLI and new tests for error and mutation models. --- tests/test_cli/test_cli.py | 37 ++++-- .../test_models/test_error_and_mut_models.py | 105 +++++++++++++++++- tests/test_read_simulator/test_options.py | 57 +++++++--- 3 files changed, 177 insertions(+), 22 deletions(-) diff --git a/tests/test_cli/test_cli.py b/tests/test_cli/test_cli.py index 07805497..d5e3247a 100644 --- a/tests/test_cli/test_cli.py +++ b/tests/test_cli/test_cli.py @@ -9,6 +9,21 @@ from neat.cli.cli import Cli, main +def _write_min_cfg(tmp_path: Path) -> Path: + """ + Write a minimal config file for read-simulator. + + These CLI tests validate argument handling, logging, and return codes, + not the full simulation behavior, but read-simulator still requires -c. + """ + ref = tmp_path / "ref.fa" + ref.write_text(">chr1\nACGT\n", encoding="utf-8") + + cfg = tmp_path / "conf.yml" + cfg.write_text(f"reference: {ref}\nproduce_fastq: true\n", encoding="utf-8") + return cfg + + def test_cli_registers_read_simulator_subcommand(): cli = Cli() # Argparse stores subparsers in a private map; ensure our command is registered @@ -50,10 +65,15 @@ def test_logging_creates_named_log_file_and_announces(monkeypatch, tmp_path: Pat lambda *args, **kwargs: None, ) + cfg = _write_min_cfg(tmp_path) + rc = main(cli.parser, [ "--log-name", str(logname), # Supply a benign subcommand with minimal required args - "read-simulator", "-o", str(tmp_path), "-p", "pref" + "read-simulator", + "-c", str(cfg), + "-o", str(tmp_path), + "-p", "pref", ]) out = capsys.readouterr().out # main should create/log the file path and return 0 (success) @@ -68,13 +88,12 @@ def test_read_simulator_success_invokes_runner(monkeypatch, tmp_path: Path): called = {} def fake_runner(cfg, outdir, prefix): - called['args'] = (cfg, outdir, prefix) + called["args"] = (cfg, outdir, prefix) # Patch runner used by command monkeypatch.setattr("neat.cli.commands.read_simulator.read_simulator_runner", fake_runner) - cfg = tmp_path / "conf.yml" - cfg.write_text("reference: ''\n", encoding="utf-8") # minimal content; not validated here + cfg = _write_min_cfg(tmp_path) rc = main(cli.parser, [ "--no-log", @@ -85,7 +104,7 @@ def fake_runner(cfg, outdir, prefix): ]) assert rc == 0 - assert called['args'] == (str(cfg), str(tmp_path), "myprefix") + assert called["args"] == (str(cfg), str(tmp_path), "myprefix") def test_read_simulator_failure_returns_1_and_prints_error(monkeypatch, tmp_path: Path, capsys): @@ -94,11 +113,15 @@ def test_read_simulator_failure_returns_1_and_prints_error(monkeypatch, tmp_path def boom(*args, **kwargs): raise RuntimeError("kaboom") - monkeypatch.setattr("neat.read_simulator.read_simulator_runner", boom) + # Patch the runner symbol used by the read-simulator command + monkeypatch.setattr("neat.cli.commands.read_simulator.read_simulator_runner", boom) + + cfg = _write_min_cfg(tmp_path) rc = main(cli.parser, [ "--no-log", "read-simulator", + "-c", str(cfg), "-o", str(tmp_path), "-p", "x", ]) @@ -106,4 +129,4 @@ def boom(*args, **kwargs): out = capsys.readouterr().out assert rc == 1 # Error path prints a line starting with 'ERROR:' - assert "ERROR:" in out + assert "ERROR:" in out \ No newline at end of file diff --git a/tests/test_models/test_error_and_mut_models.py b/tests/test_models/test_error_and_mut_models.py index cc8caec3..a255ac96 100644 --- a/tests/test_models/test_error_and_mut_models.py +++ b/tests/test_models/test_error_and_mut_models.py @@ -43,8 +43,7 @@ def test_mutation_model_generate_snv_trinuc(): def test_sequencing_error_model_zero_error_returns_none_or_empty(): """ - avg_seq_error == 0 should yield no errors. Some versions return just the list, - others return a (list, padding) tuple — accept both. + avg_seq_error == 0 should yield no errors. """ rng = default_rng(4) sem = SequencingErrorModel(avg_seq_error=0.0) @@ -89,3 +88,105 @@ def test_sequencing_error_model_basic_snvs_only(): assert hasattr(e, "location") assert hasattr(e, "ref") assert hasattr(e, "alt") + +def test_mutation_model_insertion_reproducible_with_seed(): + """Same seed and inputs should give the same insertion (length and alt).""" + rng1 = default_rng(123) + rng2 = default_rng(123) + m = MutationModel() + ref = Seq("ACGT") + + ins1 = m.generate_insertion(location=10, ref=ref, rng=rng1) + ins2 = m.generate_insertion(location=10, ref=ref, rng=rng2) + + assert isinstance(ins1, Insertion) + assert isinstance(ins2, Insertion) + assert ins1.length == ins2.length + assert str(ins1.alt) == str(ins2.alt) + + +def test_mutation_model_deletion_reproducible_with_seed(): + """Same seed and inputs should give the same deletion object shape.""" + rng1 = default_rng(456) + rng2 = default_rng(456) + m = MutationModel() + + del1 = m.generate_deletion(location=25, rng=rng1) + del2 = m.generate_deletion(location=25, rng=rng2) + + assert isinstance(del1, Deletion) + assert isinstance(del2, Deletion) + assert del1.length == del2.length + assert del1.position1 == del2.position1 + + +def test_mutation_model_snv_does_not_keep_reference_base(): + """ + For a given trinucleotide, the generated SNV should change the central base. + """ + rng = default_rng(7) + m = MutationModel() + trinuc = Seq("ACA") + central = str(trinuc[1]) + + snv = m.generate_snv(trinucleotide=trinuc, reference_location=100, rng=rng) + assert isinstance(snv, SingleNucleotideVariant) + assert snv.alt in ["A", "C", "G", "T"] + assert snv.alt != central + + +def test_traditional_quality_model_reproducible_with_seed(): + """Quality model should be deterministic given the same RNG state.""" + rng1 = default_rng(8) + rng2 = default_rng(8) + qm = TraditionalQualityModel(average_error=0.01) + + qs1 = qm.get_quality_scores(model_read_length=151, length=100, rng=rng1) + qs2 = qm.get_quality_scores(model_read_length=151, length=100, rng=rng2) + + assert np.array_equal(qs1, qs2) + + +def test_sequencing_error_model_reproducible_with_seed(): + """Error placement should be deterministic given the same RNG state.""" + sem = SequencingErrorModel(avg_seq_error=0.05) + ref = SeqRecord(Seq("ACGT" * 30), id="chr1") + quals = np.array([30] * len(ref), dtype=int) + + rng1 = default_rng(9) + rng2 = default_rng(9) + + introduced1, pad1 = sem.get_sequencing_errors( + padding=20, reference_segment=ref, quality_scores=quals, rng=rng1 + ) + introduced2, pad2 = sem.get_sequencing_errors( + padding=20, reference_segment=ref, quality_scores=quals, rng=rng2 + ) + + assert pad1 == pad2 + proj1 = [(e.error_type, e.location, e.ref, e.alt) for e in introduced1] + proj2 = [(e.error_type, e.location, e.ref, e.alt) for e in introduced2] + assert proj1 == proj2 + + +def test_sequencing_error_model_nonzero_error_introduces_in_bounds_errors(): + """ + With a non-zero average error rate, we expect at least some errors and their + locations must be within the reference segment. + """ + rng = default_rng(10) + sem = SequencingErrorModel(avg_seq_error=0.2) + ref = SeqRecord(Seq("ACGT" * 50), id="chr1") + quals = np.array([10] * len(ref), dtype=int) + + introduced, pad = sem.get_sequencing_errors( + padding=20, reference_segment=ref, quality_scores=quals, rng=rng + ) + + # At least one error is expected for these settings. + assert len(introduced) > 0 + # All error locations should be within the sequence. + for e in introduced: + assert 0 <= e.location < len(ref) + assert e.ref in ["A", "C", "G", "T"] + assert e.alt in ["A", "C", "G", "T"] \ No newline at end of file diff --git a/tests/test_read_simulator/test_options.py b/tests/test_read_simulator/test_options.py index a5528ab4..dad1cbfe 100644 --- a/tests/test_read_simulator/test_options.py +++ b/tests/test_read_simulator/test_options.py @@ -1,6 +1,7 @@ from neat.read_simulator.utils.options import Options from pathlib import Path as _PathAlias +import logging as _logging import numpy as _np import textwrap as _textwrap import pytest as _pytest @@ -10,8 +11,42 @@ def _project_root() -> _PathAlias: return _PathAlias(__file__).resolve().parents[2] -# Redefine the function name used above to override the brittle test -# so pytest only sees this correct version. +@_pytest.fixture(autouse=True) +def _isolate_neat_logging(): + """ + Prevent flaky 'ValueError: I/O operation on closed file' logging errors under pytest. + """ + # Clear handlers on NEAT and all child loggers + for name, logger in list(_logging.Logger.manager.loggerDict.items()): + if name == "neat" or name.startswith("neat."): + if isinstance(logger, _logging.Logger): + for h in list(logger.handlers): + logger.removeHandler(h) + try: + h.close() + except Exception: + pass + logger.handlers.clear() + logger.propagate = True # child loggers will propagate to 'neat' + + neat_logger = _logging.getLogger("neat") + neat_logger.handlers.clear() + neat_logger.addHandler(_logging.NullHandler()) + neat_logger.propagate = False # stop at 'neat' (do not reach root) + + yield + + # Rremove NullHandler + for h in list(neat_logger.handlers): + neat_logger.removeHandler(h) + try: + h.close() + except Exception: + pass + neat_logger.handlers.clear() + neat_logger.propagate = True + + def test_basic_options(): reference = _project_root() / "data" / "H1N1.fa" base_options = Options(reference) @@ -57,7 +92,6 @@ def test_rng_seed_reproducible(): def test_from_cli_single_end_with_threads_and_splits(tmp_path: _PathAlias): - # Build a minimal YAML config using repository-relative paths cfg = _textwrap.dedent( f""" reference: {(_project_root() / 'data' / 'H1N1.fa').as_posix()} @@ -76,8 +110,8 @@ def test_from_cli_single_end_with_threads_and_splits(tmp_path: _PathAlias): rng_seed: 42 overwrite_output: true - mode: contig - size: 500000 + parallel_mode: size + parallel_block_size: 500000 threads: 2 cleanup_splits: false reuse_splits: false @@ -92,14 +126,12 @@ def test_from_cli_single_end_with_threads_and_splits(tmp_path: _PathAlias): opts = Options.from_cli(outdir, "fromcli", yml_path) - # Basics propagated assert opts.reference == _project_root() / "data" / "H1N1.fa" assert opts.read_len == 75 assert opts.coverage == 5 assert opts.ploidy == 2 assert opts.rng_seed == 42 - # Output construction via log_configuration() inside from_cli assert opts.output_dir == outdir assert opts.output_prefix == "fromcli" assert opts.fq1 == outdir / "fromcli.fastq.gz" @@ -107,9 +139,7 @@ def test_from_cli_single_end_with_threads_and_splits(tmp_path: _PathAlias): assert opts.bam is None assert opts.vcf is None - # Parallel-related settings assert opts.threads == 2 - # cleanup_splits: false -> splits dir under output_dir assert opts.splits_dir == outdir / "splits" assert opts.splits_dir.is_dir() @@ -132,7 +162,7 @@ def test_from_cli_paired_end_fragments(tmp_path: _PathAlias): rng_seed: 7 overwrite_output: true - mode: contig + parallel_mode: contig threads: 1 cleanup_splits: true reuse_splits: false @@ -162,6 +192,8 @@ def test_from_cli_reuse_splits_missing_dir_raises(tmp_path: _PathAlias): produce_bam: false produce_vcf: false threads: 4 + parallel_mode: size + parallel_block_size: 500000 cleanup_splits: true reuse_splits: true overwrite_output: true @@ -174,6 +206,5 @@ def test_from_cli_reuse_splits_missing_dir_raises(tmp_path: _PathAlias): outdir = tmp_path / "out" outdir.mkdir(parents=True, exist_ok=True) - options = Options.from_cli(outdir, "reuse", yml_path) - # should issue a warning but continue in this case - assert options.reuse_splits == True + with _pytest.raises(FileNotFoundError, match=r"reuse_splits=True"): + Options.from_cli(outdir, "reuse", yml_path) \ No newline at end of file From d547eaccd83ffe5949734233f86aeb0905453433 Mon Sep 17 00:00:00 2001 From: Keshav Date: Sun, 22 Feb 2026 19:19:43 +0100 Subject: [PATCH 08/13] Adding README changes now that bioconda has been confirmed to work correctly. --- README.md | 139 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 85 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index 1bcf291a..28e9d27a 100755 --- a/README.md +++ b/README.md @@ -3,17 +3,18 @@ Welcome to the NEAT project, the NExt-generation sequencing Analysis Toolkit, version 4.3.5. This release of NEAT 4.3.5 includes several fixes and a little bit of restructuring, including a parallel process for running `neat read-simulator`. Our tests show much improved performance. If the logs seem excessive, you might try using the `--log-level ERROR` to reduce the output from the logs. See the [ChangeLog](ChangeLog.md) for notes. NEAT 4.3.5 is the official release of NEAT 4.0. It represents a lot of hard work from several contributors at NCSA and beyond. With the addition of parallel processing, we feel that the code is ready for production, and future releases will focus on compatibility, bug fixes, and testing. Future releases for the time being will be enumerations of 4.3.X. ## NEAT v4.3.5 -Neat 4.3.5 marked the officially 'complete' version of NEAT 4.3, implementing parallelization. To add parallelization to you run, simply add the "threads" parameter in your configuration and run read-simulator as normal. NEAT will take care of the rest. You can customize the parameters in you configuration file, as needed. + +NEAT 4.3.5 marked the officially 'complete' version of NEAT 4.3, implementing parallelization. To add parallelization to your run, simply add the `threads` parameter in your configuration file and run `read-simulator` as normal. NEAT will take care of the rest. You can customize the parameters in your configuration file, as needed. We have completed major revisions on NEAT since 3.4 and consider NEAT 4.3.5 to be a stable release, in that we will continue to update and provide bug fixes and support. We will consider new features and pull requests. Please include justification for major changes. See [contribute](CONTRIBUTING.md) for more information. If you'd like to use some of our code in your own, no problem! Just review the [license](LICENSE.md), first. -We've deprecated NEAT's command-line interface options for the most part, opting to simplify things with configuration files. If you require the CLI for legacy purposes, NEAT 3.4 was our last release to be fully command-line interface. Please convert your CLI commands to the corresponding yaml configuration for future runs. +We've deprecated NEAT's command-line interface options for the most part, opting to simplify things with configuration files. If you require the CLI for legacy purposes, NEAT 3.4 was our last release to be fully supported via command-line interface. Please convert your CLI commands to the corresponding configuration file for future runs. ### Statement of Need Developing and validating bioinformatics pipelines depends on access to genomic data with known ground truth. As a result, many research groups rely on simulated reads, and it can be useful to vary the parameters of the sequencing process itself. NEAT addresses this need as an open-source Python package that can integrate seamlessly with existing bioinformatics workflows—its simulations account for a wide range of sequencing parameters (e.g., coverage, fragment length, sequencing error models, mutational frequencies, ploidy, etc.) and allow users to customize their sequencing data. -NEAT is a fine-grained read simulator that simulates real-looking data using models learned from specific datasets. It was originally designed to simulate short reads, but it handles long-read simulation as well and is adaptable to any machine, with custom error models and the capability to handle single-base substitutions and indel errors. Unlike many simulators that rely solely on fixed error profiles, NEAT can learn empirical mutation and sequencing models from real datasets and use these models to generate realistic sequencing data, providing outputs in several common file formats (e.g., FASTQ, BAM, and VCF). There are several supporting utilities for generating models used for simulation and for comparing the outputs of alignment and variant calling to the golden BAM and golden VCF produced by NEAT. +NEAT is a fine-grained read simulator that simulates real-looking data using models learned from specific datasets. It was originally designed to simulate short reads and is adaptable to different machines, with custom error models and the capability to handle single-base substitutions, indel errors, and other types of mutations. Unlike simulators that rely solely on fixed error profiles, NEAT can learn empirical mutation and sequencing models from real datasets and use these models to generate realistic sequencing data, providing outputs in several common file formats (e.g., FASTQ, BAM, and VCF). There are several supporting utilities for generating models used for simulation and for comparing the outputs of alignment and variant calling to the golden BAM and golden VCF produced by NEAT. To cite this work, please use: @@ -40,7 +41,10 @@ To cite this work, please use: * [`neat model-fraglen`](#neat-model-fraglen) * [`neat gen-mut-model`](#neat-gen-mut-model) * [`neat model-seq-err`](#neat-model-seq-err) + * [`neat model-qual-score`](#neat-model-qual-score) * [`neat vcf_compare`](#neat-vcf_compare) + * [Tests](#tests) + * [Guide to run locally](#guide-to-run-locally) * [Note on Sensitive Patient Data](#note-on-sensitive-patient-data) ## Prerequisites @@ -77,32 +81,30 @@ $ git clone git@github.com:ncsa/NEAT.git $ cd NEAT ``` -A quick form of installation uses `bioconda`. Once `conda` is installed, the following command can be run for easy setup. -In the NEAT repo, at the base level is the environment.yml file you will need. Change directories into the neat repository -and run: +A quick form of installation uses `bioconda`. You must run these commands inside the NEAT project directory. ```bash -(base) $ conda env create -f environment.yml +(base) $ conda create -n neat -c conda-forge -c bioconda neat (base) $ conda activate neat -(neat) $ poetry install (neat) $ neat --help # tests that NEAT has installed correctly ``` Alternatively, instead of the `bioconda` method, you can use the `poetry` module in build a wheel file, which can then be `pip` installed. -You will need to run these commands from within the NEAT directory: +Once `conda` is installed, the following command can be run for easy setup. +In the NEAT repository, at the base level is the `environment.yml` file you will need. Change directories into the NEAT repository +and run: ```bash (base) $ conda env create -f environment.yml (base) $ conda activate neat -(neat) $ poetry build -(neat) $ pip install dist/neat*whl +(neat) $ poetry install (neat) $ neat --help # tests that NEAT has installed correctly ``` Assuming you have installed `conda`, run `source activate` or `conda activate`. -Please note that these installation instructions support MacOS, Windows, and Linux. However, if you are on MacOS, you need to remove the line `libgcc=14` from `environment.yml`. A solution for some non-Linux users is simple to remove the version specification (e.g., `libgcc`). +Please note that these installation instructions support MacOS, Windows, and Linux. Alternatively, if you wish to work with NEAT in the development-only environment, you can use `poetry install` within the NEAT repo, after creating the `conda` environment: @@ -156,42 +158,50 @@ description of the potential inputs in the config file. See `NEAT/config_templat To run the simulator in multithreaded mode, set the `threads` value in the config to something greater than 1. -`reference`: full path to a fasta file to generate reads from. -`read_len`: The length of the reads for the fastq (if using). _Integer value, default 101._ -`coverage`: desired coverage value. _Float or integer, default = 10._ -`ploidy`: Desired value for ploidy (# of copies of each chromosome in the organism, where if ploidy > 2, "heterozygous" mutates floor(ploidy / 2) chromosomes). _Default is 2._ -`paired_ended`: If paired-ended reads are desired, set this to True. Setting this to true requires either entering values for fragment_mean and fragment_st_dev or entering the path to a valid fragment_model. -`fragment_mean`: Use with paired-ended reads, set a fragment length mean manually -`fragment_st_dev`: Use with paired-ended reads, set the standard deviation of the fragment length dataset +`reference`: Full path to a FASTA file to generate reads from. + +`read_len`: The length of the reads for the FASTQ (if using). _Integer value, default 101._ + +`coverage`: Desired coverage value. _Float or integer, default = 10._ + +`ploidy`: Desired value for ploidy (# of copies of each chromosome in the organism, where if ploidy > 2, "heterozygous" mutates floor(ploidy / 2) chromosomes). _Default is 2._ + +`paired_ended`: If paired-ended reads are desired, set this to `True`. Setting this to `True` requires either entering values for `fragment_mean` and `fragment_st_dev` or entering the path to a valid `fragment_model`. + +`fragment_mean`: Use with paired-ended reads, setting a fragment length mean manually. + +`fragment_st_dev`: Use with paired-ended reads, setting the standard deviation of the fragment length dataset. + +The following values can be set to `True` or omitted to use defaults. If `True`, NEAT will produce the file type. -The following values can be set to true or omitted to use defaults. If True, NEAT will produce the file type. The default is given: -`produce_bam`: False -`produce_vcf`: False -`produce_fastq`: True - - -| Parameter | Description | -|---------------------|-------------| -| `error_model` | Full path to an error model generated by NEAT. Leave empty to use default model (default model based on human, sequenced by Illumina). | -| `mutation_model` | Full path to a mutation model generated by NEAT. Leave empty to use a default model (default model based on human data sequenced by Illumina). | -| `fragment_model` | Full path to fragment length model generated by NEAT. Leave empty to use default model (default model based on human data sequenced by Illumina). | -| `threads` | The number of threads for NEAT to use. Increasing the number will speed up read generation. | -| `avg_seq_error` | Average sequencing error rate for the sequencing machine. Use to increase or decrease the rate of errors in the reads. Float between 0 and 0.3. Default is set by the error model. | -| `rescale_qualities` | Rescale the quality scores to reflect the avg_seq_error rate above. Set True to activate if you notice issues with the sequencing error rates in your dataset. | -| `include_vcf` | Full path to list of variants in VCF format to include in the simulation. These will be inserted as they appear in the input VCF into the final VCF, and the corresponding fastq and bam files, if requested. | -| `target_bed` | Full path to list of regions in BED format to target. All areas outside these regions will have coverage of 0. | -| `discard_bed` | Full path to a list of regions to discard, in BED format. | -| `mutation_rate` | Desired rate of mutation for the dataset. Float between 0.0 and 0.3 (default is determined by the mutation model). | -| `mutation_bed` | Full path to a list of regions with a column describing the mutation rate of that region, as a float with values between 0 and 0.3. The mutation rate must be in the third column as, e.g., mut_rate=0.00. | -| `rng_seed` | Manually enter a seed for the random number generator. Used for repeating runs. Must be an integer. | -| `min_mutations` | Set the minimum number of mutations that NEAT should add, per contig. Default is 0. We recommend setting this to at least one for small chromosomes, so NEAT will produce at least one mutation per contig. | -| `threads` | Number of threads to use. More than 1 will use multithreading parallelism to speed up processing. | -| `mode` | 'size' or 'contig' whether to divide the contigs into blocks or just by contig. By contig is the default, try by size. Varying the size parameter may help if default values are not sufficient. | -| `size` | Default value of 500,000. | -| `cleanup_splits` | If running more than one simulation on the same input fasta, you can reuse splits files. By default, this will be set to False, and splits files will be deleted at the end of the run. | -| `reuse_splits` | If an existing splits file exists in the output folder, it will use those splits, if this value is set to True. | +`produce_bam`: `False` +`produce_vcf`: `False` +`produce_fastq`: `True` + +More parameters are below: + +| Parameter | Description | +|---------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `error_model` | Full path to an error model or quality score model generated by NEAT. Leave empty to use default model (default model based on human, sequenced by Illumina). | +| `mutation_model` | Full path to a mutation model generated by NEAT. Leave empty to use a default model (default model based on human data sequenced by Illumina). | +| `fragment_model` | Full path to fragment length model generated by NEAT. Leave empty to use default model (default model based on human data sequenced by Illumina). | +| `threads` | The number of threads for NEAT to use. Increasing the number will speed up read generation. | +| `avg_seq_error` | Average sequencing error rate for the sequencing machine. Use to increase or decrease the rate of errors in the reads. Float between 0 and 0.3. Default is set by the error model. | +| `rescale_qualities` | Rescale the quality scores to reflect the `avg_seq_error` rate above. Set `True` to activate if you notice issues with the sequencing error rates in your dataset. | +| `include_vcf` | Full path to list of variants in VCF format to include in the simulation. These will be inserted as they appear in the input VCF into the final VCF, and the corresponding FASTQ and BAM files, if requested. | +| `target_bed` | Full path to list of regions in BED format to target. All areas outside these regions will have coverage of 0. | +| `discard_bed` | Full path to a list of regions to discard, in BED format. | +| `mutation_rate` | Desired rate of mutation for the dataset. Float between 0.0 and 0.3 (default is determined by the mutation model). | +| `mutation_bed` | Full path to a list of regions with a column describing the mutation rate of that region, as a float with values between 0 and 0.3. The mutation rate must be in the third column as, e.g., `mut_rate`=0.00. | +| `rng_seed` | Manually enter a seed for the random number generator. Used for repeating runs. Must be an integer. | +| `min_mutations` | Set the minimum number of mutations that NEAT should add, per contig. Default is 0. We recommend setting this to at least one for small chromosomes, so NEAT will produce at least one mutation per contig. | +| `threads` | Number of threads to use. More than 1 will use multi-threading to speed up processing. | +| `mode` | `size` or `contig` whether to divide the contigs into blocks or just by contig. By `contig` is the default, but division by `size` may speed up your run. | +| `size` | Default value of 500,000. | +| `cleanup_splits` | If running more than one simulation on the same input fasta, you can reuse splits files. By default, this will be set to `False`, and splits files will be deleted at the end of the run. | +| `reuse_splits` | If an existing splits file exists in the output folder, it will use those splits, if this value is set to `True`. | The command line options for NEAT are as follows: @@ -205,7 +215,7 @@ Universal options can be applied to any subfunction. The commands should come be | --log-detail VALUE | VALUE must be one of [LOW, MEDIUM, HIGH] - how much info to write for each log record | | --silent-mode | Writes logs, but suppresses stdout messages | -read-simulator command line options +`read-simulator` command line options | Option | Description | |---------------------|-------------------------------------| | -c VALUE, --config VALUE | The VALUE should be the name of the config file to use for this run | @@ -229,7 +239,7 @@ Features: - Can accurately simulate large, single-end reads with high indel error rates (PacBio-like) given a model - Specify simple fragment length model with mean and standard deviation or an empirically learned fragment distribution - Simulates quality scores using either the default model or empirically learned quality scores using `neat gen_mut_model` -- Introduces sequencing substitution errors using either the default model or empirically learned from utilities/ +- Introduces sequencing substitution errors using either the default model or empirically learned in `utilities` - Output a VCF file with the 'golden' set of true positive variants. These can be compared to bioinformatics workflow output (includes coverage and allele balance information) - Output a BAM file with the 'golden' set of aligned reads. These indicate where each read originated and how it should be aligned with the reference - Create paired tumour/normal datasets using characteristics learned from real tumour data @@ -282,7 +292,7 @@ Here we enabled NEAT’s parallelized mode (“small filtering”), which splits | *S. cerevisiae* | 12,310,392 | 139,148 | 2.3191 | | Honeybee | 228,091,137 | 3,040,336 | 50.6723 | | Rice | 394,543,607 | 4,335,126 | 72.2521 | -| *Miscanthus* | 2,718,242,062 | 24876744 | 414.6 | +| *Miscanthus* | 2,718,242,062 | 24,876,744 | 414.6 | For mid-sized genomes (e.g., *E. coli* and *S. cerevisiae*), enabling parallelization reduced runtimes by roughly a factor of two to three compared to the base configuration. For larger genomes (honeybee and rice), the parallel configuration may make multi-hour simulations feasible. @@ -409,7 +419,7 @@ neat read-simulator \ Several scripts are distributed with `gen_reads` that are used to generate the models used for simulation. -## `neat model-fraglen` +### `neat model-fraglen` Computes empirical fragment length distribution from sample paired-end data. NEAT uses the template length (tlen) attribute calculated from paired-ended alignments to generate summary statistics for fragment lengths, which can be input into NEAT. @@ -421,7 +431,7 @@ Computes empirical fragment length distribution from sample paired-end data. NEA and creates `fraglen.pickle.gz` model in working directory. -## `neat gen-mut-model` +### `neat gen-mut-model` Takes reference genome and VCF file to generate mutation models: @@ -440,7 +450,7 @@ Trinucleotides are identified in the reference genome and the variant file. The | --human-sample | Use to skip unnumbered scaffolds in human references | | --skip-common | Do not save common snps or high mutation areas | -## `neat model-seq-err` +### `neat model-seq-err` Generates sequencing error model for NEAT. @@ -477,7 +487,28 @@ neat model-seq-err \ Please note that `-i2` can be used in place of `-i` to produce paired data. -## `neat vcf_compare` +### `neat model-qual-score` + +Typical usage: + +```bash +neat model-qual-score \ + -i input_reads.fastq(.gz) \ + -q 33 \ + -Q 42 \ + -m 1000000 \ + --markov \ + -o /path/to/models \ + -p my_qual_model +``` + +Similarly, use `-i2` to produce a model for paired-ended data. `-q` denotes the quality score offset, while `-Q` is the maximum quality score. + +`-m` denotes the maximum number of reads to process. Use a large number or input -1 to use all reads. `--markov` fits a quality model from the input data using a Markov chain process instead of the baseline quality score model (optional). + +Finally, `-o` is the output directory for the model file and `-p` is the prefix for the output model, such that the file will be written as `.p.gz` inside the output folder. + +### `neat vcf_compare` Tool for comparing VCF files (Not yet implemented in NEAT 4.3.5). @@ -504,7 +535,7 @@ neat vcf_compare We provide unit tests (e.g., mutation and sequencing error models) and basic integration tests for the CLI. -### Run locally +### Guide to run locally ```bash conda env create -f environment.yml conda activate neat @@ -515,4 +546,4 @@ pytest -q tests Please see `CONTRIBUTING.md` for more information and further instructions. ### Note on Sensitive Patient Data -ICGC's "Access Controlled Data" documentation can be found at https://docs.icgc.org/portal/access/. To have access to controlled germline data, a DACO must be submitted. Open tier data can be obtained without a DACO, but germline alleles that do not match the reference genome are masked and replaced with the reference allele. Controlled data includes unmasked germline alleles. +ICGC's "Access Controlled Data" documentation can be found at https://docs.icgc.org/portal/access/. To have access to controlled germline data, a DACO must be submitted. Open tier data can be obtained without a DACO, but germline alleles that do not match the reference genome are masked and replaced with the reference allele. Controlled data includes unmasked germline alleles. \ No newline at end of file From 137cb25c9b90e2d0e9e94c4bd682f288e49eda9e Mon Sep 17 00:00:00 2001 From: Keshav Date: Sun, 22 Feb 2026 19:26:34 +0100 Subject: [PATCH 09/13] One-line change to Options class. --- neat/read_simulator/utils/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neat/read_simulator/utils/options.py b/neat/read_simulator/utils/options.py index fd05b6de..2104dd32 100644 --- a/neat/read_simulator/utils/options.py +++ b/neat/read_simulator/utils/options.py @@ -155,7 +155,7 @@ def __init__(self, self.discard_bed: Path | None = discard_bed self.mutation_model: Path | None = mutation_model self.mutation_rate: float | None = mutation_rate - self.mutation_bed: str | None = mutation_bed + self.mutation_bed: Path | None = mutation_bed self.quality_offset: int = quality_offset self.paired_ended: bool = paired_ended From f3ade95a00f6341abbb4f43a02d78fdac567b946 Mon Sep 17 00:00:00 2001 From: Keshav Date: Sun, 22 Feb 2026 19:37:26 +0100 Subject: [PATCH 10/13] An RNG-related bug covered by tests was solved with this change. --- neat/read_simulator/utils/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neat/read_simulator/utils/options.py b/neat/read_simulator/utils/options.py index fd05b6de..cf0abe88 100644 --- a/neat/read_simulator/utils/options.py +++ b/neat/read_simulator/utils/options.py @@ -266,7 +266,7 @@ def from_cli(output_dir: Path, # Update items to config or default values base_options.__dict__.update(final_args) - base_options.set_random_seed() + base_options.rng = base_options.set_random_seed() # Some options checking to clean up the args dict base_options.check_options() From 8c58e6309dbbec821c0ead03b8a8680e9afa1621 Mon Sep 17 00:00:00 2001 From: Keshav Date: Sun, 22 Feb 2026 19:45:05 +0100 Subject: [PATCH 11/13] Fixing bug discovered in testing. --- neat/read_simulator/utils/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neat/read_simulator/utils/options.py b/neat/read_simulator/utils/options.py index fd05b6de..05e01320 100644 --- a/neat/read_simulator/utils/options.py +++ b/neat/read_simulator/utils/options.py @@ -288,7 +288,7 @@ def check_and_log_error(keyname, value_to_check, crit1, crit2): if value_to_check not in crit2: _LOG.error(f"Must choose one of {crit2}") sys.exit(1) - elif isinstance(crit1, int) and isinstance(crit2, int): + elif isinstance(crit1, (int, float)) and isinstance(crit2, (int, float)): if not (crit1 <= value_to_check <= crit2): _LOG.error(f'`{keyname}` must be between {crit1} and {crit2} (input: {value_to_check}).') sys.exit(1) From 67c901079893352b91828cb85053b7314f59c3c0 Mon Sep 17 00:00:00 2001 From: Keshav Date: Sun, 22 Feb 2026 19:51:58 +0100 Subject: [PATCH 12/13] Minor change to splits handling. --- neat/read_simulator/utils/options.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/neat/read_simulator/utils/options.py b/neat/read_simulator/utils/options.py index fd05b6de..c932c26d 100644 --- a/neat/read_simulator/utils/options.py +++ b/neat/read_simulator/utils/options.py @@ -442,13 +442,15 @@ def log_configuration(self): _LOG.info(f' - splitting input into size {self.parallel_block_size}') elif self.parallel_mode == 'contig': _LOG.info(f'Splitting input by contig.') - if not self.cleanup_splits or self.reuse_splits: + if self.reuse_splits: splits_dir = Path(f'{self.output_dir}/splits/') - if splits_dir.is_dir(): + if not splits_dir.is_dir(): + raise FileNotFoundError(f"reuse_splits=True but splits dir not found: {splits_dir}") _LOG.info(f'Reusing existing splits {splits_dir}.') - else: - _LOG.warning(f'Reused splits set to True, but splits dir not found: {splits_dir}. Creating new splits') - _LOG.info(f'Preserving splits for next run in directory {self.splits_dir}.') + _LOG.info(f'Preserving splits for next run in directory {splits_dir}.') + elif not self.cleanup_splits: + splits_dir = Path(f'{self.output_dir}/splits/') + _LOG.info(f'Preserving splits for next run in directory {splits_dir}.') else: splits_dir = self.temp_dir_path / "splits" From d3542acc3bb5a9f627b64663b66b16320d4f6bf3 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Thu, 26 Feb 2026 16:10:26 -0600 Subject: [PATCH 13/13] Adjusting coverage calculations --- neat/read_simulator/utils/generate_reads.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py index 74dab43c..2ae38897 100644 --- a/neat/read_simulator/utils/generate_reads.py +++ b/neat/read_simulator/utils/generate_reads.py @@ -49,9 +49,9 @@ def cover_dataset( number_reads_per_layer = ceil(span_length / fragment_model.fragment_mean) if options.paired_ended: # TODO use gc bias to skew this number. Calculate at the runner level. - number_reads = number_reads_per_layer * (options.coverage//2) + number_reads = ceil(number_reads_per_layer * (options.coverage/2)) else: - number_reads = number_reads_per_layer * options.coverage + number_reads = ceil(number_reads_per_layer * options.coverage) # step 1: Divide the span up into segments drawn from the fragment pool. Assign reads based on that. # step 2: repeat above until number of reads exceeds number_reads