ncsa · joshfactorial · Mar 2, 2026 · Jan 24, 2026 · Jan 24, 2026 · Jan 24, 2026
diff --git a/README.md b/README.md
diff --git a/config_template/simple_template.yml b/config_template/simple_template.yml
@@ -26,8 +26,8 @@ rng_seed: .
 min_mutations: .
 overwrite_output: .
 
-mode: .
-size: .
+parallel_mode: .
+parallel_block_size: .
 threads: .
 cleanup_splits: .
 reuse_splits: .
diff --git a/config_template/template_neat_config.yml b/config_template/template_neat_config.yml
@@ -1,137 +1,135 @@
-## Template for gen_reads parallel
+## Template for NEAT's read-simulator (as of version 4.3.5, parallelization-friendly)
 ## Any parameter that is not required but has a default value will use the
 ## default value even if the variable is not included in the config. For
-## required items, they must be included in the config and the must be given a value.
+## required items, they must be included in the config and they must be given a value.
 ## All other items can be present or not. If present and the value is set to a single
 ## period, the variable will be treated as though it had been omitted. Please do
-## not modify this template, but instead make a copy in your working directory. Done this
-## way, you can run without even needing to declare -c.
+## not modify this template, but instead make a copy in your working directory.
+## Run with this command: neat read-simulator -c <config.yml> -o <out_dir> [-p <prefix>]
 
-# Absolute path to input reference fasta file
+# Absolute path to input reference FASTA file
 # type = string | required: yes
 reference: REQUIRED
 
-# Read length of the reads in the fastq output. Only required if @produce_fastq is set to true
-# type = int | required: no | default = 101
+# Read length of the reads in the FASTQ output. Only required if @produce_fastq is set to true
+# type = int | required: no | default = 151
 read_len: .
 
-# Average Coverage for the entire genome.
+# Average coverage for the entire genome
 # type = float | required: no | default = 10.0
 coverage: .
 
-# Absolute path to file with sequencing error model
+# Absolute path to file with sequencing error model or quality-score model.
+# Error models are typically produced by neat model-seq-err (from FASTQ/BAM-like inputs), while
+# quality-score models can be produced by neat model-qual-score (optionally fit with --markov)
 # type = string | required: no | default: <NEAT_DIR>/neat/models/defaults/default_error_model.pickle.gz
 error_model: .
 
 # Average sequencing error rate for the sequencing machine
 # type = float | required = no | must be between 0.0 and 0.3
 avg_seq_error: .
 
-# This scales the quality scores to match the desired average sequencing error rate
-# specified by avg_seq_error.
+# This scales the quality scores to match the desired average sequencing error rate specified by avg_seq_error
 # type: boolean | required = no | default = false
 rescale_qualities: .
 
-# This is the factor to add to the quality scores to get the ascii text version of the
-# score. The default follows the sanger quality offset
+# This is the factor to add to the quality scores to get the ASCII text version of the
+# score. The default follows the Sanger quality offset
 # type: int | required = no | default = 33
 quality_offset: .
 
 # Desired ploidy
 # type = int | required = no | default = 2
 ploidy: .
 
-# Absolute path to vcf file containing variants that will always be included, regardless
-# of genotype and filter. You can pre-filter your vcf for these fields before inputting it
-# if this is not the desired behavior.
+# Absolute path to VCF file containing variants that will always be included, regardless
+# of genotype and filter. You can pre-filter your VCF for these fields before inputting it
+# if this is not the desired behavior
 # type: string | required = no
 include_vcf: .
 
-# Absolute path to bed file containing reference regions that the simulation
-# should target.
+# Absolute path to BED file containing reference regions that the simulation should target
 # type = string | required = no
 target_bed: .
 
-# Scalar value for coverage in regions outside the targeted bed. Example 0.5
+# Scalar value for coverage in regions outside the targeted BED. Example: 0.5
 # would get you roughly half the coverage as the on target areas. Default is
-# 0 coverage in off-target regions. Number should be a float in decimal.
+# 0 coverage in off-target regions. Number should be a float in decimal
 # type: float | required = no | default = 0.00
 off_target_scalar: .
 
-# Absolute path to bed file containing reference regions that the simulation
-# should discard.
+# Absolute path to BED file containing reference regions that the simulation should discard
 # type = string | required = no
 discard_bed: .
 
 # Absolute path to the mutation model pickle file. Omitting this value will cause
-# NEAT to use the default model, with some standard parameters, and generally uniform biases.
+# NEAT to use the default model, with some standard parameters, and generally uniform biases
 # type: string | required = no
 mutation_model: .
 
-# Average mutation rate per base pair. Overall average is 0.001, or model default
-# Use either this value to override the mutation rate for the default or input model.
+# Average mutation rate per base pair. Overall average is 0.001, or model default.
+# Use either this value to override the mutation rate for the default or input model
 # type: float | required = no | must be between 0.0 and 0.3
 mutation_rate: .
 
-# Absolute path to a bed file with mutation rates by region.
-# Rates must be in the fourth column and be of the form "mut_rate=x.xx"
-# Rates must be between 0.00 and 0.03
+# Absolute path to a BED file with mutation rates by region.
+# Rates must be in the third column and be of the form "mut_rate=x.xx"
+# Rates must be between 0.0 and 0.3
 # type: string | required = no
 mutation_bed: .
 
-# Whether the output should be paired ended. For certain conditions (i.e., vcf only or
-# fasta only), this will be ignored. If this is true, then there must be an included fragment
+# Whether the output should be paired ended. For certain conditions (i.e., VCF only or
+# FASTA only), this will be ignored. If this is true, then there must be an included fragment
 # length model output from runner.py or a mean and standard deviation
-# by declaring values for @fragment_mean and @fragment_std_dev.
+# by declaring values for @fragment_mean and @fragment_std_dev
 # type: boolean | required = no | default = false
 paired_ended: .
 
-# Absolute path to a pickle file containing the fragment length model output
-# from runner.py.
+# Absolute path to a pickle file containing the fragment length model.
+# Typically produced by neat model-fraglen (learned from BAM alignments)
 # type: string | required = no | default: <NEAT_DIR>/neat/models/defaults/default_fraglen_model.pickle.gz
 fragment_model: .
 
-# Mean for the paired end fragment length. This only applies if paired-ended is set to true.
+# Mean for the paired-end fragment length. This only applies if paired-ended is set to true.
 # This number will form the mean for the sample distribution of the fragment lengths in the simulation
 # Note: This number is REQUIRED if paired_ended is set to true, unless a fragment length model is used.
 # type: float | required: no (unless paired-ended)
 fragment_mean: .
 
-# Standard deviation for the paired end fragment length. This only applies if paired-ended is set to true.
+# Standard deviation for the paired-end fragment length. This only applies if paired-ended is set to true.
 # This number will form the standard deviation about the mean specified above for the sample distribution
-# of the fragment lengths in the simulation.
+# of the fragment lengths in the simulation
 # Note: This number is REQUIRED if paired_ended is set to true, unless a fragment length model is used.
 # type: float | required: no (unless paired-ended)
 fragment_st_dev: .
 
-# Whether to produce the golden bam file. This file will contain the reads
+# Whether to produce the golden BAM file. This file will contain the reads
 # aligned with the exact region of the genome
 # type: boolean | required = no | default = false
 produce_bam: .
 
-# Whether to produce a vcf file containing all the mutation errors added
-# by NEAT.
+# Whether to produce a VCF file containing all the mutation errors added by NEAT
 # type: boolean | required = no | default = false
 produce_vcf: .
 
-# Whether to output the fastq(s) of the reads. This is the default output. NEAT
-# will produce 1 fastq for single ended reads or 2 fastqs for paired ended.
+# Whether to output the FASTQ(s) of the reads. This is the default output. NEAT
+# will produce 1 FASTQ for single-ended reads or 2 FASTQs for paired-ended reads
 # type: boolean | required = no | default = true
 produce_fastq: .
 
 # If set to true, this will ignore statistical models and force coverage to be
-# constant across the genome. This is considered a debugging feature.
+# constant across the genome. This is considered a debugging feature
 # type: boolean | required = no | default = false
 no_coverage_bias: .
 
 # Set an RNG seed value. Runs using identical RNG values should produce identical results
 # so things like read locations, variant positions, error positions, etc. should be the same.
-# Useful for debugging.
+# Useful for debugging
 # type: int | required = no
 rng_seed: .
 
 # Set an absolute minimum number of mutations. The program always adds at least 1 mutation.
-# Useful for very small datasets.
+# Useful for very small datasets
 # type: int | required = no
 min_mutations: .
 
@@ -141,17 +139,17 @@ min_mutations: .
 overwrite_output: .
 
 # How to split the input reference for parallelization
-# Note if threads == 1, this option has no effect.
+# Note: If threads == 1, this option has no effect.
 # type = string | required: no | default = contig | values: contig, size
 parallel_mode: .
 
 # Target block size if by = size (overlap = read_len * 2).
-# Default is 500000 when by = size. Not used for by = contig.
+# Default is 500000 when by = size. Not used for by = contig
 # type = int | required: no | default = 500000 (when by=size)
 parallel_block_size: .
 
-# Maximum number of concurrent NEAT jobs (threads or hyperthreads) to run.
-# type = int | required: no | default = all available.
+# Maximum number of concurrent NEAT jobs (threads or hyperthreads) to run
+# type = int | required: no | default = all available
 threads: .
 
 # Delete the 'splits' directory after stitching completes
@@ -160,7 +158,7 @@ threads: .
 cleanup_splits: .
 
 # Reuse existing files in '<out_dir>/splits' and skip the split step.
-# The directory must contain neat-generated files and must be in the output dir within "splits"
+# The directory must contain NEAT-generated files and must be in the output directory within "splits"
 # Note if threads == 1, this option has no effect.
 # type = bool | required: no | default = False
 reuse_splits: .
diff --git a/neat/cli/cli.py b/neat/cli/cli.py
@@ -1,6 +1,9 @@
 """Implements command line interface used by the package."""
 
 __all__ = ['Cli', 'main', 'run']
+__version__ = "4.3.6"
+__author__ = "Joshua Allen"
+__email__ = "jallen17@illinois.edu"
 
 import argparse
 import importlib
@@ -41,6 +44,11 @@ def __init__(self):
         self.parser = argparse.ArgumentParser(
             prog="neat", description="Run NEAT components"
         )
+        self.parser.add_argument(
+            "-v", "--version",
+            action="version",
+            version="%(prog)s {version}".format(version=__version__),
+        )
         self.parser.add_argument(
             "--no-log",
             default=False,

diff --git a/neat/cli/commands/options.py b/neat/cli/commands/options.py
@@ -14,6 +14,7 @@
     "--output_dir",
     dest="output_dir",
     type=str,
+    required=True,
     help="Path to the output directory. Will create if not present.",
     default=os.getcwd()
 )

diff --git a/neat/cli/commands/read_simulator.py b/neat/cli/commands/read_simulator.py
@@ -33,7 +33,7 @@ def add_arguments(self, parser: argparse.ArgumentParser):
             "-c", "--config",
             metavar="config",
             type=str,
-            required=False,
+            required=True,
             help="Path (including filename) to the configuration file for this run."
         )
 

diff --git a/neat/read_simulator/runner.py b/neat/read_simulator/runner.py
@@ -2,7 +2,7 @@
 Runner for generate_reads task
 """
 import logging
-import os
+import shutil
 import subprocess
 import time
 import multiprocessing as mp
@@ -239,7 +239,7 @@ def read_simulator_runner(config: str, output_dir: str, file_prefix: str):
             temp_file = str(options.temp_dir_path / "temp.sorted.vcf.gz")
             subprocess.run(["bcftools", "sort", "-o", temp_file, "-Ob9", str(file)])
             Path(temp_file).is_file()
-            os.rename(temp_file, str(file))
+            shutil.move(temp_file, str(file))
             _LOG.info("Indexing vcf")
             pysam.tabix_index(str(file), preset="vcf", force=force)
 

diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py
@@ -49,9 +49,9 @@ def cover_dataset(
     number_reads_per_layer = ceil(span_length / fragment_model.fragment_mean)
     if options.paired_ended:
         # TODO use gc bias to skew this number. Calculate at the runner level.
-        number_reads = number_reads_per_layer * (options.coverage//2)
+        number_reads = ceil(number_reads_per_layer * (options.coverage/2))
     else:
-        number_reads = number_reads_per_layer * options.coverage
+        number_reads = ceil(number_reads_per_layer * options.coverage)
 
     # step 1: Divide the span up into segments drawn from the fragment pool. Assign reads based on that.
     # step 2: repeat above until number of reads exceeds number_reads

diff --git a/neat/read_simulator/utils/options.py b/neat/read_simulator/utils/options.py
@@ -155,7 +155,7 @@ def __init__(self,
         self.discard_bed: Path | None = discard_bed
         self.mutation_model: Path | None = mutation_model
         self.mutation_rate: float | None = mutation_rate
-        self.mutation_bed: str | None = mutation_bed
+        self.mutation_bed: Path | None = mutation_bed
         self.quality_offset: int = quality_offset
 
         self.paired_ended: bool = paired_ended
@@ -237,8 +237,8 @@ def from_cli(output_dir: Path,
             'rng_seed': (int, None, None, None),
             'min_mutations': (int, 0, None, None),
             'overwrite_output': (bool, False, None, None),
-            'mode': (str, 'size', 'choice', ['size', 'contig']),
-            'size': (int, 500000, None, None),
+            'parallel_mode': (str, 'size', 'choice', ['size', 'contig']),
+            'parallel_block_size': (int, 500000, None, None),
             'threads': (int, 1, 1, 1000),
             'cleanup_splits': (bool, True, None, None),
             'reuse_splits': (bool, False, None, None)
@@ -266,7 +266,7 @@ def from_cli(output_dir: Path,
 
         # Update items to config or default values
         base_options.__dict__.update(final_args)
-        base_options.set_random_seed()
+        base_options.rng = base_options.set_random_seed()
 
         # Some options checking to clean up the args dict
         base_options.check_options()
@@ -288,7 +288,7 @@ def check_and_log_error(keyname, value_to_check, crit1, crit2):
             if value_to_check not in crit2:
                 _LOG.error(f"Must choose one of {crit2}")
                 sys.exit(1)
-        elif isinstance(crit1, int) and isinstance(crit2, int):
+        elif isinstance(crit1, (int, float)) and isinstance(crit2, (int, float)):
             if not (crit1 <= value_to_check <= crit2):
                 _LOG.error(f'`{keyname}` must be between {crit1} and {crit2} (input: {value_to_check}).')
                 sys.exit(1)
@@ -377,6 +377,7 @@ def check_options(self):
         """
         Some sanity checks and corrections to the options.
         """
+
         if not (self.produce_bam or self.produce_vcf or self.produce_fastq):
             _LOG.error('No files would be produced, as all file types are set to false')
             sys.exit(1)
@@ -438,16 +439,18 @@ def log_configuration(self):
 
         if self.parallel_mode == 'size':
             _LOG.info(f'Splitting reference into chunks.')
-            _LOG.info(f'  - splitting input into size {self.size}')
+            _LOG.info(f'  - splitting input into size {self.parallel_block_size}')
         elif self.parallel_mode == 'contig':
             _LOG.info(f'Splitting input by contig.')
-        if not self.cleanup_splits or self.reuse_splits:
+        if self.reuse_splits:
             splits_dir = Path(f'{self.output_dir}/splits/')
-            if splits_dir.is_dir():
+            if not splits_dir.is_dir():
+                raise FileNotFoundError(f"reuse_splits=True but splits dir not found: {splits_dir}")
                 _LOG.info(f'Reusing existing splits {splits_dir}.')
-            else:
-                _LOG.warning(f'Reused splits set to True, but splits dir not found: {splits_dir}. Creating new splits')
-            _LOG.info(f'Preserving splits for next run in directory {self.splits_dir}.')
+                _LOG.info(f'Preserving splits for next run in directory {splits_dir}.')
+        elif not self.cleanup_splits:
+            splits_dir = Path(f'{self.output_dir}/splits/')
+            _LOG.info(f'Preserving splits for next run in directory {splits_dir}.')
         else:
             splits_dir = self.temp_dir_path / "splits"
 

diff --git a/neat/read_simulator/utils/split_inputs.py b/neat/read_simulator/utils/split_inputs.py
@@ -67,15 +67,15 @@ def main(options: Options, reference_index: dict) -> tuple[dict, int]:
     # We'll keep track of chunks by contig, to help us out later
     split_fasta_dict: dict[str, dict[tuple[int, int], Path]] = {key: {} for key in reference_index.keys()}
     for contig, seq_record in reference_index.items():
-        if options.mode == "contig":
+        if options.parallel_mode == "contig":
             stem = f"{idx:0{pad}d}__{contig}"
             fa = options.splits_dir / f"{stem}.fa.gz"
             write_fasta(contig, seq_record.seq.upper(), fa)
             split_fasta_dict[contig][(0, len(seq_record))] = fa
             idx += 1
             written += 1
         else:
-            for start, subseq in chunk_record(seq_record.seq.upper(), options.size, overlap):
+            for start, subseq in chunk_record(seq_record.seq.upper(), options.parallel_block_size, overlap):
                 stem = f"{idx:0{pad}d}__{contig}"
                 fa = options.splits_dir / f"{stem}.fa.gz"
                 write_fasta(contig, subseq, fa)