From fdb8d2c541d9973481fb4cf0735bde3b2c1da086 Mon Sep 17 00:00:00 2001 From: Keshav Date: Sat, 29 Nov 2025 15:20:18 +0100 Subject: [PATCH 01/13] More error model and mutation model tests. --- .../test_models/test_error_and_mut_models.py | 105 +++++++++++++++++- 1 file changed, 103 insertions(+), 2 deletions(-) diff --git a/tests/test_models/test_error_and_mut_models.py b/tests/test_models/test_error_and_mut_models.py index cc8caec3..5c3d4c80 100644 --- a/tests/test_models/test_error_and_mut_models.py +++ b/tests/test_models/test_error_and_mut_models.py @@ -43,8 +43,7 @@ def test_mutation_model_generate_snv_trinuc(): def test_sequencing_error_model_zero_error_returns_none_or_empty(): """ - avg_seq_error == 0 should yield no errors. Some versions return just the list, - others return a (list, padding) tuple — accept both. + avg_seq_error == 0 should yield no errors. """ rng = default_rng(4) sem = SequencingErrorModel(avg_seq_error=0.0) @@ -89,3 +88,105 @@ def test_sequencing_error_model_basic_snvs_only(): assert hasattr(e, "location") assert hasattr(e, "ref") assert hasattr(e, "alt") + +def test_mutation_model_insertion_reproducible_with_seed(): + """Same seed and inputs should give the same insertion (length and alt).""" + rng1 = default_rng(123) + rng2 = default_rng(123) + m = MutationModel() + ref = Seq("ACGT") + + ins1 = m.generate_insertion(location=10, ref=ref, rng=rng1) + ins2 = m.generate_insertion(location=10, ref=ref, rng=rng2) + + assert isinstance(ins1, Insertion) + assert isinstance(ins2, Insertion) + assert ins1.length == ins2.length + assert str(ins1.alt) == str(ins2.alt) + + +def test_mutation_model_deletion_reproducible_with_seed(): + """Same seed and inputs should give the same deletion object shape.""" + rng1 = default_rng(456) + rng2 = default_rng(456) + m = MutationModel() + + del1 = m.generate_deletion(location=25, rng=rng1) + del2 = m.generate_deletion(location=25, rng=rng2) + + assert isinstance(del1, Deletion) + assert isinstance(del2, Deletion) + assert del1.length == del2.length + assert del1.position1 == del2.position1 + + +def test_mutation_model_snv_does_not_keep_reference_base(): + """ + For a given trinucleotide, the generated SNV should change the central base. + """ + rng = default_rng(7) + m = MutationModel() + trinuc = Seq("ACA") + central = str(trinuc[1]) + + snv = m.generate_snv(trinucleotide=trinuc, reference_location=100, rng=rng) + assert isinstance(snv, SingleNucleotideVariant) + assert snv.alt in ["A", "C", "G", "T"] + assert snv.alt != central + + +def test_traditional_quality_model_reproducible_with_seed(): + """Quality model should be deterministic given the same RNG state.""" + rng1 = default_rng(8) + rng2 = default_rng(8) + qm = TraditionalQualityModel(average_error=0.01) + + qs1 = qm.get_quality_scores(model_read_length=151, length=100, rng=rng1) + qs2 = qm.get_quality_scores(model_read_length=151, length=100, rng=rng2) + + assert np.array_equal(qs1, qs2) + + +def test_sequencing_error_model_reproducible_with_seed(): + """Error placement should be deterministic given the same RNG state.""" + sem = SequencingErrorModel(avg_seq_error=0.05) + ref = SeqRecord(Seq("ACGT" * 30), id="chr1") + quals = np.array([30] * len(ref), dtype=int) + + rng1 = default_rng(9) + rng2 = default_rng(9) + + introduced1, pad1 = sem.get_sequencing_errors( + padding=20, reference_segment=ref, quality_scores=quals, rng=rng1 + ) + introduced2, pad2 = sem.get_sequencing_errors( + padding=20, reference_segment=ref, quality_scores=quals, rng=rng2 + ) + + assert pad1 == pad2 + proj1 = [(e.error_type, e.location, e.ref, e.alt) for e in introduced1] + proj2 = [(e.error_type, e.location, e.ref, e.alt) for e in introduced2] + assert proj1 == proj2 + + +def test_sequencing_error_model_nonzero_error_introduces_in_bounds_errors(): + """ + With a non-zero average error rate, we expect at least some errors and their + locations must be within the reference segment. + """ + rng = default_rng(10) + sem = SequencingErrorModel(avg_seq_error=0.2) + ref = SeqRecord(Seq("ACGT" * 50), id="chr1") + quals = np.array([10] * len(ref), dtype=int) + + introduced, pad = sem.get_sequencing_errors( + padding=20, reference_segment=ref, quality_scores=quals, rng=rng + ) + + # At least one error is expected for these settings. + assert len(introduced) > 0 + # All error locations should be within the sequence. + for e in introduced: + assert 0 <= e.location < len(ref) + assert e.ref in ["A", "C", "G", "T"] + assert e.alt in ["A", "C", "G", "T"] From 069272ce9375381a733ad3b462d9ee11d48f3da8 Mon Sep 17 00:00:00 2001 From: Keshav Date: Sat, 29 Nov 2025 16:21:35 +0100 Subject: [PATCH 02/13] Readability updates to README. --- README.md | 110 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 69 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index 9e200ce4..082ff4c2 100755 --- a/README.md +++ b/README.md @@ -3,17 +3,17 @@ Welcome to the NEAT project, the NExt-generation sequencing Analysis Toolkit, version 4.3.5. This release of NEAT 4.3.5 includes several fixes and a little bit of restructuring, including a parallel process for running `neat read-simulator`. Our tests show much improved performance. If the logs seem excessive, you might try using the `--log-level ERROR` to reduce the output from the logs. See the [ChangeLog](ChangeLog.md) for notes. NEAT 4.3.5 is the official release of NEAT 4.0. It represents a lot of hard work from several contributors at NCSA and beyond. With the addition of parallel processing, we feel that the code is ready for production, and future releases will focus on compatibility, bug fixes, and testing. Future releases for the time being will be enumerations of 4.3.X. ## NEAT v4.3.5 -Neat 4.3.5 marked the officially 'complete' version of NEAT 4.3, implementing parallelization. To add parallelization to you run, simply add the "threads" parameter in your configuration and run read-simulator as normal. NEAT will take care of the rest. You can customize the parameters in you configuration file, as needed. +Neat 4.3.5 marked the officially 'complete' version of NEAT 4.3, implementing parallelization. To add parallelization to your run, simply add the "threads" parameter in your configuration file and run `read-simulator` as normal. NEAT will take care of the rest. You can customize the parameters in you configuration file, as needed. We have completed major revisions on NEAT since 3.4 and consider NEAT 4.3.5 to be a stable release, in that we will continue to update and provide bug fixes and support. We will consider new features and pull requests. Please include justification for major changes. See [contribute](CONTRIBUTING.md) for more information. If you'd like to use some of our code in your own, no problem! Just review the [license](LICENSE.md), first. -We've deprecated NEAT's command-line interface options for the most part, opting to simplify things with configuration files. If you require the CLI for legacy purposes, NEAT 3.4 was our last release to be fully command-line interface. Please convert your CLI commands to the corresponding yaml configuration for future runs. +We've deprecated NEAT's command-line interface options for the most part, opting to simplify things with configuration files. If you require the CLI for legacy purposes, NEAT 3.4 was our last release to be fully supported via command-line interface. Please convert your CLI commands to the corresponding configuration file for future runs. ### Statement of Need Developing and validating bioinformatics pipelines depends on access to genomic data with known ground truth. As a result, many research groups rely on simulated reads, and it can be useful to vary the parameters of the sequencing process itself. NEAT addresses this need as an open-source Python package that can integrate seamlessly with existing bioinformatics workflows—its simulations account for a wide range of sequencing parameters (e.g., coverage, fragment length, sequencing error models, mutational frequencies, ploidy, etc.) and allow users to customize their sequencing data. -NEAT is a fine-grained read simulator that simulates real-looking data using models learned from specific datasets. It was originally designed to simulate short reads, but it handles long-read simulation as well and is adaptable to any machine, with custom error models and the capability to handle single-base substitutions and indel errors. Unlike many simulators that rely solely on fixed error profiles, NEAT can learn empirical mutation and sequencing models from real datasets and use these models to generate realistic sequencing data, providing outputs in several common file formats (e.g., FASTQ, BAM, and VCF). There are several supporting utilities for generating models used for simulation and for comparing the outputs of alignment and variant calling to the golden BAM and golden VCF produced by NEAT. +NEAT is a fine-grained read simulator that simulates real-looking data using models learned from specific datasets. It was originally designed to simulate short reads and is adaptable to any machine, with custom error models and the capability to handle single-base substitutions, indel errors, and other types of mutations. Unlike simulators that rely solely on fixed error profiles, NEAT can learn empirical mutation and sequencing models from real datasets and use these models to generate realistic sequencing data, providing outputs in several common file formats (e.g., FASTQ, BAM, and VCF). There are several supporting utilities for generating models used for simulation and for comparing the outputs of alignment and variant calling to the golden BAM and golden VCF produced by NEAT. To cite this work, please use: @@ -41,6 +41,8 @@ To cite this work, please use: * [`neat gen-mut-model`](#neat-gen-mut-model) * [`neat model-seq-err`](#neat-model-seq-err) * [`neat vcf_compare`](#neat-vcf_compare) + * [Tests](#tests) + * [Guide to run locally](#guide-to-run-locally) * [Note on Sensitive Patient Data](#note-on-sensitive-patient-data) ## Prerequisites @@ -99,7 +101,7 @@ You will need to run these commands from within the NEAT directory: Assuming you have installed `conda`, run `source activate` or `conda activate`. -Please note that these installation instructions support MacOS, Windows, and Linux. However, if you are on MacOS, you need to remove the line `libgcc=14` from `environment.yml`. A solution for some non-Linux users is simple to remove the version specification (e.g., `libgcc`). +Please note that these installation instructions support MacOS, Windows, and Linux. Alternatively, if you wish to work with NEAT in the development-only environment, you can use `poetry install` within the NEAT repo, after creating the `conda` environment: @@ -153,42 +155,68 @@ description of the potential inputs in the config file. See `NEAT/config_templat To run the simulator in multithreaded mode, set the `threads` value in the config to something greater than 1. -`reference`: full path to a fasta file to generate reads from. -`read_len`: The length of the reads for the fastq (if using). _Integer value, default 101._ -`coverage`: desired coverage value. _Float or integer, default = 10._ +`reference`: Full path to a FASTA file to generate reads from. + +`read_len`: The length of the reads for the FASTQ (if using). _Integer value, default 101._ + +`coverage`: Desired coverage value. _Float or integer, default = 10._ + `ploidy`: Desired value for ploidy (# of copies of each chromosome in the organism, where if ploidy > 2, "heterozygous" mutates floor(ploidy / 2) chromosomes). _Default is 2._ -`paired_ended`: If paired-ended reads are desired, set this to True. Setting this to true requires either entering values for fragment_mean and fragment_st_dev or entering the path to a valid fragment_model. -`fragment_mean`: Use with paired-ended reads, set a fragment length mean manually -`fragment_st_dev`: Use with paired-ended reads, set the standard deviation of the fragment length dataset -The following values can be set to true or omitted to use defaults. If True, NEAT will produce the file type. +`paired_ended`: If paired-ended reads are desired, set this to `True`. Setting this to `True` requires either entering values for `fragment_mean` and `fragment_st_dev` or entering the path to a valid `fragment_model`. + +`fragment_mean`: Use with paired-ended reads, setting a fragment length mean manually. + +`fragment_st_dev`: Use with paired-ended reads, setting the standard deviation of the fragment length dataset. + +The following values can be set to `True` or omitted to use defaults. If `True`, NEAT will produce the file type. The default is given: -`produce_bam`: False -`produce_vcf`: False -`produce_fastq`: True - -`error_model`: full path to an error model generated by NEAT. Leave empty to use default model _(default model based on human, sequenced by Illumina)._ -`mutation_model`: full path to a mutation model generated by NEAT. Leave empty to use a default model _(default model based on human data sequenced by Illumina)._ -`fragment_model`: full path to fragment length model generate by NEAT. Leave empty to use default model _(default model based on human data sequenced by Illumina)._ - -`threads`: The number of threads for NEAT to use. _Increasing the number will speed up read generation._ -`avg_seq_error`: average sequencing error rate for the sequencing machine. Use to increase or decrease the rate of errors in the reads. _Float between 0 and 0.3. Default is set by the error model._ -`rescale_qualities`: rescale the quality scores to reflect the avg_seq_error rate above. Set True to activate if you notice issues with the sequencing error rates in your datatset. -`include_vcf`: full path to list of variants in vcf format to include in the simulation. These will be inserted as they appear in the input VCF into the final VCF, and the corresponding fastq and bam files, if requested. -`target_bed`: full path to list of regions in bed format to target. All areas outside these regions will have coverage of 0. -`discard_bed`: full path to a list of regions to discard, in BED format. -`mutation_rate`: Desired rate of mutation for the dataset. _Float between 0.0 and 0.3 (default is determined by the mutation model)._ -`mutation_bed`: full path to a list of regions with a column describing the mutation rate of that region, as a float with values between 0 and 0.3. The mutation rate must be in the third column as, e.g., mut_rate=0.00. -`rng_seed`: Manually enter a seed for the random number generator. Used for repeating runs. _Must be an integer._ +`produce_bam`: `False` + +`produce_vcf`: `False` + +`produce_fastq`: `True` + +More parameters are below: + +`error_model`: Full path to an error model generated by NEAT. Leave empty to use default model _(default model based on human, sequenced by Illumina)_. + +`mutation_model`: Full path to a mutation model generated by NEAT. Leave empty to use a default model _(default model based on human data sequenced by Illumina)_. + +`fragment_model`: Full path to fragment length model generate by NEAT. Leave empty to use default model _(default model based on human data sequenced by Illumina)_. + +`threads`: The number of threads for NEAT to use. _Increasing the number will speed up read generation_. + +`avg_seq_error`: Average sequencing error rate for the sequencing machine. Use to increase or decrease the rate of errors in the reads. _Float between 0 and 0.3. Default is set by the error model_. + +`rescale_qualities`: Rescale the quality scores to reflect the `avg_seq_error` rate above. Set `True` to activate if you notice issues with the sequencing error rates in your dataset. + +`include_vcf`: Full path to list of variants in VCF format to include in the simulation. These will be inserted as they appear in the input VCF into the final VCF, and the corresponding FASTQ and BAM files, if requested. + +`target_bed`: Full path to list of regions in BED format to target. All areas outside these regions will have coverage of 0. + +`discard_bed`: Full path to a list of regions to discard, in BED format. + +`mutation_rate`: Desired rate of mutation for the dataset. _Float between 0.0 and 0.3 (default is determined by the mutation model)_. + +`mutation_bed`: Full path to a list of regions with a column describing the mutation rate of that region, as a float with values between 0 and 0.3. The mutation rate must be in the third column like so (e.g., `mut_rate`=0.00). + +`rng_seed`: Manually enter a seed for the random number generator. Used for repeating runs. _Must be an integer_. + `min_mutations`: Set the minimum number of mutations that NEAT should add, per contig. _Default is 0._ We recommend setting this to at least one for small chromosomes, so NEAT will produce at least one mutation per contig. -`threads`: Number of threads to use. More than 1 will use multithreading parallelism to speed up processing. -`mode`: 'size' or 'contig' whether to divide the contigs into blocks or just by contig. By contig is the default, try by size. Varying the size parameter may help if default values are not sufficient. + +`threads`: Number of threads to use. More than 1 will use multi-threading to speed up processing. + +`mode`: `size` or `contig` whether to divide the contigs into blocks or just by contig. By `contig` is the default, but division by `size` may speed up your run. Varying the `size` parameter may help if default values do not sufficiently improve runtimes. + `size`: Default value of 500,000. -`cleanup_splits`: If running more than one simulation on the same input fasta, you can reuse splits files. By default, this will be set to False, and splits files will be deleted at the end of the run. -`reuse_splits`: If an existing splits file exists in the output folder, it will use those splits, if this value is set to True. -The command line options for NEAT are as follows: +`cleanup_splits`: If running more than one simulation on the same input FASTA, you can reuse splits files. By default, this will be set to `False`, and splits files will be deleted at the end of the run. + +`reuse_splits`: If an existing splits file exists in the output folder, it will use those splits, if this value is set to `True`. + +The command-line options for NEAT are as follows: Universal options can be applied to any subfunction. The commands should come before the function name (e.g., neat --log-level DEBUG read-simulator ...), except -h or --help, which can appear anywhere in the command. | Universal Options | Description | @@ -200,7 +228,7 @@ Universal options can be applied to any subfunction. The commands should come be | --log-detail VALUE | VALUE must be one of [LOW, MEDIUM, HIGH] - how much info to write for each log record | | --silent-mode | Writes logs, but suppresses stdout messages | -read-simulator command line options +`read-simulator` command line options | Option | Description | |---------------------|-------------------------------------| | -c VALUE, --config VALUE | The VALUE should be the name of the config file to use for this run | @@ -224,10 +252,10 @@ Features: - Can accurately simulate large, single-end reads with high indel error rates (PacBio-like) given a model - Specify simple fragment length model with mean and standard deviation or an empirically learned fragment distribution - Simulates quality scores using either the default model or empirically learned quality scores using `neat gen_mut_model` -- Introduces sequencing substitution errors using either the default model or empirically learned from utilities/ +- Introduces sequencing substitution errors using either the default model or empirically learned in `utilities` - Output a VCF file with the 'golden' set of true positive variants. These can be compared to bioinformatics workflow output (includes coverage and allele balance information) - Output a BAM file with the 'golden' set of aligned reads. These indicate where each read originated and how it should be aligned with the reference -- Create paired tumour/normal datasets using characteristics learned from real tumour data +- Create paired tumor/normal datasets using characteristics learned from real tumour data ### Estimated runtimes @@ -404,7 +432,7 @@ neat read-simulator \ Several scripts are distributed with `gen_reads` that are used to generate the models used for simulation. -## `neat model-fraglen` +### `neat model-fraglen` Computes empirical fragment length distribution from sample paired-end data. NEAT uses the template length (tlen) attribute calculated from paired-ended alignments to generate summary statistics for fragment lengths, which can be input into NEAT. @@ -416,7 +444,7 @@ Computes empirical fragment length distribution from sample paired-end data. NEA and creates `fraglen.pickle.gz` model in working directory. -## `neat gen-mut-model` +### `neat gen-mut-model` Takes reference genome and VCF file to generate mutation models: @@ -435,7 +463,7 @@ Trinucleotides are identified in the reference genome and the variant file. The | --human-sample | Use to skip unnumbered scaffolds in human references | | --skip-common | Do not save common snps or high mutation areas | -## `neat model-seq-err` +### `neat model-seq-err` Generates sequencing error model for NEAT. @@ -472,7 +500,7 @@ neat model-seq-err \ Please note that `-i2` can be used in place of `-i` to produce paired data. -## `neat vcf_compare` +### `neat vcf_compare` Tool for comparing VCF files (Not yet implemented in NEAT 4.3.5). @@ -499,7 +527,7 @@ neat vcf_compare We provide unit tests (e.g., mutation and sequencing error models) and basic integration tests for the CLI. -### Run locally +### Guide to run locally ```bash conda env create -f environment.yml conda activate neat From 9d43fa3d8b51197d60f0b2518e75535a8e7b48f1 Mon Sep 17 00:00:00 2001 From: Keshav Date: Fri, 5 Dec 2025 21:17:36 +0100 Subject: [PATCH 03/13] Full Markov integration set of scripts. --- README.md | 113 +++--- neat/cli/commands/__init__.py | 3 +- neat/cli/commands/model_qual_score.py | 87 ++++ neat/model_quality_score/__init__.py | 9 + neat/model_quality_score/runner.py | 236 +++++++++++ neat/models/__init__.py | 1 + neat/models/markov_quality_model.py | 168 ++++++++ neat/models/quality_score_model.py | 426 -------------------- neat/quality_score_modeling/__init__.py | 7 + neat/quality_score_modeling/markov_utils.py | 226 +++++++++++ tests/test_models/test_qual_score_models.py | 80 ++++ 11 files changed, 876 insertions(+), 480 deletions(-) create mode 100644 neat/cli/commands/model_qual_score.py create mode 100644 neat/model_quality_score/__init__.py create mode 100644 neat/model_quality_score/runner.py create mode 100644 neat/models/markov_quality_model.py delete mode 100644 neat/models/quality_score_model.py create mode 100644 neat/quality_score_modeling/__init__.py create mode 100644 neat/quality_score_modeling/markov_utils.py create mode 100644 tests/test_models/test_qual_score_models.py diff --git a/README.md b/README.md index 082ff4c2..1987dd4d 100755 --- a/README.md +++ b/README.md @@ -3,7 +3,8 @@ Welcome to the NEAT project, the NExt-generation sequencing Analysis Toolkit, version 4.3.5. This release of NEAT 4.3.5 includes several fixes and a little bit of restructuring, including a parallel process for running `neat read-simulator`. Our tests show much improved performance. If the logs seem excessive, you might try using the `--log-level ERROR` to reduce the output from the logs. See the [ChangeLog](ChangeLog.md) for notes. NEAT 4.3.5 is the official release of NEAT 4.0. It represents a lot of hard work from several contributors at NCSA and beyond. With the addition of parallel processing, we feel that the code is ready for production, and future releases will focus on compatibility, bug fixes, and testing. Future releases for the time being will be enumerations of 4.3.X. ## NEAT v4.3.5 -Neat 4.3.5 marked the officially 'complete' version of NEAT 4.3, implementing parallelization. To add parallelization to your run, simply add the "threads" parameter in your configuration file and run `read-simulator` as normal. NEAT will take care of the rest. You can customize the parameters in you configuration file, as needed. + +NEAT 4.3.5 marked the officially 'complete' version of NEAT 4.3, implementing parallelization. To add parallelization to your run, simply add the `threads` parameter in your configuration file and run `read-simulator` as normal. NEAT will take care of the rest. You can customize the parameters in your configuration file, as needed. We have completed major revisions on NEAT since 3.4 and consider NEAT 4.3.5 to be a stable release, in that we will continue to update and provide bug fixes and support. We will consider new features and pull requests. Please include justification for major changes. See [contribute](CONTRIBUTING.md) for more information. If you'd like to use some of our code in your own, no problem! Just review the [license](LICENSE.md), first. @@ -13,7 +14,7 @@ We've deprecated NEAT's command-line interface options for the most part, opting Developing and validating bioinformatics pipelines depends on access to genomic data with known ground truth. As a result, many research groups rely on simulated reads, and it can be useful to vary the parameters of the sequencing process itself. NEAT addresses this need as an open-source Python package that can integrate seamlessly with existing bioinformatics workflows—its simulations account for a wide range of sequencing parameters (e.g., coverage, fragment length, sequencing error models, mutational frequencies, ploidy, etc.) and allow users to customize their sequencing data. -NEAT is a fine-grained read simulator that simulates real-looking data using models learned from specific datasets. It was originally designed to simulate short reads and is adaptable to any machine, with custom error models and the capability to handle single-base substitutions, indel errors, and other types of mutations. Unlike simulators that rely solely on fixed error profiles, NEAT can learn empirical mutation and sequencing models from real datasets and use these models to generate realistic sequencing data, providing outputs in several common file formats (e.g., FASTQ, BAM, and VCF). There are several supporting utilities for generating models used for simulation and for comparing the outputs of alignment and variant calling to the golden BAM and golden VCF produced by NEAT. +NEAT is a fine-grained read simulator that simulates real-looking data using models learned from specific datasets. It was originally designed to simulate short reads and is adaptable to different machines, with custom error models and the capability to handle single-base substitutions, indel errors, and other types of mutations. Unlike simulators that rely solely on fixed error profiles, NEAT can learn empirical mutation and sequencing models from real datasets and use these models to generate realistic sequencing data, providing outputs in several common file formats (e.g., FASTQ, BAM, and VCF). There are several supporting utilities for generating models used for simulation and for comparing the outputs of alignment and variant calling to the golden BAM and golden VCF produced by NEAT. To cite this work, please use: @@ -40,6 +41,7 @@ To cite this work, please use: * [`neat model-fraglen`](#neat-model-fraglen) * [`neat gen-mut-model`](#neat-gen-mut-model) * [`neat model-seq-err`](#neat-model-seq-err) + * [`neat model-qual-score`](#neat-model-qual-score) * [`neat vcf_compare`](#neat-vcf_compare) * [Tests](#tests) * [Guide to run locally](#guide-to-run-locally) @@ -157,11 +159,11 @@ To run the simulator in multithreaded mode, set the `threads` value in the confi `reference`: Full path to a FASTA file to generate reads from. -`read_len`: The length of the reads for the FASTQ (if using). _Integer value, default 101._ +`read_len`: The length of the reads for the FASTQ (if using). _Integer value, default 101._ -`coverage`: Desired coverage value. _Float or integer, default = 10._ +`coverage`: Desired coverage value. _Float or integer, default = 10._ -`ploidy`: Desired value for ploidy (# of copies of each chromosome in the organism, where if ploidy > 2, "heterozygous" mutates floor(ploidy / 2) chromosomes). _Default is 2._ +`ploidy`: Desired value for ploidy (# of copies of each chromosome in the organism, where if ploidy > 2, "heterozygous" mutates floor(ploidy / 2) chromosomes). _Default is 2._ `paired_ended`: If paired-ended reads are desired, set this to `True`. Setting this to `True` requires either entering values for `fragment_mean` and `fragment_st_dev` or entering the path to a valid `fragment_model`. @@ -170,53 +172,37 @@ To run the simulator in multithreaded mode, set the `threads` value in the confi `fragment_st_dev`: Use with paired-ended reads, setting the standard deviation of the fragment length dataset. The following values can be set to `True` or omitted to use defaults. If `True`, NEAT will produce the file type. + The default is given: `produce_bam`: `False` - -`produce_vcf`: `False` - +`produce_vcf`: `False` `produce_fastq`: `True` More parameters are below: -`error_model`: Full path to an error model generated by NEAT. Leave empty to use default model _(default model based on human, sequenced by Illumina)_. - -`mutation_model`: Full path to a mutation model generated by NEAT. Leave empty to use a default model _(default model based on human data sequenced by Illumina)_. - -`fragment_model`: Full path to fragment length model generate by NEAT. Leave empty to use default model _(default model based on human data sequenced by Illumina)_. - -`threads`: The number of threads for NEAT to use. _Increasing the number will speed up read generation_. - -`avg_seq_error`: Average sequencing error rate for the sequencing machine. Use to increase or decrease the rate of errors in the reads. _Float between 0 and 0.3. Default is set by the error model_. - -`rescale_qualities`: Rescale the quality scores to reflect the `avg_seq_error` rate above. Set `True` to activate if you notice issues with the sequencing error rates in your dataset. - -`include_vcf`: Full path to list of variants in VCF format to include in the simulation. These will be inserted as they appear in the input VCF into the final VCF, and the corresponding FASTQ and BAM files, if requested. - -`target_bed`: Full path to list of regions in BED format to target. All areas outside these regions will have coverage of 0. - -`discard_bed`: Full path to a list of regions to discard, in BED format. - -`mutation_rate`: Desired rate of mutation for the dataset. _Float between 0.0 and 0.3 (default is determined by the mutation model)_. - -`mutation_bed`: Full path to a list of regions with a column describing the mutation rate of that region, as a float with values between 0 and 0.3. The mutation rate must be in the third column like so (e.g., `mut_rate`=0.00). - -`rng_seed`: Manually enter a seed for the random number generator. Used for repeating runs. _Must be an integer_. - -`min_mutations`: Set the minimum number of mutations that NEAT should add, per contig. _Default is 0._ We recommend setting this to at least one for small chromosomes, so NEAT will produce at least one mutation per contig. - -`threads`: Number of threads to use. More than 1 will use multi-threading to speed up processing. - -`mode`: `size` or `contig` whether to divide the contigs into blocks or just by contig. By `contig` is the default, but division by `size` may speed up your run. Varying the `size` parameter may help if default values do not sufficiently improve runtimes. - -`size`: Default value of 500,000. - -`cleanup_splits`: If running more than one simulation on the same input FASTA, you can reuse splits files. By default, this will be set to `False`, and splits files will be deleted at the end of the run. - -`reuse_splits`: If an existing splits file exists in the output folder, it will use those splits, if this value is set to `True`. - -The command-line options for NEAT are as follows: +| Parameter | Description | +|---------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `error_model` | Full path to an error model or quality score model generated by NEAT. Leave empty to use default model (default model based on human, sequenced by Illumina). | +| `mutation_model` | Full path to a mutation model generated by NEAT. Leave empty to use a default model (default model based on human data sequenced by Illumina). | +| `fragment_model` | Full path to fragment length model generated by NEAT. Leave empty to use default model (default model based on human data sequenced by Illumina). | +| `threads` | The number of threads for NEAT to use. Increasing the number will speed up read generation. | +| `avg_seq_error` | Average sequencing error rate for the sequencing machine. Use to increase or decrease the rate of errors in the reads. Float between 0 and 0.3. Default is set by the error model. | +| `rescale_qualities` | Rescale the quality scores to reflect the `avg_seq_error` rate above. Set `True` to activate if you notice issues with the sequencing error rates in your dataset. | +| `include_vcf` | Full path to list of variants in VCF format to include in the simulation. These will be inserted as they appear in the input VCF into the final VCF, and the corresponding FASTQ and BAM files, if requested. | +| `target_bed` | Full path to list of regions in BED format to target. All areas outside these regions will have coverage of 0. | +| `discard_bed` | Full path to a list of regions to discard, in BED format. | +| `mutation_rate` | Desired rate of mutation for the dataset. Float between 0.0 and 0.3 (default is determined by the mutation model). | +| `mutation_bed` | Full path to a list of regions with a column describing the mutation rate of that region, as a float with values between 0 and 0.3. The mutation rate must be in the third column as, e.g., `mut_rate`=0.00. | +| `rng_seed` | Manually enter a seed for the random number generator. Used for repeating runs. Must be an integer. | +| `min_mutations` | Set the minimum number of mutations that NEAT should add, per contig. Default is 0. We recommend setting this to at least one for small chromosomes, so NEAT will produce at least one mutation per contig. | +| `threads` | Number of threads to use. More than 1 will use multi-threading to speed up processing. | +| `mode` | `size` or `contig` whether to divide the contigs into blocks or just by contig. By `contig` is the default, but division by `size` may speed up your run. | +| `size` | Default value of 500,000. | +| `cleanup_splits` | If running more than one simulation on the same input fasta, you can reuse splits files. By default, this will be set to `False`, and splits files will be deleted at the end of the run. | +| `reuse_splits` | If an existing splits file exists in the output folder, it will use those splits, if this value is set to `True`. | + +The command line options for NEAT are as follows: Universal options can be applied to any subfunction. The commands should come before the function name (e.g., neat --log-level DEBUG read-simulator ...), except -h or --help, which can appear anywhere in the command. | Universal Options | Description | @@ -255,7 +241,7 @@ Features: - Introduces sequencing substitution errors using either the default model or empirically learned in `utilities` - Output a VCF file with the 'golden' set of true positive variants. These can be compared to bioinformatics workflow output (includes coverage and allele balance information) - Output a BAM file with the 'golden' set of aligned reads. These indicate where each read originated and how it should be aligned with the reference -- Create paired tumor/normal datasets using characteristics learned from real tumour data +- Create paired tumour/normal datasets using characteristics learned from real tumour data ### Estimated runtimes @@ -305,7 +291,7 @@ Here we enabled NEAT’s parallelized mode (“small filtering”), which splits | *S. cerevisiae* | 12,310,392 | 139,148 | 2.3191 | | Honeybee | 228,091,137 | 3,040,336 | 50.6723 | | Rice | 394,543,607 | 4,335,126 | 72.2521 | -| *Miscanthus* | 2,718,242,062 | 24876744 | 414.6 | +| *Miscanthus* | 2,718,242,062 | 24,876,744 | 414.6 | For mid-sized genomes (e.g., *E. coli* and *S. cerevisiae*), enabling parallelization reduced runtimes by roughly a factor of two to three compared to the base configuration. For larger genomes (honeybee and rice), the parallel configuration may make multi-hour simulations feasible. @@ -318,7 +304,7 @@ The following commands are examples for common types of data to be generated. Th ### Whole genome simulation Simulate whole genome dataset with random variants inserted according to the default model: -``` +```yml [contents of neat_config.yml] reference: hg19.fa read_len: 126 @@ -336,7 +322,7 @@ neat read-simulator \ ### Targeted region simulation Simulate a targeted region of a genome (e.g., an exome) with a targeted bed: -``` +```yml [contents of neat_config.yml] reference: hg19.fa read_len: 126 @@ -358,7 +344,7 @@ In this case, you would want to split the contig into blocks, rather than readin Also, we demonstrate the situation where you do not want any logs produced: `neat_config.yml`: -``` +```yml reference: giant_bacterium.fa read_len: 150 produce_bam: True @@ -380,7 +366,7 @@ neat read-simulator \ ### Insert specific variants Simulate a whole genome dataset with only the variants in the provided VCF file using `-v` and setting mutation rate to 0 with `-M`. -``` +```yml [contents of neat_config.yml] reference: hg19.fa read_len: 126 @@ -400,7 +386,7 @@ neat read-simulator \ ### Single-end reads Simulate single-end reads by omitting paired-ended options: -``` +```yml [contents of neat_config.yml] reference: hg18.fa read_len: 126 @@ -416,7 +402,7 @@ neat read-simulator \ ### Large single-end reads Simulate PacBio-like reads by providing an error model (note that this is not yet implemented in NEAT 4.0): -``` +```yml [contents of neat-config.yml] reference: hg19.fa read_len: 5000 @@ -500,6 +486,27 @@ neat model-seq-err \ Please note that `-i2` can be used in place of `-i` to produce paired data. +### `neat model-qual-score` + +Typical usage: + +```bash +neat model-qual-score \ + -i input_reads.fastq(.gz) \ + -q 33 \ + -Q 42 \ + -m 1000000 \ + --markov \ + -o /path/to/models \ + -p my_qual_model +``` + +Similarly, use `-i2` to produce a model for paired-ended data. `-q` denotes the quality score offset, while `-Q` is the maximum quality score. + +`-m` denotes the maximum number of reads to process. Use a large number or input -1 to use all reads. `--markov` fits a quality model from the input data using a Markov chain process instead of the baseline quality score model (optional). + +Finally, `-o` is the output directory for the model file and `-p` is the prefix for the output model, such that the file will be written as `.p.gz` inside the output folder. + ### `neat vcf_compare` Tool for comparing VCF files (Not yet implemented in NEAT 4.3.5). diff --git a/neat/cli/commands/__init__.py b/neat/cli/commands/__init__.py index f55b4b86..a3086440 100644 --- a/neat/cli/commands/__init__.py +++ b/neat/cli/commands/__init__.py @@ -1,2 +1,3 @@ -"""Modules related to the subcommands' interfaces""" +"""Modules related to the subcommands""" + from .base import * \ No newline at end of file diff --git a/neat/cli/commands/model_qual_score.py b/neat/cli/commands/model_qual_score.py new file mode 100644 index 00000000..aea15edf --- /dev/null +++ b/neat/cli/commands/model_qual_score.py @@ -0,0 +1,87 @@ +import argparse + +from ...model_quality_score import model_qual_score_runner +from .base import BaseCommand +from .options import output_group + + +class Command(BaseCommand): + """ + Generate a quality score model (traditional or Markov) from FASTQ. + """ + + name = "model-qual-score" + description = "Generate quality score model from FASTQ (optional Markov chain)." + + def add_arguments(self, parser: argparse.ArgumentParser): + + parser.add_argument( + "-i", + dest="input_files", + metavar="FILE", + nargs="+", + required=True, + help="Input FASTQ file(s) (gzipped or plain).", + ) + + parser.add_argument( + "-q", + dest="quality_offset", + type=int, + default=33, + help="Quality score offset [33].", + ) + + parser.add_argument( + "-Q", + dest="quality_scores", + type=int, + nargs="+", + default=[42], + help="Max quality or explicit list of quality scores [42].", + ) + + parser.add_argument( + "-m", + dest="max_num", + type=int, + default=-1, + help="Max number of reads to process [-1 = all].", + ) + + parser.add_argument( + "--markov", + dest="use_markov", + action="store_true", + default=False, + help="Use Markov quality model instead of the traditional model.", + ) + + parser.add_argument( + "--overwrite", + dest="overwrite", + action="store_true", + default=False, + help="Overwrite existing output file if present.", + ) + + output_group.add_to_parser(parser) + + def execute(self, arguments: argparse.Namespace): + + if len(arguments.quality_scores) == 1: + qual_scores: int | list[int] = arguments.quality_scores[0] + + else: + qual_scores = arguments.quality_scores + + model_qual_score_runner( + files=arguments.input_files, + offset=arguments.quality_offset, + qual_scores=qual_scores, + max_reads=arguments.max_num, + overwrite=arguments.overwrite, + output_dir=arguments.output_dir, + output_prefix=arguments.prefix, + use_markov=arguments.use_markov, + ) diff --git a/neat/model_quality_score/__init__.py b/neat/model_quality_score/__init__.py new file mode 100644 index 00000000..a6b92eb9 --- /dev/null +++ b/neat/model_quality_score/__init__.py @@ -0,0 +1,9 @@ +""" +Submodule to build quality score models for NEAT that wraps NEAT’s +existing sequencing error model and traditional quality model, with the +option to construct a Markov chain–based quality model instead +""" + +__all__ = ["model_qual_score_runner"] + +from .runner import model_qual_score_runner \ No newline at end of file diff --git a/neat/model_quality_score/runner.py b/neat/model_quality_score/runner.py new file mode 100644 index 00000000..2589b112 --- /dev/null +++ b/neat/model_quality_score/runner.py @@ -0,0 +1,236 @@ +""" +Runner for creating quality score models. + +This module implements the core logic for generating quality score models +from input FASTQ files. + +The resulting models are saved as a gzip-compressed pickle file. +""" + +import gzip +import pickle +import logging +from pathlib import Path +from typing import Iterable, List, Optional, Tuple + +import numpy as np + +from ..common import validate_input_path, validate_output_path +from ..models import SequencingErrorModel, TraditionalQualityModel +from ..model_sequencing_error.utils import parse_file +from ..models.markov_quality_model import MarkovQualityModel +from ..quality_score_modeling.markov_utils import build_markov_model + +__all__ = ["model_qual_score_runner"] + +_LOG = logging.getLogger(__name__) + + +def _prepare_quality_scores_argument( + qual_scores: Iterable[int | float], +) -> Tuple[List[int], bool]: + """Normalize the quality scores argument to a list of ints.""" + + if isinstance(qual_scores, int): + max_q = int(qual_scores) + return list(range(0, max_q + 1)), False + + sorted_scores = sorted(int(x) for x in qual_scores) + max_q = max(sorted_scores) + + return list(range(0, max_q + 1)), True + + +def model_qual_score_runner( + files: List[str], + offset: int, + qual_scores: Iterable[int | float], + max_reads: int, + use_markov: bool, + overwrite: bool, + output_dir: str, + output_prefix: str, +) -> None: + """Create and save a quality score model from FASTQ data. + + Parameters + ---------- + files: + A list of FASTQ file names. + offset: + Quality score offset (e.g. 33 for Sanger encoding). + qual_scores: + Either a single integer specifying the maximum quality score or an + iterable of integers specifying the allowed quality scores (binned + model). + max_reads: + Maximum number of reads to process from each input file. A value of + ``-1`` or ``float('inf')`` processes all reads. + use_markov: + If ``True``, construct a Markov chain–based quality model. If + ``False``, only the traditional quality model is produced. + overwrite: + If ``True``, allow an existing output file to be overwritten. + output_dir: + Directory to write the output model file. + output_prefix: + Prefix to use for the output file name. The file will be named + ``{output_prefix}.p.gz`` in the ``output_dir``. + """ + + if len(files) > 2: + _LOG.info("Only processing the first two input files") + files = files[:2] + + # Validate input paths + for file in files: + validate_input_path(file) + + _LOG.debug("Input files: %s", ", ".join(str(x) for x in files)) + _LOG.debug("Quality offset: %d", offset) + + final_quality_scores, binned = _prepare_quality_scores_argument(qual_scores) + _LOG.debug("Quality scores: %s", final_quality_scores) + + # Determine maximum number of reads to process + if max_reads in (-1, None): + num_records_to_process = float("inf") + + else: + num_records_to_process = max_reads + _LOG.debug( + "Maximum number of records to process: %s", + "all" if num_records_to_process == float("inf") else num_records_to_process, + ) + + # Validate output directory and file + validate_output_path(output_dir, is_file=False) + output_path = Path(output_dir) + output_file = output_path / f"{output_prefix}.p.gz" + validate_output_path(output_file, overwrite=overwrite) + _LOG.info("Writing output to: %s", output_file) + + # Containers for per-file quality model parameters + read_parameters: List[np.ndarray] = [] + average_errors: List[float] = [] + read_length = 0 + + # Collect traditional model parameters using existing NEAT utilities + file_num = 0 + + for file in files: + + file_num += 1 + _LOG.info("Reading file %d of %d", file_num, len(files)) + + params_by_position, file_avg_error, read_length = parse_file( + file, + final_quality_scores, + num_records_to_process, + offset, + read_length, + ) + + read_parameters.append(params_by_position) + average_errors.append(file_avg_error) + _LOG.info("Finished reading file %d", file_num) + + if not read_parameters: + raise RuntimeError( + "No quality score parameters were computed; check input FASTQ files." + ) + + average_error = float(np.average(average_errors)) if average_errors else 0.0 + _LOG.info("Average sequencing error across files: %f", average_error) + + # Prepare models for each input file + models: List[Tuple[SequencingErrorModel, TraditionalQualityModel, Optional[MarkovQualityModel]]] = [] + + for idx in range(len(read_parameters)): + + # Sequencing error model (always produced) + seq_err_model = SequencingErrorModel( + avg_seq_error=average_error, + read_length=read_length, + ) + + # Traditional quality model (always produced) + trad_model = TraditionalQualityModel( + average_error=average_error, + quality_scores=np.array(final_quality_scores), + qual_score_probs=read_parameters[idx], + ) + + # Optionally build Markov quality model + markov_model: Optional[MarkovQualityModel] = None + + if use_markov: + try: + ( + init_dist, + pos_dists, + max_quality, + train_read_length, + ) = build_markov_model( + [files[idx]], + num_records_to_process, + offset, + ) + + markov_model = MarkovQualityModel( + initial_distribution=init_dist, + position_distributions=pos_dists, + max_quality=max_quality, + read_length=train_read_length, + ) + + except Exception as exc: + _LOG.error( + "Failed to construct Markov model for %s: %s", + files[idx], + exc, + ) + raise + + models.append((seq_err_model, trad_model, markov_model)) + + _LOG.debug("Constructed %d model(s)", len(models)) + + # Write out the models + with gzip.open(output_file, "wb") as out_model: + if not models: + raise RuntimeError( + "Internal error: no models constructed, so there is nothing to write." + ) + + if len(models) == 1: + seq_err1, trad1, markov1 = models[0] + pickle.dump( + { + "error_model1": seq_err1, + "error_model2": None, + "qual_score_model1": markov1 if use_markov else trad1, + "qual_score_model2": None, + }, + out_model, + ) + + elif len(models) == 2: + (seq_err1, trad1, markov1), (seq_err2, trad2, markov2) = models + pickle.dump( + { + "error_model1": seq_err1, + "error_model2": seq_err2, + "qual_score_model1": markov1 if use_markov else trad1, + "qual_score_model2": markov2 if use_markov else trad2, + }, + out_model, + ) + + else: + # NEAT's read simulator only understands one or two models + raise RuntimeError( + f"Expected at most two quality models, but constructed {len(models)}." + ) + + _LOG.info("Quality score model saved to %s", output_file) diff --git a/neat/models/__init__.py b/neat/models/__init__.py index 5a63fe16..7fb80f1b 100644 --- a/neat/models/__init__.py +++ b/neat/models/__init__.py @@ -1,3 +1,4 @@ from .error_models import * from .mutation_model import * from .fragment_model import * +from .markov_quality_model import * \ No newline at end of file diff --git a/neat/models/markov_quality_model.py b/neat/models/markov_quality_model.py new file mode 100644 index 00000000..141e2dcb --- /dev/null +++ b/neat/models/markov_quality_model.py @@ -0,0 +1,168 @@ +""" +Empirical position-specific quality score model for NEAT. + +The model consists of an initial distribution giving the probability of observing each +quality score at the first position of a read and a sequence of per-position empirical +distributions giving the probability of observing each quality score at each subsequent +position. + +At generation time, the model samples a quality score independently at +each position from the corresponding empirical distribution. + +Although this model exists outside of ``neat.models.error_models``, it +parallels the ``TraditionalQualityModel`` class from NEAT’s core and can +be used interchangeably when constructing quality models. +""" + +from dataclasses import dataclass +from typing import Dict, List + +import numpy as np + +__all__ = ["MarkovQualityModel"] + +@dataclass +class MarkovQualityModel: + """Empirical, position-specific quality score model. + + Parameters + ---------- + initial_distribution: + A mapping from integer quality scores to their observed + probabilities at position 0. + position_distributions: + A list of mappings from integer quality scores to their observed + probabilities at each position. Element ``i`` in the list + corresponds to position ``i`` in the read. + max_quality: + The maximum quality observed in the training data. Generated + qualities will be clipped to not exceed the maximum quality. + read_length: + The length of reads used during training determines how many + position-specific distributions exist. + """ + + initial_distribution: Dict[int, float] + position_distributions: List[Dict[int, float]] + max_quality: int + read_length: int + + def __post_init__(self) -> None: + + # Normalize the initial distribution + total_init = float(sum(self.initial_distribution.values())) + + if total_init <= 0: + raise ValueError( + "Initial distribution must have positive probability mass." + ) + + self.initial_distribution = { + int(k): float(v) / total_init + for k, v in self.initial_distribution.items() + } + + # Normalize each position-specific distribution + norm_positions: List[Dict[int, float]] = [] + + for dist in self.position_distributions: + total = float(sum(dist.values())) + + if total <= 0: + # Fall back to a single mass at quality 0 + norm_positions.append({0: 1.0}) + continue + + norm_positions.append( + {int(k): float(v) / total for k, v in dist.items()} + ) + + self.position_distributions = norm_positions + + # Ensure max_quality and read_length are integers + self.max_quality = int(self.max_quality) + self.read_length = int(self.read_length) + + # Precompute numpy arrays for fast sampling + init_keys = list(self.initial_distribution.keys()) + init_vals = [self.initial_distribution[k] for k in init_keys] + self._init_scores = np.asarray(init_keys, dtype=int) + self._init_probs = np.asarray(init_vals, dtype=float) + + self._values_by_pos: List[np.ndarray] = [] + self._probs_by_pos: List[np.ndarray] = [] + + for dist in self.position_distributions: + keys = list(dist.keys()) + vals = [dist[k] for k in keys] + self._values_by_pos.append(np.asarray(keys, dtype=int)) + self._probs_by_pos.append(np.asarray(vals, dtype=float)) + + @property + def quality_scores(self) -> List[int]: + """List of supported quality scores.""" + + return list(range(0, self.max_quality + 1)) + + def _position_index_for_length(self, pos: int, length: int) -> int: + """Map a position in a generated read onto a training position.""" + + if length <= 1 or self.read_length <= 1: + return 0 + + idx = int(round((pos / max(1, length - 1)) * (self.read_length - 1))) + + if idx < 0: + idx = 0 + + elif idx > self.read_length - 1: + idx = self.read_length - 1 + + return idx + + def get_quality_scores( + self, + model_read_length: int, + length: int, + rng: np.random.Generator, + ) -> np.ndarray: + """Generate a synthetic quality score array using this empirical model. + + Parameters + ---------- + model_read_length: + The length of reads used during training. + length: + Desired length of the returned quality array. + rng: + A :class:`numpy.random.Generator` instance used for sampling. + + Returns + ------- + numpy.ndarray + An array of integers representing quality scores. + """ + if length <= 0: + return np.zeros(0, dtype=int) + + qualities = np.zeros(length, dtype=int) + + # Sample initial quality from the starting distribution + qualities[0] = rng.choice(self._init_scores, p=self._init_probs) + + # Sample each subsequent position independently from its empirical per-position distribution + for i in range(1, length): + p_idx = self._position_index_for_length(i, length) + vals = self._values_by_pos[p_idx] + probs = self._probs_by_pos[p_idx] + q = int(rng.choice(vals, p=probs)) + + # Clip to valid range + if q < 0: + q = 0 + elif q > self.max_quality: + q = self.max_quality + + qualities[i] = q + + return qualities diff --git a/neat/models/quality_score_model.py b/neat/models/quality_score_model.py deleted file mode 100644 index 2c52987b..00000000 --- a/neat/models/quality_score_model.py +++ /dev/null @@ -1,426 +0,0 @@ -import pysam -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -from matplotlib.patches import Patch -import seaborn as sns -from scipy.stats import ttest_ind, ttest_rel, f_oneway, norm, levene, shapiro -from sklearn.utils import resample -import pathlib -import pickle - - -def make_qual_score_list(bam_file): - """Takes an input BAM file and creates lists of quality scores. This becomes a data frame, which will be - pre-processed for Markov chain analysis.""" - - index = f"{bam_file}.bai" - - if not pathlib.Path(index).exists(): - print("No index found, creating one.") - - pysam.index(bam_file) - - file_to_parse = pysam.AlignmentFile(bam_file, "rb", check_sq=False) - num_recs = file_to_parse.count() - print(f"{num_recs} records to parse") - - modulo = round(num_recs / 9) - - qual_list = [] - i = 0 - j = 0 - - def print_update(number, factor, percent): - - if number % factor == 0: - percent += 10 - print(f"{percent}% complete", end="\r") - - return percent - - print("Parsing file") - - for item in file_to_parse.fetch(): - - if item.is_unmapped or "S" in item.cigarstring: - i += 1 - j = print_update(i, modulo, j) - - continue - - # mapping quality scores - - align_qual = item.query_alignment_qualities - - # append to master lists - - qual_list.append(align_qual) - i += 1 - j = print_update(i, modulo, j) - - print(f"100% complete") - file_to_parse.close() - - quality_df = pd.DataFrame(qual_list) # turn list of lists into a dataframe - quality_df = quality_df.fillna(0) # pre-processing - fill in missing data - - # filters to process outliers - - quality_df[quality_df > 40] = 40 - quality_df[quality_df < 0] = 0 - - return quality_df - - -def estimate_transition_probabilities(std_dev): - """Takes a standard deviation as an input and generates the transition probabilities with a normal - distribution that can be used to represent a Markov process.""" - - # define the probabilities for transition states based on a normal distribution - - transition_probs = { - -3: norm.pdf(-3, 0, std_dev), - -2: norm.pdf(-2, 0, std_dev), - -1: norm.pdf(-1, 0, std_dev), - 0: norm.pdf(0, 0, std_dev), - 1: norm.pdf(1, 0, std_dev), - 2: norm.pdf(2, 0, std_dev), - 3: norm.pdf(3, 0, std_dev), - } - - # normalize the probabilities to sum to 1 - - total_prob = sum(transition_probs.values()) - - for k in transition_probs: - transition_probs[k] /= total_prob - - return transition_probs - - -def apply_markov_chain(quality_df, noise_level=10, std_dev=2): - """Takes a data frame representing quality scores by position along a read and parameters to increase - variability in the Markov process as inputs and generates predictions based on an ergodic Markov chain. - Generates a data frame with simulated reads.""" - - transition_probs = estimate_transition_probabilities(std_dev) - num_rows, num_cols = quality_df.shape - - count = 0 - markov_preds = [] - - for row in quality_df.iterrows(): - - qualities = row[1].values - pred_qualities = np.zeros_like(qualities) - pred_qualities[0] = qualities[0] # initial state - - print(count, ":", qualities) - row_mean = np.mean(qualities) - row_median = np.median(qualities) - row_std = np.std(qualities) - - for i in range(1, len(quality_df.columns)): - prev_quality = pred_qualities[i - 1] - transitions = list(transition_probs.keys()) - probabilities = list(transition_probs.values()) - next_quality = np.random.choice(transitions, p=probabilities) - - pred_qualities[i] = max(0, prev_quality + next_quality) # ensuring no negative qualities - - # the noise parameter prevents long stretches of the predicted quality scores being very similar - - pred_qualities[i] += np.random.normal(0, noise_level) # add some noise to the predictions - pred_qualities[i] = min(max(pred_qualities[i], 0), 40) # finalize range - - print(count, "mean:", row_mean, "median:", row_median, "st dev:", row_std) - count += 1 - - for i in range(1, len(quality_df.columns)): - - if pred_qualities[i] < row_mean - 2 * row_std: - - if np.random.rand() < 0.95: # 95% chance to substitute abnormally low quality scores - - # uses median and standard deviation from read (not the mean because of outliers) - - new_quality = np.random.normal(row_median, row_std) - pred_qualities[i] = min(max(new_quality, 0), 40) - - # the maximum predicted quality score should be derived from the data - - max_quality = np.max(qualities) - pred_qualities = np.clip(pred_qualities, 0, max_quality) - - markov_preds.append(pred_qualities) - - # randomly sample 30% of the total quality scores for a given read to have the maximum value - - num_samples = int(0.3 * len(quality_df.columns)) # TO DO: make this a parameter - sample_indices = np.random.choice(len(quality_df.columns), num_samples, replace=False) - pred_qualities[sample_indices] = max_quality - - markov_preds_df = pd.DataFrame(markov_preds) - - # apply final linear transformations - - edge_len = int(len(quality_df.columns) * 0.01) - # mid_start = int(len(quality_df.columns) * 0.40) - # mid_end = int(len(quality_df.columns) * 0.60) - - markov_preds_df.iloc[:, :edge_len] -= 5 - markov_preds_df.iloc[:, :edge_len] = markov_preds_df.iloc[:, :edge_len].clip(lower=0) - - markov_preds_df.iloc[:, -edge_len:] -= 5 - markov_preds_df.iloc[:, -edge_len:] = markov_preds_df.iloc[:, -edge_len:].clip(lower=0) - - # markov_preds_df.iloc[:, mid_start:mid_end] += 2 - - return markov_preds_df - - -def plot_heatmap(y_preds_df, file_path): - """Takes a dataframe of predicted quality scores and plots a seaborn heatmap to visualize them.""" - - sns.heatmap(y_preds_df, vmin=0, vmax=max(markov_preds_df.max()), cmap="viridis") - plt.savefig(file_path) - print("Heatmap plotted") - - -def save_file(df, csv_file_path, pickle_file_path): - """Saves the dataframe to a CSV file and a pickle file.""" - - df.to_csv(csv_file_path, index=False) - - with open(pickle_file_path, "wb") as f: - pickle.dump(df, f) - - print(f"Data saved to {csv_file_path} and {pickle_file_path}") - - -def compare_quality_scores(quality_df, markov_preds_df): - """Compares the means and variances of quality scores within each row of the original and predicted data frames.""" - - mean_p_values = [] - variance_p_values = [] - - for i in range(len(quality_df)): - original = quality_df.iloc[i].values - predicted = markov_preds_df.iloc[i].values - - original_mean = np.mean(original) - predicted_mean = np.mean(predicted) - - # print the pairs of means - - print(f'Row {i}: Original Mean = {original_mean}, Predicted Mean = {predicted_mean}') - - # test for equality of means - - t_stat, mean_p_value = ttest_ind(original, predicted, equal_var=False) - mean_p_values.append(mean_p_value) - - # test for equality of variances - - stat, variance_p_value = levene(original, predicted) - variance_p_values.append(variance_p_value) - - return mean_p_values, variance_p_values - - -def plot_comparison_results(mean_p_values, variance_p_values): - """Plots the comparison results for means and variances of quality scores.""" - - rows = range(len(mean_p_values)) - - fig, ax = plt.subplots(1, 2, figsize=(12, 6)) - - ax[0].scatter(rows, mean_p_values, marker='o') - ax[0].set_title('P-values for Equality of Means') - ax[0].set_xlabel('Row Index') - ax[0].set_ylabel('P-value') - - ax[1].scatter(rows, variance_p_values, marker='o') - ax[1].set_title('P-values for Equality of Variances') - ax[1].set_xlabel('Row Index') - ax[1].set_ylabel('P-value') - - plt.tight_layout() - plt.show() - - -def calculate_row_means(df): - return df.mean(axis=1) - - -def calculate_row_variances(df): - return df.var(axis=1) - - -def bootstrap_p_values(list1, list2, n_bootstrap=1000): - bootstrapped_p_values = [] - - for _ in range(n_bootstrap): - sample1 = resample(list1, replace=True) - sample2 = resample(list2, replace=True) - - t_stat, p_value = ttest_ind(sample1, sample2, equal_var=False) - bootstrapped_p_values.append(p_value) - - return bootstrapped_p_values - - -def permutation_test(list1, list2, n_permutations=10000): - observed_diff = abs(np.mean(list1) - np.mean(list2)) - combined = np.concatenate([list1, list2]) - perm_diffs = np.zeros(n_permutations) - - for i in range(n_permutations): - np.random.shuffle(combined) - perm_list1 = combined[:len(list1)] - perm_list2 = combined[len(list1):] - perm_diffs[i] = abs(np.mean(perm_list1) - np.mean(perm_list2)) - - p_value = np.sum(perm_diffs >= observed_diff) / n_permutations - return p_value - - -def test_normality(quality_df, markov_preds_df): - """Tests normality of quality scores within each row using the Shapiro-Wilk test.""" - - quality_df_normality = [] - markov_preds_df_normality = [] - - for i in range(len(quality_df)): - original = quality_df.iloc[i].values - predicted = markov_preds_df.iloc[i].values - - # test normality for the original data - - stat, p_value = shapiro(original) - quality_df_normality.append(p_value > 0.05) - - # test normality for the predicted data - - stat, p_value = shapiro(predicted) - markov_preds_df_normality.append(p_value > 0.05) - - return quality_df_normality, markov_preds_df_normality - - -def plot_normality_results(quality_df_normality, markov_preds_df_normality): - """Plots the normality results for quality scores in side by side heatmaps.""" - - fig, ax = plt.subplots(1, 2, figsize=(12, 6)) - - quality_df_normality = np.array(quality_df_normality).reshape(-1, 1) - markov_preds_df_normality = np.array(markov_preds_df_normality).reshape(-1, 1) - - # create a color map for the heatmap - - cmap = sns.color_palette(["orange", "blue"]) - - sns.heatmap(quality_df_normality, ax=ax[0], cbar=False, cmap=cmap) - ax[0].set_title('Normality of Original Quality Scores') - ax[0].set_xlabel('Quality Scores') - ax[0].set_ylabel('Row Index') - - sns.heatmap(markov_preds_df_normality, ax=ax[1], cbar=False, cmap=cmap) - ax[1].set_title('Normality of Predicted Quality Scores') - ax[1].set_xlabel('Quality Scores') - - legend_elements = [Patch(facecolor='orange', edgecolor='black', label='Non-normal'), - Patch(facecolor='blue', edgecolor='black', label='Normal')] - fig.legend(handles=legend_elements, loc='lower right', title='Distribution') - - plt.tight_layout() - plt.show() - - -# example usage - -# bam_file = "/Users/keshavgandhi/Downloads/H1N1.bam" - -bam_file = "/Users/keshavgandhi/Downloads/subsample_3.125.bam" -quality_df = make_qual_score_list(bam_file) - -markov_preds_df = apply_markov_chain(quality_df) - -# plot_heatmap(markov_preds_df, 'markov_chain_heatmap.svg') -# save_to_csv_and_pickle(markov_preds_df, 'markov_preds.csv', 'markov_preds.pickle') - -sns.heatmap(quality_df, vmin=0, vmax=max(quality_df.max()), cmap='viridis') -sns.heatmap(markov_preds_df, vmin=0, vmax=max(markov_preds_df.max()), cmap='viridis') - -# markov_preds_df - -# for i in range (1, max(markov_preds_df)): -# print(max(markov_preds_df[i])) - -# quality_df.iloc[100][25:75] -# markov_preds_df.iloc[100][25:75] - -bam_file = "/Users/keshavgandhi/Downloads/H1N1.bam" -test_df = make_qual_score_list(bam_file) -markov_preds_df = apply_markov_chain(test_df) - -# compare quality scores - -mean_p_values, variance_p_values = compare_quality_scores(test_df, markov_preds_df) - -markov_means = calculate_row_means(markov_preds_df).tolist() -quality_means = calculate_row_means(test_df).tolist() - -markov_variances = calculate_row_variances(markov_preds_df).tolist() -quality_variances = calculate_row_variances(test_df).tolist() - -# perform permutation test - -permutation_p_value_means = permutation_test(markov_means, quality_means) - -# perform two-sample t-test - -t_stat_means, ttest_p_value_means = ttest_ind(markov_means, quality_means, equal_var=False) - -# bootstrap analysis for means - -bootstrapped_p_values_means = bootstrap_p_values(markov_means, quality_means) -mean_bootstrap_p_value_means = np.mean(bootstrapped_p_values_means) -std_bootstrap_p_value_means = np.std(bootstrapped_p_values_means) - -print(f'Permutation test p-value (means): {permutation_p_value_means}') -print(f'Two-sample t-test p-value (means): {ttest_p_value_means}') -print(f'Bootstrap mean p-value (means): {mean_bootstrap_p_value_means}') -print(f'Bootstrap p-value standard deviation (means): {std_bootstrap_p_value_means}') - -# perform permutation test for variances - -permutation_p_value_variances = permutation_test(markov_variances, quality_variances) - -# perform two-sample t-test for variances - -t_stat_variances, ttest_p_value_variances = ttest_ind(markov_variances, quality_variances, equal_var=False) - -# perform bootstrap analysis for variances - -bootstrapped_p_values_variances = bootstrap_p_values(markov_variances, quality_variances) -mean_bootstrap_p_value_variances = np.mean(bootstrapped_p_values_variances) -std_bootstrap_p_value_variances = np.std(bootstrapped_p_values_variances) - -print(f'Permutation test p-value (variances): {permutation_p_value_variances}') -print(f'Two-sample t-test p-value (variances): {ttest_p_value_variances}') -print(f'Bootstrap mean p-value (variances): {mean_bootstrap_p_value_variances}') -print(f'Bootstrap p-value standard deviation (variances): {std_bootstrap_p_value_variances}') - -# plot comparison results - -plot_comparison_results(mean_p_values, variance_p_values) - -# test normality - -quality_df_normality_test, markov_preds_df_normality_test = test_normality(test_df, markov_preds_df) - -# plot normality results - -plot_normality_results(quality_df_normality_test, markov_preds_df_normality_test) diff --git a/neat/quality_score_modeling/__init__.py b/neat/quality_score_modeling/__init__.py new file mode 100644 index 00000000..3e110be4 --- /dev/null +++ b/neat/quality_score_modeling/__init__.py @@ -0,0 +1,7 @@ +""" +Load quality score modeling utilities for NEAT +""" + +__all__ = ["build_markov_model"] + +from .markov_utils import build_markov_model \ No newline at end of file diff --git a/neat/quality_score_modeling/markov_utils.py b/neat/quality_score_modeling/markov_utils.py new file mode 100644 index 00000000..5ea3f5cb --- /dev/null +++ b/neat/quality_score_modeling/markov_utils.py @@ -0,0 +1,226 @@ +""" +Utility functions for building an empirical quality score model. + +The functions in this module extract per-read quality scores from FASTQ files +and compute the statistics required by the ``MarkovQualityModel``. +""" + +import logging +from collections import defaultdict +from pathlib import Path +from typing import Dict, Iterable, List, Tuple + +from ..model_sequencing_error.utils import convert_quality_string +from ..common import open_input + +_LOG = logging.getLogger(__name__) + +__all__ = [ + "read_quality_lists", + "compute_initial_distribution", + "compute_position_distributions", + "build_markov_model", +] + + +def read_quality_lists( + files: Iterable[str], + max_reads: int, + offset: int, +) -> Tuple[List[List[int]], int]: + """Read per-read quality scores from one or more FASTQ files. + + Parameters + ---------- + files: + An iterable of FASTQ filenames. Only the first ``max_reads`` reads + will be consumed from each file. If a file contains fewer than + ``max_reads`` reads, the entire file is processed. + max_reads: + Maximum number of reads per file to consume. A value of ``-1`` + or ``float('inf')`` indicates that all reads in the file should be + processed. + offset: + Numeric quality score offset (usually 33 for Sanger encoding). + + Returns + ------- + Tuple[List[List[int]], int] + A tuple containing the list of quality score lists and the read + length inferred from the first record. If no reads are found this + function returns an empty list and zero. + """ + + qualities: List[List[int]] = [] + read_length = 0 + + for file in files: + + path = Path(file) + if not path.exists(): + raise FileNotFoundError(f"Input FASTQ file not found: {file}") + + reads_to_parse = max_reads + + if reads_to_parse in (None, -1): + reads_to_parse = float("inf") + + reads_read = 0 + + with open_input(file) as fq_in: + + while reads_read < reads_to_parse: + + # FASTQ format: four lines per record (header, sequence, plus, quality) + header = fq_in.readline() + + if not header: + break # end of file + + seq = fq_in.readline() + plus = fq_in.readline() + qual = fq_in.readline() + + if not qual: + break # truncated file + + quality_scores = convert_quality_string(qual.strip(), offset) + + if not read_length: + read_length = len(quality_scores) + + # Skip reads that do not match the inferred read length + if len(quality_scores) != read_length: + _LOG.debug( + "Skipping record of length %d (expected %d)", + len(quality_scores), + read_length, + ) + continue + + qualities.append(quality_scores) + reads_read += 1 + + return qualities, read_length + + +def compute_initial_distribution(qualities: Iterable[List[int]]) -> Dict[int, float]: + """Compute the empirical distribution of first quality scores. + + Parameters + ---------- + qualities: + An iterable of quality score lists. The first element of each list + will be used to populate the distribution. If a list is empty, it + contributes nothing. + + Returns + ------- + Dict[int, float] + A dictionary mapping quality scores to counts. + """ + + counts: Dict[int, float] = defaultdict(float) + + for qlist in qualities: + if not qlist: + continue + + counts[qlist[0]] += 1.0 + + return dict(counts) + + +def compute_position_distributions( + qualities: Iterable[List[int]], + read_length: int, +) -> List[Dict[int, float]]: + """Compute per-position empirical distributions of quality scores. + + Parameters + ---------- + qualities: + An iterable of quality score lists. All lists are expected to have + length ``read_length``. Any that do not are ignored. + read_length: + Length of reads used for training. + + Returns + ------- + List[Dict[int, float]] + A list of length ``read_length``. Element ``i`` is a dictionary + mapping quality scores to counts at position ``i``. + """ + + if read_length <= 0: + return [] + + # One histogram per position + histograms: List[Dict[int, float]] = [ + defaultdict(float) for _ in range(read_length) + ] + + for qlist in qualities: + + if len(qlist) != read_length: + continue + + for i, q in enumerate(qlist): + histograms[i][int(q)] += 1.0 + + # Convert default dicts to plain dicts + return [dict(h) for h in histograms] + + +def build_markov_model( + files: Iterable[str], + max_reads: int, + offset: int, +) -> Tuple[Dict[int, float], List[Dict[int, float]], int, int]: + """Build distributions and max quality for the empirical quality model. + + This is a convenience wrapper around :func:`read_quality_lists`, + :func:`compute_initial_distribution`, and + :func:`compute_position_distributions`. It returns the empirical + distributions and the maximum observed quality, along with the read + length used for training. + + Parameters + ---------- + files: + An iterable of FASTQ filenames. + max_reads: + Maximum number of reads per file to consume. ``-1`` or + ``float('inf')`` indicates that all reads should be processed. + offset: + Numeric quality score offset (usually 33 for Sanger encoding). + + Returns + ------- + Tuple[Dict[int, float], List[Dict[int, float]], int, int] + A tuple ``(initial_distribution, position_distributions, + max_quality, read_length)``. Distributions are returned as raw + counts; the caller is responsible for normalisation. + """ + + qualities, read_length = read_quality_lists(files, max_reads, offset) + + if not qualities: + raise ValueError("No quality scores could be read from the input files.") + + init_dist = compute_initial_distribution(qualities) + pos_dists = compute_position_distributions(qualities, read_length) + + # Determine maximum observed quality + max_quality = 0 + + for qlist in qualities: + if not qlist: + continue + + mq = max(qlist) + + if mq > max_quality: + max_quality = mq + + return init_dist, pos_dists, max_quality, read_length diff --git a/tests/test_models/test_qual_score_models.py b/tests/test_models/test_qual_score_models.py new file mode 100644 index 00000000..9a8485eb --- /dev/null +++ b/tests/test_models/test_qual_score_models.py @@ -0,0 +1,80 @@ +""" +Unit tests for MarkovQualityModel +""" + +import pytest +import numpy as np +from numpy.random import default_rng + +from neat.models.markov_quality_model import MarkovQualityModel + + +def test_markov_quality_model_shapes_and_range(): + """ + Basic sanity check for MarkovQualityModel: output array shape and bounds. + """ + rng = default_rng(11) + # Simple symmetric distributions around a high-quality region + init_dist = {30: 1.0, 31: 1.0, 32: 1.0} + single_pos_dist = {-1: 1.0, 0: 2.0, 1: 1.0} + max_q = 42 + read_length = 151 + # Position-specific distributions that reuse the same shape at every position + position_distributions = [single_pos_dist] * read_length + qm = MarkovQualityModel(init_dist, position_distributions, max_q, read_length) + qs = qm.get_quality_scores(model_read_length=read_length, length=75, rng=rng) + assert isinstance(qs, np.ndarray) + assert len(qs) == 75 + # Ensure we stay within the valid range + assert qs.min() >= 0 + assert qs.max() <= max_q + + +def test_markov_quality_model_quality_scores_property_matches_range(): + """ + quality_scores should expose the full discrete range [0, max_quality]. + """ + init_dist = {35: 1.0} + single_pos_dist = {0: 1.0} + max_q = 40 + read_length = 151 + position_distributions = [single_pos_dist] * read_length + qm = MarkovQualityModel(init_dist, position_distributions, max_q, read_length) + scores = qm.quality_scores + assert isinstance(scores, list) + assert scores[0] == 0 + assert scores[-1] == max_q + # The range should be contiguous + assert scores == list(range(0, max_q + 1)) + + +def test_markov_quality_model_reproducible_with_seed(): + """Markov quality model should be deterministic for a fixed RNG state.""" + init_dist = {30: 1.0, 31: 1.0} + single_pos_dist = {-1: 1.0, 0: 2.0, 1: 1.0} + max_q = 42 + read_length = 151 + position_distributions = [single_pos_dist] * read_length + qm = MarkovQualityModel(init_dist, position_distributions, max_q, read_length) + rng1 = default_rng(12) + rng2 = default_rng(12) + qs1 = qm.get_quality_scores(model_read_length=read_length, length=100, rng=rng1) + qs2 = qm.get_quality_scores(model_read_length=read_length, length=100, rng=rng2) + assert isinstance(qs1, np.ndarray) + assert isinstance(qs2, np.ndarray) + assert np.array_equal(qs1, qs2) + + +def test_markov_quality_model_invalid_initial_distribution_raises(): + """ + The model should reject empty or zero-mass initial distributions. + """ + read_length = 151 + single_pos_dist = {0: 1.0} + position_distributions = [single_pos_dist] * read_length + # Empty initial distribution + with pytest.raises(ValueError): + MarkovQualityModel({}, position_distributions, 40, read_length) + # Zero total mass in initial distribution + with pytest.raises(ValueError): + MarkovQualityModel({30: 0.0}, position_distributions, 40, read_length) From 7d6808f73e641086e1b33d662da528e71fc5e82a Mon Sep 17 00:00:00 2001 From: Keshav Date: Fri, 30 Jan 2026 19:20:26 +0100 Subject: [PATCH 04/13] Renaming to parallel_block_size and parallel_mode; fixing tests. --- config_template/template_neat_config.yml | 129 +++++++++------------- neat/read_simulator/utils/options.py | 20 ++-- neat/read_simulator/utils/split_inputs.py | 4 +- tests/test_cli/test_cli.py | 35 +++++- tests/test_read_simulator/test_options.py | 57 +++++++--- version.py | 10 ++ 6 files changed, 147 insertions(+), 108 deletions(-) create mode 100644 version.py diff --git a/config_template/template_neat_config.yml b/config_template/template_neat_config.yml index 84d2c3af..2471c627 100644 --- a/config_template/template_neat_config.yml +++ b/config_template/template_neat_config.yml @@ -1,25 +1,27 @@ -## Template for gen_reads parallel +## Template for NEAT's read-simulator (as of version 4.3.5, parallelization-friendly) ## Any parameter that is not required but has a default value will use the ## default value even if the variable is not included in the config. For -## required items, they must be included in the config and the must be given a value. +## required items, they must be included in the config and they must be given a value. ## All other items can be present or not. If present and the value is set to a single ## period, the variable will be treated as though it had been omitted. Please do -## not modify this template, but instead make a copy in your working directory. Done this -## way, you can run without even needing to declare -c. +## not modify this template, but instead make a copy in your working directory. +## Run with: neat read-simulator -c -o [-p ] -# Absolute path to input reference fasta file +# Absolute path to input reference FASTA file # type = string | required: yes reference: REQUIRED -# Read length of the reads in the fastq output. Only required if @produce_fastq is set to true -# type = int | required: no | default = 101 +# Read length of the reads in the FASTQ output. Only used if produce_fastq = true +# type = int | required: no | default = 151 read_len: . -# Average Coverage for the entire genome. -# type = float | required: no | default = 10.0 +# Average coverage for the entire genome +# type = int | required: no | default = 10 coverage: . -# Absolute path to file with sequencing error model +# Absolute path to file with sequencing error model or quality-score model +# Error models are typically produced by neat model-seq-err (from FASTQ/BAM-like inputs) +# Quality-score models can be produced by neat model-qual-score (optionally fit with --markov) # type = string | required: no | default: /neat/models/defaults/default_error_model.pickle.gz error_model: . @@ -27,13 +29,11 @@ error_model: . # type = float | required = no | must be between 0.0 and 0.3 avg_seq_error: . -# This scales the quality scores to match the desired average sequencing error rate -# specified by avg_seq_error. +# Scale quality scores to match avg_seq_error # type: boolean | required = no | default = false rescale_qualities: . -# This is the factor to add to the quality scores to get the ascii text version of the -# score. The default follows the sanger quality offset +# PHRED quality offset (e.g., 33 for Sanger) # type: int | required = no | default = 33 quality_offset: . @@ -41,126 +41,99 @@ quality_offset: . # type = int | required = no | default = 2 ploidy: . -# Absolute path to vcf file containing variants that will always be included, regardless -# of genotype and filter. You can pre-filter your vcf for these fields before inputting it -# if this is not the desired behavior. +# Absolute path to VCF file containing variants that will always be included # type: string | required = no include_vcf: . -# Absolute path to bed file containing reference regions that the simulation -# should target. +# Absolute path to BED file containing reference regions that the simulation should target # type = string | required = no target_bed: . -# Scalar value for coverage in regions outside the targeted bed. Example 0.5 -# would get you roughly half the coverage as the on target areas. Default is -# 0 coverage in off-target regions. Number should be a float in decimal. -# type: float | required = no | default = 0.00 -off_target_scalar: . - -# Absolute path to bed file containing reference regions that the simulation -# should discard. +# Absolute path to BED file containing reference regions that the simulation should discard # type = string | required = no discard_bed: . -# Absolute path to the mutation model pickle file. Omitting this value will cause -# NEAT to use the default model, with some standard parameters, and generally uniform biases. +# Absolute path to the mutation model pickle file +# Typically produced by neat gen-mut-model using a reference FASTA and variants VCF +# Uses trinucleotide context and a trinucleotide transition-matrix to select sites/alleles # type: string | required = no mutation_model: . -# Average mutation rate per base pair. Overall average is 0.001, or model default -# Use either this value to override the mutation rate for the default or input model. +# Average mutation rate per base pair (overrides model mutation rate, if set) # type: float | required = no | must be between 0.0 and 0.3 mutation_rate: . -# Absolute path to a bed file with mutation rates by region. -# Rates must be in the fourth column and be of the form "mut_rate=x.xx" -# Rates must be between 0.00 and 0.03 +# Absolute path to a BED file with mutation rates by region +# Rates must be in the third column and be of the form "mut_rate=x.xx" +# Rates must be between 0.00 and 0.30 # type: string | required = no mutation_bed: . -# Whether the output should be paired ended. For certain conditions (i.e., vcf only or -# fasta only), this will be ignored. If this is true, then there must be an included fragment -# length model output from runner.py or a mean and standard deviation -# by declaring values for @fragment_mean and @fragment_std_dev. +# Paired-end output mode +# If true, you must provide either fragment_model OR (fragment_mean + fragment_st_dev) # type: boolean | required = no | default = false paired_ended: . -# Absolute path to a pickle file containing the fragment length model output -# from runner.py. +# Absolute path to a pickle file containing the fragment length model +# Typically produced by neat model-fraglen (learned from BAM alignments) # type: string | required = no | default: /neat/models/defaults/default_fraglen_model.pickle.gz fragment_model: . -# Mean for the paired end fragment length. This only applies if paired-ended is set to true. -# This number will form the mean for the sample distribution of the fragment lengths in the simulation -# Note: This number is REQUIRED if paired_ended is set to true, unless a fragment length model is used. -# type: float | required: no (unless paired-ended) +# Mean paired-end fragment length (used only if paired_ended = true and fragment_model is not set) +# type: float | required: no (unless paired_ended and no fragment_model) fragment_mean: . -# Standard deviation for the paired end fragment length. This only applies if paired-ended is set to true. -# This number will form the standard deviation about the mean specified above for the sample distribution -# of the fragment lengths in the simulation. -# Note: This number is REQUIRED if paired_ended is set to true, unless a fragment length model is used. -# type: float | required: no (unless paired-ended) +# Standard deviation of paired-end fragment length distribution +# type: float | required: no (unless paired_ended and no fragment_model) fragment_st_dev: . -# Whether to produce the golden bam file. This file will contain the reads -# aligned with the exact region of the genome +# Produce golden BAM file (aligned reads) # type: boolean | required = no | default = false produce_bam: . -# Whether to produce a vcf file containing all the mutation errors added -# by NEAT. +# Produce golden VCF file (all NEAT-added variants) # type: boolean | required = no | default = false produce_vcf: . -# Whether to output the fastq(s) of the reads. This is the default output. NEAT -# will produce 1 fastq for single ended reads or 2 fastqs for paired ended. +# Produce FASTQ output (default output) # type: boolean | required = no | default = true produce_fastq: . -# If set to true, this will ignore statistical models and force coverage to be -# constant across the genome. This is considered a debugging feature. +# Ignore coverage bias models and force constant coverage (debugging) # type: boolean | required = no | default = false no_coverage_bias: . -# Set an RNG seed value. Runs using identical RNG values should produce identical results -# so things like read locations, variant positions, error positions, etc. should be the same. -# Useful for debugging. +# RNG seed for reproducibility # type: int | required = no rng_seed: . -# Set an absolute minimum number of mutations. The program always adds at least 1 mutation. -# Useful for very small datasets. -# type: int | required = no +# Absolute minimum number of mutations (NEAT always adds at least 1) +# type: int | required = no | default = 0 min_mutations: . -# Overwrite the output files, if they are named the same as the current run. -# Default is to quit if files already exist to avoid data destruction +# Overwrite output files if they already exist # type: bool | required = no | default = false overwrite_output: . # How to split the input reference for parallelization -# Note if threads == 1, this option has no effect. +# Note: if threads == 1, this option has no effect (treated as contig) # type = string | required: no | default = contig | values: contig, size parallel_mode: . -# Target block size if by = size (overlap = read_len * 2). -# Default is 500000 when by = size. Not used for by = contig. -# type = int | required: no | default = 500000 (when by=size) +# Target block size if parallel_mode = size (overlap = read_len * 2) +# type = int | required: no | default = 500000 (when parallel_mode=size) parallel_block_size: . -# Maximum number of concurrent NEAT jobs (threads or hyperthreads) to run. -# type = int | required: no | default = all available. +# Number of worker processes/threads to run +# type = int | required: no | default = 1 threads: . -# Delete the 'splits' directory after stitching completes -# Note if threads == 1, this option has no effect. +# If true, delete splits after stitching completes +# Set false to preserve splits in /splits for reuse # type = bool | required: no | default = true cleanup_splits: . -# Reuse existing files in '/splits' and skip the split step. -# The directory must contain neat-generated files and must be in the output dir within "splits" -# Note if threads == 1, this option has no effect. -# type = bool | required: no | default = False -reuse_splits: . \ No newline at end of file +# Reuse existing files in '/splits' and skip the split step +# Directory must exist and contain NEAT-generated split files +# type = bool | required: no | default = false +reuse_splits: . diff --git a/neat/read_simulator/utils/options.py b/neat/read_simulator/utils/options.py index a45f8664..cf897761 100644 --- a/neat/read_simulator/utils/options.py +++ b/neat/read_simulator/utils/options.py @@ -155,7 +155,7 @@ def __init__(self, self.discard_bed: Path | None = discard_bed self.mutation_model: Path | None = mutation_model self.mutation_rate: float | None = mutation_rate - self.mutation_bed: str | None = mutation_bed + self.mutation_bed: Path | None = mutation_bed self.quality_offset: int = quality_offset self.paired_ended: bool = paired_ended @@ -266,7 +266,7 @@ def from_cli(output_dir: Path, # Update items to config or default values base_options.__dict__.update(final_args) - base_options.set_random_seed() + base_options.rng = base_options.set_random_seed() # Some options checking to clean up the args dict base_options.check_options() @@ -288,7 +288,7 @@ def check_and_log_error(keyname, value_to_check, crit1, crit2): if value_to_check not in crit2: _LOG.error(f"Must choose one of {crit2}") sys.exit(1) - elif isinstance(crit1, int) and isinstance(crit2, int): + elif isinstance(crit1, (int, float)) and isinstance(crit2, (int, float)): if not (crit1 <= value_to_check <= crit2): _LOG.error(f'`{keyname}` must be between {crit1} and {crit2} (input: {value_to_check}).') sys.exit(1) @@ -439,16 +439,18 @@ def log_configuration(self): if self.parallel_mode == 'size': _LOG.info(f'Splitting reference into chunks.') - _LOG.info(f' - splitting input into size {self.size}') + _LOG.info(f' - splitting input into size {self.parallel_block_size}') elif self.parallel_mode == 'contig': _LOG.info(f'Splitting input by contig.') - if not self.cleanup_splits or self.reuse_splits: + if self.reuse_splits: splits_dir = Path(f'{self.output_dir}/splits/') - if splits_dir.is_dir(): + if not splits_dir.is_dir(): + raise FileNotFoundError(f"reuse_splits=True but splits dir not found: {splits_dir}") _LOG.info(f'Reusing existing splits {splits_dir}.') - else: - _LOG.warning(f'Reused splits set to True, but splits dir not found: {splits_dir}. Creating new splits') - _LOG.info(f'Preserving splits for next run in directory {self.splits_dir}.') + _LOG.info(f'Preserving splits for next run in directory {splits_dir}.') + elif not self.cleanup_splits: + splits_dir = Path(f'{self.output_dir}/splits/') + _LOG.info(f'Preserving splits for next run in directory {splits_dir}.') else: splits_dir = self.temp_dir_path / "splits" diff --git a/neat/read_simulator/utils/split_inputs.py b/neat/read_simulator/utils/split_inputs.py index 82f63cef..ad3a42f9 100644 --- a/neat/read_simulator/utils/split_inputs.py +++ b/neat/read_simulator/utils/split_inputs.py @@ -67,7 +67,7 @@ def main(options: Options, reference_index: dict) -> tuple[dict, int]: # We'll keep track of chunks by contig, to help us out later split_fasta_dict: dict[str, dict[tuple[int, int], Path]] = {key: {} for key in reference_index.keys()} for contig, seq_record in reference_index.items(): - if options.mode == "contig": + if options.parallel_mode == "contig": stem = f"{idx:0{pad}d}__{contig}" fa = options.splits_dir / f"{stem}.fa.gz" write_fasta(contig, seq_record.seq.upper(), fa) @@ -75,7 +75,7 @@ def main(options: Options, reference_index: dict) -> tuple[dict, int]: idx += 1 written += 1 else: - for start, subseq in chunk_record(seq_record.seq.upper(), options.size, overlap): + for start, subseq in chunk_record(seq_record.seq.upper(), options.parallel_block_size, overlap): stem = f"{idx:0{pad}d}__{contig}" fa = options.splits_dir / f"{stem}.fa.gz" write_fasta(contig, subseq, fa) diff --git a/tests/test_cli/test_cli.py b/tests/test_cli/test_cli.py index 07805497..9dce056a 100644 --- a/tests/test_cli/test_cli.py +++ b/tests/test_cli/test_cli.py @@ -9,6 +9,21 @@ from neat.cli.cli import Cli, main +def _write_min_cfg(tmp_path: Path) -> Path: + """ + Write a minimal config file for read-simulator. + + These CLI tests validate argument handling, logging, and return codes, + not the full simulation behavior, but read-simulator still requires -c. + """ + ref = tmp_path / "ref.fa" + ref.write_text(">chr1\nACGT\n", encoding="utf-8") + + cfg = tmp_path / "conf.yml" + cfg.write_text(f"reference: {ref}\nproduce_fastq: true\n", encoding="utf-8") + return cfg + + def test_cli_registers_read_simulator_subcommand(): cli = Cli() # Argparse stores subparsers in a private map; ensure our command is registered @@ -50,10 +65,15 @@ def test_logging_creates_named_log_file_and_announces(monkeypatch, tmp_path: Pat lambda *args, **kwargs: None, ) + cfg = _write_min_cfg(tmp_path) + rc = main(cli.parser, [ "--log-name", str(logname), # Supply a benign subcommand with minimal required args - "read-simulator", "-o", str(tmp_path), "-p", "pref" + "read-simulator", + "-c", str(cfg), + "-o", str(tmp_path), + "-p", "pref", ]) out = capsys.readouterr().out # main should create/log the file path and return 0 (success) @@ -68,13 +88,12 @@ def test_read_simulator_success_invokes_runner(monkeypatch, tmp_path: Path): called = {} def fake_runner(cfg, outdir, prefix): - called['args'] = (cfg, outdir, prefix) + called["args"] = (cfg, outdir, prefix) # Patch runner used by command monkeypatch.setattr("neat.cli.commands.read_simulator.read_simulator_runner", fake_runner) - cfg = tmp_path / "conf.yml" - cfg.write_text("reference: ''\n", encoding="utf-8") # minimal content; not validated here + cfg = _write_min_cfg(tmp_path) rc = main(cli.parser, [ "--no-log", @@ -85,7 +104,7 @@ def fake_runner(cfg, outdir, prefix): ]) assert rc == 0 - assert called['args'] == (str(cfg), str(tmp_path), "myprefix") + assert called["args"] == (str(cfg), str(tmp_path), "myprefix") def test_read_simulator_failure_returns_1_and_prints_error(monkeypatch, tmp_path: Path, capsys): @@ -94,11 +113,15 @@ def test_read_simulator_failure_returns_1_and_prints_error(monkeypatch, tmp_path def boom(*args, **kwargs): raise RuntimeError("kaboom") - monkeypatch.setattr("neat.read_simulator.read_simulator_runner", boom) + # Patch the runner symbol used by the read-simulator command + monkeypatch.setattr("neat.cli.commands.read_simulator.read_simulator_runner", boom) + + cfg = _write_min_cfg(tmp_path) rc = main(cli.parser, [ "--no-log", "read-simulator", + "-c", str(cfg), "-o", str(tmp_path), "-p", "x", ]) diff --git a/tests/test_read_simulator/test_options.py b/tests/test_read_simulator/test_options.py index a5528ab4..8f9c1e8a 100644 --- a/tests/test_read_simulator/test_options.py +++ b/tests/test_read_simulator/test_options.py @@ -1,6 +1,7 @@ from neat.read_simulator.utils.options import Options from pathlib import Path as _PathAlias +import logging as _logging import numpy as _np import textwrap as _textwrap import pytest as _pytest @@ -10,8 +11,42 @@ def _project_root() -> _PathAlias: return _PathAlias(__file__).resolve().parents[2] -# Redefine the function name used above to override the brittle test -# so pytest only sees this correct version. +@_pytest.fixture(autouse=True) +def _isolate_neat_logging(): + """ + Prevent flaky 'ValueError: I/O operation on closed file' logging errors under pytest. + """ + # Clear handlers on NEAT and all child loggers + for name, logger in list(_logging.Logger.manager.loggerDict.items()): + if name == "neat" or name.startswith("neat."): + if isinstance(logger, _logging.Logger): + for h in list(logger.handlers): + logger.removeHandler(h) + try: + h.close() + except Exception: + pass + logger.handlers.clear() + logger.propagate = True # child loggers will propagate to 'neat' + + neat_logger = _logging.getLogger("neat") + neat_logger.handlers.clear() + neat_logger.addHandler(_logging.NullHandler()) + neat_logger.propagate = False # stop at 'neat' (do not reach root) + + yield + + # Rremove NullHandler + for h in list(neat_logger.handlers): + neat_logger.removeHandler(h) + try: + h.close() + except Exception: + pass + neat_logger.handlers.clear() + neat_logger.propagate = True + + def test_basic_options(): reference = _project_root() / "data" / "H1N1.fa" base_options = Options(reference) @@ -57,7 +92,6 @@ def test_rng_seed_reproducible(): def test_from_cli_single_end_with_threads_and_splits(tmp_path: _PathAlias): - # Build a minimal YAML config using repository-relative paths cfg = _textwrap.dedent( f""" reference: {(_project_root() / 'data' / 'H1N1.fa').as_posix()} @@ -76,8 +110,8 @@ def test_from_cli_single_end_with_threads_and_splits(tmp_path: _PathAlias): rng_seed: 42 overwrite_output: true - mode: contig - size: 500000 + parallel_mode: size + parallel_block_size: 500000 threads: 2 cleanup_splits: false reuse_splits: false @@ -92,14 +126,12 @@ def test_from_cli_single_end_with_threads_and_splits(tmp_path: _PathAlias): opts = Options.from_cli(outdir, "fromcli", yml_path) - # Basics propagated assert opts.reference == _project_root() / "data" / "H1N1.fa" assert opts.read_len == 75 assert opts.coverage == 5 assert opts.ploidy == 2 assert opts.rng_seed == 42 - # Output construction via log_configuration() inside from_cli assert opts.output_dir == outdir assert opts.output_prefix == "fromcli" assert opts.fq1 == outdir / "fromcli.fastq.gz" @@ -107,9 +139,7 @@ def test_from_cli_single_end_with_threads_and_splits(tmp_path: _PathAlias): assert opts.bam is None assert opts.vcf is None - # Parallel-related settings assert opts.threads == 2 - # cleanup_splits: false -> splits dir under output_dir assert opts.splits_dir == outdir / "splits" assert opts.splits_dir.is_dir() @@ -132,7 +162,7 @@ def test_from_cli_paired_end_fragments(tmp_path: _PathAlias): rng_seed: 7 overwrite_output: true - mode: contig + parallel_mode: contig threads: 1 cleanup_splits: true reuse_splits: false @@ -162,6 +192,8 @@ def test_from_cli_reuse_splits_missing_dir_raises(tmp_path: _PathAlias): produce_bam: false produce_vcf: false threads: 4 + parallel_mode: size + parallel_block_size: 500000 cleanup_splits: true reuse_splits: true overwrite_output: true @@ -174,6 +206,5 @@ def test_from_cli_reuse_splits_missing_dir_raises(tmp_path: _PathAlias): outdir = tmp_path / "out" outdir.mkdir(parents=True, exist_ok=True) - options = Options.from_cli(outdir, "reuse", yml_path) - # should issue a warning but continue in this case - assert options.reuse_splits == True + with _pytest.raises(FileNotFoundError, match=r"reuse_splits=True"): + Options.from_cli(outdir, "reuse", yml_path) diff --git a/version.py b/version.py new file mode 100644 index 00000000..4b48e9d3 --- /dev/null +++ b/version.py @@ -0,0 +1,10 @@ +from importlib.metadata import PackageNotFoundError, version as _version + +def neat_version() -> str: + """ + Return NEAT's package version. + """ + try: + return _version("neat") + except PackageNotFoundError: + return "unknown" From 55541ffa1e1e2282c450e6acd2d62a81eb704e4f Mon Sep 17 00:00:00 2001 From: Keshav Date: Mon, 2 Feb 2026 16:40:50 +0100 Subject: [PATCH 05/13] Installation instructions. --- README.md | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index c9dc286c..3193c9e4 100755 --- a/README.md +++ b/README.md @@ -81,26 +81,24 @@ $ git clone git@github.com:ncsa/NEAT.git $ cd NEAT ``` -A quick form of installation uses `bioconda`. Once `conda` is installed, the following command can be run for easy setup. -In the NEAT repo, at the base level is the environment.yml file you will need. Change directories into the neat repository -and run: +A quick form of installation uses `bioconda`. You must run these commands inside the NEAT project directory. ```bash -(base) $ conda env create -f environment.yml +(base) $ conda create -n neat -c conda-forge -c bioconda neat (base) $ conda activate neat -(neat) $ poetry install (neat) $ neat --help # tests that NEAT has installed correctly ``` Alternatively, instead of the `bioconda` method, you can use the `poetry` module in build a wheel file, which can then be `pip` installed. -You will need to run these commands from within the NEAT directory: +Once `conda` is installed, the following command can be run for easy setup. +In the NEAT repository, at the base level is the `environment.yml` file you will need. Change directories into the NEAT repository +and run: ```bash (base) $ conda env create -f environment.yml (base) $ conda activate neat -(neat) $ poetry build -(neat) $ pip install dist/neat*whl +(neat) $ poetry install (neat) $ neat --help # tests that NEAT has installed correctly ``` From 5a4d347cbef4a6e4e1aeed209a6e4d111ba180b0 Mon Sep 17 00:00:00 2001 From: Keshav Date: Mon, 23 Feb 2026 20:15:28 +0100 Subject: [PATCH 06/13] Markov quality score model changes copied in here. --- neat/cli/commands/model_qual_score.py | 87 ++++ .../1764714781.403029_NEAT.log | 2 + neat/model_quality_score/__init__.py | 9 + neat/model_quality_score/runner.py | 236 ++++++++++ neat/models/__init__.py | 1 + neat/models/markov_quality_model.py | 168 +++++++ neat/models/quality_score_model.py | 426 ------------------ neat/quality_score_modeling/__init__.py | 7 + neat/quality_score_modeling/markov_utils.py | 226 ++++++++++ tests/test_models/test_qual_score_models.py | 80 ++++ 10 files changed, 816 insertions(+), 426 deletions(-) create mode 100644 neat/cli/commands/model_qual_score.py create mode 100644 neat/model_quality_score/1764714781.403029_NEAT.log create mode 100644 neat/model_quality_score/__init__.py create mode 100644 neat/model_quality_score/runner.py create mode 100644 neat/models/markov_quality_model.py delete mode 100644 neat/models/quality_score_model.py create mode 100644 neat/quality_score_modeling/__init__.py create mode 100644 neat/quality_score_modeling/markov_utils.py create mode 100644 tests/test_models/test_qual_score_models.py diff --git a/neat/cli/commands/model_qual_score.py b/neat/cli/commands/model_qual_score.py new file mode 100644 index 00000000..c94152f0 --- /dev/null +++ b/neat/cli/commands/model_qual_score.py @@ -0,0 +1,87 @@ +import argparse + +from ...model_quality_score import model_qual_score_runner +from .base import BaseCommand +from .options import output_group + + +class Command(BaseCommand): + """ + Generate a quality score model (traditional or Markov) from FASTQ. + """ + + name = "model-qual-score" + description = "Generate quality score model from FASTQ (optional Markov chain)." + + def add_arguments(self, parser: argparse.ArgumentParser): + + parser.add_argument( + "-i", + dest="input_files", + metavar="FILE", + nargs="+", + required=True, + help="Input FASTQ file(s) (gzipped or plain).", + ) + + parser.add_argument( + "-q", + dest="quality_offset", + type=int, + default=33, + help="Quality score offset [33].", + ) + + parser.add_argument( + "-Q", + dest="quality_scores", + type=int, + nargs="+", + default=[42], + help="Max quality or explicit list of quality scores [42].", + ) + + parser.add_argument( + "-m", + dest="max_num", + type=int, + default=-1, + help="Max number of reads to process [-1 = all].", + ) + + parser.add_argument( + "--markov", + dest="use_markov", + action="store_true", + default=False, + help="Use Markov quality model instead of the traditional model.", + ) + + parser.add_argument( + "--overwrite", + dest="overwrite", + action="store_true", + default=False, + help="Overwrite existing output file if present.", + ) + + output_group.add_to_parser(parser) + + def execute(self, arguments: argparse.Namespace): + + if len(arguments.quality_scores) == 1: + qual_scores: int | list[int] = arguments.quality_scores[0] + + else: + qual_scores = arguments.quality_scores + + model_qual_score_runner( + files=arguments.input_files, + offset=arguments.quality_offset, + qual_scores=qual_scores, + max_reads=arguments.max_num, + overwrite=arguments.overwrite, + output_dir=arguments.output_dir, + output_prefix=arguments.prefix, + use_markov=arguments.use_markov, + ) \ No newline at end of file diff --git a/neat/model_quality_score/1764714781.403029_NEAT.log b/neat/model_quality_score/1764714781.403029_NEAT.log new file mode 100644 index 00000000..f9b4ac75 --- /dev/null +++ b/neat/model_quality_score/1764714781.403029_NEAT.log @@ -0,0 +1,2 @@ +2025-12-02 23:33:02,707:INFO:neat.common.logging:writing log to: /Users/keshavgandhi/PycharmProjects/NEAT/neat/model_quality_score/1764714781.403029_NEAT.log +2025-12-02 23:33:02,707:ERROR:neat.common.io:Path 'data/baby.fastq' does not exist or not a file diff --git a/neat/model_quality_score/__init__.py b/neat/model_quality_score/__init__.py new file mode 100644 index 00000000..a6b92eb9 --- /dev/null +++ b/neat/model_quality_score/__init__.py @@ -0,0 +1,9 @@ +""" +Submodule to build quality score models for NEAT that wraps NEAT’s +existing sequencing error model and traditional quality model, with the +option to construct a Markov chain–based quality model instead +""" + +__all__ = ["model_qual_score_runner"] + +from .runner import model_qual_score_runner \ No newline at end of file diff --git a/neat/model_quality_score/runner.py b/neat/model_quality_score/runner.py new file mode 100644 index 00000000..c30dbbd9 --- /dev/null +++ b/neat/model_quality_score/runner.py @@ -0,0 +1,236 @@ +""" +Runner for creating quality score models. + +This module implements the core logic for generating quality score models +from input FASTQ files. + +The resulting models are saved as a gzip-compressed pickle file. +""" + +import gzip +import pickle +import logging +from pathlib import Path +from typing import Iterable, List, Optional, Tuple + +import numpy as np + +from ..common import validate_input_path, validate_output_path +from ..models import SequencingErrorModel, TraditionalQualityModel +from ..model_sequencing_error.utils import parse_file +from ..models.markov_quality_model import MarkovQualityModel +from ..quality_score_modeling.markov_utils import build_markov_model + +__all__ = ["model_qual_score_runner"] + +_LOG = logging.getLogger(__name__) + + +def _prepare_quality_scores_argument( + qual_scores: Iterable[int | float], +) -> Tuple[List[int], bool]: + """Normalize the quality scores argument to a list of ints.""" + + if isinstance(qual_scores, int): + max_q = int(qual_scores) + return list(range(0, max_q + 1)), False + + sorted_scores = sorted(int(x) for x in qual_scores) + max_q = max(sorted_scores) + + return list(range(0, max_q + 1)), True + + +def model_qual_score_runner( + files: List[str], + offset: int, + qual_scores: Iterable[int | float], + max_reads: int, + use_markov: bool, + overwrite: bool, + output_dir: str, + output_prefix: str, +) -> None: + """Create and save a quality score model from FASTQ data. + + Parameters + ---------- + files: + A list of FASTQ file names. + offset: + Quality score offset (e.g. 33 for Sanger encoding). + qual_scores: + Either a single integer specifying the maximum quality score or an + iterable of integers specifying the allowed quality scores (binned + model). + max_reads: + Maximum number of reads to process from each input file. A value of + ``-1`` or ``float('inf')`` processes all reads. + use_markov: + If ``True``, construct a Markov chain–based quality model. If + ``False``, only the traditional quality model is produced. + overwrite: + If ``True``, allow an existing output file to be overwritten. + output_dir: + Directory to write the output model file. + output_prefix: + Prefix to use for the output file name. The file will be named + ``{output_prefix}.p.gz`` in the ``output_dir``. + """ + + if len(files) > 2: + _LOG.info("Only processing the first two input files") + files = files[:2] + + # Validate input paths + for file in files: + validate_input_path(file) + + _LOG.debug("Input files: %s", ", ".join(str(x) for x in files)) + _LOG.debug("Quality offset: %d", offset) + + final_quality_scores, binned = _prepare_quality_scores_argument(qual_scores) + _LOG.debug("Quality scores: %s", final_quality_scores) + + # Determine maximum number of reads to process + if max_reads in (-1, None): + num_records_to_process = float("inf") + + else: + num_records_to_process = max_reads + _LOG.debug( + "Maximum number of records to process: %s", + "all" if num_records_to_process == float("inf") else num_records_to_process, + ) + + # Validate output directory and file + validate_output_path(output_dir, is_file=False) + output_path = Path(output_dir) + output_file = output_path / f"{output_prefix}.p.gz" + validate_output_path(output_file, overwrite=overwrite) + _LOG.info("Writing output to: %s", output_file) + + # Containers for per-file quality model parameters + read_parameters: List[np.ndarray] = [] + average_errors: List[float] = [] + read_length = 0 + + # Collect traditional model parameters using existing NEAT utilities + file_num = 0 + + for file in files: + + file_num += 1 + _LOG.info("Reading file %d of %d", file_num, len(files)) + + params_by_position, file_avg_error, read_length = parse_file( + file, + final_quality_scores, + num_records_to_process, + offset, + read_length, + ) + + read_parameters.append(params_by_position) + average_errors.append(file_avg_error) + _LOG.info("Finished reading file %d", file_num) + + if not read_parameters: + raise RuntimeError( + "No quality score parameters were computed; check input FASTQ files." + ) + + average_error = float(np.average(average_errors)) if average_errors else 0.0 + _LOG.info("Average sequencing error across files: %f", average_error) + + # Prepare models for each input file + models: List[Tuple[SequencingErrorModel, TraditionalQualityModel, Optional[MarkovQualityModel]]] = [] + + for idx in range(len(read_parameters)): + + # Sequencing error model (always produced) + seq_err_model = SequencingErrorModel( + avg_seq_error=average_error, + read_length=read_length, + ) + + # Traditional quality model (always produced) + trad_model = TraditionalQualityModel( + average_error=average_error, + quality_scores=np.array(final_quality_scores), + qual_score_probs=read_parameters[idx], + ) + + # Optionally build Markov quality model + markov_model: Optional[MarkovQualityModel] = None + + if use_markov: + try: + ( + init_dist, + pos_dists, + max_quality, + train_read_length, + ) = build_markov_model( + [files[idx]], + num_records_to_process, + offset, + ) + + markov_model = MarkovQualityModel( + initial_distribution=init_dist, + position_distributions=pos_dists, + max_quality=max_quality, + read_length=train_read_length, + ) + + except Exception as exc: + _LOG.error( + "Failed to construct Markov model for %s: %s", + files[idx], + exc, + ) + raise + + models.append((seq_err_model, trad_model, markov_model)) + + _LOG.debug("Constructed %d model(s)", len(models)) + + # Write out the models + with gzip.open(output_file, "wb") as out_model: + if not models: + raise RuntimeError( + "Internal error: no models constructed, so there is nothing to write." + ) + + if len(models) == 1: + seq_err1, trad1, markov1 = models[0] + pickle.dump( + { + "error_model1": seq_err1, + "error_model2": None, + "qual_score_model1": markov1 if use_markov else trad1, + "qual_score_model2": None, + }, + out_model, + ) + + elif len(models) == 2: + (seq_err1, trad1, markov1), (seq_err2, trad2, markov2) = models + pickle.dump( + { + "error_model1": seq_err1, + "error_model2": seq_err2, + "qual_score_model1": markov1 if use_markov else trad1, + "qual_score_model2": markov2 if use_markov else trad2, + }, + out_model, + ) + + else: + # NEAT's read simulator only understands one or two models + raise RuntimeError( + f"Expected at most two quality models, but constructed {len(models)}." + ) + + _LOG.info("Quality score model saved to %s", output_file) \ No newline at end of file diff --git a/neat/models/__init__.py b/neat/models/__init__.py index 5a63fe16..7fb80f1b 100644 --- a/neat/models/__init__.py +++ b/neat/models/__init__.py @@ -1,3 +1,4 @@ from .error_models import * from .mutation_model import * from .fragment_model import * +from .markov_quality_model import * \ No newline at end of file diff --git a/neat/models/markov_quality_model.py b/neat/models/markov_quality_model.py new file mode 100644 index 00000000..ba8050e7 --- /dev/null +++ b/neat/models/markov_quality_model.py @@ -0,0 +1,168 @@ +""" +Empirical position-specific quality score model for NEAT. + +The model consists of an initial distribution giving the probability of observing each +quality score at the first position of a read and a sequence of per-position empirical +distributions giving the probability of observing each quality score at each subsequent +position. + +At generation time, the model samples a quality score independently at +each position from the corresponding empirical distribution. + +Although this model exists outside of ``neat.models.error_models``, it +parallels the ``TraditionalQualityModel`` class from NEAT’s core and can +be used interchangeably when constructing quality models. +""" + +from dataclasses import dataclass +from typing import Dict, List + +import numpy as np + +__all__ = ["MarkovQualityModel"] + +@dataclass +class MarkovQualityModel: + """Empirical, position-specific quality score model. + + Parameters + ---------- + initial_distribution: + A mapping from integer quality scores to their observed + probabilities at position 0. + position_distributions: + A list of mappings from integer quality scores to their observed + probabilities at each position. Element ``i`` in the list + corresponds to position ``i`` in the read. + max_quality: + The maximum quality observed in the training data. Generated + qualities will be clipped to not exceed the maximum quality. + read_length: + The length of reads used during training determines how many + position-specific distributions exist. + """ + + initial_distribution: Dict[int, float] + position_distributions: List[Dict[int, float]] + max_quality: int + read_length: int + + def __post_init__(self) -> None: + + # Normalize the initial distribution + total_init = float(sum(self.initial_distribution.values())) + + if total_init <= 0: + raise ValueError( + "Initial distribution must have positive probability mass." + ) + + self.initial_distribution = { + int(k): float(v) / total_init + for k, v in self.initial_distribution.items() + } + + # Normalize each position-specific distribution + norm_positions: List[Dict[int, float]] = [] + + for dist in self.position_distributions: + total = float(sum(dist.values())) + + if total <= 0: + # Fall back to a single mass at quality 0 + norm_positions.append({0: 1.0}) + continue + + norm_positions.append( + {int(k): float(v) / total for k, v in dist.items()} + ) + + self.position_distributions = norm_positions + + # Ensure max_quality and read_length are integers + self.max_quality = int(self.max_quality) + self.read_length = int(self.read_length) + + # Precompute numpy arrays for fast sampling + init_keys = list(self.initial_distribution.keys()) + init_vals = [self.initial_distribution[k] for k in init_keys] + self._init_scores = np.asarray(init_keys, dtype=int) + self._init_probs = np.asarray(init_vals, dtype=float) + + self._values_by_pos: List[np.ndarray] = [] + self._probs_by_pos: List[np.ndarray] = [] + + for dist in self.position_distributions: + keys = list(dist.keys()) + vals = [dist[k] for k in keys] + self._values_by_pos.append(np.asarray(keys, dtype=int)) + self._probs_by_pos.append(np.asarray(vals, dtype=float)) + + @property + def quality_scores(self) -> List[int]: + """List of supported quality scores.""" + + return list(range(0, self.max_quality + 1)) + + def _position_index_for_length(self, pos: int, length: int) -> int: + """Map a position in a generated read onto a training position.""" + + if length <= 1 or self.read_length <= 1: + return 0 + + idx = int(round((pos / max(1, length - 1)) * (self.read_length - 1))) + + if idx < 0: + idx = 0 + + elif idx > self.read_length - 1: + idx = self.read_length - 1 + + return idx + + def get_quality_scores( + self, + model_read_length: int, + length: int, + rng: np.random.Generator, + ) -> np.ndarray: + """Generate a synthetic quality score array using this empirical model. + + Parameters + ---------- + model_read_length: + The length of reads used during training. + length: + Desired length of the returned quality array. + rng: + A :class:`numpy.random.Generator` instance used for sampling. + + Returns + ------- + numpy.ndarray + An array of integers representing quality scores. + """ + if length <= 0: + return np.zeros(0, dtype=int) + + qualities = np.zeros(length, dtype=int) + + # Sample initial quality from the starting distribution + qualities[0] = rng.choice(self._init_scores, p=self._init_probs) + + # Sample each subsequent position independently from its empirical per-position distribution + for i in range(1, length): + p_idx = self._position_index_for_length(i, length) + vals = self._values_by_pos[p_idx] + probs = self._probs_by_pos[p_idx] + q = int(rng.choice(vals, p=probs)) + + # Clip to valid range + if q < 0: + q = 0 + elif q > self.max_quality: + q = self.max_quality + + qualities[i] = q + + return qualities \ No newline at end of file diff --git a/neat/models/quality_score_model.py b/neat/models/quality_score_model.py deleted file mode 100644 index 2c52987b..00000000 --- a/neat/models/quality_score_model.py +++ /dev/null @@ -1,426 +0,0 @@ -import pysam -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -from matplotlib.patches import Patch -import seaborn as sns -from scipy.stats import ttest_ind, ttest_rel, f_oneway, norm, levene, shapiro -from sklearn.utils import resample -import pathlib -import pickle - - -def make_qual_score_list(bam_file): - """Takes an input BAM file and creates lists of quality scores. This becomes a data frame, which will be - pre-processed for Markov chain analysis.""" - - index = f"{bam_file}.bai" - - if not pathlib.Path(index).exists(): - print("No index found, creating one.") - - pysam.index(bam_file) - - file_to_parse = pysam.AlignmentFile(bam_file, "rb", check_sq=False) - num_recs = file_to_parse.count() - print(f"{num_recs} records to parse") - - modulo = round(num_recs / 9) - - qual_list = [] - i = 0 - j = 0 - - def print_update(number, factor, percent): - - if number % factor == 0: - percent += 10 - print(f"{percent}% complete", end="\r") - - return percent - - print("Parsing file") - - for item in file_to_parse.fetch(): - - if item.is_unmapped or "S" in item.cigarstring: - i += 1 - j = print_update(i, modulo, j) - - continue - - # mapping quality scores - - align_qual = item.query_alignment_qualities - - # append to master lists - - qual_list.append(align_qual) - i += 1 - j = print_update(i, modulo, j) - - print(f"100% complete") - file_to_parse.close() - - quality_df = pd.DataFrame(qual_list) # turn list of lists into a dataframe - quality_df = quality_df.fillna(0) # pre-processing - fill in missing data - - # filters to process outliers - - quality_df[quality_df > 40] = 40 - quality_df[quality_df < 0] = 0 - - return quality_df - - -def estimate_transition_probabilities(std_dev): - """Takes a standard deviation as an input and generates the transition probabilities with a normal - distribution that can be used to represent a Markov process.""" - - # define the probabilities for transition states based on a normal distribution - - transition_probs = { - -3: norm.pdf(-3, 0, std_dev), - -2: norm.pdf(-2, 0, std_dev), - -1: norm.pdf(-1, 0, std_dev), - 0: norm.pdf(0, 0, std_dev), - 1: norm.pdf(1, 0, std_dev), - 2: norm.pdf(2, 0, std_dev), - 3: norm.pdf(3, 0, std_dev), - } - - # normalize the probabilities to sum to 1 - - total_prob = sum(transition_probs.values()) - - for k in transition_probs: - transition_probs[k] /= total_prob - - return transition_probs - - -def apply_markov_chain(quality_df, noise_level=10, std_dev=2): - """Takes a data frame representing quality scores by position along a read and parameters to increase - variability in the Markov process as inputs and generates predictions based on an ergodic Markov chain. - Generates a data frame with simulated reads.""" - - transition_probs = estimate_transition_probabilities(std_dev) - num_rows, num_cols = quality_df.shape - - count = 0 - markov_preds = [] - - for row in quality_df.iterrows(): - - qualities = row[1].values - pred_qualities = np.zeros_like(qualities) - pred_qualities[0] = qualities[0] # initial state - - print(count, ":", qualities) - row_mean = np.mean(qualities) - row_median = np.median(qualities) - row_std = np.std(qualities) - - for i in range(1, len(quality_df.columns)): - prev_quality = pred_qualities[i - 1] - transitions = list(transition_probs.keys()) - probabilities = list(transition_probs.values()) - next_quality = np.random.choice(transitions, p=probabilities) - - pred_qualities[i] = max(0, prev_quality + next_quality) # ensuring no negative qualities - - # the noise parameter prevents long stretches of the predicted quality scores being very similar - - pred_qualities[i] += np.random.normal(0, noise_level) # add some noise to the predictions - pred_qualities[i] = min(max(pred_qualities[i], 0), 40) # finalize range - - print(count, "mean:", row_mean, "median:", row_median, "st dev:", row_std) - count += 1 - - for i in range(1, len(quality_df.columns)): - - if pred_qualities[i] < row_mean - 2 * row_std: - - if np.random.rand() < 0.95: # 95% chance to substitute abnormally low quality scores - - # uses median and standard deviation from read (not the mean because of outliers) - - new_quality = np.random.normal(row_median, row_std) - pred_qualities[i] = min(max(new_quality, 0), 40) - - # the maximum predicted quality score should be derived from the data - - max_quality = np.max(qualities) - pred_qualities = np.clip(pred_qualities, 0, max_quality) - - markov_preds.append(pred_qualities) - - # randomly sample 30% of the total quality scores for a given read to have the maximum value - - num_samples = int(0.3 * len(quality_df.columns)) # TO DO: make this a parameter - sample_indices = np.random.choice(len(quality_df.columns), num_samples, replace=False) - pred_qualities[sample_indices] = max_quality - - markov_preds_df = pd.DataFrame(markov_preds) - - # apply final linear transformations - - edge_len = int(len(quality_df.columns) * 0.01) - # mid_start = int(len(quality_df.columns) * 0.40) - # mid_end = int(len(quality_df.columns) * 0.60) - - markov_preds_df.iloc[:, :edge_len] -= 5 - markov_preds_df.iloc[:, :edge_len] = markov_preds_df.iloc[:, :edge_len].clip(lower=0) - - markov_preds_df.iloc[:, -edge_len:] -= 5 - markov_preds_df.iloc[:, -edge_len:] = markov_preds_df.iloc[:, -edge_len:].clip(lower=0) - - # markov_preds_df.iloc[:, mid_start:mid_end] += 2 - - return markov_preds_df - - -def plot_heatmap(y_preds_df, file_path): - """Takes a dataframe of predicted quality scores and plots a seaborn heatmap to visualize them.""" - - sns.heatmap(y_preds_df, vmin=0, vmax=max(markov_preds_df.max()), cmap="viridis") - plt.savefig(file_path) - print("Heatmap plotted") - - -def save_file(df, csv_file_path, pickle_file_path): - """Saves the dataframe to a CSV file and a pickle file.""" - - df.to_csv(csv_file_path, index=False) - - with open(pickle_file_path, "wb") as f: - pickle.dump(df, f) - - print(f"Data saved to {csv_file_path} and {pickle_file_path}") - - -def compare_quality_scores(quality_df, markov_preds_df): - """Compares the means and variances of quality scores within each row of the original and predicted data frames.""" - - mean_p_values = [] - variance_p_values = [] - - for i in range(len(quality_df)): - original = quality_df.iloc[i].values - predicted = markov_preds_df.iloc[i].values - - original_mean = np.mean(original) - predicted_mean = np.mean(predicted) - - # print the pairs of means - - print(f'Row {i}: Original Mean = {original_mean}, Predicted Mean = {predicted_mean}') - - # test for equality of means - - t_stat, mean_p_value = ttest_ind(original, predicted, equal_var=False) - mean_p_values.append(mean_p_value) - - # test for equality of variances - - stat, variance_p_value = levene(original, predicted) - variance_p_values.append(variance_p_value) - - return mean_p_values, variance_p_values - - -def plot_comparison_results(mean_p_values, variance_p_values): - """Plots the comparison results for means and variances of quality scores.""" - - rows = range(len(mean_p_values)) - - fig, ax = plt.subplots(1, 2, figsize=(12, 6)) - - ax[0].scatter(rows, mean_p_values, marker='o') - ax[0].set_title('P-values for Equality of Means') - ax[0].set_xlabel('Row Index') - ax[0].set_ylabel('P-value') - - ax[1].scatter(rows, variance_p_values, marker='o') - ax[1].set_title('P-values for Equality of Variances') - ax[1].set_xlabel('Row Index') - ax[1].set_ylabel('P-value') - - plt.tight_layout() - plt.show() - - -def calculate_row_means(df): - return df.mean(axis=1) - - -def calculate_row_variances(df): - return df.var(axis=1) - - -def bootstrap_p_values(list1, list2, n_bootstrap=1000): - bootstrapped_p_values = [] - - for _ in range(n_bootstrap): - sample1 = resample(list1, replace=True) - sample2 = resample(list2, replace=True) - - t_stat, p_value = ttest_ind(sample1, sample2, equal_var=False) - bootstrapped_p_values.append(p_value) - - return bootstrapped_p_values - - -def permutation_test(list1, list2, n_permutations=10000): - observed_diff = abs(np.mean(list1) - np.mean(list2)) - combined = np.concatenate([list1, list2]) - perm_diffs = np.zeros(n_permutations) - - for i in range(n_permutations): - np.random.shuffle(combined) - perm_list1 = combined[:len(list1)] - perm_list2 = combined[len(list1):] - perm_diffs[i] = abs(np.mean(perm_list1) - np.mean(perm_list2)) - - p_value = np.sum(perm_diffs >= observed_diff) / n_permutations - return p_value - - -def test_normality(quality_df, markov_preds_df): - """Tests normality of quality scores within each row using the Shapiro-Wilk test.""" - - quality_df_normality = [] - markov_preds_df_normality = [] - - for i in range(len(quality_df)): - original = quality_df.iloc[i].values - predicted = markov_preds_df.iloc[i].values - - # test normality for the original data - - stat, p_value = shapiro(original) - quality_df_normality.append(p_value > 0.05) - - # test normality for the predicted data - - stat, p_value = shapiro(predicted) - markov_preds_df_normality.append(p_value > 0.05) - - return quality_df_normality, markov_preds_df_normality - - -def plot_normality_results(quality_df_normality, markov_preds_df_normality): - """Plots the normality results for quality scores in side by side heatmaps.""" - - fig, ax = plt.subplots(1, 2, figsize=(12, 6)) - - quality_df_normality = np.array(quality_df_normality).reshape(-1, 1) - markov_preds_df_normality = np.array(markov_preds_df_normality).reshape(-1, 1) - - # create a color map for the heatmap - - cmap = sns.color_palette(["orange", "blue"]) - - sns.heatmap(quality_df_normality, ax=ax[0], cbar=False, cmap=cmap) - ax[0].set_title('Normality of Original Quality Scores') - ax[0].set_xlabel('Quality Scores') - ax[0].set_ylabel('Row Index') - - sns.heatmap(markov_preds_df_normality, ax=ax[1], cbar=False, cmap=cmap) - ax[1].set_title('Normality of Predicted Quality Scores') - ax[1].set_xlabel('Quality Scores') - - legend_elements = [Patch(facecolor='orange', edgecolor='black', label='Non-normal'), - Patch(facecolor='blue', edgecolor='black', label='Normal')] - fig.legend(handles=legend_elements, loc='lower right', title='Distribution') - - plt.tight_layout() - plt.show() - - -# example usage - -# bam_file = "/Users/keshavgandhi/Downloads/H1N1.bam" - -bam_file = "/Users/keshavgandhi/Downloads/subsample_3.125.bam" -quality_df = make_qual_score_list(bam_file) - -markov_preds_df = apply_markov_chain(quality_df) - -# plot_heatmap(markov_preds_df, 'markov_chain_heatmap.svg') -# save_to_csv_and_pickle(markov_preds_df, 'markov_preds.csv', 'markov_preds.pickle') - -sns.heatmap(quality_df, vmin=0, vmax=max(quality_df.max()), cmap='viridis') -sns.heatmap(markov_preds_df, vmin=0, vmax=max(markov_preds_df.max()), cmap='viridis') - -# markov_preds_df - -# for i in range (1, max(markov_preds_df)): -# print(max(markov_preds_df[i])) - -# quality_df.iloc[100][25:75] -# markov_preds_df.iloc[100][25:75] - -bam_file = "/Users/keshavgandhi/Downloads/H1N1.bam" -test_df = make_qual_score_list(bam_file) -markov_preds_df = apply_markov_chain(test_df) - -# compare quality scores - -mean_p_values, variance_p_values = compare_quality_scores(test_df, markov_preds_df) - -markov_means = calculate_row_means(markov_preds_df).tolist() -quality_means = calculate_row_means(test_df).tolist() - -markov_variances = calculate_row_variances(markov_preds_df).tolist() -quality_variances = calculate_row_variances(test_df).tolist() - -# perform permutation test - -permutation_p_value_means = permutation_test(markov_means, quality_means) - -# perform two-sample t-test - -t_stat_means, ttest_p_value_means = ttest_ind(markov_means, quality_means, equal_var=False) - -# bootstrap analysis for means - -bootstrapped_p_values_means = bootstrap_p_values(markov_means, quality_means) -mean_bootstrap_p_value_means = np.mean(bootstrapped_p_values_means) -std_bootstrap_p_value_means = np.std(bootstrapped_p_values_means) - -print(f'Permutation test p-value (means): {permutation_p_value_means}') -print(f'Two-sample t-test p-value (means): {ttest_p_value_means}') -print(f'Bootstrap mean p-value (means): {mean_bootstrap_p_value_means}') -print(f'Bootstrap p-value standard deviation (means): {std_bootstrap_p_value_means}') - -# perform permutation test for variances - -permutation_p_value_variances = permutation_test(markov_variances, quality_variances) - -# perform two-sample t-test for variances - -t_stat_variances, ttest_p_value_variances = ttest_ind(markov_variances, quality_variances, equal_var=False) - -# perform bootstrap analysis for variances - -bootstrapped_p_values_variances = bootstrap_p_values(markov_variances, quality_variances) -mean_bootstrap_p_value_variances = np.mean(bootstrapped_p_values_variances) -std_bootstrap_p_value_variances = np.std(bootstrapped_p_values_variances) - -print(f'Permutation test p-value (variances): {permutation_p_value_variances}') -print(f'Two-sample t-test p-value (variances): {ttest_p_value_variances}') -print(f'Bootstrap mean p-value (variances): {mean_bootstrap_p_value_variances}') -print(f'Bootstrap p-value standard deviation (variances): {std_bootstrap_p_value_variances}') - -# plot comparison results - -plot_comparison_results(mean_p_values, variance_p_values) - -# test normality - -quality_df_normality_test, markov_preds_df_normality_test = test_normality(test_df, markov_preds_df) - -# plot normality results - -plot_normality_results(quality_df_normality_test, markov_preds_df_normality_test) diff --git a/neat/quality_score_modeling/__init__.py b/neat/quality_score_modeling/__init__.py new file mode 100644 index 00000000..3e110be4 --- /dev/null +++ b/neat/quality_score_modeling/__init__.py @@ -0,0 +1,7 @@ +""" +Load quality score modeling utilities for NEAT +""" + +__all__ = ["build_markov_model"] + +from .markov_utils import build_markov_model \ No newline at end of file diff --git a/neat/quality_score_modeling/markov_utils.py b/neat/quality_score_modeling/markov_utils.py new file mode 100644 index 00000000..3288fd15 --- /dev/null +++ b/neat/quality_score_modeling/markov_utils.py @@ -0,0 +1,226 @@ +""" +Utility functions for building an empirical quality score model. + +The functions in this module extract per-read quality scores from FASTQ files +and compute the statistics required by the ``MarkovQualityModel``. +""" + +import logging +from collections import defaultdict +from pathlib import Path +from typing import Dict, Iterable, List, Tuple + +from ..model_sequencing_error.utils import convert_quality_string +from ..common import open_input + +_LOG = logging.getLogger(__name__) + +__all__ = [ + "read_quality_lists", + "compute_initial_distribution", + "compute_position_distributions", + "build_markov_model", +] + + +def read_quality_lists( + files: Iterable[str], + max_reads: int, + offset: int, +) -> Tuple[List[List[int]], int]: + """Read per-read quality scores from one or more FASTQ files. + + Parameters + ---------- + files: + An iterable of FASTQ filenames. Only the first ``max_reads`` reads + will be consumed from each file. If a file contains fewer than + ``max_reads`` reads, the entire file is processed. + max_reads: + Maximum number of reads per file to consume. A value of ``-1`` + or ``float('inf')`` indicates that all reads in the file should be + processed. + offset: + Numeric quality score offset (usually 33 for Sanger encoding). + + Returns + ------- + Tuple[List[List[int]], int] + A tuple containing the list of quality score lists and the read + length inferred from the first record. If no reads are found this + function returns an empty list and zero. + """ + + qualities: List[List[int]] = [] + read_length = 0 + + for file in files: + + path = Path(file) + if not path.exists(): + raise FileNotFoundError(f"Input FASTQ file not found: {file}") + + reads_to_parse = max_reads + + if reads_to_parse in (None, -1): + reads_to_parse = float("inf") + + reads_read = 0 + + with open_input(file) as fq_in: + + while reads_read < reads_to_parse: + + # FASTQ format: four lines per record (header, sequence, plus, quality) + header = fq_in.readline() + + if not header: + break # end of file + + seq = fq_in.readline() + plus = fq_in.readline() + qual = fq_in.readline() + + if not qual: + break # truncated file + + quality_scores = convert_quality_string(qual.strip(), offset) + + if not read_length: + read_length = len(quality_scores) + + # Skip reads that do not match the inferred read length + if len(quality_scores) != read_length: + _LOG.debug( + "Skipping record of length %d (expected %d)", + len(quality_scores), + read_length, + ) + continue + + qualities.append(quality_scores) + reads_read += 1 + + return qualities, read_length + + +def compute_initial_distribution(qualities: Iterable[List[int]]) -> Dict[int, float]: + """Compute the empirical distribution of first quality scores. + + Parameters + ---------- + qualities: + An iterable of quality score lists. The first element of each list + will be used to populate the distribution. If a list is empty, it + contributes nothing. + + Returns + ------- + Dict[int, float] + A dictionary mapping quality scores to counts. + """ + + counts: Dict[int, float] = defaultdict(float) + + for qlist in qualities: + if not qlist: + continue + + counts[qlist[0]] += 1.0 + + return dict(counts) + + +def compute_position_distributions( + qualities: Iterable[List[int]], + read_length: int, +) -> List[Dict[int, float]]: + """Compute per-position empirical distributions of quality scores. + + Parameters + ---------- + qualities: + An iterable of quality score lists. All lists are expected to have + length ``read_length``. Any that do not are ignored. + read_length: + Length of reads used for training. + + Returns + ------- + List[Dict[int, float]] + A list of length ``read_length``. Element ``i`` is a dictionary + mapping quality scores to counts at position ``i``. + """ + + if read_length <= 0: + return [] + + # One histogram per position + histograms: List[Dict[int, float]] = [ + defaultdict(float) for _ in range(read_length) + ] + + for qlist in qualities: + + if len(qlist) != read_length: + continue + + for i, q in enumerate(qlist): + histograms[i][int(q)] += 1.0 + + # Convert default dicts to plain dicts + return [dict(h) for h in histograms] + + +def build_markov_model( + files: Iterable[str], + max_reads: int, + offset: int, +) -> Tuple[Dict[int, float], List[Dict[int, float]], int, int]: + """Build distributions and max quality for the empirical quality model. + + This is a convenience wrapper around :func:`read_quality_lists`, + :func:`compute_initial_distribution`, and + :func:`compute_position_distributions`. It returns the empirical + distributions and the maximum observed quality, along with the read + length used for training. + + Parameters + ---------- + files: + An iterable of FASTQ filenames. + max_reads: + Maximum number of reads per file to consume. ``-1`` or + ``float('inf')`` indicates that all reads should be processed. + offset: + Numeric quality score offset (usually 33 for Sanger encoding). + + Returns + ------- + Tuple[Dict[int, float], List[Dict[int, float]], int, int] + A tuple ``(initial_distribution, position_distributions, + max_quality, read_length)``. Distributions are returned as raw + counts; the caller is responsible for normalisation. + """ + + qualities, read_length = read_quality_lists(files, max_reads, offset) + + if not qualities: + raise ValueError("No quality scores could be read from the input files.") + + init_dist = compute_initial_distribution(qualities) + pos_dists = compute_position_distributions(qualities, read_length) + + # Determine maximum observed quality + max_quality = 0 + + for qlist in qualities: + if not qlist: + continue + + mq = max(qlist) + + if mq > max_quality: + max_quality = mq + + return init_dist, pos_dists, max_quality, read_length \ No newline at end of file diff --git a/tests/test_models/test_qual_score_models.py b/tests/test_models/test_qual_score_models.py new file mode 100644 index 00000000..cd9c021a --- /dev/null +++ b/tests/test_models/test_qual_score_models.py @@ -0,0 +1,80 @@ +""" +Unit tests for MarkovQualityModel +""" + +import pytest +import numpy as np +from numpy.random import default_rng + +from neat.models.markov_quality_model import MarkovQualityModel + + +def test_markov_quality_model_shapes_and_range(): + """ + Basic sanity check for MarkovQualityModel: output array shape and bounds. + """ + rng = default_rng(11) + # Simple symmetric distributions around a high-quality region + init_dist = {30: 1.0, 31: 1.0, 32: 1.0} + single_pos_dist = {-1: 1.0, 0: 2.0, 1: 1.0} + max_q = 42 + read_length = 151 + # Position-specific distributions that reuse the same shape at every position + position_distributions = [single_pos_dist] * read_length + qm = MarkovQualityModel(init_dist, position_distributions, max_q, read_length) + qs = qm.get_quality_scores(model_read_length=read_length, length=75, rng=rng) + assert isinstance(qs, np.ndarray) + assert len(qs) == 75 + # Ensure we stay within the valid range + assert qs.min() >= 0 + assert qs.max() <= max_q + + +def test_markov_quality_model_quality_scores_property_matches_range(): + """ + quality_scores should expose the full discrete range [0, max_quality]. + """ + init_dist = {35: 1.0} + single_pos_dist = {0: 1.0} + max_q = 40 + read_length = 151 + position_distributions = [single_pos_dist] * read_length + qm = MarkovQualityModel(init_dist, position_distributions, max_q, read_length) + scores = qm.quality_scores + assert isinstance(scores, list) + assert scores[0] == 0 + assert scores[-1] == max_q + # The range should be contiguous + assert scores == list(range(0, max_q + 1)) + + +def test_markov_quality_model_reproducible_with_seed(): + """Markov quality model should be deterministic for a fixed RNG state.""" + init_dist = {30: 1.0, 31: 1.0} + single_pos_dist = {-1: 1.0, 0: 2.0, 1: 1.0} + max_q = 42 + read_length = 151 + position_distributions = [single_pos_dist] * read_length + qm = MarkovQualityModel(init_dist, position_distributions, max_q, read_length) + rng1 = default_rng(12) + rng2 = default_rng(12) + qs1 = qm.get_quality_scores(model_read_length=read_length, length=100, rng=rng1) + qs2 = qm.get_quality_scores(model_read_length=read_length, length=100, rng=rng2) + assert isinstance(qs1, np.ndarray) + assert isinstance(qs2, np.ndarray) + assert np.array_equal(qs1, qs2) + + +def test_markov_quality_model_invalid_initial_distribution_raises(): + """ + The model should reject empty or zero-mass initial distributions. + """ + read_length = 151 + single_pos_dist = {0: 1.0} + position_distributions = [single_pos_dist] * read_length + # Empty initial distribution + with pytest.raises(ValueError): + MarkovQualityModel({}, position_distributions, 40, read_length) + # Zero total mass in initial distribution + with pytest.raises(ValueError): + MarkovQualityModel({30: 0.0}, position_distributions, 40, read_length) \ No newline at end of file From b0ad46aa64ce17582b852d515c3504d3585d744c Mon Sep 17 00:00:00 2001 From: Keshav Date: Fri, 27 Feb 2026 16:15:09 +0100 Subject: [PATCH 07/13] README updates including Markov stuff. --- README.md | 139 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 85 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index 1bcf291a..28e9d27a 100755 --- a/README.md +++ b/README.md @@ -3,17 +3,18 @@ Welcome to the NEAT project, the NExt-generation sequencing Analysis Toolkit, version 4.3.5. This release of NEAT 4.3.5 includes several fixes and a little bit of restructuring, including a parallel process for running `neat read-simulator`. Our tests show much improved performance. If the logs seem excessive, you might try using the `--log-level ERROR` to reduce the output from the logs. See the [ChangeLog](ChangeLog.md) for notes. NEAT 4.3.5 is the official release of NEAT 4.0. It represents a lot of hard work from several contributors at NCSA and beyond. With the addition of parallel processing, we feel that the code is ready for production, and future releases will focus on compatibility, bug fixes, and testing. Future releases for the time being will be enumerations of 4.3.X. ## NEAT v4.3.5 -Neat 4.3.5 marked the officially 'complete' version of NEAT 4.3, implementing parallelization. To add parallelization to you run, simply add the "threads" parameter in your configuration and run read-simulator as normal. NEAT will take care of the rest. You can customize the parameters in you configuration file, as needed. + +NEAT 4.3.5 marked the officially 'complete' version of NEAT 4.3, implementing parallelization. To add parallelization to your run, simply add the `threads` parameter in your configuration file and run `read-simulator` as normal. NEAT will take care of the rest. You can customize the parameters in your configuration file, as needed. We have completed major revisions on NEAT since 3.4 and consider NEAT 4.3.5 to be a stable release, in that we will continue to update and provide bug fixes and support. We will consider new features and pull requests. Please include justification for major changes. See [contribute](CONTRIBUTING.md) for more information. If you'd like to use some of our code in your own, no problem! Just review the [license](LICENSE.md), first. -We've deprecated NEAT's command-line interface options for the most part, opting to simplify things with configuration files. If you require the CLI for legacy purposes, NEAT 3.4 was our last release to be fully command-line interface. Please convert your CLI commands to the corresponding yaml configuration for future runs. +We've deprecated NEAT's command-line interface options for the most part, opting to simplify things with configuration files. If you require the CLI for legacy purposes, NEAT 3.4 was our last release to be fully supported via command-line interface. Please convert your CLI commands to the corresponding configuration file for future runs. ### Statement of Need Developing and validating bioinformatics pipelines depends on access to genomic data with known ground truth. As a result, many research groups rely on simulated reads, and it can be useful to vary the parameters of the sequencing process itself. NEAT addresses this need as an open-source Python package that can integrate seamlessly with existing bioinformatics workflows—its simulations account for a wide range of sequencing parameters (e.g., coverage, fragment length, sequencing error models, mutational frequencies, ploidy, etc.) and allow users to customize their sequencing data. -NEAT is a fine-grained read simulator that simulates real-looking data using models learned from specific datasets. It was originally designed to simulate short reads, but it handles long-read simulation as well and is adaptable to any machine, with custom error models and the capability to handle single-base substitutions and indel errors. Unlike many simulators that rely solely on fixed error profiles, NEAT can learn empirical mutation and sequencing models from real datasets and use these models to generate realistic sequencing data, providing outputs in several common file formats (e.g., FASTQ, BAM, and VCF). There are several supporting utilities for generating models used for simulation and for comparing the outputs of alignment and variant calling to the golden BAM and golden VCF produced by NEAT. +NEAT is a fine-grained read simulator that simulates real-looking data using models learned from specific datasets. It was originally designed to simulate short reads and is adaptable to different machines, with custom error models and the capability to handle single-base substitutions, indel errors, and other types of mutations. Unlike simulators that rely solely on fixed error profiles, NEAT can learn empirical mutation and sequencing models from real datasets and use these models to generate realistic sequencing data, providing outputs in several common file formats (e.g., FASTQ, BAM, and VCF). There are several supporting utilities for generating models used for simulation and for comparing the outputs of alignment and variant calling to the golden BAM and golden VCF produced by NEAT. To cite this work, please use: @@ -40,7 +41,10 @@ To cite this work, please use: * [`neat model-fraglen`](#neat-model-fraglen) * [`neat gen-mut-model`](#neat-gen-mut-model) * [`neat model-seq-err`](#neat-model-seq-err) + * [`neat model-qual-score`](#neat-model-qual-score) * [`neat vcf_compare`](#neat-vcf_compare) + * [Tests](#tests) + * [Guide to run locally](#guide-to-run-locally) * [Note on Sensitive Patient Data](#note-on-sensitive-patient-data) ## Prerequisites @@ -77,32 +81,30 @@ $ git clone git@github.com:ncsa/NEAT.git $ cd NEAT ``` -A quick form of installation uses `bioconda`. Once `conda` is installed, the following command can be run for easy setup. -In the NEAT repo, at the base level is the environment.yml file you will need. Change directories into the neat repository -and run: +A quick form of installation uses `bioconda`. You must run these commands inside the NEAT project directory. ```bash -(base) $ conda env create -f environment.yml +(base) $ conda create -n neat -c conda-forge -c bioconda neat (base) $ conda activate neat -(neat) $ poetry install (neat) $ neat --help # tests that NEAT has installed correctly ``` Alternatively, instead of the `bioconda` method, you can use the `poetry` module in build a wheel file, which can then be `pip` installed. -You will need to run these commands from within the NEAT directory: +Once `conda` is installed, the following command can be run for easy setup. +In the NEAT repository, at the base level is the `environment.yml` file you will need. Change directories into the NEAT repository +and run: ```bash (base) $ conda env create -f environment.yml (base) $ conda activate neat -(neat) $ poetry build -(neat) $ pip install dist/neat*whl +(neat) $ poetry install (neat) $ neat --help # tests that NEAT has installed correctly ``` Assuming you have installed `conda`, run `source activate` or `conda activate`. -Please note that these installation instructions support MacOS, Windows, and Linux. However, if you are on MacOS, you need to remove the line `libgcc=14` from `environment.yml`. A solution for some non-Linux users is simple to remove the version specification (e.g., `libgcc`). +Please note that these installation instructions support MacOS, Windows, and Linux. Alternatively, if you wish to work with NEAT in the development-only environment, you can use `poetry install` within the NEAT repo, after creating the `conda` environment: @@ -156,42 +158,50 @@ description of the potential inputs in the config file. See `NEAT/config_templat To run the simulator in multithreaded mode, set the `threads` value in the config to something greater than 1. -`reference`: full path to a fasta file to generate reads from. -`read_len`: The length of the reads for the fastq (if using). _Integer value, default 101._ -`coverage`: desired coverage value. _Float or integer, default = 10._ -`ploidy`: Desired value for ploidy (# of copies of each chromosome in the organism, where if ploidy > 2, "heterozygous" mutates floor(ploidy / 2) chromosomes). _Default is 2._ -`paired_ended`: If paired-ended reads are desired, set this to True. Setting this to true requires either entering values for fragment_mean and fragment_st_dev or entering the path to a valid fragment_model. -`fragment_mean`: Use with paired-ended reads, set a fragment length mean manually -`fragment_st_dev`: Use with paired-ended reads, set the standard deviation of the fragment length dataset +`reference`: Full path to a FASTA file to generate reads from. + +`read_len`: The length of the reads for the FASTQ (if using). _Integer value, default 101._ + +`coverage`: Desired coverage value. _Float or integer, default = 10._ + +`ploidy`: Desired value for ploidy (# of copies of each chromosome in the organism, where if ploidy > 2, "heterozygous" mutates floor(ploidy / 2) chromosomes). _Default is 2._ + +`paired_ended`: If paired-ended reads are desired, set this to `True`. Setting this to `True` requires either entering values for `fragment_mean` and `fragment_st_dev` or entering the path to a valid `fragment_model`. + +`fragment_mean`: Use with paired-ended reads, setting a fragment length mean manually. + +`fragment_st_dev`: Use with paired-ended reads, setting the standard deviation of the fragment length dataset. + +The following values can be set to `True` or omitted to use defaults. If `True`, NEAT will produce the file type. -The following values can be set to true or omitted to use defaults. If True, NEAT will produce the file type. The default is given: -`produce_bam`: False -`produce_vcf`: False -`produce_fastq`: True - - -| Parameter | Description | -|---------------------|-------------| -| `error_model` | Full path to an error model generated by NEAT. Leave empty to use default model (default model based on human, sequenced by Illumina). | -| `mutation_model` | Full path to a mutation model generated by NEAT. Leave empty to use a default model (default model based on human data sequenced by Illumina). | -| `fragment_model` | Full path to fragment length model generated by NEAT. Leave empty to use default model (default model based on human data sequenced by Illumina). | -| `threads` | The number of threads for NEAT to use. Increasing the number will speed up read generation. | -| `avg_seq_error` | Average sequencing error rate for the sequencing machine. Use to increase or decrease the rate of errors in the reads. Float between 0 and 0.3. Default is set by the error model. | -| `rescale_qualities` | Rescale the quality scores to reflect the avg_seq_error rate above. Set True to activate if you notice issues with the sequencing error rates in your dataset. | -| `include_vcf` | Full path to list of variants in VCF format to include in the simulation. These will be inserted as they appear in the input VCF into the final VCF, and the corresponding fastq and bam files, if requested. | -| `target_bed` | Full path to list of regions in BED format to target. All areas outside these regions will have coverage of 0. | -| `discard_bed` | Full path to a list of regions to discard, in BED format. | -| `mutation_rate` | Desired rate of mutation for the dataset. Float between 0.0 and 0.3 (default is determined by the mutation model). | -| `mutation_bed` | Full path to a list of regions with a column describing the mutation rate of that region, as a float with values between 0 and 0.3. The mutation rate must be in the third column as, e.g., mut_rate=0.00. | -| `rng_seed` | Manually enter a seed for the random number generator. Used for repeating runs. Must be an integer. | -| `min_mutations` | Set the minimum number of mutations that NEAT should add, per contig. Default is 0. We recommend setting this to at least one for small chromosomes, so NEAT will produce at least one mutation per contig. | -| `threads` | Number of threads to use. More than 1 will use multithreading parallelism to speed up processing. | -| `mode` | 'size' or 'contig' whether to divide the contigs into blocks or just by contig. By contig is the default, try by size. Varying the size parameter may help if default values are not sufficient. | -| `size` | Default value of 500,000. | -| `cleanup_splits` | If running more than one simulation on the same input fasta, you can reuse splits files. By default, this will be set to False, and splits files will be deleted at the end of the run. | -| `reuse_splits` | If an existing splits file exists in the output folder, it will use those splits, if this value is set to True. | +`produce_bam`: `False` +`produce_vcf`: `False` +`produce_fastq`: `True` + +More parameters are below: + +| Parameter | Description | +|---------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `error_model` | Full path to an error model or quality score model generated by NEAT. Leave empty to use default model (default model based on human, sequenced by Illumina). | +| `mutation_model` | Full path to a mutation model generated by NEAT. Leave empty to use a default model (default model based on human data sequenced by Illumina). | +| `fragment_model` | Full path to fragment length model generated by NEAT. Leave empty to use default model (default model based on human data sequenced by Illumina). | +| `threads` | The number of threads for NEAT to use. Increasing the number will speed up read generation. | +| `avg_seq_error` | Average sequencing error rate for the sequencing machine. Use to increase or decrease the rate of errors in the reads. Float between 0 and 0.3. Default is set by the error model. | +| `rescale_qualities` | Rescale the quality scores to reflect the `avg_seq_error` rate above. Set `True` to activate if you notice issues with the sequencing error rates in your dataset. | +| `include_vcf` | Full path to list of variants in VCF format to include in the simulation. These will be inserted as they appear in the input VCF into the final VCF, and the corresponding FASTQ and BAM files, if requested. | +| `target_bed` | Full path to list of regions in BED format to target. All areas outside these regions will have coverage of 0. | +| `discard_bed` | Full path to a list of regions to discard, in BED format. | +| `mutation_rate` | Desired rate of mutation for the dataset. Float between 0.0 and 0.3 (default is determined by the mutation model). | +| `mutation_bed` | Full path to a list of regions with a column describing the mutation rate of that region, as a float with values between 0 and 0.3. The mutation rate must be in the third column as, e.g., `mut_rate`=0.00. | +| `rng_seed` | Manually enter a seed for the random number generator. Used for repeating runs. Must be an integer. | +| `min_mutations` | Set the minimum number of mutations that NEAT should add, per contig. Default is 0. We recommend setting this to at least one for small chromosomes, so NEAT will produce at least one mutation per contig. | +| `threads` | Number of threads to use. More than 1 will use multi-threading to speed up processing. | +| `mode` | `size` or `contig` whether to divide the contigs into blocks or just by contig. By `contig` is the default, but division by `size` may speed up your run. | +| `size` | Default value of 500,000. | +| `cleanup_splits` | If running more than one simulation on the same input fasta, you can reuse splits files. By default, this will be set to `False`, and splits files will be deleted at the end of the run. | +| `reuse_splits` | If an existing splits file exists in the output folder, it will use those splits, if this value is set to `True`. | The command line options for NEAT are as follows: @@ -205,7 +215,7 @@ Universal options can be applied to any subfunction. The commands should come be | --log-detail VALUE | VALUE must be one of [LOW, MEDIUM, HIGH] - how much info to write for each log record | | --silent-mode | Writes logs, but suppresses stdout messages | -read-simulator command line options +`read-simulator` command line options | Option | Description | |---------------------|-------------------------------------| | -c VALUE, --config VALUE | The VALUE should be the name of the config file to use for this run | @@ -229,7 +239,7 @@ Features: - Can accurately simulate large, single-end reads with high indel error rates (PacBio-like) given a model - Specify simple fragment length model with mean and standard deviation or an empirically learned fragment distribution - Simulates quality scores using either the default model or empirically learned quality scores using `neat gen_mut_model` -- Introduces sequencing substitution errors using either the default model or empirically learned from utilities/ +- Introduces sequencing substitution errors using either the default model or empirically learned in `utilities` - Output a VCF file with the 'golden' set of true positive variants. These can be compared to bioinformatics workflow output (includes coverage and allele balance information) - Output a BAM file with the 'golden' set of aligned reads. These indicate where each read originated and how it should be aligned with the reference - Create paired tumour/normal datasets using characteristics learned from real tumour data @@ -282,7 +292,7 @@ Here we enabled NEAT’s parallelized mode (“small filtering”), which splits | *S. cerevisiae* | 12,310,392 | 139,148 | 2.3191 | | Honeybee | 228,091,137 | 3,040,336 | 50.6723 | | Rice | 394,543,607 | 4,335,126 | 72.2521 | -| *Miscanthus* | 2,718,242,062 | 24876744 | 414.6 | +| *Miscanthus* | 2,718,242,062 | 24,876,744 | 414.6 | For mid-sized genomes (e.g., *E. coli* and *S. cerevisiae*), enabling parallelization reduced runtimes by roughly a factor of two to three compared to the base configuration. For larger genomes (honeybee and rice), the parallel configuration may make multi-hour simulations feasible. @@ -409,7 +419,7 @@ neat read-simulator \ Several scripts are distributed with `gen_reads` that are used to generate the models used for simulation. -## `neat model-fraglen` +### `neat model-fraglen` Computes empirical fragment length distribution from sample paired-end data. NEAT uses the template length (tlen) attribute calculated from paired-ended alignments to generate summary statistics for fragment lengths, which can be input into NEAT. @@ -421,7 +431,7 @@ Computes empirical fragment length distribution from sample paired-end data. NEA and creates `fraglen.pickle.gz` model in working directory. -## `neat gen-mut-model` +### `neat gen-mut-model` Takes reference genome and VCF file to generate mutation models: @@ -440,7 +450,7 @@ Trinucleotides are identified in the reference genome and the variant file. The | --human-sample | Use to skip unnumbered scaffolds in human references | | --skip-common | Do not save common snps or high mutation areas | -## `neat model-seq-err` +### `neat model-seq-err` Generates sequencing error model for NEAT. @@ -477,7 +487,28 @@ neat model-seq-err \ Please note that `-i2` can be used in place of `-i` to produce paired data. -## `neat vcf_compare` +### `neat model-qual-score` + +Typical usage: + +```bash +neat model-qual-score \ + -i input_reads.fastq(.gz) \ + -q 33 \ + -Q 42 \ + -m 1000000 \ + --markov \ + -o /path/to/models \ + -p my_qual_model +``` + +Similarly, use `-i2` to produce a model for paired-ended data. `-q` denotes the quality score offset, while `-Q` is the maximum quality score. + +`-m` denotes the maximum number of reads to process. Use a large number or input -1 to use all reads. `--markov` fits a quality model from the input data using a Markov chain process instead of the baseline quality score model (optional). + +Finally, `-o` is the output directory for the model file and `-p` is the prefix for the output model, such that the file will be written as `.p.gz` inside the output folder. + +### `neat vcf_compare` Tool for comparing VCF files (Not yet implemented in NEAT 4.3.5). @@ -504,7 +535,7 @@ neat vcf_compare We provide unit tests (e.g., mutation and sequencing error models) and basic integration tests for the CLI. -### Run locally +### Guide to run locally ```bash conda env create -f environment.yml conda activate neat @@ -515,4 +546,4 @@ pytest -q tests Please see `CONTRIBUTING.md` for more information and further instructions. ### Note on Sensitive Patient Data -ICGC's "Access Controlled Data" documentation can be found at https://docs.icgc.org/portal/access/. To have access to controlled germline data, a DACO must be submitted. Open tier data can be obtained without a DACO, but germline alleles that do not match the reference genome are masked and replaced with the reference allele. Controlled data includes unmasked germline alleles. +ICGC's "Access Controlled Data" documentation can be found at https://docs.icgc.org/portal/access/. To have access to controlled germline data, a DACO must be submitted. Open tier data can be obtained without a DACO, but germline alleles that do not match the reference genome are masked and replaced with the reference allele. Controlled data includes unmasked germline alleles. \ No newline at end of file From af8ba09b968ecddbf4f4971b0e3bd130d55b6081 Mon Sep 17 00:00:00 2001 From: Keshav Date: Sat, 28 Feb 2026 13:45:29 +0100 Subject: [PATCH 08/13] Removing accidental log file. --- neat/model_quality_score/1764714781.403029_NEAT.log | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 neat/model_quality_score/1764714781.403029_NEAT.log diff --git a/neat/model_quality_score/1764714781.403029_NEAT.log b/neat/model_quality_score/1764714781.403029_NEAT.log deleted file mode 100644 index f9b4ac75..00000000 --- a/neat/model_quality_score/1764714781.403029_NEAT.log +++ /dev/null @@ -1,2 +0,0 @@ -2025-12-02 23:33:02,707:INFO:neat.common.logging:writing log to: /Users/keshavgandhi/PycharmProjects/NEAT/neat/model_quality_score/1764714781.403029_NEAT.log -2025-12-02 23:33:02,707:ERROR:neat.common.io:Path 'data/baby.fastq' does not exist or not a file From e94e785986ca8bc7b2f2c88e1d11954f74339e98 Mon Sep 17 00:00:00 2001 From: Keshav Date: Thu, 5 Mar 2026 03:45:03 +0100 Subject: [PATCH 09/13] Transition matrices better accounted for. --- neat/model_quality_score/runner.py | 152 +++++--------- neat/models/markov_quality_model.py | 121 +++++++---- neat/quality_score_modeling/markov_utils.py | 218 ++++++++++---------- 3 files changed, 249 insertions(+), 242 deletions(-) diff --git a/neat/model_quality_score/runner.py b/neat/model_quality_score/runner.py index c30dbbd9..179a659b 100644 --- a/neat/model_quality_score/runner.py +++ b/neat/model_quality_score/runner.py @@ -3,8 +3,6 @@ This module implements the core logic for generating quality score models from input FASTQ files. - -The resulting models are saved as a gzip-compressed pickle file. """ import gzip @@ -16,8 +14,8 @@ import numpy as np from ..common import validate_input_path, validate_output_path -from ..models import SequencingErrorModel, TraditionalQualityModel from ..model_sequencing_error.utils import parse_file +from ..models import SequencingErrorModel, TraditionalQualityModel from ..models.markov_quality_model import MarkovQualityModel from ..quality_score_modeling.markov_utils import build_markov_model @@ -27,56 +25,40 @@ def _prepare_quality_scores_argument( - qual_scores: Iterable[int | float], -) -> Tuple[List[int], bool]: - """Normalize the quality scores argument to a list of ints.""" + qual_scores: int | Iterable[int | float], +) -> Tuple[List[int], Optional[List[int]]]: + """ + Returns: + - full_range_scores: required by parse_file indexing + - allowed_bins: None (no binning) or sorted unique user bins (for Markov binning) + """ if isinstance(qual_scores, int): max_q = int(qual_scores) - return list(range(0, max_q + 1)), False + return list(range(0, max_q + 1)), None + + bins = sorted({int(x) for x in qual_scores}) - sorted_scores = sorted(int(x) for x in qual_scores) - max_q = max(sorted_scores) + if not bins: + raise ValueError("quality_scores list must not be empty.") - return list(range(0, max_q + 1)), True + max_q = max(bins) + + # parse_file requires indices up to max_q (it indexes by raw Q value) + return list(range(0, max_q + 1)), bins def model_qual_score_runner( files: List[str], offset: int, - qual_scores: Iterable[int | float], + qual_scores: int | Iterable[int | float], max_reads: int, use_markov: bool, overwrite: bool, output_dir: str, output_prefix: str, ) -> None: - """Create and save a quality score model from FASTQ data. - - Parameters - ---------- - files: - A list of FASTQ file names. - offset: - Quality score offset (e.g. 33 for Sanger encoding). - qual_scores: - Either a single integer specifying the maximum quality score or an - iterable of integers specifying the allowed quality scores (binned - model). - max_reads: - Maximum number of reads to process from each input file. A value of - ``-1`` or ``float('inf')`` processes all reads. - use_markov: - If ``True``, construct a Markov chain–based quality model. If - ``False``, only the traditional quality model is produced. - overwrite: - If ``True``, allow an existing output file to be overwritten. - output_dir: - Directory to write the output model file. - output_prefix: - Prefix to use for the output file name. The file will be named - ``{output_prefix}.p.gz`` in the ``output_dir``. - """ + """Create and save a quality score model from FASTQ data.""" if len(files) > 2: _LOG.info("Only processing the first two input files") @@ -89,25 +71,23 @@ def model_qual_score_runner( _LOG.debug("Input files: %s", ", ".join(str(x) for x in files)) _LOG.debug("Quality offset: %d", offset) - final_quality_scores, binned = _prepare_quality_scores_argument(qual_scores) - _LOG.debug("Quality scores: %s", final_quality_scores) + final_quality_scores, allowed_bins = _prepare_quality_scores_argument(qual_scores) + _LOG.debug("Quality scores range: %s", final_quality_scores) + + if allowed_bins is not None: + _LOG.debug("Markov binning enabled with bins: %s", allowed_bins) - # Determine maximum number of reads to process if max_reads in (-1, None): num_records_to_process = float("inf") - else: num_records_to_process = max_reads - _LOG.debug( - "Maximum number of records to process: %s", - "all" if num_records_to_process == float("inf") else num_records_to_process, - ) # Validate output directory and file validate_output_path(output_dir, is_file=False) output_path = Path(output_dir) output_file = output_path / f"{output_prefix}.p.gz" validate_output_path(output_file, overwrite=overwrite) + _LOG.info("Writing output to: %s", output_file) # Containers for per-file quality model parameters @@ -115,13 +95,10 @@ def model_qual_score_runner( average_errors: List[float] = [] read_length = 0 - # Collect traditional model parameters using existing NEAT utilities - file_num = 0 + # Traditional model parameters (existing NEAT utility) + for idx_file, file in enumerate(files, start=1): - for file in files: - - file_num += 1 - _LOG.info("Reading file %d of %d", file_num, len(files)) + _LOG.info("Reading file %d of %d", idx_file, len(files)) params_by_position, file_avg_error, read_length = parse_file( file, @@ -133,12 +110,11 @@ def model_qual_score_runner( read_parameters.append(params_by_position) average_errors.append(file_avg_error) - _LOG.info("Finished reading file %d", file_num) + + _LOG.info("Finished reading file %d", idx_file) if not read_parameters: - raise RuntimeError( - "No quality score parameters were computed; check input FASTQ files." - ) + raise RuntimeError("No quality score parameters were computed. Check input FASTQ files.") average_error = float(np.average(average_errors)) if average_errors else 0.0 _LOG.info("Average sequencing error across files: %f", average_error) @@ -149,10 +125,7 @@ def model_qual_score_runner( for idx in range(len(read_parameters)): # Sequencing error model (always produced) - seq_err_model = SequencingErrorModel( - avg_seq_error=average_error, - read_length=read_length, - ) + seq_err_model = SequencingErrorModel(avg_seq_error=average_error, read_length=read_length) # Traditional quality model (always produced) trad_model = TraditionalQualityModel( @@ -161,49 +134,36 @@ def model_qual_score_runner( qual_score_probs=read_parameters[idx], ) - # Optionally build Markov quality model markov_model: Optional[MarkovQualityModel] = None + # Optionally build Markov quality model + if use_markov: - try: - ( - init_dist, - pos_dists, - max_quality, - train_read_length, - ) = build_markov_model( - [files[idx]], - num_records_to_process, - offset, - ) - - markov_model = MarkovQualityModel( - initial_distribution=init_dist, - position_distributions=pos_dists, - max_quality=max_quality, - read_length=train_read_length, - ) - - except Exception as exc: - _LOG.error( - "Failed to construct Markov model for %s: %s", - files[idx], - exc, - ) - raise - models.append((seq_err_model, trad_model, markov_model)) + # Position-specific transition matrices + init_dist, pos_dists, trans_dists, max_quality, train_read_length = build_markov_model( + [files[idx]], + num_records_to_process, + offset, + allowed_quality_scores=allowed_bins, + ) - _LOG.debug("Constructed %d model(s)", len(models)) + markov_model = MarkovQualityModel( + initial_distribution=init_dist, + position_distributions=pos_dists, + max_quality=max_quality, + read_length=train_read_length, + transition_distributions=trans_dists, + ) + + models.append((seq_err_model, trad_model, markov_model)) # Write out the models + with gzip.open(output_file, "wb") as out_model: - if not models: - raise RuntimeError( - "Internal error: no models constructed, so there is nothing to write." - ) if len(models) == 1: + seq_err1, trad1, markov1 = models[0] pickle.dump( { @@ -216,6 +176,7 @@ def model_qual_score_runner( ) elif len(models) == 2: + (seq_err1, trad1, markov1), (seq_err2, trad2, markov2) = models pickle.dump( { @@ -228,9 +189,8 @@ def model_qual_score_runner( ) else: + # NEAT's read simulator only understands one or two models - raise RuntimeError( - f"Expected at most two quality models, but constructed {len(models)}." - ) + raise RuntimeError(f"Expected at most two quality models, but constructed {len(models)}.") - _LOG.info("Quality score model saved to %s", output_file) \ No newline at end of file + _LOG.info("Quality score model saved to %s", output_file) diff --git a/neat/models/markov_quality_model.py b/neat/models/markov_quality_model.py index ba8050e7..622f2831 100644 --- a/neat/models/markov_quality_model.py +++ b/neat/models/markov_quality_model.py @@ -1,21 +1,16 @@ """ -Empirical position-specific quality score model for NEAT. +Position-specific quality score model for NEAT. The model consists of an initial distribution giving the probability of observing each -quality score at the first position of a read and a sequence of per-position empirical -distributions giving the probability of observing each quality score at each subsequent -position. +quality score at the first position of a read, per-position marginals, and per-position +transition matrices. -At generation time, the model samples a quality score independently at -each position from the corresponding empirical distribution. - -Although this model exists outside of ``neat.models.error_models``, it -parallels the ``TraditionalQualityModel`` class from NEAT’s core and can -be used interchangeably when constructing quality models. +If transition_distributions is None, the model independently samples from +per-position marginals. """ from dataclasses import dataclass -from typing import Dict, List +from typing import Dict, List, Optional import numpy as np @@ -23,8 +18,7 @@ @dataclass class MarkovQualityModel: - """Empirical, position-specific quality score model. - + """ Parameters ---------- initial_distribution: @@ -40,12 +34,15 @@ class MarkovQualityModel: read_length: The length of reads used during training determines how many position-specific distributions exist. + transition_distributions: + Optional list trans[i][q_prev][q_next] = count/prob. """ initial_distribution: Dict[int, float] position_distributions: List[Dict[int, float]] max_quality: int read_length: int + transition_distributions: Optional[List[Dict[int, Dict[int, float]]]] = None def __post_init__(self) -> None: @@ -58,24 +55,20 @@ def __post_init__(self) -> None: ) self.initial_distribution = { - int(k): float(v) / total_init - for k, v in self.initial_distribution.items() + int(k): float(v) / total_init for k, v in self.initial_distribution.items() } - # Normalize each position-specific distribution + # Normalize per-position marginals norm_positions: List[Dict[int, float]] = [] for dist in self.position_distributions: total = float(sum(dist.values())) if total <= 0: - # Fall back to a single mass at quality 0 norm_positions.append({0: 1.0}) continue - norm_positions.append( - {int(k): float(v) / total for k, v in dist.items()} - ) + norm_positions.append({int(k): float(v) / total for k, v in dist.items()}) self.position_distributions = norm_positions @@ -98,6 +91,38 @@ def __post_init__(self) -> None: self._values_by_pos.append(np.asarray(keys, dtype=int)) self._probs_by_pos.append(np.asarray(vals, dtype=float)) + # Normalize and cache transitions + self._has_transitions = False + self._trans_values_by_pos: List[Dict[int, np.ndarray]] = [] + self._trans_probs_by_pos: List[Dict[int, np.ndarray]] = [] + + if self.transition_distributions is not None: + if len(self.transition_distributions) not in (0, max(0, self.read_length - 1)): + raise ValueError( + "transition_distributions must have length read_length-1 " + f"(expected {max(0, self.read_length - 1)}, got {len(self.transition_distributions)})." + ) + + # Normalize each row q_prev -> distribution over q_next + self._has_transitions = len(self.transition_distributions) > 0 + + for pos_trans in self.transition_distributions: + cache_vals: Dict[int, np.ndarray] = {} + cache_probs: Dict[int, np.ndarray] = {} + + for q_prev, next_map in pos_trans.items(): + total = float(sum(next_map.values())) + if total <= 0: + continue + + keys = [int(k) for k in next_map.keys()] + probs = [float(next_map[k]) / total for k in next_map.keys()] + cache_vals[int(q_prev)] = np.asarray(keys, dtype=int) + cache_probs[int(q_prev)] = np.asarray(probs, dtype=float) + + self._trans_values_by_pos.append(cache_vals) + self._trans_probs_by_pos.append(cache_probs) + @property def quality_scores(self) -> List[int]: """List of supported quality scores.""" @@ -126,36 +151,48 @@ def get_quality_scores( length: int, rng: np.random.Generator, ) -> np.ndarray: - """Generate a synthetic quality score array using this empirical model. - - Parameters - ---------- - model_read_length: - The length of reads used during training. - length: - Desired length of the returned quality array. - rng: - A :class:`numpy.random.Generator` instance used for sampling. - - Returns - ------- - numpy.ndarray - An array of integers representing quality scores. + """Generate a synthetic quality score array. + + If transitions are provided, use the Markov-based model. + + If transitions are not provided, use an empirical version of the model. """ + if length <= 0: return np.zeros(0, dtype=int) qualities = np.zeros(length, dtype=int) # Sample initial quality from the starting distribution - qualities[0] = rng.choice(self._init_scores, p=self._init_probs) + qualities[0] = int(rng.choice(self._init_scores, p=self._init_probs)) - # Sample each subsequent position independently from its empirical per-position distribution for i in range(1, length): - p_idx = self._position_index_for_length(i, length) - vals = self._values_by_pos[p_idx] - probs = self._probs_by_pos[p_idx] - q = int(rng.choice(vals, p=probs)) + p_cur = self._position_index_for_length(i, length) + + q = None + + if self._has_transitions and self._trans_values_by_pos: + + p_prev = self._position_index_for_length(i - 1, length) + + # Transitions are defined for positions + p_prev = min(p_prev, len(self._trans_values_by_pos) - 1) + + q_prev = int(qualities[i - 1]) + vals_map = self._trans_values_by_pos[p_prev] + probs_map = self._trans_probs_by_pos[p_prev] + + if q_prev in vals_map: + vals = vals_map[q_prev] + probs = probs_map[q_prev] + q = int(rng.choice(vals, p=probs)) + + # Use marginal method if no transition row + if q is None: + p_idx = min(p_cur, len(self._values_by_pos) - 1) + vals = self._values_by_pos[p_idx] + probs = self._probs_by_pos[p_idx] + q = int(rng.choice(vals, p=probs)) # Clip to valid range if q < 0: @@ -165,4 +202,4 @@ def get_quality_scores( qualities[i] = q - return qualities \ No newline at end of file + return qualities diff --git a/neat/quality_score_modeling/markov_utils.py b/neat/quality_score_modeling/markov_utils.py index 3288fd15..4f22c743 100644 --- a/neat/quality_score_modeling/markov_utils.py +++ b/neat/quality_score_modeling/markov_utils.py @@ -1,17 +1,18 @@ """ -Utility functions for building an empirical quality score model. +Utility functions for building a position-specific Markov quality score model. The functions in this module extract per-read quality scores from FASTQ files -and compute the statistics required by the ``MarkovQualityModel``. +and compute the information required by the ``MarkovQualityModel``. """ import logging +from bisect import bisect_right from collections import defaultdict from pathlib import Path -from typing import Dict, Iterable, List, Tuple +from typing import Dict, Iterable, List, Optional, Tuple -from ..model_sequencing_error.utils import convert_quality_string from ..common import open_input +from ..model_sequencing_error.utils import convert_quality_string _LOG = logging.getLogger(__name__) @@ -19,38 +20,48 @@ "read_quality_lists", "compute_initial_distribution", "compute_position_distributions", + "compute_transition_distributions", "build_markov_model", ] +def _down_bin_quality(q: int, allowed: List[int]) -> int: + """ + Map q to the greatest allowed value <= q (down-binning). + If q is below the smallest allowed, map to allowed[0]. + """ + + if not allowed: + return int(q) + + q = int(q) + i = bisect_right(allowed, q) - 1 + + if i < 0: + return int(allowed[0]) + + return int(allowed[i]) + + def read_quality_lists( files: Iterable[str], max_reads: int, offset: int, + allowed_quality_scores: Optional[Iterable[int]] = None, ) -> Tuple[List[List[int]], int]: """Read per-read quality scores from one or more FASTQ files. - Parameters - ---------- - files: - An iterable of FASTQ filenames. Only the first ``max_reads`` reads - will be consumed from each file. If a file contains fewer than - ``max_reads`` reads, the entire file is processed. - max_reads: - Maximum number of reads per file to consume. A value of ``-1`` - or ``float('inf')`` indicates that all reads in the file should be - processed. - offset: - Numeric quality score offset (usually 33 for Sanger encoding). - - Returns - ------- - Tuple[List[List[int]], int] - A tuple containing the list of quality score lists and the read - length inferred from the first record. If no reads are found this - function returns an empty list and zero. + Returns (qualities, read_length). If no reads are found, returns ([], 0). """ + allowed_sorted: Optional[List[int]] = None + + if allowed_quality_scores is not None: + allowed_sorted = sorted({int(x) for x in allowed_quality_scores}) + + if not allowed_sorted: + allowed_sorted = None + qualities: List[List[int]] = [] read_length = 0 @@ -71,33 +82,36 @@ def read_quality_lists( while reads_read < reads_to_parse: - # FASTQ format: four lines per record (header, sequence, plus, quality) + # FASTQ format: four lines per record header = fq_in.readline() if not header: break # end of file - seq = fq_in.readline() - plus = fq_in.readline() + _ = fq_in.readline() # seq + _ = fq_in.readline() # plus qual = fq_in.readline() if not qual: - break # truncated file + break - quality_scores = convert_quality_string(qual.strip(), offset) + qlist = convert_quality_string(qual.strip(), offset) if not read_length: - read_length = len(quality_scores) + read_length = len(qlist) # Skip reads that do not match the inferred read length - if len(quality_scores) != read_length: + if len(qlist) != read_length: _LOG.debug( "Skipping record of length %d (expected %d)", - len(quality_scores), + len(qlist), read_length, ) continue + if allowed_sorted is not None: + qlist = [_down_bin_quality(q, allowed_sorted) for q in qlist] + qualities.append(quality_scores) reads_read += 1 @@ -105,26 +119,13 @@ def read_quality_lists( def compute_initial_distribution(qualities: Iterable[List[int]]) -> Dict[int, float]: - """Compute the empirical distribution of first quality scores. - - Parameters - ---------- - qualities: - An iterable of quality score lists. The first element of each list - will be used to populate the distribution. If a list is empty, it - contributes nothing. - - Returns - ------- - Dict[int, float] - A dictionary mapping quality scores to counts. - """ + """Counts of Q at position 0.""" counts: Dict[int, float] = defaultdict(float) for qlist in qualities: - if not qlist: - continue + if qlist: + counts[int(qlist[0])] += 1.0 counts[qlist[0]] += 1.0 @@ -135,30 +136,12 @@ def compute_position_distributions( qualities: Iterable[List[int]], read_length: int, ) -> List[Dict[int, float]]: - """Compute per-position empirical distributions of quality scores. - - Parameters - ---------- - qualities: - An iterable of quality score lists. All lists are expected to have - length ``read_length``. Any that do not are ignored. - read_length: - Length of reads used for training. - - Returns - ------- - List[Dict[int, float]] - A list of length ``read_length``. Element ``i`` is a dictionary - mapping quality scores to counts at position ``i``. - """ + """Counts of Q at position i.""" if read_length <= 0: return [] - # One histogram per position - histograms: List[Dict[int, float]] = [ - defaultdict(float) for _ in range(read_length) - ] + histograms: List[Dict[int, float]] = [defaultdict(float) for _ in range(read_length)] for qlist in qualities: @@ -168,59 +151,86 @@ def compute_position_distributions( for i, q in enumerate(qlist): histograms[i][int(q)] += 1.0 - # Convert default dicts to plain dicts return [dict(h) for h in histograms] +def compute_transition_distributions( + qualities: Iterable[List[int]], + read_length: int, +) -> List[Dict[int, Dict[int, float]]]: + """ + Transition counts per position i (0..read_length-2): + trans[i][q_prev][q_next] += 1 + """ + + if read_length <= 1: + return [] + + trans: List[Dict[int, Dict[int, float]]] = [ + defaultdict(lambda: defaultdict(float)) for _ in range(read_length - 1) + ] + + for qlist in qualities: + + if len(qlist) != read_length: + continue + + for i in range(read_length - 1): + q_prev = int(qlist[i]) + q_next = int(qlist[i + 1]) + trans[i][q_prev][q_next] += 1.0 + + # Convert nested defaultdicts to plain dicts + out: List[Dict[int, Dict[int, float]]] = [] + + for i in range(read_length - 1): + + pos_dict: Dict[int, Dict[int, float]] = {} + + for q_prev, nexts in trans[i].items(): + pos_dict[int(q_prev)] = {int(qn): float(c) for qn, c in nexts.items()} + + out.append(pos_dict) + + return out + + def build_markov_model( files: Iterable[str], max_reads: int, offset: int, -) -> Tuple[Dict[int, float], List[Dict[int, float]], int, int]: - """Build distributions and max quality for the empirical quality model. - - This is a convenience wrapper around :func:`read_quality_lists`, - :func:`compute_initial_distribution`, and - :func:`compute_position_distributions`. It returns the empirical - distributions and the maximum observed quality, along with the read - length used for training. - - Parameters - ---------- - files: - An iterable of FASTQ filenames. - max_reads: - Maximum number of reads per file to consume. ``-1`` or - ``float('inf')`` indicates that all reads should be processed. - offset: - Numeric quality score offset (usually 33 for Sanger encoding). - - Returns - ------- - Tuple[Dict[int, float], List[Dict[int, float]], int, int] - A tuple ``(initial_distribution, position_distributions, - max_quality, read_length)``. Distributions are returned as raw - counts; the caller is responsible for normalisation. - """ - - qualities, read_length = read_quality_lists(files, max_reads, offset) + allowed_quality_scores: Optional[Iterable[int]] = None, +) -> Tuple[ + Dict[int, float], + List[Dict[int, float]], + List[Dict[int, Dict[int, float]]], + int, + int, +]: + """Wrapper to create the Markov model.""" + + qualities, read_length = read_quality_lists( + files, max_reads, offset, allowed_quality_scores=allowed_quality_scores + ) if not qualities: raise ValueError("No quality scores could be read from the input files.") init_dist = compute_initial_distribution(qualities) pos_dists = compute_position_distributions(qualities, read_length) + trans_dists = compute_transition_distributions(qualities, read_length) - # Determine maximum observed quality + # Determine maximum quality (post-binning, if applied) max_quality = 0 - for qlist in qualities: - if not qlist: - continue + if qlist: + max_quality = max(max_quality, max(qlist)) - mq = max(qlist) + # If user supplied bins, max_quality should not exceed the max bin. + if allowed_quality_scores is not None: + bins = sorted({int(x) for x in allowed_quality_scores}) - if mq > max_quality: - max_quality = mq + if bins: + max_quality = min(max_quality, max(bins)) - return init_dist, pos_dists, max_quality, read_length \ No newline at end of file + return init_dist, pos_dists, trans_dists, int(max_quality), int(read_length) From cfb8323799e869b994f699c7e4be6356ebdf28e2 Mon Sep 17 00:00:00 2001 From: Keshav Date: Thu, 5 Mar 2026 03:49:08 +0100 Subject: [PATCH 10/13] Bug fixing and cleaning up. --- neat/models/markov_quality_model.py | 11 ++++++++++- neat/quality_score_modeling/markov_utils.py | 4 +--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/neat/models/markov_quality_model.py b/neat/models/markov_quality_model.py index 622f2831..6a1bb832 100644 --- a/neat/models/markov_quality_model.py +++ b/neat/models/markov_quality_model.py @@ -116,7 +116,14 @@ def __post_init__(self) -> None: continue keys = [int(k) for k in next_map.keys()] - probs = [float(next_map[k]) / total for k in next_map.keys()] + + keys = [] + probs = [] + + for qn, c in next_map.items(): + keys.append(int(qn)) + probs.append(float(c) / total) + cache_vals[int(q_prev)] = np.asarray(keys, dtype=int) cache_probs[int(q_prev)] = np.asarray(probs, dtype=float) @@ -158,6 +165,8 @@ def get_quality_scores( If transitions are not provided, use an empirical version of the model. """ + _ = model_read_length + if length <= 0: return np.zeros(0, dtype=int) diff --git a/neat/quality_score_modeling/markov_utils.py b/neat/quality_score_modeling/markov_utils.py index 4f22c743..0851b0df 100644 --- a/neat/quality_score_modeling/markov_utils.py +++ b/neat/quality_score_modeling/markov_utils.py @@ -112,7 +112,7 @@ def read_quality_lists( if allowed_sorted is not None: qlist = [_down_bin_quality(q, allowed_sorted) for q in qlist] - qualities.append(quality_scores) + qualities.append(qlist) reads_read += 1 return qualities, read_length @@ -127,8 +127,6 @@ def compute_initial_distribution(qualities: Iterable[List[int]]) -> Dict[int, fl if qlist: counts[int(qlist[0])] += 1.0 - counts[qlist[0]] += 1.0 - return dict(counts) From 96085c0fc89a2bf781d1c10c14665be949066e9f Mon Sep 17 00:00:00 2001 From: Keshav Date: Thu, 5 Mar 2026 03:52:29 +0100 Subject: [PATCH 11/13] Type bug and cleaning up. --- neat/models/markov_quality_model.py | 2 -- neat/quality_score_modeling/markov_utils.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/neat/models/markov_quality_model.py b/neat/models/markov_quality_model.py index 6a1bb832..15e16dc4 100644 --- a/neat/models/markov_quality_model.py +++ b/neat/models/markov_quality_model.py @@ -115,8 +115,6 @@ def __post_init__(self) -> None: if total <= 0: continue - keys = [int(k) for k in next_map.keys()] - keys = [] probs = [] diff --git a/neat/quality_score_modeling/markov_utils.py b/neat/quality_score_modeling/markov_utils.py index 0851b0df..d75c7a47 100644 --- a/neat/quality_score_modeling/markov_utils.py +++ b/neat/quality_score_modeling/markov_utils.py @@ -45,7 +45,7 @@ def _down_bin_quality(q: int, allowed: List[int]) -> int: def read_quality_lists( files: Iterable[str], - max_reads: int, + max_reads: int | float, offset: int, allowed_quality_scores: Optional[Iterable[int]] = None, ) -> Tuple[List[List[int]], int]: From ff13ea9d4f18482d44c6c038b8a8dc1d920d8705 Mon Sep 17 00:00:00 2001 From: keshav-gandhi <87151245+keshav-gandhi@users.noreply.github.com> Date: Thu, 5 Mar 2026 14:56:18 +0100 Subject: [PATCH 12/13] Formatting in template_neat_config.yml --- config_template/template_neat_config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config_template/template_neat_config.yml b/config_template/template_neat_config.yml index cfbb7e2a..a0db4968 100644 --- a/config_template/template_neat_config.yml +++ b/config_template/template_neat_config.yml @@ -151,8 +151,8 @@ parallel_block_size: . # type = int | required: no | default = all available threads: . -# Delete the 'splits' directory after stitching completes. -# Note if threads == 1, this option has no effect. +# Delete the 'splits' directory after stitching completes +# Note: If threads == 1, this option has no effect. # type = bool | required: no | default = true cleanup_splits: . From 0b69f8d9b4474cf1de5ec9663c8e14385066d413 Mon Sep 17 00:00:00 2001 From: keshav-gandhi <87151245+keshav-gandhi@users.noreply.github.com> Date: Thu, 5 Mar 2026 14:57:32 +0100 Subject: [PATCH 13/13] Delete version.py (we already have a 4.3.6 and version tracker now) --- version.py | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 version.py diff --git a/version.py b/version.py deleted file mode 100644 index 4b48e9d3..00000000 --- a/version.py +++ /dev/null @@ -1,10 +0,0 @@ -from importlib.metadata import PackageNotFoundError, version as _version - -def neat_version() -> str: - """ - Return NEAT's package version. - """ - try: - return _version("neat") - except PackageNotFoundError: - return "unknown"