Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
8aa2032
Developed script for wrapping bacterial chromosomes
karenhx2 Oct 4, 2025
fbca683
Cleaned up files
karenhx2 Oct 4, 2025
1440b21
Merge branch 'main' into feature/karen_bacterial_wrapper
joshfactorial Oct 16, 2025
edc67b8
Worked on testing functions for stitching outputs together
karenhx2 Oct 16, 2025
ba60be7
Merge branch 'feature/karen_bacterial_wrapper' of github.com:ncsa/NEA…
karenhx2 Oct 16, 2025
541b3b1
Removed reference and data files
karenhx2 Oct 16, 2025
85a9c51
Updated script to include paired-ended runs
karenhx2 Nov 12, 2025
acef491
Implemented CLI for wrapper
karenhx2 Dec 18, 2025
e75e596
Removed extra copy of runner code
karenhx2 Jan 26, 2026
8459c8b
Fixed bugs with config files and output stitching
karenhx2 Feb 1, 2026
3fd4eb2
Merge branch 'develop' into feature/karen_bacterial_wrapper
joshfactorial Feb 4, 2026
dc28b19
Finished final debugging
karenhx2 Feb 12, 2026
aaaeed4
Accidentally removed poetry file and re-adding it
karenhx2 Feb 12, 2026
ec1c70a
Merge branch 'develop' of github.com:ncsa/NEAT into feature/karen_bac…
karenhx2 Feb 22, 2026
349857c
Merge branch 'develop' of github.com:ncsa/NEAT into feature/karen_bac…
karenhx2 Feb 26, 2026
4a33b46
Updated wrapper
karenhx2 Feb 26, 2026
383a4ef
Developed script for wrapping bacterial chromosomes
karenhx2 Oct 4, 2025
49ada66
Cleaned up files
karenhx2 Oct 4, 2025
7c9c546
Worked on testing functions for stitching outputs together
karenhx2 Oct 16, 2025
17a5340
Removed reference and data files
karenhx2 Oct 16, 2025
7d9798a
Updated script to include paired-ended runs
karenhx2 Nov 12, 2025
7c92f0f
Implemented CLI for wrapper
karenhx2 Dec 18, 2025
4e482af
Removed extra copy of runner code
karenhx2 Jan 26, 2026
a73ce4c
Fixed bugs with config files and output stitching
karenhx2 Feb 1, 2026
1a8dd50
Finished final debugging
karenhx2 Feb 12, 2026
1b7e179
Accidentally removed poetry file and re-adding it
karenhx2 Feb 12, 2026
625be66
Updated wrapper
karenhx2 Feb 26, 2026
0464d64
A couple of changes related to filetypes
joshfactorial Feb 26, 2026
d109128
Fixing a small error introduced by merging
joshfactorial Feb 26, 2026
be62d97
Resolved merging conflicts
karenhx2 Feb 26, 2026
f55f87d
Fixed pytest error with reuse splits
karenhx2 Mar 5, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions neat/bacterial_wrapper/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"""
Load modules needed for other parts of the program
"""
from .runner import *
203 changes: 203 additions & 0 deletions neat/bacterial_wrapper/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
import subprocess
import gzip
import shutil
import yaml
import pysam
import unittest
import os

from pathlib import Path
from typing import List
from Bio import bgzf
from Bio.bgzf import BgzfWriter, BgzfReader


# Rearranges the bacterial chromosome by wrapping it around

def wrapper(seq):
length = len(seq)

if (length % 2 == 0):
half_index = length // 2
else:
half_index = (length // 2) + 1

first_half = seq[:half_index]
second_half = seq[half_index:]

new_seq = second_half + first_half

return new_seq


# Writes the newly rearranged chromosome's sequence to a new fasta file

def write_fasta_file(new_seq, bacteria_name, fasta_header, output_dir_path, type):
fasta_file_name = f"{type}_{bacteria_name}.fna"
fasta_file_path = output_dir_path / fasta_file_name
fasta_file = open(fasta_file_path, "w")

fasta_file.write(fasta_header + "\n" + new_seq)

fasta_file.close()

return fasta_file_path


# Writes a yml configuration file for the newly rearranged chromosome's fasta sequence
# Splits the coverage in half for the reference and new config files
# These use default values for all other parameters for NEAT

def write_config_file(ref_config_file, rearranged_seq_file, orig_seq_file, bacteria_name, output_dir_path):
new_config_file_name = f"new_{bacteria_name}_config_test.yml"
old_config_file_name = f"{bacteria_name}_config_test.yml"

new_config_file_path = output_dir_path / new_config_file_name
old_config_file_path = output_dir_path / old_config_file_name

with open(ref_config_file, 'r') as ref_file, open(new_config_file_path, 'w') as new_file, open(old_config_file_path, 'w') as old_file:
for line in ref_file:
if line.find("reference:") != -1:
new_file.write(f"reference: {rearranged_seq_file}\n")
old_file.write(f"reference: {orig_seq_file}\n")
elif line.find("coverage:") != -1:
if line.strip() == "coverage: .":
new_coverage = 5.0
else:
new_coverage = float((line.split(" "))[1].strip()) // 2

new_file.write(f"coverage: {new_coverage}\n")
old_file.write(f"coverage: {new_coverage}\n")
else:
new_file.write(line)
old_file.write(line)


ref_file.close()
new_file.close()
old_file.close()

return old_config_file_path, new_config_file_path


# Runs the NEAT read simulator using the given config file

def run_neat(config_file, output_dir, prefix):
subprocess.run(["neat", "read-simulator", "-c", config_file, "-o", output_dir + "/" + prefix])


# General function for bacterial wrapper that calls all of the functions defined above

def bacterial_wrapper(reference_file, bacteria_name, ref_config_file, output_dir):

orig_seq = ""

f = open(reference_file)
fasta_header = f.readline().strip()

plasmids = False

for line in f:
if line[0] != ">":
orig_seq += line.strip()
elif line.lower().find("plasmid") != -1: # exclude plasmids from the sequence to be rearranged
plasmids = True
break

f.close()

output_dir_path = Path(output_dir)

rearranged_seq = wrapper(orig_seq)
rearranged_seq_file = write_fasta_file(rearranged_seq, bacteria_name, fasta_header, output_dir_path, "wrapped")

orig_seq_file = reference_file
if plasmids:
orig_seq_file = write_fasta_file(orig_seq, bacteria_name, fasta_header, output_dir_path, "orig")

config_files = write_config_file(ref_config_file, rearranged_seq_file, orig_seq_file, bacteria_name, output_dir_path)
old_config_file = config_files[0]
new_config_file = config_files[1]

run_neat(old_config_file, output_dir, "Regular")
run_neat(new_config_file, output_dir, "Wrapped")


# Stitching all outputs together - Keshav's script

def concat_fq(input_files: List[Path], dest: Path) -> None:

if not input_files:
# Nothing to do, and no error to throw
return

with gzip.open(dest, 'wt') as out_f:
for input_file in input_files:
with gzip.open(input_file, 'rt') as in_f:
shutil.copyfileobj(in_f, out_f)

def merge_bam(bams: List[Path], dest: Path, threads: int) -> None:

if not bams:
return

unsorted = dest.with_suffix(".unsorted.bam")
pysam.merge("--no-PG", "-@", str(threads), "-f", str(unsorted), *map(str, bams))
pysam.sort("-@", str(threads), "-o", str(dest), str(unsorted))
unsorted.unlink(missing_ok=True)

def merge_vcf(vcfs: List[Path], dest: Path) -> None:
if not vcfs:
return

first, *rest = vcfs
shutil.copy(first, dest)

with dest.open("ab") as out_f:
for vcf in rest:
with vcf.open("rb") as fh:
for line in fh:
if not line.startswith(b"#"):
out_f.write(line)

def stitch_all_outputs(files: List[Path], output_dir) -> None:
fq1_list = []
fq2_list = []
vcf_list = []
bam_list = []

for file in files:
file_name = file.stem # use stem to differentiate fq1 and fq2
suffixes = file.suffixes # use suffixes to catch vcf and bam files

if "r2.fastq" in file_name:
fq2_list.append(file)
elif "r1.fastq" in file_name or ".fastq" in suffixes:
fq1_list.append(file)
elif ".vcf" in suffixes and ".tbi" not in suffixes:
vcf_list.append(file)
elif ".bam" in suffixes and ".bai" not in suffixes:
bam_list.append(file)

dest_fq1 = Path(f"{output_dir}/stitched_fq1.gz")
dest_bam = Path(f"{output_dir}/stitched.bam")
dest_vcf = Path(f"{output_dir}/stitched.vcf")

concat_fq(fq1_list, dest_fq1)

if (fq2_list):
dest_fq2 = Path(f"{output_dir}/stitched_fq2.gz")
concat_fq(fq2_list, dest_fq2)

merge_bam(bam_list, dest_bam, 2)
merge_vcf(vcf_list, dest_vcf)


# Testing functions

class TestWrapper(unittest.TestCase):
def test_even(self):
self.assertEqual(wrapper("ABBCBB"), "CBBABB")

def test_odd(self):
self.assertEqual(wrapper("ABBCBBC"), "BBCABBC")
78 changes: 78 additions & 0 deletions neat/cli/commands/bacterial_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""
Command line interface for NEAT's bacterial wrapper function
"""

import argparse
import subprocess
import os
from pathlib import Path

from ...bacterial_wrapper import bacterial_wrapper
from ...bacterial_wrapper import stitch_all_outputs
from .base import BaseCommand
from .options import output_group


class Command(BaseCommand):
"""
Class that generates wrapped bacterial models
"""
name = "bacterial-wrapper"
description = "Generate wrapped bacterial model reads"

def add_arguments(self, parser: argparse.ArgumentParser):
"""
Add the command's arguments to its parser

:param parser: The parser to add arguments to
"""

parser.add_argument('reference',
type=str,
metavar='reference.fa',
help="Reference file for organism in fasta format.")

parser.add_argument('bacteria_name',
type=str,
metavar='bacteria_name',
help="Name of the bacteria.")

parser.add_argument(
"-c", "--config",
metavar="config",
type=str,
required=True,
help="Path (including filename) to the configuration file for the reference run."
)

output_group.add_to_parser(parser)

def execute(self, arguments: argparse.Namespace):
"""
Execute the command

:param arguments: The namespace with arguments and their values.
"""
bacterial_wrapper(arguments.reference, arguments.bacteria_name, arguments.config, arguments.output_dir)

output_path = Path(arguments.output_dir)
file_list = os.listdir(output_path / "Regular") # same file names for both Regular and Wrapped folders

output_files = []

for file in file_list:
reg_file_path = output_path / "Regular" / file
wrap_file_path = output_path / "Wrapped" / file

if ("vcf" in file):
subprocess.run(["gzip", "-d", reg_file_path])
subprocess.run(["gzip", "-d", wrap_file_path])

file = file[:-3]
reg_file_path = output_path / "Regular" / file
wrap_file_path = output_path / "Wrapped" / file

output_files.append(reg_file_path)
output_files.append(wrap_file_path)

stitch_all_outputs(output_files, arguments.output_dir)
11 changes: 7 additions & 4 deletions neat/read_simulator/utils/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,15 +249,13 @@ def from_cli(output_dir: Path,
for key, (_, default, _, _) in defs.items():
if key not in input_args:
input_args[key] = default

base_options = Options(
output_dir=output_dir,
output_prefix=output_prefix
)

# Read the config file using the definitions for validation
base_options.read_yaml(config_file, defs)

# Merge validated config values over defaults
final_args = dict(input_args)
for key, val in defs.items():
Expand Down Expand Up @@ -299,6 +297,8 @@ def read_yaml(self, config_yaml: Path, args: dict):
But I'm not sure how else to accomplish this.
"""
config = yaml.load(open(config_yaml, 'r'), Loader=Loader)


for key, value in config.items():
if key in args:
type_of_var, default, criteria1, criteria2 = args[key]
Expand Down Expand Up @@ -447,7 +447,10 @@ def log_configuration(self):
if splits_dir.is_dir():
_LOG.info(f'Reusing existing splits {splits_dir}.')
else:
_LOG.warning(f'Reused splits set to True, but splits dir not found: {splits_dir}. Creating new splits')
if self.reuse_splits:
raise FileNotFoundError(f'reuse_splits=True')
else:
_LOG.warning(f'Reused splits set to True, but splits dir not found: {splits_dir}. Creating new splits')
_LOG.info(f'Preserving splits for next run in directory {self.splits_dir}.')
else:
splits_dir = self.temp_dir_path / "splits"
Expand Down Expand Up @@ -509,4 +512,4 @@ def log_configuration(self):
_LOG.info(f'Custom average mutation rate for the run: {self.mutation_rate}')
if self.mutation_bed:
_LOG.info(f'BED of mutation rates of different regions: {self.mutation_bed}')
_LOG.info(f'RNG seed value for run: {self.rng_seed}')
_LOG.info(f'RNG seed value for run: {self.rng_seed}')
4 changes: 4 additions & 0 deletions neat/read_simulator/utils/output_file_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,16 @@

import os
import re
import shutil
import time
from struct import pack
import logging
from typing import Any

from Bio import bgzf
from Bio import SeqIO
from pathlib import Path
from numpy.random import Generator

#gzip for temp outs, bgzip for final outs
import gzip
Expand Down
7 changes: 6 additions & 1 deletion neat/read_simulator/utils/stitch_outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
_LOG = logging.getLogger(__name__)

def concat(files_to_join: List[Path], dest_file: gzip.GzipFile) -> None:
if not files_to_join:
# Nothing to do, and no error to throw
_LOG.warning(f"Concat called but there are no files to join: {files_to_join}" )
return

for f in files_to_join:
with gzip.open(f, 'rt') as in_f:
shutil.copyfileobj(in_f, dest_file)
Expand All @@ -33,7 +38,7 @@ def merge_vcfs(vcfs: List[Path], ofw: OutputFileWriter) -> None:
def merge_bam(bam_files: List[Path], ofw: OutputFileWriter, threads: int):
merged_file = ofw.tmp_dir / "temp_merged.bam"
intermediate_files = []
# Note 1000 is abritrary. May need to be a user parameter/adjustable/a function
# Note 1000 is arbitrary. May need to be a user parameter/adjustable/a function
for i in range(0, len(bam_files), 500):
temp_file = str(ofw.tmp_dir / f"temp_merged_{i}.bam")
pysam.merge("--no-PG", "-f", temp_file, *map(str, bam_files[i:i+500]))
Expand Down
Loading