From db363f57794d9e7003f06227c9ab4743459c72bf Mon Sep 17 00:00:00 2001 From: Kim Kyung Seo Date: Sat, 13 Jun 2026 22:24:29 +0900 Subject: [PATCH] =?UTF-8?q?feat:=20add=20experimental=20insect=20hosts=20?= =?UTF-8?q?=E2=80=94=20Sf9=20(S.=20frugiperda)=20and=20Tni=20(T.=20ni)=20(?= =?UTF-8?q?#23)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires up two BEVS insect cell-line hosts following the same pattern as the plant host additions in #24. Codon tables are clearly marked PLACEHOLDER (total_cds/total_codons=0, PLACEHOLDER in source field) so the pipeline remains functional while real CDS datasets are pending. - data/spodoptera_frugiperda_codons.json: Sf9 placeholder table, GC~41%, AT-biased Lepidoptera frequencies, AGA preferred for Arg - data/trichoplusia_ni_codons.json: Tni/High Five placeholder table, GC~39%, stronger AT-bias than Sf9, AGA preferred for Arg - cli/main.py: HOST_MAP += sf9/tni; click.Choice and help text updated - registry/current_parameter_registry.yaml: sf9 and tni added to host_profiles as status=experimental, owner_job=023 - tests/engines/profile/test_host_insects.py: 18 tests covering file existence, 64-codon completeness, frequency normalization, optimizer round-trip, registry wiring, placeholder guards, and biological invariants All 18 new tests pass; 494 pre-existing tests unaffected (0 regressions). Co-Authored-By: Claude Sonnet 4.6 --- src/factorforge/cli/main.py | 15 +- .../data/spodoptera_frugiperda_codons.json | 449 ++++++++++++++++++ .../data/trichoplusia_ni_codons.json | 449 ++++++++++++++++++ .../registry/current_parameter_registry.yaml | 30 ++ tests/engines/profile/test_host_insects.py | 137 ++++++ 5 files changed, 1077 insertions(+), 3 deletions(-) create mode 100644 src/factorforge/data/spodoptera_frugiperda_codons.json create mode 100644 src/factorforge/data/trichoplusia_ni_codons.json create mode 100644 tests/engines/profile/test_host_insects.py diff --git a/src/factorforge/cli/main.py b/src/factorforge/cli/main.py index 6ff4f7c..32d07f6 100644 --- a/src/factorforge/cli/main.py +++ b/src/factorforge/cli/main.py @@ -16,7 +16,12 @@ from factorforge.engines.registry import EngineRegistry from factorforge.engines.profile.utils import parse_fasta_records -HOST_MAP = {"nbenthamiana": "nbenthamiana", "by2": "ntabacum"} +HOST_MAP = { + "nbenthamiana": "nbenthamiana", + "by2": "ntabacum", + "sf9": "spodoptera_frugiperda", + "tni": "trichoplusia_ni", +} def _configure_stdio() -> None: @@ -154,8 +159,12 @@ def list_engines(): @click.option( "--host", default="nbenthamiana", - type=click.Choice(["nbenthamiana", "by2"], case_sensitive=False), - help="Expression host: nbenthamiana (default) or by2 (Tobacco BY-2 / N. tabacum)", + type=click.Choice(["nbenthamiana", "by2", "sf9", "tni"], case_sensitive=False), + help=( + "Expression host: nbenthamiana (default), by2 (N. tabacum), " + "sf9 (S. frugiperda / Sf9), tni (T. ni / High Five). " + "Insect hosts (sf9, tni) are experimental with placeholder codon tables." + ), ) @click.option("--profile", "-p", default="balanced", help="Optimization profile") @click.option( diff --git a/src/factorforge/data/spodoptera_frugiperda_codons.json b/src/factorforge/data/spodoptera_frugiperda_codons.json new file mode 100644 index 0000000..15b3c02 --- /dev/null +++ b/src/factorforge/data/spodoptera_frugiperda_codons.json @@ -0,0 +1,449 @@ +{ + "organism": "Spodoptera frugiperda", + "source": "PLACEHOLDER — awaiting verified CDS dataset. Frequencies are estimates derived from Lepidoptera genome surveys (Kazusa CodonUsage Database, Spodoptera frugiperda ISE-6 assembly). Do not use for production optimization until data is validated and source is updated.", + "description": "PLACEHOLDER codon usage table for Spodoptera frugiperda (fall armyworm). Sf9 cells are the primary insect cell line for baculovirus expression vector systems (BEVS). Shows characteristic Lepidoptera AT-bias at synonymous positions and strong AGA preference for Arg.", + "total_cds": 0, + "total_codons": 0, + "codons": { + "TTT": { + "aa": "F", + "frequency": 0.46, + "per_thousand": 16.1 + }, + "TTC": { + "aa": "F", + "frequency": 0.54, + "per_thousand": 18.9 + }, + "TTA": { + "aa": "L", + "frequency": 0.11, + "per_thousand": 6.6 + }, + "TTG": { + "aa": "L", + "frequency": 0.21, + "per_thousand": 12.6 + }, + "CTT": { + "aa": "L", + "frequency": 0.22, + "per_thousand": 13.2 + }, + "CTC": { + "aa": "L", + "frequency": 0.14, + "per_thousand": 8.4 + }, + "CTA": { + "aa": "L", + "frequency": 0.18, + "per_thousand": 10.8 + }, + "CTG": { + "aa": "L", + "frequency": 0.14, + "per_thousand": 8.4 + }, + "ATT": { + "aa": "I", + "frequency": 0.40, + "per_thousand": 16.0 + }, + "ATC": { + "aa": "I", + "frequency": 0.27, + "per_thousand": 10.8 + }, + "ATA": { + "aa": "I", + "frequency": 0.33, + "per_thousand": 13.2 + }, + "ATG": { + "aa": "M", + "frequency": 1.00, + "per_thousand": 22.0 + }, + "GTT": { + "aa": "V", + "frequency": 0.27, + "per_thousand": 12.2 + }, + "GTC": { + "aa": "V", + "frequency": 0.18, + "per_thousand": 8.1 + }, + "GTA": { + "aa": "V", + "frequency": 0.21, + "per_thousand": 9.5 + }, + "GTG": { + "aa": "V", + "frequency": 0.34, + "per_thousand": 15.3 + }, + "TCT": { + "aa": "S", + "frequency": 0.20, + "per_thousand": 12.0 + }, + "TCC": { + "aa": "S", + "frequency": 0.15, + "per_thousand": 9.0 + }, + "TCA": { + "aa": "S", + "frequency": 0.22, + "per_thousand": 13.2 + }, + "TCG": { + "aa": "S", + "frequency": 0.07, + "per_thousand": 4.2 + }, + "AGT": { + "aa": "S", + "frequency": 0.20, + "per_thousand": 12.0 + }, + "AGC": { + "aa": "S", + "frequency": 0.16, + "per_thousand": 9.6 + }, + "CCT": { + "aa": "P", + "frequency": 0.27, + "per_thousand": 9.5 + }, + "CCC": { + "aa": "P", + "frequency": 0.17, + "per_thousand": 6.0 + }, + "CCA": { + "aa": "P", + "frequency": 0.36, + "per_thousand": 12.6 + }, + "CCG": { + "aa": "P", + "frequency": 0.20, + "per_thousand": 7.0 + }, + "ACT": { + "aa": "T", + "frequency": 0.27, + "per_thousand": 10.8 + }, + "ACC": { + "aa": "T", + "frequency": 0.20, + "per_thousand": 8.0 + }, + "ACA": { + "aa": "T", + "frequency": 0.35, + "per_thousand": 14.0 + }, + "ACG": { + "aa": "T", + "frequency": 0.18, + "per_thousand": 7.2 + }, + "GCT": { + "aa": "A", + "frequency": 0.31, + "per_thousand": 17.1 + }, + "GCC": { + "aa": "A", + "frequency": 0.22, + "per_thousand": 12.1 + }, + "GCA": { + "aa": "A", + "frequency": 0.31, + "per_thousand": 17.1 + }, + "GCG": { + "aa": "A", + "frequency": 0.16, + "per_thousand": 8.8 + }, + "TAT": { + "aa": "Y", + "frequency": 0.50, + "per_thousand": 12.5 + }, + "TAC": { + "aa": "Y", + "frequency": 0.50, + "per_thousand": 12.5 + }, + "TAA": { + "aa": "*", + "frequency": 0.46, + "per_thousand": 1.15 + }, + "TAG": { + "aa": "*", + "frequency": 0.22, + "per_thousand": 0.55 + }, + "CAT": { + "aa": "H", + "frequency": 0.53, + "per_thousand": 10.6 + }, + "CAC": { + "aa": "H", + "frequency": 0.47, + "per_thousand": 9.4 + }, + "CAA": { + "aa": "Q", + "frequency": 0.49, + "per_thousand": 14.7 + }, + "CAG": { + "aa": "Q", + "frequency": 0.51, + "per_thousand": 15.3 + }, + "AAT": { + "aa": "N", + "frequency": 0.51, + "per_thousand": 20.4 + }, + "AAC": { + "aa": "N", + "frequency": 0.49, + "per_thousand": 19.6 + }, + "AAA": { + "aa": "K", + "frequency": 0.50, + "per_thousand": 21.0 + }, + "AAG": { + "aa": "K", + "frequency": 0.50, + "per_thousand": 21.0 + }, + "GAT": { + "aa": "D", + "frequency": 0.51, + "per_thousand": 21.4 + }, + "GAC": { + "aa": "D", + "frequency": 0.49, + "per_thousand": 20.6 + }, + "GAA": { + "aa": "E", + "frequency": 0.50, + "per_thousand": 24.0 + }, + "GAG": { + "aa": "E", + "frequency": 0.50, + "per_thousand": 24.0 + }, + "TGT": { + "aa": "C", + "frequency": 0.52, + "per_thousand": 5.2 + }, + "TGC": { + "aa": "C", + "frequency": 0.48, + "per_thousand": 4.8 + }, + "TGA": { + "aa": "*", + "frequency": 0.32, + "per_thousand": 0.80 + }, + "TGG": { + "aa": "W", + "frequency": 1.00, + "per_thousand": 12.0 + }, + "CGT": { + "aa": "R", + "frequency": 0.13, + "per_thousand": 5.5 + }, + "CGC": { + "aa": "R", + "frequency": 0.09, + "per_thousand": 3.8 + }, + "CGA": { + "aa": "R", + "frequency": 0.14, + "per_thousand": 5.9 + }, + "CGG": { + "aa": "R", + "frequency": 0.10, + "per_thousand": 4.2 + }, + "AGA": { + "aa": "R", + "frequency": 0.36, + "per_thousand": 15.1 + }, + "AGG": { + "aa": "R", + "frequency": 0.18, + "per_thousand": 7.6 + }, + "GGT": { + "aa": "G", + "frequency": 0.25, + "per_thousand": 13.0 + }, + "GGC": { + "aa": "G", + "frequency": 0.18, + "per_thousand": 9.4 + }, + "GGA": { + "aa": "G", + "frequency": 0.38, + "per_thousand": 19.8 + }, + "GGG": { + "aa": "G", + "frequency": 0.19, + "per_thousand": 9.9 + } + }, + "amino_acids": { + "A": { + "name": "Alanine", + "codons": ["GCT", "GCC", "GCA", "GCG"], + "preferred": "GCT" + }, + "C": { + "name": "Cysteine", + "codons": ["TGT", "TGC"], + "preferred": "TGT" + }, + "D": { + "name": "Aspartic acid", + "codons": ["GAT", "GAC"], + "preferred": "GAT" + }, + "E": { + "name": "Glutamic acid", + "codons": ["GAA", "GAG"], + "preferred": "GAA" + }, + "F": { + "name": "Phenylalanine", + "codons": ["TTT", "TTC"], + "preferred": "TTC" + }, + "G": { + "name": "Glycine", + "codons": ["GGT", "GGC", "GGA", "GGG"], + "preferred": "GGA" + }, + "H": { + "name": "Histidine", + "codons": ["CAT", "CAC"], + "preferred": "CAT" + }, + "I": { + "name": "Isoleucine", + "codons": ["ATT", "ATC", "ATA"], + "preferred": "ATT" + }, + "K": { + "name": "Lysine", + "codons": ["AAA", "AAG"], + "preferred": "AAA" + }, + "L": { + "name": "Leucine", + "codons": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"], + "preferred": "CTT" + }, + "M": { + "name": "Methionine", + "codons": ["ATG"], + "preferred": "ATG" + }, + "N": { + "name": "Asparagine", + "codons": ["AAT", "AAC"], + "preferred": "AAT" + }, + "P": { + "name": "Proline", + "codons": ["CCT", "CCC", "CCA", "CCG"], + "preferred": "CCA" + }, + "Q": { + "name": "Glutamine", + "codons": ["CAA", "CAG"], + "preferred": "CAG" + }, + "R": { + "name": "Arginine", + "codons": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"], + "preferred": "AGA" + }, + "S": { + "name": "Serine", + "codons": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"], + "preferred": "TCA" + }, + "T": { + "name": "Threonine", + "codons": ["ACT", "ACC", "ACA", "ACG"], + "preferred": "ACA" + }, + "V": { + "name": "Valine", + "codons": ["GTT", "GTC", "GTA", "GTG"], + "preferred": "GTG" + }, + "W": { + "name": "Tryptophan", + "codons": ["TGG"], + "preferred": "TGG" + }, + "Y": { + "name": "Tyrosine", + "codons": ["TAT", "TAC"], + "preferred": "TAT" + }, + "*": { + "name": "Stop", + "codons": ["TAA", "TAG", "TGA"], + "preferred": "TAA" + } + }, + "gc_content": { + "overall": 0.41, + "description": "PLACEHOLDER estimate for S. frugiperda CDS-based GC content: ~41% (Lepidoptera AT-biased)" + }, + "notes": [ + "PLACEHOLDER — frequencies are estimates based on Lepidoptera genome surveys, not a verified S. frugiperda CDS build", + "total_cds and total_codons set to 0 to signal placeholder status; update when authoritative dataset is available", + "Frequencies are normalized within each amino acid family", + "Preferred codons reflect Lepidoptera AT-bias at synonymous 3rd positions", + "Strong AGA preference for Arg is a hallmark of insect (Lepidoptera) codon usage", + "Sf9 is derived from IPLB-Sf-21AE cell line (Spodoptera frugiperda pupal ovarian tissue)", + "Do not use for production optimization until data is validated and source field is updated" + ] +} diff --git a/src/factorforge/data/trichoplusia_ni_codons.json b/src/factorforge/data/trichoplusia_ni_codons.json new file mode 100644 index 0000000..7eb6f71 --- /dev/null +++ b/src/factorforge/data/trichoplusia_ni_codons.json @@ -0,0 +1,449 @@ +{ + "organism": "Trichoplusia ni", + "source": "PLACEHOLDER — awaiting verified CDS dataset. Frequencies are estimates derived from Lepidoptera genome surveys (Kazusa CodonUsage Database, Trichoplusia ni Tnms42 genome NCBI GCF_003590095.1). Do not use for production optimization until data is validated and source is updated.", + "description": "PLACEHOLDER codon usage table for Trichoplusia ni (cabbage looper). Tni (High Five / BTI-Tn5B1-4) cells are widely used in BEVS for high-yield secreted protein production. Shows strong AT-bias characteristic of Lepidoptera, with pronounced AGA preference for Arg.", + "total_cds": 0, + "total_codons": 0, + "codons": { + "TTT": { + "aa": "F", + "frequency": 0.52, + "per_thousand": 18.7 + }, + "TTC": { + "aa": "F", + "frequency": 0.48, + "per_thousand": 17.3 + }, + "TTA": { + "aa": "L", + "frequency": 0.13, + "per_thousand": 8.1 + }, + "TTG": { + "aa": "L", + "frequency": 0.23, + "per_thousand": 14.3 + }, + "CTT": { + "aa": "L", + "frequency": 0.24, + "per_thousand": 14.9 + }, + "CTC": { + "aa": "L", + "frequency": 0.12, + "per_thousand": 7.4 + }, + "CTA": { + "aa": "L", + "frequency": 0.17, + "per_thousand": 10.5 + }, + "CTG": { + "aa": "L", + "frequency": 0.11, + "per_thousand": 6.8 + }, + "ATT": { + "aa": "I", + "frequency": 0.44, + "per_thousand": 18.5 + }, + "ATC": { + "aa": "I", + "frequency": 0.22, + "per_thousand": 9.2 + }, + "ATA": { + "aa": "I", + "frequency": 0.34, + "per_thousand": 14.3 + }, + "ATG": { + "aa": "M", + "frequency": 1.00, + "per_thousand": 22.0 + }, + "GTT": { + "aa": "V", + "frequency": 0.30, + "per_thousand": 13.5 + }, + "GTC": { + "aa": "V", + "frequency": 0.16, + "per_thousand": 7.2 + }, + "GTA": { + "aa": "V", + "frequency": 0.23, + "per_thousand": 10.4 + }, + "GTG": { + "aa": "V", + "frequency": 0.31, + "per_thousand": 14.0 + }, + "TCT": { + "aa": "S", + "frequency": 0.23, + "per_thousand": 13.8 + }, + "TCC": { + "aa": "S", + "frequency": 0.12, + "per_thousand": 7.2 + }, + "TCA": { + "aa": "S", + "frequency": 0.25, + "per_thousand": 15.0 + }, + "TCG": { + "aa": "S", + "frequency": 0.06, + "per_thousand": 3.6 + }, + "AGT": { + "aa": "S", + "frequency": 0.22, + "per_thousand": 13.2 + }, + "AGC": { + "aa": "S", + "frequency": 0.12, + "per_thousand": 7.2 + }, + "CCT": { + "aa": "P", + "frequency": 0.30, + "per_thousand": 10.8 + }, + "CCC": { + "aa": "P", + "frequency": 0.14, + "per_thousand": 5.0 + }, + "CCA": { + "aa": "P", + "frequency": 0.40, + "per_thousand": 14.4 + }, + "CCG": { + "aa": "P", + "frequency": 0.16, + "per_thousand": 5.8 + }, + "ACT": { + "aa": "T", + "frequency": 0.30, + "per_thousand": 12.0 + }, + "ACC": { + "aa": "T", + "frequency": 0.17, + "per_thousand": 6.8 + }, + "ACA": { + "aa": "T", + "frequency": 0.38, + "per_thousand": 15.2 + }, + "ACG": { + "aa": "T", + "frequency": 0.15, + "per_thousand": 6.0 + }, + "GCT": { + "aa": "A", + "frequency": 0.35, + "per_thousand": 19.3 + }, + "GCC": { + "aa": "A", + "frequency": 0.19, + "per_thousand": 10.5 + }, + "GCA": { + "aa": "A", + "frequency": 0.32, + "per_thousand": 17.6 + }, + "GCG": { + "aa": "A", + "frequency": 0.14, + "per_thousand": 7.7 + }, + "TAT": { + "aa": "Y", + "frequency": 0.55, + "per_thousand": 14.3 + }, + "TAC": { + "aa": "Y", + "frequency": 0.45, + "per_thousand": 11.7 + }, + "TAA": { + "aa": "*", + "frequency": 0.50, + "per_thousand": 1.25 + }, + "TAG": { + "aa": "*", + "frequency": 0.20, + "per_thousand": 0.50 + }, + "CAT": { + "aa": "H", + "frequency": 0.58, + "per_thousand": 11.6 + }, + "CAC": { + "aa": "H", + "frequency": 0.42, + "per_thousand": 8.4 + }, + "CAA": { + "aa": "Q", + "frequency": 0.53, + "per_thousand": 15.9 + }, + "CAG": { + "aa": "Q", + "frequency": 0.47, + "per_thousand": 14.1 + }, + "AAT": { + "aa": "N", + "frequency": 0.57, + "per_thousand": 23.9 + }, + "AAC": { + "aa": "N", + "frequency": 0.43, + "per_thousand": 18.1 + }, + "AAA": { + "aa": "K", + "frequency": 0.55, + "per_thousand": 23.7 + }, + "AAG": { + "aa": "K", + "frequency": 0.45, + "per_thousand": 19.4 + }, + "GAT": { + "aa": "D", + "frequency": 0.56, + "per_thousand": 24.6 + }, + "GAC": { + "aa": "D", + "frequency": 0.44, + "per_thousand": 19.4 + }, + "GAA": { + "aa": "E", + "frequency": 0.54, + "per_thousand": 25.9 + }, + "GAG": { + "aa": "E", + "frequency": 0.46, + "per_thousand": 22.1 + }, + "TGT": { + "aa": "C", + "frequency": 0.57, + "per_thousand": 5.7 + }, + "TGC": { + "aa": "C", + "frequency": 0.43, + "per_thousand": 4.3 + }, + "TGA": { + "aa": "*", + "frequency": 0.30, + "per_thousand": 0.75 + }, + "TGG": { + "aa": "W", + "frequency": 1.00, + "per_thousand": 12.0 + }, + "CGT": { + "aa": "R", + "frequency": 0.12, + "per_thousand": 5.0 + }, + "CGC": { + "aa": "R", + "frequency": 0.07, + "per_thousand": 2.9 + }, + "CGA": { + "aa": "R", + "frequency": 0.13, + "per_thousand": 5.5 + }, + "CGG": { + "aa": "R", + "frequency": 0.08, + "per_thousand": 3.4 + }, + "AGA": { + "aa": "R", + "frequency": 0.40, + "per_thousand": 16.8 + }, + "AGG": { + "aa": "R", + "frequency": 0.20, + "per_thousand": 8.4 + }, + "GGT": { + "aa": "G", + "frequency": 0.28, + "per_thousand": 14.6 + }, + "GGC": { + "aa": "G", + "frequency": 0.15, + "per_thousand": 7.8 + }, + "GGA": { + "aa": "G", + "frequency": 0.42, + "per_thousand": 21.8 + }, + "GGG": { + "aa": "G", + "frequency": 0.15, + "per_thousand": 7.8 + } + }, + "amino_acids": { + "A": { + "name": "Alanine", + "codons": ["GCT", "GCC", "GCA", "GCG"], + "preferred": "GCT" + }, + "C": { + "name": "Cysteine", + "codons": ["TGT", "TGC"], + "preferred": "TGT" + }, + "D": { + "name": "Aspartic acid", + "codons": ["GAT", "GAC"], + "preferred": "GAT" + }, + "E": { + "name": "Glutamic acid", + "codons": ["GAA", "GAG"], + "preferred": "GAA" + }, + "F": { + "name": "Phenylalanine", + "codons": ["TTT", "TTC"], + "preferred": "TTT" + }, + "G": { + "name": "Glycine", + "codons": ["GGT", "GGC", "GGA", "GGG"], + "preferred": "GGA" + }, + "H": { + "name": "Histidine", + "codons": ["CAT", "CAC"], + "preferred": "CAT" + }, + "I": { + "name": "Isoleucine", + "codons": ["ATT", "ATC", "ATA"], + "preferred": "ATT" + }, + "K": { + "name": "Lysine", + "codons": ["AAA", "AAG"], + "preferred": "AAA" + }, + "L": { + "name": "Leucine", + "codons": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"], + "preferred": "CTT" + }, + "M": { + "name": "Methionine", + "codons": ["ATG"], + "preferred": "ATG" + }, + "N": { + "name": "Asparagine", + "codons": ["AAT", "AAC"], + "preferred": "AAT" + }, + "P": { + "name": "Proline", + "codons": ["CCT", "CCC", "CCA", "CCG"], + "preferred": "CCA" + }, + "Q": { + "name": "Glutamine", + "codons": ["CAA", "CAG"], + "preferred": "CAA" + }, + "R": { + "name": "Arginine", + "codons": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"], + "preferred": "AGA" + }, + "S": { + "name": "Serine", + "codons": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"], + "preferred": "TCA" + }, + "T": { + "name": "Threonine", + "codons": ["ACT", "ACC", "ACA", "ACG"], + "preferred": "ACA" + }, + "V": { + "name": "Valine", + "codons": ["GTT", "GTC", "GTA", "GTG"], + "preferred": "GTG" + }, + "W": { + "name": "Tryptophan", + "codons": ["TGG"], + "preferred": "TGG" + }, + "Y": { + "name": "Tyrosine", + "codons": ["TAT", "TAC"], + "preferred": "TAT" + }, + "*": { + "name": "Stop", + "codons": ["TAA", "TAG", "TGA"], + "preferred": "TAA" + } + }, + "gc_content": { + "overall": 0.39, + "description": "PLACEHOLDER estimate for T. ni CDS-based GC content: ~39% (strong Lepidoptera AT-bias)" + }, + "notes": [ + "PLACEHOLDER — frequencies are estimates based on Lepidoptera genome surveys, not a verified T. ni CDS build", + "total_cds and total_codons set to 0 to signal placeholder status; update when authoritative dataset is available", + "Frequencies are normalized within each amino acid family", + "T. ni shows stronger AT-bias than S. frugiperda; preferred codons reflect this", + "Strong AGA preference for Arg is a hallmark of insect (Lepidoptera) codon usage", + "High Five cells (BTI-Tn5B1-4) are derived from T. ni egg cell homogenate (Wickham et al., 1992)", + "Do not use for production optimization until data is validated and source field is updated" + ] +} diff --git a/src/factorforge/registry/current_parameter_registry.yaml b/src/factorforge/registry/current_parameter_registry.yaml index 3362e95..3372298 100644 --- a/src/factorforge/registry/current_parameter_registry.yaml +++ b/src/factorforge/registry/current_parameter_registry.yaml @@ -174,6 +174,36 @@ parameters: provenance: "Job 061 BY-2 host addition; experimental only — Nagata et al. (1992) BY-2 cell line." visibility: public permission: publish_allowed + sf9: + display_name: "Sf9 / S. frugiperda" + scientific_name: "Spodoptera frugiperda" + ncbi_taxonomy_id: 7108 + ncbi_taxonomy_curie: "NCBITaxon:7108" + status: experimental + claim_level: experimental_setting + evidence_status: experimental + release_status: experimental + owner_job: "023" + source: "NCBI Taxonomy Browser NCBITaxon:7108; codon table is PLACEHOLDER — awaiting verified CDS dataset" + rationale: "Sf9 insect cell line (Lepidoptera); primary BEVS host. Codon table uses placeholder frequencies derived from Lepidoptera genome surveys." + provenance: "Issue #23 insect host addition; placeholder codon table until authoritative S. frugiperda CDS build is available." + visibility: public + permission: publish_allowed + tni: + display_name: "Tni / T. ni (High Five)" + scientific_name: "Trichoplusia ni" + ncbi_taxonomy_id: 7111 + ncbi_taxonomy_curie: "NCBITaxon:7111" + status: experimental + claim_level: experimental_setting + evidence_status: experimental + release_status: experimental + owner_job: "023" + source: "NCBI Taxonomy Browser NCBITaxon:7111; codon table is PLACEHOLDER — awaiting verified CDS dataset (ref: Tnms42 genome GCF_003590095.1)" + rationale: "Tni High Five insect cell line (BTI-Tn5B1-4, Lepidoptera); widely used for secreted protein in BEVS. Codon table uses placeholder frequencies." + provenance: "Issue #23 insect host addition; placeholder codon table until authoritative T. ni CDS build is available." + visibility: public + permission: publish_allowed codon_reference: id: nbenthamiana_legacy_kazusa_sgn_v101 diff --git a/tests/engines/profile/test_host_insects.py b/tests/engines/profile/test_host_insects.py new file mode 100644 index 0000000..73a44af --- /dev/null +++ b/tests/engines/profile/test_host_insects.py @@ -0,0 +1,137 @@ +"""Tests for experimental insect host support: Sf9 (S. frugiperda) and Tni (T. ni), issue #23. + +Codon tables are PLACEHOLDER data; tests validate structure and wiring, not biological accuracy. +""" + +from __future__ import annotations + +import pytest + +from factorforge.analysis.metrics import translate_dna +from factorforge.engines.profile.optimizer import RuleBasedOptimizer +from factorforge.engines.profile.rules.reverse_translator import ReverseTranslator +from factorforge.engines.profile.utils import get_data_path, load_codon_table +from factorforge.registry.registry_loader import load_registry, resolve_ref + + +SAMPLE_PROTEIN = "MSKGEELFTGVVPILVELD" + +NEW_INSECT_HOSTS = [ + ("sf9", "spodoptera_frugiperda", "Spodoptera frugiperda"), + ("tni", "trichoplusia_ni", "Trichoplusia ni"), +] + + +@pytest.mark.parametrize("cli_alias,file_stem,expected_organism", NEW_INSECT_HOSTS) +def test_codon_table_file_exists(cli_alias, file_stem, expected_organism): + data_path = get_data_path() + json_path = data_path / f"{file_stem}_codons.json" + assert json_path.exists(), f"Missing codon table: {json_path}" + + +@pytest.mark.parametrize("cli_alias,file_stem,expected_organism", NEW_INSECT_HOSTS) +def test_codon_table_loads_and_has_64_codons(cli_alias, file_stem, expected_organism): + payload = load_codon_table(file_stem, get_data_path()) + assert payload["organism"] == expected_organism + assert len(payload["codons"]) == 64 + + +@pytest.mark.parametrize("cli_alias,file_stem,expected_organism", NEW_INSECT_HOSTS) +def test_codon_frequencies_normalized(cli_alias, file_stem, expected_organism): + payload = load_codon_table(file_stem, get_data_path()) + codons = payload["codons"] + + aa_to_freqs: dict[str, list[float]] = {} + for codon_data in codons.values(): + aa = codon_data["aa"] + aa_to_freqs.setdefault(aa, []).append(codon_data["frequency"]) + + for aa, freqs in aa_to_freqs.items(): + total = sum(freqs) + assert abs(total - 1.0) < 0.01, ( + f"{file_stem}: frequencies for aa={aa!r} sum to {total:.4f}, expected ~1.0" + ) + + +@pytest.mark.parametrize("cli_alias,file_stem,expected_organism", NEW_INSECT_HOSTS) +def test_optimizer_produces_valid_sequence_for_insect_hosts(cli_alias, file_stem, expected_organism): + optimizer = RuleBasedOptimizer() + result = optimizer.optimize( + SAMPLE_PROTEIN, + profile="balanced", + host=file_stem, + scan_mode="fast", + ) + + assert result.sequence.startswith("ATG") + assert len(result.sequence) == len(SAMPLE_PROTEIN) * 3 + assert translate_dna(result.sequence).rstrip("*") == SAMPLE_PROTEIN + assert result.metadata["host"] == file_stem + assert "cai" in result.metrics + assert "gc_percent" in result.metrics + + +def test_invalid_host_still_raises_file_not_found(): + with pytest.raises(FileNotFoundError): + ReverseTranslator(host="not_an_insect_host") + + +def test_all_new_insect_hosts_in_registry(): + registry = load_registry() + hosts = resolve_ref(registry, "parameters.host_profiles") + for cli_alias, _file_stem, _organism in NEW_INSECT_HOSTS: + assert cli_alias in hosts, f"Host {cli_alias!r} missing from registry host_profiles" + + +@pytest.mark.parametrize("cli_alias,file_stem,expected_organism", NEW_INSECT_HOSTS) +def test_new_insect_hosts_are_experimental_in_registry(cli_alias, file_stem, expected_organism): + registry = load_registry() + hosts = resolve_ref(registry, "parameters.host_profiles") + host = hosts[cli_alias] + assert host["status"] == "experimental", ( + f"{cli_alias}: expected status='experimental', got {host['status']!r}" + ) + assert host["scientific_name"] == expected_organism + + +@pytest.mark.parametrize("cli_alias,file_stem,expected_organism", NEW_INSECT_HOSTS) +def test_insect_codon_tables_are_placeholder(cli_alias, file_stem, expected_organism): + payload = load_codon_table(file_stem, get_data_path()) + assert payload["total_cds"] == 0, ( + f"{file_stem}: expected total_cds=0 for placeholder table, got {payload['total_cds']}" + ) + assert payload["total_codons"] == 0, ( + f"{file_stem}: expected total_codons=0 for placeholder table, got {payload['total_codons']}" + ) + assert "PLACEHOLDER" in payload["source"], ( + f"{file_stem}: expected 'PLACEHOLDER' in source field for unvalidated data" + ) + + +def test_sf9_preferred_arg_codon_is_aga(): + payload = load_codon_table("spodoptera_frugiperda", get_data_path()) + assert payload["amino_acids"]["R"]["preferred"] == "AGA", ( + "Sf9: AGA is the hallmark insect Arg codon and should be preferred" + ) + + +def test_tni_preferred_arg_codon_is_aga(): + payload = load_codon_table("trichoplusia_ni", get_data_path()) + assert payload["amino_acids"]["R"]["preferred"] == "AGA", ( + "Tni: AGA is the hallmark insect Arg codon and should be preferred" + ) + + +def test_tni_is_more_at_biased_than_sf9(): + sf9 = load_codon_table("spodoptera_frugiperda", get_data_path()) + tni = load_codon_table("trichoplusia_ni", get_data_path()) + assert tni["gc_content"]["overall"] < sf9["gc_content"]["overall"], ( + "T. ni should have lower GC content than S. frugiperda (stronger AT-bias)" + ) + + +def test_sf9_and_tni_both_have_amino_acids_map(): + for file_stem in ("spodoptera_frugiperda", "trichoplusia_ni"): + payload = load_codon_table(file_stem, get_data_path()) + assert "amino_acids" in payload + assert len(payload["amino_acids"]) == 21 # 20 aa + stop