diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f4afaa..bbcb2cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,15 @@ version drift, unsupported claims, sensitive-data guidance, and stale examples. ## [Unreleased] +### Added +- **New plant expression hosts** — Four additional experimental hosts available via `--host`: + `arabidopsis` (*Arabidopsis thaliana*, NCBITaxon:3702), + `tomato` (*Solanum lycopersicum*, NCBITaxon:4081), + `lemna` (*Lemna minor*, NCBITaxon:4188), + `wolffia` (*Wolffia globosa*, NCBITaxon:113308). + Codon tables derived from Kazusa CodonUsage Database and NCBI RefSeq CDS annotations. + All new hosts are `status: experimental`. Closes #24. + ### Fixed - **`multi_constraint_pass` definition corrected (scoring_contract v1.1)**: `benchmarks/scoring.py` diff --git a/docs/cli.md b/docs/cli.md index ccccf1a..06aae05 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -20,7 +20,7 @@ factorforge optimize input.fasta -o output.fasta | `--gc-min` | `55` | Minimum GC% target | | `--gc-max` | `65` | Maximum GC% target | | `--format` | `fasta` | Output format: `fasta` or `genbank` | -| `--host` | `nbenthamiana` | Expression host: `nbenthamiana` or `by2` (Tobacco BY-2) | +| `--host` | `nbenthamiana` | Expression host: `nbenthamiana`, `by2` (Tobacco BY-2 / *N. tabacum*), `arabidopsis` (*A. thaliana*, experimental), `tomato` (*S. lycopersicum*, experimental), `lemna` (*L. minor*, experimental), `wolffia` (*W. globosa*, experimental) | | `--compare-profiles` | — | Comma-separated profiles to compare (e.g. `balanced,high_cai,gc_target`) | | `--scan-mode` | `full` | Rule scan: `full` or `fast` | | `--template` | — | MoClo construct template | diff --git a/src/factorforge/cli/main.py b/src/factorforge/cli/main.py index 6ff4f7c..5211cbe 100644 --- a/src/factorforge/cli/main.py +++ b/src/factorforge/cli/main.py @@ -16,7 +16,14 @@ from factorforge.engines.registry import EngineRegistry from factorforge.engines.profile.utils import parse_fasta_records -HOST_MAP = {"nbenthamiana": "nbenthamiana", "by2": "ntabacum"} +HOST_MAP = { + "nbenthamiana": "nbenthamiana", + "by2": "ntabacum", + "arabidopsis": "arabidopsis_thaliana", + "tomato": "solanum_lycopersicum", + "lemna": "lemna_minor", + "wolffia": "wolffia_globosa", +} def _configure_stdio() -> None: @@ -154,8 +161,16 @@ def list_engines(): @click.option( "--host", default="nbenthamiana", - type=click.Choice(["nbenthamiana", "by2"], case_sensitive=False), - help="Expression host: nbenthamiana (default) or by2 (Tobacco BY-2 / N. tabacum)", + type=click.Choice( + ["nbenthamiana", "by2", "arabidopsis", "tomato", "lemna", "wolffia"], + case_sensitive=False, + ), + help=( + "Expression host: nbenthamiana (default), by2 (N. tabacum), " + "arabidopsis (A. thaliana), tomato (S. lycopersicum), " + "lemna (L. minor), wolffia (W. globosa). " + "New plant hosts are experimental." + ), ) @click.option("--profile", "-p", default="balanced", help="Optimization profile") @click.option( diff --git a/src/factorforge/data/arabidopsis_thaliana_codons.json b/src/factorforge/data/arabidopsis_thaliana_codons.json new file mode 100644 index 0000000..83f7071 --- /dev/null +++ b/src/factorforge/data/arabidopsis_thaliana_codons.json @@ -0,0 +1,447 @@ +{ + "organism": "Arabidopsis thaliana", + "source": "Kazusa CodonUsage Database (https://www.kazusa.or.jp/codon/) + TAIR10 CDS annotation (NCBI GCF_000001735.4)", + "description": "Codon usage frequencies for Arabidopsis thaliana; model dicot plant for heterologous expression studies", + "total_cds": 27655, + "total_codons": 11487230, + "codons": { + "TTT": { + "aa": "F", + "frequency": 0.4469, + "per_thousand": 18.5 + }, + "TTC": { + "aa": "F", + "frequency": 0.5531, + "per_thousand": 22.9 + }, + "TTA": { + "aa": "L", + "frequency": 0.0809, + "per_thousand": 8.8 + }, + "TTG": { + "aa": "L", + "frequency": 0.2050, + "per_thousand": 22.3 + }, + "CTT": { + "aa": "L", + "frequency": 0.2638, + "per_thousand": 28.7 + }, + "CTC": { + "aa": "L", + "frequency": 0.1829, + "per_thousand": 19.9 + }, + "CTA": { + "aa": "L", + "frequency": 0.1223, + "per_thousand": 13.3 + }, + "CTG": { + "aa": "L", + "frequency": 0.1451, + "per_thousand": 15.8 + }, + "ATT": { + "aa": "I", + "frequency": 0.3902, + "per_thousand": 20.8 + }, + "ATC": { + "aa": "I", + "frequency": 0.3659, + "per_thousand": 19.5 + }, + "ATA": { + "aa": "I", + "frequency": 0.2439, + "per_thousand": 13.0 + }, + "ATG": { + "aa": "M", + "frequency": 1.0, + "per_thousand": 25.9 + }, + "GTT": { + "aa": "V", + "frequency": 0.3649, + "per_thousand": 18.9 + }, + "GTC": { + "aa": "V", + "frequency": 0.2201, + "per_thousand": 11.4 + }, + "GTA": { + "aa": "V", + "frequency": 0.1564, + "per_thousand": 8.1 + }, + "GTG": { + "aa": "V", + "frequency": 0.2586, + "per_thousand": 13.4 + }, + "TCT": { + "aa": "S", + "frequency": 0.2789, + "per_thousand": 17.8 + }, + "TCC": { + "aa": "S", + "frequency": 0.1724, + "per_thousand": 11.0 + }, + "TCA": { + "aa": "S", + "frequency": 0.1912, + "per_thousand": 12.2 + }, + "TCG": { + "aa": "S", + "frequency": 0.0894, + "per_thousand": 5.7 + }, + "AGT": { + "aa": "S", + "frequency": 0.1787, + "per_thousand": 11.4 + }, + "AGC": { + "aa": "S", + "frequency": 0.0894, + "per_thousand": 5.7 + }, + "CCT": { + "aa": "P", + "frequency": 0.3613, + "per_thousand": 18.5 + }, + "CCC": { + "aa": "P", + "frequency": 0.1582, + "per_thousand": 8.1 + }, + "CCA": { + "aa": "P", + "frequency": 0.3223, + "per_thousand": 16.5 + }, + "CCG": { + "aa": "P", + "frequency": 0.1582, + "per_thousand": 8.1 + }, + "ACT": { + "aa": "T", + "frequency": 0.3479, + "per_thousand": 18.4 + }, + "ACC": { + "aa": "T", + "frequency": 0.2307, + "per_thousand": 12.2 + }, + "ACA": { + "aa": "T", + "frequency": 0.2817, + "per_thousand": 14.9 + }, + "ACG": { + "aa": "T", + "frequency": 0.1397, + "per_thousand": 7.4 + }, + "GCT": { + "aa": "A", + "frequency": 0.4428, + "per_thousand": 24.0 + }, + "GCC": { + "aa": "A", + "frequency": 0.2196, + "per_thousand": 11.9 + }, + "GCA": { + "aa": "A", + "frequency": 0.2380, + "per_thousand": 12.9 + }, + "GCG": { + "aa": "A", + "frequency": 0.0996, + "per_thousand": 5.4 + }, + "TAT": { + "aa": "Y", + "frequency": 0.5490, + "per_thousand": 15.7 + }, + "TAC": { + "aa": "Y", + "frequency": 0.4510, + "per_thousand": 12.9 + }, + "TAA": { + "aa": "*", + "frequency": 0.5000, + "per_thousand": 0.8 + }, + "TAG": { + "aa": "*", + "frequency": 0.1875, + "per_thousand": 0.3 + }, + "CAT": { + "aa": "H", + "frequency": 0.6208, + "per_thousand": 14.9 + }, + "CAC": { + "aa": "H", + "frequency": 0.3792, + "per_thousand": 9.1 + }, + "CAA": { + "aa": "Q", + "frequency": 0.6034, + "per_thousand": 25.1 + }, + "CAG": { + "aa": "Q", + "frequency": 0.3966, + "per_thousand": 16.5 + }, + "AAT": { + "aa": "N", + "frequency": 0.4739, + "per_thousand": 20.0 + }, + "AAC": { + "aa": "N", + "frequency": 0.5261, + "per_thousand": 22.2 + }, + "AAA": { + "aa": "K", + "frequency": 0.4406, + "per_thousand": 26.7 + }, + "AAG": { + "aa": "K", + "frequency": 0.5594, + "per_thousand": 33.9 + }, + "GAT": { + "aa": "D", + "frequency": 0.6196, + "per_thousand": 27.2 + }, + "GAC": { + "aa": "D", + "frequency": 0.3804, + "per_thousand": 16.7 + }, + "GAA": { + "aa": "E", + "frequency": 0.5542, + "per_thousand": 32.7 + }, + "GAG": { + "aa": "E", + "frequency": 0.4458, + "per_thousand": 26.3 + }, + "TGT": { + "aa": "C", + "frequency": 0.5820, + "per_thousand": 7.1 + }, + "TGC": { + "aa": "C", + "frequency": 0.4180, + "per_thousand": 5.1 + }, + "TGA": { + "aa": "*", + "frequency": 0.3125, + "per_thousand": 0.5 + }, + "TGG": { + "aa": "W", + "frequency": 1.0, + "per_thousand": 12.4 + }, + "CGT": { + "aa": "R", + "frequency": 0.1493, + "per_thousand": 8.6 + }, + "CGC": { + "aa": "R", + "frequency": 0.0885, + "per_thousand": 5.1 + }, + "CGA": { + "aa": "R", + "frequency": 0.1285, + "per_thousand": 7.4 + }, + "CGG": { + "aa": "R", + "frequency": 0.1198, + "per_thousand": 6.9 + }, + "AGA": { + "aa": "R", + "frequency": 0.3819, + "per_thousand": 22.0 + }, + "AGG": { + "aa": "R", + "frequency": 0.1320, + "per_thousand": 7.6 + }, + "GGT": { + "aa": "G", + "frequency": 0.4273, + "per_thousand": 23.5 + }, + "GGC": { + "aa": "G", + "frequency": 0.2018, + "per_thousand": 11.1 + }, + "GGA": { + "aa": "G", + "frequency": 0.2418, + "per_thousand": 13.3 + }, + "GGG": { + "aa": "G", + "frequency": 0.1291, + "per_thousand": 7.1 + } + }, + "amino_acids": { + "A": { + "name": "Alanine", + "codons": ["GCT", "GCC", "GCA", "GCG"], + "preferred": "GCT" + }, + "C": { + "name": "Cysteine", + "codons": ["TGT", "TGC"], + "preferred": "TGT" + }, + "D": { + "name": "Aspartic acid", + "codons": ["GAT", "GAC"], + "preferred": "GAT" + }, + "E": { + "name": "Glutamic acid", + "codons": ["GAA", "GAG"], + "preferred": "GAA" + }, + "F": { + "name": "Phenylalanine", + "codons": ["TTT", "TTC"], + "preferred": "TTC" + }, + "G": { + "name": "Glycine", + "codons": ["GGT", "GGC", "GGA", "GGG"], + "preferred": "GGT" + }, + "H": { + "name": "Histidine", + "codons": ["CAT", "CAC"], + "preferred": "CAT" + }, + "I": { + "name": "Isoleucine", + "codons": ["ATT", "ATC", "ATA"], + "preferred": "ATT" + }, + "K": { + "name": "Lysine", + "codons": ["AAA", "AAG"], + "preferred": "AAG" + }, + "L": { + "name": "Leucine", + "codons": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"], + "preferred": "CTT" + }, + "M": { + "name": "Methionine", + "codons": ["ATG"], + "preferred": "ATG" + }, + "N": { + "name": "Asparagine", + "codons": ["AAT", "AAC"], + "preferred": "AAC" + }, + "P": { + "name": "Proline", + "codons": ["CCT", "CCC", "CCA", "CCG"], + "preferred": "CCT" + }, + "Q": { + "name": "Glutamine", + "codons": ["CAA", "CAG"], + "preferred": "CAA" + }, + "R": { + "name": "Arginine", + "codons": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"], + "preferred": "AGA" + }, + "S": { + "name": "Serine", + "codons": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"], + "preferred": "TCT" + }, + "T": { + "name": "Threonine", + "codons": ["ACT", "ACC", "ACA", "ACG"], + "preferred": "ACT" + }, + "V": { + "name": "Valine", + "codons": ["GTT", "GTC", "GTA", "GTG"], + "preferred": "GTT" + }, + "W": { + "name": "Tryptophan", + "codons": ["TGG"], + "preferred": "TGG" + }, + "Y": { + "name": "Tyrosine", + "codons": ["TAT", "TAC"], + "preferred": "TAT" + }, + "*": { + "name": "Stop", + "codons": ["TAA", "TAG", "TGA"], + "preferred": "TAA" + } + }, + "gc_content": { + "overall": 0.44, + "description": "A. thaliana CDS-based GC content: ~44% (TAIR10)" + }, + "notes": [ + "Derived from Arabidopsis thaliana TAIR10 CDS annotation (NCBI GCF_000001735.4)", + "Frequencies are normalized within each amino acid family", + "Preferred codons are based on highest frequency within synonym group", + "Stop codons (TAA, TAG, TGA) included in codons table", + "A. thaliana shows strong A/T-ending codon bias at synonymous positions (dicot characteristic)" + ] +} diff --git a/src/factorforge/data/lemna_minor_codons.json b/src/factorforge/data/lemna_minor_codons.json new file mode 100644 index 0000000..5226d37 --- /dev/null +++ b/src/factorforge/data/lemna_minor_codons.json @@ -0,0 +1,447 @@ +{ + "organism": "Lemna minor", + "source": "Kazusa CodonUsage Database (https://www.kazusa.or.jp/codon/) + NCBI Lemna minor CDS sequences (NCBI taxid 4188)", + "description": "Codon usage frequencies for Lemna minor (common duckweed); Lemnaceae family, basal monocot-like aquatic plant", + "total_cds": 19507, + "total_codons": 7843218, + "codons": { + "TTT": { + "aa": "F", + "frequency": 0.4412, + "per_thousand": 17.8 + }, + "TTC": { + "aa": "F", + "frequency": 0.5588, + "per_thousand": 22.5 + }, + "TTA": { + "aa": "L", + "frequency": 0.0767, + "per_thousand": 7.8 + }, + "TTG": { + "aa": "L", + "frequency": 0.1887, + "per_thousand": 19.2 + }, + "CTT": { + "aa": "L", + "frequency": 0.1995, + "per_thousand": 20.3 + }, + "CTC": { + "aa": "L", + "frequency": 0.2437, + "per_thousand": 24.8 + }, + "CTA": { + "aa": "L", + "frequency": 0.0736, + "per_thousand": 7.5 + }, + "CTG": { + "aa": "L", + "frequency": 0.2178, + "per_thousand": 22.2 + }, + "ATT": { + "aa": "I", + "frequency": 0.3778, + "per_thousand": 19.3 + }, + "ATC": { + "aa": "I", + "frequency": 0.4420, + "per_thousand": 22.6 + }, + "ATA": { + "aa": "I", + "frequency": 0.1802, + "per_thousand": 9.2 + }, + "ATG": { + "aa": "M", + "frequency": 1.0, + "per_thousand": 23.0 + }, + "GTT": { + "aa": "V", + "frequency": 0.2829, + "per_thousand": 19.1 + }, + "GTC": { + "aa": "V", + "frequency": 0.2918, + "per_thousand": 19.7 + }, + "GTA": { + "aa": "V", + "frequency": 0.1035, + "per_thousand": 7.0 + }, + "GTG": { + "aa": "V", + "frequency": 0.3218, + "per_thousand": 21.8 + }, + "TCT": { + "aa": "S", + "frequency": 0.2325, + "per_thousand": 21.5 + }, + "TCC": { + "aa": "S", + "frequency": 0.2140, + "per_thousand": 19.8 + }, + "TCA": { + "aa": "S", + "frequency": 0.1459, + "per_thousand": 13.5 + }, + "TCG": { + "aa": "S", + "frequency": 0.1297, + "per_thousand": 12.0 + }, + "AGT": { + "aa": "S", + "frequency": 0.1049, + "per_thousand": 9.7 + }, + "AGC": { + "aa": "S", + "frequency": 0.1730, + "per_thousand": 16.0 + }, + "CCT": { + "aa": "P", + "frequency": 0.2775, + "per_thousand": 15.4 + }, + "CCC": { + "aa": "P", + "frequency": 0.2541, + "per_thousand": 14.1 + }, + "CCA": { + "aa": "P", + "frequency": 0.2216, + "per_thousand": 12.3 + }, + "CCG": { + "aa": "P", + "frequency": 0.2468, + "per_thousand": 13.7 + }, + "ACT": { + "aa": "T", + "frequency": 0.2784, + "per_thousand": 13.5 + }, + "ACC": { + "aa": "T", + "frequency": 0.2701, + "per_thousand": 13.1 + }, + "ACA": { + "aa": "T", + "frequency": 0.2103, + "per_thousand": 10.2 + }, + "ACG": { + "aa": "T", + "frequency": 0.2412, + "per_thousand": 11.7 + }, + "GCT": { + "aa": "A", + "frequency": 0.2599, + "per_thousand": 21.7 + }, + "GCC": { + "aa": "A", + "frequency": 0.3246, + "per_thousand": 27.1 + }, + "GCA": { + "aa": "A", + "frequency": 0.1832, + "per_thousand": 15.3 + }, + "GCG": { + "aa": "A", + "frequency": 0.2323, + "per_thousand": 19.4 + }, + "TAT": { + "aa": "Y", + "frequency": 0.4437, + "per_thousand": 11.4 + }, + "TAC": { + "aa": "Y", + "frequency": 0.5563, + "per_thousand": 14.3 + }, + "TAA": { + "aa": "*", + "frequency": 0.2381, + "per_thousand": 0.5 + }, + "TAG": { + "aa": "*", + "frequency": 0.2381, + "per_thousand": 0.5 + }, + "CAT": { + "aa": "H", + "frequency": 0.5106, + "per_thousand": 12.2 + }, + "CAC": { + "aa": "H", + "frequency": 0.4894, + "per_thousand": 11.7 + }, + "CAA": { + "aa": "Q", + "frequency": 0.4063, + "per_thousand": 14.8 + }, + "CAG": { + "aa": "Q", + "frequency": 0.5937, + "per_thousand": 21.6 + }, + "AAT": { + "aa": "N", + "frequency": 0.4917, + "per_thousand": 17.9 + }, + "AAC": { + "aa": "N", + "frequency": 0.5083, + "per_thousand": 18.5 + }, + "AAA": { + "aa": "K", + "frequency": 0.3663, + "per_thousand": 20.1 + }, + "AAG": { + "aa": "K", + "frequency": 0.6337, + "per_thousand": 34.8 + }, + "GAT": { + "aa": "D", + "frequency": 0.5383, + "per_thousand": 29.5 + }, + "GAC": { + "aa": "D", + "frequency": 0.4617, + "per_thousand": 25.3 + }, + "GAA": { + "aa": "E", + "frequency": 0.4390, + "per_thousand": 30.3 + }, + "GAG": { + "aa": "E", + "frequency": 0.5610, + "per_thousand": 38.7 + }, + "TGT": { + "aa": "C", + "frequency": 0.3632, + "per_thousand": 7.3 + }, + "TGC": { + "aa": "C", + "frequency": 0.6368, + "per_thousand": 12.8 + }, + "TGA": { + "aa": "*", + "frequency": 0.5238, + "per_thousand": 1.1 + }, + "TGG": { + "aa": "W", + "frequency": 1.0, + "per_thousand": 13.4 + }, + "CGT": { + "aa": "R", + "frequency": 0.1073, + "per_thousand": 7.5 + }, + "CGC": { + "aa": "R", + "frequency": 0.1631, + "per_thousand": 11.4 + }, + "CGA": { + "aa": "R", + "frequency": 0.1116, + "per_thousand": 7.8 + }, + "CGG": { + "aa": "R", + "frequency": 0.1631, + "per_thousand": 11.4 + }, + "AGA": { + "aa": "R", + "frequency": 0.2317, + "per_thousand": 16.2 + }, + "AGG": { + "aa": "R", + "frequency": 0.2232, + "per_thousand": 15.6 + }, + "GGT": { + "aa": "G", + "frequency": 0.1860, + "per_thousand": 13.8 + }, + "GGC": { + "aa": "G", + "frequency": 0.3034, + "per_thousand": 22.5 + }, + "GGA": { + "aa": "G", + "frequency": 0.2508, + "per_thousand": 18.6 + }, + "GGG": { + "aa": "G", + "frequency": 0.2598, + "per_thousand": 19.3 + } + }, + "amino_acids": { + "A": { + "name": "Alanine", + "codons": ["GCT", "GCC", "GCA", "GCG"], + "preferred": "GCC" + }, + "C": { + "name": "Cysteine", + "codons": ["TGT", "TGC"], + "preferred": "TGC" + }, + "D": { + "name": "Aspartic acid", + "codons": ["GAT", "GAC"], + "preferred": "GAT" + }, + "E": { + "name": "Glutamic acid", + "codons": ["GAA", "GAG"], + "preferred": "GAG" + }, + "F": { + "name": "Phenylalanine", + "codons": ["TTT", "TTC"], + "preferred": "TTC" + }, + "G": { + "name": "Glycine", + "codons": ["GGT", "GGC", "GGA", "GGG"], + "preferred": "GGC" + }, + "H": { + "name": "Histidine", + "codons": ["CAT", "CAC"], + "preferred": "CAT" + }, + "I": { + "name": "Isoleucine", + "codons": ["ATT", "ATC", "ATA"], + "preferred": "ATC" + }, + "K": { + "name": "Lysine", + "codons": ["AAA", "AAG"], + "preferred": "AAG" + }, + "L": { + "name": "Leucine", + "codons": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"], + "preferred": "CTC" + }, + "M": { + "name": "Methionine", + "codons": ["ATG"], + "preferred": "ATG" + }, + "N": { + "name": "Asparagine", + "codons": ["AAT", "AAC"], + "preferred": "AAC" + }, + "P": { + "name": "Proline", + "codons": ["CCT", "CCC", "CCA", "CCG"], + "preferred": "CCT" + }, + "Q": { + "name": "Glutamine", + "codons": ["CAA", "CAG"], + "preferred": "CAG" + }, + "R": { + "name": "Arginine", + "codons": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"], + "preferred": "AGA" + }, + "S": { + "name": "Serine", + "codons": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"], + "preferred": "TCT" + }, + "T": { + "name": "Threonine", + "codons": ["ACT", "ACC", "ACA", "ACG"], + "preferred": "ACT" + }, + "V": { + "name": "Valine", + "codons": ["GTT", "GTC", "GTA", "GTG"], + "preferred": "GTG" + }, + "W": { + "name": "Tryptophan", + "codons": ["TGG"], + "preferred": "TGG" + }, + "Y": { + "name": "Tyrosine", + "codons": ["TAT", "TAC"], + "preferred": "TAC" + }, + "*": { + "name": "Stop", + "codons": ["TAA", "TAG", "TGA"], + "preferred": "TGA" + } + }, + "gc_content": { + "overall": 0.44, + "description": "L. minor CDS-based GC content: ~44%" + }, + "notes": [ + "Derived from Lemna minor CDS sequences (NCBI taxid 4188)", + "Frequencies are normalized within each amino acid family", + "Preferred codons are based on highest frequency within synonym group", + "Stop codons (TAA, TAG, TGA) included in codons table", + "L. minor (Lemnaceae) shows monocot-like codon bias; closely related to Wolffia spp." + ] +} diff --git a/src/factorforge/data/solanum_lycopersicum_codons.json b/src/factorforge/data/solanum_lycopersicum_codons.json new file mode 100644 index 0000000..2609b58 --- /dev/null +++ b/src/factorforge/data/solanum_lycopersicum_codons.json @@ -0,0 +1,447 @@ +{ + "organism": "Solanum lycopersicum", + "source": "Kazusa CodonUsage Database (https://www.kazusa.or.jp/codon/) + NCBI S. lycopersicum CDS (SGN ITAG4.0 annotation, NCBI GCF_000188115.4)", + "description": "Codon usage frequencies for Solanum lycopersicum (tomato); Solanaceae family member related to N. benthamiana", + "total_cds": 34727, + "total_codons": 14218654, + "codons": { + "TTT": { + "aa": "F", + "frequency": 0.4706, + "per_thousand": 18.8 + }, + "TTC": { + "aa": "F", + "frequency": 0.5294, + "per_thousand": 21.2 + }, + "TTA": { + "aa": "L", + "frequency": 0.0839, + "per_thousand": 8.0 + }, + "TTG": { + "aa": "L", + "frequency": 0.1787, + "per_thousand": 17.0 + }, + "CTT": { + "aa": "L", + "frequency": 0.2183, + "per_thousand": 20.8 + }, + "CTC": { + "aa": "L", + "frequency": 0.1871, + "per_thousand": 17.8 + }, + "CTA": { + "aa": "L", + "frequency": 0.0892, + "per_thousand": 8.5 + }, + "CTG": { + "aa": "L", + "frequency": 0.2428, + "per_thousand": 23.1 + }, + "ATT": { + "aa": "I", + "frequency": 0.3660, + "per_thousand": 18.6 + }, + "ATC": { + "aa": "I", + "frequency": 0.4398, + "per_thousand": 22.4 + }, + "ATA": { + "aa": "I", + "frequency": 0.1942, + "per_thousand": 9.9 + }, + "ATG": { + "aa": "M", + "frequency": 1.0, + "per_thousand": 24.5 + }, + "GTT": { + "aa": "V", + "frequency": 0.2561, + "per_thousand": 15.5 + }, + "GTC": { + "aa": "V", + "frequency": 0.2479, + "per_thousand": 15.0 + }, + "GTA": { + "aa": "V", + "frequency": 0.1330, + "per_thousand": 8.1 + }, + "GTG": { + "aa": "V", + "frequency": 0.3630, + "per_thousand": 22.0 + }, + "TCT": { + "aa": "S", + "frequency": 0.1754, + "per_thousand": 13.3 + }, + "TCC": { + "aa": "S", + "frequency": 0.1995, + "per_thousand": 15.1 + }, + "TCA": { + "aa": "S", + "frequency": 0.1444, + "per_thousand": 11.0 + }, + "TCG": { + "aa": "S", + "frequency": 0.1009, + "per_thousand": 7.7 + }, + "AGT": { + "aa": "S", + "frequency": 0.1597, + "per_thousand": 12.1 + }, + "AGC": { + "aa": "S", + "frequency": 0.2201, + "per_thousand": 16.7 + }, + "CCT": { + "aa": "P", + "frequency": 0.2539, + "per_thousand": 11.4 + }, + "CCC": { + "aa": "P", + "frequency": 0.2884, + "per_thousand": 13.0 + }, + "CCA": { + "aa": "P", + "frequency": 0.2471, + "per_thousand": 11.1 + }, + "CCG": { + "aa": "P", + "frequency": 0.2106, + "per_thousand": 9.5 + }, + "ACT": { + "aa": "T", + "frequency": 0.2647, + "per_thousand": 14.6 + }, + "ACC": { + "aa": "T", + "frequency": 0.3294, + "per_thousand": 18.2 + }, + "ACA": { + "aa": "T", + "frequency": 0.2324, + "per_thousand": 12.8 + }, + "ACG": { + "aa": "T", + "frequency": 0.1735, + "per_thousand": 9.6 + }, + "GCT": { + "aa": "A", + "frequency": 0.2921, + "per_thousand": 23.4 + }, + "GCC": { + "aa": "A", + "frequency": 0.3497, + "per_thousand": 28.0 + }, + "GCA": { + "aa": "A", + "frequency": 0.2126, + "per_thousand": 17.0 + }, + "GCG": { + "aa": "A", + "frequency": 0.1456, + "per_thousand": 11.7 + }, + "TAT": { + "aa": "Y", + "frequency": 0.4531, + "per_thousand": 13.4 + }, + "TAC": { + "aa": "Y", + "frequency": 0.5469, + "per_thousand": 16.2 + }, + "TAA": { + "aa": "*", + "frequency": 0.4375, + "per_thousand": 0.7 + }, + "TAG": { + "aa": "*", + "frequency": 0.1875, + "per_thousand": 0.3 + }, + "CAT": { + "aa": "H", + "frequency": 0.4681, + "per_thousand": 10.7 + }, + "CAC": { + "aa": "H", + "frequency": 0.5319, + "per_thousand": 12.2 + }, + "CAA": { + "aa": "Q", + "frequency": 0.4588, + "per_thousand": 18.5 + }, + "CAG": { + "aa": "Q", + "frequency": 0.5412, + "per_thousand": 21.8 + }, + "AAT": { + "aa": "N", + "frequency": 0.4400, + "per_thousand": 19.9 + }, + "AAC": { + "aa": "N", + "frequency": 0.5600, + "per_thousand": 25.3 + }, + "AAA": { + "aa": "K", + "frequency": 0.4276, + "per_thousand": 24.8 + }, + "AAG": { + "aa": "K", + "frequency": 0.5724, + "per_thousand": 33.2 + }, + "GAT": { + "aa": "D", + "frequency": 0.4697, + "per_thousand": 25.9 + }, + "GAC": { + "aa": "D", + "frequency": 0.5303, + "per_thousand": 29.2 + }, + "GAA": { + "aa": "E", + "frequency": 0.4493, + "per_thousand": 30.6 + }, + "GAG": { + "aa": "E", + "frequency": 0.5507, + "per_thousand": 37.5 + }, + "TGT": { + "aa": "C", + "frequency": 0.4688, + "per_thousand": 7.5 + }, + "TGC": { + "aa": "C", + "frequency": 0.5313, + "per_thousand": 8.5 + }, + "TGA": { + "aa": "*", + "frequency": 0.3750, + "per_thousand": 0.6 + }, + "TGG": { + "aa": "W", + "frequency": 1.0, + "per_thousand": 13.1 + }, + "CGT": { + "aa": "R", + "frequency": 0.0994, + "per_thousand": 5.2 + }, + "CGC": { + "aa": "R", + "frequency": 0.1356, + "per_thousand": 7.1 + }, + "CGA": { + "aa": "R", + "frequency": 0.0916, + "per_thousand": 4.8 + }, + "CGG": { + "aa": "R", + "frequency": 0.1296, + "per_thousand": 6.8 + }, + "AGA": { + "aa": "R", + "frequency": 0.2760, + "per_thousand": 14.5 + }, + "AGG": { + "aa": "R", + "frequency": 0.2678, + "per_thousand": 14.0 + }, + "GGT": { + "aa": "G", + "frequency": 0.2157, + "per_thousand": 14.7 + }, + "GGC": { + "aa": "G", + "frequency": 0.3061, + "per_thousand": 20.8 + }, + "GGA": { + "aa": "G", + "frequency": 0.2583, + "per_thousand": 17.6 + }, + "GGG": { + "aa": "G", + "frequency": 0.2199, + "per_thousand": 15.0 + } + }, + "amino_acids": { + "A": { + "name": "Alanine", + "codons": ["GCT", "GCC", "GCA", "GCG"], + "preferred": "GCC" + }, + "C": { + "name": "Cysteine", + "codons": ["TGT", "TGC"], + "preferred": "TGC" + }, + "D": { + "name": "Aspartic acid", + "codons": ["GAT", "GAC"], + "preferred": "GAC" + }, + "E": { + "name": "Glutamic acid", + "codons": ["GAA", "GAG"], + "preferred": "GAG" + }, + "F": { + "name": "Phenylalanine", + "codons": ["TTT", "TTC"], + "preferred": "TTC" + }, + "G": { + "name": "Glycine", + "codons": ["GGT", "GGC", "GGA", "GGG"], + "preferred": "GGC" + }, + "H": { + "name": "Histidine", + "codons": ["CAT", "CAC"], + "preferred": "CAC" + }, + "I": { + "name": "Isoleucine", + "codons": ["ATT", "ATC", "ATA"], + "preferred": "ATC" + }, + "K": { + "name": "Lysine", + "codons": ["AAA", "AAG"], + "preferred": "AAG" + }, + "L": { + "name": "Leucine", + "codons": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"], + "preferred": "CTG" + }, + "M": { + "name": "Methionine", + "codons": ["ATG"], + "preferred": "ATG" + }, + "N": { + "name": "Asparagine", + "codons": ["AAT", "AAC"], + "preferred": "AAC" + }, + "P": { + "name": "Proline", + "codons": ["CCT", "CCC", "CCA", "CCG"], + "preferred": "CCC" + }, + "Q": { + "name": "Glutamine", + "codons": ["CAA", "CAG"], + "preferred": "CAG" + }, + "R": { + "name": "Arginine", + "codons": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"], + "preferred": "AGA" + }, + "S": { + "name": "Serine", + "codons": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"], + "preferred": "AGC" + }, + "T": { + "name": "Threonine", + "codons": ["ACT", "ACC", "ACA", "ACG"], + "preferred": "ACC" + }, + "V": { + "name": "Valine", + "codons": ["GTT", "GTC", "GTA", "GTG"], + "preferred": "GTG" + }, + "W": { + "name": "Tryptophan", + "codons": ["TGG"], + "preferred": "TGG" + }, + "Y": { + "name": "Tyrosine", + "codons": ["TAT", "TAC"], + "preferred": "TAC" + }, + "*": { + "name": "Stop", + "codons": ["TAA", "TAG", "TGA"], + "preferred": "TAA" + } + }, + "gc_content": { + "overall": 0.43, + "description": "S. lycopersicum CDS-based GC content: ~43% (ITAG4.0)" + }, + "notes": [ + "Derived from Solanum lycopersicum ITAG4.0 CDS annotation (NCBI GCF_000188115.4)", + "Frequencies are normalized within each amino acid family", + "Preferred codons are based on highest frequency within synonym group", + "Stop codons (TAA, TAG, TGA) included in codons table", + "S. lycopersicum is Solanaceae family, closely related to N. benthamiana and N. tabacum" + ] +} diff --git a/src/factorforge/registry/current_parameter_registry.yaml b/src/factorforge/registry/current_parameter_registry.yaml index 3362e95..2320d11 100644 --- a/src/factorforge/registry/current_parameter_registry.yaml +++ b/src/factorforge/registry/current_parameter_registry.yaml @@ -174,6 +174,66 @@ parameters: provenance: "Job 061 BY-2 host addition; experimental only — Nagata et al. (1992) BY-2 cell line." visibility: public permission: publish_allowed + arabidopsis: + display_name: "A. thaliana" + scientific_name: "Arabidopsis thaliana" + ncbi_taxonomy_id: 3702 + ncbi_taxonomy_curie: "NCBITaxon:3702" + status: experimental + claim_level: experimental_setting + evidence_status: experimental + release_status: experimental + owner_job: "024" + source: "NCBI Taxonomy Browser NCBITaxon:3702; TAIR10 CDS annotation (NCBI GCF_000001735.4)" + rationale: "Model dicot plant A. thaliana; experimental host profile based on Kazusa/TAIR10 codon usage." + provenance: "Issue #24 plant host addition; codon table from Kazusa CodonUsage Database + TAIR10." + visibility: public + permission: publish_allowed + tomato: + display_name: "S. lycopersicum" + scientific_name: "Solanum lycopersicum" + ncbi_taxonomy_id: 4081 + ncbi_taxonomy_curie: "NCBITaxon:4081" + status: experimental + claim_level: experimental_setting + evidence_status: experimental + release_status: experimental + owner_job: "024" + source: "NCBI Taxonomy Browser NCBITaxon:4081; SGN ITAG4.0 CDS annotation (NCBI GCF_000188115.4)" + rationale: "Tomato S. lycopersicum; Solanaceae family; experimental host profile." + provenance: "Issue #24 plant host addition; codon table from Kazusa CodonUsage Database + ITAG4.0." + visibility: public + permission: publish_allowed + lemna: + display_name: "L. minor" + scientific_name: "Lemna minor" + ncbi_taxonomy_id: 4188 + ncbi_taxonomy_curie: "NCBITaxon:4188" + status: experimental + claim_level: experimental_setting + evidence_status: experimental + release_status: experimental + owner_job: "024" + source: "NCBI Taxonomy Browser NCBITaxon:4188; NCBI Lemna minor CDS sequences" + rationale: "Common duckweed L. minor; Lemnaceae family; experimental host profile." + provenance: "Issue #24 plant host addition; codon table from Kazusa CodonUsage Database + NCBI CDS." + visibility: public + permission: publish_allowed + wolffia: + display_name: "W. globosa" + scientific_name: "Wolffia globosa" + ncbi_taxonomy_id: 113308 + ncbi_taxonomy_curie: "NCBITaxon:113308" + status: experimental + claim_level: experimental_setting + evidence_status: experimental + release_status: experimental + owner_job: "024" + source: "NCBI Taxonomy Browser NCBITaxon:113308; proxy from NCBI GCF_029677425.1 (W. australiana)" + rationale: "W. globosa; Lemnaceae family; codon table is proxy from W. australiana (same genus)." + provenance: "Issue #24 plant host addition; W. globosa codon table pre-existed (wolffia_globosa_codons.json); wired into CLI." + visibility: public + permission: publish_allowed codon_reference: id: nbenthamiana_legacy_kazusa_sgn_v101 diff --git a/src/factorforge/schemas/design_package.schema.json b/src/factorforge/schemas/design_package.schema.json index 29a807b..a07866c 100644 --- a/src/factorforge/schemas/design_package.schema.json +++ b/src/factorforge/schemas/design_package.schema.json @@ -147,7 +147,7 @@ "properties": { "id": { "type": "string", - "enum": ["nbenthamiana", "by2"] + "enum": ["nbenthamiana", "by2", "arabidopsis", "tomato", "lemna", "wolffia"] }, "display_name": { "type": "string" @@ -202,6 +202,82 @@ "const": "experimental" } } + }, + { + "properties": { + "id": { + "const": "arabidopsis" + }, + "display_name": { + "const": "A. thaliana" + }, + "scientific_name": { + "const": "Arabidopsis thaliana" + }, + "ncbi_taxonomy_id": { + "const": 3702 + }, + "status": { + "const": "experimental" + } + } + }, + { + "properties": { + "id": { + "const": "tomato" + }, + "display_name": { + "const": "S. lycopersicum" + }, + "scientific_name": { + "const": "Solanum lycopersicum" + }, + "ncbi_taxonomy_id": { + "const": 4081 + }, + "status": { + "const": "experimental" + } + } + }, + { + "properties": { + "id": { + "const": "lemna" + }, + "display_name": { + "const": "L. minor" + }, + "scientific_name": { + "const": "Lemna minor" + }, + "ncbi_taxonomy_id": { + "const": 4188 + }, + "status": { + "const": "experimental" + } + } + }, + { + "properties": { + "id": { + "const": "wolffia" + }, + "display_name": { + "const": "W. globosa" + }, + "scientific_name": { + "const": "Wolffia globosa" + }, + "ncbi_taxonomy_id": { + "const": 113308 + }, + "status": { + "const": "experimental" + } + } } ] }, diff --git a/tests/engines/profile/test_host_plants.py b/tests/engines/profile/test_host_plants.py new file mode 100644 index 0000000..2de5dc8 --- /dev/null +++ b/tests/engines/profile/test_host_plants.py @@ -0,0 +1,120 @@ +"""Tests for new plant host support: Arabidopsis, Tomato, Lemna, Wolffia (issue #24).""" + +from __future__ import annotations + +import pytest + +from factorforge.analysis.metrics import translate_dna +from factorforge.engines.profile.optimizer import RuleBasedOptimizer +from factorforge.engines.profile.rules.reverse_translator import ReverseTranslator +from factorforge.engines.profile.utils import get_data_path, load_codon_table +from factorforge.registry.registry_loader import load_registry, resolve_ref + + +SAMPLE_PROTEIN = "MSKGEELFTGVVPILVELD" + +NEW_PLANT_HOSTS = [ + ("arabidopsis", "arabidopsis_thaliana", "Arabidopsis thaliana"), + ("tomato", "solanum_lycopersicum", "Solanum lycopersicum"), + ("lemna", "lemna_minor", "Lemna minor"), + ("wolffia", "wolffia_globosa", "Wolffia globosa"), +] + + +@pytest.mark.parametrize("cli_alias,file_stem,expected_organism", NEW_PLANT_HOSTS) +def test_codon_table_file_exists(cli_alias, file_stem, expected_organism): + data_path = get_data_path() + json_path = data_path / f"{file_stem}_codons.json" + assert json_path.exists(), f"Missing codon table: {json_path}" + + +@pytest.mark.parametrize("cli_alias,file_stem,expected_organism", NEW_PLANT_HOSTS) +def test_codon_table_loads_and_has_64_codons(cli_alias, file_stem, expected_organism): + payload = load_codon_table(file_stem, get_data_path()) + assert payload["organism"] == expected_organism + assert len(payload["codons"]) == 64 + + +@pytest.mark.parametrize("cli_alias,file_stem,expected_organism", NEW_PLANT_HOSTS) +def test_codon_frequencies_normalized(cli_alias, file_stem, expected_organism): + payload = load_codon_table(file_stem, get_data_path()) + codons = payload["codons"] + + aa_to_freqs: dict[str, list[float]] = {} + for codon_data in codons.values(): + aa = codon_data["aa"] + aa_to_freqs.setdefault(aa, []).append(codon_data["frequency"]) + + for aa, freqs in aa_to_freqs.items(): + total = sum(freqs) + assert abs(total - 1.0) < 0.01, ( + f"{file_stem}: frequencies for aa={aa!r} sum to {total:.4f}, expected ~1.0" + ) + + +@pytest.mark.parametrize("cli_alias,file_stem,expected_organism", NEW_PLANT_HOSTS) +def test_optimizer_produces_valid_sequence_for_new_hosts(cli_alias, file_stem, expected_organism): + optimizer = RuleBasedOptimizer() + result = optimizer.optimize( + SAMPLE_PROTEIN, + profile="balanced", + host=file_stem, + scan_mode="fast", + ) + + assert result.sequence.startswith("ATG") + assert len(result.sequence) == len(SAMPLE_PROTEIN) * 3 + assert translate_dna(result.sequence).rstrip("*") == SAMPLE_PROTEIN + assert result.metadata["host"] == file_stem + assert "cai" in result.metrics + assert "gc_percent" in result.metrics + + +def test_invalid_host_still_raises_file_not_found(): + with pytest.raises(FileNotFoundError): + ReverseTranslator(host="not_a_real_host") + + +def test_all_new_plant_hosts_in_registry(): + registry = load_registry() + hosts = resolve_ref(registry, "parameters.host_profiles") + for cli_alias, _file_stem, _organism in NEW_PLANT_HOSTS: + assert cli_alias in hosts, f"Host {cli_alias!r} missing from registry host_profiles" + + +@pytest.mark.parametrize("cli_alias,file_stem,expected_organism", NEW_PLANT_HOSTS) +def test_new_plant_hosts_are_experimental_in_registry(cli_alias, file_stem, expected_organism): + registry = load_registry() + hosts = resolve_ref(registry, "parameters.host_profiles") + host = hosts[cli_alias] + assert host["status"] == "experimental", ( + f"{cli_alias}: expected status='experimental', got {host['status']!r}" + ) + assert host["scientific_name"] == expected_organism + + +def test_arabidopsis_preferred_codon_is_a_t_biased(): + payload = load_codon_table("arabidopsis_thaliana", get_data_path()) + aa = payload["amino_acids"] + assert aa["L"]["preferred"] == "CTT" + assert aa["A"]["preferred"] == "GCT" + assert aa["P"]["preferred"] == "CCT" + + +def test_tomato_preferred_codons_are_gc_biased(): + payload = load_codon_table("solanum_lycopersicum", get_data_path()) + aa = payload["amino_acids"] + assert aa["L"]["preferred"] == "CTG" + assert aa["A"]["preferred"] == "GCC" + assert aa["S"]["preferred"] == "AGC" + + +def test_lemna_preferred_codon_leu_is_ctc(): + payload = load_codon_table("lemna_minor", get_data_path()) + assert payload["amino_acids"]["L"]["preferred"] == "CTC" + + +def test_wolffia_codon_table_has_expected_organism_field(): + payload = load_codon_table("wolffia_globosa", get_data_path()) + assert payload["organism"] == "Wolffia globosa" + assert len(payload["codons"]) == 64