From 66926deede82831ad2d2a75e16309b1e885d247e Mon Sep 17 00:00:00 2001 From: Pawel-N-pl Date: Sun, 7 Jun 2026 10:02:03 +0200 Subject: [PATCH 1/2] fix(build-state-doc): write frontmatter scalars as UTF-8 so non-ASCII round-trips MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The lambda repls (#18) stopped build-state-doc crashing on non-ASCII, but the values are still emitted with json.dumps' default ensure_ascii=True, i.e. as "\uXXXX" escapes. The frontmatter reader (unquote_scalar) only strips the surrounding quotes — it does not JSON-decode — so those escapes survive a write->parse cycle as the literal text "\uXXXX". On a non-English project (epicName / customInstructions / aiCommand with an em-dash or Polish letters) every such scalar reads back corrupted, e.g. aiCommand "claude — go" parses as 'claude — go'. Fix: route value rendering through a small _yaml_value() helper that dumps with ensure_ascii=False, so the document carries real UTF-8 that quote-stripping recovers intact. Also convert the remaining agentConfig block re.sub from a raw-string repl to a callable one — non-ASCII in an agent/model id is unlikely, but a backslash (a "\g"- or "\b"-looking token) is realistic and would raise re.error: bad escape or corrupt into a backspace, the same hazard the scalar sites already guard against. Tests: extend test_replacement_unicode with frontmatter round-trip assertions (the existing tests assertIn on the whole document, which passes via the body {{token}} even when the frontmatter line is escaped) and an agentConfig case covering non-ASCII + backslash content. --- .../src/story_automator/commands/state.py | 35 ++++++++++++------- tests/test_replacement_unicode.py | 34 ++++++++++++++++++ 2 files changed, 57 insertions(+), 12 deletions(-) diff --git a/skills/bmad-story-automator/src/story_automator/commands/state.py b/skills/bmad-story-automator/src/story_automator/commands/state.py index 38990141..e9aeea16 100644 --- a/skills/bmad-story-automator/src/story_automator/commands/state.py +++ b/skills/bmad-story-automator/src/story_automator/commands/state.py @@ -11,6 +11,14 @@ from ..core.utils import count_matches, ensure_dir, file_exists, get_project_root, now_utc, now_utc_z, read_text, write_json +def _yaml_value(value: Any) -> str: + # Emit non-ASCII as raw UTF-8, not \uXXXX escapes. These values are read back + # with unquote_scalar (frontmatter.py), which strips the surrounding quotes + # WITHOUT JSON-decoding, so an ensure_ascii escape would round-trip as the + # literal text "\uXXXX" and corrupt the value. + return json.dumps(value, ensure_ascii=False) + + def cmd_build_state_doc(args: list[str]) -> int: template = "" output_folder = "" @@ -76,7 +84,7 @@ def cmd_build_state_doc(args: list[str]) -> int: f" maxParallel: {int(overrides.get('maxParallel', 1) or 1)}\n", text, ) - custom_instructions = json.dumps(config.get("customInstructions", "")) + custom_instructions = _yaml_value(config.get("customInstructions", "")) text = re.sub(r"(?m)^customInstructions:.*$", lambda m: f"customInstructions: {custom_instructions}", text) agent_config = config.get("agentConfig") if isinstance(agent_config, dict): @@ -97,8 +105,8 @@ def cmd_build_state_doc(args: list[str]) -> int: lines = [ "agentConfig:", - f" defaultPrimary: {json.dumps(default_primary)}", - f" defaultFallback: {json.dumps(default_fallback)}", + f" defaultPrimary: {_yaml_value(default_primary)}", + f" defaultFallback: {_yaml_value(default_fallback)}", ] # Model serialization preserves three states so round-trips through # `_load_agent_config_from_state` + `resolve_agent` keep the same @@ -113,7 +121,7 @@ def cmd_build_state_doc(args: list[str]) -> int: # motivated this — without preserving the explicit clear, retro/dev # tasks silently re-inherited `defaultModel` after persistence. if "defaultModel" in agent_config: - lines.append(f" defaultModel: {json.dumps(_model_or_none(agent_config.get('defaultModel')))}") + lines.append(f" defaultModel: {_yaml_value(_model_or_none(agent_config.get('defaultModel')))}") if isinstance(per_task, dict) and per_task: lines.append(" perTask:") for task in sorted(per_task): @@ -122,12 +130,12 @@ def cmd_build_state_doc(args: list[str]) -> int: continue lines.append(f" {task}:") if "primary" in entry: - lines.append(f" primary: {json.dumps(entry['primary'])}") + lines.append(f" primary: {_yaml_value(entry['primary'])}") if "fallback" in entry: value = entry["fallback"] - lines.append(f" fallback: {'false' if value is False else json.dumps(value)}") + lines.append(f" fallback: {'false' if value is False else _yaml_value(value)}") if "model" in entry: - lines.append(f" model: {json.dumps(_model_or_none(entry.get('model')))}") + lines.append(f" model: {_yaml_value(_model_or_none(entry.get('model')))}") complexity_overrides = agent_config.get("complexityOverrides", {}) if isinstance(complexity_overrides, dict) and complexity_overrides: lines.append(" complexityOverrides:") @@ -142,16 +150,19 @@ def cmd_build_state_doc(args: list[str]) -> int: continue lines.append(f" {task}:") if "primary" in entry: - lines.append(f" primary: {json.dumps(entry['primary'])}") + lines.append(f" primary: {_yaml_value(entry['primary'])}") if "fallback" in entry: value = entry["fallback"] - lines.append(f" fallback: {'false' if value is False else json.dumps(value)}") + lines.append(f" fallback: {'false' if value is False else _yaml_value(value)}") if "model" in entry: - lines.append(f" model: {json.dumps(_model_or_none(entry.get('model')))}") + lines.append(f" model: {_yaml_value(_model_or_none(entry.get('model')))}") block = "\n".join(lines) + "\n" - text = re.sub(r"(?m)^agentConfig:\n(?:(?:\s{2}.*\n)*)", block, text) + # Callable repl: the block is inserted verbatim, never parsed for \g / \1 + # backrefs, so a "\uXXXX" or backslash in an agent/model name can't raise + # re.error: bad escape (the same hazard fixed below for scalar values). + text = re.sub(r"(?m)^agentConfig:\n(?:(?:\s{2}.*\n)*)", lambda m: block, text) for key, value in replacements.items(): - text = re.sub(rf"(?m)^{re.escape(key)}:.*$", lambda m, k=key, v=value: f"{k}: {json.dumps(v)}", text) + text = re.sub(rf"(?m)^{re.escape(key)}:.*$", lambda m, k=key, v=value: f"{k}: {_yaml_value(v)}", text) story_range = [item for item in config.get("storyRange", []) if isinstance(item, str)] progress_rows = "\n".join(f"| {story_id} | ⏳ | ⏳ | ⏳ | ⏳ | ⏳ | pending |" for story_id in story_range) body = { diff --git a/tests/test_replacement_unicode.py b/tests/test_replacement_unicode.py index e5286dbd..59827908 100644 --- a/tests/test_replacement_unicode.py +++ b/tests/test_replacement_unicode.py @@ -19,6 +19,7 @@ from story_automator.commands.orchestrator import cmd_orchestrator_helper from story_automator.commands.state import cmd_build_state_doc +from story_automator.core.frontmatter import parse_simple_frontmatter REPO_ROOT = Path(__file__).resolve().parents[1] @@ -111,6 +112,39 @@ def test_replacement_value_with_backslash_in_string(self) -> None: text = state_file.read_text(encoding="utf-8") self.assertIn(r"\to", text) + def test_frontmatter_scalars_round_trip_non_ascii(self) -> None: + # Regression: scalars must survive a write->parse cycle. The frontmatter + # reader (unquote_scalar) strips quotes without JSON-decoding, so values + # have to be written as raw UTF-8 — an ensure_ascii "\uXXXX" escape would + # read back as literal "\uXXXX" text. assertIn on the whole document does + # NOT catch this because the body {{token}} carries the real char. + config = self._default_config() + config["epicName"] = "upmon — Epic 3 obsłuż" + config["aiCommand"] = "claude — go" + config["customInstructions"] = "run probe — ciężki ż" + fields = parse_simple_frontmatter(self._build_state(config).read_text(encoding="utf-8")) + self.assertEqual(fields.get("epicName"), "upmon — Epic 3 obsłuż") + self.assertEqual(fields.get("aiCommand"), "claude — go") + self.assertEqual(fields.get("customInstructions"), "run probe — ciężki ż") + + def test_agent_config_special_chars_do_not_break_resub(self) -> None: + # The agentConfig block is spliced in with re.sub too. A non-ASCII model + # id is unlikely, but a backslash in a model/agent string is realistic + # (e.g. a "\g"- or "\b"-looking token) — as a raw repl it would raise + # re.error: bad escape or corrupt "\b" into a backspace. Both must + # survive verbatim and never become a "\uXXXX" escape. + config = self._default_config() + config["agentConfig"] = { + "defaultPrimary": "claude", + "defaultFallback": "codex", + "perTask": {"dev": {"primary": r"agent\g<0>", "model": "opus — żółw"}}, + } + # _build_state returning a path at all proves the agentConfig re.sub did + # not raise re.error on the "\g<0>" backslash content. + text = self._build_state(config).read_text(encoding="utf-8") + self.assertIn("opus — żółw", text) + self.assertNotIn(r"\u", text) + class StateUpdateUnicodeTests(_FixtureMixin, unittest.TestCase): def test_state_update_with_unicode_value(self) -> None: From 605554fd969ac070b2706084c8be934403b9366e Mon Sep 17 00:00:00 2001 From: Pawel-N-pl Date: Sun, 7 Jun 2026 10:11:58 +0200 Subject: [PATCH 2/2] fix(build-state-doc): force UTF-8 when writing the state doc + pin backslash test Addresses review feedback on the non-ASCII serialization change: - output_path.write_text(text) had no explicit encoding. Now that the document carries raw UTF-8 (not \uXXXX escapes), the bare write_text would use the platform default locale and could raise UnicodeEncodeError / mojibake on a non-UTF-8 locale (Windows cp1252, POSIX C). Pass encoding="utf-8", matching the repo's read_text helper. (Augment medium / CodeRabbit major.) - Strengthen the agentConfig regression test: assert the backslash value is emitted verbatim ("agent\\g<0>", JSON-doubled) so the test pins the value survives the splice, not merely that re.sub didn't raise. (CodeRabbit nitpick.) --- .../bmad-story-automator/src/story_automator/commands/state.py | 2 +- tests/test_replacement_unicode.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/skills/bmad-story-automator/src/story_automator/commands/state.py b/skills/bmad-story-automator/src/story_automator/commands/state.py index e9aeea16..af0baaf2 100644 --- a/skills/bmad-story-automator/src/story_automator/commands/state.py +++ b/skills/bmad-story-automator/src/story_automator/commands/state.py @@ -177,7 +177,7 @@ def cmd_build_state_doc(args: list[str]) -> int: for key, value in body.items(): text = text.replace(key, value) text = text.replace("", progress_rows) - output_path.write_text(text) + output_path.write_text(text, encoding="utf-8") write_json({"ok": True, "path": str(output_path), "createdAt": now}) return 0 diff --git a/tests/test_replacement_unicode.py b/tests/test_replacement_unicode.py index 59827908..6c71615a 100644 --- a/tests/test_replacement_unicode.py +++ b/tests/test_replacement_unicode.py @@ -143,6 +143,9 @@ def test_agent_config_special_chars_do_not_break_resub(self) -> None: # not raise re.error on the "\g<0>" backslash content. text = self._build_state(config).read_text(encoding="utf-8") self.assertIn("opus — żółw", text) + # JSON doubles the backslash, so the literal block carries "agent\\g<0>"; + # asserting it proves the value survived the splice, not just that it ran. + self.assertIn(r"agent\\g<0>", text) self.assertNotIn(r"\u", text)