From 66926deede82831ad2d2a75e16309b1e885d247e Mon Sep 17 00:00:00 2001
From: Pawel-N-pl <ashi.one@gmail.com>
Date: Sun, 7 Jun 2026 10:02:03 +0200
Subject: [PATCH 1/2] fix(build-state-doc): write frontmatter scalars as UTF-8
 so non-ASCII round-trips
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The lambda repls (#18) stopped build-state-doc crashing on non-ASCII, but the
values are still emitted with json.dumps' default ensure_ascii=True, i.e. as
"\uXXXX" escapes. The frontmatter reader (unquote_scalar) only strips the
surrounding quotes — it does not JSON-decode — so those escapes survive a
write->parse cycle as the literal text "\uXXXX". On a non-English project
(epicName / customInstructions / aiCommand with an em-dash or Polish letters)
every such scalar reads back corrupted, e.g. aiCommand "claude — go" parses as
'claude — go'.

Fix: route value rendering through a small _yaml_value() helper that dumps with
ensure_ascii=False, so the document carries real UTF-8 that quote-stripping
recovers intact. Also convert the remaining agentConfig block re.sub from a
raw-string repl to a callable one — non-ASCII in an agent/model id is unlikely,
but a backslash (a "\g"- or "\b"-looking token) is realistic and would raise
re.error: bad escape or corrupt into a backspace, the same hazard the scalar
sites already guard against.

Tests: extend test_replacement_unicode with frontmatter round-trip assertions
(the existing tests assertIn on the whole document, which passes via the body
{{token}} even when the frontmatter line is escaped) and an agentConfig case
covering non-ASCII + backslash content.
---
 .../src/story_automator/commands/state.py     | 35 ++++++++++++-------
 tests/test_replacement_unicode.py             | 34 ++++++++++++++++++
 2 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/skills/bmad-story-automator/src/story_automator/commands/state.py b/skills/bmad-story-automator/src/story_automator/commands/state.py
index 38990141..e9aeea16 100644
--- a/skills/bmad-story-automator/src/story_automator/commands/state.py
+++ b/skills/bmad-story-automator/src/story_automator/commands/state.py
@@ -11,6 +11,14 @@
 from ..core.utils import count_matches, ensure_dir, file_exists, get_project_root, now_utc, now_utc_z, read_text, write_json
 
 
+def _yaml_value(value: Any) -> str:
+    # Emit non-ASCII as raw UTF-8, not \uXXXX escapes. These values are read back
+    # with unquote_scalar (frontmatter.py), which strips the surrounding quotes
+    # WITHOUT JSON-decoding, so an ensure_ascii escape would round-trip as the
+    # literal text "\uXXXX" and corrupt the value.
+    return json.dumps(value, ensure_ascii=False)
+
+
 def cmd_build_state_doc(args: list[str]) -> int:
     template = ""
     output_folder = ""
@@ -76,7 +84,7 @@ def cmd_build_state_doc(args: list[str]) -> int:
         f"  maxParallel: {int(overrides.get('maxParallel', 1) or 1)}\n",
         text,
     )
-    custom_instructions = json.dumps(config.get("customInstructions", ""))
+    custom_instructions = _yaml_value(config.get("customInstructions", ""))
     text = re.sub(r"(?m)^customInstructions:.*$", lambda m: f"customInstructions: {custom_instructions}", text)
     agent_config = config.get("agentConfig")
     if isinstance(agent_config, dict):
@@ -97,8 +105,8 @@ def cmd_build_state_doc(args: list[str]) -> int:
 
         lines = [
             "agentConfig:",
-            f"  defaultPrimary: {json.dumps(default_primary)}",
-            f"  defaultFallback: {json.dumps(default_fallback)}",
+            f"  defaultPrimary: {_yaml_value(default_primary)}",
+            f"  defaultFallback: {_yaml_value(default_fallback)}",
         ]
         # Model serialization preserves three states so round-trips through
         # `_load_agent_config_from_state` + `resolve_agent` keep the same
@@ -113,7 +121,7 @@ def cmd_build_state_doc(args: list[str]) -> int:
         # motivated this — without preserving the explicit clear, retro/dev
         # tasks silently re-inherited `defaultModel` after persistence.
         if "defaultModel" in agent_config:
-            lines.append(f"  defaultModel: {json.dumps(_model_or_none(agent_config.get('defaultModel')))}")
+            lines.append(f"  defaultModel: {_yaml_value(_model_or_none(agent_config.get('defaultModel')))}")
         if isinstance(per_task, dict) and per_task:
             lines.append("  perTask:")
             for task in sorted(per_task):
@@ -122,12 +130,12 @@ def cmd_build_state_doc(args: list[str]) -> int:
                     continue
                 lines.append(f"    {task}:")
                 if "primary" in entry:
-                    lines.append(f"      primary: {json.dumps(entry['primary'])}")
+                    lines.append(f"      primary: {_yaml_value(entry['primary'])}")
                 if "fallback" in entry:
                     value = entry["fallback"]
-                    lines.append(f"      fallback: {'false' if value is False else json.dumps(value)}")
+                    lines.append(f"      fallback: {'false' if value is False else _yaml_value(value)}")
                 if "model" in entry:
-                    lines.append(f"      model: {json.dumps(_model_or_none(entry.get('model')))}")
+                    lines.append(f"      model: {_yaml_value(_model_or_none(entry.get('model')))}")
         complexity_overrides = agent_config.get("complexityOverrides", {})
         if isinstance(complexity_overrides, dict) and complexity_overrides:
             lines.append("  complexityOverrides:")
@@ -142,16 +150,19 @@ def cmd_build_state_doc(args: list[str]) -> int:
                         continue
                     lines.append(f"      {task}:")
                     if "primary" in entry:
-                        lines.append(f"        primary: {json.dumps(entry['primary'])}")
+                        lines.append(f"        primary: {_yaml_value(entry['primary'])}")
                     if "fallback" in entry:
                         value = entry["fallback"]
-                        lines.append(f"        fallback: {'false' if value is False else json.dumps(value)}")
+                        lines.append(f"        fallback: {'false' if value is False else _yaml_value(value)}")
                     if "model" in entry:
-                        lines.append(f"        model: {json.dumps(_model_or_none(entry.get('model')))}")
+                        lines.append(f"        model: {_yaml_value(_model_or_none(entry.get('model')))}")
         block = "\n".join(lines) + "\n"
-        text = re.sub(r"(?m)^agentConfig:\n(?:(?:\s{2}.*\n)*)", block, text)
+        # Callable repl: the block is inserted verbatim, never parsed for \g / \1
+        # backrefs, so a "\uXXXX" or backslash in an agent/model name can't raise
+        # re.error: bad escape (the same hazard fixed below for scalar values).
+        text = re.sub(r"(?m)^agentConfig:\n(?:(?:\s{2}.*\n)*)", lambda m: block, text)
     for key, value in replacements.items():
-        text = re.sub(rf"(?m)^{re.escape(key)}:.*$", lambda m, k=key, v=value: f"{k}: {json.dumps(v)}", text)
+        text = re.sub(rf"(?m)^{re.escape(key)}:.*$", lambda m, k=key, v=value: f"{k}: {_yaml_value(v)}", text)
     story_range = [item for item in config.get("storyRange", []) if isinstance(item, str)]
     progress_rows = "\n".join(f"| {story_id} | ⏳ | ⏳ | ⏳ | ⏳ | ⏳ | pending |" for story_id in story_range)
     body = {
diff --git a/tests/test_replacement_unicode.py b/tests/test_replacement_unicode.py
index e5286dbd..59827908 100644
--- a/tests/test_replacement_unicode.py
+++ b/tests/test_replacement_unicode.py
@@ -19,6 +19,7 @@
 
 from story_automator.commands.orchestrator import cmd_orchestrator_helper
 from story_automator.commands.state import cmd_build_state_doc
+from story_automator.core.frontmatter import parse_simple_frontmatter
 
 
 REPO_ROOT = Path(__file__).resolve().parents[1]
@@ -111,6 +112,39 @@ def test_replacement_value_with_backslash_in_string(self) -> None:
         text = state_file.read_text(encoding="utf-8")
         self.assertIn(r"\to", text)
 
+    def test_frontmatter_scalars_round_trip_non_ascii(self) -> None:
+        # Regression: scalars must survive a write->parse cycle. The frontmatter
+        # reader (unquote_scalar) strips quotes without JSON-decoding, so values
+        # have to be written as raw UTF-8 — an ensure_ascii "\uXXXX" escape would
+        # read back as literal "\uXXXX" text. assertIn on the whole document does
+        # NOT catch this because the body {{token}} carries the real char.
+        config = self._default_config()
+        config["epicName"] = "upmon — Epic 3 obsłuż"
+        config["aiCommand"] = "claude — go"
+        config["customInstructions"] = "run probe — ciężki ż"
+        fields = parse_simple_frontmatter(self._build_state(config).read_text(encoding="utf-8"))
+        self.assertEqual(fields.get("epicName"), "upmon — Epic 3 obsłuż")
+        self.assertEqual(fields.get("aiCommand"), "claude — go")
+        self.assertEqual(fields.get("customInstructions"), "run probe — ciężki ż")
+
+    def test_agent_config_special_chars_do_not_break_resub(self) -> None:
+        # The agentConfig block is spliced in with re.sub too. A non-ASCII model
+        # id is unlikely, but a backslash in a model/agent string is realistic
+        # (e.g. a "\g"- or "\b"-looking token) — as a raw repl it would raise
+        # re.error: bad escape or corrupt "\b" into a backspace. Both must
+        # survive verbatim and never become a "\uXXXX" escape.
+        config = self._default_config()
+        config["agentConfig"] = {
+            "defaultPrimary": "claude",
+            "defaultFallback": "codex",
+            "perTask": {"dev": {"primary": r"agent\g<0>", "model": "opus — żółw"}},
+        }
+        # _build_state returning a path at all proves the agentConfig re.sub did
+        # not raise re.error on the "\g<0>" backslash content.
+        text = self._build_state(config).read_text(encoding="utf-8")
+        self.assertIn("opus — żółw", text)
+        self.assertNotIn(r"\u", text)
+
 
 class StateUpdateUnicodeTests(_FixtureMixin, unittest.TestCase):
     def test_state_update_with_unicode_value(self) -> None:

From 605554fd969ac070b2706084c8be934403b9366e Mon Sep 17 00:00:00 2001
From: Pawel-N-pl <ashi.one@gmail.com>
Date: Sun, 7 Jun 2026 10:11:58 +0200
Subject: [PATCH 2/2] fix(build-state-doc): force UTF-8 when writing the state
 doc + pin backslash test

Addresses review feedback on the non-ASCII serialization change:

- output_path.write_text(text) had no explicit encoding. Now that the document
  carries raw UTF-8 (not \uXXXX escapes), the bare write_text would use the
  platform default locale and could raise UnicodeEncodeError / mojibake on a
  non-UTF-8 locale (Windows cp1252, POSIX C). Pass encoding="utf-8", matching
  the repo's read_text helper. (Augment medium / CodeRabbit major.)
- Strengthen the agentConfig regression test: assert the backslash value is
  emitted verbatim ("agent\\g<0>", JSON-doubled) so the test pins the value
  survives the splice, not merely that re.sub didn't raise. (CodeRabbit nitpick.)
---
 .../bmad-story-automator/src/story_automator/commands/state.py | 2 +-
 tests/test_replacement_unicode.py                              | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/skills/bmad-story-automator/src/story_automator/commands/state.py b/skills/bmad-story-automator/src/story_automator/commands/state.py
index e9aeea16..af0baaf2 100644
--- a/skills/bmad-story-automator/src/story_automator/commands/state.py
+++ b/skills/bmad-story-automator/src/story_automator/commands/state.py
@@ -177,7 +177,7 @@ def cmd_build_state_doc(args: list[str]) -> int:
     for key, value in body.items():
         text = text.replace(key, value)
     text = text.replace("<!-- Progress rows will be appended here -->", progress_rows)
-    output_path.write_text(text)
+    output_path.write_text(text, encoding="utf-8")
     write_json({"ok": True, "path": str(output_path), "createdAt": now})
     return 0
 
diff --git a/tests/test_replacement_unicode.py b/tests/test_replacement_unicode.py
index 59827908..6c71615a 100644
--- a/tests/test_replacement_unicode.py
+++ b/tests/test_replacement_unicode.py
@@ -143,6 +143,9 @@ def test_agent_config_special_chars_do_not_break_resub(self) -> None:
         # not raise re.error on the "\g<0>" backslash content.
         text = self._build_state(config).read_text(encoding="utf-8")
         self.assertIn("opus — żółw", text)
+        # JSON doubles the backslash, so the literal block carries "agent\\g<0>";
+        # asserting it proves the value survived the splice, not just that it ran.
+        self.assertIn(r"agent\\g<0>", text)
         self.assertNotIn(r"\u", text)