diff --git a/.agents/skills/watchlist-md/SKILL.md b/.agents/skills/watchlist-md/SKILL.md index ea0ae49..bfb4351 100644 --- a/.agents/skills/watchlist-md/SKILL.md +++ b/.agents/skills/watchlist-md/SKILL.md @@ -77,11 +77,12 @@ Use this item shape: - next_step_on_fail: ``` -For open items, populate: ID, status, priority, owner, due_at, created_at, -source, trigger, action, and done_when. Use safe source pointers only; never store -signed, tokenized, private, or credential-bearing links. Keep last_checked_at and -result blank until checked. Use `assistant_on_review` for explicit-review help; -treat legacy `owner: agent` as `assistant_on_review`. +For open items, keep field keys and enum values in English; populate: ID, +status, priority, owner, due_at, created_at, source, trigger, action, and +done_when. Localize only titles and free-text values. Use safe pointers; never +store signed, tokenized, private, or credential-bearing links. Keep +last_checked_at and result blank until checked. Use `assistant_on_review` for +explicit-review help; treat legacy `owner: agent` as `assistant_on_review`. Generate IDs from the WATCHLIST timezone: WATCHLIST.md `timezone:` field > explicit user timezone > environment/user timezone > Asia/Seoul. Re-read diff --git a/evals/cases/localized-schema-tokens-kr.json b/evals/cases/localized-schema-tokens-kr.json new file mode 100644 index 0000000..1363138 --- /dev/null +++ b/evals/cases/localized-schema-tokens-kr.json @@ -0,0 +1,65 @@ +{ + "id": "localized-schema-tokens-kr", + "prompt": "한국어로 작성하되 WATCHLIST.md에 추가해줘. 오늘 17:00에 CI 확인.", + "locale": "ko", + "fixed_now": "2026-05-14T16:30:00+09:00", + "fixture": "empty.watchlist.md", + "should_trigger_skill": true, + "expected": { + "operation": "add_item", + "status": "open", + "due_at": "2026-05-14T17:00:00+09:00", + "scheduler": "none", + "required_fields": [ + "source", + "trigger", + "action", + "done_when" + ], + "forbidden_response_substrings": [ + "I'll remind you", + "I will remind you", + "I'll check later", + "I will check later", + "자동으로 알려드릴게요", + "제가 나중에 확인할게요" + ], + "schema_tokens": { + "must_use_field_keys": [ + "schema_version", + "automation", + "timezone", + "status", + "priority", + "owner", + "due_at", + "created_at", + "source", + "trigger", + "action", + "done_when", + "last_checked_at", + "result", + "next_step_on_fail" + ], + "must_use_enum_values": [ + "open", + "P1", + "assistant_on_review" + ], + "must_not_use_localized_schema_tokens": [ + "상태", + "우선순위", + "담당자", + "기한", + "생성일", + "출처", + "계기", + "작업", + "완료조건", + "열림", + "어시스턴트" + ] + } + } +} diff --git a/evals/check_policy_markers.py b/evals/check_policy_markers.py index cab2f6e..d95ce58 100644 --- a/evals/check_policy_markers.py +++ b/evals/check_policy_markers.py @@ -41,7 +41,7 @@ "pending result for later review", "safe link", "Scope pre-authorized watchlist recording", - "For open items, populate", + "keep field keys and enum values in English", "confirm ID, due_at", "WATCHLIST.md `timezone:` field", "environment/user timezone", diff --git a/evals/check_semantic_cases.py b/evals/check_semantic_cases.py index b5dfc30..141a71a 100644 --- a/evals/check_semantic_cases.py +++ b/evals/check_semantic_cases.py @@ -195,6 +195,7 @@ def validate_add_item( case_id: str, expected: dict[str, object], errors: list[str], + locale: object = None, ) -> None: require_keys( expected, @@ -248,6 +249,92 @@ def validate_add_item( f"{forbidden_operation}" ) + validate_schema_tokens(case_id, locale, expected, errors) + + +def validate_schema_tokens( + case_id: str, + locale: object, + expected: dict[str, object], + errors: list[str], +) -> None: + schema_tokens = expected.get("schema_tokens") + if schema_tokens is None: + return + if not isinstance(schema_tokens, dict): + errors.append(f"{case_id}: expected.schema_tokens must be an object") + return + + require_keys( + schema_tokens, + { + "must_use_field_keys", + "must_use_enum_values", + "must_not_use_localized_schema_tokens", + }, + case_id, + errors, + "expected.schema_tokens", + ) + field_keys = require_string_list( + schema_tokens, + "must_use_field_keys", + case_id, + errors, + "expected.schema_tokens", + ) + enum_values = require_string_list( + schema_tokens, + "must_use_enum_values", + case_id, + errors, + "expected.schema_tokens", + ) + localized_tokens = require_string_list( + schema_tokens, + "must_not_use_localized_schema_tokens", + case_id, + errors, + "expected.schema_tokens", + ) + + required_field_keys = { + "schema_version", + "automation", + "timezone", + "status", + "priority", + "owner", + "due_at", + "created_at", + "source", + "trigger", + "action", + "done_when", + "last_checked_at", + "result", + "next_step_on_fail", + } + missing_field_keys = sorted(required_field_keys - field_keys) + if missing_field_keys: + errors.append( + f"{case_id}: schema_tokens.must_use_field_keys missing " + f"{', '.join(missing_field_keys)}" + ) + + if not {"open", "P1", "assistant_on_review"}.issubset(enum_values): + errors.append( + f"{case_id}: schema_tokens.must_use_enum_values must include " + "open, P1, and assistant_on_review" + ) + if locale != "ko": + return + if not {"상태", "우선순위", "담당자", "기한", "열림"}.issubset(localized_tokens): + errors.append( + f"{case_id}: schema_tokens.must_not_use_localized_schema_tokens " + "must include 상태, 우선순위, 담당자, 기한, and 열림" + ) + def validate_storage_contract( case_id: str, @@ -537,7 +624,7 @@ def validate_case( return if operation == "add_item": - validate_add_item(case_id, expected, errors) + validate_add_item(case_id, expected, errors, case.get("locale")) elif operation == "archive_items": validate_archive_items(case_id, expected, errors) elif operation == "complete_item": diff --git a/evals/prompts.csv b/evals/prompts.csv index 66cb240..b6dd7e0 100644 --- a/evals/prompts.csv +++ b/evals/prompts.csv @@ -1,5 +1,6 @@ id,should_trigger,prompt,expected add-kr-01,true,"WATCHLIST.md에 추가해줘. 오늘 17:00에 GitHub Actions 결과 확인.","creates one open item; due_at is ISO-8601 +09:00 when current time is available; scheduler status says none" +localized-schema-tokens-kr,true,"한국어로 작성하되 WATCHLIST.md에 추가해줘. 오늘 17:00에 CI 확인.","creates one open item using English schema field keys and enum values while allowing Korean title and free-text values" add-kr-02,true,"배포가 방금 시작됐어. 30분 뒤에 에러 로그 확인해야 해.","creates a deferred check with concrete due_at when current time is available, otherwise uses unscheduled and records ambiguity" review-kr-01,true,"오늘 확인할 WATCHLIST.md 보여줘.","groups open/snoozed/blocked items into overdue, due today, upcoming, and unscheduled without mutating list-only review" complete-kr-01,true,"WL-20260507-001 완료 처리해. CI 모두 pass 했어.","sets status done; fills last_checked_at and result; moves the item under ## Done by default when that section exists; does not delete the item" diff --git a/evals/self_checks.yaml b/evals/self_checks.yaml index b601008..2392c7a 100644 --- a/evals/self_checks.yaml +++ b/evals/self_checks.yaml @@ -20,6 +20,49 @@ cases: - trigger - action - done_when + - id: localized-schema-tokens-kr + prompt: "한국어로 작성하되 WATCHLIST.md에 추가해줘. 오늘 17:00에 CI 확인." + expected: + status: open + due_at: "2026-05-14T17:00:00+09:00" + required_fields: + - source + - trigger + - action + - done_when + schema_tokens: + must_use_field_keys: + - schema_version + - automation + - timezone + - status + - priority + - owner + - due_at + - created_at + - source + - trigger + - action + - done_when + - last_checked_at + - result + - next_step_on_fail + must_use_enum_values: + - open + - P1 + - assistant_on_review + must_not_use_localized_schema_tokens: + - 상태 + - 우선순위 + - 담당자 + - 기한 + - 생성일 + - 출처 + - 계기 + - 작업 + - 완료조건 + - 열림 + - 어시스턴트 - id: add-kr-02 prompt: "배포가 방금 시작됐어. 30분 뒤에 에러 로그 확인해야 해." expected: diff --git a/evals/test_check_watchlist.py b/evals/test_check_watchlist.py index 5443c13..ba826b8 100644 --- a/evals/test_check_watchlist.py +++ b/evals/test_check_watchlist.py @@ -593,8 +593,9 @@ def test_skill_runtime_polish_markers_stay_precise(self): "> explicit user timezone > environment/user timezone > Asia/Seoul." ) required_information = ( - "For open items, populate: ID, status, priority, owner, due_at, " - "created_at, source, trigger, action, and done_when." + "For open items, keep field keys and enum values in English; " + "populate: ID, status, priority, owner, due_at, created_at, source, " + "trigger, action, and done_when." ) self.assertIn("pending result for later review", text) @@ -610,6 +611,7 @@ def test_skill_runtime_polish_markers_stay_precise(self): self.assertIn("due_at", text) self.assertNotIn("due time", text) self.assertIn(required_information, " ".join(text.split())) + self.assertIn("Localize only titles and free-text values", " ".join(text.split())) self.assertNotIn("done condition", text) self.assertIn("confirm ID, due_at, action, done_when, and scheduler status", " ".join(text.split())) self.assertIn("watchlist timezone", normalized_text)