Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions .agents/skills/watchlist-md/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,12 @@ Use this item shape:
- next_step_on_fail:
```

For open items, populate: ID, status, priority, owner, due_at, created_at,
source, trigger, action, and done_when. Use safe source pointers only; never store
signed, tokenized, private, or credential-bearing links. Keep last_checked_at and
result blank until checked. Use `assistant_on_review` for explicit-review help;
treat legacy `owner: agent` as `assistant_on_review`.
For open items, keep field keys and enum values in English; populate: ID,
status, priority, owner, due_at, created_at, source, trigger, action, and
done_when. Localize only titles and free-text values. Use safe pointers; never
store signed, tokenized, private, or credential-bearing links. Keep
last_checked_at and result blank until checked. Use `assistant_on_review` for
explicit-review help; treat legacy `owner: agent` as `assistant_on_review`.

Generate IDs from the WATCHLIST timezone: WATCHLIST.md `timezone:` field >
explicit user timezone > environment/user timezone > Asia/Seoul. Re-read
Expand Down
65 changes: 65 additions & 0 deletions evals/cases/localized-schema-tokens-kr.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
{
"id": "localized-schema-tokens-kr",
"prompt": "한국어로 작성하되 WATCHLIST.md에 추가해줘. 오늘 17:00에 CI 확인.",
"locale": "ko",
"fixed_now": "2026-05-14T16:30:00+09:00",
"fixture": "empty.watchlist.md",
"should_trigger_skill": true,
"expected": {
"operation": "add_item",
"status": "open",
"due_at": "2026-05-14T17:00:00+09:00",
"scheduler": "none",
"required_fields": [
"source",
"trigger",
"action",
"done_when"
],
"forbidden_response_substrings": [
"I'll remind you",
"I will remind you",
"I'll check later",
"I will check later",
"자동으로 알려드릴게요",
"제가 나중에 확인할게요"
],
"schema_tokens": {
"must_use_field_keys": [
"schema_version",
"automation",
"timezone",
"status",
"priority",
"owner",
"due_at",
"created_at",
"source",
"trigger",
"action",
"done_when",
"last_checked_at",
"result",
"next_step_on_fail"
],
"must_use_enum_values": [
"open",
"P1",
"assistant_on_review"
],
"must_not_use_localized_schema_tokens": [
"상태",
"우선순위",
"담당자",
"기한",
"생성일",
"출처",
"계기",
"작업",
"완료조건",
"열림",
"어시스턴트"
]
}
}
}
2 changes: 1 addition & 1 deletion evals/check_policy_markers.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
"pending result for later review",
"safe link",
"Scope pre-authorized watchlist recording",
"For open items, populate",
"keep field keys and enum values in English",
"confirm ID, due_at",
"WATCHLIST.md `timezone:` field",
"environment/user timezone",
Expand Down
89 changes: 88 additions & 1 deletion evals/check_semantic_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ def validate_add_item(
case_id: str,
expected: dict[str, object],
errors: list[str],
locale: object = None,
) -> None:
require_keys(
expected,
Expand Down Expand Up @@ -248,6 +249,92 @@ def validate_add_item(
f"{forbidden_operation}"
)

validate_schema_tokens(case_id, locale, expected, errors)


def validate_schema_tokens(
case_id: str,
locale: object,
expected: dict[str, object],
errors: list[str],
) -> None:
schema_tokens = expected.get("schema_tokens")
if schema_tokens is None:
return
if not isinstance(schema_tokens, dict):
errors.append(f"{case_id}: expected.schema_tokens must be an object")
return

require_keys(
schema_tokens,
{
"must_use_field_keys",
"must_use_enum_values",
"must_not_use_localized_schema_tokens",
},
case_id,
errors,
"expected.schema_tokens",
)
field_keys = require_string_list(
schema_tokens,
"must_use_field_keys",
case_id,
errors,
"expected.schema_tokens",
)
enum_values = require_string_list(
schema_tokens,
"must_use_enum_values",
case_id,
errors,
"expected.schema_tokens",
)
localized_tokens = require_string_list(
schema_tokens,
"must_not_use_localized_schema_tokens",
case_id,
errors,
"expected.schema_tokens",
)

required_field_keys = {
"schema_version",
"automation",
"timezone",
"status",
"priority",
"owner",
"due_at",
"created_at",
"source",
"trigger",
"action",
"done_when",
"last_checked_at",
"result",
"next_step_on_fail",
}
missing_field_keys = sorted(required_field_keys - field_keys)
if missing_field_keys:
errors.append(
f"{case_id}: schema_tokens.must_use_field_keys missing "
f"{', '.join(missing_field_keys)}"
)

if not {"open", "P1", "assistant_on_review"}.issubset(enum_values):
errors.append(
f"{case_id}: schema_tokens.must_use_enum_values must include "
"open, P1, and assistant_on_review"
)
if locale != "ko":
return
if not {"상태", "우선순위", "담당자", "기한", "열림"}.issubset(localized_tokens):
errors.append(
f"{case_id}: schema_tokens.must_not_use_localized_schema_tokens "
"must include 상태, 우선순위, 담당자, 기한, and 열림"
)
Comment on lines +332 to +336
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The assertion for localized schema tokens is hardcoded to Korean words (상태, 우선순위, etc.). If non-Korean localized test cases (e.g., -jp or -es) are added in the future, this validator will fail.

We should condition this check so it only runs for Korean cases (e.g., by checking if "-kr" is in the case_id).

Suggested change
if not {"상태", "우선순위", "담당자", "기한", "열림"}.issubset(localized_tokens):
errors.append(
f"{case_id}: schema_tokens.must_not_use_localized_schema_tokens "
"must include 상태, 우선순위, 담당자, 기한, and 열림"
)
if "-kr" in case_id and not {"상태", "우선순위", "담당자", "기한", "열림"}.issubset(localized_tokens):
errors.append(
f"{case_id}: schema_tokens.must_not_use_localized_schema_tokens "
"must include 상태, 우선순위, 담당자, 기한, and 열림"
)



def validate_storage_contract(
case_id: str,
Expand Down Expand Up @@ -537,7 +624,7 @@ def validate_case(
return

if operation == "add_item":
validate_add_item(case_id, expected, errors)
validate_add_item(case_id, expected, errors, case.get("locale"))
elif operation == "archive_items":
validate_archive_items(case_id, expected, errors)
elif operation == "complete_item":
Expand Down
1 change: 1 addition & 0 deletions evals/prompts.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
id,should_trigger,prompt,expected
add-kr-01,true,"WATCHLIST.md에 추가해줘. 오늘 17:00에 GitHub Actions 결과 확인.","creates one open item; due_at is ISO-8601 +09:00 when current time is available; scheduler status says none"
localized-schema-tokens-kr,true,"한국어로 작성하되 WATCHLIST.md에 추가해줘. 오늘 17:00에 CI 확인.","creates one open item using English schema field keys and enum values while allowing Korean title and free-text values"
add-kr-02,true,"배포가 방금 시작됐어. 30분 뒤에 에러 로그 확인해야 해.","creates a deferred check with concrete due_at when current time is available, otherwise uses unscheduled and records ambiguity"
review-kr-01,true,"오늘 확인할 WATCHLIST.md 보여줘.","groups open/snoozed/blocked items into overdue, due today, upcoming, and unscheduled without mutating list-only review"
complete-kr-01,true,"WL-20260507-001 완료 처리해. CI 모두 pass 했어.","sets status done; fills last_checked_at and result; moves the item under ## Done by default when that section exists; does not delete the item"
Expand Down
43 changes: 43 additions & 0 deletions evals/self_checks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,49 @@ cases:
- trigger
- action
- done_when
- id: localized-schema-tokens-kr
prompt: "한국어로 작성하되 WATCHLIST.md에 추가해줘. 오늘 17:00에 CI 확인."
expected:
status: open
due_at: "2026-05-14T17:00:00+09:00"
required_fields:
- source
- trigger
- action
- done_when
schema_tokens:
must_use_field_keys:
- schema_version
- automation
- timezone
- status
- priority
- owner
- due_at
- created_at
- source
- trigger
- action
- done_when
- last_checked_at
- result
- next_step_on_fail
must_use_enum_values:
- open
- P1
- assistant_on_review
must_not_use_localized_schema_tokens:
- 상태
- 우선순위
- 담당자
- 기한
- 생성일
- 출처
- 계기
- 작업
- 완료조건
- 열림
- 어시스턴트
- id: add-kr-02
prompt: "배포가 방금 시작됐어. 30분 뒤에 에러 로그 확인해야 해."
expected:
Expand Down
6 changes: 4 additions & 2 deletions evals/test_check_watchlist.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,8 +593,9 @@ def test_skill_runtime_polish_markers_stay_precise(self):
"> explicit user timezone > environment/user timezone > Asia/Seoul."
)
required_information = (
"For open items, populate: ID, status, priority, owner, due_at, "
"created_at, source, trigger, action, and done_when."
"For open items, keep field keys and enum values in English; "
"populate: ID, status, priority, owner, due_at, created_at, source, "
"trigger, action, and done_when."
)

self.assertIn("pending result for later review", text)
Expand All @@ -610,6 +611,7 @@ def test_skill_runtime_polish_markers_stay_precise(self):
self.assertIn("due_at", text)
self.assertNotIn("due time", text)
self.assertIn(required_information, " ".join(text.split()))
self.assertIn("Localize only titles and free-text values", " ".join(text.split()))
self.assertNotIn("done condition", text)
self.assertIn("confirm ID, due_at, action, done_when, and scheduler status", " ".join(text.split()))
self.assertIn("watchlist timezone", normalized_text)
Expand Down
Loading