From e6de4ad327dafe727e139835bd05535d81249c65 Mon Sep 17 00:00:00 2001 From: Alexandre Balmes Date: Thu, 4 Jun 2026 00:51:33 +0200 Subject: [PATCH] feat(codex): implement F103 NDJSON output parity for Codex provider - `.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/journal.wal`: Add ZPM PR tracking journal - `.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl`: Add ZPM PR knowledge base schema - `CHANGELOG.md`: Document F103 presence-aware output extraction and Response population - `README.md`: Update output formatting feature description to include Response auto-population - `docs/reference/interpolation.md`: Expand Response section with agent step examples and JSON/Response comparison - `docs/user-guide/agent-steps.md`: Update Output/Response field descriptions and output_format table for F103 - `internal/infrastructure/agents/codex_provider.go`: Add extractCodexAssistantText/extractCodexTextContent helpers; wire extractTextContent hook; implement presence-aware Output overwrite and Response population in Execute and ExecuteConversation - `internal/infrastructure/agents/codex_provider_delegation_test.go`: Remove stub-era conditionals; add OutputExtraction, HookWired, PlainTextFallback, NDJSONOutput delegation tests - `internal/infrastructure/agents/codex_provider_test_helpers_test.go`: Add ndjsonLine test helper - `internal/infrastructure/agents/codex_provider_unit_test.go`: Expand unit test coverage for F103 extraction paths; remove duplicated ProviderName/TimestampOrdering tests - `internal/infrastructure/agents/helpers.go`: Remove stale comment from appendCodexOptions Closes #363 --- .../journal.wal | 166 +++++ .../knowledge.pl | 62 ++ CHANGELOG.md | 4 + README.md | 2 +- docs/reference/interpolation.md | 77 ++- docs/user-guide/agent-steps.md | 32 +- .../infrastructure/agents/codex_provider.go | 145 ++++- .../agents/codex_provider_delegation_test.go | 177 ++++-- .../agents/codex_provider_exec_test.go | 7 +- ...odex_provider_parse_display_events_test.go | 44 ++ .../agents/codex_provider_resume_test.go | 37 +- .../agents/codex_provider_session_test.go | 12 +- .../codex_provider_test_helpers_test.go | 15 + .../agents/codex_provider_unit_test.go | 583 +++++++++++++++--- internal/infrastructure/agents/helpers.go | 4 +- 15 files changed, 1142 insertions(+), 225 deletions(-) create mode 100644 .zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/journal.wal create mode 100644 .zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl create mode 100644 internal/infrastructure/agents/codex_provider_test_helpers_test.go diff --git a/.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/journal.wal b/.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/journal.wal new file mode 100644 index 00000000..9be492db --- /dev/null +++ b/.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/journal.wal @@ -0,0 +1,166 @@ +{"ts":1780517469,"op":"retractall","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:todo(_, _, _, _)"} +{"ts":1780517470,"op":"retractall","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:stub(_, _, _)"} +{"ts":1780517470,"op":"retractall","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock(_, _, _)"} +{"ts":1780517470,"op":"retractall","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:not_impl(_, _, _)"} +{"ts":1780517470,"op":"retractall","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:pr_file(_, _)"} +{"ts":1780517470,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:pr_file('.zpm/mounts.json', changed)"} +{"ts":1780519586,"op":"retractall","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:todo(_, _, _, _)"} +{"ts":1780519586,"op":"retractall","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:stub(_, _, _)"} +{"ts":1780519586,"op":"retractall","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock(_, _, _)"} +{"ts":1780519586,"op":"retractall","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:not_impl(_, _, _)"} +{"ts":1780519586,"op":"retractall","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:pr_file(_, _)"} +{"ts":1780519586,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:pr_file('.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/journal.wal', changed)"} +{"ts":1780519587,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:stub('issue_1_2', '.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/journal.wal', 'unknown')"} +{"ts":1780519587,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_1_3', '.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/journal.wal', 'unknown')"} +{"ts":1780519587,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:stub('issue_1_8', '.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/journal.wal', 'unknown')"} +{"ts":1780519587,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_1_9', '.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/journal.wal', 'unknown')"} +{"ts":1780519587,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:pr_file('.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl', changed)"} +{"ts":1780519588,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:todo('issue_2_8', '.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl', 8, 'found in changed code')"} +{"ts":1780519588,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:stub('issue_2_9', '.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl', 'unknown')"} +{"ts":1780519588,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_2_10', '.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl', 'unknown')"} +{"ts":1780519588,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:stub('issue_2_17', '.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl', 'unknown')"} +{"ts":1780519588,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_2_18', '.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl', 'unknown')"} +{"ts":1780519588,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:stub('issue_2_26', '.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl', 'unknown')"} +{"ts":1780519588,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:stub('issue_2_27', '.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl', 'unknown')"} +{"ts":1780519589,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_2_28', '.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl', 'unknown')"} +{"ts":1780519589,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_2_29', '.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl', 'unknown')"} +{"ts":1780519589,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:stub('issue_2_36', '.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl', 'unknown')"} +{"ts":1780519589,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:stub('issue_2_37', '.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl', 'unknown')"} +{"ts":1780519589,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_2_38', '.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl', 'unknown')"} +{"ts":1780519589,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_2_39', '.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl', 'unknown')"} +{"ts":1780519589,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:pr_file('.zpm/mounts.json', changed)"} +{"ts":1780519590,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:pr_file('internal/infrastructure/agents/codex_provider.go', changed)"} +{"ts":1780519590,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:pr_file('internal/infrastructure/agents/codex_provider_unit_test.go', test)"} +{"ts":1780519590,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_10', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519590,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_16', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519590,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_23', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519590,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_29', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'main')"} +{"ts":1780519590,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_36', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519591,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_43', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519591,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_49', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'complex')"} +{"ts":1780519591,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_55', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519591,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_62', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519591,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_63', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519591,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_64', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519592,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_87', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519592,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_94', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519592,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_101', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519592,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_108', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519592,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_115', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519592,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_122', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519592,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_129', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519593,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_130', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519593,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_131', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519593,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_136', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519593,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_169', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519593,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_170', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519593,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_171', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519593,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_179', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519594,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_295', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519594,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_296', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519594,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_297', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519594,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_342', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519594,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_343', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519594,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_344', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519594,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_359', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519595,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_364', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519595,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_369', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519595,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_374', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519595,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_379', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519595,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_386', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519595,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_387', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519595,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_388', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519596,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_446', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519596,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_447', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519596,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_448', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519596,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_449', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519596,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_463', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519596,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_468', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519597,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_473', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519597,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_478', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519597,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_483', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519597,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_490', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519597,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_491', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519597,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_492', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519597,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_504', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519598,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_505', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519598,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_506', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519598,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_518', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519598,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_519', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519598,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_520', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519598,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_535', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519598,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_542', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'main')"} +{"ts":1780519599,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_550', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519599,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_558', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'func')"} +{"ts":1780519599,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_565', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519599,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_566', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519599,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_567', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519599,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_584', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519600,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_585', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519600,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_586', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519600,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_615', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519600,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_616', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519600,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_617', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519600,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_630', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519600,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_631', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519601,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_632', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519601,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_653', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519601,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_654', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519601,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_655', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519601,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_674', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519601,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_675', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519602,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_676', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519602,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_758', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519602,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_759', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519602,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_760', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519602,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_806', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519602,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_807', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519602,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_808', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519603,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_824', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519603,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_829', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519603,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_834', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519603,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_841', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519603,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_842', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519603,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_843', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519603,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_890', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519604,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_891', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519604,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_892', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519604,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_898', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519604,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_939', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519604,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_946', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519604,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_953', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519605,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_960', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519605,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_967', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519605,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_974', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519605,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_975', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519605,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_976', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519605,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_981', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519605,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_991', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519606,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_992', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519606,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_993', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519606,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_998', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519606,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1010', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519606,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1011', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519606,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1012', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519606,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1018', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519607,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1163', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519607,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1164', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519607,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1165', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519607,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1179', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519607,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1180', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519607,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1181', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519608,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1193', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519608,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1194', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519608,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1195', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519608,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1206', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519608,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1207', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519608,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1208', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519608,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1209', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519609,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1216', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519609,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1223', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519609,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1224', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519609,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1225', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519609,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1237', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519609,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1238', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519610,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1239', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} +{"ts":1780519610,"op":"assert","clause":"pr_feature_f103_codex_provider_jsonl_output_parity:mock('issue_5_1246', 'internal/infrastructure/agents/codex_provider_unit_test.go', 'unknown')"} diff --git a/.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl b/.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl new file mode 100644 index 00000000..40504f08 --- /dev/null +++ b/.zpm/kb/pr_feature_f103_codex_provider_jsonl_output_parity/knowledge.pl @@ -0,0 +1,62 @@ +:- module(pr_feature_f103_codex_provider_jsonl_output_parity, []). +% ─── PR Tracking Schema ────────────────────────────────────────────────────── +% Memory segment: pr_ +% Lifecycle: created at implement start, gated before commit, archived on merge. +% +% Facts (asserted by scan scripts and LLM): +% pr_file(Path, ChangeType) — file in PR scope (changed | added | test) +% todo(Id, File, Line, Desc) — TODO/FIXME found in changed code +% stub(Id, File, Symbol) — stub/placeholder implementation +% mock(Id, File, Symbol) — mock that should be replaced with real impl +% not_impl(Id, File, Desc) — "not yet implemented" marker +% resolved(Type, Id) — marks a tracked issue as resolved +% +% Dynamic declarations (required by Trealla Prolog for runtime assertion). +:- dynamic(pr_file/2). +:- dynamic(todo/4). +:- dynamic(stub/3). +:- dynamic(mock/3). +:- dynamic(not_impl/3). +:- dynamic(resolved/2). + +% ─── Unresolved queries ───────────────────────────────────────────────────── +% Convenience predicates for querying unresolved issues by type. +unresolved_todo(Id, File, Line, Desc) :- + todo(Id, File, Line, Desc), \+ resolved(todo, Id). +unresolved_stub(Id, File, Symbol) :- + stub(Id, File, Symbol), \+ resolved(stub, Id). +unresolved_mock(Id, File, Symbol) :- + mock(Id, File, Symbol), \+ resolved(mock, Id). +unresolved_not_impl(Id, File, Desc) :- + not_impl(Id, File, Desc), \+ resolved(not_impl, Id). + +% A blocking issue is any tracked issue that has not been resolved. +blocking_issue(Id, todo, File, Desc) :- + todo(Id, File, _, Desc), \+ resolved(todo, Id). +blocking_issue(Id, stub, File, Symbol) :- + stub(Id, File, Symbol), \+ resolved(stub, Id). +blocking_issue(Id, mock, File, Symbol) :- + mock(Id, File, Symbol), \+ resolved(mock, Id). +blocking_issue(Id, not_impl, File, Desc) :- + not_impl(Id, File, Desc), \+ resolved(not_impl, Id). + +% PR is ready ONLY when zero blocking issues remain. +pr_ready :- \+ blocking_issue(_, _, _, _). + +% Health summary — counts by category. +pr_health(blocking, N) :- + findall(I, blocking_issue(I, _, _, _), L), length(L, N). +pr_health(resolved, N) :- + findall(I, resolved(_, I), L), length(L, N). +pr_health(files, N) :- + findall(F, pr_file(F, _), L), length(L, N). + +% Coverage gap: source file changed without corresponding test file. +coverage_gap(File) :- + pr_file(File, changed), + \+ pr_file(File, test), + \+ test_file(File, _). + +% List all blocking issues as Id-Type-File-Desc tuples. +all_blockers(Blockers) :- + findall(blocker(Id, Type, File, Desc), blocking_issue(Id, Type, File, Desc), Blockers). diff --git a/CHANGELOG.md b/CHANGELOG.md index 21c032cd..87ec8903 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- **F103**: Codex provider output parity — `state.Output` and `ConversationResult.Output` for the `codex` provider now contain clean aggregated assistant text instead of raw NDJSON, regardless of `output_format` value (`json`, `stream-json`, `text`, or absent), closing the gap left by B015 in 0.7.1 which only covered Claude, Gemini, and OpenCode. Aggregation is presence-aware: when the NDJSON stream contains ≥1 `assistant_message` event, the extracted text overwrites `Output` (empty `assistant_message` yields `Output == ""`); when no events are parsed (plain-text mocks, pre-result lifecycle events only), the base output is preserved. `state.Response` and `ConversationResult.Response` are now populated via `tryParseJSONResponse` on the aggregated assistant text when it is a valid JSON object — matching Gemini/OpenCode semantics. Estimated token recount runs on the extracted text rather than the raw NDJSON length. Conversation parity achieved through a Codex-targeted override in `ExecuteConversation` plus an `extractTextContent` hook wired into `cliProviderHooks`; base provider and other CLI providers (Claude, Gemini, OpenCode, GitHub Copilot) are bit-identical. NUL bytes, multi-event streams, empty assistant messages, and non-JSON plain text are all handled without panic or truncation. Workflow authors can now reference `{{.states.codex_step.Output}}` and `{{.states.codex_step.Response}}` with the same semantics as other providers, unblocking multi-step workflows that pipe Codex output downstream. + ## [0.10.0] - 2026-06-01 ### Changed diff --git a/README.md b/README.md index e9fbedca..c87b77a7 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ A Go CLI tool for orchestrating AI agents (Claude, Gemini, Codex, GitHub Copilot - **State Machine Execution** - Define workflows as state machines with conditional transitions based on exit codes, command output, or custom expressions - **Inline Error Handling** - Specify error messages and exit codes directly on steps without creating separate terminal states - **Agent Steps** - Invoke AI agents via CLI tools (Claude, Codex, Gemini, GitHub Copilot) or direct HTTP (OpenAI, Ollama, vLLM, Groq) with prompt templates, response parsing, and accurate token tracking -- **Output Formatting for Agent Steps** - Automatically strip markdown code fences and validate JSON output; human-readable streaming display controlled by `output_format` field (text vs raw NDJSON); unified display-event abstraction across all 6 providers with optional verbose mode showing tool-use markers (`[tool: Name(Arg)]`) +- **Output Formatting for Agent Steps** - Automatically strip markdown code fences and validate JSON output; human-readable streaming display controlled by `output_format` field (text vs raw NDJSON); unified display-event abstraction across all 6 providers with optional verbose mode showing tool-use markers (`[tool: Name(Arg)]`); automatic `state.Response` population for structured JSON output (heuristic across all providers) - **Agent Skills** - Inject deterministic domain knowledge into agent steps via `skills:` declarations in workflow YAML; filesystem-based multi-directory discovery (project `.awf/skills/`, `.agents/skills/`, `.claude/skills/`, XDG global) with priority ordering; SKILL.md frontmatter stripping and agentskills.io-compliant `` structured wrapping with bundled resource enumeration; validated by `awf validate` - **Agent Roles** - Inject reusable personas into agent steps via `role:` field referencing AGENTS.md files; filesystem-based multi-directory discovery in a dedicated `roles/` namespace (project `.awf/roles/`, `.agents/roles/`, `$XDG_CONFIG_HOME/awf/roles/`, `~/.agents/roles/`) with priority ordering and `AWF_ROLES_PATH` exclusive override; AGENTS.md frontmatter stripping and system prompt injection with optional composition via inline `system_prompt` field; validated by `awf validate` - **External Prompt Files** - Load agent prompts from `.md` files with full template interpolation, helper functions, and local override support diff --git a/docs/reference/interpolation.md b/docs/reference/interpolation.md index a0d9b366..7c424366 100644 --- a/docs/reference/interpolation.md +++ b/docs/reference/interpolation.md @@ -159,9 +159,18 @@ transitions: **Note**: `TokensUsed` replaced deprecated `states.step_name.Tokens` field. If migrating from earlier versions, update workflow YAML expressions from `{{.states.step_name.Tokens}}` to `{{.states.step_name.TokensUsed}}`. -#### Response (Operation Outputs) +#### Response (Structured Outputs) -Operation steps (e.g., `github.get_issue`, `http.request`) return structured data accessible via `Response`: +Both agent steps and operation steps return structured data accessible via `Response` when the output is valid JSON. + +**Agent steps** automatically populate `Response` when the agent's output contains valid JSON (regardless of `output_format` setting): + +```yaml +{{.states.step_name.Response.field}} # Parsed field from agent JSON output +{{.states.step_name.Response}} # Full parsed JSON object +``` + +**Operation steps** (e.g., `github.get_issue`, `http.request`) always return structured data via `Response`: ```yaml {{.states.step_name.Response.title}} # Parsed field from operation result @@ -169,25 +178,25 @@ Operation steps (e.g., `github.get_issue`, `http.request`) return structured dat {{.states.step_name.Response.labels}} # Array field ``` -Use `Output` for raw JSON, `Response.field` for parsed fields: +Example with agent step returning JSON: ```yaml states: - initial: get_issue - - get_issue: - type: operation - operation: github.get_issue - inputs: - number: "{{.inputs.issue_number}}" - on_success: show_title + initial: analyze + + analyze: + type: agent + provider: claude + prompt: "Return JSON analysis with 'issues' and 'severity' fields" + on_success: process on_failure: error - show_title: + process: type: step - command: echo "Issue: {{.states.get_issue.Response.title}}" + command: | + echo "Severity: {{.states.analyze.Response.severity}}" + echo "Issues: {{.states.analyze.Response.issues}}" on_success: done - on_failure: error done: type: terminal @@ -196,6 +205,15 @@ states: status: failure ``` +If agent returns: +```json +{"issues": ["buffer overflow", "memory leak"], "severity": "high"} +``` + +Then: +- `{{.states.analyze.Response.severity}}` = `"high"` +- `{{.states.analyze.Response.issues}}` = `["buffer overflow", "memory leak"]` + **HTTP operation outputs** follow the same pattern: ```yaml @@ -205,7 +223,12 @@ states: {{.states.step_name.Response.body_truncated}} # true if body was truncated ``` -See [Workflow Syntax - Operation State](../user-guide/workflow-syntax.md#operation-state) for the full list of available operations and their output fields. +**Difference from `JSON`:** +- `Response` is populated automatically for all agent outputs if valid JSON is detected (heuristic) +- `JSON` is only populated when `output_format: json` is explicitly set on the agent step +- Use `Response.field` for automatic best-effort parsing; use `JSON.field` for explicit, validated JSON output + +See [Agent Steps - Output Formatting](../user-guide/agent-steps.md#output-formatting) for examples and [Workflow Syntax - Operation State](../user-guide/workflow-syntax.md#operation-state) for available operations. #### JSON (Explicit Output Formatting) @@ -217,9 +240,13 @@ When an agent step uses `output_format: json`, the parsed JSON is accessible via ``` **Key differences from `Response`:** -- `JSON` is only populated when `output_format: json` is explicitly set on the agent step -- `Response` is populated automatically for all agent outputs if valid JSON is detected (heuristic) -- `JSON` represents explicitly formatted output; `Response` is automatic best-effort parsing +- `JSON` is only populated when `output_format: json` is explicitly set on the agent step (and validation passes) +- `Response` is populated automatically for all agent outputs if valid JSON is detected (heuristic, regardless of `output_format`) +- Use `JSON.field` for strict, validated output; use `Response.field` for lenient, automatic parsing + +**When to use each:** +- **`JSON.field`**: You explicitly requested `output_format: json` and want strict validation +- **`Response.field`**: You want to access JSON from any agent output without requiring `output_format: json` Example with `output_format: json`: @@ -238,8 +265,9 @@ states: process: type: step command: | - echo "Severity: {{.states.analyze.JSON.severity}}" - echo "Issues: {{.states.analyze.JSON.issues}}" + # Both work — JSON is strict, Response is lenient + echo "Severity (JSON): {{.states.analyze.JSON.severity}}" + echo "Severity (Response): {{.states.analyze.Response.severity}}" on_success: done done: @@ -254,9 +282,14 @@ If agent returns: {"issues": ["buffer overflow", "memory leak"], "severity": "high"} ``` -Then: +Then both work identically: - `{{.states.analyze.JSON.severity}}` = `"high"` -- `{{.states.analyze.JSON.issues}}` = `["buffer overflow", "memory leak"]` +- `{{.states.analyze.Response.severity}}` = `"high"` + +**Difference becomes apparent when `output_format: json` is omitted:** +- Without `output_format: json`, JSON validation is skipped +- `JSON` remains empty (never populated) +- `Response` still populates if valid JSON is detected See [Agent Steps - Output Formatting](../user-guide/agent-steps.md#output-formatting) for detailed examples and best practices. diff --git a/docs/user-guide/agent-steps.md b/docs/user-guide/agent-steps.md index ccd68cad..1d08d3be 100644 --- a/docs/user-guide/agent-steps.md +++ b/docs/user-guide/agent-steps.md @@ -1132,8 +1132,8 @@ Agent responses are automatically captured in the execution state: | Field | Type | Description | |-------|------|-------------| -| `{{.states.step_name.Output}}` | string | Raw response text (or cleaned text if `output_format` is set) | -| `{{.states.step_name.Response}}` | object | Parsed JSON response (automatic heuristic) | +| `{{.states.step_name.Output}}` | string | Aggregated assistant text. For CLI providers emitting NDJSON (Claude, Codex, Gemini, OpenCode, GitHub Copilot), the raw stream is extracted to clean text regardless of `output_format`; for HTTP providers, the response body text. `output_format: json` additionally strips markdown code fences. | +| `{{.states.step_name.Response}}` | object | Parsed JSON response (automatic heuristic — populated when the assistant text is a valid JSON object) | | `{{.states.step_name.JSON}}` | object | Parsed JSON from `output_format: json` (explicit, see [Output Formatting](#output-formatting)) | | `{{.states.step_name.TokensUsed}}` | int | Total tokens consumed by this step | | `{{.states.step_name.TokensInput}}` | int | Input tokens (prompt + context). `0` in single-turn mode. | @@ -1165,10 +1165,11 @@ process_response: ## Output Formatting -The `output_format` field serves two purposes: +The `output_format` field serves three purposes: -1. **Post-processing**: Strips markdown code fences and optionally validates JSON (F065) -2. **Display filtering**: Controls how agent responses appear on terminal during streaming and buffered execution, with optional verbose tool-use markers (F082, F085) +1. **Text Extraction**: For CLI providers, extracts clean assistant text from streaming event logs (F103, F082) +2. **Post-processing**: Strips markdown code fences and optionally validates JSON (F065) +3. **Display filtering**: Controls how agent responses appear on terminal during streaming and buffered execution, with optional verbose tool-use markers (F082, F085) When an agent wraps its output in markdown code fences (common with many LLMs), use `output_format` to automatically strip the fences and optionally validate the content: @@ -1210,6 +1211,7 @@ process_results: - Strips outermost markdown code fences (e.g., ````json ... ``` ``) - Validates stripped content as valid JSON - Stores parsed JSON in `{{.states.step_name.JSON}}` +- Automatically populates `{{.states.step_name.Response}}` with parsed JSON (available across all providers) - If validation fails, step fails with a descriptive error - Works with both objects and arrays @@ -1225,6 +1227,8 @@ The analysis shows the following: - `{{.states.analyze.Output}}` = `{"issues": ["buffer overflow", "memory leak"], "severity": "high"}` - `{{.states.analyze.JSON.issues}}` = `["buffer overflow", "memory leak"]` - `{{.states.analyze.JSON.severity}}` = `"high"` +- `{{.states.analyze.Response.issues}}` = `["buffer overflow", "memory leak"]` (same as JSON) +- `{{.states.analyze.Response.severity}}` = `"high"` (same as JSON) #### `text` Format @@ -1248,6 +1252,7 @@ save_code: - Strips outermost markdown code fences (e.g., ````python ... ``` ``) - Returns clean text in `{{.states.step_name.Output}}` - Does not populate `{{.states.step_name.JSON}}` +- Automatically populates `{{.states.step_name.Response}}` if the output happens to be valid JSON (heuristic) **Example agent output:** ``` @@ -1262,6 +1267,8 @@ def fibonacci(n): **After processing:** - `{{.states.generate_code.Output}}` = `def fibonacci(n):\n if n <= 1:\n return n\n return fibonacci(n-1) + fibonacci(n-2)` +- `{{.states.generate_code.JSON}}` = empty/not populated (no `output_format: json`) +- `{{.states.generate_code.Response}}` = populated only if output happens to be valid JSON #### No Format (Default) @@ -1279,10 +1286,15 @@ analyze: The `output_format` field controls how agent responses appear on the terminal (F082). Additionally, the `--verbose` flag displays tool-use activity markers (F085) — showing which tools the agent invoked — alongside agent output when running with `awf run --output streaming` or `--output buffered`: -| `output_format` | Streaming Display | Buffered Display | Raw Storage | +| `output_format` | Streaming Display | Buffered Display | `state.Output` Storage | |---|---|---|---| -| `text` (or omitted) | Human-readable filtered text | Filtered text in summary | Raw NDJSON | -| `json` | Raw NDJSON (unfiltered) | Raw NDJSON (unfiltered) | Raw NDJSON | +| `text` (or omitted) | Human-readable filtered text | Filtered text in summary | Extracted assistant text | +| `json` | Raw NDJSON (unfiltered) | Raw NDJSON (unfiltered) | Extracted assistant text (markdown code fences stripped) | + +**Output Parity Across All Providers (F103):** For CLI-based providers (Claude, Codex, Gemini, OpenCode), the system automatically extracts clean assistant text from NDJSON events and stores it in `state.Output`. Additionally: +- `state.Response` is automatically populated when the output is valid JSON (heuristic, regardless of `output_format`) +- Both `output_format: json` and omitted formats produce semantically equivalent output shapes across all providers +- Codex now has feature parity with Claude, Gemini, and OpenCode for output handling #### Streaming Mode (`--output streaming`) @@ -1347,10 +1359,10 @@ Silent mode suppresses all display regardless of `output_format`: ```bash awf run code-review --output silent # No output displayed (silent mode is absolute) -# state.Output still contains raw NDJSON for template interpolation +# state.Output still contains the aggregated assistant text for template interpolation ``` -**Note:** `state.Output` always contains the raw NDJSON regardless of display filtering. Filtering only affects terminal display, not data storage. +**Note:** `state.Output` is populated independently of display filtering. For CLI providers emitting NDJSON (Claude, Codex, Gemini, OpenCode, GitHub Copilot), the assistant text is extracted from the raw stream and stored — so `{{.states.step.Output}}` resolves to clean text regardless of `output_format`. Filtering only affects terminal display, not data storage. #### Provider Event Cadence diff --git a/internal/infrastructure/agents/codex_provider.go b/internal/infrastructure/agents/codex_provider.go index 2db39aa9..71512cb0 100644 --- a/internal/infrastructure/agents/codex_provider.go +++ b/internal/infrastructure/agents/codex_provider.go @@ -27,12 +27,7 @@ type CodexProvider struct { } func NewCodexProvider() *CodexProvider { - p := &CodexProvider{ - logger: logger.NopLogger{}, - executor: NewExecCLIExecutor(), - } - p.base = p.newBase() - return p + return NewCodexProviderWithOptions() } func NewCodexProviderWithOptions(opts ...CodexProviderOption) *CodexProvider { @@ -52,6 +47,7 @@ func (p *CodexProvider) newBase() *baseCLIProvider { buildExecuteArgs: p.buildExecuteArgs, buildConversationArgs: p.buildConversationArgs, extractSessionID: p.extractSessionID, + extractTextContent: p.extractCodexTextContent, validateOptions: validateCodexOptions, parseDisplayEvents: p.parseCodexDisplayEvents, extractTokenUsage: p.extractCodexTokenUsage, @@ -69,28 +65,63 @@ func (p *CodexProvider) Execute(ctx context.Context, prompt string, options map[ return nil, err } - // Codex CLI is always invoked with `exec --json` (NDJSON). For text intent, - // aggregate assistant message content for state.Output so downstream - // interpolation ({{states.step.Output}}) is human-readable (F082). - userFormat, _ := getStringOption(options, "output_format") - if userFormat != "json" && userFormat != "stream-json" { - if extracted := extractDisplayTextFromEvents(rawOutput, p.parseCodexDisplayEvents); extracted != "" { - result.Output = extracted - if result.TokensEstimated { - tokens, _ := p.base.tokenizer.CountTokens(extracted) //nolint:errcheck // ApproximationTokenizer never errors with a valid ratio - result.Tokens = tokens + // F103: presence-aware extraction — when the NDJSON carried an assistant_message event (even an + // empty one), overwrite Output with the aggregated assistant text and re-derive every dependent + // field (Response, token estimate) from it, regardless of output_format. Gating all of them on + // hadText mirrors the result.Output != "" guard in ExecuteConversation: an empty assistant_message + // must not leave a JSON Response or a token count estimated from the raw NDJSON envelope. + if extracted, hadText := p.extractCodexAssistantText(rawOutput); hadText { + result.Output = extracted + if extracted != "" { + // Array payloads ([…]) leave Response nil — tryParseJSONResponse matches HasPrefix("{") only. + if jsonResp := tryParseJSONResponse(extracted); jsonResp != nil { + result.Response = jsonResp } } + if result.TokensEstimated { + tokens, _ := p.base.tokenizer.CountTokens(extracted) //nolint:errcheck // ApproximationTokenizer never errors with a valid ratio + result.Tokens = tokens + } } return result, nil } func (p *CodexProvider) ExecuteConversation(ctx context.Context, state *workflow.ConversationState, prompt string, options map[string]any, stdout, stderr io.Writer) (*workflow.ConversationResult, error) { - result, _, err := p.base.executeConversation(ctx, state, prompt, options, stdout, stderr) + result, rawOutput, err := p.base.executeConversation(ctx, state, prompt, options, stdout, stderr) if err != nil { return nil, err } + if text, hadText := p.extractCodexAssistantText(rawOutput); hadText { + // F103: presence-aware overwrite. The base derived Output, the trailing assistant + // turn's Content, and the estimated token counts from the raw NDJSON envelope; once + // Output is replaced with the aggregated assistant text we must re-derive all three so + // they stay consistent (mirrors the token recount performed in Execute). Without this, + // an empty assistant_message leaves inflated TokensOutput/TokensTotal and a turn whose + // Content still holds the raw NDJSON, violating the turn.Content == Output invariant. + result.Output = text + if result.TokensEstimated { + tokens, _ := p.base.tokenizer.CountTokens(text) //nolint:errcheck // ApproximationTokenizer never errors with a valid ratio + result.TokensOutput = tokens + result.TokensTotal = result.TokensInput + tokens + } + if result.State != nil { + if n := len(result.State.Turns); n > 0 { + if last := &result.State.Turns[n-1]; last.Role == workflow.TurnRoleAssistant { + last.Content = text + if result.TokensEstimated { + last.Tokens = result.TokensOutput + } + } + } + } + } + if result.Output != "" { + // F103: array payloads ([…]) leave Response nil — tryParseJSONResponse matches HasPrefix("{") only. F107 will revisit. + if jsonResp := tryParseJSONResponse(result.Output); jsonResp != nil { + result.Response = jsonResp + } + } return result, nil } @@ -115,8 +146,11 @@ func (p *CodexProvider) buildExecuteArgs(prompt string, options map[string]any) func (p *CodexProvider) buildConversationArgs(state *workflow.ConversationState, prompt string, options map[string]any) ([]string, error) { var args []string if state.SessionID != "" { - // Resume an existing thread using the native resume subcommand. - args = []string{"resume", state.SessionID, "--json", prompt} + // Resume an existing thread headlessly. The top-level `codex resume` subcommand is the + // interactive TUI (session picker, no --json); headless JSONL resume lives under + // `codex exec resume [PROMPT] --json` (codex-cli >= 0.x). Using the TUI + // resume here makes the CLI reject --json with "unexpected argument '--json'". + args = []string{"exec", "resume", state.SessionID, "--json", prompt} } else { // Codex CLI has no --system-prompt flag; inline the system prompt into // the first-turn message only when a session is not yet established. @@ -127,7 +161,6 @@ func (p *CodexProvider) buildConversationArgs(state *workflow.ConversationState, return args, nil } -// appendCodexOptions appends Codex CLI flags from options; unknown keys are silently ignored. func appendCodexOptions(args []string, options map[string]any) []string { if model, ok := getStringOption(options, "model"); ok && model != "" { args = append(args, "--model", model) @@ -182,6 +215,48 @@ func (p *CodexProvider) extractCodexTokenUsage(rawOutput string) *tokenUsage { } } +// extractCodexTextContent adapts extractCodexAssistantText to the extractTextContent hook +// signature (func(string) string) required by cliProviderHooks. The hadText bool is deliberately +// discarded: the base provider only needs the aggregated text to build outputStr, and it already +// treats "" as "fall back to TrimSpace(rawOutput)". +// +// CONTRACT (do not "simplify" away the ExecuteConversation override): because this hook cannot +// surface hadText, an empty assistant_message (hadText=true, text="") is indistinguishable from +// non-NDJSON plain text (hadText=false) at the base layer — both yield "". CodexProvider.Execute +// and CodexProvider.ExecuteConversation therefore re-run extractCodexAssistantText to recover +// hadText and apply the presence-aware overwrite that corrects Output, the token counts, and the +// trailing turn Content for the empty-message case. This intentional re-walk of the NDJSON is the +// accepted cost of keeping the base hook signature uniform across all providers. +func (p *CodexProvider) extractCodexTextContent(output string) string { + text, _ := p.extractCodexAssistantText(output) + return text +} + +// extractCodexAssistantText aggregates assistant text from NDJSON output via +// p.parseCodexDisplayEvents, mirroring the join semantics of extractDisplayTextFromEvents. +// hadText is true iff at least one EventText was produced (even when its .Text is empty), +// enabling callers to distinguish "empty assistant_message" from "non-NDJSON plain text". +func (p *CodexProvider) extractCodexAssistantText(output string) (string, bool) { + var result strings.Builder + hadText := false + for line := range strings.SplitSeq(output, "\n") { + if line == "" { + continue + } + for _, evt := range p.parseCodexDisplayEvents([]byte(line)) { + if evt.Kind != EventText { + continue + } + hadText = true + if result.Len() > 0 { + result.WriteRune('\n') + } + result.WriteString(evt.Text) + } + } + return result.String(), hadText +} + func validateCodexOptions(options map[string]any) error { if options == nil { return nil @@ -247,17 +322,30 @@ func (p *CodexProvider) codexMCPInjector(_ context.Context, args []string, cfg * } // parseCodexDisplayEvents parses a single NDJSON line from Codex CLI output into -// DisplayEvents. It emits EventText for assistant_message items and EventToolUse -// for function_call items. All other event types return nil (skip signal). +// DisplayEvents. It emits EventText for assistant-message items and EventToolUse +// for function-call items. All other event types return nil (skip signal). +// +// Codex CLI versions disagree on the item discriminator and its values: +// - legacy schema: item.item_type == "assistant_message" / "function_call" +// - codex-cli >= 0.133.0: item.type == "agent_message" / "command_execution" +// +// We read BOTH discriminator keys (item_type wins when present) and accept both naming +// families, so the same parser handles old fixtures and the current CLI. When the schema +// is not recognized, extraction yields no text and the base provider falls back to the raw +// NDJSON — which is exactly the leak this dual-schema handling prevents. func (p *CodexProvider) parseCodexDisplayEvents(line []byte) []DisplayEvent { // Replace NUL bytes (0x00) with the 6-byte JSON unicode escape sequence // {0x5c,0x75,0x30,0x30,0x30,0x30} = backslash + u + 0 + 0 + 0 + 0. // json.Unmarshal decodes this escape back to NUL, preserving string content. + // This diverges intentionally from the Claude provider, which replaces NUL with a space + // (discarding the byte): Codex tool output may embed NUL as a meaningful sentinel, so we + // preserve it rather than mangle it. Do not align the two without re-checking that contract. sanitized := bytes.ReplaceAll(line, []byte{0x00}, []byte{0x5c, 0x75, 0x30, 0x30, 0x30, 0x30}) var evt struct { Type string `json:"type"` Item *struct { - ItemType string `json:"item_type"` + ItemType string `json:"item_type"` // legacy schema discriminator + Kind string `json:"type"` // codex-cli >= 0.133.0 discriminator Text string `json:"text"` Name string `json:"name"` Arguments string `json:"arguments"` @@ -270,10 +358,15 @@ func (p *CodexProvider) parseCodexDisplayEvents(line []byte) []DisplayEvent { return nil } if evt.Type == "item.completed" && evt.Item != nil { - switch evt.Item.ItemType { - case "assistant_message": + // item_type (legacy) takes precedence; fall back to type (current CLI). + itemKind := evt.Item.ItemType + if itemKind == "" { + itemKind = evt.Item.Kind + } + switch itemKind { + case "assistant_message", "agent_message": return []DisplayEvent{{Type: evt.Type, Kind: EventText, Text: evt.Item.Text}} - case "function_call": + case "function_call", "command_execution": // Codex does not emit tool-call IDs; ID is always empty. preview := extractArgPreview(evt.Item.Arguments) return []DisplayEvent{{Type: evt.Type, Kind: EventToolUse, Name: evt.Item.Name, Arg: preview, ID: ""}} diff --git a/internal/infrastructure/agents/codex_provider_delegation_test.go b/internal/infrastructure/agents/codex_provider_delegation_test.go index c8d81a32..19400f44 100644 --- a/internal/infrastructure/agents/codex_provider_delegation_test.go +++ b/internal/infrastructure/agents/codex_provider_delegation_test.go @@ -2,6 +2,7 @@ package agents import ( "context" + "errors" "strings" "testing" @@ -11,11 +12,6 @@ import ( "github.com/stretchr/testify/require" ) -// T004: Codex provider delegation tests verify that CodexProvider correctly -// delegates Execute and ExecuteConversation to baseCLIProvider through hooks. -// Tests fail against stub (buildExecuteArgs/buildConversationArgs return nil) -// and pass after implementation provides proper CLI arguments. - func TestCodexProvider_Execute_DelegationToBase(t *testing.T) { tests := []struct { name string @@ -23,7 +19,6 @@ func TestCodexProvider_Execute_DelegationToBase(t *testing.T) { options map[string]any mockStdout []byte wantOutput string - // Note: stub returns nil args, real impl will return ["exec", "--json", ...] }{ { name: "simple prompt delegation", @@ -55,18 +50,11 @@ func TestCodexProvider_Execute_DelegationToBase(t *testing.T) { provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) result, err := provider.Execute(context.Background(), tt.prompt, tt.options, nil, nil) - if err != nil { - // Stub returns nil args, causing executor to be called with zero args - // Real implementation will return proper args ["exec", "--json", ...] - t.Logf("implementation incomplete: %v", err) - assert.Nil(t, result) - return - } - + require.NoError(t, err) require.NotNil(t, result) assert.Equal(t, "codex", result.Provider) assert.Equal(t, tt.wantOutput, result.Output) - assert.True(t, result.TokensEstimated, "Execute must set TokensEstimated=true (FR-007)") + assert.True(t, result.TokensEstimated, "all CLI providers estimate tokens (no real-time token API)") assert.NotZero(t, result.Tokens) }) } @@ -78,10 +66,10 @@ func TestCodexProvider_Execute_EmptyPrompt_ValidationByBase(t *testing.T) { result, err := provider.Execute(context.Background(), "", nil, nil, nil) - // Base validates empty prompt before calling hooks assert.Error(t, err) assert.Nil(t, result) - assert.Contains(t, err.Error(), "prompt") + assert.Contains(t, err.Error(), "prompt cannot be empty") + assert.Empty(t, mockExec.GetCalls()) } func TestCodexProvider_Execute_ContextCancellation(t *testing.T) { @@ -93,9 +81,10 @@ func TestCodexProvider_Execute_ContextCancellation(t *testing.T) { result, err := provider.Execute(ctx, "test", nil, nil, nil) - // Base checks context error before hook execution assert.Error(t, err) assert.Nil(t, result) + assert.True(t, errors.Is(err, context.Canceled), "error must wrap context.Canceled") + assert.Empty(t, mockExec.GetCalls()) } func TestCodexProvider_ExecuteConversation_DelegationToBase(t *testing.T) { @@ -142,18 +131,12 @@ func TestCodexProvider_ExecuteConversation_DelegationToBase(t *testing.T) { state := workflow.NewConversationState(tt.systemPrompt) result, err := provider.ExecuteConversation(context.Background(), state, tt.userPrompt, tt.options, nil, nil) - if err != nil { - // Stub returns nil args → executor error (expected against stub) - t.Logf("stub error (expected): %v", err) - assert.Nil(t, result) - return - } - + require.NoError(t, err) require.NotNil(t, result) assert.Equal(t, "codex", result.Provider) assert.NotNil(t, result.State) - // Verify empty output gets " " fallback (FR-008) + // empty output gets " " fallback if tt.mockStdout == "" { assert.Equal(t, " ", result.Output) } else { @@ -182,6 +165,8 @@ func TestCodexProvider_ExecuteConversation_ContextCancellation(t *testing.T) { assert.Error(t, err) assert.Nil(t, result) + assert.True(t, errors.Is(err, context.Canceled), "error must wrap context.Canceled") + assert.Empty(t, mockExec.GetCalls()) } func TestCodexProvider_ExecuteConversation_TurnManagement(t *testing.T) { @@ -199,12 +184,7 @@ func TestCodexProvider_ExecuteConversation_TurnManagement(t *testing.T) { state.TotalTokens = 100 result, err := provider.ExecuteConversation(context.Background(), state, "Second question", nil, nil, nil) - if err != nil { - // Stub behavior - expected against incomplete implementation - t.Logf("stub error: %v", err) - return - } - + require.NoError(t, err) require.NotNil(t, result) require.NotNil(t, result.State) @@ -248,10 +228,8 @@ func TestCodexProvider_ExecuteConversation_SessionIDExtraction(t *testing.T) { state := workflow.NewConversationState("System") result, err := provider.ExecuteConversation(context.Background(), state, "test", nil, nil, nil) - if err != nil { - // Stub case - return - } + require.NoError(t, err) + require.NotNil(t, result) if tt.wantThreadID != "" { assert.Equal(t, tt.wantThreadID, result.State.SessionID) @@ -260,15 +238,6 @@ func TestCodexProvider_ExecuteConversation_SessionIDExtraction(t *testing.T) { } } -func TestCodexProvider_Validate_BinaryNotFound(t *testing.T) { - provider := NewCodexProvider() - // This will fail on systems without Codex CLI, which is expected - err := provider.Validate() - // Error is acceptable (Codex not installed) - // Success is acceptable (Codex installed) - _ = err -} - func TestCodexProvider_ExecuteConversation_MultipleEvents(t *testing.T) { output := strings.Join([]string{ `{"type":"message","content":"thinking..."}`, @@ -282,13 +251,11 @@ func TestCodexProvider_ExecuteConversation_MultipleEvents(t *testing.T) { state := workflow.NewConversationState("Assistant") result, err := provider.ExecuteConversation(context.Background(), state, "multi-event test", nil, nil, nil) - if err != nil { - // Stub case - return - } + require.NoError(t, err) + require.NotNil(t, result) // Even with multiple events, session ID should be extracted - if result != nil && result.State != nil && result.State.SessionID != "" { + if result.State != nil && result.State.SessionID != "" { assert.Equal(t, "thread-multi", result.State.SessionID) } } @@ -299,13 +266,8 @@ func TestCodexProvider_Execute_AllTokensEstimatedTrue(t *testing.T) { provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) result, err := provider.Execute(context.Background(), "test prompt", nil, nil, nil) - if err != nil { - // Stub case - return - } - + require.NoError(t, err) require.NotNil(t, result) - // FR-007: TokensEstimated must be true for all CLI providers assert.True(t, result.TokensEstimated) } @@ -316,12 +278,105 @@ func TestCodexProvider_ExecuteConversation_AllTokensEstimatedTrue(t *testing.T) state := workflow.NewConversationState("System") result, err := provider.ExecuteConversation(context.Background(), state, "test", nil, nil, nil) - if err != nil { - // Stub case - return + require.NoError(t, err) + require.NotNil(t, result) + assert.True(t, result.TokensEstimated) +} + +func TestCodexProvider_ExecuteConversation_OutputExtraction(t *testing.T) { + ndjson := ndjsonLine("extracted assistant text") + + tests := []struct { + name string + outputFormat string + }{ + {name: "default (no output_format)", outputFormat: ""}, + {name: "json output_format", outputFormat: "json"}, + {name: "text output_format", outputFormat: "text"}, + {name: "stream-json output_format", outputFormat: "stream-json"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(ndjson), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + + opts := map[string]any{} + if tt.outputFormat != "" { + opts["output_format"] = tt.outputFormat + } + + state := workflow.NewConversationState("System") + result, err := provider.ExecuteConversation(context.Background(), state, "test prompt", opts, nil, nil) + require.NoError(t, err) + require.NotNil(t, result) + assert.Equal(t, "extracted assistant text", result.Output) + assert.NotContains(t, result.Output, "item.completed", "raw NDJSON envelope must not appear in Output") + // Verify the executor was actually invoked (proves mock isolation) + assert.Len(t, mockExec.GetCalls(), 1, "executor must be called exactly once per conversation turn") + }) } +} + +func TestCodexProvider_ExecuteConversation_HookWired(t *testing.T) { + ndjson := `{"type":"item.completed","item":{"item_type":"assistant_message","text":"hook extracted text"}}` + + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(ndjson), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + state := workflow.NewConversationState("System") + result, err := provider.ExecuteConversation(context.Background(), state, "test", nil, nil, nil) + require.NoError(t, err) require.NotNil(t, result) - // FR-007: TokensEstimated must be true for all CLI providers - assert.True(t, result.TokensEstimated) + + assert.Equal(t, "hook extracted text", result.Output) + assert.NotContains(t, result.Output, "item.completed", "raw NDJSON envelope must not appear in Output") + // Verify the executor was invoked with the NDJSON input (proves hook ran on real output) + assert.Len(t, mockExec.GetCalls(), 1, "executor must be called exactly once") +} + +// TestCodexProvider_ExecuteConversation_PlainTextFallback proves the hook runs at runtime: +// when extractCodexTextContent returns "" (non-NDJSON input), base_cli_provider.go:332-334 +// falls through to strings.TrimSpace(rawOutput). Without the hook wired, the base provider +// would use the raw bytes directly (no trim). With the hook wired, "" triggers the fallback. +func TestCodexProvider_ExecuteConversation_PlainTextFallback(t *testing.T) { + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(" plain text response "), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + + state := workflow.NewConversationState("System") + result, err := provider.ExecuteConversation(context.Background(), state, "test", nil, nil, nil) + require.NoError(t, err) + require.NotNil(t, result) + + // Hook returns "" for non-NDJSON; base falls back to strings.TrimSpace(rawOutput). + assert.Equal(t, "plain text response", result.Output, "plain text must be trimmed via base fallback") + assert.NotContains(t, result.Output, "item.completed") + assert.Len(t, mockExec.GetCalls(), 1) +} + +func TestCodexProvider_ExecuteConversation_NDJSONOutput(t *testing.T) { + multiEvent := strings.Join([]string{ + `{"type":"thread.started","thread_id":"thread-ndjson"}`, + ndjsonLine("first part"), + ndjsonLine("second part"), + `{"type":"item.completed","item":{"item_type":"function_call","name":"bash","arguments":"{}"}}`, + ndjsonLine("third part"), + }, "\n") + + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(multiEvent), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + + state := workflow.NewConversationState("System") + result, err := provider.ExecuteConversation(context.Background(), state, "test", nil, nil, nil) + require.NoError(t, err) + require.NotNil(t, result) + + assert.Equal(t, "first part\nsecond part\nthird part", result.Output) + assert.NotContains(t, result.Output, "item.completed", "raw NDJSON must not appear in Output") + // Verify executor was invoked: proves the mock was the actual dependency under test + assert.Len(t, mockExec.GetCalls(), 1, "executor must be called exactly once") } diff --git a/internal/infrastructure/agents/codex_provider_exec_test.go b/internal/infrastructure/agents/codex_provider_exec_test.go index 72f912b0..93ce9832 100644 --- a/internal/infrastructure/agents/codex_provider_exec_test.go +++ b/internal/infrastructure/agents/codex_provider_exec_test.go @@ -214,9 +214,10 @@ func TestCodexProvider_ExecuteConversation_ResumeTurnResumeCommand(t *testing.T) calls := mockExec.GetCalls() require.Len(t, calls, 1) args := calls[0].Args - assert.Equal(t, "resume", args[0], "resume turn should use 'resume' subcommand") - assert.Equal(t, "xyz789", args[1], "should have session ID as arg[1]") - assert.Equal(t, "--json", args[2], "should have --json flag after session ID") + assert.Equal(t, "exec", args[0], "resume turn should use exec subcommand") + assert.Equal(t, "resume", args[1], "should use the exec resume subcommand") + assert.Equal(t, "xyz789", args[2], "should have session ID as arg[2]") + assert.Equal(t, "--json", args[3], "should have --json flag after session ID") assert.Contains(t, strings.Join(args, " "), "continue", "should contain prompt") } diff --git a/internal/infrastructure/agents/codex_provider_parse_display_events_test.go b/internal/infrastructure/agents/codex_provider_parse_display_events_test.go index 6778114d..6a90823f 100644 --- a/internal/infrastructure/agents/codex_provider_parse_display_events_test.go +++ b/internal/infrastructure/agents/codex_provider_parse_display_events_test.go @@ -135,6 +135,50 @@ func TestCodexProvider_parseCodexDisplayEvents_FunctionCall(t *testing.T) { } } +// TestCodexProvider_parseCodexDisplayEvents_CurrentSchema covers the codex-cli >= 0.133.0 +// event shape, where the item discriminator moved to item.type with new values +// (agent_message / command_execution) instead of the legacy item.item_type +// (assistant_message / function_call). Regression guard for the format drift that left raw +// NDJSON leaking into state.Output. +func TestCodexProvider_parseCodexDisplayEvents_CurrentSchema(t *testing.T) { + provider := NewCodexProvider() + + t.Run("agent_message surfaces text (real codex-cli 0.133.0 line)", func(t *testing.T) { + line := []byte(`{"type":"item.completed","item":{"id":"item_0","type":"agent_message","text":"PONG"}}`) + got := provider.parseCodexDisplayEvents(line) + require.Len(t, got, 1) + assert.Equal(t, "item.completed", got[0].Type) + assert.Equal(t, EventText, got[0].Kind) + assert.Equal(t, "PONG", got[0].Text) + }) + + t.Run("agent_message with empty text still produces an EventText", func(t *testing.T) { + line := []byte(`{"type":"item.completed","item":{"id":"i1","type":"agent_message","text":""}}`) + got := provider.parseCodexDisplayEvents(line) + require.Len(t, got, 1) + assert.Equal(t, EventText, got[0].Kind) + assert.Equal(t, "", got[0].Text) + }) + + t.Run("command_execution maps to EventToolUse", func(t *testing.T) { + line := []byte(`{"type":"item.completed","item":{"id":"i2","type":"command_execution","name":"shell","arguments":"{\"command\":\"ls -la\"}"}}`) + got := provider.parseCodexDisplayEvents(line) + require.Len(t, got, 1) + assert.Equal(t, EventToolUse, got[0].Kind) + assert.Equal(t, "shell", got[0].Name) + assert.Equal(t, "ls -la", got[0].Arg) + }) + + t.Run("legacy item_type wins when both keys present", func(t *testing.T) { + // item_type takes precedence over type, so a legacy assistant_message is honored. + line := []byte(`{"type":"item.completed","item":{"item_type":"assistant_message","type":"agent_message","text":"hi"}}`) + got := provider.parseCodexDisplayEvents(line) + require.Len(t, got, 1) + assert.Equal(t, EventText, got[0].Kind) + assert.Equal(t, "hi", got[0].Text) + }) +} + func TestCodexProvider_parseCodexDisplayEvents_NonAssistantItems(t *testing.T) { provider := NewCodexProvider() diff --git a/internal/infrastructure/agents/codex_provider_resume_test.go b/internal/infrastructure/agents/codex_provider_resume_test.go index 66f50a05..0e5b2db6 100644 --- a/internal/infrastructure/agents/codex_provider_resume_test.go +++ b/internal/infrastructure/agents/codex_provider_resume_test.go @@ -53,11 +53,12 @@ func TestCodexProvider_ExecuteConversation_T005_ResumeTurnResumeSubcommand(t *te call := calls[0] assert.Equal(t, "codex", call.Name) - assert.GreaterOrEqual(t, len(call.Args), 4) - assert.Equal(t, "resume", call.Args[0], "resume turn should use resume subcommand") - assert.Equal(t, "codex-sess-123", call.Args[1], "session ID should be args[1]") - assert.Equal(t, "--json", call.Args[2], "resume should have --json flag after session ID") - assert.Equal(t, "add error handling", call.Args[3], "prompt should follow --json") + assert.GreaterOrEqual(t, len(call.Args), 5) + assert.Equal(t, "exec", call.Args[0], "resume turn should use exec subcommand") + assert.Equal(t, "resume", call.Args[1], "exec resume subcommand") + assert.Equal(t, "codex-sess-123", call.Args[2], "session ID should be args[2]") + assert.Equal(t, "--json", call.Args[3], "resume should have --json flag after session ID") + assert.Equal(t, "add error handling", call.Args[4], "prompt should follow --json") } // T005: ExecuteConversation uses resume for any non-empty session ID (no prefix required) @@ -77,10 +78,11 @@ func TestCodexProvider_ExecuteConversation_T005_NonCodexPrefixedSessionIDFallbac require.Len(t, calls, 1) call := calls[0] - assert.Equal(t, "resume", call.Args[0], "any non-empty session ID triggers resume subcommand") - assert.Equal(t, "non-codex-prefixed-id", call.Args[1], "session ID passed after resume") - assert.Equal(t, "--json", call.Args[2]) - assert.Equal(t, "continue", call.Args[3]) + assert.Equal(t, "exec", call.Args[0], "any non-empty session ID triggers exec resume") + assert.Equal(t, "resume", call.Args[1], "exec resume subcommand") + assert.Equal(t, "non-codex-prefixed-id", call.Args[2], "session ID passed after resume") + assert.Equal(t, "--json", call.Args[3]) + assert.Equal(t, "continue", call.Args[4]) } // T005: Quiet flag is NOT passed (replaced by --json on exec/resume) @@ -204,7 +206,7 @@ func TestCodexProvider_ExecuteConversation_T005_OptionsWithSubcommands(t *testin name: "resume turn with model", sessionID: "codex-sess-abc", options: map[string]any{"model": "gpt-4o"}, - expectedSubcmd: "resume", + expectedSubcmd: "exec", expectedFlag: "--model", }, { @@ -218,7 +220,7 @@ func TestCodexProvider_ExecuteConversation_T005_OptionsWithSubcommands(t *testin name: "resume turn with dangerously_skip_permissions", sessionID: "codex-sess-xyz", options: map[string]any{"dangerously_skip_permissions": true}, - expectedSubcmd: "resume", + expectedSubcmd: "exec", expectedFlag: "--dangerously-bypass-approvals-and-sandbox", }, } @@ -239,6 +241,9 @@ func TestCodexProvider_ExecuteConversation_T005_OptionsWithSubcommands(t *testin call := calls[0] assert.Equal(t, tt.expectedSubcmd, call.Args[0]) + if tt.sessionID != "" { + assert.Equal(t, "resume", call.Args[1], "resume turn nests the resume subcommand under exec") + } assert.Contains(t, call.Args, tt.expectedFlag, "expected flag should be in args") assert.NotContains(t, call.Args, "--language", "--language is not supported") assert.NotContains(t, call.Args, "--yolo", "--yolo is not supported") @@ -316,7 +321,7 @@ func TestCodexProvider_ExecuteConversation_T005_SystemPromptHandling(t *testing. name: "resume turn with system prompt (should ignore)", sessionID: "codex-sess-abc", hasSystemPrompt: true, - expectedSubcmd: "resume", + expectedSubcmd: "exec", shouldInline: false, }, } @@ -345,11 +350,11 @@ func TestCodexProvider_ExecuteConversation_T005_SystemPromptHandling(t *testing. assert.NotContains(t, args, "--system-prompt", "codex has no --system-prompt flag") var promptArg string - switch args[0] { - case "exec": + switch { + case args[0] == "exec" && len(args) > 1 && args[1] == "resume": + promptArg = args[4] + case args[0] == "exec": promptArg = args[2] - case "resume": - promptArg = args[3] } if tt.shouldInline { diff --git a/internal/infrastructure/agents/codex_provider_session_test.go b/internal/infrastructure/agents/codex_provider_session_test.go index a5b671f3..a3b43820 100644 --- a/internal/infrastructure/agents/codex_provider_session_test.go +++ b/internal/infrastructure/agents/codex_provider_session_test.go @@ -240,14 +240,14 @@ func TestCodexProvider_ExecuteConversation_SystemPromptFirstTurnOnly(t *testing. assert.NotContains(t, args, "--system-prompt", "codex has no --system-prompt flag") // The prompt arg sits immediately after the subcommand positional args. - // exec: args = ["exec", "--json", , ...] - // resume: args = ["resume", , "--json", , ...] + // first turn: args = ["exec", "--json", , ...] + // resume: args = ["exec", "resume", , "--json", , ...] var promptArg string - switch args[0] { - case "exec": + switch { + case args[0] == "exec" && len(args) > 1 && args[1] == "resume": + promptArg = args[4] + case args[0] == "exec": promptArg = args[2] - case "resume": - promptArg = args[3] default: t.Fatalf("unexpected subcommand %q", args[0]) } diff --git a/internal/infrastructure/agents/codex_provider_test_helpers_test.go b/internal/infrastructure/agents/codex_provider_test_helpers_test.go new file mode 100644 index 00000000..23122498 --- /dev/null +++ b/internal/infrastructure/agents/codex_provider_test_helpers_test.go @@ -0,0 +1,15 @@ +package agents + +import "encoding/json" + +// ndjsonLine builds a single Codex NDJSON `item.completed` envelope wrapping an +// assistant_message with the given text. Shared across the codex provider test +// files (unit + delegation) so the envelope shape lives in one place. +// +// The text is JSON-escaped via json.Marshal so callers may pass quotes, +// backslashes, or control characters without producing a malformed envelope — +// matching the project convention of testing handlers with special characters. +func ndjsonLine(text string) string { + escaped, _ := json.Marshal(text) //nolint:errcheck // json.Marshal never fails for a Go string value + return `{"type":"item.completed","item":{"item_type":"assistant_message","text":` + string(escaped) + `}}` +} diff --git a/internal/infrastructure/agents/codex_provider_unit_test.go b/internal/infrastructure/agents/codex_provider_unit_test.go index bdbb862c..15d58423 100644 --- a/internal/infrastructure/agents/codex_provider_unit_test.go +++ b/internal/infrastructure/agents/codex_provider_unit_test.go @@ -3,6 +3,7 @@ package agents import ( "context" "errors" + "strings" "testing" "time" @@ -12,9 +13,6 @@ import ( "github.com/stretchr/testify/require" ) -// Component: C025 - Unit Tests for CodexProvider (WITHOUT integration build tag) -// These tests use MockCLIExecutor to avoid external CLI dependencies - func TestCodexProvider_Execute_Success(t *testing.T) { tests := []struct { name string @@ -69,7 +67,6 @@ func TestCodexProvider_Execute_Success(t *testing.T) { require.NotNil(t, result) assert.Equal(t, "codex", result.Provider) assert.Equal(t, tt.wantOutput, result.Output) - // Token estimation is ~4 chars per token expectedTokens := len(tt.wantOutput) / 4 assert.Equal(t, expectedTokens, result.Tokens) assert.False(t, result.StartedAt.IsZero()) @@ -175,7 +172,6 @@ func TestCodexProvider_Execute_EmptyPrompt(t *testing.T) { assert.Error(t, err) assert.Contains(t, err.Error(), tt.wantErr) assert.Nil(t, result) - // Executor should not be called calls := mockExec.GetCalls() assert.Empty(t, calls) }) @@ -189,7 +185,6 @@ func TestCodexProvider_Execute_ValidationErrors(t *testing.T) { options map[string]any wantErr string }{ - // Valid models (should not error) { name: "valid model: gpt-4o", prompt: "test", @@ -226,7 +221,6 @@ func TestCodexProvider_Execute_ValidationErrors(t *testing.T) { options: map[string]any{"model": "o4-mini"}, wantErr: "", }, - // Invalid models (should error) { name: "invalid model: claude-3-opus (wrong provider)", prompt: "test", @@ -269,7 +263,6 @@ func TestCodexProvider_Execute_ValidationErrors(t *testing.T) { options: map[string]any{"model": "oracle"}, wantErr: "must start with 'gpt-', 'codex-', or be an o-series", }, - // Options that are silently ignored (no validation error) { name: "max_tokens is silently ignored (no validation error)", prompt: "test", @@ -396,6 +389,10 @@ func TestCodexProvider_Execute_CLIErrors(t *testing.T) { } } +// TestCodexProvider_Execute_StdoutStderrCombination exercises the non-NDJSON fallback path: +// plain stdout/stderr carries no assistant_message event, so extractCodexAssistantText reports +// hadText=false and Output is whatever baseCLIProvider.combineOutput produced. It validates the +// base combine/fallback behavior, not the F103 NDJSON extraction overwrite. func TestCodexProvider_Execute_StdoutStderrCombination(t *testing.T) { tests := []struct { name string @@ -459,30 +456,13 @@ func TestCodexProvider_Execute_StdoutStderrCombination(t *testing.T) { func TestCodexProvider_Execute_TokenEstimation(t *testing.T) { tests := []struct { - name string - mockStdout []byte - expectedTokens int + name string + mockStdout []byte }{ - { - name: "small output", - mockStdout: []byte("test"), - expectedTokens: 1, // 4 chars / 4 = 1 - }, - { - name: "medium output", - mockStdout: []byte("This is a longer output with multiple words"), - expectedTokens: 10, // 44 chars / 4 = 10 (integer division) - }, - { - name: "large output", - mockStdout: make([]byte, 1000), - expectedTokens: 250, // 1000 / 4 = 250 - }, - { - name: "empty output", - mockStdout: []byte(""), - expectedTokens: 0, - }, + {name: "small output", mockStdout: []byte("test")}, + {name: "medium output", mockStdout: []byte("This is a longer output with multiple words")}, + {name: "large output", mockStdout: make([]byte, 1000)}, + {name: "empty output", mockStdout: []byte("")}, } for _, tt := range tests { @@ -495,37 +475,18 @@ func TestCodexProvider_Execute_TokenEstimation(t *testing.T) { require.NoError(t, err) require.NotNil(t, result) - assert.Equal(t, tt.expectedTokens, result.Tokens) + // Plain (non-NDJSON) output is used verbatim, except an empty body becomes the " " + // fallback. The ApproximationTokenizer estimates len/4 tokens; derive the expectation + // from that contract instead of hardcoding so the test tracks the tokenizer's ratio. + wantOutput := string(tt.mockStdout) + if wantOutput == "" { + wantOutput = " " + } + assert.Equal(t, len(wantOutput)/4, result.Tokens) }) } } -func TestCodexProvider_Execute_TimestampOrdering(t *testing.T) { - mockExec := mocks.NewMockCLIExecutor() - mockExec.SetOutput([]byte("code"), nil) - provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) - - result, err := provider.Execute(context.Background(), "test", nil, nil, nil) - - require.NoError(t, err) - require.NotNil(t, result) - assert.False(t, result.StartedAt.IsZero()) - assert.False(t, result.CompletedAt.IsZero()) - assert.True(t, result.CompletedAt.After(result.StartedAt) || result.CompletedAt.Equal(result.StartedAt)) -} - -func TestCodexProvider_Execute_ProviderName(t *testing.T) { - mockExec := mocks.NewMockCLIExecutor() - mockExec.SetOutput([]byte("code"), nil) - provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) - - result, err := provider.Execute(context.Background(), "test", nil, nil, nil) - - require.NoError(t, err) - require.NotNil(t, result) - assert.Equal(t, "codex", result.Provider) -} - func TestCodexProvider_ExecuteConversation_Success(t *testing.T) { tests := []struct { name string @@ -698,7 +659,6 @@ func TestCodexProvider_ExecuteConversation_ValidationErrors(t *testing.T) { options map[string]any wantErr string }{ - // Valid models (should not error) { name: "valid model: gpt-4o", options: map[string]any{"model": "gpt-4o"}, @@ -714,7 +674,6 @@ func TestCodexProvider_ExecuteConversation_ValidationErrors(t *testing.T) { options: map[string]any{"model": "o1"}, wantErr: "", }, - // Invalid models (should error) { name: "invalid model: claude-3-opus", options: map[string]any{"model": "claude-3-opus"}, @@ -735,7 +694,6 @@ func TestCodexProvider_ExecuteConversation_ValidationErrors(t *testing.T) { options: map[string]any{"model": "ollama"}, wantErr: "must start with 'gpt-', 'codex-', or be an o-series", }, - // Options that are silently ignored (no validation error) { name: "temperature is silently ignored (no validation error)", options: map[string]any{"temperature": 2.5}, @@ -911,15 +869,12 @@ func TestCodexProvider_Name(t *testing.T) { assert.Equal(t, "codex", provider.Name()) } -func TestCodexProvider_Validate_Success(t *testing.T) { - // Note: This test will fail if 'codex' is not in PATH - // For unit testing purposes, we skip this test if codex is not available +func TestCodexProvider_Validate_DoesNotPanic(t *testing.T) { + // Validate() resolves 'codex' in PATH, so its result is environment-specific + // (error when absent, nil when installed). This test only guarantees the method + // can be invoked without panicking. provider := NewCodexProvider() - err := provider.Validate() - - // We don't assert anything specific here because this depends on the system - // The test verifies the method can be called without panicking - _ = err + _ = provider.Validate() } func TestCodexProvider_NewCodexProvider_DefaultExecutor(t *testing.T) { @@ -930,7 +885,6 @@ func TestCodexProvider_NewCodexProvider_DefaultExecutor(t *testing.T) { assert.True(t, ok, "default executor should be ExecCLIExecutor") } -// T008: Explicit tests for `exec --json` arg structure assertion func TestCodexProvider_Execute_ExecJSONStructure(t *testing.T) { tests := []struct { name string @@ -986,7 +940,6 @@ func TestCodexProvider_Execute_ExecJSONStructure(t *testing.T) { } } -// T008: Verify --json flag is first arg after exec subcommand func TestCodexProvider_Execute_JSONFlagPosition(t *testing.T) { mockExec := mocks.NewMockCLIExecutor() mockExec.SetOutput([]byte("result"), nil) @@ -1005,7 +958,6 @@ func TestCodexProvider_Execute_JSONFlagPosition(t *testing.T) { assert.Equal(t, "test prompt", args[2]) } -// T008: ExecuteConversation uses exec --json on first turn func TestCodexProvider_ExecuteConversation_FirstTurnExecJSON(t *testing.T) { mockExec := mocks.NewMockCLIExecutor() mockExec.SetOutput([]byte("response"), nil) @@ -1025,7 +977,481 @@ func TestCodexProvider_ExecuteConversation_FirstTurnExecJSON(t *testing.T) { assert.Equal(t, "first prompt", args[2], "third arg should be prompt") } -// T008: ExecuteConversation uses resume subcommand on resume turn +func TestCodexProvider_extractCodexTextContent(t *testing.T) { + tests := []struct { + name string + input string + wantText string + }{ + { + name: "single assistant_message", + input: ndjsonLine("hello"), + wantText: "hello", + }, + { + name: "multiple assistant_messages joined with newline", + input: ndjsonLine("foo") + "\n" + ndjsonLine("bar"), + wantText: "foo\nbar", + }, + { + name: "empty assistant_message text", + input: ndjsonLine(""), + wantText: "", + }, + { + name: "plain non-NDJSON text", + input: "second response", + wantText: "", + }, + { + name: "empty input", + input: "", + wantText: "", + }, + { + name: "NUL byte inside JSON string value", + input: "{\"type\":\"item.completed\",\"item\":{\"item_type\":\"assistant_message\",\"text\":\"hell\x00o\"}}", + wantText: "hell\x00o", + }, + { + name: "function_call only produces no text", + input: `{"type":"item.completed","item":{"item_type":"function_call","name":"bash","arguments":"{}"}}`, + wantText: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p := NewCodexProvider() + got := p.extractCodexTextContent(tt.input) + assert.Equal(t, tt.wantText, got) + }) + } +} + +func TestCodexProvider_extractCodexAssistantText(t *testing.T) { + tests := []struct { + name string + input string + wantText string + wantHadText bool + }{ + { + name: "single assistant_message with text", + input: ndjsonLine("hello"), + wantText: "hello", + wantHadText: true, + }, + { + name: "multiple assistant_messages joined with newline", + input: ndjsonLine("foo") + "\n" + ndjsonLine("bar"), + wantText: "foo\nbar", + wantHadText: true, + }, + { + name: "empty assistant_message text — hadText true despite empty string", + input: ndjsonLine(""), + wantText: "", + wantHadText: true, + }, + { + name: "plain non-NDJSON text — hadText false", + input: "second response", + wantText: "", + wantHadText: false, + }, + { + name: "empty input", + input: "", + wantText: "", + wantHadText: false, + }, + { + name: "NUL byte inside JSON string value — no truncation", + input: "{\"type\":\"item.completed\",\"item\":{\"item_type\":\"assistant_message\",\"text\":\"hell\x00o\"}}", + wantText: "hell\x00o", + wantHadText: true, + }, + { + name: "function_call only — hadText false", + input: `{"type":"item.completed","item":{"item_type":"function_call","name":"bash","arguments":"{}"}}`, + wantText: "", + wantHadText: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p := NewCodexProvider() + gotText, gotHadText := p.extractCodexAssistantText(tt.input) + assert.Equal(t, tt.wantText, gotText) + assert.Equal(t, tt.wantHadText, gotHadText) + }) + } +} + +func TestCodexProvider_extractCodexTokenUsage(t *testing.T) { + tests := []struct { + name string + input string + wantNil bool + wantInput int + wantOutput int + wantTotal int + }{ + { + name: "turn.completed with full usage", + input: `{"type":"turn.completed","usage":{"input_tokens":10,"output_tokens":5}}`, + wantInput: 10, + wantOutput: 5, + wantTotal: 15, + }, + { + name: "usage among multiple events", + input: `{"type":"thread.started","thread_id":"t1"}` + "\n" + `{"type":"turn.completed","usage":{"input_tokens":3,"output_tokens":7}}`, + wantInput: 3, + wantOutput: 7, + wantTotal: 10, + }, + { + name: "partial usage defaults missing field to zero", + input: `{"type":"turn.completed","usage":{"input_tokens":4}}`, + wantInput: 4, + wantOutput: 0, + wantTotal: 4, + }, + { + name: "turn.completed without usage field", + input: `{"type":"turn.completed"}`, + wantNil: true, + }, + { + name: "usage present but not an object", + input: `{"type":"turn.completed","usage":"nope"}`, + wantNil: true, + }, + { + name: "no turn.completed event", + input: `{"type":"item.completed","item":{"item_type":"assistant_message","text":"hi"}}`, + wantNil: true, + }, + { + name: "empty output", + input: "", + wantNil: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p := NewCodexProvider() + usage := p.extractCodexTokenUsage(tt.input) + if tt.wantNil { + assert.Nil(t, usage) + return + } + require.NotNil(t, usage) + assert.Equal(t, tt.wantInput, usage.InputTokens) + assert.Equal(t, tt.wantOutput, usage.OutputTokens) + assert.Equal(t, tt.wantTotal, usage.TotalTokens) + }) + } +} + +// TestCodexProvider_RealStream_0133 feeds the exact NDJSON stream emitted by +// codex-cli 0.133.0 (captured from `codex exec --json`) and asserts the assistant text +// is extracted cleanly and the real usage is read. Regression guard for the schema drift +// (item.type=agent_message) that previously left the raw envelope in state.Output. +func TestCodexProvider_RealStream_0133(t *testing.T) { + stream := strings.Join([]string{ + `{"type":"thread.started","thread_id":"019e91f0-2fd4-7a20-9fc2-03994766eec3"}`, + `{"type":"turn.started"}`, + `{"type":"item.completed","item":{"id":"item_0","type":"agent_message","text":"PONG"}}`, + `{"type":"turn.completed","usage":{"input_tokens":18809,"cached_input_tokens":3456,"output_tokens":6,"reasoning_output_tokens":0}}`, + }, "\n") + + p := NewCodexProvider() + + text, hadText := p.extractCodexAssistantText(stream) + assert.True(t, hadText, "agent_message must be recognized as assistant text") + assert.Equal(t, "PONG", text, "Output must be the clean answer, not the raw NDJSON envelope") + + usage := p.extractCodexTokenUsage(stream) + require.NotNil(t, usage, "turn.completed.usage must be read") + assert.Equal(t, 18809, usage.InputTokens) + assert.Equal(t, 6, usage.OutputTokens) +} + +func TestCodexProvider_Execute_OutputExtractionAllFormats(t *testing.T) { + ndjson := `{"type":"item.completed","item":{"item_type":"assistant_message","text":"clean extracted text"}}` + tests := []struct { + name string + outputFormat string + }{ + {name: "json format", outputFormat: "json"}, + {name: "stream-json format", outputFormat: "stream-json"}, + {name: "text format", outputFormat: "text"}, + {name: "empty format", outputFormat: ""}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(ndjson), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + options := map[string]any{"output_format": tt.outputFormat} + + result, err := provider.Execute(context.Background(), "test", options, nil, nil) + + require.NoError(t, err) + require.NotNil(t, result) + assert.Equal(t, "clean extracted text", result.Output) + }) + } +} + +func TestCodexProvider_Execute_ResponsePopulation_ValidJSON(t *testing.T) { + ndjson := `{"type":"item.completed","item":{"item_type":"assistant_message","text":"{\"answer\":42}"}}` + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(ndjson), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + + result, err := provider.Execute(context.Background(), "test", nil, nil, nil) + + require.NoError(t, err) + require.NotNil(t, result) + require.NotNil(t, result.Response) + assert.Equal(t, float64(42), result.Response["answer"]) +} + +func TestCodexProvider_Execute_ResponsePopulation_InvalidJSON(t *testing.T) { + ndjson := `{"type":"item.completed","item":{"item_type":"assistant_message","text":"plain prose text"}}` + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(ndjson), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + + result, err := provider.Execute(context.Background(), "test", nil, nil, nil) + + require.NoError(t, err) + require.NotNil(t, result) + assert.Nil(t, result.Response) + assert.Equal(t, "plain prose text", result.Output) +} + +func TestCodexProvider_Execute_ResponsePopulation_EmptyExtraction(t *testing.T) { + rawMock := "raw plain text output" + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(rawMock), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + + result, err := provider.Execute(context.Background(), "test", nil, nil, nil) + + require.NoError(t, err) + require.NotNil(t, result) + assert.Nil(t, result.Response) + assert.Equal(t, rawMock, result.Output) +} + +func TestCodexProvider_Execute_TokenRecountOnExtracted(t *testing.T) { + // recount uses extracted length ("hi" = 0 tokens), not raw NDJSON length (~19 tokens) + ndjson := `{"type":"item.completed","item":{"item_type":"assistant_message","text":"hi"}}` + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(ndjson), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + + result, err := provider.Execute(context.Background(), "test", nil, nil, nil) + + require.NoError(t, err) + require.NotNil(t, result) + assert.True(t, result.TokensEstimated) + assert.Equal(t, len("hi")/4, result.Tokens) +} + +func TestCodexProvider_ExecuteConversation_EmptyAssistantMessage(t *testing.T) { + // Response nil: base uses struct literal init, not make(); update if base switches to pre-init. + ndjson := `{"type":"item.completed","item":{"item_type":"assistant_message","text":""}}` + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(ndjson), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + + result, err := provider.ExecuteConversation(context.Background(), workflow.NewConversationState(""), "test", nil, nil, nil) + + require.NoError(t, err) + require.NotNil(t, result) + assert.Equal(t, "", result.Output) + assert.Nil(t, result.Response) + // F103: the presence-aware overwrite must re-derive token counts from the extracted + // (empty) text rather than leaving the inflated count the base computed on the raw NDJSON. + assert.Equal(t, 0, result.TokensOutput) + assert.Equal(t, result.TokensInput, result.TokensTotal) + // And the trailing assistant turn Content must match the overwritten Output. + require.NotEmpty(t, result.State.Turns) + last := result.State.Turns[len(result.State.Turns)-1] + assert.Equal(t, workflow.TurnRoleAssistant, last.Role) + assert.Equal(t, "", last.Content) +} + +func TestCodexProvider_ExecuteConversation_ResponsePopulation_ValidJSON(t *testing.T) { + ndjson := `{"type":"item.completed","item":{"item_type":"assistant_message","text":"{\"answer\":42}"}}` + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(ndjson), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + + result, err := provider.ExecuteConversation(context.Background(), workflow.NewConversationState(""), "test", nil, nil, nil) + + require.NoError(t, err) + require.NotNil(t, result) + require.NotNil(t, result.Response) + assert.Equal(t, float64(42), result.Response["answer"]) +} + +func TestCodexProvider_ExecuteConversation_JSONResponse(t *testing.T) { + ndjson := `{"type":"item.completed","item":{"item_type":"assistant_message","text":"{\"key\":\"val\"}"}}` + tests := []struct { + name string + outputFormat string + }{ + {name: "json format", outputFormat: "json"}, + {name: "stream-json format", outputFormat: "stream-json"}, + {name: "text format", outputFormat: "text"}, + {name: "absent format", outputFormat: ""}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(ndjson), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + options := map[string]any{"output_format": tt.outputFormat} + + result, err := provider.ExecuteConversation(context.Background(), workflow.NewConversationState(""), "test", options, nil, nil) + + require.NoError(t, err) + require.NotNil(t, result) + require.NotNil(t, result.Response, "Response must be populated for all output_format values") + assert.Equal(t, "val", result.Response["key"]) + }) + } +} + +func TestCodexProvider_ExecuteConversation_NilResponseOnPlainText(t *testing.T) { + ndjson := `{"type":"item.completed","item":{"item_type":"assistant_message","text":"plain prose text"}}` + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(ndjson), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + + result, err := provider.ExecuteConversation(context.Background(), workflow.NewConversationState(""), "test", nil, nil, nil) + + require.NoError(t, err) + require.NotNil(t, result) + assert.Nil(t, result.Response) + assert.Equal(t, "plain prose text", result.Output) +} + +func TestCodexProvider_ExecuteConversation_MultiEventNDJSON(t *testing.T) { + ndjson := ndjsonLine("foo") + "\n" + ndjsonLine("bar") + + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(ndjson), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + state := workflow.NewConversationState("") + + result, err := provider.ExecuteConversation(context.Background(), state, "test", nil, nil, nil) + + require.NoError(t, err) + require.NotNil(t, result) + assert.Equal(t, "foo\nbar", result.Output) +} + +func TestCodexProvider_ExecuteConversation_NULBytesInStream(t *testing.T) { + ndjson := "{\"type\":\"item.completed\",\"item\":{\"item_type\":\"assistant_message\",\"text\":\"before\x00after\"}}" + + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(ndjson), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + state := workflow.NewConversationState("") + + result, err := provider.ExecuteConversation(context.Background(), state, "test", nil, nil, nil) + + require.NoError(t, err) + require.NotNil(t, result) + assert.Contains(t, result.Output, "after", "post-NUL characters must not be truncated") +} + +func TestCodexProvider_ExecuteConversation_NonJSONText(t *testing.T) { + ndjson := `{"type":"item.completed","item":{"item_type":"assistant_message","text":"hello world"}}` + + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(ndjson), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + state := workflow.NewConversationState("") + + result, err := provider.ExecuteConversation(context.Background(), state, "test", nil, nil, nil) + + require.NoError(t, err) + require.NotNil(t, result) + assert.Equal(t, "hello world", result.Output) + assert.Nil(t, result.Response) +} + +func TestCodexProvider_Execute_MultiEventNDJSON(t *testing.T) { + ndjson := ndjsonLine("foo") + "\n" + ndjsonLine("bar") + + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(ndjson), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + + result, err := provider.Execute(context.Background(), "test", nil, nil, nil) + + require.NoError(t, err) + require.NotNil(t, result) + assert.Equal(t, "foo\nbar", result.Output) +} + +func TestCodexProvider_Execute_EmptyAssistantMessage(t *testing.T) { + ndjson := `{"type":"item.completed","item":{"item_type":"assistant_message","text":""}}` + + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(ndjson), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + + result, err := provider.Execute(context.Background(), "test", nil, nil, nil) + + require.NoError(t, err) + require.NotNil(t, result) + assert.Equal(t, "", result.Output) + assert.Nil(t, result.Response) +} + +func TestCodexProvider_Execute_NULBytesInStream(t *testing.T) { + ndjson := "{\"type\":\"item.completed\",\"item\":{\"item_type\":\"assistant_message\",\"text\":\"before\x00after\"}}" + + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(ndjson), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + + result, err := provider.Execute(context.Background(), "test", nil, nil, nil) + + require.NoError(t, err) + require.NotNil(t, result) + assert.Contains(t, result.Output, "after", "post-NUL characters must not be truncated") +} + +func TestCodexProvider_Execute_NonJSONText(t *testing.T) { + ndjson := `{"type":"item.completed","item":{"item_type":"assistant_message","text":"hello world"}}` + + mockExec := mocks.NewMockCLIExecutor() + mockExec.SetOutput([]byte(ndjson), nil) + provider := NewCodexProviderWithOptions(WithCodexExecutor(mockExec)) + + result, err := provider.Execute(context.Background(), "test", nil, nil, nil) + + require.NoError(t, err) + require.NotNil(t, result) + assert.Equal(t, "hello world", result.Output) + assert.Nil(t, result.Response) +} + func TestCodexProvider_ExecuteConversation_ResumeJSONStructure(t *testing.T) { mockExec := mocks.NewMockCLIExecutor() mockExec.SetOutput([]byte("response"), nil) @@ -1040,9 +1466,10 @@ func TestCodexProvider_ExecuteConversation_ResumeJSONStructure(t *testing.T) { require.Len(t, calls, 1) args := calls[0].Args - require.Len(t, args, 4, "resume turn should have resume, sessionID, --json, prompt") - assert.Equal(t, "resume", args[0], "first arg should be resume subcommand") - assert.Equal(t, "codex-abc123def456", args[1], "second arg should be session ID") - assert.Equal(t, "--json", args[2], "third arg should be --json flag") - assert.Equal(t, "follow up", args[3], "fourth arg should be prompt") + require.Len(t, args, 5, "resume turn should have exec, resume, sessionID, --json, prompt") + assert.Equal(t, "exec", args[0], "first arg should be exec subcommand") + assert.Equal(t, "resume", args[1], "second arg should be resume subcommand") + assert.Equal(t, "codex-abc123def456", args[2], "third arg should be session ID") + assert.Equal(t, "--json", args[3], "fourth arg should be --json flag") + assert.Equal(t, "follow up", args[4], "fifth arg should be prompt") } diff --git a/internal/infrastructure/agents/helpers.go b/internal/infrastructure/agents/helpers.go index 73423837..efbc2b35 100644 --- a/internal/infrastructure/agents/helpers.go +++ b/internal/infrastructure/agents/helpers.go @@ -62,7 +62,7 @@ func tryParseJSONResponse(output string) map[string]any { // findFirstNDJSONEvent scans NDJSON output and returns the first parsed event // whose "type" field equals eventType. Returns nil if no match is found. func findFirstNDJSONEvent(output, eventType string) map[string]any { - for _, line := range strings.Split(output, "\n") { + for line := range strings.SplitSeq(output, "\n") { line = strings.TrimSpace(line) if line == "" { continue @@ -80,7 +80,7 @@ func findFirstNDJSONEvent(output, eventType string) map[string]any { func findLastNDJSONEvent(output, eventType string) map[string]any { var found map[string]any - for _, line := range strings.Split(output, "\n") { + for line := range strings.SplitSeq(output, "\n") { line = strings.TrimSpace(line) if line == "" { continue