From 69a2b46c5708565dac73f8576e1b4bf7bb5982a1 Mon Sep 17 00:00:00 2001 From: 0xDevNinja Date: Thu, 14 May 2026 14:19:43 +0530 Subject: [PATCH] fix(global-discover): bucket codex by originator + read 128KB for CC cwd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two problems from issue #1315, one diff. Problem 1: `scanCodex` counted every rollout file as a `codex` session, conflating Codex Desktop (interactive codex dev) with codex_exec (cron / subagent) and Claude Code (CC driving codex via MCP). `/retro global` then narrated "codex was the primary tool, 414 sessions" when codex actually drove dev for one repo's middle phase and the other ~309 entries were CC firing codex as cross-model review. `payload.originator` is now normalized into a 4-bucket `codex_originators: { desktop, exec, claude_code, other }`. Surfaced under `tools.codex.originators` and per-repo `codex_originators`. Additive — existing `codex` totals stay, no consumer break. Problem 2: `extractCwdFromJsonl` read 8KB then parsed. Recent Claude Code / CCR JSONL files often open with a `queue-operation` event 30-50KB long that carries no `cwd`. The 8KB read truncated that line, JSON.parse failed, the fallback returned null, and the whole project dir got skipped. Akagilnc measured ~450 CC files vanishing this way in one repo's 31d window. Bumped to 128KB — same buffer size `scanCodex` already uses. Also exposes `CLAUDE_PROJECTS_DIR` env override (parallel to existing `CODEX_SESSIONS_DIR`) so the regression test can plant a fake project dir with a >30KB first line. Out of scope: the suggested annotation in `/retro global` output that "sessions" means "tool invocations / file count" (problem 3 in the issue). Reasonable separate PR. Fixes #1315 --- bin/gstack-global-discover.ts | 78 ++++++++++++++-- test/global-discover.test.ts | 171 ++++++++++++++++++++++++++++++++++ 2 files changed, 241 insertions(+), 8 deletions(-) diff --git a/bin/gstack-global-discover.ts b/bin/gstack-global-discover.ts index 4e1445b37a..accc292ff6 100644 --- a/bin/gstack-global-discover.ts +++ b/bin/gstack-global-discover.ts @@ -16,9 +16,22 @@ import { homedir } from "os"; // ── Types ────────────────────────────────────────────────────────────────── +// Codex `payload.originator` values map to four buckets so /retro global can +// distinguish real codex dev (Codex Desktop) from subagent invocations +// (codex_exec) and CC-driven calls (Claude Code). See issue #1315. +type CodexOriginator = "desktop" | "exec" | "claude_code" | "other"; + interface Session { tool: "claude_code" | "codex" | "gemini"; cwd: string; + codexOriginator?: CodexOriginator; +} + +interface CodexOriginatorCounts { + desktop: number; + exec: number; + claude_code: number; + other: number; } interface Repo { @@ -26,6 +39,7 @@ interface Repo { remote: string; paths: string[]; sessions: { claude_code: number; codex: number; gemini: number }; + codex_originators: CodexOriginatorCounts; } interface DiscoveryResult { @@ -34,7 +48,7 @@ interface DiscoveryResult { repos: Repo[]; tools: { claude_code: { total_sessions: number; repos: number }; - codex: { total_sessions: number; repos: number }; + codex: { total_sessions: number; repos: number; originators: CodexOriginatorCounts }; gemini: { total_sessions: number; repos: number }; }; total_sessions: number; @@ -178,7 +192,9 @@ function getGitRemote(cwd: string): string | null { // ── Scanners ─────────────────────────────────────────────────────────────── function scanClaudeCode(since: Date): Session[] { - const projectsDir = join(homedir(), ".claude", "projects"); + // `CLAUDE_PROJECTS_DIR` is honored for test injection (mirrors the existing + // `CODEX_SESSIONS_DIR` knob in scanCodex). Production paths leave it unset. + const projectsDir = process.env.CLAUDE_PROJECTS_DIR || join(homedir(), ".claude", "projects"); if (!existsSync(projectsDir)) return []; const sessions: Session[] = []; @@ -274,11 +290,17 @@ function resolveClaudeCodeCwd( } function extractCwdFromJsonl(filePath: string): string | null { + // Recent Claude Code / CCR JSONL files often start with a large + // `queue-operation` event (~30-50KB) that carries no `cwd`. The pre-fix + // 8KB buffer truncated mid-line, JSON.parse failed, and the project + // directory was silently dropped from the discovery count. Issue #1315 + // (Akagilnc's diagnosis): ~450 CC jsonl files in a single repo went + // missing this way. 128KB matches scanCodex's existing buffer choice + // and covers the largest first-line events we've seen in the wild. try { - // Read only the first 8KB to avoid loading huge JSONL files into memory const fd = openSync(filePath, "r"); - const buf = Buffer.alloc(8192); - const bytesRead = readSync(fd, buf, 0, 8192, 0); + const buf = Buffer.alloc(131072); + const bytesRead = readSync(fd, buf, 0, 131072, 0); closeSync(fd); const text = buf.toString("utf-8", 0, bytesRead); const lines = text.split("\n").slice(0, 15); @@ -297,6 +319,23 @@ function extractCwdFromJsonl(filePath: string): string | null { return null; } +// Codex rollouts ship a free-form `payload.originator` string. Real values +// seen in the wild: "Codex Desktop" (interactive dev), "codex_exec" (cron / +// scripted / subagent), "Claude Code" (CC's MCP / subagent integration). +// Anything else lands in `other` rather than being silently dropped. +function normalizeCodexOriginator(raw: unknown): CodexOriginator { + if (typeof raw !== "string") return "other"; + const v = raw.toLowerCase(); + if (v === "codex desktop" || v === "codex_desktop") return "desktop"; + if (v === "codex_exec" || v === "codex exec") return "exec"; + if (v === "claude code" || v === "claude_code") return "claude_code"; + return "other"; +} + +function emptyOriginatorCounts(): CodexOriginatorCounts { + return { desktop: 0, exec: 0, claude_code: 0, other: 0 }; +} + function scanCodex(since: Date): Session[] { const sessionsDir = process.env.CODEX_SESSIONS_DIR || join(homedir(), ".codex", "sessions"); if (!existsSync(sessionsDir)) return []; @@ -346,7 +385,11 @@ function scanCodex(since: Date): Session[] { if (!firstLine) continue; const meta = JSON.parse(firstLine); if (meta.type === "session_meta" && meta.payload?.cwd) { - sessions.push({ tool: "codex", cwd: meta.payload.cwd }); + sessions.push({ + tool: "codex", + cwd: meta.payload.cwd, + codexOriginator: normalizeCodexOriginator(meta.payload.originator), + }); } } catch { console.error(`Warning: could not parse Codex session ${filePath}`); @@ -508,8 +551,12 @@ async function resolveAndDeduplicate(sessions: Session[]): Promise { } const sessionCounts = { claude_code: 0, codex: 0, gemini: 0 }; + const codexOriginators = emptyOriginatorCounts(); for (const s of data.sessions) { sessionCounts[s.tool]++; + if (s.tool === "codex") { + codexOriginators[s.codexOriginator ?? "other"]++; + } } repos.push({ @@ -517,6 +564,7 @@ async function resolveAndDeduplicate(sessions: Session[]): Promise { remote, paths: data.paths, sessions: sessionCounts, + codex_originators: codexOriginators, }); } @@ -559,13 +607,18 @@ async function main() { const codexRepos = new Set(repos.filter((r) => r.sessions.codex > 0).map((r) => r.remote)).size; const geminiRepos = new Set(repos.filter((r) => r.sessions.gemini > 0).map((r) => r.remote)).size; + const codexOriginatorTotals = emptyOriginatorCounts(); + for (const s of codexSessions) { + codexOriginatorTotals[s.codexOriginator ?? "other"]++; + } + const result: DiscoveryResult = { window: since, start_date: startDate, repos, tools: { claude_code: { total_sessions: ccSessions.length, repos: ccRepos }, - codex: { total_sessions: codexSessions.length, repos: codexRepos }, + codex: { total_sessions: codexSessions.length, repos: codexRepos, originators: codexOriginatorTotals }, gemini: { total_sessions: geminiSessions.length, repos: geminiRepos }, }, total_sessions: allSessions.length, @@ -578,13 +631,22 @@ async function main() { // Summary format console.log(`Window: ${since} (since ${startDate})`); console.log(`Sessions: ${allSessions.length} total (CC: ${ccSessions.length}, Codex: ${codexSessions.length}, Gemini: ${geminiSessions.length})`); + if (codexSessions.length > 0) { + const o = codexOriginatorTotals; + console.log(` Codex originators: desktop=${o.desktop}, exec=${o.exec}, claude_code=${o.claude_code}, other=${o.other}`); + } console.log(`Repos: ${repos.length} unique`); console.log(""); for (const repo of repos) { const total = repo.sessions.claude_code + repo.sessions.codex + repo.sessions.gemini; const tools = []; if (repo.sessions.claude_code > 0) tools.push(`CC:${repo.sessions.claude_code}`); - if (repo.sessions.codex > 0) tools.push(`Codex:${repo.sessions.codex}`); + if (repo.sessions.codex > 0) { + const o = repo.codex_originators; + // Show the desktop/exec split inline when codex sessions are present — + // a single number hid real-dev vs subagent activity in /retro global. + tools.push(`Codex:${repo.sessions.codex} (desktop=${o.desktop}, exec=${o.exec}, cc=${o.claude_code}${o.other > 0 ? `, other=${o.other}` : ""})`); + } if (repo.sessions.gemini > 0) tools.push(`Gemini:${repo.sessions.gemini}`); console.log(` ${repo.name} (${total} sessions) — ${tools.join(", ")}`); console.log(` Remote: ${repo.remote}`); diff --git a/test/global-discover.test.ts b/test/global-discover.test.ts index e541644c2e..cc395baabd 100644 --- a/test/global-discover.test.ts +++ b/test/global-discover.test.ts @@ -290,6 +290,157 @@ describe("gstack-global-discover", () => { }); }); + describe("codex originator bucketing (issue #1315)", () => { + let tmpDir: string; + let codexDir: string; + let repoDir: string; + + beforeEach(() => { + tmpDir = mkdtempSync(join(tmpdir(), "gstack-codex-orig-")); + const now = new Date(); + const y = now.getFullYear().toString(); + const m = String(now.getMonth() + 1).padStart(2, "0"); + const d = String(now.getDate()).padStart(2, "0"); + codexDir = join(tmpDir, "codex-home", "sessions", y, m, d); + mkdirSync(codexDir, { recursive: true }); + + repoDir = join(tmpDir, "fake-repo"); + mkdirSync(repoDir); + spawnSync("git", ["init"], { cwd: repoDir, stdio: "pipe" }); + spawnSync("git", ["commit", "--allow-empty", "-m", "init"], { + cwd: repoDir, + stdio: "pipe", + }); + }); + + afterEach(() => { + rmSync(tmpDir, { recursive: true, force: true }); + }); + + function writeCodex(originator: string) { + const line = JSON.stringify({ + timestamp: new Date().toISOString(), + type: "session_meta", + payload: { id: `t-${Math.random()}`, timestamp: new Date().toISOString(), cwd: repoDir, originator }, + }); + const name = `rollout-${new Date().toISOString().replace(/[:.]/g, "-")}-${Math.random().toString(36).slice(2)}.jsonl`; + writeFileSync(join(codexDir, name), line + "\n"); + } + + function discover() { + const r = spawnSync( + "bun", + ["run", scriptPath, "--since", "1h", "--format", "json"], + { + encoding: "utf-8", + timeout: 30000, + env: { ...process.env, CODEX_SESSIONS_DIR: join(tmpDir, "codex-home", "sessions") }, + } + ); + expect(r.status).toBe(0); + return JSON.parse(r.stdout); + } + + test("'Codex Desktop' originator → desktop bucket", () => { + writeCodex("Codex Desktop"); + const json = discover(); + expect(json.tools.codex.originators.desktop).toBe(1); + expect(json.tools.codex.originators.exec).toBe(0); + expect(json.tools.codex.originators.claude_code).toBe(0); + }); + + test("'codex_exec' originator → exec bucket", () => { + writeCodex("codex_exec"); + const json = discover(); + expect(json.tools.codex.originators.exec).toBe(1); + expect(json.tools.codex.originators.desktop).toBe(0); + }); + + test("'Claude Code' originator → claude_code bucket", () => { + writeCodex("Claude Code"); + const json = discover(); + expect(json.tools.codex.originators.claude_code).toBe(1); + expect(json.tools.codex.originators.desktop).toBe(0); + expect(json.tools.codex.originators.exec).toBe(0); + }); + + test("unknown originator → other bucket (not silently dropped)", () => { + writeCodex("future-agent-name-not-yet-mapped"); + const json = discover(); + expect(json.tools.codex.originators.other).toBe(1); + expect(json.tools.codex.total_sessions).toBe(1); + }); + + test("per-repo codex_originators sums to per-repo codex count", () => { + writeCodex("Codex Desktop"); + writeCodex("codex_exec"); + writeCodex("codex_exec"); + writeCodex("Claude Code"); + const json = discover(); + // The fake repo's normalized remote will be local: form; just find it. + const repo = json.repos.find((r: any) => r.paths.includes(repoDir)); + expect(repo).toBeDefined(); + const o = repo.codex_originators; + expect(o.desktop + o.exec + o.claude_code + o.other).toBe(repo.sessions.codex); + expect(o.desktop).toBe(1); + expect(o.exec).toBe(2); + expect(o.claude_code).toBe(1); + }); + }); + + describe("CC jsonl with >8KB first line (issue #1315 Problem 2)", () => { + let tmpDir: string; + let ccProjectsDir: string; + let realRepoDir: string; + + beforeEach(() => { + tmpDir = mkdtempSync(join(tmpdir(), "gstack-cc-bigline-")); + // Real repo on disk so resolveClaudeCodeCwd can verify it. + realRepoDir = join(tmpDir, "real-repo"); + mkdirSync(realRepoDir); + spawnSync("git", ["init"], { cwd: realRepoDir, stdio: "pipe" }); + spawnSync("git", ["commit", "--allow-empty", "-m", "init"], { cwd: realRepoDir, stdio: "pipe" }); + // CC project dir is a CCR / cron-style decoded path that does NOT exist + // on disk, so resolveClaudeCodeCwd falls to extractCwdFromJsonl. + const fakeProjectName = "-tmp-does-not-exist-on-disk-blogger-lab"; + ccProjectsDir = join(tmpDir, "claude-home", "projects", fakeProjectName); + mkdirSync(ccProjectsDir, { recursive: true }); + }); + + afterEach(() => { + rmSync(tmpDir, { recursive: true, force: true }); + }); + + test("first-line queue-operation >8KB no longer hides cwd on later line", () => { + // Recreate the Akagilnc scenario: first line is a huge queue-operation + // event with no `cwd`, second line carries the real cwd. + const bigLine = JSON.stringify({ + type: "queue-operation", + payload: { junk: "x".repeat(40000) }, + }); + expect(bigLine.length).toBeGreaterThan(30000); + const cwdLine = JSON.stringify({ type: "summary", cwd: realRepoDir }); + const jsonl = bigLine + "\n" + cwdLine + "\n"; + writeFileSync(join(ccProjectsDir, "session-1.jsonl"), jsonl); + + const r = spawnSync( + "bun", + ["run", scriptPath, "--since", "1h", "--format", "json"], + { + encoding: "utf-8", + timeout: 30000, + env: { ...process.env, HOME: join(tmpDir, "claude-home"), CLAUDE_PROJECTS_DIR: join(tmpDir, "claude-home", "projects") }, + } + ); + expect(r.status).toBe(0); + const json = JSON.parse(r.stdout); + // The fake repo should now be discovered as a CC session. + const found = json.repos.find((repo: any) => repo.paths.includes(realRepoDir)); + expect(found).toBeDefined(); + expect(found.sessions.claude_code).toBeGreaterThanOrEqual(1); + }); + }); + describe("discovery output structure", () => { test("repos have required fields", () => { const result = spawnSync( @@ -329,6 +480,26 @@ describe("gstack-global-discover", () => { expect(json.total_sessions).toBe(toolTotal); }); + test("repos expose codex_originators breakdown", () => { + const result = spawnSync( + "bun", + ["run", scriptPath, "--since", "30d", "--format", "json"], + { encoding: "utf-8", timeout: 30000 } + ); + const json = JSON.parse(result.stdout); + expect(json.tools.codex).toHaveProperty("originators"); + const o = json.tools.codex.originators; + for (const k of ["desktop", "exec", "claude_code", "other"]) { + expect(o).toHaveProperty(k); + expect(typeof o[k]).toBe("number"); + } + // Sum of originators must equal codex total_sessions. + expect(o.desktop + o.exec + o.claude_code + o.other).toBe(json.tools.codex.total_sessions); + for (const repo of json.repos) { + expect(repo).toHaveProperty("codex_originators"); + } + }); + test("deduplicates Conductor workspaces by remote", () => { const result = spawnSync( "bun",