Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions packages/core/src/evaluation/orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -962,6 +962,7 @@ export async function runEvaluation(
testId: '__before_all__',
evalRunId,
evalDir,
workspaceFileDir: suiteWorkspace?.workspaceFileDir,
};
try {
beforeAllOutput = await executeWorkspaceScript(
Expand All @@ -988,6 +989,7 @@ export async function runEvaluation(
testId: '__before_all__',
evalRunId,
evalDir,
workspaceFileDir: suiteWorkspace?.workspaceFileDir,
};
try {
const output = await executeWorkspaceScript(
Expand Down Expand Up @@ -1408,6 +1410,7 @@ export async function runEvaluation(
testId: '__after_all__',
evalRunId,
evalDir,
workspaceFileDir: suiteWorkspace?.workspaceFileDir,
};
try {
const afterAllOutput = await executeWorkspaceScript(
Expand Down Expand Up @@ -1859,6 +1862,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
caseInput: evalCase.question,
caseMetadata: evalCase.metadata,
evalDir,
workspaceFileDir: evalCase.workspace?.workspaceFileDir,
};
try {
beforeAllOutput = await executeWorkspaceScript(
Expand Down Expand Up @@ -1938,6 +1942,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
caseInput: evalCase.question,
caseMetadata: evalCase.metadata,
evalDir,
workspaceFileDir: evalCase.workspace?.workspaceFileDir,
};
try {
beforeEachOutput = await executeWorkspaceScript(
Expand Down Expand Up @@ -2195,6 +2200,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
caseInput: evalCase.question,
caseMetadata: evalCase.metadata,
evalDir,
workspaceFileDir: evalCase.workspace?.workspaceFileDir,
};
try {
afterEachOutput = await executeWorkspaceScript(
Expand Down
4 changes: 4 additions & 0 deletions packages/core/src/evaluation/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,10 @@ export type WorkspaceConfig = {
readonly path?: string;
/** Docker-based workspace: run grader commands inside a container */
readonly docker?: DockerWorkspaceConfig;
/** Directory containing the workspace file when workspace is a file reference.
* Used as default cwd for hook commands so that file-referenced templates resolve
* relative paths from their own directory, not the eval file's directory. */
readonly workspaceFileDir?: string;
};

export type CodeEvaluatorConfig = {
Expand Down
8 changes: 6 additions & 2 deletions packages/core/src/evaluation/workspace/script-executor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,12 @@ export interface ScriptExecutionContext {
readonly evalRunId: string;
readonly caseInput?: string;
readonly caseMetadata?: Record<string, unknown>;
/** Directory containing the eval YAML file. Used as default cwd. */
/** Directory containing the eval YAML file. Used as fallback cwd. */
readonly evalDir?: string;
/** Directory containing the workspace file (when workspace is a file reference).
* Takes priority over evalDir as default cwd so that file-referenced templates
* resolve relative paths from their own directory. */
readonly workspaceFileDir?: string;
}

export type ScriptFailureMode = 'fatal' | 'warn';
Expand Down Expand Up @@ -57,7 +61,7 @@ export async function executeWorkspaceScript(
});

const timeoutMs = config.timeout_ms ?? (failureMode === 'fatal' ? 60000 : 30000);
const cwd = config.cwd ?? context.evalDir;
const cwd = config.cwd ?? context.workspaceFileDir ?? context.evalDir;

// Support both command (canonical) and script (deprecated alias)
if (config.script !== undefined && config.command === undefined) {
Expand Down
3 changes: 2 additions & 1 deletion packages/core/src/evaluation/yaml-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -751,7 +751,7 @@ async function resolveWorkspaceConfig(
const workspaceFileDir = path.dirname(workspaceFilePath);
const resolvedWorkspace = parseWorkspaceConfig(parsed, workspaceFileDir);
if (resolvedWorkspace) {
return resolvedWorkspace;
return { ...resolvedWorkspace, workspaceFileDir };
}

const parsedObject = parsed as Record<string, unknown>;
Expand Down Expand Up @@ -882,6 +882,7 @@ function mergeWorkspaceConfigs(
mode: caseLevel.mode ?? suiteLevel.mode,
path: caseLevel.path ?? suiteLevel.path,
docker: caseLevel.docker ?? suiteLevel.docker,
workspaceFileDir: caseLevel.workspaceFileDir ?? suiteLevel.workspaceFileDir,
};
}

Expand Down
56 changes: 56 additions & 0 deletions packages/core/test/evaluation/workspace-config-parsing.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,62 @@ tests:
expect(cases[0].workspace?.template).toBe(path.join(wsDir, 'my-template'));
// cwd resolved relative to workspace file dir
expect(cases[0].workspace?.hooks?.before_all?.cwd).toBe(path.join(wsDir, 'scripts'));
// workspaceFileDir is set to the workspace file's directory
expect(cases[0].workspace?.workspaceFileDir).toBe(wsDir);
});

it('should set workspaceFileDir when workspace is a file reference', async () => {
const wsDir = path.join(testDir, 'wsfiledir-test');
await mkdir(wsDir, { recursive: true });

const workspaceFile = path.join(wsDir, 'workspace.yaml');
await writeFile(
workspaceFile,
`
hooks:
before_all:
command: ["echo", "hello"]
`,
);

const evalFile = path.join(testDir, 'wsfiledir-eval.yaml');
await writeFile(
evalFile,
`
workspace: ./wsfiledir-test/workspace.yaml

tests:
- id: wsfiledir-test-1
input: "Do something"
criteria: "Should work"
`,
);

const cases = await loadTests(evalFile, testDir);
expect(cases).toHaveLength(1);
expect(cases[0].workspace?.workspaceFileDir).toBe(wsDir);
});

it('should not set workspaceFileDir for inline workspace config', async () => {
const evalFile = path.join(testDir, 'inline-workspace.yaml');
await writeFile(
evalFile,
`
workspace:
hooks:
before_all:
command: ["echo", "hello"]

tests:
- id: inline-test-1
input: "Do something"
criteria: "Should work"
`,
);

const cases = await loadTests(evalFile, testDir);
expect(cases).toHaveLength(1);
expect(cases[0].workspace?.workspaceFileDir).toBeUndefined();
});

it('should throw a clear error when workspace file is not found', async () => {
Expand Down
65 changes: 65 additions & 0 deletions packages/core/test/evaluation/workspace/script-executor.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -344,4 +344,69 @@ process.stdout.write(JSON.stringify(args));
await rm(explicitDir, { recursive: true, force: true });
}
});

it('defaults cwd to workspaceFileDir over evalDir when workspace is a file reference', async () => {
const evalDir = path.join(tmpdir(), `agentv-evaldir-${randomUUID()}`);
const workspaceFileDir = path.join(tmpdir(), `agentv-wsfiledir-${randomUUID()}`);
await mkdir(evalDir, { recursive: true });
await mkdir(workspaceFileDir, { recursive: true });

try {
const cwdScript = path.join(testDir, 'print-cwd3.js');
await writeFile(cwdScript, 'process.stdout.write(process.cwd());');

const config: WorkspaceScriptConfig = {
command: ['node', cwdScript],
// No cwd — should default to workspaceFileDir, not evalDir
};

const context: ScriptExecutionContext = {
workspacePath: '/tmp/workspace',
testId: 'wsfiledir-default-test',
evalRunId: 'run-cwd-3',
evalDir,
workspaceFileDir,
};

const output = await executeWorkspaceScript(config, context, 'fatal');
expect(output).toBe(workspaceFileDir);
} finally {
await rm(evalDir, { recursive: true, force: true });
await rm(workspaceFileDir, { recursive: true, force: true });
}
});

it('uses explicit cwd over workspaceFileDir', async () => {
const evalDir = path.join(tmpdir(), `agentv-evaldir-${randomUUID()}`);
const workspaceFileDir = path.join(tmpdir(), `agentv-wsfiledir-${randomUUID()}`);
const explicitDir = path.join(tmpdir(), `agentv-explicit-${randomUUID()}`);
await mkdir(evalDir, { recursive: true });
await mkdir(workspaceFileDir, { recursive: true });
await mkdir(explicitDir, { recursive: true });

try {
const cwdScript = path.join(testDir, 'print-cwd4.js');
await writeFile(cwdScript, 'process.stdout.write(process.cwd());');

const config: WorkspaceScriptConfig = {
command: ['node', cwdScript],
cwd: explicitDir, // Explicit cwd should override both workspaceFileDir and evalDir
};

const context: ScriptExecutionContext = {
workspacePath: '/tmp/workspace',
testId: 'explicit-over-wsfiledir-test',
evalRunId: 'run-cwd-4',
evalDir,
workspaceFileDir,
};

const output = await executeWorkspaceScript(config, context, 'fatal');
expect(output).toBe(explicitDir);
} finally {
await rm(evalDir, { recursive: true, force: true });
await rm(workspaceFileDir, { recursive: true, force: true });
await rm(explicitDir, { recursive: true, force: true });
}
});
});
Loading