diff --git a/benchmark-models/SKILL.md b/benchmark-models/SKILL.md index 5e5e6bd667..7318a6bcc3 100644 --- a/benchmark-models/SKILL.md +++ b/benchmark-models/SKILL.md @@ -4,17 +4,19 @@ preamble-tier: 1 version: 1.0.0 description: | Cross-model benchmark for gstack skills. Runs the same prompt through Claude, - GPT (via Codex CLI), and Gemini side-by-side — compares latency, tokens, cost, - and optionally quality via LLM judge. Answers "which model is actually best - for this skill?" with data instead of vibes. Separate from /benchmark, which - measures web page performance. Use when: "benchmark models", "compare models", - "which model is best for X", "cross-model comparison", "model shootout". (gstack) + GPT (via Codex CLI), Gemini, and Ollama (local) side-by-side — compares + latency, tokens, cost, and optionally quality via LLM judge. Answers "which + model is actually best for this skill?" with data instead of vibes. Separate + from /benchmark, which measures web page performance. Use when: "benchmark + models", "compare models", "which model is best for X", "cross-model + comparison", "model shootout". (gstack) Voice triggers (speech-to-text aliases): "compare models", "model shootout", "which model is best". triggers: - cross model benchmark - - compare claude gpt gemini + - compare claude gpt gemini ollama - benchmark skill across models - which model should I use + - local vs cloud model comparison allowed-tools: - Bash - Read @@ -522,12 +524,12 @@ If C: ask for the path. Verify it exists. Use as positional argument. ## Step 2: Choose providers ```bash -"$BIN" --prompt "unused, dry-run" --models claude,gpt,gemini --dry-run +"$BIN" --prompt "unused, dry-run" --models claude,gpt,gemini,ollama --dry-run ``` Show the dry-run output. The "Adapter availability" section tells the user which providers will actually run (OK) vs skip (NOT READY — remediation hint included). -If ALL three show NOT READY: stop with a clear message — benchmark can't run without at least one authed provider. Suggest `claude login`, `codex login`, or `gemini login` / `export GOOGLE_API_KEY`. +If ALL four show NOT READY: stop with a clear message — benchmark can't run without at least one authed provider. Suggest `claude login`, `codex login`, `gemini login` / `export GOOGLE_API_KEY`, or for local: install Ollama from https://ollama.com then `ollama pull qwen2.5-coder:7b && ollama serve`. If at least one is OK: AskUserQuestion: - **Simplify:** "Which models should we include? The dry-run above showed which are authed. Unauthed ones will be skipped cleanly — they won't abort the batch." diff --git a/benchmark-models/SKILL.md.tmpl b/benchmark-models/SKILL.md.tmpl index 034cda1824..8d41067565 100644 --- a/benchmark-models/SKILL.md.tmpl +++ b/benchmark-models/SKILL.md.tmpl @@ -4,20 +4,22 @@ preamble-tier: 1 version: 1.0.0 description: | Cross-model benchmark for gstack skills. Runs the same prompt through Claude, - GPT (via Codex CLI), and Gemini side-by-side — compares latency, tokens, cost, - and optionally quality via LLM judge. Answers "which model is actually best - for this skill?" with data instead of vibes. Separate from /benchmark, which - measures web page performance. Use when: "benchmark models", "compare models", - "which model is best for X", "cross-model comparison", "model shootout". (gstack) + GPT (via Codex CLI), Gemini, and Ollama (local) side-by-side — compares + latency, tokens, cost, and optionally quality via LLM judge. Answers "which + model is actually best for this skill?" with data instead of vibes. Separate + from /benchmark, which measures web page performance. Use when: "benchmark + models", "compare models", "which model is best for X", "cross-model + comparison", "model shootout". (gstack) voice-triggers: - "compare models" - "model shootout" - "which model is best" triggers: - cross model benchmark - - compare claude gpt gemini + - compare claude gpt gemini ollama - benchmark skill across models - which model should I use + - local vs cloud model comparison allowed-tools: - Bash - Read @@ -69,12 +71,12 @@ If C: ask for the path. Verify it exists. Use as positional argument. ## Step 2: Choose providers ```bash -"$BIN" --prompt "unused, dry-run" --models claude,gpt,gemini --dry-run +"$BIN" --prompt "unused, dry-run" --models claude,gpt,gemini,ollama --dry-run ``` Show the dry-run output. The "Adapter availability" section tells the user which providers will actually run (OK) vs skip (NOT READY — remediation hint included). -If ALL three show NOT READY: stop with a clear message — benchmark can't run without at least one authed provider. Suggest `claude login`, `codex login`, or `gemini login` / `export GOOGLE_API_KEY`. +If ALL four show NOT READY: stop with a clear message — benchmark can't run without at least one authed provider. Suggest `claude login`, `codex login`, `gemini login` / `export GOOGLE_API_KEY`, or for local: install Ollama from https://ollama.com then `ollama pull qwen2.5-coder:7b && ollama serve`. If at least one is OK: AskUserQuestion: - **Simplify:** "Which models should we include? The dry-run above showed which are authed. Unauthed ones will be skipped cleanly — they won't abort the batch." diff --git a/bin/gstack-model-benchmark b/bin/gstack-model-benchmark index 7c48c910b0..9346738177 100755 --- a/bin/gstack-model-benchmark +++ b/bin/gstack-model-benchmark @@ -7,7 +7,9 @@ * gstack-model-benchmark [options] * * Options: - * --models claude,gpt,gemini Comma-separated provider list (default: claude) + * --models claude,gpt,gemini,ollama + * Comma-separated provider list (default: claude). + * Valid: claude, gpt, gemini, ollama (local). * --prompt "" Inline prompt instead of a file * --workdir Working dir passed to each CLI (default: cwd) * --timeout-ms Per-provider timeout (default: 300000) @@ -27,16 +29,21 @@ import * as fs from 'fs'; import * as path from 'path'; import { runBenchmark, formatTable, formatJson, formatMarkdown, type BenchmarkInput } from '../test/helpers/benchmark-runner'; +import type { Family } from '../test/helpers/providers/types'; import { ClaudeAdapter } from '../test/helpers/providers/claude'; import { GptAdapter } from '../test/helpers/providers/gpt'; import { GeminiAdapter } from '../test/helpers/providers/gemini'; +import { OllamaAdapter } from '../test/helpers/providers/ollama'; const ADAPTER_FACTORIES = { claude: () => new ClaudeAdapter(), gpt: () => new GptAdapter(), gemini: () => new GeminiAdapter(), + ollama: () => new OllamaAdapter(), }; +const VALID_PROVIDERS: Family[] = ['claude', 'gpt', 'gemini', 'ollama']; + type OutputFormat = 'table' | 'json' | 'markdown'; function arg(name: string, def?: string): string | undefined { @@ -51,13 +58,13 @@ function flag(name: string): boolean { return process.argv.includes(name); } -function parseProviders(s: string | undefined): Array<'claude' | 'gpt' | 'gemini'> { +function parseProviders(s: string | undefined): Family[] { if (!s) return ['claude']; - const seen = new Set<'claude' | 'gpt' | 'gemini'>(); + const seen = new Set(); for (const p of s.split(',').map(x => x.trim()).filter(Boolean)) { - if (p === 'claude' || p === 'gpt' || p === 'gemini') seen.add(p); + if ((VALID_PROVIDERS as string[]).includes(p)) seen.add(p as Family); else { - console.error(`WARN: unknown provider '${p}' — skipping. Valid: claude, gpt, gemini.`); + console.error(`WARN: unknown provider '${p}' — skipping. Valid: ${VALID_PROVIDERS.join(', ')}.`); } } return seen.size ? Array.from(seen) : ['claude']; @@ -124,7 +131,7 @@ async function main(): Promise { async function dryRunReport(opts: { prompt: string; - providers: Array<'claude' | 'gpt' | 'gemini'>; + providers: Family[]; workdir: string; timeoutMs: number; output: OutputFormat; diff --git a/test/benchmark-cli.test.ts b/test/benchmark-cli.test.ts index 2932ec0c4c..fb4a26a248 100644 --- a/test/benchmark-cli.test.ts +++ b/test/benchmark-cli.test.ts @@ -147,6 +147,23 @@ describe('gstack-model-benchmark --dry-run', () => { // Summary truncates to 80 chars + ellipsis expect(r.stdout).toMatch(/prompt:\s+x{80}…/); }); + + test('ollama is accepted in --models whitelist', () => { + const r = run(['--prompt', 'hi', '--models', 'claude,ollama', '--dry-run']); + expect(r.status).toBe(0); + expect(r.stdout).toContain('providers: claude, ollama'); + // Ollama adapter must appear in availability section with OK or NOT READY + expect(r.stdout).toMatch(/ollama:\s+(OK|NOT READY)/); + // Stderr must NOT contain the unknown-provider warning for ollama + expect(r.stderr).not.toContain("unknown provider 'ollama'"); + }); + + test('valid provider list in WARN message includes ollama', () => { + const r = run(['--prompt', 'hi', '--models', 'fake-provider', '--dry-run']); + expect(r.status).toBe(0); + expect(r.stderr).toContain('ollama'); + expect(r.stderr).toContain('Valid:'); + }); }); describe('gstack-model-benchmark prompt resolution', () => { diff --git a/test/benchmark-runner.test.ts b/test/benchmark-runner.test.ts index ecd503ea8f..6508011765 100644 --- a/test/benchmark-runner.test.ts +++ b/test/benchmark-runner.test.ts @@ -42,6 +42,12 @@ test('PRICING table covers the key model families', () => { expect(PRICING['claude-sonnet-4-6']).toBeDefined(); expect(PRICING['gpt-5.4']).toBeDefined(); expect(PRICING['gemini-2.5-pro']).toBeDefined(); + expect(PRICING['qwen2.5-coder:7b']).toBeDefined(); +}); + +test('Ollama models are priced at $0 (local inference)', () => { + expect(estimateCostUsd({ input: 1_000_000, output: 1_000_000 }, 'qwen2.5-coder:7b')).toBe(0); + expect(estimateCostUsd({ input: 1_000_000, output: 1_000_000 }, 'llama3.2:3b')).toBe(0); }); test('missingTools reports unsupported tools per provider', () => { @@ -51,12 +57,15 @@ test('missingTools reports unsupported tools per provider', () => { expect(missingTools('claude', ['Edit', 'Glob', 'Grep', 'Bash', 'Read'])).toEqual([]); // Gemini has very limited agentic surface expect(missingTools('gemini', ['Bash', 'Edit'])).toEqual(['Bash', 'Edit']); + // Ollama /api/generate has zero agentic surface — every tool is unsupported + expect(missingTools('ollama', ['Read', 'Bash', 'Edit'])).toEqual(['Read', 'Bash', 'Edit']); }); -test('TOOL_COMPATIBILITY is populated for all three families', () => { +test('TOOL_COMPATIBILITY is populated for all four families', () => { expect(TOOL_COMPATIBILITY.claude).toBeDefined(); expect(TOOL_COMPATIBILITY.gpt).toBeDefined(); expect(TOOL_COMPATIBILITY.gemini).toBeDefined(); + expect(TOOL_COMPATIBILITY.ollama).toBeDefined(); }); test('formatTable handles a report with mixed success/error/unavailable entries', () => { diff --git a/test/helpers/benchmark-runner.ts b/test/helpers/benchmark-runner.ts index cbef4107b2..e02fdfd130 100644 --- a/test/helpers/benchmark-runner.ts +++ b/test/helpers/benchmark-runner.ts @@ -7,26 +7,27 @@ * one. Per-provider auth/timeout/rate-limit errors don't abort the batch. */ -import type { ProviderAdapter, RunOpts, RunResult } from './providers/types'; +import type { ProviderAdapter, RunOpts, RunResult, Family } from './providers/types'; import { ClaudeAdapter } from './providers/claude'; import { GptAdapter } from './providers/gpt'; import { GeminiAdapter } from './providers/gemini'; +import { OllamaAdapter } from './providers/ollama'; export interface BenchmarkInput { prompt: string; workdir: string; timeoutMs?: number; - /** Adapter names to run (e.g., ['claude', 'gpt', 'gemini']). */ - providers: Array<'claude' | 'gpt' | 'gemini'>; + /** Adapter names to run (e.g., ['claude', 'gpt', 'gemini', 'ollama']). */ + providers: Family[]; /** Optional per-provider model overrides. */ - models?: Partial>; + models?: Partial>; /** If true, skip providers whose available() returns !ok. If false, include them with error. */ skipUnavailable?: boolean; } export interface BenchmarkEntry { provider: string; - family: 'claude' | 'gpt' | 'gemini'; + family: Family; available: boolean; unavailable_reason?: string; result?: RunResult; @@ -44,10 +45,11 @@ export interface BenchmarkReport { entries: BenchmarkEntry[]; } -const ADAPTERS: Record<'claude' | 'gpt' | 'gemini', () => ProviderAdapter> = { +const ADAPTERS: Record ProviderAdapter> = { claude: () => new ClaudeAdapter(), gpt: () => new GptAdapter(), gemini: () => new GeminiAdapter(), + ollama: () => new OllamaAdapter(), }; export async function runBenchmark(input: BenchmarkInput): Promise { diff --git a/test/helpers/pricing.ts b/test/helpers/pricing.ts index 71e456f434..acfe9ecd87 100644 --- a/test/helpers/pricing.ts +++ b/test/helpers/pricing.ts @@ -32,6 +32,13 @@ export const PRICING: Record = { // Google 'gemini-2.5-pro': { input_per_mtok: 1.25, output_per_mtok: 5.00, as_of: '2026-04' }, 'gemini-2.5-flash': { input_per_mtok: 0.30, output_per_mtok: 1.20, as_of: '2026-04' }, + + // Ollama (local inference — no API cost). Entries exist so estimateCostUsd + // returns 0 without emitting a "no pricing" WARN. If you ever run Ollama + // against a paid GPU host, override the rates per-model here. + 'qwen2.5-coder:7b': { input_per_mtok: 0, output_per_mtok: 0, as_of: '2026-04' }, + 'llama3.2:3b': { input_per_mtok: 0, output_per_mtok: 0, as_of: '2026-04' }, + 'nomic-embed-text': { input_per_mtok: 0, output_per_mtok: 0, as_of: '2026-04' }, }; const WARNED = new Set(); diff --git a/test/helpers/providers/ollama.ts b/test/helpers/providers/ollama.ts new file mode 100644 index 0000000000..e5efb8edd4 --- /dev/null +++ b/test/helpers/providers/ollama.ts @@ -0,0 +1,130 @@ +import type { ProviderAdapter, RunOpts, RunResult, AvailabilityCheck } from './types'; +import { estimateCostUsd } from '../pricing'; + +/** + * Ollama adapter — wraps a local Ollama daemon's HTTP API. + * + * Unlike Claude/GPT/Gemini (which shell out to a CLI binary), Ollama exposes a + * native HTTP server on `http://localhost:11434` by default. The adapter talks + * directly via `fetch()`. No CLI dependency, no auth — the daemon is local. + * + * Default model: `qwen2.5-coder:7b` (general-purpose code-leaning model). + * Override per-run via `RunOpts.model` or globally via `GSTACK_OLLAMA_MODEL`. + * Override daemon URL via `GSTACK_OLLAMA_URL` (e.g. for a remote / non-default port). + * + * Tool-call counting is 0 — the `/api/generate` endpoint emits no tool events. + * If a future benchmark needs tool calls, switch to `/api/chat` with `tools[]`. + * Cost is always 0 — Ollama runs locally on the user's machine. + */ +export class OllamaAdapter implements ProviderAdapter { + readonly name = 'ollama'; + readonly family = 'ollama' as const; + + private get baseUrl(): string { + return (process.env.GSTACK_OLLAMA_URL ?? 'http://localhost:11434').replace(/\/+$/, ''); + } + + private get defaultModel(): string { + return process.env.GSTACK_OLLAMA_MODEL ?? 'qwen2.5-coder:7b'; + } + + async available(): Promise { + // Probe the tags endpoint with a tight timeout. A live daemon responds in + // ~5-50ms; a missing one fails immediately with ECONNREFUSED. + const ctrl = new AbortController(); + const tid = setTimeout(() => ctrl.abort(), 2000); + try { + const res = await fetch(`${this.baseUrl}/api/tags`, { signal: ctrl.signal }); + if (!res.ok) { + return { ok: false, reason: `Ollama daemon at ${this.baseUrl} returned HTTP ${res.status}. Is it healthy? Try \`ollama serve\`.` }; + } + const body = await res.json() as { models?: Array<{ name: string }> }; + if (!body.models || body.models.length === 0) { + return { ok: false, reason: `Ollama daemon at ${this.baseUrl} has no models pulled. Run \`ollama pull ${this.defaultModel}\`.` }; + } + return { ok: true }; + } catch (err) { + const msg = (err as Error).message ?? String(err); + if (/abort/i.test(msg)) { + return { ok: false, reason: `Ollama daemon at ${this.baseUrl} did not respond within 2s. Start it with \`ollama serve\` or set GSTACK_OLLAMA_URL.` }; + } + return { ok: false, reason: `Ollama daemon not reachable at ${this.baseUrl} (${msg.slice(0, 200)}). Install from https://ollama.com or set GSTACK_OLLAMA_URL.` }; + } finally { + clearTimeout(tid); + } + } + + async run(opts: RunOpts): Promise { + const start = Date.now(); + const model = opts.model ?? this.defaultModel; + const ctrl = new AbortController(); + const tid = setTimeout(() => ctrl.abort(), opts.timeoutMs); + + try { + const res = await fetch(`${this.baseUrl}/api/generate`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model, + prompt: opts.prompt, + stream: false, + }), + signal: ctrl.signal, + }); + if (!res.ok) { + const text = await res.text().catch(() => ''); + const durationMs = Date.now() - start; + if (res.status === 404) { + return this.emptyResult(durationMs, model, { code: 'unknown', reason: `model '${model}' not found. Pull it with \`ollama pull ${model}\`. ${text.slice(0, 200)}` }); + } + return this.emptyResult(durationMs, model, { code: 'unknown', reason: `HTTP ${res.status}: ${text.slice(0, 400)}` }); + } + const body = await res.json() as { + response?: string; + model?: string; + prompt_eval_count?: number; + eval_count?: number; + done?: boolean; + }; + return { + output: body.response ?? '', + tokens: { + input: body.prompt_eval_count ?? 0, + output: body.eval_count ?? 0, + }, + durationMs: Date.now() - start, + toolCalls: 0, + modelUsed: body.model ?? model, + }; + } catch (err) { + const durationMs = Date.now() - start; + const msg = (err as Error).message ?? String(err); + if (/abort/i.test(msg)) { + return this.emptyResult(durationMs, model, { code: 'timeout', reason: `exceeded ${opts.timeoutMs}ms` }); + } + if (/ECONNREFUSED|fetch failed|getaddrinfo/i.test(msg)) { + return this.emptyResult(durationMs, model, { code: 'binary_missing', reason: `Ollama daemon not reachable at ${this.baseUrl}. Start it with \`ollama serve\`.` }); + } + return this.emptyResult(durationMs, model, { code: 'unknown', reason: msg.slice(0, 400) }); + } finally { + clearTimeout(tid); + } + } + + estimateCost(_tokens: { input: number; output: number; cached?: number }, model?: string): number { + // Local inference — no API cost. Pass through to pricing table anyway so + // future cloud-hosted Ollama runners (e.g. via paid GPU) can override. + return estimateCostUsd(_tokens, model ?? this.defaultModel); + } + + private emptyResult(durationMs: number, model: string, error: RunResult['error']): RunResult { + return { + output: '', + tokens: { input: 0, output: 0 }, + durationMs, + toolCalls: 0, + modelUsed: model, + error, + }; + } +} diff --git a/test/helpers/providers/types.ts b/test/helpers/providers/types.ts index 1680d0ceb1..2217643954 100644 --- a/test/helpers/providers/types.ts +++ b/test/helpers/providers/types.ts @@ -55,7 +55,7 @@ export interface AvailabilityCheck { reason?: string; } -export type Family = 'claude' | 'gpt' | 'gemini'; +export type Family = 'claude' | 'gpt' | 'gemini' | 'ollama'; export interface ProviderAdapter { /** Stable name used in output tables and config (e.g., 'claude', 'gpt', 'gemini'). */ diff --git a/test/helpers/tool-map.ts b/test/helpers/tool-map.ts index 9fcf8e7f9b..a8bec08f8a 100644 --- a/test/helpers/tool-map.ts +++ b/test/helpers/tool-map.ts @@ -26,7 +26,7 @@ export type ToolName = | 'WebSearch' | 'WebFetch'; -export const TOOL_COMPATIBILITY: Record<'claude' | 'gpt' | 'gemini', Record> = { +export const TOOL_COMPATIBILITY: Record<'claude' | 'gpt' | 'gemini' | 'ollama', Record> = { claude: { Read: true, Write: true, @@ -67,6 +67,22 @@ export const TOOL_COMPATIBILITY: Record<'claude' | 'gpt' | 'gemini', Record Promise; + +function stubFetch(impl: FetchStub): void { + (globalThis as { fetch: FetchStub }).fetch = impl; +} + +function restoreFetch(): void { + (globalThis as { fetch: typeof REAL_FETCH }).fetch = REAL_FETCH; +} + +describe('OllamaAdapter.available', () => { + beforeEach(() => { + delete process.env.GSTACK_OLLAMA_URL; + delete process.env.GSTACK_OLLAMA_MODEL; + }); + afterEach(() => { + restoreFetch(); + if (REAL_OLLAMA_URL !== undefined) process.env.GSTACK_OLLAMA_URL = REAL_OLLAMA_URL; + if (REAL_OLLAMA_MODEL !== undefined) process.env.GSTACK_OLLAMA_MODEL = REAL_OLLAMA_MODEL; + }); + + test('returns ok when daemon responds with at least one model', async () => { + stubFetch(async () => new Response(JSON.stringify({ models: [{ name: 'qwen2.5-coder:7b' }] }), { status: 200 })); + const adapter = new OllamaAdapter(); + const check = await adapter.available(); + expect(check.ok).toBe(true); + }); + + test('returns not-ok with remediation hint when daemon has no models', async () => { + stubFetch(async () => new Response(JSON.stringify({ models: [] }), { status: 200 })); + const adapter = new OllamaAdapter(); + const check = await adapter.available(); + expect(check.ok).toBe(false); + expect(check.reason).toMatch(/ollama pull/); + }); + + test('returns not-ok when daemon is unreachable', async () => { + stubFetch(async () => { throw new Error('fetch failed: ECONNREFUSED'); }); + const adapter = new OllamaAdapter(); + const check = await adapter.available(); + expect(check.ok).toBe(false); + expect(check.reason).toMatch(/not reachable|Install|ollama serve/); + }); + + test('returns not-ok when daemon returns non-2xx', async () => { + stubFetch(async () => new Response('Internal error', { status: 500 })); + const adapter = new OllamaAdapter(); + const check = await adapter.available(); + expect(check.ok).toBe(false); + expect(check.reason).toMatch(/HTTP 500/); + }); + + test('honors GSTACK_OLLAMA_URL override', async () => { + process.env.GSTACK_OLLAMA_URL = 'http://example.local:9999'; + let calledUrl = ''; + stubFetch(async (input) => { + calledUrl = String(input); + return new Response(JSON.stringify({ models: [{ name: 'foo' }] }), { status: 200 }); + }); + const adapter = new OllamaAdapter(); + await adapter.available(); + expect(calledUrl).toBe('http://example.local:9999/api/tags'); + }); +}); + +describe('OllamaAdapter.run', () => { + beforeEach(() => { + delete process.env.GSTACK_OLLAMA_URL; + delete process.env.GSTACK_OLLAMA_MODEL; + }); + afterEach(() => { + restoreFetch(); + if (REAL_OLLAMA_URL !== undefined) process.env.GSTACK_OLLAMA_URL = REAL_OLLAMA_URL; + if (REAL_OLLAMA_MODEL !== undefined) process.env.GSTACK_OLLAMA_MODEL = REAL_OLLAMA_MODEL; + }); + + test('parses successful response into RunResult', async () => { + stubFetch(async () => new Response(JSON.stringify({ + response: 'hello world', + model: 'qwen2.5-coder:7b', + prompt_eval_count: 12, + eval_count: 34, + done: true, + }), { status: 200 })); + const adapter = new OllamaAdapter(); + const res = await adapter.run({ prompt: 'hi', workdir: '/tmp', timeoutMs: 5000 }); + expect(res.output).toBe('hello world'); + expect(res.tokens.input).toBe(12); + expect(res.tokens.output).toBe(34); + expect(res.toolCalls).toBe(0); + expect(res.modelUsed).toBe('qwen2.5-coder:7b'); + expect(res.error).toBeUndefined(); + }); + + test('sends correct POST body to /api/generate', async () => { + let capturedBody: unknown = null; + let capturedUrl = ''; + stubFetch(async (input, init) => { + capturedUrl = String(input); + capturedBody = init?.body ? JSON.parse(String(init.body)) : null; + return new Response(JSON.stringify({ response: '', eval_count: 0, prompt_eval_count: 0 }), { status: 200 }); + }); + const adapter = new OllamaAdapter(); + await adapter.run({ prompt: 'test prompt', workdir: '/tmp', timeoutMs: 5000, model: 'llama3.2:3b' }); + expect(capturedUrl).toBe('http://localhost:11434/api/generate'); + expect(capturedBody).toEqual({ model: 'llama3.2:3b', prompt: 'test prompt', stream: false }); + }); + + test('uses default model when none specified', async () => { + let capturedModel = ''; + stubFetch(async (_input, init) => { + const body = init?.body ? JSON.parse(String(init.body)) : {}; + capturedModel = body.model; + return new Response(JSON.stringify({ response: '', eval_count: 0, prompt_eval_count: 0 }), { status: 200 }); + }); + const adapter = new OllamaAdapter(); + await adapter.run({ prompt: 'x', workdir: '/tmp', timeoutMs: 5000 }); + expect(capturedModel).toBe('qwen2.5-coder:7b'); + }); + + test('honors GSTACK_OLLAMA_MODEL override', async () => { + process.env.GSTACK_OLLAMA_MODEL = 'custom-model:13b'; + let capturedModel = ''; + stubFetch(async (_input, init) => { + const body = init?.body ? JSON.parse(String(init.body)) : {}; + capturedModel = body.model; + return new Response(JSON.stringify({ response: '', eval_count: 0, prompt_eval_count: 0 }), { status: 200 }); + }); + const adapter = new OllamaAdapter(); + await adapter.run({ prompt: 'x', workdir: '/tmp', timeoutMs: 5000 }); + expect(capturedModel).toBe('custom-model:13b'); + }); + + test('returns binary_missing error on ECONNREFUSED', async () => { + stubFetch(async () => { throw new Error('fetch failed: ECONNREFUSED'); }); + const adapter = new OllamaAdapter(); + const res = await adapter.run({ prompt: 'x', workdir: '/tmp', timeoutMs: 5000 }); + expect(res.error?.code).toBe('binary_missing'); + expect(res.error?.reason).toMatch(/ollama serve/); + expect(res.output).toBe(''); + }); + + test('returns unknown error with helpful message on 404 (model not pulled)', async () => { + stubFetch(async () => new Response('model "missing" not found', { status: 404 })); + const adapter = new OllamaAdapter(); + const res = await adapter.run({ prompt: 'x', workdir: '/tmp', timeoutMs: 5000, model: 'missing' }); + expect(res.error?.code).toBe('unknown'); + expect(res.error?.reason).toMatch(/ollama pull missing/); + }); + + test('returns timeout error when fetch is aborted', async () => { + stubFetch(async (_input, init) => { + return new Promise((_resolve, reject) => { + init?.signal?.addEventListener('abort', () => reject(new Error('The operation was aborted'))); + }); + }); + const adapter = new OllamaAdapter(); + const res = await adapter.run({ prompt: 'x', workdir: '/tmp', timeoutMs: 50 }); + expect(res.error?.code).toBe('timeout'); + }); +}); + +describe('OllamaAdapter.estimateCost', () => { + test('returns 0 for known Ollama models', () => { + const adapter = new OllamaAdapter(); + expect(adapter.estimateCost({ input: 1_000_000, output: 500_000 }, 'qwen2.5-coder:7b')).toBe(0); + expect(adapter.estimateCost({ input: 1_000_000, output: 500_000 }, 'llama3.2:3b')).toBe(0); + }); +}); + +describe('OllamaAdapter identity', () => { + test('exposes stable name and family', () => { + const adapter = new OllamaAdapter(); + expect(adapter.name).toBe('ollama'); + expect(adapter.family).toBe('ollama'); + }); +}); diff --git a/test/skill-e2e-benchmark-providers.test.ts b/test/skill-e2e-benchmark-providers.test.ts index 12456ec231..8ad5e80a42 100644 --- a/test/skill-e2e-benchmark-providers.test.ts +++ b/test/skill-e2e-benchmark-providers.test.ts @@ -22,6 +22,7 @@ import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; import { ClaudeAdapter } from './helpers/providers/claude'; import { GptAdapter } from './helpers/providers/gpt'; import { GeminiAdapter } from './helpers/providers/gemini'; +import { OllamaAdapter } from './helpers/providers/ollama'; import { runBenchmark } from './helpers/benchmark-runner'; import * as fs from 'fs'; import * as path from 'path'; @@ -39,6 +40,7 @@ const PROMPT = 'Reply with exactly this text and nothing else: ok'; const claude = new ClaudeAdapter(); const gpt = new GptAdapter(); const gemini = new GeminiAdapter(); +const ollama = new OllamaAdapter(); // Use a temp working directory so provider CLIs can't accidentally touch the repo. // Created in beforeAll / cleaned in afterAll so concurrent CI runs don't leak. @@ -80,6 +82,15 @@ describeIfEvals('multi-provider benchmark adapters (live)', () => { } }); + test('ollama: available() returns structured ok/reason', async () => { + const check = await ollama.available(); + expect(check).toHaveProperty('ok'); + if (!check.ok) { + expect(typeof check.reason).toBe('string'); + expect(check.reason!.length).toBeGreaterThan(0); + } + }); + test('claude: trivial prompt produces parseable output', async () => { const check = await claude.available(); if (!check.ok) { @@ -144,6 +155,28 @@ describeIfEvals('multi-provider benchmark adapters (live)', () => { expect(typeof result.modelUsed).toBe('string'); }, 150_000); + test('ollama: trivial prompt produces parseable output', async () => { + const check = await ollama.available(); + if (!check.ok) { + process.stderr.write(`\nollama live smoke: SKIPPED — ${check.reason}\n`); + return; + } + const result = await ollama.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 }); + if (result.error) { + throw new Error(`ollama errored: ${result.error.code} — ${result.error.reason}`); + } + // Local inference — model+quantization quality varies, so don't grep for "ok". + // Assert structural correctness: text output, parseable tokens, valid duration. + expect(typeof result.output).toBe('string'); + expect(result.tokens.input).toBeGreaterThan(0); + expect(result.tokens.output).toBeGreaterThan(0); + expect(result.durationMs).toBeGreaterThan(0); + expect(typeof result.modelUsed).toBe('string'); + expect(result.modelUsed.length).toBeGreaterThan(0); + // Local inference has zero per-token cost. + expect(ollama.estimateCost(result.tokens, result.modelUsed)).toBe(0); + }, 150_000); + test('timeout error surfaces as error.code=timeout (no exception)', async () => { // Use whatever adapter is available first — all three should share timeout semantics. const adapter = (await claude.available()).ok ? claude @@ -164,18 +197,18 @@ describeIfEvals('multi-provider benchmark adapters (live)', () => { }, 30_000); test('runBenchmark: Promise.allSettled means one unavailable provider does not block others', async () => { - // Use the full runner with all three providers — whichever are unauthed should + // Use the full runner with all four providers — whichever are unauthed should // return entries with available=false and not crash the batch. const report = await runBenchmark({ prompt: PROMPT, workdir, - providers: ['claude', 'gpt', 'gemini'], + providers: ['claude', 'gpt', 'gemini', 'ollama'], timeoutMs: 120_000, skipUnavailable: false, }); - expect(report.entries).toHaveLength(3); + expect(report.entries).toHaveLength(4); for (const e of report.entries) { - expect(['claude', 'gpt', 'gemini']).toContain(e.family); + expect(['claude', 'gpt', 'gemini', 'ollama']).toContain(e.family); if (e.available) { expect(e.result).toBeDefined(); } else {