garrytan · SyncroAgency · May 14, 2026
diff --git a/benchmark-models/SKILL.md b/benchmark-models/SKILL.md
@@ -4,17 +4,19 @@ preamble-tier: 1
 version: 1.0.0
 description: |
   Cross-model benchmark for gstack skills. Runs the same prompt through Claude,
-  GPT (via Codex CLI), and Gemini side-by-side — compares latency, tokens, cost,
-  and optionally quality via LLM judge. Answers "which model is actually best
-  for this skill?" with data instead of vibes. Separate from /benchmark, which
-  measures web page performance. Use when: "benchmark models", "compare models",
-  "which model is best for X", "cross-model comparison", "model shootout". (gstack)
+  GPT (via Codex CLI), Gemini, and Ollama (local) side-by-side — compares
+  latency, tokens, cost, and optionally quality via LLM judge. Answers "which
+  model is actually best for this skill?" with data instead of vibes. Separate
+  from /benchmark, which measures web page performance. Use when: "benchmark
+  models", "compare models", "which model is best for X", "cross-model
+  comparison", "model shootout". (gstack)
   Voice triggers (speech-to-text aliases): "compare models", "model shootout", "which model is best".
 triggers:
   - cross model benchmark
-  - compare claude gpt gemini
+  - compare claude gpt gemini ollama
   - benchmark skill across models
   - which model should I use
+  - local vs cloud model comparison
 allowed-tools:
   - Bash
   - Read
@@ -522,12 +524,12 @@ If C: ask for the path. Verify it exists. Use as positional argument.
 ## Step 2: Choose providers
 
 ```bash
-"$BIN" --prompt "unused, dry-run" --models claude,gpt,gemini --dry-run
+"$BIN" --prompt "unused, dry-run" --models claude,gpt,gemini,ollama --dry-run
 ```
 
 Show the dry-run output. The "Adapter availability" section tells the user which providers will actually run (OK) vs skip (NOT READY — remediation hint included).
 
-If ALL three show NOT READY: stop with a clear message — benchmark can't run without at least one authed provider. Suggest `claude login`, `codex login`, or `gemini login` / `export GOOGLE_API_KEY`.
+If ALL four show NOT READY: stop with a clear message — benchmark can't run without at least one authed provider. Suggest `claude login`, `codex login`, `gemini login` / `export GOOGLE_API_KEY`, or for local: install Ollama from https://ollama.com then `ollama pull qwen2.5-coder:7b && ollama serve`.
 
 If at least one is OK: AskUserQuestion:
 - **Simplify:** "Which models should we include? The dry-run above showed which are authed. Unauthed ones will be skipped cleanly — they won't abort the batch."

diff --git a/benchmark-models/SKILL.md.tmpl b/benchmark-models/SKILL.md.tmpl
@@ -4,20 +4,22 @@ preamble-tier: 1
 version: 1.0.0
 description: |
   Cross-model benchmark for gstack skills. Runs the same prompt through Claude,
-  GPT (via Codex CLI), and Gemini side-by-side — compares latency, tokens, cost,
-  and optionally quality via LLM judge. Answers "which model is actually best
-  for this skill?" with data instead of vibes. Separate from /benchmark, which
-  measures web page performance. Use when: "benchmark models", "compare models",
-  "which model is best for X", "cross-model comparison", "model shootout". (gstack)
+  GPT (via Codex CLI), Gemini, and Ollama (local) side-by-side — compares
+  latency, tokens, cost, and optionally quality via LLM judge. Answers "which
+  model is actually best for this skill?" with data instead of vibes. Separate
+  from /benchmark, which measures web page performance. Use when: "benchmark
+  models", "compare models", "which model is best for X", "cross-model
+  comparison", "model shootout". (gstack)
 voice-triggers:
   - "compare models"
   - "model shootout"
   - "which model is best"
 triggers:
   - cross model benchmark
-  - compare claude gpt gemini
+  - compare claude gpt gemini ollama
   - benchmark skill across models
   - which model should I use
+  - local vs cloud model comparison
 allowed-tools:
   - Bash
   - Read
@@ -69,12 +71,12 @@ If C: ask for the path. Verify it exists. Use as positional argument.
 ## Step 2: Choose providers
 
 ```bash
-"$BIN" --prompt "unused, dry-run" --models claude,gpt,gemini --dry-run
+"$BIN" --prompt "unused, dry-run" --models claude,gpt,gemini,ollama --dry-run
 ```
 
 Show the dry-run output. The "Adapter availability" section tells the user which providers will actually run (OK) vs skip (NOT READY — remediation hint included).
 
-If ALL three show NOT READY: stop with a clear message — benchmark can't run without at least one authed provider. Suggest `claude login`, `codex login`, or `gemini login` / `export GOOGLE_API_KEY`.
+If ALL four show NOT READY: stop with a clear message — benchmark can't run without at least one authed provider. Suggest `claude login`, `codex login`, `gemini login` / `export GOOGLE_API_KEY`, or for local: install Ollama from https://ollama.com then `ollama pull qwen2.5-coder:7b && ollama serve`.
 
 If at least one is OK: AskUserQuestion:
 - **Simplify:** "Which models should we include? The dry-run above showed which are authed. Unauthed ones will be skipped cleanly — they won't abort the batch."

diff --git a/bin/gstack-model-benchmark b/bin/gstack-model-benchmark
@@ -7,7 +7,9 @@
  *   gstack-model-benchmark <skill-or-prompt-file> [options]
  *
  * Options:
- *   --models claude,gpt,gemini   Comma-separated provider list (default: claude)
+ *   --models claude,gpt,gemini,ollama
+ *                                Comma-separated provider list (default: claude).
+ *                                Valid: claude, gpt, gemini, ollama (local).
  *   --prompt "<text>"            Inline prompt instead of a file
  *   --workdir <path>             Working dir passed to each CLI (default: cwd)
  *   --timeout-ms <n>             Per-provider timeout (default: 300000)
@@ -27,16 +29,21 @@
 import * as fs from 'fs';
 import * as path from 'path';
 import { runBenchmark, formatTable, formatJson, formatMarkdown, type BenchmarkInput } from '../test/helpers/benchmark-runner';
+import type { Family } from '../test/helpers/providers/types';
 import { ClaudeAdapter } from '../test/helpers/providers/claude';
 import { GptAdapter } from '../test/helpers/providers/gpt';
 import { GeminiAdapter } from '../test/helpers/providers/gemini';
+import { OllamaAdapter } from '../test/helpers/providers/ollama';
 
 const ADAPTER_FACTORIES = {
   claude: () => new ClaudeAdapter(),
   gpt: () => new GptAdapter(),
   gemini: () => new GeminiAdapter(),
+  ollama: () => new OllamaAdapter(),
 };
 
+const VALID_PROVIDERS: Family[] = ['claude', 'gpt', 'gemini', 'ollama'];
+
 type OutputFormat = 'table' | 'json' | 'markdown';
 
 function arg(name: string, def?: string): string | undefined {
@@ -51,13 +58,13 @@ function flag(name: string): boolean {
   return process.argv.includes(name);
 }
 
-function parseProviders(s: string | undefined): Array<'claude' | 'gpt' | 'gemini'> {
+function parseProviders(s: string | undefined): Family[] {
   if (!s) return ['claude'];
-  const seen = new Set<'claude' | 'gpt' | 'gemini'>();
+  const seen = new Set<Family>();
   for (const p of s.split(',').map(x => x.trim()).filter(Boolean)) {
-    if (p === 'claude' || p === 'gpt' || p === 'gemini') seen.add(p);
+    if ((VALID_PROVIDERS as string[]).includes(p)) seen.add(p as Family);
     else {
-      console.error(`WARN: unknown provider '${p}' — skipping. Valid: claude, gpt, gemini.`);
+      console.error(`WARN: unknown provider '${p}' — skipping. Valid: ${VALID_PROVIDERS.join(', ')}.`);
     }
   }
   return seen.size ? Array.from(seen) : ['claude'];
@@ -124,7 +131,7 @@ async function main(): Promise<void> {
 
 async function dryRunReport(opts: {
   prompt: string;
-  providers: Array<'claude' | 'gpt' | 'gemini'>;
+  providers: Family[];
   workdir: string;
   timeoutMs: number;
   output: OutputFormat;

diff --git a/test/benchmark-cli.test.ts b/test/benchmark-cli.test.ts
@@ -147,6 +147,23 @@ describe('gstack-model-benchmark --dry-run', () => {
     // Summary truncates to 80 chars + ellipsis
     expect(r.stdout).toMatch(/prompt:\s+x{80}…/);
   });
+
+  test('ollama is accepted in --models whitelist', () => {
+    const r = run(['--prompt', 'hi', '--models', 'claude,ollama', '--dry-run']);
+    expect(r.status).toBe(0);
+    expect(r.stdout).toContain('providers:  claude, ollama');
+    // Ollama adapter must appear in availability section with OK or NOT READY
+    expect(r.stdout).toMatch(/ollama:\s+(OK|NOT READY)/);
+    // Stderr must NOT contain the unknown-provider warning for ollama
+    expect(r.stderr).not.toContain("unknown provider 'ollama'");
+  });
+
+  test('valid provider list in WARN message includes ollama', () => {
+    const r = run(['--prompt', 'hi', '--models', 'fake-provider', '--dry-run']);
+    expect(r.status).toBe(0);
+    expect(r.stderr).toContain('ollama');
+    expect(r.stderr).toContain('Valid:');
+  });
 });
 
 describe('gstack-model-benchmark prompt resolution', () => {

diff --git a/test/benchmark-runner.test.ts b/test/benchmark-runner.test.ts
@@ -42,6 +42,12 @@ test('PRICING table covers the key model families', () => {
   expect(PRICING['claude-sonnet-4-6']).toBeDefined();
   expect(PRICING['gpt-5.4']).toBeDefined();
   expect(PRICING['gemini-2.5-pro']).toBeDefined();
+  expect(PRICING['qwen2.5-coder:7b']).toBeDefined();
+});
+
+test('Ollama models are priced at $0 (local inference)', () => {
+  expect(estimateCostUsd({ input: 1_000_000, output: 1_000_000 }, 'qwen2.5-coder:7b')).toBe(0);
+  expect(estimateCostUsd({ input: 1_000_000, output: 1_000_000 }, 'llama3.2:3b')).toBe(0);
 });
 
 test('missingTools reports unsupported tools per provider', () => {
@@ -51,12 +57,15 @@ test('missingTools reports unsupported tools per provider', () => {
   expect(missingTools('claude', ['Edit', 'Glob', 'Grep', 'Bash', 'Read'])).toEqual([]);
   // Gemini has very limited agentic surface
   expect(missingTools('gemini', ['Bash', 'Edit'])).toEqual(['Bash', 'Edit']);
+  // Ollama /api/generate has zero agentic surface — every tool is unsupported
+  expect(missingTools('ollama', ['Read', 'Bash', 'Edit'])).toEqual(['Read', 'Bash', 'Edit']);
 });
 
-test('TOOL_COMPATIBILITY is populated for all three families', () => {
+test('TOOL_COMPATIBILITY is populated for all four families', () => {
   expect(TOOL_COMPATIBILITY.claude).toBeDefined();
   expect(TOOL_COMPATIBILITY.gpt).toBeDefined();
   expect(TOOL_COMPATIBILITY.gemini).toBeDefined();
+  expect(TOOL_COMPATIBILITY.ollama).toBeDefined();
 });
 
 test('formatTable handles a report with mixed success/error/unavailable entries', () => {

diff --git a/test/helpers/benchmark-runner.ts b/test/helpers/benchmark-runner.ts
@@ -7,26 +7,27 @@
  * one. Per-provider auth/timeout/rate-limit errors don't abort the batch.
  */
 
-import type { ProviderAdapter, RunOpts, RunResult } from './providers/types';
+import type { ProviderAdapter, RunOpts, RunResult, Family } from './providers/types';
 import { ClaudeAdapter } from './providers/claude';
 import { GptAdapter } from './providers/gpt';
 import { GeminiAdapter } from './providers/gemini';
+import { OllamaAdapter } from './providers/ollama';
 
 export interface BenchmarkInput {
   prompt: string;
   workdir: string;
   timeoutMs?: number;
-  /** Adapter names to run (e.g., ['claude', 'gpt', 'gemini']). */
-  providers: Array<'claude' | 'gpt' | 'gemini'>;
+  /** Adapter names to run (e.g., ['claude', 'gpt', 'gemini', 'ollama']). */
+  providers: Family[];
   /** Optional per-provider model overrides. */
-  models?: Partial<Record<'claude' | 'gpt' | 'gemini', string>>;
+  models?: Partial<Record<Family, string>>;
   /** If true, skip providers whose available() returns !ok. If false, include them with error. */
   skipUnavailable?: boolean;
 }
 
 export interface BenchmarkEntry {
   provider: string;
-  family: 'claude' | 'gpt' | 'gemini';
+  family: Family;
   available: boolean;
   unavailable_reason?: string;
   result?: RunResult;
@@ -44,10 +45,11 @@ export interface BenchmarkReport {
   entries: BenchmarkEntry[];
 }
 
-const ADAPTERS: Record<'claude' | 'gpt' | 'gemini', () => ProviderAdapter> = {
+const ADAPTERS: Record<Family, () => ProviderAdapter> = {
   claude: () => new ClaudeAdapter(),
   gpt: () => new GptAdapter(),
   gemini: () => new GeminiAdapter(),
+  ollama: () => new OllamaAdapter(),
 };
 
 export async function runBenchmark(input: BenchmarkInput): Promise<BenchmarkReport> {

diff --git a/test/helpers/pricing.ts b/test/helpers/pricing.ts
@@ -32,6 +32,13 @@ export const PRICING: Record<string, ModelPricing> = {
   // Google
   'gemini-2.5-pro':     { input_per_mtok: 1.25,  output_per_mtok: 5.00,  as_of: '2026-04' },
   'gemini-2.5-flash':   { input_per_mtok: 0.30,  output_per_mtok: 1.20,  as_of: '2026-04' },
+
+  // Ollama (local inference — no API cost). Entries exist so estimateCostUsd
+  // returns 0 without emitting a "no pricing" WARN. If you ever run Ollama
+  // against a paid GPU host, override the rates per-model here.
+  'qwen2.5-coder:7b':   { input_per_mtok: 0,     output_per_mtok: 0,     as_of: '2026-04' },
+  'llama3.2:3b':        { input_per_mtok: 0,     output_per_mtok: 0,     as_of: '2026-04' },
+  'nomic-embed-text':   { input_per_mtok: 0,     output_per_mtok: 0,     as_of: '2026-04' },
 };
 
 const WARNED = new Set<string>();

diff --git a/test/helpers/providers/ollama.ts b/test/helpers/providers/ollama.ts
@@ -0,0 +1,130 @@
+import type { ProviderAdapter, RunOpts, RunResult, AvailabilityCheck } from './types';
+import { estimateCostUsd } from '../pricing';
+
+/**
+ * Ollama adapter — wraps a local Ollama daemon's HTTP API.
+ *
+ * Unlike Claude/GPT/Gemini (which shell out to a CLI binary), Ollama exposes a
+ * native HTTP server on `http://localhost:11434` by default. The adapter talks
+ * directly via `fetch()`. No CLI dependency, no auth — the daemon is local.
+ *
+ * Default model: `qwen2.5-coder:7b` (general-purpose code-leaning model).
+ * Override per-run via `RunOpts.model` or globally via `GSTACK_OLLAMA_MODEL`.
+ * Override daemon URL via `GSTACK_OLLAMA_URL` (e.g. for a remote / non-default port).
+ *
+ * Tool-call counting is 0 — the `/api/generate` endpoint emits no tool events.
+ * If a future benchmark needs tool calls, switch to `/api/chat` with `tools[]`.
+ * Cost is always 0 — Ollama runs locally on the user's machine.
+ */
+export class OllamaAdapter implements ProviderAdapter {
+  readonly name = 'ollama';
+  readonly family = 'ollama' as const;
+
+  private get baseUrl(): string {
+    return (process.env.GSTACK_OLLAMA_URL ?? 'http://localhost:11434').replace(/\/+$/, '');
+  }
+
+  private get defaultModel(): string {
+    return process.env.GSTACK_OLLAMA_MODEL ?? 'qwen2.5-coder:7b';
+  }
+
+  async available(): Promise<AvailabilityCheck> {
+    // Probe the tags endpoint with a tight timeout. A live daemon responds in
+    // ~5-50ms; a missing one fails immediately with ECONNREFUSED.
+    const ctrl = new AbortController();
+    const tid = setTimeout(() => ctrl.abort(), 2000);
+    try {
+      const res = await fetch(`${this.baseUrl}/api/tags`, { signal: ctrl.signal });
+      if (!res.ok) {
+        return { ok: false, reason: `Ollama daemon at ${this.baseUrl} returned HTTP ${res.status}. Is it healthy? Try \`ollama serve\`.` };
+      }
+      const body = await res.json() as { models?: Array<{ name: string }> };
+      if (!body.models || body.models.length === 0) {
+        return { ok: false, reason: `Ollama daemon at ${this.baseUrl} has no models pulled. Run \`ollama pull ${this.defaultModel}\`.` };
+      }
+      return { ok: true };
+    } catch (err) {
+      const msg = (err as Error).message ?? String(err);
+      if (/abort/i.test(msg)) {
+        return { ok: false, reason: `Ollama daemon at ${this.baseUrl} did not respond within 2s. Start it with \`ollama serve\` or set GSTACK_OLLAMA_URL.` };
+      }
+      return { ok: false, reason: `Ollama daemon not reachable at ${this.baseUrl} (${msg.slice(0, 200)}). Install from https://ollama.com or set GSTACK_OLLAMA_URL.` };
+    } finally {
+      clearTimeout(tid);
+    }
+  }
+
+  async run(opts: RunOpts): Promise<RunResult> {
+    const start = Date.now();
+    const model = opts.model ?? this.defaultModel;
+    const ctrl = new AbortController();
+    const tid = setTimeout(() => ctrl.abort(), opts.timeoutMs);
+
+    try {
+      const res = await fetch(`${this.baseUrl}/api/generate`, {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+          model,
+          prompt: opts.prompt,
+          stream: false,
+        }),
+        signal: ctrl.signal,
+      });
+      if (!res.ok) {
+        const text = await res.text().catch(() => '');
+        const durationMs = Date.now() - start;
+        if (res.status === 404) {
+          return this.emptyResult(durationMs, model, { code: 'unknown', reason: `model '${model}' not found. Pull it with \`ollama pull ${model}\`. ${text.slice(0, 200)}` });
+        }
+        return this.emptyResult(durationMs, model, { code: 'unknown', reason: `HTTP ${res.status}: ${text.slice(0, 400)}` });
+      }
+      const body = await res.json() as {
+        response?: string;
+        model?: string;
+        prompt_eval_count?: number;
+        eval_count?: number;
+        done?: boolean;
+      };
+      return {
+        output: body.response ?? '',
+        tokens: {
+          input: body.prompt_eval_count ?? 0,
+          output: body.eval_count ?? 0,
+        },
+        durationMs: Date.now() - start,
+        toolCalls: 0,
+        modelUsed: body.model ?? model,
+      };
+    } catch (err) {
+      const durationMs = Date.now() - start;
+      const msg = (err as Error).message ?? String(err);
+      if (/abort/i.test(msg)) {
+        return this.emptyResult(durationMs, model, { code: 'timeout', reason: `exceeded ${opts.timeoutMs}ms` });
+      }
+      if (/ECONNREFUSED|fetch failed|getaddrinfo/i.test(msg)) {
+        return this.emptyResult(durationMs, model, { code: 'binary_missing', reason: `Ollama daemon not reachable at ${this.baseUrl}. Start it with \`ollama serve\`.` });
+      }
+      return this.emptyResult(durationMs, model, { code: 'unknown', reason: msg.slice(0, 400) });
+    } finally {
+      clearTimeout(tid);
+    }
+  }
+
+  estimateCost(_tokens: { input: number; output: number; cached?: number }, model?: string): number {
+    // Local inference — no API cost. Pass through to pricing table anyway so
+    // future cloud-hosted Ollama runners (e.g. via paid GPU) can override.
+    return estimateCostUsd(_tokens, model ?? this.defaultModel);
+  }
+
+  private emptyResult(durationMs: number, model: string, error: RunResult['error']): RunResult {
+    return {
+      output: '',
+      tokens: { input: 0, output: 0 },
+      durationMs,
+      toolCalls: 0,
+      modelUsed: model,
+      error,
+    };
+  }
+}
diff --git a/test/helpers/providers/types.ts b/test/helpers/providers/types.ts
@@ -55,7 +55,7 @@ export interface AvailabilityCheck {
   reason?: string;
 }
 
-export type Family = 'claude' | 'gpt' | 'gemini';
+export type Family = 'claude' | 'gpt' | 'gemini' | 'ollama';
 
 export interface ProviderAdapter {
   /** Stable name used in output tables and config (e.g., 'claude', 'gpt', 'gemini'). */