wesleysimplicio · wesleysimplicio · May 31, 2026 · May 31, 2026 · May 31, 2026 · May 31, 2026
@@ -15,3 +15,7 @@ test-results/
 
 # Rust build artifacts
 rust/*/target/
+
+# Local GGUF weights (Path 4 in-process llama backend)
+*.gguf
+/models/
@@ -5,6 +5,34 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.5.0] — 2026-05-31
+
+### Added
+- **Path 4: in-process local inference via `llama-cpp-python`** (issue #42).
+  A new offline-first provider runs a GGUF model directly in the Python
+  process — zero API key, zero HTTP overhead. The model is loaded once and
+  reused across calls.
+  - **Default local model:** `Qwen2.5-Coder-1.5B-Instruct-Q5_K_M` from
+    `bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF`, fetched once from the
+    Hugging Face Hub.
+  - **Auto-default:** when neither `SIMPLICIO_MODEL` nor `SIMPLICIO_BASE_URL`
+    is set, simplicio now routes to this local model instead of erroring.
+  - **Explicit route:** `SIMPLICIO_MODEL=local-llama/<repo>::<file.gguf>`,
+    `local-llama/default`, or `local-llama//abs/path/model.gguf`.
+  - **`simplicio task --local`** forces the local model regardless of ambient
+    config.
+  - **Tuning knobs:** `SIMPLICIO_LOCAL_MODEL_PATH`, `SIMPLICIO_LOCAL_MODEL_REPO`,
+    `SIMPLICIO_LOCAL_MODEL_FILE`, `SIMPLICIO_LOCAL_CTX`,
+    `SIMPLICIO_LOCAL_THREADS`, `SIMPLICIO_LOCAL_GPU_LAYERS`,
+    `SIMPLICIO_LOCAL_MAX_TOKENS`, `SIMPLICIO_LOCAL_TEMP`.
+  - New optional extra: `pip install 'simplicio-cli[local]'`
+    (`llama-cpp-python>=0.3.2`, `huggingface-hub>=0.23`).
+
+### Changed
+- `simplicio` with no provider configured no longer raises — it falls back to
+  the local Qwen model (offline-first). Set `SIMPLICIO_BASE_URL` or
+  `SIMPLICIO_MODEL` to opt back into a remote provider.
+
 ## [0.4.4] — 2026-05-30
 
 ### Added

@@ -500,6 +500,7 @@ user prompt. UserPromptSubmit is the right pre-hook for routing decisions.
 | DeepSeek | `deepseek-chat` | `https://api.deepseek.com` |
 | OpenAI | `gpt-4.1` | `https://api.openai.com/v1` |
 | Local (Ollama) | `llama3` | `http://localhost:11434/v1` |
+| Local (in-process) | `local-llama/default` | *(leave unset)* |
 | Anthropic native | `claude-opus-4-7` | *(leave unset)* |
 
 If `SIMPLICIO_BASE_URL` is unset and the key is `ANTHROPIC_API_KEY`, it uses the
@@ -510,6 +511,40 @@ your `base_url` — so **any** OpenAI-like provider works without code changes.
 simplicio smoke      # prints provider config + one test call
 ```
 
+### Path 4 — offline-first local model (zero key, zero HTTP)
+
+simplicio ships an **in-process** backend powered by
+[`llama-cpp-python`](https://github.com/abetlen/llama-cpp-python). When **no
+provider is configured** (`SIMPLICIO_MODEL` *and* `SIMPLICIO_BASE_URL` both
+unset), it runs **Qwen2.5-Coder-1.5B-Instruct (Q5_K_M GGUF)** directly — small,
+code-specialized, fast on CPU, no API key, no Ollama, no HTTP overhead. The
+6-layer contract is what makes a 1.5B usable: it lifts the same model from ~34%
+to ~88% pass-rate on the local benchmark.
+
+```bash
+pip install 'simplicio-cli[local]'          # pulls llama-cpp-python + huggingface-hub
+
+simplicio task "add input validation to createUser" \
+  --target src/users.ts --local              # forces the local model
+
+# the GGUF is fetched once from the Hugging Face Hub, then cached + reused
+```
+
+Explicit routes (override the default model/weights):
+
+```bash
+SIMPLICIO_MODEL=local-llama/default                                  # bundled default
+SIMPLICIO_MODEL=local-llama/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF::Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf
+SIMPLICIO_MODEL=local-llama//models/my-model.gguf                    # direct local path
+SIMPLICIO_LOCAL_MODEL_PATH=/models/my-model.gguf                     # always wins
+```
+
+Tuning knobs (all optional): `SIMPLICIO_LOCAL_CTX` (context window, default
+`8192`), `SIMPLICIO_LOCAL_THREADS`, `SIMPLICIO_LOCAL_GPU_LAYERS` (offload to GPU,
+default `0`), `SIMPLICIO_LOCAL_MAX_TOKENS` (generation cap),
+`SIMPLICIO_LOCAL_TEMP` (default `0.1`), `SIMPLICIO_LOCAL_MODEL_REPO` /
+`SIMPLICIO_LOCAL_MODEL_FILE`.
+
 ### The pipeline (both paths)
 
 Whichever entry point you use, each task runs through the same engine:

@@ -1,6 +1,6 @@
 [project]
 name = "simplicio-cli"
-version = "0.4.4"
+version = "0.5.0"
 description = "Portable task-to-code pipeline that works with any LLM. Turn a one-line task into a verified code change — diff + test + verify loop. +55 pts on a 156-check benchmark, 21% faster, ~same tokens."
 readme = "README.md"
 license = { text = "MIT" }
@@ -55,6 +55,9 @@ dependencies = [
 
 [project.optional-dependencies]
 bench = ["fpdf2>=2.7"]
+# Offline-first in-process inference (Path 4). Pulls the llama.cpp Python
+# bindings plus huggingface-hub to fetch the default Qwen2.5-Coder-1.5B GGUF.
+local = ["llama-cpp-python>=0.3.2", "huggingface-hub>=0.23"]
 
 [project.urls]
 Homepage = "https://github.com/wesleysimplicio/simplicio-cli"

@@ -1 +1 @@
-__version__ = "0.4.4"
+__version__ = "0.5.0"
@@ -99,6 +99,12 @@ def main(argv=None):
         default=[],
         help="glob limiting which paths the task may change; repeatable",
     )
+    pt.add_argument(
+        "--local",
+        action="store_true",
+        help="force the in-process local model (Qwen2.5-Coder-1.5B GGUF, "
+        "no API key); overrides SIMPLICIO_MODEL/SIMPLICIO_BASE_URL",
+    )
 
     pb = sub.add_parser("bench", help="compare with vs without (real numbers)")
     pb.add_argument("--root", default=".")
@@ -187,6 +193,12 @@ def main(argv=None):
     else:
         from .pipeline import run, run_task
 
+        if getattr(a, "local", False):
+            # Force Path 4: pin the local model and drop any HTTP endpoint so the
+            # in-process llama backend wins regardless of the ambient config.
+            os.environ["SIMPLICIO_MODEL"] = "local-llama/default"
+            os.environ.pop("SIMPLICIO_BASE_URL", None)
+
         if a.json or a.dry_run_task:
             result = run_task(
                 a.root,

@@ -1,7 +1,7 @@
 """
 providers.py — provider-agnostic. Does NOT list specific models.
 
-Three modes, picked by SIMPLICIO_MODEL prefix:
+Four modes, picked by SIMPLICIO_MODEL prefix (or by absence of config):
 
 1. Native Anthropic SDK
      SIMPLICIO_MODEL=claude-opus-4-7
@@ -20,6 +20,16 @@
      to be logged in (Claude Code session or `codex login`). Subprocess is
      given SIMPLICIO_HOOK_GUARD=1 so the inner CLI does not re-trigger the
      simplicio UserPromptSubmit hook (recursion guard).
+
+4. In-process local inference via llama-cpp-python (offline-first, zero key)
+     SIMPLICIO_MODEL=local-llama/<repo>::<file.gguf>   -> explicit HF GGUF
+     SIMPLICIO_MODEL=local-llama/default               -> bundled default
+     SIMPLICIO_MODEL=local-llama//abs/path/model.gguf  -> direct local path
+     This is also the DEFAULT when neither SIMPLICIO_MODEL nor
+     SIMPLICIO_BASE_URL is set: simplicio runs Qwen2.5-Coder-1.5B-Instruct
+     (Q5_K_M GGUF) on CPU with no HTTP overhead. The GGUF is fetched once from
+     the Hugging Face Hub and the model is loaded once, then reused. Requires
+     the `local` extra: pip install 'simplicio-cli[local]'.
 """
 
 import os
@@ -56,7 +66,125 @@ def _inline_feedback(prompt, feedback):
     return f"{prompt}\n\nThe test FAILED:\n{feedback}\nFix it. Same output format."
 
 
+# --------------------------------------------------------------------------- #
+# Path 4: in-process local inference (llama-cpp-python). Offline-first default.
+# --------------------------------------------------------------------------- #
+
+# bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF is a small, code-specialized model
+# that runs fast on CPU. Q5_K_M is the speed/quality sweet spot for the 1.5B.
+LOCAL_DEFAULT_REPO = "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF"
+LOCAL_DEFAULT_FILE = "Qwen2.5-Coder-1.5B-Instruct-Q5_K_M.gguf"
+LOCAL_MODEL_PREFIX = "local-llama/"
+
+# Loaded Llama instances, keyed by (gguf_path, n_ctx, n_threads, n_gpu_layers).
+# A model load is expensive (weights -> RAM), so we keep it for the process.
+_LOCAL_LLAMA_CACHE = {}
+
+
+def _is_local(model, base):
+    """True when generate() should route to the in-process llama backend.
+
+    Either an explicit `local-llama/` model, or the offline-first default:
+    nothing configured at all (no model, no OpenAI-compatible base_url).
+    """
+    if model and model.startswith(LOCAL_MODEL_PREFIX):
+        return True
+    return not model and not base
+
+
+def _local_spec(model):
+    """Resolve (repo, file, path) for a local-llama model id.
+
+    Forms after the `local-llama/` prefix:
+      "" / "default" / "auto"   -> bundled Qwen2.5-Coder-1.5B Q5_K_M default
+      "<repo>::<file.gguf>"     -> explicit HF repo + filename
+      "/abs/path/model.gguf"    -> direct local path (no download)
+      "<repo>"                  -> HF repo + default/SIMPLICIO_LOCAL_MODEL_FILE
+    SIMPLICIO_LOCAL_MODEL_PATH always wins when set.
+    """
+    path = os.environ.get("SIMPLICIO_LOCAL_MODEL_PATH")
+    if path:
+        return None, None, path
+    file_env = os.environ.get("SIMPLICIO_LOCAL_MODEL_FILE", LOCAL_DEFAULT_FILE)
+    spec = ""
+    if model and model.startswith(LOCAL_MODEL_PREFIX):
+        spec = model[len(LOCAL_MODEL_PREFIX) :].strip()
+    if spec and spec not in ("default", "auto"):
+        if "::" in spec:
+            repo, fname = spec.split("::", 1)
+            return repo.strip(), fname.strip(), None
+        if spec.endswith(".gguf") and (os.sep in spec or spec.startswith((".", "/"))):
+            return None, None, spec
+        return spec, file_env, None
+    repo = os.environ.get("SIMPLICIO_LOCAL_MODEL_REPO", LOCAL_DEFAULT_REPO)
+    return repo, file_env, None
+
+
+def _resolve_local_path(repo, fname, path):
+    """Return a filesystem path to the GGUF, downloading from HF if needed."""
+    if path:
+        if not os.path.exists(path):
+            raise SystemExit(
+                f"simplicio: local model not found at {path}. Point "
+                "SIMPLICIO_LOCAL_MODEL_PATH at an existing .gguf file."
+            )
+        return path
+    try:
+        from huggingface_hub import hf_hub_download
+    except ImportError:
+        raise SystemExit(
+            "simplicio: local backend needs huggingface-hub. "
+            "Install extras: pip install 'simplicio-cli[local]'"
+        )
+    return hf_hub_download(repo_id=repo, filename=fname)
+
+
+def _local_llama(model):
+    """Load (or reuse) the Llama instance for the given local model id."""
+    try:
+        from llama_cpp import Llama
+    except ImportError:
+        raise SystemExit(
+            "simplicio: local backend needs llama-cpp-python. "
+            "Install extras: pip install 'simplicio-cli[local]'"
+        )
+    repo, fname, path = _local_spec(model)
+    gguf = _resolve_local_path(repo, fname, path)
+    n_ctx = int(os.environ.get("SIMPLICIO_LOCAL_CTX", "8192"))
+    threads = os.environ.get("SIMPLICIO_LOCAL_THREADS")
+    n_threads = int(threads) if threads else None
+    n_gpu_layers = int(os.environ.get("SIMPLICIO_LOCAL_GPU_LAYERS", "0"))
+    cache_key = (gguf, n_ctx, n_threads, n_gpu_layers)
+    llm = _LOCAL_LLAMA_CACHE.get(cache_key)
+    if llm is None:
+        llm = Llama(
+            model_path=gguf,
+            n_ctx=n_ctx,
+            n_threads=n_threads,
+            n_gpu_layers=n_gpu_layers,
+            verbose=False,
+        )
+        _LOCAL_LLAMA_CACHE[cache_key] = llm
+    return llm
+
+
+def _local_generate(prompt, feedback, model, max_tokens):
+    """Generate a completion in-process via llama-cpp-python."""
+    llm = _local_llama(model)
+    cap = os.environ.get("SIMPLICIO_LOCAL_MAX_TOKENS")
+    out_tokens = int(cap) if cap else max_tokens
+    temperature = float(os.environ.get("SIMPLICIO_LOCAL_TEMP", "0.1"))
+    r = llm.create_chat_completion(
+        messages=_msgs(prompt, feedback),
+        max_tokens=out_tokens,
+        temperature=temperature,
+    )
+    return r["choices"][0]["message"]["content"] or ""
+
+
 def _provider_id(model, base):
+    if model and model.startswith(LOCAL_MODEL_PREFIX):
+        return "local-llama"
     if model.startswith("claude-cli/"):
         return "claude-cli"
     if model.startswith("codex-cli/"):
@@ -130,10 +258,35 @@ def _shell_out_codex(prompt, model):
 def generate(prompt, feedback=None, max_tokens=4000):
     c = _cfg()
     model = c["model"]
+
+    # Path 4: in-process local inference. Explicit `local-llama/` model, or the
+    # offline-first default when nothing is configured. No API key, no HTTP.
+    if _is_local(model, c["base"]):
+        eff_model = model or (LOCAL_MODEL_PREFIX + "default")
+        # Fold the resolved weights into the cache key: two different GGUFs can
+        # both route as `local-llama/default` (via SIMPLICIO_LOCAL_MODEL_PATH /
+        # _REPO / _FILE), and must NOT share cached completions.
+        repo, fname, path = _local_spec(eff_model)
+        weights = path or f"{repo}/{fname}"
+        key = make_key(
+            "local-llama",
+            eff_model,
+            prompt,
+            feedback=feedback,
+            max_tokens=max_tokens,
+            weights=weights,
+        )
+        cached = cache().get(key)
+        if cached is not None:
+            return cached.completion
+        out = _local_generate(prompt, feedback, eff_model, max_tokens)
+        cache().put(key, CacheEntry(out, provider_id="local-llama", model=eff_model))
+        return out
+
     if not model:
         raise SystemExit(
             "set SIMPLICIO_MODEL (e.g. anthropic/claude-opus-4, claude-cli/sonnet, "
-            "codex-cli/gpt-5, glm-4.6, llama3, claude-opus-4-7)"
+            "codex-cli/gpt-5, local-llama/default, glm-4.6, llama3, claude-opus-4-7)"
         )
     provider_id = _provider_id(model, c["base"])
     key = make_key(
@@ -194,6 +347,14 @@ def generate(prompt, feedback=None, max_tokens=4000):
 
 def info():
     c = _cfg()
+    if _is_local(c["model"], c["base"]):
+        eff_model = c["model"] or (LOCAL_MODEL_PREFIX + "default (auto)")
+        repo, fname, path = _local_spec(c["model"] or "")
+        target = path or f"{repo}/{fname}"
+        return (
+            f"model={eff_model} provider=local-llama "
+            f"(in-process, llama-cpp-python) target={target} key=not-needed"
+        )
     model = c["model"] or "(unset)"
     if model.startswith("claude-cli/"):
         return f"model={model} provider=claude-cli (shell-out, uses Claude Code OAuth) key=not-needed"

@@ -7,7 +7,7 @@
 def test_package_version_matches_release_metadata() -> None:
     project = tomllib.loads(Path("pyproject.toml").read_text(encoding="utf-8"))["project"]
 
-    assert project["version"] == "0.4.4"
+    assert project["version"] == "0.5.0"
     assert __version__ == project["version"]