diff --git a/.gitignore b/.gitignore
index 0e8f8c7..87a6b2b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,7 @@ test-results/
 
 # Rust build artifacts
 rust/*/target/
+
+# Local GGUF weights (Path 4 in-process llama backend)
+*.gguf
+/models/
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 00785f2..4610767 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,34 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.5.0] — 2026-05-31
+
+### Added
+- **Path 4: in-process local inference via `llama-cpp-python`** (issue #42).
+  A new offline-first provider runs a GGUF model directly in the Python
+  process — zero API key, zero HTTP overhead. The model is loaded once and
+  reused across calls.
+  - **Default local model:** `Qwen2.5-Coder-1.5B-Instruct-Q5_K_M` from
+    `bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF`, fetched once from the
+    Hugging Face Hub.
+  - **Auto-default:** when neither `SIMPLICIO_MODEL` nor `SIMPLICIO_BASE_URL`
+    is set, simplicio now routes to this local model instead of erroring.
+  - **Explicit route:** `SIMPLICIO_MODEL=local-llama/<repo>::<file.gguf>`,
+    `local-llama/default`, or `local-llama//abs/path/model.gguf`.
+  - **`simplicio task --local`** forces the local model regardless of ambient
+    config.
+  - **Tuning knobs:** `SIMPLICIO_LOCAL_MODEL_PATH`, `SIMPLICIO_LOCAL_MODEL_REPO`,
+    `SIMPLICIO_LOCAL_MODEL_FILE`, `SIMPLICIO_LOCAL_CTX`,
+    `SIMPLICIO_LOCAL_THREADS`, `SIMPLICIO_LOCAL_GPU_LAYERS`,
+    `SIMPLICIO_LOCAL_MAX_TOKENS`, `SIMPLICIO_LOCAL_TEMP`.
+  - New optional extra: `pip install 'simplicio-cli[local]'`
+    (`llama-cpp-python>=0.3.2`, `huggingface-hub>=0.23`).
+
+### Changed
+- `simplicio` with no provider configured no longer raises — it falls back to
+  the local Qwen model (offline-first). Set `SIMPLICIO_BASE_URL` or
+  `SIMPLICIO_MODEL` to opt back into a remote provider.
+
 ## [0.4.4] — 2026-05-30
 
 ### Added
diff --git a/README.md b/README.md
index 8621f16..0da8abb 100644
--- a/README.md
+++ b/README.md
@@ -500,6 +500,7 @@ user prompt. UserPromptSubmit is the right pre-hook for routing decisions.
 | DeepSeek | `deepseek-chat` | `https://api.deepseek.com` |
 | OpenAI | `gpt-4.1` | `https://api.openai.com/v1` |
 | Local (Ollama) | `llama3` | `http://localhost:11434/v1` |
+| Local (in-process) | `local-llama/default` | *(leave unset)* |
 | Anthropic native | `claude-opus-4-7` | *(leave unset)* |
 
 If `SIMPLICIO_BASE_URL` is unset and the key is `ANTHROPIC_API_KEY`, it uses the
@@ -510,6 +511,40 @@ your `base_url` — so **any** OpenAI-like provider works without code changes.
 simplicio smoke      # prints provider config + one test call
 ```
 
+### Path 4 — offline-first local model (zero key, zero HTTP)
+
+simplicio ships an **in-process** backend powered by
+[`llama-cpp-python`](https://github.com/abetlen/llama-cpp-python). When **no
+provider is configured** (`SIMPLICIO_MODEL` *and* `SIMPLICIO_BASE_URL` both
+unset), it runs **Qwen2.5-Coder-1.5B-Instruct (Q5_K_M GGUF)** directly — small,
+code-specialized, fast on CPU, no API key, no Ollama, no HTTP overhead. The
+6-layer contract is what makes a 1.5B usable: it lifts the same model from ~34%
+to ~88% pass-rate on the local benchmark.
+
+```bash
+pip install 'simplicio-cli[local]'          # pulls llama-cpp-python + huggingface-hub
+
+simplicio task "add input validation to createUser" \
+  --target src/users.ts --local              # forces the local model
+
+# the GGUF is fetched once from the Hugging Face Hub, then cached + reused
+```
+
+Explicit routes (override the default model/weights):
+
+```bash
+SIMPLICIO_MODEL=local-llama/default                                  # bundled default
+SIMPLICIO_MODEL=local-llama/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF::Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf
+SIMPLICIO_MODEL=local-llama//models/my-model.gguf                    # direct local path
+SIMPLICIO_LOCAL_MODEL_PATH=/models/my-model.gguf                     # always wins
+```
+
+Tuning knobs (all optional): `SIMPLICIO_LOCAL_CTX` (context window, default
+`8192`), `SIMPLICIO_LOCAL_THREADS`, `SIMPLICIO_LOCAL_GPU_LAYERS` (offload to GPU,
+default `0`), `SIMPLICIO_LOCAL_MAX_TOKENS` (generation cap),
+`SIMPLICIO_LOCAL_TEMP` (default `0.1`), `SIMPLICIO_LOCAL_MODEL_REPO` /
+`SIMPLICIO_LOCAL_MODEL_FILE`.
+
 ### The pipeline (both paths)
 
 Whichever entry point you use, each task runs through the same engine:
diff --git a/pyproject.toml b/pyproject.toml
index a52fe08..eff7f06 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "simplicio-cli"
-version = "0.4.4"
+version = "0.5.0"
 description = "Portable task-to-code pipeline that works with any LLM. Turn a one-line task into a verified code change — diff + test + verify loop. +55 pts on a 156-check benchmark, 21% faster, ~same tokens."
 readme = "README.md"
 license = { text = "MIT" }
@@ -55,6 +55,9 @@ dependencies = [
 
 [project.optional-dependencies]
 bench = ["fpdf2>=2.7"]
+# Offline-first in-process inference (Path 4). Pulls the llama.cpp Python
+# bindings plus huggingface-hub to fetch the default Qwen2.5-Coder-1.5B GGUF.
+local = ["llama-cpp-python>=0.3.2", "huggingface-hub>=0.23"]
 
 [project.urls]
 Homepage = "https://github.com/wesleysimplicio/simplicio-cli"
diff --git a/simplicio/__init__.py b/simplicio/__init__.py
index cd1ee63..3d18726 100644
--- a/simplicio/__init__.py
+++ b/simplicio/__init__.py
@@ -1 +1 @@
-__version__ = "0.4.4"
+__version__ = "0.5.0"
diff --git a/simplicio/cli.py b/simplicio/cli.py
index f1d8276..88d9f52 100644
--- a/simplicio/cli.py
+++ b/simplicio/cli.py
@@ -99,6 +99,12 @@ def main(argv=None):
         default=[],
         help="glob limiting which paths the task may change; repeatable",
     )
+    pt.add_argument(
+        "--local",
+        action="store_true",
+        help="force the in-process local model (Qwen2.5-Coder-1.5B GGUF, "
+        "no API key); overrides SIMPLICIO_MODEL/SIMPLICIO_BASE_URL",
+    )
 
     pb = sub.add_parser("bench", help="compare with vs without (real numbers)")
     pb.add_argument("--root", default=".")
@@ -187,6 +193,12 @@ def main(argv=None):
     else:
         from .pipeline import run, run_task
 
+        if getattr(a, "local", False):
+            # Force Path 4: pin the local model and drop any HTTP endpoint so the
+            # in-process llama backend wins regardless of the ambient config.
+            os.environ["SIMPLICIO_MODEL"] = "local-llama/default"
+            os.environ.pop("SIMPLICIO_BASE_URL", None)
+
         if a.json or a.dry_run_task:
             result = run_task(
                 a.root,
diff --git a/simplicio/providers.py b/simplicio/providers.py
index ad82805..233440f 100644
--- a/simplicio/providers.py
+++ b/simplicio/providers.py
@@ -1,7 +1,7 @@
 """
 providers.py — provider-agnostic. Does NOT list specific models.
 
-Three modes, picked by SIMPLICIO_MODEL prefix:
+Four modes, picked by SIMPLICIO_MODEL prefix (or by absence of config):
 
 1. Native Anthropic SDK
      SIMPLICIO_MODEL=claude-opus-4-7
@@ -20,6 +20,16 @@
      to be logged in (Claude Code session or `codex login`). Subprocess is
      given SIMPLICIO_HOOK_GUARD=1 so the inner CLI does not re-trigger the
      simplicio UserPromptSubmit hook (recursion guard).
+
+4. In-process local inference via llama-cpp-python (offline-first, zero key)
+     SIMPLICIO_MODEL=local-llama/<repo>::<file.gguf>   -> explicit HF GGUF
+     SIMPLICIO_MODEL=local-llama/default               -> bundled default
+     SIMPLICIO_MODEL=local-llama//abs/path/model.gguf  -> direct local path
+     This is also the DEFAULT when neither SIMPLICIO_MODEL nor
+     SIMPLICIO_BASE_URL is set: simplicio runs Qwen2.5-Coder-1.5B-Instruct
+     (Q5_K_M GGUF) on CPU with no HTTP overhead. The GGUF is fetched once from
+     the Hugging Face Hub and the model is loaded once, then reused. Requires
+     the `local` extra: pip install 'simplicio-cli[local]'.
 """
 
 import os
@@ -56,7 +66,125 @@ def _inline_feedback(prompt, feedback):
     return f"{prompt}\n\nThe test FAILED:\n{feedback}\nFix it. Same output format."
 
 
+# --------------------------------------------------------------------------- #
+# Path 4: in-process local inference (llama-cpp-python). Offline-first default.
+# --------------------------------------------------------------------------- #
+
+# bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF is a small, code-specialized model
+# that runs fast on CPU. Q5_K_M is the speed/quality sweet spot for the 1.5B.
+LOCAL_DEFAULT_REPO = "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF"
+LOCAL_DEFAULT_FILE = "Qwen2.5-Coder-1.5B-Instruct-Q5_K_M.gguf"
+LOCAL_MODEL_PREFIX = "local-llama/"
+
+# Loaded Llama instances, keyed by (gguf_path, n_ctx, n_threads, n_gpu_layers).
+# A model load is expensive (weights -> RAM), so we keep it for the process.
+_LOCAL_LLAMA_CACHE = {}
+
+
+def _is_local(model, base):
+    """True when generate() should route to the in-process llama backend.
+
+    Either an explicit `local-llama/` model, or the offline-first default:
+    nothing configured at all (no model, no OpenAI-compatible base_url).
+    """
+    if model and model.startswith(LOCAL_MODEL_PREFIX):
+        return True
+    return not model and not base
+
+
+def _local_spec(model):
+    """Resolve (repo, file, path) for a local-llama model id.
+
+    Forms after the `local-llama/` prefix:
+      "" / "default" / "auto"   -> bundled Qwen2.5-Coder-1.5B Q5_K_M default
+      "<repo>::<file.gguf>"     -> explicit HF repo + filename
+      "/abs/path/model.gguf"    -> direct local path (no download)
+      "<repo>"                  -> HF repo + default/SIMPLICIO_LOCAL_MODEL_FILE
+    SIMPLICIO_LOCAL_MODEL_PATH always wins when set.
+    """
+    path = os.environ.get("SIMPLICIO_LOCAL_MODEL_PATH")
+    if path:
+        return None, None, path
+    file_env = os.environ.get("SIMPLICIO_LOCAL_MODEL_FILE", LOCAL_DEFAULT_FILE)
+    spec = ""
+    if model and model.startswith(LOCAL_MODEL_PREFIX):
+        spec = model[len(LOCAL_MODEL_PREFIX) :].strip()
+    if spec and spec not in ("default", "auto"):
+        if "::" in spec:
+            repo, fname = spec.split("::", 1)
+            return repo.strip(), fname.strip(), None
+        if spec.endswith(".gguf") and (os.sep in spec or spec.startswith((".", "/"))):
+            return None, None, spec
+        return spec, file_env, None
+    repo = os.environ.get("SIMPLICIO_LOCAL_MODEL_REPO", LOCAL_DEFAULT_REPO)
+    return repo, file_env, None
+
+
+def _resolve_local_path(repo, fname, path):
+    """Return a filesystem path to the GGUF, downloading from HF if needed."""
+    if path:
+        if not os.path.exists(path):
+            raise SystemExit(
+                f"simplicio: local model not found at {path}. Point "
+                "SIMPLICIO_LOCAL_MODEL_PATH at an existing .gguf file."
+            )
+        return path
+    try:
+        from huggingface_hub import hf_hub_download
+    except ImportError:
+        raise SystemExit(
+            "simplicio: local backend needs huggingface-hub. "
+            "Install extras: pip install 'simplicio-cli[local]'"
+        )
+    return hf_hub_download(repo_id=repo, filename=fname)
+
+
+def _local_llama(model):
+    """Load (or reuse) the Llama instance for the given local model id."""
+    try:
+        from llama_cpp import Llama
+    except ImportError:
+        raise SystemExit(
+            "simplicio: local backend needs llama-cpp-python. "
+            "Install extras: pip install 'simplicio-cli[local]'"
+        )
+    repo, fname, path = _local_spec(model)
+    gguf = _resolve_local_path(repo, fname, path)
+    n_ctx = int(os.environ.get("SIMPLICIO_LOCAL_CTX", "8192"))
+    threads = os.environ.get("SIMPLICIO_LOCAL_THREADS")
+    n_threads = int(threads) if threads else None
+    n_gpu_layers = int(os.environ.get("SIMPLICIO_LOCAL_GPU_LAYERS", "0"))
+    cache_key = (gguf, n_ctx, n_threads, n_gpu_layers)
+    llm = _LOCAL_LLAMA_CACHE.get(cache_key)
+    if llm is None:
+        llm = Llama(
+            model_path=gguf,
+            n_ctx=n_ctx,
+            n_threads=n_threads,
+            n_gpu_layers=n_gpu_layers,
+            verbose=False,
+        )
+        _LOCAL_LLAMA_CACHE[cache_key] = llm
+    return llm
+
+
+def _local_generate(prompt, feedback, model, max_tokens):
+    """Generate a completion in-process via llama-cpp-python."""
+    llm = _local_llama(model)
+    cap = os.environ.get("SIMPLICIO_LOCAL_MAX_TOKENS")
+    out_tokens = int(cap) if cap else max_tokens
+    temperature = float(os.environ.get("SIMPLICIO_LOCAL_TEMP", "0.1"))
+    r = llm.create_chat_completion(
+        messages=_msgs(prompt, feedback),
+        max_tokens=out_tokens,
+        temperature=temperature,
+    )
+    return r["choices"][0]["message"]["content"] or ""
+
+
 def _provider_id(model, base):
+    if model and model.startswith(LOCAL_MODEL_PREFIX):
+        return "local-llama"
     if model.startswith("claude-cli/"):
         return "claude-cli"
     if model.startswith("codex-cli/"):
@@ -130,10 +258,35 @@ def _shell_out_codex(prompt, model):
 def generate(prompt, feedback=None, max_tokens=4000):
     c = _cfg()
     model = c["model"]
+
+    # Path 4: in-process local inference. Explicit `local-llama/` model, or the
+    # offline-first default when nothing is configured. No API key, no HTTP.
+    if _is_local(model, c["base"]):
+        eff_model = model or (LOCAL_MODEL_PREFIX + "default")
+        # Fold the resolved weights into the cache key: two different GGUFs can
+        # both route as `local-llama/default` (via SIMPLICIO_LOCAL_MODEL_PATH /
+        # _REPO / _FILE), and must NOT share cached completions.
+        repo, fname, path = _local_spec(eff_model)
+        weights = path or f"{repo}/{fname}"
+        key = make_key(
+            "local-llama",
+            eff_model,
+            prompt,
+            feedback=feedback,
+            max_tokens=max_tokens,
+            weights=weights,
+        )
+        cached = cache().get(key)
+        if cached is not None:
+            return cached.completion
+        out = _local_generate(prompt, feedback, eff_model, max_tokens)
+        cache().put(key, CacheEntry(out, provider_id="local-llama", model=eff_model))
+        return out
+
     if not model:
         raise SystemExit(
             "set SIMPLICIO_MODEL (e.g. anthropic/claude-opus-4, claude-cli/sonnet, "
-            "codex-cli/gpt-5, glm-4.6, llama3, claude-opus-4-7)"
+            "codex-cli/gpt-5, local-llama/default, glm-4.6, llama3, claude-opus-4-7)"
         )
     provider_id = _provider_id(model, c["base"])
     key = make_key(
@@ -194,6 +347,14 @@ def generate(prompt, feedback=None, max_tokens=4000):
 
 def info():
     c = _cfg()
+    if _is_local(c["model"], c["base"]):
+        eff_model = c["model"] or (LOCAL_MODEL_PREFIX + "default (auto)")
+        repo, fname, path = _local_spec(c["model"] or "")
+        target = path or f"{repo}/{fname}"
+        return (
+            f"model={eff_model} provider=local-llama "
+            f"(in-process, llama-cpp-python) target={target} key=not-needed"
+        )
     model = c["model"] or "(unset)"
     if model.startswith("claude-cli/"):
         return f"model={model} provider=claude-cli (shell-out, uses Claude Code OAuth) key=not-needed"
diff --git a/tests/python/test_package_metadata.py b/tests/python/test_package_metadata.py
index 300c8fa..329edec 100644
--- a/tests/python/test_package_metadata.py
+++ b/tests/python/test_package_metadata.py
@@ -7,7 +7,7 @@
 def test_package_version_matches_release_metadata() -> None:
     project = tomllib.loads(Path("pyproject.toml").read_text(encoding="utf-8"))["project"]
 
-    assert project["version"] == "0.4.4"
+    assert project["version"] == "0.5.0"
     assert __version__ == project["version"]
 
 
diff --git a/tests/python/test_providers_local.py b/tests/python/test_providers_local.py
new file mode 100644
index 0000000..b9656fb
--- /dev/null
+++ b/tests/python/test_providers_local.py
@@ -0,0 +1,326 @@
+"""Tests for Path 4: in-process local inference (llama-cpp-python).
+
+The llama-cpp-python and huggingface-hub libs are optional extras that are not
+installed in CI, so we test the routing/spec resolution directly and stub the
+heavy model load (`_local_llama`) when exercising generate().
+"""
+
+import os
+import sys
+import types
+from unittest.mock import MagicMock
+
+import pytest
+
+from simplicio import providers
+from simplicio._cache import reset_for_tests
+
+
+@pytest.fixture(autouse=True)
+def _clean(tmp_path, monkeypatch):
+    for v in (
+        "SIMPLICIO_MODEL",
+        "SIMPLICIO_BASE_URL",
+        "SIMPLICIO_API_KEY",
+        "OPENROUTER_API_KEY",
+        "ANTHROPIC_API_KEY",
+        "SIMPLICIO_LOCAL_MODEL_PATH",
+        "SIMPLICIO_LOCAL_MODEL_REPO",
+        "SIMPLICIO_LOCAL_MODEL_FILE",
+        "SIMPLICIO_LOCAL_CTX",
+        "SIMPLICIO_LOCAL_THREADS",
+        "SIMPLICIO_LOCAL_GPU_LAYERS",
+        "SIMPLICIO_LOCAL_MAX_TOKENS",
+        "SIMPLICIO_LOCAL_TEMP",
+    ):
+        monkeypatch.delenv(v, raising=False)
+    monkeypatch.setenv("SIMPLICIO_CACHE_DIR", str(tmp_path / "cache"))
+    monkeypatch.delenv("SIMPLICIO_BUST_CACHE", raising=False)
+    providers._LOCAL_LLAMA_CACHE.clear()
+    reset_for_tests()
+    yield
+    providers._LOCAL_LLAMA_CACHE.clear()
+    reset_for_tests()
+
+
+# --------------------------------------------------------------------------- #
+# _is_local
+# --------------------------------------------------------------------------- #
+
+
+def test_is_local_explicit_prefix():
+    assert providers._is_local("local-llama/default", None) is True
+    assert providers._is_local("local-llama/repo::a.gguf", "http://x") is True
+
+
+def test_is_local_auto_default_when_nothing_configured():
+    assert providers._is_local(None, None) is True
+    assert providers._is_local("", "") is True
+
+
+def test_is_local_false_when_base_set():
+    assert providers._is_local(None, "http://localhost:11434/v1") is False
+
+
+def test_is_local_false_when_other_model_set():
+    assert providers._is_local("claude-opus-4-7", None) is False
+    assert providers._is_local("claude-cli/sonnet", None) is False
+
+
+# --------------------------------------------------------------------------- #
+# _local_spec
+# --------------------------------------------------------------------------- #
+
+
+def test_local_spec_default():
+    repo, fname, path = providers._local_spec("")
+    assert repo == providers.LOCAL_DEFAULT_REPO
+    assert fname == providers.LOCAL_DEFAULT_FILE
+    assert path is None
+
+
+def test_local_spec_default_keyword():
+    repo, fname, path = providers._local_spec("local-llama/default")
+    assert repo == providers.LOCAL_DEFAULT_REPO
+    assert fname == providers.LOCAL_DEFAULT_FILE
+
+
+def test_local_spec_repo_and_file():
+    repo, fname, path = providers._local_spec("local-llama/owner/repo::weights.gguf")
+    assert repo == "owner/repo"
+    assert fname == "weights.gguf"
+    assert path is None
+
+
+def test_local_spec_direct_path():
+    repo, fname, path = providers._local_spec("local-llama//models/x.gguf")
+    assert repo is None and fname is None
+    assert path == "/models/x.gguf"
+
+
+def test_local_spec_path_env_wins(monkeypatch):
+    monkeypatch.setenv("SIMPLICIO_LOCAL_MODEL_PATH", "/data/custom.gguf")
+    repo, fname, path = providers._local_spec("local-llama/owner/repo::w.gguf")
+    assert path == "/data/custom.gguf"
+    assert repo is None and fname is None
+
+
+def test_local_spec_file_env_override(monkeypatch):
+    monkeypatch.setenv("SIMPLICIO_LOCAL_MODEL_FILE", "Q4_K_M.gguf")
+    repo, fname, _ = providers._local_spec("")
+    assert fname == "Q4_K_M.gguf"
+
+
+def test_local_spec_bare_repo_uses_default_file():
+    repo, fname, path = providers._local_spec("local-llama/some/repo")
+    assert repo == "some/repo"
+    assert fname == providers.LOCAL_DEFAULT_FILE
+
+
+# --------------------------------------------------------------------------- #
+# _provider_id / info
+# --------------------------------------------------------------------------- #
+
+
+def test_provider_id_local():
+    assert providers._provider_id("local-llama/default", None) == "local-llama"
+
+
+def test_info_local_auto_default():
+    s = providers.info()
+    assert "local-llama" in s
+    assert "in-process" in s
+    assert "key=not-needed" in s
+    assert providers.LOCAL_DEFAULT_FILE in s
+
+
+def test_info_local_explicit(monkeypatch):
+    monkeypatch.setenv("SIMPLICIO_MODEL", "local-llama/owner/repo::w.gguf")
+    s = providers.info()
+    assert "local-llama" in s
+    assert "owner/repo/w.gguf" in s
+
+
+# --------------------------------------------------------------------------- #
+# _resolve_local_path
+# --------------------------------------------------------------------------- #
+
+
+def test_resolve_local_path_missing_file_raises():
+    with pytest.raises(SystemExit) as exc:
+        providers._resolve_local_path(None, None, "/nope/missing.gguf")
+    assert "not found" in str(exc.value)
+
+
+def test_resolve_local_path_existing_file(tmp_path):
+    f = tmp_path / "m.gguf"
+    f.write_bytes(b"x")
+    assert providers._resolve_local_path(None, None, str(f)) == str(f)
+
+
+def test_resolve_local_path_downloads_from_hf(monkeypatch):
+    fake = types.ModuleType("huggingface_hub")
+    fake.hf_hub_download = MagicMock(return_value="/cache/weights.gguf")
+    monkeypatch.setitem(sys.modules, "huggingface_hub", fake)
+    out = providers._resolve_local_path("owner/repo", "weights.gguf", None)
+    assert out == "/cache/weights.gguf"
+    fake.hf_hub_download.assert_called_once_with(
+        repo_id="owner/repo", filename="weights.gguf"
+    )
+
+
+def test_resolve_local_path_no_hf_lib_raises(monkeypatch):
+    monkeypatch.setitem(sys.modules, "huggingface_hub", None)
+    with pytest.raises(SystemExit) as exc:
+        providers._resolve_local_path("owner/repo", "w.gguf", None)
+    assert "huggingface-hub" in str(exc.value)
+    assert "simplicio-cli[local]" in str(exc.value)
+
+
+# --------------------------------------------------------------------------- #
+# _local_llama missing backend
+# --------------------------------------------------------------------------- #
+
+
+def test_local_llama_missing_backend_raises(monkeypatch):
+    monkeypatch.setitem(sys.modules, "llama_cpp", None)
+    with pytest.raises(SystemExit) as exc:
+        providers._local_llama("local-llama/default")
+    assert "llama-cpp-python" in str(exc.value)
+    assert "simplicio-cli[local]" in str(exc.value)
+
+
+def test_local_llama_loads_and_caches(monkeypatch):
+    f = MagicMock(name="LlamaInstance")
+    Llama = MagicMock(return_value=f)
+    fake = types.ModuleType("llama_cpp")
+    fake.Llama = Llama
+    monkeypatch.setitem(sys.modules, "llama_cpp", fake)
+    monkeypatch.setenv("SIMPLICIO_LOCAL_MODEL_PATH", _touch(monkeypatch))
+
+    a = providers._local_llama("local-llama/default")
+    b = providers._local_llama("local-llama/default")
+    assert a is f and b is f
+    Llama.assert_called_once()  # second call reused the cached instance
+    kwargs = Llama.call_args[1]
+    assert kwargs["n_ctx"] == 8192
+    assert kwargs["n_gpu_layers"] == 0
+    assert kwargs["verbose"] is False
+
+
+def test_local_llama_honours_ctx_threads_gpu(monkeypatch):
+    Llama = MagicMock(return_value=MagicMock())
+    fake = types.ModuleType("llama_cpp")
+    fake.Llama = Llama
+    monkeypatch.setitem(sys.modules, "llama_cpp", fake)
+    monkeypatch.setenv("SIMPLICIO_LOCAL_MODEL_PATH", _touch(monkeypatch))
+    monkeypatch.setenv("SIMPLICIO_LOCAL_CTX", "16384")
+    monkeypatch.setenv("SIMPLICIO_LOCAL_THREADS", "6")
+    monkeypatch.setenv("SIMPLICIO_LOCAL_GPU_LAYERS", "20")
+
+    providers._local_llama("local-llama/default")
+    kwargs = Llama.call_args[1]
+    assert kwargs["n_ctx"] == 16384
+    assert kwargs["n_threads"] == 6
+    assert kwargs["n_gpu_layers"] == 20
+
+
+# --------------------------------------------------------------------------- #
+# generate() local routing
+# --------------------------------------------------------------------------- #
+
+
+def test_generate_routes_to_local_by_default(monkeypatch):
+    calls = []
+
+    def fake_local(prompt, feedback, model, max_tokens):
+        calls.append((prompt, model, max_tokens))
+        return "LOCAL OK"
+
+    monkeypatch.setattr(providers, "_local_generate", fake_local)
+    out = providers.generate("do x", max_tokens=128)
+    assert out == "LOCAL OK"
+    assert calls[0][1] == "local-llama/default"
+    assert calls[0][2] == 128
+
+
+def test_generate_local_explicit_prefix(monkeypatch):
+    seen = {}
+
+    def fake_local(prompt, feedback, model, max_tokens):
+        seen["model"] = model
+        return "OK"
+
+    monkeypatch.setenv("SIMPLICIO_MODEL", "local-llama/owner/repo::w.gguf")
+    monkeypatch.setattr(providers, "_local_generate", fake_local)
+    providers.generate("x")
+    assert seen["model"] == "local-llama/owner/repo::w.gguf"
+
+
+def test_generate_local_uses_completion_cache(monkeypatch):
+    n = {"calls": 0}
+
+    def fake_local(prompt, feedback, model, max_tokens):
+        n["calls"] += 1
+        return "CACHED"
+
+    monkeypatch.setattr(providers, "_local_generate", fake_local)
+    assert providers.generate("same prompt") == "CACHED"
+    assert providers.generate("same prompt") == "CACHED"
+    assert n["calls"] == 1  # second call served from cache
+
+
+def test_generate_no_model_with_base_still_raises(monkeypatch):
+    monkeypatch.setenv("SIMPLICIO_BASE_URL", "http://localhost:11434/v1")
+    with pytest.raises(SystemExit) as exc:
+        providers.generate("x")
+    assert "SIMPLICIO_MODEL" in str(exc.value)
+    assert "local-llama" in str(exc.value)
+
+
+def test_local_generate_caps_tokens_and_temp(monkeypatch):
+    llm = MagicMock()
+    llm.create_chat_completion.return_value = {
+        "choices": [{"message": {"content": "hi"}}]
+    }
+    monkeypatch.setattr(providers, "_local_llama", lambda model: llm)
+    monkeypatch.setenv("SIMPLICIO_LOCAL_MAX_TOKENS", "256")
+    monkeypatch.setenv("SIMPLICIO_LOCAL_TEMP", "0.4")
+
+    out = providers._local_generate("p", None, "local-llama/default", 4000)
+    assert out == "hi"
+    kwargs = llm.create_chat_completion.call_args[1]
+    assert kwargs["max_tokens"] == 256  # cap overrides the 4000 arg
+    assert kwargs["temperature"] == 0.4
+
+
+def test_generate_cache_key_includes_weights(monkeypatch):
+    # Two different GGUFs both routed as the default must not collide in cache.
+    seen = []
+
+    def fake_local(prompt, feedback, model, max_tokens):
+        seen.append(os.environ.get("SIMPLICIO_LOCAL_MODEL_PATH"))
+        return f"out:{os.environ.get('SIMPLICIO_LOCAL_MODEL_PATH')}"
+
+    monkeypatch.setenv("SIMPLICIO_CACHE", "1")
+    monkeypatch.setattr(providers, "_local_generate", fake_local)
+
+    monkeypatch.setenv("SIMPLICIO_LOCAL_MODEL_PATH", "/models/a.gguf")
+    out_a = providers.generate("same prompt")
+    monkeypatch.setenv("SIMPLICIO_LOCAL_MODEL_PATH", "/models/b.gguf")
+    out_b = providers.generate("same prompt")
+
+    assert out_a == "out:/models/a.gguf"
+    assert out_b == "out:/models/b.gguf"  # not served stale from model A
+    assert seen == ["/models/a.gguf", "/models/b.gguf"]
+
+
+def _touch(monkeypatch):
+    """Create a throwaway .gguf file and return its path."""
+    import tempfile
+
+    fd, path = tempfile.mkstemp(suffix=".gguf")
+    import os as _os
+
+    _os.close(fd)
+    return path
diff --git a/tests/python/test_providers_shellout.py b/tests/python/test_providers_shellout.py
index 221c17b..894349f 100644
--- a/tests/python/test_providers_shellout.py
+++ b/tests/python/test_providers_shellout.py
@@ -166,8 +166,11 @@ def test_native_path_still_requires_key(monkeypatch):
     assert "claude-cli" in str(exc.value)
 
 
-def test_no_model_raises_with_hint(monkeypatch):
+def test_no_model_with_base_raises_with_hint(monkeypatch):
+    # With no model but an OpenAI-compatible base_url set, the local default
+    # does NOT kick in (the base signals a remote endpoint) so we still raise.
     monkeypatch.delenv("SIMPLICIO_MODEL", raising=False)
+    monkeypatch.setenv("SIMPLICIO_BASE_URL", "http://localhost:11434/v1")
 
     with pytest.raises(SystemExit) as exc:
         providers.generate("x")
@@ -175,3 +178,14 @@ def test_no_model_raises_with_hint(monkeypatch):
     assert "SIMPLICIO_MODEL" in msg
     assert "claude-cli" in msg
     assert "codex-cli" in msg
+    assert "local-llama" in msg
+
+
+def test_no_config_at_all_routes_to_local_default(monkeypatch):
+    # No model AND no base -> offline-first local default (Path 4), not a raise.
+    monkeypatch.delenv("SIMPLICIO_MODEL", raising=False)
+    monkeypatch.delenv("SIMPLICIO_BASE_URL", raising=False)
+    monkeypatch.setattr(
+        providers, "_local_generate", lambda p, f, m, mt: f"local:{m}"
+    )
+    assert providers.generate("x") == "local:local-llama/default"