diff --git a/.gitignore b/.gitignore index 0e8f8c7..87a6b2b 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,7 @@ test-results/ # Rust build artifacts rust/*/target/ + +# Local GGUF weights (Path 4 in-process llama backend) +*.gguf +/models/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 00785f2..4610767 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,34 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.5.0] — 2026-05-31 + +### Added +- **Path 4: in-process local inference via `llama-cpp-python`** (issue #42). + A new offline-first provider runs a GGUF model directly in the Python + process — zero API key, zero HTTP overhead. The model is loaded once and + reused across calls. + - **Default local model:** `Qwen2.5-Coder-1.5B-Instruct-Q5_K_M` from + `bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF`, fetched once from the + Hugging Face Hub. + - **Auto-default:** when neither `SIMPLICIO_MODEL` nor `SIMPLICIO_BASE_URL` + is set, simplicio now routes to this local model instead of erroring. + - **Explicit route:** `SIMPLICIO_MODEL=local-llama/::`, + `local-llama/default`, or `local-llama//abs/path/model.gguf`. + - **`simplicio task --local`** forces the local model regardless of ambient + config. + - **Tuning knobs:** `SIMPLICIO_LOCAL_MODEL_PATH`, `SIMPLICIO_LOCAL_MODEL_REPO`, + `SIMPLICIO_LOCAL_MODEL_FILE`, `SIMPLICIO_LOCAL_CTX`, + `SIMPLICIO_LOCAL_THREADS`, `SIMPLICIO_LOCAL_GPU_LAYERS`, + `SIMPLICIO_LOCAL_MAX_TOKENS`, `SIMPLICIO_LOCAL_TEMP`. + - New optional extra: `pip install 'simplicio-cli[local]'` + (`llama-cpp-python>=0.3.2`, `huggingface-hub>=0.23`). + +### Changed +- `simplicio` with no provider configured no longer raises — it falls back to + the local Qwen model (offline-first). Set `SIMPLICIO_BASE_URL` or + `SIMPLICIO_MODEL` to opt back into a remote provider. + ## [0.4.4] — 2026-05-30 ### Added diff --git a/README.md b/README.md index 8621f16..0da8abb 100644 --- a/README.md +++ b/README.md @@ -500,6 +500,7 @@ user prompt. UserPromptSubmit is the right pre-hook for routing decisions. | DeepSeek | `deepseek-chat` | `https://api.deepseek.com` | | OpenAI | `gpt-4.1` | `https://api.openai.com/v1` | | Local (Ollama) | `llama3` | `http://localhost:11434/v1` | +| Local (in-process) | `local-llama/default` | *(leave unset)* | | Anthropic native | `claude-opus-4-7` | *(leave unset)* | If `SIMPLICIO_BASE_URL` is unset and the key is `ANTHROPIC_API_KEY`, it uses the @@ -510,6 +511,40 @@ your `base_url` — so **any** OpenAI-like provider works without code changes. simplicio smoke # prints provider config + one test call ``` +### Path 4 — offline-first local model (zero key, zero HTTP) + +simplicio ships an **in-process** backend powered by +[`llama-cpp-python`](https://github.com/abetlen/llama-cpp-python). When **no +provider is configured** (`SIMPLICIO_MODEL` *and* `SIMPLICIO_BASE_URL` both +unset), it runs **Qwen2.5-Coder-1.5B-Instruct (Q5_K_M GGUF)** directly — small, +code-specialized, fast on CPU, no API key, no Ollama, no HTTP overhead. The +6-layer contract is what makes a 1.5B usable: it lifts the same model from ~34% +to ~88% pass-rate on the local benchmark. + +```bash +pip install 'simplicio-cli[local]' # pulls llama-cpp-python + huggingface-hub + +simplicio task "add input validation to createUser" \ + --target src/users.ts --local # forces the local model + +# the GGUF is fetched once from the Hugging Face Hub, then cached + reused +``` + +Explicit routes (override the default model/weights): + +```bash +SIMPLICIO_MODEL=local-llama/default # bundled default +SIMPLICIO_MODEL=local-llama/bartowski/Qwen2.5-Coder-7B-Instruct-GGUF::Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf +SIMPLICIO_MODEL=local-llama//models/my-model.gguf # direct local path +SIMPLICIO_LOCAL_MODEL_PATH=/models/my-model.gguf # always wins +``` + +Tuning knobs (all optional): `SIMPLICIO_LOCAL_CTX` (context window, default +`8192`), `SIMPLICIO_LOCAL_THREADS`, `SIMPLICIO_LOCAL_GPU_LAYERS` (offload to GPU, +default `0`), `SIMPLICIO_LOCAL_MAX_TOKENS` (generation cap), +`SIMPLICIO_LOCAL_TEMP` (default `0.1`), `SIMPLICIO_LOCAL_MODEL_REPO` / +`SIMPLICIO_LOCAL_MODEL_FILE`. + ### The pipeline (both paths) Whichever entry point you use, each task runs through the same engine: diff --git a/pyproject.toml b/pyproject.toml index a52fe08..eff7f06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "simplicio-cli" -version = "0.4.4" +version = "0.5.0" description = "Portable task-to-code pipeline that works with any LLM. Turn a one-line task into a verified code change — diff + test + verify loop. +55 pts on a 156-check benchmark, 21% faster, ~same tokens." readme = "README.md" license = { text = "MIT" } @@ -55,6 +55,9 @@ dependencies = [ [project.optional-dependencies] bench = ["fpdf2>=2.7"] +# Offline-first in-process inference (Path 4). Pulls the llama.cpp Python +# bindings plus huggingface-hub to fetch the default Qwen2.5-Coder-1.5B GGUF. +local = ["llama-cpp-python>=0.3.2", "huggingface-hub>=0.23"] [project.urls] Homepage = "https://github.com/wesleysimplicio/simplicio-cli" diff --git a/simplicio/__init__.py b/simplicio/__init__.py index cd1ee63..3d18726 100644 --- a/simplicio/__init__.py +++ b/simplicio/__init__.py @@ -1 +1 @@ -__version__ = "0.4.4" +__version__ = "0.5.0" diff --git a/simplicio/cli.py b/simplicio/cli.py index f1d8276..88d9f52 100644 --- a/simplicio/cli.py +++ b/simplicio/cli.py @@ -99,6 +99,12 @@ def main(argv=None): default=[], help="glob limiting which paths the task may change; repeatable", ) + pt.add_argument( + "--local", + action="store_true", + help="force the in-process local model (Qwen2.5-Coder-1.5B GGUF, " + "no API key); overrides SIMPLICIO_MODEL/SIMPLICIO_BASE_URL", + ) pb = sub.add_parser("bench", help="compare with vs without (real numbers)") pb.add_argument("--root", default=".") @@ -187,6 +193,12 @@ def main(argv=None): else: from .pipeline import run, run_task + if getattr(a, "local", False): + # Force Path 4: pin the local model and drop any HTTP endpoint so the + # in-process llama backend wins regardless of the ambient config. + os.environ["SIMPLICIO_MODEL"] = "local-llama/default" + os.environ.pop("SIMPLICIO_BASE_URL", None) + if a.json or a.dry_run_task: result = run_task( a.root, diff --git a/simplicio/providers.py b/simplicio/providers.py index ad82805..233440f 100644 --- a/simplicio/providers.py +++ b/simplicio/providers.py @@ -1,7 +1,7 @@ """ providers.py — provider-agnostic. Does NOT list specific models. -Three modes, picked by SIMPLICIO_MODEL prefix: +Four modes, picked by SIMPLICIO_MODEL prefix (or by absence of config): 1. Native Anthropic SDK SIMPLICIO_MODEL=claude-opus-4-7 @@ -20,6 +20,16 @@ to be logged in (Claude Code session or `codex login`). Subprocess is given SIMPLICIO_HOOK_GUARD=1 so the inner CLI does not re-trigger the simplicio UserPromptSubmit hook (recursion guard). + +4. In-process local inference via llama-cpp-python (offline-first, zero key) + SIMPLICIO_MODEL=local-llama/:: -> explicit HF GGUF + SIMPLICIO_MODEL=local-llama/default -> bundled default + SIMPLICIO_MODEL=local-llama//abs/path/model.gguf -> direct local path + This is also the DEFAULT when neither SIMPLICIO_MODEL nor + SIMPLICIO_BASE_URL is set: simplicio runs Qwen2.5-Coder-1.5B-Instruct + (Q5_K_M GGUF) on CPU with no HTTP overhead. The GGUF is fetched once from + the Hugging Face Hub and the model is loaded once, then reused. Requires + the `local` extra: pip install 'simplicio-cli[local]'. """ import os @@ -56,7 +66,125 @@ def _inline_feedback(prompt, feedback): return f"{prompt}\n\nThe test FAILED:\n{feedback}\nFix it. Same output format." +# --------------------------------------------------------------------------- # +# Path 4: in-process local inference (llama-cpp-python). Offline-first default. +# --------------------------------------------------------------------------- # + +# bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF is a small, code-specialized model +# that runs fast on CPU. Q5_K_M is the speed/quality sweet spot for the 1.5B. +LOCAL_DEFAULT_REPO = "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF" +LOCAL_DEFAULT_FILE = "Qwen2.5-Coder-1.5B-Instruct-Q5_K_M.gguf" +LOCAL_MODEL_PREFIX = "local-llama/" + +# Loaded Llama instances, keyed by (gguf_path, n_ctx, n_threads, n_gpu_layers). +# A model load is expensive (weights -> RAM), so we keep it for the process. +_LOCAL_LLAMA_CACHE = {} + + +def _is_local(model, base): + """True when generate() should route to the in-process llama backend. + + Either an explicit `local-llama/` model, or the offline-first default: + nothing configured at all (no model, no OpenAI-compatible base_url). + """ + if model and model.startswith(LOCAL_MODEL_PREFIX): + return True + return not model and not base + + +def _local_spec(model): + """Resolve (repo, file, path) for a local-llama model id. + + Forms after the `local-llama/` prefix: + "" / "default" / "auto" -> bundled Qwen2.5-Coder-1.5B Q5_K_M default + "::" -> explicit HF repo + filename + "/abs/path/model.gguf" -> direct local path (no download) + "" -> HF repo + default/SIMPLICIO_LOCAL_MODEL_FILE + SIMPLICIO_LOCAL_MODEL_PATH always wins when set. + """ + path = os.environ.get("SIMPLICIO_LOCAL_MODEL_PATH") + if path: + return None, None, path + file_env = os.environ.get("SIMPLICIO_LOCAL_MODEL_FILE", LOCAL_DEFAULT_FILE) + spec = "" + if model and model.startswith(LOCAL_MODEL_PREFIX): + spec = model[len(LOCAL_MODEL_PREFIX) :].strip() + if spec and spec not in ("default", "auto"): + if "::" in spec: + repo, fname = spec.split("::", 1) + return repo.strip(), fname.strip(), None + if spec.endswith(".gguf") and (os.sep in spec or spec.startswith((".", "/"))): + return None, None, spec + return spec, file_env, None + repo = os.environ.get("SIMPLICIO_LOCAL_MODEL_REPO", LOCAL_DEFAULT_REPO) + return repo, file_env, None + + +def _resolve_local_path(repo, fname, path): + """Return a filesystem path to the GGUF, downloading from HF if needed.""" + if path: + if not os.path.exists(path): + raise SystemExit( + f"simplicio: local model not found at {path}. Point " + "SIMPLICIO_LOCAL_MODEL_PATH at an existing .gguf file." + ) + return path + try: + from huggingface_hub import hf_hub_download + except ImportError: + raise SystemExit( + "simplicio: local backend needs huggingface-hub. " + "Install extras: pip install 'simplicio-cli[local]'" + ) + return hf_hub_download(repo_id=repo, filename=fname) + + +def _local_llama(model): + """Load (or reuse) the Llama instance for the given local model id.""" + try: + from llama_cpp import Llama + except ImportError: + raise SystemExit( + "simplicio: local backend needs llama-cpp-python. " + "Install extras: pip install 'simplicio-cli[local]'" + ) + repo, fname, path = _local_spec(model) + gguf = _resolve_local_path(repo, fname, path) + n_ctx = int(os.environ.get("SIMPLICIO_LOCAL_CTX", "8192")) + threads = os.environ.get("SIMPLICIO_LOCAL_THREADS") + n_threads = int(threads) if threads else None + n_gpu_layers = int(os.environ.get("SIMPLICIO_LOCAL_GPU_LAYERS", "0")) + cache_key = (gguf, n_ctx, n_threads, n_gpu_layers) + llm = _LOCAL_LLAMA_CACHE.get(cache_key) + if llm is None: + llm = Llama( + model_path=gguf, + n_ctx=n_ctx, + n_threads=n_threads, + n_gpu_layers=n_gpu_layers, + verbose=False, + ) + _LOCAL_LLAMA_CACHE[cache_key] = llm + return llm + + +def _local_generate(prompt, feedback, model, max_tokens): + """Generate a completion in-process via llama-cpp-python.""" + llm = _local_llama(model) + cap = os.environ.get("SIMPLICIO_LOCAL_MAX_TOKENS") + out_tokens = int(cap) if cap else max_tokens + temperature = float(os.environ.get("SIMPLICIO_LOCAL_TEMP", "0.1")) + r = llm.create_chat_completion( + messages=_msgs(prompt, feedback), + max_tokens=out_tokens, + temperature=temperature, + ) + return r["choices"][0]["message"]["content"] or "" + + def _provider_id(model, base): + if model and model.startswith(LOCAL_MODEL_PREFIX): + return "local-llama" if model.startswith("claude-cli/"): return "claude-cli" if model.startswith("codex-cli/"): @@ -130,10 +258,35 @@ def _shell_out_codex(prompt, model): def generate(prompt, feedback=None, max_tokens=4000): c = _cfg() model = c["model"] + + # Path 4: in-process local inference. Explicit `local-llama/` model, or the + # offline-first default when nothing is configured. No API key, no HTTP. + if _is_local(model, c["base"]): + eff_model = model or (LOCAL_MODEL_PREFIX + "default") + # Fold the resolved weights into the cache key: two different GGUFs can + # both route as `local-llama/default` (via SIMPLICIO_LOCAL_MODEL_PATH / + # _REPO / _FILE), and must NOT share cached completions. + repo, fname, path = _local_spec(eff_model) + weights = path or f"{repo}/{fname}" + key = make_key( + "local-llama", + eff_model, + prompt, + feedback=feedback, + max_tokens=max_tokens, + weights=weights, + ) + cached = cache().get(key) + if cached is not None: + return cached.completion + out = _local_generate(prompt, feedback, eff_model, max_tokens) + cache().put(key, CacheEntry(out, provider_id="local-llama", model=eff_model)) + return out + if not model: raise SystemExit( "set SIMPLICIO_MODEL (e.g. anthropic/claude-opus-4, claude-cli/sonnet, " - "codex-cli/gpt-5, glm-4.6, llama3, claude-opus-4-7)" + "codex-cli/gpt-5, local-llama/default, glm-4.6, llama3, claude-opus-4-7)" ) provider_id = _provider_id(model, c["base"]) key = make_key( @@ -194,6 +347,14 @@ def generate(prompt, feedback=None, max_tokens=4000): def info(): c = _cfg() + if _is_local(c["model"], c["base"]): + eff_model = c["model"] or (LOCAL_MODEL_PREFIX + "default (auto)") + repo, fname, path = _local_spec(c["model"] or "") + target = path or f"{repo}/{fname}" + return ( + f"model={eff_model} provider=local-llama " + f"(in-process, llama-cpp-python) target={target} key=not-needed" + ) model = c["model"] or "(unset)" if model.startswith("claude-cli/"): return f"model={model} provider=claude-cli (shell-out, uses Claude Code OAuth) key=not-needed" diff --git a/tests/python/test_package_metadata.py b/tests/python/test_package_metadata.py index 300c8fa..329edec 100644 --- a/tests/python/test_package_metadata.py +++ b/tests/python/test_package_metadata.py @@ -7,7 +7,7 @@ def test_package_version_matches_release_metadata() -> None: project = tomllib.loads(Path("pyproject.toml").read_text(encoding="utf-8"))["project"] - assert project["version"] == "0.4.4" + assert project["version"] == "0.5.0" assert __version__ == project["version"] diff --git a/tests/python/test_providers_local.py b/tests/python/test_providers_local.py new file mode 100644 index 0000000..b9656fb --- /dev/null +++ b/tests/python/test_providers_local.py @@ -0,0 +1,326 @@ +"""Tests for Path 4: in-process local inference (llama-cpp-python). + +The llama-cpp-python and huggingface-hub libs are optional extras that are not +installed in CI, so we test the routing/spec resolution directly and stub the +heavy model load (`_local_llama`) when exercising generate(). +""" + +import os +import sys +import types +from unittest.mock import MagicMock + +import pytest + +from simplicio import providers +from simplicio._cache import reset_for_tests + + +@pytest.fixture(autouse=True) +def _clean(tmp_path, monkeypatch): + for v in ( + "SIMPLICIO_MODEL", + "SIMPLICIO_BASE_URL", + "SIMPLICIO_API_KEY", + "OPENROUTER_API_KEY", + "ANTHROPIC_API_KEY", + "SIMPLICIO_LOCAL_MODEL_PATH", + "SIMPLICIO_LOCAL_MODEL_REPO", + "SIMPLICIO_LOCAL_MODEL_FILE", + "SIMPLICIO_LOCAL_CTX", + "SIMPLICIO_LOCAL_THREADS", + "SIMPLICIO_LOCAL_GPU_LAYERS", + "SIMPLICIO_LOCAL_MAX_TOKENS", + "SIMPLICIO_LOCAL_TEMP", + ): + monkeypatch.delenv(v, raising=False) + monkeypatch.setenv("SIMPLICIO_CACHE_DIR", str(tmp_path / "cache")) + monkeypatch.delenv("SIMPLICIO_BUST_CACHE", raising=False) + providers._LOCAL_LLAMA_CACHE.clear() + reset_for_tests() + yield + providers._LOCAL_LLAMA_CACHE.clear() + reset_for_tests() + + +# --------------------------------------------------------------------------- # +# _is_local +# --------------------------------------------------------------------------- # + + +def test_is_local_explicit_prefix(): + assert providers._is_local("local-llama/default", None) is True + assert providers._is_local("local-llama/repo::a.gguf", "http://x") is True + + +def test_is_local_auto_default_when_nothing_configured(): + assert providers._is_local(None, None) is True + assert providers._is_local("", "") is True + + +def test_is_local_false_when_base_set(): + assert providers._is_local(None, "http://localhost:11434/v1") is False + + +def test_is_local_false_when_other_model_set(): + assert providers._is_local("claude-opus-4-7", None) is False + assert providers._is_local("claude-cli/sonnet", None) is False + + +# --------------------------------------------------------------------------- # +# _local_spec +# --------------------------------------------------------------------------- # + + +def test_local_spec_default(): + repo, fname, path = providers._local_spec("") + assert repo == providers.LOCAL_DEFAULT_REPO + assert fname == providers.LOCAL_DEFAULT_FILE + assert path is None + + +def test_local_spec_default_keyword(): + repo, fname, path = providers._local_spec("local-llama/default") + assert repo == providers.LOCAL_DEFAULT_REPO + assert fname == providers.LOCAL_DEFAULT_FILE + + +def test_local_spec_repo_and_file(): + repo, fname, path = providers._local_spec("local-llama/owner/repo::weights.gguf") + assert repo == "owner/repo" + assert fname == "weights.gguf" + assert path is None + + +def test_local_spec_direct_path(): + repo, fname, path = providers._local_spec("local-llama//models/x.gguf") + assert repo is None and fname is None + assert path == "/models/x.gguf" + + +def test_local_spec_path_env_wins(monkeypatch): + monkeypatch.setenv("SIMPLICIO_LOCAL_MODEL_PATH", "/data/custom.gguf") + repo, fname, path = providers._local_spec("local-llama/owner/repo::w.gguf") + assert path == "/data/custom.gguf" + assert repo is None and fname is None + + +def test_local_spec_file_env_override(monkeypatch): + monkeypatch.setenv("SIMPLICIO_LOCAL_MODEL_FILE", "Q4_K_M.gguf") + repo, fname, _ = providers._local_spec("") + assert fname == "Q4_K_M.gguf" + + +def test_local_spec_bare_repo_uses_default_file(): + repo, fname, path = providers._local_spec("local-llama/some/repo") + assert repo == "some/repo" + assert fname == providers.LOCAL_DEFAULT_FILE + + +# --------------------------------------------------------------------------- # +# _provider_id / info +# --------------------------------------------------------------------------- # + + +def test_provider_id_local(): + assert providers._provider_id("local-llama/default", None) == "local-llama" + + +def test_info_local_auto_default(): + s = providers.info() + assert "local-llama" in s + assert "in-process" in s + assert "key=not-needed" in s + assert providers.LOCAL_DEFAULT_FILE in s + + +def test_info_local_explicit(monkeypatch): + monkeypatch.setenv("SIMPLICIO_MODEL", "local-llama/owner/repo::w.gguf") + s = providers.info() + assert "local-llama" in s + assert "owner/repo/w.gguf" in s + + +# --------------------------------------------------------------------------- # +# _resolve_local_path +# --------------------------------------------------------------------------- # + + +def test_resolve_local_path_missing_file_raises(): + with pytest.raises(SystemExit) as exc: + providers._resolve_local_path(None, None, "/nope/missing.gguf") + assert "not found" in str(exc.value) + + +def test_resolve_local_path_existing_file(tmp_path): + f = tmp_path / "m.gguf" + f.write_bytes(b"x") + assert providers._resolve_local_path(None, None, str(f)) == str(f) + + +def test_resolve_local_path_downloads_from_hf(monkeypatch): + fake = types.ModuleType("huggingface_hub") + fake.hf_hub_download = MagicMock(return_value="/cache/weights.gguf") + monkeypatch.setitem(sys.modules, "huggingface_hub", fake) + out = providers._resolve_local_path("owner/repo", "weights.gguf", None) + assert out == "/cache/weights.gguf" + fake.hf_hub_download.assert_called_once_with( + repo_id="owner/repo", filename="weights.gguf" + ) + + +def test_resolve_local_path_no_hf_lib_raises(monkeypatch): + monkeypatch.setitem(sys.modules, "huggingface_hub", None) + with pytest.raises(SystemExit) as exc: + providers._resolve_local_path("owner/repo", "w.gguf", None) + assert "huggingface-hub" in str(exc.value) + assert "simplicio-cli[local]" in str(exc.value) + + +# --------------------------------------------------------------------------- # +# _local_llama missing backend +# --------------------------------------------------------------------------- # + + +def test_local_llama_missing_backend_raises(monkeypatch): + monkeypatch.setitem(sys.modules, "llama_cpp", None) + with pytest.raises(SystemExit) as exc: + providers._local_llama("local-llama/default") + assert "llama-cpp-python" in str(exc.value) + assert "simplicio-cli[local]" in str(exc.value) + + +def test_local_llama_loads_and_caches(monkeypatch): + f = MagicMock(name="LlamaInstance") + Llama = MagicMock(return_value=f) + fake = types.ModuleType("llama_cpp") + fake.Llama = Llama + monkeypatch.setitem(sys.modules, "llama_cpp", fake) + monkeypatch.setenv("SIMPLICIO_LOCAL_MODEL_PATH", _touch(monkeypatch)) + + a = providers._local_llama("local-llama/default") + b = providers._local_llama("local-llama/default") + assert a is f and b is f + Llama.assert_called_once() # second call reused the cached instance + kwargs = Llama.call_args[1] + assert kwargs["n_ctx"] == 8192 + assert kwargs["n_gpu_layers"] == 0 + assert kwargs["verbose"] is False + + +def test_local_llama_honours_ctx_threads_gpu(monkeypatch): + Llama = MagicMock(return_value=MagicMock()) + fake = types.ModuleType("llama_cpp") + fake.Llama = Llama + monkeypatch.setitem(sys.modules, "llama_cpp", fake) + monkeypatch.setenv("SIMPLICIO_LOCAL_MODEL_PATH", _touch(monkeypatch)) + monkeypatch.setenv("SIMPLICIO_LOCAL_CTX", "16384") + monkeypatch.setenv("SIMPLICIO_LOCAL_THREADS", "6") + monkeypatch.setenv("SIMPLICIO_LOCAL_GPU_LAYERS", "20") + + providers._local_llama("local-llama/default") + kwargs = Llama.call_args[1] + assert kwargs["n_ctx"] == 16384 + assert kwargs["n_threads"] == 6 + assert kwargs["n_gpu_layers"] == 20 + + +# --------------------------------------------------------------------------- # +# generate() local routing +# --------------------------------------------------------------------------- # + + +def test_generate_routes_to_local_by_default(monkeypatch): + calls = [] + + def fake_local(prompt, feedback, model, max_tokens): + calls.append((prompt, model, max_tokens)) + return "LOCAL OK" + + monkeypatch.setattr(providers, "_local_generate", fake_local) + out = providers.generate("do x", max_tokens=128) + assert out == "LOCAL OK" + assert calls[0][1] == "local-llama/default" + assert calls[0][2] == 128 + + +def test_generate_local_explicit_prefix(monkeypatch): + seen = {} + + def fake_local(prompt, feedback, model, max_tokens): + seen["model"] = model + return "OK" + + monkeypatch.setenv("SIMPLICIO_MODEL", "local-llama/owner/repo::w.gguf") + monkeypatch.setattr(providers, "_local_generate", fake_local) + providers.generate("x") + assert seen["model"] == "local-llama/owner/repo::w.gguf" + + +def test_generate_local_uses_completion_cache(monkeypatch): + n = {"calls": 0} + + def fake_local(prompt, feedback, model, max_tokens): + n["calls"] += 1 + return "CACHED" + + monkeypatch.setattr(providers, "_local_generate", fake_local) + assert providers.generate("same prompt") == "CACHED" + assert providers.generate("same prompt") == "CACHED" + assert n["calls"] == 1 # second call served from cache + + +def test_generate_no_model_with_base_still_raises(monkeypatch): + monkeypatch.setenv("SIMPLICIO_BASE_URL", "http://localhost:11434/v1") + with pytest.raises(SystemExit) as exc: + providers.generate("x") + assert "SIMPLICIO_MODEL" in str(exc.value) + assert "local-llama" in str(exc.value) + + +def test_local_generate_caps_tokens_and_temp(monkeypatch): + llm = MagicMock() + llm.create_chat_completion.return_value = { + "choices": [{"message": {"content": "hi"}}] + } + monkeypatch.setattr(providers, "_local_llama", lambda model: llm) + monkeypatch.setenv("SIMPLICIO_LOCAL_MAX_TOKENS", "256") + monkeypatch.setenv("SIMPLICIO_LOCAL_TEMP", "0.4") + + out = providers._local_generate("p", None, "local-llama/default", 4000) + assert out == "hi" + kwargs = llm.create_chat_completion.call_args[1] + assert kwargs["max_tokens"] == 256 # cap overrides the 4000 arg + assert kwargs["temperature"] == 0.4 + + +def test_generate_cache_key_includes_weights(monkeypatch): + # Two different GGUFs both routed as the default must not collide in cache. + seen = [] + + def fake_local(prompt, feedback, model, max_tokens): + seen.append(os.environ.get("SIMPLICIO_LOCAL_MODEL_PATH")) + return f"out:{os.environ.get('SIMPLICIO_LOCAL_MODEL_PATH')}" + + monkeypatch.setenv("SIMPLICIO_CACHE", "1") + monkeypatch.setattr(providers, "_local_generate", fake_local) + + monkeypatch.setenv("SIMPLICIO_LOCAL_MODEL_PATH", "/models/a.gguf") + out_a = providers.generate("same prompt") + monkeypatch.setenv("SIMPLICIO_LOCAL_MODEL_PATH", "/models/b.gguf") + out_b = providers.generate("same prompt") + + assert out_a == "out:/models/a.gguf" + assert out_b == "out:/models/b.gguf" # not served stale from model A + assert seen == ["/models/a.gguf", "/models/b.gguf"] + + +def _touch(monkeypatch): + """Create a throwaway .gguf file and return its path.""" + import tempfile + + fd, path = tempfile.mkstemp(suffix=".gguf") + import os as _os + + _os.close(fd) + return path diff --git a/tests/python/test_providers_shellout.py b/tests/python/test_providers_shellout.py index 221c17b..894349f 100644 --- a/tests/python/test_providers_shellout.py +++ b/tests/python/test_providers_shellout.py @@ -166,8 +166,11 @@ def test_native_path_still_requires_key(monkeypatch): assert "claude-cli" in str(exc.value) -def test_no_model_raises_with_hint(monkeypatch): +def test_no_model_with_base_raises_with_hint(monkeypatch): + # With no model but an OpenAI-compatible base_url set, the local default + # does NOT kick in (the base signals a remote endpoint) so we still raise. monkeypatch.delenv("SIMPLICIO_MODEL", raising=False) + monkeypatch.setenv("SIMPLICIO_BASE_URL", "http://localhost:11434/v1") with pytest.raises(SystemExit) as exc: providers.generate("x") @@ -175,3 +178,14 @@ def test_no_model_raises_with_hint(monkeypatch): assert "SIMPLICIO_MODEL" in msg assert "claude-cli" in msg assert "codex-cli" in msg + assert "local-llama" in msg + + +def test_no_config_at_all_routes_to_local_default(monkeypatch): + # No model AND no base -> offline-first local default (Path 4), not a raise. + monkeypatch.delenv("SIMPLICIO_MODEL", raising=False) + monkeypatch.delenv("SIMPLICIO_BASE_URL", raising=False) + monkeypatch.setattr( + providers, "_local_generate", lambda p, f, m, mt: f"local:{m}" + ) + assert providers.generate("x") == "local:local-llama/default"