diff --git a/.env.example b/.env.example
index 18164a8..21b7f48 100644
--- a/.env.example
+++ b/.env.example
@@ -1,6 +1,27 @@
-# Copy to .env and fill in. .env is gitignored.
+# 复制为 .env 后填写；.env 已在 .gitignore 中忽略
 TCAPTCHA_BASE_URL=https://t.captcha.qq.com
-TCAPTCHA_LLM_API_KEY=sk-your-relay-key-here
-TCAPTCHA_LLM_BASE_URL=https://your-relay.example.com
+
+# --- LLM 视觉求解器 ---
+# 只有 image_select pipeline 需要；word_click 已切换到本地 YOLO + Siamese
+# （见 `--extra word-click`），除非你要用 image_select，否则下面几项留空即可。
+TCAPTCHA_LLM_API_KEY=
+TCAPTCHA_LLM_BASE_URL=
 TCAPTCHA_LLM_MODEL=gpt-5.4
 TCAPTCHA_LLM_TIMEOUT=30
+
+# --- word_click / ONNX Runtime 调优 ---
+# 执行后端：默认 "auto" 按 CUDA > ROCm > DML > CoreML > CPU 的顺序挑选。
+# macOS 下 CoreML 首次图编译较慢，通常固定为 "cpu" 更快。
+# 可选值：auto | cpu | cuda | rocm | dml | coreml
+# TCAPTCHA_ORT_BACKEND=cpu
+#
+# ORT intra-op 线程数，默认 min(4, os.cpu_count())。
+# 52×52 的 Siamese 模型过 4 线程后会反向变慢，没实测过就别动。
+# TCAPTCHA_ORT_INTRA_OP_THREADS=4
+
+# --- serve 模式（crack-tcaptcha serve） ---
+# POST /solve 的共享密钥；设置后客户端必须在请求头带 `X-SK`。
+# TCAPTCHA_SERVE_SK=change-me
+# TCAPTCHA_SERVE_HOST=127.0.0.1
+# TCAPTCHA_SERVE_PORT=9991
+# TCAPTCHA_SERVE_WORKERS=4
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..d355526
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+src/crack_tcaptcha/solvers/models/*.onnx filter=lfs diff=lfs merge=lfs -text
+src/crack_tcaptcha/solvers/models/*.ttf filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
index f441298..77ec3fa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,5 +13,4 @@ site
 *.so
 *.whl
 .env
-*.onnx
 origin_papers/
diff --git a/AGENTS.md b/AGENTS.md
index c0d5d75..1c4fc26 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -24,7 +24,8 @@ Python >= 3.10, `uv` is the canonical package manager.
 uv sync
 
 # Install with optional extras
-uv sync --extra icon-click   # adds ddddocr + onnxruntime (needed for icon_click and word_click)
+uv sync --extra icon-click   # ddddocr + onnxruntime (icon_click pipeline)
+uv sync --extra word-click   # onnxruntime + opencv-headless + ddddocr (word_click pipeline, local YOLO+Siamese)
 uv sync --extra dev          # pytest, respx, ruff, hypothesis
 uv sync --extra docs         # mkdocs-material
 
@@ -40,9 +41,15 @@ uv run pytest tests/pipelines/ -q         # a single directory
 uv run ruff check .
 uv run ruff format .
 
-# CLI
+# CLI — one-shot
 uv run crack-tcaptcha solve --appid YOUR_APPID --entry-url https://your-site.example/login
 
+# CLI — long-running HTTP service (recommended for repeated use; models load once)
+uv run crack-tcaptcha serve --port 9991 --workers 4
+#   POST http://127.0.0.1:9991/solve  {"appid":"YOUR_APPID","retries":3}
+#   GET  http://127.0.0.1:9991/health
+#   set TCAPTCHA_SERVE_SK to require an X-SK header.
+
 # Docs
 uv run mkdocs serve
 ```
@@ -53,7 +60,8 @@ uv run mkdocs serve
 src/crack_tcaptcha/
 ├── __init__.py          # public API: solve()
 ├── captcha_type.py      # pure-function classifier (dyn_show_info → type)
-├── cli.py               # argparse entry point
+├── cli.py               # argparse entry point (solve / serve subcommands)
+├── server.py            # long-running HTTP service (stdlib http.server)
 ├── client.py            # HTTP three-phase + JSONP unwrap (scrapling / curl_cffi)
 ├── exceptions.py        # NetworkError, SolveError, PowError, TDCError
 ├── models.py            # pydantic models for prehandle / verify responses
@@ -64,10 +72,13 @@ src/crack_tcaptcha/
 │   ├── _common.py       # run_async, finish_with_verify (shared tail)
 │   ├── slide.py         # NCC template match
 │   ├── icon_click.py    # ddddocr detect + template match
-│   ├── word_click.py    # ddddocr detect + LLM vision (+ OCR fallback)
+│   ├── word_click.py    # YOLO detect + Siamese match (local ONNX); ddddocr OCR fallback
 │   └── image_select.py  # LLM region matching
 ├── solvers/
-│   └── llm_vision.py    # OpenAI-compatible vision client
+│   ├── ort_provider.py  # ONNX Runtime execution-provider selection
+│   ├── word_ocr.py      # YOLOv8 + Siamese solver for word_click (fast path)
+│   ├── llm_vision.py    # OpenAI-compatible vision client (image_select only)
+│   └── models/          # bundled ONNX models + font.ttf (force-included in wheel)
 └── tdc/
     ├── provider.py      # TDCProvider Protocol (DI point)
     ├── nodejs_jsdom.py  # Node.js subprocess implementation
@@ -77,6 +88,8 @@ src/crack_tcaptcha/
 Dependency direction is strictly top-down: `pipelines/` depends on
 `solvers/`, `tdc/`, `client.py`, `pow.py`, `trajectory.py`. `solvers/` and
 `tdc/` are independent of each other and must not import from `pipelines/`.
+`server.py` depends on `__init__.solve` and may trigger `solvers/word_ocr.warmup`
+at startup — it must not import from `pipelines/` directly.
 
 ## 4. Key Conventions
 
@@ -124,9 +137,19 @@ Dependency direction is strictly top-down: `pipelines/` depends on
   `DynAnswerType_UC`, `elem_id=""`, `data="<region_id>"`.
 - **Trajectory jitter.** Ease-in-out cubic with ±1 px jitter currently
   passes. Perfectly smooth trajectories get detected.
-- **LLM retry semantics.** `locate_chars` / `match_region` each retry once
+- **LLM retry semantics.** `match_region` (image_select) retries once
   internally on transport errors. Outer retries are the pipeline's
   `max_retries` (entire prehandle → verify loop).
+- **word_click model files are bundled.** `src/crack_tcaptcha/solvers/models/`
+  ships `word_click_detector.onnx` (YOLOv8, 10 MB),
+  `word_click_matcher.onnx` (Siamese, 29 MB), and `font.ttf` (4.6 MB).
+  These are `force-include`d into the wheel via hatch config. Don't
+  rename them without updating `word_ocr.py` and `pyproject.toml`.
+- **ORT cold-start hides behind warmup.** `crack-tcaptcha solve` spawns a
+  background thread that calls `solvers.word_ocr.warmup()` while the
+  first HTTP round-trip is in flight; `crack-tcaptcha serve` warms up at
+  boot. On macOS, `TCAPTCHA_ORT_BACKEND=cpu` is usually faster than the
+  default CoreML auto-pick because CoreML pays a one-off graph compile.
 
 ## 6. Testing Guidelines
 
@@ -173,11 +196,15 @@ Dependency direction is strictly top-down: `pipelines/` depends on
 
 - **Node.js >= 18** for the TDC.js bridge (`tdc/js/tdc_executor.js`,
   runs `tdc.js` inside jsdom). Install deps with `cd src/crack_tcaptcha/tdc/js && npm install`.
-- **`ddddocr`** (optional extra `icon-click`) for icon/character
-  detection. Required by `icon_click` and `word_click` pipelines. Pulls
-  in `onnxruntime`.
-- **OpenAI-compatible LLM relay** for `word_click` (recommended) and
-  `image_select` (required). Configure via `TCAPTCHA_LLM_API_KEY`,
+- **`ddddocr`** (optional extra `icon-click`, and part of `word-click`)
+  for icon / character detection. Required by `icon_click` and used as
+  the `word_click` fallback path. Pulls in `onnxruntime`.
+- **`onnxruntime` + `opencv-python-headless`** (optional extra
+  `word-click`, alongside `ddddocr`). Required for the primary
+  `word_click` path (local YOLOv8 detector + Siamese matcher shipped
+  under `solvers/models/`). No external API calls.
+- **OpenAI-compatible LLM relay** for `image_select` (required). No
+  longer required for `word_click`. Configure via `TCAPTCHA_LLM_API_KEY`,
   `TCAPTCHA_LLM_BASE_URL`, `TCAPTCHA_LLM_MODEL`, `TCAPTCHA_LLM_TIMEOUT`
   in `.env`. Any `/v1/chat/completions` endpoint that accepts
   `image_url` content blocks works.
diff --git a/CLAUDE.md b/CLAUDE.md
index 02da5ee..6a01359 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -65,3 +65,21 @@ after — don't batch updates.
   against live risk-control signals)
 - Deleting or renaming files under `tdc/js/` (tdc.js is vendored
   intentionally)
+- Deleting, renaming, or re-quantizing files under
+  `src/crack_tcaptcha/solvers/models/` (bundled ONNX models + font are
+  force-included into the wheel; a rename means editing `word_ocr.py`
+  and `pyproject.toml` in lockstep)
+
+### word_click / serve mode
+
+- Primary `word_click` path is local (YOLO detector + Siamese matcher
+  ONNX models under `solvers/models/`). LLM is no longer required.
+- When iterating on `solvers/word_ocr.py`, prefer the serve mode to
+  avoid per-run ONNX cold-start:
+  ```bash
+  uv run crack-tcaptcha serve --port 9991 --workers 2
+  # then hit POST /solve repeatedly
+  ```
+- On macOS, if solve feels slow, check provider selection: CoreML EP
+  pays a per-process graph-compile cost. Force CPU with
+  `TCAPTCHA_ORT_BACKEND=cpu` when benchmarking.
diff --git a/README.md b/README.md
index 6bfb3dd..6d9442e 100644
--- a/README.md
+++ b/README.md
@@ -5,26 +5,29 @@
 
 ![verify ok](images/word-click-success.png)
 
-> 上图为 word_click（文字点选）流水线的真实运行日志：从 `prehandle` → `getcapbysig` 下载背景图 → LLM 视觉给出点击坐标 → `nodejs_jsdom` 采集 TDC collect / eks / pow → `cap_union_new_verify` 一次通过，`ok=true`。
+> 上图为 word_click（文字点选）流水线的真实运行日志：从 `prehandle` → `getcapbysig` 下载背景图 → **本地 YOLO + Siamese 模型**给出点击坐标 → `nodejs_jsdom` 采集 TDC collect / eks / pow → `cap_union_new_verify` 一次通过，`ok=true`。
 
 ## 特性
 
 - **4 种验证类型**：`slider`（滑块）、`icon_click`（图标点击）、`word_click`（文字点选）、`image_select`（图像选择）
 - **无头浏览器依赖**：`nodejs_jsdom` 在 Node.js 进程里用 jsdom 跑官方 TDC.js，生成 `collect / eks / tokenid / pow_answer`
-- **策略化求解器**：滑块使用 OpenCV 模板匹配；点击类支持 `ddddocr` / 任意 OpenAI 兼容的 LLM vision
+- **策略化求解器**：滑块使用 OpenCV 模板匹配；`word_click` 走本地 **YOLOv8 检测 + Siamese 匹配**（纯 ONNX Runtime，单次 ~200 ms）；`icon_click` 使用 `ddddocr`；`image_select` 使用 OpenAI 兼容 LLM vision
+- **常驻 HTTP 服务**：`crack-tcaptcha serve` 让模型只加载一次，每次求解只付推理时间（零进程冷启动）
 - **工程化**：pydantic-settings 配置、结构化日志、CLI、pytest，类型完整
 
 ## 当前测试状态
 
 | 类型 | 状态 | 备注 |
 |---|---|---|
-| `word_click`（文字点选） | ✅ 已跑通（见上图） | LLM vision 映射字→bbox，一次通过 |
+| `word_click`（文字点选） | ✅ 已跑通（见上图） | 本地 YOLO + Siamese 模型，一次通过 |
 | `slider`（滑块） | 🧪 未充分验证 | pipeline 已实现，仅做过少量手工测试 |
 | `icon_click`（图标点击） | 🧪 未充分验证 | pipeline 已实现，依赖 `ddddocr`，待回归 |
 | `image_select`（图像选择） | 🧪 未充分验证 | pipeline 已实现，待回归 |
 
 > 目前项目重点打磨 `word_click`，其它类型欢迎 PR 补测试样本 / 回归用例。
 
+> 📎 **历史方案**：早期 `word_click` 使用 GPT / OpenAI 兼容 LLM vision 接口识别文字坐标的实现，已保留在 [`legacy-llm-vision`](../../tree/legacy-llm-vision) 分支，供参考或回退使用。主分支已切换为本地 YOLOv8 + Siamese 纯 ONNX 方案，无需任何外部大模型 API。
+
 ## 安装
 
 ### 按需求选择
@@ -33,26 +36,32 @@
 # 最小安装：仅 slider pipeline（HTTP + 轨迹生成，无 ML 依赖）
 uv add crack-tcaptcha
 
-# 推荐：图标点击 + 文字点选（word_click 也依赖 ddddocr）
+# 文字点选（本地 YOLO + Siamese 模型；含 ddddocr 作为 OCR 兜底）
+uv add "crack-tcaptcha[word-click]"
+
+# 图标点击（仅 ddddocr）
 uv add "crack-tcaptcha[icon-click]"
 
 # 中文图像选择（cn-clip / torch，下载模型约数百 MB）
 uv add "crack-tcaptcha[clip]"
 
-# 全功能一键装（= icon-click + clip）
+# 全功能一键装（= word-click + icon-click + clip）
 uv add "crack-tcaptcha[all]"
 ```
 
-也可以用 `pip` 替代 `uv add`，语法一致：`pip install 'crack-tcaptcha[icon-click]'`。
+也可以用 `pip` 替代 `uv add`，语法一致：`pip install 'crack-tcaptcha[word-click]'`。
 
 | Extra | 引入依赖 | 启用的 pipeline |
 |---|---|---|
-| _(none)_ | 仅 httpx / pydantic / numpy / Pillow | `slider` |
-| `icon-click` | `ddddocr`（+ onnxruntime） | `icon_click`、`word_click` |
+| _(none)_ | httpx / pydantic / numpy / Pillow / scrapling | `slider` |
+| `icon-click` | `ddddocr`（+ onnxruntime） | `icon_click` |
+| `word-click` | `onnxruntime` + `opencv-python-headless` + `ddddocr` | `word_click`（本地 YOLO + Siamese，OCR 兜底） |
 | `clip` | `cn2an`、`cn-clip`、`torch` | `image_select`（CLIP backend） |
 | `all` | 以上全部 | 所有 pipeline |
 
-> 运行 `word_click` / `icon_click` 前未装 `[icon-click]` 会得到清晰的 ModuleNotFoundError 提示。
+> 未装 `[word-click]` / `[icon-click]` 时，对应 pipeline 抛出清晰的 `SolveError` 提示应装哪个 extra。
+
+> `word_click` 的本地模型（`word_click_detector.onnx` 10 MB、`word_click_matcher.onnx` 29 MB、`font.ttf` 4.6 MB）已随 wheel 打包，安装后开箱即用，无需额外下载。
 
 ### 前置要求
 
@@ -83,13 +92,30 @@ if result.ok:
 ### 命令行
 
 ```bash
-# 通用求解：--appid 替换为你自己的 APP_ID
+# 一次性求解：--appid 替换为你自己的 APP_ID
 crack-tcaptcha solve --appid YOUR_APPID --retries 3 --json
 
 # 指定来源页（会带上对应 Referer / Origin）
 crack-tcaptcha solve --appid YOUR_APPID --entry-url https://example.com/login --json
 ```
 
+### 常驻 HTTP 服务（推荐用于重复调用）
+
+一次性 CLI 每次都要冷启动 Python + 加载 ONNX 模型，首次可能要花几秒。常驻模式模型只加载一次，后续请求只付推理时间：
+
+```bash
+# 启动（鉴权可选：导出 TCAPTCHA_SERVE_SK 后客户端需带 X-SK header）
+export TCAPTCHA_SERVE_SK=change-me
+crack-tcaptcha serve --port 9991 --workers 4
+
+# 客户端：POST /solve
+curl -H 'X-SK: change-me' -X POST http://127.0.0.1:9991/solve \
+    -d '{"appid":"YOUR_APPID","retries":3}'
+
+# 健康检查
+curl http://127.0.0.1:9991/health
+```
+
 > 命令行示例中的 `YOUR_APPID` 仅为占位符，请替换为你自己的 appid；仓库不提供任何真实业务 appid。
 
 ## 本地测试页
@@ -111,18 +137,24 @@ crack-tcaptcha solve --appid YOUR_APPID --entry-url http://localhost:8765/tcap2_
 
 ```
 src/crack_tcaptcha/
-├── client.py              # HTTPX 客户端：prehandle / getcapbysig / verify
+├── client.py              # HTTP 三段式：prehandle / getcapbysig / verify
+├── cli.py                 # argparse 入口（solve / serve 子命令）
+├── server.py              # 常驻 HTTP 服务
 ├── pow.py                 # PoW 求解
-├── trajectory.py          # 轨迹/点击序列合成
+├── trajectory.py          # 轨迹 / 点击序列合成
 ├── captcha_type.py        # 类型分发路由
 ├── pipelines/             # 每种验证类型一个 pipeline
 │   ├── slide.py
 │   ├── icon_click.py
-│   ├── word_click.py      # 文字点选（对应截图演示）
+│   ├── word_click.py      # 本地 YOLO 检测 + Siamese 匹配（含 ddddocr 兜底）
 │   └── image_select.py
-├── solvers/llm_vision.py  # OpenAI 兼容 LLM 视觉求解器
+├── solvers/
+│   ├── ort_provider.py    # ORT execution-provider 选择（CUDA/ROCm/DML/CoreML/CPU）
+│   ├── word_ocr.py        # YOLO + Siamese 求解器（word_click 主路径）
+│   ├── llm_vision.py      # OpenAI 兼容 LLM vision（image_select 用）
+│   └── models/            # 打包的 ONNX 模型 + font.ttf
 └── tdc/                   # TDC.js 桥
-    ├── js/                # npm install 后放 node_modules
+    ├── js/                # npm install 后的 node_modules
     └── nodejs_jsdom.py    # jsdom NodeProvider
 ```
 
@@ -141,9 +173,18 @@ src/crack_tcaptcha/
 | `TCAPTCHA_TDC_TIMEOUT` | `60.0` | TDC.js 桥超时 |
 | `TCAPTCHA_TDC_DEBUG` | `false` | 打开后保留 jsdom 调试日志 |
 | `TCAPTCHA_PROXY` | `None` | `http://user:pass@host:port` |
-| `TCAPTCHA_LLM_API_KEY` | `""` | LLM vision 求解器（`image_select` / `word_click`） |
+| `TCAPTCHA_LLM_API_KEY` | `""` | LLM vision 求解器（仅 `image_select` 需要） |
 | `TCAPTCHA_LLM_BASE_URL` | `""` | OpenAI 兼容接口根 |
 | `TCAPTCHA_LLM_MODEL` | `gpt-5.4` | 模型名 |
+| `TCAPTCHA_ORT_BACKEND` | `auto` | ONNX 执行后端：`auto` / `cpu` / `cuda` / `rocm` / `dml` / `coreml` |
+| `TCAPTCHA_ORT_INTRA_OP_THREADS` | `min(4, cpu_count)` | ORT 线程数（Siamese 在 >4 时反而更慢） |
+| `TCAPTCHA_SERVE_SK` | `""` | 常驻服务鉴权 secret；非空时请求必须带 `X-SK` header |
+| `TCAPTCHA_SERVE_HOST` | `127.0.0.1` | `serve` 子命令监听地址 |
+| `TCAPTCHA_SERVE_PORT` | `9991` | `serve` 子命令监听端口 |
+| `TCAPTCHA_SERVE_WORKERS` | `4` | `serve` 并发 solve 上限 |
+
+> macOS 下若首次求解明显慢，通常是 CoreML 后端的图编译开销；
+> 导出 `TCAPTCHA_ORT_BACKEND=cpu` 往往比默认更快。
 
 ## 开发
 
@@ -154,47 +195,6 @@ uv run pytest -x -ra
 uv run pytest -m "not network"   # 跳过联网用例
 ```
 
-## 推荐 — 用本地模型替换 LLM vision
-
-当前 `word_click` / `image_select` 走 OpenAI 兼容接口，单次推理 **1~3 s** 起步（受网络、排队、token 数影响），是整条链路里最慢的一步。
-本地模型可以把这一步压到 **≤200 ms**，且无调用成本 / 限流 / 数据出站风险。
-
-两类任务本质都是 **"把一张图映射到一个确定的类别 / 索引"**，不需要真正的生成式 VLM：
-
-### 方案 A：PaddleOCR + 轻量匹配（推荐）
-
-| 子任务 | 本地替代 |
-|---|---|
-| `word_click`：识别背景图 3 个 bbox 里各是什么汉字 | **PaddleOCR** (`ch_PP-OCRv4`)，单字裁剪后 OCR → 与指令中的字做字符串匹配 |
-| `image_select`：在 N 宫格里挑"哪个是苹果" | **PaddleClas PP-LCNet / PP-ShiTu** 或 **cn-clip ViT-B/16**（已列在 `[clip]` extras） |
-
-优点：CPU 可跑、模型 <20 MB、推理 10~50 ms；PaddleOCR 对中文场景文字鲁棒性很好。
-
-### 方案 B：CLIP 类零样本匹配
-
-直接复用仓库里已经声明过的 `cn-clip` 依赖：
-
-```bash
-uv add "crack-tcaptcha[clip]"
-```
-
-- `word_click`：把每个 bbox 裁剪图与 "一张写着'X'字的图" 做 image-text 相似度 argmax（但中文单字 CLIP 准确率一般，建议配合 OCR 投票）
-- `image_select`：把指令"请选出所有包含苹果的图片"直接作为 text query，对 N 个格子打分排序，取 top-k
-
-优点：一个模型吃下所有"图→文"匹配场景；缺点：模型 ~400 MB，冷启动有成本。
-
-### 方案 C：ddddocr + 本地分类头（最轻）
-
-- `icon_click` 已经在用 `ddddocr`；`word_click` 的 bbox 识别也可以换 `ddddocr.DdddOcr(det=False)`（纯 OCR 模式）
-- 对 `image_select` 训一个 **PP-LCNet** 分类头（常见类别就那几类：动物、交通工具、食物...）+ "其它"兜底走 CLIP
-
-### 落地建议
-
-1. 在 `solvers/` 下新增 `paddle_ocr.py` 和 `cn_clip.py`，实现与 `llm_vision.py` 同签名（`match_region` / `locate_chars`）
-2. 在 `settings.py` 加 `solver_backend: Literal["llm", "paddle", "clip", "ddddocr"] = "llm"`
-3. pipeline 启动时根据 backend 路由，保留 LLM 作为兜底（本地模型置信度 < 阈值时回退）
-4. 评估指标：单验证码平均耗时、端到端通过率、CPU / 显存占用，基准样本集可用 `tests/samples/`
-
 ## 免责声明
 
 本项目 **仅用于个人安全研究、技术学习与学术交流**，不代表任何商业机构的立场。
diff --git a/docs/architecture.md b/docs/architecture.md
index 807fd2e..c4b0737 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -5,6 +5,7 @@
 ```
 ┌───────────────────────────────────────────────────────────────┐
 │  solve(appid, ...)                  ← crack_tcaptcha/__init__ │
+│  crack-tcaptcha serve               ← crack_tcaptcha/server.py │
 └──────────────┬────────────────────────────────────────────────┘
                │  classify(dyn)  →  captcha_type
                ▼
@@ -12,17 +13,25 @@
 │  pipelines/                                                    │
 │    ├─ slide.py         ── OpenCV NCC                           │
 │    ├─ icon_click.py    ── ddddocr                              │
-│    ├─ word_click.py    ── ddddocr + llm_vision                 │
+│    ├─ word_click.py    ── word_ocr (YOLO + Siamese)            │
+│    │                       (回退: ddddocr)                     │
 │    ├─ image_select.py  ── llm_vision                           │
 │    └─ _common.py       ── finish_with_verify / run_async       │
 └──────────────┬────────────────────────────────────────────────┘
                │ uses                                   uses
                ▼                                        ▼
-   ┌──────────────────────┐             ┌──────────────────────┐
-   │ solvers/             │             │ tdc/                 │
-   │   └─ llm_vision.py   │             │   ├─ provider.py     │
-   │      (OpenAI 兼容)    │             │   └─ nodejs_jsdom.py │
-   └──────────────────────┘             └──────────────────────┘
+   ┌──────────────────────────────┐     ┌──────────────────────┐
+   │ solvers/                      │     │ tdc/                 │
+   │   ├─ ort_provider.py          │     │   ├─ provider.py     │
+   │   ├─ word_ocr.py              │     │   └─ nodejs_jsdom.py │
+   │   │    (YOLO + Siamese ONNX)  │     │                      │
+   │   ├─ llm_vision.py            │     │                      │
+   │   │    (OpenAI 兼容)           │     │                      │
+   │   └─ models/                  │     │                      │
+   │       word_click_detector.onnx│     │                      │
+   │       word_click_matcher.onnx │     │                      │
+   │       font.ttf                │     │                      │
+   └──────────────────────────────┘     └──────────────────────┘
                │                                        │
                └──────────┬─────────────────────────────┘
                           ▼
@@ -35,7 +44,7 @@
       └──────────────────────────────────────────┘
 ```
 
-依赖方向严格自上而下：`pipelines` 依赖 `solvers`、`tdc`、`client`；`solvers` 和 `tdc` 互不依赖。
+依赖方向严格自上而下：`pipelines` 依赖 `solvers`、`tdc`、`client`；`solvers` 和 `tdc` 互不依赖。`server.py` 只依赖 `__init__.solve` 与 `solvers/word_ocr.warmup`，不直接导入 `pipelines/`。
 
 ## 三段式协议
 
@@ -78,7 +87,7 @@ POST 字段：`ans`、`pow_answer`、`pow_calc_time`、`collect`、`tlg`、`eks`
 |---|---|---|---|
 | `slide` | `solvers`（内嵌 NCC，见 `pipelines/slide.py` 的 `SliderSolver`） | —— | `numpy`、`Pillow` |
 | `icon_click` | `ddddocr` 检测 + 模板匹配 | —— | `ddddocr`（extra `icon-click`） |
-| `word_click` | `solvers/llm_vision.locate_chars` | `ddddocr` 按 bbox 分类 + 子串匹配 | `ddddocr` + OpenAI 兼容 API |
+| `word_click` | `solvers/word_ocr.locate_chars_by_siamese`（YOLO + Siamese ONNX，本地） | `ddddocr` 检测 + 分类兜底 | `onnxruntime` + `opencv-python-headless` + `ddddocr`（extra `word-click`） |
 | `image_select` | `solvers/llm_vision.match_region` | —— | OpenAI 兼容 API |
 
 自动路由：`captcha_type.classify(dyn_show_info)` 是一个纯函数分类器，按规则顺序返回 `slide` / `icon_click` / `word_click` / `image_select` / `unknown`。规则命中后 `pipelines.dispatch` 将请求分发到对应的 pipeline。
diff --git a/docs/index.md b/docs/index.md
index d5e25e4..a0885a1 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -8,7 +8,7 @@
 |---|---|---|
 | 滑块 `slider` | OpenCV NCC 模板匹配 | [slider.md](slider.md) |
 | 图标点击 `icon_click` | ddddocr 检测 + 模板匹配 | [icon-click.md](icon-click.md) |
-| 文字点选 `word_click` | ddddocr 检测 + LLM vision | [word-click.md](word-click.md) |
+| 文字点选 `word_click` | 本地 YOLO + Siamese（ddddocr 兜底） | [word-click.md](word-click.md) |
 | 图像选择 `image_select` | LLM vision | [image-select.md](image-select.md) |
 
 ## 快速导航
diff --git a/docs/word-click.md b/docs/word-click.md
index 2c8406c..216d642 100644
--- a/docs/word-click.md
+++ b/docs/word-click.md
@@ -6,33 +6,71 @@ TCaptcha 2.0 的文字点选挑战。提示词（目标汉字序列）内嵌在
 
 1. `prehandle` 返回 `instruction`（含目标汉字）、背景图 URL、`pow_cfg`；`fg_elem_list` 为空
 2. 下载背景图（`bg_elem_cfg.size_2d = [672, 480]`）
-3. `ddddocr` 在背景上检测候选汉字 bbox
-4. **主路径：LLM vision** —— 给每个候选 bbox 画红框 + 编号（1..N），和目标汉字一起发给 LLM，要求返回 `{char: box_index}` JSON
-5. **回退路径：ddddocr OCR 分类** —— 对 LLM 没映射上的汉字，逐 bbox 跑 `ddddocr.classification` 做子串匹配
-6. 按提示顺序把每个汉字映射到 bbox 中心坐标
-7. 生成点击轨迹，通过 TDC 桥取 `collect` / `eks`
-8. POST `verify`
+3. **主路径：YOLOv8 检测 + Siamese 匹配（本地 ONNX，纯 CPU 约 200 ms）**
+   1. `word_click_detector.onnx`（YOLOv8）在背景图上定位所有字符 bbox
+   2. 用打包在仓库里的 `font.ttf` 把 `instruction` 里每个目标汉字渲染成 52×52 参考图
+   3. 对每一对 `(bbox 裁剪图, 目标参考图)` 跑 `word_click_matcher.onnx`（Siamese）得相似度
+   4. 对每个目标按指令顺序贪心取当前未使用的最高分 bbox
+4. **回退路径：ddddocr**（仅在 YOLO 检测结果为 0、或未安装 `[word-click]` extra 时走） —— `detection + classification` 子串匹配
+5. 按提示顺序把每个汉字映射到 bbox 中心坐标
+6. 生成点击轨迹，通过 TDC 桥取 `collect` / `eks`
+7. POST `verify`
+
+详见 `src/crack_tcaptcha/pipelines/word_click.py` 与 `src/crack_tcaptcha/solvers/word_ocr.py`。
+
+## 模型与依赖
+
+装 `word-click` extra：
+
+```bash
+uv sync --extra word-click
+# 或
+pip install 'crack-tcaptcha[word-click]'
+```
+
+这会安装：
 
-详见 `src/crack_tcaptcha/pipelines/word_click.py`。
+| 包 | 作用 |
+|---|---|
+| `onnxruntime` | 跑 YOLO + Siamese ONNX |
+| `opencv-python-headless` | 图像预处理（letterbox、裁剪、colorspace） |
+| `ddddocr` | 回退路径（YOLO 检测失败时兜底） |
 
-## LLM 配置
+模型文件已通过 hatch `force-include` 打包进 wheel，开箱即用：
 
-求解器实现在 `src/crack_tcaptcha/solvers/llm_vision.py`（`locate_chars` 函数），走 OpenAI 兼容的 `/v1/chat/completions` 接口。
+| 文件 | 大小 | 作用 |
+|---|---|---|
+| `solvers/models/word_click_detector.onnx` | 10 MB | YOLOv8 检测器，输出字符 bbox |
+| `solvers/models/word_click_matcher.onnx` | 29 MB | Siamese 相似度网络（输入 52×52×3×2，输出 1 个相似度） |
+| `solvers/models/font.ttf` | 4.6 MB | 目标字渲染字体（颜色 BGR `(56,178,227)` 模拟验证码配色） |
 
-需要的环境变量（pydantic-settings 自动读 `.env` 或环境变量）：
+## 性能调优
 
-| 环境变量 | 说明 | 默认值 |
+ONNX Runtime 执行后端通过环境变量选择，默认 `auto` 优先挑 `CUDA > ROCm > DML > CoreML > CPU`：
+
+| 环境变量 | 默认 | 说明 |
 |---|---|---|
-| `TCAPTCHA_LLM_API_KEY` | Bearer token | `""`（未配置时 LLM 路径会跳过） |
-| `TCAPTCHA_LLM_BASE_URL` | 不含 `/v1/...` 的基础 URL | `""` |
-| `TCAPTCHA_LLM_MODEL` | 模型名 | `gpt-5.4` |
-| `TCAPTCHA_LLM_TIMEOUT` | HTTP 超时秒数 | `30` |
+| `TCAPTCHA_ORT_BACKEND` | `auto` | 强制后端：`cpu` / `cuda` / `rocm` / `dml` / `coreml` / `auto` |
+| `TCAPTCHA_ORT_INTRA_OP_THREADS` | `min(4, cpu_count)` | ORT 线程数；Siamese 太小，>4 反而更慢 |
+
+**macOS 提示**：默认会选 CoreML EP，首次推理需要一次性的 graph-compile 开销（几秒），总耗时可能比纯 CPU 还高。重复调用请用常驻服务模式，或 `export TCAPTCHA_ORT_BACKEND=cpu` 固定 CPU。
 
-**支持的后端**：任何 OpenAI 兼容的中继服务（官方 OpenAI、Azure OpenAI 代理、自建 vLLM / llama.cpp 的 OpenAI shim 等）。请求体用 `image_url` 传 base64 图片，返回必须是能解析出 JSON 对象的文本。
+### 常驻服务模式
+
+一次性 CLI 每次都付 Python 启动 + ONNX 加载的冷启动成本。生产 / 重复调用请用 `serve` 子命令：
+
+```bash
+# 启动（模型在进程启动时只加载一次）
+TCAPTCHA_SERVE_SK=change-me crack-tcaptcha serve --port 9991 --workers 4
+
+# 客户端
+curl -H 'X-SK: change-me' -X POST http://127.0.0.1:9991/solve \
+    -d '{"appid":"YOUR_APPID","retries":3}'
+```
 
-**Prompt 位置**：`_build_word_click_prompt()` 和 `_build_prompt()` 函数在 `solvers/llm_vision.py` 顶部，prompt 硬编码在 Python 中，方便直接修改；调整后不需要改 pipeline。
+## LLM 不再需要
 
-如果没有配置 LLM（key / base_url 任一为空），pipeline 会走纯 ddddocr OCR 回退；经验上 ddddocr 对验证码字体识别率明显低于 vision LLM，推荐配置 LLM。
+之前版本主路径走 OpenAI 兼容 `/v1/chat/completions`；本地模型 + ddddocr 兜底足够覆盖所有已知 word_click 样本，LLM 相关环境变量（`TCAPTCHA_LLM_*`）只对 `image_select` 还有意义，可留空。
 
 ## Answer 格式
 
@@ -48,7 +86,9 @@ TCaptcha 2.0 的文字点选挑战。提示词（目标汉字序列）内嵌在
 
 ## 常见坑
 
-- **LLM 返回非 JSON** —— `solvers/llm_vision.py` 内部会做一次重试；仍失败则抛 `SolveError`，由 pipeline 走 ddddocr 回退
-- **bbox 数量少于目标字数** —— 说明 detection 漏检，记录 warning 后仍会尽力回答，但通过率会下降
 - **提示词里混入标点或空格** —— `_parse_target_chars` 用 `[\u4e00-\u9fff]` 正则只抓 CJK 汉字
-- **最后兜底** —— 如果所有方法都没给某个字找到 box，会把它分配到第一个未被使用的 bbox，至少保证一次可见的点击（而不是 `(0,0)` 的显式错误）
+- **bbox 数量少于目标字数** —— YOLO 漏检，pipeline 会记 warning 后仍尽力回答；验证失败时外层 `max_retries` 会重拉一次 prehandle
+- **YOLO 返回 0 bbox** —— 自动走 ddddocr detect + classification 兜底
+- **CoreML EP 首次慢** —— 见上节；`serve` 模式或 `TCAPTCHA_ORT_BACKEND=cpu` 解决
+- **模型文件被安全软件误删** —— 重装 `[word-click]` extra 即可；模型在 wheel 里
+- **最后兜底** —— 如果所有方法都没给某个字找到 bbox（极少见），会把它分配到第一个未被使用的 bbox，至少保证一次可见的点击（而不是 `(0,0)` 的显式错误）
diff --git a/pyproject.toml b/pyproject.toml
index 599c5c8..19d1514 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,13 +24,22 @@ dependencies = [
 
 [project.optional-dependencies]
 icon-click = ["ddddocr>=1.4"]
+word-click = [
+    "onnxruntime>=1.17",
+    "opencv-python-headless>=4.8",
+    "ddddocr>=1.4",
+]
 clip = ["cn2an>=0.5", "cn-clip>=1.0", "torch>=2.0"]
 all = [
     "ddddocr>=1.4",
+    "onnxruntime>=1.17",
+    "opencv-python-headless>=4.8",
     "cn2an>=0.5",
     "cn-clip>=1.0",
     "torch>=2.0",
 ]
+
+[dependency-groups]
 dev = [
     "pytest>=8.0",
     "pytest-asyncio>=0.23",
@@ -53,6 +62,18 @@ build-backend = "hatchling.build"
 [tool.hatch.build.targets.wheel]
 packages = ["src/crack_tcaptcha"]
 
+[tool.hatch.build.targets.wheel.force-include]
+"src/crack_tcaptcha/solvers/models/word_click_detector.onnx" = "crack_tcaptcha/solvers/models/word_click_detector.onnx"
+"src/crack_tcaptcha/solvers/models/word_click_matcher.onnx" = "crack_tcaptcha/solvers/models/word_click_matcher.onnx"
+"src/crack_tcaptcha/solvers/models/font.ttf" = "crack_tcaptcha/solvers/models/font.ttf"
+
+[tool.hatch.build.targets.sdist]
+include = [
+    "src/crack_tcaptcha/**",
+    "src/crack_tcaptcha/solvers/models/*.onnx",
+    "src/crack_tcaptcha/solvers/models/*.ttf",
+]
+
 [tool.ruff]
 line-length = 130
 target-version = "py310"
diff --git a/src/crack_tcaptcha/cli.py b/src/crack_tcaptcha/cli.py
index 2227842..4d43018 100644
--- a/src/crack_tcaptcha/cli.py
+++ b/src/crack_tcaptcha/cli.py
@@ -6,6 +6,21 @@
 import json
 import logging
 import sys
+import threading
+
+
+def _warmup_word_click() -> None:
+    """Best-effort: preload YOLO + Siamese sessions in background.
+
+    Runs concurrently with the first prehandle HTTP request so the model
+    load latency is hidden behind network wait time.
+    """
+    try:
+        from crack_tcaptcha.solvers.word_ocr import warmup
+
+        warmup()
+    except Exception:  # word-click extra not installed — that's fine
+        pass
 
 
 def main(argv: list[str] | None = None) -> None:
@@ -18,18 +33,49 @@ def main(argv: list[str] | None = None) -> None:
     parser = argparse.ArgumentParser(prog="crack-tcaptcha", description="TCaptcha automated solver")
     sub = parser.add_subparsers(dest="command")
 
-    solve_p = sub.add_parser("solve", help="Solve a TCaptcha challenge")
+    solve_p = sub.add_parser("solve", help="Solve a TCaptcha challenge (one-shot)")
     solve_p.add_argument("--appid", required=True, help="TCaptcha APP_ID")
     solve_p.add_argument("--retries", type=int, default=3, help="Max retry attempts")
     solve_p.add_argument("--entry-url", default="", help="Parent page URL (optional)")
     solve_p.add_argument("--json", action="store_true", dest="as_json", help="Output as JSON")
+    solve_p.add_argument(
+        "--no-warmup",
+        action="store_true",
+        help="Skip background ONNX model warmup (useful for benchmarking cold-start)",
+    )
+
+    serve_p = sub.add_parser(
+        "serve",
+        help="Run a long-lived HTTP server (models load once; best for repeated use)",
+    )
+    serve_p.add_argument("--host", default="127.0.0.1")
+    serve_p.add_argument("--port", type=int, default=9991)
+    serve_p.add_argument("--workers", type=int, default=4, help="Max concurrent solves")
 
     args = parser.parse_args(argv)
 
+    if args.command == "serve":
+        import os as _os
+
+        from crack_tcaptcha.server import run as serve_run
+
+        serve_run(
+            host=args.host,
+            port=args.port,
+            workers=args.workers,
+            sk=_os.environ.get("TCAPTCHA_SERVE_SK") or None,
+        )
+        return
+
     if args.command != "solve":
         parser.print_help()
         sys.exit(1)
 
+    # Kick off model warmup in a daemon thread so it overlaps with the
+    # first HTTP round-trip.
+    if not args.no_warmup:
+        threading.Thread(target=_warmup_word_click, name="word_click-warmup", daemon=True).start()
+
     from crack_tcaptcha import solve
 
     result = solve(appid=args.appid, max_retries=args.retries, entry_url=args.entry_url)
diff --git a/src/crack_tcaptcha/pipelines/word_click.py b/src/crack_tcaptcha/pipelines/word_click.py
index cc113bc..44d299c 100644
--- a/src/crack_tcaptcha/pipelines/word_click.py
+++ b/src/crack_tcaptcha/pipelines/word_click.py
@@ -6,11 +6,13 @@
   - fg_elem_list: [] (hint chars are inline in instruction text)
   - bg_elem_cfg.size_2d: [672, 480]
 
-Solver pipeline:
-  1. ddddocr detection → candidate char bboxes on bg
-  2. LLM vision (primary): sends annotated bg + target chars → gets {char: bbox_idx}
-  3. ddddocr OCR (fallback): if LLM unavailable or returns 0/miss for a char,
-     run per-bbox ddddocr classification and substring-match.
+Solver pipeline (primary path, local, ~50-200ms CPU):
+  1. YOLOv8 detection → candidate char bboxes on bg
+  2. Siamese similarity: render each target char with the bundled font
+     and match it against every bbox crop; pick highest-scoring unused bbox.
+
+Fallback (when the siamese extra is not installed or the YOLO stage
+returns 0 bboxes): the legacy ddddocr detection + per-bbox OCR path.
 
 Answer format (confirmed against real Chrome verify body):
   [{"elem_id": 1, "type": "DynAnswerType_POS", "data": "x,y"}, ...]
@@ -27,7 +29,6 @@
 from crack_tcaptcha.models import PrehandleResp, VerifyResp
 from crack_tcaptcha.pipelines._common import finish_with_verify
 from crack_tcaptcha.pow import solve_pow
-from crack_tcaptcha.settings import settings
 from crack_tcaptcha.tdc.provider import TDCProvider
 from crack_tcaptcha.trajectory import generate_click_trajectory, merge_trajectories
 
@@ -40,71 +41,16 @@ def _parse_target_chars(instruction: str) -> list[str]:
     return re.findall(r"[\u4e00-\u9fff]", after)
 
 
-def _bbox_center(bbox: tuple[int, int, int, int]) -> tuple[int, int]:
-    x1, y1, x2, y2 = bbox
-    return ((x1 + x2) // 2, (y1 + y2) // 2)
-
-
-def _fallback_match_by_ocr(
-    bg_bytes: bytes,
-    bboxes: list[tuple[int, int, int, int]],
-    targets: list[str],
-    already_assigned: dict[str, int],
-) -> dict[str, int]:
-    """Per-bbox ddddocr classify + substring match for chars LLM missed."""
-    import io
-
-    from PIL import Image
-
-    from crack_tcaptcha._legacy.icon_match import _get_ocr
-
-    missing = [ch for ch in targets if already_assigned.get(ch, 0) <= 0]
-    if not missing:
-        return already_assigned
-
-    ocr = _get_ocr()
-    bg_img = Image.open(io.BytesIO(bg_bytes)).convert("RGB")
-    bg_w, bg_h = bg_img.size
-    used_indices = {v for v in already_assigned.values() if v > 0}
-    bbox_ocr: dict[int, str] = {}
-    for i, (x1, y1, x2, y2) in enumerate(bboxes, start=1):
-        if i in used_indices:
-            continue
-        pad = 2
-        crop = bg_img.crop((max(0, x1 - pad), max(0, y1 - pad), min(bg_w, x2 + pad), min(bg_h, y2 + pad)))
-        buf = io.BytesIO()
-        crop.save(buf, "PNG")
-        try:
-            text = ocr.classification(buf.getvalue()) or ""
-        except Exception as e:  # pragma: no cover - defensive
-            log.warning("word_click fallback ocr error on bbox %d: %s", i, e)
-            text = ""
-        text = re.sub(r"[^\u4e00-\u9fff]", "", text)
-        bbox_ocr[i] = text
-    log.info("word_click fallback ocr on %d unused bboxes: %s", len(bbox_ocr), bbox_ocr)
-
-    result = dict(already_assigned)
-    for ch in missing:
-        for i, text in bbox_ocr.items():
-            if i in used_indices:
-                continue
-            if ch in text:
-                result[ch] = i
-                used_indices.add(i)
-                log.info("word_click fallback: %r → bbox %d via ocr=%r", ch, i, text)
-                break
-    # Final fallback: assign any remaining char to first unused bbox (visible click
-    # better than (0,0) which is guaranteed wrong).
-    for ch in targets:
-        if result.get(ch, 0) > 0:
-            continue
-        for i in range(1, len(bboxes) + 1):
-            if i not in used_indices:
-                result[ch] = i
-                used_indices.add(i)
-                log.info("word_click fallback: %r → bbox %d (last-resort)", ch, i)
-                break
-    return result
+def _fallback_ddddocr(bg_bytes: bytes, targets: list[str]) -> list[tuple[int, int]]:
+    """Legacy ddddocr detect + per-bbox OCR fallback.
+
+    Only used if the primary siamese path is unavailable (onnxruntime/cv2 not
+    installed, model files missing, or YOLO produces no bboxes).
+    """
+    from crack_tcaptcha._legacy.icon_match import match_words
+
+    log.info("word_click: falling back to ddddocr match_words")
+    return match_words(bg_bytes, targets)
 
 
 def solve_one_attempt(
@@ -125,48 +71,16 @@ def solve_one_attempt(
         len(bg_bytes),
     )
 
+    # Primary path: local YOLO + Siamese (fast, no network, no API cost)
+    click_coords: list[tuple[int, int]]
     try:
-        from crack_tcaptcha._legacy.icon_match import detect_icons
-    except ImportError as e:
-        raise SolveError("word_click requires ddddocr: `uv sync --extra icon-click`") from e
-
-    bboxes = detect_icons(bg_bytes)
-    if len(bboxes) < len(targets):
-        log.warning(
-            "word_click: only %d bboxes detected for %d targets",
-            len(bboxes),
-            len(targets),
-        )
-    if not bboxes:
-        raise SolveError("word_click: detector returned 0 bboxes")
-    log.info("word_click detection: %d bboxes=%s", len(bboxes), bboxes)
-
-    # Primary: LLM vision (more reliable than ddddocr OCR on captcha fonts)
-    char_to_box: dict[str, int] = {}
-    llm_ok = bool(settings.llm_api_key and settings.llm_base_url)
-    if llm_ok:
-        try:
-            from crack_tcaptcha.solvers.llm_vision import locate_chars
-
-            char_to_box = locate_chars(bg_bytes, targets=targets, bboxes=bboxes)
-        except SolveError as e:
-            log.warning("word_click: LLM locate_chars failed, falling back: %s", e)
-    else:
-        log.info("word_click: LLM not configured, using ddddocr fallback only")
-
-    # Fallback for any char LLM returned 0 / miss
-    char_to_box = _fallback_match_by_ocr(bg_bytes, bboxes, targets, char_to_box)
-
-    click_coords: list[tuple[int, int]] = []
-    for ch in targets:
-        idx = char_to_box.get(ch, 0)
-        if 1 <= idx <= len(bboxes):
-            click_coords.append(_bbox_center(bboxes[idx - 1]))
-        else:
-            # Should not happen after fallback, but be safe.
-            cx, cy = _bbox_center(bboxes[0])
-            click_coords.append((cx, cy))
-            log.warning("word_click: char %r unresolved, using bbox 1", ch)
+        from crack_tcaptcha.solvers.word_ocr import locate_chars_by_siamese
+
+        click_coords = locate_chars_by_siamese(bg_bytes, targets)
+    except SolveError as e:
+        log.warning("word_click siamese path failed: %s — using ddddocr fallback", e)
+        click_coords = _fallback_ddddocr(bg_bytes, targets)
+
     log.info("word_click click_coords=%s for targets=%s", click_coords, targets)
 
     pow_answer, pow_calc_time = solve_pow(
diff --git a/src/crack_tcaptcha/server.py b/src/crack_tcaptcha/server.py
new file mode 100644
index 0000000..71261bf
--- /dev/null
+++ b/src/crack_tcaptcha/server.py
@@ -0,0 +1,220 @@
+"""Long-running HTTP server for crack-tcaptcha.
+
+Exposes ``POST /solve`` that wraps :func:`crack_tcaptcha.solve`. Models
+load **once** at startup (warmup), so every request pays only the
+inference cost — no process cold-start, no ONNX reload. This is the
+recommended mode for any non-one-shot use (scripts hammering captchas,
+bench, integrations, …).
+
+The server uses stdlib ``http.server`` so we don't need to pull
+``fastapi``/``uvicorn`` into the default dependency set.
+
+Endpoints::
+
+    GET  /health           → {"status": "ok", "providers": [...]}
+    POST /solve            → request body: {"appid": "...", "retries": 3, "entry_url": ""}
+                             → response:    SolveResult model as JSON
+
+Authentication (optional): set ``TCAPTCHA_SERVE_SK`` and every request
+must send it in the ``X-SK`` header.
+
+Concurrency: the server spawns a small worker thread pool so independent
+solves run in parallel. ONNX sessions are thread-safe when created with
+``intra_op_num_threads>=1`` (the default), which we do.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+from typing import Any
+
+log = logging.getLogger(__name__)
+
+_DEFAULT_HOST = "127.0.0.1"
+_DEFAULT_PORT = 9991
+_DEFAULT_WORKERS = 4
+
+
+def _warmup_all() -> list[str]:
+    """Preload all ONNX sessions and run one dummy inference each.
+
+    Returns a list of human-readable provider strings for /health.
+    """
+    providers: list[str] = []
+    try:
+        from crack_tcaptcha.solvers.word_ocr import (
+            _get_siamese_session,
+            _get_yolo_session,
+            warmup,
+        )
+
+        warmup()
+        yolo = _get_yolo_session()
+        siamese = _get_siamese_session()
+        providers.append(f"yolo={yolo.get_providers()}")
+        providers.append(f"siamese={siamese.get_providers()}")
+    except Exception as e:
+        log.warning("serve: word_click warmup failed (%s) — fallbacks still work", e)
+    return providers
+
+
+class _State:
+    """Shared server state: one executor + cached warmup info."""
+
+    def __init__(self, workers: int, sk: str | None) -> None:
+        self.executor = ThreadPoolExecutor(max_workers=workers, thread_name_prefix="solve")
+        self.sk = sk
+        self.providers: list[str] = []
+        self.started_at = time.time()
+
+
+class _Handler(BaseHTTPRequestHandler):
+    # class-level; populated by run()
+    state: _State = None  # type: ignore[assignment]
+
+    def log_message(self, format: str, *args: Any) -> None:  # noqa: A002 — stdlib signature
+        log.info("%s - %s", self.address_string(), format % args)
+
+    # ---- response helpers ------------------------------------------------
+
+    def _send_json(self, code: int, payload: dict[str, Any]) -> None:
+        body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
+        self.send_response(code)
+        self.send_header("Content-Type", "application/json; charset=utf-8")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def _check_auth(self) -> bool:
+        if not self.state.sk:
+            return True
+        if self.headers.get("X-SK") == self.state.sk:
+            return True
+        self._send_json(401, {"status": "error", "msg": "unauthorized"})
+        return False
+
+    # ---- routes ----------------------------------------------------------
+
+    def do_GET(self) -> None:  # noqa: N802 (stdlib convention)
+        if self.path == "/health":
+            self._send_json(
+                200,
+                {
+                    "status": "ok",
+                    "providers": self.state.providers,
+                    "uptime_s": round(time.time() - self.state.started_at, 1),
+                },
+            )
+            return
+        self._send_json(404, {"status": "error", "msg": "not found"})
+
+    def do_POST(self) -> None:  # noqa: N802
+        if self.path != "/solve":
+            self._send_json(404, {"status": "error", "msg": "not found"})
+            return
+        if not self._check_auth():
+            return
+
+        length = int(self.headers.get("Content-Length", "0") or 0)
+        raw = self.rfile.read(length) if length > 0 else b""
+        try:
+            body = json.loads(raw.decode("utf-8")) if raw else {}
+        except json.JSONDecodeError as e:
+            self._send_json(400, {"status": "error", "msg": f"invalid json: {e}"})
+            return
+
+        appid = body.get("appid") or body.get("app_id")
+        if not appid:
+            self._send_json(400, {"status": "error", "msg": "missing appid"})
+            return
+        retries = int(body.get("retries", body.get("max_retries", 3)))
+        entry_url = body.get("entry_url", "")
+
+        from crack_tcaptcha import solve
+
+        # Run in the executor so concurrent /solve requests don't block each
+        # other. The HTTP server is already ThreadingHTTPServer, so this is
+        # actually just enforcing a bounded concurrency.
+        fut = self.state.executor.submit(solve, appid=str(appid), max_retries=retries, entry_url=entry_url)
+        t0 = time.time()
+        try:
+            result = fut.result()
+        except Exception as e:  # pragma: no cover - defensive
+            log.exception("solve crashed")
+            self._send_json(500, {"status": "error", "msg": str(e)})
+            return
+        cost = round(time.time() - t0, 3)
+        payload = result.model_dump()
+        payload["_cost_s"] = cost
+        self._send_json(200, payload)
+
+
+def run(host: str, port: int, workers: int, sk: str | None) -> None:
+    """Start the server. Blocks until Ctrl-C."""
+    logging.basicConfig(
+        level=logging.INFO,
+        format="[%(asctime)s] %(levelname)s %(name)s: %(message)s",
+        datefmt="%H:%M:%S",
+    )
+
+    log.info("crack-tcaptcha serve: warming up models...")
+    providers = _warmup_all()
+    for p in providers:
+        log.info("  %s", p)
+
+    state = _State(workers=workers, sk=sk)
+    state.providers = providers
+    _Handler.state = state
+
+    server = ThreadingHTTPServer((host, port), _Handler)
+    log.info("listening on http://%s:%d (workers=%d, auth=%s)", host, port, workers, "on" if sk else "off")
+
+    stop_evt = threading.Event()
+
+    def _shutdown(*_: Any) -> None:
+        log.info("shutting down...")
+        stop_evt.set()
+        # serve_forever blocks in the main thread; use shutdown() from another
+        # thread to break it out.
+        threading.Thread(target=server.shutdown, daemon=True).start()
+
+    import signal
+
+    signal.signal(signal.SIGINT, _shutdown)
+    signal.signal(signal.SIGTERM, _shutdown)
+
+    try:
+        server.serve_forever()
+    finally:
+        state.executor.shutdown(wait=True, cancel_futures=True)
+        server.server_close()
+        log.info("stopped")
+
+
+def main(argv: list[str] | None = None) -> None:
+    """Entry point registered as `crack-tcaptcha serve` subcommand."""
+    import argparse
+
+    parser = argparse.ArgumentParser(prog="crack-tcaptcha serve", description="Long-running solver HTTP service")
+    parser.add_argument("--host", default=os.environ.get("TCAPTCHA_SERVE_HOST", _DEFAULT_HOST))
+    parser.add_argument("--port", type=int, default=int(os.environ.get("TCAPTCHA_SERVE_PORT", _DEFAULT_PORT)))
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=int(os.environ.get("TCAPTCHA_SERVE_WORKERS", _DEFAULT_WORKERS)),
+        help="Max concurrent solves",
+    )
+    args = parser.parse_args(argv)
+
+    sk = os.environ.get("TCAPTCHA_SERVE_SK") or None
+    run(host=args.host, port=args.port, workers=args.workers, sk=sk)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/crack_tcaptcha/solvers/models/font.ttf b/src/crack_tcaptcha/solvers/models/font.ttf
new file mode 100644
index 0000000..fc3ace0
--- /dev/null
+++ b/src/crack_tcaptcha/solvers/models/font.ttf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ea1d8f8aed5cee8cfa964b452c2085a1135acc02eb61c963f2f2b45bfbf4dbc
+size 4825100
diff --git a/src/crack_tcaptcha/solvers/models/word_click_detector.onnx b/src/crack_tcaptcha/solvers/models/word_click_detector.onnx
new file mode 100644
index 0000000..58b39c8
--- /dev/null
+++ b/src/crack_tcaptcha/solvers/models/word_click_detector.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3d606a40ca327b9f5cff5c9cc6dc924c1c5da1b75a4050c358a84782e682e82
+size 10457464
diff --git a/src/crack_tcaptcha/solvers/models/word_click_matcher.onnx b/src/crack_tcaptcha/solvers/models/word_click_matcher.onnx
new file mode 100644
index 0000000..4664414
--- /dev/null
+++ b/src/crack_tcaptcha/solvers/models/word_click_matcher.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f48e175f8bb8d0fcadba8d22a263dcb429c87f659acd56542b1bdc3d8dab9e2
+size 29974422
diff --git a/src/crack_tcaptcha/solvers/ort_provider.py b/src/crack_tcaptcha/solvers/ort_provider.py
new file mode 100644
index 0000000..b214117
--- /dev/null
+++ b/src/crack_tcaptcha/solvers/ort_provider.py
@@ -0,0 +1,60 @@
+"""ONNX Runtime execution-provider selection.
+
+Honours the ``TCAPTCHA_ORT_BACKEND`` env var (``cuda`` / ``rocm`` / ``dml`` /
+``coreml`` / ``cpu`` / ``auto``, default ``auto``). Falls back to CPU when the
+requested backend is unavailable.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+
+log = logging.getLogger(__name__)
+
+_BACKEND_MAP = {
+    "cuda": "CUDAExecutionProvider",
+    "rocm": "ROCMExecutionProvider",
+    "dml": "DmlExecutionProvider",
+    "coreml": "CoreMLExecutionProvider",
+    "cpu": "CPUExecutionProvider",
+}
+
+_AUTO_PRIORITY = (
+    "CUDAExecutionProvider",
+    "ROCMExecutionProvider",
+    "DmlExecutionProvider",
+    "CoreMLExecutionProvider",
+    "CPUExecutionProvider",
+)
+
+
+def resolve_providers() -> list[str]:
+    """Return an ORT providers list, always terminated by CPU as a fallback."""
+    import onnxruntime as ort
+
+    available = set(ort.get_available_providers())
+    pref = os.environ.get("TCAPTCHA_ORT_BACKEND", "auto").strip().lower()
+
+    wanted = _BACKEND_MAP.get(pref)
+    if wanted:
+        if wanted in available:
+            if wanted == "CPUExecutionProvider":
+                return [wanted]
+            return [wanted, "CPUExecutionProvider"]
+        log.warning(
+            "TCAPTCHA_ORT_BACKEND=%s requested %s, not available (%s) — falling back to auto",
+            pref,
+            wanted,
+            sorted(available),
+        )
+
+    for p in _AUTO_PRIORITY:
+        if p in available:
+            if p == "CPUExecutionProvider":
+                return [p]
+            return [p, "CPUExecutionProvider"]
+    return ["CPUExecutionProvider"]
+
+
+__all__ = ["resolve_providers"]
diff --git a/src/crack_tcaptcha/solvers/word_ocr.py b/src/crack_tcaptcha/solvers/word_ocr.py
new file mode 100644
index 0000000..a4fec87
--- /dev/null
+++ b/src/crack_tcaptcha/solvers/word_ocr.py
@@ -0,0 +1,405 @@
+"""word-click solver: YOLOv8 detection + Siamese similarity matching.
+
+Replaces the slow LLM-vision path for ``word_click``. Uses two ONNX models
+shipped with the package:
+
+* ``yolo_word.onnx`` — YOLOv8 detector, finds candidate character bboxes on
+  the bg image.
+* ``siamese_word.onnx`` — Siamese network, takes two 52×52 RGB crops and
+  returns a similarity score.
+
+Target characters are rendered with the bundled ``font.ttf`` into 52×52
+reference images and compared against every detected bbox crop. The
+highest-scoring unused bbox is picked for each target, in order.
+
+Performance notes:
+
+* Sessions are module-level singletons (one load per Python process).
+* ``SessionOptions`` enables all graph optimisations and pins thread count
+  to a sensible default.
+* Siamese inference is **batched**: for each target char we stack all
+  candidate crops into one call (N forward passes → 1 call), which is the
+  main win over a naïve loop.
+* ``warmup()`` can be called at startup to amortise first-inference cost
+  (CoreML / CUDA graph compile).
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import threading
+from pathlib import Path
+
+import numpy as np
+
+from crack_tcaptcha.exceptions import SolveError
+from crack_tcaptcha.solvers.ort_provider import resolve_providers
+
+log = logging.getLogger(__name__)
+
+_MODEL_DIR = Path(__file__).resolve().parent / "models"
+_YOLO_PATH = _MODEL_DIR / "word_click_detector.onnx"
+_SIAMESE_PATH = _MODEL_DIR / "word_click_matcher.onnx"
+_FONT_PATH = _MODEL_DIR / "font.ttf"
+
+_YOLO_CONFIDENCE = 0.5
+_YOLO_IOU = 0.7
+_SIAMESE_INPUT = (52, 52)
+_CHAR_RENDER_SIZE = 52
+_CHAR_RENDER_FONT_SIZE = 40
+_CHAR_RENDER_COLOR = (227, 178, 56)  # BGR order when fed to siamese
+
+
+# --- lazy model singletons ---------------------------------------------------
+
+_yolo_lock = threading.Lock()
+_siamese_lock = threading.Lock()
+_yolo_session = None
+_siamese_session = None
+_siamese_input_names: tuple[str, str] | None = None
+_siamese_batch_supported: bool | None = None
+
+
+def _import_onnx():
+    try:
+        import onnxruntime  # noqa: F401
+    except ImportError as e:  # pragma: no cover - import guard
+        raise SolveError("word_click requires onnxruntime: `uv sync --extra word-click`") from e
+
+
+def _import_cv2():
+    try:
+        import cv2
+    except ImportError as e:  # pragma: no cover - import guard
+        raise SolveError("word_click requires opencv-python-headless: `uv sync --extra word-click`") from e
+    return cv2
+
+
+def _make_session_options():
+    """Build a tuned SessionOptions instance."""
+    import onnxruntime as ort
+
+    so = ort.SessionOptions()
+    so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+    # The siamese net is tiny (52×52 input) — beyond ~4 intra-op threads we
+    # measurably regress from scheduling overhead. Cap at 4 by default;
+    # override via TCAPTCHA_ORT_INTRA_OP_THREADS for specific hardware.
+    env_threads = os.environ.get("TCAPTCHA_ORT_INTRA_OP_THREADS")
+    if env_threads and env_threads.isdigit() and int(env_threads) > 0:
+        threads = int(env_threads)
+    else:
+        try:
+            threads = min(4, os.cpu_count() or 4)
+        except Exception:
+            threads = 4
+    so.intra_op_num_threads = max(1, threads)
+    so.inter_op_num_threads = 1
+    so.log_severity_level = 3  # ERROR only
+    return so
+
+
+def _get_yolo_session():
+    global _yolo_session
+    if _yolo_session is not None:
+        return _yolo_session
+    with _yolo_lock:
+        if _yolo_session is not None:
+            return _yolo_session
+        _import_onnx()
+        import onnxruntime as ort
+
+        if not _YOLO_PATH.is_file():
+            raise SolveError(f"word_click: missing yolo model at {_YOLO_PATH}")
+        so = _make_session_options()
+        _yolo_session = ort.InferenceSession(str(_YOLO_PATH), sess_options=so, providers=resolve_providers())
+        log.info("word_click yolo session providers=%s", _yolo_session.get_providers())
+    return _yolo_session
+
+
+def _get_siamese_session():
+    global _siamese_session, _siamese_input_names
+    if _siamese_session is not None:
+        return _siamese_session
+    with _siamese_lock:
+        if _siamese_session is not None:
+            return _siamese_session
+        _import_onnx()
+        import onnxruntime as ort
+
+        if not _SIAMESE_PATH.is_file():
+            raise SolveError(f"word_click: missing siamese model at {_SIAMESE_PATH}")
+        so = _make_session_options()
+        _siamese_session = ort.InferenceSession(str(_SIAMESE_PATH), sess_options=so, providers=resolve_providers())
+        inputs = _siamese_session.get_inputs()
+        _siamese_input_names = (inputs[0].name, inputs[1].name)
+        log.info(
+            "word_click siamese session providers=%s inputs=%s",
+            _siamese_session.get_providers(),
+            [(i.name, i.shape) for i in inputs],
+        )
+    return _siamese_session
+
+
+# --- image helpers -----------------------------------------------------------
+
+
+def _bytes_to_bgr(byte_data: bytes) -> np.ndarray:
+    cv2 = _import_cv2()
+    arr = np.frombuffer(byte_data, np.uint8)
+    img = cv2.imdecode(arr, cv2.IMREAD_UNCHANGED)
+    if img is None:
+        raise SolveError("word_click: failed to decode bg image")
+    if len(img.shape) == 2:
+        return cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+    if img.shape[-1] == 4:
+        alpha = img[..., 3:4].astype(np.float32) / 255.0
+        rgb = img[..., :3].astype(np.float32)
+        white = np.full_like(rgb, 255.0)
+        return (rgb * alpha + white * (1 - alpha)).astype(np.uint8)
+    return img
+
+
+def _render_char(char: str) -> np.ndarray:
+    """Render one CJK char to a 52×52 BGR image using the bundled font."""
+    from PIL import Image, ImageDraw, ImageFont
+
+    if not _FONT_PATH.is_file():
+        raise SolveError(f"word_click: missing font at {_FONT_PATH}")
+    img = Image.new("RGB", (_CHAR_RENDER_SIZE, _CHAR_RENDER_SIZE), color="white")
+    draw = ImageDraw.Draw(img)
+    font = ImageFont.truetype(str(_FONT_PATH), _CHAR_RENDER_FONT_SIZE)
+    bbox = font.getbbox(char)
+    text_w = bbox[2] - bbox[0]
+    x = (_CHAR_RENDER_SIZE - text_w) // 2
+    y = -3
+    draw.text((x, y), char, fill=_CHAR_RENDER_COLOR, font=font)
+    cv2 = _import_cv2()
+    return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+
+
+# --- YOLOv8 detection --------------------------------------------------------
+
+
+def _letterbox(
+    img: np.ndarray,
+    new_shape: tuple[int, int],
+) -> tuple[np.ndarray, tuple[float, float], tuple[float, float]]:
+    cv2 = _import_cv2()
+    h, w = img.shape[:2]
+    r = min(new_shape[0] / h, new_shape[1] / w)
+    new_unpad = (int(round(w * r)), int(round(h * r)))
+    dw = (new_shape[1] - new_unpad[0]) / 2
+    dh = (new_shape[0] - new_unpad[1]) / 2
+    resized = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top = int(round(dh - 0.1))
+    bottom = int(round(dh + 0.1))
+    left = int(round(dw - 0.1))
+    right = int(round(dw + 0.1))
+    padded = cv2.copyMakeBorder(resized, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(255, 255, 255))
+    return padded, (r, r), (left, top)
+
+
+def _yolo_detect(bg_bgr: np.ndarray) -> list[tuple[int, int, int, int]]:
+    """Run YOLOv8 and return [(x1, y1, x2, y2), ...] on the original image."""
+    cv2 = _import_cv2()
+    sess = _get_yolo_session()
+    inp = sess.get_inputs()[0]
+    _, _, ih, iw = inp.shape
+
+    letter, (rx, ry), (pad_x, pad_y) = _letterbox(bg_bgr, (ih, iw))
+    rgb = cv2.cvtColor(letter, cv2.COLOR_BGR2RGB)
+    data = (np.array(rgb) / 255.0).transpose(2, 0, 1)[None].astype(np.float32)
+
+    outputs = sess.run(None, {inp.name: data})[0]
+    preds = np.transpose(np.squeeze(outputs))
+
+    boxes: list[list[float]] = []
+    scores: list[float] = []
+    for row in preds:
+        class_scores = row[4:]
+        max_score = float(np.amax(class_scores))
+        if max_score < _YOLO_CONFIDENCE:
+            continue
+        x, y, w, h = row[0], row[1], row[2], row[3]
+        left = int(((x - w / 2) - pad_x) / rx)
+        top = int(((y - h / 2) - pad_y) / ry)
+        width = int(w / rx)
+        height = int(h / ry)
+        boxes.append([left, top, width, height])
+        scores.append(max_score)
+
+    if not boxes:
+        return []
+
+    indices = cv2.dnn.NMSBoxes(boxes, scores, _YOLO_CONFIDENCE, _YOLO_IOU)
+    bh, bw = bg_bgr.shape[:2]
+    out: list[tuple[int, int, int, int]] = []
+    for i in np.array(indices).flatten():
+        x, y, w, h = boxes[int(i)]
+        x1 = int(max(0, x))
+        y1 = int(max(1, y))  # mirror tx-word quirk: y1<=0 clamped to 1
+        x2 = int(min(bw, x + w))
+        y2 = int(min(bh, y + h))
+        if x2 > x1 and y2 > y1:
+            out.append((x1, y1, x2, y2))
+    return out
+
+
+# --- Siamese matching --------------------------------------------------------
+
+
+def _prep_siamese(img: np.ndarray) -> np.ndarray:
+    """Preprocess a single BGR crop to (1, 3, 52, 52) float32 [0,1]."""
+    cv2 = _import_cv2()
+    resized = cv2.resize(img, _SIAMESE_INPUT)
+    arr = np.transpose(resized, (2, 0, 1)).astype(np.float32) / 255.0
+    return arr[None, ...]
+
+
+def _siamese_score_batch(crops: list[np.ndarray], ref: np.ndarray) -> list[float]:
+    """Score every crop against the ref in one (or as few as possible) ORT calls.
+
+    Fast paths, in order:
+
+    1. True batched inference (if the exported graph has a dynamic batch
+       dim). Cached after first attempt.
+    2. Thread-pool parallel per-pair calls. ``session.run`` releases the
+       GIL, so this gives real parallelism on CPU EP with
+       ``intra_op_num_threads>=2``.
+    """
+    global _siamese_batch_supported
+
+    sess = _get_siamese_session()
+    assert _siamese_input_names is not None
+    n0, n1 = _siamese_input_names
+
+    if not crops:
+        return []
+
+    ref_prepped = _prep_siamese(ref)  # (1,3,52,52)
+
+    # 1) Try batched inference (only once — decision is cached).
+    if _siamese_batch_supported is not False:
+        try:
+            batch = np.concatenate([_prep_siamese(c) for c in crops], axis=0)  # (N,3,52,52)
+            refs = np.repeat(ref_prepped, batch.shape[0], axis=0)  # (N,3,52,52)
+            pred = sess.run(None, {n0: batch, n1: refs})[0]
+            arr = np.asarray(pred).reshape(-1)
+            if arr.size == batch.shape[0]:
+                _siamese_batch_supported = True
+                return [float(v) for v in arr]
+        except Exception as e:
+            log.info("word_click siamese batch not supported, using per-pair: %s", e)
+            _siamese_batch_supported = False
+
+    # 2) Per-pair path. Preprocess everything up front (numpy ops, cheap),
+    # then let ORT's own intra-op thread pool do the heavy lifting.
+    # An outer ThreadPoolExecutor causes oversubscription vs intra_op_num_threads
+    # and measurably slows things down — don't add one.
+    prepped = [_prep_siamese(c) for c in crops]
+    out: list[float] = []
+    for p in prepped:
+        pred = sess.run(None, {n0: p, n1: ref_prepped})[0]
+        out.append(float(np.asarray(pred).reshape(-1)[0]))
+    return out
+
+
+# --- public API --------------------------------------------------------------
+
+
+def warmup() -> None:
+    """Load both sessions and run one dummy inference each.
+
+    Call this once at process start (e.g. CLI entry point) to amortise the
+    first-request cost (graph optimisation, kernel JIT, CoreML compile).
+    """
+    try:
+        _get_yolo_session()
+        _get_siamese_session()
+    except SolveError as e:
+        log.warning("word_click warmup: sessions unavailable (%s)", e)
+        return
+
+    try:
+        # Dummy 672×480 bg to exercise yolo preprocess/infer path.
+        dummy_bg = np.full((480, 672, 3), 255, dtype=np.uint8)
+        _yolo_detect(dummy_bg)
+        dummy_crop = np.full((52, 52, 3), 200, dtype=np.uint8)
+        dummy_ref = np.full((52, 52, 3), 100, dtype=np.uint8)
+        _siamese_score_batch([dummy_crop], dummy_ref)
+        log.info("word_click warmup: done")
+    except Exception as e:  # defensive; warmup must never break the caller
+        log.warning("word_click warmup: dummy inference failed: %s", e)
+
+
+def locate_chars_by_siamese(
+    bg_bytes: bytes,
+    targets: list[str],
+) -> list[tuple[int, int]]:
+    """Return click (cx, cy) for each target char, in order.
+
+    Raises ``SolveError`` if YOLO finds zero bboxes. If YOLO finds fewer
+    bboxes than targets, each target still gets its best pick; the caller
+    decides whether to retry or accept.
+
+    Strategy: compute the full ``(targets × crops)`` score matrix once,
+    then greedily pick the best unused crop per target in input order.
+    This avoids redundant ORT calls when multiple targets share the same
+    candidate pool.
+    """
+    bg = _bytes_to_bgr(bg_bytes)
+    bboxes = _yolo_detect(bg)
+    if not bboxes:
+        raise SolveError("word_click: yolo returned 0 bboxes")
+    log.info("word_click yolo: %d bboxes %s", len(bboxes), bboxes)
+
+    # pre-crop
+    crops: list[np.ndarray] = []
+    centers: list[tuple[int, int]] = []
+    for x1, y1, x2, y2 in bboxes:
+        crop = bg[y1:y2, x1:x2]
+        if crop.size == 0:
+            continue
+        crops.append(crop)
+        centers.append((int((x1 + x2) // 2), int((y1 + y2) // 2)))
+
+    if not crops:
+        raise SolveError("word_click: all yolo bboxes produced empty crops")
+
+    # Full score matrix: rows = targets (in order), cols = crop indices.
+    score_matrix: list[list[float]] = []
+    for ch in targets:
+        ref = _render_char(ch)
+        score_matrix.append(_siamese_score_batch(crops, ref))
+
+    # Greedy assignment in target order. (Hungarian would be optimal, but
+    # targets rarely collide and instruction order is the actual click
+    # order — greedy matches tx-word's proven approach.)
+    result: list[tuple[int, int]] = []
+    used: set[int] = set()
+    for ti, ch in enumerate(targets):
+        scores = score_matrix[ti]
+        best_idx = -1
+        best_score = -1.0
+        for i, s in enumerate(scores):
+            if i in used:
+                continue
+            if s > best_score:
+                best_score = s
+                best_idx = i
+        if best_idx < 0:
+            # all already used — reuse best overall
+            for i, s in enumerate(scores):
+                if s > best_score:
+                    best_score = s
+                    best_idx = i
+        if best_idx < 0:
+            raise SolveError(f"word_click: no candidate for target {ch!r}")
+        used.add(best_idx)
+        result.append(centers[best_idx])
+        log.info("word_click: %r → %s (score=%.3f)", ch, centers[best_idx], best_score)
+    return result
+
+
+__all__ = ["locate_chars_by_siamese", "warmup"]
diff --git a/tests/test_pipeline_word_click.py b/tests/test_pipeline_word_click.py
index fbc7c52..57d66e5 100644
--- a/tests/test_pipeline_word_click.py
+++ b/tests/test_pipeline_word_click.py
@@ -1,4 +1,4 @@
-"""Tests for pipelines/word_click.py."""
+"""Tests for pipelines/word_click.py (local YOLO + Siamese path)."""
 
 from __future__ import annotations
 
@@ -19,8 +19,6 @@
     VerifyResp,
 )
 from crack_tcaptcha.pipelines.word_click import (
-    _bbox_center,
-    _fallback_match_by_ocr,
     _parse_target_chars,
     solve_one_attempt,
 )
@@ -44,16 +42,8 @@ def test_no_chars_returns_empty(self):
         assert _parse_target_chars("请依次点击：abc 123") == []
 
 
-class TestBboxCenter:
-    def test_integer_center(self):
-        assert _bbox_center((10, 20, 30, 40)) == (20, 30)
-
-    def test_floor_division(self):
-        assert _bbox_center((0, 0, 3, 3)) == (1, 1)
-
-
 # ---------------------------------------------------------------------------
-# _fallback_match_by_ocr
+# solve_one_attempt
 # ---------------------------------------------------------------------------
 
 
@@ -64,63 +54,6 @@ def _fake_bg_bytes(w: int = 100, h: int = 80) -> bytes:
     return buf.getvalue()
 
 
-class TestFallbackMatchByOcr:
-    def test_already_assigned_short_circuits(self, monkeypatch):
-        called = {"ocr": False}
-
-        def fake_get_ocr():
-            called["ocr"] = True
-            return MagicMock()
-
-        monkeypatch.setattr("crack_tcaptcha._legacy.icon_match._get_ocr", fake_get_ocr)
-
-        bboxes = [(0, 0, 10, 10), (20, 0, 30, 10)]
-        result = _fallback_match_by_ocr(
-            _fake_bg_bytes(),
-            bboxes,
-            targets=["甲", "乙"],
-            already_assigned={"甲": 1, "乙": 2},
-        )
-        assert result == {"甲": 1, "乙": 2}
-        assert called["ocr"] is False
-
-    def test_fills_missing_via_ocr_text(self, monkeypatch):
-        # First unused bbox returns "甲甲", second returns noise then "乙"
-        ocr = MagicMock()
-        ocr.classification.side_effect = ["甲", "乙foo"]
-        monkeypatch.setattr("crack_tcaptcha._legacy.icon_match._get_ocr", lambda: ocr)
-
-        bboxes = [(0, 0, 10, 10), (20, 0, 30, 10), (40, 0, 50, 10)]
-        result = _fallback_match_by_ocr(
-            _fake_bg_bytes(),
-            bboxes,
-            targets=["甲", "乙"],
-            already_assigned={},
-        )
-        assert result["甲"] == 1
-        assert result["乙"] == 2
-
-    def test_last_resort_assigns_unused_bbox(self, monkeypatch):
-        # OCR returns nothing useful; char must still map to SOME unused bbox
-        ocr = MagicMock()
-        ocr.classification.return_value = ""
-        monkeypatch.setattr("crack_tcaptcha._legacy.icon_match._get_ocr", lambda: ocr)
-
-        bboxes = [(0, 0, 10, 10), (20, 0, 30, 10)]
-        result = _fallback_match_by_ocr(
-            _fake_bg_bytes(),
-            bboxes,
-            targets=["甲"],
-            already_assigned={},
-        )
-        assert 1 <= result["甲"] <= 2
-
-
-# ---------------------------------------------------------------------------
-# solve_one_attempt
-# ---------------------------------------------------------------------------
-
-
 def _make_pre(instruction: str = "请依次点击：甲 乙 ") -> PrehandleResp:
     return PrehandleResp(
         sess="sess_x",
@@ -149,6 +82,20 @@ def stub_pow(monkeypatch):
     )
 
 
+@pytest.fixture()
+def stub_finish(monkeypatch):
+    """Short-circuit finish_with_verify to skip TDC / trajectory plumbing."""
+
+    def fake_finish(client, pre, tdc_provider, *, ans_json, pow_answer, pow_calc_time, trajectory):
+        return client.verify(
+            ans=ans_json,
+            pow_answer=pow_answer,
+            pow_calc_time=pow_calc_time,
+        )
+
+    monkeypatch.setattr("crack_tcaptcha.pipelines.word_click.finish_with_verify", fake_finish)
+
+
 class TestSolveOneAttempt:
     def test_raises_when_no_cjk_chars(self, stub_pow):
         client, tdc = _mock_client_and_tdc()
@@ -156,86 +103,48 @@ def test_raises_when_no_cjk_chars(self, stub_pow):
         with pytest.raises(SolveError, match="no CJK chars"):
             solve_one_attempt(client, pre, tdc)
 
-    def test_raises_when_detector_returns_empty(self, monkeypatch, stub_pow):
-        client, tdc = _mock_client_and_tdc()
-        pre = _make_pre()
-
-        monkeypatch.setattr(
-            "crack_tcaptcha._legacy.icon_match.detect_icons",
-            lambda _bg: [],
-        )
-        with pytest.raises(SolveError, match="returned 0 bboxes"):
-            solve_one_attempt(client, pre, tdc)
-
-    def test_raises_when_ddddocr_missing(self, monkeypatch, stub_pow):
-        client, tdc = _mock_client_and_tdc()
-        pre = _make_pre()
-
-        # Make the lazy import inside solve_one_attempt blow up with ImportError
-        import builtins
-
-        real_import = builtins.__import__
-
-        def fake_import(name, globals=None, locals=None, fromlist=(), level=0):
-            if name == "crack_tcaptcha._legacy.icon_match" and fromlist and "detect_icons" in fromlist:
-                raise ImportError("no ddddocr")
-            return real_import(name, globals, locals, fromlist, level)
-
-        monkeypatch.setattr(builtins, "__import__", fake_import)
-
-        with pytest.raises(SolveError, match="requires ddddocr"):
-            solve_one_attempt(client, pre, tdc)
-
-    def test_success_with_llm_path(self, monkeypatch, stub_pow):
+    def test_success_with_siamese_path(self, monkeypatch, stub_pow, stub_finish):
         client, tdc = _mock_client_and_tdc()
         pre = _make_pre(instruction="请依次点击：甲 乙 ")
 
-        bboxes = [(0, 0, 20, 20), (40, 0, 60, 20)]
-        monkeypatch.setattr(
-            "crack_tcaptcha._legacy.icon_match.detect_icons",
-            lambda _bg: bboxes,
-        )
-        # Pretend LLM is configured
-        monkeypatch.setattr(
-            "crack_tcaptcha.pipelines.word_click.settings",
-            MagicMock(llm_api_key="k", llm_base_url="u"),
-        )
+        # Primary path: siamese returns explicit click coords for each target.
         monkeypatch.setattr(
-            "crack_tcaptcha.solvers.llm_vision.locate_chars",
-            lambda _bg, targets, bboxes: {"甲": 1, "乙": 2},
+            "crack_tcaptcha.solvers.word_ocr.locate_chars_by_siamese",
+            lambda _bg, targets: [(10, 10), (50, 10)],
         )
-        # finish_with_verify's TDC collect goes through; just rubber-stamp
-        monkeypatch.setattr("crack_tcaptcha.pipelines._common.resolve_tdc_url", lambda p: p)
 
         resp = solve_one_attempt(client, pre, tdc)
         assert resp.ok
-        # verify called with ans JSON describing click center coords
+
         kwargs = client.verify.call_args.kwargs
         ans = json.loads(kwargs["ans"])
         assert [a["type"] for a in ans] == ["DynAnswerType_POS", "DynAnswerType_POS"]
-        # bbox 1 center = (10,10), bbox 2 center = (50,10)
         assert ans[0]["data"] == "10,10"
         assert ans[1]["data"] == "50,10"
         assert kwargs["pow_answer"] == "p_42"
         assert kwargs["pow_calc_time"] == 3
 
-    def test_success_with_llm_absent_uses_ocr_only(self, monkeypatch, stub_pow):
+    def test_falls_back_to_ddddocr_when_siamese_fails(self, monkeypatch, stub_pow, stub_finish):
         client, tdc = _mock_client_and_tdc()
         pre = _make_pre(instruction="请依次点击：甲 ")
 
+        # Primary path raises SolveError → pipeline should fall back.
+        def raising_siamese(_bg, _targets):
+            raise SolveError("siamese unavailable")
+
         monkeypatch.setattr(
-            "crack_tcaptcha._legacy.icon_match.detect_icons",
-            lambda _bg: [(0, 0, 10, 10), (20, 0, 30, 10)],
+            "crack_tcaptcha.solvers.word_ocr.locate_chars_by_siamese",
+            raising_siamese,
         )
-        # LLM not configured
+        # Fallback path: _fallback_ddddocr imports match_words from _legacy.icon_match.
         monkeypatch.setattr(
-            "crack_tcaptcha.pipelines.word_click.settings",
-            MagicMock(llm_api_key="", llm_base_url=""),
+            "crack_tcaptcha._legacy.icon_match.match_words",
+            lambda _bg, _targets: [(33, 44)],
         )
-        ocr = MagicMock()
-        ocr.classification.return_value = "甲"
-        monkeypatch.setattr("crack_tcaptcha._legacy.icon_match._get_ocr", lambda: ocr)
-        monkeypatch.setattr("crack_tcaptcha.pipelines._common.resolve_tdc_url", lambda p: p)
 
         resp = solve_one_attempt(client, pre, tdc)
         assert resp.ok
+
+        kwargs = client.verify.call_args.kwargs
+        ans = json.loads(kwargs["ans"])
+        assert ans == [{"elem_id": 1, "type": "DynAnswerType_POS", "data": "33,44"}]
diff --git a/uv.lock b/uv.lock
index d860cdc..4410119 100644
--- a/uv.lock
+++ b/uv.lock
@@ -359,6 +359,9 @@ all = [
     { name = "cn-clip" },
     { name = "cn2an" },
     { name = "ddddocr" },
+    { name = "onnxruntime", version = "1.24.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "onnxruntime", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "opencv-python-headless" },
     { name = "torch" },
 ]
 clip = [
@@ -366,6 +369,17 @@ clip = [
     { name = "cn2an" },
     { name = "torch" },
 ]
+icon-click = [
+    { name = "ddddocr" },
+]
+word-click = [
+    { name = "ddddocr" },
+    { name = "onnxruntime", version = "1.24.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "onnxruntime", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "opencv-python-headless" },
+]
+
+[package.dev-dependencies]
 dev = [
     { name = "httpx" },
     { name = "hypothesis" },
@@ -377,9 +391,6 @@ dev = [
 docs = [
     { name = "mkdocs-material" },
 ]
-icon-click = [
-    { name = "ddddocr" },
-]
 
 [package.metadata]
 requires-dist = [
@@ -389,23 +400,32 @@ requires-dist = [
     { name = "cn2an", marker = "extra == 'clip'", specifier = ">=0.5" },
     { name = "ddddocr", marker = "extra == 'all'", specifier = ">=1.4" },
     { name = "ddddocr", marker = "extra == 'icon-click'", specifier = ">=1.4" },
+    { name = "ddddocr", marker = "extra == 'word-click'", specifier = ">=1.4" },
     { name = "httpx", specifier = ">=0.27" },
-    { name = "httpx", marker = "extra == 'dev'", specifier = ">=0.27" },
-    { name = "hypothesis", marker = "extra == 'dev'", specifier = ">=6.0" },
-    { name = "mkdocs-material", marker = "extra == 'docs'", specifier = ">=9.0" },
     { name = "numpy", specifier = ">=1.24" },
+    { name = "onnxruntime", marker = "extra == 'all'", specifier = ">=1.17" },
+    { name = "onnxruntime", marker = "extra == 'word-click'", specifier = ">=1.17" },
+    { name = "opencv-python-headless", marker = "extra == 'all'", specifier = ">=4.8" },
+    { name = "opencv-python-headless", marker = "extra == 'word-click'", specifier = ">=4.8" },
     { name = "pillow", specifier = ">=10.0" },
     { name = "pydantic", specifier = ">=2.0" },
     { name = "pydantic-settings", specifier = ">=2.0" },
-    { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" },
-    { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.23" },
-    { name = "respx", marker = "extra == 'dev'", specifier = ">=0.21" },
-    { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.4" },
     { name = "scrapling", extras = ["fetchers"], specifier = ">=0.4.3" },
     { name = "torch", marker = "extra == 'all'", specifier = ">=2.0" },
     { name = "torch", marker = "extra == 'clip'", specifier = ">=2.0" },
 ]
-provides-extras = ["icon-click", "clip", "all", "dev", "docs"]
+provides-extras = ["icon-click", "word-click", "clip", "all"]
+
+[package.metadata.requires-dev]
+dev = [
+    { name = "httpx", specifier = ">=0.27" },
+    { name = "hypothesis", specifier = ">=6.0" },
+    { name = "pytest", specifier = ">=8.0" },
+    { name = "pytest-asyncio", specifier = ">=0.23" },
+    { name = "respx", specifier = ">=0.21" },
+    { name = "ruff", specifier = ">=0.4" },
+]
+docs = [{ name = "mkdocs-material", specifier = ">=9.0" }]
 
 [[package]]
 name = "cssselect"
@@ -1602,10 +1622,14 @@ dependencies = [
     { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/79/42/2310883be3b8826ac58c3f2787b9358a2d46923d61f88fedf930bc59c60c/opencv_python_headless-4.13.0.92-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:1a7d040ac656c11b8c38677cc8cccdc149f98535089dbe5b081e80a4e5903209", size = 46247192, upload-time = "2026-02-05T07:01:35.187Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/1e/6f9e38005a6f7f22af785df42a43139d0e20f169eb5787ce8be37ee7fcc9/opencv_python_headless-4.13.0.92-cp37-abi3-macosx_14_0_x86_64.whl", hash = "sha256:3e0a6f0a37994ec6ce5f59e936be21d5d6384a4556f2d2da9c2f9c5dc948394c", size = 32568914, upload-time = "2026-02-05T07:01:51.989Z" },
     { url = "https://files.pythonhosted.org/packages/21/76/9417a6aef9def70e467a5bf560579f816148a4c658b7d525581b356eda9e/opencv_python_headless-4.13.0.92-cp37-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5c8cfc8e87ed452b5cecb9419473ee5560a989859fe1d10d1ce11ae87b09a2cb", size = 33703709, upload-time = "2026-02-05T10:24:46.469Z" },
     { url = "https://files.pythonhosted.org/packages/92/ce/bd17ff5772938267fd49716e94ca24f616ff4cb1ff4c6be13085108037be/opencv_python_headless-4.13.0.92-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0525a3d2c0b46c611e2130b5fdebc94cf404845d8fa64d2f3a3b679572a5bd22", size = 56016764, upload-time = "2026-02-05T10:26:48.904Z" },
     { url = "https://files.pythonhosted.org/packages/8f/b4/b7bcbf7c874665825a8c8e1097e93ea25d1f1d210a3e20d4451d01da30aa/opencv_python_headless-4.13.0.92-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:eb60e36b237b1ebd40a912da5384b348df8ed534f6f644d8e0b4f103e272ba7d", size = 35010236, upload-time = "2026-02-05T10:28:11.031Z" },
     { url = "https://files.pythonhosted.org/packages/4b/33/b5db29a6c00eb8f50708110d8d453747ca125c8b805bc437b289dbdcc057/opencv_python_headless-4.13.0.92-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:0bd48544f77c68b2941392fcdf9bcd2b9cdf00e98cb8c29b2455d194763cf99e", size = 60391106, upload-time = "2026-02-05T10:30:14.236Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/c3/52cfea47cd33e53e8c0fbd6e7c800b457245c1fda7d61660b4ffe9596a7f/opencv_python_headless-4.13.0.92-cp37-abi3-win32.whl", hash = "sha256:a7cf08e5b191f4ebb530791acc0825a7986e0d0dee2a3c491184bd8599848a4b", size = 30812232, upload-time = "2026-02-05T07:02:29.594Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/90/b338326131ccb2aaa3c2c85d00f41822c0050139a4bfe723cfd95455bd2d/opencv_python_headless-4.13.0.92-cp37-abi3-win_amd64.whl", hash = "sha256:77a82fe35ddcec0f62c15f2ba8a12ecc2ed4207c17b0902c7a3151ae29f37fb6", size = 40070414, upload-time = "2026-02-05T07:02:26.448Z" },
 ]
 
 [[package]]