diff --git a/.env.example b/.env.example index 18164a8..21b7f48 100644 --- a/.env.example +++ b/.env.example @@ -1,6 +1,27 @@ -# Copy to .env and fill in. .env is gitignored. +# 复制为 .env 后填写;.env 已在 .gitignore 中忽略 TCAPTCHA_BASE_URL=https://t.captcha.qq.com -TCAPTCHA_LLM_API_KEY=sk-your-relay-key-here -TCAPTCHA_LLM_BASE_URL=https://your-relay.example.com + +# --- LLM 视觉求解器 --- +# 只有 image_select pipeline 需要;word_click 已切换到本地 YOLO + Siamese +# (见 `--extra word-click`),除非你要用 image_select,否则下面几项留空即可。 +TCAPTCHA_LLM_API_KEY= +TCAPTCHA_LLM_BASE_URL= TCAPTCHA_LLM_MODEL=gpt-5.4 TCAPTCHA_LLM_TIMEOUT=30 + +# --- word_click / ONNX Runtime 调优 --- +# 执行后端:默认 "auto" 按 CUDA > ROCm > DML > CoreML > CPU 的顺序挑选。 +# macOS 下 CoreML 首次图编译较慢,通常固定为 "cpu" 更快。 +# 可选值:auto | cpu | cuda | rocm | dml | coreml +# TCAPTCHA_ORT_BACKEND=cpu +# +# ORT intra-op 线程数,默认 min(4, os.cpu_count())。 +# 52×52 的 Siamese 模型过 4 线程后会反向变慢,没实测过就别动。 +# TCAPTCHA_ORT_INTRA_OP_THREADS=4 + +# --- serve 模式(crack-tcaptcha serve) --- +# POST /solve 的共享密钥;设置后客户端必须在请求头带 `X-SK`。 +# TCAPTCHA_SERVE_SK=change-me +# TCAPTCHA_SERVE_HOST=127.0.0.1 +# TCAPTCHA_SERVE_PORT=9991 +# TCAPTCHA_SERVE_WORKERS=4 diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..d355526 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +src/crack_tcaptcha/solvers/models/*.onnx filter=lfs diff=lfs merge=lfs -text +src/crack_tcaptcha/solvers/models/*.ttf filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index f441298..77ec3fa 100644 --- a/.gitignore +++ b/.gitignore @@ -13,5 +13,4 @@ site *.so *.whl .env -*.onnx origin_papers/ diff --git a/AGENTS.md b/AGENTS.md index c0d5d75..1c4fc26 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -24,7 +24,8 @@ Python >= 3.10, `uv` is the canonical package manager. uv sync # Install with optional extras -uv sync --extra icon-click # adds ddddocr + onnxruntime (needed for icon_click and word_click) +uv sync --extra icon-click # ddddocr + onnxruntime (icon_click pipeline) +uv sync --extra word-click # onnxruntime + opencv-headless + ddddocr (word_click pipeline, local YOLO+Siamese) uv sync --extra dev # pytest, respx, ruff, hypothesis uv sync --extra docs # mkdocs-material @@ -40,9 +41,15 @@ uv run pytest tests/pipelines/ -q # a single directory uv run ruff check . uv run ruff format . -# CLI +# CLI — one-shot uv run crack-tcaptcha solve --appid YOUR_APPID --entry-url https://your-site.example/login +# CLI — long-running HTTP service (recommended for repeated use; models load once) +uv run crack-tcaptcha serve --port 9991 --workers 4 +# POST http://127.0.0.1:9991/solve {"appid":"YOUR_APPID","retries":3} +# GET http://127.0.0.1:9991/health +# set TCAPTCHA_SERVE_SK to require an X-SK header. + # Docs uv run mkdocs serve ``` @@ -53,7 +60,8 @@ uv run mkdocs serve src/crack_tcaptcha/ ├── __init__.py # public API: solve() ├── captcha_type.py # pure-function classifier (dyn_show_info → type) -├── cli.py # argparse entry point +├── cli.py # argparse entry point (solve / serve subcommands) +├── server.py # long-running HTTP service (stdlib http.server) ├── client.py # HTTP three-phase + JSONP unwrap (scrapling / curl_cffi) ├── exceptions.py # NetworkError, SolveError, PowError, TDCError ├── models.py # pydantic models for prehandle / verify responses @@ -64,10 +72,13 @@ src/crack_tcaptcha/ │ ├── _common.py # run_async, finish_with_verify (shared tail) │ ├── slide.py # NCC template match │ ├── icon_click.py # ddddocr detect + template match -│ ├── word_click.py # ddddocr detect + LLM vision (+ OCR fallback) +│ ├── word_click.py # YOLO detect + Siamese match (local ONNX); ddddocr OCR fallback │ └── image_select.py # LLM region matching ├── solvers/ -│ └── llm_vision.py # OpenAI-compatible vision client +│ ├── ort_provider.py # ONNX Runtime execution-provider selection +│ ├── word_ocr.py # YOLOv8 + Siamese solver for word_click (fast path) +│ ├── llm_vision.py # OpenAI-compatible vision client (image_select only) +│ └── models/ # bundled ONNX models + font.ttf (force-included in wheel) └── tdc/ ├── provider.py # TDCProvider Protocol (DI point) ├── nodejs_jsdom.py # Node.js subprocess implementation @@ -77,6 +88,8 @@ src/crack_tcaptcha/ Dependency direction is strictly top-down: `pipelines/` depends on `solvers/`, `tdc/`, `client.py`, `pow.py`, `trajectory.py`. `solvers/` and `tdc/` are independent of each other and must not import from `pipelines/`. +`server.py` depends on `__init__.solve` and may trigger `solvers/word_ocr.warmup` +at startup — it must not import from `pipelines/` directly. ## 4. Key Conventions @@ -124,9 +137,19 @@ Dependency direction is strictly top-down: `pipelines/` depends on `DynAnswerType_UC`, `elem_id=""`, `data=""`. - **Trajectory jitter.** Ease-in-out cubic with ±1 px jitter currently passes. Perfectly smooth trajectories get detected. -- **LLM retry semantics.** `locate_chars` / `match_region` each retry once +- **LLM retry semantics.** `match_region` (image_select) retries once internally on transport errors. Outer retries are the pipeline's `max_retries` (entire prehandle → verify loop). +- **word_click model files are bundled.** `src/crack_tcaptcha/solvers/models/` + ships `word_click_detector.onnx` (YOLOv8, 10 MB), + `word_click_matcher.onnx` (Siamese, 29 MB), and `font.ttf` (4.6 MB). + These are `force-include`d into the wheel via hatch config. Don't + rename them without updating `word_ocr.py` and `pyproject.toml`. +- **ORT cold-start hides behind warmup.** `crack-tcaptcha solve` spawns a + background thread that calls `solvers.word_ocr.warmup()` while the + first HTTP round-trip is in flight; `crack-tcaptcha serve` warms up at + boot. On macOS, `TCAPTCHA_ORT_BACKEND=cpu` is usually faster than the + default CoreML auto-pick because CoreML pays a one-off graph compile. ## 6. Testing Guidelines @@ -173,11 +196,15 @@ Dependency direction is strictly top-down: `pipelines/` depends on - **Node.js >= 18** for the TDC.js bridge (`tdc/js/tdc_executor.js`, runs `tdc.js` inside jsdom). Install deps with `cd src/crack_tcaptcha/tdc/js && npm install`. -- **`ddddocr`** (optional extra `icon-click`) for icon/character - detection. Required by `icon_click` and `word_click` pipelines. Pulls - in `onnxruntime`. -- **OpenAI-compatible LLM relay** for `word_click` (recommended) and - `image_select` (required). Configure via `TCAPTCHA_LLM_API_KEY`, +- **`ddddocr`** (optional extra `icon-click`, and part of `word-click`) + for icon / character detection. Required by `icon_click` and used as + the `word_click` fallback path. Pulls in `onnxruntime`. +- **`onnxruntime` + `opencv-python-headless`** (optional extra + `word-click`, alongside `ddddocr`). Required for the primary + `word_click` path (local YOLOv8 detector + Siamese matcher shipped + under `solvers/models/`). No external API calls. +- **OpenAI-compatible LLM relay** for `image_select` (required). No + longer required for `word_click`. Configure via `TCAPTCHA_LLM_API_KEY`, `TCAPTCHA_LLM_BASE_URL`, `TCAPTCHA_LLM_MODEL`, `TCAPTCHA_LLM_TIMEOUT` in `.env`. Any `/v1/chat/completions` endpoint that accepts `image_url` content blocks works. diff --git a/CLAUDE.md b/CLAUDE.md index 02da5ee..6a01359 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -65,3 +65,21 @@ after — don't batch updates. against live risk-control signals) - Deleting or renaming files under `tdc/js/` (tdc.js is vendored intentionally) +- Deleting, renaming, or re-quantizing files under + `src/crack_tcaptcha/solvers/models/` (bundled ONNX models + font are + force-included into the wheel; a rename means editing `word_ocr.py` + and `pyproject.toml` in lockstep) + +### word_click / serve mode + +- Primary `word_click` path is local (YOLO detector + Siamese matcher + ONNX models under `solvers/models/`). LLM is no longer required. +- When iterating on `solvers/word_ocr.py`, prefer the serve mode to + avoid per-run ONNX cold-start: + ```bash + uv run crack-tcaptcha serve --port 9991 --workers 2 + # then hit POST /solve repeatedly + ``` +- On macOS, if solve feels slow, check provider selection: CoreML EP + pays a per-process graph-compile cost. Force CPU with + `TCAPTCHA_ORT_BACKEND=cpu` when benchmarking. diff --git a/README.md b/README.md index 6bfb3dd..6d9442e 100644 --- a/README.md +++ b/README.md @@ -5,26 +5,29 @@ ![verify ok](images/word-click-success.png) -> 上图为 word_click(文字点选)流水线的真实运行日志:从 `prehandle` → `getcapbysig` 下载背景图 → LLM 视觉给出点击坐标 → `nodejs_jsdom` 采集 TDC collect / eks / pow → `cap_union_new_verify` 一次通过,`ok=true`。 +> 上图为 word_click(文字点选)流水线的真实运行日志:从 `prehandle` → `getcapbysig` 下载背景图 → **本地 YOLO + Siamese 模型**给出点击坐标 → `nodejs_jsdom` 采集 TDC collect / eks / pow → `cap_union_new_verify` 一次通过,`ok=true`。 ## 特性 - **4 种验证类型**:`slider`(滑块)、`icon_click`(图标点击)、`word_click`(文字点选)、`image_select`(图像选择) - **无头浏览器依赖**:`nodejs_jsdom` 在 Node.js 进程里用 jsdom 跑官方 TDC.js,生成 `collect / eks / tokenid / pow_answer` -- **策略化求解器**:滑块使用 OpenCV 模板匹配;点击类支持 `ddddocr` / 任意 OpenAI 兼容的 LLM vision +- **策略化求解器**:滑块使用 OpenCV 模板匹配;`word_click` 走本地 **YOLOv8 检测 + Siamese 匹配**(纯 ONNX Runtime,单次 ~200 ms);`icon_click` 使用 `ddddocr`;`image_select` 使用 OpenAI 兼容 LLM vision +- **常驻 HTTP 服务**:`crack-tcaptcha serve` 让模型只加载一次,每次求解只付推理时间(零进程冷启动) - **工程化**:pydantic-settings 配置、结构化日志、CLI、pytest,类型完整 ## 当前测试状态 | 类型 | 状态 | 备注 | |---|---|---| -| `word_click`(文字点选) | ✅ 已跑通(见上图) | LLM vision 映射字→bbox,一次通过 | +| `word_click`(文字点选) | ✅ 已跑通(见上图) | 本地 YOLO + Siamese 模型,一次通过 | | `slider`(滑块) | 🧪 未充分验证 | pipeline 已实现,仅做过少量手工测试 | | `icon_click`(图标点击) | 🧪 未充分验证 | pipeline 已实现,依赖 `ddddocr`,待回归 | | `image_select`(图像选择) | 🧪 未充分验证 | pipeline 已实现,待回归 | > 目前项目重点打磨 `word_click`,其它类型欢迎 PR 补测试样本 / 回归用例。 +> 📎 **历史方案**:早期 `word_click` 使用 GPT / OpenAI 兼容 LLM vision 接口识别文字坐标的实现,已保留在 [`legacy-llm-vision`](../../tree/legacy-llm-vision) 分支,供参考或回退使用。主分支已切换为本地 YOLOv8 + Siamese 纯 ONNX 方案,无需任何外部大模型 API。 + ## 安装 ### 按需求选择 @@ -33,26 +36,32 @@ # 最小安装:仅 slider pipeline(HTTP + 轨迹生成,无 ML 依赖) uv add crack-tcaptcha -# 推荐:图标点击 + 文字点选(word_click 也依赖 ddddocr) +# 文字点选(本地 YOLO + Siamese 模型;含 ddddocr 作为 OCR 兜底) +uv add "crack-tcaptcha[word-click]" + +# 图标点击(仅 ddddocr) uv add "crack-tcaptcha[icon-click]" # 中文图像选择(cn-clip / torch,下载模型约数百 MB) uv add "crack-tcaptcha[clip]" -# 全功能一键装(= icon-click + clip) +# 全功能一键装(= word-click + icon-click + clip) uv add "crack-tcaptcha[all]" ``` -也可以用 `pip` 替代 `uv add`,语法一致:`pip install 'crack-tcaptcha[icon-click]'`。 +也可以用 `pip` 替代 `uv add`,语法一致:`pip install 'crack-tcaptcha[word-click]'`。 | Extra | 引入依赖 | 启用的 pipeline | |---|---|---| -| _(none)_ | 仅 httpx / pydantic / numpy / Pillow | `slider` | -| `icon-click` | `ddddocr`(+ onnxruntime) | `icon_click`、`word_click` | +| _(none)_ | httpx / pydantic / numpy / Pillow / scrapling | `slider` | +| `icon-click` | `ddddocr`(+ onnxruntime) | `icon_click` | +| `word-click` | `onnxruntime` + `opencv-python-headless` + `ddddocr` | `word_click`(本地 YOLO + Siamese,OCR 兜底) | | `clip` | `cn2an`、`cn-clip`、`torch` | `image_select`(CLIP backend) | | `all` | 以上全部 | 所有 pipeline | -> 运行 `word_click` / `icon_click` 前未装 `[icon-click]` 会得到清晰的 ModuleNotFoundError 提示。 +> 未装 `[word-click]` / `[icon-click]` 时,对应 pipeline 抛出清晰的 `SolveError` 提示应装哪个 extra。 + +> `word_click` 的本地模型(`word_click_detector.onnx` 10 MB、`word_click_matcher.onnx` 29 MB、`font.ttf` 4.6 MB)已随 wheel 打包,安装后开箱即用,无需额外下载。 ### 前置要求 @@ -83,13 +92,30 @@ if result.ok: ### 命令行 ```bash -# 通用求解:--appid 替换为你自己的 APP_ID +# 一次性求解:--appid 替换为你自己的 APP_ID crack-tcaptcha solve --appid YOUR_APPID --retries 3 --json # 指定来源页(会带上对应 Referer / Origin) crack-tcaptcha solve --appid YOUR_APPID --entry-url https://example.com/login --json ``` +### 常驻 HTTP 服务(推荐用于重复调用) + +一次性 CLI 每次都要冷启动 Python + 加载 ONNX 模型,首次可能要花几秒。常驻模式模型只加载一次,后续请求只付推理时间: + +```bash +# 启动(鉴权可选:导出 TCAPTCHA_SERVE_SK 后客户端需带 X-SK header) +export TCAPTCHA_SERVE_SK=change-me +crack-tcaptcha serve --port 9991 --workers 4 + +# 客户端:POST /solve +curl -H 'X-SK: change-me' -X POST http://127.0.0.1:9991/solve \ + -d '{"appid":"YOUR_APPID","retries":3}' + +# 健康检查 +curl http://127.0.0.1:9991/health +``` + > 命令行示例中的 `YOUR_APPID` 仅为占位符,请替换为你自己的 appid;仓库不提供任何真实业务 appid。 ## 本地测试页 @@ -111,18 +137,24 @@ crack-tcaptcha solve --appid YOUR_APPID --entry-url http://localhost:8765/tcap2_ ``` src/crack_tcaptcha/ -├── client.py # HTTPX 客户端:prehandle / getcapbysig / verify +├── client.py # HTTP 三段式:prehandle / getcapbysig / verify +├── cli.py # argparse 入口(solve / serve 子命令) +├── server.py # 常驻 HTTP 服务 ├── pow.py # PoW 求解 -├── trajectory.py # 轨迹/点击序列合成 +├── trajectory.py # 轨迹 / 点击序列合成 ├── captcha_type.py # 类型分发路由 ├── pipelines/ # 每种验证类型一个 pipeline │ ├── slide.py │ ├── icon_click.py -│ ├── word_click.py # 文字点选(对应截图演示) +│ ├── word_click.py # 本地 YOLO 检测 + Siamese 匹配(含 ddddocr 兜底) │ └── image_select.py -├── solvers/llm_vision.py # OpenAI 兼容 LLM 视觉求解器 +├── solvers/ +│ ├── ort_provider.py # ORT execution-provider 选择(CUDA/ROCm/DML/CoreML/CPU) +│ ├── word_ocr.py # YOLO + Siamese 求解器(word_click 主路径) +│ ├── llm_vision.py # OpenAI 兼容 LLM vision(image_select 用) +│ └── models/ # 打包的 ONNX 模型 + font.ttf └── tdc/ # TDC.js 桥 - ├── js/ # npm install 后放 node_modules + ├── js/ # npm install 后的 node_modules └── nodejs_jsdom.py # jsdom NodeProvider ``` @@ -141,9 +173,18 @@ src/crack_tcaptcha/ | `TCAPTCHA_TDC_TIMEOUT` | `60.0` | TDC.js 桥超时 | | `TCAPTCHA_TDC_DEBUG` | `false` | 打开后保留 jsdom 调试日志 | | `TCAPTCHA_PROXY` | `None` | `http://user:pass@host:port` | -| `TCAPTCHA_LLM_API_KEY` | `""` | LLM vision 求解器(`image_select` / `word_click`) | +| `TCAPTCHA_LLM_API_KEY` | `""` | LLM vision 求解器(仅 `image_select` 需要) | | `TCAPTCHA_LLM_BASE_URL` | `""` | OpenAI 兼容接口根 | | `TCAPTCHA_LLM_MODEL` | `gpt-5.4` | 模型名 | +| `TCAPTCHA_ORT_BACKEND` | `auto` | ONNX 执行后端:`auto` / `cpu` / `cuda` / `rocm` / `dml` / `coreml` | +| `TCAPTCHA_ORT_INTRA_OP_THREADS` | `min(4, cpu_count)` | ORT 线程数(Siamese 在 >4 时反而更慢) | +| `TCAPTCHA_SERVE_SK` | `""` | 常驻服务鉴权 secret;非空时请求必须带 `X-SK` header | +| `TCAPTCHA_SERVE_HOST` | `127.0.0.1` | `serve` 子命令监听地址 | +| `TCAPTCHA_SERVE_PORT` | `9991` | `serve` 子命令监听端口 | +| `TCAPTCHA_SERVE_WORKERS` | `4` | `serve` 并发 solve 上限 | + +> macOS 下若首次求解明显慢,通常是 CoreML 后端的图编译开销; +> 导出 `TCAPTCHA_ORT_BACKEND=cpu` 往往比默认更快。 ## 开发 @@ -154,47 +195,6 @@ uv run pytest -x -ra uv run pytest -m "not network" # 跳过联网用例 ``` -## 推荐 — 用本地模型替换 LLM vision - -当前 `word_click` / `image_select` 走 OpenAI 兼容接口,单次推理 **1~3 s** 起步(受网络、排队、token 数影响),是整条链路里最慢的一步。 -本地模型可以把这一步压到 **≤200 ms**,且无调用成本 / 限流 / 数据出站风险。 - -两类任务本质都是 **"把一张图映射到一个确定的类别 / 索引"**,不需要真正的生成式 VLM: - -### 方案 A:PaddleOCR + 轻量匹配(推荐) - -| 子任务 | 本地替代 | -|---|---| -| `word_click`:识别背景图 3 个 bbox 里各是什么汉字 | **PaddleOCR** (`ch_PP-OCRv4`),单字裁剪后 OCR → 与指令中的字做字符串匹配 | -| `image_select`:在 N 宫格里挑"哪个是苹果" | **PaddleClas PP-LCNet / PP-ShiTu** 或 **cn-clip ViT-B/16**(已列在 `[clip]` extras) | - -优点:CPU 可跑、模型 <20 MB、推理 10~50 ms;PaddleOCR 对中文场景文字鲁棒性很好。 - -### 方案 B:CLIP 类零样本匹配 - -直接复用仓库里已经声明过的 `cn-clip` 依赖: - -```bash -uv add "crack-tcaptcha[clip]" -``` - -- `word_click`:把每个 bbox 裁剪图与 "一张写着'X'字的图" 做 image-text 相似度 argmax(但中文单字 CLIP 准确率一般,建议配合 OCR 投票) -- `image_select`:把指令"请选出所有包含苹果的图片"直接作为 text query,对 N 个格子打分排序,取 top-k - -优点:一个模型吃下所有"图→文"匹配场景;缺点:模型 ~400 MB,冷启动有成本。 - -### 方案 C:ddddocr + 本地分类头(最轻) - -- `icon_click` 已经在用 `ddddocr`;`word_click` 的 bbox 识别也可以换 `ddddocr.DdddOcr(det=False)`(纯 OCR 模式) -- 对 `image_select` 训一个 **PP-LCNet** 分类头(常见类别就那几类:动物、交通工具、食物...)+ "其它"兜底走 CLIP - -### 落地建议 - -1. 在 `solvers/` 下新增 `paddle_ocr.py` 和 `cn_clip.py`,实现与 `llm_vision.py` 同签名(`match_region` / `locate_chars`) -2. 在 `settings.py` 加 `solver_backend: Literal["llm", "paddle", "clip", "ddddocr"] = "llm"` -3. pipeline 启动时根据 backend 路由,保留 LLM 作为兜底(本地模型置信度 < 阈值时回退) -4. 评估指标:单验证码平均耗时、端到端通过率、CPU / 显存占用,基准样本集可用 `tests/samples/` - ## 免责声明 本项目 **仅用于个人安全研究、技术学习与学术交流**,不代表任何商业机构的立场。 diff --git a/docs/architecture.md b/docs/architecture.md index 807fd2e..c4b0737 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -5,6 +5,7 @@ ``` ┌───────────────────────────────────────────────────────────────┐ │ solve(appid, ...) ← crack_tcaptcha/__init__ │ +│ crack-tcaptcha serve ← crack_tcaptcha/server.py │ └──────────────┬────────────────────────────────────────────────┘ │ classify(dyn) → captcha_type ▼ @@ -12,17 +13,25 @@ │ pipelines/ │ │ ├─ slide.py ── OpenCV NCC │ │ ├─ icon_click.py ── ddddocr │ -│ ├─ word_click.py ── ddddocr + llm_vision │ +│ ├─ word_click.py ── word_ocr (YOLO + Siamese) │ +│ │ (回退: ddddocr) │ │ ├─ image_select.py ── llm_vision │ │ └─ _common.py ── finish_with_verify / run_async │ └──────────────┬────────────────────────────────────────────────┘ │ uses uses ▼ ▼ - ┌──────────────────────┐ ┌──────────────────────┐ - │ solvers/ │ │ tdc/ │ - │ └─ llm_vision.py │ │ ├─ provider.py │ - │ (OpenAI 兼容) │ │ └─ nodejs_jsdom.py │ - └──────────────────────┘ └──────────────────────┘ + ┌──────────────────────────────┐ ┌──────────────────────┐ + │ solvers/ │ │ tdc/ │ + │ ├─ ort_provider.py │ │ ├─ provider.py │ + │ ├─ word_ocr.py │ │ └─ nodejs_jsdom.py │ + │ │ (YOLO + Siamese ONNX) │ │ │ + │ ├─ llm_vision.py │ │ │ + │ │ (OpenAI 兼容) │ │ │ + │ └─ models/ │ │ │ + │ word_click_detector.onnx│ │ │ + │ word_click_matcher.onnx │ │ │ + │ font.ttf │ │ │ + └──────────────────────────────┘ └──────────────────────┘ │ │ └──────────┬─────────────────────────────┘ ▼ @@ -35,7 +44,7 @@ └──────────────────────────────────────────┘ ``` -依赖方向严格自上而下:`pipelines` 依赖 `solvers`、`tdc`、`client`;`solvers` 和 `tdc` 互不依赖。 +依赖方向严格自上而下:`pipelines` 依赖 `solvers`、`tdc`、`client`;`solvers` 和 `tdc` 互不依赖。`server.py` 只依赖 `__init__.solve` 与 `solvers/word_ocr.warmup`,不直接导入 `pipelines/`。 ## 三段式协议 @@ -78,7 +87,7 @@ POST 字段:`ans`、`pow_answer`、`pow_calc_time`、`collect`、`tlg`、`eks` |---|---|---|---| | `slide` | `solvers`(内嵌 NCC,见 `pipelines/slide.py` 的 `SliderSolver`) | —— | `numpy`、`Pillow` | | `icon_click` | `ddddocr` 检测 + 模板匹配 | —— | `ddddocr`(extra `icon-click`) | -| `word_click` | `solvers/llm_vision.locate_chars` | `ddddocr` 按 bbox 分类 + 子串匹配 | `ddddocr` + OpenAI 兼容 API | +| `word_click` | `solvers/word_ocr.locate_chars_by_siamese`(YOLO + Siamese ONNX,本地) | `ddddocr` 检测 + 分类兜底 | `onnxruntime` + `opencv-python-headless` + `ddddocr`(extra `word-click`) | | `image_select` | `solvers/llm_vision.match_region` | —— | OpenAI 兼容 API | 自动路由:`captcha_type.classify(dyn_show_info)` 是一个纯函数分类器,按规则顺序返回 `slide` / `icon_click` / `word_click` / `image_select` / `unknown`。规则命中后 `pipelines.dispatch` 将请求分发到对应的 pipeline。 diff --git a/docs/index.md b/docs/index.md index d5e25e4..a0885a1 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,7 +8,7 @@ |---|---|---| | 滑块 `slider` | OpenCV NCC 模板匹配 | [slider.md](slider.md) | | 图标点击 `icon_click` | ddddocr 检测 + 模板匹配 | [icon-click.md](icon-click.md) | -| 文字点选 `word_click` | ddddocr 检测 + LLM vision | [word-click.md](word-click.md) | +| 文字点选 `word_click` | 本地 YOLO + Siamese(ddddocr 兜底) | [word-click.md](word-click.md) | | 图像选择 `image_select` | LLM vision | [image-select.md](image-select.md) | ## 快速导航 diff --git a/docs/word-click.md b/docs/word-click.md index 2c8406c..216d642 100644 --- a/docs/word-click.md +++ b/docs/word-click.md @@ -6,33 +6,71 @@ TCaptcha 2.0 的文字点选挑战。提示词(目标汉字序列)内嵌在 1. `prehandle` 返回 `instruction`(含目标汉字)、背景图 URL、`pow_cfg`;`fg_elem_list` 为空 2. 下载背景图(`bg_elem_cfg.size_2d = [672, 480]`) -3. `ddddocr` 在背景上检测候选汉字 bbox -4. **主路径:LLM vision** —— 给每个候选 bbox 画红框 + 编号(1..N),和目标汉字一起发给 LLM,要求返回 `{char: box_index}` JSON -5. **回退路径:ddddocr OCR 分类** —— 对 LLM 没映射上的汉字,逐 bbox 跑 `ddddocr.classification` 做子串匹配 -6. 按提示顺序把每个汉字映射到 bbox 中心坐标 -7. 生成点击轨迹,通过 TDC 桥取 `collect` / `eks` -8. POST `verify` +3. **主路径:YOLOv8 检测 + Siamese 匹配(本地 ONNX,纯 CPU 约 200 ms)** + 1. `word_click_detector.onnx`(YOLOv8)在背景图上定位所有字符 bbox + 2. 用打包在仓库里的 `font.ttf` 把 `instruction` 里每个目标汉字渲染成 52×52 参考图 + 3. 对每一对 `(bbox 裁剪图, 目标参考图)` 跑 `word_click_matcher.onnx`(Siamese)得相似度 + 4. 对每个目标按指令顺序贪心取当前未使用的最高分 bbox +4. **回退路径:ddddocr**(仅在 YOLO 检测结果为 0、或未安装 `[word-click]` extra 时走) —— `detection + classification` 子串匹配 +5. 按提示顺序把每个汉字映射到 bbox 中心坐标 +6. 生成点击轨迹,通过 TDC 桥取 `collect` / `eks` +7. POST `verify` + +详见 `src/crack_tcaptcha/pipelines/word_click.py` 与 `src/crack_tcaptcha/solvers/word_ocr.py`。 + +## 模型与依赖 + +装 `word-click` extra: + +```bash +uv sync --extra word-click +# 或 +pip install 'crack-tcaptcha[word-click]' +``` + +这会安装: -详见 `src/crack_tcaptcha/pipelines/word_click.py`。 +| 包 | 作用 | +|---|---| +| `onnxruntime` | 跑 YOLO + Siamese ONNX | +| `opencv-python-headless` | 图像预处理(letterbox、裁剪、colorspace) | +| `ddddocr` | 回退路径(YOLO 检测失败时兜底) | -## LLM 配置 +模型文件已通过 hatch `force-include` 打包进 wheel,开箱即用: -求解器实现在 `src/crack_tcaptcha/solvers/llm_vision.py`(`locate_chars` 函数),走 OpenAI 兼容的 `/v1/chat/completions` 接口。 +| 文件 | 大小 | 作用 | +|---|---|---| +| `solvers/models/word_click_detector.onnx` | 10 MB | YOLOv8 检测器,输出字符 bbox | +| `solvers/models/word_click_matcher.onnx` | 29 MB | Siamese 相似度网络(输入 52×52×3×2,输出 1 个相似度) | +| `solvers/models/font.ttf` | 4.6 MB | 目标字渲染字体(颜色 BGR `(56,178,227)` 模拟验证码配色) | -需要的环境变量(pydantic-settings 自动读 `.env` 或环境变量): +## 性能调优 -| 环境变量 | 说明 | 默认值 | +ONNX Runtime 执行后端通过环境变量选择,默认 `auto` 优先挑 `CUDA > ROCm > DML > CoreML > CPU`: + +| 环境变量 | 默认 | 说明 | |---|---|---| -| `TCAPTCHA_LLM_API_KEY` | Bearer token | `""`(未配置时 LLM 路径会跳过) | -| `TCAPTCHA_LLM_BASE_URL` | 不含 `/v1/...` 的基础 URL | `""` | -| `TCAPTCHA_LLM_MODEL` | 模型名 | `gpt-5.4` | -| `TCAPTCHA_LLM_TIMEOUT` | HTTP 超时秒数 | `30` | +| `TCAPTCHA_ORT_BACKEND` | `auto` | 强制后端:`cpu` / `cuda` / `rocm` / `dml` / `coreml` / `auto` | +| `TCAPTCHA_ORT_INTRA_OP_THREADS` | `min(4, cpu_count)` | ORT 线程数;Siamese 太小,>4 反而更慢 | + +**macOS 提示**:默认会选 CoreML EP,首次推理需要一次性的 graph-compile 开销(几秒),总耗时可能比纯 CPU 还高。重复调用请用常驻服务模式,或 `export TCAPTCHA_ORT_BACKEND=cpu` 固定 CPU。 -**支持的后端**:任何 OpenAI 兼容的中继服务(官方 OpenAI、Azure OpenAI 代理、自建 vLLM / llama.cpp 的 OpenAI shim 等)。请求体用 `image_url` 传 base64 图片,返回必须是能解析出 JSON 对象的文本。 +### 常驻服务模式 + +一次性 CLI 每次都付 Python 启动 + ONNX 加载的冷启动成本。生产 / 重复调用请用 `serve` 子命令: + +```bash +# 启动(模型在进程启动时只加载一次) +TCAPTCHA_SERVE_SK=change-me crack-tcaptcha serve --port 9991 --workers 4 + +# 客户端 +curl -H 'X-SK: change-me' -X POST http://127.0.0.1:9991/solve \ + -d '{"appid":"YOUR_APPID","retries":3}' +``` -**Prompt 位置**:`_build_word_click_prompt()` 和 `_build_prompt()` 函数在 `solvers/llm_vision.py` 顶部,prompt 硬编码在 Python 中,方便直接修改;调整后不需要改 pipeline。 +## LLM 不再需要 -如果没有配置 LLM(key / base_url 任一为空),pipeline 会走纯 ddddocr OCR 回退;经验上 ddddocr 对验证码字体识别率明显低于 vision LLM,推荐配置 LLM。 +之前版本主路径走 OpenAI 兼容 `/v1/chat/completions`;本地模型 + ddddocr 兜底足够覆盖所有已知 word_click 样本,LLM 相关环境变量(`TCAPTCHA_LLM_*`)只对 `image_select` 还有意义,可留空。 ## Answer 格式 @@ -48,7 +86,9 @@ TCaptcha 2.0 的文字点选挑战。提示词(目标汉字序列)内嵌在 ## 常见坑 -- **LLM 返回非 JSON** —— `solvers/llm_vision.py` 内部会做一次重试;仍失败则抛 `SolveError`,由 pipeline 走 ddddocr 回退 -- **bbox 数量少于目标字数** —— 说明 detection 漏检,记录 warning 后仍会尽力回答,但通过率会下降 - **提示词里混入标点或空格** —— `_parse_target_chars` 用 `[\u4e00-\u9fff]` 正则只抓 CJK 汉字 -- **最后兜底** —— 如果所有方法都没给某个字找到 box,会把它分配到第一个未被使用的 bbox,至少保证一次可见的点击(而不是 `(0,0)` 的显式错误) +- **bbox 数量少于目标字数** —— YOLO 漏检,pipeline 会记 warning 后仍尽力回答;验证失败时外层 `max_retries` 会重拉一次 prehandle +- **YOLO 返回 0 bbox** —— 自动走 ddddocr detect + classification 兜底 +- **CoreML EP 首次慢** —— 见上节;`serve` 模式或 `TCAPTCHA_ORT_BACKEND=cpu` 解决 +- **模型文件被安全软件误删** —— 重装 `[word-click]` extra 即可;模型在 wheel 里 +- **最后兜底** —— 如果所有方法都没给某个字找到 bbox(极少见),会把它分配到第一个未被使用的 bbox,至少保证一次可见的点击(而不是 `(0,0)` 的显式错误) diff --git a/pyproject.toml b/pyproject.toml index 599c5c8..19d1514 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,13 +24,22 @@ dependencies = [ [project.optional-dependencies] icon-click = ["ddddocr>=1.4"] +word-click = [ + "onnxruntime>=1.17", + "opencv-python-headless>=4.8", + "ddddocr>=1.4", +] clip = ["cn2an>=0.5", "cn-clip>=1.0", "torch>=2.0"] all = [ "ddddocr>=1.4", + "onnxruntime>=1.17", + "opencv-python-headless>=4.8", "cn2an>=0.5", "cn-clip>=1.0", "torch>=2.0", ] + +[dependency-groups] dev = [ "pytest>=8.0", "pytest-asyncio>=0.23", @@ -53,6 +62,18 @@ build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] packages = ["src/crack_tcaptcha"] +[tool.hatch.build.targets.wheel.force-include] +"src/crack_tcaptcha/solvers/models/word_click_detector.onnx" = "crack_tcaptcha/solvers/models/word_click_detector.onnx" +"src/crack_tcaptcha/solvers/models/word_click_matcher.onnx" = "crack_tcaptcha/solvers/models/word_click_matcher.onnx" +"src/crack_tcaptcha/solvers/models/font.ttf" = "crack_tcaptcha/solvers/models/font.ttf" + +[tool.hatch.build.targets.sdist] +include = [ + "src/crack_tcaptcha/**", + "src/crack_tcaptcha/solvers/models/*.onnx", + "src/crack_tcaptcha/solvers/models/*.ttf", +] + [tool.ruff] line-length = 130 target-version = "py310" diff --git a/src/crack_tcaptcha/cli.py b/src/crack_tcaptcha/cli.py index 2227842..4d43018 100644 --- a/src/crack_tcaptcha/cli.py +++ b/src/crack_tcaptcha/cli.py @@ -6,6 +6,21 @@ import json import logging import sys +import threading + + +def _warmup_word_click() -> None: + """Best-effort: preload YOLO + Siamese sessions in background. + + Runs concurrently with the first prehandle HTTP request so the model + load latency is hidden behind network wait time. + """ + try: + from crack_tcaptcha.solvers.word_ocr import warmup + + warmup() + except Exception: # word-click extra not installed — that's fine + pass def main(argv: list[str] | None = None) -> None: @@ -18,18 +33,49 @@ def main(argv: list[str] | None = None) -> None: parser = argparse.ArgumentParser(prog="crack-tcaptcha", description="TCaptcha automated solver") sub = parser.add_subparsers(dest="command") - solve_p = sub.add_parser("solve", help="Solve a TCaptcha challenge") + solve_p = sub.add_parser("solve", help="Solve a TCaptcha challenge (one-shot)") solve_p.add_argument("--appid", required=True, help="TCaptcha APP_ID") solve_p.add_argument("--retries", type=int, default=3, help="Max retry attempts") solve_p.add_argument("--entry-url", default="", help="Parent page URL (optional)") solve_p.add_argument("--json", action="store_true", dest="as_json", help="Output as JSON") + solve_p.add_argument( + "--no-warmup", + action="store_true", + help="Skip background ONNX model warmup (useful for benchmarking cold-start)", + ) + + serve_p = sub.add_parser( + "serve", + help="Run a long-lived HTTP server (models load once; best for repeated use)", + ) + serve_p.add_argument("--host", default="127.0.0.1") + serve_p.add_argument("--port", type=int, default=9991) + serve_p.add_argument("--workers", type=int, default=4, help="Max concurrent solves") args = parser.parse_args(argv) + if args.command == "serve": + import os as _os + + from crack_tcaptcha.server import run as serve_run + + serve_run( + host=args.host, + port=args.port, + workers=args.workers, + sk=_os.environ.get("TCAPTCHA_SERVE_SK") or None, + ) + return + if args.command != "solve": parser.print_help() sys.exit(1) + # Kick off model warmup in a daemon thread so it overlaps with the + # first HTTP round-trip. + if not args.no_warmup: + threading.Thread(target=_warmup_word_click, name="word_click-warmup", daemon=True).start() + from crack_tcaptcha import solve result = solve(appid=args.appid, max_retries=args.retries, entry_url=args.entry_url) diff --git a/src/crack_tcaptcha/pipelines/word_click.py b/src/crack_tcaptcha/pipelines/word_click.py index cc113bc..44d299c 100644 --- a/src/crack_tcaptcha/pipelines/word_click.py +++ b/src/crack_tcaptcha/pipelines/word_click.py @@ -6,11 +6,13 @@ - fg_elem_list: [] (hint chars are inline in instruction text) - bg_elem_cfg.size_2d: [672, 480] -Solver pipeline: - 1. ddddocr detection → candidate char bboxes on bg - 2. LLM vision (primary): sends annotated bg + target chars → gets {char: bbox_idx} - 3. ddddocr OCR (fallback): if LLM unavailable or returns 0/miss for a char, - run per-bbox ddddocr classification and substring-match. +Solver pipeline (primary path, local, ~50-200ms CPU): + 1. YOLOv8 detection → candidate char bboxes on bg + 2. Siamese similarity: render each target char with the bundled font + and match it against every bbox crop; pick highest-scoring unused bbox. + +Fallback (when the siamese extra is not installed or the YOLO stage +returns 0 bboxes): the legacy ddddocr detection + per-bbox OCR path. Answer format (confirmed against real Chrome verify body): [{"elem_id": 1, "type": "DynAnswerType_POS", "data": "x,y"}, ...] @@ -27,7 +29,6 @@ from crack_tcaptcha.models import PrehandleResp, VerifyResp from crack_tcaptcha.pipelines._common import finish_with_verify from crack_tcaptcha.pow import solve_pow -from crack_tcaptcha.settings import settings from crack_tcaptcha.tdc.provider import TDCProvider from crack_tcaptcha.trajectory import generate_click_trajectory, merge_trajectories @@ -40,71 +41,16 @@ def _parse_target_chars(instruction: str) -> list[str]: return re.findall(r"[\u4e00-\u9fff]", after) -def _bbox_center(bbox: tuple[int, int, int, int]) -> tuple[int, int]: - x1, y1, x2, y2 = bbox - return ((x1 + x2) // 2, (y1 + y2) // 2) - - -def _fallback_match_by_ocr( - bg_bytes: bytes, - bboxes: list[tuple[int, int, int, int]], - targets: list[str], - already_assigned: dict[str, int], -) -> dict[str, int]: - """Per-bbox ddddocr classify + substring match for chars LLM missed.""" - import io - - from PIL import Image - - from crack_tcaptcha._legacy.icon_match import _get_ocr - - missing = [ch for ch in targets if already_assigned.get(ch, 0) <= 0] - if not missing: - return already_assigned - - ocr = _get_ocr() - bg_img = Image.open(io.BytesIO(bg_bytes)).convert("RGB") - bg_w, bg_h = bg_img.size - used_indices = {v for v in already_assigned.values() if v > 0} - bbox_ocr: dict[int, str] = {} - for i, (x1, y1, x2, y2) in enumerate(bboxes, start=1): - if i in used_indices: - continue - pad = 2 - crop = bg_img.crop((max(0, x1 - pad), max(0, y1 - pad), min(bg_w, x2 + pad), min(bg_h, y2 + pad))) - buf = io.BytesIO() - crop.save(buf, "PNG") - try: - text = ocr.classification(buf.getvalue()) or "" - except Exception as e: # pragma: no cover - defensive - log.warning("word_click fallback ocr error on bbox %d: %s", i, e) - text = "" - text = re.sub(r"[^\u4e00-\u9fff]", "", text) - bbox_ocr[i] = text - log.info("word_click fallback ocr on %d unused bboxes: %s", len(bbox_ocr), bbox_ocr) - - result = dict(already_assigned) - for ch in missing: - for i, text in bbox_ocr.items(): - if i in used_indices: - continue - if ch in text: - result[ch] = i - used_indices.add(i) - log.info("word_click fallback: %r → bbox %d via ocr=%r", ch, i, text) - break - # Final fallback: assign any remaining char to first unused bbox (visible click - # better than (0,0) which is guaranteed wrong). - for ch in targets: - if result.get(ch, 0) > 0: - continue - for i in range(1, len(bboxes) + 1): - if i not in used_indices: - result[ch] = i - used_indices.add(i) - log.info("word_click fallback: %r → bbox %d (last-resort)", ch, i) - break - return result +def _fallback_ddddocr(bg_bytes: bytes, targets: list[str]) -> list[tuple[int, int]]: + """Legacy ddddocr detect + per-bbox OCR fallback. + + Only used if the primary siamese path is unavailable (onnxruntime/cv2 not + installed, model files missing, or YOLO produces no bboxes). + """ + from crack_tcaptcha._legacy.icon_match import match_words + + log.info("word_click: falling back to ddddocr match_words") + return match_words(bg_bytes, targets) def solve_one_attempt( @@ -125,48 +71,16 @@ def solve_one_attempt( len(bg_bytes), ) + # Primary path: local YOLO + Siamese (fast, no network, no API cost) + click_coords: list[tuple[int, int]] try: - from crack_tcaptcha._legacy.icon_match import detect_icons - except ImportError as e: - raise SolveError("word_click requires ddddocr: `uv sync --extra icon-click`") from e - - bboxes = detect_icons(bg_bytes) - if len(bboxes) < len(targets): - log.warning( - "word_click: only %d bboxes detected for %d targets", - len(bboxes), - len(targets), - ) - if not bboxes: - raise SolveError("word_click: detector returned 0 bboxes") - log.info("word_click detection: %d bboxes=%s", len(bboxes), bboxes) - - # Primary: LLM vision (more reliable than ddddocr OCR on captcha fonts) - char_to_box: dict[str, int] = {} - llm_ok = bool(settings.llm_api_key and settings.llm_base_url) - if llm_ok: - try: - from crack_tcaptcha.solvers.llm_vision import locate_chars - - char_to_box = locate_chars(bg_bytes, targets=targets, bboxes=bboxes) - except SolveError as e: - log.warning("word_click: LLM locate_chars failed, falling back: %s", e) - else: - log.info("word_click: LLM not configured, using ddddocr fallback only") - - # Fallback for any char LLM returned 0 / miss - char_to_box = _fallback_match_by_ocr(bg_bytes, bboxes, targets, char_to_box) - - click_coords: list[tuple[int, int]] = [] - for ch in targets: - idx = char_to_box.get(ch, 0) - if 1 <= idx <= len(bboxes): - click_coords.append(_bbox_center(bboxes[idx - 1])) - else: - # Should not happen after fallback, but be safe. - cx, cy = _bbox_center(bboxes[0]) - click_coords.append((cx, cy)) - log.warning("word_click: char %r unresolved, using bbox 1", ch) + from crack_tcaptcha.solvers.word_ocr import locate_chars_by_siamese + + click_coords = locate_chars_by_siamese(bg_bytes, targets) + except SolveError as e: + log.warning("word_click siamese path failed: %s — using ddddocr fallback", e) + click_coords = _fallback_ddddocr(bg_bytes, targets) + log.info("word_click click_coords=%s for targets=%s", click_coords, targets) pow_answer, pow_calc_time = solve_pow( diff --git a/src/crack_tcaptcha/server.py b/src/crack_tcaptcha/server.py new file mode 100644 index 0000000..71261bf --- /dev/null +++ b/src/crack_tcaptcha/server.py @@ -0,0 +1,220 @@ +"""Long-running HTTP server for crack-tcaptcha. + +Exposes ``POST /solve`` that wraps :func:`crack_tcaptcha.solve`. Models +load **once** at startup (warmup), so every request pays only the +inference cost — no process cold-start, no ONNX reload. This is the +recommended mode for any non-one-shot use (scripts hammering captchas, +bench, integrations, …). + +The server uses stdlib ``http.server`` so we don't need to pull +``fastapi``/``uvicorn`` into the default dependency set. + +Endpoints:: + + GET /health → {"status": "ok", "providers": [...]} + POST /solve → request body: {"appid": "...", "retries": 3, "entry_url": ""} + → response: SolveResult model as JSON + +Authentication (optional): set ``TCAPTCHA_SERVE_SK`` and every request +must send it in the ``X-SK`` header. + +Concurrency: the server spawns a small worker thread pool so independent +solves run in parallel. ONNX sessions are thread-safe when created with +``intra_op_num_threads>=1`` (the default), which we do. +""" + +from __future__ import annotations + +import json +import logging +import os +import threading +import time +from concurrent.futures import ThreadPoolExecutor +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from typing import Any + +log = logging.getLogger(__name__) + +_DEFAULT_HOST = "127.0.0.1" +_DEFAULT_PORT = 9991 +_DEFAULT_WORKERS = 4 + + +def _warmup_all() -> list[str]: + """Preload all ONNX sessions and run one dummy inference each. + + Returns a list of human-readable provider strings for /health. + """ + providers: list[str] = [] + try: + from crack_tcaptcha.solvers.word_ocr import ( + _get_siamese_session, + _get_yolo_session, + warmup, + ) + + warmup() + yolo = _get_yolo_session() + siamese = _get_siamese_session() + providers.append(f"yolo={yolo.get_providers()}") + providers.append(f"siamese={siamese.get_providers()}") + except Exception as e: + log.warning("serve: word_click warmup failed (%s) — fallbacks still work", e) + return providers + + +class _State: + """Shared server state: one executor + cached warmup info.""" + + def __init__(self, workers: int, sk: str | None) -> None: + self.executor = ThreadPoolExecutor(max_workers=workers, thread_name_prefix="solve") + self.sk = sk + self.providers: list[str] = [] + self.started_at = time.time() + + +class _Handler(BaseHTTPRequestHandler): + # class-level; populated by run() + state: _State = None # type: ignore[assignment] + + def log_message(self, format: str, *args: Any) -> None: # noqa: A002 — stdlib signature + log.info("%s - %s", self.address_string(), format % args) + + # ---- response helpers ------------------------------------------------ + + def _send_json(self, code: int, payload: dict[str, Any]) -> None: + body = json.dumps(payload, ensure_ascii=False).encode("utf-8") + self.send_response(code) + self.send_header("Content-Type", "application/json; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def _check_auth(self) -> bool: + if not self.state.sk: + return True + if self.headers.get("X-SK") == self.state.sk: + return True + self._send_json(401, {"status": "error", "msg": "unauthorized"}) + return False + + # ---- routes ---------------------------------------------------------- + + def do_GET(self) -> None: # noqa: N802 (stdlib convention) + if self.path == "/health": + self._send_json( + 200, + { + "status": "ok", + "providers": self.state.providers, + "uptime_s": round(time.time() - self.state.started_at, 1), + }, + ) + return + self._send_json(404, {"status": "error", "msg": "not found"}) + + def do_POST(self) -> None: # noqa: N802 + if self.path != "/solve": + self._send_json(404, {"status": "error", "msg": "not found"}) + return + if not self._check_auth(): + return + + length = int(self.headers.get("Content-Length", "0") or 0) + raw = self.rfile.read(length) if length > 0 else b"" + try: + body = json.loads(raw.decode("utf-8")) if raw else {} + except json.JSONDecodeError as e: + self._send_json(400, {"status": "error", "msg": f"invalid json: {e}"}) + return + + appid = body.get("appid") or body.get("app_id") + if not appid: + self._send_json(400, {"status": "error", "msg": "missing appid"}) + return + retries = int(body.get("retries", body.get("max_retries", 3))) + entry_url = body.get("entry_url", "") + + from crack_tcaptcha import solve + + # Run in the executor so concurrent /solve requests don't block each + # other. The HTTP server is already ThreadingHTTPServer, so this is + # actually just enforcing a bounded concurrency. + fut = self.state.executor.submit(solve, appid=str(appid), max_retries=retries, entry_url=entry_url) + t0 = time.time() + try: + result = fut.result() + except Exception as e: # pragma: no cover - defensive + log.exception("solve crashed") + self._send_json(500, {"status": "error", "msg": str(e)}) + return + cost = round(time.time() - t0, 3) + payload = result.model_dump() + payload["_cost_s"] = cost + self._send_json(200, payload) + + +def run(host: str, port: int, workers: int, sk: str | None) -> None: + """Start the server. Blocks until Ctrl-C.""" + logging.basicConfig( + level=logging.INFO, + format="[%(asctime)s] %(levelname)s %(name)s: %(message)s", + datefmt="%H:%M:%S", + ) + + log.info("crack-tcaptcha serve: warming up models...") + providers = _warmup_all() + for p in providers: + log.info(" %s", p) + + state = _State(workers=workers, sk=sk) + state.providers = providers + _Handler.state = state + + server = ThreadingHTTPServer((host, port), _Handler) + log.info("listening on http://%s:%d (workers=%d, auth=%s)", host, port, workers, "on" if sk else "off") + + stop_evt = threading.Event() + + def _shutdown(*_: Any) -> None: + log.info("shutting down...") + stop_evt.set() + # serve_forever blocks in the main thread; use shutdown() from another + # thread to break it out. + threading.Thread(target=server.shutdown, daemon=True).start() + + import signal + + signal.signal(signal.SIGINT, _shutdown) + signal.signal(signal.SIGTERM, _shutdown) + + try: + server.serve_forever() + finally: + state.executor.shutdown(wait=True, cancel_futures=True) + server.server_close() + log.info("stopped") + + +def main(argv: list[str] | None = None) -> None: + """Entry point registered as `crack-tcaptcha serve` subcommand.""" + import argparse + + parser = argparse.ArgumentParser(prog="crack-tcaptcha serve", description="Long-running solver HTTP service") + parser.add_argument("--host", default=os.environ.get("TCAPTCHA_SERVE_HOST", _DEFAULT_HOST)) + parser.add_argument("--port", type=int, default=int(os.environ.get("TCAPTCHA_SERVE_PORT", _DEFAULT_PORT))) + parser.add_argument( + "--workers", + type=int, + default=int(os.environ.get("TCAPTCHA_SERVE_WORKERS", _DEFAULT_WORKERS)), + help="Max concurrent solves", + ) + args = parser.parse_args(argv) + + sk = os.environ.get("TCAPTCHA_SERVE_SK") or None + run(host=args.host, port=args.port, workers=args.workers, sk=sk) + + +if __name__ == "__main__": + main() diff --git a/src/crack_tcaptcha/solvers/models/font.ttf b/src/crack_tcaptcha/solvers/models/font.ttf new file mode 100644 index 0000000..fc3ace0 --- /dev/null +++ b/src/crack_tcaptcha/solvers/models/font.ttf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ea1d8f8aed5cee8cfa964b452c2085a1135acc02eb61c963f2f2b45bfbf4dbc +size 4825100 diff --git a/src/crack_tcaptcha/solvers/models/word_click_detector.onnx b/src/crack_tcaptcha/solvers/models/word_click_detector.onnx new file mode 100644 index 0000000..58b39c8 --- /dev/null +++ b/src/crack_tcaptcha/solvers/models/word_click_detector.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3d606a40ca327b9f5cff5c9cc6dc924c1c5da1b75a4050c358a84782e682e82 +size 10457464 diff --git a/src/crack_tcaptcha/solvers/models/word_click_matcher.onnx b/src/crack_tcaptcha/solvers/models/word_click_matcher.onnx new file mode 100644 index 0000000..4664414 --- /dev/null +++ b/src/crack_tcaptcha/solvers/models/word_click_matcher.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f48e175f8bb8d0fcadba8d22a263dcb429c87f659acd56542b1bdc3d8dab9e2 +size 29974422 diff --git a/src/crack_tcaptcha/solvers/ort_provider.py b/src/crack_tcaptcha/solvers/ort_provider.py new file mode 100644 index 0000000..b214117 --- /dev/null +++ b/src/crack_tcaptcha/solvers/ort_provider.py @@ -0,0 +1,60 @@ +"""ONNX Runtime execution-provider selection. + +Honours the ``TCAPTCHA_ORT_BACKEND`` env var (``cuda`` / ``rocm`` / ``dml`` / +``coreml`` / ``cpu`` / ``auto``, default ``auto``). Falls back to CPU when the +requested backend is unavailable. +""" + +from __future__ import annotations + +import logging +import os + +log = logging.getLogger(__name__) + +_BACKEND_MAP = { + "cuda": "CUDAExecutionProvider", + "rocm": "ROCMExecutionProvider", + "dml": "DmlExecutionProvider", + "coreml": "CoreMLExecutionProvider", + "cpu": "CPUExecutionProvider", +} + +_AUTO_PRIORITY = ( + "CUDAExecutionProvider", + "ROCMExecutionProvider", + "DmlExecutionProvider", + "CoreMLExecutionProvider", + "CPUExecutionProvider", +) + + +def resolve_providers() -> list[str]: + """Return an ORT providers list, always terminated by CPU as a fallback.""" + import onnxruntime as ort + + available = set(ort.get_available_providers()) + pref = os.environ.get("TCAPTCHA_ORT_BACKEND", "auto").strip().lower() + + wanted = _BACKEND_MAP.get(pref) + if wanted: + if wanted in available: + if wanted == "CPUExecutionProvider": + return [wanted] + return [wanted, "CPUExecutionProvider"] + log.warning( + "TCAPTCHA_ORT_BACKEND=%s requested %s, not available (%s) — falling back to auto", + pref, + wanted, + sorted(available), + ) + + for p in _AUTO_PRIORITY: + if p in available: + if p == "CPUExecutionProvider": + return [p] + return [p, "CPUExecutionProvider"] + return ["CPUExecutionProvider"] + + +__all__ = ["resolve_providers"] diff --git a/src/crack_tcaptcha/solvers/word_ocr.py b/src/crack_tcaptcha/solvers/word_ocr.py new file mode 100644 index 0000000..a4fec87 --- /dev/null +++ b/src/crack_tcaptcha/solvers/word_ocr.py @@ -0,0 +1,405 @@ +"""word-click solver: YOLOv8 detection + Siamese similarity matching. + +Replaces the slow LLM-vision path for ``word_click``. Uses two ONNX models +shipped with the package: + +* ``yolo_word.onnx`` — YOLOv8 detector, finds candidate character bboxes on + the bg image. +* ``siamese_word.onnx`` — Siamese network, takes two 52×52 RGB crops and + returns a similarity score. + +Target characters are rendered with the bundled ``font.ttf`` into 52×52 +reference images and compared against every detected bbox crop. The +highest-scoring unused bbox is picked for each target, in order. + +Performance notes: + +* Sessions are module-level singletons (one load per Python process). +* ``SessionOptions`` enables all graph optimisations and pins thread count + to a sensible default. +* Siamese inference is **batched**: for each target char we stack all + candidate crops into one call (N forward passes → 1 call), which is the + main win over a naïve loop. +* ``warmup()`` can be called at startup to amortise first-inference cost + (CoreML / CUDA graph compile). +""" + +from __future__ import annotations + +import logging +import os +import threading +from pathlib import Path + +import numpy as np + +from crack_tcaptcha.exceptions import SolveError +from crack_tcaptcha.solvers.ort_provider import resolve_providers + +log = logging.getLogger(__name__) + +_MODEL_DIR = Path(__file__).resolve().parent / "models" +_YOLO_PATH = _MODEL_DIR / "word_click_detector.onnx" +_SIAMESE_PATH = _MODEL_DIR / "word_click_matcher.onnx" +_FONT_PATH = _MODEL_DIR / "font.ttf" + +_YOLO_CONFIDENCE = 0.5 +_YOLO_IOU = 0.7 +_SIAMESE_INPUT = (52, 52) +_CHAR_RENDER_SIZE = 52 +_CHAR_RENDER_FONT_SIZE = 40 +_CHAR_RENDER_COLOR = (227, 178, 56) # BGR order when fed to siamese + + +# --- lazy model singletons --------------------------------------------------- + +_yolo_lock = threading.Lock() +_siamese_lock = threading.Lock() +_yolo_session = None +_siamese_session = None +_siamese_input_names: tuple[str, str] | None = None +_siamese_batch_supported: bool | None = None + + +def _import_onnx(): + try: + import onnxruntime # noqa: F401 + except ImportError as e: # pragma: no cover - import guard + raise SolveError("word_click requires onnxruntime: `uv sync --extra word-click`") from e + + +def _import_cv2(): + try: + import cv2 + except ImportError as e: # pragma: no cover - import guard + raise SolveError("word_click requires opencv-python-headless: `uv sync --extra word-click`") from e + return cv2 + + +def _make_session_options(): + """Build a tuned SessionOptions instance.""" + import onnxruntime as ort + + so = ort.SessionOptions() + so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL + # The siamese net is tiny (52×52 input) — beyond ~4 intra-op threads we + # measurably regress from scheduling overhead. Cap at 4 by default; + # override via TCAPTCHA_ORT_INTRA_OP_THREADS for specific hardware. + env_threads = os.environ.get("TCAPTCHA_ORT_INTRA_OP_THREADS") + if env_threads and env_threads.isdigit() and int(env_threads) > 0: + threads = int(env_threads) + else: + try: + threads = min(4, os.cpu_count() or 4) + except Exception: + threads = 4 + so.intra_op_num_threads = max(1, threads) + so.inter_op_num_threads = 1 + so.log_severity_level = 3 # ERROR only + return so + + +def _get_yolo_session(): + global _yolo_session + if _yolo_session is not None: + return _yolo_session + with _yolo_lock: + if _yolo_session is not None: + return _yolo_session + _import_onnx() + import onnxruntime as ort + + if not _YOLO_PATH.is_file(): + raise SolveError(f"word_click: missing yolo model at {_YOLO_PATH}") + so = _make_session_options() + _yolo_session = ort.InferenceSession(str(_YOLO_PATH), sess_options=so, providers=resolve_providers()) + log.info("word_click yolo session providers=%s", _yolo_session.get_providers()) + return _yolo_session + + +def _get_siamese_session(): + global _siamese_session, _siamese_input_names + if _siamese_session is not None: + return _siamese_session + with _siamese_lock: + if _siamese_session is not None: + return _siamese_session + _import_onnx() + import onnxruntime as ort + + if not _SIAMESE_PATH.is_file(): + raise SolveError(f"word_click: missing siamese model at {_SIAMESE_PATH}") + so = _make_session_options() + _siamese_session = ort.InferenceSession(str(_SIAMESE_PATH), sess_options=so, providers=resolve_providers()) + inputs = _siamese_session.get_inputs() + _siamese_input_names = (inputs[0].name, inputs[1].name) + log.info( + "word_click siamese session providers=%s inputs=%s", + _siamese_session.get_providers(), + [(i.name, i.shape) for i in inputs], + ) + return _siamese_session + + +# --- image helpers ----------------------------------------------------------- + + +def _bytes_to_bgr(byte_data: bytes) -> np.ndarray: + cv2 = _import_cv2() + arr = np.frombuffer(byte_data, np.uint8) + img = cv2.imdecode(arr, cv2.IMREAD_UNCHANGED) + if img is None: + raise SolveError("word_click: failed to decode bg image") + if len(img.shape) == 2: + return cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + if img.shape[-1] == 4: + alpha = img[..., 3:4].astype(np.float32) / 255.0 + rgb = img[..., :3].astype(np.float32) + white = np.full_like(rgb, 255.0) + return (rgb * alpha + white * (1 - alpha)).astype(np.uint8) + return img + + +def _render_char(char: str) -> np.ndarray: + """Render one CJK char to a 52×52 BGR image using the bundled font.""" + from PIL import Image, ImageDraw, ImageFont + + if not _FONT_PATH.is_file(): + raise SolveError(f"word_click: missing font at {_FONT_PATH}") + img = Image.new("RGB", (_CHAR_RENDER_SIZE, _CHAR_RENDER_SIZE), color="white") + draw = ImageDraw.Draw(img) + font = ImageFont.truetype(str(_FONT_PATH), _CHAR_RENDER_FONT_SIZE) + bbox = font.getbbox(char) + text_w = bbox[2] - bbox[0] + x = (_CHAR_RENDER_SIZE - text_w) // 2 + y = -3 + draw.text((x, y), char, fill=_CHAR_RENDER_COLOR, font=font) + cv2 = _import_cv2() + return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + + +# --- YOLOv8 detection -------------------------------------------------------- + + +def _letterbox( + img: np.ndarray, + new_shape: tuple[int, int], +) -> tuple[np.ndarray, tuple[float, float], tuple[float, float]]: + cv2 = _import_cv2() + h, w = img.shape[:2] + r = min(new_shape[0] / h, new_shape[1] / w) + new_unpad = (int(round(w * r)), int(round(h * r))) + dw = (new_shape[1] - new_unpad[0]) / 2 + dh = (new_shape[0] - new_unpad[1]) / 2 + resized = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR) + top = int(round(dh - 0.1)) + bottom = int(round(dh + 0.1)) + left = int(round(dw - 0.1)) + right = int(round(dw + 0.1)) + padded = cv2.copyMakeBorder(resized, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(255, 255, 255)) + return padded, (r, r), (left, top) + + +def _yolo_detect(bg_bgr: np.ndarray) -> list[tuple[int, int, int, int]]: + """Run YOLOv8 and return [(x1, y1, x2, y2), ...] on the original image.""" + cv2 = _import_cv2() + sess = _get_yolo_session() + inp = sess.get_inputs()[0] + _, _, ih, iw = inp.shape + + letter, (rx, ry), (pad_x, pad_y) = _letterbox(bg_bgr, (ih, iw)) + rgb = cv2.cvtColor(letter, cv2.COLOR_BGR2RGB) + data = (np.array(rgb) / 255.0).transpose(2, 0, 1)[None].astype(np.float32) + + outputs = sess.run(None, {inp.name: data})[0] + preds = np.transpose(np.squeeze(outputs)) + + boxes: list[list[float]] = [] + scores: list[float] = [] + for row in preds: + class_scores = row[4:] + max_score = float(np.amax(class_scores)) + if max_score < _YOLO_CONFIDENCE: + continue + x, y, w, h = row[0], row[1], row[2], row[3] + left = int(((x - w / 2) - pad_x) / rx) + top = int(((y - h / 2) - pad_y) / ry) + width = int(w / rx) + height = int(h / ry) + boxes.append([left, top, width, height]) + scores.append(max_score) + + if not boxes: + return [] + + indices = cv2.dnn.NMSBoxes(boxes, scores, _YOLO_CONFIDENCE, _YOLO_IOU) + bh, bw = bg_bgr.shape[:2] + out: list[tuple[int, int, int, int]] = [] + for i in np.array(indices).flatten(): + x, y, w, h = boxes[int(i)] + x1 = int(max(0, x)) + y1 = int(max(1, y)) # mirror tx-word quirk: y1<=0 clamped to 1 + x2 = int(min(bw, x + w)) + y2 = int(min(bh, y + h)) + if x2 > x1 and y2 > y1: + out.append((x1, y1, x2, y2)) + return out + + +# --- Siamese matching -------------------------------------------------------- + + +def _prep_siamese(img: np.ndarray) -> np.ndarray: + """Preprocess a single BGR crop to (1, 3, 52, 52) float32 [0,1].""" + cv2 = _import_cv2() + resized = cv2.resize(img, _SIAMESE_INPUT) + arr = np.transpose(resized, (2, 0, 1)).astype(np.float32) / 255.0 + return arr[None, ...] + + +def _siamese_score_batch(crops: list[np.ndarray], ref: np.ndarray) -> list[float]: + """Score every crop against the ref in one (or as few as possible) ORT calls. + + Fast paths, in order: + + 1. True batched inference (if the exported graph has a dynamic batch + dim). Cached after first attempt. + 2. Thread-pool parallel per-pair calls. ``session.run`` releases the + GIL, so this gives real parallelism on CPU EP with + ``intra_op_num_threads>=2``. + """ + global _siamese_batch_supported + + sess = _get_siamese_session() + assert _siamese_input_names is not None + n0, n1 = _siamese_input_names + + if not crops: + return [] + + ref_prepped = _prep_siamese(ref) # (1,3,52,52) + + # 1) Try batched inference (only once — decision is cached). + if _siamese_batch_supported is not False: + try: + batch = np.concatenate([_prep_siamese(c) for c in crops], axis=0) # (N,3,52,52) + refs = np.repeat(ref_prepped, batch.shape[0], axis=0) # (N,3,52,52) + pred = sess.run(None, {n0: batch, n1: refs})[0] + arr = np.asarray(pred).reshape(-1) + if arr.size == batch.shape[0]: + _siamese_batch_supported = True + return [float(v) for v in arr] + except Exception as e: + log.info("word_click siamese batch not supported, using per-pair: %s", e) + _siamese_batch_supported = False + + # 2) Per-pair path. Preprocess everything up front (numpy ops, cheap), + # then let ORT's own intra-op thread pool do the heavy lifting. + # An outer ThreadPoolExecutor causes oversubscription vs intra_op_num_threads + # and measurably slows things down — don't add one. + prepped = [_prep_siamese(c) for c in crops] + out: list[float] = [] + for p in prepped: + pred = sess.run(None, {n0: p, n1: ref_prepped})[0] + out.append(float(np.asarray(pred).reshape(-1)[0])) + return out + + +# --- public API -------------------------------------------------------------- + + +def warmup() -> None: + """Load both sessions and run one dummy inference each. + + Call this once at process start (e.g. CLI entry point) to amortise the + first-request cost (graph optimisation, kernel JIT, CoreML compile). + """ + try: + _get_yolo_session() + _get_siamese_session() + except SolveError as e: + log.warning("word_click warmup: sessions unavailable (%s)", e) + return + + try: + # Dummy 672×480 bg to exercise yolo preprocess/infer path. + dummy_bg = np.full((480, 672, 3), 255, dtype=np.uint8) + _yolo_detect(dummy_bg) + dummy_crop = np.full((52, 52, 3), 200, dtype=np.uint8) + dummy_ref = np.full((52, 52, 3), 100, dtype=np.uint8) + _siamese_score_batch([dummy_crop], dummy_ref) + log.info("word_click warmup: done") + except Exception as e: # defensive; warmup must never break the caller + log.warning("word_click warmup: dummy inference failed: %s", e) + + +def locate_chars_by_siamese( + bg_bytes: bytes, + targets: list[str], +) -> list[tuple[int, int]]: + """Return click (cx, cy) for each target char, in order. + + Raises ``SolveError`` if YOLO finds zero bboxes. If YOLO finds fewer + bboxes than targets, each target still gets its best pick; the caller + decides whether to retry or accept. + + Strategy: compute the full ``(targets × crops)`` score matrix once, + then greedily pick the best unused crop per target in input order. + This avoids redundant ORT calls when multiple targets share the same + candidate pool. + """ + bg = _bytes_to_bgr(bg_bytes) + bboxes = _yolo_detect(bg) + if not bboxes: + raise SolveError("word_click: yolo returned 0 bboxes") + log.info("word_click yolo: %d bboxes %s", len(bboxes), bboxes) + + # pre-crop + crops: list[np.ndarray] = [] + centers: list[tuple[int, int]] = [] + for x1, y1, x2, y2 in bboxes: + crop = bg[y1:y2, x1:x2] + if crop.size == 0: + continue + crops.append(crop) + centers.append((int((x1 + x2) // 2), int((y1 + y2) // 2))) + + if not crops: + raise SolveError("word_click: all yolo bboxes produced empty crops") + + # Full score matrix: rows = targets (in order), cols = crop indices. + score_matrix: list[list[float]] = [] + for ch in targets: + ref = _render_char(ch) + score_matrix.append(_siamese_score_batch(crops, ref)) + + # Greedy assignment in target order. (Hungarian would be optimal, but + # targets rarely collide and instruction order is the actual click + # order — greedy matches tx-word's proven approach.) + result: list[tuple[int, int]] = [] + used: set[int] = set() + for ti, ch in enumerate(targets): + scores = score_matrix[ti] + best_idx = -1 + best_score = -1.0 + for i, s in enumerate(scores): + if i in used: + continue + if s > best_score: + best_score = s + best_idx = i + if best_idx < 0: + # all already used — reuse best overall + for i, s in enumerate(scores): + if s > best_score: + best_score = s + best_idx = i + if best_idx < 0: + raise SolveError(f"word_click: no candidate for target {ch!r}") + used.add(best_idx) + result.append(centers[best_idx]) + log.info("word_click: %r → %s (score=%.3f)", ch, centers[best_idx], best_score) + return result + + +__all__ = ["locate_chars_by_siamese", "warmup"] diff --git a/tests/test_pipeline_word_click.py b/tests/test_pipeline_word_click.py index fbc7c52..57d66e5 100644 --- a/tests/test_pipeline_word_click.py +++ b/tests/test_pipeline_word_click.py @@ -1,4 +1,4 @@ -"""Tests for pipelines/word_click.py.""" +"""Tests for pipelines/word_click.py (local YOLO + Siamese path).""" from __future__ import annotations @@ -19,8 +19,6 @@ VerifyResp, ) from crack_tcaptcha.pipelines.word_click import ( - _bbox_center, - _fallback_match_by_ocr, _parse_target_chars, solve_one_attempt, ) @@ -44,16 +42,8 @@ def test_no_chars_returns_empty(self): assert _parse_target_chars("请依次点击:abc 123") == [] -class TestBboxCenter: - def test_integer_center(self): - assert _bbox_center((10, 20, 30, 40)) == (20, 30) - - def test_floor_division(self): - assert _bbox_center((0, 0, 3, 3)) == (1, 1) - - # --------------------------------------------------------------------------- -# _fallback_match_by_ocr +# solve_one_attempt # --------------------------------------------------------------------------- @@ -64,63 +54,6 @@ def _fake_bg_bytes(w: int = 100, h: int = 80) -> bytes: return buf.getvalue() -class TestFallbackMatchByOcr: - def test_already_assigned_short_circuits(self, monkeypatch): - called = {"ocr": False} - - def fake_get_ocr(): - called["ocr"] = True - return MagicMock() - - monkeypatch.setattr("crack_tcaptcha._legacy.icon_match._get_ocr", fake_get_ocr) - - bboxes = [(0, 0, 10, 10), (20, 0, 30, 10)] - result = _fallback_match_by_ocr( - _fake_bg_bytes(), - bboxes, - targets=["甲", "乙"], - already_assigned={"甲": 1, "乙": 2}, - ) - assert result == {"甲": 1, "乙": 2} - assert called["ocr"] is False - - def test_fills_missing_via_ocr_text(self, monkeypatch): - # First unused bbox returns "甲甲", second returns noise then "乙" - ocr = MagicMock() - ocr.classification.side_effect = ["甲", "乙foo"] - monkeypatch.setattr("crack_tcaptcha._legacy.icon_match._get_ocr", lambda: ocr) - - bboxes = [(0, 0, 10, 10), (20, 0, 30, 10), (40, 0, 50, 10)] - result = _fallback_match_by_ocr( - _fake_bg_bytes(), - bboxes, - targets=["甲", "乙"], - already_assigned={}, - ) - assert result["甲"] == 1 - assert result["乙"] == 2 - - def test_last_resort_assigns_unused_bbox(self, monkeypatch): - # OCR returns nothing useful; char must still map to SOME unused bbox - ocr = MagicMock() - ocr.classification.return_value = "" - monkeypatch.setattr("crack_tcaptcha._legacy.icon_match._get_ocr", lambda: ocr) - - bboxes = [(0, 0, 10, 10), (20, 0, 30, 10)] - result = _fallback_match_by_ocr( - _fake_bg_bytes(), - bboxes, - targets=["甲"], - already_assigned={}, - ) - assert 1 <= result["甲"] <= 2 - - -# --------------------------------------------------------------------------- -# solve_one_attempt -# --------------------------------------------------------------------------- - - def _make_pre(instruction: str = "请依次点击:甲 乙 ") -> PrehandleResp: return PrehandleResp( sess="sess_x", @@ -149,6 +82,20 @@ def stub_pow(monkeypatch): ) +@pytest.fixture() +def stub_finish(monkeypatch): + """Short-circuit finish_with_verify to skip TDC / trajectory plumbing.""" + + def fake_finish(client, pre, tdc_provider, *, ans_json, pow_answer, pow_calc_time, trajectory): + return client.verify( + ans=ans_json, + pow_answer=pow_answer, + pow_calc_time=pow_calc_time, + ) + + monkeypatch.setattr("crack_tcaptcha.pipelines.word_click.finish_with_verify", fake_finish) + + class TestSolveOneAttempt: def test_raises_when_no_cjk_chars(self, stub_pow): client, tdc = _mock_client_and_tdc() @@ -156,86 +103,48 @@ def test_raises_when_no_cjk_chars(self, stub_pow): with pytest.raises(SolveError, match="no CJK chars"): solve_one_attempt(client, pre, tdc) - def test_raises_when_detector_returns_empty(self, monkeypatch, stub_pow): - client, tdc = _mock_client_and_tdc() - pre = _make_pre() - - monkeypatch.setattr( - "crack_tcaptcha._legacy.icon_match.detect_icons", - lambda _bg: [], - ) - with pytest.raises(SolveError, match="returned 0 bboxes"): - solve_one_attempt(client, pre, tdc) - - def test_raises_when_ddddocr_missing(self, monkeypatch, stub_pow): - client, tdc = _mock_client_and_tdc() - pre = _make_pre() - - # Make the lazy import inside solve_one_attempt blow up with ImportError - import builtins - - real_import = builtins.__import__ - - def fake_import(name, globals=None, locals=None, fromlist=(), level=0): - if name == "crack_tcaptcha._legacy.icon_match" and fromlist and "detect_icons" in fromlist: - raise ImportError("no ddddocr") - return real_import(name, globals, locals, fromlist, level) - - monkeypatch.setattr(builtins, "__import__", fake_import) - - with pytest.raises(SolveError, match="requires ddddocr"): - solve_one_attempt(client, pre, tdc) - - def test_success_with_llm_path(self, monkeypatch, stub_pow): + def test_success_with_siamese_path(self, monkeypatch, stub_pow, stub_finish): client, tdc = _mock_client_and_tdc() pre = _make_pre(instruction="请依次点击:甲 乙 ") - bboxes = [(0, 0, 20, 20), (40, 0, 60, 20)] - monkeypatch.setattr( - "crack_tcaptcha._legacy.icon_match.detect_icons", - lambda _bg: bboxes, - ) - # Pretend LLM is configured - monkeypatch.setattr( - "crack_tcaptcha.pipelines.word_click.settings", - MagicMock(llm_api_key="k", llm_base_url="u"), - ) + # Primary path: siamese returns explicit click coords for each target. monkeypatch.setattr( - "crack_tcaptcha.solvers.llm_vision.locate_chars", - lambda _bg, targets, bboxes: {"甲": 1, "乙": 2}, + "crack_tcaptcha.solvers.word_ocr.locate_chars_by_siamese", + lambda _bg, targets: [(10, 10), (50, 10)], ) - # finish_with_verify's TDC collect goes through; just rubber-stamp - monkeypatch.setattr("crack_tcaptcha.pipelines._common.resolve_tdc_url", lambda p: p) resp = solve_one_attempt(client, pre, tdc) assert resp.ok - # verify called with ans JSON describing click center coords + kwargs = client.verify.call_args.kwargs ans = json.loads(kwargs["ans"]) assert [a["type"] for a in ans] == ["DynAnswerType_POS", "DynAnswerType_POS"] - # bbox 1 center = (10,10), bbox 2 center = (50,10) assert ans[0]["data"] == "10,10" assert ans[1]["data"] == "50,10" assert kwargs["pow_answer"] == "p_42" assert kwargs["pow_calc_time"] == 3 - def test_success_with_llm_absent_uses_ocr_only(self, monkeypatch, stub_pow): + def test_falls_back_to_ddddocr_when_siamese_fails(self, monkeypatch, stub_pow, stub_finish): client, tdc = _mock_client_and_tdc() pre = _make_pre(instruction="请依次点击:甲 ") + # Primary path raises SolveError → pipeline should fall back. + def raising_siamese(_bg, _targets): + raise SolveError("siamese unavailable") + monkeypatch.setattr( - "crack_tcaptcha._legacy.icon_match.detect_icons", - lambda _bg: [(0, 0, 10, 10), (20, 0, 30, 10)], + "crack_tcaptcha.solvers.word_ocr.locate_chars_by_siamese", + raising_siamese, ) - # LLM not configured + # Fallback path: _fallback_ddddocr imports match_words from _legacy.icon_match. monkeypatch.setattr( - "crack_tcaptcha.pipelines.word_click.settings", - MagicMock(llm_api_key="", llm_base_url=""), + "crack_tcaptcha._legacy.icon_match.match_words", + lambda _bg, _targets: [(33, 44)], ) - ocr = MagicMock() - ocr.classification.return_value = "甲" - monkeypatch.setattr("crack_tcaptcha._legacy.icon_match._get_ocr", lambda: ocr) - monkeypatch.setattr("crack_tcaptcha.pipelines._common.resolve_tdc_url", lambda p: p) resp = solve_one_attempt(client, pre, tdc) assert resp.ok + + kwargs = client.verify.call_args.kwargs + ans = json.loads(kwargs["ans"]) + assert ans == [{"elem_id": 1, "type": "DynAnswerType_POS", "data": "33,44"}] diff --git a/uv.lock b/uv.lock index d860cdc..4410119 100644 --- a/uv.lock +++ b/uv.lock @@ -359,6 +359,9 @@ all = [ { name = "cn-clip" }, { name = "cn2an" }, { name = "ddddocr" }, + { name = "onnxruntime", version = "1.24.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "onnxruntime", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "opencv-python-headless" }, { name = "torch" }, ] clip = [ @@ -366,6 +369,17 @@ clip = [ { name = "cn2an" }, { name = "torch" }, ] +icon-click = [ + { name = "ddddocr" }, +] +word-click = [ + { name = "ddddocr" }, + { name = "onnxruntime", version = "1.24.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "onnxruntime", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "opencv-python-headless" }, +] + +[package.dev-dependencies] dev = [ { name = "httpx" }, { name = "hypothesis" }, @@ -377,9 +391,6 @@ dev = [ docs = [ { name = "mkdocs-material" }, ] -icon-click = [ - { name = "ddddocr" }, -] [package.metadata] requires-dist = [ @@ -389,23 +400,32 @@ requires-dist = [ { name = "cn2an", marker = "extra == 'clip'", specifier = ">=0.5" }, { name = "ddddocr", marker = "extra == 'all'", specifier = ">=1.4" }, { name = "ddddocr", marker = "extra == 'icon-click'", specifier = ">=1.4" }, + { name = "ddddocr", marker = "extra == 'word-click'", specifier = ">=1.4" }, { name = "httpx", specifier = ">=0.27" }, - { name = "httpx", marker = "extra == 'dev'", specifier = ">=0.27" }, - { name = "hypothesis", marker = "extra == 'dev'", specifier = ">=6.0" }, - { name = "mkdocs-material", marker = "extra == 'docs'", specifier = ">=9.0" }, { name = "numpy", specifier = ">=1.24" }, + { name = "onnxruntime", marker = "extra == 'all'", specifier = ">=1.17" }, + { name = "onnxruntime", marker = "extra == 'word-click'", specifier = ">=1.17" }, + { name = "opencv-python-headless", marker = "extra == 'all'", specifier = ">=4.8" }, + { name = "opencv-python-headless", marker = "extra == 'word-click'", specifier = ">=4.8" }, { name = "pillow", specifier = ">=10.0" }, { name = "pydantic", specifier = ">=2.0" }, { name = "pydantic-settings", specifier = ">=2.0" }, - { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }, - { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.23" }, - { name = "respx", marker = "extra == 'dev'", specifier = ">=0.21" }, - { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.4" }, { name = "scrapling", extras = ["fetchers"], specifier = ">=0.4.3" }, { name = "torch", marker = "extra == 'all'", specifier = ">=2.0" }, { name = "torch", marker = "extra == 'clip'", specifier = ">=2.0" }, ] -provides-extras = ["icon-click", "clip", "all", "dev", "docs"] +provides-extras = ["icon-click", "word-click", "clip", "all"] + +[package.metadata.requires-dev] +dev = [ + { name = "httpx", specifier = ">=0.27" }, + { name = "hypothesis", specifier = ">=6.0" }, + { name = "pytest", specifier = ">=8.0" }, + { name = "pytest-asyncio", specifier = ">=0.23" }, + { name = "respx", specifier = ">=0.21" }, + { name = "ruff", specifier = ">=0.4" }, +] +docs = [{ name = "mkdocs-material", specifier = ">=9.0" }] [[package]] name = "cssselect" @@ -1602,10 +1622,14 @@ dependencies = [ { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] wheels = [ + { url = "https://files.pythonhosted.org/packages/79/42/2310883be3b8826ac58c3f2787b9358a2d46923d61f88fedf930bc59c60c/opencv_python_headless-4.13.0.92-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:1a7d040ac656c11b8c38677cc8cccdc149f98535089dbe5b081e80a4e5903209", size = 46247192, upload-time = "2026-02-05T07:01:35.187Z" }, + { url = "https://files.pythonhosted.org/packages/2d/1e/6f9e38005a6f7f22af785df42a43139d0e20f169eb5787ce8be37ee7fcc9/opencv_python_headless-4.13.0.92-cp37-abi3-macosx_14_0_x86_64.whl", hash = "sha256:3e0a6f0a37994ec6ce5f59e936be21d5d6384a4556f2d2da9c2f9c5dc948394c", size = 32568914, upload-time = "2026-02-05T07:01:51.989Z" }, { url = "https://files.pythonhosted.org/packages/21/76/9417a6aef9def70e467a5bf560579f816148a4c658b7d525581b356eda9e/opencv_python_headless-4.13.0.92-cp37-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5c8cfc8e87ed452b5cecb9419473ee5560a989859fe1d10d1ce11ae87b09a2cb", size = 33703709, upload-time = "2026-02-05T10:24:46.469Z" }, { url = "https://files.pythonhosted.org/packages/92/ce/bd17ff5772938267fd49716e94ca24f616ff4cb1ff4c6be13085108037be/opencv_python_headless-4.13.0.92-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0525a3d2c0b46c611e2130b5fdebc94cf404845d8fa64d2f3a3b679572a5bd22", size = 56016764, upload-time = "2026-02-05T10:26:48.904Z" }, { url = "https://files.pythonhosted.org/packages/8f/b4/b7bcbf7c874665825a8c8e1097e93ea25d1f1d210a3e20d4451d01da30aa/opencv_python_headless-4.13.0.92-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:eb60e36b237b1ebd40a912da5384b348df8ed534f6f644d8e0b4f103e272ba7d", size = 35010236, upload-time = "2026-02-05T10:28:11.031Z" }, { url = "https://files.pythonhosted.org/packages/4b/33/b5db29a6c00eb8f50708110d8d453747ca125c8b805bc437b289dbdcc057/opencv_python_headless-4.13.0.92-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:0bd48544f77c68b2941392fcdf9bcd2b9cdf00e98cb8c29b2455d194763cf99e", size = 60391106, upload-time = "2026-02-05T10:30:14.236Z" }, + { url = "https://files.pythonhosted.org/packages/fb/c3/52cfea47cd33e53e8c0fbd6e7c800b457245c1fda7d61660b4ffe9596a7f/opencv_python_headless-4.13.0.92-cp37-abi3-win32.whl", hash = "sha256:a7cf08e5b191f4ebb530791acc0825a7986e0d0dee2a3c491184bd8599848a4b", size = 30812232, upload-time = "2026-02-05T07:02:29.594Z" }, + { url = "https://files.pythonhosted.org/packages/4a/90/b338326131ccb2aaa3c2c85d00f41822c0050139a4bfe723cfd95455bd2d/opencv_python_headless-4.13.0.92-cp37-abi3-win_amd64.whl", hash = "sha256:77a82fe35ddcec0f62c15f2ba8a12ecc2ed4207c17b0902c7a3151ae29f37fb6", size = 40070414, upload-time = "2026-02-05T07:02:26.448Z" }, ] [[package]]