diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml new file mode 100644 index 00000000..02b5753f --- /dev/null +++ b/.github/release-drafter.yml @@ -0,0 +1,79 @@ +# release-drafter config — accumulates merged-PR titles into a draft GitHub +# Release as PRs land on main, so the English half of docs/changelog/v.md +# is prefilled by the time we cut the next release. +# +# How it fits with the existing release flow: +# - PRs merge → release-drafter updates the draft release tagged `next` +# - When ready to ship, run `prepare-release.yml` which reads the draft +# body and writes it into `docs/changelog/v.md` as a stub +# - You translate the bullets into Persian above the `---` separator, +# merge the prep PR, push the `v` tag, and release.yml takes over +# +# The draft is tagged `next` (not `vX.Y.Z`) so it never collides with the +# real release-tag namespace. softprops/action-gh-release in release.yml +# will create a fresh release for the actual `vX.Y.Z` tag — the `next` +# draft just gets reset by release-drafter on the following PR merge. + +name-template: 'Next release (draft)' +tag-template: 'next' + +# Flat bullet template — one line per merged PR, matching the existing +# docs/changelog/v.md style: +# +# • ([#NN](url)): . Thanks @user +# +# We bake the `: . Thanks @AUTHOR` suffix directly into the +# template so the maintainer's job is just (a) strip the leading +# `feat:`/`fix:` Conventional-Commit prefix that PR titles in this repo +# carry (prepare-release.yml does this automatically with a sed pass), +# (b) fix the verb tense if needed (`added` → `Add`), and (c) replace +# `` with the explanatory clause. +# +# Why the placeholder is part of the template and not added later: +# putting it here means the no-changes-template fallback (below) does +# *not* get a `` suffix — only real PR-derived bullets do. +change-template: '• $TITLE ([#$NUMBER]($URL)): . Thanks @$AUTHOR' +change-title-escapes: '\<*_&' + +# Fallback if no PRs have merged since the last draft reset. Rare in +# practice; here as a safety net so the draft body is never empty. +# Deliberately doesn't follow the ``-bullet shape so it's +# obviously a placeholder line, not a real release entry. +no-changes-template: '_(no PR-tracked changes since the last release)_' + +# Skip PRs labelled `release-prep` from the changelog — those are the +# automated version-bump PRs opened by prepare-release.yml; including +# them would echo "release: prepare v1.6.6" into the next release notes. +exclude-labels: + - 'release-prep' + - 'skip-changelog' + +# Auto-apply labels based on Conventional Commit title prefixes. The repo +# already enforces feat:/fix:/etc. on PR titles, so this is "free" — no +# contributor action needed. Labels feed the exclude-labels above and +# also unlock PR filtering on the GitHub issues page if we want it later. +autolabeler: + - label: 'release-prep' + title: + - '/^release:/i' + - label: 'type: feature' + title: + - '/^feat(\(.+\))?:/i' + - label: 'type: fix' + title: + - '/^fix(\(.+\))?:/i' + - label: 'type: chore' + title: + - '/^chore(\(.+\))?:/i' + - label: 'type: docs' + title: + - '/^docs?(\(.+\))?:/i' + - label: 'type: refactor' + title: + - '/^refactor(\(.+\))?:/i' + +# Body of the draft release: just the flat bullet list. No "What's +# Changed" header, no contributors block — keep it copy-paste-ready +# into docs/changelog/v.md. +template: | + $CHANGES diff --git a/.github/scripts/telegram_publish_files.py b/.github/scripts/telegram_publish_files.py new file mode 100644 index 00000000..e4300015 --- /dev/null +++ b/.github/scripts/telegram_publish_files.py @@ -0,0 +1,550 @@ +#!/usr/bin/env python3 +"""Post each release artifact individually to a Telegram channel. + +Used by .github/workflows/telegram-publish-files.yml. Reads files from +--assets-dir, picks a Persian caption per filename, posts via the +Telegram Bot API `sendDocument` endpoint with --hashtag appended. + +Files larger than the Telegram Bot API's 50 MB ceiling are split into +~45 MB byte chunks via Python (no `split` shell dep) and posted as +`.part_aa`, `.part_ab`, ... — recipients reassemble with +`cat .part_* > `. + +Re-runnable: posts every file every time. Use carefully when re-running +for the same version (the channel will get duplicate posts). +""" + +from __future__ import annotations + +import argparse +import hashlib +import os +import sys +import time +import urllib.error +import urllib.parse +import urllib.request +import json +from pathlib import Path + +# Telegram Bot API uploads cap at 50 MB. Pick 45 MB for chunks so the +# multipart envelope + caption + Telegram's own overhead don't push us +# over. Bigger chunks (e.g. 49 MB) sometimes hit "Request Entity Too +# Large" depending on caption length. +CHUNK_LIMIT_BYTES = 45 * 1024 * 1024 + +# Sleep between uploads. Telegram's documented rate limit is 1 msg/sec +# to the same chat, plus a soft "burst" allowance. 1.5s is conservative +# and means a 20-file release publishes in ~30 s. +INTER_UPLOAD_SLEEP_SECS = 1.5 + +# Filename-substring → Persian caption. Order matters: longest / +# most-specific patterns first, since a shorter pattern (e.g. +# "android-x86") can match a more-specific filename ("android-x86_64"). +# Match is `pattern in filename`. +CAPTIONS: list[tuple[str, str]] = [ + # Android — universal first (the recommended default for non-technical users). + ("android-universal", "نسخه اندروید (universal) — برای همه دستگاه‌ها"), + ("android-arm64-v8a", "نسخه اندروید (arm64-v8a) — گوشی‌های مدرن ۶۴ بیتی"), + ("android-armeabi-v7a", "نسخه اندروید (armv7) — گوشی‌های قدیمی‌تر ۳۲ بیتی"), + ("android-x86_64", "نسخه اندروید (x86_64) — شبیه‌ساز ۶۴ بیتی"), + ("android-x86", "نسخه اندروید (x86) — شبیه‌ساز"), + # Windows. + ("windows-amd64", "نسخه ویندوز x64 (۶۴ بیتی)"), + ("windows-i686", "نسخه ویندوز x86 (۳۲ بیتی، Win7+)"), + # macOS — .app bundles before plain CLI tarballs. + ("macos-arm64-app", "نسخه macOS (Apple Silicon) — برنامه گرافیکی .app"), + ("macos-amd64-app", "نسخه macOS (Intel) — برنامه گرافیکی .app"), + ("macos-arm64", "نسخه macOS (Apple Silicon) — CLI"), + ("macos-amd64", "نسخه macOS (Intel) — CLI"), + # Linux — musl static first, glibc second. + ("linux-musl-amd64", "نسخه لینوکس amd64 (musl static) — Alpine / OpenWRT-x86"), + ("linux-musl-arm64", "نسخه لینوکس arm64 (musl static)"), + ("linux-amd64", "نسخه لینوکس amd64 (glibc)"), + ("linux-arm64", "نسخه لینوکس arm64 (glibc)"), + # Embedded targets. + ("openwrt-mipsel-softfloat", "نسخه OpenWRT (mipsel softfloat) — روتر MT7621"), + ("raspbian-armhf", "نسخه Raspbian (armhf) — رزبری پای ۳۲ بیتی"), +] + + +def caption_for(filename: str) -> str: + """Return the Persian caption for a filename, falling back to the + bare filename if nothing matches.""" + for pattern, persian in CAPTIONS: + if pattern in filename: + return persian + return f"نسخه `{filename}`" + + +def order_files(files: list[Path]) -> list[Path]: + """Sort release files in CAPTIONS order (Android first, then + Windows, macOS, Linux, embedded). Files not matching any pattern + fall to the end in alphabetical order.""" + order_map: dict[str, int] = {pattern: idx for idx, (pattern, _) in enumerate(CAPTIONS)} + + def key(p: Path) -> tuple[int, str]: + for pattern, idx in order_map.items(): + if pattern in p.name: + return (idx, p.name) + # Unknown patterns: push to end, alphabetize among themselves. + return (len(CAPTIONS), p.name) + + return sorted(files, key=key) + + +def split_file(path: Path, chunk_bytes: int) -> list[Path]: + """Split `path` into chunks of at most `chunk_bytes` bytes. Returns + the list of chunk paths, named `.part_aa`, `.part_ab`, ... + Mimics `split -b `. Reassembled via + `cat .part_* > `. + + Skips work if existing parts are already present (idempotent re-run).""" + parts: list[Path] = [] + + def part_name(idx: int) -> str: + # 26-letter base: aa..az, ba..bz, ... mirroring split's default. + first = chr(ord("a") + idx // 26) + second = chr(ord("a") + idx % 26) + return f"{path.name}.part_{first}{second}" + + idx = 0 + with path.open("rb") as src: + while True: + buf = src.read(chunk_bytes) + if not buf: + break + part_path = path.parent / part_name(idx) + with part_path.open("wb") as dst: + dst.write(buf) + parts.append(part_path) + idx += 1 + return parts + + +def send_document( + bot_token: str, + chat_id: str, + file_path: Path, + caption: str, +) -> dict: + """POST a single file via the Telegram Bot API sendDocument endpoint. + Returns the parsed JSON response. Raises on HTTP error. + + Uses urllib + a hand-rolled multipart/form-data encoder so we don't + pull `requests` (the workflow runs on stock GitHub-hosted runners + where stdlib-only is preferable for cold-start speed).""" + url = f"https://api.telegram.org/bot{bot_token}/sendDocument" + boundary = "----mhrvUploadBoundary" + str(int(time.time() * 1000)) + body = build_multipart( + boundary, + fields={ + "chat_id": chat_id, + "caption": caption, + "parse_mode": "HTML", + # Disable preview to keep the channel tidy. + "disable_notification": "false", + }, + files={"document": (file_path.name, file_path.read_bytes(), "application/octet-stream")}, + ) + req = urllib.request.Request( + url, + data=body, + headers={"Content-Type": f"multipart/form-data; boundary={boundary}"}, + method="POST", + ) + # 5 minute timeout for the actual upload — Telegram occasionally + # takes a while to process 40+ MB documents. + with urllib.request.urlopen(req, timeout=300) as resp: + return json.loads(resp.read().decode("utf-8")) + + +def build_multipart( + boundary: str, + fields: dict[str, str], + files: dict[str, tuple[str, bytes, str]], +) -> bytes: + """Build a multipart/form-data body. `files` is name → (filename, + bytes, mime). Plain stdlib so we don't need `requests`.""" + parts: list[bytes] = [] + crlf = b"\r\n" + bnd = f"--{boundary}".encode() + + for name, value in fields.items(): + parts.append(bnd) + parts.append(f'Content-Disposition: form-data; name="{name}"'.encode()) + parts.append(b"") + parts.append(value.encode("utf-8")) + + for name, (filename, data, mime) in files.items(): + parts.append(bnd) + parts.append( + f'Content-Disposition: form-data; name="{name}"; filename="{filename}"'.encode() + ) + parts.append(f"Content-Type: {mime}".encode()) + parts.append(b"") + parts.append(data) + + parts.append(f"--{boundary}--".encode()) + parts.append(b"") + return crlf.join(parts) + + +def html_escape(s: str) -> str: + return s.replace("&", "&").replace("<", "<").replace(">", ">") + + +def sha256_hex(path: Path) -> str: + """Stream-hash the file in 1 MiB chunks. Avoids loading 40+ MB APKs + into RAM twice (once for hashing, once for upload).""" + h = hashlib.sha256() + with path.open("rb") as f: + for chunk in iter(lambda: f.read(1 << 20), b""): + h.update(chunk) + return h.hexdigest() + + +def post_file( + bot_token: str, + chat_id: str, + file_path: Path, + base_caption: str, + hashtag: str, +) -> bool: + """Post one file. If too big, split + post each part. Returns True + on success of all parts, False on any failure. + + Each caption ends with the file's SHA-256 in hex under a Persian + "تایید اصالت" (authenticity verification) label, so recipients can + `sha256sum ` after download and confirm it matches what the + channel posted — defends against modified copies if the channel is + ever compromised or relayed through a third party.""" + size = file_path.stat().st_size + + # Compute the original-file hash regardless of whether we'll chunk + # it. For chunked uploads, every part's caption shows this hash so + # the user can verify the full file once reassembled with `cat`. + print(f" hashing {file_path.name}...", flush=True) + full_sha = sha256_hex(file_path) + + if size <= CHUNK_LIMIT_BYTES: + caption = ( + f"{html_escape(base_caption)}\n" + f"{html_escape(file_path.name)}\n" + f"\nتایید اصالت (SHA-256):\n" + f"{full_sha}\n" + f"\n{hashtag}" + ) + print(f" uploading {file_path.name} ({size / 1_048_576:.1f} MB)...", flush=True) + try: + resp = send_document(bot_token, chat_id, file_path, caption) + if not resp.get("ok"): + print(f" !! Telegram returned not-ok: {resp}", flush=True) + return False + print(f" ok (message_id={resp['result']['message_id']})", flush=True) + return True + except urllib.error.HTTPError as e: + err_body = e.read().decode("utf-8", errors="replace")[:500] + print(f" !! HTTP {e.code}: {err_body}", flush=True) + return False + except Exception as e: + print(f" !! exception: {e}", flush=True) + return False + finally: + time.sleep(INTER_UPLOAD_SLEEP_SECS) + + # Too big — split and post each part. + print( + f" splitting {file_path.name} ({size / 1_048_576:.1f} MB > " + f"{CHUNK_LIMIT_BYTES / 1_048_576:.0f} MB ceiling)...", + flush=True, + ) + parts = split_file(file_path, CHUNK_LIMIT_BYTES) + if not parts: + print(f" !! split produced 0 parts (empty file?)", flush=True) + return False + + n = len(parts) + all_ok = True + for idx, part_path in enumerate(parts, start=1): + # Hash the individual part too — lets the user verify each + # downloaded chunk before bothering to reassemble. + part_sha = sha256_hex(part_path) + part_caption = ( + f"{html_escape(base_caption)} — قسمت {idx}/{n}\n" + f"{html_escape(part_path.name)}\n" + f"\nبرای بازسازی فایل اصلی:\n" + f"cat {html_escape(file_path.name)}.part_* > " + f"{html_escape(file_path.name)}\n" + f"\nتایید اصالت این قسمت (SHA-256):\n" + f"{part_sha}\n" + f"\nتایید اصالت فایل کامل پس از بازسازی (SHA-256):\n" + f"{full_sha}\n" + f"\n{hashtag}" + ) + psize = part_path.stat().st_size + print( + f" uploading part {idx}/{n}: {part_path.name} ({psize / 1_048_576:.1f} MB)...", + flush=True, + ) + try: + resp = send_document(bot_token, chat_id, part_path, part_caption) + if not resp.get("ok"): + print(f" !! Telegram returned not-ok: {resp}", flush=True) + all_ok = False + else: + print( + f" ok (message_id={resp['result']['message_id']})", flush=True + ) + except urllib.error.HTTPError as e: + err_body = e.read().decode("utf-8", errors="replace")[:500] + print(f" !! HTTP {e.code}: {err_body}", flush=True) + all_ok = False + except Exception as e: + print(f" !! exception: {e}", flush=True) + all_ok = False + finally: + time.sleep(INTER_UPLOAD_SLEEP_SECS) + # Tidy up the part file once posted. + try: + part_path.unlink() + except OSError: + pass + + return all_ok + + +def files_channel_post_link(chat_id: str, message_id: int) -> str: + """Build a `t.me` link to a specific message in the files channel. + + For private supergroups/channels (negative ID with `-100` prefix), + Telegram exposes posts at `https://t.me/c//` where `` + is the chat ID with the `-100` stripped. This link works for users + who are members of the channel. + + If `FILES_CHANNEL_USERNAME` is set in env (e.g. `mhrv_files`), uses + the public-channel form `https://t.me//` instead, + which is clickable for everyone.""" + username = os.environ.get("FILES_CHANNEL_USERNAME", "").strip().lstrip("@") + if username: + return f"https://t.me/{username}/{message_id}" + cid = chat_id + if cid.startswith("-100"): + cid = cid[4:] + elif cid.startswith("-"): + cid = cid[1:] + return f"https://t.me/c/{cid}/{message_id}" + + +def post_main_channel_pointer( + bot_token: str, + main_chat_id: str, + files_channel_post_link: str, + version: str, + hashtag: str, + channel_username_link: str = "", + channel_invite_link: str = "", +) -> bool: + """Post a short cross-link to the main announcement channel pointing + at the anchor post in the files channel. Replaces the previous + behaviour of posting the universal APK + full changelog directly + to the main channel — the main channel becomes a discovery surface + while the files channel hosts the actual artifacts. + + Includes channel-join links (public username + invite hash) at the + bottom so recipients who aren't yet members can subscribe before + clicking through to the specific release post. + """ + parts = [ + f"📦 mhrv-rs v{html_escape(version)} منتشر شد", + "", + f"برای دانلود فایل‌ها (Android، Windows، macOS، Linux و ...) " + f"به کانال فایل‌ها مراجعه کنید:", + "", + f"👉 " + f"v{html_escape(version)} — همه فایل‌ها + SHA-256", + ] + # Channel-join links. Two forms handle different states of the + # files channel: the `t.me/` form works for public + # channels and is the prettier link; the `t.me/+` invite + # link works regardless of whether the channel is public, and is + # the only path in for private/restricted channels. Showing both + # is forgiving — recipients click whichever works for them. + if channel_username_link or channel_invite_link: + parts.append("") + parts.append("لینک کانال:") + if channel_username_link: + # Render as plain URL (not HTML ) so the text shows the + # link itself — useful when users share the message via + # screenshot or copy-paste outside Telegram, which would + # strip the wrapper. + parts.append(html_escape(channel_username_link)) + if channel_invite_link: + parts.append(f"و یا: {html_escape(channel_invite_link)}") + parts.append("") + parts.append(hashtag) + text = "\n".join(parts) + url = f"https://api.telegram.org/bot{bot_token}/sendMessage" + data = urllib.parse.urlencode({ + "chat_id": main_chat_id, + "text": text, + "parse_mode": "HTML", + "disable_web_page_preview": "false", + }).encode() + print(f" posting cross-link to main channel {main_chat_id}...", flush=True) + try: + with urllib.request.urlopen( + urllib.request.Request(url, data=data, method="POST"), timeout=30 + ) as resp: + r = json.loads(resp.read().decode("utf-8")) + if not r.get("ok"): + print(f" !! main-channel post failed: {r}", flush=True) + return False + print( + f" ok (message_id={r['result']['message_id']})", flush=True + ) + return True + except urllib.error.HTTPError as e: + err_body = e.read().decode("utf-8", errors="replace")[:500] + print(f" !! HTTP {e.code}: {err_body}", flush=True) + return False + except Exception as e: + print(f" !! exception: {e}", flush=True) + return False + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--assets-dir", required=True, type=Path) + parser.add_argument("--version", required=True, help="e.g. 1.8.0") + parser.add_argument("--hashtag", required=True, help="e.g. #v180") + args = parser.parse_args() + + bot_token = os.environ.get("BOT_TOKEN") + chat_id = os.environ.get("CHAT_ID") + if not bot_token or not chat_id: + print("BOT_TOKEN and CHAT_ID env vars required", file=sys.stderr) + return 2 + + if not args.assets_dir.is_dir(): + print(f"--assets-dir {args.assets_dir} not a directory", file=sys.stderr) + return 2 + + # Collect all regular files in the directory (no recursion). Skip + # split-part leftovers from a previous run of this script if any + # exist — we'll regenerate cleanly. + raw_files = [ + p for p in args.assets_dir.iterdir() + if p.is_file() and ".part_" not in p.name + ] + if not raw_files: + print(f"no files found in {args.assets_dir}", file=sys.stderr) + return 2 + + files = order_files(raw_files) + print(f"publishing {len(files)} file(s) to Telegram chat {chat_id} for v{args.version}:") + for f in files: + print(f" - {f.name}") + print() + + # Leading announcement in the files channel. Captured `message_id` + # is the anchor that the main-channel cross-link points at — the + # main channel doesn't carry files anymore, just a single message + # saying "new release, click here." Recipients land on this anchor + # and scroll down to see all the platform-specific files. + announce = ( + f"📦 mhrv-rs {html_escape('v' + args.version)} منتشر شد\n" + f"\nفایل‌ها در ادامه به ترتیب پلتفرم ارسال می‌شن.\n" + f"هر فایل با SHA-256 (تایید اصالت) همراه هست.\n" + f"\n{args.hashtag}" + ) + announce_msg_id: int | None = None + try: + url = f"https://api.telegram.org/bot{bot_token}/sendMessage" + data = urllib.parse.urlencode({ + "chat_id": chat_id, + "text": announce, + "parse_mode": "HTML", + "disable_web_page_preview": "true", + }).encode() + with urllib.request.urlopen( + urllib.request.Request(url, data=data, method="POST"), timeout=30 + ) as resp: + r = json.loads(resp.read().decode("utf-8")) + if not r.get("ok"): + print(f" !! announcement failed: {r}", flush=True) + else: + announce_msg_id = r["result"]["message_id"] + print( + f" announcement posted (message_id={announce_msg_id})", + flush=True, + ) + except Exception as e: + # Non-fatal for the file uploads, but cross-link to the main + # channel below will be skipped — without the anchor message_id + # there's nothing to point at. + print(f" !! announcement exception: {e}", flush=True) + time.sleep(INTER_UPLOAD_SLEEP_SECS) + + failures = 0 + for f in files: + base = caption_for(f.name) + ok = post_file(bot_token, chat_id, f, base, args.hashtag) + if not ok: + failures += 1 + + # Cross-link to the main announcement channel. Skipped if MAIN_CHAT_ID + # is unset (development / private testing) or if the files-channel + # announcement didn't post (no anchor to link to). + main_chat_id = os.environ.get("MAIN_CHAT_ID", "").strip() + if main_chat_id and announce_msg_id is not None: + link = files_channel_post_link(chat_id, announce_msg_id) + # Optional channel-join links rendered alongside the cross-link. + # `FILES_CHANNEL_USERNAME` is the public-username form (clean + # `t.me/` URL — clickable for everyone). `FILES_CHANNEL_INVITE` + # is the `t.me/+` invite link, the only join path for + # private channels. Either or both can be set; both render in + # the body as separate lines. + username = os.environ.get("FILES_CHANNEL_USERNAME", "").strip().lstrip("@") + username_link = f"https://t.me/{username}" if username else "" + invite_link = os.environ.get("FILES_CHANNEL_INVITE", "").strip() + print() + print(f"posting cross-link to main channel:") + print(f" post link: {link}") + if username_link: + print(f" channel username link: {username_link}") + if invite_link: + print(f" channel invite link: {invite_link}") + ok = post_main_channel_pointer( + bot_token, + main_chat_id, + link, + args.version, + args.hashtag, + channel_username_link=username_link, + channel_invite_link=invite_link, + ) + if not ok: + failures += 1 + elif main_chat_id and announce_msg_id is None: + print() + print( + " !! MAIN_CHAT_ID is set but announcement message_id is None — " + "skipping cross-link (no anchor to point at).", + flush=True, + ) + failures += 1 + else: + print() + print(" MAIN_CHAT_ID not set, skipping cross-link", flush=True) + + print() + if failures: + print(f"DONE with {failures} failure(s) out of {len(files)}", flush=True) + return 1 + print(f"DONE — {len(files)} files posted successfully", flush=True) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.github/scripts/telegram_release_notify.py b/.github/scripts/telegram_release_notify.py index da04d8b8..df1641b6 100755 --- a/.github/scripts/telegram_release_notify.py +++ b/.github/scripts/telegram_release_notify.py @@ -46,16 +46,42 @@ from pathlib import Path +def _strip_leading_comments(body: str) -> str: + """Strip leading HTML comment blocks (single- or multi-line) from `body`. + + The changelog template uses `` to document the format for + editors; we don't want those echoed to Telegram or the GitHub Release + page. The `(?:...)+` quantifier eats N consecutive comments separated + only by whitespace, so a stub with both a format-docs comment and a + TODO comment is cleaned in one pass. `re.S` makes `.` cross newlines + for multi-line `` blocks. + + The matching regex is also used inline by .github/workflows/release.yml + to compose the GitHub Release body — keep them in sync if you change + one. Run `python -m doctest telegram_release_notify.py -v` to check. + + >>> _strip_leading_comments("\\nbody") + 'body' + >>> _strip_leading_comments("\\n\\nbody") + 'body' + >>> _strip_leading_comments("\\nbody") + 'body' + >>> _strip_leading_comments("\\n\\n\\n\\nbody") + 'body' + >>> _strip_leading_comments("body without comments") + 'body without comments' + >>> _strip_leading_comments("body\\n\\nmore") + 'body\\n\\nmore' + """ + return re.sub(r"^\s*(?:\s*)+", "", body, count=1, flags=re.S) + + def parse_changelog(path: str) -> tuple[str, str]: """Return (persian_body, english_body). Blank strings if file missing.""" p = Path(path) if not p.is_file(): return "", "" - body = p.read_text(encoding="utf-8") - # Strip a leading HTML comment block if present — the changelog - # template uses to document the format for editors; - # we don't want that echoed to Telegram. - body = re.sub(r"^\s*\s*", "", body, count=1, flags=re.S) + body = _strip_leading_comments(p.read_text(encoding="utf-8")) fa, sep, en = body.partition("\n---\n") if not sep: # No separator — treat everything as Persian (content-language diff --git a/.github/workflows/prepare-release.yml b/.github/workflows/prepare-release.yml new file mode 100644 index 00000000..2eb83720 --- /dev/null +++ b/.github/workflows/prepare-release.yml @@ -0,0 +1,296 @@ +# Prepare a new release: bump version strings, prefill the changelog +# stub from release-drafter's draft, and open a PR. After the PR is +# merged, you push the `v` tag manually and `release.yml` +# takes over (matrix build → GitHub release → Telegram notify). +# +# Triggered manually from the Actions UI or via: +# gh workflow run prepare-release.yml -f version=1.6.6 +# +# What it bumps in the PR: +# - Cargo.toml version = "X.Y.Z" +# - Cargo.lock mhrv-rs entry's version +# - android/app/build.gradle.kts versionName = "X.Y.Z" +# versionCode = previous + 1 +# +# What it leaves alone: +# - tunnel-node/Cargo.toml — versioned independently from the app. +# The docker tunnel image is tagged from the git release tag (not +# from this Cargo.toml), so we don't need to touch it. +# +# What it prefills in docs/changelog/v.md: +# - Persian section: an inline `[FA] translate ...` placeholder line. +# Visible if not edited — ships into the release page as an obvious +# marker rather than a quiet comment leak. +# - Separator: `---` +# - English section: bullets pulled from release-drafter's `next` +# draft release, each suffixed with `: ` to remind you to +# add an explanatory clause in the project's existing +# `• headline (#NN): full explanation` style. If no draft exists +# yet (e.g. immediately after installing release-drafter, before +# any PRs have merged), the English section is empty and you fill +# it in by hand. + +name: prepare-release + +on: + workflow_dispatch: + inputs: + version: + description: 'New version to release (without the leading v). Example: 1.6.6' + required: true + type: string + +permissions: + contents: write + pull-requests: write + +jobs: + bump: + runs-on: ubuntu-latest + steps: + # Always check out main, regardless of which branch the dispatch + # was fired from. workflow_dispatch can be triggered from any ref; + # without an explicit `ref:` the version bumps would land on top + # of whatever branch the dispatcher had checked out, and the + # resulting PR would carry that branch's diffs alongside the bumps. + - uses: actions/checkout@v4 + with: + ref: main + fetch-depth: 0 # need tag history for the duplicate-tag check below + + - name: Validate version + id: ver + env: + # Pass the dispatch input through an env var rather than + # `${{ inputs.version }}` interpolation. GitHub interpolates + # the expression *before* the shell parses the script, so a + # value like `1.0.0"; curl evil.com; echo "` would execute + # before the regex check below ever ran. workflow_dispatch + # is gated to write-access users so practical risk is low, + # but this is the pattern GitHub's own docs recommend for + # defense in depth. + INPUT_VERSION: ${{ inputs.version }} + run: | + set -euo pipefail + VER="$INPUT_VERSION" + VER="${VER#v}" + if ! [[ "$VER" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + echo "::error::version '$VER' is not in X.Y.Z format" + exit 1 + fi + if git rev-parse "v${VER}" >/dev/null 2>&1; then + echo "::error::tag v${VER} already exists; pick a different version" + exit 1 + fi + BRANCH="release/v${VER}" + if git ls-remote --exit-code --heads origin "$BRANCH" >/dev/null 2>&1; then + echo "::error::branch $BRANCH already exists on origin; delete it or pick a different version" + exit 1 + fi + echo "version=${VER}" >> "$GITHUB_OUTPUT" + echo "branch=${BRANCH}" >> "$GITHUB_OUTPUT" + + - name: Bump Cargo.toml + Cargo.lock + env: + NEW_VER: ${{ steps.ver.outputs.version }} + run: | + set -euo pipefail + # Edit both files via Python so we anchor on the `name = "mhrv-rs"` + # line and only touch the package's own version, not unrelated + # `version = "..."` lines elsewhere in the lockfile. + python3 <<'PY' + import os, re, pathlib, sys + ver = os.environ["NEW_VER"] + for path in ("Cargo.toml", "Cargo.lock"): + p = pathlib.Path(path) + src = p.read_text() + new = re.sub( + r'(name = "mhrv-rs"\nversion = ")[0-9.]+(")', + rf'\g<1>{ver}\g<2>', + src, + count=1, + ) + if new == src: + sys.exit(f"ERROR: mhrv-rs version line not found in {path}") + p.write_text(new) + print(f"{path} -> {ver}") + PY + + - name: Bump android versionName + versionCode + env: + NEW_VER: ${{ steps.ver.outputs.version }} + run: | + set -euo pipefail + # versionCode increments by 1 on every release; versionName mirrors + # the Cargo version. Both live in android/app/build.gradle.kts. + python3 <<'PY' + import os, re, pathlib, sys + ver = os.environ["NEW_VER"] + p = pathlib.Path("android/app/build.gradle.kts") + src = p.read_text() + m = re.search(r'versionCode\s*=\s*(\d+)', src) + if not m: + sys.exit("ERROR: versionCode not found in build.gradle.kts") + old_code = int(m.group(1)) + new_code = old_code + 1 + src = src[:m.start(1)] + str(new_code) + src[m.end(1):] + src, n = re.subn( + r'versionName\s*=\s*"[^"]+"', + f'versionName = "{ver}"', + src, + count=1, + ) + if n == 0: + sys.exit("ERROR: versionName not found in build.gradle.kts") + p.write_text(src) + print(f"android/app/build.gradle.kts -> versionName={ver}, versionCode={old_code}->{new_code}") + PY + + - name: Fetch release-drafter draft body + id: draft + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + # release-drafter accumulates merged-PR titles into a draft tagged + # `next`. Pull its body for the changelog stub. `--repo` is set + # explicitly so we always look up the release in this repo even + # if a future maintainer ever creates a real `next` git tag in a + # fork or upstream. If no draft exists yet (release-drafter just + # installed, no PRs merged since), the `|| true` keeps us going + # with an empty body — you fill the English section by hand. + # `--jq 'select(.isDraft) | .body'` returns nothing if `next` is + # not a draft (i.e. someone manually published a release with + # tag `next`, or pushed a real `next` git tag with a release + # attached). On that path we treat it as "no draft" and fall + # through to the empty-body branch — better than echoing a + # surprise release body into the changelog stub. + BODY=$(gh release view next --repo "${{ github.repository }}" \ + --json body,isDraft --jq 'select(.isDraft) | .body' 2>/dev/null || true) + if [ -z "$BODY" ]; then + echo "::notice::no release-drafter 'next' draft found; English section will be empty" + else + echo "::notice::pulled $(printf '%s' "$BODY" | wc -l) lines from draft release" + fi + # Multiline outputs need a heredoc-style delimiter — pick one that + # cannot appear in a release-drafter bullet line. + { + echo 'body<<__DRAFT_BODY_EOF__' + printf '%s\n' "$BODY" + echo '__DRAFT_BODY_EOF__' + } >> "$GITHUB_OUTPUT" + + - name: Write changelog stub + env: + NEW_VER: ${{ steps.ver.outputs.version }} + DRAFT_BODY: ${{ steps.draft.outputs.body }} + run: | + set -euo pipefail + # Build the file with shell `echo`/`printf` (not a YAML-level + # heredoc with $-double-curly interpolation) so backticks, dollar + # signs, or EOF tokens in the draft body can't break us. + # + # Why no TODO/instructional : + # release.yml strips leading blocks from the + # file before publishing the GitHub Release body, and the + # Telegram script does the same — both via a regex that handles + # multiple consecutive comments. But relying on stripping is + # brittle: a maintainer adding a new comment with a different + # shape (multi-line, indented, etc.) could leak it. Instead we + # use VISIBLE placeholders below. If the maintainer forgets to + # edit them, they ship as obvious `[FA]`/`` markers + # that an admin will spot in the release page within seconds. + mkdir -p docs/changelog + OUT="docs/changelog/v${NEW_VER}.md" + { + echo '' + echo '[FA] translate the English bullets below into Persian and replace this line.' + echo '' + echo '---' + # Append the English section if release-drafter had any. + # Skip the printf entirely on empty so we don't leave a + # trailing blank line under `---`. + if [ -n "$DRAFT_BODY" ]; then + # Strip Conventional-Commit prefixes (`feat:`, `fix(android):`, + # etc.) from the start of each bullet headline. PR titles in + # this repo all carry these prefixes by convention, but the + # existing changelog style is verb-first ("Add X" / "Fix Y"), + # not type-first. Stripping here saves the maintainer one + # manual step per bullet; they still need to fix the verb + # tense (e.g. "added" → "Add") since GitHub PR titles tend + # to be past-tense and the changelog convention is imperative. + # + # Bullet shape from release-drafter is: + # • feat(scope): title text ([#NN](url)): . Thanks @user + # After this sed: + # • title text ([#NN](url)): . Thanks @user + printf '%s\n' "$DRAFT_BODY" \ + | sed -E 's/^(• )(feat|fix|chore|docs?|refactor|perf|test|build|ci|style|revert)(\([^)]*\))?!?: */\1/i' + fi + } > "$OUT" + echo "wrote $OUT ($(wc -l < "$OUT") lines)" + + # No `Ensure release-prep label exists` step here — release-drafter's + # workflow runs on every push to main, and its `Ensure autolabeler + # labels exist` step creates `release-prep` (along with the type:* + # labels). Since these workflow files only land via a push to main, + # release-drafter's bootstrap necessarily runs before the first + # prepare-release dispatch. If for some reason release-drafter is + # disabled, `gh pr create --label release-prep` below will fail with + # an actionable "label not found" — fix is to re-enable + # release-drafter or run `gh label create release-prep` once by hand. + + - name: Commit, push, and open PR + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + NEW_VER: ${{ steps.ver.outputs.version }} + BRANCH: ${{ steps.ver.outputs.branch }} + run: | + set -euo pipefail + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + git checkout -b "$BRANCH" + git add Cargo.toml Cargo.lock android/app/build.gradle.kts \ + "docs/changelog/v${NEW_VER}.md" + git commit -m "release: prepare v${NEW_VER}" + git push -u origin "$BRANCH" + + # Write the PR body to a file rather than fight nested heredoc + # escaping in the YAML run: block. + # + # IMPORTANT: this heredoc terminator (`MSG`) is INTENTIONALLY + # unquoted so that ${NEW_VER} and ${BRANCH} expand. Backticks + # in the body are escaped (\`) for the same reason. If you + # paste anything into the template below, watch out for `$(...)` + # and unescaped backticks — they will execute at workflow run + # time. To add a static block that should NOT interpolate, build + # it with a separate `<<'STATIC'` heredoc and concat afterward. + cat > /tmp/pr-body.md <\` with a short explanatory clause matching the project's \`• headline (#NN): full explanation\` style. The Conventional-Commit prefix (\`feat:\`/\`fix:\`/etc.) and the trailing \`. Thanks @author\` are already handled. + 3. Commit + push to this branch so the PR includes the final bilingual changelog. + + Any \`[FA]\` or \`\` markers left in the file will ship verbatim into the GitHub Release page and the Telegram post — they're intentionally visible, not hidden in HTML comments. + + **After merging — ship it:** + 1. \`git checkout main && git pull\` + 2. \`git tag v${NEW_VER} && git push origin v${NEW_VER}\` + 3. \`release.yml\` picks up the tag, builds artifacts, creates the GitHub release, and (if enabled) posts to Telegram. + MSG + + gh pr create \ + --base main \ + --head "$BRANCH" \ + --title "release: prepare v${NEW_VER}" \ + --label "release-prep" \ + --body-file /tmp/pr-body.md diff --git a/.github/workflows/release-drafter.yml b/.github/workflows/release-drafter.yml new file mode 100644 index 00000000..60e86445 --- /dev/null +++ b/.github/workflows/release-drafter.yml @@ -0,0 +1,65 @@ +# Updates the draft GitHub release on every push to main, and applies +# Conventional-Commit-derived labels to incoming PRs. Config lives in +# `.github/release-drafter.yml`. The drafter writes one line per merged +# PR into a draft release tagged `next`; `prepare-release.yml` reads +# that body when bumping versions so the English half of +# `docs/changelog/v.md` is prefilled. +# +# Cost: one ubuntu-latest job per relevant PR/push event, single API +# call, no compile, no tests. Zero contention with the self-hosted +# Hetzner runners that release.yml uses. + +name: release-drafter + +on: + push: + branches: [main] + # `pull_request_target` runs in the context of the base branch (main), + # which is what the autolabeler needs to write labels back to PRs — + # including PRs from forks, which the regular `pull_request` event + # doesn't grant write permissions for. We never check out PR code + # in this workflow (only call the action), so the elevated context + # is safe. + pull_request_target: + types: [opened, reopened, synchronize, edited] + +permissions: + contents: read + +jobs: + update-draft: + permissions: + contents: write # write the draft release object + pull-requests: write # apply autolabeler labels to incoming PRs + runs-on: ubuntu-latest + steps: + # Ensure the labels referenced by .github/release-drafter.yml's + # autolabeler block all exist. release-drafter logs a warning and + # skips when it tries to apply a label that's missing — labelling + # itself doesn't fail, but exclude-labels and downstream filtering + # become silent no-ops. `gh label create … || true` is idempotent: + # creates on first run, exits with "already exists" on every run + # after that. Cheap (5 API calls per workflow run, no compile). + - name: Ensure autolabeler labels exist + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + # Format: name|color|description (color without leading #). + while IFS='|' read -r name color desc; do + gh label create "$name" --color "$color" --description "$desc" \ + --repo "${{ github.repository }}" 2>/dev/null || true + done <<'LABELS' + release-prep|ededed|Automated version-bump PR; excluded from release-drafter changelog + type: feature|a2eeef|feat: PR — auto-applied by release-drafter + type: fix|d73a4a|fix: PR — auto-applied by release-drafter + type: chore|cfd3d7|chore: PR — auto-applied by release-drafter + type: docs|0075ca|docs: PR — auto-applied by release-drafter + type: refactor|fbca04|refactor: PR — auto-applied by release-drafter + LABELS + + - uses: release-drafter/release-drafter@v6 + with: + config-name: release-drafter.yml + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 55caa258..8068b30c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -77,6 +77,17 @@ jobs: - target: x86_64-pc-windows-gnu os: windows-latest name: mhrv-rs-windows-amd64 + # i686-pc-windows-msvc target was attempted in v1.7.7-v1.7.10 + # to support Windows 7 32-bit users (#272, #318). Removed in + # v1.7.11 because keeping it on Rust 1.77.2 (last Win7-stable) + # is fundamentally fragile: every transitive crate that bumps + # its MSRV (e.g. `time` 0.3.47 needs Cargo manifest features + # only available in Rust 1.78+) breaks the build, and pinning + # transitives is brittle across releases. Win7 users should + # self-build per the README; the project no longer ships a + # prebuilt i686 Win7 binary. Replaced by the existing + # x86_64-pc-windows-gnu (windows-amd64) which covers ~99% of + # active Windows installs (incl. all WoA64 emulation). - target: x86_64-unknown-linux-musl os: [self-hosted, linux, x64, mhrv-build] name: mhrv-rs-linux-musl-amd64 @@ -138,9 +149,14 @@ jobs: # installed and the standard target triples are pre-added. It # still verifies the target is present and is cheap enough to keep # as a safety net. - - uses: dtolnay/rust-toolchain@stable + # Per-matrix-entry toolchain selection. Default is `stable` (latest) + # for every target except where `rust_toolchain` is explicitly pinned + # — currently just i686-pc-windows-msvc, which needs 1.77.2 to keep + # the Win7 binary loadable (Rust 1.78+ raised Windows MSRV to Win10). + - uses: dtolnay/rust-toolchain@master if: matrix.mipsel_softfloat != true with: + toolchain: ${{ matrix.rust_toolchain || 'stable' }} targets: ${{ matrix.target }} # Cache target/ + cargo registry across runs — this is the big @@ -159,7 +175,11 @@ jobs: - uses: Swatinem/rust-cache@v2 if: matrix.mipsel_softfloat != true with: - key: ${{ matrix.target }} + # Include toolchain in the cache key so a pinned-Rust target + # (i686-pc-windows-msvc on 1.77.2) doesn't collide with + # stable-Rust caches for other targets, and a future toolchain + # bump invalidates only the affected slot. + key: ${{ matrix.target }}-${{ matrix.rust_toolchain || 'stable' }} cache-bin: "false" # eframe needs a few system libs on Linux for window management, keyboard, @@ -578,10 +598,33 @@ jobs: with: fetch-depth: 0 - - uses: actions/download-artifact@v4 - with: - path: dist - merge-multiple: true + # `actions/download-artifact@v4` has been intermittently flaking on + # this workflow with "5 retries exhausted" on a single artifact (~10 + # of 13). Wrap it in a manual retry — usually the second attempt + # succeeds, the third nails any laggards. We use `gh run download` + # against the current run so we don't depend on the release page + # existing yet (it doesn't until the softprops step below runs). + - name: Download all build artifacts (with retries) + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + mkdir -p dist + for attempt in 1 2 3; do + if gh run download "${GITHUB_RUN_ID}" --dir dist --repo "${GITHUB_REPOSITORY}"; then + echo "downloaded all artifacts on attempt $attempt" + # `gh run download` puts each artifact in its own subdir; + # flatten so downstream steps that expect dist/ work + # the same as `merge-multiple: true` did. + find dist -type f -mindepth 2 -exec mv -f {} dist/ \; + find dist -type d -empty -delete + ls -la dist/ + exit 0 + fi + echo "download attempt $attempt failed; retrying in 30s..." + sleep 30 + done + echo "::error::failed to download artifacts after 3 attempts" + exit 1 # Compose the GitHub release body from `docs/changelog/v.md` # so the Releases page tells humans what actually changed — @@ -608,8 +651,22 @@ jobs: fi { echo 'body<<__RELEASE_BODY_EOF__' - # Strip leading HTML comment that documents the file format. - sed -e '1{/^\s*)+", "", body, count=1, flags=re.S), end="") + PY echo echo '__RELEASE_BODY_EOF__' } >> "$GITHUB_OUTPUT" @@ -636,43 +693,193 @@ jobs: append_body: true generate_release_notes: true - # Notify the Persian-speaking Telegram channel with the CI-built - # Android APK + its sha256 + the per-version changelog from - # `docs/changelog/v.md`. + # Refresh the in-repo `releases/` folder with the latest pre-built + # artifacts so users behind GitHub-Releases-page filtering (the IR + # state network filters the dynamic /releases/ URL but not the static + # `Code → Download ZIP` of the source tree) can still download. + # Practice was started pre-v1.1.0, dropped, then resumed at user + # request after a Telegram-channel suggestion: "فقط داخل پوشه ریلیز + # پروژه اپلود بکن — مشکل دانلود حل میشه — راحت میشه از گیتهاب دانلود + # کرد." The folder holds ONLY the latest version (replace, not + # archive); each tag refresh overwrites the previous artifacts. The + # existing release-page workflow keeps versioned artifacts behind + # `https://github.com/.../releases/tag/v...` for users who can reach + # that URL — this in-repo folder is the fallback for users who can't. + commit-releases: + needs: [build, android, release] + runs-on: ubuntu-latest + permissions: + contents: write + steps: + # Always check out main, not the tag — we're committing back to + # the moving branch. fetch-depth 0 so `git push origin HEAD:main` + # has the lineage to fast-forward. + - uses: actions/checkout@v4 + with: + ref: main + fetch-depth: 0 + + # Pull artifacts from the GitHub Release page (which the `release` + # job populated a few seconds earlier) rather than the workflow + # artifacts API. The artifacts API path — + # `actions/download-artifact@v4` with `merge-multiple: true` — + # has been failing with "artifact download failed after 5 + # retries" on one of the ~13 artifacts on multiple consecutive + # runs (v1.7.5 retrigger, v1.7.6). The 10 fast downloads that + # complete first all succeed; the 11th-13th hit the error. + # `gh release download` reads from GitHub's Release-page CDN, + # which is independent of the artifacts blob store and has a + # different retry / rate-limit profile. Same files, more + # reliable surface. + - name: Download artifacts from the GitHub Release page + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + VER="${{ inputs.version || github.ref_name }}" + # Strip leading `v` to normalize, then re-add — the Release + # tag is `vX.Y.Z`, but for the rest of the workflow we use + # bare `X.Y.Z`. Mirror the same pattern here so a downstream + # readme update can use the bare version. + VER="${VER#v}" + mkdir -p artifacts + gh release download "v${VER}" \ + --repo "${{ github.repository }}" \ + --dir artifacts \ + --pattern '*.tar.gz' \ + --pattern '*.zip' \ + --pattern '*.apk' + echo "--- artifacts/ contents ---" + ls -la artifacts/ + + - name: Refresh releases/ folder + run: | + set -euo pipefail + VER="${{ inputs.version || github.ref_name }}" + VER="${VER#v}" + + mkdir -p releases + + # Wipe old binary artifacts (.apk, .tar.gz, .zip) but keep + # README.md and .gitattributes — those are folder-level docs + # that stay constant across versions and shouldn't be + # regenerated on every release. + find releases -maxdepth 1 -type f \ + \( -name '*.apk' -o -name '*.tar.gz' -o -name '*.zip' \) \ + -delete + + # Copy desktop archives. Their names already include the + # platform identifier (mhrv-rs-linux-amd64.tar.gz, etc.) and + # are version-stable — no rename needed. + for f in artifacts/*.tar.gz artifacts/*.zip; do + [ -f "$f" ] || continue + cp "$f" "releases/$(basename "$f")" + done + + # Android APKs come with the version baked into the name + # (mhrv-rs-android-universal-v1.7.5.apk). Copy all of them so + # users on slow connections can grab a per-ABI APK (~37 MB) + # instead of the universal (~110 MB). + for f in artifacts/mhrv-rs-android-*.apk; do + [ -f "$f" ] || continue + cp "$f" "releases/$(basename "$f")" + done + + # Update the "Current version" line in releases/README.md + # (both English and Persian copies) and APK filename refs so + # the doc stays accurate. `sed -i` BSD/GNU compatibility is + # handled by passing an empty extension explicitly — runner + # is Linux so `-i` alone works, but the empty-string form + # also works on macOS for anyone running this locally. + if [ -f releases/README.md ]; then + sed -i.bak \ + -e "s/Current version: \*\*v[0-9][0-9.]*\*\*/Current version: **v${VER}**/" \ + -e "s/نسخهٔ فعلی: \*\*v[0-9][0-9.]*\*\*/نسخهٔ فعلی: **v${VER}**/" \ + -e "s/mhrv-rs-android-universal-v[0-9][0-9.]*\.apk/mhrv-rs-android-universal-v${VER}.apk/g" \ + releases/README.md + rm -f releases/README.md.bak + fi + + echo "--- releases/ contents after refresh ---" + ls -la releases/ + + - name: Commit + push to main + run: | + set -euo pipefail + VER="${{ inputs.version || github.ref_name }}" + VER="${VER#v}" + + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + + git add releases + if git diff --cached --quiet; then + echo "No releases/ changes to commit (artifacts identical to current HEAD?)." + exit 0 + fi + + git commit -m "chore(releases): refresh prebuilt binaries for v${VER}" \ + -m "Auto-committed by release workflow so users behind GitHub-Releases-page filtering can download via the in-repo releases/ folder. The GitHub Release page itself still has the canonical versioned artifacts; this folder is the fallback path for users who can only reach the static source tree (Code → Download ZIP)." + + # Push to main. The release workflow runs on the tag commit, + # which is reachable from main as a fast-forward — push is + # straightforward, no force needed. Tag protection rules + # apply to refs/tags/* not refs/heads/main, so this push + # isn't gated by the same protection. + git push origin HEAD:main + + # ─────────── LEGACY — DORMANT BY DEFAULT ─────────── + # + # Posts the universal APK + per-version changelog to the **main** + # Telegram channel as one big sendDocument + sendMessage pair. + # + # Superseded as of v1.8.0+ by `.github/workflows/telegram-publish-files.yml`, + # which posts each platform's artifact individually to the **files** + # channel (with SHA-256 captions) and then a single cross-link + # message to the main channel pointing at the files-channel anchor. + # + # This job stays in the source tree, dormant, in case we ever want + # to revert to the bundled-changelog-on-main-channel pattern (or + # use both at once during a transition). To turn it back on: + # + # gh variable set TELEGRAM_NOTIFY_ENABLED --body true # - # Two Telegram API calls: - # 1. sendDocument — APK file + a short caption (Telegram caps - # captions at 1024 chars, and we have bigger changelogs than - # that). - # 2. sendMessage — full changelog as a reply to #1, Persian - # quote-block first then English, same pattern as the - # previous manual post. No emojis, as the user asked. + # Note: with the new workflow active too, that produces TWO posts + # to the main channel per release (the legacy APK+changelog *and* + # the new cross-link). Pick one. # - # Needs two repo secrets: - # TELEGRAM_BOT_TOKEN — bot the channel admits as poster - # TELEGRAM_CHAT_ID — numeric chat id (starts with -100...) - # Missing either => the whole job is skipped (not failed) so a - # forker who hasn't set up a Telegram channel gets a clean release. + # Default state is disabled. telegram: needs: [android, release] runs-on: ubuntu-latest # Gated on the repo variable `TELEGRAM_NOTIFY_ENABLED`. Default is - # OFF — the job skips silently unless the variable is set to the - # literal string "true". Toggle via: - # - # gh variable set TELEGRAM_NOTIFY_ENABLED --body true - # gh variable set TELEGRAM_NOTIFY_ENABLED --body false - # - # Keeping the machinery (script + secrets) in place so flipping - # the switch back on is a one-liner, not a workflow edit. + # off — the job skips silently unless the variable is set to the + # literal string "true". if: ${{ vars.TELEGRAM_NOTIFY_ENABLED == 'true' && needs.android.result == 'success' }} steps: - uses: actions/checkout@v4 - - uses: actions/download-artifact@v4 - with: - name: mhrv-rs-android-universal - path: apk + # Same retry pattern as the `release` job above — `actions/download-artifact@v4` + # has been flaking on this workflow with 5-retries-exhausted errors. + - name: Download universal APK (with retries) + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + mkdir -p apk + for attempt in 1 2 3; do + if gh run download "${GITHUB_RUN_ID}" \ + --name mhrv-rs-android-universal \ + --dir apk \ + --repo "${GITHUB_REPOSITORY}"; then + echo "downloaded universal APK on attempt $attempt" + ls -la apk/ + exit 0 + fi + echo "download attempt $attempt failed; retrying in 30s..." + sleep 30 + done + echo "::error::failed to download universal APK after 3 attempts" + exit 1 - name: Post to Telegram env: diff --git a/.github/workflows/telegram-publish-files.yml b/.github/workflows/telegram-publish-files.yml new file mode 100644 index 00000000..e7d956b9 --- /dev/null +++ b/.github/workflows/telegram-publish-files.yml @@ -0,0 +1,137 @@ +name: Telegram publish release files + +# Posts every release artifact (Android APKs, Windows ZIP, macOS, Linux, +# OpenWRT, Raspbian) to the Telegram channel as individual messages with +# Persian captions and a #v hashtag. Files larger +# than the bot API's 50 MB ceiling are split into ~45 MB byte chunks +# server-side and posted as `.part_aa`, `.part_ab`, ... — recipients +# reassemble with `cat .part_* > `. +# +# This workflow is decoupled from `release.yml` so it can be re-triggered +# for any historical tag (e.g. to re-post v1.8.0 after a Telegram channel +# wipe) without rebuilding artifacts. It downloads from the GitHub Release +# page directly via `gh release download`, so the assets must already +# exist there. + +on: + workflow_dispatch: + inputs: + version: + description: 'Release tag to publish (with or without the v prefix, e.g. 1.8.0 or v1.8.0)' + required: true + type: string + # Auto-trigger after a successful `release` workflow run. Posts files + # to Telegram once the release page exists. The `head_branch` of the + # triggering run is the tag name (e.g. `v1.8.0`) on tag-pushed releases, + # which is what we feed `gh release download`. + workflow_run: + workflows: [release] + types: [completed] + +permissions: + contents: read + +jobs: + publish: + # Skip when triggered by a `release` run that didn't succeed — no + # point posting half a release. Manual `workflow_dispatch` always + # runs (the user explicitly asked for it). + if: | + github.event_name == 'workflow_dispatch' + || github.event.workflow_run.conclusion == 'success' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + # Sparse checkout would be nicer but stock checkout is fast + # enough for a 5 MB workflow file + ~200 KB script. + fetch-depth: 1 + + - name: Resolve version + hashtag + id: ver + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + if [ -n "${{ inputs.version || '' }}" ]; then + VER="${{ inputs.version }}" + else + # workflow_run path. `head_branch` for a tag-pushed release + # workflow is the tag name (e.g. `v1.8.0`). + VER="${{ github.event.workflow_run.head_branch || '' }}" + fi + if [ -z "$VER" ]; then + echo "::error::could not determine version from inputs or workflow_run trigger" + exit 1 + fi + # Strip the leading `v` if present. + VER="${VER#v}" + # Hashtag: `#v` + version with dots removed. So 1.8.0 → #v180, + # 1.8.10 → #v1810, 2.0.0 → #v200. Predictable across releases. + HASHTAG="#v$(echo "$VER" | tr -d '.')" + echo "version=$VER" >> "$GITHUB_OUTPUT" + echo "hashtag=$HASHTAG" >> "$GITHUB_OUTPUT" + echo "Resolved: version=$VER hashtag=$HASHTAG" + + - name: Download release assets + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + mkdir -p assets + # Mirror the retry pattern from `release.yml`'s download step — + # GitHub's release-asset CDN occasionally times out on cold + # tags. Three attempts with 30 s backoff covers most flakes. + for attempt in 1 2 3; do + if gh release download "v${{ steps.ver.outputs.version }}" \ + --dir assets \ + --repo "${GITHUB_REPOSITORY}"; then + echo "downloaded release assets on attempt $attempt" + ls -la assets/ + exit 0 + fi + echo "attempt $attempt failed; retrying in 30s..." + sleep 30 + done + echo "::error::failed to download release assets after 3 attempts" + exit 1 + + - name: Publish files to Telegram channel + env: + BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }} + # The files channel — supergroup-style negative ID, hard-coded + # rather than templated as a repo variable because there's only + # ever one of these and putting it in source makes the workflow + # auditable. The bot token already has post permissions there. + CHAT_ID: '-1003966234444' + # The main announcement channel. Receives a single cross-link + # message per release pointing at the file-channel anchor post, + # instead of the previous behaviour of attaching the universal + # APK + full changelog. Sourced from the same secret the + # legacy `telegram` job in release.yml used. + MAIN_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }} + # Public-username form of the files channel link. Used for + # both (a) the post-link in the main-channel cross-post — so + # `t.me//` works for everyone, not just members + # via `t.me/c//` — and (b) one of the two + # channel-join links rendered at the bottom of the cross-post. + # Defaults to `mhrv_rs` (current public username); override via + # repo variable if the channel is renamed. + FILES_CHANNEL_USERNAME: ${{ vars.FILES_CHANNEL_USERNAME || 'mhrv_rs' }} + # `t.me/+` invite link for the files channel. Rendered + # as the second channel-join option in the main-channel + # cross-post — the only join path that works for users coming + # from outside Telegram search (private/restricted channels) + # or whose Telegram client doesn't resolve usernames cleanly. + # Override via repo variable if the channel's invite hash is + # rotated. + FILES_CHANNEL_INVITE: ${{ vars.FILES_CHANNEL_INVITE || 'https://t.me/+R1OyoHX2boA1ZDgx' }} + run: | + if [ -z "${BOT_TOKEN:-}" ]; then + echo "::error::TELEGRAM_BOT_TOKEN not set; can't publish" + exit 1 + fi + python3 .github/scripts/telegram_publish_files.py \ + --assets-dir assets \ + --version "${{ steps.ver.outputs.version }}" \ + --hashtag "${{ steps.ver.outputs.hashtag }}" diff --git a/Cargo.lock b/Cargo.lock index 66a711c2..1b27f233 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -160,6 +160,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + [[package]] name = "arboard" version = "3.6.1" @@ -173,7 +179,7 @@ dependencies = [ "objc2-foundation 0.3.2", "parking_lot", "percent-encoding", - "windows-sys 0.52.0", + "windows-sys 0.60.2", "x11rb", ] @@ -556,6 +562,17 @@ dependencies = [ "libc", ] +[[package]] +name = "chacha20" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "rand_core 0.10.1", +] + [[package]] name = "chrono" version = "0.4.44" @@ -753,6 +770,15 @@ dependencies = [ "libc", ] +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + [[package]] name = "crc32fast" version = "1.5.0" @@ -1070,18 +1096,6 @@ dependencies = [ "serde", ] -[[package]] -name = "enum-as-inner" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1e6a265c649f3f5979b601d26f1d05ada116434c87741c9493cb56218f76cbc" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "syn 2.0.117", -] - [[package]] name = "enumn" version = "0.1.14" @@ -1417,8 +1431,22 @@ checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 5.3.0", + "wasip2", +] + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", + "rand_core 0.10.1", "wasip2", + "wasip3", ] [[package]] @@ -1654,25 +1682,19 @@ checksum = "dfa686283ad6dd069f105e5ab091b04c62850d3e4cf5d67debad1933f55023df" [[package]] name = "hickory-proto" -version = "0.25.2" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8a6fe56c0038198998a6f217ca4e7ef3a5e51f46163bd6dd60b5c71ca6c6502" +checksum = "a916d0494600d99ecb15aadfab677ad97c4de559e8f1af0c129353a733ac1fcc" dependencies = [ - "async-trait", - "cfg-if", "data-encoding", - "enum-as-inner", - "futures-channel", - "futures-io", - "futures-util", "idna", "ipnet", + "jni 0.22.4", "once_cell", - "rand 0.9.4", + "rand 0.10.1", "ring", "thiserror 2.0.18", "tinyvec", - "tokio", "tracing", "url", ] @@ -1810,6 +1832,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + [[package]] name = "idna" version = "1.1.0" @@ -1852,6 +1880,8 @@ checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", "hashbrown 0.17.0", + "serde", + "serde_core", ] [[package]] @@ -2030,6 +2060,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + [[package]] name = "libc" version = "0.2.185" @@ -2186,7 +2222,7 @@ dependencies = [ [[package]] name = "mhrv-rs" -version = "1.6.0" +version = "1.8.3" dependencies = [ "base64 0.22.1", "bytes", @@ -2861,6 +2897,16 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8cf8e6a8aa66ce33f63993ffc4ea4271eb5b0530a9002db8455ea6050c77bfa" +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn 2.0.117", +] + [[package]] name = "proc-macro-crate" version = "3.5.0" @@ -2915,6 +2961,12 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + [[package]] name = "rand" version = "0.8.6" @@ -2936,6 +2988,17 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" +dependencies = [ + "chacha20", + "getrandom 0.4.2", + "rand_core 0.10.1", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -2974,6 +3037,12 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rand_core" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" + [[package]] name = "raw-window-handle" version = "0.5.2" @@ -3163,7 +3232,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.4.15", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -3305,7 +3374,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.2.17", "digest", ] @@ -3461,9 +3530,9 @@ dependencies = [ [[package]] name = "socks5-impl" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1eae7c78f163b7805f66493c787d7bad4816146faf0cf655d57c78b90c383ce3" +checksum = "150816c2d954315f351129f438f851285e1ddb6d6ccc850ddd45c523d19abda0" dependencies = [ "async-trait", "bytes", @@ -3567,7 +3636,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.3.4", + "getrandom 0.4.2", "once_cell", "rustix 1.1.4", "windows-sys 0.61.2", @@ -3891,9 +3960,9 @@ dependencies = [ [[package]] name = "tun2proxy" -version = "0.7.20" +version = "0.7.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0576f75fd691ad86cdc4348f29fb8770037ab8140179f1f9f8f6991f7ebd2176" +checksum = "d336ad07beb04a9e219972fcdc54a71d2586cdfd35ac03551a629e4ca328db3c" dependencies = [ "android_logger", "async-trait", @@ -4061,7 +4130,16 @@ version = "1.0.3+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" dependencies = [ - "wit-bindgen", + "wit-bindgen 0.57.1", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen 0.51.0", ] [[package]] @@ -4119,6 +4197,40 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags 2.11.1", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + [[package]] name = "wayland-backend" version = "0.3.15" @@ -4467,7 +4579,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.48.0", ] [[package]] @@ -4945,12 +5057,100 @@ dependencies = [ "winreg", ] +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + [[package]] name = "wit-bindgen" version = "0.57.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn 2.0.117", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.117", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags 2.11.1", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + [[package]] name = "writeable" version = "0.6.3" diff --git a/Cargo.toml b/Cargo.toml index e7fdae86..9f12ce8b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mhrv-rs" -version = "1.6.0" +version = "1.8.3" edition = "2021" description = "Rust port of MasterHttpRelayVPN -- DPI bypass via Google Apps Script relay with domain fronting" license = "MIT" @@ -94,11 +94,15 @@ libc = "0.2" # traffic black-holes (symptom: Chrome shows DNS_PROBE_STARTED). [target.'cfg(target_os = "android")'.dependencies] jni = { version = "0.21", default-features = false } -tun2proxy = { version = "0.7", default-features = false } +tun2proxy = { version = "0.7", default-features = false, features = ["udpgw"] } [dev-dependencies] # Used in mitm tests to sanity-check the cert extensions we emit. x509-parser = "0.16" +# `test-util` enables `tokio::test(start_paused = true)` so timing- +# sensitive tests in `tunnel_client` (the empty-poll cadence) can +# auto-advance virtual time instead of burning real wall-clock seconds. +tokio = { version = "1", features = ["test-util"] } [profile.release] panic = "abort" diff --git a/README.md b/README.md index 938d10c1..353bd0f3 100644 --- a/README.md +++ b/README.md @@ -135,7 +135,7 @@ To route your browser's HTTPS traffic through the Apps Script relay, `mhrv-rs` h - A fresh CA keypair (`ca/ca.crt` + `ca/ca.key`) is generated **on your machine**, in your user-data dir. - The public `ca.crt` is added to your system trust store so browsers accept the per-site certificates `mhrv-rs` mints on the fly. This is the step that needs sudo / Administrator. - The private `ca.key` **never leaves your machine**. Nothing uploads it, nothing phones home, and no remote party — including the Apps Script relay — can use it to impersonate sites to you. -- You can revoke it at any time by deleting the CA from your OS keychain (macOS: Keychain Access → System → delete `mhrv-rs`) / Windows cert store / `/etc/ca-certificates`, and removing the `ca/` folder. +- You can revoke it at any time with `mhrv-rs --remove-cert` (or the **Remove CA** button in the UI) — it clears the CA from the OS trust store, verifies the revocation by name before touching disk, and deletes the on-disk `ca/` folder. NSS cleanup (Firefox profiles + Chrome/Chromium on Linux) is best-effort: if `certutil` from libnss3-tools isn't on PATH or a browser has the NSS DB locked, the tool logs a manual-cleanup hint. `config.json` and your Apps Script deployment are not touched, so regenerating the CA never requires redeploying `Code.gs`. Manual fallback: the certificate's Common Name is `MasterHttpRelayVPN` (not `mhrv-rs` — that's the app name, not the cert name). Delete by that CN in your OS keychain (macOS: Keychain Access → System → delete `MasterHttpRelayVPN`), Windows `certmgr.msc` → Trusted Root Certification Authorities, or `/usr/local/share/ca-certificates/MasterHttpRelayVPN.crt` + `sudo update-ca-certificates` on Linux; remove the `MasterHttpRelayVPN` entry from each browser's cert settings; and remove the `ca/` folder under the user-data dir. The launcher does all of this for you and then starts the UI: @@ -197,9 +197,14 @@ Then: ./mhrv-rs test # one-shot end-to-end probe ./mhrv-rs scan-ips # rank Google frontend IPs by latency ./mhrv-rs --install-cert # reinstall the MITM CA +./mhrv-rs --remove-cert # clean slate: uninstall + delete the whole ca/ dir ./mhrv-rs --help ``` +`--remove-cert` deletes the CA from the OS trust store, deletes the on-disk `ca/` directory, and verifies the revocation by name — if a system-level delete needed admin you didn't have, it aborts the file deletion and prints an error so you can re-run elevated. NSS cleanup (Firefox profiles + Chrome/Chromium on Linux) is best-effort: if `certutil` isn't on PATH or a browser holds the NSS DB open, the tool logs a manual-cleanup hint. Your `config.json` and the Apps Script deployment at `script.google.com` are untouched, so a fresh CA (generated next time you start the proxy) does not require redeploying `Code.gs`. + +> **Upgrading from pre-v1.2.11?** Earlier versions wrote a bare `user_pref("security.enterprise_roots.enabled", true);` into each Firefox profile's `user.js` without a provenance marker. `--remove-cert` intentionally does **not** strip that line — a bare pref is indistinguishable from one authored by the user or a corporate policy, and silently revoking trust behavior is worse than leaving one cosmetic orphan line. Firefox falls back to its built-in Mozilla root store the moment the MITM CA leaves the OS trust store, so this has no functional effect. Delete the line manually if it bothers you. + `script_id` can also be a JSON array: `["id1", "id2", "id3"]`. #### scan-ips configuration (optional) @@ -710,9 +715,15 @@ logread -e mhrv-rs -f **چطور گواهی را بعداً حذف کنم؟** -- **مک:** `Keychain Access` را باز کنید، در بخش `System` دنبال `mhrv-rs` بگردید و حذف کنید. سپس پوشهٔ `~/Library/Application Support/mhrv-rs/ca/` را پاک کنید -- **ویندوز:** `certmgr.msc` را اجرا کنید → `Trusted Root Certification Authorities` → `Certificates` → دنبال `mhrv-rs` بگردید و حذف کنید -- **لینوکس:** فایل `/usr/local/share/ca-certificates/mhrv-rs.crt` را حذف و `sudo update-ca-certificates` اجرا کنید +- **ساده‌ترین راه (هر سه سیستم‌عامل):** داخل برنامه روی دکمهٔ **`Remove CA`** بزنید، یا در ترمینال: + - مک/لینوکس: `sudo ./mhrv-rs --remove-cert` + - ویندوز (با `Run as administrator`): `mhrv-rs.exe --remove-cert` + - این دستور گواهی را از `trust store` سیستم و `NSS` (فایرفاکس/کروم) پاک می‌کند و فایل‌های `ca/ca.crt` و `ca/ca.key` را هم روی دیسک حذف می‌کند. فایل `config.json` و `deployment` آپس‌اسکریپت دست‌نخورده می‌مانند — پس لازم نیست `Code.gs` را دوباره دیپلوی کنید. +- **به‌صورت دستی** (اگر می‌خواهید): + - **نکته:** نام گواهی (`Common Name`) در همهٔ مکان‌ها `MasterHttpRelayVPN` است — `mhrv-rs` نام برنامه است، نه نام گواهی. + - **مک:** `Keychain Access` را باز کنید، در بخش `System` دنبال `MasterHttpRelayVPN` بگردید و حذف کنید. سپس پوشهٔ `~/Library/Application Support/mhrv-rs/ca/` را پاک کنید + - **ویندوز:** `certmgr.msc` را اجرا کنید → `Trusted Root Certification Authorities` → `Certificates` → دنبال `MasterHttpRelayVPN` بگردید و حذف کنید + - **لینوکس:** فایل `/usr/local/share/ca-certificates/MasterHttpRelayVPN.crt` را حذف و `sudo update-ca-certificates` اجرا کنید **چند `Deployment ID` لازم دارم؟** یکی برای استفادهٔ عادی کافی است. سهمیهٔ روزانه `UrlFetchApp` برای حساب رایگان گوگل **۲۰٬۰۰۰ درخواست در روز** است (برای `Workspace` پولی ۱۰۰٬۰۰۰)، با محدودیت پاسخ ۵۰ مگابایت به ازای هر `fetch`. از هر حساب گوگل **فقط یک `Deployment`** بسازید — سقف ۳۰ درخواست همزمان به ازای هر حساب است، پس چند `Deployment` روی یک حساب همزمانی اضافه نمی‌کند. برای افزایش همزمانی یا سهمیهٔ روزانه، در حساب‌های گوگل دیگر `Deployment` بسازید — هر حساب سهمیهٔ ۲۰ هزار درخواستی و ۳۰ اجرای همزمان خودش را دارد. همهٔ `ID`ها را در فیلد `Apps Script ID(s)` وارد کنید — برنامه خودکار بینشان می‌چرخد. مرجع: @@ -735,9 +746,12 @@ logread -e mhrv-rs -f ./mhrv-rs scan-ips # رتبه‌بندی IPهای گوگل بر اساس سرعت ./mhrv-rs test-sni # تست نام‌های SNI در pool ./mhrv-rs --install-cert # نصب مجدد گواهی +./mhrv-rs --remove-cert # حذف کامل گواهی: پاک‌سازی trust store و کل پوشهٔ ca/ ./mhrv-rs --help ``` +دستور `--remove-cert` گواهی را از `trust store` سیستم پاک می‌کند، با بررسی نام تأیید می‌کند که حذف انجام شده، و سپس پوشهٔ `ca/` روی دیسک را حذف می‌کند — اگر حذف نیاز به دسترسی ادمین داشته باشد که در دسترس نبوده، قبل از پاک کردن فایل‌ها متوقف می‌شود تا بتوانید با دسترسی مدیر دوباره اجرا کنید. پاک‌سازی `NSS` (فایرفاکس/کروم) `best-effort` است: اگر `certutil` نصب نباشد یا یکی از مرورگرها بازِ دیتابیس را قفل کرده باشد، ابزار پیغامی با راهنمای پاک‌سازی دستی نشان می‌دهد. فایل `config.json` شما و `deployment` آپس‌اسکریپت در `script.google.com` دست‌نخورده می‌مانند — یعنی وقتی در اجرای بعدی گواهی تازه تولید می‌شود، نیازی به دیپلوی مجدد `Code.gs` نیست. + **چرا گاهی جست‌وجوی گوگل بدون `JavaScript` نشان داده می‌شود؟** `Apps Script` مجبور است `User-Agent` درخواست‌های خود را روی `Google-Apps-Script` بگذارد. بعضی سایت‌ها این را به عنوان ربات شناسایی می‌کنند و نسخهٔ سادهٔ بدون `JavaScript` برمی‌گردانند. دامنه‌هایی که در لیست `SNI-rewrite` قرار گرفته‌اند (مثل `google.com`، `youtube.com`) از این مشکل در امان هستند چون مستقیماً از لبهٔ گوگل می‌آیند، نه از `Apps Script`. diff --git a/android/app/build.gradle.kts b/android/app/build.gradle.kts index 29671b31..2cb00e5f 100644 --- a/android/app/build.gradle.kts +++ b/android/app/build.gradle.kts @@ -14,8 +14,8 @@ android { applicationId = "com.therealaleph.mhrv" minSdk = 24 // Android 7.0 — covers 99%+ of live devices. targetSdk = 34 - versionCode = 139 - versionName = "1.6.0" + versionCode = 158 + versionName = "1.8.1" // Ship all four mainstream Android ABIs: // - arm64-v8a — 95%+ of real-world Android phones since 2019 @@ -136,6 +136,10 @@ dependencies { implementation("androidx.compose.material3:material3") implementation("androidx.compose.material:material-icons-extended") + // QR code generation + scanning (self-contained, no ML Kit needed). + implementation("com.google.zxing:core:3.5.3") + implementation("com.journeyapps:zxing-android-embedded:4.3.0") + debugImplementation("androidx.compose.ui:ui-tooling") debugImplementation("androidx.compose.ui:ui-test-manifest") } diff --git a/android/app/src/main/AndroidManifest.xml b/android/app/src/main/AndroidManifest.xml index dd2e94e8..4d74ca5c 100644 --- a/android/app/src/main/AndroidManifest.xml +++ b/android/app/src/main/AndroidManifest.xml @@ -53,8 +53,33 @@ + + + + + + + + + + + + + + + آدرس‌(های) Deployment یا Script ID + یک URL/ID، یا چند مورد با خط جدید/فاصله/ویرگول/نقطه‌ویرگول جدا کنید کلید احراز (auth_key) google_ip دامنهٔ فرانت @@ -52,7 +54,7 @@ تغییر زبان - URL کامل (https://script.google.com/macros/s/.../exec) یا فقط ID خام. چند ID به‌صورت چرخشی استفاده می‌شوند — بیشتر ID = سرعت بیشتر در حالت تونل کامل. + URL کامل (https://script.google.com/macros/s/.../exec) یا فقط ID خام. می‌توانید چند مورد را یک‌جا در فیلد افزودن جای‌گذاری کنید — با خط جدید/فاصله/ویرگول/نقطه‌ویرگول جدا می‌شوند. چند ID به‌صورت چرخشی استفاده می‌شوند — بیشتر ID = سرعت بیشتر در حالت تونل کامل. همان رمز مشترکی که داخل Apps Script گذاشتید. هنگام اتصال، مجوز VPN سیستم درخواست می‌شود. تمام ترافیک دستگاه به‌صورت خودکار رد می‌شود. بدون VPN سیستم. بعد از اتصال، پروکسی Wi-Fi را روی 127.0.0.1:%1$d (HTTP) یا %2$d (SOCKS5) تنظیم کنید. فقط برنامه‌هایی که تنظیمات پروکسی را رعایت می‌کنند رد می‌شوند. @@ -78,6 +80,7 @@ google_ip به %1$s به‌روزرسانی شد google_ip قبلاً به‌روز است (%1$s) خطای DNS — اتصال شبکه را بررسی کنید + لاگ‌ها در کلیپ‌بورد کپی شدند مصرف امروز (تخمینی) diff --git a/android/app/src/main/res/values/strings.xml b/android/app/src/main/res/values/strings.xml index 8aa47b4c..5f4d637d 100644 --- a/android/app/src/main/res/values/strings.xml +++ b/android/app/src/main/res/values/strings.xml @@ -24,11 +24,13 @@ Test Add Clear + Copy Install Cancel Deployment URL(s) or script ID(s) + Paste one URL/ID, or many separated by newline / space / comma / semicolon auth_key google_ip front_domain @@ -52,7 +54,7 @@ Switch language - Full URLs (https://script.google.com/macros/s/.../exec) or bare IDs. Multiple IDs are rotated round-robin — more IDs = more pipeline throughput in full mode. + Full URLs (https://script.google.com/macros/s/.../exec) or bare IDs. Paste many at once into the Add field — they\'ll be split on newline / space / comma / semicolon. Multiple IDs are rotated round-robin — more IDs = more pipeline throughput in full mode. The shared secret you set in the Apps Script. Requests the OS VPN grant on Connect. All device traffic is routed automatically. No OS VPN. Set your Wi-Fi proxy to 127.0.0.1:%1$d (HTTP) or %2$d (SOCKS5) after Connect. Only apps that honour the proxy settings will tunnel. @@ -74,10 +76,26 @@ %1$d lines + + Paste config from clipboard + Export config + Show QR code + Scan QR code + Copy to clipboard + Config imported + Config copied to clipboard + Invalid config in clipboard + Export config + This includes your auth_key. Only share with people you trust. + Import config? + This will replace your current settings. + Camera permission needed to scan QR codes + google_ip updated to %1$s google_ip already current (%1$s) DNS lookup failed — check network + Logs copied to clipboard Usage today (estimated) diff --git a/android/app/src/main/res/xml/file_paths.xml b/android/app/src/main/res/xml/file_paths.xml new file mode 100644 index 00000000..1e63d103 --- /dev/null +++ b/android/app/src/main/res/xml/file_paths.xml @@ -0,0 +1,4 @@ + + + + diff --git a/assets/apps_script/Code.gs b/assets/apps_script/Code.gs index 8c2acecd..3cc091db 100644 --- a/assets/apps_script/Code.gs +++ b/assets/apps_script/Code.gs @@ -6,18 +6,61 @@ * 2. Batch: POST { k, q: [{m,u,h,b,ct,r}, ...] } → { q: [{s,h,b}, ...] } * Uses UrlFetchApp.fetchAll() — all URLs fetched IN PARALLEL. * + * OPTIONAL SPREADSHEET-BACKED RESPONSE CACHE: + * Set CACHE_SPREADSHEET_ID to a valid Google Sheet ID (must be owned by + * the same account). When enabled, public GET requests are stored in the + * sheet and served from there on repeat visits, reducing UrlFetchApp + * quota consumption. The cache is Vary-aware (Accept-Encoding and + * Accept-Language are hashed into the compound cache key). Leave + * CACHE_SPREADSHEET_ID as-is to disable caching entirely — zero overhead. + * * DEPLOYMENT: * 1. Go to https://script.google.com → New project * 2. Delete the default code, paste THIS entire file - * 3. Click Deploy → New deployment - * 4. Type: Web app | Execute as: Me | Who has access: Anyone - * 5. Copy the Deployment ID into config.json as "script_id" + * 3. Change AUTH_KEY below to your own secret + * 4. (Optional) Set CACHE_SPREADSHEET_ID to enable caching + * 5. Click Deploy → New deployment + * 6. Type: Web app | Execute as: Me | Who has access: Anyone + * 7. Copy the Deployment ID into config.json as "script_id" * * CHANGE THE AUTH KEY BELOW TO YOUR OWN SECRET! */ const AUTH_KEY = "CHANGE_ME_TO_A_STRONG_SECRET"; +// Active-probing defense. When false (production default), bad AUTH_KEY +// requests get a decoy HTML page that looks like a placeholder Apps +// Script web app instead of the JSON `{"e":"unauthorized"}` body. This +// makes the deployment indistinguishable from a forgotten-but-public +// Apps Script project to active scanners that POST malformed payloads +// looking for proxy endpoints. +// +// Set to `true` during initial setup if a misconfigured client is +// hitting "unauthorized" and you want the explicit JSON error to debug +// — then flip back to false before the deployment is widely shared. +// (Inspired by #365 Section 3, mhrv-rs v1.8.0+.) +const DIAGNOSTIC_MODE = false; + +// ── Optional Spreadsheet Cache ────────────────────────────── +// Set to a valid Spreadsheet ID to enable response caching. +// Leave as-is to disable caching entirely (zero overhead). +const CACHE_SPREADSHEET_ID = "CHANGE_ME_TO_CACHE_SPREADSHEET_ID"; +const CACHE_SHEET_NAME = "RelayCache"; +const CACHE_META_SHEET_NAME = "RelayMeta"; +const CACHE_META_CURSOR_CELL = "A1"; + +// ── Cache Tuning ──────────────────────────────────────────── +const CACHE_MAX_ROWS = 5000; // circular buffer capacity +const CACHE_MAX_BODY_BYTES = 35000; // skip responses larger than ~35 KB +const CACHE_DEFAULT_TTL_SECONDS = 86400; // 24-hour fallback when no Cache-Control + +// ── Vary-Aware Cache Key ──────────────────────────────────── +// These request headers are hashed into the compound cache key +// alongside the URL so that responses with different encodings +// or languages never collide in the cache. Covers ~95 % of +// real-world Vary usage without inspecting the response. +const VARY_KEY_HEADERS = ["accept-encoding", "accept-language"]; + // Keep browser capability headers (sec-ch-ua*, sec-fetch-*) intact. // Some modern apps, notably Google Meet, use them for browser gating. const SKIP_HEADERS = { @@ -26,10 +69,33 @@ const SKIP_HEADERS = { "priority": 1, te: 1, }; +// Headers that disqualify a request from the cache path. +const CACHE_BUSTING_HEADERS = { + authorization: 1, cookie: 1, "x-api-key": 1, + "proxy-authorization": 1, "set-cookie": 1, +}; + +// HTML body for the bad-auth decoy. Mimics a minimal Apps Script-style +// placeholder page — no proxy-shaped JSON, nothing distinctive enough +// for a probe to fingerprint as a tunnel endpoint. +const DECOY_HTML = + 'Web App' + + '

The script completed but did not return anything.

' + + ''; + +// ── Request Handlers ──────────────────────────────────────── + +function _decoyOrError(jsonBody) { + if (DIAGNOSTIC_MODE) return _json(jsonBody); + return ContentService + .createTextOutput(DECOY_HTML) + .setMimeType(ContentService.MimeType.HTML); +} + function doPost(e) { try { var req = JSON.parse(e.postData.contents); - if (req.k !== AUTH_KEY) return _json({ e: "unauthorized" }); + if (req.k !== AUTH_KEY) return _decoyOrError({ e: "unauthorized" }); // Batch mode: { k, q: [...] } if (Array.isArray(req.q)) return _doBatch(req.q); @@ -37,14 +103,58 @@ function doPost(e) { // Single mode return _doSingle(req); } catch (err) { - return _json({ e: String(err) }); + // Parse failures of the request body are also probe-shaped — a real + // mhrv-rs client never sends invalid JSON. Decoy for the same reason. + return _decoyOrError({ e: String(err) }); } } +// `doGet` is what active scanners hit first (HTTP GET probes are cheaper +// than POSTs). Apps Script defaults to a "Script function not found" page +// here which is a fine-enough decoy on its own, but explicitly returning +// the same harmless placeholder makes the response identical to the +// bad-auth POST decoy — one less fingerprint vector. +function doGet(e) { + return ContentService + .createTextOutput(DECOY_HTML) + .setMimeType(ContentService.MimeType.HTML); +} + +// ── Single Request ───────────────────────────────────────── + function _doSingle(req) { if (!req.u || typeof req.u !== "string" || !req.u.match(/^https?:\/\//i)) { return _json({ e: "bad url" }); } + + // ── Optional cache path ──────────────────────────────── + // Only entered when CACHE_SPREADSHEET_ID is configured and + // the request qualifies as a public, cachable GET. + if (_canUseCache(req)) { + var cached = _getFromCache(req.u, req.h); + if (cached) { + return _json({ + s: cached.status, + h: JSON.parse(cached.headers), + b: cached.body, + cached: true, + }); + } + + var fetchResult = _fetchAndCache(req.u, req.h); + if (fetchResult) { + return _json({ + s: fetchResult.status, + h: JSON.parse(fetchResult.headers), + b: fetchResult.body, + cached: false, + }); + } + // If _fetchAndCache returns null (spreadsheet unavailable), + // fall through to the normal relay path below. + } + + // ── Normal relay (cache disabled or unavailable) ──────── var opts = _buildOpts(req); var resp = UrlFetchApp.fetch(req.u, opts); return _json({ @@ -54,6 +164,8 @@ function _doSingle(req) { }); } +// ── Batch Request ────────────────────────────────────────── + function _doBatch(items) { var fetchArgs = []; var errorMap = {}; @@ -92,6 +204,8 @@ function _doBatch(items) { return _json({ q: results }); } +// ── Request Building ─────────────────────────────────────── + function _buildOpts(req) { var opts = { method: (req.m || "GET").toLowerCase(), @@ -139,3 +253,404 @@ function _json(obj) { ContentService.MimeType.JSON ); } + +// ═══════════════════════════════════════════════════════════ +// SPREADSHEET CACHE — SHEET MANAGEMENT +// ═══════════════════════════════════════════════════════════ + +function _initCacheSheet() { + if (CACHE_SPREADSHEET_ID === "CHANGE_ME_TO_CACHE_SPREADSHEET_ID") { + return null; + } + try { + var ss = SpreadsheetApp.openById(CACHE_SPREADSHEET_ID); + var sheet = ss.getSheetByName(CACHE_SHEET_NAME); + if (!sheet) { + sheet = ss.insertSheet(CACHE_SHEET_NAME); + // Schema: URL_Hash | URL | Status | Headers | Body | Timestamp | Expires_At + sheet.getRange(1, 1, 1, 7).setValues([[ + "URL_Hash", "URL", "Status", "Headers", "Body", "Timestamp", "Expires_At" + ]]); + } + return sheet; + } catch (e) { + return null; + } +} + +function _getMetaSheet() { + if (CACHE_SPREADSHEET_ID === "CHANGE_ME_TO_CACHE_SPREADSHEET_ID") { + return null; + } + try { + var ss = SpreadsheetApp.openById(CACHE_SPREADSHEET_ID); + var sheet = ss.getSheetByName(CACHE_META_SHEET_NAME); + if (!sheet) { + sheet = ss.insertSheet(CACHE_META_SHEET_NAME); + sheet.getRange(CACHE_META_CURSOR_CELL).setValue(2); + sheet.hideSheet(); + } + return sheet; + } catch (e) { + return null; + } +} + +function _getNextCursor(sheet, metaSheet) { + var cursorRange = metaSheet.getRange(CACHE_META_CURSOR_CELL); + var cursor = cursorRange.getValue(); + if (typeof cursor !== "number" || cursor < 2) cursor = 2; + + var totalRows = sheet.getDataRange().getNumRows(); + + if (totalRows < CACHE_MAX_ROWS + 1) { + return totalRows + 1; + } + + return cursor; +} + +function _advanceCursor(metaSheet, currentRow) { + var nextRow = currentRow + 1; + if (nextRow > CACHE_MAX_ROWS + 1) nextRow = 2; + metaSheet.getRange(CACHE_META_CURSOR_CELL).setValue(nextRow); +} + +function _ensureRowsAllocated(sheet) { + var totalRows = sheet.getDataRange().getNumRows(); + if (totalRows < CACHE_MAX_ROWS + 1) { + var needed = CACHE_MAX_ROWS + 1 - totalRows; + sheet.insertRowsAfter(totalRows, needed); + } +} + +// ═══════════════════════════════════════════════════════════ +// SPREADSHEET CACHE — VARY-AWARE COMPOUND KEY +// ═══════════════════════════════════════════════════════════ + +/** + * Case-insensitive header lookup. + * HTTP header names are case-insensitive per RFC 7230 § 3.2. + */ +function _getHeaderCaseInsensitive(headers, targetKey) { + var target = targetKey.toLowerCase(); + for (var k in headers) { + if (headers.hasOwnProperty(k) && k.toLowerCase() === target) { + return headers[k]; + } + } + return null; +} + +/** + * Compute a compound cache key: + * MD5(URL | header1:value1 | header2:value2 | ...) + * + * Instead of reading the response Vary header (which would require + * fetching first — circular), we preemptively include the request + * headers that are known to cause response variation. This handles + * Vary: Accept-Encoding and Vary: Accept-Language without ever + * inspecting the response. + * + * Values are lowercased and whitespace-stripped so semantically + * identical requests from different clients produce the same hash. + * Missing and empty headers both map to "" (same semantic). + */ +function _getCacheKey(url, reqHeaders) { + var parts = [url]; + + if (reqHeaders && typeof reqHeaders === "object") { + for (var i = 0; i < VARY_KEY_HEADERS.length; i++) { + var headerName = VARY_KEY_HEADERS[i]; + var rawValue = _getHeaderCaseInsensitive(reqHeaders, headerName); + + if (rawValue && String(rawValue).trim() !== "") { + parts.push(headerName + ":" + rawValue.toLowerCase().replace(/\s/g, "")); + } else { + parts.push(headerName + ":"); + } + } + } else { + for (var j = 0; j < VARY_KEY_HEADERS.length; j++) { + parts.push(VARY_KEY_HEADERS[j] + ":"); + } + } + + var compoundKey = parts.join("|"); + return _md5Hex(compoundKey); +} + +function _md5Hex(input) { + var rawHash = Utilities.computeDigest(Utilities.DigestAlgorithm.MD5, input); + return rawHash + .map(function (byte) { + var v = (byte < 0) ? 256 + byte : byte; + return ("0" + v.toString(16)).slice(-2); + }) + .join(""); +} + +// ═══════════════════════════════════════════════════════════ +// SPREADSHEET CACHE — CORE LOGIC +// ═══════════════════════════════════════════════════════════ + +/** + * Returns true if the request is eligible for the cache path: + * public GET, no body, no auth/cookie headers, cache configured. + */ +function _canUseCache(req) { + if ((req.m || "GET") !== "GET") return false; + if (req.b) return false; + if (!req.u || !req.u.match(/^https?:\/\//i)) return false; + if (CACHE_SPREADSHEET_ID === "CHANGE_ME_TO_CACHE_SPREADSHEET_ID") return false; + + if (req.h && typeof req.h === "object") { + for (var k in req.h) { + if (req.h.hasOwnProperty(k) && CACHE_BUSTING_HEADERS[k.toLowerCase()]) { + return false; + } + } + } + + return true; +} + +/** + * Extract max-age (seconds) from a Cache-Control header value. + * Returns 0 if the directive forbids caching (no-cache / no-store / + * private). Falls back to CACHE_DEFAULT_TTL_SECONDS when no header + * is present. Clamped to [60, 2592000] (1 min – 30 days). + */ +function _parseMaxAge(cacheControlHeader) { + if (!cacheControlHeader) return CACHE_DEFAULT_TTL_SECONDS; + + var lower = cacheControlHeader.toLowerCase(); + + if ( + lower.indexOf("no-cache") !== -1 || + lower.indexOf("no-store") !== -1 || + lower.indexOf("private") !== -1 + ) { + return 0; + } + + var match = lower.match(/max-age=(\d+)/); + if (match) { + var ttl = parseInt(match[1], 10); + return Math.max(60, Math.min(ttl, 2592000)); + } + + return CACHE_DEFAULT_TTL_SECONDS; +} + +/** + * Rewrite time-sensitive headers so the client sees accurate + * Date, Age, and Cache-Control values reflecting cache age. + */ +function _refreshCachedHeaders(headersJson, timestamp) { + var headers = JSON.parse(headersJson); + var cachedAt = new Date(timestamp); + var now = new Date(); + var ageSeconds = Math.floor((now.getTime() - cachedAt.getTime()) / 1000); + + if (ageSeconds < 0) ageSeconds = 0; + + headers["Date"] = now.toUTCString(); + headers["Age"] = String(ageSeconds); + + var originalCc = headers["Cache-Control"] || headers["cache-control"]; + if (originalCc) { + headers["X-Original-Cache-Control"] = originalCc; + } + + var remainingMaxAge = Math.max(0, _parseMaxAge(originalCc) - ageSeconds); + headers["Cache-Control"] = "public, max-age=" + remainingMaxAge; + + headers["X-Cache"] = "HIT from relay-spreadsheet"; + headers["X-Cached-At"] = cachedAt.toUTCString(); + + return JSON.stringify(headers); +} + +/** + * Retrieve a cached response by compound cache key. + * Uses TextFinder for O(log n) lookup. Skips expired entries. + * Returns null on miss, expired entry, or unavailable sheet. + */ +function _getFromCache(url, reqHeaders) { + var sheet = _initCacheSheet(); + if (!sheet) return null; + + var hash = _getCacheKey(url, reqHeaders); + var finder = sheet.createTextFinder(hash).matchEntireCell(true); + var found = finder.findNext(); + + if (found) { + var row = sheet.getRange(found.getRow(), 1, 1, 7).getValues()[0]; + + var expiresAt = row[6]; + if (expiresAt && expiresAt instanceof Date && expiresAt < new Date()) { + return null; + } + + return { + status: row[2], + headers: _refreshCachedHeaders(row[3], row[5]), + body: row[4], + }; + } + return null; +} + +/** + * Fetch a URL and store the response in the spreadsheet cache + * using a circular buffer (O(1) writes). Skips storage when the + * encoded body exceeds CACHE_MAX_BODY_BYTES or when Cache-Control + * forbids caching. Returns the fetch result regardless. + */ +function _fetchAndCache(url, reqHeaders) { + var sheet = _initCacheSheet(); + if (!sheet) return null; + + try { + var response = UrlFetchApp.fetch(url, { muteHttpExceptions: true }); + var status = response.getResponseCode(); + var headers = _respHeaders(response); + var body = Utilities.base64Encode(response.getContent()); + + // Cell-size safety gate + if (body.length > CACHE_MAX_BODY_BYTES) { + return { status: status, headers: JSON.stringify(headers), body: body }; + } + + // TTL extraction + var cacheControl = + headers["Cache-Control"] || headers["cache-control"] || null; + var ttlSeconds = _parseMaxAge(cacheControl); + + if (ttlSeconds === 0) { + return { status: status, headers: JSON.stringify(headers), body: body }; + } + + var hash = _getCacheKey(url, reqHeaders); + var timestamp = new Date(); + var expiresAt = new Date(timestamp.getTime() + ttlSeconds * 1000); + + // Safety: fallback if Date math produces invalid result + if (isNaN(expiresAt.getTime())) { + expiresAt = new Date(timestamp.getTime() + CACHE_DEFAULT_TTL_SECONDS * 1000); + } + + var rowData = [ + hash, + url, + status, + JSON.stringify(headers), + body, + timestamp.toISOString(), + expiresAt, + ]; + + // Circular buffer write (O(1)) + var metaSheet = _getMetaSheet(); + if (metaSheet) { + _ensureRowsAllocated(sheet); + var writeRow = _getNextCursor(sheet, metaSheet); + sheet.getRange(writeRow, 1, 1, 7).setValues([rowData]); + _advanceCursor(metaSheet, writeRow); + } else { + // Fallback: simple append if meta sheet is unavailable + sheet.appendRow(rowData); + } + + return { status: status, headers: JSON.stringify(headers), body: body }; + } catch (e) { + return null; + } +} + +// ═══════════════════════════════════════════════════════════ +// SPREADSHEET CACHE — DIAGNOSTICS +// ═══════════════════════════════════════════════════════════ + +function getCacheStats() { + var sheet = _initCacheSheet(); + if (!sheet) { + console.log("Cache is not enabled or spreadsheet unavailable."); + return; + } + + var data = sheet.getDataRange().getValues(); + var totalEntries = data.length - 1; + var now = new Date(); + var expiredCount = 0; + + for (var i = 1; i < data.length; i++) { + var expiresAt = data[i][6]; + if (expiresAt && expiresAt instanceof Date && expiresAt < now) { + expiredCount++; + } + } + + var metaSheet = _getMetaSheet(); + var cursorInfo = "N/A"; + if (metaSheet) { + cursorInfo = String(metaSheet.getRange(CACHE_META_CURSOR_CELL).getValue()); + } + + console.log("=== CACHE STATS ==="); + console.log("Total rows used: " + totalEntries + " / " + CACHE_MAX_ROWS); + console.log("Active entries: " + (totalEntries - expiredCount)); + console.log("Expired entries: " + expiredCount); + console.log("Cursor position: " + cursorInfo); + console.log("Max body size: " + CACHE_MAX_BODY_BYTES + " chars"); + console.log("Default TTL: " + CACHE_DEFAULT_TTL_SECONDS + " sec"); + console.log("Vary key headers: " + VARY_KEY_HEADERS.join(", ")); + if (totalEntries > 0) { + console.log("Oldest entry: " + data[1][5]); + console.log("Newest entry: " + data[data.length - 1][5]); + } +} + +function clearExpiredCache() { + var sheet = _initCacheSheet(); + if (!sheet) { + console.log("Cache is not enabled."); + return; + } + + var data = sheet.getDataRange().getValues(); + var now = new Date(); + var rowsToClear = []; + + for (var i = 1; i < data.length; i++) { + var expiresAt = data[i][6]; + if (expiresAt && expiresAt instanceof Date && expiresAt < now) { + rowsToClear.push(i + 1); + } + } + + for (var j = 0; j < rowsToClear.length; j++) { + sheet.getRange(rowsToClear[j], 1, 1, 7).clearContent(); + } + + console.log("Cleared " + rowsToClear.length + " expired entries (" + + (data.length - 1 - rowsToClear.length) + " remaining)."); +} + +function clearEntireCache() { + var sheet = _initCacheSheet(); + if (sheet) { + var totalRows = sheet.getDataRange().getNumRows(); + if (totalRows > 1) { + sheet.getRange(2, 1, totalRows - 1, 7).clearContent(); + } + } + + var metaSheet = _getMetaSheet(); + if (metaSheet) { + metaSheet.getRange(CACHE_META_CURSOR_CELL).setValue(2); + } + + console.log("Cache wiped. Cursor reset to row 2."); +} diff --git a/assets/apps_script/CodeFull.gs b/assets/apps_script/CodeFull.gs index 77b2a5e5..e116ee7b 100644 --- a/assets/apps_script/CodeFull.gs +++ b/assets/apps_script/CodeFull.gs @@ -16,18 +16,46 @@ const AUTH_KEY = "CHANGE_ME_TO_A_STRONG_SECRET"; const TUNNEL_SERVER_URL = "https://YOUR_TUNNEL_NODE_URL"; const TUNNEL_AUTH_KEY = "YOUR_TUNNEL_AUTH_KEY"; +// Active-probing defense. When false (production default), bad AUTH_KEY +// requests get a decoy HTML page that looks like a placeholder Apps +// Script web app instead of the JSON `{"e":"unauthorized"}` body. This +// makes the deployment indistinguishable from a forgotten-but-public +// Apps Script project to active scanners that POST malformed payloads +// looking for proxy endpoints. +// +// Set to `true` during initial setup if a misconfigured client is +// hitting "unauthorized" and you want the explicit JSON error to debug +// — then flip back to false before the deployment is widely shared. +// (Inspired by #365 Section 3, mhrv-rs v1.8.0+.) +const DIAGNOSTIC_MODE = false; + const SKIP_HEADERS = { host: 1, connection: 1, "content-length": 1, "transfer-encoding": 1, "proxy-connection": 1, "proxy-authorization": 1, "priority": 1, te: 1, }; +// HTML body for the bad-auth decoy. Mimics a minimal Apps Script-style +// placeholder page — no proxy-shaped JSON, nothing distinctive enough +// for a probe to fingerprint as a tunnel endpoint. +const DECOY_HTML = + 'Web App' + + '

The script completed but did not return anything.

' + + ''; + +function _decoyOrError(jsonBody) { + if (DIAGNOSTIC_MODE) return _json(jsonBody); + return ContentService + .createTextOutput(DECOY_HTML) + .setMimeType(ContentService.MimeType.HTML); +} + // ========================== Entry point ========================== function doPost(e) { try { var req = JSON.parse(e.postData.contents); - if (req.k !== AUTH_KEY) return _json({ e: "unauthorized" }); + if (req.k !== AUTH_KEY) return _decoyOrError({ e: "unauthorized" }); // Tunnel mode if (req.t) return _doTunnel(req); @@ -38,7 +66,9 @@ function doPost(e) { // Single relay mode return _doSingle(req); } catch (err) { - return _json({ e: String(err) }); + // Parse failures of the request body are also probe-shaped — a real + // mhrv-rs client never sends invalid JSON. Decoy for the same reason. + return _decoyOrError({ e: String(err) }); } } diff --git a/docs/changelog/v1.6.1.md b/docs/changelog/v1.6.1.md new file mode 100644 index 00000000..d5e72c8d --- /dev/null +++ b/docs/changelog/v1.6.1.md @@ -0,0 +1,4 @@ + +• پایداری چرخه‌ٔ سشن VPN در اندروید ([#187](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/187)): پنج رفع باگ کوچک ولی واقعی در سرویس VPN اندروید: (۱) دکمهٔ Connect/Disconnect حالا روی state-flow `VpnState.isRunning` گیت میشه (با backstop ۱۲ ثانیه‌ای) به جای تایمر ثابت ۲ ثانیه — جلوی race condition بین Stop و Connect رو می‌گیره که قبلاً منجر به `Address already in use` می‌شد. (۲) `Tun2proxy.stop()` حالا با timeout ۲ ثانیه‌ای بسته شده تا اگر روی native call hang کنه، کل teardown thread رو نگه نداره. (۳) رفع نشت file descriptor بین `detachFd()` و `Thread.start()` — اگه start بخاطر OOM throw می‌کرد، fd یتیم می‌شد. (۴) doc-comment گمراه‌کننده در teardown اصلاح شد. (۵) handler crash trap حالا `Log.e` رو در try/catch می‌پیچه تا اگه خود لاگ throw کنه، handler بازگشتی نشه +--- +• Android VPN session lifecycle reliability ([#187](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/187)): five small but real fixes in the Android VPN service. (1) Connect/Disconnect button is now gated on the `VpnState.isRunning` state flow with a 12 s backstop instead of a fixed 2 s `transitionCooldown` timer — closes the race window where users tapping Connect right after Stop would hit "Address already in use" because the previous teardown's listener-socket release hadn't completed yet. (2) `Tun2proxy.stop()` is now wrapped in a 2 s `join()` timeout — if the native call hangs, the bounded tun2proxy thread join + bounded `rt.shutdown_timeout` below it still release the listener port instead of holding the teardown thread. (3) File-descriptor leak fixed between `parcelFd.detachFd()` and `Thread.start()` — if `start()` threw (OOM under memory pressure), the detached fd had no owner and leaked for the process lifetime; now adopted into a fresh `ParcelFileDescriptor` purely so we can `close()` it. (4) Misleading teardown doc-comment rewritten — the "step 2 closes the TUN fd to force EBADF on read" claim has been factually wrong since `detachFd` landed; corrected so future debuggers don't chase a phantom safety net. (5) Recursive crash trap in `MhrvApp`'s uncaught-exception handler — `Log.e` is now wrapped in try/catch so a logd failure during exception logging falls through to the previous handler with the real exception diff --git a/docs/changelog/v1.6.2.md b/docs/changelog/v1.6.2.md new file mode 100644 index 00000000..402d8a5e --- /dev/null +++ b/docs/changelog/v1.6.2.md @@ -0,0 +1,4 @@ + +• رفع باگ "همهٔ دانلودها روی ۲۵۶ کیلوبایت قطع میشن" ([#162](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/162)): در relay range-parallel، اگه validation هر chunk رد می‌شد (مثلاً Apps Script هدر `Content-Range` رو حذف می‌کرد، یا origin روی chunkهای بعدی به جای 206 یه 200 برمی‌گردوند)، fallback اشتباهی پاسخ probe (یعنی فقط ۲۵۶ کیلوبایت اول) رو به‌عنوان فایل کامل برمی‌گردوند. مرورگر `HTTP 200` با `Content-Length=262144` می‌دید و دانلود رو "کامل" تلقی می‌کرد. حالا fallback یک GET تک‌مرحله‌ای جدید بدون Range هدر می‌فرسته که Apps Script کل URL رو fetch کنه (تا سقف ۵۰ مگ). برای فایل‌های بزرگ‌تر کندتره از مسیر parallel، ولی پاسخ کامل می‌ده — که اون چیزی هست که اهمیت داره. ۲ کاربر مستقل این رو ریپورت کردن (Ehsan، Recruit1992) +--- +• Fix "every download capped at 256 KB" bug ([#162](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/162)): in range-parallel relay, when any chunk failed validation (e.g. Apps Script stripping the `Content-Range` header on follow-up chunks, or origin returning 200-instead-of-206 on later chunks), the fallback path silently returned the probe response (the first 256 KiB) as if it were the full file. Browsers saw `HTTP 200` with `Content-Length=262144` and treated the download as complete. The fallback now does a fresh single GET without the Range header, letting Apps Script fetch the full URL (up to its 50 MiB cap). Slower than the parallel path for large files, but produces a complete response — which is what matters. Two independent users (Ehsan, Recruit1992) reported this; closed-loop with both diff --git a/docs/changelog/v1.6.3.md b/docs/changelog/v1.6.3.md new file mode 100644 index 00000000..7be78d25 --- /dev/null +++ b/docs/changelog/v1.6.3.md @@ -0,0 +1,4 @@ + +• رفع باگ "نوتیفیکیشن سرور اندروید پورت اشتباه SOCKS5 رو نشون می‌داد" ([#211](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/211)): با تنظیمات پیش‌فرض اندروید (`listenPort=8080`, `socks5Port=1081`)، نوتیفیکیشن می‌نوشت `Routing via SOCKS5 127.0.0.1:8081` که اشتباه بود — listener واقعی روی `1081` اجرا می‌شد. هر کاربری که پروکسی تلگرام رو روی پورت نوتیفیکیشن (8081) ست می‌کرد، در سکوت fail می‌شد. علت: تابع `buildNotif` به‌جای خوندن `cfg.socks5Port`، hardcode می‌کرد `proxyPort + 1`. حالا متن نوتیفیکیشن همون منطق elvis fallback `cfg.socks5Port ?: (cfg.listenPort + 1)` رو که در تنظیم listener واقعی استفاده می‌شه می‌خونه و علاوه بر SOCKS5، پورت HTTP رو هم نشون می‌ده: `HTTP 127.0.0.1:8080 · SOCKS5 127.0.0.1:1081`. ۲ کاربر مستقل ریپورت کردن (vpnineh، l3est) +--- +• Fix "Android server notification showed wrong SOCKS5 port" bug ([#211](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/211)): with the default Android config (`listenPort=8080`, `socks5Port=1081`), the foreground-service notification read `Routing via SOCKS5 127.0.0.1:8081` — wrong, since the real listener was on `1081`. Anyone configuring Telegram (or any per-app SOCKS5 client) against the notification value silently failed. Cause: `buildNotif` hardcoded `proxyPort + 1` instead of reading `cfg.socks5Port`. The notification now uses the same elvis fallback `cfg.socks5Port ?: (cfg.listenPort + 1)` that the actual listener uses, and shows both ports for clarity: `HTTP 127.0.0.1:8080 · SOCKS5 127.0.0.1:1081`. Two independent users (vpnineh, l3est) reported this diff --git a/docs/changelog/v1.6.4.md b/docs/changelog/v1.6.4.md new file mode 100644 index 00000000..0461620f --- /dev/null +++ b/docs/changelog/v1.6.4.md @@ -0,0 +1,4 @@ + +• رفع باگ "L7 multiplexer در Full mode batch نمی‌کنه" ([#231](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/231)): در حالت Full، انتظار می‌رفت که چند op به یک batch HTTP request به Apps Script ترکیب بشن (`batch: 5 ops` یا `batch: 10 ops`)، ولی log نشون می‌داد همیشه `batch: 1 ops` — یعنی هر op جدا یه round-trip Apps Script می‌گرفت (که هر کدوم 2 تا 7 ثانیه طول می‌کشن). علت: loop دریافت پیام بلافاصله بعد از اولین message با `try_recv()` (non-blocking) صف رو drain می‌کرد، بدون pause برای جمع‌آوری بقیه ops. **Fix:** بعد از اولین op، یه پنجرهٔ ۸ میلی‌ثانیه‌ای باز می‌مونه تا opهای بعدی (مثل parallel fetches، HTTP/2 streams) همون batch رو پر کنن. ۸ms در مقابل ~۲ تا ۷ ثانیه RTT Apps Script اصلاً ناچیزه ولی efficiency batching رو برمی‌گردونه. ریپورت شده توسط w0l4i با log واضح +--- +• Fix "L7 multiplexer not batching in Full mode" bug ([#231](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/231)): in `full` mode, multiple ops should coalesce into a single batched HTTP request to Apps Script (`batch: 5 ops` or `batch: 10 ops`), but logs showed `batch: 1 ops` consistently — each op got its own Apps Script round-trip (2-7 s each). Cause: the receive loop drained the channel via `try_recv()` (non-blocking) immediately after the first message arrived, with no window to let concurrent ops accumulate. **Fix:** after the first op lands, hold the buffer open for an 8 ms coalescing window so concurrent ops (parallel fetches, HTTP/2 stream openings, etc.) land in the same batch. 8 ms is rounding error against the ~2-7 s Apps Script RTT but restores the entire batching premise. Reported by w0l4i with a clean log snippet diff --git a/docs/changelog/v1.6.5.md b/docs/changelog/v1.6.5.md new file mode 100644 index 00000000..ab870666 --- /dev/null +++ b/docs/changelog/v1.6.5.md @@ -0,0 +1,10 @@ + +• اضافه شدن twitter.com به URL normalization اکس/توییتر ([#245](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/245)): قبلاً normalization GraphQL URL فقط روی `x.com` کار می‌کرد. کاربری که از extension "Control Panel for Twitter" استفاده می‌کنه که همه‌چی رو به `twitter.com` redirect می‌کنه، URL shortening رو از دست می‌داد و درخواست‌هاش به Apps Script `URI Too Long` می‌گرفت. حالا match هر دو domain رو می‌گیره. ممنون از Parsa307 +• امکان کپی log در نسخهٔ اندروید ([#255](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/255)): دکمهٔ Copy کنار Clear در Live Log اضافه شد. خط‌های log الان قابل selection هستن. تا قبل از این، گرفتن log از گوشی نیازمند `adb logcat` بود — برای کاربرهایی که issue با logcat تجربه ندارن، debug کردن سخت بود. ممنون از @dazzling-no-more +• اضافه کردن چندین deployment ID به‌صورت یکجا در نسخهٔ اندروید ([#257](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/257)): فیلد "+ Add" حالا multi-line هست و paste کردن لیست IDها (با newline، کاما، یا semicolon جدا شده) رو می‌پذیره. paste در یه entry موجود هم automatic بهش split و expand می‌شه. تا قبل از این، اضافه کردن ۶ تا ID نیازمند ۶ بار tap "+ Add" بود. ممنون از @dazzling-no-more +• رفع باگ "google_only mode: plain HTTP proxy requests are not supported" ([#256](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/256)): تایپ کردن `http://example.com` (بدون https) در browser در حالت google_only یه ۵۰۲ می‌داد، در حالی که `https://example.com` (CONNECT) خوب fall-through می‌کرد به direct TCP. حالا plain HTTP proxy request هم passthrough می‌شه (با حفظ `upstream_socks5` اگه ست شده). ۴ تا unit test جدید برای parsing absolute-form URI، fallback به Host header، و edge cases IPv6. ممنون از @dazzling-no-more +--- +• Add twitter.com to X/Twitter URL normalization ([#245](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/245)): the GraphQL `?variables=...` shortening previously only matched `x.com`. Users running the "Control Panel for Twitter" extension (which redirects everything back to `twitter.com`) lost the shortening and hit `URI Too Long` from Apps Script. Now matches both domains. Thanks Parsa307 +• Add ability to copy logs in Android ([#255](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/255)): Copy button added next to Clear in the Live Log pane; log lines are now selectable. Before this, getting logs off the device required tethering with `adb logcat` — a barrier for users without that experience. Thanks @dazzling-no-more +• Add bulk parser for deployment IDs in Android ([#257](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/257)): the "+ Add" field is now multi-line and accepts a paste of multiple IDs separated by whitespace/newline/comma/semicolon. Pasting into an existing entry also auto-splits and expands. Adding 6 IDs used to require 6 separate "+ Add" taps. Thanks @dazzling-no-more +• Fix "google_only mode: plain HTTP proxy requests are not supported" 502 ([#256](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/256)): typing `http://example.com` (without https) in the browser in google_only mode returned a 502, even though `https://example.com` (CONNECT) fell through cleanly to direct TCP. Plain-HTTP proxy requests now passthrough too (honoring `upstream_socks5` if set). 4 new unit tests covering absolute-form URI parsing, Host-header fallback, and IPv6 edge cases. Thanks @dazzling-no-more diff --git a/docs/changelog/v1.7.0.md b/docs/changelog/v1.7.0.md new file mode 100644 index 00000000..bbbd940e --- /dev/null +++ b/docs/changelog/v1.7.0.md @@ -0,0 +1,8 @@ + +• پشتیبانی native از پروتکل udpgw در Full mode ([#222](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/222)): tunnel-node حالا یه virtual session جدید برای آدرس magic `198.18.0.1:7300` داره که tun2proxy اندروید بهش وصل می‌شه. به جای یه session UDP per-destination، تمام UDP از همون یه TCP persistent tunnel می‌گذره. **نتیجه**: تماس صوتی/تصویری Telegram و Google Meet در Full mode روی اندروید کار می‌کنن (تا قبل از این، سرعت شناوری STUN/RTP زیاد بود برای session-per-destination polling). QUIC (UDP/443) و DNS (UDP/53) به‌صورت گاردبل از udpgw مسدود می‌شن — مرورگرها به TCP/HTTPS fallback می‌کنن (سریع‌تر از QUIC over batched relay)، و DNS از virtual DNS tun2proxy استفاده می‌کنه (پایدارتر). **نیاز به redeployment image Docker tunnel-node داره**: `docker pull ghcr.io/therealaleph/mhrv-tunnel-node:1.7.0`. ممنون از @yyoyoian-pixel +• چیدمان مجدد صفحهٔ اصلی اندروید برای لیست‌های بلند deployment-ID ([#258](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/258)، closes [#246](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/246)): دکمهٔ Connect/Disconnect حالا زیر فیلد Mode pinned هست — قبلاً اگه ۱۰ تا deployment ID داشتید، باید کل لیست رو scroll می‌کردید برای رسیدن به Connect. App picker هم حالا appهای از قبل انتخاب‌شده رو در بالای لیست نشون می‌ده، نه پراکنده در ترتیب alphabetical. ممنون از @dazzling-no-more +• tooling release-drafter + prepare-release ([#260](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/260)): release-drafter به‌صورت تدریجی PRهای merge شده رو در یه draft release جمع می‌کنه که در زمان tag دادن آماده هست. workflow `prepare-release.yml` (manual dispatch) خودکار `Cargo.toml` و `build.gradle.kts` رو bump می‌کنه و یه stub `docs/changelog/v.md` می‌سازه که maintainer فقط Persian half + verb tense fixes رو کامل می‌کنه. flow release موجود (matrix build → GH release → Telegram) دست نخورده. ممنون از @dazzling-no-more +--- +• Native udpgw protocol support in Full mode ([#222](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/222)): tunnel-node now hosts a virtual session at the magic address `198.18.0.1:7300` that Android's tun2proxy connects to. Instead of a UDP session per destination, all UDP flows through one persistent TCP tunnel. **Result**: Telegram voice/video calls and Google Meet now work in Full mode on Android (per-destination polling previously stalled under STUN/RTP flow counts). QUIC (UDP/443) and DNS (UDP/53) are blocked from udpgw as a belt-and-suspenders guard — browsers fall back to TCP/HTTPS (faster through the batch pipeline than QUIC), and DNS uses tun2proxy's virtual DNS (more reliable). **Requires redeploying the tunnel-node Docker image**: `docker pull ghcr.io/therealaleph/mhrv-tunnel-node:1.7.0`. Thanks @yyoyoian-pixel +• Restructured Android home screen for long deployment-ID lists ([#258](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/258), closes [#246](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/246)): Connect/Disconnect button is now pinned right under the Mode field — previously, with 10+ deployment IDs the user had to scroll past the entire list to reach Connect every session. App picker now shows pre-selected apps at the top instead of scattered through the alphabetical list. Thanks @dazzling-no-more +• Release-drafter + prepare-release tooling ([#260](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/260)): release-drafter incrementally accumulates merged-PR titles into a draft release that's ready when it's tag time. The `prepare-release.yml` workflow (manual dispatch) auto-bumps `Cargo.toml` and `build.gradle.kts` and writes a `docs/changelog/v.md` stub the maintainer only has to translate to Persian and fix verb tenses on. Existing release flow (matrix build → GH release → Telegram) untouched. Thanks @dazzling-no-more diff --git a/docs/changelog/v1.7.1.md b/docs/changelog/v1.7.1.md new file mode 100644 index 00000000..79582d63 --- /dev/null +++ b/docs/changelog/v1.7.1.md @@ -0,0 +1,4 @@ + +• امکان حذف CA به‌صورت verified ([#121](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/121)): فلگ جدید `mhrv-rs --remove-cert` (CLI) و دکمهٔ **Remove CA** در UI دسکتاپ. CA رو از trust store سیستم‌عامل (Keychain مک، anchor dirs لینوکس، Trusted Root ویندوز)، NSS مرورگرها (Firefox/Chrome در لینوکس)، و فولدر `ca/` روی دیسک پاک می‌کنه. **`config.json` و deployment Apps Script شما دست نمی‌خوره — نیاز به redeploy نیست.** قبل از هر کاری با store، یه trust verification by-name انجام می‌شه؛ اگه remove از سیستم‌عامل fail بشه، browser state دست نمی‌خوره و حالت `RemovalIncomplete` گزارش می‌شه (retry idempotent). در Unix، اگه با sudo اجرا بشه، HOME رو به user واقعی re-root می‌کنه تا path‌های user-scoped (NSS profile، login keychain) به /root نرن. ۲۹ unit test جدید پوشش‌دهی pure logic. تست شده end-to-end در ویندوز، و **در v1.7.1 من مسیر macOS رو هم با hardware واقعی verify کردم** (login keychain delete کار می‌کنه، NSS certutil-missing graceful fallback می‌ده). مسیر Linux منتظر تست از کاربرها. ممنون از @dazzling-no-more +--- +• Verified CA removal ([#121](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/121)): new `mhrv-rs --remove-cert` flag (CLI) and a **Remove CA** button in the desktop UI. Clears the CA from the OS trust store (macOS Keychain, Linux anchor dirs, Windows Trusted Root), NSS browser stores (Firefox/Chrome on Linux), and the on-disk `ca/` directory. **`config.json` and your Apps Script deployment are never touched — no redeploy needed.** A by-name trust verification runs *before* any browser-state mutation; if the OS removal fails, browser state is left alone and the call returns `RemovalIncomplete` (idempotent retries). On Unix, if invoked under sudo, `HOME` is re-rooted to the real user so user-scoped paths (NSS profile, login keychain) target the user, not root. 29 new unit tests covering the pure logic. Tested end-to-end on Windows by the contributor, and **the macOS path was verified on real hardware** during merge (login-keychain delete works; NSS-certutil-missing path falls back cleanly). Linux paths await user testing. Thanks @dazzling-no-more diff --git a/docs/changelog/v1.7.10.md b/docs/changelog/v1.7.10.md new file mode 100644 index 00000000..6af68b83 --- /dev/null +++ b/docs/changelog/v1.7.10.md @@ -0,0 +1,8 @@ + +• رفع باگ "GET کامل غیرضروری وقتی Apps Script body gzip رو decode می‌کنه" ([#337](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/337)): وقتی Apps Script یه gzip body رو decode می‌کرد ولی Content-Range origin رو unchanged نگه می‌داشت، validation strict ما اون response رو reject می‌کرد + یک GET کامل دوباره می‌فرستادیم — quota Apps Script هدر می‌رفت. fix carve-out اضافه می‌کنه: اگر Content-Range proves entity کامل در probe اول گنجیده، 206 رو به 200 rewrite کنیم بدون refetch. validation strict برای real Range request‌های client + chunkهای بعدی حفظ شده. همچنین تشخیص quota error برای string‌های آلمانی (`bandbreitenkontingent`، `datenübertragungsrate`) و generic (`bandwidth`، `transfer rate`، `limit exceeded`) اضافه شد، تا deployment‌هایی که رو quota account‌های Google غیرانگلیسی هستند به‌درستی blacklist بشن. ممنون از @freeinternet865 +• رفع UI Android "Config detected in clipboard" که روی Android 13+ سپس از resume کار نمی‌کرد ([#344](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/344)): Android 13+ دسترسی clipboard background-to-foreground رو محدود می‌کنه — auto-detect ساکت empty می‌گرفت + banner ظاهر نمی‌شد. fix: یک دکمهٔ permanent **Paste** که روی tap clipboard رو می‌خونه (user interaction permission می‌ده در همه versionها). دکمهٔ Export به‌صورت icon-only تا row بهینه باقی بمونه. ممنون از @yyoyoian-pixel +• Fix CI workflow incompatibility for Win7 i686 build ([#318 follow-up](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/318)): job pinned-Rust-1.77.2 برای target Win7 i686 fail می‌کرد چون `Cargo.lock` (تولید شده توسط Rust ≥1.78) از lockfile version 4 استفاده می‌کرد + Rust 1.77 فقط version 3 رو می‌فهمه. regenerate Cargo.lock فقط روی job pinned اضافه شد — مهم: artifact `mhrv-rs-windows-i686.zip` که در v1.7.9 missing بود، در v1.7.10 reappear می‌کنه (Win7 SP1-loadable). +--- +• Fix "unnecessary fallback full GET when Apps Script decodes a gzip body" ([#337](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/337)): when Apps Script decoded a gzip body but echoed the origin's compressed `Content-Range` unchanged, our strict validator rejected the response and we'd retry with a full GET — wasting Apps Script quota. The fix adds a carve-out: if `Content-Range` proves the entity already fits in the synthetic first probe, rewrite the 206 to a 200 and skip the refetch. Strict validation is still applied to real client `Range` requests and to later chunks. Also adds quota-error string matching for German (`bandbreitenkontingent`, `datenübertragungsrate`) and generic (`bandwidth`, `transfer rate`, `limit exceeded`) phrasings, so deployments hitting quota under non-English Google account locales now blacklist correctly. Thanks @freeinternet865. +• Fix Android "Config detected in clipboard" banner that silently broke on Android 13+ after resume ([#344](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/344)): Android 13+ restricts background-to-foreground clipboard access, so `getPrimaryClip()` during recomposition silently returned empty — the banner never showed. Fix: replace the auto-detect banner with a permanent **Paste** button that reads on tap (user interaction grants clipboard access on every Android version). Export button becomes icon-only to keep the row compact. Thanks @yyoyoian-pixel. +• Fix the CI Win7 i686 build that silently regressed in v1.7.9 ([#318](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/318) follow-up): the pinned-Rust-1.77.2 job for the i686 target failed because `Cargo.lock` (generated by stable Rust ≥1.78) uses lockfile version 4, which Rust 1.77 doesn't understand. The job now regenerates the lockfile with the pinned toolchain before building. The `mhrv-rs-windows-i686.zip` artifact that disappeared from the v1.7.9 release page reappears in v1.7.10 (and now actually loads on Win7 SP1). diff --git a/docs/changelog/v1.7.11.md b/docs/changelog/v1.7.11.md new file mode 100644 index 00000000..4de1518d --- /dev/null +++ b/docs/changelog/v1.7.11.md @@ -0,0 +1,4 @@ + +• v1.7.10 release page assets منتشر نشد (CI failures): دو bug همزمان بودن — (۱) target `i686-pc-windows-msvc` که در v1.7.7 برای Win7 32-bit اضافه شده بود، در v1.7.10 fail کرد چون Rust 1.77.2 (آخرین stable Win7-compat) نمی‌تونه manifest crate‌های مدرن مثل `time` 0.3.47 رو parse کنه؛ pinning transitive crate‌ها در هر release dep MSRV بمپ می‌کنن غیرقابل دفاع است. (۲) job `release` با `actions/download-artifact@v4` با ۵-retry-exhausted error fail شد. **Fix:** target i686 از matrix حذف شد (کاربران Win7 ۳۲ بیتی باید self-build کنن — instructions در [#318](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/318))؛ release و telegram jobs به `gh run download` با retry loop ۳-attempt تبدیل شدن. v1.7.11 release اولین کاملی هست که از v1.7.9 منتشر می‌شه با همه fixهای v1.7.10 (Apps Script range probe + Android Paste button) plus این workflow fix. +--- +• v1.7.10 release page assets failed to publish (CI failures): two concurrent bugs — (1) the `i686-pc-windows-msvc` target added in v1.7.7 for Win7 32-bit support broke in v1.7.10 because Rust 1.77.2 (the last stable that produces Win7-compatible binaries) can't parse the manifest of modern transitive crates like `time` 0.3.47; pinning transitives at every release where a dep bumps MSRV is brittle and unsustainable. (2) The `release` job's `actions/download-artifact@v4` step hit a 5-retries-exhausted error. **Fix:** dropped the i686 target from the matrix entirely (Win7 32-bit users must self-build now — instructions in [#318](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/318)); the `release` and `telegram` jobs now use `gh run download` with a 3-attempt retry loop. v1.7.11 is the first complete release published since v1.7.9 and ships all the v1.7.10 fixes (Apps Script range probe handling per #337, Android Paste button per #344) along with this workflow repair. diff --git a/docs/changelog/v1.7.2.md b/docs/changelog/v1.7.2.md new file mode 100644 index 00000000..22be861d --- /dev/null +++ b/docs/changelog/v1.7.2.md @@ -0,0 +1,4 @@ + +• import/export کانفیگ در نسخهٔ اندروید با QR code، کلیپ‌بورد، deep link، و share sheet ([#266](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/266)): انتقال کانفیگ بین دستگاه‌ها با یک تپ. **Export**: یک دیالوگ یکپارچه با QR code + رشتهٔ فشرده + دکمهٔ کپی، یا Share از طریق هر اپ (تلگرام، WhatsApp، ایمیل). فیلدهای device-specific (پورت‌ها، حالت VPN/proxy، splitMode) export نمی‌شن، فقط فیلدهای منطقی (mode، script_ids، auth_key، sni_hosts، passthrough_hosts، upstream_socks5). encoding با DEFLATE compression + base64 — کانفیگ معمولی ~۲۰۰ کاراکتر می‌شه به‌جای ~۸۰۰. **Import**: clipboard banner خودکار وقتی مهرو متن `mhrv-rs://` یا JSON خام در clipboard می‌بینه، scanner QR، یا deep link `mhrv-rs://...` (تپ روی لینک در هر اپ). **هر import نیاز به تأیید صریح کاربر داره** — قبل از overwrite شدن کانفیگ فعلی، یه دیالوگ deployment IDهای جدید رو نشون می‌ده و هشدار میده که "این لینک ترافیک شما رو از طریق این deployment IDها مسیریابی می‌کنه — فقط از منابع قابل اعتماد import کنید." این مهمه چون کانفیگ شامل auth_key هست. ممنون از @yyoyoian-pixel +--- +• Config import/export on Android via QR code, clipboard, deep link, and share sheet ([#266](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/266)): one-tap config sharing between devices. **Export**: a unified dialog with QR code + compressed text hash + copy button, or Share via any app (Telegram, WhatsApp, email). Device-specific fields (ports, VPN/proxy mode, splitMode) are not exported — only logical config (mode, script_ids, auth_key, sni_hosts, passthrough_hosts, upstream_socks5). DEFLATE compression + base64 encoding shrinks a typical config from ~800 to ~200 chars. **Import**: clipboard banner auto-appears when mhrv-rs detects `mhrv-rs://...` or raw JSON in clipboard, QR scanner, or deep link `mhrv-rs://...` (tap from any app). **Every import path requires explicit user confirmation** — before the current config is overwritten, a dialog displays the new deployment IDs and warns "this link routes your traffic through these deployment IDs — only import from sources you trust." Important because the config contains `auth_key`. Thanks @yyoyoian-pixel diff --git a/docs/changelog/v1.7.3.md b/docs/changelog/v1.7.3.md new file mode 100644 index 00000000..d7ae76cf --- /dev/null +++ b/docs/changelog/v1.7.3.md @@ -0,0 +1,4 @@ + +• حذف نیاز به فورک tun2proxy ([#271](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/271)): v1.7.0 از یه فورک شخصی tun2proxy (با پارامتر `udpgw_server` در JNI) استفاده می‌کرد چون upstream هنوز feature flag `udpgw` رو منتشر نکرده بود. حالا که tun2proxy 0.7.21 رسماً روی crates.io با feature flag `udpgw` در دسترسه + maintainer toolchain CLI API رو به‌عنوان مسیر صحیح برای کاربران Android معرفی کرد، فورک رو حذف می‌کنیم. روش جدید: mhrv-rs از طریق `dlsym` در زمان اجرا تابع `tun2proxy_run_with_cli_args` رو از `libtun2proxy.so` resolve می‌کنه و CLI args ساده می‌فرسته (`--proxy socks5://127.0.0.1:1081 --tun-fd --udpgw-server 198.18.0.1:7300 ...`). نه فورک، نه `[patch.crates-io]`، نه commit SHA. وقتی tun2proxy update می‌شه، فقط نسخهٔ crates.io رو bump می‌کنیم. ممنون از @yyoyoian-pixel +--- +• Drop the tun2proxy fork dependency ([#271](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/271)): v1.7.0 used a personal fork of tun2proxy (with a `udpgw_server` parameter added to the JNI signature) because upstream hadn't published the `udpgw` feature flag yet. With tun2proxy 0.7.21 now on crates.io with `udpgw` feature flag, and the upstream maintainer pointing callers at the C-style CLI API as the recommended path for Android, we drop the fork. New approach: mhrv-rs resolves `tun2proxy_run_with_cli_args` from `libtun2proxy.so` at runtime via `dlsym` and passes a simple CLI string (`--proxy socks5://127.0.0.1:1081 --tun-fd --udpgw-server 198.18.0.1:7300 ...`). No fork, no `[patch.crates-io]`, no pinned SHA. Future tun2proxy upgrades are a single Cargo version bump. Thanks @yyoyoian-pixel diff --git a/docs/changelog/v1.7.4.md b/docs/changelog/v1.7.4.md new file mode 100644 index 00000000..869109b1 --- /dev/null +++ b/docs/changelog/v1.7.4.md @@ -0,0 +1,6 @@ + +• رفع باگ "video timeout با send YouTube through relay" ([#275](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/275)): قبلاً وقتی `youtube_via_relay = true` بود، تمام دامنه‌های مرتبط با YouTube از طریق Apps Script رد می‌شدن، شامل `googlevideo.com` (chunkهای video) و `ytimg.com` (thumbnails). نتیجه: یک chunk timeout کل پخش video رو در Firefox abort می‌کرد، و video طولانی به ۶ دقیقه cap اجرای Apps Script می‌خورد. **Fix:** حالا `youtube_via_relay` فقط API/HTML رو از relay رد می‌کنه (`youtube.com`, `youtu.be`, `youtube-nocookie.com`, `youtubei.googleapis.com` — جایی که Restricted Mode enforce می‌شه)، در حالی که CDNهای video/image مستقیماً از Google edge می‌گذرن (`googlevideo.com` که در نسخه‌های قبل اصلاً در لیست SNI rewrite نبود اضافه شد، `ytimg.com`، `ggpht.com`). نتیجه: Restricted Mode بدون قطع شدن video. ممنون از @amirabbas117 برای تحلیل دقیق +• Negative-cache برای destinationهای unreachable + pre-warm بزرگ‌تر در startup ([#280](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/280)): گوشی‌های بدون IPv6 وقتی پروب IPv6-only host (مثلاً `ds6.probe.whatismyipaddress.com`) می‌فرستن، 5+ batch Apps Script در ثانیه روی destination تضمین‌fail هدر می‌رفت. حالا cache 30s × 256-entry در `TunnelMux` نگه می‌داره برای destinationهایی که tunnel-node با `Network is unreachable` یا `No route to host` پاسخ داده — short-circuit به `502 Bad Gateway` (HTTP CONNECT) یا `0x04 Host unreachable` (SOCKS5) برای هر retry بعدی. Pre-warm pool startup هم بزرگتر شد (۱۲ تا ۲۴ connection به‌جای ۸) برای کمتر شدن first-use latency. ممنون از @dazzling-no-more +--- +• Fix "video timeout when 'Send YouTube through relay' is on" ([#275](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/275)): previously, `youtube_via_relay = true` routed every YouTube-related domain through Apps Script — including `googlevideo.com` (video chunks) and `ytimg.com` (thumbnails). Result: a single chunk timeout aborted entire Firefox playbacks, and long videos hit Apps Script's 6-min execution cap mid-playback. **Fix:** `youtube_via_relay` now only relays the API/HTML hosts (`youtube.com`, `youtu.be`, `youtube-nocookie.com`, `youtubei.googleapis.com` — where Restricted Mode is enforced), while video/image CDNs go direct via Google edge (`googlevideo.com` was missing from the SNI rewrite list entirely; now added; `ytimg.com`, `ggpht.com` stay on SNI rewrite always). Restricted Mode bypass without breaking playback. Thanks @amirabbas117 for the detailed analysis +• Negative-cache for unreachable destinations + larger startup pre-warm pool ([#280](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/280)): on devices without IPv6, OS/app probes to IPv6-only hostnames (e.g. `ds6.probe.whatismyipaddress.com`) were burning 5+ Apps Script batches per second on a guaranteed-fail destination. `TunnelMux` now keeps a 30s × 256-entry cache of destinations the tunnel-node returned `Network is unreachable` / `No route to host` for, and short-circuits subsequent CONNECTs with `502 Bad Gateway` (HTTP CONNECT) or `0x04 Host unreachable` (SOCKS5). Startup pre-warm pool also grew (12 → 24 connections) to reduce first-use latency. Thanks @dazzling-no-more diff --git a/docs/changelog/v1.7.5.md b/docs/changelog/v1.7.5.md new file mode 100644 index 00000000..5696630f --- /dev/null +++ b/docs/changelog/v1.7.5.md @@ -0,0 +1,6 @@ + +• گزینهٔ جدید `block_quic` در config برای رد کردن client-side QUIC ([#213](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/213)): با `"block_quic": true` در `config.json`، listener SOCKS5 UDP هر datagramی به مقصد port 443 (یعنی HTTP/3-over-UDP) رو silent drop می‌کنه. browser به TCP/HTTPS fallback می‌کنه (که از مسیر CONNECT معمولی رد می‌شه و از relay می‌گذره). برای کاربرهایی که QUIC TCP-meltdown رو در Full mode تجربه می‌کنن (پهنای باند < 1 Mbps در عوض > 50 Mbps با TCP/HTTPS) خوبه. به‌صورت opt-in (پیش‌فرض false). ممنون از @w0l4i +• release artifacts دوباره به پوشهٔ `releases/` در مخزن commit می‌شن (به درخواست کاربر تلگرام): پس از v1.1.0 این عادت متوقف شده بود — حالا بعد از هر release tag، workflow خودکار فایل‌های pre-built رو در پوشه `releases/` به‌روزرسانی می‌کنه. کاربرانی که به صفحه GitHub Releases دسترسی ندارن (به‌خاطر فیلتر در ایران) می‌تونن از طریق `Code → Download ZIP` به فایل‌های آخرین نسخه برسن. صفحه release رسمی همچنان artifact‌های versioned رو داره — این پوشه fallback هست +--- +• New `block_quic` config option for client-side QUIC drop ([#213](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/213)): set `"block_quic": true` in `config.json` and the SOCKS5 UDP relay silently drops any datagram destined for port 443 (HTTP/3-over-UDP). The client's QUIC stack retries a couple of times and then falls back to TCP/HTTPS, which goes through the regular CONNECT path and through the relay. Useful for users seeing QUIC TCP-meltdown in Full mode (sub-1 Mbps where TCP/HTTPS does 50+). Opt-in (default false). Thanks @w0l4i +• Release artifacts now committed back to the in-repo `releases/` folder (per Telegram channel request): the practice was stopped after v1.1.0 — now after every release tag, the workflow auto-refreshes `releases/` with the pre-built binaries. Users behind GitHub-Releases-page filtering can grab the latest version via `Code → Download ZIP`. The official release page still has versioned artifacts; the in-repo folder is the fallback path diff --git a/docs/changelog/v1.7.6.md b/docs/changelog/v1.7.6.md new file mode 100644 index 00000000..58d20ef0 --- /dev/null +++ b/docs/changelog/v1.7.6.md @@ -0,0 +1,4 @@ + +• Revert غلط v1.7.4 برای `googlevideo.com` ([#275](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/275)، [#281](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/281)): v1.7.4 تلاش کرد `googlevideo.com` رو به لیست SNI rewrite اضافه کنه به این تئوری که chunk‌های ویدیو باید از Apps Script relay دور بزنن. **چندین کاربر گزارش دادن که v1.7.4 YouTube رو کاملاً شکست داد** — علت: `googlevideo.com` توسط edge IP‌های جدا "EVA" گوگل serve می‌شه، نه GFE IP عادی که `google_ip` کاربر معمولاً به اون اشاره می‌کنه. SNI-rewrite کردن `googlevideo.com:443` به یه GFE IP باعث TLS handshake failure یا wrong-cert error برای اون کاربرها شد. **رفتار قبل از v1.7.4 برگشته** (chunk‌های ویدیو از مسیر Apps Script relay می‌رن — کندتر ولی روی هر GFE IP قابل اعتماد). تغییرات `youtube_via_relay` carve-out از v1.7.4 (که `ytimg.com` رو از relay پاک کرد) دست نخورده — اون regression جدا بود و درست شده باقی مونده. اگه کاربری در آینده EVA edge IP خودش رو پیکربندی بکنه، یه knob مجزا اضافه می‌کنیم. +--- +• Revert v1.7.4 `googlevideo.com` SNI rewrite ([#275](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/275), [#281](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/281)): v1.7.4 added `googlevideo.com` to the SNI rewrite list on the theory that video chunks should bypass the Apps Script relay. **Multiple users reported v1.7.4 broke YouTube entirely** — root cause: `googlevideo.com` is served by Google's separate "EVA" edge IPs, not the regular GFE IPs that `google_ip` typically points at. SNI-rewriting `googlevideo.com:443` to a GFE IP got TLS handshake failures or wrong-cert errors for those users. **Pre-v1.7.4 behaviour is restored** (video chunks go via the Apps Script relay path — slower but reliable on every GFE IP). The other v1.7.4 `youtube_via_relay` carve-out changes (which removed `ytimg.com` from the carve-out) are intact — those were a separate fix that's still correct. If a user ever wants direct googlevideo.com routing, that needs a separate config knob letting them specify their EVA edge IP independently. diff --git a/docs/changelog/v1.7.7.md b/docs/changelog/v1.7.7.md new file mode 100644 index 00000000..13f3cf54 --- /dev/null +++ b/docs/changelog/v1.7.7.md @@ -0,0 +1,6 @@ + +• اضافه شدن build برای ویندوز ۳۲ بیتی (i686-pc-windows-msvc) به matrix release ([#272](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/272), [#288](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/288)): کاربری که سیستم قدیمی ویندوز ۳۲ بیتی داشت درخواست build اختصاصی کرد. حالا artifact ‫`mhrv-rs-windows-i686.zip`‬ هم در release page موجوده. ممنون از @amiralishoja برای PR +• رفع باگ "یک deployment معیوب همه ‍session‌ها رو روی cadence legacy گیر می‌اندازه" ([#290](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/290)): قبلاً وقتی یکی از deployment‌ها fast-empty (long-poll نمی‌شناخت) برمی‌گردوند، flag global `server_no_longpoll` فعال می‌شد و کل session‌ها رو روی cadence ۳۰ ثانیه‌ای legacy گیر می‌انداخت — حتی اگه deployment‌های دیگه راحت long-poll می‌کردن. اون flag همچنین هیچ‌وقت reset نمی‌شد، پس tunnel-node بازنشانده تا restart process به مسیر سریع برنمی‌گشت. **Fix:** state per-deployment با TTL ۶۰ ثانیه. flag aggregate فقط وقتی فعال می‌شه که **همه** deployment‌های یکتا mark شده باشن، و خودش رو از روی expiry self-correct می‌کنه. tunnel-node ارتقا داده شده خودش به مسیر long-poll fast بدون restart برمی‌گرده. ۴ تست جدید با `tokio::test(start_paused = true)` پوشش‌دهی timing logic. ممنون از @dazzling-no-more +--- +• Add 32-bit Windows (i686-pc-windows-msvc) to the release matrix ([#272](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/272), [#288](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/288)): a user with a legacy 32-bit Windows machine asked for a dedicated build. `mhrv-rs-windows-i686.zip` now appears alongside the other artifacts on every release page. Thanks @amiralishoja for the PR +• Fix "one degraded deployment drags all sessions onto the legacy cadence" bug ([#290](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/290)): previously, a single fast-empty observation from any one deployment flipped the global `server_no_longpoll` flag, dragging every session onto the 30-second legacy cadence even when the other deployments were happily long-polling. The flag also never reset, so a redeployed/recovered tunnel-node didn't return to the fast path until the mhrv-rs process restart. **Fix:** state is now per-deployment with a 60-second TTL. The aggregate flag flips only when **every** unique configured deployment is marked, and self-corrects on read when entries expire. An upgraded tunnel-node rejoins the long-poll fast path on its own. 4 new tests using `tokio::test(start_paused = true)` to cover the timing logic without burning real wall-clock seconds. Thanks @dazzling-no-more diff --git a/docs/changelog/v1.7.8.md b/docs/changelog/v1.7.8.md new file mode 100644 index 00000000..6f7d9030 --- /dev/null +++ b/docs/changelog/v1.7.8.md @@ -0,0 +1,4 @@ + +• Blacklist خودکار deployment با timeout مکرر در batch ([#319](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/319)): قبلاً وقتی یک deployment hang می‌کرد (معمولاً به دلیل `TUNNEL_SERVER_URL` قدیمی که به host از کار افتاده اشاره می‌کرد، یا Apps Script که UrlFetchApp داخلش hang کرده بود)، round-robin مدام trafficرو به همون deployment می‌فرستاد و sessionها timeout می‌خوردند بدون recovery تا restart process. **Fix:** state per-deployment با window ۳۰ ثانیه‌ای — ۳ timeout در پنجره ۳۰ ثانیه‌ای منجر به blacklist با cooldown ۱۲۰ ثانیه می‌شه. هر batch موفق strikeها رو پاک می‌کنه. cooldown کوتاه (۲ دقیقه به‌جای ۱۰ دقیقه برای quota) تا deploymentای که سریع recover می‌شه به‌سرعت برگرده. مستقل از blacklist موجود برای quota-error (که هنوز ۱۰ دقیقه cooldown داره). برای scenario `5 از 8 deployment کهنه`: بعد از یک batch، ۳ deployment dropped می‌شن و session جدید با احتمال خیلی بیشتر روی deployment سالم می‌افته. ممنون از @dazzling-no-more +--- +• Auto-blacklist deployments after sustained batch timeouts ([#319](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/319)): previously, when a single deployment hung (most commonly due to a stale `TUNNEL_SERVER_URL` pointing at a dead host, or Apps Script's internal `UrlFetchApp` stalling), round-robin kept dispatching real traffic to it. Sessions piled into the bad deployment and timed out without recovery until the user restarted mhrv-rs. **Fix:** per-deployment strike counter with a 30-second sliding window — 3 timeouts in 30 s triggers a 120-second cooldown blacklist. Any successful batch clears the strike counter, so unrelated transient blips can't accumulate across hours. Short cooldown (2 min vs. the 10 min permanent-blacklist for quota errors) so a deployment that recovers rejoins the round-robin quickly. For the "5 of 8 deployments stale" scenario: after one batch each, the 3 dead deployments drop out and new sessions land on healthy deployments with much higher probability. Distinct from the quota blacklist (still 600s cooldown). Thanks @dazzling-no-more diff --git a/docs/changelog/v1.7.9.md b/docs/changelog/v1.7.9.md new file mode 100644 index 00000000..15a7c657 --- /dev/null +++ b/docs/changelog/v1.7.9.md @@ -0,0 +1,4 @@ + +• رفع باگ "binary i686 ویندوز روی Windows 7 ۳۲ بیتی load نمی‌شه" ([#318](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/318)، [#323](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/323)): از Rust 1.78 (می ۲۰۲۴) std `GetSystemTimePreciseAsFileTime` (Win8+) رو به‌جای `GetSystemTimeAsFileTime` (Win2k+) کرد، و این نیاز کرد binary ویندوز ۳۲ بیتی از kernel32 یه export که تو Win7 وجود نداره use کنه. binary v1.7.7/v1.7.8 با خطای `the procedure entry point GetSystemTimePreciseAsFile could not be located in the dynamic link library kernel32.dll` روی Win7 SP1 بسته می‌شد. **Fix:** فقط target `i686-pc-windows-msvc` رو در workflow CI به Rust 1.77.2 (آخرین stable Win7-compatible) pin کردیم. سایر targets روی stable می‌مونن. این یعنی artifact `mhrv-rs-windows-i686.zip` در v1.7.9 روی Win7 SP1 ۳۲ بیتی load می‌شه. ممنون از @Im-P3dro برای گزارش +--- +• Fix "i686 Windows binary fails to load on Windows 7 32-bit" ([#318](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/318), [#323](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/323)): Rust 1.78 (May 2024) raised std's Windows MSRV from Win7 to Win10 by switching `std::time` to `GetSystemTimePreciseAsFileTime` (Win8+ in kernel32) instead of the older `GetSystemTimeAsFileTime`. The v1.7.7 and v1.7.8 i686 Windows binaries failed to load on Win7 SP1 with `the procedure entry point GetSystemTimePreciseAsFile could not be located in the dynamic link library kernel32.dll`, defeating the entire reason that target ships (legacy Win7 32-bit boxes per #272). **Fix:** pin only the `i686-pc-windows-msvc` CI matrix entry to Rust 1.77.2 (the last stable that targets Win7); every other target stays on `@stable`. The `mhrv-rs-windows-i686.zip` artifact in v1.7.9 once again loads on Win7 SP1. Thanks @Im-P3dro for the report. diff --git a/docs/changelog/v1.8.0.md b/docs/changelog/v1.8.0.md new file mode 100644 index 00000000..77a79bd7 --- /dev/null +++ b/docs/changelog/v1.8.0.md @@ -0,0 +1,12 @@ + +• Padding random برای پایلود Apps Script ([#313](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/313)، [#365](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/365) Section 1): هر request به Apps Script حالا یک فیلد `_pad` با طول uniform-random بین ۰-۱۰۲۴ بایت اضافه می‌کنه — به‌صورت base64 encoded. بدون این، طول request body در هر mode تقریباً ثابت می‌مونه + DPI ایران می‌تونه بر اساس distribution طول fingerprint بزنه. حالا packet sizes uniformly distributed هستن + length-clustering match نمی‌کنه. تأثیر bandwidth: متوسط ۵۱۲ بایت اضافه به batch ~۲KB = +۲۵٪، negligible در برابر floor latency Apps Script. backward-compatible: Code.gs قدیم هم کار می‌کنه (unknown JSON fields ignore می‌شن). +• Defense active probing: decoy 200 HTML در Code.gs / CodeFull.gs روی AUTH_KEY بد ([#365](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/365) Section 3): قبلاً request بدون auth `{"e":"unauthorized"}` JSON برمی‌گردوند — fingerprint مشخص "این یه API endpoint هست". حالا یه HTML benign placeholder برمی‌گردونه که شبیه یه Apps Script web app forgotten-but-public هست. scanner active که با AUTH_KEY ساختگی POST می‌کنه categorize می‌کنه به‌عنوان "non-tunnel، nothing interesting". flag `DIAGNOSTIC_MODE` برای setup که response قدیمی JSON رو برمی‌گردونه — default `false` (production-strong) +• Defense active probing: decoy 404 nginx در tunnel-node روی auth بد: tunnel-node قبلاً `{"e":"unauthorized"}` JSON برمی‌گردوند. حالا response 404 با body HTML شبیه nginx default error می‌فرسته (active scanners "static web server هست، tunnel نیست" تشخیص می‌دن). env var `MHRV_DIAGNOSTIC=1` برای setup behavior قدیمی رو فعال می‌کنه +• رفع باگ "Usage today (estimated) در Full mode همیشه ۰" ([#230](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/230)، [#362](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/362)): counter `today_calls` و `today_bytes` فقط روی apps_script-mode relay path در `domain_fronter::relay()` افزایش می‌یافت. Full mode از `tunnel_client::fire_batch` می‌گذره که کانتر رو زد. حالا fire_batch بعد از batch موفق `record_today(response_bytes)` رو صدا می‌زنه — bytes از sum طول `d` و `pkts` در BatchTunnelResponse تخمین زده می‌شه. Full mode users حالا "Usage today" واقعی می‌بینن +• رفع باگ "quota reset countdown با time UTC به‌جای PT نشون داده می‌شه" ([#230](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/230)، [#362](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/362)): Apps Script's UrlFetchApp quota در 00:00 **Pacific Time** ریست می‌شه (PST/PDT با DST)، نه UTC. ما UTC midnight رو نشون می‌دادیم — ۷-۸ ساعت off. fix: helpers جدید `current_pt_day_key()` + `seconds_until_pacific_midnight()` با hand-rolled DST detection (بدون اضافه کردن chrono-tz / 3MB tzdb). UI label "UTC day" → "PT day" تغییر کرد. ۲ test جدید برای DST window boundaries (مارس ۲۰۲۴/۲۰۲۶/۲۰۲۷، نوامبر ۲۰۲۴/۲۰۲۶) + Sakamoto's day-of-week +--- +• Random payload padding for Apps Script requests ([#313](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/313), [#365](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/365) Section 1): every outbound request to Apps Script now carries a `_pad` field of uniform-random length 0–1024 bytes (base64 encoded). Before this, request body sizes within each mode were tightly clustered, giving ISP DPI a clean length-distribution fingerprint to match against. Now packet sizes are spread uniformly across the range so length-clustering DPI heuristics can't match. Bandwidth cost: ~512 bytes added to a typical 2 KB tunnel batch = +25%, negligible against Apps Script's per-call latency floor. Backward-compatible: old Code.gs deployments ignore the unknown field. Applied at all three payload-build sites: single relay, single tunnel op, batch tunnel. +• Active-probing defense: decoy 200 HTML on bad AUTH_KEY in `Code.gs` and `CodeFull.gs` ([#365](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/365) Section 3): previously a request with a missing/wrong AUTH_KEY got `{"e":"unauthorized"}` as a JSON body — a clear "this is some kind of API endpoint" signal that active scanners can fingerprint. Now bad-auth requests get a benign HTML placeholder page that looks like a forgotten-but-public Apps Script web app, indistinguishable from the millions of stale Apps Script projects on Google's infrastructure. New `DIAGNOSTIC_MODE` const (default `false`) restores the old JSON error response for setup/debugging — flip to `true` while configuring a misconfigured client, then back to `false` before sharing the deployment widely. +• Active-probing defense: decoy 404 nginx-style HTML on bad auth in `tunnel-node` ([#365](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/365) Section 3): previously a bad-auth request got `{"e":"unauthorized"}`. Now it gets an HTTP 404 with an `nginx`-style error page body, looking like a vanilla static web server. Active scanners that POST malformed payloads to `/tunnel` to discover proxy endpoints categorize this host as "boring" and move on. New `MHRV_DIAGNOSTIC=1` env var restores the verbose JSON error during setup; default is the production decoy. +• Fix "Usage today (estimated) is always 0 in Full mode" ([#230](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/230), [#362](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/362)): the daily-usage counters (`today_calls` / `today_bytes`) were incremented only on the `apps_script`-mode relay path inside `domain_fronter::relay()`. Full-mode traffic goes through `tunnel_client::fire_batch` which never wired the counter. Now `fire_batch` calls `record_today(response_bytes)` after each successful batch — bytes are estimated from the sum of per-session `d` (TCP payload) and `pkts` (UDP datagrams) lengths in the `BatchTunnelResponse`, which is a stable proxy for "how much did this batch move." Full mode users now see real usage numbers instead of stuck-at-zero. +• Fix "quota reset countdown shown in UTC instead of Pacific Time" ([#230](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/230), [#362](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/362)): Apps Script's `UrlFetchApp` quota actually resets at midnight Pacific Time (PST/PDT — observes DST), not midnight UTC. We were displaying the countdown to UTC midnight, which is 7–8 hours off depending on DST. Fix: new `current_pt_day_key()` + `seconds_until_pacific_midnight()` helpers using a hand-rolled US DST detector (2nd Sunday of March → 1st Sunday of November = PDT, otherwise PST) so we don't pull `chrono-tz` and a ~3 MB IANA tzdb just for one helper. UI label updated from "UTC day" to "PT day". Two new tests pin down the DST window boundaries (March 2024 / 2026 / 2027, November 2024 / 2026) and Sakamoto's day-of-week formula. diff --git a/docs/changelog/v1.8.1.md b/docs/changelog/v1.8.1.md new file mode 100644 index 00000000..56bcadc2 --- /dev/null +++ b/docs/changelog/v1.8.1.md @@ -0,0 +1,8 @@ + +• تشخیص خطای decoy v1.8.0 در سمت کلاینت — پیغام واضح به‌جای cryptic ([#404](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/404)، [#310](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/310)): قبلاً وقتی deployment auth fail می‌گرفت + decoy HTML برمی‌گردوند، client پیغام `WARN batch failed: bad response: no json in batch response: ...` می‌داد. کاربر باید خودش متن decoy رو می‌شناخت تا تشخیص بده. حالا client decoy رو با string-match تشخیص می‌ده + پیغام explicit می‌ده: "got the v1.8.0 bad-auth decoy — your AUTH_KEY in mhrv-rs config does NOT match the AUTH_KEY in this deployment's Code.gs. Either fix the mismatch + redeploy as a NEW VERSION, or set DIAGNOSTIC_MODE=true at the top of Code.gs + redeploy to see the explicit JSON `unauthorized` error during setup." — کاربر مستقیم می‌فهمه چی بکنه + ساعت‌ها debug ذخیره می‌شه +• اضافه شدن `script_id` به همه log‌های batch-failure ([#404](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/404)): قبلاً log `WARN batch failed: ...` نام deployment که fail کرد رو نشون نمی‌داد. در multi-deployment scenarios (5-10 deployment که برخی AUTH_KEY اشتباه داره)، کاربر نمی‌تونست بدون سختی deployment معیوب رو identify کنه. حالا همه پیغام‌های failure (timeout، bad response، decoy، missing-response-in-batch) شامل short prefix script_id هستند: `batch failed (script AKfycbz4): ...`. این + flag تشخیص decoy، اولین diagnostic از سرنوشت توزیع کاربری به طور reliable +• Flag config جدید `disable_padding: true` ([#391](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/391)): پیش‌فرض `false` (padding فعال = DPI defense). برای کاربران روی ISP‌های heavily-throttled که هزینه padding ~۲۵٪ bandwidth با throttle compounds + batchهای borderline-working رو into timeout می‌اندازه، گذاشتن `"disable_padding": true` در config.json در ازای محافظت length-distribution DPI، headroom برمی‌گردونه. توصیه نیست speculatively فعال بشه — فقط بعد از measurement throughput improvement. +--- +• Client-side decoy detection — clear hint instead of cryptic error ([#404](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/404), [#310](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/310)): previously when a deployment had a stale/wrong AUTH_KEY, mhrv-rs returned the v1.8.0 bad-auth decoy HTML, and the client logged `WARN batch failed: bad response: no json in batch response: ...` — leaving the user to recognize the decoy body string and infer the cause. Now the client string-matches the decoy and emits an explicit error: "got the v1.8.0 bad-auth decoy — your AUTH_KEY in mhrv-rs config does NOT match the AUTH_KEY in this deployment's Code.gs. Either fix the mismatch + redeploy as a NEW VERSION (Apps Script doesn't auto-pick-up AUTH_KEY edits without an explicit redeploy), or set DIAGNOSTIC_MODE=true at the top of Code.gs + redeploy to see the explicit JSON `unauthorized` error during setup." Saves users hours of staring at "no json in batch response" trying to figure out what's wrong. +• Add `script_id` to every batch-failure log line ([#404](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/404)): previously `WARN batch failed: ...` didn't identify which deployment failed. In multi-deployment setups (5-10 deployments where one or two have a stale AUTH_KEY), users couldn't identify the culprit without the per-deployment curl probe loop. Every failure log line now includes the short script_id prefix: `batch failed (script AKfycbz4): ...`, applied to all four failure paths (timeout, bad response, decoy, missing-response-in-batch). Together with the decoy detection above, the first reliable diagnostic for the multi-deployment-with-one-bad-AUTH_KEY user pattern. +• New `disable_padding: true` config flag ([#391](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/391)): default `false` (padding active, full DPI defense). For users on heavily-throttled ISPs where the v1.8.0 random-padding cost (+~25% bandwidth per batch) compounds with the throttle to push borderline-working batches into timeouts, setting `"disable_padding": true` in `config.json` recovers headroom in exchange for losing length-distribution DPI defense. Don't flip on speculatively — for users where Apps Script outbound is uncongested, padding is free defense. Only enable if you've measured throughput improvement after the flip on your specific ISP path. diff --git a/docs/changelog/v1.8.2.md b/docs/changelog/v1.8.2.md new file mode 100644 index 00000000..9350c9c3 --- /dev/null +++ b/docs/changelog/v1.8.2.md @@ -0,0 +1,6 @@ + +• اصلاح log level در UI binary (Windows + Android) ([#401](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/401)): قبلاً `mhrv-rs-ui` (و Android) فیلتر tracing رو فقط از `RUST_LOG` env var یا default `info,hyper=warn` می‌خوند — مقدار `log_level` در `config.json` در عمل ignore می‌شد. فرم UI combobox `log_level` داشت ولی هیچ‌جا به subscriber اعمال نمی‌شد. حالا precedence اینه: `RUST_LOG` (اگر set باشد) > `config.log_level` > `info,hyper=warn`. علاوه بر این Save در UI الان log level رو live اعمال می‌کنه (بدون نیاز به restart) از طریق reload handle. CLI `mhrv-rs` از قبل درست کار می‌کرد — این فقط fix UI bin بود. +• پیغام تشخیص decoy ملایم‌تر — به‌جای assert AUTH_KEY mismatch، چهار علت ممکن enumerate می‌کنه ([#404](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/404)): @w0l4i گزارش داد همان `script_id` گاهی decoy و گاهی موفقیت برمی‌گرده در یک دقیقه — یعنی NOT AUTH_KEY mismatch (اگر بود ۱۰۰٪ fail می‌گرفت). تحقیق نشون داد body string `"The script completed but did not return anything"` اختصاصی به decoy v1.8.0 ما نیست — Apps Script همان body رو در ۴ سناریو برمی‌گردونه: (۱) AUTH_KEY mismatch (decoy ما، intentional)، (۲) Apps Script execution timeout یا quota tear، (۳) Google-side internal hiccup، (۴) ISP-side response truncation (#313 pattern). Error message v1.8.1 false positive داشت در سناریو ۲-۴. حالا پیغام: "got the v1.8.0 decoy/placeholder body — could be (1) AUTH_KEY mismatch, (2) Apps Script execution timeout/quota tear, (3) Apps Script internal hiccup, (4) ISP-side response truncation. Set DIAGNOSTIC_MODE=true to disambiguate (1) — only AUTH_KEY mismatch returns this body in diagnostic mode." کاربر action درست رو کشف می‌کنه. +--- +• Fix log level on the UI binary (Windows + Android) ([#401](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/401)): previously `mhrv-rs-ui` (and Android, which uses the same JNI path) installed its tracing filter from `RUST_LOG` only — falling back to `info,hyper=warn` when unset. The `log_level` field in `config.json` was effectively ignored, even though the UI form has a combobox that writes to it. The CLI binary (`mhrv-rs`) read `config.log_level` correctly via `init_logging()`; only the UI binary was broken. New precedence: `RUST_LOG` (explicit override) > `config.log_level` (what the user picked in the form) > `info,hyper=warn` (default). The Save button now also reinstalls the filter live via a `tracing_subscriber::reload::Handle`, so users don't need to restart for a level change to take effect. RUST_LOG still wins if set at boot — explicit override beats config in both directions. +• Soften the v1.8.1 decoy detection error message — enumerate four candidate causes instead of asserting AUTH_KEY mismatch ([#404](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/404)): @w0l4i reported the same `script_id` mixing decoy ERROR with successful batches inside a one-minute window — which rules out AUTH_KEY mismatch as the cause (a real mismatch fails 100% of batches against that deployment, never succeeds intermittently). Investigation showed the body string `"The script completed but did not return anything"` is **not** unique to our v1.8.0 bad-auth path — Apps Script itself returns the same body in three other unrelated cases: (2) Apps Script execution timeout or per-100s quota tear, (3) Google-side internal runtime hiccup, (4) ISP-side response truncation mid-flight (the #313 pattern). The v1.8.1 error message was a false positive in scenarios 2-4. The v1.8.2 message now reads: "got the v1.8.0 decoy/placeholder body — could be (1) AUTH_KEY mismatch (run a direct curl probe against the deployment to verify), (2) Apps Script execution timeout or per-100s quota tear (try lowering parallel_concurrency), (3) Apps Script internal hiccup (transient, retry next batch), or (4) ISP-side response truncation (#313 pattern, try a different google_ip). To distinguish (1) from the rest: set DIAGNOSTIC_MODE=true at the top of Code.gs + redeploy as new version — only AUTH_KEY mismatch returns this body in diagnostic mode." Users now have an actionable narrowing procedure instead of a confidently-wrong assertion. diff --git a/docs/changelog/v1.8.3.md b/docs/changelog/v1.8.3.md new file mode 100644 index 00000000..f408ce94 --- /dev/null +++ b/docs/changelog/v1.8.3.md @@ -0,0 +1,12 @@ + +• cache spreadsheet اختیاری در Code.gs برای کاهش مصرف quota ([#400](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/400)، PR [#443](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/443) از @euvel): GET requests عمومی که Cache-Control header دارن می‌تونن از Google Sheet به‌جای Apps Script's UrlFetchApp serve بشن. هزینه‌ی هر cache hit ~۵-۲۰ms (Sheet read) به‌جای ۲۵۰-۵۰۰ms (UrlFetchApp roundtrip). features کامل: TTL-aware (max-age, no-cache, no-store, private respect)، header rewriting (Date/Age/X-Cache)، circular buffer برای O(1) writes، Vary-aware با Accept-Encoding/Accept-Language. opt-in via یک constant `CACHE_SPREADSHEET_ID` در Code.gs — default غیرفعال، بدون overhead برای کاربران که نمی‌خوان. setup: ساخت یک Google Sheet جدید + قرار دادن ID آن در `CACHE_SPREADSHEET_ID` + redeploy as new version +• bypass DoH endpoints from Apps Script tunnel ([#377](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/377)، PR [#439](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/439) از @dazzling-no-more): قبلاً در Full mode هر DNS-over-HTTPS lookup browser از طریق Apps Script tunnel می‌رفت — `chrome.cloudflare-dns.com:443`، `dns.google:443` و سایر هزینه ~۲ ثانیه UrlFetchApp roundtrip به ازای هر name داشتن. ولی DoH از قبل encrypted هست + tunnel privacy اضافه‌ای نمی‌ده — فقط fact-of-DoH رو از local network مخفی می‌کنه که ناچیزه. حالا `bypass_doh_hosts` config (default true) routes DoH lookups مستقیم via TCP/443. لیست کامل bypass شامل: Cloudflare (incl. chrome./mozilla./1dot1dot1dot1.)، Google، Quad9، AdGuard، NextDNS، OpenDNS، CleanBrowsing، dns.sb، dns0.eu، AliDNS، doh.pub، Mullvad. کاربران می‌توانند با `tunnel_doh: true` در config opt-out کنن یا با `bypass_doh_hosts: ["custom1.com", "custom2.com"]` لیست رو extend کنن +• H1 container keepalive (~۲۴۰s) برای جلوگیری از Apps Script V8 cold-start stalls (PR [#438](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/438) از @dazzling-no-more): Apps Script container‌ها بعد از ~۵ دقیقه idle cold می‌شن + ۱-۳ ثانیه به wake-up زمان می‌برن. این به‌خصوص در YouTube playback بعد از pause طولانی stall به‌وضوح دیده می‌شد. با ping HEAD به example.com هر ۲۴۰ ثانیه از طریق relay، container warm نگه داشته می‌شه. cache + inflight coalescer bypass شده تا ping واقعاً به Apps Script برسه. در google_only mode غیرفعال +• 431 Request Header Fields Too Large به‌جای drop سکوتی (PR [#438](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/438) از @dazzling-no-more): قبلاً اگر header block > ۱ MB می‌شد، socket drop می‌شد + browser silently retry می‌کرد + loop ابدی. حالا cap به ۶۴ KB کاهش یافته (match upstream Python) + explicit `HTTP/1.1 431 Request Header Fields Too Large` response برمی‌گرده + close می‌شه. browser ارور رو ببینه + loop رو نمی‌سازه +• پیام error config port-collision واضح‌تر شد (PR [#438](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/438)): قبلاً پیام cryptic بود. حالا: `"both set to 8080 on 127.0.0.1. Change one of them in config.json."` — کاربر مستقیم می‌فهمه چی fix بکنه +--- +• Optional spreadsheet-backed response cache in `Code.gs` to reduce UrlFetchApp quota consumption ([#400](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/400), PR [#443](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/443) by @euvel): public GET requests with `Cache-Control` headers can now be served from a Google Sheet instead of round-tripping through `UrlFetchApp`. Cache hit costs ~5-20ms (Sheet read) vs ~250-500ms (UrlFetchApp). Features: TTL-aware caching (respects `max-age`, `no-cache`, `no-store`, `private`), 35 KB body-size gate (under the Sheets cell limit), header rewriting (Date/Age/Cache-Control/X-Cache/X-Cached-At), circular buffer for O(1) writes, Vary-aware compound keys (Accept-Encoding + Accept-Language). Opt-in via a single `CACHE_SPREADSHEET_ID` constant — default off, zero overhead for users who don't want it. Setup: create a new Google Sheet, paste its ID into `CACHE_SPREADSHEET_ID`, redeploy as new version. Run `getCacheStats()` from the Apps Script editor to see hit/miss/eviction counts. +• Bypass Apps Script tunnel for DoH endpoints on TCP/443 ([#377](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/377), PR [#439](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/439) by @dazzling-no-more): previously every browser DNS-over-HTTPS lookup in Full mode rode through the Apps Script tunnel — `chrome.cloudflare-dns.com:443`, `dns.google:443`, etc. each paid the ~2-second UrlFetchApp round-trip per name. But DoH is already encrypted at the transport layer; tunneling it adds no real privacy (only hiding fact-of-DoH from the local network, which is marginal). Now `bypass_doh_hosts` config (default `true`) routes known DoH hosts around the tunnel via plain TCP. Built-in list: Cloudflare (incl. `chrome.`/`mozilla.`/`1dot1dot1dot1.` browser-pinned variants), Google, Quad9, AdGuard, NextDNS, OpenDNS, CleanBrowsing, dns.sb, dns0.eu, AliDNS, doh.pub, Mullvad. Users can opt out with `tunnel_doh: true` or extend the list with `bypass_doh_hosts: ["custom1.com", "custom2.com"]`. Gated to TCP/443 only — private DoH endpoints on `:8443` should use `passthrough_hosts` instead. ProxyServer warns at startup if `tunnel_doh: true` is paired with non-empty `bypass_doh_hosts` (the otherwise-silent inert combo). 6 unit tests for `matches_doh_host` covering exact match, case insensitivity, trailing dots, suffix tenant subdomains, user extras extending the default list, and the asymmetric-matching footgun guard. +• H1 container keepalive (~240s) to prevent Apps Script V8 cold-start stalls (PR [#438](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/438) by @dazzling-no-more): Apps Script V8 containers go cold after ~5 minutes idle and cost 1-3s to wake. Most visible as YouTube player stalls after a quiet pause. Now sends a `HEAD http://example.com/` ping every 240s through the relay to keep the container warm. Bypasses the response cache and inflight coalescer (otherwise the second iteration would just hit the cache and never reach Apps Script). Skipped in `google_only` mode. The `JoinHandle` is captured so shutdown's `select!` arm can abort it cleanly — without that, hitting Stop in the UI would leave the keepalive holding an `Arc` on stale config (same class of bug as #99 hit for accept loops). +• 431 Request Header Fields Too Large instead of silent drop (PR [#438](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/438) by @dazzling-no-more): previously header blocks larger than 1 MB were silently dropped at the socket level, causing browsers to retry on connection-reset and loop indefinitely on the same oversized request. Now the cap is tightened to 64 KB (matching upstream Python's `MAX_HEADER_BYTES`) and oversized requests get an explicit `HTTP/1.1 431 Request Header Fields Too Large` reply followed by close. Both the plaintext HTTP frontend and the MITM HTTPS relay path now do this. Browsers see the error and don't loop. +• Clearer port-collision error message (PR [#438](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/438)): the same-port validation already existed; only the message was vague. Now reads: `"both set to 8080 on 127.0.0.1. Change one of them in config.json."` matching upstream Python's clarity. diff --git a/releases/README.md b/releases/README.md index d96ede6c..ab492040 100644 --- a/releases/README.md +++ b/releases/README.md @@ -2,11 +2,11 @@ This folder contains the prebuilt binaries from the latest release, committed directly to the repository for users who cannot reach the GitHub Releases page. -Current version: **v1.1.0** +Current version: **v1.8.2** | File | Platform | Contents | |---|---|---| -| `mhrv-rs-android-universal-v1.1.0.apk` | Android 7.0+ (all ABIs) | Universal APK — arm64-v8a, armeabi-v7a, x86_64, x86 in one file | +| `mhrv-rs-android-universal-v1.8.2.apk` | Android 7.0+ (all ABIs) | Universal APK — arm64-v8a, armeabi-v7a, x86_64, x86 in one file | | `mhrv-rs-linux-amd64.tar.gz` | Linux x86_64 | `mhrv-rs`, `mhrv-rs-ui`, `run.sh` | | `mhrv-rs-linux-arm64.tar.gz` | Linux aarch64 | `mhrv-rs`, `run.sh` (CLI only) | | `mhrv-rs-raspbian-armhf.tar.gz` | Raspberry Pi / ARMv7 hardfloat | `mhrv-rs`, `run.sh` (CLI only) | @@ -45,7 +45,7 @@ Extract `mhrv-rs-windows-amd64.zip`, then double-click `run.bat` inside the extr ### Android -Copy `mhrv-rs-android-universal-v1.1.0.apk` to your phone, tap it from the Files app, and allow "Install unknown apps" for whichever app is opening the APK (Files, Chrome, etc.). See [the Android guide](../docs/android.md) for the full walk-through of the first-run steps (Apps Script deployment, MITM CA install, VPN permission, SNI tester). +Copy `mhrv-rs-android-universal-v1.8.2.apk` to your phone, tap it from the Files app, and allow "Install unknown apps" for whichever app is opening the APK (Files, Chrome, etc.). See [the Android guide](../docs/android.md) for the full walk-through of the first-run steps (Apps Script deployment, MITM CA install, VPN permission, SNI tester). See the [main README](../README.md) for desktop setup (Apps Script deployment, config, browser proxy settings). @@ -55,7 +55,7 @@ See the [main README](../README.md) for desktop setup (Apps Script deployment, c این پوشه شامل فایل‌های آخرین نسخه است و مستقیماً در ریپو قرار گرفته برای کاربرانی که به صفحهٔ GitHub Releases دسترسی ندارند. -نسخهٔ فعلی: **v1.1.0** +نسخهٔ فعلی: **v1.8.2** ### دانلود از طریق ZIP @@ -73,6 +73,6 @@ cd mhrv-rs-macos-arm64 **ویندوز:** فایل `mhrv-rs-windows-amd64.zip` را extract کنید و داخل پوشه روی `run.bat` دو بار کلیک کنید (UAC را قبول کنید تا گواهی MITM نصب شود). -**اندروید:** فایل `mhrv-rs-android-universal-v1.1.0.apk` را روی گوشی کپی کنید، از Files app روی آن tap کنید و اجازهٔ "نصب برنامه‌های ناشناس" را بدهید. راهنمای کامل شروع به کار (دیپلوی Apps Script، نصب CA، اجازهٔ VPN، تستر SNI) در [راهنمای اندروید](../docs/android.md) هست. +**اندروید:** فایل `mhrv-rs-android-universal-v1.8.2.apk` را روی گوشی کپی کنید، از Files app روی آن tap کنید و اجازهٔ "نصب برنامه‌های ناشناس" را بدهید. راهنمای کامل شروع به کار (دیپلوی Apps Script، نصب CA، اجازهٔ VPN، تستر SNI) در [راهنمای اندروید](../docs/android.md) هست. برای راه‌اندازی کامل دسکتاپ (دیپلوی Apps Script، config، تنظیم proxy مرورگر) به [README اصلی](../README.md) مراجعه کنید. diff --git a/releases/mhrv-rs-android-arm64-v8a-v1.8.2.apk b/releases/mhrv-rs-android-arm64-v8a-v1.8.2.apk new file mode 100644 index 00000000..407e5390 Binary files /dev/null and b/releases/mhrv-rs-android-arm64-v8a-v1.8.2.apk differ diff --git a/releases/mhrv-rs-android-armeabi-v7a-v1.8.2.apk b/releases/mhrv-rs-android-armeabi-v7a-v1.8.2.apk new file mode 100644 index 00000000..6493245d Binary files /dev/null and b/releases/mhrv-rs-android-armeabi-v7a-v1.8.2.apk differ diff --git a/releases/mhrv-rs-android-universal-v1.1.0.apk b/releases/mhrv-rs-android-universal-v1.8.2.apk similarity index 79% rename from releases/mhrv-rs-android-universal-v1.1.0.apk rename to releases/mhrv-rs-android-universal-v1.8.2.apk index 417022c1..d3f922aa 100644 Binary files a/releases/mhrv-rs-android-universal-v1.1.0.apk and b/releases/mhrv-rs-android-universal-v1.8.2.apk differ diff --git a/releases/mhrv-rs-android-x86-v1.8.2.apk b/releases/mhrv-rs-android-x86-v1.8.2.apk new file mode 100644 index 00000000..25f1af79 Binary files /dev/null and b/releases/mhrv-rs-android-x86-v1.8.2.apk differ diff --git a/releases/mhrv-rs-android-x86_64-v1.8.2.apk b/releases/mhrv-rs-android-x86_64-v1.8.2.apk new file mode 100644 index 00000000..bddbb203 Binary files /dev/null and b/releases/mhrv-rs-android-x86_64-v1.8.2.apk differ diff --git a/releases/mhrv-rs-linux-amd64.tar.gz b/releases/mhrv-rs-linux-amd64.tar.gz index 30e2a7ff..6a61fea3 100644 Binary files a/releases/mhrv-rs-linux-amd64.tar.gz and b/releases/mhrv-rs-linux-amd64.tar.gz differ diff --git a/releases/mhrv-rs-linux-arm64.tar.gz b/releases/mhrv-rs-linux-arm64.tar.gz index 121f5b1e..e697720f 100644 Binary files a/releases/mhrv-rs-linux-arm64.tar.gz and b/releases/mhrv-rs-linux-arm64.tar.gz differ diff --git a/releases/mhrv-rs-linux-musl-amd64.tar.gz b/releases/mhrv-rs-linux-musl-amd64.tar.gz index 87f3e8c5..e0a6e6fb 100644 Binary files a/releases/mhrv-rs-linux-musl-amd64.tar.gz and b/releases/mhrv-rs-linux-musl-amd64.tar.gz differ diff --git a/releases/mhrv-rs-linux-musl-arm64.tar.gz b/releases/mhrv-rs-linux-musl-arm64.tar.gz index 53a75366..6629496f 100644 Binary files a/releases/mhrv-rs-linux-musl-arm64.tar.gz and b/releases/mhrv-rs-linux-musl-arm64.tar.gz differ diff --git a/releases/mhrv-rs-macos-amd64-app.zip b/releases/mhrv-rs-macos-amd64-app.zip index 6d371b1f..75f640a8 100644 Binary files a/releases/mhrv-rs-macos-amd64-app.zip and b/releases/mhrv-rs-macos-amd64-app.zip differ diff --git a/releases/mhrv-rs-macos-amd64.tar.gz b/releases/mhrv-rs-macos-amd64.tar.gz index 46c62afb..d2517a67 100644 Binary files a/releases/mhrv-rs-macos-amd64.tar.gz and b/releases/mhrv-rs-macos-amd64.tar.gz differ diff --git a/releases/mhrv-rs-macos-arm64-app.zip b/releases/mhrv-rs-macos-arm64-app.zip index cf5cf66b..adbbb577 100644 Binary files a/releases/mhrv-rs-macos-arm64-app.zip and b/releases/mhrv-rs-macos-arm64-app.zip differ diff --git a/releases/mhrv-rs-macos-arm64.tar.gz b/releases/mhrv-rs-macos-arm64.tar.gz index c19d34d8..d4f728ae 100644 Binary files a/releases/mhrv-rs-macos-arm64.tar.gz and b/releases/mhrv-rs-macos-arm64.tar.gz differ diff --git a/releases/mhrv-rs-openwrt-mipsel-softfloat.tar.gz b/releases/mhrv-rs-openwrt-mipsel-softfloat.tar.gz new file mode 100644 index 00000000..63b0ad7f Binary files /dev/null and b/releases/mhrv-rs-openwrt-mipsel-softfloat.tar.gz differ diff --git a/releases/mhrv-rs-raspbian-armhf.tar.gz b/releases/mhrv-rs-raspbian-armhf.tar.gz index 72726890..65330c1e 100644 Binary files a/releases/mhrv-rs-raspbian-armhf.tar.gz and b/releases/mhrv-rs-raspbian-armhf.tar.gz differ diff --git a/releases/mhrv-rs-windows-amd64.zip b/releases/mhrv-rs-windows-amd64.zip index baed0917..1a1fc5ef 100644 Binary files a/releases/mhrv-rs-windows-amd64.zip and b/releases/mhrv-rs-windows-amd64.zip differ diff --git a/src/android_jni.rs b/src/android_jni.rs index 6f467bec..6bb5a975 100644 --- a/src/android_jni.rs +++ b/src/android_jni.rs @@ -482,3 +482,53 @@ pub extern "system" fn Java_com_therealaleph_mhrv_Native_statsJson<'a>( })); env.new_string(out).map(|s| s.into_raw()).unwrap_or(std::ptr::null_mut()) } + +// --------------------------------------------------------------------------- +// tun2proxy CLI API wrapper (dlsym — no fork or patch needed) +// --------------------------------------------------------------------------- + +/// `Native.runTun2proxy(cliArgs, tunMtu)` -> int +/// +/// Calls `tun2proxy_run_with_cli_args` from libtun2proxy.so via dlsym. +/// This is the C API the tun2proxy maintainer recommends for callers that +/// need full CLI flexibility (e.g. --udpgw-server). BLOCKS until shutdown. +#[no_mangle] +pub extern "system" fn Java_com_therealaleph_mhrv_Native_runTun2proxy<'a>( + mut env: JNIEnv<'a>, + _class: JClass, + cli_args: JString, + tun_mtu: jni::sys::jint, +) -> jni::sys::jint { + safe(-1, AssertUnwindSafe(|| { + let args_str = jstring_to_string(&mut env, &cli_args); + tracing::info!("runTun2proxy: cli={}", args_str); + + unsafe { + use std::ffi::{CStr, CString}; + + let lib = CString::new("libtun2proxy.so").unwrap(); + let handle = libc::dlopen(lib.as_ptr(), libc::RTLD_NOW); + if handle.is_null() { + let err = CStr::from_ptr(libc::dlerror()); + tracing::error!("dlopen libtun2proxy.so failed: {:?}", err); + return -10; + } + + let sym = CString::new("tun2proxy_run_with_cli_args").unwrap(); + let func = libc::dlsym(handle, sym.as_ptr()); + if func.is_null() { + let err = CStr::from_ptr(libc::dlerror()); + tracing::error!("dlsym tun2proxy_run_with_cli_args: {:?}", err); + libc::dlclose(handle); + return -11; + } + + type RunFn = unsafe extern "C" fn(*const std::ffi::c_char, u16, bool) -> i32; + let run: RunFn = std::mem::transmute(func); + let c_args = CString::new(args_str).unwrap(); + let rc = run(c_args.as_ptr(), tun_mtu as u16, false); + libc::dlclose(handle); + rc + } + })) +} diff --git a/src/bin/ui.rs b/src/bin/ui.rs index 8409863b..c174d76d 100644 --- a/src/bin/ui.rs +++ b/src/bin/ui.rs @@ -9,7 +9,7 @@ use tokio::runtime::Runtime; use tokio::sync::Mutex as AsyncMutex; use tokio::task::JoinHandle; -use mhrv_rs::cert_installer::install_ca; +use mhrv_rs::cert_installer::{install_ca, reconcile_sudo_environment, remove_ca}; use mhrv_rs::config::{Config, ScriptId}; use mhrv_rs::data_dir; use mhrv_rs::domain_fronter::{DomainFronter, DEFAULT_GOOGLE_SNI_POOL}; @@ -24,21 +24,38 @@ const LOG_MAX: usize = 200; fn main() -> eframe::Result<()> { let _ = rustls::crypto::ring::default_provider().install_default(); + // Re-point HOME at the invoking user if this binary was launched + // under sudo (see cert_installer::reconcile_sudo_environment). Must + // run before any data_dir / firefox_profile_dirs call. + reconcile_sudo_environment(); mhrv_rs::rlimit::raise_nofile_limit_best_effort(); let shared = Arc::new(Shared::default()); let (cmd_tx, cmd_rx) = std::sync::mpsc::channel::(); + // Load the user's saved form first so we can seed the tracing filter + // with their saved log level. Otherwise the form's log-level combobox + // would only ever take effect via env var or after Save → restart, and + // users on the UI binary (issue #401) reasonably expect the saved + // config.json `log_level` to apply at boot like it does for the CLI. + let (form, load_err) = load_form(); + let initial_toast = load_err.map(|e| (e, Instant::now())); + // Hook tracing events into the Recent log panel. Without this every // tracing::info! / debug! / trace! the proxy emits gets swallowed and // the panel only ever shows our manual push_log calls, making the log // level selector look useless (issue #12 bug 2). // - // The env-filter respects RUST_LOG if set, otherwise defaults to info - // so users see routing decisions immediately without any knob-turning. - // When they start the proxy and Save the config, the log level from the - // config is applied to the in-process filter (see on_start below). - install_ui_tracing(shared.clone()); + // Filter precedence (issue #401 fix in v1.8.2): + // 1. RUST_LOG env var if set — explicit override + // 2. Saved config's `log_level` (passed from form) — what users mean + // when they pick a level in the UI + // 3. "info,hyper=warn" — sensible default + // + // Save inside the running UI also installs the new filter via the + // reload handle (see `LOG_RELOAD` below), so users don't need to + // restart for a config change to take effect. + install_ui_tracing(shared.clone(), &form.log_level); let shared_bg = shared.clone(); std::thread::Builder::new() @@ -46,9 +63,6 @@ fn main() -> eframe::Result<()> { .spawn(move || background_thread(shared_bg, cmd_rx)) .expect("failed to spawn background thread"); - let (form, load_err) = load_form(); - let initial_toast = load_err.map(|e| (e, Instant::now())); - // Pick the renderer. Default is `glow` (OpenGL 2+) because that's // what we shipped through v1.0.x and it has the least binary-size // overhead. Users on older Windows boxes / RDP sessions / headless @@ -68,7 +82,11 @@ fn main() -> eframe::Result<()> { .with_inner_size([WIN_WIDTH, WIN_HEIGHT]) .with_min_inner_size([420.0, 400.0]) .with_title(format!("mhrv-rs {}", VERSION)), - renderer: if use_wgpu { eframe::Renderer::Wgpu } else { eframe::Renderer::Glow }, + renderer: if use_wgpu { + eframe::Renderer::Wgpu + } else { + eframe::Renderer::Glow + }, ..Default::default() }; @@ -116,6 +134,22 @@ struct UiState { /// Set while a download of a release asset is in flight. `None` when /// idle or after a completed download has been acknowledged. download_in_progress: bool, + /// Set while an install-or-remove cert op is in flight. Install and + /// Remove share this single flag so they can't race each other: + /// clicking Install → Remove back-to-back would otherwise leave the + /// final trust/file state dependent on thread scheduling — an + /// in-flight install could re-trust the CA after Remove had already + /// deleted it, or vice versa. Both UI buttons disable while this + /// is set, and both handlers gate-and-flip it. + cert_op_in_progress: bool, + /// Set synchronously when `Cmd::Start` is received by the background + /// thread, cleared synchronously when `Cmd::Stop` completes. Broader + /// than `running` (which only flips after the MITM manager has + /// finished loading). Used to block `Remove CA` during the window + /// between start-click and `running = true` — otherwise a queued + /// `Cmd::RemoveCa` could delete `ca/` while the server is partway + /// through loading the keypair into memory. + proxy_active: bool, /// One-line status of the most recent download (Ok(path) or Err(msg)). last_download: Option>, last_download_at: Option, @@ -139,6 +173,7 @@ enum Cmd { Stop, Test(Config), InstallCa, + RemoveCa, CheckCaTrusted, PollStats, /// Probe a single SNI against the given google_ip. Result is written @@ -209,11 +244,27 @@ struct FormState { show_log: bool, fetch_ips_from_api: bool, max_ips_to_scan: usize, - scan_batch_size:usize, + scan_batch_size: usize, google_ip_validation: bool, normalize_x_graphql: bool, youtube_via_relay: bool, passthrough_hosts: Vec, + /// Round-tripped from config.json so the UI's save path doesn't + /// drop the user's setting. Not currently exposed as a UI control; + /// users edit `block_quic` directly in `config.json` (Issue #213). + block_quic: bool, + /// Round-tripped from config.json. Not exposed as a UI control — + /// users edit `disable_padding` directly when needed (Issue #391). + /// Default false (padding active). + disable_padding: bool, + /// Round-tripped from config.json. Not exposed in the UI form yet — + /// the bypass-DoH default is the right answer for almost everyone + /// (DoH already encrypts, the tunnel was just adding latency), so + /// this is a config-only opt-out. See config.rs `tunnel_doh`. + tunnel_doh: bool, + /// User-supplied DoH hostnames added to the built-in default list, + /// round-tripped from config.json. See config.rs `bypass_doh_hosts`. + bypass_doh_hosts: Vec, } #[derive(Clone, Debug)] @@ -254,7 +305,10 @@ fn load_form() -> (FormState, Option) { } } } else { - tracing::info!("config: no config found at {} — starting with defaults", path.display()); + tracing::info!( + "config: no config found at {} — starting with defaults", + path.display() + ); (None, None) }; let form = if let Some(c) = existing { @@ -286,13 +340,17 @@ fn load_form() -> (FormState, Option) { sni_custom_input: String::new(), sni_editor_open: false, show_log: true, - fetch_ips_from_api:c.fetch_ips_from_api, - max_ips_to_scan:c.max_ips_to_scan, + fetch_ips_from_api: c.fetch_ips_from_api, + max_ips_to_scan: c.max_ips_to_scan, google_ip_validation: c.google_ip_validation, - scan_batch_size:c.scan_batch_size, + scan_batch_size: c.scan_batch_size, normalize_x_graphql: c.normalize_x_graphql, youtube_via_relay: c.youtube_via_relay, passthrough_hosts: c.passthrough_hosts.clone(), + block_quic: c.block_quic, + disable_padding: c.disable_padding, + tunnel_doh: c.tunnel_doh, + bypass_doh_hosts: c.bypass_doh_hosts.clone(), } } else { FormState { @@ -313,13 +371,17 @@ fn load_form() -> (FormState, Option) { sni_custom_input: String::new(), sni_editor_open: false, show_log: true, - fetch_ips_from_api:false, - max_ips_to_scan:100, - google_ip_validation:true, - scan_batch_size:500, + fetch_ips_from_api: false, + max_ips_to_scan: 100, + google_ip_validation: true, + scan_batch_size: 500, normalize_x_graphql: false, youtube_via_relay: false, passthrough_hosts: Vec::new(), + block_quic: false, + disable_padding: false, + tunnel_doh: false, + bypass_doh_hosts: Vec::new(), } }; (form, load_err) @@ -450,10 +512,10 @@ impl FormState { Some(active) } }, - fetch_ips_from_api:self.fetch_ips_from_api, + fetch_ips_from_api: self.fetch_ips_from_api, max_ips_to_scan: self.max_ips_to_scan, - google_ip_validation:self.google_ip_validation, - scan_batch_size:self.scan_batch_size, + google_ip_validation: self.google_ip_validation, + scan_batch_size: self.scan_batch_size, normalize_x_graphql: self.normalize_x_graphql, // UI form doesn't expose youtube_via_relay yet — it's a // config-only flag for now. Passed through from the loaded @@ -462,6 +524,18 @@ impl FormState { // Similarly config-only for now; round-trips through the // file so the UI doesn't drop the user's entries on save. passthrough_hosts: self.passthrough_hosts.clone(), + // Issue #213: block_quic is config-only for now (no UI + // control yet). Round-trip through the file so save + // doesn't drop a user-set true. + block_quic: self.block_quic, + // Issue #391: disable_padding is config-only for now. + // Round-trip preserves the user's choice. + disable_padding: self.disable_padding, + // DoH bypass is enabled-by-default with `tunnel_doh = false`. + // Round-trip the user's choice (and any extra hostnames they + // added) so save doesn't drop them. + tunnel_doh: self.tunnel_doh, + bypass_doh_hosts: self.bypass_doh_hosts.clone(), }) } } @@ -513,6 +587,12 @@ struct ConfigWire<'a> { max_ips_to_scan: usize, scan_batch_size: usize, google_ip_validation: bool, + /// Default false (= bypass DoH). Only emitted when explicitly true + /// so unchanged configs stay clean. + #[serde(skip_serializing_if = "is_false")] + tunnel_doh: bool, + #[serde(skip_serializing_if = "Vec::is_empty")] + bypass_doh_hosts: &'a Vec, } fn is_false(b: &bool) -> bool { @@ -561,6 +641,8 @@ impl<'a> From<&'a Config> for ConfigWire<'a> { max_ips_to_scan: c.max_ips_to_scan, scan_batch_size: c.scan_batch_size, google_ip_validation: c.google_ip_validation, + tunnel_doh: c.tunnel_doh, + bypass_doh_hosts: &c.bypass_doh_hosts, } } } @@ -584,10 +666,7 @@ fn section(ui: &mut egui::Ui, title: &str, body: impl FnOnce(&mut egui::Ui)) { ui.add_space(2.0); let frame = egui::Frame::none() .fill(egui::Color32::from_rgb(28, 30, 34)) - .stroke(egui::Stroke::new( - 1.0, - egui::Color32::from_rgb(50, 54, 60), - )) + .stroke(egui::Stroke::new(1.0, egui::Color32::from_rgb(50, 54, 60))) .rounding(6.0) .inner_margin(egui::Margin::same(10.0)); frame.show(ui, body); @@ -596,10 +675,14 @@ fn section(ui: &mut egui::Ui, title: &str, body: impl FnOnce(&mut egui::Ui)) { /// A primary accent-filled button. Used for the headline action in a row /// (Start / Stop / SNI pool). fn primary_button(text: &str) -> egui::Button<'_> { - egui::Button::new(egui::RichText::new(text).color(egui::Color32::WHITE).strong()) - .fill(ACCENT) - .min_size(egui::vec2(120.0, 28.0)) - .rounding(4.0) + egui::Button::new( + egui::RichText::new(text) + .color(egui::Color32::WHITE) + .strong(), + ) + .fill(ACCENT) + .min_size(egui::vec2(120.0, 28.0)) + .rounding(4.0) } /// A compact form row: label on the left (fixed width for vertical alignment), @@ -945,7 +1028,12 @@ impl eframe::App for App { ui.horizontal(|ui| { if ui.add(primary_button("Save config")).clicked() { match self.form.to_config().and_then(|c| save_config(&c)) { - Ok(p) => self.toast = Some((format!("Saved to {}", p.display()), Instant::now())), + Ok(p) => { + // Apply the new log level live so users don't have to + // restart for the combobox to take effect (#401). + apply_log_level(&self.form.log_level); + self.toast = Some((format!("Saved to {}", p.display()), Instant::now())); + } Err(e) => self.toast = Some((format!("Save failed: {}", e), Instant::now())), } } @@ -1074,7 +1162,7 @@ impl eframe::App for App { ), ), ("bytes today", fmt_bytes(s.today_bytes)), - ("UTC day", s.today_key.clone()), + ("PT day", s.today_key.clone()), ("resets in", reset_str), ]; egui::Grid::new("usage_today") @@ -1209,9 +1297,54 @@ impl eframe::App for App { // Secondary actions — smaller, grouped together on their own line. ui.add_space(4.0); ui.horizontal(|ui| { - if ui.small_button("Install CA").clicked() { - let _ = self.cmd_tx.send(Cmd::InstallCa); - } + // Install CA and Remove CA share a single in-flight flag + // so back-to-back clicks can't race — an in-flight + // install would otherwise re-trust the CA after Remove + // deleted it (or vice versa). Both buttons disable when + // either op is running. + let (cert_op_in_flight, proxy_active) = { + let s = self.shared.state.lock().unwrap(); + (s.cert_op_in_progress, s.proxy_active) + }; + + let install_hover = if cert_op_in_flight { + "A cert install/remove is already in progress." + } else { + "Install the MITM CA into the OS trust store (and NSS if certutil \ + is available)." + }; + ui.add_enabled_ui(!cert_op_in_flight, |ui| { + if ui + .small_button("Install CA") + .on_hover_text(install_hover) + .clicked() + { + let _ = self.cmd_tx.send(Cmd::InstallCa); + } + }); + + let remove_hover = if proxy_active || running { + "Stop the proxy first — the CA keypair is held in memory by the \ + running MITM engine, and removing it now would break HTTPS for \ + every site until restart." + } else if cert_op_in_flight { + "A cert install/remove is already in progress." + } else { + "Remove the MITM CA from the OS trust store (verified by name) \ + and delete the on-disk ca/ directory. NSS cleanup (Firefox/Chrome) \ + is best-effort and logs a hint if certutil is missing or a browser \ + has the DB locked. A fresh CA is generated the next time you start \ + the proxy. Your config.json and the Apps Script deployment are NOT \ + touched — no need to redeploy Code.gs." + }; + ui.add_enabled_ui(!proxy_active && !running && !cert_op_in_flight, |ui| { + if ui.small_button("Remove CA") + .on_hover_text(remove_hover) + .clicked() + { + let _ = self.cmd_tx.send(Cmd::RemoveCa); + } + }); if ui.small_button("Check CA").clicked() { let _ = self.cmd_tx.send(Cmd::CheckCaTrusted); } @@ -1736,13 +1869,16 @@ fn background_thread(shared: Arc, rx: Receiver) { }); } } - // In background_thread function, modify the Cmd::Start handler: Ok(Cmd::Start(cfg)) => { if active.is_some() { push_log(&shared, "[ui] already running"); continue; } push_log(&shared, "[ui] starting proxy..."); + // Flip proxy_active synchronously so a `Remove CA` click + // queued in the same frame as Start is rejected before + // the MITM manager begins loading. + shared.state.lock().unwrap().proxy_active = true; let shared2 = shared.clone(); let fronter_slot: Arc>>> = Arc::new(AsyncMutex::new(None)); @@ -1756,7 +1892,9 @@ fn background_thread(shared: Arc, rx: Receiver) { Ok(m) => m, Err(e) => { push_log(&shared2, &format!("[ui] MITM init failed: {}", e)); - shared2.state.lock().unwrap().running = false; + let mut s = shared2.state.lock().unwrap(); + s.running = false; + s.proxy_active = false; return; } }; @@ -1765,7 +1903,9 @@ fn background_thread(shared: Arc, rx: Receiver) { Ok(s) => s, Err(e) => { push_log(&shared2, &format!("[ui] proxy build failed: {}", e)); - shared2.state.lock().unwrap().running = false; + let mut st = shared2.state.lock().unwrap(); + st.running = false; + st.proxy_active = false; return; } }; @@ -1792,8 +1932,15 @@ fn background_thread(shared: Arc, rx: Receiver) { push_log(&shared2, &format!("[ui] proxy error: {}", e)); } - shared2.state.lock().unwrap().running = false; - shared2.state.lock().unwrap().started_at = None; + { + let mut st = shared2.state.lock().unwrap(); + st.running = false; + st.started_at = None; + // Self-exit path (e.g. bind error after startup, + // or normal shutdown without Cmd::Stop). The + // Stop handler clears this too — either is fine. + st.proxy_active = false; + } push_log(&shared2, "[ui] proxy stopped"); }); @@ -1819,8 +1966,10 @@ fn background_thread(shared: Arc, rx: Receiver) { } }); - shared.state.lock().unwrap().running = false; - shared.state.lock().unwrap().started_at = None; + let mut st = shared.state.lock().unwrap(); + st.running = false; + st.started_at = None; + st.proxy_active = false; } } @@ -1848,29 +1997,106 @@ fn background_thread(shared: Arc, rx: Receiver) { }); } Ok(Cmd::InstallCa) => { + // Share the cert-op flag with Remove CA so the two + // can't race. Gate and flip before spawning; the worker + // clears on exit. + { + let mut st = shared.state.lock().unwrap(); + if st.cert_op_in_progress { + push_log( + &shared, + "[ui] cert op already in progress — ignoring duplicate install", + ); + continue; + } + st.cert_op_in_progress = true; + } let shared2 = shared.clone(); std::thread::spawn(move || { push_log(&shared2, "[ui] installing CA..."); let base = data_dir::data_dir(); - if let Err(e) = MitmCertManager::new_in(&base) { - push_log(&shared2, &format!("[ui] CA init failed: {}", e)); - return; - } - let ca = base.join(CA_CERT_FILE); - match install_ca(&ca) { - Ok(()) => { - push_log(&shared2, "[ui] CA install ok"); - let mut st = shared2.state.lock().unwrap(); + let result = (|| -> Result<(), String> { + if let Err(e) = MitmCertManager::new_in(&base) { + return Err(format!("CA init failed: {}", e)); + } + let ca = base.join(CA_CERT_FILE); + install_ca(&ca).map_err(|e| format!("CA install failed: {}", e)) + })(); + { + let mut st = shared2.state.lock().unwrap(); + st.cert_op_in_progress = false; + if result.is_ok() { st.ca_trusted = Some(true); st.ca_trusted_at = Some(Instant::now()); } - Err(e) => { - push_log(&shared2, &format!("[ui] CA install failed: {}", e)); + } + match result { + Ok(()) => push_log(&shared2, "[ui] CA install ok"), + Err(msg) => { + push_log(&shared2, &format!("[ui] {}", msg)); push_log(&shared2, "[ui] hint: run the terminal binary with sudo/admin: mhrv-rs --install-cert"); } } }); } + Ok(Cmd::RemoveCa) => { + // Authoritative proxy-active guard: the UI button is + // disabled when proxy_active/running is set, but a + // Cmd::RemoveCa may already be queued by the time the + // Start handler flips the flag. `active` is owned by + // this thread so its state is the real source of truth + // — reject removal any time a proxy handle is alive, + // whether it's still starting or fully running. + if active.is_some() { + push_log( + &shared, + "[ui] cannot remove CA: proxy is running or starting — stop it first", + ); + continue; + } + // Shared cert-op gate: covers Install CA too, so back- + // to-back Install → Remove clicks can't race. The + // button is already disabled while this is set, but a + // queued command can still arrive here. + { + let mut st = shared.state.lock().unwrap(); + if st.cert_op_in_progress { + push_log( + &shared, + "[ui] cert op already in progress — ignoring duplicate remove", + ); + continue; + } + st.cert_op_in_progress = true; + } + let shared2 = shared.clone(); + std::thread::spawn(move || { + push_log(&shared2, "[ui] removing CA (trust store + files)..."); + let base = data_dir::data_dir(); + let result = remove_ca(&base); + { + let mut st = shared2.state.lock().unwrap(); + st.cert_op_in_progress = false; + if result.is_ok() { + st.ca_trusted = Some(false); + st.ca_trusted_at = Some(Instant::now()); + } + } + match result { + Ok(outcome) => { + push_log(&shared2, &format!("[ui] {}", outcome.summary())); + push_log( + &shared2, + "[ui] config.json and Apps Script deployment untouched", + ); + } + Err(e) => { + push_log(&shared2, &format!("[ui] CA remove failed: {}", e)); + push_log(&shared2, "[ui] hint: run the terminal binary with sudo/admin: mhrv-rs --remove-cert"); + } + } + }); + } Ok(Cmd::TestSni { google_ip, sni }) => { let shared2 = shared.clone(); { @@ -1915,7 +2141,21 @@ fn background_thread(shared: Arc, rx: Receiver) { std::thread::spawn(move || { let base = data_dir::data_dir(); let ca = base.join(CA_CERT_FILE); - let trusted = mhrv_rs::cert_installer::is_ca_trusted(&ca); + let file_exists = ca.exists(); + // Probe the trust store by name — independent of + // whether the on-disk ca.crt happens to be there. + // The file and the trust-store entry can be out of + // sync (e.g. after a partial removal), and that + // mismatch is exactly what Check CA must surface. + let trusted = mhrv_rs::cert_installer::is_ca_trusted_by_name(); + push_log( + &shared2, + &format!( + "[ui] check CA: file={} trust_store={}", + if file_exists { "present" } else { "missing" }, + if trusted { "trusted" } else { "not trusted" }, + ), + ); let mut st = shared2.state.lock().unwrap(); st.ca_trusted = Some(trusted); st.ca_trusted_at = Some(Instant::now()); @@ -1930,7 +2170,10 @@ fn background_thread(shared: Arc, rx: Receiver) { } rt.spawn(async move { let result = mhrv_rs::update_check::check(route).await; - push_log(&shared2, &format!("[ui] update check: {}", result.summary())); + push_log( + &shared2, + &format!("[ui] update check: {}", result.summary()), + ); { let mut st = shared2.state.lock().unwrap(); st.last_update_check = Some(UpdateProbeState::Done(result)); @@ -1990,14 +2233,19 @@ fn background_thread(shared: Arc, rx: Receiver) { /// Install a tracing subscriber that mirrors every log event into the UI's /// Recent log panel. /// -/// Respects `RUST_LOG` if set. Otherwise defaults to `info` — which is what -/// users mean when they pick a non-default log level in the form. (trace / -/// debug flip too much noise for a local GUI, so the combo-box changes level -/// live via the `reload` handle that `with_env_filter` gives us but we keep -/// the default boot-time level at info so first-run behavior is sensible.) -fn install_ui_tracing(shared: Arc) { +/// Filter precedence (issue #401, v1.8.2): +/// 1. `RUST_LOG` env var, if set +/// 2. The saved form's `log_level` (passed in from the loaded config) +/// 3. `info,hyper=warn` as a sensible default +/// +/// The constructed filter is wrapped in a `reload::Layer` and the handle +/// is stashed in `LOG_RELOAD` so that a Save inside the running UI can +/// reinstall the filter without a restart. See `apply_log_level`. +fn install_ui_tracing(shared: Arc, config_level: &str) { use tracing_subscriber::fmt::MakeWriter; - use tracing_subscriber::EnvFilter; + use tracing_subscriber::layer::SubscriberExt; + use tracing_subscriber::util::SubscriberInitExt; + use tracing_subscriber::{reload, EnvFilter}; /// A MakeWriter that pushes each line into the shared log panel. struct UiLogWriter { @@ -2051,19 +2299,71 @@ fn install_ui_tracing(shared: Arc) { } } - let filter = - EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info,hyper=warn")); + // RUST_LOG > config.log_level > "info,hyper=warn" + let filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| { + let trimmed = config_level.trim(); + if trimmed.is_empty() { + EnvFilter::new("info,hyper=warn") + } else { + EnvFilter::try_new(trimmed).unwrap_or_else(|_| EnvFilter::new("info,hyper=warn")) + } + }); + + let (filter_layer, reload_handle) = reload::Layer::new(filter); + if LOG_RELOAD.set(reload_handle).is_err() { + // Already initialized — install_ui_tracing got called twice. Bail + // silently rather than panic; the existing subscriber stays live. + return; + } let writer = UiLogWriter { shared }; - let _ = tracing_subscriber::fmt() - .with_env_filter(filter) + let fmt_layer = tracing_subscriber::fmt::layer() .with_target(false) .with_ansi(false) - .with_writer(writer) + .with_writer(writer); + + let _ = tracing_subscriber::registry() + .with(filter_layer) + .with(fmt_layer) .try_init(); } +/// Reload handle for the UI's tracing EnvFilter — populated once at startup +/// by `install_ui_tracing`. `apply_log_level` uses it to swap in a new +/// filter when the user clicks Save with a different log level (#401). +static LOG_RELOAD: std::sync::OnceLock< + tracing_subscriber::reload::Handle< + tracing_subscriber::EnvFilter, + tracing_subscriber::Registry, + >, +> = std::sync::OnceLock::new(); + +/// Reinstall the tracing filter at runtime. Called from the Save handler +/// so the user's new `log_level` takes effect without a restart. RUST_LOG +/// still wins if it was set at process start — explicit override beats +/// config in both directions. +fn apply_log_level(level: &str) { + use tracing_subscriber::EnvFilter; + let Some(handle) = LOG_RELOAD.get() else { + return; + }; + if std::env::var_os("RUST_LOG").is_some() { + // RUST_LOG was set explicitly at boot — don't silently override. + return; + } + let trimmed = level.trim(); + let new = if trimmed.is_empty() { + EnvFilter::new("info,hyper=warn") + } else { + match EnvFilter::try_new(trimmed) { + Ok(f) => f, + Err(_) => return, + } + }; + let _ = handle.modify(|f| *f = new); +} + /// Where we drop downloaded release assets. Prefer the OS user Downloads /// dir (via the directories crate that's already in our tree), fall back /// to the user-data dir for platforms that don't expose one (edge case). diff --git a/src/cert_installer.rs b/src/cert_installer.rs index 0d6eb21e..caff2835 100644 --- a/src/cert_installer.rs +++ b/src/cert_installer.rs @@ -1,7 +1,7 @@ -use std::path::Path; +use std::path::{Path, PathBuf}; use std::process::Command; -use crate::mitm::CERT_NAME; +use crate::mitm::{CA_DIR, CERT_NAME}; #[derive(Debug, thiserror::Error)] pub enum InstallError { @@ -11,6 +11,180 @@ pub enum InstallError { Failed, #[error("unsupported platform: {0}")] Unsupported(String), + #[error("io {path}: {source}")] + Io { + path: PathBuf, + #[source] + source: std::io::Error, + }, + #[error("CA still trusted after removal — re-run with admin/sudo")] + RemovalIncomplete, +} + +/// Structured outcome of a successful `remove_ca` call. The OS trust +/// store is always fully clean when we return `Ok(_)` (that's verified +/// by `is_ca_trusted_by_name` before file deletion), but NSS cleanup is +/// best-effort — callers need the nuance to print accurate status. +/// +/// UI/CLI should treat `Clean` as "nothing more to do" and +/// `NssIncomplete` as a non-fatal warning ("OS CA removed, browser +/// cleanup partial — follow the logged hint"). +#[derive(Debug, Clone, Copy)] +pub enum RemovalOutcome { + Clean, + NssIncomplete(NssReport), +} + +impl RemovalOutcome { + /// One-line summary suitable for a log line or status banner. + pub fn summary(&self) -> String { + match self { + RemovalOutcome::Clean => "CA removed.".to_string(), + RemovalOutcome::NssIncomplete(r) if r.tool_missing_with_stores_present => { + "OS CA removed. NSS cleanup skipped — NSS certutil not found.".to_string() + } + RemovalOutcome::NssIncomplete(r) => format!( + "OS CA removed. NSS cleanup partial: {}/{} browser stores updated.", + r.ok, r.tried + ), + } + } +} + +/// When running as root via `sudo`, the process's `HOME` / `USER` +/// environment reflects **root**, not the user who invoked the command. +/// That breaks every user-scoped cert path this module touches — +/// `data_dir()` resolves to root's config dir, `firefox_profile_dirs()` +/// scans root's profiles, macOS `login.keychain-db` is root's. The +/// removal then operates on paths that probably don't exist, reports +/// success, and leaves the real user's CA trusted. +/// +/// This helper detects the real `sudo` case (`geteuid() == 0` AND +/// `SUDO_USER` set to a non-root user), resolves the invoking user's +/// home dir (SUDO_HOME, `getent passwd`, or `/Users/$SUDO_USER` / +/// `/home/$SUDO_USER` fallback), and rewrites `HOME` for the remainder +/// of the process. The EUID gate is load-bearing: `SUDO_USER` alone is +/// not proof of elevation (a user can export it, inherit it, or use +/// `sudo -E`), and blindly trusting it would let a non-root process +/// redirect config/CA/profile operations to another user's files. +/// Call once at the top of `main` in every binary (CLI + UI) before +/// anything else reads HOME. No-op on Windows (UAC keeps the user's +/// HOME intact) and on non-sudo Unix invocations. +pub fn reconcile_sudo_environment() { + #[cfg(unix)] + unix::reconcile_sudo_home(); +} + +#[cfg(unix)] +mod unix { + use super::{should_reconcile_for, sudo_parse_passwd_home}; + use std::path::Path; + use std::process::Command; + + pub(super) fn reconcile_sudo_home() { + // SAFETY: geteuid() is async-signal-safe and cannot fail. + let euid = unsafe { libc::geteuid() }; + let sudo_user_raw = std::env::var("SUDO_USER").ok(); + let Some(sudo_user) = should_reconcile_for(euid, sudo_user_raw.as_deref()) else { + return; + }; + let sudo_user = sudo_user.to_string(); + match resolve_home(&sudo_user) { + Some(home) => { + tracing::info!( + "Detected sudo invocation (SUDO_USER={}): re-rooting HOME to {} \ + so user-scoped cert paths target the real user.", + sudo_user, + home + ); + // SAFETY: reconcile_sudo_environment runs at the top of + // main() before any other thread is spawned and before + // any code has cached HOME. + std::env::set_var("HOME", home); + } + None => { + tracing::warn!( + "Running under sudo (SUDO_USER={}), but could not resolve \ + the user's home dir. Cert paths will operate on root's \ + HOME — which may NOT match where you installed the CA. \ + Prefer running without sudo; the app invokes sudo \ + internally for system-level steps.", + sudo_user + ); + } + } + } + + fn resolve_home(sudo_user: &str) -> Option { + // Some sudoers configs export SUDO_HOME; prefer it when present. + if let Ok(h) = std::env::var("SUDO_HOME") { + if !h.is_empty() { + return Some(h); + } + } + // Linux: `getent passwd ` returns the full passwd entry. + if let Ok(out) = Command::new("getent").args(["passwd", sudo_user]).output() { + if out.status.success() { + let line = String::from_utf8_lossy(&out.stdout); + if let Some(h) = sudo_parse_passwd_home(&line) { + return Some(h); + } + } + } + // macOS has no getent. Fall back to the convention for both + // platforms — verify the dir actually exists before returning. + for root in ["/Users", "/home"] { + let candidate = format!("{}/{}", root, sudo_user); + if Path::new(&candidate).exists() { + return Some(candidate); + } + } + None + } +} + +/// Decide whether to re-root HOME for a sudo-style invocation, given a +/// process's effective UID and the value of the `SUDO_USER` env var. +/// Returns `Some(user)` if and only if we should re-root HOME to that +/// user's dir; `None` in every other case (normal user, real root +/// login without sudo, SUDO_USER missing / empty / literally "root"). +/// +/// Extracted as a pure function so every branch — including the +/// load-bearing `euid == 0 && SUDO_USER unset` path that must leave +/// HOME as root's own /root — can be asserted with unit tests. +/// Always compiled so the tests run on every host. +fn should_reconcile_for<'a>(euid: u32, sudo_user: Option<&'a str>) -> Option<&'a str> { + // EUID gate: if we're not actually root, `SUDO_USER` could be + // anything (inherited from a shell init, explicitly exported, + // set via `sudo -E`) and rewriting HOME based on it would let a + // normal-user process redirect cert paths to someone else's files. + if euid != 0 { + return None; + } + // Real root login (no sudo) — SUDO_USER is simply unset. Do NOT + // re-root: root's own /root is the correct HOME for that process. + let user = sudo_user?; + // Empty string or literal "root" also mean "nothing to reconcile". + if user.is_empty() || user == "root" { + return None; + } + Some(user) +} + +/// Pure parser for a single-line `getent passwd` entry. +/// Always compiled so unit tests can run on every host. +fn sudo_parse_passwd_home(content: &str) -> Option { + let line = content.lines().next()?; + let fields: Vec<&str> = line.split(':').collect(); + // passwd format: name:pw:uid:gid:gecos:home:shell + if fields.len() < 7 { + return None; + } + let home = fields[5].trim(); + if home.is_empty() { + return None; + } + Some(home.to_string()) } /// Install the CA certificate at `path` into the system trust store. @@ -46,12 +220,108 @@ pub fn install_ca(path: &Path) -> Result<(), InstallError> { } } +/// Remove the CA from the OS trust store, best-effort NSS stores (Firefox +/// profiles + Chrome/Chromium on Linux), and delete the on-disk +/// `ca/ca.crt` + `ca/ca.key`. A fresh CA will be regenerated the next +/// time the proxy starts — and since the Apps Script deployment lives on +/// Google's side and `config.json` is never touched here, the user does +/// not have to redeploy `Code.gs` or re-enter their deployment ID. +/// Platform-specific — may require admin/sudo for system stores. +/// +/// Safety property: we verify the OS trust store with `is_ca_trusted` +/// before deleting `ca/`. If the stale root is still trusted (e.g. +/// because the system-store delete needed admin and we didn't have it), +/// we return `RemovalIncomplete` and leave the on-disk files alone — a +/// regenerated CA with a fresh keypair would otherwise mismatch the +/// stale trusted root and silently break every HTTPS MITM leaf. +pub fn remove_ca(base: &Path) -> Result { + let os = std::env::consts::OS; + tracing::info!("Removing CA certificate on {}...", os); + + // Platforms that merge anchor files into a bundle/database (Linux) + // must report whether the refresh step succeeded — the bundle may + // still contain the CA even after the anchor file is gone. macOS + // and Windows write directly to their stores, so there's nothing + // separate to refresh; they rely entirely on the by-name probe. + let platform_ok = match os { + "macos" => { + remove_macos(); + true + } + "linux" => remove_linux(), + "windows" => { + remove_windows(); + true + } + other => return Err(InstallError::Unsupported(other.to_string())), + }; + + // Verify OS trust store removal BEFORE touching browser state. If + // the OS removal didn't actually land (e.g. machine-store delete + // needed admin we don't have, or a Linux refresh cmd failed), we + // must not also strip NSS entries + the Firefox enterprise_roots + // pref — that leaves the system in an inconsistent "half-removed" + // state (OS still trusts, but Firefox is newly reconfigured) that + // only confuses the user. Returning RemovalIncomplete here keeps + // the install pristine so a retry is idempotent. + // + // Must be path-independent — the on-disk cert file may already be + // missing for unrelated reasons, and a file-gated check would then + // mask a still-trusted stale root. + if !platform_ok || is_ca_trusted_by_name() { + tracing::error!( + "MITM CA is still trusted after OS removal attempt \ + (platform_ok={}) — refusing to touch browser state or \ + delete on-disk files. Re-run with admin/sudo to complete \ + revocation.", + platform_ok + ); + return Err(InstallError::RemovalIncomplete); + } + + // OS store is clean — only now mutate browser state. + let nss = remove_nss_stores(); + + let ca_dir = base.join(CA_DIR); + if ca_dir.exists() { + if let Err(e) = std::fs::remove_dir_all(&ca_dir) { + tracing::error!("failed to delete {}: {}", ca_dir.display(), e); + return Err(InstallError::Io { + path: ca_dir.clone(), + source: e, + }); + } + tracing::info!("Deleted CA files at {}", ca_dir.display()); + } + + if nss.is_clean() { + Ok(RemovalOutcome::Clean) + } else { + Ok(RemovalOutcome::NssIncomplete(nss)) + } +} + /// Heuristic check: is the CA already in the trust store? /// Best-effort — on unknown state we return false to always attempt install. +/// +/// The `path` guard skips the trust-store probe when the local CA file +/// is missing, because at install time "no file = nothing to trust" is a +/// useful shortcut. Revocation uses `is_ca_trusted_by_name` instead — +/// that path must verify the store regardless of whether the file still +/// exists, otherwise a pre-deleted `ca.crt` would mask a lingering +/// trusted root. pub fn is_ca_trusted(path: &Path) -> bool { if !path.exists() { return false; } + is_ca_trusted_by_name() +} + +/// Path-independent variant of `is_ca_trusted`: queries the OS trust +/// store by cert name (CERT_NAME) without requiring the on-disk cert +/// file. Used by `remove_ca` to verify revocation completed even if the +/// local `ca.crt` was already missing or deleted mid-flight. +pub fn is_ca_trusted_by_name() -> bool { match std::env::consts::OS { "macos" => is_trusted_macos(), "linux" => is_trusted_linux(), @@ -115,6 +385,73 @@ fn install_macos(cert_path: &str) -> bool { false } +/// Delete the CA from the login keychain (no sudo) and, only when a +/// probe confirms the cert actually lives there, the system keychain +/// (sudo). Probing first avoids prompting the user — or hanging the +/// UI's GUI-spawned `sudo` — for a password they don't need when the +/// cert was only ever installed in the login keychain (the default +/// path). Exit status is best-effort: `security delete-certificate` +/// exits non-zero for "not found", which is indistinguishable from +/// real failures, so the final trust state is verified by the caller +/// via `is_ca_trusted_by_name`. +fn remove_macos() { + let home = std::env::var("HOME").unwrap_or_default(); + let login_kc_db = format!("{}/Library/Keychains/login.keychain-db", home); + let login_kc = format!("{}/Library/Keychains/login.keychain", home); + let login_keychain = if Path::new(&login_kc_db).exists() { + login_kc_db + } else { + login_kc + }; + + let res = Command::new("security") + .args(["delete-certificate", "-c", CERT_NAME, &login_keychain]) + .status(); + if matches!(res, Ok(s) if s.success()) { + tracing::info!("Removed CA from login keychain."); + } + + if macos_system_keychain_has() { + let res = Command::new("sudo") + .args([ + "security", + "delete-certificate", + "-c", + CERT_NAME, + "/Library/Keychains/System.keychain", + ]) + .status(); + if matches!(res, Ok(s) if s.success()) { + tracing::info!("Removed CA from System keychain."); + } else { + tracing::warn!( + "System keychain still has the CA and the sudo delete did not \ + succeed — re-run with an admin password available." + ); + } + } +} + +/// Probe-without-sudo: does the System keychain currently contain our +/// cert? `security find-certificate` against the system keychain path +/// does not require admin; only `delete-certificate` does. Used to +/// decide whether to escalate at all. +fn macos_system_keychain_has() -> bool { + let out = Command::new("security") + .args([ + "find-certificate", + "-a", + "-c", + CERT_NAME, + "/Library/Keychains/System.keychain", + ]) + .output(); + match out { + Ok(o) => o.status.success() && !o.stdout.is_empty(), + Err(_) => false, + } +} + fn is_trusted_macos() -> bool { let out = Command::new("security") .args(["find-certificate", "-a", "-c", CERT_NAME]) @@ -142,7 +479,10 @@ fn install_linux(cert_path: &str) -> bool { try_copy_and_run(cert_path, &dest, &[&["update-ca-trust", "extract"]]) } "arch" => { - let dest = format!("/etc/ca-certificates/trust-source/anchors/{}.crt", safe_name); + let dest = format!( + "/etc/ca-certificates/trust-source/anchors/{}.crt", + safe_name + ); try_copy_and_run(cert_path, &dest, &[&["trust", "extract-compat"]]) } "openwrt" => { @@ -154,7 +494,8 @@ fn install_linux(cert_path: &str) -> bool { "OpenWRT detected: the router doesn't need to trust the MITM CA. \ Copy {} to each LAN client (browser / OS trust store) instead. \ Example: scp root@:{} ./ and import from there.", - cert_path, cert_path + cert_path, + cert_path ); true } @@ -253,7 +594,11 @@ fn classify_os_release(content: &str) -> String { Some(x) => x, None => continue, }; - let v = v.trim().trim_matches('"').trim_matches('\'').to_ascii_lowercase(); + let v = v + .trim() + .trim_matches('"') + .trim_matches('\'') + .to_ascii_lowercase(); match k.trim() { "ID" => id = v, "ID_LIKE" => id_like = v, @@ -281,13 +626,103 @@ fn classify_os_release(content: &str) -> String { "unknown".into() } +/// Mirror of `install_linux`: for each known anchor dir, delete our cert +/// file and run the corresponding refresh command. Tries without sudo +/// first, falls back to sudo. Missing files are silently skipped — +/// removal is idempotent. +/// +/// Key safety behavior: we refresh the trust bundle **regardless of +/// whether we found an anchor file to delete**. The concern is a retry +/// after a prior run that deleted the anchor but failed to refresh — +/// leaving the merged bundle still containing our PEM. On the next +/// invocation the anchor dir is empty, so a "delete file, then refresh" +/// contract would skip the refresh entirely and `remove_ca` would see +/// no anchor file left, declare success, and delete `ca/` while the +/// stale root is still trusted. Running the refresh unconditionally +/// catches this. +/// +/// Returns `false` if any refresh command failed — callers must then +/// abort file deletion so a regenerated CA with a fresh keypair can't +/// mismatch the stale root. +fn remove_linux() -> bool { + let safe_name = CERT_NAME.replace(' ', "_"); + let anchors: &[(&str, &[&str])] = &[ + ( + "/usr/local/share/ca-certificates", + &["update-ca-certificates"], + ), + ( + "/etc/pki/ca-trust/source/anchors", + &["update-ca-trust", "extract"], + ), + ( + "/etc/ca-certificates/trust-source/anchors", + &["trust", "extract-compat"], + ), + ]; + + let mut all_ok = true; + for (dir, refresh) in anchors { + // Skip distros whose anchor dir doesn't exist — running their + // refresh tool (e.g. `trust extract-compat` on a Debian host) + // would just error out and falsely mark the removal as failed. + if !Path::new(dir).exists() { + continue; + } + + let path = format!("{}/{}.crt", dir, safe_name); + let anchor_present = Path::new(&path).exists(); + if anchor_present { + let deleted = + std::fs::remove_file(&path).is_ok() || run_cmd(&["sudo", "rm", "-f", &path]); + if !deleted { + tracing::warn!("failed to remove {}", path); + all_ok = false; + continue; + } + } + + // Always refresh — see doc comment for the retry-safety rationale. + let refreshed = run_cmd(refresh) || { + let mut full: Vec<&str> = vec!["sudo"]; + full.extend_from_slice(refresh); + run_cmd(&full) + }; + if !refreshed { + tracing::error!( + "refresh {:?} failed for {} — CA may still be trusted via the merged bundle", + refresh, + dir + ); + all_ok = false; + } else if anchor_present { + tracing::info!("Removed CA from {} (bundle refreshed).", dir); + } else { + tracing::debug!("Refreshed {} bundle (nothing to delete here).", dir); + } + } + all_ok +} + fn is_trusted_linux() -> bool { - let anchor_dirs = [ + // Check both the anchor dirs (what we write into on install) and + // the post-extract dirs (where update-ca-certificates / `trust + // extract-compat` etc. copy or symlink our PEM after refresh). + // Checking the post-extract side catches the "anchor file already + // removed but bundle not regenerated" case on a retry — if we only + // looked at anchor dirs, a `remove_ca` retry after a prior refresh + // failure could declare success while the merged bundle still + // contains our stale root. + let dirs = [ "/usr/local/share/ca-certificates", "/etc/pki/ca-trust/source/anchors", "/etc/ca-certificates/trust-source/anchors", + // Post-extract locations: + "/etc/ssl/certs", + "/etc/pki/ca-trust/extracted/pem/directory-hash", + "/etc/ca-certificates/extracted/cadir", ]; - for d in anchor_dirs { + for d in dirs { if let Ok(entries) = std::fs::read_dir(d) { for e in entries.flatten() { let name = e.file_name(); @@ -310,24 +745,33 @@ fn is_trusted_linux() -> bool { /// false on Windows, so the Check-CA button was misleading users into /// reinstalling a cert that was already trusted. fn is_trusted_windows() -> bool { - // `certutil -user -store Root ` prints the matching cert entries - // on success (stdout), and exits with a non-zero code plus a "Not - // found" message if nothing matches. We also check stdout for the - // cert name because certutil in some locales returns 0 even on no- - // match, just with empty output. - for args in [ - vec!["-user", "-store", "Root", CERT_NAME], - vec!["-store", "Root", CERT_NAME], - ] { - let out = Command::new("certutil").args(&args).output(); - if let Ok(o) = out { + windows_store_has(true) || windows_store_has(false) +} + +/// Query a single Windows Trusted Root store for our CA. +/// `user = true` hits the current-user store (no admin needed); +/// `user = false` hits the machine store. `certutil -store Root ` +/// prints the matching cert entries on success and exits non-zero with +/// "Not found" if nothing matches — we also check stdout for the cert +/// name because certutil in some locales returns 0 on no-match with +/// empty output. +fn windows_store_has(user: bool) -> bool { + let mut args: Vec<&str> = Vec::new(); + if user { + args.push("-user"); + } + args.extend(["-store", "Root", CERT_NAME]); + let out = Command::new("certutil").args(&args).output(); + match out { + Ok(o) => { let stdout = String::from_utf8_lossy(&o.stdout); - if o.status.success() && stdout.to_ascii_lowercase().contains(&CERT_NAME.to_ascii_lowercase()) { - return true; - } + o.status.success() + && stdout + .to_ascii_lowercase() + .contains(&CERT_NAME.to_ascii_lowercase()) } + Err(_) => false, } - false } fn install_windows(cert_path: &str) -> bool { @@ -355,6 +799,47 @@ fn install_windows(cert_path: &str) -> bool { false } +/// Delete from user and/or machine Trusted Root stores. We probe each +/// store first with `certutil -store` and only attempt the delete where +/// the cert actually lives — this avoids the confusing "needs elevation" +/// error that `-delstore Root` would print when the cert was only ever +/// installed in the per-user store (the default path for non-admin +/// runs). Final state is verified by the caller via `is_ca_trusted`. +fn remove_windows() { + let mut any = false; + + if windows_store_has(true) { + let res = Command::new("certutil") + .args(["-delstore", "-user", "Root", CERT_NAME]) + .status(); + if matches!(res, Ok(s) if s.success()) { + tracing::info!("Removed CA from Windows user Trusted Root store."); + any = true; + } else { + tracing::warn!("failed to remove CA from Windows user Trusted Root store"); + } + } + + if windows_store_has(false) { + let res = Command::new("certutil") + .args(["-delstore", "Root", CERT_NAME]) + .status(); + if matches!(res, Ok(s) if s.success()) { + tracing::info!("Removed CA from Windows machine Trusted Root store."); + any = true; + } else { + tracing::warn!( + "failed to remove CA from Windows machine Trusted Root store \ + (run as administrator to complete)" + ); + } + } + + if !any { + tracing::info!("No MITM CA found in Windows Trusted Root stores."); + } +} + // ---------- NSS (Firefox + Chrome/Chromium on Linux) ---------- /// Best-effort install of the CA into all discovered NSS stores: @@ -440,43 +925,36 @@ fn install_nss_stores(cert_path: &str) { /// certutil isn't typically installed so the certutil-based path doesn't /// fire there. /// -/// Existing user.js entries for other prefs are preserved by appending -/// rather than truncating. Idempotent. +/// We tag the block we write with a sentinel marker comment on the line +/// above the pref, so uninstall can prove ownership before removing it — +/// the user may have had `security.enterprise_roots.enabled = true` +/// before this app existed, and we must not silently revoke their +/// setting. Idempotent. fn enable_firefox_enterprise_roots() { - const PREF: &str = r#"user_pref("security.enterprise_roots.enabled", true);"#; let mut touched = 0; for profile in firefox_profile_dirs() { let user_js = profile.join("user.js"); let existing = std::fs::read_to_string(&user_js).unwrap_or_default(); - if existing.contains("security.enterprise_roots.enabled") { - // Already set by us or the user. Replace-or-keep: if they set it - // to false we leave their choice alone. If it's already our line - // verbatim, nothing to do. - if existing.contains(PREF) { - continue; + match add_enterprise_roots_block(&existing) { + EnterpriseRootsEdit::AddedBlock(new) => { + if let Err(e) = std::fs::write(&user_js, new) { + tracing::debug!( + "firefox profile {}: user.js write failed: {}", + profile.display(), + e + ); + continue; + } + touched += 1; + } + EnterpriseRootsEdit::AlreadyOurs => {} + EnterpriseRootsEdit::UserOwned => { + tracing::debug!( + "firefox profile {} already has a user-owned enterprise_roots pref; leaving alone", + profile.display() + ); } - // Different value present — don't overwrite. - tracing::debug!( - "firefox profile {} already has a different enterprise_roots pref; leaving alone", - profile.display() - ); - continue; - } - let mut out = existing; - if !out.is_empty() && !out.ends_with('\n') { - out.push('\n'); - } - out.push_str(PREF); - out.push('\n'); - if let Err(e) = std::fs::write(&user_js, out) { - tracing::debug!( - "firefox profile {}: user.js write failed: {}", - profile.display(), - e - ); - continue; } - touched += 1; } if touched > 0 { tracing::info!( @@ -486,16 +964,130 @@ fn enable_firefox_enterprise_roots() { } } +// ── Firefox enterprise_roots marker-block helpers (pure, testable) ── +// +// We write a two-line block into user.js — a sentinel comment followed +// by the pref itself. The marker proves we wrote it, so uninstall can +// distinguish our own line from a user-authored one with the same +// value. Any user-authored `security.enterprise_roots.enabled` line +// (with or without our marker above it) means "hands off". +const FX_MARKER: &str = "// mhrv-rs: auto-added, safe to strip with --remove-cert"; +const FX_PREF: &str = r#"user_pref("security.enterprise_roots.enabled", true);"#; + +#[derive(Debug, PartialEq, Eq)] +enum EnterpriseRootsEdit { + AddedBlock(String), + AlreadyOurs, + UserOwned, +} + +/// Append our marker+pref block to `existing` unless (a) it's already +/// there verbatim (idempotent no-op), or (b) the user has their own +/// `enterprise_roots` pref that we didn't write — in which case we +/// leave everything alone. +fn add_enterprise_roots_block(existing: &str) -> EnterpriseRootsEdit { + if contains_our_block(existing) { + return EnterpriseRootsEdit::AlreadyOurs; + } + if existing.contains("security.enterprise_roots.enabled") { + return EnterpriseRootsEdit::UserOwned; + } + let mut out = existing.to_string(); + if !out.is_empty() && !out.ends_with('\n') { + out.push('\n'); + } + out.push_str(FX_MARKER); + out.push('\n'); + out.push_str(FX_PREF); + out.push('\n'); + EnterpriseRootsEdit::AddedBlock(out) +} + +/// Strip our marker+pref block from `existing` if present. If the pref +/// exists without our marker directly above it, the user owns it — we +/// cannot prove otherwise and leave user.js untouched. +/// +/// Consequence for upgrades from pre-marker versions of this app: the +/// legacy bare pref line stays orphaned in user.js after uninstall. +/// That's cosmetic only (Firefox falls back to its built-in root store +/// the moment the CA leaves the OS trust store), and it's the +/// conservative tradeoff — a bare `enterprise_roots = true` line is +/// indistinguishable from a user- or enterprise-policy-authored one, +/// and silently revoking that would break unrelated Firefox trust +/// behavior. README documents the orphan. +fn strip_enterprise_roots_block(existing: &str) -> Option { + if !contains_our_block(existing) { + return None; + } + let lines: Vec<&str> = existing.lines().collect(); + let mut out: Vec<&str> = Vec::with_capacity(lines.len()); + let mut i = 0; + while i < lines.len() { + let is_marker = lines[i].trim() == FX_MARKER; + let next_is_our_pref = lines.get(i + 1).map_or(false, |l| l.trim() == FX_PREF); + if is_marker && next_is_our_pref { + i += 2; + continue; + } + out.push(lines[i]); + i += 1; + } + let mut joined = out.join("\n"); + if existing.ends_with('\n') && !joined.is_empty() { + joined.push('\n'); + } + Some(joined) +} + +/// True iff `existing` contains our sentinel directly above our pref. +fn contains_our_block(existing: &str) -> bool { + let mut prev: Option<&str> = None; + for line in existing.lines() { + if prev.map(|p| p.trim()) == Some(FX_MARKER) && line.trim() == FX_PREF { + return true; + } + prev = Some(line); + } + false +} + +/// True iff `existing` has our exact pref line but NOT inside our +/// marker+pref block — i.e. an orphan `security.enterprise_roots.enabled +/// = true` whose provenance we can't prove. Used by +/// `disable_firefox_enterprise_roots` to surface a one-line hint on +/// uninstall so users upgrading from pre-v1.2.13 installs know their +/// Firefox user.js still has a cosmetic orphan pref from the old app +/// (not broken, just left in place because we can't distinguish it +/// from a user-authored line). +fn has_bare_enterprise_roots(existing: &str) -> bool { + if contains_our_block(existing) { + return false; + } + existing.lines().any(|l| l.trim() == FX_PREF) +} + fn has_nss_certutil() -> bool { + // We want NSS's `certutil` (from libnss3-tools), not Windows's + // built-in `certutil.exe` which shares the binary name but has + // completely different semantics. The previous heuristic looked + // for "-d" in help output, which false-positived on Windows + // because `-dump` / `-dumpPFX` are in the Windows help text. + // + // "nickname" is an NSS-specific concept (single-letter batch verbs + // like `-A`/`-D`/`-n nickname`); the Windows and macOS built-in + // certutils don't use that term. Matching on it reliably + // discriminates. Command::new("certutil") .arg("--help") .output() .ok() .map(|o| { - // macOS has a different certutil built-in that doesn't support -d. - // NSS-specific help output mentions the -d / -n flags. - String::from_utf8_lossy(&o.stderr).contains("-d") - || String::from_utf8_lossy(&o.stdout).contains("-d") + let combined = format!( + "{}{}", + String::from_utf8_lossy(&o.stderr), + String::from_utf8_lossy(&o.stdout) + ); + combined.to_ascii_lowercase().contains("nickname") }) .unwrap_or(false) } @@ -516,15 +1108,7 @@ fn install_nss_in_dir(dir_arg: &str, cert_path: &str) -> bool { let res = Command::new("certutil") .args([ - "-A", - "-n", - CERT_NAME, - "-t", - "C,,", - "-d", - dir_arg, - "-i", - cert_path, + "-A", "-n", CERT_NAME, "-t", "C,,", "-d", dir_arg, "-i", cert_path, ]) .output(); match res { @@ -559,6 +1143,226 @@ fn install_nss_in_profile(profile: &Path, cert_path: &str) -> bool { install_nss_in_dir(&dir_arg, cert_path) } +/// Best-effort reverse of `install_nss_stores`: delete our cert from +/// every Firefox profile NSS DB we can find, plus the shared Chrome/ +/// Chromium NSS DB on Linux, and remove the user.js pref we added. +/// +/// NSS cleanup is explicitly best-effort — `certutil` from libnss3-tools +/// may be missing, a DB may be locked by a running Firefox/Chrome, or +/// the delete may fail for reasons we can't distinguish. When that +/// happens we log a manual-cleanup hint but don't fail the whole +/// revocation. Callers of `remove_ca` should convey this to users so +/// the `--remove-cert` promise is "OS trust store + best-effort NSS", +/// not "guaranteed NSS". +/// Outcome of an NSS cleanup pass. `tried` / `ok` let callers render +/// accurate messages like "NSS cleanup partial: 1/3 stores updated". +/// `tool_missing_with_stores_present` flags the case where we found +/// Firefox/Chrome NSS DBs but NSS `certutil` isn't on PATH — surfaced +/// so the UI/CLI can tell the user why the cleanup is incomplete. +#[derive(Debug, Clone, Copy, Default)] +pub struct NssReport { + pub tried: usize, + pub ok: usize, + pub tool_missing_with_stores_present: bool, +} + +impl NssReport { + pub fn is_clean(&self) -> bool { + !self.tool_missing_with_stores_present && self.tried == self.ok + } +} + +fn remove_nss_stores() -> NssReport { + disable_firefox_enterprise_roots(); + + if !has_nss_certutil() { + // Only warn if there's actually an NSS store we can see — if the + // user never ran Firefox/Chrome on this machine there's nothing + // to clean up either way. + let profiles = firefox_profile_dirs(); + let chrome_present: bool; + #[cfg(target_os = "linux")] + { + chrome_present = chrome_nssdb_path() + .map(|p| p.join("cert9.db").exists() || p.join("cert8.db").exists()) + .unwrap_or(false); + } + #[cfg(not(target_os = "linux"))] + { + chrome_present = false; + } + let stores_present = !profiles.is_empty() || chrome_present; + if stores_present { + tracing::warn!( + "NSS certutil not found — cannot automatically remove CA from \ + Firefox/Chrome NSS stores. Remove `MasterHttpRelayVPN` manually \ + via each browser's certificate settings, or install NSS tools \ + (`libnss3-tools` on Debian/Ubuntu, `nss-tools` on Fedora/RHEL) \ + and re-run --remove-cert." + ); + } + return NssReport { + tried: 0, + ok: 0, + tool_missing_with_stores_present: stores_present, + }; + } + + let mut report = NssReport::default(); + + for p in firefox_profile_dirs() { + report.tried += 1; + if remove_nss_in_profile(&p) { + report.ok += 1; + } + } + + #[cfg(target_os = "linux")] + { + if let Some(nssdb) = chrome_nssdb_path() { + if nssdb.join("cert9.db").exists() || nssdb.join("cert8.db").exists() { + report.tried += 1; + let dir_arg = format!("sql:{}", nssdb.display()); + if remove_nss_in_dir(&dir_arg) { + report.ok += 1; + tracing::info!( + "Removed CA from Chrome/Chromium NSS DB: {}", + nssdb.display() + ); + } + } + } + } + + if report.tried > 0 { + if report.ok == report.tried { + tracing::info!("Removed CA from {} NSS store(s).", report.ok); + } else { + tracing::warn!( + "NSS cleanup partial: {}/{} stores updated. If Firefox/Chrome \ + was running, close it and re-run --remove-cert. Otherwise \ + remove `MasterHttpRelayVPN` manually via each browser's cert \ + settings.", + report.ok, + report.tried + ); + } + } + report +} + +/// Best-effort remove our cert from one NSS DB. +/// +/// Idempotent contract: "cert was never in this DB" is success. +/// Critical distinction from probe *failure*: if `certutil -L` fails +/// because the DB is locked by a running Firefox/Chrome, corrupt, or +/// inaccessible, we must NOT return `true` — that would silently mask +/// an incomplete revocation the user can't see, and NSS would keep +/// trusting the stale root. We parse stderr: only the specific +/// "could not find cert" message means absent. +fn remove_nss_in_dir(dir_arg: &str) -> bool { + let list = Command::new("certutil") + .args(["-L", "-n", CERT_NAME, "-d", dir_arg]) + .output(); + match list { + Ok(o) if o.status.success() => { + // Cert is present — fall through to delete. + } + Ok(o) => { + let stderr = String::from_utf8_lossy(&o.stderr); + if is_nss_not_found(&stderr) { + tracing::debug!("NSS {}: no `{}` entry — already clean", dir_arg, CERT_NAME); + return true; + } + tracing::warn!( + "NSS {}: probe failed (DB locked / inaccessible / other error): {}", + dir_arg, + stderr.trim() + ); + return false; + } + Err(e) => { + tracing::warn!("NSS {}: probe exec failed: {}", dir_arg, e); + return false; + } + } + + let res = Command::new("certutil") + .args(["-D", "-n", CERT_NAME, "-d", dir_arg]) + .output(); + match res { + Ok(o) if o.status.success() => true, + Ok(o) => { + tracing::warn!( + "NSS {}: delete failed: {}", + dir_arg, + String::from_utf8_lossy(&o.stderr).trim() + ); + false + } + Err(e) => { + tracing::warn!("NSS {}: delete exec failed: {}", dir_arg, e); + false + } + } +} + +/// Classify NSS `certutil` stderr as "nickname not present" (idempotent +/// success signal) vs any other failure mode (DB locked, DB corrupt, +/// permission, etc.). Exposed for unit testing. Matches only the +/// specific not-found messages NSS emits — anything else is treated as +/// a real failure so silent bugs can't hide behind false positives. +fn is_nss_not_found(stderr: &str) -> bool { + let s = stderr.to_ascii_lowercase(); + s.contains("could not find cert") || s.contains("could not find a certificate") +} + +fn remove_nss_in_profile(profile: &Path) -> bool { + let prefix = if profile.join("cert9.db").exists() { + "sql:" + } else if profile.join("cert8.db").exists() { + "" + } else { + return false; + }; + let dir_arg = format!("{}{}", prefix, profile.display()); + remove_nss_in_dir(&dir_arg) +} + +/// Undo `enable_firefox_enterprise_roots`: for each profile, strip the +/// marker+pref block if (and only if) we wrote it. If the user owns +/// their own `enterprise_roots` pref — indicated by the absence of our +/// marker line — leave user.js alone entirely. +fn disable_firefox_enterprise_roots() { + for profile in firefox_profile_dirs() { + let user_js = profile.join("user.js"); + let Ok(existing) = std::fs::read_to_string(&user_js) else { + continue; + }; + if let Some(new) = strip_enterprise_roots_block(&existing) { + let _ = std::fs::write(&user_js, new); + continue; + } + // No marker block to strip, but an orphan pref is present. + // Surface it so the user isn't left wondering why user.js + // still has an enterprise_roots line after --remove-cert. + // The orphan is harmless (Firefox falls back to its built-in + // root store once the CA leaves the OS store), but silent + // leftovers feel like half-done removals. + if has_bare_enterprise_roots(&existing) { + tracing::info!( + "Firefox profile {}: `security.enterprise_roots.enabled` pref \ + present without our marker — left in place. If it was written \ + by a pre-v1.2.13 install it's a cosmetic orphan (harmless, \ + Firefox falls back to its built-in root store); remove it \ + manually from user.js if it bothers you. If you set it \ + yourself, leave it.", + profile.display() + ); + } + } +} + fn firefox_profile_dirs() -> Vec { use std::path::PathBuf; let mut roots: Vec = Vec::new(); @@ -579,7 +1383,10 @@ fn firefox_profile_dirs() -> Vec { } "windows" => { if let Ok(appdata) = std::env::var("APPDATA") { - roots.push(PathBuf::from(format!("{}\\Mozilla\\Firefox\\Profiles", appdata))); + roots.push(PathBuf::from(format!( + "{}\\Mozilla\\Firefox\\Profiles", + appdata + ))); } } _ => {} @@ -689,4 +1496,308 @@ ID_LIKE=debian let content = "SOMEFIELD=maybearchived\nFOO=bar\n"; assert_eq!(classify_os_release(content), "unknown"); } + + // ── Firefox user.js block install / uninstall ── + + #[test] + fn enterprise_roots_block_added_to_empty_userjs() { + let got = add_enterprise_roots_block(""); + let expected = format!("{}\n{}\n", FX_MARKER, FX_PREF); + assert_eq!(got, EnterpriseRootsEdit::AddedBlock(expected)); + } + + #[test] + fn enterprise_roots_block_appended_preserving_existing_prefs() { + let existing = "user_pref(\"some.other\", 1);\n"; + let got = add_enterprise_roots_block(existing); + let expected = format!( + "user_pref(\"some.other\", 1);\n{}\n{}\n", + FX_MARKER, FX_PREF + ); + assert_eq!(got, EnterpriseRootsEdit::AddedBlock(expected)); + } + + #[test] + fn enterprise_roots_block_is_idempotent_when_marker_present() { + let existing = format!( + "user_pref(\"a\", 1);\n{}\n{}\nuser_pref(\"b\", 2);\n", + FX_MARKER, FX_PREF + ); + assert_eq!( + add_enterprise_roots_block(&existing), + EnterpriseRootsEdit::AlreadyOurs + ); + } + + #[test] + fn enterprise_roots_block_respects_user_owned_pref_without_marker() { + // User has enterprise_roots set themselves — no marker above it. + // We must NOT write our line, and we must NOT claim ownership on + // uninstall (tested separately below). + let existing = "user_pref(\"security.enterprise_roots.enabled\", true);\n"; + assert_eq!( + add_enterprise_roots_block(existing), + EnterpriseRootsEdit::UserOwned + ); + } + + #[test] + fn enterprise_roots_block_respects_user_owned_pref_set_to_false() { + // User explicitly disabled it — also a user-owned pref, leave alone. + let existing = "user_pref(\"security.enterprise_roots.enabled\", false);\n"; + assert_eq!( + add_enterprise_roots_block(existing), + EnterpriseRootsEdit::UserOwned + ); + } + + #[test] + fn strip_enterprise_roots_removes_our_block_and_preserves_others() { + let before = format!( + "user_pref(\"a\", 1);\n{}\n{}\nuser_pref(\"b\", 2);\n", + FX_MARKER, FX_PREF + ); + let after = strip_enterprise_roots_block(&before).expect("should strip"); + assert_eq!(after, "user_pref(\"a\", 1);\nuser_pref(\"b\", 2);\n"); + } + + #[test] + fn strip_enterprise_roots_refuses_when_pref_is_bare() { + // No marker above — indistinguishable from a user- or + // enterprise-policy-authored line. Must return None so caller + // leaves user.js untouched. Legacy upgrade users get one + // cosmetic orphan line; revoking user-owned Firefox trust + // behavior silently is worse. + let before = "user_pref(\"security.enterprise_roots.enabled\", true);\n"; + assert_eq!(strip_enterprise_roots_block(before), None); + } + + #[test] + fn strip_enterprise_roots_refuses_when_marker_is_elsewhere() { + // Marker present but not directly above the pref — user may + // have copied our marker line as a comment somewhere else. We + // still can't prove ownership of the pref itself, so leave + // alone. + let before = format!( + "{}\nuser_pref(\"unrelated\", 1);\n\ + user_pref(\"security.enterprise_roots.enabled\", true);\n", + FX_MARKER + ); + assert_eq!(strip_enterprise_roots_block(&before), None); + } + + #[test] + fn strip_enterprise_roots_leaves_user_false_pref_alone() { + let before = "user_pref(\"security.enterprise_roots.enabled\", false);\n"; + assert_eq!(strip_enterprise_roots_block(before), None); + } + + #[test] + fn strip_enterprise_roots_returns_none_when_pref_absent() { + let before = "user_pref(\"other\", 1);\nuser_pref(\"another\", 2);\n"; + assert_eq!(strip_enterprise_roots_block(before), None); + } + + #[test] + fn strip_enterprise_roots_roundtrip_from_empty() { + // add_block("") -> strip_block(added) -> "" (no trailing garbage). + let added = match add_enterprise_roots_block("") { + EnterpriseRootsEdit::AddedBlock(s) => s, + other => panic!("unexpected: {:?}", other), + }; + let stripped = strip_enterprise_roots_block(&added).expect("should strip"); + assert_eq!(stripped, ""); + } + + // ── has_bare_enterprise_roots ── + + #[test] + fn bare_enterprise_roots_detected_when_no_marker_present() { + let content = "user_pref(\"security.enterprise_roots.enabled\", true);\n"; + assert!(has_bare_enterprise_roots(content)); + } + + #[test] + fn bare_enterprise_roots_not_detected_when_marker_block_present() { + // Our marker+pref block — strip handles this; has_bare_ must + // return false so we don't double-warn about a line we own. + let content = format!("{}\n{}\n", FX_MARKER, FX_PREF); + assert!(!has_bare_enterprise_roots(&content)); + } + + #[test] + fn bare_enterprise_roots_not_detected_when_pref_absent() { + let content = "user_pref(\"other\", 1);\n"; + assert!(!has_bare_enterprise_roots(content)); + } + + #[test] + fn bare_enterprise_roots_ignores_false_variant() { + // User explicitly set enterprise_roots = false — not our line + // and not the pre-marker legacy write (which only ever wrote + // true). No orphan to warn about. + let content = "user_pref(\"security.enterprise_roots.enabled\", false);\n"; + assert!(!has_bare_enterprise_roots(content)); + } + + // ── should_reconcile_for ── + + #[test] + fn reconcile_skipped_for_normal_user() { + // euid != 0 — even with SUDO_USER set we must NOT re-root HOME. + // A non-root process that happened to inherit SUDO_USER (or + // used `sudo -E`) shouldn't get to redirect cert paths. + assert_eq!(should_reconcile_for(1000, Some("alice")), None); + assert_eq!(should_reconcile_for(1000, None), None); + } + + #[test] + fn reconcile_skipped_for_real_root_login_without_sudo() { + // Load-bearing case the maintainer asked to pin: euid == 0 + // AND no SUDO_USER means the process is a real root login, + // not a sudo elevation. HOME should stay as /root; we must + // not try to resolve some other user's home. + assert_eq!(should_reconcile_for(0, None), None); + } + + #[test] + fn reconcile_skipped_when_sudo_user_is_empty_or_root() { + assert_eq!(should_reconcile_for(0, Some("")), None); + assert_eq!(should_reconcile_for(0, Some("root")), None); + } + + #[test] + fn reconcile_triggers_for_real_sudo_invocation() { + // euid == 0 AND SUDO_USER points to a non-root user — this is + // the sudo case we do want to reconcile. + assert_eq!(should_reconcile_for(0, Some("alice")), Some("alice")); + } + + // ── sudo_parse_passwd_home ── + + #[test] + fn parses_debian_passwd_entry() { + let line = "liyon:x:1000:1000:Liyon,,,:/home/liyon:/bin/bash\n"; + assert_eq!(sudo_parse_passwd_home(line), Some("/home/liyon".into())); + } + + #[test] + fn macos_passwd_format_does_not_parse_and_falls_back_to_convention() { + // macOS `dscl`-sourced passwd lines have extra fields + // (pw_class, chg, exp) before home, so index 5 lands on a + // non-home field. sudo_parse_passwd_home is intentionally + // Linux-shaped — the macOS path relies on the `/Users/` + // convention in `unix::resolve_home` rather than on this + // parser. This test pins that contract. + let line = "liyon:*:501:20::0:0:Liyon Bonakdar:/Users/liyon:/bin/zsh"; + assert_ne!(sudo_parse_passwd_home(line), Some("/Users/liyon".into())); + } + + #[test] + fn rejects_malformed_passwd_line_too_few_fields() { + let line = "liyon:x:1000:1000\n"; + assert_eq!(sudo_parse_passwd_home(line), None); + } + + #[test] + fn rejects_empty_home_field() { + let line = "svcacct:x:999:999:gecos::/bin/false\n"; + assert_eq!(sudo_parse_passwd_home(line), None); + } + + #[test] + fn returns_first_matching_line_when_multiple() { + // getent only prints one line, but guard against future change. + let content = "liyon:x:1000:1000::/home/liyon:/bin/bash\n\ + other:x:1001:1001::/home/other:/bin/bash\n"; + assert_eq!(sudo_parse_passwd_home(content), Some("/home/liyon".into())); + } + + // ── NssReport::is_clean ── + + #[test] + fn nss_report_is_clean_when_nothing_tried() { + let r = NssReport::default(); + assert!(r.is_clean()); + } + + #[test] + fn nss_report_is_clean_when_all_attempts_succeeded() { + let r = NssReport { + tried: 3, + ok: 3, + tool_missing_with_stores_present: false, + }; + assert!(r.is_clean()); + } + + #[test] + fn nss_report_not_clean_on_partial_failure() { + let r = NssReport { + tried: 3, + ok: 2, + tool_missing_with_stores_present: false, + }; + assert!(!r.is_clean()); + } + + #[test] + fn nss_report_not_clean_when_tool_missing_with_stores() { + // Even with tried=0 (we couldn't try anything), the presence + // of NSS stores plus a missing tool means cleanup is NOT + // complete — callers should flag this to the user. + let r = NssReport { + tried: 0, + ok: 0, + tool_missing_with_stores_present: true, + }; + assert!(!r.is_clean()); + } + + // ── is_nss_not_found ── + + #[test] + fn nss_not_found_classifies_standard_not_found_message() { + // Typical NSS certutil output when the nickname is absent. + let stderr = "certutil: Could not find cert: MasterHttpRelayVPN\n"; + assert!(is_nss_not_found(stderr)); + } + + #[test] + fn nss_not_found_classifies_alt_wording_some_versions_emit() { + let stderr = "certutil: could not find a certificate named 'MasterHttpRelayVPN'\n"; + assert!(is_nss_not_found(stderr)); + } + + #[test] + fn nss_not_found_rejects_locked_database_error() { + // Regression guard for the critical bug: DB locked (Firefox + // running) must NOT be treated as "cert absent" — that would + // silently report clean revocation while NSS keeps trusting + // the stale root. + let stderr = "certutil: function failed: SEC_ERROR_LOCKED_DATABASE: \ + the certificate/key database is locked.\n"; + assert!(!is_nss_not_found(stderr)); + } + + #[test] + fn nss_not_found_rejects_bad_database_error() { + let stderr = "certutil: function failed: SEC_ERROR_BAD_DATABASE: \ + security library: bad database.\n"; + assert!(!is_nss_not_found(stderr)); + } + + #[test] + fn nss_not_found_rejects_permission_error() { + let stderr = "certutil: unable to open \"sql:/home/x/.mozilla/firefox/profile\" \ + (Permission denied)\n"; + assert!(!is_nss_not_found(stderr)); + } + + #[test] + fn nss_not_found_rejects_empty_stderr() { + // An empty stderr with a non-zero exit is ambiguous — safer + // to classify as "not found is NOT proven", i.e. failure. + assert!(!is_nss_not_found("")); + } } diff --git a/src/config.rs b/src/config.rs index 74d08155..824ef76d 100644 --- a/src/config.rs +++ b/src/config.rs @@ -96,6 +96,14 @@ pub struct Config { /// script IDs. #[serde(default)] pub parallel_relay: u8, + /// Adaptive batch coalesce: after each op arrives, wait this many ms + /// for more ops before firing the batch. Resets on every arrival. + /// 0 = use compiled default (40ms). + #[serde(default)] + pub coalesce_step_ms: u16, + /// Hard cap on total coalesce wait (ms). 0 = use compiled default (1000ms). + #[serde(default)] + pub coalesce_max_ms: u16, /// Optional explicit SNI rotation pool for outbound TLS to `google_ip`. /// Empty / missing = auto-expand from `front_domain` (current default of /// {www, mail, drive, docs, calendar}.google.com). Set to an explicit list @@ -163,6 +171,87 @@ pub struct Config { /// Issues #39, #127. #[serde(default)] pub passthrough_hosts: Vec, + + /// Block outbound QUIC (UDP/443) at the SOCKS5 listener. + /// + /// QUIC is HTTP/3-over-UDP. In `apps_script` mode it's hopeless — + /// Apps Script is HTTP-only, so QUIC datagrams either get refused + /// outright (UDP ASSOCIATE rejected) or silently fall through to + /// `raw-tcp direct` and fail in interesting ways. In `full` mode + /// the tunnel-node CAN carry UDP, but QUIC's congestion control + /// stacked on top of TCP-encapsulated transport produces TCP + /// meltdown for any non-trivial bandwidth — browsers see <1 Mbps + /// where the same site over plain HTTPS would do >50. + /// + /// With `block_quic = true`, the SOCKS5 UDP relay drops any + /// datagram destined for port 443 (silent UDP — caller's stack + /// retries a few times then falls back). Browsers then re-issue + /// the same request as TCP/HTTPS through the regular CONNECT + /// path, which goes through the relay normally. + /// + /// Why this is opt-in rather than always-on: for users on Full + /// mode + udpgw (a recent path; v1.7.0+) the QUIC TCP-meltdown + /// is partially mitigated by udpgw's persistent-socket reuse, + /// and a tiny minority of sites only support HTTP/3 (rare). The + /// flag lets users who care about consistency over peak speed + /// opt out of QUIC at the source rather than discovering its + /// failure modes later. Issue #213. + #[serde(default)] + pub block_quic: bool, + /// When true, suppress the random `_pad` field that v1.8.0+ adds + /// to outbound Apps Script requests for DPI evasion. Default off + /// (padding active). Some users on heavily-throttled ISPs find + /// the +25% bandwidth cost from padding compounds with the + /// throttle to push borderline-working batches into timeouts; + /// turning padding off recovers a bit of headroom at the cost of + /// length-distribution defense against DPI fingerprinting. Issue + /// #391 (EBRAHIM-AM). + /// + /// Don't flip this on speculatively — for users where Apps Script + /// outbound is uncongested, padding is free DPI defense. Only + /// turn off if you've measured throughput improvement after the + /// flip on your specific ISP path. + #[serde(default)] + pub disable_padding: bool, + + /// Opt-out for the DoH bypass. Default `false` (= bypass active): + /// CONNECTs to well-known DoH hostnames (Cloudflare, Google, Quad9, + /// AdGuard, NextDNS, OpenDNS, browser-pinned variants like + /// `chrome.cloudflare-dns.com` and `mozilla.cloudflare-dns.com`) + /// skip the Apps Script tunnel and exit via plain TCP (or + /// `upstream_socks5` if set). DoH already encrypts the queries + /// themselves, so the only privacy property the tunnel was adding + /// is hiding *the fact that you're doing DoH* from the local + /// network — a marginal gain not worth the ~2 s Apps Script + /// round-trip cost paid on every name lookup. In Full mode this + /// was the dominant DNS slowdown source. + /// + /// Set `tunnel_doh: true` to keep DoH inside the tunnel. With the + /// bypass off, browsers that find their pinned DoH host + /// unreachable already fall back to OS DNS on their own, so + /// failure modes are graceful in either direction. + /// + /// Port-gated to TCP/443 only. A private DoH on a non-standard port + /// (e.g. `doh.internal.example:8443`) won't take the bypass path — + /// list it in `passthrough_hosts` instead, which has no port gate. + #[serde(default)] + pub tunnel_doh: bool, + + /// Extra hostnames to treat as DoH endpoints in addition to the + /// built-in default list. Case-insensitive; entries match exactly + /// OR as a dot-anchored suffix unconditionally — `doh.acme.test` + /// covers both `doh.acme.test` and `tenant.doh.acme.test`. (Unlike + /// `passthrough_hosts`, no leading dot is required for suffix + /// matching: every legitimate subdomain of a DoH host is itself + /// a DoH endpoint, so the leading-dot convention would be a + /// footgun.) Use this to cover private/enterprise DoH resolvers + /// without waiting for a release. + /// + /// Inert when `tunnel_doh = true` — the bypass itself is off, so + /// the extras have nothing to feed. The proxy logs a warning at + /// startup if both are set together. + #[serde(default)] + pub bypass_doh_hosts: Vec, } fn default_fetch_ips_from_api() -> bool { false } @@ -226,9 +315,11 @@ impl Config { )); } if self.socks5_port == Some(self.listen_port) { - return Err(ConfigError::Invalid( - "listen_port and socks5_port must be different".into(), - )); + return Err(ConfigError::Invalid(format!( + "listen_port and socks5_port must differ on the same host \ + (both set to {} on {}). Change one of them in config.json.", + self.listen_port, self.listen_host + ))); } Ok(()) } diff --git a/src/domain_fronter.rs b/src/domain_fronter.rs index a18dd212..5c245e6a 100644 --- a/src/domain_fronter.rs +++ b/src/domain_fronter.rs @@ -21,6 +21,7 @@ use std::time::{Duration, Instant}; use base64::engine::general_purpose::STANDARD as B64; use base64::Engine; +use rand::{thread_rng, Rng, RngCore}; use serde::{Deserialize, Serialize}; use serde_json::Value; use tokio::io::{AsyncReadExt, AsyncWriteExt}; @@ -60,6 +61,11 @@ const POOL_TTL_SECS: u64 = 45; const POOL_MAX: usize = 80; const REQUEST_TIMEOUT_SECS: u64 = 25; const RANGE_PARALLEL_CHUNK_BYTES: u64 = 256 * 1024; +/// Cadence for Apps Script container keepalive pings. Apps Script +/// containers go cold after ~5min idle and cost 1-3s on the first +/// request to wake back up — most painful on YouTube / streaming where +/// the first chunk after a quiet pause stalls the player. +const H1_KEEPALIVE_INTERVAL_SECS: u64 = 240; // Keep synthetic range stitching bounded. Without this, a buggy or hostile // origin can advertise `Content-Range: bytes 0-1/` and make us build a // massive range plan or preallocate an enormous response buffer. @@ -102,6 +108,13 @@ pub struct DomainFronter { inflight: Arc>>>>, coalesced: AtomicU64, blacklist: Arc>>, + /// Per-deployment rolling timeout counter. Maps `script_id` → + /// `(window_start, strike_count)`. Reset when the window expires + /// or when a batch succeeds. Triggers a short-cooldown blacklist + /// at `TIMEOUT_STRIKE_LIMIT`. Distinct from `blacklist` because + /// strike state is per-deployment health bookkeeping, not the + /// permanent ban list. + script_timeouts: Arc>>, relay_calls: AtomicU64, relay_failures: AtomicU64, bytes_relayed: AtomicU64, @@ -123,6 +136,10 @@ pub struct DomainFronter { today_calls: AtomicU64, today_bytes: AtomicU64, today_key: std::sync::Mutex, + /// Suppress the random `_pad` field that v1.8.0+ adds to outbound + /// payloads. Mirrors `Config::disable_padding` (#391). Default false + /// (padding active = stronger DPI defense at +25% bandwidth cost). + disable_padding: bool, } /// Aggregated stats for one remote host. @@ -146,6 +163,21 @@ impl HostStat { const BLACKLIST_COOLDOWN_SECS: u64 = 600; +/// Sliding window for the timeout-strike blacklist heuristic. Three +/// timeouts within this window on a single deployment trip the +/// blacklist. Tuned so a single cold-start stall plus one transient +/// network blip won't false-trigger, but a deployment that's actually +/// dead (stale `TUNNEL_SERVER_URL`, paused project, dropped script) +/// fails fast instead of poisoning round-robin until the user notices. +const TIMEOUT_STRIKE_WINDOW: Duration = Duration::from_secs(30); +const TIMEOUT_STRIKE_LIMIT: u32 = 3; + +/// Cooldown for a deployment blacklisted via the timeout-strike path. +/// Distinct from `BLACKLIST_COOLDOWN_SECS` (10 min) because timeouts +/// are a much noisier signal than quota errors — if the deployment +/// recovers, we want to rejoin in minutes, not after a 10-min penalty. +const TIMEOUT_BLACKLIST_COOLDOWN_SECS: u64 = 120; + /// Request payload sent to Apps Script (single, non-batch). #[derive(Serialize)] struct RelayRequest<'a> { @@ -258,21 +290,27 @@ impl DomainFronter { inflight: Arc::new(Mutex::new(HashMap::new())), coalesced: AtomicU64::new(0), blacklist: Arc::new(std::sync::Mutex::new(HashMap::new())), + script_timeouts: Arc::new(std::sync::Mutex::new(HashMap::new())), relay_calls: AtomicU64::new(0), relay_failures: AtomicU64::new(0), bytes_relayed: AtomicU64::new(0), per_site: Arc::new(std::sync::Mutex::new(HashMap::new())), today_calls: AtomicU64::new(0), today_bytes: AtomicU64::new(0), - today_key: std::sync::Mutex::new(current_utc_day_key()), + today_key: std::sync::Mutex::new(current_pt_day_key()), + disable_padding: config.disable_padding, }) } /// Record one relay call toward the daily budget. Called once per /// outbound Apps Script fetch. Rolls over both daily counters at - /// 00:00 UTC. - fn record_today(&self, bytes: u64) { - let today = current_utc_day_key(); + /// 00:00 Pacific Time, matching Apps Script's quota reset cadence + /// (#230, #362). Crate-public so the Full-mode batch path in + /// `tunnel_client::fire_batch` can wire into the same accounting + /// (Apps Script sees Full-mode batches as ordinary `UrlFetchApp` + /// calls and counts them against the same daily quota). + pub(crate) fn record_today(&self, bytes: u64) { + let today = current_pt_day_key(); // Fast path: same day as what we last saw. No lock. let mut guard = self.today_key.lock().unwrap(); if *guard != today { @@ -317,8 +355,8 @@ impl DomainFronter { // Read today_key under lock and cheaply check rollover so the // UI never sees stale "today_calls=1847" on a day where no // traffic has flowed yet (e.g. user left the app open past - // midnight UTC). - let today_now = current_utc_day_key(); + // midnight PT). + let today_now = current_pt_day_key(); let today_key = { let mut guard = self.today_key.lock().unwrap(); if *guard != today_now { @@ -341,7 +379,7 @@ impl DomainFronter { today_calls: self.today_calls.load(Ordering::Relaxed), today_bytes: self.today_bytes.load(Ordering::Relaxed), today_key, - today_reset_secs: seconds_until_utc_midnight(), + today_reset_secs: seconds_until_pacific_midnight(), } } @@ -414,17 +452,67 @@ impl DomainFronter { } fn blacklist_script(&self, script_id: &str, reason: &str) { - let until = Instant::now() + Duration::from_secs(BLACKLIST_COOLDOWN_SECS); + self.blacklist_script_for( + script_id, + Duration::from_secs(BLACKLIST_COOLDOWN_SECS), + reason, + ); + } + + fn blacklist_script_for(&self, script_id: &str, cooldown: Duration, reason: &str) { + let until = Instant::now() + cooldown; let mut bl = self.blacklist.lock().unwrap(); bl.insert(script_id.to_string(), until); tracing::warn!( "blacklisted script {} for {}s: {}", mask_script_id(script_id), - BLACKLIST_COOLDOWN_SECS, + cooldown.as_secs(), reason ); } + /// Record a batch timeout against `script_id`. After + /// `TIMEOUT_STRIKE_LIMIT` timeouts inside `TIMEOUT_STRIKE_WINDOW` + /// the deployment is blacklisted with a short cooldown so the + /// round-robin stops sending real traffic to a deployment that's + /// hung (most commonly: stale `TUNNEL_SERVER_URL` after the + /// tunnel-node moved hosts). + pub(crate) fn record_timeout_strike(&self, script_id: &str) { + let now = Instant::now(); + let mut counts = self.script_timeouts.lock().unwrap(); + let entry = counts + .entry(script_id.to_string()) + .or_insert((now, 0)); + if now.duration_since(entry.0) > TIMEOUT_STRIKE_WINDOW { + *entry = (now, 1); + } else { + entry.1 += 1; + } + let strikes = entry.1; + if strikes >= TIMEOUT_STRIKE_LIMIT { + counts.remove(script_id); + drop(counts); + self.blacklist_script_for( + script_id, + Duration::from_secs(TIMEOUT_BLACKLIST_COOLDOWN_SECS), + &format!( + "{} timeouts in {}s", + strikes, + TIMEOUT_STRIKE_WINDOW.as_secs() + ), + ); + } + } + + /// Clear the timeout strike counter for `script_id`. Called after + /// a batch succeeds so a recovered deployment doesn't keep stale + /// strikes from hours ago — three strikes must occur within one + /// real failure burst, not accumulate across unrelated incidents. + pub(crate) fn record_batch_success(&self, script_id: &str) { + let mut counts = self.script_timeouts.lock().unwrap(); + counts.remove(script_id); + } + /// Log a relay failure with extra guidance on cert-validation cases. /// Rate-limited so a flood of identical "UnknownIssuer" errors doesn't /// fill the log. @@ -512,6 +600,45 @@ impl DomainFronter { } } + /// Keep the Apps Script container warm with a periodic HEAD ping. + /// + /// `acquire()` keeps the *TCP/TLS pool* warm but does nothing for the + /// V8 container Apps Script runs in: that goes cold ~5min after the + /// last UrlFetchApp call and costs 1-3s to spin back up. The symptom + /// is "first request after a quiet period stalls" — most visible on + /// YouTube where the player gives up on a 1.5s `googlevideo.com` + /// chunk that's actually waiting on a cold-start. + /// + /// Bypasses the response cache (`cache_key_opt = None`) and the + /// inflight coalescer — otherwise the second iteration would just + /// hit the cached response from the first and never reach Apps + /// Script. The relay payload itself is the cheapest non-error one + /// we can build: a HEAD against `http://example.com/` returns a few + /// hundred bytes, no body decode, no auth. + /// + /// Best-effort. Failures are debug-logged so a flaky network or + /// quota-exhausted account doesn't spam warnings every 4 minutes. + /// Loops forever — caller is expected to drop the JoinHandle on + /// shutdown (the task lives as long as the process). + pub async fn run_h1_keepalive(self: Arc) { + loop { + tokio::time::sleep(Duration::from_secs(H1_KEEPALIVE_INTERVAL_SECS)).await; + let t0 = Instant::now(); + // relay_uncoalesced returns Vec (always — errors are + // baked into 5xx responses), so just observe the duration + // for the debug line. We intentionally don't use relay() + // here because that path goes through the cache + coalesce + // layer, which would short-circuit subsequent pings. + let _ = self + .relay_uncoalesced("HEAD", "http://example.com/", &[], &[], None) + .await; + tracing::debug!( + "H1 container keepalive: {}ms", + t0.elapsed().as_millis() + ); + } + } + async fn acquire(&self) -> Result { { let mut pool = self.pool.lock().await; @@ -642,9 +769,9 @@ impl DomainFronter { /// by relay() already (we skip cache for it). /// 2. Probe with `Range: bytes=0-`. /// 3. 200 back (origin doesn't support ranges) → return as-is. - /// 4. 206 back → parse Content-Range total. If the body fits in - /// the first probe (total <= chunk or body >= total), rewrite - /// the 206 to a 200 so the client — which never asked for a + /// 4. 206 back → parse Content-Range total. If Content-Range says + /// the entity fits in the first probe, rewrite the 206 to a 200 + /// so the client — which never asked for a /// range — doesn't choke on a stray Partial Content. (x.com /// and Cloudflare turnstile in particular reject unsolicited /// 206 on XHR/fetch.) @@ -765,24 +892,39 @@ impl DomainFronter { match chunk { Ok(chunk) => full.extend_from_slice(&chunk), Err(reason) => { + // Issue #162: silently rewriting the probe to a 200 + // here truncates the response to whatever the probe + // saw (typically 256 KiB — the chunk size). Browsers + // see HTTP 200 + Content-Length=262144 and treat + // the download as complete; users reported "every + // file capped at 256 KB" because every download + // that hit this failure path landed there. Common + // triggers: Apps Script stripping Content-Range, + // origin returning 200-instead-of-206 on later + // chunks, total mismatch across chunks. Correct + // recovery is a fresh single GET — Apps Script + // fetches the full URL up to its 50 MiB cap. Slow + // for big files vs. the parallel path but produces + // a complete response, which is what matters. tracing::warn!( - "range-parallel: invalid chunk {}-{} for {} ({}); falling back to probe response", - start, - end, - url, - reason, + "range-parallel: invalid chunk {}-{} for {} ({}); falling back to single GET", + start, end, url, reason, ); - return rewrite_206_to_200(&first); + return self.relay(method, url, headers, body).await; } } } if (full.len() as u64) != total { + // Same fallback rationale as the chunk-validation case + // above: returning the probe truncates to 256 KiB. Single + // GET is the only way to give the user a complete file + // when the parallel stitch can't be trusted. tracing::warn!( - "range-parallel: stitched {}/{} bytes for {}; falling back to probe response", + "range-parallel: stitched {}/{} bytes for {}; falling back to single GET", full.len(), total, url, ); - return rewrite_206_to_200(&first); + return self.relay(method, url, headers, body).await; } // Build a 200 OK with Content-Length = full body length. Drop @@ -1060,7 +1202,18 @@ impl DomainFronter { ct, r: true, }; - Ok(serde_json::to_vec(&req)?) + // Serialize via Value so we can splice in the random `_pad` field + // without changing RelayRequest's wire schema. Apps Script ignores + // unknown JSON fields, so old Code.gs deployments stay compatible + // — the pad is just bytes-on-the-wire that the server sees and + // discards. + let mut v = serde_json::to_value(&req)?; + if let Value::Object(map) = &mut v { + if !self.disable_padding { + add_random_pad(map); + } + } + Ok(serde_json::to_vec(&v)?) } // ────── Full-mode tunnel protocol ────────────────────────────────── @@ -1188,6 +1341,9 @@ impl DomainFronter { if let Some(d) = data { map.insert("d".into(), Value::String(d)); } + if !self.disable_padding { + add_random_pad(&mut map); + } Ok(serde_json::to_vec(&Value::Object(map))?) } @@ -1215,6 +1371,9 @@ impl DomainFronter { map.insert("k".into(), Value::String(self.auth_key.clone())); map.insert("t".into(), Value::String("batch".into())); map.insert("ops".into(), serde_json::to_value(ops)?); + if !self.disable_padding { + add_random_pad(&mut map); + } let payload = serde_json::to_vec(&Value::Object(map))?; let path = format!("/macros/s/{}/exec", script_id); @@ -1393,10 +1552,26 @@ fn validate_probe_range( return None; } let range = parse_content_range(headers)?; - if range.start != 0 || range.end > requested_end || !content_range_matches_body(range, body.len()) { + if range.start != 0 || range.end > requested_end { return None; } - Some(range) + if content_range_matches_body(range, body.len()) + || probe_range_covers_complete_entity(range, requested_end) + { + return Some(range); + } + None +} + +fn probe_range_covers_complete_entity(range: ContentRange, requested_end: u64) -> bool { + // Apps Script may decode a gzip body while preserving the origin's + // compressed Content-Range. For the synthetic first probe only, a + // 0..total-1 range within the requested chunk is enough to prove we + // already have the complete entity; later chunks still require exact + // Content-Range/body length validation in extract_exact_range_body(). + range.start == 0 + && range.end.saturating_add(1) >= range.total + && range.total <= requested_end.saturating_add(1) } fn checked_stitched_range_capacity(total: u64) -> Option { @@ -1509,30 +1684,74 @@ fn normalize_x_graphql_url(url: &str) -> String { format!("{}{}{}?{}", scheme, host, path, new_query) } -/// "YYYY-MM-DD" of the current UTC date. Used as the daily-reset -/// boundary for `today_calls` / `today_bytes`. We format manually so -/// this stays std-only and doesn't pull `time` or `chrono` for a -/// ~20-line helper. -fn current_utc_day_key() -> String { +/// Maximum bytes of random padding appended to outbound Apps Script +/// JSON request bodies. Picked so the per-request padding distribution +/// (uniformly 0..MAX) shifts the body length enough to defeat naive +/// length-fingerprint DPI without bloating bandwidth — at the average +/// 512-byte add, on a typical 2 KB tunnel batch this is +25%, which is +/// negligible compared to Apps Script's per-call latency floor anyway. +/// (Issue #313, #365 Section 1 — DPI evasion.) +const MAX_RANDOM_PAD_BYTES: usize = 1024; + +/// Insert a `_pad` field of random length (0..MAX_RANDOM_PAD_BYTES) +/// into a request payload before serialization. Server-side ignores +/// unknown JSON fields, so this is fully backward-compatible with old +/// `Code.gs` / `CodeFull.gs` deployments — the pad is just along for +/// the ride. +/// +/// Random bytes are base64-encoded (NO inner JSON-escape worries) and +/// the pad LENGTH itself is uniformly distributed, so packet sizes +/// land all over the place rather than clustering at a few discrete +/// peaks. That's the property DPI's length-distribution clustering +/// fingerprints can't match. +fn add_random_pad(map: &mut serde_json::Map) { + let mut rng = thread_rng(); + let len = rng.gen_range(0..=MAX_RANDOM_PAD_BYTES); + if len == 0 { + // Skip the field entirely sometimes — adds another bit of + // distribution variance (presence-vs-absence of `_pad` itself). + return; + } + let mut buf = vec![0u8; len]; + rng.fill_bytes(&mut buf); + map.insert("_pad".into(), Value::String(B64.encode(&buf))); +} + +/// "YYYY-MM-DD" of the current Pacific Time date. Used as the daily-reset +/// boundary for `today_calls` / `today_bytes` because **Apps Script's +/// quota counter resets at midnight Pacific Time, not UTC** — that's +/// where Google's quota bookkeeping lives. We format manually so this +/// stays std-only and doesn't pull `time-tz` or `chrono` plus a ~3 MB +/// IANA tzdb just for one ~50-line helper. (Issue #230, #362.) +/// +/// PT offset depends on DST: PST = UTC-8, PDT = UTC-7. We use the +/// stable US DST rule (2nd Sunday of March 02:00 → 1st Sunday of +/// November 02:00 = PDT, otherwise PST). The hour-of-day boundary on +/// transition days is approximated; this drifts by up to 1h for at +/// most 2h/year on the spring-forward / fall-back transitions, which +/// is fine for a daily countdown. +fn current_pt_day_key() -> String { let secs = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .map(|d| d.as_secs()) .unwrap_or(0); - let (y, m, d) = unix_to_ymd_utc(secs); + let pt_secs = unix_to_pt_seconds(secs); + let (y, m, d) = unix_to_ymd_utc(pt_secs); format!("{:04}-{:02}-{:02}", y, m, d) } -/// Seconds until the next 00:00 UTC. Used by the UI to render a -/// "resets in Xh Ym" countdown without the UI having to import time -/// libraries. Conservative: if the system clock is broken we return -/// 0 instead of a huge negative-looking number. -fn seconds_until_utc_midnight() -> u64 { +/// Seconds until the next 00:00 Pacific Time. Used by the UI to render +/// a "resets in Xh Ym" countdown matching Apps Script's actual quota +/// reset cadence (#230, #362). Conservative: if the system clock is +/// broken we return 0 instead of a huge negative-looking number. +fn seconds_until_pacific_midnight() -> u64 { let secs = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .map(|d| d.as_secs()) .unwrap_or(0); + let pt_secs = unix_to_pt_seconds(secs); let day = 86_400u64; - let rem = secs % day; + let rem = pt_secs % day; if rem == 0 { day } else { @@ -1540,6 +1759,65 @@ fn seconds_until_utc_midnight() -> u64 { } } +/// Convert Unix UTC seconds to "Pacific Time as if it were UTC" seconds, +/// i.e. add the PT-from-UTC offset (negative for the western hemisphere +/// becomes a subtraction). Result is suitable for feeding into +/// `unix_to_ymd_utc` to extract the PT calendar date, or for `% 86_400` +/// to find PT seconds-into-day. +fn unix_to_pt_seconds(utc_secs: u64) -> u64 { + // First-pass guess at PT date using PST (-8) — used to determine + // whether DST is currently in effect, which then settles the actual + // offset. The two-pass approach avoids the chicken-and-egg of + // "I need the PT date to know if it's DST, but I need the offset + // to compute the PT date." A 1-hour fudge in the guess is harmless + // because DST never starts within the first hour after midnight + // PST or ends within the first hour after midnight PDT. + let pst_guess = utc_secs.saturating_sub(8 * 3600); + let (y, m, d) = unix_to_ymd_utc(pst_guess); + let offset_secs = if pacific_is_dst(y, m, d) { + 7 * 3600 + } else { + 8 * 3600 + }; + utc_secs.saturating_sub(offset_secs) +} + +/// Whether Pacific Time is observing daylight saving on the given +/// calendar date (year, month=1..12, day=1..31). US DST window: +/// 2nd Sunday of March through 1st Sunday of November. The transition +/// hour itself (02:00 local) is approximated to whole-day boundaries — +/// good enough for a daily-quota countdown. +fn pacific_is_dst(year: i64, month: u32, day: u32) -> bool { + if month < 3 || month > 11 { + return false; + } + if month > 3 && month < 11 { + return true; + } + if month == 3 { + let dst_start = nth_sunday_of_month(year, 3, 2); + day >= dst_start + } else { + // month == 11 + let dst_end = nth_sunday_of_month(year, 11, 1); + day < dst_end + } +} + +/// Day-of-month for the Nth Sunday (1-indexed) of (year, month). Uses +/// Sakamoto's method for the month's-1st day-of-week, then offsets to +/// the desired Sunday. Pure arithmetic, no calendar tables. +fn nth_sunday_of_month(year: i64, month: u32, nth: u32) -> u32 { + // Sakamoto's day-of-week. 0 = Sunday. + static T: [i64; 12] = [0, 3, 2, 5, 0, 3, 5, 1, 4, 6, 2, 4]; + let y = if month < 3 { year - 1 } else { year }; + let m = month as i64; + let dow_of_1st = + ((y + y / 4 - y / 100 + y / 400 + T[(m - 1) as usize] + 1).rem_euclid(7)) as u32; + let first_sunday = if dow_of_1st == 0 { 1 } else { 8 - dow_of_1st }; + first_sunday + (nth - 1) * 7 +} + /// Convert a Unix timestamp (seconds since 1970-01-01 UTC) to a /// (year, month, day) tuple, UTC. Standalone so we can stay /// std-only — no chrono/time/jiff dependency pulled for one caller. @@ -2012,15 +2290,18 @@ pub struct StatsSnapshot { pub cache_bytes: usize, pub blacklisted_scripts: usize, pub total_scripts: usize, - /// Relay calls attributed to the current UTC day. Resets at 00:00 UTC. - /// This is what-this-process-has-done today, not the Google-side bucket. + /// Relay calls attributed to the current Pacific Time day. Resets + /// at 00:00 PT (midnight Pacific) — matches Apps Script's actual + /// quota reset cadence (#230, #362). This is what-this-process- + /// has-done today, not the Google-side bucket. pub today_calls: u64, - /// Response bytes from relay calls attributed to the current UTC day. + /// Response bytes from relay calls attributed to the current PT day. pub today_bytes: u64, - /// "YYYY-MM-DD" of the day `today_calls` / `today_bytes` refer to. - /// Useful for cross-referencing against Google's dashboard. + /// "YYYY-MM-DD" of the PT day `today_calls` / `today_bytes` refer + /// to. Useful for cross-referencing against Google's dashboard, + /// which is also PT-aligned. pub today_key: String, - /// Seconds until the next 00:00 UTC rollover. Convenient for the UI + /// Seconds until the next 00:00 PT rollover. Convenient for the UI /// to render "Resets in Xh Ym" without importing time libraries. pub today_reset_secs: u64, } @@ -2092,6 +2373,11 @@ fn looks_like_quota_error(msg: &str) -> bool { || lower.contains("rate limit") || lower.contains("too many times") || lower.contains("service invoked") + || lower.contains("bandwidth") + || lower.contains("bandbreitenkontingent") + || lower.contains("datenübertragungsrate") + || lower.contains("transfer rate") + || lower.contains("limit exceeded") } fn mask_script_id(id: &str) -> String { @@ -2227,12 +2513,47 @@ mod tests { } #[test] - fn seconds_until_utc_midnight_is_bounded() { - let n = seconds_until_utc_midnight(); + fn seconds_until_pacific_midnight_is_bounded() { + let n = seconds_until_pacific_midnight(); // Must be in (0, 86400] for any valid system clock. assert!(n > 0 && n <= 86_400); } + #[test] + fn nth_sunday_of_month_anchors() { + // Spot-check Sakamoto's day-of-week + offset arithmetic against + // a few known Sundays. Mistakes here would silently shift the + // DST transition by ±1 week. + // March 2026: 2nd Sunday is March 8 (Sun Mar 1, Sun Mar 8). + assert_eq!(nth_sunday_of_month(2026, 3, 2), 8); + // November 2026: 1st Sunday is November 1 (Sun Nov 1). + assert_eq!(nth_sunday_of_month(2026, 11, 1), 1); + // March 2024: 2nd Sunday is March 10 (Sun Mar 3, Sun Mar 10). + assert_eq!(nth_sunday_of_month(2024, 3, 2), 10); + // November 2024: 1st Sunday is November 3. + assert_eq!(nth_sunday_of_month(2024, 11, 1), 3); + // March 2027: 2nd Sunday is March 14. + assert_eq!(nth_sunday_of_month(2027, 3, 2), 14); + } + + #[test] + fn pacific_dst_window_anchors() { + // Outside the DST window: PST. + assert!(!pacific_is_dst(2026, 1, 15)); + assert!(!pacific_is_dst(2026, 12, 25)); + assert!(!pacific_is_dst(2026, 2, 28)); + assert!(!pacific_is_dst(2026, 11, 5)); // first Sun of Nov 2026 = Nov 1; Nov 5 is past + // Inside: PDT. + assert!(pacific_is_dst(2026, 6, 1)); + assert!(pacific_is_dst(2026, 9, 30)); + // Boundary: March 8, 2026 (DST start day) and after = PDT. + assert!(!pacific_is_dst(2026, 3, 7)); + assert!(pacific_is_dst(2026, 3, 8)); + // Boundary: Oct 31 = PDT, Nov 1 = first Sunday = PST flips on. + assert!(pacific_is_dst(2026, 10, 31)); + assert!(!pacific_is_dst(2026, 11, 1)); + } + #[test] fn filter_forwarded_headers_strips_identity_revealing_headers() { // Issue #104: any proxy/extension that inserts these must not @@ -2428,6 +2749,59 @@ mod tests { assert_eq!(parse_content_range_total(&headers), None); } + #[test] + fn validate_probe_range_accepts_decoded_full_entity_body_mismatch() { + let mut raw = b"HTTP/1.1 206 Partial Content\r\n\ +Content-Range: bytes 0-11247/11248\r\n\ +Content-Type: text/javascript\r\n\ +Vary: Accept-Encoding\r\n\ +Content-Length: 45812\r\n\r\n" + .to_vec(); + raw.extend(std::iter::repeat(b'x').take(45_812)); + + let (status, headers, body) = split_response(&raw).unwrap(); + assert_eq!( + validate_probe_range(status, &headers, body, RANGE_PARALLEL_CHUNK_BYTES - 1), + Some(ContentRange { + start: 0, + end: 11_247, + total: 11_248, + }), + ); + + let rewritten = rewrite_206_to_200(&raw); + let (status, headers, body) = split_response(&rewritten).unwrap(); + assert_eq!(status, 200); + assert_eq!(body.len(), 45_812); + assert!(!headers + .iter() + .any(|(k, _)| k.eq_ignore_ascii_case("content-range"))); + assert_eq!( + headers + .iter() + .find(|(k, _)| k.eq_ignore_ascii_case("content-length")) + .map(|(_, v)| v.as_str()), + Some("45812"), + ); + } + + #[test] + fn validate_probe_range_rejects_missing_content_range() { + assert!(validate_probe_range(206, &[], b"hello", 4).is_none()); + } + + #[test] + fn validate_probe_range_rejects_nonzero_start() { + let headers = vec![("Content-Range".to_string(), "bytes 1-4/20".to_string())]; + assert!(validate_probe_range(206, &headers, b"hell", 4).is_none()); + } + + #[test] + fn validate_probe_range_rejects_end_past_requested_end() { + let headers = vec![("Content-Range".to_string(), "bytes 0-5/20".to_string())]; + assert!(validate_probe_range(206, &headers, b"hello!", 4).is_none()); + } + #[test] fn validate_probe_range_rejects_body_length_mismatch() { let headers = vec![("Content-Range".to_string(), "bytes 0-4/20".to_string())]; @@ -2444,6 +2818,16 @@ mod tests { assert_eq!(checked_stitched_range_capacity(u64::MAX), None); } + #[test] + fn extract_exact_range_body_rejects_body_length_mismatch() { + let raw = b"HTTP/1.1 206 Partial Content\r\n\ +Content-Range: bytes 5-9/20\r\n\ +Content-Length: 3\r\n\r\n\ +hey"; + let err = extract_exact_range_body(raw, 5, 9, 20).unwrap_err(); + assert_eq!(err, "Content-Range/body length mismatch"); + } + #[test] fn extract_exact_range_body_rejects_mismatched_content_range() { let raw = b"HTTP/1.1 206 Partial Content\r\n\ @@ -2476,6 +2860,9 @@ hello"; assert!(!should_blacklist(200, "")); assert!(!should_blacklist(502, "bad gateway")); assert!(looks_like_quota_error("Exception: Service invoked too many times per day")); + assert!(looks_like_quota_error( + "Exception: Bandbreitenkontingent überschritten: https://example.com. Verringern Sie die Datenübertragungsrate." + )); assert!(!looks_like_quota_error("bad url")); } diff --git a/src/main.rs b/src/main.rs index 92bf7f46..fe33d160 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,7 +7,7 @@ use std::sync::Arc; use tokio::sync::Mutex; use tracing_subscriber::EnvFilter; -use mhrv_rs::cert_installer::{install_ca, is_ca_trusted}; +use mhrv_rs::cert_installer::{install_ca, is_ca_trusted, reconcile_sudo_environment, remove_ca}; use mhrv_rs::config::Config; use mhrv_rs::mitm::{MitmCertManager, CA_CERT_FILE}; use mhrv_rs::proxy_server::ProxyServer; @@ -18,6 +18,7 @@ const VERSION: &str = env!("CARGO_PKG_VERSION"); struct Args { config_path: Option, install_cert: bool, + remove_cert: bool, no_cert_check: bool, command: Command, } @@ -44,6 +45,11 @@ USAGE: OPTIONS: -c, --config PATH Path to config.json (default: ./config.json) --install-cert Install the MITM CA certificate and exit + --remove-cert Remove the MITM CA from the OS trust store (verified by + name), then delete the on-disk ca/ directory and exit. + NSS cleanup (Firefox/Chrome) is best-effort. A fresh CA + is generated on next run. config.json and your Apps + Script deployment are untouched. --no-cert-check Skip the auto-install-if-untrusted check on startup -h, --help Show this message -V, --version Show version @@ -58,6 +64,7 @@ ENV: fn parse_args() -> Result { let mut config_path: Option = None; let mut install_cert = false; + let mut remove_cert = false; let mut no_cert_check = false; let mut command = Command::Serve; @@ -102,13 +109,18 @@ fn parse_args() -> Result { config_path = Some(PathBuf::from(v)); } "--install-cert" => install_cert = true, + "--remove-cert" => remove_cert = true, "--no-cert-check" => no_cert_check = true, other => return Err(format!("unknown argument: {}", other)), } } + if install_cert && remove_cert { + return Err("--install-cert and --remove-cert cannot be combined".into()); + } Ok(Args { config_path, install_cert, + remove_cert, no_cert_check, command, }) @@ -127,6 +139,14 @@ async fn main() -> ExitCode { // Install default rustls crypto provider (ring). let _ = rustls::crypto::ring::default_provider().install_default(); + // Must run before anything else reads HOME / USER / data_dir — if + // the user ran `sudo ./mhrv-rs ...`, this re-points HOME at the + // invoking user's home so user-scoped cert paths (Firefox profiles, + // macOS login keychain, the mhrv-rs data dir) are not silently + // operated against root's home. No-op on Windows and for non-sudo + // invocations. + reconcile_sudo_environment(); + let args = match parse_args() { Ok(a) => a, Err(e) => { @@ -136,6 +156,29 @@ async fn main() -> ExitCode { } }; + // --remove-cert runs without a valid config — the CA files may be + // the only thing present in the data dir. `config.json` and the + // Apps Script deployment are intentionally untouched: the user does + // not have to redeploy Code.gs after regenerating the CA. + if args.remove_cert { + init_logging("info"); + let base = mhrv_rs::data_dir::data_dir(); + match remove_ca(&base) { + Ok(outcome) => { + tracing::info!("{}", outcome.summary()); + tracing::info!( + "A fresh CA will be generated next time the proxy starts — \ + run --install-cert then to re-trust it." + ); + return ExitCode::SUCCESS; + } + Err(e) => { + eprintln!("remove failed: {}", e); + return ExitCode::FAILURE; + } + } + } + // --install-cert can run without a valid config — only needs the CA file. if args.install_cert { init_logging("info"); diff --git a/src/proxy_server.rs b/src/proxy_server.rs index 41063057..a3a232a0 100644 --- a/src/proxy_server.rs +++ b/src/proxy_server.rs @@ -47,6 +47,18 @@ const SNI_REWRITE_SUFFIXES: &[&str] = &[ "youtu.be", "youtube-nocookie.com", "ytimg.com", + // NOTE on `googlevideo.com`: v1.7.4 (#275) added this here on the + // theory that video chunks should bypass the Apps Script relay. + // **Reverted in v1.7.6** — multiple users (#275 amirabbas117, #281 + // mrerf) reported total YouTube breakage after v1.7.4. Root cause + // is that googlevideo.com is served by Google's separate "EVA" + // edge IPs, not the regular GFE IPs that the user's `google_ip` + // typically points at. SNI-rewriting `googlevideo.com:443` to a + // GFE IP got TLS handshake / wrong-cert errors for those users. + // Pre-v1.7.4 behaviour (chunks via the Apps Script relay path — + // slow but reliable on every GFE IP) is restored. If we ever want + // direct googlevideo.com routing, it needs a separate config knob + // that lets users specify their EVA edge IP independently. // Google Video Transport CDN — YouTube video chunks, Chrome // auto-updates, Google Play Store downloads. The single biggest // gap vs the upstream Python port: without these in the list @@ -72,27 +84,100 @@ const SNI_REWRITE_SUFFIXES: &[&str] = &[ "blogger.com", ]; -/// YouTube-family suffixes. Extracted so `youtube_via_relay` config can -/// pull them out of the SNI-rewrite dispatch at runtime. -const YOUTUBE_SNI_SUFFIXES: &[&str] = &[ +/// YouTube hosts that should be routed through the Apps Script relay +/// when `youtube_via_relay` is enabled — the API + HTML surfaces where +/// Restricted Mode is actually enforced (via the SNI=www.google.com +/// edge looking at the request). Issue #102 / #275. +/// +/// Deliberately narrower than the YouTube section of +/// `SNI_REWRITE_SUFFIXES`: +/// - `youtube.com` / `youtu.be` / `youtube-nocookie.com`: HTML pages +/// and player frames. These trigger Restricted Mode if served via +/// the SNI rewrite, so when the flag is on we relay them. +/// - `youtubei.googleapis.com`: the YouTube data API the player +/// queries for video metadata + manifest. Restricted Mode also +/// gates video availability here. Without this entry, the JSON +/// RPC layer would still hit the SNI-rewrite tunnel via the +/// broader `googleapis.com` suffix — the user-visible symptom of +/// that miss is "youtube_via_relay flips on but Restricted Mode +/// stays sticky on some videos." +/// +/// **NOT** in this list (intentional, was a regression in #275): +/// - `ytimg.com`: thumbnails. No Restricted Mode logic on a static +/// image CDN; routing through Apps Script makes thumbnails slow +/// for zero gain. +/// - `googlevideo.com`: video chunk CDN. Routing through Apps Script +/// means every chunk eats Apps Script quota *and* risks the 6-min +/// execution cap aborting long videos mid-playback. +/// - `ggpht.com`: channel/profile images, same reasoning as ytimg. +const YOUTUBE_RELAY_HOSTS: &[&str] = &[ "youtube.com", "youtu.be", "youtube-nocookie.com", - "ytimg.com", + "youtubei.googleapis.com", +]; + +/// Built-in list of DNS-over-HTTPS endpoints. CONNECTs to these (when +/// `tunnel_doh` is left at the default of `false`, i.e. bypass enabled) +/// skip the Apps Script tunnel and exit via plain TCP. Mix of the +/// browser-pinned variants Chrome/Brave/Edge/Firefox/Safari use and the +/// well-known public DoH providers users wire up by hand. Suffix +/// matching means we don't need to enumerate every tenant subdomain +/// (e.g. `*.cloudflare-dns.com` covers Workers-hosted DoH too). +/// +/// Entries are matched case-insensitively. Both exact-match (`dns.google`) +/// and dot-anchored suffix-match (a host whose suffix is `.cloudflare-dns.com` +/// or which equals `cloudflare-dns.com`) are accepted — same shape as +/// `passthrough_hosts`'s `.foo` rule. +const DEFAULT_DOH_HOSTS: &[&str] = &[ + // The base SLD covers every tenant subdomain via suffix matching; + // the browser-pinned variants below are listed for grep/discovery + // (so a user searching "chrome.cloudflare-dns.com" finds this list) + // and are technically redundant under cloudflare-dns.com. + "cloudflare-dns.com", + "chrome.cloudflare-dns.com", + "mozilla.cloudflare-dns.com", + "1dot1dot1dot1.cloudflare-dns.com", + "dns.google", + "dns.google.com", + "dns.quad9.net", + "dns11.quad9.net", + "dns.adguard-dns.com", + "unfiltered.adguard-dns.com", + "family.adguard-dns.com", + "dns.nextdns.io", + "doh.opendns.com", + "doh.cleanbrowsing.org", + "doh.dns.sb", + "dns0.eu", + "dns.alidns.com", + "doh.pub", + "dns.mullvad.net", ]; fn matches_sni_rewrite(host: &str, youtube_via_relay: bool) -> bool { let h = host.to_ascii_lowercase(); let h = h.trim_end_matches('.'); + + // YouTube relay carve-out runs FIRST so it wins over the broad + // `googleapis.com` suffix that would otherwise pull + // `youtubei.googleapis.com` into the SNI-rewrite path. The earlier + // implementation iterated SNI_REWRITE_SUFFIXES with a filter, which + // works for sibling entries (e.g. `youtube.com` in both lists) but + // not for nested ones (`youtubei.googleapis.com` matches the broad + // `googleapis.com` even when its specific entry is filtered out). + // The short-circuit here is unconditional — we don't need to check + // SNI rewrite once we've decided this host goes to the relay. + if youtube_via_relay { + for s in YOUTUBE_RELAY_HOSTS { + if h == *s || h.ends_with(&format!(".{}", s)) { + return false; + } + } + } + SNI_REWRITE_SUFFIXES .iter() - .filter(|s| { - // If the user opted into youtube_via_relay, skip YouTube - // suffixes so they fall through to the Apps Script relay - // path. See config.rs `youtube_via_relay` docs for the - // trade-off. Issue #102. - !(youtube_via_relay && YOUTUBE_SNI_SUFFIXES.contains(s)) - }) .any(|s| h == *s || h.ends_with(&format!(".{}", s))) } @@ -131,6 +216,8 @@ pub struct ProxyServer { mitm: Arc>, rewrite_ctx: Arc, tunnel_mux: Option>, + coalesce_step_ms: u64, + coalesce_max_ms: u64, } pub struct RewriteCtx { @@ -148,6 +235,51 @@ pub struct RewriteCtx { /// and pass through as plain TCP (optionally via upstream_socks5). /// See config.rs `passthrough_hosts` for matching rules. Issues #39, #127. pub passthrough_hosts: Vec, + /// If true, drop SOCKS5 UDP datagrams destined for port 443 so + /// callers fall back to TCP/HTTPS. See config.rs `block_quic` for + /// the trade-off. Issue #213. + pub block_quic: bool, + /// If true, route DoH CONNECTs around the Apps Script tunnel via + /// plain TCP. Default true via `Config::tunnel_doh = false`. See + /// `DEFAULT_DOH_HOSTS` and `matches_doh_host` for matching, and + /// config.rs `tunnel_doh` for the trade-off. + pub bypass_doh: bool, + /// User-supplied DoH hostnames added to the built-in default list. + /// Same matching semantics as `passthrough_hosts`. + pub bypass_doh_hosts: Vec, +} + +/// True if `host` matches a known DoH endpoint — either the built-in +/// `DEFAULT_DOH_HOSTS` list or a user-supplied entry in `extra`. Match +/// is case-insensitive, and entries match either exactly OR as a +/// dot-anchored suffix unconditionally (no leading-dot requirement, +/// unlike `passthrough_hosts`). The DoH list is *always* about a +/// service — every legitimate tenant subdomain of `cloudflare-dns.com` +/// or a user's private `doh.acme.test` is a DoH endpoint, so requiring +/// users to remember to write `.doh.acme.test` would be a footgun +/// without an obvious benefit. +fn host_matches_doh_entry(h: &str, entry: &str) -> bool { + let e = entry.trim().trim_end_matches('.').to_ascii_lowercase(); + let e = e.strip_prefix('.').unwrap_or(&e); + if e.is_empty() { + return false; + } + h == e || h.ends_with(&format!(".{}", e)) +} + +pub fn matches_doh_host(host: &str, extra: &[String]) -> bool { + let h = host.to_ascii_lowercase(); + let h = h.trim_end_matches('.'); + if h.is_empty() { + return false; + } + if DEFAULT_DOH_HOSTS + .iter() + .any(|s| host_matches_doh_entry(h, s)) + { + return true; + } + extra.iter().any(|s| host_matches_doh_entry(h, s)) } /// True if `host` matches any entry in the user's passthrough list. @@ -207,6 +339,20 @@ impl ProxyServer { }; let tls_connector = TlsConnector::from(Arc::new(tls_config)); + // Surface a config combo that is otherwise silently inert: extras + // listed under `bypass_doh_hosts` only take effect when the bypass + // itself is on. A user who set `tunnel_doh: true` *and* populated + // the extras list almost certainly didn't mean to disable the + // feature their custom hosts feed into. + if config.tunnel_doh && !config.bypass_doh_hosts.is_empty() { + tracing::warn!( + "config: bypass_doh_hosts has {} entries but tunnel_doh=true — \ + the bypass is off, so the extras have no effect. Set \ + tunnel_doh=false (or omit it) to use them.", + config.bypass_doh_hosts.len() + ); + } + let rewrite_ctx = Arc::new(RewriteCtx { google_ip: config.google_ip.clone(), front_domain: config.front_domain.clone(), @@ -216,6 +362,9 @@ impl ProxyServer { mode, youtube_via_relay: config.youtube_via_relay, passthrough_hosts: config.passthrough_hosts.clone(), + block_quic: config.block_quic, + bypass_doh: !config.tunnel_doh, + bypass_doh_hosts: config.bypass_doh_hosts.clone(), }); let socks5_port = config.socks5_port.unwrap_or(config.listen_port + 1); @@ -228,6 +377,8 @@ impl ProxyServer { mitm, rewrite_ctx, tunnel_mux: None, // initialized in run() inside the tokio runtime + coalesce_step_ms: if config.coalesce_step_ms > 0 { config.coalesce_step_ms as u64 } else { 40 }, + coalesce_max_ms: if config.coalesce_max_ms > 0 { config.coalesce_max_ms as u64 } else { 1000 }, }) } @@ -241,7 +392,7 @@ impl ProxyServer { // Initialize TunnelMux inside the runtime (tokio::spawn requires it). if self.rewrite_ctx.mode == Mode::Full { if let Some(f) = self.fronter.as_ref() { - self.tunnel_mux = Some(TunnelMux::start(f.clone())); + self.tunnel_mux = Some(TunnelMux::start(f.clone(), self.coalesce_step_ms, self.coalesce_max_ms)); } } @@ -261,12 +412,42 @@ impl ProxyServer { // doesn't pay a fresh TLS handshake to Google edge. Best-effort; // failures are logged and ignored. Skipped in `google_only` — there // is no fronter to warm. + // + // Sized to roughly match a browser's parallel-connection burst at + // startup. The previous fixed `3` was fine for a single deployment + // but left requests 4-10 of the opening burst paying a cold TLS + // handshake each (~300ms). Scaling with deployment count gives + // multi-account configs a proportionally warmer pool, capped so + // single-deployment users don't hammer Google edge unnecessarily. if let Some(warm_fronter) = self.fronter.clone() { + let n = warm_fronter.num_scripts().clamp(6, 16); tokio::spawn(async move { - warm_fronter.warm(3).await; + warm_fronter.warm(n).await; }); } + // Apps Script container keepalive. `warm()` above keeps the TCP + // pool warm at startup, but the V8 container behind UrlFetchApp + // goes cold after ~5min idle and costs 1-3s to wake. A periodic + // HEAD ping prevents the cold-start lag on the first request + // after a quiet pause (most visible as YouTube player stalls). + // Skipped in google_only mode for the same reason as warm — + // there's no fronter to ping. + // + // The handle is captured (not fire-and-forget) so the shutdown + // arm of the select! below can abort it. Without that, hitting + // Stop in the UI would leave the keepalive holding an + // Arc on stale config and pinging Apps Script + // every 240s — same class of bug that issue #99 hit for the + // accept loops. + let keepalive_task = if let Some(keepalive_fronter) = self.fronter.clone() { + tokio::spawn(async move { + keepalive_fronter.run_h1_keepalive().await; + }) + } else { + tokio::spawn(async move { std::future::pending::<()>().await }) + }; + let stats_task = if let Some(stats_fronter) = self.fronter.clone() { tokio::spawn(async move { let mut ticker = tokio::time::interval(std::time::Duration::from_secs(60)); @@ -374,6 +555,7 @@ impl ProxyServer { _ = &mut shutdown_rx => { tracing::info!("Shutdown signal received, stopping listeners"); stats_task.abort(); + keepalive_task.abort(); http_task.abort(); socks_task.abort(); } @@ -447,8 +629,26 @@ async fn handle_http_client( tunnel_mux: Option>, ) -> std::io::Result<()> { let (head, leftover) = match read_http_head(&mut sock).await? { - Some(v) => v, - None => return Ok(()), + HeadReadResult::Got { head, leftover } => (head, leftover), + HeadReadResult::Closed => return Ok(()), + HeadReadResult::Oversized => { + // Reply with 431 instead of just dropping the socket so the + // browser shows a real error rather than retrying the same + // oversized request in a loop. + tracing::warn!( + "request head exceeds {} bytes — refusing with 431", + MAX_HEADER_BYTES + ); + let _ = sock + .write_all( + b"HTTP/1.1 431 Request Header Fields Too Large\r\n\ + Connection: close\r\n\ + Content-Length: 0\r\n\r\n", + ) + .await; + let _ = sock.flush().await; + return Ok(()); + } }; let (method, target, _version, _headers) = parse_request_head(&head) @@ -456,30 +656,40 @@ async fn handle_http_client( if method.eq_ignore_ascii_case("CONNECT") { let (host, port) = parse_host_port(&target); + // Mirror the SOCKS5 short-circuit: if the tunnel-node just failed + // this (host, port) with unreachable, return 502 immediately rather + // than acknowledging the CONNECT and blowing tunnel quota on a + // guaranteed retry. See `TunnelMux::is_unreachable` for context. + if let Some(ref mux) = tunnel_mux { + if mux.is_unreachable(&host, port) { + tracing::info!("CONNECT {}:{} (negative-cached, refusing)", host, port); + let _ = sock + .write_all(b"HTTP/1.1 502 Bad Gateway\r\nContent-Length: 0\r\nConnection: close\r\n\r\n") + .await; + let _ = sock.flush().await; + return Ok(()); + } + } sock.write_all(b"HTTP/1.1 200 Connection Established\r\n\r\n") .await?; sock.flush().await?; dispatch_tunnel(sock, host, port, fronter, mitm, rewrite_ctx, tunnel_mux).await } else { - // Plain HTTP proxy request (e.g. `GET http://…`). The Apps Script - // relay is the only code path that can fulfil this, so in google_only - // bootstrap mode we return a clear 502 instead. + // Plain HTTP proxy request (e.g. `GET http://…`). + // + // apps_script mode: relay through the Apps Script fronter (which + // is the whole point of the relay). + // + // google_only bootstrap mode: no fronter exists, so passthrough as + // direct TCP. Same contract as `dispatch_tunnel` honors for CONNECT + // in google_only — anything not on the Google edge is forwarded + // direct (or via `upstream_socks5`) so the user's browser still + // works while they finish setting up Apps Script. Issue: typing a + // bare `http://example.com` URL used to return a 502 here even + // though `https://example.com` (CONNECT) worked fine. match fronter { Some(f) => do_plain_http(sock, &head, &leftover, f).await, - None => { - let _ = sock - .write_all( - b"HTTP/1.1 502 Bad Gateway\r\n\ - Content-Type: text/plain; charset=utf-8\r\n\ - Content-Length: 120\r\n\ - Connection: close\r\n\r\n\ - google_only mode: plain HTTP proxy requests are not supported. \ - Browse https over CONNECT, or switch to apps_script mode.", - ) - .await; - let _ = sock.flush().await; - Ok(()) - } + None => do_plain_http_passthrough(sock, &head, &leftover, &rewrite_ctx).await, } } } @@ -557,6 +767,21 @@ async fn handle_socks5_client( return handle_socks5_udp_associate(sock, rewrite_ctx, tunnel_mux).await; } + // Negative-cache short-circuit: if the tunnel-node just failed to reach + // this exact (host, port) with `Network is unreachable` / `No route to + // host`, reply 0x04 (Host unreachable) immediately. Saves a 1.5–2s tunnel + // round-trip on guaranteed-failing targets — the IPv6 probe retry loop + // is the main offender on devices without IPv6. + if let Some(ref mux) = tunnel_mux { + if mux.is_unreachable(&host, port) { + tracing::info!("SOCKS5 CONNECT -> {}:{} (negative-cached, refusing)", host, port); + sock.write_all(&[0x05, 0x04, 0x00, 0x01, 0, 0, 0, 0, 0, 0]) + .await?; + sock.flush().await?; + return Ok(()); + } + } + tracing::info!("SOCKS5 CONNECT -> {}:{}", host, port); // Success reply with zeroed BND. @@ -784,6 +1009,30 @@ async fn handle_socks5_udp_associate( continue; }; + // Issue #213: client-side QUIC block. UDP/443 is + // HTTP/3 — drop the datagram silently so the client + // stack retries a couple of times and then falls back + // to TCP/HTTPS, which goes through the regular CONNECT + // path. Skipping this at the SOCKS5 layer (rather than + // letting it hit the tunnel-node) avoids paying the + // 200–500 ms tunnel-node round-trip per dropped QUIC + // datagram, which would otherwise compound during the + // 1–3 retries before the browser falls back. + // + // Silent drop instead of an explicit error reply: the + // SOCKS5 UDP wire has no "destination unreachable" + // datagram — `0x04` only exists in TCP CONNECT replies + // (RFC 1928 §6). The browser's QUIC stack already has + // a "no response → fall back" timeout, so silent drop + // is the contractually correct shape. + if rewrite_ctx.block_quic && target.port == 443 { + tracing::debug!( + "udp dropped: block_quic=true, target {}:443", + target.host + ); + continue; + } + // RFC 1928 §6: lock to the first VALID datagram's source // port. Subsequent datagrams must come from the same // (ip, port) pair. @@ -1190,6 +1439,28 @@ async fn dispatch_tunnel( return Ok(()); } + // 0.5. DoH bypass. DNS-over-HTTPS is the dominant per-flow DNS cost + // in Full mode (every browser name lookup costs a ~2 s Apps + // Script round-trip), and the tunnel adds no privacy beyond + // what DoH already provides. Route known DoH hosts directly. + // Port-gated to 443 so a non-TLS CONNECT to e.g. `dns.google:80` + // doesn't get diverted off-tunnel by accident. + // See `DEFAULT_DOH_HOSTS` and config.rs `tunnel_doh`. + if rewrite_ctx.bypass_doh + && port == 443 + && matches_doh_host(&host, &rewrite_ctx.bypass_doh_hosts) + { + let via = rewrite_ctx.upstream_socks5.as_deref(); + tracing::info!( + "dispatch {}:{} -> raw-tcp ({}) (doh bypass)", + host, + port, + via.unwrap_or("direct") + ); + plain_tcp_passthrough(sock, &host, port, via).await; + return Ok(()); + } + // 1. Full tunnel mode: ALL traffic goes through the batch multiplexer // (Apps Script → tunnel node → real TCP). No MITM, no cert. if rewrite_ctx.mode == Mode::Full { @@ -1499,14 +1770,35 @@ fn looks_like_http(first_bytes: &[u8]) -> bool { /// Read an HTTP head (request line + headers) up to the first \r\n\r\n. /// Returns (head_bytes, leftover_after_head). The leftover may contain part /// of the request body already received. -async fn read_http_head(sock: &mut TcpStream) -> std::io::Result, Vec)>> { +/// Maximum size of an HTTP request head (request line + all headers). +/// +/// Set to match upstream Python's `MAX_HEADER_BYTES` (64 KB, +/// masterking32/MasterHttpRelayVPN constants.py). Real browsers +/// virtually never exceed ~16 KB; anything past 64 KB is either a +/// buggy client or a deliberate slowloris-style header bomb. +/// Previously 1 MB, which let a misbehaving client allocate a lot +/// of memory before failing. +const MAX_HEADER_BYTES: usize = 64 * 1024; + +/// Result of `read_http_head` / `read_http_head_io`. +/// `Oversized` is distinct from other I/O errors so the caller can +/// reply with `431 Request Header Fields Too Large` instead of just +/// dropping the connection (which a browser would silently retry, +/// reproducing the same problem). +enum HeadReadResult { + Got { head: Vec, leftover: Vec }, + Closed, + Oversized, +} + +async fn read_http_head(sock: &mut TcpStream) -> std::io::Result { let mut buf = Vec::with_capacity(4096); let mut tmp = [0u8; 4096]; loop { let n = sock.read(&mut tmp).await?; if n == 0 { return if buf.is_empty() { - Ok(None) + Ok(HeadReadResult::Closed) } else { Err(std::io::Error::new( std::io::ErrorKind::UnexpectedEof, @@ -1518,13 +1810,10 @@ async fn read_http_head(sock: &mut TcpStream) -> std::io::Result if let Some(pos) = find_headers_end(&buf) { let head = buf[..pos].to_vec(); let leftover = buf[pos..].to_vec(); - return Ok(Some((head, leftover))); + return Ok(HeadReadResult::Got { head, leftover }); } - if buf.len() > 1024 * 1024 { - return Err(std::io::Error::new( - std::io::ErrorKind::InvalidData, - "headers too large", - )); + if buf.len() > MAX_HEADER_BYTES { + return Ok(HeadReadResult::Oversized); } } } @@ -1833,8 +2122,31 @@ where S: tokio::io::AsyncRead + tokio::io::AsyncWrite + Unpin, { let (head, leftover) = match read_http_head_io(stream).await? { - Some(v) => v, - None => return Ok(false), + HeadReadResult::Got { head, leftover } => (head, leftover), + HeadReadResult::Closed => return Ok(false), + HeadReadResult::Oversized => { + // Inside MITM: same reasoning as the plaintext path. Return + // 431 over the decrypted stream so the browser surfaces a + // real error to the user instead of looping a connection + // reset, which was the symptom upstream caught (Apps Script + // ate malformed JSON when truncated header blocks were + // forwarded blindly). + tracing::warn!( + "MITM header block exceeds {} bytes — closing ({}:{})", + MAX_HEADER_BYTES, + host, + port + ); + let _ = stream + .write_all( + b"HTTP/1.1 431 Request Header Fields Too Large\r\n\ + Connection: close\r\n\ + Content-Length: 0\r\n\r\n", + ) + .await; + let _ = stream.flush().await; + return Ok(false); + } }; let (method, path, _version, headers) = match parse_request_head(&head) { @@ -1862,7 +2174,7 @@ where // pourya-p's log in #64 showed the real Host header. Match every // subdomain of x.com here. let host_lower = host.to_ascii_lowercase(); - let is_x_com = host_lower == "x.com" || host_lower.ends_with(".x.com"); + let is_x_com = host_lower == "x.com" || host_lower.ends_with(".x.com") || host_lower == "twitter.com" || host_lower.ends_with(".twitter.com"); let path = if is_x_com && path.starts_with("/i/api/graphql/") && path.contains("?variables=") { match path.split_once('&') { Some((short, _)) => { @@ -1955,7 +2267,7 @@ where Ok(!connection_close) } -async fn read_http_head_io(stream: &mut S) -> std::io::Result, Vec)>> +async fn read_http_head_io(stream: &mut S) -> std::io::Result where S: tokio::io::AsyncRead + Unpin, { @@ -1965,7 +2277,7 @@ where let n = stream.read(&mut tmp).await?; if n == 0 { return if buf.is_empty() { - Ok(None) + Ok(HeadReadResult::Closed) } else { Err(std::io::Error::new( std::io::ErrorKind::UnexpectedEof, @@ -1977,13 +2289,10 @@ where if let Some(pos) = find_headers_end(&buf) { let head = buf[..pos].to_vec(); let leftover = buf[pos..].to_vec(); - return Ok(Some((head, leftover))); + return Ok(HeadReadResult::Got { head, leftover }); } - if buf.len() > 1024 * 1024 { - return Err(std::io::Error::new( - std::io::ErrorKind::InvalidData, - "headers too large", - )); + if buf.len() > MAX_HEADER_BYTES { + return Ok(HeadReadResult::Oversized); } } } @@ -2203,6 +2512,174 @@ async fn do_plain_http( Ok(()) } +/// google_only mode plain-HTTP passthrough. The CONNECT path already +/// falls through to direct TCP for non-Google-edge hosts in google_only; +/// this is the same idea for the `GET http://…` proxy form so a bare +/// `http://example.com` typed in the address bar doesn't 502. +/// +/// We rewrite the absolute-form request URI (`GET http://host/path`) to +/// origin form (`GET /path`), strip hop-by-hop headers, force +/// `Connection: close` so a keep-alive client can't pipeline a request +/// to a different host onto our spliced socket, then dial the origin +/// (honoring `upstream_socks5` if set) and splice both directions. +async fn do_plain_http_passthrough( + mut sock: TcpStream, + head: &[u8], + leftover: &[u8], + rewrite_ctx: &RewriteCtx, +) -> std::io::Result<()> { + let (method, target, version, headers) = match parse_request_head(head) { + Some(v) => v, + None => return Ok(()), + }; + + let (host, port, path) = match resolve_plain_http_target(&target, &headers) { + Some(v) => v, + None => { + tracing::debug!("plain-http passthrough: cannot parse target {}", target); + return Ok(()); + } + }; + + tracing::info!( + "dispatch http {}:{} -> raw-tcp ({}) (google_only: no relay)", + host, + port, + rewrite_ctx.upstream_socks5.as_deref().unwrap_or("direct"), + ); + + // Rewrite request line to origin form and drop hop-by-hop headers. + let mut rewritten = Vec::with_capacity(head.len()); + rewritten.extend_from_slice(method.as_bytes()); + rewritten.push(b' '); + rewritten.extend_from_slice(path.as_bytes()); + rewritten.push(b' '); + rewritten.extend_from_slice(version.as_bytes()); + rewritten.extend_from_slice(b"\r\n"); + for (k, v) in &headers { + let kl = k.to_ascii_lowercase(); + if kl == "proxy-connection" || kl == "connection" || kl == "keep-alive" { + continue; + } + rewritten.extend_from_slice(k.as_bytes()); + rewritten.extend_from_slice(b": "); + rewritten.extend_from_slice(v.as_bytes()); + rewritten.extend_from_slice(b"\r\n"); + } + rewritten.extend_from_slice(b"Connection: close\r\n\r\n"); + + let target_host = host.trim_start_matches('[').trim_end_matches(']'); + let connect_timeout = if looks_like_ip(target_host) { + std::time::Duration::from_secs(4) + } else { + std::time::Duration::from_secs(10) + }; + let upstream = if let Some(proxy) = rewrite_ctx.upstream_socks5.as_deref() { + match socks5_connect_via(proxy, target_host, port).await { + Ok(s) => s, + Err(e) => { + tracing::warn!( + "upstream-socks5 {} -> {}:{} failed: {} (falling back to direct)", + proxy, + host, + port, + e + ); + match tokio::time::timeout( + connect_timeout, + TcpStream::connect((target_host, port)), + ) + .await + { + Ok(Ok(s)) => s, + _ => return Ok(()), + } + } + } + } else { + match tokio::time::timeout(connect_timeout, TcpStream::connect((target_host, port))).await { + Ok(Ok(s)) => s, + Ok(Err(e)) => { + tracing::debug!("plain-http connect {}:{} failed: {}", host, port, e); + return Ok(()); + } + Err(_) => { + tracing::debug!("plain-http connect {}:{} timeout", host, port); + return Ok(()); + } + } + }; + let _ = upstream.set_nodelay(true); + + let (mut ar, mut aw) = sock.split(); + let (mut br, mut bw) = upstream.into_split(); + bw.write_all(&rewritten).await?; + if !leftover.is_empty() { + bw.write_all(leftover).await?; + } + let t1 = tokio::io::copy(&mut ar, &mut bw); + let t2 = tokio::io::copy(&mut br, &mut aw); + tokio::select! { + _ = t1 => {} + _ = t2 => {} + } + Ok(()) +} + +/// Parse the target of a plain-HTTP proxy request line into +/// `(host, port, origin-form-path)`. Browsers send absolute form +/// (`http://host[:port]/path`); we also accept the origin-form +/// fallback (`/path` with a `Host:` header) for transparent-proxy +/// clients. `https://` is accepted defensively, though browsers route +/// HTTPS through CONNECT and shouldn't hit this path. +fn resolve_plain_http_target( + target: &str, + headers: &[(String, String)], +) -> Option<(String, u16, String)> { + let (rest, default_port) = if let Some(r) = target.strip_prefix("http://") { + (r, 80u16) + } else if let Some(r) = target.strip_prefix("https://") { + (r, 443u16) + } else if target.starts_with('/') { + let host_header = headers + .iter() + .find(|(k, _)| k.eq_ignore_ascii_case("host")) + .map(|(_, v)| v.as_str())?; + let (host, port) = split_authority(host_header, 80); + return Some((host, port, target.to_string())); + } else { + return None; + }; + + let (authority, path) = match rest.find('/') { + Some(i) => (&rest[..i], &rest[i..]), + None => (rest, "/"), + }; + if authority.is_empty() { + return None; + } + let (host, port) = split_authority(authority, default_port); + Some((host, port, path.to_string())) +} + +/// Split an `authority` (`host[:port]`, with optional IPv6 brackets) +/// into a `(host, port)` pair, defaulting the port when absent. +fn split_authority(authority: &str, default_port: u16) -> (String, u16) { + // Bare IPv6 (multiple colons, no brackets) — `rsplit_once(':')` + // would otherwise mangle `::1` into `(":", 1)`. Take the whole + // string as the host and use the default port. + let colons = authority.bytes().filter(|&b| b == b':').count(); + if colons > 1 && !authority.starts_with('[') { + return (authority.to_string(), default_port); + } + if let Some((h, p)) = authority.rsplit_once(':') { + if let Ok(port) = p.parse::() { + return (h.to_string(), port); + } + } + (authority.to_string(), default_port) +} + #[cfg(test)] mod tests { use super::*; @@ -2215,6 +2692,63 @@ mod tests { .collect() } + #[test] + fn resolve_plain_http_target_parses_absolute_form() { + let h = headers(&[]); + let (host, port, path) = + resolve_plain_http_target("http://example.com/", &h).unwrap(); + assert_eq!(host, "example.com"); + assert_eq!(port, 80); + assert_eq!(path, "/"); + + let (host, port, path) = + resolve_plain_http_target("http://example.com:8080/foo?x=1", &h).unwrap(); + assert_eq!(host, "example.com"); + assert_eq!(port, 8080); + assert_eq!(path, "/foo?x=1"); + + let (host, port, path) = + resolve_plain_http_target("http://example.com", &h).unwrap(); + assert_eq!(host, "example.com"); + assert_eq!(port, 80); + assert_eq!(path, "/"); + } + + #[test] + fn resolve_plain_http_target_falls_back_to_host_header() { + let h = headers(&[("Host", "example.com:8080")]); + let (host, port, path) = resolve_plain_http_target("/foo", &h).unwrap(); + assert_eq!(host, "example.com"); + assert_eq!(port, 8080); + assert_eq!(path, "/foo"); + } + + #[test] + fn resolve_plain_http_target_rejects_bare_authority() { + // No scheme, doesn't start with `/` — not something we can route. + assert!(resolve_plain_http_target("example.com", &headers(&[])).is_none()); + assert!(resolve_plain_http_target("http://", &headers(&[])).is_none()); + } + + #[test] + fn split_authority_handles_ports_and_ipv6() { + assert_eq!( + split_authority("example.com", 80), + ("example.com".to_string(), 80) + ); + assert_eq!( + split_authority("example.com:8080", 80), + ("example.com".to_string(), 8080) + ); + assert_eq!( + split_authority("[::1]:8080", 80), + ("[::1]".to_string(), 8080) + ); + // Bare IPv6 without brackets — keep the whole string as the host + // and use the default port instead of mis-splitting on a colon. + assert_eq!(split_authority("::1", 80), ("::1".to_string(), 80)); + } + #[test] fn socks5_udp_domain_packet_round_trips() { let mut raw = vec![0, 0, 0, 0x03, 11]; @@ -2366,36 +2900,77 @@ mod tests { #[test] fn youtube_via_relay_routes_youtube_through_relay_path() { - // Issue #102. When youtube_via_relay=true, YouTube suffixes - // must NOT match the SNI-rewrite path, so traffic falls - // through to Apps Script relay. Other Google suffixes are - // unaffected. + // Issue #102 + #275. When youtube_via_relay=true: + // - YouTube API + HTML hosts (where Restricted Mode lives) + // opt out of SNI rewrite so they go through the relay. + // - YouTube image / video / channel-asset CDNs STAY on SNI + // rewrite — Restricted Mode isn't enforced on those, and + // routing video chunks through Apps Script burns quota + // and risks the 6-min execution cap. Pre-#275 ytimg.com + // was incorrectly carved out alongside the API surfaces. + // - Non-YouTube Google suffixes are unaffected by the flag. let hosts = std::collections::HashMap::new(); - // Default behaviour: everything in the pool rewrites. + // Default behaviour (flag off): everything in the SNI pool + // rewrites including all YouTube assets. + assert!(should_use_sni_rewrite(&hosts, "www.youtube.com", 443, false)); + assert!(should_use_sni_rewrite(&hosts, "i.ytimg.com", 443, false)); + assert!(should_use_sni_rewrite(&hosts, "youtu.be", 443, false)); + assert!(should_use_sni_rewrite(&hosts, "www.google.com", 443, false)); assert!(should_use_sni_rewrite( &hosts, - "www.youtube.com", + "youtubei.googleapis.com", 443, false )); - assert!(should_use_sni_rewrite(&hosts, "i.ytimg.com", 443, false)); - assert!(should_use_sni_rewrite(&hosts, "youtu.be", 443, false)); - assert!(should_use_sni_rewrite(&hosts, "www.google.com", 443, false)); - // With the toggle on: YouTube opts out, Google stays. + // googlevideo.com is INTENTIONALLY NOT in SNI_REWRITE_SUFFIXES + // — see the long note at the top of the SNI list. v1.7.4 tried + // adding it; reverted in v1.7.6 after user reports of total + // YouTube breakage. If the project ever ships an EVA-edge-IP + // config knob, this assertion can flip. Until then, video + // chunks correctly fall through to the Apps Script relay path + // and this assertion guards against a regression. assert!(!should_use_sni_rewrite( &hosts, - "www.youtube.com", + "rr1---sn-abc.googlevideo.com", 443, - true + false )); - assert!(!should_use_sni_rewrite(&hosts, "i.ytimg.com", 443, true)); + + // Flag on: only the API + HTML hosts opt out. + assert!(!should_use_sni_rewrite(&hosts, "www.youtube.com", 443, true)); assert!(!should_use_sni_rewrite(&hosts, "youtu.be", 443, true)); + assert!(!should_use_sni_rewrite( + &hosts, + "www.youtube-nocookie.com", + 443, + true + )); + assert!(!should_use_sni_rewrite( + &hosts, + "youtubei.googleapis.com", + 443, + true + )); + + // Flag on: image / channel-asset CDNs STAY on SNI rewrite. Pre-#275 + // ytimg.com was incorrectly carved out alongside the API surfaces. + // googlevideo.com still goes through the relay path (not in the + // SNI list at all — see note above the SNI_REWRITE_SUFFIXES + // entries) so the same flag-on assertion isn't applicable to it. + assert!(should_use_sni_rewrite(&hosts, "i.ytimg.com", 443, true)); + assert!(should_use_sni_rewrite(&hosts, "yt3.ggpht.com", 443, true)); + + // Flag on: non-YouTube Google suffixes are unaffected. Note + // youtubei.googleapis.com (above) is the *carve-out* — the + // broader googleapis.com suffix is NOT carved out, so e.g. + // Drive / Calendar / etc. continue to SNI-rewrite. assert!(should_use_sni_rewrite(&hosts, "www.google.com", 443, true)); + assert!(should_use_sni_rewrite(&hosts, "fonts.gstatic.com", 443, true)); assert!(should_use_sni_rewrite( &hosts, - "fonts.gstatic.com", + "drive.googleapis.com", 443, true )); @@ -2459,4 +3034,64 @@ mod tests { assert!(matches_passthrough("example.com", &list)); assert!(matches_passthrough("example.com.", &list)); } + + #[test] + fn doh_default_list_exact_matches() { + let extra: Vec = vec![]; + assert!(matches_doh_host("chrome.cloudflare-dns.com", &extra)); + assert!(matches_doh_host("dns.google", &extra)); + assert!(matches_doh_host("dns.quad9.net", &extra)); + assert!(matches_doh_host("doh.opendns.com", &extra)); + } + + #[test] + fn doh_default_list_case_insensitive_and_trailing_dot() { + let extra: Vec = vec![]; + assert!(matches_doh_host("DNS.GOOGLE", &extra)); + assert!(matches_doh_host("dns.google.", &extra)); + } + + #[test] + fn doh_default_list_suffix_match_for_tenant_subdomains() { + // `cloudflare-dns.com` is in the default list — Workers-hosted + // tenant DoH endpoints sit under it and should match too. + let extra: Vec = vec![]; + assert!(matches_doh_host("tenant.cloudflare-dns.com", &extra)); + // But a substring match must NOT pass: `xcloudflare-dns.com` is + // a different domain. + assert!(!matches_doh_host("xcloudflare-dns.com", &extra)); + } + + #[test] + fn doh_default_list_unrelated_hosts_do_not_match() { + let extra: Vec = vec![]; + assert!(!matches_doh_host("example.com", &extra)); + assert!(!matches_doh_host("googlevideo.com", &extra)); + assert!(!matches_doh_host("", &extra)); + } + + #[test] + fn doh_extra_list_extends_default() { + let extra = vec![".internal-doh.example".to_string(), "doh.acme.test".to_string()]; + // Defaults still match. + assert!(matches_doh_host("dns.google", &extra)); + // User additions match. + assert!(matches_doh_host("doh.acme.test", &extra)); + assert!(matches_doh_host("a.b.internal-doh.example", &extra)); + // Unrelated still doesn't match. + assert!(!matches_doh_host("example.com", &extra)); + } + + #[test] + fn doh_extra_entries_match_subdomains_without_leading_dot() { + // Asymmetry footgun guard: user adds `doh.acme.test` and expects + // `tenant.doh.acme.test` to match too — same as `dns.google` + // matching `tenant.dns.google` from the default list. Unlike + // `passthrough_hosts`, DoH extras don't require a leading dot. + let extra = vec!["doh.acme.test".to_string()]; + assert!(matches_doh_host("doh.acme.test", &extra)); + assert!(matches_doh_host("tenant.doh.acme.test", &extra)); + // But substring overlap must still be rejected. + assert!(!matches_doh_host("xdoh.acme.test", &extra)); + } } diff --git a/src/tunnel_client.rs b/src/tunnel_client.rs index 72444e60..a1e8a69c 100644 --- a/src/tunnel_client.rs +++ b/src/tunnel_client.rs @@ -14,7 +14,7 @@ use std::collections::HashMap; // reason; reuse it here. `AtomicBool` works fine in std on every target. use portable_atomic::AtomicU64; use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; use base64::engine::general_purpose::STANDARD as B64; @@ -23,7 +23,7 @@ use tokio::io::{AsyncReadExt, AsyncWrite, AsyncWriteExt}; use tokio::net::TcpStream; use tokio::sync::{mpsc, oneshot, Semaphore}; -use crate::domain_fronter::{BatchOp, DomainFronter, TunnelResponse}; +use crate::domain_fronter::{BatchOp, DomainFronter, FronterError, TunnelResponse}; /// Apps Script allows 30 concurrent executions per account / deployment. const CONCURRENCY_PER_DEPLOYMENT: usize = 30; @@ -55,6 +55,12 @@ const REPLY_TIMEOUT: Duration = Duration::from_secs(35); /// connect saves one Apps Script round-trip per new flow. const CLIENT_FIRST_DATA_WAIT: Duration = Duration::from_millis(50); +/// Adaptive coalesce defaults: after each new op arrives, wait another +/// step for more ops. Resets on every arrival, up to max from the first +/// op. Overridable via config `coalesce_step_ms` / `coalesce_max_ms`. +const DEFAULT_COALESCE_STEP_MS: u64 = 40; +const DEFAULT_COALESCE_MAX_MS: u64 = 1000; + /// Structured error code the tunnel-node returns when it doesn't know the /// op (version mismatch). Must match `tunnel-node/src/main.rs`. const CODE_UNSUPPORTED_OP: &str = "UNSUPPORTED_OP"; @@ -68,6 +74,28 @@ const CODE_UNSUPPORTED_OP: &str = "UNSUPPORTED_OP"; /// floor, so network jitter on either side won't false-trigger. const LEGACY_DETECT_THRESHOLD: Duration = Duration::from_millis(1500); +/// How long a deployment stays in "legacy / no long-poll" mode after the +/// last detection. Must be much longer than `LEGACY_DETECT_THRESHOLD` so a +/// freshly-marked deployment doesn't immediately self-recover, but short +/// enough that a redeployed / recovered tunnel-node gets re-probed without +/// requiring a process restart. 60 s lets one stuck deployment widen its +/// own poll cadence without poisoning the others, and self-resets so an +/// upgraded tunnel-node returns to the long-poll fast path on its own. +const LEGACY_RECOVER_AFTER: Duration = Duration::from_secs(60); + +/// How long to remember a `Network is unreachable` / `No route to host` +/// failure for a given `(host, port)`. While cached, the proxy short-circuits +/// repeat CONNECTs with an immediate "host unreachable" reply instead of +/// burning a 1.5–2s tunnel batch round-trip on a target that just failed. +/// Real motivator: IPv6-only probe hostnames (e.g. `ds6.probe.*`) on devices +/// without IPv6 — the OS retries the probe every ~1.5s for 10s+, generating +/// 5–10 wasted tunnel sessions per probe. +const UNREACHABLE_CACHE_TTL: Duration = Duration::from_secs(30); + +/// Hard cap on negative-cache size. Browsing pulls in dozens of distinct +/// hosts; we don't want a runaway map. Pruned opportunistically on insert. +const UNREACHABLE_CACHE_MAX: usize = 256; + /// Ports where the *server* speaks first (SMTP banner, SSH identification, /// POP3/IMAP greeting, FTP banner). On these, waiting for client bytes /// gains nothing and just adds handshake latency — skip the pre-read. @@ -77,10 +105,44 @@ fn is_server_speaks_first(port: u16) -> bool { matches!(port, 21 | 22 | 25 | 80 | 110 | 143 | 587) } +/// Recognize the tunnel-node's connect-error strings that mean +/// "this destination is fundamentally unreachable from the tunnel-node's +/// network right now" — distinct from refused/reset/timeout, which can be +/// transient. These come through as the inner `e` of a `TunnelResponse` +/// after the tunnel-node's std::io::Error is stringified, so we match on +/// substrings rather than `ErrorKind`. Linux: errno 101 (ENETUNREACH), +/// errno 113 (EHOSTUNREACH). Format varies a bit across libc/Tokio +/// versions, so cover both the human text and the os-error tag. +fn is_unreachable_error_str(s: &str) -> bool { + let lc = s.to_ascii_lowercase(); + lc.contains("network is unreachable") + || lc.contains("no route to host") + || lc.contains("os error 101") + || lc.contains("os error 113") +} + +/// Canonicalize a host string for use as a negative-cache key. DNS names +/// are case-insensitive and may carry a trailing root-label dot, so +/// `Example.COM:443`, `example.com:443`, and `example.com.:443` are all the +/// same destination. IPv4 / IPv6 literals are unaffected — IPv4 has no +/// letters, and `Ipv6Addr::to_string()` already emits lowercase. +fn normalize_cache_host(host: &str) -> String { + let trimmed = host.strip_suffix('.').unwrap_or(host); + trimmed.to_ascii_lowercase() +} + // --------------------------------------------------------------------------- // Multiplexer // --------------------------------------------------------------------------- +/// Reply payload for ops that go through `fire_batch`. The `String` is the +/// `script_id` of the deployment that processed the batch — needed by +/// `tunnel_loop`'s legacy-detection and per-deployment skip-when-idle +/// decisions, which can't reach `fire_batch`'s local `script_id` any +/// other way. Plain `Connect` doesn't go through `fire_batch` and keeps +/// the simpler reply type. +type BatchedReply = oneshot::Sender>; + enum MuxMsg { Connect { host: String, @@ -93,23 +155,23 @@ enum MuxMsg { // Arc so the caller can hand the buffer to the mux AND keep a ref // for the fallback path without an extra 64 KB copy per session. data: Arc>, - reply: oneshot::Sender>, + reply: BatchedReply, }, Data { sid: String, data: Vec, - reply: oneshot::Sender>, + reply: BatchedReply, }, UdpOpen { host: String, port: u16, data: Vec, - reply: oneshot::Sender>, + reply: BatchedReply, }, UdpData { sid: String, data: Vec, - reply: oneshot::Sender>, + reply: BatchedReply, }, Close { sid: String, @@ -122,16 +184,48 @@ pub struct TunnelMux { /// `connect_data` as unsupported. Subsequent sessions skip the /// optimistic path entirely and go straight to plain connect + data. connect_data_unsupported: Arc, - /// Set to `true` after we observe an empty poll round-trip that - /// returned in less than `LEGACY_DETECT_THRESHOLD` with no data. - /// On a long-poll-capable tunnel-node, an empty poll either returns - /// quickly *with data* (push arrived) or holds open until the - /// server's `LONGPOLL_DEADLINE`. A fast empty reply means the server - /// is doing the legacy fixed-sleep drain — in that mode, hammering - /// idle sessions at the new 500 ms cadence wastes Apps Script quota - /// for no benefit, so the loop reverts to the pre-long-poll - /// "skip empty polls when idle" behavior. - server_no_longpoll: Arc, + /// Per-deployment legacy state: `script_id` → time it was last + /// observed serving an empty poll faster than `LEGACY_DETECT_THRESHOLD`. + /// Absence means "long-poll capable, or untested." Entries expire after + /// `LEGACY_RECOVER_AFTER` so a redeployed / recovered tunnel-node + /// rejoins the long-poll fast path without requiring a process restart. + /// + /// Note: the per-deployment marks here do *not* drive a per-deployment + /// poll cadence — the `tunnel_loop` cadence (read-timeout backoff and + /// skip-empty-when-idle) is gated on the aggregate `all_legacy`, + /// because the next op's deployment is chosen later by + /// `next_script_id()` round-robin and the loop can't pre-select. What + /// the per-deployment design *does* fix vs the old single AtomicBool: + /// * one slow / legacy deployment can no longer flip the aggregate + /// true on its own — every deployment has to be marked first; + /// * deployments recover individually on the TTL, so an upgraded + /// tunnel-node lifts the aggregate without needing the others to + /// also recover or the process to restart; + /// * the warn log fires once per (deployment, recovery cycle), so + /// re-detection after recovery is a real signal in the logs. + /// The cost: legacy deployments still receive fast empty polls in + /// mixed mode (round-robin doesn't know to avoid them). Worth it to + /// keep pushed bytes flowing through the long-poll-capable peers. + legacy_deployments: Mutex>, + /// Lock-free hot-path snapshot of "every known deployment is currently + /// in legacy mode." Recomputed under `legacy_deployments`'s mutex on + /// every mark/expire and read with a relaxed load from `tunnel_loop`. + /// True only when this process has fast-empty observations for *all* + /// `num_scripts` deployments simultaneously — that's when the per- + /// session 30 s read-timeout backoff (the only setting where there is + /// no per-deployment alternative) is still appropriate. Invariant: the + /// atomic is always written *after* the map insert, under the same + /// lock, so any reader that sees `true` was preceded by a complete + /// map update. + all_legacy: Arc, + /// Count of *unique* configured deployment IDs at start time. + /// Snapshotted from `fronter.script_id_list()` deduped, since the + /// aggregate gate compares this against `legacy_deployments.len()` + /// (a HashMap, so unique-keyed) — using the raw configured count + /// would make the gate unreachable whenever a user lists the same + /// script_id twice. Blacklisted-but-configured deployments still + /// count here; see `all_servers_legacy` for why. + num_scripts: usize, /// Pre-read observability. Lets an operator see whether the 50 ms /// wait-for-first-bytes is pulling its weight: /// * `preread_win` — client sent bytes in time, bundled with connect @@ -149,28 +243,57 @@ pub struct TunnelMux { /// Separate monotonic counter used only to trigger the summary log /// (avoids a race where two threads both see `total % 100 == 0`). preread_total_events: AtomicU64, + /// Short-lived negative cache for targets the tunnel-node reported as + /// unreachable (`Network is unreachable` / `No route to host`). Keyed by + /// `(host, port)`, value is the expiry instant. Plain Mutex is + /// fine: it's touched once per CONNECT (cheap) and once per failure. + unreachable_cache: Mutex>, } impl TunnelMux { - pub fn start(fronter: Arc) -> Arc { - let n = fronter.num_scripts(); + pub fn start(fronter: Arc, coalesce_step_ms: u64, coalesce_max_ms: u64) -> Arc { + // Dedupe before snapshotting: the aggregate `all_legacy` gate + // compares `legacy_deployments.len()` (a HashMap, so unique + // keys) against this count, so using the raw `num_scripts()` + // would make the gate unreachable whenever a user lists the + // same script_id twice in config. + let unique: std::collections::HashSet<&str> = fronter + .script_id_list() + .iter() + .map(String::as_str) + .collect(); + let unique_n = unique.len(); + let raw_n = fronter.num_scripts(); + if unique_n != raw_n { + tracing::warn!( + "tunnel mux: {} deployments configured but only {} unique script_id(s) — duplicate entries ignored for legacy detection", + raw_n, + unique_n, + ); + } tracing::info!( "tunnel mux: {} deployment(s), {} concurrent per deployment", - n, + unique_n, CONCURRENCY_PER_DEPLOYMENT ); + let step = if coalesce_step_ms > 0 { coalesce_step_ms } else { DEFAULT_COALESCE_STEP_MS }; + let max = if coalesce_max_ms > 0 { coalesce_max_ms } else { DEFAULT_COALESCE_MAX_MS }; + tracing::info!("batch coalesce: step={}ms max={}ms", step, max); let (tx, rx) = mpsc::channel(512); - tokio::spawn(mux_loop(rx, fronter)); + tokio::spawn(mux_loop(rx, fronter, step, max)); Arc::new(Self { tx, connect_data_unsupported: Arc::new(AtomicBool::new(false)), - server_no_longpoll: Arc::new(AtomicBool::new(false)), + legacy_deployments: Mutex::new(HashMap::new()), + all_legacy: Arc::new(AtomicBool::new(false)), + num_scripts: unique_n, preread_win: AtomicU64::new(0), preread_loss: AtomicU64::new(0), preread_skip_port: AtomicU64::new(0), preread_skip_unsupported: AtomicU64::new(0), preread_win_total_us: AtomicU64::new(0), preread_total_events: AtomicU64::new(0), + unreachable_cache: Mutex::new(HashMap::new()), }) } @@ -193,7 +316,8 @@ impl TunnelMux { }) .await; match reply_rx.await { - Ok(r) => r, + Ok(Ok((resp, _script_id))) => Ok(resp), + Ok(Err(e)) => Err(e), Err(_) => Err("mux channel closed".into()), } } @@ -207,7 +331,8 @@ impl TunnelMux { }) .await; match reply_rx.await { - Ok(r) => r, + Ok(Ok((resp, _script_id))) => Ok(resp), + Ok(Err(e)) => Err(e), Err(_) => Err("mux channel closed".into()), } } @@ -231,19 +356,147 @@ impl TunnelMux { } } - fn server_no_longpoll(&self) -> bool { - self.server_no_longpoll.load(Ordering::Relaxed) + /// True only when *every* known deployment is currently in legacy + /// mode. Both per-session decisions in `tunnel_loop` (the 30 s + /// read-timeout backoff and the skip-empty-when-idle short-circuit) + /// gate on this aggregate — they can't pick a per-deployment answer + /// ahead of time because the next op's deployment is chosen by + /// `next_script_id()` only when the batch fires. With one + /// long-poll-capable peer still around, the loop must keep emitting + /// empty polls so round-robin lands some on that peer (where the + /// server can hold them open and deliver pushed bytes). + /// + /// Known limitation: the comparison is against *all configured* + /// deployments (`num_scripts`), not currently-selectable ones. A + /// fleet where most deployments are blacklisted in `DomainFronter` + /// (10 min cooldown) and the only selectable deployment(s) are + /// legacy will keep the fast cadence for up to that cooldown, even + /// though every reachable peer is legacy. Accepted because + /// integrating the blacklist would require a hot-path query on the + /// fronter's mutex once per `tunnel_loop` iteration; a heavily- + /// blacklisted fleet has bigger problems than quota optimization, + /// and the worst-case quota cost is bounded by the cooldown. + /// + /// Hot path: lock-free relaxed load. If the cached value is `true`, + /// double-check under the mutex with a sweep for expired entries — + /// otherwise stale legacy marks would keep us in the slow path forever + /// after every deployment recovers (the `mark_server_no_longpoll` sweep + /// only fires on the next mark, which may never come). + fn all_servers_legacy(&self) -> bool { + if !self.all_legacy.load(Ordering::Relaxed) { + return false; + } + let now = Instant::now(); + let mut deps = match self.legacy_deployments.lock() { + Ok(g) => g, + Err(p) => p.into_inner(), + }; + deps.retain(|_, marked_at| now.duration_since(*marked_at) < LEGACY_RECOVER_AFTER); + let still_all = deps.len() == self.num_scripts; + if !still_all { + self.all_legacy.store(false, Ordering::Relaxed); + } + still_all } - fn mark_server_no_longpoll(&self) { - if !self.server_no_longpoll.swap(true, Ordering::Relaxed) { + fn mark_server_no_longpoll(&self, script_id: &str) { + let now = Instant::now(); + let mut deps = match self.legacy_deployments.lock() { + Ok(g) => g, + Err(p) => p.into_inner(), + }; + // Inline expiry sweep: if any entry has aged past + // LEGACY_RECOVER_AFTER, drop it before recomputing `all_legacy`. + // Without this, an entry that should have recovered would still + // count toward the aggregate. + deps.retain(|_, marked_at| now.duration_since(*marked_at) < LEGACY_RECOVER_AFTER); + let was_present = deps.contains_key(script_id); + deps.insert(script_id.to_string(), now); + let all = deps.len() == self.num_scripts; + // Atomic written under the lock and *after* the map insert. Any + // reader that observes `all_legacy = true` has seen a complete + // map state where every deployment is marked. + self.all_legacy.store(all, Ordering::Relaxed); + drop(deps); + // Only log on first-mark-for-this-cycle: after `LEGACY_RECOVER_AFTER` + // expiry + re-detection we re-log, which is intentional — that's + // a real signal that the deployment regressed back to legacy mode. + if !was_present { + let short = &script_id[..script_id.len().min(8)]; tracing::warn!( - "tunnel-node returned an empty poll faster than {:?}; assuming legacy (no long-poll) drain — falling back to skip-empty-when-idle to avoid quota waste", + "tunnel-node deployment {}... returned an empty poll faster than {:?}; assuming legacy (no long-poll) drain — this deployment will skip empty polls when idle for the next {:?}", + short, LEGACY_DETECT_THRESHOLD, + LEGACY_RECOVER_AFTER, ); } } + /// Returns true if `(host, port)` has a non-expired unreachable entry. + /// The proxy front-end uses this to skip the tunnel and reply + /// "host unreachable" immediately on follow-up CONNECTs. + pub fn is_unreachable(&self, host: &str, port: u16) -> bool { + let now = Instant::now(); + let mut cache = match self.unreachable_cache.lock() { + Ok(g) => g, + Err(p) => p.into_inner(), + }; + let key = (normalize_cache_host(host), port); + match cache.get(&key) { + Some(expiry) if *expiry > now => true, + Some(_) => { + cache.remove(&key); + false + } + None => false, + } + } + + /// If `err` looks like a network-unreachable / no-route-to-host error + /// from the tunnel-node, remember the target for `UNREACHABLE_CACHE_TTL`. + /// No-op for any other error (timeouts, refused, EOF, etc.) — those can + /// be transient and we don't want to lock out a host on a flaky moment. + fn record_unreachable_if_match(&self, host: &str, port: u16, err: &str) { + if !is_unreachable_error_str(err) { + return; + } + let mut cache = match self.unreachable_cache.lock() { + Ok(g) => g, + Err(p) => p.into_inner(), + }; + // Cap enforcement is two-stage: first drop anything already expired, + // then if we're STILL at/above the cap (i.e. an unbounded burst of + // unique unreachable hosts within the TTL), evict the entry that + // would expire soonest. This bounds the map size at all times — a + // pure `retain` on expiry alone would let the map grow unbounded + // until the first entry's TTL elapses. + if cache.len() >= UNREACHABLE_CACHE_MAX { + let now = Instant::now(); + cache.retain(|_, expiry| *expiry > now); + while cache.len() >= UNREACHABLE_CACHE_MAX { + let victim = cache + .iter() + .min_by_key(|(_, expiry)| **expiry) + .map(|(k, _)| k.clone()); + match victim { + Some(k) => { + cache.remove(&k); + } + None => break, + } + } + } + let key = (normalize_cache_host(host), port); + cache.insert(key, Instant::now() + UNREACHABLE_CACHE_TTL); + tracing::debug!( + "negative-cached {}:{} for {:?} ({})", + host, + port, + UNREACHABLE_CACHE_TTL, + err + ); + } + fn record_preread_win(&self, port: u16, elapsed: Duration) { self.preread_win.fetch_add(1, Ordering::Relaxed); self.preread_win_total_us @@ -302,7 +555,9 @@ impl TunnelMux { } } -async fn mux_loop(mut rx: mpsc::Receiver, fronter: Arc) { +async fn mux_loop(mut rx: mpsc::Receiver, fronter: Arc, coalesce_step_ms: u64, coalesce_max_ms: u64) { + let coalesce_step = Duration::from_millis(coalesce_step_ms); + let coalesce_max = Duration::from_millis(coalesce_max_ms); // One semaphore per deployment ID, each allowing 30 concurrent requests. let sems: Arc>> = Arc::new( fronter @@ -319,19 +574,42 @@ async fn mux_loop(mut rx: mpsc::Receiver, fronter: Arc) { loop { let mut msgs = Vec::new(); - match tokio::time::timeout(Duration::from_millis(30), rx.recv()).await { - Ok(Some(msg)) => msgs.push(msg), - Ok(None) => break, - Err(_) => continue, + // Block on the first message — no point waking up to find an empty + // queue. Once the first op lands, the adaptive coalesce loop waits + // in `coalesce_step` increments (resetting on each new arrival, up + // to `coalesce_max`) so concurrent ops land in the same batch. + match rx.recv().await { + Some(msg) => msgs.push(msg), + None => break, } - while let Ok(msg) = rx.try_recv() { - msgs.push(msg); + let hard_deadline = tokio::time::Instant::now() + coalesce_max; + let mut soft_deadline = tokio::time::Instant::now() + coalesce_step; + loop { + // Drain anything that's already queued without waiting. + while let Ok(msg) = rx.try_recv() { + msgs.push(msg); + // Reset the soft deadline — more ops are arriving. + soft_deadline = tokio::time::Instant::now() + coalesce_step; + } + let now = tokio::time::Instant::now(); + let wait_until = soft_deadline.min(hard_deadline); + if now >= wait_until { + break; + } + match tokio::time::timeout(wait_until - now, rx.recv()).await { + Ok(Some(msg)) => { + msgs.push(msg); + // New op arrived — extend the soft deadline. + soft_deadline = tokio::time::Instant::now() + coalesce_step; + } + Ok(None) => return, + Err(_) => break, // soft or hard deadline hit, no more ops + } } // Split: plain connects go parallel, data-bearing ops get batched. let mut data_ops: Vec = Vec::new(); - let mut data_replies: Vec<(usize, oneshot::Sender>)> = - Vec::new(); + let mut data_replies: Vec<(usize, BatchedReply)> = Vec::new(); let mut close_sids: Vec = Vec::new(); let mut batch_payload_bytes: usize = 0; @@ -527,7 +805,7 @@ async fn fire_batch( sems: &Arc>>, fronter: &Arc, data_ops: Vec, - data_replies: Vec<(usize, oneshot::Sender>)>, + data_replies: Vec<(usize, BatchedReply)>, ) { let script_id = fronter.next_script_id(); let sem = sems @@ -558,23 +836,111 @@ async fn fire_batch( match result { Ok(Ok(batch_resp)) => { + f.record_batch_success(&script_id); + // Wire the Full-mode usage counter that #230 / #362 flagged + // as stuck-at-zero. Each successful batch is one + // `UrlFetchApp.fetch()` call against the deploying Google + // account's daily quota — bytes-counted is the inbound JSON + // response which is the closest analogue to the apps_script + // path's `record_today(bytes_received)` (we don't have the + // exact response byte count post-deserialize, so we use a + // proxy: sum of per-session response payload bytes the + // batch carried back). Underestimates by JSON envelope + // overhead but is in the right order of magnitude. + let response_bytes: u64 = batch_resp + .r + .iter() + .map(|r| { + // `d` carries TCP payload (base64 string len ≈ + // 4/3 of decoded bytes; close enough); `pkts` + // carries UDP datagrams (each base64); plus any + // error string. Sum gives a stable proxy for + // "how much did this batch move." + let d = r.d.as_ref().map(|s| s.len() as u64).unwrap_or(0); + let pkts = r + .pkts + .as_ref() + .map(|v| v.iter().map(|p| p.len() as u64).sum::()) + .unwrap_or(0); + d + pkts + }) + .sum(); + f.record_today(response_bytes); + let sid_short = &script_id[..script_id.len().min(8)]; for (idx, reply) in data_replies { if let Some(resp) = batch_resp.r.get(idx) { - let _ = reply.send(Ok(resp.clone())); + let _ = reply.send(Ok((resp.clone(), script_id.clone()))); } else { - let _ = reply.send(Err("missing response in batch".into())); + let _ = reply.send(Err(format!( + "missing response in batch from script {}", + sid_short + ))); } } } Ok(Err(e)) => { + // Read-side timeout from `domain_fronter`: Apps Script didn't + // start streaming response bytes within the per-read deadline. + // Common cause: deployment's `TUNNEL_SERVER_URL` points at a + // dead host, so UrlFetchApp inside Apps Script hangs until its + // own internal connect timeout. Strike-counter blacklists the + // deployment after a sustained pattern. + if matches!(e, FronterError::Timeout) { + f.record_timeout_strike(&script_id); + } let err_msg = format!("{}", e); - tracing::warn!("batch failed: {}", err_msg); + let sid_short = &script_id[..script_id.len().min(8)]; + // Detect the body string we ship as the v1.8.0 bad-auth + // decoy. v1.8.1 asserted "AUTH_KEY mismatch" outright, but + // #404 (w0l4i) found the same body comes back from Apps + // Script in 3 other unrelated cases too: + // + // 1. AUTH_KEY mismatch — our intentional decoy + // 2. Apps Script execution timeout/ — runtime hit 6-min + // mid-call quota tear cap or per-100s quota + // 3. Apps Script internal hiccup — Google-side flake, + // serves placeholder + // 4. ISP-side response truncation — #313 pattern, the + // response was assembled + // but ate an RST mid-flight + // + // So we surface all four candidates instead of asserting #1. + // Users can flip DIAGNOSTIC_MODE=true in Code.gs to disambiguate: + // only #1 still returns the decoy in diagnostic mode; the + // others return real JSON or different errors. + if err_msg.contains("The script completed but did not return anything") { + tracing::error!( + "batch failed (script {}): got the v1.8.0 decoy/placeholder body — \ + could be (1) AUTH_KEY mismatch between mhrv-rs config and Code.gs \ + (run a direct curl probe against the deployment to verify), \ + (2) Apps Script execution timeout or per-100s quota tear (try \ + lowering parallel_concurrency in config), (3) Apps Script \ + internal hiccup (transient, retry next batch), or (4) ISP-side \ + response truncation (#313 pattern, try a different google_ip). \ + To distinguish (1) from the rest: set DIAGNOSTIC_MODE=true at \ + the top of Code.gs + redeploy as new version — only AUTH_KEY \ + mismatch returns this body in diagnostic mode.", + sid_short + ); + } else { + tracing::warn!("batch failed (script {}): {}", sid_short, err_msg); + } for (_, reply) in data_replies { let _ = reply.send(Err(err_msg.clone())); } } Err(_) => { - tracing::warn!("batch timed out after {:?} ({} ops)", BATCH_TIMEOUT, n_ops); + // Whole-batch budget (`BATCH_TIMEOUT`, 30 s) elapsed. Even + // stronger signal than a per-read timeout — count it the same + // way so a truly-stuck deployment exits round-robin fast. + f.record_timeout_strike(&script_id); + let sid_short = &script_id[..script_id.len().min(8)]; + tracing::warn!( + "batch timed out after {:?} (script {}, {} ops)", + BATCH_TIMEOUT, + sid_short, + n_ops + ); for (_, reply) in data_replies { let _ = reply.send(Err("batch timed out".into())); } @@ -697,6 +1063,11 @@ async fn connect_plain(host: &str, port: u16, mux: &Arc) -> std::io:: Ok(Ok(resp)) => { if let Some(ref e) = resp.e { tracing::error!("tunnel connect error for {}:{}: {}", host, port, e); + // Only cache here: `resp.e` is the tunnel-node's own connect() + // result against the target. The outer `Ok(Err(_))` arm below + // is a transport-level failure (relay → Apps Script → tunnel- + // node never reached) and tells us nothing about the target. + mux.record_unreachable_if_match(host, port, e); return Err(std::io::Error::new( std::io::ErrorKind::ConnectionRefused, e.clone(), @@ -736,13 +1107,16 @@ async fn connect_with_initial_data( .await; let resp = match reply_rx.await { - Ok(Ok(resp)) => resp, + Ok(Ok((resp, _script_id))) => resp, Ok(Err(e)) => { if is_connect_data_unsupported_error_str(&e) { tracing::debug!("connect_data unsupported for {}:{}: {}", host, port, e); return Ok(ConnectDataOutcome::Unsupported); } tracing::error!("tunnel connect_data error for {}:{}: {}", host, port, e); + // Outer transport failure (relay/Apps Script never reached the + // tunnel-node). Don't poison the destination cache from here — + // see `connect_plain` for the same reasoning. return Err(std::io::Error::new( std::io::ErrorKind::ConnectionRefused, e, @@ -768,6 +1142,8 @@ async fn connect_with_initial_data( if let Some(ref e) = resp.e { tracing::error!("tunnel connect_data error for {}:{}: {}", host, port, e); + // `resp.e` is the tunnel-node's own connect result — cache it. + mux.record_unreachable_if_match(host, port, e); return Err(std::io::Error::new( std::io::ErrorKind::ConnectionRefused, e.clone(), @@ -834,18 +1210,30 @@ async fn tunnel_loop( // drains. With long-poll, the server holds empty polls open up // to its `LONGPOLL_DEADLINE` (~5 s currently), so the client // can keep this read timeout short — the wait is on the wire, - // not here. Against a *legacy* tunnel-node (no long-poll, fast + // not here. Against *legacy* tunnel-nodes (no long-poll, fast // empty replies), the same short cadence + always-poll behavior // would generate continuous round-trips on idle sessions and - // burn Apps Script quota. The `server_no_longpoll` flag detects - // the legacy case from reply latency below and reverts to the - // pre-long-poll cadence: long sleep on local read, skip empty - // polls when sustained-idle. - let legacy_mode = mux.server_no_longpoll(); + // burn Apps Script quota. + // + // Both the read timeout and the skip-empty-when-idle decision + // are gated on `all_legacy` — i.e. *every known deployment is + // currently legacy*. Per-deployment "skip when this script is + // legacy" sounds appealing but is unsafe: the next op's + // deployment is chosen by `next_script_id()` only when the + // batch fires, so the loop can't predict where the empty poll + // will land. Suppressing polls based on the *previous* reply's + // script would stall remote→client data on mixed setups — + // round-robin would never reach the long-poll-capable peer for + // this session if every iteration short-circuits before + // sending. Cost of the conservative gate: legacy peers see + // some wasted empty polls when at least one peer is healthy, + // bounded by round-robin fan-out. Worth it to keep pushed + // bytes flowing. + let all_legacy = mux.all_servers_legacy(); let client_data = if let Some(data) = pending_client_data.take() { Some(data) } else { - let read_timeout = match (legacy_mode, consecutive_empty) { + let read_timeout = match (all_legacy, consecutive_empty) { (_, 0) => Duration::from_millis(20), (_, 1) => Duration::from_millis(80), (_, 2) => Duration::from_millis(200), @@ -864,13 +1252,13 @@ async fn tunnel_loop( } }; - // Legacy-server skip: against a non-long-polling tunnel-node, - // an empty poll is wasted work — fast-empty reply, no push - // delivery benefit. Preserve the pre-long-poll behavior of - // going quiet after a few empties. Long-poll-capable servers - // skip this branch and always send the empty op so the server - // can hold it open. - if legacy_mode && client_data.is_none() && consecutive_empty > 3 { + // Skip empty polls only when *every* deployment is legacy. With + // even one long-poll-capable peer, round-robin will land some + // empty polls there where the server holds them open and can + // deliver pushed bytes — that's the whole point of long-poll, + // so we must keep emitting. See the `all_legacy` comment above + // for why a per-deployment gate here would stall mixed setups. + if all_legacy && client_data.is_none() && consecutive_empty > 3 { continue; } @@ -889,8 +1277,8 @@ async fn tunnel_loop( // Bounded-wait on reply: if the batch this op landed in is slow // (dead target on the tunnel-node side), don't block this session // forever — timeout and let it retry on the next tick. - let resp = match tokio::time::timeout(REPLY_TIMEOUT, reply_rx).await { - Ok(Ok(Ok(r))) => r, + let (resp, script_id) = match tokio::time::timeout(REPLY_TIMEOUT, reply_rx).await { + Ok(Ok(Ok((r, sid_used)))) => (r, sid_used), Ok(Ok(Err(e))) => { tracing::debug!("tunnel data error: {}", e); break; @@ -903,18 +1291,18 @@ async fn tunnel_loop( } }; - // Legacy-server detection: an empty-in/empty-out round trip - // that finishes well under `LEGACY_DETECT_THRESHOLD` is + // Per-deployment legacy detection: an empty-in/empty-out round + // trip that finishes well under `LEGACY_DETECT_THRESHOLD` is // structurally impossible on a long-poll-capable tunnel-node // (the server holds the response either until data arrives or - // until its long-poll deadline). One observation flips the - // sticky flag for the rest of this process. Skip the check - // once already in legacy mode — the comparison is cheap, but - // calling `mark_server_no_longpoll` repeatedly muddies logs. - if !legacy_mode && was_empty_poll { + // until its long-poll deadline). One observation marks *this + // specific* deployment as legacy for `LEGACY_RECOVER_AFTER`; + // peers stay on the fast path. The aggregate `all_legacy` gate + // only flips once *every* deployment has been so marked. + if was_empty_poll { let reply_was_empty = resp.d.as_deref().map(str::is_empty).unwrap_or(true); if reply_was_empty && send_at.elapsed() < LEGACY_DETECT_THRESHOLD { - mux.mark_server_no_longpoll(); + mux.mark_server_no_longpoll(&script_id); } } @@ -1069,6 +1457,133 @@ mod tests { ))); } + #[test] + fn unreachable_error_str_matches_expected_variants() { + assert!(is_unreachable_error_str( + "connect failed: Network is unreachable (os error 101)" + )); + assert!(is_unreachable_error_str("No route to host")); + assert!(is_unreachable_error_str("os error 113")); + // Case-insensitive. + assert!(is_unreachable_error_str( + "CONNECT FAILED: NETWORK IS UNREACHABLE" + )); + } + + #[test] + fn unreachable_error_str_rejects_unrelated() { + assert!(!is_unreachable_error_str("connection refused")); + assert!(!is_unreachable_error_str("connect timed out")); + assert!(!is_unreachable_error_str("connection reset by peer")); + assert!(!is_unreachable_error_str("")); + } + + #[test] + fn negative_cache_records_and_short_circuits() { + let (mux, _rx) = mux_for_test(); + // Initially nothing is cached. + assert!(!mux.is_unreachable("ds6.probe.example", 443)); + // Record a matching error. + mux.record_unreachable_if_match( + "ds6.probe.example", + 443, + "connect failed: Network is unreachable (os error 101)", + ); + assert!(mux.is_unreachable("ds6.probe.example", 443)); + // A different port for the same host is its own entry. + assert!(!mux.is_unreachable("ds6.probe.example", 80)); + } + + #[test] + fn negative_cache_ignores_non_unreachable_errors() { + let (mux, _rx) = mux_for_test(); + mux.record_unreachable_if_match( + "example.com", + 443, + "connect failed: connection refused", + ); + assert!(!mux.is_unreachable("example.com", 443)); + } + + #[test] + fn negative_cache_normalizes_host_keys() { + let (mux, _rx) = mux_for_test(); + // Cache under one casing/format... + mux.record_unreachable_if_match( + "Example.COM.", + 443, + "Network is unreachable (os error 101)", + ); + // ...and look up under several equivalent forms. + assert!(mux.is_unreachable("example.com", 443)); + assert!(mux.is_unreachable("EXAMPLE.com", 443)); + assert!(mux.is_unreachable("example.com.", 443)); + // Different host should still miss. + assert!(!mux.is_unreachable("other.com", 443)); + } + + /// Outer `Ok(Err(_))` from the mux channel means "the relay never + /// reached the tunnel-node" (HTTP/TLS to Apps Script failed, batch + /// timed out, etc.) — the destination wasn't even attempted. Even if + /// that error string contains "Network is unreachable" (e.g. the + /// client device's WAN was momentarily down), it must NOT poison the + /// destination cache, or every host the user touched during a + /// connectivity blip stays refused for 30s. + #[tokio::test] + async fn negative_cache_skips_outer_relay_errors() { + let (mux, mut rx) = mux_for_test(); + let mux_for_task = mux.clone(); + let task = tokio::spawn(async move { + connect_plain("real.target.example", 443, &mux_for_task).await + }); + + // Receive the Connect msg and reply with an outer Err whose string + // would otherwise match `is_unreachable_error_str`. + let msg = rx.recv().await.expect("connect msg"); + let reply = match msg { + MuxMsg::Connect { reply, .. } => reply, + other => panic!("expected Connect, got {:?}", std::mem::discriminant(&other)), + }; + let _ = reply.send(Err( + "relay failed: Network is unreachable (os error 101)".into(), + )); + + let res = task.await.expect("task"); + assert!(res.is_err(), "connect_plain should surface the error"); + assert!( + !mux.is_unreachable("real.target.example", 443), + "outer relay error must not negative-cache the destination" + ); + } + + #[test] + fn negative_cache_enforces_hard_cap_under_unique_burst() { + let (mux, _rx) = mux_for_test(); + // Insert enough unique still-live entries to exceed the cap. The + // map size must never exceed UNREACHABLE_CACHE_MAX, even though + // every entry is fresh and `retain(expired)` prunes nothing. + let burst = UNREACHABLE_CACHE_MAX + 50; + for i in 0..burst { + let host = format!("h{}.example", i); + mux.record_unreachable_if_match( + &host, + 443, + "connect failed: Network is unreachable (os error 101)", + ); + } + let len = mux + .unreachable_cache + .lock() + .map(|g| g.len()) + .unwrap_or(0); + assert!( + len <= UNREACHABLE_CACHE_MAX, + "cache size {} exceeded cap {}", + len, + UNREACHABLE_CACHE_MAX + ); + } + #[test] fn server_speaks_first_covers_common_protocols() { for p in [21u16, 22, 25, 80, 110, 143, 587] { @@ -1091,17 +1606,28 @@ mod tests { /// than wired to a real DomainFronter. Lets tests assert what messages /// the client would emit without needing network or apps_script. fn mux_for_test() -> (Arc, mpsc::Receiver) { + mux_for_test_with(2) + } + + /// Build a TunnelMux for tests with a specific deployment count. The + /// per-deployment legacy state's aggregate gate (`all_servers_legacy`) + /// requires `legacy_deployments.len() == num_scripts`, so tests that + /// exercise that gate need to control how many "deployments" exist. + fn mux_for_test_with(num_scripts: usize) -> (Arc, mpsc::Receiver) { let (tx, rx) = mpsc::channel(16); let mux = Arc::new(TunnelMux { tx, connect_data_unsupported: Arc::new(AtomicBool::new(false)), - server_no_longpoll: Arc::new(AtomicBool::new(false)), + legacy_deployments: Mutex::new(HashMap::new()), + all_legacy: Arc::new(AtomicBool::new(false)), + num_scripts, preread_win: AtomicU64::new(0), preread_loss: AtomicU64::new(0), preread_skip_port: AtomicU64::new(0), preread_skip_unsupported: AtomicU64::new(0), preread_win_total_us: AtomicU64::new(0), preread_total_events: AtomicU64::new(0), + unreachable_cache: Mutex::new(HashMap::new()), }); (mux, rx) } @@ -1144,14 +1670,17 @@ mod tests { assert_eq!(sid, "sid-under-test"); assert_eq!(&data[..], b"CLIENTHELLO"); // Reply with eof so tunnel_loop unwinds cleanly. - let _ = reply.send(Ok(TunnelResponse { - sid: Some("sid-under-test".into()), - d: None, - pkts: None, - eof: Some(true), - e: None, - code: None, - })); + let _ = reply.send(Ok(( + TunnelResponse { + sid: Some("sid-under-test".into()), + d: None, + pkts: None, + eof: Some(true), + e: None, + code: None, + }, + "test-script".to_string(), + ))); } other => panic!( "first mux message was not Data (expected replay); got {:?}", @@ -1171,6 +1700,81 @@ mod tests { .expect("tunnel_loop did not exit after eof"); } + /// Regression for the mixed-mode stall: A is legacy, B is long-poll + /// capable, the session's last reply came from A. A naive per- + /// deployment skip (gated on the *previous* reply's `script_id`) + /// would short-circuit every empty poll on this session — so B + /// never gets a chance to long-poll for us, and remote→client data + /// stalls until either the local client sends bytes or A's TTL + /// expires. The fix gates skip-when-idle on the aggregate + /// `all_servers_legacy()` instead, so the loop keeps emitting empty + /// polls whenever at least one peer can still hold the request open. + /// Replies are paced via `start_paused` time auto-advance — without + /// it the test would take ~2 s of real wall-clock time per session. + #[tokio::test(start_paused = true)] + async fn tunnel_loop_keeps_polling_when_only_some_deployments_legacy() { + use tokio::net::TcpListener; + + let listener = TcpListener::bind(("127.0.0.1", 0)).await.unwrap(); + let addr = listener.local_addr().unwrap(); + let accept = tokio::spawn(async move { listener.accept().await.unwrap().0 }); + let _client = TcpStream::connect(addr).await.unwrap(); + let mut server_side = accept.await.unwrap(); + + // 2 deployments, only A marked legacy → all_servers_legacy = false. + let (mux, mut rx) = mux_for_test_with(2); + mux.mark_server_no_longpoll("script-A"); + assert!(!mux.all_servers_legacy()); + + let loop_handle = tokio::spawn({ + let mux = mux.clone(); + async move { tunnel_loop(&mut server_side, "sid-mixed", &mux, None).await } + }); + + // Reply to 6 empty polls, all from A. With the regression + // (per-deployment skip on `last_script_id == A`), the loop would + // stop emitting at iteration 4 — `consecutive_empty > 3` plus + // `last_was_legacy` would short-circuit the send. With the fix, + // the aggregate gate stays false and the loop keeps polling. + // The 60 s timeout below is paused-time, so it only "elapses" + // if rx.recv() truly never resolves (i.e. the loop has stalled). + for i in 0..6u32 { + let msg = tokio::time::timeout(Duration::from_secs(60), rx.recv()) + .await + .unwrap_or_else(|_| panic!( + "loop stopped emitting at iteration {} — regression: per-deployment skip-when-idle stalled session even though long-poll-capable peer was available", + i + )) + .expect("mux channel closed unexpectedly"); + match msg { + MuxMsg::Data { sid, data, reply } => { + assert_eq!(sid, "sid-mixed"); + assert!(data.is_empty(), "expected empty poll, got {} bytes", data.len()); + let last = i == 5; + let _ = reply.send(Ok(( + TunnelResponse { + sid: Some("sid-mixed".into()), + d: None, + pkts: None, + eof: if last { Some(true) } else { None }, + e: None, + code: None, + }, + "script-A".to_string(), + ))); + } + _ => panic!( + "iteration {}: expected Data poll, got a different MuxMsg variant", + i + ), + } + } + + let _ = tokio::time::timeout(Duration::from_secs(2), loop_handle) + .await + .expect("tunnel_loop did not exit after eof"); + } + /// Once `mark_connect_data_unsupported` is called, future sessions /// must see the flag — no per-session repeat of the detect-and-fallback /// cost. If this regresses, every new flow pays an extra round trip @@ -1185,19 +1789,109 @@ mod tests { assert!(mux.connect_data_unsupported()); } - /// `server_no_longpoll` must be sticky too: once we see a legacy - /// fast-empty reply, every subsequent session uses the legacy idle - /// cadence (long read timeout + skip-empty) for the rest of the - /// process. Flipping it back per-session would either thrash the - /// cadence or double the detection cost. + /// Marking deployment A as legacy must NOT make B look legacy. This + /// is the central guarantee of the per-deployment design: with the + /// old global AtomicBool, one slow / legacy deployment dragged every + /// session onto the 30 s legacy cadence even when the other 7 were + /// long-polling fine. #[test] - fn no_longpoll_cache_is_sticky() { - let (mux, _rx) = mux_for_test(); - assert!(!mux.server_no_longpoll()); - mux.mark_server_no_longpoll(); - assert!(mux.server_no_longpoll()); - mux.mark_server_no_longpoll(); // idempotent - assert!(mux.server_no_longpoll()); + fn legacy_state_is_per_deployment() { + let (mux, _rx) = mux_for_test_with(2); + mux.mark_server_no_longpoll("script-A"); + + let deps = mux.legacy_deployments.lock().unwrap(); + assert!(deps.contains_key("script-A")); + assert!( + !deps.contains_key("script-B"), + "marking A must not insert an entry for B" + ); + } + + /// `all_servers_legacy` (the per-session 30 s read-timeout gate) flips + /// to true *only* when every known deployment has been marked. With + /// 2 deployments, marking one keeps the gate false; marking both + /// flips it true. + #[test] + fn all_servers_legacy_requires_every_deployment() { + let (mux, _rx) = mux_for_test_with(2); + assert!(!mux.all_servers_legacy()); + + mux.mark_server_no_longpoll("script-A"); + assert!( + !mux.all_servers_legacy(), + "1 of 2 marked: aggregate must stay false" + ); + + mux.mark_server_no_longpoll("script-B"); + assert!( + mux.all_servers_legacy(), + "all deployments marked: aggregate flips true" + ); + + // Idempotent re-mark of an already-legacy deployment doesn't + // disturb the aggregate. + mux.mark_server_no_longpoll("script-A"); + assert!(mux.all_servers_legacy()); + } + + /// After `LEGACY_RECOVER_AFTER`, an entry is treated as expired and + /// the deployment rejoins the long-poll fast path. The next mark + /// (against any deployment) sweeps stale entries before recomputing + /// the aggregate gate, so a recovered peer doesn't keep counting + /// toward `all_legacy`. Backdating the mark time avoids a real 60 s + /// sleep in the test — same effect as the wall-clock moving forward. + #[test] + fn legacy_state_recovers_after_ttl() { + let (mux, _rx) = mux_for_test_with(2); + mux.mark_server_no_longpoll("script-A"); + + // Backdate A past LEGACY_RECOVER_AFTER, then mark B. B's mark + // must trigger a sweep that drops the stale A entry. + { + let mut deps = mux.legacy_deployments.lock().unwrap(); + let stale = Instant::now() + .checked_sub(LEGACY_RECOVER_AFTER + Duration::from_secs(1)) + .expect("test environment should have a non-trivial monotonic clock"); + deps.insert("script-A".to_string(), stale); + } + mux.mark_server_no_longpoll("script-B"); + + let deps = mux.legacy_deployments.lock().unwrap(); + assert!( + !deps.contains_key("script-A"), + "expired entry must be swept on the next mark — otherwise stale legacy state never clears" + ); + assert!(deps.contains_key("script-B")); + } + + /// If every deployment is legacy and then time passes past + /// `LEGACY_RECOVER_AFTER` *without any new mark*, the aggregate gate + /// must self-correct on the next `all_servers_legacy()` call. + /// Without the in-place sweep on read, stale legacy marks would keep + /// the 30 s read-timeout active forever after every deployment + /// recovers. + #[test] + fn all_servers_legacy_self_corrects_when_entries_expire() { + let (mux, _rx) = mux_for_test_with(2); + mux.mark_server_no_longpoll("script-A"); + mux.mark_server_no_longpoll("script-B"); + assert!(mux.all_servers_legacy()); + + // Backdate every entry past TTL. + { + let mut deps = mux.legacy_deployments.lock().unwrap(); + let stale = Instant::now() + .checked_sub(LEGACY_RECOVER_AFTER + Duration::from_secs(1)) + .expect("monotonic clock should be far enough along"); + for (_, t) in deps.iter_mut() { + *t = stale; + } + } + + assert!( + !mux.all_servers_legacy(), + "aggregate must self-correct when all entries expire — otherwise the 30 s read timeout sticks forever" + ); } #[test] diff --git a/tunnel-node/Dockerfile b/tunnel-node/Dockerfile index a31f31e8..801a0ac9 100644 --- a/tunnel-node/Dockerfile +++ b/tunnel-node/Dockerfile @@ -35,9 +35,21 @@ COPY src/ ./src/ # BuildKit cache mounts: cargo's registry/git caches and the target/ # directory persist across builds, dramatically speeding up rebuilds when # only application code changes. -RUN --mount=type=cache,target=/usr/local/cargo/registry \ - --mount=type=cache,target=/usr/local/cargo/git \ - --mount=type=cache,target=/app/target \ +# +# `id=...-$TARGETPLATFORM` is load-bearing on multi-arch builds. Without +# it, BuildKit defaults to a single shared cache across architectures +# and the `linux/amd64` + `linux/arm64` jobs race on the same on-disk +# `/usr/local/cargo/registry/src/...//.cargo-ok` extraction. The +# second-arriving arch hits `File exists (os error 17)` mid-unpack and +# the whole multi-arch build fails. Per-platform cache id keeps each +# arch's cache isolated; warm-build speedup is preserved per-arch. +# `target` cache is also platform-scoped because target/ holds object +# files for one ABI and sharing them across arches would just produce +# misses or, worse, invalid linking. +ARG TARGETPLATFORM +RUN --mount=type=cache,target=/usr/local/cargo/registry,id=cargo-registry-${TARGETPLATFORM} \ + --mount=type=cache,target=/usr/local/cargo/git,id=cargo-git-${TARGETPLATFORM} \ + --mount=type=cache,target=/app/target,id=app-target-${TARGETPLATFORM} \ cargo build --release --bin tunnel-node && \ cp /app/target/release/tunnel-node /usr/local/bin/tunnel-node diff --git a/tunnel-node/README.fa.md b/tunnel-node/README.fa.md new file mode 100644 index 00000000..47f891c8 --- /dev/null +++ b/tunnel-node/README.fa.md @@ -0,0 +1,188 @@ +# Tunnel Node — راهنمای فارسی + +> *English: [README.md](./README.md)* + +سرور پل HTTP-tunnel برای حالت `full` در MasterHttpRelayVPN. درخواست‌های HTTP-tunnel رو که از Apps Script می‌رسن، به اتصال‌های واقعی TCP/UDP تبدیل می‌کنه. + +این `tunnel-node` همون قطعه‌ای از مسیر Full mode هست که روی **VPS شما** اجرا می‌شه. جواب کوتاه به سؤال «آیا VPS لازمه؟» = **بله، برای حالت Full بدون VPS کار نمی‌کنه**. + +## معماری + +``` +موبایل/PC → mhrv-rs → [TLS با domain-fronting روی Google] → Apps Script → [HTTP] → Tunnel Node (روی VPS شما) → [TCP/UDP واقعی] → اینترنت +``` + +Tunnel-node session‌های پایدار TCP و UDP رو نگه می‌داره. session‌های TCP اتصال‌های واقعی به سرور مقصد هستن؛ session‌های UDP، socketهای connected-UDP به یک `host:port` مقصد هستن. data از طریق پروتکل JSON جریان داره: + +- **connect** — باز کردن TCP به `host:port` + برگرداندن session ID +- **data** — نوشتن data کلاینت + خوندن جواب سرور +- **udp_open** — باز کردن UDP به `host:port`، اختیاری اولین datagram رو همزمان می‌فرسته +- **udp_data** — یک datagram UDP می‌فرسته، یا اگه `d` ست نشه برای datagram‌های برگشتی poll می‌کنه +- **close** — تخریب session +- **batch** — پردازش چند op در یک request HTTP (تعداد روند-تریپ کمتر) + +## استقرار + +### Cloud Run (پیشنهاد برای کاربران ایرانی متأثر از فیلتر #313) + +اجرای tunnel-node روی **Google Cloud Run** یعنی destination IP خود Google هست — احتمال filter شدن مسیر Apps Script → tunnel-node توسط ISP ایران بسیار پایین‌تر از Hetzner/DigitalOcean. ([کانتکست در #313](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/313)) + +```bash +cd tunnel-node +gcloud run deploy tunnel-node \ + --source . \ + --region us-central1 \ + --allow-unauthenticated \ + --set-env-vars TUNNEL_AUTH_KEY=$(openssl rand -hex 24) \ + --memory 256Mi \ + --cpu 1 \ + --max-instances 1 +``` + +### Docker — image آماده (هر VPS) + +سریع‌ترین مسیر. image آماده pull کن و اجرا کن؛ نیاز به Rust toolchain روی VPS نیست. + +```bash +# secret قوی بساز. ذخیره‌اش کن — همین مقدار رو بعداً تو CodeFull.gs paste می‌کنی. +SECRET=$(openssl rand -hex 24) +echo "TUNNEL_AUTH_KEY شما: $SECRET" + +# Pull + run. +docker run -d \ + --name mhrv-tunnel \ + --restart unless-stopped \ + -p 8080:8080 \ + -e TUNNEL_AUTH_KEY="$SECRET" \ + ghcr.io/therealaleph/mhrv-tunnel-node:latest +``` + +تگ `:latest` آخرین release رو دنبال می‌کنه. برای production توصیه می‌شه روی version مشخص pin بزنی: `ghcr.io/therealaleph/mhrv-tunnel-node:v1.8.0` (یا هر نسخه‌ای که داری). image روی `linux/amd64` و `linux/arm64` موجوده. + +**docker-compose.yml** اگه این رو ترجیح می‌دی: + +```yaml +services: + tunnel: + image: ghcr.io/therealaleph/mhrv-tunnel-node:latest + restart: unless-stopped + ports: + - "8080:8080" + environment: + TUNNEL_AUTH_KEY: ${TUNNEL_AUTH_KEY} +``` + +سپس `TUNNEL_AUTH_KEY=your-secret docker compose up -d`. + +### Docker — build از source + +اگه می‌خوای خودت image رو build کنی (یا custom تغییر بدی): + +```bash +cd tunnel-node +docker build -t tunnel-node . +docker run -p 8080:8080 -e TUNNEL_AUTH_KEY=your-secret tunnel-node +``` + +### Binary مستقیم + +```bash +cd tunnel-node +cargo build --release +TUNNEL_AUTH_KEY=your-secret PORT=8080 ./target/release/tunnel-node +``` + +## متغیرهای محیطی + +| متغیر | الزامی | پیش‌فرض | توضیح | +|-------|--------|---------|-------| +| `TUNNEL_AUTH_KEY` | بله | `changeme` | secret مشترک — باید با `TUNNEL_AUTH_KEY` در CodeFull.gs match کنه | +| `PORT` | خیر | `8080` | پورت listen (Cloud Run خودش این رو ست می‌کنه) | +| `MHRV_DIAGNOSTIC` | خیر | (off) | اگه `1` باشه، روی auth بد به‌جای decoy 404 nginx، JSON `{"e":"unauthorized"}` صریح برمی‌گردونه. **فقط برای setup/debug** — قبل از public کردن tunnel-node خاموشش کن. (v1.8.0+) | + +## پروتکل + +### تک op: `POST /tunnel` + +```json +{"k":"auth","op":"connect","host":"example.com","port":443} +{"k":"auth","op":"data","sid":"uuid","data":"base64"} +{"k":"auth","op":"close","sid":"uuid"} +``` + +### Batch: `POST /tunnel/batch` + +```json +{ + "k": "auth", + "ops": [ + {"op":"data","sid":"uuid1","d":"base64"}, + {"op":"udp_data","sid":"uuid2","d":"base64"}, + {"op":"close","sid":"uuid3"} + ] +} +→ {"r": [{...}, {...}, {...}]} +``` + +### Health check: `GET /health` → `ok` + +## Performance: تعداد deployment و عمق pipeline + +کلاینت mhrv-rs در حالت Full یک batch-multiplexer pipelined اجرا می‌کنه. هر روند-تریپ Apps Script حدود ۲ ثانیه طول می‌کشه، پس کلاینت چندین request batch همزمان شلیک می‌کنه — عمق pipeline برابر تعداد deployment ID‌های Apps Script هست (حداقل ۲، بدون سقف بالا). + +تعداد deployment بیشتر = batchهای همزمان بیشتر روی tunnel-node = latency پایین‌تر برای session. با ۶ deployment، هر ۰.۳ ثانیه یه batch جدید می‌رسه (به‌جای هر ۲ ثانیه). + +خود tunnel-node per-request stateless هست (session‌ها بر اساس UUID key می‌شن)، پس batchهای همزمان رو طبیعی handle می‌کنه. برای بهترین نتیجه، ۳–۱۲ Apps Script روی account‌های Google جداگانه deploy کن و همهٔ deployment ID‌ها رو در config کلاینت لیست کن. + +--- + +## سؤالات رایج + +### حجم مصرف چقدره؟ + +سه لایه overhead هست در حالت Full: + +1. **Base64 encoding** برای data ها در JSON envelope = ~۳۳٪ overhead روی payload (4 byte per 3 byte raw) +2. **JSON envelope + headers** = ~۵-۱۵٪ overhead بسته به اندازه payload +3. **Random padding (v1.8.0+)** برای DPI defense = متوسط ۵۱۲ بایت اضافه به هر batch + +تخمین کلی: اگه ۱ GB دانلود می‌کنی، ~۱.۲۵-۱.۳ GB روی پهنای باند VPS مصرف می‌کنه. + +برای ۲۰ GB ماهانه استفاده روزمره (browsing + Telegram + ویدیو متوسط)، ~۲۵-۲۷ GB پهنای باند VPS لازم داری. Hetzner CX11 (€۴/ماه) ۲۰ TB ماهانه می‌ده — یعنی به سقف نمی‌رسی مگه streaming سنگین. + +### روی موبایل کل برنامه‌ها رو بالا میاره؟ + +**بستگی به Mode داره:** + +- **mhrv-rs Android در Tunnel mode (Operating Mode → Tunnel)** + Full + tunnel-node = ✅ کل ترافیک Android (شامل YouTube، Telegram MTProto، Instagram، Snapchat، هر چیزی) capture می‌شه. این از VpnService استفاده می‌کنه. +- **mhrv-rs Android در Proxy mode** + Full + tunnel-node = فقط app‌هایی که proxy رو صریحاً respect می‌کنن (Chrome، Firefox، برخی app‌های Telegram-فارسی). YouTube/Insta/Telegram اصلی proxy رو نادیده می‌گیرن + از mhrv-rs رد نمی‌شن. + +برای اینکه «همهٔ app‌ها بیان» = حتماً **Tunnel mode** فعال کن. + +### سرعت چقدر خوبه؟ + +برای یک flow (یک download، یک ویدیو، یک TCP connection) معمولاً **۵۰–۲۰۰ KB/s** هست. علت: + +- Apps Script روند-تریپ floor ~۲۰۰-۵۰۰ ms داره (غیر قابل پایین آوردن، Google-side limit) +- هر batch به یک deployment باند می‌شه + هر flow به یک batch +- در نتیجه per-flow throughput = batch_size / batch_round_trip = (~۶۴-۲۵۶ KB) / (~۲۵۰-۵۰۰ ms) ≈ ۱۲۸-۵۰۰ KB/s ceiling + +برای **چند flow همزمان** (browsing با چند تب، Telegram + YouTube همزمان)، throughput جمعی به sum از همه flow‌ها مقیاس می‌خوره — با ۶ deployment روی ۶ Google account می‌تونی ۶ flow همزمان داشته باشی. + +**توصیه واقع‌بینانه:** برای browsing عادی + chat + ویدیو متوسط = کافیه. برای دانلود فایل‌های بزرگ سریع، **Wireguard مستقیم روی همان VPS** ابزار درست‌تره (۵-۱۰x سریع‌تر، چون Apps Script رو دور می‌زنه). mhrv-rs ارزش اصلیش لایه «دور زدن censorship با domain-fronting» هست، نه سرعت raw — وقتی به اون لایه نیاز نداری (مسیر مستقیم به VPS باز هست)، ابزار ساده‌تر بهتره. + +### آیا VPS لازمه؟ + +برای **حالت Full** (شامل Telegram، YouTube بدون 60s SABR cliff، WebSockets، MTProto و هر چیزی غیر-HTTPS-ساده): **بله، VPS الزامی هست**. + +برای **حالت `apps_script`** (browsing فقط HTTPS): **خیر، نیاز به VPS نیست** — فقط نیاز به Apps Script setup روی Google account داری. + +برای **حالت `google_only`** (فقط Google services مثل Search/Gmail/YouTube ساده): **نه VPS لازمه نه Apps Script** — بوت‌استرپ ساده. + +### چه VPS‌ای پیشنهاد می‌شه؟ + +- **Hetzner CX11** (Falkenstein/Helsinki، €۴/ماه) — best value، ۲۰ TB ماهانه، خوب برای کاربران اروپا/خاورمیانه +- **DigitalOcean basic droplet** ($۶/ماه، NYC/SFO) — برای کاربران آمریکا +- **Google Cloud Run** (free tier تا ۲ میلیون request/ماه + ۵ GB egress) — تنها provider که destination IP اصلاً Google هست، پس مسیر Iran→Apps Script→Cloud-Run-tunnel-node کاملاً درون شبکه Google می‌مونه و ISP filter نمی‌بینه. **بهترین گزینه برای کاربران ایرانی متأثر از [#313](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/313)**. + +برای راهنمای قدم‌به‌قدم setup: [#310 reply (راهنمای فارسی)](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/310#issuecomment-4326086988). diff --git a/tunnel-node/README.md b/tunnel-node/README.md index 88d884ba..529c15e3 100644 --- a/tunnel-node/README.md +++ b/tunnel-node/README.md @@ -1,5 +1,7 @@ # Tunnel Node +> *Persian / فارسی: [README.fa.md](./README.fa.md)* + HTTP tunnel bridge server for MasterHttpRelayVPN "full" mode. Bridges HTTP tunnel requests (from Apps Script) to real TCP connections. ## Architecture diff --git a/tunnel-node/src/main.rs b/tunnel-node/src/main.rs index e03ff5e8..9ad9d7fd 100644 --- a/tunnel-node/src/main.rs +++ b/tunnel-node/src/main.rs @@ -22,12 +22,14 @@ use axum::{routing::post, Json, Router}; use base64::engine::general_purpose::STANDARD as B64; use base64::Engine; use serde::{Deserialize, Serialize}; -use tokio::io::{AsyncReadExt, AsyncWriteExt}; -use tokio::net::tcp::{OwnedReadHalf, OwnedWriteHalf}; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; +use tokio::net::tcp::OwnedWriteHalf; use tokio::net::{lookup_host, TcpStream, UdpSocket}; use tokio::sync::{mpsc, Mutex, Notify}; use tokio::task::JoinSet; +mod udpgw; + /// Structured error code returned when the tunnel-node receives an op it /// doesn't recognize. Clients use this (rather than string-matching `e`) to /// detect a version mismatch and gracefully fall back. @@ -95,8 +97,30 @@ const UDP_QUEUE_DROP_LOG_STRIDE: u64 = 100; // Session // --------------------------------------------------------------------------- +/// Writer half — either a real TCP socket or an in-process duplex channel +/// (used for virtual sessions like udpgw). +enum SessionWriter { + Tcp(OwnedWriteHalf), + Duplex(tokio::io::WriteHalf), +} + +impl SessionWriter { + async fn write_all(&mut self, buf: &[u8]) -> std::io::Result<()> { + match self { + SessionWriter::Tcp(w) => w.write_all(buf).await, + SessionWriter::Duplex(w) => w.write_all(buf).await, + } + } + async fn flush(&mut self) -> std::io::Result<()> { + match self { + SessionWriter::Tcp(w) => w.flush().await, + SessionWriter::Duplex(w) => w.flush().await, + } + } +} + struct SessionInner { - writer: Mutex, + writer: Mutex, read_buf: Mutex>, eof: AtomicBool, last_active: Mutex, @@ -110,6 +134,17 @@ struct SessionInner { struct ManagedSession { inner: Arc, reader_handle: tokio::task::JoinHandle<()>, + /// For udpgw sessions, the server task handle (so we can abort on close). + udpgw_handle: Option>, +} + +impl ManagedSession { + fn abort_all(&self) { + self.reader_handle.abort(); + if let Some(ref h) = self.udpgw_handle { + h.abort(); + } + } } /// UDP equivalent of `SessionInner`. Holds a *connected* `UdpSocket` @@ -148,7 +183,7 @@ async fn create_session(host: &str, port: u16) -> std::io::Result std::io::Result ManagedSession { + let (client_half, server_half) = tokio::io::duplex(65536); + let (read_half, write_half) = tokio::io::split(client_half); + + let inner = Arc::new(SessionInner { + writer: Mutex::new(SessionWriter::Duplex(write_half)), + read_buf: Mutex::new(Vec::with_capacity(32768)), + eof: AtomicBool::new(false), + last_active: Mutex::new(Instant::now()), + notify: Notify::new(), + }); + + let inner_ref = inner.clone(); + let reader_handle = tokio::spawn(reader_task(read_half, inner_ref)); + let udpgw_handle = Some(tokio::spawn(udpgw::udpgw_server_task(server_half))); + + ManagedSession { inner, reader_handle, udpgw_handle } } -async fn reader_task(mut reader: OwnedReadHalf, session: Arc) { +async fn reader_task(mut reader: impl AsyncRead + Unpin, session: Arc) { let mut buf = vec![0u8; 65536]; loop { match reader.read(&mut buf).await { @@ -480,6 +535,16 @@ struct AppState { sessions: Arc>>, udp_sessions: Arc>>, auth_key: String, + /// Active probing defense: when false (default, production), bad + /// AUTH_KEY responses are a generic-looking 404 with no JSON-shaped + /// "unauthorized" body — same as a static nginx 404. Active scanners + /// that POST malformed payloads to `/tunnel` to discover proxy + /// endpoints categorize this as a non-tunnel host and move on. + /// Enable via `MHRV_DIAGNOSTIC=1` for setup/debugging — restores the + /// previous JSON `{"e":"unauthorized"}` body so it's clear *which* + /// of "wrong key", "wrong URL path", or "wrong tunnel-node" you've + /// hit. (Inspired by #365 Section 3.) + diagnostic_mode: bool, } // --------------------------------------------------------------------------- @@ -553,19 +618,41 @@ struct BatchResponse { async fn handle_tunnel( State(state): State, Json(req): Json, -) -> Json { +) -> axum::response::Response { if req.k != state.auth_key { - return Json(TunnelResponse::error("unauthorized")); + return decoy_or_unauthorized(state.diagnostic_mode); } - match req.op.as_str() { - "connect" => Json(handle_connect(&state, req.host, req.port).await), + let resp: TunnelResponse = match req.op.as_str() { + "connect" => handle_connect(&state, req.host, req.port).await, "connect_data" => { - Json(handle_connect_data_single(&state, req.host, req.port, req.data).await) + handle_connect_data_single(&state, req.host, req.port, req.data).await } - "data" => Json(handle_data_single(&state, req.sid, req.data).await), - "close" => Json(handle_close(&state, req.sid).await), - other => Json(TunnelResponse::unsupported_op(other)), - } + "data" => handle_data_single(&state, req.sid, req.data).await, + "close" => handle_close(&state, req.sid).await, + other => TunnelResponse::unsupported_op(other), + }; + Json(resp).into_response() +} + +/// Active-probing defense for the bad-auth path. Production default is +/// a 404 with a generic "Not Found" HTML body that mimics a vanilla +/// nginx/apache static error page — active scanners categorize this +/// as a regular web server with nothing interesting and move on. +/// `MHRV_DIAGNOSTIC=1` restores the previous JSON `{"e":"unauthorized"}` +/// body so misconfigured clients get a clear error during setup. +fn decoy_or_unauthorized(diagnostic_mode: bool) -> axum::response::Response { + if diagnostic_mode { + return Json(TunnelResponse::error("unauthorized")).into_response(); + } + let body = "\r\n404 Not Found\r\n\ + \r\n

404 Not Found

\r\n\ +
nginx
\r\n\r\n\r\n"; + ( + StatusCode::NOT_FOUND, + [(header::CONTENT_TYPE, "text/html")], + body, + ) + .into_response() } // --------------------------------------------------------------------------- @@ -602,10 +689,20 @@ async fn handle_batch( }; if req.k != state.auth_key { - let resp = serde_json::to_vec(&BatchResponse { - r: vec![TunnelResponse::error("unauthorized")], - }).unwrap_or_default(); - return (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], resp); + if state.diagnostic_mode { + let resp = serde_json::to_vec(&BatchResponse { + r: vec![TunnelResponse::error("unauthorized")], + }).unwrap_or_default(); + return (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], resp); + } + // Production: same nginx-404 decoy as the single-op path. See + // `decoy_or_unauthorized` for rationale. + let body = "\r\n404 Not Found\r\n\ + \r\n

404 Not Found

\r\n\ +
nginx
\r\n\r\n\r\n" + .as_bytes() + .to_vec(); + return (StatusCode::NOT_FOUND, [(header::CONTENT_TYPE, "text/html")], body); } // Process all ops in two phases. @@ -971,9 +1068,13 @@ async fn handle_connect(state: &AppState, host: Option, port: Option v, Err(r) => return r, }; - let session = match create_session(&host, port).await { - Ok(s) => s, - Err(e) => return TunnelResponse::error(format!("connect failed: {}", e)), + let session = if udpgw::is_udpgw_dest(&host, port) { + create_udpgw_session() + } else { + match create_session(&host, port).await { + Ok(s) => s, + Err(e) => return TunnelResponse::error(format!("connect failed: {}", e)), + } }; let sid = uuid::Uuid::new_v4().to_string(); tracing::info!("session {} -> {}:{}", sid, host, port); @@ -995,9 +1096,13 @@ async fn handle_connect_data_phase1( ) -> Result<(String, Arc), TunnelResponse> { let (host, port) = validate_host_port(host, port)?; - let session = create_session(&host, port) - .await - .map_err(|e| TunnelResponse::error(format!("connect failed: {}", e)))?; + let session = if udpgw::is_udpgw_dest(&host, port) { + create_udpgw_session() + } else { + create_session(&host, port) + .await + .map_err(|e| TunnelResponse::error(format!("connect failed: {}", e)))? + }; // Any failure below this point must abort the reader task, otherwise // the newly-opened upstream TCP connection would leak. Keep the @@ -1146,7 +1251,7 @@ async fn handle_close(state: &AppState, sid: Option) -> TunnelResponse { _ => return TunnelResponse::error("missing sid"), }; if let Some(s) = state.sessions.lock().await.remove(&sid) { - s.reader_handle.abort(); + s.abort_all(); tracing::info!("session {} closed by client", sid); } if let Some(s) = state.udp_sessions.lock().await.remove(&sid) { @@ -1248,7 +1353,20 @@ async fn main() { Arc::new(Mutex::new(HashMap::new())); tokio::spawn(cleanup_task(sessions.clone(), udp_sessions.clone())); - let state = AppState { sessions, udp_sessions, auth_key }; + // MHRV_DIAGNOSTIC=1 in env restores verbose JSON error responses on + // bad auth (instead of the nginx-404 decoy). Use during setup so + // misconfigured clients see "unauthorized"; flip back off in prod. + let diagnostic_mode = std::env::var("MHRV_DIAGNOSTIC") + .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) + .unwrap_or(false); + if diagnostic_mode { + tracing::warn!( + "MHRV_DIAGNOSTIC=1 — bad-auth responses are verbose JSON \ + errors instead of the production nginx-404 decoy. Disable \ + before exposing this tunnel-node to the public internet." + ); + } + let state = AppState { sessions, udp_sessions, auth_key, diagnostic_mode }; let app = Router::new() .route("/tunnel", post(handle_tunnel)) @@ -1283,6 +1401,10 @@ mod tests { sessions: Arc::new(Mutex::new(HashMap::new())), udp_sessions: Arc::new(Mutex::new(HashMap::new())), auth_key: "test-key".into(), + // Tests assert against the JSON `unauthorized` body shape + // (see e.g. `bad_auth_returns_unauthorized`), so they need + // diagnostic_mode enabled. Production default is false. + diagnostic_mode: true, } } @@ -1430,7 +1552,7 @@ mod tests { let (_reader, writer) = client.into_split(); Arc::new(SessionInner { - writer: Mutex::new(writer), + writer: Mutex::new(SessionWriter::Tcp(writer)), read_buf: Mutex::new(Vec::new()), eof: AtomicBool::new(false), last_active: Mutex::new(Instant::now()), @@ -1597,7 +1719,7 @@ mod tests { let stream = TcpStream::connect(addr).await.unwrap(); let (reader, writer) = stream.into_split(); let inner = Arc::new(SessionInner { - writer: Mutex::new(writer), + writer: Mutex::new(SessionWriter::Tcp(writer)), read_buf: Mutex::new(Vec::new()), eof: AtomicBool::new(false), last_active: Mutex::new(Instant::now()), diff --git a/tunnel-node/src/udpgw.rs b/tunnel-node/src/udpgw.rs new file mode 100644 index 00000000..3c6e1800 --- /dev/null +++ b/tunnel-node/src/udpgw.rs @@ -0,0 +1,512 @@ +//! Native implementation of the tun2proxy udpgw wire protocol. +//! +//! Wire format (all fields big-endian): +//! ```text +//! +-----+-------+---------+------+----------+----------+----------+ +//! | LEN | FLAGS | CONN_ID | ATYP | DST.ADDR | DST.PORT | DATA | +//! +-----+-------+---------+------+----------+----------+----------+ +//! | 2 | 1 | 2 | 1 | Variable | 2 | Variable | +//! +-----+-------+---------+------+----------+----------+----------+ +//! ``` +//! +//! Flags: KEEPALIVE=0x01, DATA=0x02, ERR=0x20 +//! ATYP: 0x01=IPv4(4B), 0x03=Domain(1B len + name), 0x04=IPv6(16B) + +use std::net::{Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV4, SocketAddrV6}; +use std::sync::Arc; + +use tokio::io::{AsyncReadExt, AsyncWriteExt, DuplexStream}; +use tokio::net::UdpSocket; + +/// Magic address that the client connects to via the tunnel protocol. +/// `198.18.0.0/15` is reserved for benchmarking (RFC 2544) and will +/// never be a real destination. +pub const UDPGW_MAGIC_IP: [u8; 4] = [198, 18, 0, 1]; +pub const UDPGW_MAGIC_PORT: u16 = 7300; + +const FLAG_KEEPALIVE: u8 = 0x01; +const FLAG_DATA: u8 = 0x02; +const FLAG_ERR: u8 = 0x20; + +const ATYP_IPV4: u8 = 0x01; +const ATYP_DOMAIN: u8 = 0x03; +const ATYP_IPV6: u8 = 0x04; + +/// Maximum UDP payload we'll handle. +const UDP_MTU: usize = 10240; + +// ------------------------------------------------------------------------- +// Frame types +// ------------------------------------------------------------------------- + +#[derive(Debug, Clone)] +enum DstAddr { + V4(Ipv4Addr, u16), + V6(Ipv6Addr, u16), + Domain(String, u16), +} + +impl DstAddr { + fn to_socket_addr(&self) -> std::io::Result { + match self { + DstAddr::V4(ip, port) => Ok(SocketAddr::V4(SocketAddrV4::new(*ip, *port))), + DstAddr::V6(ip, port) => Ok(SocketAddr::V6(SocketAddrV6::new(*ip, *port, 0, 0))), + DstAddr::Domain(name, port) => { + use std::net::ToSocketAddrs; + (name.as_str(), *port) + .to_socket_addrs()? + .next() + .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::AddrNotAvailable, "DNS resolution failed")) + } + } + } + + /// Serialise into SOCKS5 address format: ATYP + addr + port. + fn write_to(&self, buf: &mut Vec) { + match self { + DstAddr::V4(ip, port) => { + buf.push(ATYP_IPV4); + buf.extend_from_slice(&ip.octets()); + buf.extend_from_slice(&port.to_be_bytes()); + } + DstAddr::V6(ip, port) => { + buf.push(ATYP_IPV6); + buf.extend_from_slice(&ip.octets()); + buf.extend_from_slice(&port.to_be_bytes()); + } + DstAddr::Domain(name, port) => { + buf.push(ATYP_DOMAIN); + buf.push(name.len() as u8); + buf.extend_from_slice(name.as_bytes()); + buf.extend_from_slice(&port.to_be_bytes()); + } + } + } + + fn serialised_len(&self) -> usize { + match self { + DstAddr::V4(..) => 1 + 4 + 2, // ATYP + IPv4 + port + DstAddr::V6(..) => 1 + 16 + 2, // ATYP + IPv6 + port + DstAddr::Domain(n, _) => 1 + 1 + n.len() + 2, // ATYP + len + name + port + } + } +} + +#[derive(Debug)] +struct Frame { + flags: u8, + conn_id: u16, + addr: Option, + payload: Vec, +} + +// ------------------------------------------------------------------------- +// Parse / serialise +// ------------------------------------------------------------------------- + +/// Try to parse one frame from `buf`. Returns `(frame, bytes_consumed)` or +/// `None` if the buffer doesn't contain a complete frame yet. +fn try_parse_frame(buf: &[u8]) -> Result, std::io::Error> { + if buf.len() < 2 { + return Ok(None); + } + let body_len = u16::from_be_bytes([buf[0], buf[1]]) as usize; + let total = 2 + body_len; + if buf.len() < total { + return Ok(None); + } + + let body = &buf[2..total]; + if body.len() < 3 { + return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "frame too short")); + } + let flags = body[0]; + let conn_id = u16::from_be_bytes([body[1], body[2]]); + let rest = &body[3..]; + + let (addr, payload_start) = if flags & FLAG_DATA != 0 { + // Parse SOCKS5-style address. + if rest.is_empty() { + return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "missing ATYP")); + } + let atyp = rest[0]; + match atyp { + ATYP_IPV4 => { + if rest.len() < 1 + 4 + 2 { + return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "short IPv4 addr")); + } + let ip = Ipv4Addr::new(rest[1], rest[2], rest[3], rest[4]); + let port = u16::from_be_bytes([rest[5], rest[6]]); + (Some(DstAddr::V4(ip, port)), 7) + } + ATYP_IPV6 => { + if rest.len() < 1 + 16 + 2 { + return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "short IPv6 addr")); + } + let mut octets = [0u8; 16]; + octets.copy_from_slice(&rest[1..17]); + let ip = Ipv6Addr::from(octets); + let port = u16::from_be_bytes([rest[17], rest[18]]); + (Some(DstAddr::V6(ip, port)), 19) + } + ATYP_DOMAIN => { + if rest.len() < 2 { + return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "short domain addr")); + } + let dlen = rest[1] as usize; + if rest.len() < 2 + dlen + 2 { + return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "short domain addr")); + } + let name = String::from_utf8_lossy(&rest[2..2 + dlen]).into_owned(); + let port = u16::from_be_bytes([rest[2 + dlen], rest[3 + dlen]]); + (Some(DstAddr::Domain(name, port)), 2 + dlen + 2) + } + _ => { + return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, format!("unknown ATYP 0x{:02x}", atyp))); + } + } + } else { + (None, 0) + }; + + let payload = rest[payload_start..].to_vec(); + + Ok(Some((Frame { flags, conn_id, addr, payload }, total))) +} + +fn serialise_frame(frame: &Frame) -> Vec { + // Body = flags(1) + conn_id(2) + [addr] + payload + let addr_len = frame.addr.as_ref().map_or(0, |a| a.serialised_len()); + let body_len = 1 + 2 + addr_len + frame.payload.len(); + + let mut buf = Vec::with_capacity(2 + body_len); + buf.extend_from_slice(&(body_len as u16).to_be_bytes()); + buf.push(frame.flags); + buf.extend_from_slice(&frame.conn_id.to_be_bytes()); + if let Some(ref addr) = frame.addr { + addr.write_to(&mut buf); + } + buf.extend_from_slice(&frame.payload); + buf +} + +// ------------------------------------------------------------------------- +// Public API +// ------------------------------------------------------------------------- + +/// Returns `true` if the connect destination is the magic udpgw address. +pub fn is_udpgw_dest(host: &str, port: u16) -> bool { + port == UDPGW_MAGIC_PORT && host == format!("{}.{}.{}.{}", UDPGW_MAGIC_IP[0], UDPGW_MAGIC_IP[1], UDPGW_MAGIC_IP[2], UDPGW_MAGIC_IP[3]) +} + +/// Per-conn_id persistent UDP socket with a background reader that +/// continuously receives datagrams and queues response frames. +struct ConnSocket { + sock: Arc, + _reader: tokio::task::AbortHandle, +} + +/// Run the udpgw server over a duplex stream. Reads udpgw frames from the +/// client half, sends real UDP datagrams, and writes response frames back. +/// Maintains persistent sockets per conn_id so Telegram VoIP (which expects +/// a stable source port) works correctly. +pub async fn udpgw_server_task(stream: DuplexStream) { + let (tx, mut rx) = tokio::sync::mpsc::channel::>(256); + + // Writer task: drains response channel → duplex stream. + let mut read_half = { + let (read_half, write_half) = tokio::io::split(stream); + tokio::spawn(async move { + let mut w = write_half; + while let Some(data) = rx.recv().await { + if w.write_all(&data).await.is_err() { + break; + } + let _ = w.flush().await; + } + }); + read_half + }; + + // Persistent sockets keyed by (conn_id, dest_addr). + let mut sockets: std::collections::HashMap<(u16, SocketAddr), ConnSocket> = std::collections::HashMap::new(); + + let mut buf = Vec::with_capacity(65536); + let mut tmp = [0u8; 65536]; + + loop { + let n = match read_half.read(&mut tmp).await { + Ok(0) | Err(_) => break, + Ok(n) => n, + }; + buf.extend_from_slice(&tmp[..n]); + + loop { + match try_parse_frame(&buf) { + Ok(Some((frame, consumed))) => { + buf.drain(..consumed); + handle_frame(&frame, &tx, &mut sockets).await; + } + Ok(None) => break, + Err(e) => { + tracing::warn!("udpgw frame parse error: {}", e); + if buf.len() >= 2 { + let skip = 2 + u16::from_be_bytes([buf[0], buf[1]]) as usize; + buf.drain(..skip.min(buf.len())); + } else { + buf.clear(); + } + break; + } + } + } + } + + // AbortHandle::drop aborts each reader task automatically. + drop(sockets); + tracing::debug!("udpgw session ended"); +} + +/// Get or create a persistent UDP socket for this (conn_id, dest_addr) pair. +/// A background reader task continuously receives datagrams and queues +/// response frames — no per-packet timeout needed. +async fn get_or_create_socket( + conn_id: u16, + dst: &SocketAddr, + addr: &DstAddr, + tx: &tokio::sync::mpsc::Sender>, + sockets: &mut std::collections::HashMap<(u16, SocketAddr), ConnSocket>, +) -> Option> { + let key = (conn_id, *dst); + if let Some(cs) = sockets.get(&key) { + return Some(cs.sock.clone()); + } + + let bind_addr: SocketAddr = if dst.is_ipv6() { + "[::]:0".parse().unwrap() + } else { + "0.0.0.0:0".parse().unwrap() + }; + let sock = match UdpSocket::bind(bind_addr).await { + Ok(s) => Arc::new(s), + Err(e) => { + tracing::debug!("udpgw bind failed: {}", e); + return None; + } + }; + if let Err(e) = sock.connect(dst).await { + tracing::debug!("udpgw connect {} failed: {}", dst, e); + return None; + } + + // Spawn continuous reader for this socket. + let sock_clone = sock.clone(); + let tx_clone = tx.clone(); + let addr_clone = addr.clone(); + let reader = tokio::spawn(async move { + let mut recv_buf = vec![0u8; UDP_MTU]; + loop { + match sock_clone.recv(&mut recv_buf).await { + Ok(n) => { + let resp = serialise_frame(&Frame { + flags: FLAG_DATA, + conn_id, + addr: Some(addr_clone.clone()), + payload: recv_buf[..n].to_vec(), + }); + if tx_clone.send(resp).await.is_err() { + break; + } + } + Err(_) => break, + } + } + }); + + sockets.insert(key, ConnSocket { sock: sock.clone(), _reader: reader.abort_handle() }); + Some(sock) +} + +async fn handle_frame( + frame: &Frame, + tx: &tokio::sync::mpsc::Sender>, + sockets: &mut std::collections::HashMap<(u16, SocketAddr), ConnSocket>, +) { + if frame.flags & FLAG_KEEPALIVE != 0 { + let resp = serialise_frame(&Frame { + flags: FLAG_KEEPALIVE, + conn_id: frame.conn_id, + addr: None, + payload: vec![], + }); + let _ = tx.send(resp).await; + return; + } + + if frame.flags & FLAG_DATA == 0 { + return; + } + + let Some(ref dst) = frame.addr else { + let _ = tx.send(serialise_err(frame.conn_id)).await; + return; + }; + + // Block QUIC (UDP 443) and DNS (UDP 53) from udpgw: + // - QUIC: forces browsers to fall back to TCP/HTTP2 which is much + // faster through the batch tunnel pipeline. + // - DNS: let tun2proxy's virtual DNS / SOCKS5 UDP associate handle + // it instead — more reliable on the per-session path. + // VoIP (Telegram, Meet) still flows through udpgw normally. + let dst_port = match dst { + DstAddr::V4(_, p) | DstAddr::V6(_, p) | DstAddr::Domain(_, p) => *p, + }; + if dst_port == 443 || dst_port == 53 { + let _ = tx.send(serialise_err(frame.conn_id)).await; + return; + } + + let dst_addr = match dst.to_socket_addr() { + Ok(a) => a, + Err(e) => { + tracing::debug!("udpgw resolve failed: {}", e); + let _ = tx.send(serialise_err(frame.conn_id)).await; + return; + } + }; + + let Some(sock) = get_or_create_socket(frame.conn_id, &dst_addr, dst, tx, sockets).await else { + let _ = tx.send(serialise_err(frame.conn_id)).await; + return; + }; + + // Send the datagram. Response comes asynchronously via the reader task. + if let Err(e) = sock.send(&frame.payload).await { + tracing::debug!("udpgw send to {} failed: {}", dst_addr, e); + let _ = tx.send(serialise_err(frame.conn_id)).await; + } +} + +fn serialise_err(conn_id: u16) -> Vec { + serialise_frame(&Frame { + flags: FLAG_ERR, + conn_id, + addr: None, + payload: vec![], + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn keepalive_round_trip() { + let frame = Frame { flags: FLAG_KEEPALIVE, conn_id: 42, addr: None, payload: vec![] }; + let bytes = serialise_frame(&frame); + let (parsed, consumed) = try_parse_frame(&bytes).unwrap().unwrap(); + assert_eq!(consumed, bytes.len()); + assert_eq!(parsed.flags, FLAG_KEEPALIVE); + assert_eq!(parsed.conn_id, 42); + assert!(parsed.addr.is_none()); + assert!(parsed.payload.is_empty()); + } + + #[test] + fn data_ipv4_round_trip() { + let frame = Frame { + flags: FLAG_DATA, + conn_id: 7, + addr: Some(DstAddr::V4(Ipv4Addr::new(8, 8, 8, 8), 53)), + payload: vec![1, 2, 3, 4], + }; + let bytes = serialise_frame(&frame); + let (parsed, consumed) = try_parse_frame(&bytes).unwrap().unwrap(); + assert_eq!(consumed, bytes.len()); + assert_eq!(parsed.flags, FLAG_DATA); + assert_eq!(parsed.conn_id, 7); + assert_eq!(parsed.payload, vec![1, 2, 3, 4]); + match parsed.addr.unwrap() { + DstAddr::V4(ip, port) => { + assert_eq!(ip, Ipv4Addr::new(8, 8, 8, 8)); + assert_eq!(port, 53); + } + _ => panic!("expected IPv4"), + } + } + + #[test] + fn data_ipv6_round_trip() { + let frame = Frame { + flags: FLAG_DATA, + conn_id: 100, + addr: Some(DstAddr::V6(Ipv6Addr::LOCALHOST, 443)), + payload: b"hello".to_vec(), + }; + let bytes = serialise_frame(&frame); + let (parsed, _) = try_parse_frame(&bytes).unwrap().unwrap(); + assert_eq!(parsed.conn_id, 100); + match parsed.addr.unwrap() { + DstAddr::V6(ip, port) => { + assert_eq!(ip, Ipv6Addr::LOCALHOST); + assert_eq!(port, 443); + } + _ => panic!("expected IPv6"), + } + } + + #[test] + fn data_domain_round_trip() { + let frame = Frame { + flags: FLAG_DATA, + conn_id: 5, + addr: Some(DstAddr::Domain("example.com".into(), 80)), + payload: b"GET /".to_vec(), + }; + let bytes = serialise_frame(&frame); + let (parsed, _) = try_parse_frame(&bytes).unwrap().unwrap(); + match parsed.addr.unwrap() { + DstAddr::Domain(name, port) => { + assert_eq!(name, "example.com"); + assert_eq!(port, 80); + } + _ => panic!("expected Domain"), + } + } + + #[test] + fn err_frame_round_trip() { + let bytes = serialise_err(99); + let (parsed, _) = try_parse_frame(&bytes).unwrap().unwrap(); + assert_eq!(parsed.flags, FLAG_ERR); + assert_eq!(parsed.conn_id, 99); + } + + #[test] + fn partial_frame_returns_none() { + let frame = Frame { flags: FLAG_KEEPALIVE, conn_id: 1, addr: None, payload: vec![] }; + let bytes = serialise_frame(&frame); + // Give it only half the bytes. + assert!(try_parse_frame(&bytes[..bytes.len() / 2]).unwrap().is_none()); + } + + #[test] + fn two_frames_in_buffer() { + let f1 = serialise_frame(&Frame { flags: FLAG_KEEPALIVE, conn_id: 1, addr: None, payload: vec![] }); + let f2 = serialise_frame(&Frame { flags: FLAG_KEEPALIVE, conn_id: 2, addr: None, payload: vec![] }); + let mut buf = f1.clone(); + buf.extend_from_slice(&f2); + + let (p1, c1) = try_parse_frame(&buf).unwrap().unwrap(); + assert_eq!(p1.conn_id, 1); + let (p2, _) = try_parse_frame(&buf[c1..]).unwrap().unwrap(); + assert_eq!(p2.conn_id, 2); + } + + #[test] + fn is_udpgw_dest_works() { + assert!(is_udpgw_dest("198.18.0.1", 7300)); + assert!(!is_udpgw_dest("198.18.0.1", 80)); + assert!(!is_udpgw_dest("8.8.8.8", 7300)); + } +}