From 2187840f2a40a8b32bb8049c7cfec3704c2730fc Mon Sep 17 00:00:00 2001 From: carlosabadia Date: Mon, 4 May 2026 13:35:23 +0200 Subject: [PATCH 1/4] Add checks for broken docs urls --- .github/workflows/check_doc_links.yml | 39 ++++++++ docs/app/scripts/check_doc_links.py | 136 ++++++++++++++++++++++++++ docs/app/tests/test_doc_links.py | 128 ++++++++++++++++++++++++ 3 files changed, 303 insertions(+) create mode 100644 .github/workflows/check_doc_links.yml create mode 100644 docs/app/scripts/check_doc_links.py create mode 100644 docs/app/tests/test_doc_links.py diff --git a/.github/workflows/check_doc_links.yml b/.github/workflows/check_doc_links.yml new file mode 100644 index 00000000000..37c959b44b4 --- /dev/null +++ b/.github/workflows/check_doc_links.yml @@ -0,0 +1,39 @@ +name: check-doc-links +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.id || github.ref }} + cancel-in-progress: true + +on: + pull_request: + branches: ["main"] + paths: + - "docs/**/*.md" + - "docs/app/scripts/check_doc_links.py" + - ".github/workflows/check_doc_links.yml" + push: + branches: ["main"] + paths: + - "docs/**/*.md" + - "docs/app/scripts/check_doc_links.py" + - ".github/workflows/check_doc_links.yml" + +jobs: + check-doc-links: + timeout-minutes: 20 + runs-on: ubuntu-latest + defaults: + run: + working-directory: docs/app + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/setup_build_env + with: + python-version: 3.14 + run-uv-sync: true + - name: Build frontend to generate sitemap.xml + run: uv run reflex export --frontend-only --no-zip + - name: Validate /docs links against sitemap.xml + run: uv run python scripts/check_doc_links.py diff --git a/docs/app/scripts/check_doc_links.py b/docs/app/scripts/check_doc_links.py new file mode 100644 index 00000000000..c4d744d1e52 --- /dev/null +++ b/docs/app/scripts/check_doc_links.py @@ -0,0 +1,136 @@ +"""Validate /docs/* markdown links against the generated sitemap.xml. + +For every .md file under the docs tree, find markdown links of the form +`[text](/docs/...)` and verify: + +1. The URL path contains no underscores (URLs use hyphens). +2. After stripping the `/docs` prefix, the path exists in sitemap.xml. + +Run after building the frontend so .web/public/sitemap.xml is present, e.g.: + + cd docs/app + uv run reflex export --frontend-only --no-zip + uv run python scripts/check_doc_links.py +""" + +from __future__ import annotations + +import argparse +import re +import sys +import xml.etree.ElementTree as ET +from pathlib import Path +from urllib.parse import urlparse + +LINK_RE = re.compile(r"\]\(\s*(/docs(?=[/)#?\s])[^)]*?)(?:\s+\"[^\"]*\")?\s*\)") +SITEMAP_NS = {"sm": "https://www.sitemaps.org/schemas/sitemap/0.9"} +SKIP_DIRS = {".web", "node_modules", "__pycache__", ".git", ".venv", "dist", "build"} + + +def _normalize(path: str) -> str: + path = path.split("#", 1)[0].split("?", 1)[0] + if not path.startswith("/"): + path = "/" + path + return path.rstrip("/") or "/" + + +def _strip_docs_prefix(path: str) -> str: + """Drop a leading `/docs` segment so both deployment styles compare equal.""" + if path == "/docs": + return "/" + if path.startswith("/docs/"): + return path[len("/docs") :] + return path + + +def load_sitemap_paths(sitemap_path: Path) -> set[str]: + """Return the set of normalized URL paths declared in sitemap.xml.""" + tree = ET.parse(sitemap_path) + paths: set[str] = set() + for loc in tree.getroot().findall("sm:url/sm:loc", SITEMAP_NS): + if loc.text is None: + continue + path = urlparse(loc.text.strip()).path + paths.add(_strip_docs_prefix(_normalize(path))) + return paths + + +def iter_md_files(md_root: Path): + """Yield .md files under md_root, skipping build/vendor directories.""" + for path in md_root.rglob("*.md"): + if any(part in SKIP_DIRS for part in path.relative_to(md_root).parts): + continue + yield path + + +def iter_md_links(md_root: Path): + """Yield (file, line_no, raw_url) for every /docs/* markdown link.""" + for md_file in iter_md_files(md_root): + try: + text = md_file.read_text(encoding="utf-8") + except OSError: + continue + for line_no, line in enumerate(text.splitlines(), start=1): + for match in LINK_RE.finditer(line): + yield md_file, line_no, match.group(1) + + +def check(md_root: Path, sitemap_path: Path) -> list[str]: + """Return a list of human-readable error strings.""" + if not sitemap_path.is_file(): + return [ + f"sitemap.xml not found at {sitemap_path}. " + "Build the frontend first (e.g. `uv run reflex export --frontend-only --no-zip`)." + ] + + valid_paths = load_sitemap_paths(sitemap_path) + errors: list[str] = [] + + for md_file, line_no, raw in iter_md_links(md_root): + location = f"{md_file}:{line_no}" + + if "_" in raw: + errors.append( + f"{location}: link contains an underscore (use hyphens): {raw!r}" + ) + + # Compare in /docs-stripped form so the check works whether the + # sitemap entries include the /docs prefix or not. + sitemap_key = _strip_docs_prefix(_normalize(raw)) + if sitemap_key not in valid_paths: + errors.append( + f"{location}: {raw!r} -> {sitemap_key!r} not found in sitemap" + ) + + return errors + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + here = Path(__file__).resolve().parent + parser.add_argument( + "--md-root", + type=Path, + default=here.parent.parent, + help="Root directory containing .md docs (default: ../..).", + ) + parser.add_argument( + "--sitemap", + type=Path, + default=here.parent / ".web" / "public" / "sitemap.xml", + help="Path to sitemap.xml (default: ../.web/public/sitemap.xml).", + ) + args = parser.parse_args() + + errors = check(args.md_root.resolve(), args.sitemap.resolve()) + if errors: + print(f"Found {len(errors)} broken /docs link(s):", file=sys.stderr) + for err in errors: + print(f" {err}", file=sys.stderr) + return 1 + print("All /docs links resolve against sitemap.xml.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/docs/app/tests/test_doc_links.py b/docs/app/tests/test_doc_links.py new file mode 100644 index 00000000000..227bff03f65 --- /dev/null +++ b/docs/app/tests/test_doc_links.py @@ -0,0 +1,128 @@ +"""Unit tests for scripts/check_doc_links.py.""" + +import sys +from pathlib import Path + +import pytest + +sys.path.append(str(Path(__file__).resolve().parent.parent / "scripts")) + +from check_doc_links import LINK_RE, _normalize, check + +SITEMAP_XML = """ + + http://localhost:3000/getting-started/basics/ + http://localhost:3000/library/disclosure/ + +""" + +SITEMAP_XML_WITH_DOCS_PREFIX = """ + + http://localhost:3000/docs/getting-started/basics/ + http://localhost:3000/docs/library/disclosure/ + +""" + + +@pytest.fixture +def docs_tree(tmp_path: Path) -> tuple[Path, Path]: + """Create a tmp docs root + sitemap.xml and return their paths.""" + sitemap = tmp_path / "sitemap.xml" + sitemap.write_text(SITEMAP_XML) + md_root = tmp_path / "docs" + md_root.mkdir() + return md_root, sitemap + + +def test_normalize_strips_fragment_query_and_trailing_slash(): + assert _normalize("/foo/bar/") == "/foo/bar" + assert _normalize("/foo/bar#section") == "/foo/bar" + assert _normalize("/foo/bar?x=1") == "/foo/bar" + assert _normalize("/") == "/" + + +def test_link_re_matches_basic_link(): + matches = LINK_RE.findall("see [basics](/docs/getting-started/basics) here") + assert matches == ["/docs/getting-started/basics"] + + +def test_link_re_does_not_match_docs_prefix_without_separator(): + """`/docsfoo` and `/docs-foo` must not be treated as /docs links.""" + assert LINK_RE.findall("[x](/docsfoo/bar)") == [] + assert LINK_RE.findall("[x](/docs-foo/bar)") == [] + + +def test_link_re_keeps_fragment_and_query(): + assert LINK_RE.findall("[x](/docs/foo#anchor)") == ["/docs/foo#anchor"] + assert LINK_RE.findall("[x](/docs/foo?q=1)") == ["/docs/foo?q=1"] + + +def test_check_passes_for_valid_link(docs_tree): + md_root, sitemap = docs_tree + (md_root / "page.md").write_text("[ok](/docs/getting-started/basics)\n") + assert check(md_root, sitemap) == [] + + +def test_check_flags_missing_link(docs_tree): + md_root, sitemap = docs_tree + (md_root / "page.md").write_text("[bad](/docs/no-such-page)\n") + errors = check(md_root, sitemap) + assert len(errors) == 1 + assert "not found in sitemap" in errors[0] + + +def test_check_flags_underscore_and_missing(docs_tree): + """Underscore link is reported twice: once for the underscore, once for missing.""" + md_root, sitemap = docs_tree + (md_root / "page.md").write_text("[under](/docs/getting_started/basics)\n") + errors = check(md_root, sitemap) + assert len(errors) == 2 + assert any("underscore" in e for e in errors) + assert any("not found in sitemap" in e for e in errors) + + +def test_check_ignores_fragment_for_sitemap_lookup(docs_tree): + md_root, sitemap = docs_tree + (md_root / "page.md").write_text("[anchor](/docs/getting-started/basics#section)\n") + assert check(md_root, sitemap) == [] + + +def test_check_ignores_query_for_sitemap_lookup(docs_tree): + md_root, sitemap = docs_tree + (md_root / "page.md").write_text("[q](/docs/library/disclosure?x=1)\n") + assert check(md_root, sitemap) == [] + + +def test_check_ignores_docs_prefix_lookalikes(docs_tree): + """`/docsfoo` should not even be treated as a /docs link.""" + md_root, sitemap = docs_tree + (md_root / "page.md").write_text("[x](/docsfoo/bar)\n") + assert check(md_root, sitemap) == [] + + +def test_check_skips_build_dirs(docs_tree): + md_root, sitemap = docs_tree + skipped = md_root / "node_modules" / "vendor" + skipped.mkdir(parents=True) + (skipped / "README.md").write_text("[bad](/docs/no-such-page)\n") + assert check(md_root, sitemap) == [] + + +def test_check_returns_helpful_message_when_sitemap_missing(tmp_path): + errors = check(tmp_path, tmp_path / "missing.xml") + assert len(errors) == 1 + assert "sitemap.xml not found" in errors[0] + + +def test_check_works_when_sitemap_has_docs_prefix(tmp_path: Path): + """Both deployment styles (with or without /docs prefix in sitemap) work.""" + sitemap = tmp_path / "sitemap.xml" + sitemap.write_text(SITEMAP_XML_WITH_DOCS_PREFIX) + md_root = tmp_path / "docs" + md_root.mkdir() + (md_root / "page.md").write_text( + "[ok](/docs/getting-started/basics)\n[bad](/docs/no-such-page)\n" + ) + errors = check(md_root, sitemap) + assert len(errors) == 1 + assert "no-such-page" in errors[0] From 873b5925d1de98e5e1411b422e97c03dae65f836 Mon Sep 17 00:00:00 2001 From: carlosabadia Date: Mon, 4 May 2026 14:24:44 +0200 Subject: [PATCH 2/4] updates --- docs/app/scripts/check_doc_links.py | 3 ++- docs/app/tests/test_doc_links.py | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/app/scripts/check_doc_links.py b/docs/app/scripts/check_doc_links.py index c4d744d1e52..a94fc683893 100644 --- a/docs/app/scripts/check_doc_links.py +++ b/docs/app/scripts/check_doc_links.py @@ -88,8 +88,9 @@ def check(md_root: Path, sitemap_path: Path) -> list[str]: for md_file, line_no, raw in iter_md_links(md_root): location = f"{md_file}:{line_no}" + path_only = raw.split("#", 1)[0].split("?", 1)[0] - if "_" in raw: + if "_" in path_only: errors.append( f"{location}: link contains an underscore (use hyphens): {raw!r}" ) diff --git a/docs/app/tests/test_doc_links.py b/docs/app/tests/test_doc_links.py index 227bff03f65..2ecb3cfa24d 100644 --- a/docs/app/tests/test_doc_links.py +++ b/docs/app/tests/test_doc_links.py @@ -87,6 +87,13 @@ def test_check_ignores_fragment_for_sitemap_lookup(docs_tree): assert check(md_root, sitemap) == [] +def test_check_allows_underscores_in_fragment(docs_tree): + """Heading anchors like `#python_code` legitimately contain underscores.""" + md_root, sitemap = docs_tree + (md_root / "page.md").write_text("[x](/docs/getting-started/basics#python_code)\n") + assert check(md_root, sitemap) == [] + + def test_check_ignores_query_for_sitemap_lookup(docs_tree): md_root, sitemap = docs_tree (md_root / "page.md").write_text("[q](/docs/library/disclosure?x=1)\n") From 5b405dd9bf7c4244af792c5f002a524f13961e51 Mon Sep 17 00:00:00 2001 From: carlosabadia Date: Tue, 5 May 2026 19:02:05 +0200 Subject: [PATCH 3/4] combine ci and be more verbose --- .github/workflows/check_doc_links.yml | 39 ------------------- .../.github/workflows/integration_tests.yml | 3 ++ docs/app/scripts/check_doc_links.py | 18 ++++++--- docs/app/tests/test_doc_links.py | 9 +++++ 4 files changed, 25 insertions(+), 44 deletions(-) delete mode 100644 .github/workflows/check_doc_links.yml diff --git a/.github/workflows/check_doc_links.yml b/.github/workflows/check_doc_links.yml deleted file mode 100644 index 37c959b44b4..00000000000 --- a/.github/workflows/check_doc_links.yml +++ /dev/null @@ -1,39 +0,0 @@ -name: check-doc-links -permissions: - contents: read - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.id || github.ref }} - cancel-in-progress: true - -on: - pull_request: - branches: ["main"] - paths: - - "docs/**/*.md" - - "docs/app/scripts/check_doc_links.py" - - ".github/workflows/check_doc_links.yml" - push: - branches: ["main"] - paths: - - "docs/**/*.md" - - "docs/app/scripts/check_doc_links.py" - - ".github/workflows/check_doc_links.yml" - -jobs: - check-doc-links: - timeout-minutes: 20 - runs-on: ubuntu-latest - defaults: - run: - working-directory: docs/app - steps: - - uses: actions/checkout@v4 - - uses: ./.github/actions/setup_build_env - with: - python-version: 3.14 - run-uv-sync: true - - name: Build frontend to generate sitemap.xml - run: uv run reflex export --frontend-only --no-zip - - name: Validate /docs links against sitemap.xml - run: uv run python scripts/check_doc_links.py diff --git a/docs/app/.github/workflows/integration_tests.yml b/docs/app/.github/workflows/integration_tests.yml index 1a03be2b533..399c2207439 100644 --- a/docs/app/.github/workflows/integration_tests.yml +++ b/docs/app/.github/workflows/integration_tests.yml @@ -63,3 +63,6 @@ jobs: - name: Export the website run: reflex export + + - name: Validate /docs links against generated sitemap + run: uv run python scripts/check_doc_links.py diff --git a/docs/app/scripts/check_doc_links.py b/docs/app/scripts/check_doc_links.py index a94fc683893..3a96cee4e0c 100644 --- a/docs/app/scripts/check_doc_links.py +++ b/docs/app/scripts/check_doc_links.py @@ -84,25 +84,33 @@ def check(md_root: Path, sitemap_path: Path) -> list[str]: ] valid_paths = load_sitemap_paths(sitemap_path) - errors: list[str] = [] + print(f"Loaded {len(valid_paths)} URLs from sitemap {sitemap_path}") + + md_files = list(iter_md_files(md_root)) + if not md_files: + return [f"No .md files found under {md_root}. Check --md-root."] + print(f"Scanning {len(md_files)} markdown file(s) under {md_root}") + errors: list[str] = [] + links_checked = 0 for md_file, line_no, raw in iter_md_links(md_root): + links_checked += 1 location = f"{md_file}:{line_no}" path_only = raw.split("#", 1)[0].split("?", 1)[0] + sitemap_key = _strip_docs_prefix(_normalize(raw)) + ok = sitemap_key in valid_paths and "_" not in path_only + print(f" [{'OK ' if ok else 'FAIL'}] {location} -> {raw}") if "_" in path_only: errors.append( f"{location}: link contains an underscore (use hyphens): {raw!r}" ) - - # Compare in /docs-stripped form so the check works whether the - # sitemap entries include the /docs prefix or not. - sitemap_key = _strip_docs_prefix(_normalize(raw)) if sitemap_key not in valid_paths: errors.append( f"{location}: {raw!r} -> {sitemap_key!r} not found in sitemap" ) + print(f"Checked {links_checked} /docs link(s) across {len(md_files)} file(s).") return errors diff --git a/docs/app/tests/test_doc_links.py b/docs/app/tests/test_doc_links.py index 2ecb3cfa24d..546b237a68f 100644 --- a/docs/app/tests/test_doc_links.py +++ b/docs/app/tests/test_doc_links.py @@ -109,6 +109,7 @@ def test_check_ignores_docs_prefix_lookalikes(docs_tree): def test_check_skips_build_dirs(docs_tree): md_root, sitemap = docs_tree + (md_root / "page.md").write_text("[ok](/docs/getting-started/basics)\n") skipped = md_root / "node_modules" / "vendor" skipped.mkdir(parents=True) (skipped / "README.md").write_text("[bad](/docs/no-such-page)\n") @@ -121,6 +122,14 @@ def test_check_returns_helpful_message_when_sitemap_missing(tmp_path): assert "sitemap.xml not found" in errors[0] +def test_check_errors_when_md_root_has_no_markdown(docs_tree): + """If the docs tree is empty, fail loudly instead of silently passing.""" + md_root, sitemap = docs_tree + errors = check(md_root, sitemap) + assert len(errors) == 1 + assert "No .md files found" in errors[0] + + def test_check_works_when_sitemap_has_docs_prefix(tmp_path: Path): """Both deployment styles (with or without /docs prefix in sitemap) work.""" sitemap = tmp_path / "sitemap.xml" From c7ed33947134d94fda7b46eaa7473238d79dbe52 Mon Sep 17 00:00:00 2001 From: carlosabadia Date: Tue, 5 May 2026 19:21:36 +0200 Subject: [PATCH 4/4] move from regex --- docs/app/scripts/check_doc_links.py | 142 +++++++++++++++++++++------- docs/app/tests/test_doc_links.py | 66 +++++++++---- 2 files changed, 158 insertions(+), 50 deletions(-) diff --git a/docs/app/scripts/check_doc_links.py b/docs/app/scripts/check_doc_links.py index 3a96cee4e0c..238ec4bbdbd 100644 --- a/docs/app/scripts/check_doc_links.py +++ b/docs/app/scripts/check_doc_links.py @@ -1,12 +1,16 @@ """Validate /docs/* markdown links against the generated sitemap.xml. -For every .md file under the docs tree, find markdown links of the form -`[text](/docs/...)` and verify: +For every .md file under the docs tree, parse it with reflex-docgen's +markdown parser and verify every `[text](/docs/...)` link: 1. The URL path contains no underscores (URLs use hyphens). 2. After stripping the `/docs` prefix, the path exists in sitemap.xml. -Run after building the frontend so .web/public/sitemap.xml is present, e.g.: +Using the real markdown AST means links inside fenced code blocks are +correctly ignored, reference-style and multi-line links are caught, and +escapes/edge cases are handled the same way the docs site renders them. + +Run after building the frontend so .web/public/sitemap.xml is present: cd docs/app uv run reflex export --frontend-only --no-zip @@ -16,13 +20,29 @@ from __future__ import annotations import argparse -import re import sys import xml.etree.ElementTree as ET +from collections.abc import Iterator from pathlib import Path from urllib.parse import urlparse -LINK_RE = re.compile(r"\]\(\s*(/docs(?=[/)#?\s])[^)]*?)(?:\s+\"[^\"]*\")?\s*\)") +from reflex_docgen.markdown import ( + Block, + BoldSpan, + DirectiveBlock, + HeadingBlock, + ImageSpan, + ItalicSpan, + LinkSpan, + ListBlock, + QuoteBlock, + Span, + StrikethroughSpan, + TableBlock, + TextBlock, + parse_document, +) + SITEMAP_NS = {"sm": "https://www.sitemaps.org/schemas/sitemap/0.9"} SKIP_DIRS = {".web", "node_modules", "__pycache__", ".git", ".venv", "dist", "build"} @@ -55,7 +75,7 @@ def load_sitemap_paths(sitemap_path: Path) -> set[str]: return paths -def iter_md_files(md_root: Path): +def iter_md_files(md_root: Path) -> Iterator[Path]: """Yield .md files under md_root, skipping build/vendor directories.""" for path in md_root.rglob("*.md"): if any(part in SKIP_DIRS for part in path.relative_to(md_root).parts): @@ -63,20 +83,56 @@ def iter_md_files(md_root: Path): yield path -def iter_md_links(md_root: Path): - """Yield (file, line_no, raw_url) for every /docs/* markdown link.""" - for md_file in iter_md_files(md_root): - try: - text = md_file.read_text(encoding="utf-8") - except OSError: - continue - for line_no, line in enumerate(text.splitlines(), start=1): - for match in LINK_RE.finditer(line): - yield md_file, line_no, match.group(1) +def _walk_spans(spans: tuple[Span, ...]) -> Iterator[LinkSpan]: + """Recursively yield every LinkSpan inside a span tree.""" + for span in spans: + if isinstance(span, LinkSpan): + yield span + yield from _walk_spans(span.children) + elif isinstance(span, (BoldSpan, ItalicSpan, StrikethroughSpan, ImageSpan)): + yield from _walk_spans(span.children) + + +def _walk_blocks(blocks: tuple[Block, ...]) -> Iterator[LinkSpan]: + """Recursively yield every LinkSpan in a block tree, skipping CodeBlock.""" + for block in blocks: + if isinstance(block, (HeadingBlock, TextBlock)): + yield from _walk_spans(block.children) + elif isinstance(block, ListBlock): + for item in block.items: + yield from _walk_blocks(item.children) + elif isinstance(block, (QuoteBlock, DirectiveBlock)): + yield from _walk_blocks(block.children) + elif isinstance(block, TableBlock): + for row in (block.header, *block.rows): + for cell in row.cells: + yield from _walk_spans(cell.children) + + +def _line_for(text: str, target: str, cursor: int) -> tuple[int, int]: + """Locate the next occurrence of `](target)` after cursor. + + Returns ``(line_number, new_cursor)``. If the link is reference-style + (no `](target)` in source), falls back to scanning for `]: target`. + Returns ``line_number == 0`` if the target can't be located. + """ + needle = "](" + target + pos = text.find(needle, cursor) + if pos == -1: + # Reference-style links resolve to the same target but live in + # a `[label]: target` definition further down the file. + pos = text.find("]: " + target, cursor) + if pos == -1: + return 0, cursor + return text.count("\n", 0, pos) + 1, pos + len(needle) def check(md_root: Path, sitemap_path: Path) -> list[str]: - """Return a list of human-readable error strings.""" + """Return a list of human-readable error strings. + + Prints a per-link trail and a summary so CI logs make it obvious which + files were scanned and which links were validated. + """ if not sitemap_path.is_file(): return [ f"sitemap.xml not found at {sitemap_path}. " @@ -93,22 +149,42 @@ def check(md_root: Path, sitemap_path: Path) -> list[str]: errors: list[str] = [] links_checked = 0 - for md_file, line_no, raw in iter_md_links(md_root): - links_checked += 1 - location = f"{md_file}:{line_no}" - path_only = raw.split("#", 1)[0].split("?", 1)[0] - sitemap_key = _strip_docs_prefix(_normalize(raw)) - ok = sitemap_key in valid_paths and "_" not in path_only - print(f" [{'OK ' if ok else 'FAIL'}] {location} -> {raw}") - - if "_" in path_only: - errors.append( - f"{location}: link contains an underscore (use hyphens): {raw!r}" - ) - if sitemap_key not in valid_paths: - errors.append( - f"{location}: {raw!r} -> {sitemap_key!r} not found in sitemap" - ) + for md_file in md_files: + try: + text = md_file.read_text(encoding="utf-8") + except OSError: + continue + try: + doc = parse_document(text) + except Exception as exc: + errors.append(f"{md_file}: failed to parse markdown ({exc})") + continue + + cursor = 0 + for link in _walk_blocks(doc.blocks): + target = link.target + if not (target == "/docs" or target.startswith("/docs/")): + continue + + line_no, cursor = _line_for(text, target, cursor) + location = f"{md_file}:{line_no}" if line_no else str(md_file) + links_checked += 1 + + path_only = _normalize(target) + sitemap_key = _strip_docs_prefix(path_only) + has_underscore = "_" in path_only + in_sitemap = sitemap_key in valid_paths + status = "OK" if (in_sitemap and not has_underscore) else "FAIL" + print(f" [{status:<4}] {location} -> {target}") + + if has_underscore: + errors.append( + f"{location}: link contains an underscore (use hyphens): {target!r}" + ) + if not in_sitemap: + errors.append( + f"{location}: {target!r} -> {sitemap_key!r} not found in sitemap" + ) print(f"Checked {links_checked} /docs link(s) across {len(md_files)} file(s).") return errors diff --git a/docs/app/tests/test_doc_links.py b/docs/app/tests/test_doc_links.py index 546b237a68f..d6ac011f6ab 100644 --- a/docs/app/tests/test_doc_links.py +++ b/docs/app/tests/test_doc_links.py @@ -7,7 +7,7 @@ sys.path.append(str(Path(__file__).resolve().parent.parent / "scripts")) -from check_doc_links import LINK_RE, _normalize, check +from check_doc_links import _normalize, check SITEMAP_XML = """ @@ -41,22 +41,6 @@ def test_normalize_strips_fragment_query_and_trailing_slash(): assert _normalize("/") == "/" -def test_link_re_matches_basic_link(): - matches = LINK_RE.findall("see [basics](/docs/getting-started/basics) here") - assert matches == ["/docs/getting-started/basics"] - - -def test_link_re_does_not_match_docs_prefix_without_separator(): - """`/docsfoo` and `/docs-foo` must not be treated as /docs links.""" - assert LINK_RE.findall("[x](/docsfoo/bar)") == [] - assert LINK_RE.findall("[x](/docs-foo/bar)") == [] - - -def test_link_re_keeps_fragment_and_query(): - assert LINK_RE.findall("[x](/docs/foo#anchor)") == ["/docs/foo#anchor"] - assert LINK_RE.findall("[x](/docs/foo?q=1)") == ["/docs/foo?q=1"] - - def test_check_passes_for_valid_link(docs_tree): md_root, sitemap = docs_tree (md_root / "page.md").write_text("[ok](/docs/getting-started/basics)\n") @@ -130,6 +114,54 @@ def test_check_errors_when_md_root_has_no_markdown(docs_tree): assert "No .md files found" in errors[0] +def test_check_ignores_links_in_fenced_code_blocks(docs_tree): + """Links inside ``` fences are not real links and must be skipped.""" + md_root, sitemap = docs_tree + (md_root / "page.md").write_text( + "Some text.\n\n```python\n# See [doc](/docs/no-such-page) for details\n```\n" + ) + assert check(md_root, sitemap) == [] + + +def test_check_resolves_reference_style_links(docs_tree): + """`[label][ref]` + `[ref]: /docs/foo` should resolve and be checked.""" + md_root, sitemap = docs_tree + (md_root / "page.md").write_text( + "See [the basics][b] for details.\n\n[b]: /docs/no-such-page\n" + ) + errors = check(md_root, sitemap) + assert len(errors) == 1 + assert "no-such-page" in errors[0] + + +def test_check_reports_distinct_lines_for_repeated_target(docs_tree): + """Two links to the same /docs target on different lines must report distinct line numbers.""" + md_root, sitemap = docs_tree + (md_root / "page.md").write_text( + "First [x](/docs/no-such-page) here.\n" + "Some other text.\n" + "Second [y](/docs/no-such-page) here.\n" + ) + errors = check(md_root, sitemap) + assert len(errors) == 2 + line_numbers = {err.split(":", 2)[1] for err in errors} + assert line_numbers == {"1", "3"} + + +def test_check_finds_links_inside_lists_and_tables(docs_tree): + """Links inside list items and table cells must still be checked.""" + md_root, sitemap = docs_tree + (md_root / "page.md").write_text( + "- bullet [bad](/docs/no-such-list-page)\n\n" + "| col |\n|-----|\n| [bad](/docs/no-such-table-page) |\n" + ) + errors = check(md_root, sitemap) + assert len(errors) == 2 + joined = "\n".join(errors) + assert "no-such-list-page" in joined + assert "no-such-table-page" in joined + + def test_check_works_when_sitemap_has_docs_prefix(tmp_path: Path): """Both deployment styles (with or without /docs prefix in sitemap) work.""" sitemap = tmp_path / "sitemap.xml"