diff --git a/docs/app/.github/workflows/integration_tests.yml b/docs/app/.github/workflows/integration_tests.yml
index 1a03be2b533..399c2207439 100644
--- a/docs/app/.github/workflows/integration_tests.yml
+++ b/docs/app/.github/workflows/integration_tests.yml
@@ -63,3 +63,6 @@ jobs:
- name: Export the website
run: reflex export
+
+ - name: Validate /docs links against generated sitemap
+ run: uv run python scripts/check_doc_links.py
diff --git a/docs/app/scripts/check_doc_links.py b/docs/app/scripts/check_doc_links.py
new file mode 100644
index 00000000000..238ec4bbdbd
--- /dev/null
+++ b/docs/app/scripts/check_doc_links.py
@@ -0,0 +1,221 @@
+"""Validate /docs/* markdown links against the generated sitemap.xml.
+
+For every .md file under the docs tree, parse it with reflex-docgen's
+markdown parser and verify every `[text](/docs/...)` link:
+
+1. The URL path contains no underscores (URLs use hyphens).
+2. After stripping the `/docs` prefix, the path exists in sitemap.xml.
+
+Using the real markdown AST means links inside fenced code blocks are
+correctly ignored, reference-style and multi-line links are caught, and
+escapes/edge cases are handled the same way the docs site renders them.
+
+Run after building the frontend so .web/public/sitemap.xml is present:
+
+ cd docs/app
+ uv run reflex export --frontend-only --no-zip
+ uv run python scripts/check_doc_links.py
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+import xml.etree.ElementTree as ET
+from collections.abc import Iterator
+from pathlib import Path
+from urllib.parse import urlparse
+
+from reflex_docgen.markdown import (
+ Block,
+ BoldSpan,
+ DirectiveBlock,
+ HeadingBlock,
+ ImageSpan,
+ ItalicSpan,
+ LinkSpan,
+ ListBlock,
+ QuoteBlock,
+ Span,
+ StrikethroughSpan,
+ TableBlock,
+ TextBlock,
+ parse_document,
+)
+
+SITEMAP_NS = {"sm": "https://www.sitemaps.org/schemas/sitemap/0.9"}
+SKIP_DIRS = {".web", "node_modules", "__pycache__", ".git", ".venv", "dist", "build"}
+
+
+def _normalize(path: str) -> str:
+ path = path.split("#", 1)[0].split("?", 1)[0]
+ if not path.startswith("/"):
+ path = "/" + path
+ return path.rstrip("/") or "/"
+
+
+def _strip_docs_prefix(path: str) -> str:
+ """Drop a leading `/docs` segment so both deployment styles compare equal."""
+ if path == "/docs":
+ return "/"
+ if path.startswith("/docs/"):
+ return path[len("/docs") :]
+ return path
+
+
+def load_sitemap_paths(sitemap_path: Path) -> set[str]:
+ """Return the set of normalized URL paths declared in sitemap.xml."""
+ tree = ET.parse(sitemap_path)
+ paths: set[str] = set()
+ for loc in tree.getroot().findall("sm:url/sm:loc", SITEMAP_NS):
+ if loc.text is None:
+ continue
+ path = urlparse(loc.text.strip()).path
+ paths.add(_strip_docs_prefix(_normalize(path)))
+ return paths
+
+
+def iter_md_files(md_root: Path) -> Iterator[Path]:
+ """Yield .md files under md_root, skipping build/vendor directories."""
+ for path in md_root.rglob("*.md"):
+ if any(part in SKIP_DIRS for part in path.relative_to(md_root).parts):
+ continue
+ yield path
+
+
+def _walk_spans(spans: tuple[Span, ...]) -> Iterator[LinkSpan]:
+ """Recursively yield every LinkSpan inside a span tree."""
+ for span in spans:
+ if isinstance(span, LinkSpan):
+ yield span
+ yield from _walk_spans(span.children)
+ elif isinstance(span, (BoldSpan, ItalicSpan, StrikethroughSpan, ImageSpan)):
+ yield from _walk_spans(span.children)
+
+
+def _walk_blocks(blocks: tuple[Block, ...]) -> Iterator[LinkSpan]:
+ """Recursively yield every LinkSpan in a block tree, skipping CodeBlock."""
+ for block in blocks:
+ if isinstance(block, (HeadingBlock, TextBlock)):
+ yield from _walk_spans(block.children)
+ elif isinstance(block, ListBlock):
+ for item in block.items:
+ yield from _walk_blocks(item.children)
+ elif isinstance(block, (QuoteBlock, DirectiveBlock)):
+ yield from _walk_blocks(block.children)
+ elif isinstance(block, TableBlock):
+ for row in (block.header, *block.rows):
+ for cell in row.cells:
+ yield from _walk_spans(cell.children)
+
+
+def _line_for(text: str, target: str, cursor: int) -> tuple[int, int]:
+ """Locate the next occurrence of `](target)` after cursor.
+
+ Returns ``(line_number, new_cursor)``. If the link is reference-style
+ (no `](target)` in source), falls back to scanning for `]: target`.
+ Returns ``line_number == 0`` if the target can't be located.
+ """
+ needle = "](" + target
+ pos = text.find(needle, cursor)
+ if pos == -1:
+ # Reference-style links resolve to the same target but live in
+ # a `[label]: target` definition further down the file.
+ pos = text.find("]: " + target, cursor)
+ if pos == -1:
+ return 0, cursor
+ return text.count("\n", 0, pos) + 1, pos + len(needle)
+
+
+def check(md_root: Path, sitemap_path: Path) -> list[str]:
+ """Return a list of human-readable error strings.
+
+ Prints a per-link trail and a summary so CI logs make it obvious which
+ files were scanned and which links were validated.
+ """
+ if not sitemap_path.is_file():
+ return [
+ f"sitemap.xml not found at {sitemap_path}. "
+ "Build the frontend first (e.g. `uv run reflex export --frontend-only --no-zip`)."
+ ]
+
+ valid_paths = load_sitemap_paths(sitemap_path)
+ print(f"Loaded {len(valid_paths)} URLs from sitemap {sitemap_path}")
+
+ md_files = list(iter_md_files(md_root))
+ if not md_files:
+ return [f"No .md files found under {md_root}. Check --md-root."]
+ print(f"Scanning {len(md_files)} markdown file(s) under {md_root}")
+
+ errors: list[str] = []
+ links_checked = 0
+ for md_file in md_files:
+ try:
+ text = md_file.read_text(encoding="utf-8")
+ except OSError:
+ continue
+ try:
+ doc = parse_document(text)
+ except Exception as exc:
+ errors.append(f"{md_file}: failed to parse markdown ({exc})")
+ continue
+
+ cursor = 0
+ for link in _walk_blocks(doc.blocks):
+ target = link.target
+ if not (target == "/docs" or target.startswith("/docs/")):
+ continue
+
+ line_no, cursor = _line_for(text, target, cursor)
+ location = f"{md_file}:{line_no}" if line_no else str(md_file)
+ links_checked += 1
+
+ path_only = _normalize(target)
+ sitemap_key = _strip_docs_prefix(path_only)
+ has_underscore = "_" in path_only
+ in_sitemap = sitemap_key in valid_paths
+ status = "OK" if (in_sitemap and not has_underscore) else "FAIL"
+ print(f" [{status:<4}] {location} -> {target}")
+
+ if has_underscore:
+ errors.append(
+ f"{location}: link contains an underscore (use hyphens): {target!r}"
+ )
+ if not in_sitemap:
+ errors.append(
+ f"{location}: {target!r} -> {sitemap_key!r} not found in sitemap"
+ )
+
+ print(f"Checked {links_checked} /docs link(s) across {len(md_files)} file(s).")
+ return errors
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser(description=__doc__)
+ here = Path(__file__).resolve().parent
+ parser.add_argument(
+ "--md-root",
+ type=Path,
+ default=here.parent.parent,
+ help="Root directory containing .md docs (default: ../..).",
+ )
+ parser.add_argument(
+ "--sitemap",
+ type=Path,
+ default=here.parent / ".web" / "public" / "sitemap.xml",
+ help="Path to sitemap.xml (default: ../.web/public/sitemap.xml).",
+ )
+ args = parser.parse_args()
+
+ errors = check(args.md_root.resolve(), args.sitemap.resolve())
+ if errors:
+ print(f"Found {len(errors)} broken /docs link(s):", file=sys.stderr)
+ for err in errors:
+ print(f" {err}", file=sys.stderr)
+ return 1
+ print("All /docs links resolve against sitemap.xml.")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/docs/app/tests/test_doc_links.py b/docs/app/tests/test_doc_links.py
new file mode 100644
index 00000000000..d6ac011f6ab
--- /dev/null
+++ b/docs/app/tests/test_doc_links.py
@@ -0,0 +1,176 @@
+"""Unit tests for scripts/check_doc_links.py."""
+
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.append(str(Path(__file__).resolve().parent.parent / "scripts"))
+
+from check_doc_links import _normalize, check
+
+SITEMAP_XML = """
+
+ http://localhost:3000/getting-started/basics/
+ http://localhost:3000/library/disclosure/
+
+"""
+
+SITEMAP_XML_WITH_DOCS_PREFIX = """
+
+ http://localhost:3000/docs/getting-started/basics/
+ http://localhost:3000/docs/library/disclosure/
+
+"""
+
+
+@pytest.fixture
+def docs_tree(tmp_path: Path) -> tuple[Path, Path]:
+ """Create a tmp docs root + sitemap.xml and return their paths."""
+ sitemap = tmp_path / "sitemap.xml"
+ sitemap.write_text(SITEMAP_XML)
+ md_root = tmp_path / "docs"
+ md_root.mkdir()
+ return md_root, sitemap
+
+
+def test_normalize_strips_fragment_query_and_trailing_slash():
+ assert _normalize("/foo/bar/") == "/foo/bar"
+ assert _normalize("/foo/bar#section") == "/foo/bar"
+ assert _normalize("/foo/bar?x=1") == "/foo/bar"
+ assert _normalize("/") == "/"
+
+
+def test_check_passes_for_valid_link(docs_tree):
+ md_root, sitemap = docs_tree
+ (md_root / "page.md").write_text("[ok](/docs/getting-started/basics)\n")
+ assert check(md_root, sitemap) == []
+
+
+def test_check_flags_missing_link(docs_tree):
+ md_root, sitemap = docs_tree
+ (md_root / "page.md").write_text("[bad](/docs/no-such-page)\n")
+ errors = check(md_root, sitemap)
+ assert len(errors) == 1
+ assert "not found in sitemap" in errors[0]
+
+
+def test_check_flags_underscore_and_missing(docs_tree):
+ """Underscore link is reported twice: once for the underscore, once for missing."""
+ md_root, sitemap = docs_tree
+ (md_root / "page.md").write_text("[under](/docs/getting_started/basics)\n")
+ errors = check(md_root, sitemap)
+ assert len(errors) == 2
+ assert any("underscore" in e for e in errors)
+ assert any("not found in sitemap" in e for e in errors)
+
+
+def test_check_ignores_fragment_for_sitemap_lookup(docs_tree):
+ md_root, sitemap = docs_tree
+ (md_root / "page.md").write_text("[anchor](/docs/getting-started/basics#section)\n")
+ assert check(md_root, sitemap) == []
+
+
+def test_check_allows_underscores_in_fragment(docs_tree):
+ """Heading anchors like `#python_code` legitimately contain underscores."""
+ md_root, sitemap = docs_tree
+ (md_root / "page.md").write_text("[x](/docs/getting-started/basics#python_code)\n")
+ assert check(md_root, sitemap) == []
+
+
+def test_check_ignores_query_for_sitemap_lookup(docs_tree):
+ md_root, sitemap = docs_tree
+ (md_root / "page.md").write_text("[q](/docs/library/disclosure?x=1)\n")
+ assert check(md_root, sitemap) == []
+
+
+def test_check_ignores_docs_prefix_lookalikes(docs_tree):
+ """`/docsfoo` should not even be treated as a /docs link."""
+ md_root, sitemap = docs_tree
+ (md_root / "page.md").write_text("[x](/docsfoo/bar)\n")
+ assert check(md_root, sitemap) == []
+
+
+def test_check_skips_build_dirs(docs_tree):
+ md_root, sitemap = docs_tree
+ (md_root / "page.md").write_text("[ok](/docs/getting-started/basics)\n")
+ skipped = md_root / "node_modules" / "vendor"
+ skipped.mkdir(parents=True)
+ (skipped / "README.md").write_text("[bad](/docs/no-such-page)\n")
+ assert check(md_root, sitemap) == []
+
+
+def test_check_returns_helpful_message_when_sitemap_missing(tmp_path):
+ errors = check(tmp_path, tmp_path / "missing.xml")
+ assert len(errors) == 1
+ assert "sitemap.xml not found" in errors[0]
+
+
+def test_check_errors_when_md_root_has_no_markdown(docs_tree):
+ """If the docs tree is empty, fail loudly instead of silently passing."""
+ md_root, sitemap = docs_tree
+ errors = check(md_root, sitemap)
+ assert len(errors) == 1
+ assert "No .md files found" in errors[0]
+
+
+def test_check_ignores_links_in_fenced_code_blocks(docs_tree):
+ """Links inside ``` fences are not real links and must be skipped."""
+ md_root, sitemap = docs_tree
+ (md_root / "page.md").write_text(
+ "Some text.\n\n```python\n# See [doc](/docs/no-such-page) for details\n```\n"
+ )
+ assert check(md_root, sitemap) == []
+
+
+def test_check_resolves_reference_style_links(docs_tree):
+ """`[label][ref]` + `[ref]: /docs/foo` should resolve and be checked."""
+ md_root, sitemap = docs_tree
+ (md_root / "page.md").write_text(
+ "See [the basics][b] for details.\n\n[b]: /docs/no-such-page\n"
+ )
+ errors = check(md_root, sitemap)
+ assert len(errors) == 1
+ assert "no-such-page" in errors[0]
+
+
+def test_check_reports_distinct_lines_for_repeated_target(docs_tree):
+ """Two links to the same /docs target on different lines must report distinct line numbers."""
+ md_root, sitemap = docs_tree
+ (md_root / "page.md").write_text(
+ "First [x](/docs/no-such-page) here.\n"
+ "Some other text.\n"
+ "Second [y](/docs/no-such-page) here.\n"
+ )
+ errors = check(md_root, sitemap)
+ assert len(errors) == 2
+ line_numbers = {err.split(":", 2)[1] for err in errors}
+ assert line_numbers == {"1", "3"}
+
+
+def test_check_finds_links_inside_lists_and_tables(docs_tree):
+ """Links inside list items and table cells must still be checked."""
+ md_root, sitemap = docs_tree
+ (md_root / "page.md").write_text(
+ "- bullet [bad](/docs/no-such-list-page)\n\n"
+ "| col |\n|-----|\n| [bad](/docs/no-such-table-page) |\n"
+ )
+ errors = check(md_root, sitemap)
+ assert len(errors) == 2
+ joined = "\n".join(errors)
+ assert "no-such-list-page" in joined
+ assert "no-such-table-page" in joined
+
+
+def test_check_works_when_sitemap_has_docs_prefix(tmp_path: Path):
+ """Both deployment styles (with or without /docs prefix in sitemap) work."""
+ sitemap = tmp_path / "sitemap.xml"
+ sitemap.write_text(SITEMAP_XML_WITH_DOCS_PREFIX)
+ md_root = tmp_path / "docs"
+ md_root.mkdir()
+ (md_root / "page.md").write_text(
+ "[ok](/docs/getting-started/basics)\n[bad](/docs/no-such-page)\n"
+ )
+ errors = check(md_root, sitemap)
+ assert len(errors) == 1
+ assert "no-such-page" in errors[0]