From d9f26a86357b4b9fe2d498f8ef3fac8b68005931 Mon Sep 17 00:00:00 2001 From: Vinta Chen Date: Sat, 2 May 2026 01:53:19 +0800 Subject: [PATCH 1/2] Improve SEO/AEO discovery surface for awesome-python.com (#3103) * update gitignore * feat: tighten homepage metadata * fix: trim generated HTML whitespace * feat(website): add discovery files and markdown alternate * feat(website): add sitemap lastmod * feat(seo): add Content-Signal directive to robots.txt Signals search, ai-input, and ai-train to crawlers via the experimental Content-Signal header in robots.txt. Co-Authored-By: Claude --------- Co-authored-by: Claude --- .gitignore | 12 +-- README.md | 2 +- website/build.py | 69 ++++++++++++++- website/templates/base.html | 35 ++++---- website/tests/test_build.py | 167 ++++++++++++++++++++++++++++++++++++ 5 files changed, 259 insertions(+), 26 deletions(-) diff --git a/.gitignore b/.gitignore index ca26a6e8fc..0d9f410bb5 100644 --- a/.gitignore +++ b/.gitignore @@ -10,12 +10,12 @@ __pycache__/ website/output/ website/data/ -# claude code +# planning docs +docs/ + +# agents +.agents/ .claude/skills/ -.gstack/ -.playwright-cli/ .superpowers/ +.playwright-cli/ skills-lock.json - -# codex -.agents/ diff --git a/README.md b/README.md index 51ae9d16aa..107b6859f7 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Awesome Python -An opinionated list of Python frameworks, libraries, tools, and resources. +An opinionated guide to the best Python frameworks, libraries, tools, and resources. # **Sponsors** diff --git a/website/build.py b/website/build.py index c223ef18ae..8fb5f38420 100644 --- a/website/build.py +++ b/website/build.py @@ -4,6 +4,8 @@ import json import re import shutil +import xml.etree.ElementTree as ET +from collections.abc import Sequence from datetime import UTC, datetime from pathlib import Path from typing import Any @@ -12,6 +14,9 @@ from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$") +SITE_URL = "https://awesome-python.com/" +SITEMAP_URL = f"{SITE_URL}sitemap.xml" +SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9" SOURCE_TYPE_DOMAINS = { "docs.python.org": "Built-in", @@ -67,6 +72,59 @@ def sort_key(entry: dict) -> tuple[int, int, int, str]: return sorted(entries, key=sort_key) +def build_robots_txt() -> str: + return ( + "User-agent: *\n" + "Content-Signal: search=yes, ai-input=yes, ai-train=yes\n" + "Allow: /\n" + "\n" + f"Sitemap: {SITEMAP_URL}\n" + ) + + +def write_sitemap_xml(path: Path, urls: Sequence[tuple[str, str]]) -> None: + ET.register_namespace("", SITEMAP_NS) + urlset = ET.Element(f"{{{SITEMAP_NS}}}urlset") + for url, lastmod in urls: + url_el = ET.SubElement(urlset, f"{{{SITEMAP_NS}}}url") + loc_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}loc") + loc_el.text = url + lastmod_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}lastmod") + lastmod_el.text = lastmod + + ET.ElementTree(urlset).write(path, encoding="utf-8", xml_declaration=True) + with path.open("ab") as f: + f.write(b"\n") + + +def top_level_heading_text(line: str) -> str | None: + stripped = line.strip() + if not stripped.startswith("# "): + return None + return stripped.removeprefix("#").strip().strip("#").strip().strip("*").strip() + + +def remove_sponsors_section(markdown: str) -> str: + lines = markdown.splitlines(keepends=True) + start_idx = None + for i, line in enumerate(lines): + heading = top_level_heading_text(line) + if heading and heading.lower() == "sponsors": + start_idx = i + break + + if start_idx is None: + return markdown + + end_idx = len(lines) + for i, line in enumerate(lines[start_idx + 1 :], start=start_idx + 1): + if top_level_heading_text(line): + end_idx = i + break + + return "".join(lines[:start_idx] + lines[end_idx:]) + + def extract_entries( categories: list[ParsedSection], groups: list[ParsedGroup], @@ -131,6 +189,7 @@ def build(repo_root: Path) -> None: categories = [cat for g in parsed_groups for cat in g["categories"]] total_entries = sum(c["entry_count"] for c in categories) entries = extract_entries(categories, parsed_groups) + build_date = datetime.now(UTC) stars_data = load_stars(website / "data" / "github_stars.json") @@ -155,6 +214,8 @@ def build(repo_root: Path) -> None: env = Environment( loader=FileSystemLoader(website / "templates"), autoescape=True, + trim_blocks=True, + lstrip_blocks=True, ) site_dir = website / "output" @@ -171,7 +232,7 @@ def build(repo_root: Path) -> None: total_entries=total_entries, total_categories=len(categories), repo_stars=repo_stars, - build_date=datetime.now(UTC).strftime("%B %d, %Y"), + build_date=build_date.strftime("%B %d, %Y"), sponsors=sponsors, ), encoding="utf-8", @@ -182,7 +243,11 @@ def build(repo_root: Path) -> None: if static_src.exists(): shutil.copytree(static_src, static_dst, dirs_exist_ok=True) - (site_dir / "llms.txt").write_text(readme_text, encoding="utf-8") + markdown_index = remove_sponsors_section(readme_text) + (site_dir / "robots.txt").write_text(build_robots_txt(), encoding="utf-8") + write_sitemap_xml(site_dir / "sitemap.xml", [(SITE_URL, build_date.date().isoformat())]) + (site_dir / "index.md").write_text(markdown_index, encoding="utf-8") + (site_dir / "llms.txt").write_text(markdown_index, encoding="utf-8") print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories") print(f"Total entries: {total_entries}") diff --git a/website/templates/base.html b/website/templates/base.html index 34546e73c3..af112095e7 100644 --- a/website/templates/base.html +++ b/website/templates/base.html @@ -1,26 +1,27 @@ + {% set default_meta_title = "Awesome Python" %} + {% set default_meta_description = "An opinionated guide to the best Python frameworks, libraries, and tools. Explore " ~ (entries | length) ~ " curated projects across " ~ total_categories ~ " categories, from AI and agents to data science and web development." %} + {% set canonical_url = "https://awesome-python.com/" %} + {% set social_image_url = "https://awesome-python.com/static/og-image.png" %} + {% set meta_title %}{% block title %}{{ default_meta_title }}{% endblock %}{% endset %} + {% set meta_description %}{% block description %}{{ default_meta_description }}{% endblock %}{% endset %} - {% block title %}Awesome Python{% endblock %} - - + {{ meta_title | trim }} + + + - - - - - + + + + + + + + diff --git a/website/tests/test_build.py b/website/tests/test_build.py index 0b22609a84..1feab77d9c 100644 --- a/website/tests/test_build.py +++ b/website/tests/test_build.py @@ -3,6 +3,9 @@ import json import shutil import textwrap +import xml.etree.ElementTree as ET +from datetime import UTC, date, datetime +from html.parser import HTMLParser from pathlib import Path from build import ( @@ -15,6 +18,40 @@ ) from readme_parser import parse_readme, slugify + +class HeadMetadataParser(HTMLParser): + def __init__(self): + super().__init__() + self.title_count = 0 + self.title = "" + self.meta_by_name = {} + self.meta_by_property = {} + self.links_by_rel = {} + self._in_title = False + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + if tag == "title": + self.title_count += 1 + self._in_title = True + elif tag == "meta": + if "name" in attrs: + self.meta_by_name[attrs["name"]] = attrs.get("content", "") + if "property" in attrs: + self.meta_by_property[attrs["property"]] = attrs.get("content", "") + elif tag == "link" and attrs.get("rel"): + for rel in attrs["rel"].split(): + self.links_by_rel[rel] = attrs.get("href", "") + + def handle_endtag(self, tag): + if tag == "title": + self._in_title = False + + def handle_data(self, data): + if self._in_title: + self.title += data + + # --------------------------------------------------------------------------- # slugify # --------------------------------------------------------------------------- @@ -72,6 +109,11 @@ def _make_repo(self, tmp_path, readme): encoding="utf-8", ) + def _copy_real_templates(self, tmp_path): + real_tpl = Path(__file__).parent / ".." / "templates" + tpl_dir = tmp_path / "website" / "templates" + shutil.copytree(real_tpl, tpl_dir) + def test_build_creates_single_page(self, tmp_path): readme = textwrap.dedent("""\ # Awesome Python @@ -114,6 +156,97 @@ def test_build_creates_single_page(self, tmp_path): # No category sub-pages assert not (site / "categories").exists() + def test_build_creates_root_discovery_files(self, tmp_path): + readme = textwrap.dedent("""\ + # Awesome Python + + Intro. + + --- + + ## Widgets + + - [w1](https://example.com) - A widget. + + # Contributing + + Help! + """) + self._make_repo(tmp_path, readme) + start_date = datetime.now(UTC).date() + build(tmp_path) + end_date = datetime.now(UTC).date() + + site = tmp_path / "website" / "output" + robots = (site / "robots.txt").read_text(encoding="utf-8") + assert robots == ( + "User-agent: *\n" + "Content-Signal: search=yes, ai-input=yes, ai-train=yes\n" + "Allow: /\n" + "\n" + "Sitemap: https://awesome-python.com/sitemap.xml\n" + ) + + sitemap = ET.parse(site / "sitemap.xml") + root = sitemap.getroot() + ns = {"sitemap": "http://www.sitemaps.org/schemas/sitemap/0.9"} + locs = [loc.text for loc in root.findall("sitemap:url/sitemap:loc", ns)] + lastmods = [lastmod.text for lastmod in root.findall("sitemap:url/sitemap:lastmod", ns)] + + assert root.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset" + assert locs == ["https://awesome-python.com/"] + assert len(lastmods) == 1 + assert start_date <= date.fromisoformat(lastmods[0]) <= end_date + assert all(loc.startswith("https://awesome-python.com/") for loc in locs) + assert all("?" not in loc for loc in locs) + + def test_build_creates_markdown_alternate_without_sponsors(self, tmp_path): + readme = textwrap.dedent("""\ + # Awesome Python + + Intro. + + # **Sponsors** + + - **[Sponsor](https://sponsor.example.com)**: Sponsored tool. + + > Become a sponsor: [Sponsor us](SPONSORSHIP.md). + + # Categories + + **Tools** + + - [Widgets](#widgets) + + --- + + ## Widgets + + - [w1](https://example.com) - A widget. + + # Contributing + + Help! + """) + (tmp_path / "README.md").write_text(readme, encoding="utf-8") + self._copy_real_templates(tmp_path) + + build(tmp_path) + + site = tmp_path / "website" / "output" + index_html = (site / "index.html").read_text(encoding="utf-8") + index_md = (site / "index.md").read_text(encoding="utf-8") + llms_txt = (site / "llms.txt").read_text(encoding="utf-8") + + assert '' in index_html + assert index_md == llms_txt + assert index_md.startswith("# Awesome Python\n\nIntro.\n\n# Categories") + assert "# **Sponsors**" not in index_md + assert "Sponsor" not in index_md + assert "SPONSORSHIP.md" not in index_md + assert "## Widgets" in index_md + assert "- [w1](https://example.com) - A widget." in index_md + def test_build_cleans_stale_output(self, tmp_path): readme = textwrap.dedent("""\ # T @@ -235,6 +368,40 @@ def test_build_with_stars_sorts_by_stars(self, tmp_path): # Expand content present assert "expand-content" in html + def test_index_contains_aligned_homepage_metadata(self, tmp_path): + readme = (Path(__file__).parents[2] / "README.md").read_text(encoding="utf-8") + (tmp_path / "README.md").write_text(readme, encoding="utf-8") + self._copy_real_templates(tmp_path) + + build(tmp_path) + + parsed_groups = parse_readme(readme) + categories = [cat for group in parsed_groups for cat in group["categories"]] + entries = extract_entries(categories, parsed_groups) + html = (tmp_path / "website" / "output" / "index.html").read_text(encoding="utf-8") + parser = HeadMetadataParser() + parser.feed(html) + + expected_title = "Awesome Python" + expected_description = f"An opinionated guide to the best Python frameworks, libraries, and tools. Explore {len(entries)} curated projects across {len(categories)} categories, from AI and agents to data science and web development." + expected_url = "https://awesome-python.com/" + expected_image = "https://awesome-python.com/static/og-image.png" + + assert parser.title_count == 1 + assert parser.title.strip() == expected_title + assert parser.meta_by_name["description"] == expected_description + assert parser.links_by_rel["canonical"] == expected_url + assert parser.meta_by_property["og:type"] == "website" + assert parser.meta_by_property["og:title"] == expected_title + assert parser.meta_by_property["og:description"] == expected_description + assert parser.meta_by_property["og:image"] == expected_image + assert parser.meta_by_property["og:url"] == expected_url + assert parser.meta_by_name["twitter:card"] == "summary_large_image" + assert parser.meta_by_name["twitter:title"] == expected_title + assert parser.meta_by_name["twitter:description"] == expected_description + assert parser.meta_by_name["twitter:image"] == expected_image + assert "\n Date: Sat, 2 May 2026 02:32:18 +0800 Subject: [PATCH 2/2] feat: generate llms.txt from template and annotate entries with star counts - Add llms.txt Jinja2 template with a categories_md placeholder - Extract categories body from README and inject it into the template - Annotate bullet-entry lines with GitHub star counts (N GitHub stars) for the main index.md and bare numbers for llms.txt - Add TestAnnotateEntriesWithStars unit tests Co-Authored-By: Claude --- website/build.py | 76 +++++++++++++++++++++++++++++++- website/templates/llms.txt | 9 ++++ website/tests/test_build.py | 87 ++++++++++++++++++++++++++++++++++++- 3 files changed, 169 insertions(+), 3 deletions(-) create mode 100644 website/templates/llms.txt diff --git a/website/build.py b/website/build.py index 8fb5f38420..f9e3aa55d1 100644 --- a/website/build.py +++ b/website/build.py @@ -14,6 +14,8 @@ from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$") +MARKDOWN_LINK_RE = re.compile(r"\[[^\]]+\]\(([^)\s]+)\)") +BULLET_LINE_RE = re.compile(r"^\s*-\s") SITE_URL = "https://awesome-python.com/" SITEMAP_URL = f"{SITE_URL}sitemap.xml" SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9" @@ -104,6 +106,72 @@ def top_level_heading_text(line: str) -> str | None: return stripped.removeprefix("#").strip().strip("#").strip().strip("*").strip() +LLMS_CATEGORIES_PLACEHOLDER = "{{ categories_md }}" + + +def extract_categories_body(markdown: str) -> str: + """Return content under the `# Categories` heading, excluding the heading line itself.""" + lines = markdown.splitlines(keepends=True) + start_idx = None + end_idx = len(lines) + for i, line in enumerate(lines): + heading = top_level_heading_text(line) + if heading is None: + continue + if start_idx is None and heading.lower() == "categories": + start_idx = i + 1 + while start_idx < len(lines) and lines[start_idx].strip() == "": + start_idx += 1 + elif start_idx is not None and i >= start_idx: + end_idx = i + break + if start_idx is None: + return "" + return "".join(lines[start_idx:end_idx]).rstrip() + "\n" + + +def build_llms_txt(template_text: str, readme_text: str, stars_data: dict[str, dict]) -> str: + """Render the llms.txt template by injecting the README's Categories body, then annotate stars.""" + body = extract_categories_body(readme_text).rstrip() + rendered = template_text.replace(LLMS_CATEGORIES_PLACEHOLDER, body) + return annotate_entries_with_stars(rendered, stars_data, format_stars=str) + + +def annotate_entries_with_stars( + markdown: str, + stars_data: dict[str, dict], + *, + format_stars=None, +) -> str: + """Append the star count to bullet entry lines whose first GitHub link has known star data. + + `format_stars` controls the parenthesized text. Defaults to "{N} GitHub stars". + Pass `str` for a bare number. + """ + if format_stars is None: + format_stars = lambda n: f"{n} GitHub stars" # noqa: E731 lambda-assignment + lines = markdown.splitlines(keepends=True) + out: list[str] = [] + for line in lines: + if not BULLET_LINE_RE.match(line): + out.append(line) + continue + annotated = line + for match in MARKDOWN_LINK_RE.finditer(line): + repo_key = extract_github_repo(match.group(1)) + if not repo_key: + continue + entry = stars_data.get(repo_key) + if not entry or "stars" not in entry: + continue + stripped = line.rstrip("\n") + ending = line[len(stripped):] + annotated = f"{stripped} ({format_stars(entry['stars'])}){ending}" + break + out.append(annotated) + return "".join(out) + + def remove_sponsors_section(markdown: str) -> str: lines = markdown.splitlines(keepends=True) start_idx = None @@ -243,11 +311,15 @@ def build(repo_root: Path) -> None: if static_src.exists(): shutil.copytree(static_src, static_dst, dirs_exist_ok=True) - markdown_index = remove_sponsors_section(readme_text) + markdown_index = annotate_entries_with_stars( + remove_sponsors_section(readme_text), stars_data + ) + llms_template = (website / "templates" / "llms.txt").read_text(encoding="utf-8") + llms_txt = build_llms_txt(llms_template, readme_text, stars_data) (site_dir / "robots.txt").write_text(build_robots_txt(), encoding="utf-8") write_sitemap_xml(site_dir / "sitemap.xml", [(SITE_URL, build_date.date().isoformat())]) (site_dir / "index.md").write_text(markdown_index, encoding="utf-8") - (site_dir / "llms.txt").write_text(markdown_index, encoding="utf-8") + (site_dir / "llms.txt").write_text(llms_txt, encoding="utf-8") print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories") print(f"Total entries: {total_entries}") diff --git a/website/templates/llms.txt b/website/templates/llms.txt new file mode 100644 index 0000000000..1db05c3bb1 --- /dev/null +++ b/website/templates/llms.txt @@ -0,0 +1,9 @@ +# Awesome Python + +An opinionated guide to the best Python frameworks, libraries, tools, and resources. + +Use this curated list when you need to find a high-quality Python library or tool for tasks such as web development, data science, machine learning, AI agents, automation, testing, or DevOps. The trailing number on each entry is its star count on GitHub. + +# Categories + +{{ categories_md }} diff --git a/website/tests/test_build.py b/website/tests/test_build.py index 1feab77d9c..32b01917d5 100644 --- a/website/tests/test_build.py +++ b/website/tests/test_build.py @@ -9,6 +9,7 @@ from pathlib import Path from build import ( + annotate_entries_with_stars, build, detect_source_type, extract_entries, @@ -108,6 +109,16 @@ def _make_repo(self, tmp_path, readme): "{% endblock %}", encoding="utf-8", ) + (tpl_dir / "llms.txt").write_text( + "# Awesome Python\n" + "\n" + "Use this list to find Python tools.\n" + "\n" + "# Categories\n" + "\n" + "{{ categories_md }}\n", + encoding="utf-8", + ) def _copy_real_templates(self, tmp_path): real_tpl = Path(__file__).parent / ".." / "templates" @@ -223,6 +234,7 @@ def test_build_creates_markdown_alternate_without_sponsors(self, tmp_path): ## Widgets - [w1](https://example.com) - A widget. + - [w2](https://github.com/owner/w2) - A starred widget. # Contributing @@ -231,6 +243,13 @@ def test_build_creates_markdown_alternate_without_sponsors(self, tmp_path): (tmp_path / "README.md").write_text(readme, encoding="utf-8") self._copy_real_templates(tmp_path) + data_dir = tmp_path / "website" / "data" + data_dir.mkdir(parents=True) + stars = { + "owner/w2": {"stars": 42, "owner": "owner", "fetched_at": "2026-01-01T00:00:00+00:00"}, + } + (data_dir / "github_stars.json").write_text(json.dumps(stars), encoding="utf-8") + build(tmp_path) site = tmp_path / "website" / "output" @@ -239,13 +258,23 @@ def test_build_creates_markdown_alternate_without_sponsors(self, tmp_path): llms_txt = (site / "llms.txt").read_text(encoding="utf-8") assert '' in index_html - assert index_md == llms_txt assert index_md.startswith("# Awesome Python\n\nIntro.\n\n# Categories") assert "# **Sponsors**" not in index_md assert "Sponsor" not in index_md assert "SPONSORSHIP.md" not in index_md assert "## Widgets" in index_md assert "- [w1](https://example.com) - A widget." in index_md + assert "- [w2](https://github.com/owner/w2) - A starred widget. (42 GitHub stars)" in index_md + + assert llms_txt.startswith("# Awesome Python\n") + assert "# Categories" in llms_txt + assert "Use this curated list" in llms_txt + assert "## Widgets" in llms_txt + assert "- [w1](https://example.com) - A widget." in llms_txt + assert "- [w2](https://github.com/owner/w2) - A starred widget. (42)" in llms_txt + assert "{{ categories_md }}" not in llms_txt + assert "# Contributing" not in llms_txt + assert "Help!" not in llms_txt def test_build_cleans_stale_output(self, tmp_path): readme = textwrap.dedent("""\ @@ -604,3 +633,59 @@ def test_source_type_detected(self): categories = [c for g in groups for c in g["categories"]] entries = extract_entries(categories, groups) assert entries[0]["source_type"] == "Built-in" + + +# --------------------------------------------------------------------------- +# annotate_entries_with_stars +# --------------------------------------------------------------------------- + + +class TestAnnotateEntriesWithStars: + def test_appends_star_count_to_bullet(self): + markdown = "- [foo](https://github.com/owner/foo) - A foo.\n" + stars = {"owner/foo": {"stars": 123, "owner": "owner"}} + assert annotate_entries_with_stars(markdown, stars) == ( + "- [foo](https://github.com/owner/foo) - A foo. (123 GitHub stars)\n" + ) + + def test_uses_first_github_link(self): + markdown = ( + "- [foo](https://github.com/owner/foo) - A foo. " + "Also [bar](https://github.com/owner/bar).\n" + ) + stars = { + "owner/foo": {"stars": 10, "owner": "owner"}, + "owner/bar": {"stars": 99, "owner": "owner"}, + } + assert annotate_entries_with_stars(markdown, stars) == ( + "- [foo](https://github.com/owner/foo) - A foo. " + "Also [bar](https://github.com/owner/bar). (10 GitHub stars)\n" + ) + + def test_skips_entries_without_star_data(self): + markdown = "- [foo](https://github.com/owner/foo) - A foo.\n" + assert annotate_entries_with_stars(markdown, {}) == markdown + + def test_skips_non_github_links(self): + markdown = "- [foo](https://example.com) - A foo.\n" + stars = {"owner/foo": {"stars": 1, "owner": "owner"}} + assert annotate_entries_with_stars(markdown, stars) == markdown + + def test_skips_non_bullet_lines(self): + markdown = "See [foo](https://github.com/owner/foo) for details.\n" + stars = {"owner/foo": {"stars": 1, "owner": "owner"}} + assert annotate_entries_with_stars(markdown, stars) == markdown + + def test_handles_indented_bullets(self): + markdown = " - [foo](https://github.com/owner/foo)\n" + stars = {"owner/foo": {"stars": 7, "owner": "owner"}} + assert annotate_entries_with_stars(markdown, stars) == ( + " - [foo](https://github.com/owner/foo) (7 GitHub stars)\n" + ) + + def test_preserves_lines_without_trailing_newline(self): + markdown = "- [foo](https://github.com/owner/foo) - A foo." + stars = {"owner/foo": {"stars": 5, "owner": "owner"}} + assert annotate_entries_with_stars(markdown, stars) == ( + "- [foo](https://github.com/owner/foo) - A foo. (5 GitHub stars)" + )