diff --git a/.gitignore b/.gitignore index ca26a6e8fc..0d9f410bb5 100644 --- a/.gitignore +++ b/.gitignore @@ -10,12 +10,12 @@ __pycache__/ website/output/ website/data/ -# claude code +# planning docs +docs/ + +# agents +.agents/ .claude/skills/ -.gstack/ -.playwright-cli/ .superpowers/ +.playwright-cli/ skills-lock.json - -# codex -.agents/ diff --git a/README.md b/README.md index 51ae9d16aa..107b6859f7 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Awesome Python -An opinionated list of Python frameworks, libraries, tools, and resources. +An opinionated guide to the best Python frameworks, libraries, tools, and resources. # **Sponsors** diff --git a/website/build.py b/website/build.py index c223ef18ae..f9e3aa55d1 100644 --- a/website/build.py +++ b/website/build.py @@ -4,6 +4,8 @@ import json import re import shutil +import xml.etree.ElementTree as ET +from collections.abc import Sequence from datetime import UTC, datetime from pathlib import Path from typing import Any @@ -12,6 +14,11 @@ from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$") +MARKDOWN_LINK_RE = re.compile(r"\[[^\]]+\]\(([^)\s]+)\)") +BULLET_LINE_RE = re.compile(r"^\s*-\s") +SITE_URL = "https://awesome-python.com/" +SITEMAP_URL = f"{SITE_URL}sitemap.xml" +SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9" SOURCE_TYPE_DOMAINS = { "docs.python.org": "Built-in", @@ -67,6 +74,125 @@ def sort_key(entry: dict) -> tuple[int, int, int, str]: return sorted(entries, key=sort_key) +def build_robots_txt() -> str: + return ( + "User-agent: *\n" + "Content-Signal: search=yes, ai-input=yes, ai-train=yes\n" + "Allow: /\n" + "\n" + f"Sitemap: {SITEMAP_URL}\n" + ) + + +def write_sitemap_xml(path: Path, urls: Sequence[tuple[str, str]]) -> None: + ET.register_namespace("", SITEMAP_NS) + urlset = ET.Element(f"{{{SITEMAP_NS}}}urlset") + for url, lastmod in urls: + url_el = ET.SubElement(urlset, f"{{{SITEMAP_NS}}}url") + loc_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}loc") + loc_el.text = url + lastmod_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}lastmod") + lastmod_el.text = lastmod + + ET.ElementTree(urlset).write(path, encoding="utf-8", xml_declaration=True) + with path.open("ab") as f: + f.write(b"\n") + + +def top_level_heading_text(line: str) -> str | None: + stripped = line.strip() + if not stripped.startswith("# "): + return None + return stripped.removeprefix("#").strip().strip("#").strip().strip("*").strip() + + +LLMS_CATEGORIES_PLACEHOLDER = "{{ categories_md }}" + + +def extract_categories_body(markdown: str) -> str: + """Return content under the `# Categories` heading, excluding the heading line itself.""" + lines = markdown.splitlines(keepends=True) + start_idx = None + end_idx = len(lines) + for i, line in enumerate(lines): + heading = top_level_heading_text(line) + if heading is None: + continue + if start_idx is None and heading.lower() == "categories": + start_idx = i + 1 + while start_idx < len(lines) and lines[start_idx].strip() == "": + start_idx += 1 + elif start_idx is not None and i >= start_idx: + end_idx = i + break + if start_idx is None: + return "" + return "".join(lines[start_idx:end_idx]).rstrip() + "\n" + + +def build_llms_txt(template_text: str, readme_text: str, stars_data: dict[str, dict]) -> str: + """Render the llms.txt template by injecting the README's Categories body, then annotate stars.""" + body = extract_categories_body(readme_text).rstrip() + rendered = template_text.replace(LLMS_CATEGORIES_PLACEHOLDER, body) + return annotate_entries_with_stars(rendered, stars_data, format_stars=str) + + +def annotate_entries_with_stars( + markdown: str, + stars_data: dict[str, dict], + *, + format_stars=None, +) -> str: + """Append the star count to bullet entry lines whose first GitHub link has known star data. + + `format_stars` controls the parenthesized text. Defaults to "{N} GitHub stars". + Pass `str` for a bare number. + """ + if format_stars is None: + format_stars = lambda n: f"{n} GitHub stars" # noqa: E731 lambda-assignment + lines = markdown.splitlines(keepends=True) + out: list[str] = [] + for line in lines: + if not BULLET_LINE_RE.match(line): + out.append(line) + continue + annotated = line + for match in MARKDOWN_LINK_RE.finditer(line): + repo_key = extract_github_repo(match.group(1)) + if not repo_key: + continue + entry = stars_data.get(repo_key) + if not entry or "stars" not in entry: + continue + stripped = line.rstrip("\n") + ending = line[len(stripped):] + annotated = f"{stripped} ({format_stars(entry['stars'])}){ending}" + break + out.append(annotated) + return "".join(out) + + +def remove_sponsors_section(markdown: str) -> str: + lines = markdown.splitlines(keepends=True) + start_idx = None + for i, line in enumerate(lines): + heading = top_level_heading_text(line) + if heading and heading.lower() == "sponsors": + start_idx = i + break + + if start_idx is None: + return markdown + + end_idx = len(lines) + for i, line in enumerate(lines[start_idx + 1 :], start=start_idx + 1): + if top_level_heading_text(line): + end_idx = i + break + + return "".join(lines[:start_idx] + lines[end_idx:]) + + def extract_entries( categories: list[ParsedSection], groups: list[ParsedGroup], @@ -131,6 +257,7 @@ def build(repo_root: Path) -> None: categories = [cat for g in parsed_groups for cat in g["categories"]] total_entries = sum(c["entry_count"] for c in categories) entries = extract_entries(categories, parsed_groups) + build_date = datetime.now(UTC) stars_data = load_stars(website / "data" / "github_stars.json") @@ -155,6 +282,8 @@ def build(repo_root: Path) -> None: env = Environment( loader=FileSystemLoader(website / "templates"), autoescape=True, + trim_blocks=True, + lstrip_blocks=True, ) site_dir = website / "output" @@ -171,7 +300,7 @@ def build(repo_root: Path) -> None: total_entries=total_entries, total_categories=len(categories), repo_stars=repo_stars, - build_date=datetime.now(UTC).strftime("%B %d, %Y"), + build_date=build_date.strftime("%B %d, %Y"), sponsors=sponsors, ), encoding="utf-8", @@ -182,7 +311,15 @@ def build(repo_root: Path) -> None: if static_src.exists(): shutil.copytree(static_src, static_dst, dirs_exist_ok=True) - (site_dir / "llms.txt").write_text(readme_text, encoding="utf-8") + markdown_index = annotate_entries_with_stars( + remove_sponsors_section(readme_text), stars_data + ) + llms_template = (website / "templates" / "llms.txt").read_text(encoding="utf-8") + llms_txt = build_llms_txt(llms_template, readme_text, stars_data) + (site_dir / "robots.txt").write_text(build_robots_txt(), encoding="utf-8") + write_sitemap_xml(site_dir / "sitemap.xml", [(SITE_URL, build_date.date().isoformat())]) + (site_dir / "index.md").write_text(markdown_index, encoding="utf-8") + (site_dir / "llms.txt").write_text(llms_txt, encoding="utf-8") print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories") print(f"Total entries: {total_entries}") diff --git a/website/templates/base.html b/website/templates/base.html index 34546e73c3..af112095e7 100644 --- a/website/templates/base.html +++ b/website/templates/base.html @@ -1,26 +1,27 @@ + {% set default_meta_title = "Awesome Python" %} + {% set default_meta_description = "An opinionated guide to the best Python frameworks, libraries, and tools. Explore " ~ (entries | length) ~ " curated projects across " ~ total_categories ~ " categories, from AI and agents to data science and web development." %} + {% set canonical_url = "https://awesome-python.com/" %} + {% set social_image_url = "https://awesome-python.com/static/og-image.png" %} + {% set meta_title %}{% block title %}{{ default_meta_title }}{% endblock %}{% endset %} + {% set meta_description %}{% block description %}{{ default_meta_description }}{% endblock %}{% endset %} - {% block title %}Awesome Python{% endblock %} - - + {{ meta_title | trim }} + + + - - - - - + + + + + + + + diff --git a/website/templates/llms.txt b/website/templates/llms.txt new file mode 100644 index 0000000000..1db05c3bb1 --- /dev/null +++ b/website/templates/llms.txt @@ -0,0 +1,9 @@ +# Awesome Python + +An opinionated guide to the best Python frameworks, libraries, tools, and resources. + +Use this curated list when you need to find a high-quality Python library or tool for tasks such as web development, data science, machine learning, AI agents, automation, testing, or DevOps. The trailing number on each entry is its star count on GitHub. + +# Categories + +{{ categories_md }} diff --git a/website/tests/test_build.py b/website/tests/test_build.py index 0b22609a84..32b01917d5 100644 --- a/website/tests/test_build.py +++ b/website/tests/test_build.py @@ -3,9 +3,13 @@ import json import shutil import textwrap +import xml.etree.ElementTree as ET +from datetime import UTC, date, datetime +from html.parser import HTMLParser from pathlib import Path from build import ( + annotate_entries_with_stars, build, detect_source_type, extract_entries, @@ -15,6 +19,40 @@ ) from readme_parser import parse_readme, slugify + +class HeadMetadataParser(HTMLParser): + def __init__(self): + super().__init__() + self.title_count = 0 + self.title = "" + self.meta_by_name = {} + self.meta_by_property = {} + self.links_by_rel = {} + self._in_title = False + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + if tag == "title": + self.title_count += 1 + self._in_title = True + elif tag == "meta": + if "name" in attrs: + self.meta_by_name[attrs["name"]] = attrs.get("content", "") + if "property" in attrs: + self.meta_by_property[attrs["property"]] = attrs.get("content", "") + elif tag == "link" and attrs.get("rel"): + for rel in attrs["rel"].split(): + self.links_by_rel[rel] = attrs.get("href", "") + + def handle_endtag(self, tag): + if tag == "title": + self._in_title = False + + def handle_data(self, data): + if self._in_title: + self.title += data + + # --------------------------------------------------------------------------- # slugify # --------------------------------------------------------------------------- @@ -71,6 +109,21 @@ def _make_repo(self, tmp_path, readme): "{% endblock %}", encoding="utf-8", ) + (tpl_dir / "llms.txt").write_text( + "# Awesome Python\n" + "\n" + "Use this list to find Python tools.\n" + "\n" + "# Categories\n" + "\n" + "{{ categories_md }}\n", + encoding="utf-8", + ) + + def _copy_real_templates(self, tmp_path): + real_tpl = Path(__file__).parent / ".." / "templates" + tpl_dir = tmp_path / "website" / "templates" + shutil.copytree(real_tpl, tpl_dir) def test_build_creates_single_page(self, tmp_path): readme = textwrap.dedent("""\ @@ -114,6 +167,115 @@ def test_build_creates_single_page(self, tmp_path): # No category sub-pages assert not (site / "categories").exists() + def test_build_creates_root_discovery_files(self, tmp_path): + readme = textwrap.dedent("""\ + # Awesome Python + + Intro. + + --- + + ## Widgets + + - [w1](https://example.com) - A widget. + + # Contributing + + Help! + """) + self._make_repo(tmp_path, readme) + start_date = datetime.now(UTC).date() + build(tmp_path) + end_date = datetime.now(UTC).date() + + site = tmp_path / "website" / "output" + robots = (site / "robots.txt").read_text(encoding="utf-8") + assert robots == ( + "User-agent: *\n" + "Content-Signal: search=yes, ai-input=yes, ai-train=yes\n" + "Allow: /\n" + "\n" + "Sitemap: https://awesome-python.com/sitemap.xml\n" + ) + + sitemap = ET.parse(site / "sitemap.xml") + root = sitemap.getroot() + ns = {"sitemap": "http://www.sitemaps.org/schemas/sitemap/0.9"} + locs = [loc.text for loc in root.findall("sitemap:url/sitemap:loc", ns)] + lastmods = [lastmod.text for lastmod in root.findall("sitemap:url/sitemap:lastmod", ns)] + + assert root.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset" + assert locs == ["https://awesome-python.com/"] + assert len(lastmods) == 1 + assert start_date <= date.fromisoformat(lastmods[0]) <= end_date + assert all(loc.startswith("https://awesome-python.com/") for loc in locs) + assert all("?" not in loc for loc in locs) + + def test_build_creates_markdown_alternate_without_sponsors(self, tmp_path): + readme = textwrap.dedent("""\ + # Awesome Python + + Intro. + + # **Sponsors** + + - **[Sponsor](https://sponsor.example.com)**: Sponsored tool. + + > Become a sponsor: [Sponsor us](SPONSORSHIP.md). + + # Categories + + **Tools** + + - [Widgets](#widgets) + + --- + + ## Widgets + + - [w1](https://example.com) - A widget. + - [w2](https://github.com/owner/w2) - A starred widget. + + # Contributing + + Help! + """) + (tmp_path / "README.md").write_text(readme, encoding="utf-8") + self._copy_real_templates(tmp_path) + + data_dir = tmp_path / "website" / "data" + data_dir.mkdir(parents=True) + stars = { + "owner/w2": {"stars": 42, "owner": "owner", "fetched_at": "2026-01-01T00:00:00+00:00"}, + } + (data_dir / "github_stars.json").write_text(json.dumps(stars), encoding="utf-8") + + build(tmp_path) + + site = tmp_path / "website" / "output" + index_html = (site / "index.html").read_text(encoding="utf-8") + index_md = (site / "index.md").read_text(encoding="utf-8") + llms_txt = (site / "llms.txt").read_text(encoding="utf-8") + + assert '' in index_html + assert index_md.startswith("# Awesome Python\n\nIntro.\n\n# Categories") + assert "# **Sponsors**" not in index_md + assert "Sponsor" not in index_md + assert "SPONSORSHIP.md" not in index_md + assert "## Widgets" in index_md + assert "- [w1](https://example.com) - A widget." in index_md + assert "- [w2](https://github.com/owner/w2) - A starred widget. (42 GitHub stars)" in index_md + + assert llms_txt.startswith("# Awesome Python\n") + assert "# Categories" in llms_txt + assert "Use this curated list" in llms_txt + assert "## Widgets" in llms_txt + assert "- [w1](https://example.com) - A widget." in llms_txt + assert "- [w2](https://github.com/owner/w2) - A starred widget. (42)" in llms_txt + assert "{{ categories_md }}" not in llms_txt + assert "# Contributing" not in llms_txt + assert "Help!" not in llms_txt + def test_build_cleans_stale_output(self, tmp_path): readme = textwrap.dedent("""\ # T @@ -235,6 +397,40 @@ def test_build_with_stars_sorts_by_stars(self, tmp_path): # Expand content present assert "expand-content" in html + def test_index_contains_aligned_homepage_metadata(self, tmp_path): + readme = (Path(__file__).parents[2] / "README.md").read_text(encoding="utf-8") + (tmp_path / "README.md").write_text(readme, encoding="utf-8") + self._copy_real_templates(tmp_path) + + build(tmp_path) + + parsed_groups = parse_readme(readme) + categories = [cat for group in parsed_groups for cat in group["categories"]] + entries = extract_entries(categories, parsed_groups) + html = (tmp_path / "website" / "output" / "index.html").read_text(encoding="utf-8") + parser = HeadMetadataParser() + parser.feed(html) + + expected_title = "Awesome Python" + expected_description = f"An opinionated guide to the best Python frameworks, libraries, and tools. Explore {len(entries)} curated projects across {len(categories)} categories, from AI and agents to data science and web development." + expected_url = "https://awesome-python.com/" + expected_image = "https://awesome-python.com/static/og-image.png" + + assert parser.title_count == 1 + assert parser.title.strip() == expected_title + assert parser.meta_by_name["description"] == expected_description + assert parser.links_by_rel["canonical"] == expected_url + assert parser.meta_by_property["og:type"] == "website" + assert parser.meta_by_property["og:title"] == expected_title + assert parser.meta_by_property["og:description"] == expected_description + assert parser.meta_by_property["og:image"] == expected_image + assert parser.meta_by_property["og:url"] == expected_url + assert parser.meta_by_name["twitter:card"] == "summary_large_image" + assert parser.meta_by_name["twitter:title"] == expected_title + assert parser.meta_by_name["twitter:description"] == expected_description + assert parser.meta_by_name["twitter:image"] == expected_image + assert "\n