Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 79 additions & 79 deletions README.md

Large diffs are not rendered by default.

53 changes: 38 additions & 15 deletions website/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,16 +243,14 @@ def write_sitemap_xml(path: Path, urls: Sequence[tuple[str, str]]) -> None:

def top_level_heading_text(line: str) -> str | None:
stripped = line.strip()
if not stripped.startswith("# "):
match = re.match(r"^(#{1,2})\s+(.+)$", stripped)
if match is None:
return None
return stripped.removeprefix("#").strip().strip("#").strip().strip("*").strip()


LLMS_CATEGORIES_PLACEHOLDER = "{{ categories_md }}"
return match.group(2).strip().strip("#").strip().strip("*").strip()


def extract_categories_body(markdown: str) -> str:
"""Return content under the `# Categories` heading, excluding the heading line itself."""
"""Return content from `Categories` through `Projects`, excluding later sections."""
lines = markdown.splitlines(keepends=True)
start_idx = None
end_idx = len(lines)
Expand All @@ -264,19 +262,40 @@ def extract_categories_body(markdown: str) -> str:
start_idx = i + 1
while start_idx < len(lines) and lines[start_idx].strip() == "":
start_idx += 1
elif start_idx is not None and i >= start_idx:
elif start_idx is not None and heading.lower() in ("resources", "contributing"):
end_idx = i
break
if start_idx is None:
return ""
return "".join(lines[start_idx:end_idx]).rstrip() + "\n"


def build_llms_txt(template_text: str, readme_text: str, stars_data: dict[str, dict]) -> str:
"""Render the llms.txt template by injecting the README's Categories body, then annotate stars."""
body = extract_categories_body(readme_text).rstrip()
rendered = template_text.replace(LLMS_CATEGORIES_PLACEHOLDER, body)
return annotate_entries_with_stars(rendered, stars_data, format_stars=str)
def build_llms_txt(
template_text: str,
*,
readme_text: str,
stars_data: dict[str, dict],
categories: Sequence[ParsedSection],
total_entries: int,
) -> str:
"""Render the llms.txt entry point with the curated category catalog."""
categories_md = annotate_entries_with_stars(
extract_categories_body(readme_text).rstrip(),
stars_data,
format_stars=lambda n: f"GitHub stars: {n}",
)
text_env = Environment(autoescape=False, trim_blocks=True, lstrip_blocks=True)
rendered = text_env.from_string(template_text).render(
site_url=SITE_URL,
github_repo_url="https://github.com/vinta/awesome-python",
contributing_url="https://github.com/vinta/awesome-python/blob/master/CONTRIBUTING.md",
sponsorship_url=SPONSORSHIP_PUBLIC_URL,
sitemap_url=SITEMAP_URL,
categories_md=categories_md,
total_entries=total_entries,
total_categories=len(categories),
)
return rendered.rstrip() + "\n"


def annotate_entries_with_stars(
Expand Down Expand Up @@ -588,11 +607,16 @@ def render_category(
if static_src.exists():
shutil.copytree(static_src, static_dst, dirs_exist_ok=True)

markdown_index = annotate_entries_with_stars(remove_sponsors_section(readme_text), stars_data)
sponsorship_md = repo_root / "SPONSORSHIP.md"
sponsorship_md_mtime = datetime.fromtimestamp(sponsorship_md.stat().st_mtime, tz=UTC).date().isoformat()
llms_template = (website / "templates" / "llms.txt").read_text(encoding="utf-8")
llms_txt = build_llms_txt(llms_template, readme_text, stars_data)
llms_txt = build_llms_txt(
llms_template,
readme_text=readme_text,
stars_data=stars_data,
categories=categories,
total_entries=total_entries,
)
(site_dir / "robots.txt").write_text(build_robots_txt(), encoding="utf-8")
sitemap_date = build_date.date().isoformat()
sitemap_urls = [(SITE_URL, sitemap_date)]
Expand All @@ -604,7 +628,6 @@ def render_category(
sitemap_urls.append((subcategory_public_url(cat_slug, sub_slug), sitemap_date))
sitemap_urls.append((SPONSORSHIP_PUBLIC_URL, sponsorship_md_mtime))
write_sitemap_xml(site_dir / "sitemap.xml", sitemap_urls)
(site_dir / "index.md").write_text(markdown_index, encoding="utf-8")
(site_dir / "llms.txt").write_text(llms_txt, encoding="utf-8")

print(f"Built site with {len(parsed_groups)} groups, {len(categories)} categories")
Expand Down
51 changes: 31 additions & 20 deletions website/readme_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,13 @@ def _heading_text(node: SyntaxTreeNode) -> str:
return ""


def _heading_level(node: SyntaxTreeNode) -> int | None:
"""Return the numeric level for a heading node."""
if node.type != "heading" or not node.tag.startswith("h"):
return None
return int(node.tag[1:])


def _extract_description_children(nodes: list[SyntaxTreeNode]) -> list[SyntaxTreeNode]:
"""Extract description children from the first paragraph if it's a single <em> block.

Expand Down Expand Up @@ -303,7 +310,7 @@ def _parse_grouped_sections(
) -> list[ParsedGroup]:
"""Parse nodes into groups of categories using bold markers as group boundaries.

Bold-only paragraphs (**Group Name**) delimit groups. H2 headings under each
Bold-only paragraphs (**Group Name**) delimit groups. H3 headings under each
bold marker become categories within that group. Categories appearing before
any bold marker go into an "Other" group.
"""
Expand Down Expand Up @@ -341,7 +348,7 @@ def flush_group() -> None:
flush_group()
current_group_name = bold_name
current_cat_body = []
elif node.type == "heading" and node.tag == "h2":
elif node.type == "heading" and node.tag in ("h2", "h3"):
flush_cat()
current_cat_name = _heading_text(node)
current_cat_body = []
Expand Down Expand Up @@ -383,7 +390,7 @@ def _parse_sponsor_item(inline: SyntaxTreeNode) -> ParsedSponsor | None:


def parse_sponsors(text: str) -> list[ParsedSponsor]:
"""Parse the `# Sponsors` section of README.md into a list of sponsors.
"""Parse the `Sponsors` section of README.md into a list of sponsors.

Expects bullets in the form `**[name](url)**: description`.
Returns [] if no Sponsors section exists.
Expand All @@ -395,14 +402,18 @@ def parse_sponsors(text: str) -> list[ParsedSponsor]:

start_idx = None
end_idx = len(children)
start_level = None
for i, node in enumerate(children):
if node.type == "heading" and node.tag == "h1":
title = _heading_text(node).strip().lower()
if start_idx is None and title == "sponsors":
start_idx = i + 1
elif start_idx is not None:
end_idx = i
break
level = _heading_level(node)
if level is None:
continue
title = _heading_text(node).strip().lower()
if start_idx is None and title == "sponsors":
start_idx = i + 1
start_level = level
elif start_idx is not None and start_level is not None and level <= start_level:
end_idx = i
break
if start_idx is None:
return []

Expand All @@ -426,26 +437,26 @@ def parse_readme(text: str) -> list[ParsedGroup]:
"""Parse README.md text into grouped categories.

Returns a list of ParsedGroup dicts containing nested categories.
Content between the thematic break (---) and # Resources or # Contributing
is parsed as categories grouped by bold markers (**Group Name**).
Content between the Projects heading and Resources or Contributing is parsed
as categories grouped by bold markers (**Group Name**).
"""
md = MarkdownIt("commonmark")
tokens = md.parse(text)
root = SyntaxTreeNode(tokens)
children = root.children

# Find thematic break (---) and section boundaries in one pass
hr_idx = None
# Find Projects and section boundaries in one pass.
projects_idx = None
cat_end_idx = None
for i, node in enumerate(children):
if hr_idx is None and node.type == "hr":
hr_idx = i
elif node.type == "heading" and node.tag == "h1":
if _heading_level(node) in (1, 2):
text_content = _heading_text(node)
if cat_end_idx is None and text_content in ("Resources", "Contributing"):
if projects_idx is None and text_content == "Projects":
projects_idx = i
elif cat_end_idx is None and text_content in ("Resources", "Contributing"):
cat_end_idx = i
if hr_idx is None:
if projects_idx is None:
return []

cat_nodes = children[hr_idx + 1 : cat_end_idx or len(children)]
cat_nodes = children[projects_idx + 1 : cat_end_idx or len(children)]
return _parse_grouped_sections(cat_nodes)
2 changes: 1 addition & 1 deletion website/templates/base.html
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
<meta name="description" content="{{ meta_description | trim }}" />
<link rel="canonical" href="{{ canonical_url | trim }}" />
{% block alternate_links %}
<link rel="alternate" type="text/markdown" href="/index.md" />
<link rel="alternate" type="text/plain" href="/llms.txt" title="LLMs text entry point" />
{% endblock %}
<meta property="og:type" content="website" />
<meta property="og:title" content="{{ meta_title | trim }}" />
Expand Down
14 changes: 11 additions & 3 deletions website/templates/llms.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
# Awesome Python

An opinionated guide to the best Python frameworks, libraries, tools, and resources.
Awesome Python is an opinionated catalog of {{ total_entries }} Python frameworks, libraries, tools, and resources across {{ total_categories }} {% if total_categories == 1 %}category{% else %}categories{% endif %}.

Use this curated list when you need to find a high-quality Python library or tool for tasks such as web development, data science, machine learning, AI agents, automation, testing, or DevOps. The trailing number on each entry is its star count on GitHub.
Scan the category index, then jump to the matching section for direct project links and short descriptions. GitHub entries with known star data end with a `GitHub stars: N` note in parentheses; treat it as popularity context, not a quality guarantee. Use the homepage for project context outside the catalog.

# Categories
## Primary Links

- Homepage: {{ site_url }}
- GitHub repository: {{ github_repo_url }}
- Contributing guide: {{ contributing_url }}
- Sponsorship: {{ sponsorship_url }}
- Sitemap: {{ sitemap_url }}

## Categories

{{ categories_md }}
Loading