From 5a8629ca28a91a2d61afa59d10a183827d89a7e8 Mon Sep 17 00:00:00 2001
From: watanabe-kohei-jp <283722319+watanabe-kohei-jp@users.noreply.github.com>
Date: Mon, 25 May 2026 10:33:26 +0900
Subject: [PATCH] fix(automation): strip HTML comments before metadata
 extraction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TITLE_RE / META_DESC_RE / SECTION_RE は src の HTML 全体に対して走るので、
HTML コメント内に literal の <title> や <meta name="description"> 文字列が
書かれていると、コメントの開きタグから本物の </title> までを誤キャプチャ
してしまっていた。

実例: _template/index.html のコメントは「<title> と <meta name=description>
は自動転記される」と書いていたため、テンプレートからコピーした全 lecture が
validate_metadata の改行検査で SystemExit する地雷を踏んでいた。

修正:
- strip_html_comments() を抽出前に通し、コメント内のタグ風文字列を無視する
- _template/index.html のコメントから literal の <title>/<meta> 記述を除去
  (defense in depth: regex 修正と併用)

E2E 検証: 修正前 fail → 修正後 5 ブロック正常検知。コメント内に攻撃的な
<title>FAKE</title> を仕込んでも本物の title/desc が正しく抽出される。
---
 _template/index.html     | 6 +++---
 scripts/sync_listings.py | 9 ++++++++-
 2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/_template/index.html b/_template/index.html
index 3e32081..828c199 100644
--- a/_template/index.html
+++ b/_template/index.html
@@ -2,9 +2,9 @@
 <html lang="ja">
 <head>
 <meta charset="UTF-8">
-<!-- 注意: <title> と <meta name="description"> はルートの sitemap.xml / llms.txt /
-     index.html / README.md の講義一覧に自動転記されます（scripts/sync_listings.py）。
-     listing にそのまま載る前提で、短く・的確に書いてください。 -->
+<!-- 注意: 直下の title 要素と meta description はルートの sitemap.xml /
+     llms.txt / index.html / README.md の講義一覧に自動転記されます
+     (scripts/sync_listings.py)。listing にそのまま載る前提で、短く・的確に。 -->
 <title>{{ 講義タイトル }} — Lectures</title>
 <meta name="description" content="{{ 1〜2 文の概要。検索・GEO 用。160 字以内で、この回が扱う機能と読者の得るものを書く }}">
 <meta name="author" content="@watanabe-kohei-jp">
diff --git a/scripts/sync_listings.py b/scripts/sync_listings.py
index cfce89b..7e9da1d 100644
--- a/scripts/sync_listings.py
+++ b/scripts/sync_listings.py
@@ -36,6 +36,13 @@
     re.DOTALL,
 )
 SECTION_RE = re.compile(r"<section[\s>]")
+HTML_COMMENT_RE = re.compile(r"<!--.*?-->", re.DOTALL)
+
+
+def strip_html_comments(page: str) -> str:
+    """Remove HTML comments so TITLE_RE / META_DESC_RE / SECTION_RE do not
+    misfire on tag-like text quoted inside <!-- ... -->."""
+    return HTML_COMMENT_RE.sub("", page)
 
 
 def repo_root() -> Path:
@@ -119,7 +126,7 @@ def discover_lectures(root: Path) -> list[Lecture]:
                 f"ERROR: {child.name}/ matches the lecture naming pattern "
                 f"but has no index.html"
             )
-        page = idx.read_text(encoding="utf-8")
+        page = strip_html_comments(idx.read_text(encoding="utf-8"))
         lectures.append(Lecture(
             num=m.group(1),
             slug=m.group(2),