StarTrail-org · andylizf · Jun 23, 2026
diff --git a/embed/src/pixelrag_embed/embed.py b/embed/src/pixelrag_embed/embed.py
@@ -264,13 +264,14 @@ def scan_shard_tiles(
         if not meta.get("complete", False):
             continue
 
-        # Extract article_id from directory name: "3104240.png.tiles" -> 3104240
-        dir_name = tiles_dir.name  # e.g. "3104240.png.tiles"
-        try:
-            article_id = int(dir_name.split(".")[0])
-        except (ValueError, IndexError):
-            logger.warning("Cannot parse article_id from %s", dir_name)
-            continue
+        article_id = meta.get("article_id")
+        if article_id is None:
+            dir_name = tiles_dir.name
+            try:
+                article_id = int(dir_name.split(".")[0])
+            except (ValueError, IndexError):
+                logger.warning("Cannot parse article_id from %s", dir_name)
+                continue
 
         if article_id in skip:
             continue
@@ -343,12 +344,14 @@ def scan_shard_chunks(
             logger.warning("Skipping %s: %s", chunks_json, e)
             continue
 
-        dir_name = tiles_dir.name
-        try:
-            article_id = int(dir_name.split(".")[0])
-        except (ValueError, IndexError):
-            logger.warning("Cannot parse article_id from %s", dir_name)
-            continue
+        article_id = meta.get("article_id")
+        if article_id is None:
+            dir_name = tiles_dir.name
+            try:
+                article_id = int(dir_name.split(".")[0])
+            except (ValueError, IndexError):
+                logger.warning("Cannot parse article_id from %s", dir_name)
+                continue
 
         if article_id in skip:
             continue

diff --git a/embed/src/pixelrag_embed/embed_cpu.py b/embed/src/pixelrag_embed/embed_cpu.py
@@ -79,16 +79,26 @@ def scan_chunks(shard_dir: str) -> list[dict]:
             )
 
         for td in tile_dirs:
-            dir_name = td.name
-            article_id_str = dir_name.replace(".png.tiles", "")
-            try:
-                article_id = int(article_id_str)
-            except ValueError:
-                article_id = hash(article_id_str) % (2**31)
-
             chunks_json = td / "chunks.json"
             tiles_json = td / "tiles.json"
 
+            # Read article_id from the manifest (written by the pipeline).
+            # Fall back to parsing the directory name for backward compat
+            # with indexes built before this change.
+            article_id = None
+            for mf in (chunks_json, tiles_json):
+                if mf.exists() and article_id is None:
+                    try:
+                        article_id = json.loads(mf.read_text()).get("article_id")
+                    except (json.JSONDecodeError, OSError):
+                        pass
+            if article_id is None:
+                article_id_str = td.name.replace(".png.tiles", "")
+                try:
+                    article_id = int(article_id_str)
+                except ValueError:
+                    article_id = hash(article_id_str) % (2**31)
+
             if chunks_json.exists():
                 with open(chunks_json) as f:
                     manifest = json.load(f)

diff --git a/index/src/pixelrag_index/pipelines.py b/index/src/pixelrag_index/pipelines.py
@@ -89,28 +89,45 @@ def build(config: dict, limit: int | None = None, force: bool = False) -> Path:
             "  Rendered %d URLs (%d skipped, already exist)", len(new_url_docs), skipped
         )
 
-    # Render PDFs
+    # Render PDFs — use idx as tile directory name (like URLs) so directory
+    # names are always the numeric article_id.
     for idx, doc in pdf_docs:
         try:
-            render_pdf(doc.path, str(tiles_dir))
+            render_pdf(doc.path, str(tiles_dir), stem=str(idx))
         except Exception as e:
             logger.warning("  FAILED PDF %s: %s", doc.id, e)
     if pdf_docs:
         logger.info("  Rendered %d PDFs", len(pdf_docs))
 
-    # Save articles.json for serve API — title + URL per article.
-    # Use the pipeline's sequential *position index* (0, 1, 2, …) rather than
-    # int(a["id"]), because local sources use filename stems (e.g. "art_alice")
-    # as doc IDs, which are not numeric. int() on a filename stem raises ValueError
-    # and crashes the entire index build step.
+    # Write article_id into each tile directory's manifest so the embed
+    # pipeline can read it explicitly instead of guessing from the directory
+    # name. This is the authoritative source of article_id — directory names
+    # are for humans, manifests are for the pipeline.
+    tile_dir_map: dict[int, Path] = {}
+    for idx, _ in url_docs + pdf_docs + image_docs:
+        tile_path = tiles_dir / f"{idx}.png.tiles"
+        if tile_path.is_dir():
+            tile_dir_map[idx] = tile_path
+    for idx, tile_path in tile_dir_map.items():
+        for manifest_name in ("tiles.json", "chunks.json"):
+            manifest_path = tile_path / manifest_name
+            if manifest_path.exists():
+                try:
+                    manifest = json.loads(manifest_path.read_text())
+                    manifest["article_id"] = idx
+                    manifest_path.write_text(json.dumps(manifest))
+                except (json.JSONDecodeError, OSError):
+                    pass
+
+    # Save articles.json for serve API — maps article_id (array index) to
+    # human-readable title + URL.
     articles_path = output / "articles.json"
     article_entries = []
     for enum_idx, a in enumerate(articles):
         title = a.get("metadata", {}).get("title", "")
         if not title and a.get("url"):
             title = a["url"].split("/")[-1].replace("_", " ").replace("%20", " ")
         if not title:
-            # Fall back to original doc id (e.g. filename stem) as display title
             title = a.get("id", str(enum_idx))
         url = a.get("url", "") or a.get("path", "")
         article_entries.append({"title": title, "url": url})

diff --git a/render/src/pixelrag_render/backends/pdf.py b/render/src/pixelrag_render/backends/pdf.py
@@ -20,6 +20,7 @@ def render_pdf(
     dpi: int = 200,
     pages: Optional[list[int]] = None,
     quality: int = 85,
+    stem: str | None = None,
 ) -> list[Path]:
     """Render a PDF to JPEG tiles.
 
@@ -32,6 +33,9 @@ def render_pdf(
         dpi: Resolution for rendering (default 200 gives ~1650×2200px for A4).
         pages: 1-based list of page numbers to render. ``None`` renders all pages.
         quality: JPEG quality 1-100 (default 85).
+        stem: Override for the tile directory name. Defaults to the PDF filename
+            stem. The pipeline passes the article_id here so directory names
+            are always numeric and consistent with articles.json.
 
     Returns:
         List containing the single tile directory Path on success.
@@ -55,7 +59,8 @@ def render_pdf(
     output_dir = Path(output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
 
-    stem = path.stem
+    if stem is None:
+        stem = path.stem
     tile_dir = output_dir / f"{stem}.png.tiles"
     tile_dir.mkdir(parents=True, exist_ok=True)
 

diff --git a/render/src/pixelrag_render/render.py b/render/src/pixelrag_render/render.py
@@ -115,6 +115,7 @@ def render_pdf(
     dpi: int = 200,
     pages: Optional[list[int]] = None,
     quality: int = 85,
+    stem: str | None = None,
 ) -> list[Path]:
     """Render a PDF file to tiled JPEG images.
 
@@ -124,13 +125,14 @@ def render_pdf(
         dpi: Rendering resolution (default 200 ≈ 1650×2200 for A4).
         pages: 1-based list of page numbers to render. ``None`` renders all.
         quality: JPEG quality 1-100 (default 85).
+        stem: Override for the tile directory name (default: PDF filename stem).
 
     Returns:
         List containing the tile directory Path on success.
     """
     from .backends.pdf import render_pdf as _render_pdf
 
-    return _render_pdf(path, output_dir, dpi=dpi, pages=pages, quality=quality)
+    return _render_pdf(path, output_dir, dpi=dpi, pages=pages, quality=quality, stem=stem)
 
 
 def render_file(

diff --git a/tests/test_article_id.py b/tests/test_article_id.py
@@ -0,0 +1,86 @@
+"""Tests for the article_id manifest contract.
+
+The pipeline writes article_id into tiles.json/chunks.json so the embed
+pipeline reads it explicitly instead of guessing from directory names.
+"""
+
+import json
+from pathlib import Path
+
+from pixelrag_embed.embed_cpu import scan_chunks
+
+
+def _make_tile_dir(base: Path, dir_name: str, article_id: int | None = None):
+    """Create a minimal tile directory with tiles.json and chunks.json."""
+    td = base / f"{dir_name}.png.tiles"
+    td.mkdir(parents=True)
+
+    from PIL import Image
+
+    img = Image.new("RGB", (875, 500))
+    img.save(td / "tile_0000.png")
+
+    tiles_meta = {"tiles": ["tile_0000.png"], "complete": True}
+    if article_id is not None:
+        tiles_meta["article_id"] = article_id
+    (td / "tiles.json").write_text(json.dumps(tiles_meta))
+
+    chunks_meta = {
+        "chunks": [
+            {
+                "tile": "tile_0000.png",
+                "tile_index": 0,
+                "chunk_index": 0,
+                "file": "tile_0000.png",
+                "x_offset": 0,
+                "y_offset": 0,
+                "height": 500,
+                "width": 875,
+            }
+        ],
+    }
+    if article_id is not None:
+        chunks_meta["article_id"] = article_id
+    (td / "chunks.json").write_text(json.dumps(chunks_meta))
+    return td
+
+
+def test_article_id_from_manifest(tmp_path):
+    """When article_id is in the manifest, embed reads it (not the dir name)."""
+    _make_tile_dir(tmp_path, "report", article_id=0)
+    items = scan_chunks(str(tmp_path))
+    assert len(items) == 1
+    assert items[0]["article_id"] == 0
+
+
+def test_article_id_fallback_to_dir_name(tmp_path):
+    """Without article_id in manifest, fall back to parsing directory name."""
+    _make_tile_dir(tmp_path, "42", article_id=None)
+    items = scan_chunks(str(tmp_path))
+    assert len(items) == 1
+    assert items[0]["article_id"] == 42
+
+
+def test_non_numeric_dir_without_manifest_id(tmp_path):
+    """Non-numeric dir name + no manifest article_id → hash fallback."""
+    _make_tile_dir(tmp_path, "my_report", article_id=None)
+    items = scan_chunks(str(tmp_path))
+    assert len(items) == 1
+    assert isinstance(items[0]["article_id"], int)
+
+
+def test_manifest_id_overrides_dir_name(tmp_path):
+    """Manifest article_id wins over directory name even if dir name is numeric."""
+    _make_tile_dir(tmp_path, "999", article_id=7)
+    items = scan_chunks(str(tmp_path))
+    assert len(items) == 1
+    assert items[0]["article_id"] == 7
+
+
+def test_multiple_articles_distinct_ids(tmp_path):
+    """Multiple tile dirs get distinct article_ids from manifests."""
+    _make_tile_dir(tmp_path, "report", article_id=0)
+    _make_tile_dir(tmp_path, "slides", article_id=1)
+    items = scan_chunks(str(tmp_path))
+    ids = {it["article_id"] for it in items}
+    assert ids == {0, 1}