From 28c9b5c42004d5810360bdec7de56f742e320460 Mon Sep 17 00:00:00 2001
From: Zhifei Li <andylizf@outlook.com>
Date: Tue, 23 Jun 2026 06:00:25 -0700
Subject: [PATCH 1/2] fix(index): write article_id into tile manifests, stop
 guessing from dir names
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The embed pipeline extracted article_id by parsing the tile directory name
(e.g. "3104240.png.tiles" → int("3104240")). This broke for PDFs (directory
named after the filename stem, e.g. "report.png.tiles" → int("report") fails)
and for any non-numeric directory name. GPU embed skipped the tile silently;
CPU embed used a hash fallback that produced IDs misaligned with articles.json.

Root cause: article_id was never explicitly communicated from the pipeline to
the embed stage — embed had to reverse-engineer it from the filesystem.

Fix: the pipeline now writes article_id into tiles.json and chunks.json after
rendering. Embed reads it from the manifest first, falling back to directory
name parsing for backward compatibility with existing large-scale indexes
(e.g. the Wikipedia corpus where dir names are already numeric).

Also: render_pdf gains a stem parameter so the pipeline can name PDF tile
directories by position index (like URLs), making directory names consistent
across all source types.

Follows the same sidecar-metadata pattern used by ColPali (JSONL manifest
mapping FAISS IDs to doc metadata) and LEANN (passage_id_scheme +
ids.txt/offset map). See also Rulin Shao's MassiveDS (shared shard IDs).
---
 embed/src/pixelrag_embed/embed.py          | 29 ++++----
 embed/src/pixelrag_embed/embed_cpu.py      | 24 ++++--
 index/src/pixelrag_index/pipelines.py      | 33 +++++++--
 render/src/pixelrag_render/backends/pdf.py |  7 +-
 render/src/pixelrag_render/render.py       |  4 +-
 tests/test_article_id.py                   | 86 ++++++++++++++++++++++
 6 files changed, 153 insertions(+), 30 deletions(-)
 create mode 100644 tests/test_article_id.py

diff --git a/embed/src/pixelrag_embed/embed.py b/embed/src/pixelrag_embed/embed.py
index e210b9a..af3afc7 100644
--- a/embed/src/pixelrag_embed/embed.py
+++ b/embed/src/pixelrag_embed/embed.py
@@ -264,13 +264,14 @@ def scan_shard_tiles(
         if not meta.get("complete", False):
             continue
 
-        # Extract article_id from directory name: "3104240.png.tiles" -> 3104240
-        dir_name = tiles_dir.name  # e.g. "3104240.png.tiles"
-        try:
-            article_id = int(dir_name.split(".")[0])
-        except (ValueError, IndexError):
-            logger.warning("Cannot parse article_id from %s", dir_name)
-            continue
+        article_id = meta.get("article_id")
+        if article_id is None:
+            dir_name = tiles_dir.name
+            try:
+                article_id = int(dir_name.split(".")[0])
+            except (ValueError, IndexError):
+                logger.warning("Cannot parse article_id from %s", dir_name)
+                continue
 
         if article_id in skip:
             continue
@@ -343,12 +344,14 @@ def scan_shard_chunks(
             logger.warning("Skipping %s: %s", chunks_json, e)
             continue
 
-        dir_name = tiles_dir.name
-        try:
-            article_id = int(dir_name.split(".")[0])
-        except (ValueError, IndexError):
-            logger.warning("Cannot parse article_id from %s", dir_name)
-            continue
+        article_id = meta.get("article_id")
+        if article_id is None:
+            dir_name = tiles_dir.name
+            try:
+                article_id = int(dir_name.split(".")[0])
+            except (ValueError, IndexError):
+                logger.warning("Cannot parse article_id from %s", dir_name)
+                continue
 
         if article_id in skip:
             continue
diff --git a/embed/src/pixelrag_embed/embed_cpu.py b/embed/src/pixelrag_embed/embed_cpu.py
index 3b717af..2cba3bf 100644
--- a/embed/src/pixelrag_embed/embed_cpu.py
+++ b/embed/src/pixelrag_embed/embed_cpu.py
@@ -79,16 +79,26 @@ def scan_chunks(shard_dir: str) -> list[dict]:
             )
 
         for td in tile_dirs:
-            dir_name = td.name
-            article_id_str = dir_name.replace(".png.tiles", "")
-            try:
-                article_id = int(article_id_str)
-            except ValueError:
-                article_id = hash(article_id_str) % (2**31)
-
             chunks_json = td / "chunks.json"
             tiles_json = td / "tiles.json"
 
+            # Read article_id from the manifest (written by the pipeline).
+            # Fall back to parsing the directory name for backward compat
+            # with indexes built before this change.
+            article_id = None
+            for mf in (chunks_json, tiles_json):
+                if mf.exists() and article_id is None:
+                    try:
+                        article_id = json.loads(mf.read_text()).get("article_id")
+                    except (json.JSONDecodeError, OSError):
+                        pass
+            if article_id is None:
+                article_id_str = td.name.replace(".png.tiles", "")
+                try:
+                    article_id = int(article_id_str)
+                except ValueError:
+                    article_id = hash(article_id_str) % (2**31)
+
             if chunks_json.exists():
                 with open(chunks_json) as f:
                     manifest = json.load(f)
diff --git a/index/src/pixelrag_index/pipelines.py b/index/src/pixelrag_index/pipelines.py
index f943ec8..1ab7593 100644
--- a/index/src/pixelrag_index/pipelines.py
+++ b/index/src/pixelrag_index/pipelines.py
@@ -89,20 +89,38 @@ def build(config: dict, limit: int | None = None, force: bool = False) -> Path:
             "  Rendered %d URLs (%d skipped, already exist)", len(new_url_docs), skipped
         )
 
-    # Render PDFs
+    # Render PDFs — use idx as tile directory name (like URLs) so directory
+    # names are always the numeric article_id.
     for idx, doc in pdf_docs:
         try:
-            render_pdf(doc.path, str(tiles_dir))
+            render_pdf(doc.path, str(tiles_dir), stem=str(idx))
         except Exception as e:
             logger.warning("  FAILED PDF %s: %s", doc.id, e)
     if pdf_docs:
         logger.info("  Rendered %d PDFs", len(pdf_docs))
 
-    # Save articles.json for serve API — title + URL per article.
-    # Use the pipeline's sequential *position index* (0, 1, 2, …) rather than
-    # int(a["id"]), because local sources use filename stems (e.g. "art_alice")
-    # as doc IDs, which are not numeric. int() on a filename stem raises ValueError
-    # and crashes the entire index build step.
+    # Write article_id into each tile directory's manifest so the embed
+    # pipeline can read it explicitly instead of guessing from the directory
+    # name. This is the authoritative source of article_id — directory names
+    # are for humans, manifests are for the pipeline.
+    tile_dir_map: dict[int, Path] = {}
+    for idx, _ in url_docs + pdf_docs + image_docs:
+        tile_path = tiles_dir / f"{idx}.png.tiles"
+        if tile_path.is_dir():
+            tile_dir_map[idx] = tile_path
+    for idx, tile_path in tile_dir_map.items():
+        for manifest_name in ("tiles.json", "chunks.json"):
+            manifest_path = tile_path / manifest_name
+            if manifest_path.exists():
+                try:
+                    manifest = json.loads(manifest_path.read_text())
+                    manifest["article_id"] = idx
+                    manifest_path.write_text(json.dumps(manifest))
+                except (json.JSONDecodeError, OSError):
+                    pass
+
+    # Save articles.json for serve API — maps article_id (array index) to
+    # human-readable title + URL.
     articles_path = output / "articles.json"
     article_entries = []
     for enum_idx, a in enumerate(articles):
@@ -110,7 +128,6 @@ def build(config: dict, limit: int | None = None, force: bool = False) -> Path:
         if not title and a.get("url"):
             title = a["url"].split("/")[-1].replace("_", " ").replace("%20", " ")
         if not title:
-            # Fall back to original doc id (e.g. filename stem) as display title
             title = a.get("id", str(enum_idx))
         url = a.get("url", "") or a.get("path", "")
         article_entries.append({"title": title, "url": url})
diff --git a/render/src/pixelrag_render/backends/pdf.py b/render/src/pixelrag_render/backends/pdf.py
index 17f306c..1c3d04e 100644
--- a/render/src/pixelrag_render/backends/pdf.py
+++ b/render/src/pixelrag_render/backends/pdf.py
@@ -20,6 +20,7 @@ def render_pdf(
     dpi: int = 200,
     pages: Optional[list[int]] = None,
     quality: int = 85,
+    stem: str | None = None,
 ) -> list[Path]:
     """Render a PDF to JPEG tiles.
 
@@ -32,6 +33,9 @@ def render_pdf(
         dpi: Resolution for rendering (default 200 gives ~1650×2200px for A4).
         pages: 1-based list of page numbers to render. ``None`` renders all pages.
         quality: JPEG quality 1-100 (default 85).
+        stem: Override for the tile directory name. Defaults to the PDF filename
+            stem. The pipeline passes the article_id here so directory names
+            are always numeric and consistent with articles.json.
 
     Returns:
         List containing the single tile directory Path on success.
@@ -55,7 +59,8 @@ def render_pdf(
     output_dir = Path(output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
 
-    stem = path.stem
+    if stem is None:
+        stem = path.stem
     tile_dir = output_dir / f"{stem}.png.tiles"
     tile_dir.mkdir(parents=True, exist_ok=True)
 
diff --git a/render/src/pixelrag_render/render.py b/render/src/pixelrag_render/render.py
index 554765d..0680ee8 100644
--- a/render/src/pixelrag_render/render.py
+++ b/render/src/pixelrag_render/render.py
@@ -115,6 +115,7 @@ def render_pdf(
     dpi: int = 200,
     pages: Optional[list[int]] = None,
     quality: int = 85,
+    stem: str | None = None,
 ) -> list[Path]:
     """Render a PDF file to tiled JPEG images.
 
@@ -124,13 +125,14 @@ def render_pdf(
         dpi: Rendering resolution (default 200 ≈ 1650×2200 for A4).
         pages: 1-based list of page numbers to render. ``None`` renders all.
         quality: JPEG quality 1-100 (default 85).
+        stem: Override for the tile directory name (default: PDF filename stem).
 
     Returns:
         List containing the tile directory Path on success.
     """
     from .backends.pdf import render_pdf as _render_pdf
 
-    return _render_pdf(path, output_dir, dpi=dpi, pages=pages, quality=quality)
+    return _render_pdf(path, output_dir, dpi=dpi, pages=pages, quality=quality, stem=stem)
 
 
 def render_file(
diff --git a/tests/test_article_id.py b/tests/test_article_id.py
new file mode 100644
index 0000000..f242b86
--- /dev/null
+++ b/tests/test_article_id.py
@@ -0,0 +1,86 @@
+"""Tests for the article_id manifest contract.
+
+The pipeline writes article_id into tiles.json/chunks.json so the embed
+pipeline reads it explicitly instead of guessing from directory names.
+"""
+
+import json
+from pathlib import Path
+
+from pixelrag_embed.embed_cpu import scan_chunks
+
+
+def _make_tile_dir(base: Path, dir_name: str, article_id: int | None = None):
+    """Create a minimal tile directory with tiles.json and chunks.json."""
+    td = base / f"{dir_name}.png.tiles"
+    td.mkdir(parents=True)
+
+    from PIL import Image
+
+    img = Image.new("RGB", (875, 500))
+    img.save(td / "tile_0000.png")
+
+    tiles_meta = {"tiles": ["tile_0000.png"], "complete": True}
+    if article_id is not None:
+        tiles_meta["article_id"] = article_id
+    (td / "tiles.json").write_text(json.dumps(tiles_meta))
+
+    chunks_meta = {
+        "chunks": [
+            {
+                "tile": "tile_0000.png",
+                "tile_index": 0,
+                "chunk_index": 0,
+                "file": "tile_0000.png",
+                "x_offset": 0,
+                "y_offset": 0,
+                "height": 500,
+                "width": 875,
+            }
+        ],
+    }
+    if article_id is not None:
+        chunks_meta["article_id"] = article_id
+    (td / "chunks.json").write_text(json.dumps(chunks_meta))
+    return td
+
+
+def test_article_id_from_manifest(tmp_path):
+    """When article_id is in the manifest, embed reads it (not the dir name)."""
+    _make_tile_dir(tmp_path, "report", article_id=0)
+    items = scan_chunks(str(tmp_path))
+    assert len(items) == 1
+    assert items[0]["article_id"] == 0
+
+
+def test_article_id_fallback_to_dir_name(tmp_path):
+    """Without article_id in manifest, fall back to parsing directory name."""
+    _make_tile_dir(tmp_path, "42", article_id=None)
+    items = scan_chunks(str(tmp_path))
+    assert len(items) == 1
+    assert items[0]["article_id"] == 42
+
+
+def test_non_numeric_dir_without_manifest_id(tmp_path):
+    """Non-numeric dir name + no manifest article_id → hash fallback."""
+    _make_tile_dir(tmp_path, "my_report", article_id=None)
+    items = scan_chunks(str(tmp_path))
+    assert len(items) == 1
+    assert isinstance(items[0]["article_id"], int)
+
+
+def test_manifest_id_overrides_dir_name(tmp_path):
+    """Manifest article_id wins over directory name even if dir name is numeric."""
+    _make_tile_dir(tmp_path, "999", article_id=7)
+    items = scan_chunks(str(tmp_path))
+    assert len(items) == 1
+    assert items[0]["article_id"] == 7
+
+
+def test_multiple_articles_distinct_ids(tmp_path):
+    """Multiple tile dirs get distinct article_ids from manifests."""
+    _make_tile_dir(tmp_path, "report", article_id=0)
+    _make_tile_dir(tmp_path, "slides", article_id=1)
+    items = scan_chunks(str(tmp_path))
+    ids = {it["article_id"] for it in items}
+    assert ids == {0, 1}

From ddd2589d1bc6202615ea948215274bc5f36e1fbb Mon Sep 17 00:00:00 2001
From: Zhifei Li <andylizf@outlook.com>
Date: Tue, 23 Jun 2026 20:36:25 -0700
Subject: [PATCH 2/2] fix(index): make article_id manifest contract actually
 reach the GPU path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses review of #83 — the manifest contract was a leaky abstraction on the
default GPU chunks path; it only worked because dir names happened to be numeric.

Three fixes:

1. chunk.py now propagates article_id from tiles.json into chunks.json. Before,
   the pipeline wrote article_id into chunks.json at Stage 1 — but for URLs that
   file doesn't exist until Stage 2 (chunk.py), so the write was a silent no-op,
   and chunk.py rebuilt chunks.json without article_id. The GPU embedder
   (scan_shard_chunks, reads only chunks.json) therefore never saw it and always
   fell back to dir-name parsing → non-numeric dirs (PDF/local rendered
   standalone) were silently skipped. chunk.py is the right place: it already
   reads tiles.json, so it carries article_id forward.

2. scan_shard_chunks (GPU) now falls back to the sibling tiles.json before the
   dir name, matching the CPU embedder — defense in depth for chunks.json built
   before this change.

3. embed_cpu non-numeric fallback uses a stable sha1-based id instead of the
   builtin hash(), which is salted per-process (PYTHONHASHSEED) and produced a
   different article_id every build → index misaligned with articles.json and
   non-reproducible.

Tests rewritten to exercise the real flow (render tiles.json -> real chunk ->
real GPU/CPU scan) instead of a hand-built chunks.json the pipeline never emits,
and to assert the fallback id is the exact stable value.
---
 embed/src/pixelrag_embed/chunk.py     |   3 +
 embed/src/pixelrag_embed/embed.py     |   9 ++
 embed/src/pixelrag_embed/embed_cpu.py |   8 +-
 index/src/pixelrag_index/pipelines.py |  17 ++--
 tests/test_article_id.py              | 135 ++++++++++++++------------
 5 files changed, 101 insertions(+), 71 deletions(-)

diff --git a/embed/src/pixelrag_embed/chunk.py b/embed/src/pixelrag_embed/chunk.py
index c8f9f01..95ac0c0 100644
--- a/embed/src/pixelrag_embed/chunk.py
+++ b/embed/src/pixelrag_embed/chunk.py
@@ -127,6 +127,7 @@ def chunk_article(article_dir: str, dry_run: bool = False, force: bool = False)
     page_height = meta.get("page_height", 0)
     viewport_width = meta.get("viewport_width", 875)
     tile_height = meta.get("tile_height", 8192)
+    article_id = meta.get("article_id")  # propagate from tiles.json into chunks.json
 
     chunks_info = []  # list of {tile, chunk_index, file, y_offset, height}
     files_written = 0
@@ -229,6 +230,8 @@ def chunk_article(article_dir: str, dry_run: bool = False, force: bool = False)
         "tile_hashes": tile_hashes,
         "chunks": chunks_info,
     }
+    if article_id is not None:
+        manifest["article_id"] = article_id
 
     if not dry_run:
         with open(chunks_json, "w") as f:
diff --git a/embed/src/pixelrag_embed/embed.py b/embed/src/pixelrag_embed/embed.py
index af3afc7..9c86315 100644
--- a/embed/src/pixelrag_embed/embed.py
+++ b/embed/src/pixelrag_embed/embed.py
@@ -345,6 +345,15 @@ def scan_shard_chunks(
             continue
 
         article_id = meta.get("article_id")
+        if article_id is None:
+            # chunks.json predates the article_id contract — try the sibling
+            # tiles.json (CPU embedder does the same), then the directory name.
+            tiles_json = tiles_dir / "tiles.json"
+            if tiles_json.exists():
+                try:
+                    article_id = json.loads(tiles_json.read_text()).get("article_id")
+                except (json.JSONDecodeError, OSError):
+                    pass
         if article_id is None:
             dir_name = tiles_dir.name
             try:
diff --git a/embed/src/pixelrag_embed/embed_cpu.py b/embed/src/pixelrag_embed/embed_cpu.py
index 2cba3bf..21e7050 100644
--- a/embed/src/pixelrag_embed/embed_cpu.py
+++ b/embed/src/pixelrag_embed/embed_cpu.py
@@ -15,6 +15,7 @@
 """
 
 import argparse
+import hashlib
 import json
 import logging
 import os
@@ -97,7 +98,12 @@ def scan_chunks(shard_dir: str) -> list[dict]:
                 try:
                     article_id = int(article_id_str)
                 except ValueError:
-                    article_id = hash(article_id_str) % (2**31)
+                    # Non-numeric dir name with no manifest article_id. Use a
+                    # stable hash (builtin hash() is salted by PYTHONHASHSEED and
+                    # would give a different id every build -> non-reproducible
+                    # index). sha1 keeps the same id for the same dir name.
+                    digest = hashlib.sha1(article_id_str.encode()).hexdigest()
+                    article_id = int(digest[:8], 16)
 
             if chunks_json.exists():
                 with open(chunks_json) as f:
diff --git a/index/src/pixelrag_index/pipelines.py b/index/src/pixelrag_index/pipelines.py
index 1ab7593..6feb7ed 100644
--- a/index/src/pixelrag_index/pipelines.py
+++ b/index/src/pixelrag_index/pipelines.py
@@ -99,18 +99,15 @@ def build(config: dict, limit: int | None = None, force: bool = False) -> Path:
     if pdf_docs:
         logger.info("  Rendered %d PDFs", len(pdf_docs))
 
-    # Write article_id into each tile directory's manifest so the embed
-    # pipeline can read it explicitly instead of guessing from the directory
-    # name. This is the authoritative source of article_id — directory names
-    # are for humans, manifests are for the pipeline.
-    tile_dir_map: dict[int, Path] = {}
+    # Write article_id into each tile directory's manifests so the embed
+    # pipeline reads it explicitly instead of guessing from the directory name.
+    # tiles.json always exists here. chunks.json exists only for PDFs (pdf.py
+    # writes it at render time, and chunk.py then skips those dirs); for URLs it
+    # is created by Stage 2's chunk.py, which propagates article_id from
+    # tiles.json. So write whichever manifests exist now.
     for idx, _ in url_docs + pdf_docs + image_docs:
-        tile_path = tiles_dir / f"{idx}.png.tiles"
-        if tile_path.is_dir():
-            tile_dir_map[idx] = tile_path
-    for idx, tile_path in tile_dir_map.items():
         for manifest_name in ("tiles.json", "chunks.json"):
-            manifest_path = tile_path / manifest_name
+            manifest_path = tiles_dir / f"{idx}.png.tiles" / manifest_name
             if manifest_path.exists():
                 try:
                     manifest = json.loads(manifest_path.read_text())
diff --git a/tests/test_article_id.py b/tests/test_article_id.py
index f242b86..55ce805 100644
--- a/tests/test_article_id.py
+++ b/tests/test_article_id.py
@@ -1,86 +1,101 @@
 """Tests for the article_id manifest contract.
 
-The pipeline writes article_id into tiles.json/chunks.json so the embed
-pipeline reads it explicitly instead of guessing from directory names.
+The pipeline writes article_id into tiles.json. chunk.py propagates it into
+chunks.json. Both embedders (GPU scan_shard_chunks, CPU scan_chunks) read it
+from the manifest, falling back to the directory name only for legacy indexes.
 """
 
 import json
 from pathlib import Path
 
+from PIL import Image
+
+from pixelrag_embed.chunk import chunk_article
+from pixelrag_embed.embed import scan_shard_chunks
 from pixelrag_embed.embed_cpu import scan_chunks
 
 
-def _make_tile_dir(base: Path, dir_name: str, article_id: int | None = None):
-    """Create a minimal tile directory with tiles.json and chunks.json."""
+def _make_tile_dir(base: Path, dir_name: str, article_id: int | None = None) -> Path:
+    """A tile dir with only tiles.json (as it exists right after rendering)."""
     td = base / f"{dir_name}.png.tiles"
     td.mkdir(parents=True)
+    Image.new("RGB", (875, 500)).save(td / "tile_0000.png")
+    meta = {"tiles": ["tile_0000.png"], "tile_height": 8192, "complete": True}
+    if article_id is not None:
+        meta["article_id"] = article_id
+    (td / "tiles.json").write_text(json.dumps(meta))
+    return td
 
-    from PIL import Image
 
-    img = Image.new("RGB", (875, 500))
-    img.save(td / "tile_0000.png")
+def _read_chunks_article_id(td: Path):
+    return json.loads((td / "chunks.json").read_text()).get("article_id")
 
-    tiles_meta = {"tiles": ["tile_0000.png"], "complete": True}
-    if article_id is not None:
-        tiles_meta["article_id"] = article_id
-    (td / "tiles.json").write_text(json.dumps(tiles_meta))
-
-    chunks_meta = {
-        "chunks": [
-            {
-                "tile": "tile_0000.png",
-                "tile_index": 0,
-                "chunk_index": 0,
-                "file": "tile_0000.png",
-                "x_offset": 0,
-                "y_offset": 0,
-                "height": 500,
-                "width": 875,
-            }
-        ],
-    }
-    if article_id is not None:
-        chunks_meta["article_id"] = article_id
-    (td / "chunks.json").write_text(json.dumps(chunks_meta))
-    return td
 
+# --- chunk.py propagates article_id from tiles.json into chunks.json ----------
 
-def test_article_id_from_manifest(tmp_path):
-    """When article_id is in the manifest, embed reads it (not the dir name)."""
-    _make_tile_dir(tmp_path, "report", article_id=0)
-    items = scan_chunks(str(tmp_path))
-    assert len(items) == 1
-    assert items[0]["article_id"] == 0
 
+def test_chunk_propagates_article_id_to_chunks_json(tmp_path):
+    td = _make_tile_dir(tmp_path, "report", article_id=0)
+    chunk_article(str(td))
+    # This is the real data flow the GPU embedder depends on.
+    assert _read_chunks_article_id(td) == 0
 
-def test_article_id_fallback_to_dir_name(tmp_path):
-    """Without article_id in manifest, fall back to parsing directory name."""
-    _make_tile_dir(tmp_path, "42", article_id=None)
-    items = scan_chunks(str(tmp_path))
-    assert len(items) == 1
-    assert items[0]["article_id"] == 42
 
+def test_chunk_without_article_id_omits_it(tmp_path):
+    td = _make_tile_dir(tmp_path, "5", article_id=None)
+    chunk_article(str(td))
+    assert "article_id" not in json.loads((td / "chunks.json").read_text())
 
-def test_non_numeric_dir_without_manifest_id(tmp_path):
-    """Non-numeric dir name + no manifest article_id → hash fallback."""
-    _make_tile_dir(tmp_path, "my_report", article_id=None)
-    items = scan_chunks(str(tmp_path))
-    assert len(items) == 1
-    assert isinstance(items[0]["article_id"], int)
 
+# --- GPU embedder (scan_shard_chunks) reads it end-to-end --------------------
 
-def test_manifest_id_overrides_dir_name(tmp_path):
-    """Manifest article_id wins over directory name even if dir name is numeric."""
-    _make_tile_dir(tmp_path, "999", article_id=7)
-    items = scan_chunks(str(tmp_path))
-    assert len(items) == 1
-    assert items[0]["article_id"] == 7
 
+def test_gpu_scan_reads_propagated_article_id(tmp_path):
+    # Non-numeric dir name: only the manifest can supply the right id.
+    td = _make_tile_dir(tmp_path, "report", article_id=3)
+    chunk_article(str(td))
+    chunks = scan_shard_chunks(str(tmp_path))
+    assert chunks and all(c.article_id == 3 for c in chunks)
 
-def test_multiple_articles_distinct_ids(tmp_path):
-    """Multiple tile dirs get distinct article_ids from manifests."""
-    _make_tile_dir(tmp_path, "report", article_id=0)
-    _make_tile_dir(tmp_path, "slides", article_id=1)
+
+def test_gpu_scan_falls_back_to_tiles_json(tmp_path):
+    # chunks.json lacks article_id (legacy chunker) but tiles.json has it.
+    td = _make_tile_dir(tmp_path, "report", article_id=4)
+    chunk_article(str(td))
+    chunks_json = td / "chunks.json"
+    meta = json.loads(chunks_json.read_text())
+    meta.pop("article_id")
+    chunks_json.write_text(json.dumps(meta))
+    chunks = scan_shard_chunks(str(tmp_path))
+    assert chunks and all(c.article_id == 4 for c in chunks)
+
+
+def test_gpu_scan_falls_back_to_numeric_dir_name(tmp_path):
+    td = _make_tile_dir(tmp_path, "42", article_id=None)
+    chunk_article(str(td))
+    chunks = scan_shard_chunks(str(tmp_path))
+    assert chunks and all(c.article_id == 42 for c in chunks)
+
+
+# --- CPU embedder (scan_chunks) ---------------------------------------------
+
+
+def test_cpu_scan_reads_article_id_from_manifest(tmp_path):
+    td = _make_tile_dir(tmp_path, "report", article_id=7)
+    chunk_article(str(td))
     items = scan_chunks(str(tmp_path))
-    ids = {it["article_id"] for it in items}
-    assert ids == {0, 1}
+    assert items and all(it["article_id"] == 7 for it in items)
+
+
+def test_cpu_non_numeric_fallback_is_reproducible(tmp_path):
+    # No manifest id, non-numeric dir → must be a *stable* hash, not the salted
+    # builtin hash() (which changes per process via PYTHONHASHSEED and would make
+    # the index non-reproducible). Assert the exact sha1-derived value so a
+    # regression back to builtin hash() fails here.
+    import hashlib
+
+    td = _make_tile_dir(tmp_path, "my_report", article_id=None)
+    chunk_article(str(td))
+    got = scan_chunks(str(tmp_path))[0]["article_id"]
+    expected = int(hashlib.sha1(b"my_report").hexdigest()[:8], 16)
+    assert got == expected