From 28c9b5c42004d5810360bdec7de56f742e320460 Mon Sep 17 00:00:00 2001 From: Zhifei Li Date: Tue, 23 Jun 2026 06:00:25 -0700 Subject: [PATCH 1/2] fix(index): write article_id into tile manifests, stop guessing from dir names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The embed pipeline extracted article_id by parsing the tile directory name (e.g. "3104240.png.tiles" → int("3104240")). This broke for PDFs (directory named after the filename stem, e.g. "report.png.tiles" → int("report") fails) and for any non-numeric directory name. GPU embed skipped the tile silently; CPU embed used a hash fallback that produced IDs misaligned with articles.json. Root cause: article_id was never explicitly communicated from the pipeline to the embed stage — embed had to reverse-engineer it from the filesystem. Fix: the pipeline now writes article_id into tiles.json and chunks.json after rendering. Embed reads it from the manifest first, falling back to directory name parsing for backward compatibility with existing large-scale indexes (e.g. the Wikipedia corpus where dir names are already numeric). Also: render_pdf gains a stem parameter so the pipeline can name PDF tile directories by position index (like URLs), making directory names consistent across all source types. Follows the same sidecar-metadata pattern used by ColPali (JSONL manifest mapping FAISS IDs to doc metadata) and LEANN (passage_id_scheme + ids.txt/offset map). See also Rulin Shao's MassiveDS (shared shard IDs). --- embed/src/pixelrag_embed/embed.py | 29 ++++---- embed/src/pixelrag_embed/embed_cpu.py | 24 ++++-- index/src/pixelrag_index/pipelines.py | 33 +++++++-- render/src/pixelrag_render/backends/pdf.py | 7 +- render/src/pixelrag_render/render.py | 4 +- tests/test_article_id.py | 86 ++++++++++++++++++++++ 6 files changed, 153 insertions(+), 30 deletions(-) create mode 100644 tests/test_article_id.py diff --git a/embed/src/pixelrag_embed/embed.py b/embed/src/pixelrag_embed/embed.py index e210b9a..af3afc7 100644 --- a/embed/src/pixelrag_embed/embed.py +++ b/embed/src/pixelrag_embed/embed.py @@ -264,13 +264,14 @@ def scan_shard_tiles( if not meta.get("complete", False): continue - # Extract article_id from directory name: "3104240.png.tiles" -> 3104240 - dir_name = tiles_dir.name # e.g. "3104240.png.tiles" - try: - article_id = int(dir_name.split(".")[0]) - except (ValueError, IndexError): - logger.warning("Cannot parse article_id from %s", dir_name) - continue + article_id = meta.get("article_id") + if article_id is None: + dir_name = tiles_dir.name + try: + article_id = int(dir_name.split(".")[0]) + except (ValueError, IndexError): + logger.warning("Cannot parse article_id from %s", dir_name) + continue if article_id in skip: continue @@ -343,12 +344,14 @@ def scan_shard_chunks( logger.warning("Skipping %s: %s", chunks_json, e) continue - dir_name = tiles_dir.name - try: - article_id = int(dir_name.split(".")[0]) - except (ValueError, IndexError): - logger.warning("Cannot parse article_id from %s", dir_name) - continue + article_id = meta.get("article_id") + if article_id is None: + dir_name = tiles_dir.name + try: + article_id = int(dir_name.split(".")[0]) + except (ValueError, IndexError): + logger.warning("Cannot parse article_id from %s", dir_name) + continue if article_id in skip: continue diff --git a/embed/src/pixelrag_embed/embed_cpu.py b/embed/src/pixelrag_embed/embed_cpu.py index 3b717af..2cba3bf 100644 --- a/embed/src/pixelrag_embed/embed_cpu.py +++ b/embed/src/pixelrag_embed/embed_cpu.py @@ -79,16 +79,26 @@ def scan_chunks(shard_dir: str) -> list[dict]: ) for td in tile_dirs: - dir_name = td.name - article_id_str = dir_name.replace(".png.tiles", "") - try: - article_id = int(article_id_str) - except ValueError: - article_id = hash(article_id_str) % (2**31) - chunks_json = td / "chunks.json" tiles_json = td / "tiles.json" + # Read article_id from the manifest (written by the pipeline). + # Fall back to parsing the directory name for backward compat + # with indexes built before this change. + article_id = None + for mf in (chunks_json, tiles_json): + if mf.exists() and article_id is None: + try: + article_id = json.loads(mf.read_text()).get("article_id") + except (json.JSONDecodeError, OSError): + pass + if article_id is None: + article_id_str = td.name.replace(".png.tiles", "") + try: + article_id = int(article_id_str) + except ValueError: + article_id = hash(article_id_str) % (2**31) + if chunks_json.exists(): with open(chunks_json) as f: manifest = json.load(f) diff --git a/index/src/pixelrag_index/pipelines.py b/index/src/pixelrag_index/pipelines.py index f943ec8..1ab7593 100644 --- a/index/src/pixelrag_index/pipelines.py +++ b/index/src/pixelrag_index/pipelines.py @@ -89,20 +89,38 @@ def build(config: dict, limit: int | None = None, force: bool = False) -> Path: " Rendered %d URLs (%d skipped, already exist)", len(new_url_docs), skipped ) - # Render PDFs + # Render PDFs — use idx as tile directory name (like URLs) so directory + # names are always the numeric article_id. for idx, doc in pdf_docs: try: - render_pdf(doc.path, str(tiles_dir)) + render_pdf(doc.path, str(tiles_dir), stem=str(idx)) except Exception as e: logger.warning(" FAILED PDF %s: %s", doc.id, e) if pdf_docs: logger.info(" Rendered %d PDFs", len(pdf_docs)) - # Save articles.json for serve API — title + URL per article. - # Use the pipeline's sequential *position index* (0, 1, 2, …) rather than - # int(a["id"]), because local sources use filename stems (e.g. "art_alice") - # as doc IDs, which are not numeric. int() on a filename stem raises ValueError - # and crashes the entire index build step. + # Write article_id into each tile directory's manifest so the embed + # pipeline can read it explicitly instead of guessing from the directory + # name. This is the authoritative source of article_id — directory names + # are for humans, manifests are for the pipeline. + tile_dir_map: dict[int, Path] = {} + for idx, _ in url_docs + pdf_docs + image_docs: + tile_path = tiles_dir / f"{idx}.png.tiles" + if tile_path.is_dir(): + tile_dir_map[idx] = tile_path + for idx, tile_path in tile_dir_map.items(): + for manifest_name in ("tiles.json", "chunks.json"): + manifest_path = tile_path / manifest_name + if manifest_path.exists(): + try: + manifest = json.loads(manifest_path.read_text()) + manifest["article_id"] = idx + manifest_path.write_text(json.dumps(manifest)) + except (json.JSONDecodeError, OSError): + pass + + # Save articles.json for serve API — maps article_id (array index) to + # human-readable title + URL. articles_path = output / "articles.json" article_entries = [] for enum_idx, a in enumerate(articles): @@ -110,7 +128,6 @@ def build(config: dict, limit: int | None = None, force: bool = False) -> Path: if not title and a.get("url"): title = a["url"].split("/")[-1].replace("_", " ").replace("%20", " ") if not title: - # Fall back to original doc id (e.g. filename stem) as display title title = a.get("id", str(enum_idx)) url = a.get("url", "") or a.get("path", "") article_entries.append({"title": title, "url": url}) diff --git a/render/src/pixelrag_render/backends/pdf.py b/render/src/pixelrag_render/backends/pdf.py index 17f306c..1c3d04e 100644 --- a/render/src/pixelrag_render/backends/pdf.py +++ b/render/src/pixelrag_render/backends/pdf.py @@ -20,6 +20,7 @@ def render_pdf( dpi: int = 200, pages: Optional[list[int]] = None, quality: int = 85, + stem: str | None = None, ) -> list[Path]: """Render a PDF to JPEG tiles. @@ -32,6 +33,9 @@ def render_pdf( dpi: Resolution for rendering (default 200 gives ~1650×2200px for A4). pages: 1-based list of page numbers to render. ``None`` renders all pages. quality: JPEG quality 1-100 (default 85). + stem: Override for the tile directory name. Defaults to the PDF filename + stem. The pipeline passes the article_id here so directory names + are always numeric and consistent with articles.json. Returns: List containing the single tile directory Path on success. @@ -55,7 +59,8 @@ def render_pdf( output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - stem = path.stem + if stem is None: + stem = path.stem tile_dir = output_dir / f"{stem}.png.tiles" tile_dir.mkdir(parents=True, exist_ok=True) diff --git a/render/src/pixelrag_render/render.py b/render/src/pixelrag_render/render.py index 554765d..0680ee8 100644 --- a/render/src/pixelrag_render/render.py +++ b/render/src/pixelrag_render/render.py @@ -115,6 +115,7 @@ def render_pdf( dpi: int = 200, pages: Optional[list[int]] = None, quality: int = 85, + stem: str | None = None, ) -> list[Path]: """Render a PDF file to tiled JPEG images. @@ -124,13 +125,14 @@ def render_pdf( dpi: Rendering resolution (default 200 ≈ 1650×2200 for A4). pages: 1-based list of page numbers to render. ``None`` renders all. quality: JPEG quality 1-100 (default 85). + stem: Override for the tile directory name (default: PDF filename stem). Returns: List containing the tile directory Path on success. """ from .backends.pdf import render_pdf as _render_pdf - return _render_pdf(path, output_dir, dpi=dpi, pages=pages, quality=quality) + return _render_pdf(path, output_dir, dpi=dpi, pages=pages, quality=quality, stem=stem) def render_file( diff --git a/tests/test_article_id.py b/tests/test_article_id.py new file mode 100644 index 0000000..f242b86 --- /dev/null +++ b/tests/test_article_id.py @@ -0,0 +1,86 @@ +"""Tests for the article_id manifest contract. + +The pipeline writes article_id into tiles.json/chunks.json so the embed +pipeline reads it explicitly instead of guessing from directory names. +""" + +import json +from pathlib import Path + +from pixelrag_embed.embed_cpu import scan_chunks + + +def _make_tile_dir(base: Path, dir_name: str, article_id: int | None = None): + """Create a minimal tile directory with tiles.json and chunks.json.""" + td = base / f"{dir_name}.png.tiles" + td.mkdir(parents=True) + + from PIL import Image + + img = Image.new("RGB", (875, 500)) + img.save(td / "tile_0000.png") + + tiles_meta = {"tiles": ["tile_0000.png"], "complete": True} + if article_id is not None: + tiles_meta["article_id"] = article_id + (td / "tiles.json").write_text(json.dumps(tiles_meta)) + + chunks_meta = { + "chunks": [ + { + "tile": "tile_0000.png", + "tile_index": 0, + "chunk_index": 0, + "file": "tile_0000.png", + "x_offset": 0, + "y_offset": 0, + "height": 500, + "width": 875, + } + ], + } + if article_id is not None: + chunks_meta["article_id"] = article_id + (td / "chunks.json").write_text(json.dumps(chunks_meta)) + return td + + +def test_article_id_from_manifest(tmp_path): + """When article_id is in the manifest, embed reads it (not the dir name).""" + _make_tile_dir(tmp_path, "report", article_id=0) + items = scan_chunks(str(tmp_path)) + assert len(items) == 1 + assert items[0]["article_id"] == 0 + + +def test_article_id_fallback_to_dir_name(tmp_path): + """Without article_id in manifest, fall back to parsing directory name.""" + _make_tile_dir(tmp_path, "42", article_id=None) + items = scan_chunks(str(tmp_path)) + assert len(items) == 1 + assert items[0]["article_id"] == 42 + + +def test_non_numeric_dir_without_manifest_id(tmp_path): + """Non-numeric dir name + no manifest article_id → hash fallback.""" + _make_tile_dir(tmp_path, "my_report", article_id=None) + items = scan_chunks(str(tmp_path)) + assert len(items) == 1 + assert isinstance(items[0]["article_id"], int) + + +def test_manifest_id_overrides_dir_name(tmp_path): + """Manifest article_id wins over directory name even if dir name is numeric.""" + _make_tile_dir(tmp_path, "999", article_id=7) + items = scan_chunks(str(tmp_path)) + assert len(items) == 1 + assert items[0]["article_id"] == 7 + + +def test_multiple_articles_distinct_ids(tmp_path): + """Multiple tile dirs get distinct article_ids from manifests.""" + _make_tile_dir(tmp_path, "report", article_id=0) + _make_tile_dir(tmp_path, "slides", article_id=1) + items = scan_chunks(str(tmp_path)) + ids = {it["article_id"] for it in items} + assert ids == {0, 1} From ddd2589d1bc6202615ea948215274bc5f36e1fbb Mon Sep 17 00:00:00 2001 From: Zhifei Li Date: Tue, 23 Jun 2026 20:36:25 -0700 Subject: [PATCH 2/2] fix(index): make article_id manifest contract actually reach the GPU path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses review of #83 — the manifest contract was a leaky abstraction on the default GPU chunks path; it only worked because dir names happened to be numeric. Three fixes: 1. chunk.py now propagates article_id from tiles.json into chunks.json. Before, the pipeline wrote article_id into chunks.json at Stage 1 — but for URLs that file doesn't exist until Stage 2 (chunk.py), so the write was a silent no-op, and chunk.py rebuilt chunks.json without article_id. The GPU embedder (scan_shard_chunks, reads only chunks.json) therefore never saw it and always fell back to dir-name parsing → non-numeric dirs (PDF/local rendered standalone) were silently skipped. chunk.py is the right place: it already reads tiles.json, so it carries article_id forward. 2. scan_shard_chunks (GPU) now falls back to the sibling tiles.json before the dir name, matching the CPU embedder — defense in depth for chunks.json built before this change. 3. embed_cpu non-numeric fallback uses a stable sha1-based id instead of the builtin hash(), which is salted per-process (PYTHONHASHSEED) and produced a different article_id every build → index misaligned with articles.json and non-reproducible. Tests rewritten to exercise the real flow (render tiles.json -> real chunk -> real GPU/CPU scan) instead of a hand-built chunks.json the pipeline never emits, and to assert the fallback id is the exact stable value. --- embed/src/pixelrag_embed/chunk.py | 3 + embed/src/pixelrag_embed/embed.py | 9 ++ embed/src/pixelrag_embed/embed_cpu.py | 8 +- index/src/pixelrag_index/pipelines.py | 17 ++-- tests/test_article_id.py | 135 ++++++++++++++------------ 5 files changed, 101 insertions(+), 71 deletions(-) diff --git a/embed/src/pixelrag_embed/chunk.py b/embed/src/pixelrag_embed/chunk.py index c8f9f01..95ac0c0 100644 --- a/embed/src/pixelrag_embed/chunk.py +++ b/embed/src/pixelrag_embed/chunk.py @@ -127,6 +127,7 @@ def chunk_article(article_dir: str, dry_run: bool = False, force: bool = False) page_height = meta.get("page_height", 0) viewport_width = meta.get("viewport_width", 875) tile_height = meta.get("tile_height", 8192) + article_id = meta.get("article_id") # propagate from tiles.json into chunks.json chunks_info = [] # list of {tile, chunk_index, file, y_offset, height} files_written = 0 @@ -229,6 +230,8 @@ def chunk_article(article_dir: str, dry_run: bool = False, force: bool = False) "tile_hashes": tile_hashes, "chunks": chunks_info, } + if article_id is not None: + manifest["article_id"] = article_id if not dry_run: with open(chunks_json, "w") as f: diff --git a/embed/src/pixelrag_embed/embed.py b/embed/src/pixelrag_embed/embed.py index af3afc7..9c86315 100644 --- a/embed/src/pixelrag_embed/embed.py +++ b/embed/src/pixelrag_embed/embed.py @@ -345,6 +345,15 @@ def scan_shard_chunks( continue article_id = meta.get("article_id") + if article_id is None: + # chunks.json predates the article_id contract — try the sibling + # tiles.json (CPU embedder does the same), then the directory name. + tiles_json = tiles_dir / "tiles.json" + if tiles_json.exists(): + try: + article_id = json.loads(tiles_json.read_text()).get("article_id") + except (json.JSONDecodeError, OSError): + pass if article_id is None: dir_name = tiles_dir.name try: diff --git a/embed/src/pixelrag_embed/embed_cpu.py b/embed/src/pixelrag_embed/embed_cpu.py index 2cba3bf..21e7050 100644 --- a/embed/src/pixelrag_embed/embed_cpu.py +++ b/embed/src/pixelrag_embed/embed_cpu.py @@ -15,6 +15,7 @@ """ import argparse +import hashlib import json import logging import os @@ -97,7 +98,12 @@ def scan_chunks(shard_dir: str) -> list[dict]: try: article_id = int(article_id_str) except ValueError: - article_id = hash(article_id_str) % (2**31) + # Non-numeric dir name with no manifest article_id. Use a + # stable hash (builtin hash() is salted by PYTHONHASHSEED and + # would give a different id every build -> non-reproducible + # index). sha1 keeps the same id for the same dir name. + digest = hashlib.sha1(article_id_str.encode()).hexdigest() + article_id = int(digest[:8], 16) if chunks_json.exists(): with open(chunks_json) as f: diff --git a/index/src/pixelrag_index/pipelines.py b/index/src/pixelrag_index/pipelines.py index 1ab7593..6feb7ed 100644 --- a/index/src/pixelrag_index/pipelines.py +++ b/index/src/pixelrag_index/pipelines.py @@ -99,18 +99,15 @@ def build(config: dict, limit: int | None = None, force: bool = False) -> Path: if pdf_docs: logger.info(" Rendered %d PDFs", len(pdf_docs)) - # Write article_id into each tile directory's manifest so the embed - # pipeline can read it explicitly instead of guessing from the directory - # name. This is the authoritative source of article_id — directory names - # are for humans, manifests are for the pipeline. - tile_dir_map: dict[int, Path] = {} + # Write article_id into each tile directory's manifests so the embed + # pipeline reads it explicitly instead of guessing from the directory name. + # tiles.json always exists here. chunks.json exists only for PDFs (pdf.py + # writes it at render time, and chunk.py then skips those dirs); for URLs it + # is created by Stage 2's chunk.py, which propagates article_id from + # tiles.json. So write whichever manifests exist now. for idx, _ in url_docs + pdf_docs + image_docs: - tile_path = tiles_dir / f"{idx}.png.tiles" - if tile_path.is_dir(): - tile_dir_map[idx] = tile_path - for idx, tile_path in tile_dir_map.items(): for manifest_name in ("tiles.json", "chunks.json"): - manifest_path = tile_path / manifest_name + manifest_path = tiles_dir / f"{idx}.png.tiles" / manifest_name if manifest_path.exists(): try: manifest = json.loads(manifest_path.read_text()) diff --git a/tests/test_article_id.py b/tests/test_article_id.py index f242b86..55ce805 100644 --- a/tests/test_article_id.py +++ b/tests/test_article_id.py @@ -1,86 +1,101 @@ """Tests for the article_id manifest contract. -The pipeline writes article_id into tiles.json/chunks.json so the embed -pipeline reads it explicitly instead of guessing from directory names. +The pipeline writes article_id into tiles.json. chunk.py propagates it into +chunks.json. Both embedders (GPU scan_shard_chunks, CPU scan_chunks) read it +from the manifest, falling back to the directory name only for legacy indexes. """ import json from pathlib import Path +from PIL import Image + +from pixelrag_embed.chunk import chunk_article +from pixelrag_embed.embed import scan_shard_chunks from pixelrag_embed.embed_cpu import scan_chunks -def _make_tile_dir(base: Path, dir_name: str, article_id: int | None = None): - """Create a minimal tile directory with tiles.json and chunks.json.""" +def _make_tile_dir(base: Path, dir_name: str, article_id: int | None = None) -> Path: + """A tile dir with only tiles.json (as it exists right after rendering).""" td = base / f"{dir_name}.png.tiles" td.mkdir(parents=True) + Image.new("RGB", (875, 500)).save(td / "tile_0000.png") + meta = {"tiles": ["tile_0000.png"], "tile_height": 8192, "complete": True} + if article_id is not None: + meta["article_id"] = article_id + (td / "tiles.json").write_text(json.dumps(meta)) + return td - from PIL import Image - img = Image.new("RGB", (875, 500)) - img.save(td / "tile_0000.png") +def _read_chunks_article_id(td: Path): + return json.loads((td / "chunks.json").read_text()).get("article_id") - tiles_meta = {"tiles": ["tile_0000.png"], "complete": True} - if article_id is not None: - tiles_meta["article_id"] = article_id - (td / "tiles.json").write_text(json.dumps(tiles_meta)) - - chunks_meta = { - "chunks": [ - { - "tile": "tile_0000.png", - "tile_index": 0, - "chunk_index": 0, - "file": "tile_0000.png", - "x_offset": 0, - "y_offset": 0, - "height": 500, - "width": 875, - } - ], - } - if article_id is not None: - chunks_meta["article_id"] = article_id - (td / "chunks.json").write_text(json.dumps(chunks_meta)) - return td +# --- chunk.py propagates article_id from tiles.json into chunks.json ---------- -def test_article_id_from_manifest(tmp_path): - """When article_id is in the manifest, embed reads it (not the dir name).""" - _make_tile_dir(tmp_path, "report", article_id=0) - items = scan_chunks(str(tmp_path)) - assert len(items) == 1 - assert items[0]["article_id"] == 0 +def test_chunk_propagates_article_id_to_chunks_json(tmp_path): + td = _make_tile_dir(tmp_path, "report", article_id=0) + chunk_article(str(td)) + # This is the real data flow the GPU embedder depends on. + assert _read_chunks_article_id(td) == 0 -def test_article_id_fallback_to_dir_name(tmp_path): - """Without article_id in manifest, fall back to parsing directory name.""" - _make_tile_dir(tmp_path, "42", article_id=None) - items = scan_chunks(str(tmp_path)) - assert len(items) == 1 - assert items[0]["article_id"] == 42 +def test_chunk_without_article_id_omits_it(tmp_path): + td = _make_tile_dir(tmp_path, "5", article_id=None) + chunk_article(str(td)) + assert "article_id" not in json.loads((td / "chunks.json").read_text()) -def test_non_numeric_dir_without_manifest_id(tmp_path): - """Non-numeric dir name + no manifest article_id → hash fallback.""" - _make_tile_dir(tmp_path, "my_report", article_id=None) - items = scan_chunks(str(tmp_path)) - assert len(items) == 1 - assert isinstance(items[0]["article_id"], int) +# --- GPU embedder (scan_shard_chunks) reads it end-to-end -------------------- -def test_manifest_id_overrides_dir_name(tmp_path): - """Manifest article_id wins over directory name even if dir name is numeric.""" - _make_tile_dir(tmp_path, "999", article_id=7) - items = scan_chunks(str(tmp_path)) - assert len(items) == 1 - assert items[0]["article_id"] == 7 +def test_gpu_scan_reads_propagated_article_id(tmp_path): + # Non-numeric dir name: only the manifest can supply the right id. + td = _make_tile_dir(tmp_path, "report", article_id=3) + chunk_article(str(td)) + chunks = scan_shard_chunks(str(tmp_path)) + assert chunks and all(c.article_id == 3 for c in chunks) -def test_multiple_articles_distinct_ids(tmp_path): - """Multiple tile dirs get distinct article_ids from manifests.""" - _make_tile_dir(tmp_path, "report", article_id=0) - _make_tile_dir(tmp_path, "slides", article_id=1) + +def test_gpu_scan_falls_back_to_tiles_json(tmp_path): + # chunks.json lacks article_id (legacy chunker) but tiles.json has it. + td = _make_tile_dir(tmp_path, "report", article_id=4) + chunk_article(str(td)) + chunks_json = td / "chunks.json" + meta = json.loads(chunks_json.read_text()) + meta.pop("article_id") + chunks_json.write_text(json.dumps(meta)) + chunks = scan_shard_chunks(str(tmp_path)) + assert chunks and all(c.article_id == 4 for c in chunks) + + +def test_gpu_scan_falls_back_to_numeric_dir_name(tmp_path): + td = _make_tile_dir(tmp_path, "42", article_id=None) + chunk_article(str(td)) + chunks = scan_shard_chunks(str(tmp_path)) + assert chunks and all(c.article_id == 42 for c in chunks) + + +# --- CPU embedder (scan_chunks) --------------------------------------------- + + +def test_cpu_scan_reads_article_id_from_manifest(tmp_path): + td = _make_tile_dir(tmp_path, "report", article_id=7) + chunk_article(str(td)) items = scan_chunks(str(tmp_path)) - ids = {it["article_id"] for it in items} - assert ids == {0, 1} + assert items and all(it["article_id"] == 7 for it in items) + + +def test_cpu_non_numeric_fallback_is_reproducible(tmp_path): + # No manifest id, non-numeric dir → must be a *stable* hash, not the salted + # builtin hash() (which changes per process via PYTHONHASHSEED and would make + # the index non-reproducible). Assert the exact sha1-derived value so a + # regression back to builtin hash() fails here. + import hashlib + + td = _make_tile_dir(tmp_path, "my_report", article_id=None) + chunk_article(str(td)) + got = scan_chunks(str(tmp_path))[0]["article_id"] + expected = int(hashlib.sha1(b"my_report").hexdigest()[:8], 16) + assert got == expected