Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 16 additions & 13 deletions embed/src/pixelrag_embed/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,13 +264,14 @@ def scan_shard_tiles(
if not meta.get("complete", False):
continue

# Extract article_id from directory name: "3104240.png.tiles" -> 3104240
dir_name = tiles_dir.name # e.g. "3104240.png.tiles"
try:
article_id = int(dir_name.split(".")[0])
except (ValueError, IndexError):
logger.warning("Cannot parse article_id from %s", dir_name)
continue
article_id = meta.get("article_id")
if article_id is None:
dir_name = tiles_dir.name
try:
article_id = int(dir_name.split(".")[0])
except (ValueError, IndexError):
logger.warning("Cannot parse article_id from %s", dir_name)
continue

if article_id in skip:
continue
Expand Down Expand Up @@ -343,12 +344,14 @@ def scan_shard_chunks(
logger.warning("Skipping %s: %s", chunks_json, e)
continue

dir_name = tiles_dir.name
try:
article_id = int(dir_name.split(".")[0])
except (ValueError, IndexError):
logger.warning("Cannot parse article_id from %s", dir_name)
continue
article_id = meta.get("article_id")
if article_id is None:
dir_name = tiles_dir.name
try:
article_id = int(dir_name.split(".")[0])
except (ValueError, IndexError):
logger.warning("Cannot parse article_id from %s", dir_name)
continue

if article_id in skip:
continue
Expand Down
24 changes: 17 additions & 7 deletions embed/src/pixelrag_embed/embed_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,16 +79,26 @@ def scan_chunks(shard_dir: str) -> list[dict]:
)

for td in tile_dirs:
dir_name = td.name
article_id_str = dir_name.replace(".png.tiles", "")
try:
article_id = int(article_id_str)
except ValueError:
article_id = hash(article_id_str) % (2**31)

chunks_json = td / "chunks.json"
tiles_json = td / "tiles.json"

# Read article_id from the manifest (written by the pipeline).
# Fall back to parsing the directory name for backward compat
# with indexes built before this change.
article_id = None
for mf in (chunks_json, tiles_json):
if mf.exists() and article_id is None:
try:
article_id = json.loads(mf.read_text()).get("article_id")
except (json.JSONDecodeError, OSError):
pass
if article_id is None:
article_id_str = td.name.replace(".png.tiles", "")
try:
article_id = int(article_id_str)
except ValueError:
article_id = hash(article_id_str) % (2**31)

if chunks_json.exists():
with open(chunks_json) as f:
manifest = json.load(f)
Expand Down
33 changes: 25 additions & 8 deletions index/src/pixelrag_index/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,28 +89,45 @@ def build(config: dict, limit: int | None = None, force: bool = False) -> Path:
" Rendered %d URLs (%d skipped, already exist)", len(new_url_docs), skipped
)

# Render PDFs
# Render PDFs — use idx as tile directory name (like URLs) so directory
# names are always the numeric article_id.
for idx, doc in pdf_docs:
try:
render_pdf(doc.path, str(tiles_dir))
render_pdf(doc.path, str(tiles_dir), stem=str(idx))
except Exception as e:
logger.warning(" FAILED PDF %s: %s", doc.id, e)
if pdf_docs:
logger.info(" Rendered %d PDFs", len(pdf_docs))

# Save articles.json for serve API — title + URL per article.
# Use the pipeline's sequential *position index* (0, 1, 2, …) rather than
# int(a["id"]), because local sources use filename stems (e.g. "art_alice")
# as doc IDs, which are not numeric. int() on a filename stem raises ValueError
# and crashes the entire index build step.
# Write article_id into each tile directory's manifest so the embed
# pipeline can read it explicitly instead of guessing from the directory
# name. This is the authoritative source of article_id — directory names
# are for humans, manifests are for the pipeline.
tile_dir_map: dict[int, Path] = {}
for idx, _ in url_docs + pdf_docs + image_docs:
tile_path = tiles_dir / f"{idx}.png.tiles"
if tile_path.is_dir():
tile_dir_map[idx] = tile_path
for idx, tile_path in tile_dir_map.items():
for manifest_name in ("tiles.json", "chunks.json"):
manifest_path = tile_path / manifest_name
if manifest_path.exists():
try:
manifest = json.loads(manifest_path.read_text())
manifest["article_id"] = idx
manifest_path.write_text(json.dumps(manifest))
except (json.JSONDecodeError, OSError):
pass

# Save articles.json for serve API — maps article_id (array index) to
# human-readable title + URL.
articles_path = output / "articles.json"
article_entries = []
for enum_idx, a in enumerate(articles):
title = a.get("metadata", {}).get("title", "")
if not title and a.get("url"):
title = a["url"].split("/")[-1].replace("_", " ").replace("%20", " ")
if not title:
# Fall back to original doc id (e.g. filename stem) as display title
title = a.get("id", str(enum_idx))
url = a.get("url", "") or a.get("path", "")
article_entries.append({"title": title, "url": url})
Expand Down
7 changes: 6 additions & 1 deletion render/src/pixelrag_render/backends/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def render_pdf(
dpi: int = 200,
pages: Optional[list[int]] = None,
quality: int = 85,
stem: str | None = None,
) -> list[Path]:
"""Render a PDF to JPEG tiles.

Expand All @@ -32,6 +33,9 @@ def render_pdf(
dpi: Resolution for rendering (default 200 gives ~1650×2200px for A4).
pages: 1-based list of page numbers to render. ``None`` renders all pages.
quality: JPEG quality 1-100 (default 85).
stem: Override for the tile directory name. Defaults to the PDF filename
stem. The pipeline passes the article_id here so directory names
are always numeric and consistent with articles.json.

Returns:
List containing the single tile directory Path on success.
Expand All @@ -55,7 +59,8 @@ def render_pdf(
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

stem = path.stem
if stem is None:
stem = path.stem
tile_dir = output_dir / f"{stem}.png.tiles"
tile_dir.mkdir(parents=True, exist_ok=True)

Expand Down
4 changes: 3 additions & 1 deletion render/src/pixelrag_render/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def render_pdf(
dpi: int = 200,
pages: Optional[list[int]] = None,
quality: int = 85,
stem: str | None = None,
) -> list[Path]:
"""Render a PDF file to tiled JPEG images.

Expand All @@ -124,13 +125,14 @@ def render_pdf(
dpi: Rendering resolution (default 200 ≈ 1650×2200 for A4).
pages: 1-based list of page numbers to render. ``None`` renders all.
quality: JPEG quality 1-100 (default 85).
stem: Override for the tile directory name (default: PDF filename stem).

Returns:
List containing the tile directory Path on success.
"""
from .backends.pdf import render_pdf as _render_pdf

return _render_pdf(path, output_dir, dpi=dpi, pages=pages, quality=quality)
return _render_pdf(path, output_dir, dpi=dpi, pages=pages, quality=quality, stem=stem)


def render_file(
Expand Down
86 changes: 86 additions & 0 deletions tests/test_article_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""Tests for the article_id manifest contract.

The pipeline writes article_id into tiles.json/chunks.json so the embed
pipeline reads it explicitly instead of guessing from directory names.
"""

import json
from pathlib import Path

from pixelrag_embed.embed_cpu import scan_chunks


def _make_tile_dir(base: Path, dir_name: str, article_id: int | None = None):
"""Create a minimal tile directory with tiles.json and chunks.json."""
td = base / f"{dir_name}.png.tiles"
td.mkdir(parents=True)

from PIL import Image

img = Image.new("RGB", (875, 500))
img.save(td / "tile_0000.png")

tiles_meta = {"tiles": ["tile_0000.png"], "complete": True}
if article_id is not None:
tiles_meta["article_id"] = article_id
(td / "tiles.json").write_text(json.dumps(tiles_meta))

chunks_meta = {
"chunks": [
{
"tile": "tile_0000.png",
"tile_index": 0,
"chunk_index": 0,
"file": "tile_0000.png",
"x_offset": 0,
"y_offset": 0,
"height": 500,
"width": 875,
}
],
}
if article_id is not None:
chunks_meta["article_id"] = article_id
(td / "chunks.json").write_text(json.dumps(chunks_meta))
return td


def test_article_id_from_manifest(tmp_path):
"""When article_id is in the manifest, embed reads it (not the dir name)."""
_make_tile_dir(tmp_path, "report", article_id=0)
items = scan_chunks(str(tmp_path))
assert len(items) == 1
assert items[0]["article_id"] == 0


def test_article_id_fallback_to_dir_name(tmp_path):
"""Without article_id in manifest, fall back to parsing directory name."""
_make_tile_dir(tmp_path, "42", article_id=None)
items = scan_chunks(str(tmp_path))
assert len(items) == 1
assert items[0]["article_id"] == 42


def test_non_numeric_dir_without_manifest_id(tmp_path):
"""Non-numeric dir name + no manifest article_id → hash fallback."""
_make_tile_dir(tmp_path, "my_report", article_id=None)
items = scan_chunks(str(tmp_path))
assert len(items) == 1
assert isinstance(items[0]["article_id"], int)


def test_manifest_id_overrides_dir_name(tmp_path):
"""Manifest article_id wins over directory name even if dir name is numeric."""
_make_tile_dir(tmp_path, "999", article_id=7)
items = scan_chunks(str(tmp_path))
assert len(items) == 1
assert items[0]["article_id"] == 7


def test_multiple_articles_distinct_ids(tmp_path):
"""Multiple tile dirs get distinct article_ids from manifests."""
_make_tile_dir(tmp_path, "report", article_id=0)
_make_tile_dir(tmp_path, "slides", article_id=1)
items = scan_chunks(str(tmp_path))
ids = {it["article_id"] for it in items}
assert ids == {0, 1}
Loading