From 29015c7a70490e8c5b48a5cb7f2c2966d0c3a1ca Mon Sep 17 00:00:00 2001
From: gas2own <gas2own@allenai.org>
Date: Tue, 9 Jun 2026 21:39:35 +0000
Subject: [PATCH 1/2] feat(local-paper-index): index raw markdown corpora, not
 just PDFs

The chunk-and-index script already ingests flat `.md` files, but it baked in
a PDF-only contract: it synthesized a `.pdf` source name, tagged every chunk
`pdf-index`, and stored only `extra.source_pdf`. That makes it awkward to index
a corpus that was authored as markdown (notes, wikis, an investigation record)
and never had a PDF.

Add `--source-ext` (default `pdf`, fully backward compatible) so callers can
index raw markdown/text directly by skipping PDF extraction entirely:

  chunk-and-index.py my-notes /data/notes --index-path ... --source-ext md

- canonical `extra.source_file` is always written; legacy `extra.source_pdf`
  is preserved for `--source-ext pdf` so existing consumers/indexes are unaffected
- secondary tag becomes `<ext>-index` (`md-index`, `txt-index`, ...; `pdf-index`
  unchanged for the default)
- resumability now keys on source_file with a source_pdf fallback
- SKILL.md documents the raw-markdown path (skip Steps 1-2)

Verified: ruff check + format clean, validate-skills 15/15, smoke-indexed a
markdown corpus, and confirmed the PDF default still emits source_pdf/pdf-index.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 skills/local-paper-index/SKILL.md             | 29 ++++++
 .../assets/chunk-and-index.py                 | 88 ++++++++++++-------
 2 files changed, 87 insertions(+), 30 deletions(-)
diff --git a/skills/local-paper-index/SKILL.md b/skills/local-paper-index/SKILL.md
index 9e9f08f..a464753 100644
--- a/skills/local-paper-index/SKILL.md
+++ b/skills/local-paper-index/SKILL.md
@@ -186,6 +186,35 @@ asta documents --root "$DATASET_ROOT" search --extra=".source_pdf contains some-
 asta documents --root "$DATASET_ROOT" list --tags="my-papers"
 ```
 
+## Indexing raw markdown (skipping PDF extraction)
+
+If your corpus is **already markdown** (authored `.md` docs, exported notes, an
+investigation record, a wiki), there is nothing to extract — skip Steps 1–2 and
+point the chunker straight at the markdown directory. Pass `--source-ext` so the
+stored source filename and the secondary tag reflect the real format (`md-index`
+instead of `pdf-index`):
+
+```bash
+COLLECTION="my-notes"
+MARKDOWN_DIR="/data/notes"            # a tree of .md files (rglob, nested OK)
+INDEX_PATH="/data/notes/index.yaml"
+
+uv run --with pyyaml python3 /path/to/assets/chunk-and-index.py \
+  "$COLLECTION" "$MARKDOWN_DIR" --index-path "$INDEX_PATH" --source-ext md
+
+bash /path/to/assets/warm-cache.sh "$(dirname "$INDEX_PATH")"
+asta documents --root "$(dirname "$INDEX_PATH")" search \
+  --summary="your query" --tags="$COLLECTION" --show-scores
+```
+
+Notes:
+- `--source-ext` only affects the synthesized `extra.source_file` value and the
+  `<ext>-index` tag; chunking, relative-path URLs, and resumability are identical
+  to the PDF path. The default remains `pdf` for backward compatibility.
+- For non-PDF collections the chunker writes `extra.source_file` (canonical). The
+  legacy `extra.source_pdf` key is still written for `--source-ext pdf` so older
+  consumers and indexes keep working.
+
 ## Storage Estimates
 
 | Collection size | Approx. index size | Approx. markdown size |
diff --git a/skills/local-paper-index/assets/chunk-and-index.py b/skills/local-paper-index/assets/chunk-and-index.py
index 93bc93e..c935cc6 100644
--- a/skills/local-paper-index/assets/chunk-and-index.py
+++ b/skills/local-paper-index/assets/chunk-and-index.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""Chunk extracted markdown files and write an asta-documents YAML index.
+"""Chunk markdown files and write an asta-documents YAML index.
 
 Writes the index YAML directly (no per-chunk CLI calls), following the same
 schema as asta-documents: version 1.0, documents list with uuid/name/url/
@@ -8,6 +8,13 @@
 Usage:
     python3 chunk-and-index.py <collection-name> <markdown-dir> --index-path <path>
 
+The input is always a directory of markdown files. Those markdown files may be
+extraction output from PDFs (the default `--source-ext pdf` case) OR raw,
+authored markdown that was never a PDF (`--source-ext md`, or any extension).
+Use the latter to index a corpus of `.md` documents directly, skipping the
+PDF-extraction step entirely. `--source-ext` only controls the synthesized
+source filename and the secondary index tag; chunking is identical.
+
 The --index-path is required. The script computes relative URLs for markdown
 files relative to the directory containing the index file. Files outside that
 directory get absolute file:// URLs.
@@ -104,15 +111,19 @@ def load_existing_index(index_path: Path) -> dict:
     return {"version": "1.0", "documents": []}
 
 
-def find_existing_pdfs(documents: list[dict], collection: str) -> set[str]:
-    """Find source_pdf values already in the index for this collection."""
+def find_existing_sources(documents: list[dict], collection: str) -> set[str]:
+    """Find source filenames already in the index for this collection.
+
+    Reads the canonical ``source_file`` key and falls back to the legacy
+    ``source_pdf`` key so indexes written by older runs stay resumable.
+    """
     seen = set()
     for doc in documents:
         extra = doc.get("extra", {})
         if extra.get("collection") == collection:
-            pdf = extra.get("source_pdf", "")
-            if pdf:
-                seen.add(pdf)
+            source = extra.get("source_file") or extra.get("source_pdf", "")
+            if source:
+                seen.add(source)
     return seen
 
 
@@ -133,6 +144,16 @@ def main():
         default=CHUNK_SIZE,
         help="Chunk size in characters (default: 2000)",
     )
+    parser.add_argument(
+        "--source-ext",
+        default="pdf",
+        help=(
+            "Extension of the original source documents, used to synthesize the "
+            "stored source filename and the secondary index tag '<ext>-index'. "
+            "Defaults to 'pdf' for PDF-extraction output; pass 'md' (or 'txt', "
+            "etc.) to index a corpus of raw markdown/text documents directly."
+        ),
+    )
     parser.add_argument(
         "--index-path",
         required=True,
@@ -144,6 +165,8 @@ def main():
     index_path = Path(args.index_path)
     chunk_size = args.chunk_size
     collection = args.collection
+    source_ext = args.source_ext.lstrip(".")
+    index_tag = f"{source_ext}-index"
 
     if not md_dir.exists():
         print(f"Error: markdown directory not found: {md_dir}", file=sys.stderr)
@@ -169,33 +192,33 @@ def main():
 
     # Load existing index (preserves previously indexed documents)
     index_data = load_existing_index(index_path)
-    existing_pdfs = find_existing_pdfs(index_data["documents"], collection)
+    existing_sources = find_existing_sources(index_data["documents"], collection)
 
     now = datetime.now(UTC).isoformat()
     new_docs = 0
-    pdfs_processed = 0
-    pdfs_skipped_empty = 0
-    pdfs_skipped_existing = 0
+    docs_processed = 0
+    docs_skipped_empty = 0
+    docs_skipped_existing = 0
 
     for md_file in md_files:
         text = md_file.read_text(encoding="utf-8")
-        # Derive the PDF name: if the .md is in a subdirectory of markdown_dir,
-        # use the subdirectory name (e.g. markdown/paper1/paper1.md -> paper1.pdf).
+        # Derive the source name: if the .md is in a subdirectory of markdown_dir,
+        # use the subdirectory name (e.g. markdown/paper1/paper1.md -> paper1.<ext>).
         # If flat in markdown_dir, use the file stem.
         if md_file.parent != md_dir:
             basename = md_file.parent.name
         else:
             basename = md_file.stem
-        source_pdf = f"{basename}.pdf"
+        source_file = f"{basename}.{source_ext}"
 
         if not text.strip():
             print(f"  [skip] {basename} (empty)")
-            pdfs_skipped_empty += 1
+            docs_skipped_empty += 1
             continue
 
-        if source_pdf in existing_pdfs:
+        if source_file in existing_sources:
             print(f"  [skip] {basename} (already indexed)")
-            pdfs_skipped_existing += 1
+            docs_skipped_existing += 1
             continue
 
         url = make_url(md_file, index_dir)
@@ -203,29 +226,34 @@ def main():
         chunks = chunk_text(text, chunk_size)
 
         for i, (chunk, offset) in enumerate(chunks, 1):
+            extra = {
+                "source_file": source_file,
+                "chunk_index": i,
+                "total_chunks": len(chunks),
+                "chunk_chars": len(chunk),
+                "chunk_offset": offset,
+                "file_chars": file_size,
+                "collection": collection,
+            }
+            # Preserve the legacy key for PDF collections so existing consumers
+            # and indexes that filter on `source_pdf` keep working unchanged.
+            if source_ext == "pdf":
+                extra["source_pdf"] = source_file
             doc_entry = {
                 "uuid": generate_uuid(),
                 "name": f"{basename} [chunk {i}/{len(chunks)}]",
                 "mime_type": "text/markdown",
                 "url": url,
                 "summary": chunk,
-                "tags": [collection, "pdf-index"],
+                "tags": [collection, index_tag],
                 "created_at": now,
                 "modified_at": now,
-                "extra": {
-                    "source_pdf": source_pdf,
-                    "chunk_index": i,
-                    "total_chunks": len(chunks),
-                    "chunk_chars": len(chunk),
-                    "chunk_offset": offset,
-                    "file_chars": file_size,
-                    "collection": collection,
-                },
+                "extra": extra,
             }
             index_data["documents"].append(doc_entry)
             new_docs += 1
 
-        pdfs_processed += 1
+        docs_processed += 1
         print(f"  [index] {basename} ({len(chunks)} chunks) -> {url}")
 
     # Write index
@@ -236,9 +264,9 @@ def main():
         )
 
     print()
-    print(f"PDFs processed:         {pdfs_processed}")
-    print(f"PDFs skipped (empty):   {pdfs_skipped_empty}")
-    print(f"PDFs skipped (exists):  {pdfs_skipped_existing}")
+    print(f"Sources processed:      {docs_processed}")
+    print(f"Sources skipped (empty):{docs_skipped_empty}")
+    print(f"Sources skipped (exists):{docs_skipped_existing}")
     print(f"New documents added:    {new_docs}")
     print(f"Total documents in idx: {len(index_data['documents'])}")
     print(f"Index written to:       {index_path}")

From 2e6f08714692b503f44cc9c2d46397edb16394de Mon Sep 17 00:00:00 2001
From: gas2own agent <agent@gas2own>
Date: Wed, 10 Jun 2026 18:11:55 +0000
Subject: [PATCH 2/2] Address review: model PDF as optional upstream, drop
 --source-ext
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rodney's feedback on PR #74:

- Drop extra.source_file — the url already points to the source .md, so it
  was redundant. The indexed document *is* the markdown.
- extra.source_pdf is now present only when the .md is downstream of a PDF;
  its absence means there is no upstream document and url is the original
  source. It now holds a real pointer (relative/file:// URL) to the PDF.
- Replace the --source-ext flag with --pdf-dir, pointing at the upstream PDF
  directory. The script iterates that directory and matches each .md to the
  PDF actually on disk (by basename) instead of synthesizing '<stem>.pdf' —
  an improvement over the old pdf-to-md logic. Unmatched .md files are
  indexed without source_pdf and warned about.
- Secondary tag is pdf-index for PDF-derived markdown (unchanged for the
  existing workflow) and md-index for raw markdown. Resumability now keys on
  url. SKILL.md Step 3 and the raw-markdown section updated to match.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../skills/local-paper-index/SKILL.md         |  37 +++-
 .../assets/chunk-and-index.py                 | 168 +++++++++++++-----
 2 files changed, 152 insertions(+), 53 deletions(-)

diff --git a/plugins/asta-preview/skills/local-paper-index/SKILL.md b/plugins/asta-preview/skills/local-paper-index/SKILL.md
index 17b052e..f74a465 100644
--- a/plugins/asta-preview/skills/local-paper-index/SKILL.md
+++ b/plugins/asta-preview/skills/local-paper-index/SKILL.md
@@ -111,22 +111,24 @@ The script:
 ### Step 3: Chunk and build index
 
 ```bash
-uv run --with pyyaml python3 /path/to/assets/chunk-and-index.py "$COLLECTION" "$MARKDOWN_DIR" --index-path "$INDEX_PATH"
+uv run --with pyyaml python3 /path/to/assets/chunk-and-index.py "$COLLECTION" "$MARKDOWN_DIR" --index-path "$INDEX_PATH" --pdf-dir "$PDF_DIR"
 ```
 
 The `--index-path` argument is **required**. The script:
-- Computes paths relative to the index file's directory, storing **relative paths** in the `url` field — making the index portable across machines
+- Computes paths relative to the index file's directory, storing **relative paths** in the `url` field — making the index portable across machines. The indexed document *is* the markdown: its `url` points at the `.md` file.
 - Reads each markdown file, splits into ~2000-char chunks at paragraph/sentence boundaries
 - Writes all documents to the index YAML in a single pass
 - Preserves any existing documents in the index (appends, does not overwrite)
-- Skips PDFs already indexed for this collection (safe to re-run)
+- Skips markdown files already indexed for this collection (resumability keys on `url`; safe to re-run)
+- Resolves the upstream PDF for each `.md` by iterating `--pdf-dir` and matching on basename (the per-PDF subdirectory name, or the flat file stem) — it finds the PDF actually on disk rather than assuming a filename. A `.md` with no matching PDF is indexed without a `source_pdf` and warned about.
 - Each document gets:
-  - **Shared PDF metadata:** `source_pdf`, `collection` (in `extra`)
+  - **Shared metadata (in `extra`):** `collection`, plus `source_pdf` (a relative/`file://` pointer to the upstream PDF) **only when** `--pdf-dir` is given and a matching PDF is found
   - **Per-chunk metadata:** `chunk_index`, `total_chunks`, `chunk_chars`, `chunk_offset`, `file_chars` (in `extra`)
-  - **Tags:** `<collection-name>`, `pdf-index`
+  - **Tags:** `<collection-name>`, plus `pdf-index` for PDF-derived markdown or `md-index` for raw markdown
 
 Options:
 - `--chunk-size 2000` — adjust chunk size (default 2000 chars)
+- `--pdf-dir "$PDF_DIR"` — directory of upstream source PDFs. Omit it when indexing authored markdown (see [Indexing raw markdown](#indexing-raw-markdown-no-pdfs) below).
 
 ### Step 4: Warm the search cache
 
@@ -171,6 +173,31 @@ asta documents --root "$DATASET_ROOT" search --extra=".source_pdf contains some-
 asta documents --root "$DATASET_ROOT" list --tags="my-papers"
 ```
 
+## Indexing raw markdown (no PDFs)
+
+If your corpus is **already markdown** (authored `.md` docs, exported notes, an
+investigation record, a wiki), there is nothing to extract — skip Steps 1–2 and
+point the chunker straight at the markdown directory. Just omit `--pdf-dir`:
+
+```bash
+COLLECTION="my-notes"
+MARKDOWN_DIR="/data/notes"            # a tree of .md files (rglob, nested OK)
+INDEX_PATH="/data/notes/index.yaml"
+
+uv run --with pyyaml python3 /path/to/assets/chunk-and-index.py \
+  "$COLLECTION" "$MARKDOWN_DIR" --index-path "$INDEX_PATH"
+
+bash /path/to/assets/warm-cache.sh "$(dirname "$INDEX_PATH")"
+asta documents --root "$(dirname "$INDEX_PATH")" search \
+  --summary="your query" --tags="$COLLECTION" --show-scores
+```
+
+The markdown is the source: each document's `url` points at the `.md`, the
+secondary tag is `md-index`, and `extra.source_pdf` is absent (there is no
+upstream PDF). Chunking, relative-path URLs, and resumability are identical to
+the PDF path — the only difference between the two is whether `extra.source_pdf`
+is present.
+
 ## Storage Estimates
 
 | Collection size | Approx. index size | Approx. markdown size |
diff --git a/plugins/asta-preview/skills/local-paper-index/assets/chunk-and-index.py b/plugins/asta-preview/skills/local-paper-index/assets/chunk-and-index.py
index 93bc93e..aaa6e44 100644
--- a/plugins/asta-preview/skills/local-paper-index/assets/chunk-and-index.py
+++ b/plugins/asta-preview/skills/local-paper-index/assets/chunk-and-index.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""Chunk extracted markdown files and write an asta-documents YAML index.
+"""Chunk markdown files and write an asta-documents YAML index.
 
 Writes the index YAML directly (no per-chunk CLI calls), following the same
 schema as asta-documents: version 1.0, documents list with uuid/name/url/
@@ -8,8 +8,20 @@
 Usage:
     python3 chunk-and-index.py <collection-name> <markdown-dir> --index-path <path>
 
-The --index-path is required. The script computes relative URLs for markdown
-files relative to the directory containing the index file. Files outside that
+The input is always a directory of markdown files, and the indexed document
+*is* that markdown: its ``url`` points at the ``.md`` file. The markdown may be
+authored directly (a corpus of notes, a wiki, an investigation record) or it
+may be extraction output from PDFs.
+
+When the markdown was extracted from PDFs, pass ``--pdf-dir`` pointing at the
+directory of source PDFs. The script then iterates that directory to find the
+PDF that actually corresponds to each ``.md`` file and records a pointer to it
+in ``extra.source_pdf``. When ``--pdf-dir`` is omitted (or no matching PDF is
+found), there is no upstream document and ``extra.source_pdf`` is simply absent
+— ``url`` is the original source.
+
+The --index-path is required. The script computes relative URLs for files
+relative to the directory containing the index file. Files outside that
 directory get absolute file:// URLs.
 
 The markdown-dir can contain either:
@@ -18,12 +30,9 @@
   - Flat .md files:
       markdown/paper1.md, markdown/paper2.md, ...
 
-The source PDF name is derived from the subdirectory name (if nested) or
-the .md file stem (if flat).
-
-Each PDF is represented by multiple documents in the index. They share
-PDF-level metadata (source_pdf, collection) with per-chunk identifiers
-(chunk_index, total_chunks).
+Each markdown file is represented by multiple documents in the index. They
+share file-level metadata (collection, and source_pdf when applicable) with
+per-chunk identifiers (chunk_index, total_chunks).
 """
 
 import argparse
@@ -80,13 +89,13 @@ def chunk_text(text: str, size: int = CHUNK_SIZE) -> list[tuple[str, int]]:
     return chunks
 
 
-def make_url(md_file: Path, index_dir: Path) -> str:
-    """Compute a URL for a markdown file, relative to the index directory.
+def make_url(path: Path, index_dir: Path) -> str:
+    """Compute a URL for a file, relative to the index directory.
 
     If the file is under the index directory, returns a relative path
     (portable, git-friendly). Otherwise returns an absolute file:// URL.
     """
-    resolved = md_file.resolve()
+    resolved = path.resolve()
     try:
         rel = resolved.relative_to(index_dir)
         return str(rel)
@@ -104,18 +113,41 @@ def load_existing_index(index_path: Path) -> dict:
     return {"version": "1.0", "documents": []}
 
 
-def find_existing_pdfs(documents: list[dict], collection: str) -> set[str]:
-    """Find source_pdf values already in the index for this collection."""
+def find_existing_urls(documents: list[dict], collection: str) -> set[str]:
+    """Find the `url`s already indexed for this collection.
+
+    The url (the markdown file itself) is the canonical identity of an indexed
+    document, so resumability keys on it.
+    """
     seen = set()
     for doc in documents:
-        extra = doc.get("extra", {})
-        if extra.get("collection") == collection:
-            pdf = extra.get("source_pdf", "")
-            if pdf:
-                seen.add(pdf)
+        if doc.get("extra", {}).get("collection") == collection:
+            url = doc.get("url")
+            if url:
+                seen.add(url)
     return seen
 
 
+def build_pdf_index(pdf_dir: Path) -> dict[str, Path]:
+    """Map each PDF's stem to its path, for matching markdown files to sources.
+
+    Iterates the actual PDFs under `pdf_dir` (recursively) rather than
+    synthesizing a filename, so the match reflects what is really on disk.
+    """
+    pdf_index: dict[str, Path] = {}
+    for pdf in sorted(pdf_dir.rglob("*.pdf")):
+        # First writer wins; warn on an ambiguous stem collision.
+        if pdf.stem in pdf_index:
+            print(
+                f"WARNING: multiple PDFs share the stem '{pdf.stem}'; "
+                f"using {pdf_index[pdf.stem]}, ignoring {pdf}",
+                file=sys.stderr,
+            )
+            continue
+        pdf_index[pdf.stem] = pdf
+    return pdf_index
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Chunk markdown files and write asta-documents YAML index"
@@ -125,7 +157,7 @@ def main():
     )
     parser.add_argument(
         "markdown_dir",
-        help="Directory containing PDF extraction output (subdirectories with .md + images, or flat .md files)",
+        help="Directory of markdown files (per-PDF subdirectories with .md + images, or flat .md files)",
     )
     parser.add_argument(
         "--chunk-size",
@@ -133,6 +165,15 @@ def main():
         default=CHUNK_SIZE,
         help="Chunk size in characters (default: 2000)",
     )
+    parser.add_argument(
+        "--pdf-dir",
+        help=(
+            "Directory of upstream source PDFs (when the markdown was extracted "
+            "from PDFs). The script iterates this directory to find the PDF "
+            "matching each .md file and records a pointer to it in "
+            "extra.source_pdf. Omit it when indexing authored markdown."
+        ),
+    )
     parser.add_argument(
         "--index-path",
         required=True,
@@ -149,6 +190,15 @@ def main():
         print(f"Error: markdown directory not found: {md_dir}", file=sys.stderr)
         sys.exit(1)
 
+    pdf_index: dict[str, Path] = {}
+    if args.pdf_dir:
+        pdf_dir = Path(args.pdf_dir)
+        if not pdf_dir.exists():
+            print(f"Error: PDF directory not found: {pdf_dir}", file=sys.stderr)
+            sys.exit(1)
+        pdf_index = build_pdf_index(pdf_dir)
+        print(f"Found {len(pdf_index)} source PDF(s) in {pdf_dir}")
+
     # Find .md files: supports both per-PDF subdirectories (with images) and
     # flat .md files directly in markdown_dir.
     md_files = sorted(md_dir.rglob("*.md"))
@@ -169,63 +219,85 @@ def main():
 
     # Load existing index (preserves previously indexed documents)
     index_data = load_existing_index(index_path)
-    existing_pdfs = find_existing_pdfs(index_data["documents"], collection)
+    existing_urls = find_existing_urls(index_data["documents"], collection)
 
     now = datetime.now(UTC).isoformat()
     new_docs = 0
-    pdfs_processed = 0
-    pdfs_skipped_empty = 0
-    pdfs_skipped_existing = 0
+    docs_processed = 0
+    docs_skipped_empty = 0
+    docs_skipped_existing = 0
 
     for md_file in md_files:
         text = md_file.read_text(encoding="utf-8")
-        # Derive the PDF name: if the .md is in a subdirectory of markdown_dir,
-        # use the subdirectory name (e.g. markdown/paper1/paper1.md -> paper1.pdf).
+        # Derive the basename: if the .md is in a subdirectory of markdown_dir,
+        # use the subdirectory name (e.g. markdown/paper1/paper1.md -> paper1).
         # If flat in markdown_dir, use the file stem.
         if md_file.parent != md_dir:
             basename = md_file.parent.name
         else:
             basename = md_file.stem
-        source_pdf = f"{basename}.pdf"
 
         if not text.strip():
             print(f"  [skip] {basename} (empty)")
-            pdfs_skipped_empty += 1
+            docs_skipped_empty += 1
             continue
 
-        if source_pdf in existing_pdfs:
+        url = make_url(md_file, index_dir)
+
+        if url in existing_urls:
             print(f"  [skip] {basename} (already indexed)")
-            pdfs_skipped_existing += 1
+            docs_skipped_existing += 1
             continue
 
-        url = make_url(md_file, index_dir)
+        # Resolve the upstream PDF, if any, by matching the basename against the
+        # PDFs actually present in --pdf-dir.
+        source_pdf_url = None
+        if args.pdf_dir:
+            pdf = pdf_index.get(basename)
+            if pdf is not None:
+                source_pdf_url = make_url(pdf, index_dir)
+            else:
+                print(
+                    f"  [warn] {basename}: no matching PDF in --pdf-dir; "
+                    "indexing without source_pdf",
+                    file=sys.stderr,
+                )
+
+        # Documents derived from a PDF keep the legacy `pdf-index` tag so
+        # existing consumers that filter on it still work; raw markdown gets
+        # `md-index`.
+        secondary_tag = "pdf-index" if source_pdf_url else "md-index"
+
         file_size = len(text)
         chunks = chunk_text(text, chunk_size)
 
         for i, (chunk, offset) in enumerate(chunks, 1):
+            extra = {
+                "chunk_index": i,
+                "total_chunks": len(chunks),
+                "chunk_chars": len(chunk),
+                "chunk_offset": offset,
+                "file_chars": file_size,
+                "collection": collection,
+            }
+            # Present only when there is a real upstream PDF for this markdown.
+            if source_pdf_url:
+                extra["source_pdf"] = source_pdf_url
             doc_entry = {
                 "uuid": generate_uuid(),
                 "name": f"{basename} [chunk {i}/{len(chunks)}]",
                 "mime_type": "text/markdown",
                 "url": url,
                 "summary": chunk,
-                "tags": [collection, "pdf-index"],
+                "tags": [collection, secondary_tag],
                 "created_at": now,
                 "modified_at": now,
-                "extra": {
-                    "source_pdf": source_pdf,
-                    "chunk_index": i,
-                    "total_chunks": len(chunks),
-                    "chunk_chars": len(chunk),
-                    "chunk_offset": offset,
-                    "file_chars": file_size,
-                    "collection": collection,
-                },
+                "extra": extra,
             }
             index_data["documents"].append(doc_entry)
             new_docs += 1
 
-        pdfs_processed += 1
+        docs_processed += 1
         print(f"  [index] {basename} ({len(chunks)} chunks) -> {url}")
 
     # Write index
@@ -236,12 +308,12 @@ def main():
         )
 
     print()
-    print(f"PDFs processed:         {pdfs_processed}")
-    print(f"PDFs skipped (empty):   {pdfs_skipped_empty}")
-    print(f"PDFs skipped (exists):  {pdfs_skipped_existing}")
-    print(f"New documents added:    {new_docs}")
-    print(f"Total documents in idx: {len(index_data['documents'])}")
-    print(f"Index written to:       {index_path}")
+    print(f"Sources processed:       {docs_processed}")
+    print(f"Sources skipped (empty): {docs_skipped_empty}")
+    print(f"Sources skipped (exists):{docs_skipped_existing}")
+    print(f"New documents added:     {new_docs}")
+    print(f"Total documents in idx:  {len(index_data['documents'])}")
+    print(f"Index written to:        {index_path}")
 
 
 if __name__ == "__main__":