From 29015c7a70490e8c5b48a5cb7f2c2966d0c3a1ca Mon Sep 17 00:00:00 2001 From: gas2own Date: Tue, 9 Jun 2026 21:39:35 +0000 Subject: [PATCH 1/2] feat(local-paper-index): index raw markdown corpora, not just PDFs The chunk-and-index script already ingests flat `.md` files, but it baked in a PDF-only contract: it synthesized a `.pdf` source name, tagged every chunk `pdf-index`, and stored only `extra.source_pdf`. That makes it awkward to index a corpus that was authored as markdown (notes, wikis, an investigation record) and never had a PDF. Add `--source-ext` (default `pdf`, fully backward compatible) so callers can index raw markdown/text directly by skipping PDF extraction entirely: chunk-and-index.py my-notes /data/notes --index-path ... --source-ext md - canonical `extra.source_file` is always written; legacy `extra.source_pdf` is preserved for `--source-ext pdf` so existing consumers/indexes are unaffected - secondary tag becomes `-index` (`md-index`, `txt-index`, ...; `pdf-index` unchanged for the default) - resumability now keys on source_file with a source_pdf fallback - SKILL.md documents the raw-markdown path (skip Steps 1-2) Verified: ruff check + format clean, validate-skills 15/15, smoke-indexed a markdown corpus, and confirmed the PDF default still emits source_pdf/pdf-index. Co-Authored-By: Claude Opus 4.8 --- skills/local-paper-index/SKILL.md | 29 ++++++ .../assets/chunk-and-index.py | 88 ++++++++++++------- 2 files changed, 87 insertions(+), 30 deletions(-) diff --git a/skills/local-paper-index/SKILL.md b/skills/local-paper-index/SKILL.md index 9e9f08f..a464753 100644 --- a/skills/local-paper-index/SKILL.md +++ b/skills/local-paper-index/SKILL.md @@ -186,6 +186,35 @@ asta documents --root "$DATASET_ROOT" search --extra=".source_pdf contains some- asta documents --root "$DATASET_ROOT" list --tags="my-papers" ``` +## Indexing raw markdown (skipping PDF extraction) + +If your corpus is **already markdown** (authored `.md` docs, exported notes, an +investigation record, a wiki), there is nothing to extract — skip Steps 1–2 and +point the chunker straight at the markdown directory. Pass `--source-ext` so the +stored source filename and the secondary tag reflect the real format (`md-index` +instead of `pdf-index`): + +```bash +COLLECTION="my-notes" +MARKDOWN_DIR="/data/notes" # a tree of .md files (rglob, nested OK) +INDEX_PATH="/data/notes/index.yaml" + +uv run --with pyyaml python3 /path/to/assets/chunk-and-index.py \ + "$COLLECTION" "$MARKDOWN_DIR" --index-path "$INDEX_PATH" --source-ext md + +bash /path/to/assets/warm-cache.sh "$(dirname "$INDEX_PATH")" +asta documents --root "$(dirname "$INDEX_PATH")" search \ + --summary="your query" --tags="$COLLECTION" --show-scores +``` + +Notes: +- `--source-ext` only affects the synthesized `extra.source_file` value and the + `-index` tag; chunking, relative-path URLs, and resumability are identical + to the PDF path. The default remains `pdf` for backward compatibility. +- For non-PDF collections the chunker writes `extra.source_file` (canonical). The + legacy `extra.source_pdf` key is still written for `--source-ext pdf` so older + consumers and indexes keep working. + ## Storage Estimates | Collection size | Approx. index size | Approx. markdown size | diff --git a/skills/local-paper-index/assets/chunk-and-index.py b/skills/local-paper-index/assets/chunk-and-index.py index 93bc93e..c935cc6 100644 --- a/skills/local-paper-index/assets/chunk-and-index.py +++ b/skills/local-paper-index/assets/chunk-and-index.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Chunk extracted markdown files and write an asta-documents YAML index. +"""Chunk markdown files and write an asta-documents YAML index. Writes the index YAML directly (no per-chunk CLI calls), following the same schema as asta-documents: version 1.0, documents list with uuid/name/url/ @@ -8,6 +8,13 @@ Usage: python3 chunk-and-index.py --index-path +The input is always a directory of markdown files. Those markdown files may be +extraction output from PDFs (the default `--source-ext pdf` case) OR raw, +authored markdown that was never a PDF (`--source-ext md`, or any extension). +Use the latter to index a corpus of `.md` documents directly, skipping the +PDF-extraction step entirely. `--source-ext` only controls the synthesized +source filename and the secondary index tag; chunking is identical. + The --index-path is required. The script computes relative URLs for markdown files relative to the directory containing the index file. Files outside that directory get absolute file:// URLs. @@ -104,15 +111,19 @@ def load_existing_index(index_path: Path) -> dict: return {"version": "1.0", "documents": []} -def find_existing_pdfs(documents: list[dict], collection: str) -> set[str]: - """Find source_pdf values already in the index for this collection.""" +def find_existing_sources(documents: list[dict], collection: str) -> set[str]: + """Find source filenames already in the index for this collection. + + Reads the canonical ``source_file`` key and falls back to the legacy + ``source_pdf`` key so indexes written by older runs stay resumable. + """ seen = set() for doc in documents: extra = doc.get("extra", {}) if extra.get("collection") == collection: - pdf = extra.get("source_pdf", "") - if pdf: - seen.add(pdf) + source = extra.get("source_file") or extra.get("source_pdf", "") + if source: + seen.add(source) return seen @@ -133,6 +144,16 @@ def main(): default=CHUNK_SIZE, help="Chunk size in characters (default: 2000)", ) + parser.add_argument( + "--source-ext", + default="pdf", + help=( + "Extension of the original source documents, used to synthesize the " + "stored source filename and the secondary index tag '-index'. " + "Defaults to 'pdf' for PDF-extraction output; pass 'md' (or 'txt', " + "etc.) to index a corpus of raw markdown/text documents directly." + ), + ) parser.add_argument( "--index-path", required=True, @@ -144,6 +165,8 @@ def main(): index_path = Path(args.index_path) chunk_size = args.chunk_size collection = args.collection + source_ext = args.source_ext.lstrip(".") + index_tag = f"{source_ext}-index" if not md_dir.exists(): print(f"Error: markdown directory not found: {md_dir}", file=sys.stderr) @@ -169,33 +192,33 @@ def main(): # Load existing index (preserves previously indexed documents) index_data = load_existing_index(index_path) - existing_pdfs = find_existing_pdfs(index_data["documents"], collection) + existing_sources = find_existing_sources(index_data["documents"], collection) now = datetime.now(UTC).isoformat() new_docs = 0 - pdfs_processed = 0 - pdfs_skipped_empty = 0 - pdfs_skipped_existing = 0 + docs_processed = 0 + docs_skipped_empty = 0 + docs_skipped_existing = 0 for md_file in md_files: text = md_file.read_text(encoding="utf-8") - # Derive the PDF name: if the .md is in a subdirectory of markdown_dir, - # use the subdirectory name (e.g. markdown/paper1/paper1.md -> paper1.pdf). + # Derive the source name: if the .md is in a subdirectory of markdown_dir, + # use the subdirectory name (e.g. markdown/paper1/paper1.md -> paper1.). # If flat in markdown_dir, use the file stem. if md_file.parent != md_dir: basename = md_file.parent.name else: basename = md_file.stem - source_pdf = f"{basename}.pdf" + source_file = f"{basename}.{source_ext}" if not text.strip(): print(f" [skip] {basename} (empty)") - pdfs_skipped_empty += 1 + docs_skipped_empty += 1 continue - if source_pdf in existing_pdfs: + if source_file in existing_sources: print(f" [skip] {basename} (already indexed)") - pdfs_skipped_existing += 1 + docs_skipped_existing += 1 continue url = make_url(md_file, index_dir) @@ -203,29 +226,34 @@ def main(): chunks = chunk_text(text, chunk_size) for i, (chunk, offset) in enumerate(chunks, 1): + extra = { + "source_file": source_file, + "chunk_index": i, + "total_chunks": len(chunks), + "chunk_chars": len(chunk), + "chunk_offset": offset, + "file_chars": file_size, + "collection": collection, + } + # Preserve the legacy key for PDF collections so existing consumers + # and indexes that filter on `source_pdf` keep working unchanged. + if source_ext == "pdf": + extra["source_pdf"] = source_file doc_entry = { "uuid": generate_uuid(), "name": f"{basename} [chunk {i}/{len(chunks)}]", "mime_type": "text/markdown", "url": url, "summary": chunk, - "tags": [collection, "pdf-index"], + "tags": [collection, index_tag], "created_at": now, "modified_at": now, - "extra": { - "source_pdf": source_pdf, - "chunk_index": i, - "total_chunks": len(chunks), - "chunk_chars": len(chunk), - "chunk_offset": offset, - "file_chars": file_size, - "collection": collection, - }, + "extra": extra, } index_data["documents"].append(doc_entry) new_docs += 1 - pdfs_processed += 1 + docs_processed += 1 print(f" [index] {basename} ({len(chunks)} chunks) -> {url}") # Write index @@ -236,9 +264,9 @@ def main(): ) print() - print(f"PDFs processed: {pdfs_processed}") - print(f"PDFs skipped (empty): {pdfs_skipped_empty}") - print(f"PDFs skipped (exists): {pdfs_skipped_existing}") + print(f"Sources processed: {docs_processed}") + print(f"Sources skipped (empty):{docs_skipped_empty}") + print(f"Sources skipped (exists):{docs_skipped_existing}") print(f"New documents added: {new_docs}") print(f"Total documents in idx: {len(index_data['documents'])}") print(f"Index written to: {index_path}") From 2e6f08714692b503f44cc9c2d46397edb16394de Mon Sep 17 00:00:00 2001 From: gas2own agent Date: Wed, 10 Jun 2026 18:11:55 +0000 Subject: [PATCH 2/2] Address review: model PDF as optional upstream, drop --source-ext MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rodney's feedback on PR #74: - Drop extra.source_file — the url already points to the source .md, so it was redundant. The indexed document *is* the markdown. - extra.source_pdf is now present only when the .md is downstream of a PDF; its absence means there is no upstream document and url is the original source. It now holds a real pointer (relative/file:// URL) to the PDF. - Replace the --source-ext flag with --pdf-dir, pointing at the upstream PDF directory. The script iterates that directory and matches each .md to the PDF actually on disk (by basename) instead of synthesizing '.pdf' — an improvement over the old pdf-to-md logic. Unmatched .md files are indexed without source_pdf and warned about. - Secondary tag is pdf-index for PDF-derived markdown (unchanged for the existing workflow) and md-index for raw markdown. Resumability now keys on url. SKILL.md Step 3 and the raw-markdown section updated to match. Co-Authored-By: Claude Opus 4.8 --- .../skills/local-paper-index/SKILL.md | 37 +++- .../assets/chunk-and-index.py | 168 +++++++++++++----- 2 files changed, 152 insertions(+), 53 deletions(-) diff --git a/plugins/asta-preview/skills/local-paper-index/SKILL.md b/plugins/asta-preview/skills/local-paper-index/SKILL.md index 17b052e..f74a465 100644 --- a/plugins/asta-preview/skills/local-paper-index/SKILL.md +++ b/plugins/asta-preview/skills/local-paper-index/SKILL.md @@ -111,22 +111,24 @@ The script: ### Step 3: Chunk and build index ```bash -uv run --with pyyaml python3 /path/to/assets/chunk-and-index.py "$COLLECTION" "$MARKDOWN_DIR" --index-path "$INDEX_PATH" +uv run --with pyyaml python3 /path/to/assets/chunk-and-index.py "$COLLECTION" "$MARKDOWN_DIR" --index-path "$INDEX_PATH" --pdf-dir "$PDF_DIR" ``` The `--index-path` argument is **required**. The script: -- Computes paths relative to the index file's directory, storing **relative paths** in the `url` field — making the index portable across machines +- Computes paths relative to the index file's directory, storing **relative paths** in the `url` field — making the index portable across machines. The indexed document *is* the markdown: its `url` points at the `.md` file. - Reads each markdown file, splits into ~2000-char chunks at paragraph/sentence boundaries - Writes all documents to the index YAML in a single pass - Preserves any existing documents in the index (appends, does not overwrite) -- Skips PDFs already indexed for this collection (safe to re-run) +- Skips markdown files already indexed for this collection (resumability keys on `url`; safe to re-run) +- Resolves the upstream PDF for each `.md` by iterating `--pdf-dir` and matching on basename (the per-PDF subdirectory name, or the flat file stem) — it finds the PDF actually on disk rather than assuming a filename. A `.md` with no matching PDF is indexed without a `source_pdf` and warned about. - Each document gets: - - **Shared PDF metadata:** `source_pdf`, `collection` (in `extra`) + - **Shared metadata (in `extra`):** `collection`, plus `source_pdf` (a relative/`file://` pointer to the upstream PDF) **only when** `--pdf-dir` is given and a matching PDF is found - **Per-chunk metadata:** `chunk_index`, `total_chunks`, `chunk_chars`, `chunk_offset`, `file_chars` (in `extra`) - - **Tags:** ``, `pdf-index` + - **Tags:** ``, plus `pdf-index` for PDF-derived markdown or `md-index` for raw markdown Options: - `--chunk-size 2000` — adjust chunk size (default 2000 chars) +- `--pdf-dir "$PDF_DIR"` — directory of upstream source PDFs. Omit it when indexing authored markdown (see [Indexing raw markdown](#indexing-raw-markdown-no-pdfs) below). ### Step 4: Warm the search cache @@ -171,6 +173,31 @@ asta documents --root "$DATASET_ROOT" search --extra=".source_pdf contains some- asta documents --root "$DATASET_ROOT" list --tags="my-papers" ``` +## Indexing raw markdown (no PDFs) + +If your corpus is **already markdown** (authored `.md` docs, exported notes, an +investigation record, a wiki), there is nothing to extract — skip Steps 1–2 and +point the chunker straight at the markdown directory. Just omit `--pdf-dir`: + +```bash +COLLECTION="my-notes" +MARKDOWN_DIR="/data/notes" # a tree of .md files (rglob, nested OK) +INDEX_PATH="/data/notes/index.yaml" + +uv run --with pyyaml python3 /path/to/assets/chunk-and-index.py \ + "$COLLECTION" "$MARKDOWN_DIR" --index-path "$INDEX_PATH" + +bash /path/to/assets/warm-cache.sh "$(dirname "$INDEX_PATH")" +asta documents --root "$(dirname "$INDEX_PATH")" search \ + --summary="your query" --tags="$COLLECTION" --show-scores +``` + +The markdown is the source: each document's `url` points at the `.md`, the +secondary tag is `md-index`, and `extra.source_pdf` is absent (there is no +upstream PDF). Chunking, relative-path URLs, and resumability are identical to +the PDF path — the only difference between the two is whether `extra.source_pdf` +is present. + ## Storage Estimates | Collection size | Approx. index size | Approx. markdown size | diff --git a/plugins/asta-preview/skills/local-paper-index/assets/chunk-and-index.py b/plugins/asta-preview/skills/local-paper-index/assets/chunk-and-index.py index 93bc93e..aaa6e44 100644 --- a/plugins/asta-preview/skills/local-paper-index/assets/chunk-and-index.py +++ b/plugins/asta-preview/skills/local-paper-index/assets/chunk-and-index.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Chunk extracted markdown files and write an asta-documents YAML index. +"""Chunk markdown files and write an asta-documents YAML index. Writes the index YAML directly (no per-chunk CLI calls), following the same schema as asta-documents: version 1.0, documents list with uuid/name/url/ @@ -8,8 +8,20 @@ Usage: python3 chunk-and-index.py --index-path -The --index-path is required. The script computes relative URLs for markdown -files relative to the directory containing the index file. Files outside that +The input is always a directory of markdown files, and the indexed document +*is* that markdown: its ``url`` points at the ``.md`` file. The markdown may be +authored directly (a corpus of notes, a wiki, an investigation record) or it +may be extraction output from PDFs. + +When the markdown was extracted from PDFs, pass ``--pdf-dir`` pointing at the +directory of source PDFs. The script then iterates that directory to find the +PDF that actually corresponds to each ``.md`` file and records a pointer to it +in ``extra.source_pdf``. When ``--pdf-dir`` is omitted (or no matching PDF is +found), there is no upstream document and ``extra.source_pdf`` is simply absent +— ``url`` is the original source. + +The --index-path is required. The script computes relative URLs for files +relative to the directory containing the index file. Files outside that directory get absolute file:// URLs. The markdown-dir can contain either: @@ -18,12 +30,9 @@ - Flat .md files: markdown/paper1.md, markdown/paper2.md, ... -The source PDF name is derived from the subdirectory name (if nested) or -the .md file stem (if flat). - -Each PDF is represented by multiple documents in the index. They share -PDF-level metadata (source_pdf, collection) with per-chunk identifiers -(chunk_index, total_chunks). +Each markdown file is represented by multiple documents in the index. They +share file-level metadata (collection, and source_pdf when applicable) with +per-chunk identifiers (chunk_index, total_chunks). """ import argparse @@ -80,13 +89,13 @@ def chunk_text(text: str, size: int = CHUNK_SIZE) -> list[tuple[str, int]]: return chunks -def make_url(md_file: Path, index_dir: Path) -> str: - """Compute a URL for a markdown file, relative to the index directory. +def make_url(path: Path, index_dir: Path) -> str: + """Compute a URL for a file, relative to the index directory. If the file is under the index directory, returns a relative path (portable, git-friendly). Otherwise returns an absolute file:// URL. """ - resolved = md_file.resolve() + resolved = path.resolve() try: rel = resolved.relative_to(index_dir) return str(rel) @@ -104,18 +113,41 @@ def load_existing_index(index_path: Path) -> dict: return {"version": "1.0", "documents": []} -def find_existing_pdfs(documents: list[dict], collection: str) -> set[str]: - """Find source_pdf values already in the index for this collection.""" +def find_existing_urls(documents: list[dict], collection: str) -> set[str]: + """Find the `url`s already indexed for this collection. + + The url (the markdown file itself) is the canonical identity of an indexed + document, so resumability keys on it. + """ seen = set() for doc in documents: - extra = doc.get("extra", {}) - if extra.get("collection") == collection: - pdf = extra.get("source_pdf", "") - if pdf: - seen.add(pdf) + if doc.get("extra", {}).get("collection") == collection: + url = doc.get("url") + if url: + seen.add(url) return seen +def build_pdf_index(pdf_dir: Path) -> dict[str, Path]: + """Map each PDF's stem to its path, for matching markdown files to sources. + + Iterates the actual PDFs under `pdf_dir` (recursively) rather than + synthesizing a filename, so the match reflects what is really on disk. + """ + pdf_index: dict[str, Path] = {} + for pdf in sorted(pdf_dir.rglob("*.pdf")): + # First writer wins; warn on an ambiguous stem collision. + if pdf.stem in pdf_index: + print( + f"WARNING: multiple PDFs share the stem '{pdf.stem}'; " + f"using {pdf_index[pdf.stem]}, ignoring {pdf}", + file=sys.stderr, + ) + continue + pdf_index[pdf.stem] = pdf + return pdf_index + + def main(): parser = argparse.ArgumentParser( description="Chunk markdown files and write asta-documents YAML index" @@ -125,7 +157,7 @@ def main(): ) parser.add_argument( "markdown_dir", - help="Directory containing PDF extraction output (subdirectories with .md + images, or flat .md files)", + help="Directory of markdown files (per-PDF subdirectories with .md + images, or flat .md files)", ) parser.add_argument( "--chunk-size", @@ -133,6 +165,15 @@ def main(): default=CHUNK_SIZE, help="Chunk size in characters (default: 2000)", ) + parser.add_argument( + "--pdf-dir", + help=( + "Directory of upstream source PDFs (when the markdown was extracted " + "from PDFs). The script iterates this directory to find the PDF " + "matching each .md file and records a pointer to it in " + "extra.source_pdf. Omit it when indexing authored markdown." + ), + ) parser.add_argument( "--index-path", required=True, @@ -149,6 +190,15 @@ def main(): print(f"Error: markdown directory not found: {md_dir}", file=sys.stderr) sys.exit(1) + pdf_index: dict[str, Path] = {} + if args.pdf_dir: + pdf_dir = Path(args.pdf_dir) + if not pdf_dir.exists(): + print(f"Error: PDF directory not found: {pdf_dir}", file=sys.stderr) + sys.exit(1) + pdf_index = build_pdf_index(pdf_dir) + print(f"Found {len(pdf_index)} source PDF(s) in {pdf_dir}") + # Find .md files: supports both per-PDF subdirectories (with images) and # flat .md files directly in markdown_dir. md_files = sorted(md_dir.rglob("*.md")) @@ -169,63 +219,85 @@ def main(): # Load existing index (preserves previously indexed documents) index_data = load_existing_index(index_path) - existing_pdfs = find_existing_pdfs(index_data["documents"], collection) + existing_urls = find_existing_urls(index_data["documents"], collection) now = datetime.now(UTC).isoformat() new_docs = 0 - pdfs_processed = 0 - pdfs_skipped_empty = 0 - pdfs_skipped_existing = 0 + docs_processed = 0 + docs_skipped_empty = 0 + docs_skipped_existing = 0 for md_file in md_files: text = md_file.read_text(encoding="utf-8") - # Derive the PDF name: if the .md is in a subdirectory of markdown_dir, - # use the subdirectory name (e.g. markdown/paper1/paper1.md -> paper1.pdf). + # Derive the basename: if the .md is in a subdirectory of markdown_dir, + # use the subdirectory name (e.g. markdown/paper1/paper1.md -> paper1). # If flat in markdown_dir, use the file stem. if md_file.parent != md_dir: basename = md_file.parent.name else: basename = md_file.stem - source_pdf = f"{basename}.pdf" if not text.strip(): print(f" [skip] {basename} (empty)") - pdfs_skipped_empty += 1 + docs_skipped_empty += 1 continue - if source_pdf in existing_pdfs: + url = make_url(md_file, index_dir) + + if url in existing_urls: print(f" [skip] {basename} (already indexed)") - pdfs_skipped_existing += 1 + docs_skipped_existing += 1 continue - url = make_url(md_file, index_dir) + # Resolve the upstream PDF, if any, by matching the basename against the + # PDFs actually present in --pdf-dir. + source_pdf_url = None + if args.pdf_dir: + pdf = pdf_index.get(basename) + if pdf is not None: + source_pdf_url = make_url(pdf, index_dir) + else: + print( + f" [warn] {basename}: no matching PDF in --pdf-dir; " + "indexing without source_pdf", + file=sys.stderr, + ) + + # Documents derived from a PDF keep the legacy `pdf-index` tag so + # existing consumers that filter on it still work; raw markdown gets + # `md-index`. + secondary_tag = "pdf-index" if source_pdf_url else "md-index" + file_size = len(text) chunks = chunk_text(text, chunk_size) for i, (chunk, offset) in enumerate(chunks, 1): + extra = { + "chunk_index": i, + "total_chunks": len(chunks), + "chunk_chars": len(chunk), + "chunk_offset": offset, + "file_chars": file_size, + "collection": collection, + } + # Present only when there is a real upstream PDF for this markdown. + if source_pdf_url: + extra["source_pdf"] = source_pdf_url doc_entry = { "uuid": generate_uuid(), "name": f"{basename} [chunk {i}/{len(chunks)}]", "mime_type": "text/markdown", "url": url, "summary": chunk, - "tags": [collection, "pdf-index"], + "tags": [collection, secondary_tag], "created_at": now, "modified_at": now, - "extra": { - "source_pdf": source_pdf, - "chunk_index": i, - "total_chunks": len(chunks), - "chunk_chars": len(chunk), - "chunk_offset": offset, - "file_chars": file_size, - "collection": collection, - }, + "extra": extra, } index_data["documents"].append(doc_entry) new_docs += 1 - pdfs_processed += 1 + docs_processed += 1 print(f" [index] {basename} ({len(chunks)} chunks) -> {url}") # Write index @@ -236,12 +308,12 @@ def main(): ) print() - print(f"PDFs processed: {pdfs_processed}") - print(f"PDFs skipped (empty): {pdfs_skipped_empty}") - print(f"PDFs skipped (exists): {pdfs_skipped_existing}") - print(f"New documents added: {new_docs}") - print(f"Total documents in idx: {len(index_data['documents'])}") - print(f"Index written to: {index_path}") + print(f"Sources processed: {docs_processed}") + print(f"Sources skipped (empty): {docs_skipped_empty}") + print(f"Sources skipped (exists):{docs_skipped_existing}") + print(f"New documents added: {new_docs}") + print(f"Total documents in idx: {len(index_data['documents'])}") + print(f"Index written to: {index_path}") if __name__ == "__main__":