From e4e2ff0d57c3d2ea480721a566ed068e9d2393ce Mon Sep 17 00:00:00 2001
From: Azis <azuolas.krusna@yahoo.com>
Date: Wed, 18 Feb 2026 00:17:36 +0100
Subject: [PATCH] Add support for pdf reading

---
 .gitignore     |   2 +
 README.md      |   8 ++++
 pyproject.toml |   1 +
 reed.py        | 110 ++++++++++++++++++++++++++++++++++++++++++++++--
 test_reed.py   | 111 +++++++++++++++++++++++++++++++++++++++++++++++++
 uv.lock        |  11 +++++
 6 files changed, 240 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 50f3892..cac9146 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,8 @@ __pycache__/
 .pytest_cache/
 .venv/
 reedy.egg-info/
+build/
+dist/
 .DS_Store
 *.wav
 *.onnx
diff --git a/README.md b/README.md
index f8c8ead..3c70969 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@ A CLI that reads text aloud using [piper-tts](https://github.com/rhasspy/piper).
 ## Features
 
 - **Multiple input sources** — text argument, file (`-f`), clipboard (`-c`), or stdin
+- **PDF support** — read full PDFs or selected pages with `--pages`
 - **Pipe-friendly** — reads from stdin, works anywhere in a shell pipeline
 - **Interactive mode** — conversational TTS with `/replay`, `/help`, `/clear`, tab completion, and history
 - **Adjustable speech** — control speed (`-s`), volume (`-v`), and sentence silence (`--silence`)
@@ -72,6 +73,12 @@ reed 'Hello, I will read this for you'
 # Read from a file
 reed -f article.txt
 
+# Read from a PDF
+reed -f book.pdf
+
+# Read selected pages from a PDF (1-based)
+reed -f book.pdf --pages 1,3-5
+
 # Read from clipboard
 reed -c
 
@@ -176,6 +183,7 @@ All voice models are hosted on Hugging Face: [https://huggingface.co/rhasspy/pip
 | Flag | Description | Default |
 |------|-------------|---------|
 | `-f`, `--file` | Read text from a file | — |
+| `--pages` | PDF pages to read (1-based), e.g. `1,3-5` | — |
 | `-c`, `--clipboard` | Read text from clipboard | — |
 | `-m`, `--model` | Voice name or path to voice model | `en_US-kristin-medium` |
 | `-s`, `--speed` | Speech speed (lower = slower) | `1.0` |
diff --git a/pyproject.toml b/pyproject.toml
index 7f826df..f164c85 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,7 @@ requires-python = ">=3.14"
 dependencies = [
     "piper-tts",
     "pathvalidate",
+    "pypdf",
     "prompt-toolkit",
     "rich",
 ]
diff --git a/reed.py b/reed.py
index c526b31..4cc7c44 100755
--- a/reed.py
+++ b/reed.py
@@ -12,11 +12,16 @@
 import urllib.request
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Callable, Optional, TextIO
+from typing import TYPE_CHECKING, Callable, Iterator, Optional, TextIO
 
 if TYPE_CHECKING:
     from prompt_toolkit import PromptSession
 
+try:
+    from pypdf import PdfReader
+except ImportError:  # pragma: no cover - validated in runtime error path
+    PdfReader = None
+
 from rich.console import Console
 from rich.markup import escape
 from rich.panel import Panel
@@ -155,7 +160,10 @@ def get_text(
         return result.stdout.strip()
 
     if args.file:
-        return Path(args.file).read_text()
+        file_path = Path(args.file)
+        if args.pages:
+            raise ReedError("--pages can only be used with PDF files")
+        return file_path.read_text()
 
     if not stdin.isatty():
         return stdin.read().strip()
@@ -166,6 +174,83 @@ def get_text(
     raise ReedError("No input provided. Use --help for usage.")
 
 
+def _parse_pdf_pages(page_selection: str, total_pages: int) -> list[int]:
+    selection = page_selection.strip()
+    if not selection:
+        raise ReedError("Invalid page selection")
+
+    selected: list[int] = []
+    seen: set[int] = set()
+    for part in selection.split(","):
+        token = part.strip()
+        if not token:
+            raise ReedError("Invalid page selection")
+
+        if "-" in token:
+            bounds = token.split("-", 1)
+            if len(bounds) != 2 or not bounds[0].isdigit() or not bounds[1].isdigit():
+                raise ReedError("Invalid page selection")
+            start = int(bounds[0])
+            end = int(bounds[1])
+            if start < 1 or end < 1 or end < start:
+                raise ReedError("Invalid page selection")
+            pages = range(start, end + 1)
+        else:
+            if not token.isdigit():
+                raise ReedError("Invalid page selection")
+            page = int(token)
+            if page < 1:
+                raise ReedError("Invalid page selection")
+            pages = [page]
+
+        for page in pages:
+            if page > total_pages:
+                raise ReedError(
+                    f"Page {page} is out of range (PDF has {total_pages} pages)"
+                )
+            index = page - 1
+            if index not in seen:
+                seen.add(index)
+                selected.append(index)
+
+    if not selected:
+        raise ReedError("Invalid page selection")
+    return selected
+
+
+def _iter_pdf_pages(
+    path: Path, page_selection: Optional[str]
+) -> Iterator[tuple[int, int, str]]:
+    """Yield ``(page_number, total_pages, text)`` for each selected PDF page."""
+    if PdfReader is None:
+        raise ReedError("PDF support requires pypdf. Reinstall reed with dependencies.")
+
+    try:
+        reader = PdfReader(str(path))
+    except Exception as e:  # pragma: no cover - depends on third-party parser internals
+        raise ReedError(f"Failed to read PDF: {e}")
+
+    total_pages = len(reader.pages)
+    if total_pages == 0:
+        raise ReedError("PDF has no pages")
+
+    if page_selection:
+        page_indices = _parse_pdf_pages(page_selection, total_pages)
+    else:
+        page_indices = list(range(total_pages))
+
+    found_any = False
+    for index in page_indices:
+        page_text = reader.pages[index].extract_text() or ""
+        page_text = page_text.strip()
+        if page_text:
+            found_any = True
+            yield (index + 1, total_pages, page_text)
+
+    if not found_any:
+        raise ReedError("No extractable text found in PDF")
+
+
 def build_piper_cmd(
     model: Path,
     speed: float,
@@ -357,7 +442,7 @@ def interactive_loop(
 def _should_enter_interactive(
     args: argparse.Namespace, stdin: Optional[TextIO]
 ) -> bool:
-    if args.text or args.file or args.clipboard:
+    if args.text or args.file or args.clipboard or args.pages:
         return False
     if stdin is not None and hasattr(stdin, "isatty") and stdin.isatty():
         return True
@@ -380,6 +465,11 @@ def main(
     )
     parser.add_argument("text", nargs="*", help="Text to read aloud")
     parser.add_argument("-f", "--file", help="Read text from a file")
+    parser.add_argument(
+        "--pages",
+        default=None,
+        help="PDF pages to read (1-based), e.g. 1,3-5",
+    )
     parser.add_argument(
         "-c", "--clipboard", action="store_true", help="Read text from clipboard"
     )
@@ -410,6 +500,9 @@ def main(
         help="Seconds of silence between sentences",
     )
     args = parser.parse_args(argv)
+    if args.pages and not args.file:
+        print_error("--pages requires --file <PDF>", print_fn)
+        return 1
 
     # Resolve model: None → default, short name → data dir path
     if args.model is None:
@@ -489,6 +582,17 @@ def main(
 
     try:
         assert stdin is not None
+
+        # PDF: generate and play one page at a time
+        if args.file and Path(args.file).suffix.lower() == ".pdf":
+            ensure_model(config, print_fn)
+            for page_num, total, page_text in _iter_pdf_pages(
+                Path(args.file), args.pages
+            ):
+                print_fn(f"\n[bold cyan]📄 Page {page_num}/{total}[/bold cyan]")
+                speak_text(page_text, config, run=run, print_fn=print_fn)
+            return 0
+
         text = get_text(args, stdin, run=run)
 
         if not text:
diff --git a/test_reed.py b/test_reed.py
index 08e3fb5..2cfd8a6 100644
--- a/test_reed.py
+++ b/test_reed.py
@@ -16,6 +16,7 @@ def _make_args(**overrides):
     defaults = dict(
         text=[],
         file=None,
+        pages=None,
         clipboard=False,
         model=Path(__file__).parent / "en_US-kristin-medium.onnx",
         speed=1.0,
@@ -382,6 +383,16 @@ def test_clipboard(self):
         args = _make_args(clipboard=True)
         assert _should_enter_interactive(args, io.StringIO()) is False
 
+    def test_pages_provided(self):
+        from reed import _should_enter_interactive
+
+        class FakeTty:
+            def isatty(self):
+                return True
+
+        args = _make_args(pages="1-2")
+        assert _should_enter_interactive(args, FakeTty()) is False
+
     def test_tty_stdin_no_args(self):
         from reed import _should_enter_interactive
 
@@ -620,6 +631,97 @@ def isatty(self):
         assert result == "hello world"
 
 
+class TestIterPdfPages:
+    def test_pdf_reads_all_pages_when_no_pages_flag(self, monkeypatch):
+        from reed import _iter_pdf_pages
+
+        class FakePage:
+            def __init__(self, text):
+                self._text = text
+
+            def extract_text(self):
+                return self._text
+
+        class FakeReader:
+            def __init__(self, path):
+                self.pages = [FakePage("page one"), FakePage("page two")]
+
+        monkeypatch.setattr("reed.PdfReader", FakeReader)
+
+        result = list(_iter_pdf_pages(Path("book.pdf"), None))
+        assert result == [(1, 2, "page one"), (2, 2, "page two")]
+
+    def test_pdf_reads_selected_pages(self, monkeypatch):
+        from reed import _iter_pdf_pages
+
+        class FakePage:
+            def __init__(self, text):
+                self._text = text
+
+            def extract_text(self):
+                return self._text
+
+        class FakeReader:
+            def __init__(self, path):
+                self.pages = [
+                    FakePage("page one"),
+                    FakePage("page two"),
+                    FakePage("page three"),
+                    FakePage("page four"),
+                ]
+
+        monkeypatch.setattr("reed.PdfReader", FakeReader)
+
+        result = list(_iter_pdf_pages(Path("book.pdf"), "2,4"))
+        assert result == [(2, 4, "page two"), (4, 4, "page four")]
+
+    def test_pdf_page_out_of_bounds_raises(self, monkeypatch):
+        from reed import ReedError, _iter_pdf_pages
+
+        class FakePage:
+            def __init__(self, text):
+                self._text = text
+
+            def extract_text(self):
+                return self._text
+
+        class FakeReader:
+            def __init__(self, path):
+                self.pages = [FakePage("page one"), FakePage("page two")]
+
+        monkeypatch.setattr("reed.PdfReader", FakeReader)
+
+        with pytest.raises(ReedError, match="out of range"):
+            list(_iter_pdf_pages(Path("book.pdf"), "3"))
+
+    def test_pdf_invalid_pages_format_raises(self, monkeypatch):
+        from reed import ReedError, _iter_pdf_pages
+
+        class FakePage:
+            def __init__(self, text):
+                self._text = text
+
+            def extract_text(self):
+                return self._text
+
+        class FakeReader:
+            def __init__(self, path):
+                self.pages = [FakePage("page one"), FakePage("page two")]
+
+        monkeypatch.setattr("reed.PdfReader", FakeReader)
+
+        with pytest.raises(ReedError, match="Invalid page selection"):
+            list(_iter_pdf_pages(Path("book.pdf"), "1,a"))
+
+    def test_pages_flag_with_non_pdf_file_raises(self):
+        from reed import ReedError, get_text
+
+        txt = io.StringIO("file content")
+        args = _make_args(file="notes.txt", pages="1")
+        with pytest.raises(ReedError, match="only be used with PDF files"):
+            get_text(args, stdin=txt)
+
+
 # ─── main error path tests ───────────────────────────────────────────
 
 
@@ -661,6 +763,15 @@ def failing_run(cmd, **kwargs):
         assert code == 1
         assert "piper exploded" in output
 
+    def test_pages_without_file_returns_1(self):
+        code, output = self._capture_main(
+            argv=["--pages", "1"],
+            run=lambda *a, **k: types.SimpleNamespace(returncode=0, stderr=""),
+            stdin=io.StringIO(""),
+        )
+        assert code == 1
+        assert "--pages requires --file <PDF>" in output
+
 
 # ─── _data_dir tests ─────────────────────────────────────────────────
 
diff --git a/uv.lock b/uv.lock
index b026121..7d91391 100644
--- a/uv.lock
+++ b/uv.lock
@@ -325,6 +325,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
 ]
 
+[[package]]
+name = "pypdf"
+version = "6.7.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ff/63/3437c4363483f2a04000a48f1cd48c40097f69d580363712fa8b0b4afe45/pypdf-6.7.1.tar.gz", hash = "sha256:6b7a63be5563a0a35d54c6d6b550d75c00b8ccf36384be96365355e296e6b3b0", size = 5302208, upload-time = "2026-02-17T17:00:48.88Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/68/77/38bd7744bb9e06d465b0c23879e6d2c187d93a383f8fa485c862822bb8a3/pypdf-6.7.1-py3-none-any.whl", hash = "sha256:a02ccbb06463f7c334ce1612e91b3e68a8e827f3cee100b9941771e6066b094e", size = 331048, upload-time = "2026-02-17T17:00:46.991Z" },
+]
+
 [[package]]
 name = "pytest"
 version = "9.0.2"
@@ -375,6 +384,7 @@ dependencies = [
     { name = "pathvalidate" },
     { name = "piper-tts" },
     { name = "prompt-toolkit" },
+    { name = "pypdf" },
     { name = "rich" },
 ]
 
@@ -398,6 +408,7 @@ requires-dist = [
     { name = "pathvalidate" },
     { name = "piper-tts" },
     { name = "prompt-toolkit" },
+    { name = "pypdf" },
     { name = "rich" },
 ]