Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 112 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,12 @@ A Python library for downloading PDF files from webpages with support for recurs
- Download PDF files from a specified webpage
- Recursive crawling with configurable depth (up to 5 levels)
- Merge downloaded PDFs into a single file or save separately
- **Smart merge ordering**: Sort PDFs numerically, alphabetically, or with custom sort keys
- **Automatic deduplication**: Remove duplicate PDF URLs across pages
- **Custom output filenames**: Name your merged PDF files
- **Rich result reporting**: Get detailed download statistics with `ProcessResult`
- **Command-line interface** for quick downloads
- **Quiet/verbose modes**: Control output verbosity with `-q` and `-v` flags
- **robots.txt compliance** for ethical web crawling
- **Custom User-Agent** support
- **Dry-run mode** to preview downloads
Expand Down Expand Up @@ -64,6 +69,9 @@ fetcharoo https://example.com
# Download with recursion and merge into one file
fetcharoo https://example.com -d 2 -m

# Merge with custom output filename and numeric sorting
fetcharoo https://example.com -m --output-name "textbook.pdf" --sort-by numeric

# List PDFs without downloading (dry run)
fetcharoo https://example.com --dry-run

Expand All @@ -72,6 +80,12 @@ fetcharoo https://example.com -o my_pdfs --delay 1.0 --progress

# Filter PDFs by pattern
fetcharoo https://example.com --include "report*.pdf" --exclude "*draft*"

# Quiet mode (less output) or verbose mode (more output)
fetcharoo https://example.com -q # Quieter
fetcharoo https://example.com -qq # Even quieter
fetcharoo https://example.com -v # More verbose
fetcharoo https://example.com -vv # Debug level
```

### CLI Options
Expand All @@ -81,12 +95,16 @@ fetcharoo https://example.com --include "report*.pdf" --exclude "*draft*"
| `-o, --output DIR` | Output directory (default: output) |
| `-d, --depth N` | Recursion depth (default: 0) |
| `-m, --merge` | Merge all PDFs into a single file |
| `--output-name FILENAME` | Custom filename for merged PDF (with `--merge`) |
| `--sort-by STRATEGY` | Sort PDFs before merging: `numeric`, `alpha`, `alpha_desc`, `none` |
| `--dry-run` | List PDFs without downloading |
| `--delay SECONDS` | Delay between requests (default: 0.5) |
| `--timeout SECONDS` | Request timeout (default: 30) |
| `--user-agent STRING` | Custom User-Agent string |
| `--respect-robots` | Respect robots.txt rules |
| `--progress` | Show progress bars |
| `-q, --quiet` | Reduce output verbosity (use `-qq` for even quieter) |
| `-v, --verbose` | Increase output verbosity (use `-vv` for debug) |
| `--include PATTERN` | Include PDFs matching pattern |
| `--exclude PATTERN` | Exclude PDFs matching pattern |
| `--min-size BYTES` | Minimum PDF size |
Expand Down Expand Up @@ -207,12 +225,62 @@ download_pdfs_from_webpage(
)
```

### Sorting and Merging

```python
from fetcharoo import download_pdfs_from_webpage

# Merge chapters in numeric order (chapter_1.pdf, chapter_2.pdf, chapter_10.pdf)
download_pdfs_from_webpage(
url='https://example.com/book',
mode='merge',
write_dir='output',
sort_by='numeric',
output_name='complete_book.pdf'
)

# Custom sort key function
from fetcharoo import process_pdfs, find_pdfs_from_webpage

pdf_urls = find_pdfs_from_webpage('https://example.com')
process_pdfs(
pdf_urls,
write_dir='output',
mode='merge',
sort_key=lambda url: url.split('/')[-1] # Sort by filename
)
```

### Using ProcessResult

```python
from fetcharoo import download_pdfs_from_webpage

# Get detailed results from download operation
result = download_pdfs_from_webpage(
url='https://example.com',
mode='separate',
write_dir='output'
)

# ProcessResult provides detailed information
print(f"Success: {result.success}")
print(f"Downloaded: {result.downloaded_count}")
print(f"Failed: {result.failed_count}")
print(f"Files created: {result.files_created}")
print(f"Errors: {result.errors}")

# ProcessResult is truthy when successful
if result:
print("Download completed!")
```

### Finding PDFs Without Downloading

```python
from fetcharoo import find_pdfs_from_webpage

# Just get the list of PDF URLs
# Just get the list of PDF URLs (deduplicated by default)
pdf_urls = find_pdfs_from_webpage(
url='https://example.com',
recursion_depth=1
Expand Down Expand Up @@ -257,15 +325,58 @@ Main function to find and download PDFs from a webpage.
| `dry_run` | bool | False | Preview URLs without downloading |
| `show_progress` | bool | False | Show progress bars |
| `filter_config` | FilterConfig | None | PDF filtering configuration |
| `sort_by` | str | None | Sort strategy: 'numeric', 'alpha', 'alpha_desc', 'none' |
| `sort_key` | callable | None | Custom sort key function |
| `output_name` | str | None | Custom filename for merged PDF |

**Returns:** `ProcessResult` object with download statistics, or dict in dry-run mode.

### `find_pdfs_from_webpage()`

Find PDF URLs without downloading.

| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `url` | str | required | The webpage URL to search |
| `recursion_depth` | int | 0 | How many levels of links to follow |
| `deduplicate` | bool | True | Remove duplicate PDF URLs |
| ... | | | (plus other parameters from above) |

### `process_pdfs()`

Download and save a list of PDF URLs.

| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `pdf_links` | list | required | List of PDF URLs to download |
| `write_dir` | str | required | Output directory |
| `mode` | str | 'separate' | 'merge' or 'separate' |
| `sort_by` | str | None | Sort strategy for merging |
| `sort_key` | callable | None | Custom sort key function |
| `output_name` | str | None | Custom merged filename |

**Returns:** `ProcessResult` object with download statistics.

### `ProcessResult`

Dataclass returned by download operations:

```python
from fetcharoo import ProcessResult

# Attributes:
result.success # bool: True if any PDFs were processed
result.files_created # List[str]: Paths to created files
result.downloaded_count # int: Number of successful downloads
result.filtered_count # int: Number of PDFs filtered out
result.failed_count # int: Number of failed downloads
result.errors # List[str]: Error messages

# ProcessResult is truthy when successful:
if result:
print("Success!")
```

### `FilterConfig`

Configuration for PDF filtering:
Expand Down
6 changes: 6 additions & 0 deletions fetcharoo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
check_robots_txt,
set_default_user_agent,
get_default_user_agent,
SORT_BY_OPTIONS,
ProcessResult,
)
from fetcharoo.pdf_utils import merge_pdfs, save_pdf_to_file
from fetcharoo.downloader import download_pdf
Expand Down Expand Up @@ -50,6 +52,10 @@
# User-Agent customization
"set_default_user_agent",
"get_default_user_agent",
# Sorting
"SORT_BY_OPTIONS",
# Result types
"ProcessResult",
# Filtering
"FilterConfig",
"matches_filename_pattern",
Expand Down
76 changes: 75 additions & 1 deletion fetcharoo/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""

import argparse
import logging
import sys
from typing import Optional

Expand All @@ -17,6 +18,41 @@
from fetcharoo.filtering import FilterConfig


def configure_logging(quiet: int, verbose: int) -> None:
"""
Configure logging level based on quiet/verbose flags.

Args:
quiet: Number of -q flags (0, 1, or 2+)
verbose: Number of -v flags (0, 1, or 2+)
"""
# Get the fetcharoo logger
logger = logging.getLogger('fetcharoo')

# Calculate effective verbosity level
# Default is WARNING, -q moves toward ERROR/CRITICAL, -v moves toward INFO/DEBUG
verbosity = verbose - quiet

if verbosity <= -2:
level = logging.CRITICAL
elif verbosity == -1:
level = logging.ERROR
elif verbosity == 0:
level = logging.WARNING
elif verbosity == 1:
level = logging.INFO
else: # verbosity >= 2
level = logging.DEBUG

# Configure handler if needed
if not logger.handlers:
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
logger.addHandler(handler)

logger.setLevel(level)


def create_parser() -> argparse.ArgumentParser:
"""
Create and configure the argument parser for the CLI.
Expand Down Expand Up @@ -74,6 +110,13 @@ def create_parser() -> argparse.ArgumentParser:
help='merge all PDFs into a single file'
)

parser.add_argument(
'--output-name',
type=str,
metavar='FILENAME',
help='custom filename for merged PDF (only used with --merge)'
)

parser.add_argument(
'--dry-run',
action='store_true',
Expand Down Expand Up @@ -115,6 +158,32 @@ def create_parser() -> argparse.ArgumentParser:
help='show progress bars during download'
)

# Verbosity options
parser.add_argument(
'-q', '--quiet',
action='count',
default=0,
help='reduce output verbosity (use -qq for even quieter)'
)

parser.add_argument(
'-v', '--verbose',
action='count',
default=0,
help='increase output verbosity (use -vv for debug level)'
)

# Sorting options
parser.add_argument(
'--sort-by',
type=str,
choices=['none', 'numeric', 'alpha', 'alpha_desc'],
default=None,
metavar='STRATEGY',
help='sort PDFs before merging: numeric (by numbers in filename), alpha (alphabetical), '
'alpha_desc (reverse alphabetical), none (default, preserves discovery order)'
)

# Filtering options
parser.add_argument(
'--include',
Expand Down Expand Up @@ -164,6 +233,9 @@ def main(argv: Optional[list] = None) -> int:
# Parse arguments
args = parser.parse_args(argv)

# Configure logging based on verbosity flags
configure_logging(args.quiet, args.verbose)

# Set custom user agent if provided
if args.user_agent:
set_default_user_agent(args.user_agent)
Expand Down Expand Up @@ -231,7 +303,9 @@ def main(argv: Optional[list] = None) -> int:
user_agent=args.user_agent,
dry_run=False,
show_progress=args.progress,
filter_config=filter_config
filter_config=filter_config,
sort_by=args.sort_by,
output_name=args.output_name
)

if success:
Expand Down
Loading