HTMLtoPDF/html2pdf.py at main · dscisci/HTMLtoPDF · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
"""
HTML to PDF Converter
Converts all HTML files in a directory to PDF using WeasyPrint.
"""

import argparse
import sys
from pathlib import Path
from weasyprint import HTML


def convert_html_to_pdf(html_file, output_dir):
    """
    Convert a single HTML file to PDF.

    Args:
        html_file: Path to the HTML file
        output_dir: Directory to save the PDF

    Returns:
        True if successful, False otherwise
    """
    try:
        pdf_filename = html_file.stem + '.pdf'
        pdf_path = output_dir / pdf_filename

        print(f"Converting: {html_file.name} -> {pdf_filename}")

        HTML(filename=str(html_file)).write_pdf(str(pdf_path))

        print(f"  ✓ Success: {pdf_path}")
        return True

    except Exception as e:
        print(f"  ✗ Error converting {html_file.name}: {str(e)}")
        return False


def convert_directory(input_dir, output_dir):
    """
    Convert all HTML files in a directory to PDF.

    Args:
        input_dir: Path to directory containing HTML files
        output_dir: Path to directory for PDF output
    """
    input_path = Path(input_dir)
    output_path = Path(output_dir)

    if not input_path.exists():
        print(f"Error: Input directory '{input_dir}' does not exist.")
        sys.exit(1)

    if not input_path.is_dir():
        print(f"Error: '{input_dir}' is not a directory.")
        sys.exit(1)

    # Create output directory if it doesn't exist
    output_path.mkdir(parents=True, exist_ok=True)

    # Find all HTML files
    html_files = list(input_path.glob('*.html')) + list(input_path.glob('*.htm'))

    if not html_files:
        print(f"No HTML files found in '{input_dir}'")
        return

    print(f"\nFound {len(html_files)} HTML file(s) in '{input_dir}'")
    print(f"Output directory: '{output_dir}'\n")

    # Convert each file
    successful = 0
    failed = 0

    for html_file in sorted(html_files):
        if convert_html_to_pdf(html_file, output_path):
            successful += 1
        else:
            failed += 1

    # Summary
    print(f"\n{'='*50}")
    print(f"Conversion complete!")
    print(f"  Successful: {successful}")
    print(f"  Failed: {failed}")
    print(f"  Total: {len(html_files)}")
    print(f"{'='*50}\n")


def main():
    parser = argparse.ArgumentParser(
        description='Convert HTML files to PDF using WeasyPrint',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='''
Examples:
  python html2pdf.py input_folder
  python html2pdf.py input_folder -o my_pdfs
  python html2pdf.py ./html_files --output ./pdfs
        '''
    )

    parser.add_argument(
        'input_dir',
        help='Directory containing HTML files to convert'
    )

    parser.add_argument(
        '-o', '--output',
        default='output',
        help='Output directory for PDF files (default: output)'
    )

    args = parser.parse_args()

    convert_directory(args.input_dir, args.output)


if __name__ == '__main__':
    main()