conversion_utility/batch_convert.py at master · 8i-labs/conversion_utility · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
#!/usr/bin/env python3
"""
Batch converter for .8ij files to 10-bit HEVC MP4
Preserves directory structure from source to destination
"""

import sys
import argparse
from pathlib import Path
import subprocess
import json
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

def find_8ij_files(input_dir: Path):
    """Find all .8ij files in input directory recursively"""
    return sorted(input_dir.rglob("*.8ij"))

def get_output_path(input_file: Path, input_base: Path, output_base: Path) -> Path:
    """Calculate output path preserving directory structure"""
    # Get relative path from input base
    rel_path = input_file.relative_to(input_base)
    # Change extension to .mp4
    rel_path = rel_path.with_suffix('.mp4')
    # Combine with output base
    return output_base / rel_path

def convert_file(input_file: Path, output_file: Path, converter_script: Path,
                 python_executable: Path, log_file: Path = None,
                 ffmpeg_path: str = '/opt/8i/bin/ffmpeg',
                 workers: int = 32, batch_size: int = 16, gpu_id: int = 0,
                 job_id: int = 1, total_jobs: int = 1) -> tuple[bool, Path]:
    """
    Convert a single .8ij file to 10-bit HEVC MP4

    Returns:
        Tuple of (success: bool, input_file: Path)
    """
    # Create output directory if needed
    output_file.parent.mkdir(parents=True, exist_ok=True)

    # Build command
    cmd = [
        str(python_executable),
        str(converter_script),
        str(input_file),
        str(output_file),
        '--bit-depth', '10',
        '--workers', str(workers),
        '--gpu', str(gpu_id),
        '--batch', str(batch_size),
        '--ffmpeg', ffmpeg_path
    ]

    print(f"\n{'='*80}")
    print(f"[Job {job_id}/{total_jobs}] Converting: {input_file.name}")
    print(f"Output: {output_file.relative_to(output_file.parents[2])}")
    print(f"GPU: {gpu_id}")
    if log_file:
        print(f"Log: {log_file.name}")
    print(f"{'='*80}")

    try:
        if log_file:
            # Redirect output to log file for parallel jobs
            # start_new_session=True creates a new process group, preventing Ctrl+C propagation
            with open(log_file, 'w') as log:
                result = subprocess.run(cmd, check=True, stdout=log, stderr=subprocess.STDOUT,
                                        start_new_session=True)
        else:
            # Display output directly for sequential jobs
            result = subprocess.run(cmd, check=True, capture_output=False)
        return (result.returncode == 0, input_file)
    except subprocess.CalledProcessError as e:
        error_msg = f"\n[Job {job_id}] ERROR: Conversion failed with return code {e.returncode}"
        if log_file:
            error_msg += f"\nSee log: {log_file}"
        print(error_msg)
        return (False, input_file)
    except Exception as e:
        error_msg = f"\n[Job {job_id}] ERROR: {e}"
        if log_file:
            error_msg += f"\nSee log: {log_file}"
        print(error_msg)
        return (False, input_file)

def load_progress(progress_file: Path):
    """Load conversion progress from JSON file"""
    if progress_file.exists():
        with open(progress_file, 'r') as f:
            return json.load(f)
    return {"completed": [], "failed": []}

def save_progress(progress_file: Path, progress):
    """Save conversion progress to JSON file"""
    with open(progress_file, 'w') as f:
        json.dump(progress, f, indent=2)

def main():
    parser = argparse.ArgumentParser(
        description='Batch convert .8ij files to 10-bit HEVC MP4 while preserving directory structure',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Sequential conversion (1 file at a time)
  python3 batch_convert.py \\
    /mnt/embla/storage/shows/digitalThings/20250326-Elkan/Plates/S18T/T01/8ij \\
    ~/Desktop/elkon_takes

  # Parallel conversion (2 files at once on same GPU)
  python3 batch_convert.py \\
    /mnt/embla/storage/shows/digitalThings/20250326-Elkan/Plates/S18T/T01/8ij \\
    ~/Desktop/elkon_takes \\
    --parallel 2

  # Parallel conversion using 2 different GPUs
  python3 batch_convert.py \\
    /mnt/embla/storage/shows/digitalThings/20250326-Elkan/Plates/S18T/T01/8ij \\
    ~/Desktop/elkon_takes \\
    --parallel 2 --gpus 0,1

  # Convert to network storage
  python3 batch_convert.py \\
    /mnt/embla/storage/shows/digitalThings/20250326-Elkan/Plates/S18T/T01/8ij \\
    /mnt/storage/alex/elkon/S18T_T01 \\
    --parallel 2
        """
    )

    parser.add_argument('input_dir', type=Path,
                        help='Input directory containing .8ij files')
    parser.add_argument('output_dir', type=Path,
                        help='Output directory for MP4 files')
    parser.add_argument('--converter', type=Path,
                        default=Path(__file__).parent / 'converter_8ij_to_mp4.py',
                        help='Path to converter_8ij_to_mp4.py (default: same directory as this script)')
    parser.add_argument('--workers', type=int, default=32,
                        help='Number of CPU workers for JPEG decoding (default: 32)')
    parser.add_argument('--batch', type=int, default=16,
                        help='Batch size for processing (default: 16)')
    parser.add_argument('--gpu', type=int, default=0,
                        help='GPU device ID (default: 0)')
    parser.add_argument('--ffmpeg', type=str, default='/opt/8i/bin/ffmpeg',
                        help='Path to ffmpeg binary (default: /opt/8i/bin/ffmpeg)')
    parser.add_argument('--parallel', type=int, default=1,
                        help='Number of files to convert in parallel (default: 1)')
    parser.add_argument('--gpus', type=str, default='0',
                        help='Comma-separated GPU IDs to use (e.g., "0,1" for 2 GPUs). Cycles through list for parallel jobs.')
    parser.add_argument('--resume', action='store_true',
                        help='Resume from previous run (skip already converted files)')
    parser.add_argument('--dry-run', action='store_true',
                        help='Show what would be converted without actually converting')

    args = parser.parse_args()

    # Parse GPU list
    gpu_list = [int(g.strip()) for g in args.gpus.split(',')]
    if not gpu_list:
        print("ERROR: No GPUs specified")
        return 1

    # Validate parallel setting
    if args.parallel < 1:
        print("ERROR: --parallel must be at least 1")
        return 1

    # Validate input directory
    if not args.input_dir.exists():
        print(f"ERROR: Input directory does not exist: {args.input_dir}")
        return 1

    # Validate converter script
    if not args.converter.exists():
        print(f"ERROR: Converter script not found: {args.converter}")
        return 1

    # Find Python executable in venv (same directory as converter)
    venv_dir = args.converter.parent / 'venv'
    if venv_dir.exists():
        python_executable = venv_dir / 'bin' / 'python3'
        if not python_executable.exists():
            python_executable = venv_dir / 'bin' / 'python'
        if not python_executable.exists():
            print(f"ERROR: venv found at {venv_dir} but no python executable")
            return 1
        print(f"Using venv Python: {python_executable}")
    else:
        print("WARNING: No venv found, using system python3")
        python_executable = Path('/usr/bin/python3')

    # Expand ~ in output path
    output_dir = args.output_dir.expanduser()

    # Create output directory
    output_dir.mkdir(parents=True, exist_ok=True)

    # Find all .8ij files
    print(f"Scanning for .8ij files in {args.input_dir}...")
    input_files = find_8ij_files(args.input_dir)

    if not input_files:
        print("ERROR: No .8ij files found")
        return 1

    print(f"Found {len(input_files)} .8ij files")

    # Load progress if resuming
    progress_file = output_dir / '.conversion_progress.json'
    progress = load_progress(progress_file) if args.resume else {"completed": [], "failed": []}

    # Calculate what needs to be converted
    files_to_convert = []
    for input_file in input_files:
        output_file = get_output_path(input_file, args.input_dir, output_dir)

        # Check if already completed
        if str(input_file) in progress['completed']:
            if output_file.exists():
                print(f"SKIP (already converted): {input_file.name}")
                continue
            else:
                # File was marked complete but doesn't exist - reconvert
                progress['completed'].remove(str(input_file))

        files_to_convert.append((input_file, output_file))

    if not files_to_convert:
        print("\nAll files already converted!")
        return 0

    # Create logs directory for parallel runs
    logs_dir = None
    if args.parallel > 1:
        logs_dir = output_dir / '.conversion_logs'
        logs_dir.mkdir(exist_ok=True)

    print(f"\n{'='*80}")
    print(f"Conversion Plan:")
    print(f"  Input directory:  {args.input_dir}")
    print(f"  Output directory: {output_dir}")
    print(f"  Files to convert: {len(files_to_convert)}")
    print(f"  Already complete: {len(progress['completed'])}")
    print(f"  Parallel jobs:    {args.parallel}")
    print(f"  GPUs:             {gpu_list}")
    if logs_dir:
        print(f"  Logs directory:   {logs_dir}")
    print(f"  Output format:    10-bit HEVC")
    print(f"{'='*80}\n")

    if args.dry_run:
        print("DRY RUN - would convert:")
        for input_file, output_file in files_to_convert:
            print(f"  {input_file.name}")
            print(f"    → {output_file.relative_to(output_dir)}")
        return 0

    # Start conversion
    start_time = datetime.now()
    successful = 0
    failed = 0
    progress_lock = Lock()

    if args.parallel == 1:
        # Sequential processing (original behavior)
        for idx, (input_file, output_file) in enumerate(files_to_convert, 1):
            print(f"\n[{idx}/{len(files_to_convert)}] ", end='')

            success, _ = convert_file(
                input_file,
                output_file,
                args.converter,
                python_executable,
                log_file=None,  # No log file for sequential - output goes to console
                ffmpeg_path=args.ffmpeg,
                workers=args.workers,
                batch_size=args.batch,
                gpu_id=gpu_list[0],
                job_id=idx,
                total_jobs=len(files_to_convert)
            )

            if success:
                successful += 1
                progress['completed'].append(str(input_file))
                save_progress(progress_file, progress)
            else:
                failed += 1
                progress['failed'].append(str(input_file))
                save_progress(progress_file, progress)
                print(f"\nWARNING: Failed to convert {input_file.name}")

                # Ask if user wants to continue
                response = input("Continue with remaining files? (y/n): ")
                if response.lower() != 'y':
                    break
    else:
        # Parallel processing
        print(f"Starting parallel conversion with {args.parallel} workers...")
        print(f"Individual job outputs will be saved to: {logs_dir}")
        print(f"To monitor a job in another terminal: tail -f {logs_dir}/<filename>.log\n")

        with ThreadPoolExecutor(max_workers=args.parallel) as executor:
            # Submit all jobs
            future_to_file = {}
            future_to_log = {}
            for idx, (input_file, output_file) in enumerate(files_to_convert):
                # Cycle through GPUs
                gpu_id = gpu_list[idx % len(gpu_list)]

                # Create unique log file for this job
                log_file = logs_dir / f"{input_file.stem}.log"

                future = executor.submit(
                    convert_file,
                    input_file,
                    output_file,
                    args.converter,
                    python_executable,
                    log_file,
                    args.ffmpeg,
                    args.workers,
                    args.batch,
                    gpu_id,
                    idx + 1,
                    len(files_to_convert)
                )
                future_to_file[future] = input_file
                future_to_log[future] = log_file

            # Process completed jobs
            for future in as_completed(future_to_file):
                input_file = future_to_file[future]
                log_file = future_to_log[future]
                try:
                    success, _ = future.result()
                    with progress_lock:
                        if success:
                            successful += 1
                            progress['completed'].append(str(input_file))
                            print(f"\n✓ SUCCESS ({successful}/{len(files_to_convert)}): {input_file.name}")
                        else:
                            failed += 1
                            progress['failed'].append(str(input_file))
                            print(f"\n✗ FAILED: {input_file.name} (see {log_file.name})")
                        save_progress(progress_file, progress)
                except Exception as e:
                    with progress_lock:
                        failed += 1
                        progress['failed'].append(str(input_file))
                        save_progress(progress_file, progress)
                    print(f"\n✗ EXCEPTION: {input_file.name} - {e} (see {log_file.name})")

    # Summary
    elapsed = datetime.now() - start_time
    print(f"\n{'='*80}")
    print(f"Conversion Summary:")
    print(f"  Successful: {successful}")
    print(f"  Failed:     {failed}")
    print(f"  Total time: {elapsed}")
    print(f"  Output dir: {output_dir}")
    if logs_dir:
        print(f"  Logs dir:   {logs_dir}")
    print(f"{'='*80}\n")

    if failed > 0:
        print("Failed files:")
        for f in progress['failed']:
            fname = Path(f).name
            print(f"  - {fname}", end='')
            if logs_dir:
                log_file = logs_dir / f"{Path(f).stem}.log"
                if log_file.exists():
                    print(f" (see {log_file})")
                else:
                    print()
            else:
                print()

    return 0 if failed == 0 else 1

if __name__ == '__main__':
    sys.exit(main())