translate/process_gui.py at main · crayxt/translate · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3

from __future__ import annotations

from datetime import datetime
import os
import queue
import re
import subprocess
import sys
import threading
import tkinter as tk
from dataclasses import dataclass
from tkinter import filedialog, messagebox, ttk
from tkinter.scrolledtext import ScrolledText
from types import SimpleNamespace
from typing import TextIO

from core.formats import FileKind, detect_file_kind
from core.providers import (
    DEFAULT_PROVIDER as DEFAULT_PROVIDER_SPEC,
    DEFAULT_PROVIDER_NAME,
    SUPPORTED_TRANSLATION_PROVIDERS,
    get_translation_provider,
)
from core.resources import detect_default_text_resource
from core.runtime import DEFAULT_BATCH_SIZE, DEFAULT_PARALLEL_REQUESTS
from core.task_cli import (
    DEFAULT_GEMINI_BACKEND,
    GEMINI_BACKEND_CHOICES,
    apply_provider_environment_from_args,
    resolve_provider_model,
)
from core.term_extraction import validate_max_length
from tasks import check_translations as check_task
from tasks import extract_terms as extract_task
from tasks import extract_terms_local as extract_local_task
from tasks import revise_translations as revise_task
from tasks import translate as translate_task


DEFAULT_SOURCE_LANG = "en"
DEFAULT_TARGET_LANG = "kk"
DEFAULT_PROVIDER = DEFAULT_PROVIDER_NAME
SUPPORTED_PROVIDER_CHOICES = tuple(sorted(SUPPORTED_TRANSLATION_PROVIDERS))
DEFAULT_MODEL = DEFAULT_PROVIDER_SPEC.default_model
DEFAULT_BATCH_SIZE_TEXT = str(DEFAULT_BATCH_SIZE)
DEFAULT_PARALLEL_REQUESTS_TEXT = str(DEFAULT_PARALLEL_REQUESTS)
THINKING_LEVEL_CHOICES = ("", "minimal", "low", "medium", "high")
EXTRACT_MODE_CHOICES = ("missing", "all")
EXTRACT_OUTPUT_CHOICES = ("po", "json")
LOCAL_EXTRACT_MAX_LENGTH_CHOICES = ("1", "2", "3")
PROGRESS_PERCENT_RE = re.compile(r"Progress:\s*(?P<pct>\d+(?:\.\d+)?)%")
BATCH_PROGRESS_RE = re.compile(
    r"Progress:\s*completed batches\s+(?P<done>\d+)/(?P<total>\d+)"
)
TRANSLATABLE_FILETYPES = [
    ("Translatable files", "*.po *.xlf *.xliff *.ts *.resx *.strings *.txt *.xml"),
    ("PO files", "*.po"),
    ("XLIFF files", "*.xlf *.xliff"),
    ("Qt TS files", "*.ts"),
    ("RESX files", "*.resx"),
    ("Apple strings files", "*.strings"),
    ("Plain text files", "*.txt"),
    ("Android XML files", "*.xml"),
    ("All files", "*.*"),
]
VOCAB_FILETYPES = [
    ("Vocabulary files", "*.txt *.po *.tbx"),
    ("Text files", "*.txt"),
    ("PO files", "*.po"),
    ("TBX files", "*.tbx"),
    ("All files", "*.*"),
]
RULES_FILETYPES = [
    ("Markdown files", "*.md"),
    ("Text files", "*.txt"),
    ("All files", "*.*"),
]
CHECK_FILETYPES = [
    ("Checkable files", "*.po *.xlf *.xliff *.ts"),
    ("PO files", "*.po"),
    ("XLIFF files", "*.xlf *.xliff"),
    ("Qt TS files", "*.ts"),
    ("All files", "*.*"),
]
LOCAL_EXTRACT_FILETYPES = [
    ("Supported local extract files", "*.po *.xlf *.xliff *.ts *.resx *.strings *.txt *.xml *.json"),
    ("Translatable files", "*.po *.xlf *.xliff *.ts *.resx *.strings *.txt *.xml"),
    ("XLIFF files", "*.xlf *.xliff"),
    ("Android XML files", "*.xml"),
    ("JSON files", "*.json"),
    ("All files", "*.*"),
]
LOCAL_EXTRACT_SOURCE_FILETYPES = [
    ("Supported local extract files", "*.po *.xlf *.xliff *.ts *.resx *.strings *.txt *.xml"),
    ("Translatable files", "*.po *.xlf *.xliff *.ts *.resx *.strings *.txt *.xml"),
    ("XLIFF files", "*.xlf *.xliff"),
    ("Android XML files", "*.xml"),
    ("All files", "*.*"),
]
JSON_FILETYPES = [
    ("JSON files", "*.json"),
    ("All files", "*.*"),
]
LOG_DIR_NAME = "logs"
CLIPBOARD_WIDGET_CLASSES = frozenset({"Entry", "TEntry", "Text", "Combobox", "TCombobox"})
READONLY_STATES = frozenset({"disabled", "readonly"})


@dataclass(slots=True)
class ProcessGuiConfig:
    input_file: str = ""
    input_files: tuple[str, ...] = ()
    source_file: str = ""
    source_lang: str = DEFAULT_SOURCE_LANG
    target_lang: str = DEFAULT_TARGET_LANG
    provider: str = DEFAULT_PROVIDER
    gemini_backend: str = DEFAULT_GEMINI_BACKEND
    google_cloud_location: str = "global"
    model: str = DEFAULT_MODEL
    thinking_level: str = ""
    batch_size: str = DEFAULT_BATCH_SIZE_TEXT
    parallel_requests: str = DEFAULT_PARALLEL_REQUESTS_TEXT
    vocab_path: str = ""
    rules_path: str = ""
    rules_str: str = ""
    api_key: str = ""
    flex_mode: bool = False
    retranslate_all: bool = False
    warnings_report: bool = False


@dataclass(slots=True)
class ExtractGuiConfig:
    input_file: str = ""
    source_lang: str = DEFAULT_SOURCE_LANG
    target_lang: str = DEFAULT_TARGET_LANG
    provider: str = DEFAULT_PROVIDER
    gemini_backend: str = DEFAULT_GEMINI_BACKEND
    google_cloud_location: str = "global"
    model: str = DEFAULT_MODEL
    thinking_level: str = ""
    batch_size: str = DEFAULT_BATCH_SIZE_TEXT
    parallel_requests: str = DEFAULT_PARALLEL_REQUESTS_TEXT
    vocab_path: str = ""
    api_key: str = ""
    flex_mode: bool = False
    mode: str = "missing"
    out_format: str = "po"
    out_path: str = ""
    max_terms_per_batch: str = "80"
    max_attempts: str = "5"


@dataclass(slots=True)
class CheckGuiConfig:
    input_file: str = ""
    source_lang: str = DEFAULT_SOURCE_LANG
    target_lang: str = DEFAULT_TARGET_LANG
    provider: str = DEFAULT_PROVIDER
    gemini_backend: str = DEFAULT_GEMINI_BACKEND
    google_cloud_location: str = "global"
    model: str = DEFAULT_MODEL
    thinking_level: str = ""
    batch_size: str = DEFAULT_BATCH_SIZE_TEXT
    parallel_requests: str = DEFAULT_PARALLEL_REQUESTS_TEXT
    vocab_path: str = ""
    rules_path: str = ""
    rules_str: str = ""
    api_key: str = ""
    flex_mode: bool = False
    num_messages: str = ""
    out_path: str = ""
    include_ok: bool = False
    max_attempts: str = "5"


@dataclass(slots=True)
class LocalExtractGuiConfig:
    input_file: str = ""
    source_lang: str = DEFAULT_SOURCE_LANG
    target_lang: str = DEFAULT_TARGET_LANG
    vocab_path: str = ""
    mode: str = "missing"
    max_length: str = "1"
    out_path: str = ""
    include_rejected: bool = False
    to_po: bool = False
    also_po: bool = False
    include_borderline: bool = False


@dataclass(slots=True)
class ReviseGuiConfig:
    input_file: str = ""
    source_file: str = ""
    source_lang: str = DEFAULT_SOURCE_LANG
    target_lang: str = DEFAULT_TARGET_LANG
    provider: str = DEFAULT_PROVIDER
    gemini_backend: str = DEFAULT_GEMINI_BACKEND
    google_cloud_location: str = "global"
    model: str = DEFAULT_MODEL
    thinking_level: str = ""
    batch_size: str = DEFAULT_BATCH_SIZE_TEXT
    parallel_requests: str = DEFAULT_PARALLEL_REQUESTS_TEXT
    vocab_path: str = ""
    rules_path: str = ""
    rules_str: str = ""
    api_key: str = ""
    flex_mode: bool = False
    instruction: str = ""
    num_messages: str = ""
    out_path: str = ""
    max_attempts: str = "5"
    in_place: bool = False
    dry_run: bool = False


def _clean(value: str) -> str:
    return str(value or "").strip()


def _clamp_percent(value: float) -> float:
    return max(0.0, min(100.0, float(value)))


def widget_supports_clipboard(widget_class: str) -> bool:
    return str(widget_class or "") in CLIPBOARD_WIDGET_CLASSES


def widget_is_editable(widget_class: str, state: str) -> bool:
    return widget_supports_clipboard(widget_class) and str(state or "") not in READONLY_STATES


def path_exists_as_file_or_dir(path: str) -> bool:
    cleaned_path = _clean(path)
    return bool(cleaned_path) and (os.path.isfile(cleaned_path) or os.path.isdir(cleaned_path))


def _validate_optional_positive_int(value: str, flag_name: str) -> str | None:
    cleaned = _clean(value)
    if not cleaned:
        return None

    try:
        parsed = int(cleaned)
    except ValueError as exc:
        raise ValueError(f"{flag_name} must be a whole number.") from exc

    if parsed <= 0:
        raise ValueError(f"{flag_name} must be greater than 0.")
    return str(parsed)


def _validate_choice(value: str, choices: tuple[str, ...], flag_name: str) -> None:
    cleaned = _clean(value)
    if cleaned and cleaned not in choices:
        raise ValueError(
            f"{flag_name} must be one of: {', '.join(choice for choice in choices if choice)}."
        )


def build_resource_root(base_dir: str | None = None) -> str:
    return os.path.abspath(base_dir or os.path.dirname(__file__))


def build_script_path(script_name: str, base_dir: str | None = None) -> str:
    return os.path.join(build_resource_root(base_dir), script_name)


def build_cli_script_path(base_dir: str | None = None) -> str:
    return build_script_path("translate_cli.py", base_dir=base_dir)


def _sanitize_log_name(value: str) -> str:
    cleaned = re.sub(r"[^A-Za-z0-9._-]+", "-", str(value or "").strip())
    cleaned = cleaned.strip("._-")
    return cleaned or "run"


def build_log_dir(base_dir: str | None = None) -> str:
    return os.path.join(build_resource_root(base_dir), LOG_DIR_NAME)


def build_run_log_path(
    tool_key: str,
    input_file: str,
    base_dir: str | None = None,
    now: datetime | None = None,
) -> str:
    timestamp = (now or datetime.now()).strftime("%Y%m%d-%H%M%S")
    input_stem = os.path.splitext(os.path.basename(_clean(input_file)))[0]
    filename = f"{_sanitize_log_name(tool_key)}-{_sanitize_log_name(input_stem)}-{timestamp}.log"
    return os.path.join(build_log_dir(base_dir), filename)


def detect_default_resource_path(
    prefix: str,
    extension: str,
    target_lang: str,
    base_dir: str | None = None,
) -> str:
    return (
        detect_default_text_resource(
            prefix,
            extension,
            target_lang,
            base_dir=build_resource_root(base_dir),
            allow_directory=prefix == "vocab",
        )
        or ""
    )


def detect_default_resource_paths(
    target_lang: str,
    base_dir: str | None = None,
) -> tuple[str, str]:
    return (
        detect_default_resource_path("vocab", "txt", target_lang, base_dir=base_dir),
        detect_default_resource_path("rules", "md", target_lang, base_dir=base_dir),
    )


def build_system_prompt_preview(tool_key: str, target_lang: str) -> str:
    normalized_tool = _clean(tool_key).lower()
    resolved_target_lang = _clean(target_lang) or DEFAULT_TARGET_LANG

    if normalized_tool in {"process", "translate"}:
        return translate_task.SYSTEM_INSTRUCTION.strip()
    if normalized_tool == "extract":
        return extract_task.build_term_system_instruction(resolved_target_lang)
    if normalized_tool == "extract_local":
        return (
            "No model system prompt is used for this task.\n\n"
            "Local term discovery runs deterministic source-side extraction using:\n"
            "- core/term_extraction.py for normalization, tokenization, filtering, evidence collection, and scoring\n"
            "- core/term_handoff.py for JSON report shaping and JSON-to-PO conversion\n"
            "- data/extract/... resource files for stopwords, low-value words, allowlists, and excluded terms\n"
            "- the approved vocabulary to filter already-known terms in missing mode\n\n"
            "The task can scan one source file or a whole source directory tree.\n"
            "It can also convert a local extraction JSON report into a translation-ready PO glossary handoff."
        )
    if normalized_tool == "check":
        return check_task.build_check_system_instruction(resolved_target_lang)
    if normalized_tool == "revise":
        return revise_task.build_revision_system_instruction(resolved_target_lang)
    return "System prompt preview unavailable."


def choose_resource_field_value(
    current_value: str,
    previous_auto_value: str,
    new_auto_value: str,
    force: bool = False,
) -> tuple[str, str]:
    cleaned_current = _clean(current_value)
    cleaned_previous_auto = _clean(previous_auto_value)
    cleaned_new_auto = _clean(new_auto_value)

    if force or not cleaned_current or cleaned_current == cleaned_previous_auto:
        return cleaned_new_auto, cleaned_new_auto

    return cleaned_current, cleaned_previous_auto


def read_text_file_or_empty(path: str) -> str:
    cleaned_path = _clean(path)
    if not cleaned_path or not os.path.isfile(cleaned_path):
        return ""

    with open(cleaned_path, "r", encoding="utf-8", errors="replace") as handle:
        return handle.read()


def parse_progress_percent(line: str) -> float | None:
    text = str(line or "").strip()
    if not text:
        return None

    percent_match = PROGRESS_PERCENT_RE.search(text)
    if percent_match:
        return _clamp_percent(float(percent_match.group("pct")))

    batch_match = BATCH_PROGRESS_RE.search(text)
    if batch_match:
        total = int(batch_match.group("total"))
        if total <= 0:
            return None
        done = int(batch_match.group("done"))
        return _clamp_percent((done / total) * 100.0)

    if text.endswith(" complete."):
        return 100.0

    return None


def summarize_input_files(file_paths: list[str] | tuple[str, ...]) -> str:
    cleaned_paths = [_clean(path) for path in file_paths if _clean(path)]
    if not cleaned_paths:
        return ""
    if len(cleaned_paths) == 1:
        return cleaned_paths[0]
    return f"{cleaned_paths[0]} (+{len(cleaned_paths) - 1} more)"


def summarize_recursive_input_folder(folder_path: str, file_count: int) -> str:
    cleaned_folder = _clean(folder_path)
    if not cleaned_folder:
        return ""
    suffix = "file" if file_count == 1 else "files"
    return f"{cleaned_folder} ({file_count} recursive {suffix})"


def get_local_extract_file_dialog_config(to_po_mode: bool) -> tuple[str, list[tuple[str, str]]]:
    if to_po_mode:
        return "Select local extraction JSON file", JSON_FILETYPES
    return "Select source file for local extraction", LOCAL_EXTRACT_SOURCE_FILETYPES


def resolve_process_input_files(config: ProcessGuiConfig) -> list[str]:
    explicit_files = [_clean(path) for path in config.input_files if _clean(path)]
    if explicit_files:
        return explicit_files
    cleaned_input = _clean(config.input_file)
    return [cleaned_input] if cleaned_input else []


def _validate_base_config(
    *,
    input_file: str,
    input_files: list[str] | None = None,
    source_lang: str,
    target_lang: str,
    provider: str,
    gemini_backend: str,
    google_cloud_location: str,
    model: str,
    thinking_level: str,
    batch_size: str,
    parallel_requests: str,
    vocab_path: str,
    api_key: str,
    environ: dict[str, str] | None = None,
    rules_path: str = "",
    allow_input_directories: bool = False,
) -> list[str]:
    env = environ if environ is not None else os.environ
    errors: list[str] = []

    cleaned_input = _clean(input_file)
    cleaned_input_files = [_clean(path) for path in (input_files or []) if _clean(path)]
    cleaned_source = _clean(source_lang)
    cleaned_target = _clean(target_lang)
    cleaned_provider = _clean(provider)
    cleaned_gemini_backend = _clean(gemini_backend).lower()
    cleaned_google_cloud_location = _clean(google_cloud_location)
    cleaned_vocab = _clean(vocab_path)
    cleaned_rules = _clean(rules_path)
    cleaned_api_key = _clean(api_key)

    if cleaned_input_files:
        for file_path in cleaned_input_files:
            if allow_input_directories:
                if not path_exists_as_file_or_dir(file_path):
                    errors.append(f"Input file or directory does not exist: {file_path}")
            elif not os.path.isfile(file_path):
                errors.append(f"Input file does not exist: {file_path}")
    elif not cleaned_input:
        errors.append("Input file is required.")
    elif allow_input_directories:
        if not path_exists_as_file_or_dir(cleaned_input):
            errors.append(f"Input file or directory does not exist: {cleaned_input}")
    elif not os.path.isfile(cleaned_input):
        errors.append(f"Input file does not exist: {cleaned_input}")

    if not cleaned_source:
        errors.append("Source language is required.")

    if not cleaned_target:
        errors.append("Target language is required.")

    if not cleaned_provider:
        errors.append("Provider is required.")
    else:
        try:
            get_translation_provider(cleaned_provider)
        except ValueError as exc:
            errors.append(str(exc))

    if cleaned_provider == "gemini":
        try:
            _validate_choice(cleaned_gemini_backend, GEMINI_BACKEND_CHOICES, "Gemini backend")
        except ValueError as exc:
            errors.append(str(exc))
        if cleaned_gemini_backend == "vertex" and (
            cleaned_google_cloud_location and cleaned_google_cloud_location.lower() != "global"
        ):
            errors.append(
                "Gemini Vertex API-key mode currently supports only the global endpoint."
            )
        if cleaned_gemini_backend != "vertex" and (
            cleaned_google_cloud_location and cleaned_google_cloud_location.lower() != "global"
        ):
            errors.append("Set Gemini backend to 'vertex' to use a custom Google Cloud location.")

    try:
        _validate_choice(thinking_level, THINKING_LEVEL_CHOICES, "Thinking level")
    except ValueError as exc:
        errors.append(str(exc))

    for label, value in (
        ("Batch size", batch_size),
        ("Parallel requests", parallel_requests),
    ):
        try:
            _validate_optional_positive_int(value, label)
        except ValueError as exc:
            errors.append(str(exc))

    if cleaned_vocab and not path_exists_as_file_or_dir(cleaned_vocab):
        errors.append(f"Vocabulary file or directory does not exist: {cleaned_vocab}")

    if cleaned_rules and not os.path.isfile(cleaned_rules):
        errors.append(f"Rules file does not exist: {cleaned_rules}")

    if cleaned_provider:
        try:
            provider_spec = get_translation_provider(cleaned_provider)
        except ValueError:
            provider_spec = None
        if provider_spec is not None:
            api_key_env = provider_spec.api_key_env
            if cleaned_provider == "gemini" and cleaned_gemini_backend == "vertex":
                if api_key_env and not cleaned_api_key and not env.get(api_key_env):
                    errors.append(
                        f"{api_key_env} is not set. Provide an API key in the GUI or the environment."
                    )
            elif api_key_env and not cleaned_api_key and not env.get(api_key_env):
                errors.append(
                    f"{api_key_env} is not set. Provide an API key in the GUI or the environment."
                )

    return errors


def validate_process_gui_config(
    config: ProcessGuiConfig,
    environ: dict[str, str] | None = None,
) -> list[str]:
    input_files = resolve_process_input_files(config)
    errors = _validate_base_config(
        input_file=input_files[0] if input_files else config.input_file,
        input_files=input_files,
        source_lang=config.source_lang,
        target_lang=config.target_lang,
        provider=config.provider,
        gemini_backend=config.gemini_backend,
        google_cloud_location=config.google_cloud_location,
        model=config.model,
        thinking_level=config.thinking_level,
        batch_size=config.batch_size,
        parallel_requests=config.parallel_requests,
        vocab_path=config.vocab_path,
        rules_path=config.rules_path,
        api_key=config.api_key,
        environ=environ,
        allow_input_directories=True,
    )

    cleaned_source_file = _clean(config.source_file)
    if translation_requires_source_file(input_files):
        if not cleaned_source_file:
            errors.append("Source file is required for Android .xml translation runs.")
        elif not os.path.isfile(cleaned_source_file):
            errors.append(f"Source file does not exist: {cleaned_source_file}")
    elif cleaned_source_file and not os.path.isfile(cleaned_source_file):
        errors.append(f"Source file does not exist: {cleaned_source_file}")

    if not any(message.startswith("Source file ") for message in errors):
        try:
            translate_task.validate_translation_files(
                input_files,
                source_file=cleaned_source_file or None,
            )
        except ValueError as exc:
            errors.append(str(exc))
    return errors


def validate_extract_gui_config(
    config: ExtractGuiConfig,
    environ: dict[str, str] | None = None,
) -> list[str]:
    errors = _validate_base_config(
        input_file=config.input_file,
        source_lang=config.source_lang,
        target_lang=config.target_lang,
        provider=config.provider,
        gemini_backend=config.gemini_backend,
        google_cloud_location=config.google_cloud_location,
        model=config.model,
        thinking_level=config.thinking_level,
        batch_size=config.batch_size,
        parallel_requests=config.parallel_requests,
        vocab_path=config.vocab_path,
        api_key=config.api_key,
        environ=environ,
    )

    try:
        _validate_choice(config.mode, EXTRACT_MODE_CHOICES, "Mode")
    except ValueError as exc:
        errors.append(str(exc))

    try:
        _validate_choice(config.out_format, EXTRACT_OUTPUT_CHOICES, "Output format")
    except ValueError as exc:
        errors.append(str(exc))

    for label, value in (
        ("Max terms per batch", config.max_terms_per_batch),
        ("Max attempts", config.max_attempts),
    ):
        try:
            _validate_optional_positive_int(value, label)
        except ValueError as exc:
            errors.append(str(exc))

    return errors


def validate_local_extract_gui_config(config: LocalExtractGuiConfig) -> list[str]:
    errors: list[str] = []

    cleaned_input = _clean(config.input_file)
    cleaned_vocab = _clean(config.vocab_path)
    cleaned_out = _clean(config.out_path)
    cleaned_source = _clean(config.source_lang)
    cleaned_target = _clean(config.target_lang)
    cleaned_mode = _clean(config.mode)
    cleaned_max_length = _clean(config.max_length)

    if not cleaned_input:
        errors.append("Input file is required.")
    elif config.to_po:
        if not os.path.isfile(cleaned_input):
            errors.append(f"Input file does not exist: {cleaned_input}")
    elif not os.path.isfile(cleaned_input) and not os.path.isdir(cleaned_input):
        errors.append(f"Input file or directory does not exist: {cleaned_input}")

    if not config.to_po:
        if not cleaned_source:
            errors.append("Source language is required.")
        if not cleaned_target:
            errors.append("Target language is required.")
        try:
            _validate_choice(cleaned_mode, EXTRACT_MODE_CHOICES, "Mode")
        except ValueError as exc:
            errors.append(str(exc))
        try:
            max_length_value = int(cleaned_max_length)
        except ValueError:
            errors.append("Max length must be 1, 2, or 3.")
        else:
            try:
                validate_max_length(max_length_value)
            except ValueError:
                errors.append("Max length must be 1, 2, or 3.")
        if cleaned_vocab and not path_exists_as_file_or_dir(cleaned_vocab):
            errors.append(f"Vocabulary file or directory does not exist: {cleaned_vocab}")
        if cleaned_out and not cleaned_out.lower().endswith(".json"):
            errors.append("Local extraction output path should end with .json.")
    else:
        if config.also_po:
            errors.append("JSON to PO mode cannot also request one-shot PO output.")
        if cleaned_input and not cleaned_input.lower().endswith(".json"):
            errors.append("JSON to PO mode requires a .json input file.")
        if cleaned_out and not cleaned_out.lower().endswith(".po"):
            errors.append("PO output path should end with .po in JSON to PO mode.")

    return errors


def validate_check_gui_config(
    config: CheckGuiConfig,
    environ: dict[str, str] | None = None,
) -> list[str]:
    errors = _validate_base_config(
        input_file=config.input_file,
        source_lang=config.source_lang,
        target_lang=config.target_lang,
        provider=config.provider,
        gemini_backend=config.gemini_backend,
        google_cloud_location=config.google_cloud_location,
        model=config.model,
        thinking_level=config.thinking_level,
        batch_size=config.batch_size,
        parallel_requests=config.parallel_requests,
        vocab_path=config.vocab_path,
        rules_path=config.rules_path,
        api_key=config.api_key,
        environ=environ,
    )

    for label, value in (
        ("Probe / num messages", config.num_messages),
        ("Max attempts", config.max_attempts),
    ):
        try:
            _validate_optional_positive_int(value, label)
        except ValueError as exc:
            errors.append(str(exc))

    return errors


def _detect_revision_file_kind(input_file: str) -> FileKind | None:
    cleaned_input = _clean(input_file)
    if not cleaned_input:
        return None

    try:
        return detect_file_kind(cleaned_input)
    except ValueError:
        return None


def translation_requires_source_file(input_files: list[str] | tuple[str, ...]) -> bool:
    if len(input_files) != 1:
        return False
    return _detect_revision_file_kind(input_files[0]) == FileKind.ANDROID_XML


def revision_requires_source_file(input_file: str) -> bool:
    file_kind = _detect_revision_file_kind(input_file)
    return file_kind in (FileKind.ANDROID_XML, FileKind.STRINGS, FileKind.RESX, FileKind.TXT)


def validate_revise_gui_config(
    config: ReviseGuiConfig,
    environ: dict[str, str] | None = None,
) -> list[str]:
    errors = _validate_base_config(
        input_file=config.input_file,
        source_lang=config.source_lang,
        target_lang=config.target_lang,
        provider=config.provider,
        gemini_backend=config.gemini_backend,
        google_cloud_location=config.google_cloud_location,
        model=config.model,
        thinking_level=config.thinking_level,
        batch_size=config.batch_size,
        parallel_requests=config.parallel_requests,
        vocab_path=config.vocab_path,
        rules_path=config.rules_path,
        api_key=config.api_key,
        environ=environ,
    )

    file_kind = _detect_revision_file_kind(config.input_file)
    if _clean(config.input_file) and file_kind is None:
        errors.append("Input file must be a supported .po, .xlf/.xliff, .ts, .resx, .strings, .txt, or Android .xml file.")

    cleaned_source_file = _clean(config.source_file)
    if revision_requires_source_file(config.input_file):
        if not cleaned_source_file:
            errors.append("Source file is required for Android .xml, .strings, .resx, and .txt revision runs.")
        elif not os.path.isfile(cleaned_source_file):
            errors.append(f"Source file does not exist: {cleaned_source_file}")
    elif cleaned_source_file and not os.path.isfile(cleaned_source_file):
        errors.append(f"Source file does not exist: {cleaned_source_file}")

    cleaned_instruction = _clean(config.instruction)
    if not cleaned_instruction:
        errors.append("Instruction is required.")

    for label, value in (
        ("Probe / num messages", config.num_messages),
        ("Max attempts", config.max_attempts),
    ):
        try:
            _validate_optional_positive_int(value, label)
        except ValueError as exc:
            errors.append(str(exc))

    if _clean(config.out_path) and config.in_place:
        errors.append("Output path and in-place mode cannot be used together.")

    return errors


def _append_common_cli_args(
    command: list[str],
    *,
    source_lang: str,
    target_lang: str,
    provider: str,
    gemini_backend: str,
    google_cloud_location: str,
    model: str,
    thinking_level: str,
    flex_mode: bool,
    batch_size: str,
    parallel_requests: str,
    vocab_path: str = "",
) -> None:
    provider_name = _clean(provider) or DEFAULT_PROVIDER
    model_name = resolve_provider_model(provider_name, _clean(model) or None)
    command.extend(
        [
            "--source-lang",
            _clean(source_lang),
            "--target-lang",
            _clean(target_lang),
            "--provider",
            provider_name,
            "--model",
            model_name,
        ]
    )

    thinking_level_value = _clean(thinking_level)
    if thinking_level_value:
        command.extend(["--thinking-level", thinking_level_value])

    try:
        provider_spec = get_translation_provider(provider_name)
    except ValueError:
        provider_spec = None
    if provider_name == "gemini":
        backend_value = _clean(gemini_backend).lower()
        location_value = _clean(google_cloud_location)
        if backend_value == "vertex" or (location_value and location_value.lower() != "global"):
            command.extend(["--gemini-backend", "vertex"])
            if location_value:
                command.extend(["--google-cloud-location", location_value])
    if flex_mode and provider_spec is not None and getattr(provider_spec, "supports_flex_mode", False):
        command.append("--flex")

    batch_size_value = _validate_optional_positive_int(batch_size, "Batch size")
    if batch_size_value:
        command.extend(["--batch-size", batch_size_value])

    parallel_value = _validate_optional_positive_int(
        parallel_requests,
        "Parallel requests",
    )
    if parallel_value:
        command.extend(["--parallel-requests", parallel_value])

    vocab_value = _clean(vocab_path)
    if vocab_value:
        command.extend(["--vocab", vocab_value])


def build_process_command(
    config: ProcessGuiConfig,
    python_executable: str | None = None,
    script_path: str | None = None,
) -> list[str]:
    errors = validate_process_gui_config(config)
    if errors:
        raise ValueError("\n".join(errors))

    resolved_script = os.path.abspath(script_path or build_cli_script_path())
    if not os.path.isfile(resolved_script):
        raise ValueError(f"translate_cli.py not found at: {resolved_script}")

    input_files = resolve_process_input_files(config)
    command = [
        python_executable or sys.executable,
        "-u",
        resolved_script,
        "translate",
        *input_files,
    ]
    _append_common_cli_args(
        command,
        source_lang=config.source_lang,
        target_lang=config.target_lang,
        provider=config.provider,
        gemini_backend=config.gemini_backend,
        google_cloud_location=config.google_cloud_location,
        model=config.model,
        thinking_level=config.thinking_level,
        flex_mode=config.flex_mode,
        batch_size=config.batch_size,
        parallel_requests=config.parallel_requests,
        vocab_path=config.vocab_path,
    )

    rules_path = _clean(config.rules_path)
    if rules_path:
        command.extend(["--rules", rules_path])

    rules_str = _clean(config.rules_str)
    if rules_str:
        command.extend(["--rules-str", rules_str])

    source_file = _clean(config.source_file)
    if source_file:
        command.extend(["--source-file", source_file])

    if config.retranslate_all:
        command.append("--retranslate-all")
    if config.warnings_report:
        command.append("--warnings-report")

    return command


def build_extract_command(
    config: ExtractGuiConfig,
    python_executable: str | None = None,
    script_path: str | None = None,
) -> list[str]:
    errors = validate_extract_gui_config(config)
    if errors:
        raise ValueError("\n".join(errors))

    resolved_script = os.path.abspath(script_path or build_cli_script_path())
    if not os.path.isfile(resolved_script):
        raise ValueError(f"translate_cli.py not found at: {resolved_script}")

    command = [
        python_executable or sys.executable,
        "-u",
        resolved_script,
        "extract-terms",
        _clean(config.input_file),
    ]
    _append_common_cli_args(
        command,
        source_lang=config.source_lang,
        target_lang=config.target_lang,
        provider=config.provider,
        gemini_backend=config.gemini_backend,
        google_cloud_location=config.google_cloud_location,
        model=config.model,
        thinking_level=config.thinking_level,
        flex_mode=config.flex_mode,
        batch_size=config.batch_size,
        parallel_requests=config.parallel_requests,
        vocab_path=config.vocab_path,
    )
    command.extend(["--mode", _clean(config.mode) or "missing"])
    command.extend(["--out-format", _clean(config.out_format) or "po"])

    out_path = _clean(config.out_path)
    if out_path:
        command.extend(["--out", out_path])

    command.extend(
        [
            "--max-terms-per-batch",
            _validate_optional_positive_int(
                config.max_terms_per_batch,
                "Max terms per batch",
            )
            or "80",
            "--max-attempts",
            _validate_optional_positive_int(config.max_attempts, "Max attempts")
            or "5",
        ]
    )
    return command


def build_local_extract_command(
    config: LocalExtractGuiConfig,
    python_executable: str | None = None,
    script_path: str | None = None,
) -> list[str]:
    errors = validate_local_extract_gui_config(config)
    if errors:
        raise ValueError("\n".join(errors))

    resolved_script = os.path.abspath(script_path or build_cli_script_path())
    if not os.path.isfile(resolved_script):
        raise ValueError(f"translate_cli.py not found at: {resolved_script}")

    command = [
        python_executable or sys.executable,
        "-u",
        resolved_script,
        "extract-terms-local",
        _clean(config.input_file),
    ]

    if config.to_po:
        command.append("--to-po")
        if config.include_borderline:
            command.append("--include-borderline")
    else:
        command.extend(
            [
                "--source-lang",