Protify/src/protify/main.py at main · Gleghorn-Lab/Protify · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import argparse
import os
import sys

import yaml
from types import SimpleNamespace

# Ensure src is in sys.path
_current_dir = os.path.dirname(os.path.abspath(__file__))
_src_dir = os.path.abspath(os.path.join(_current_dir, ".."))
if _src_dir not in sys.path:
    sys.path.insert(0, _src_dir)

# Docker/Mount fallback: If the 'src/protify' folder was mounted directly
# as the root (e.g. /workspace) and lost its 'protify' wrapper name.
try:
    import protify
except ImportError:
    if os.path.exists(os.path.join(_current_dir, "__init__.py")):
        import importlib.util
        _spec = importlib.util.spec_from_file_location("protify", os.path.join(_current_dir, "__init__.py"))
        if _spec is not None:
            _protify_mod = importlib.util.module_from_spec(_spec)
            sys.modules["protify"] = _protify_mod
            _spec.loader.exec_module(_protify_mod)

from protify.cloud_cli import _run_on_cloud, _should_auto_run_cloud


def parse_arguments():
    raw_argv = sys.argv[1:]
    parser = argparse.ArgumentParser(description="Script with arguments mirroring the provided YAML settings.")
    # ----------------- ID ----------------- #
    parser.add_argument("--hf_username", default="Synthyra", help="Hugging Face username.")
    parser.add_argument("--hf_token", default=None, help="Hugging Face token.")
    parser.add_argument("--synthyra_api_key", default=None, help="Synthyra API key.")
    parser.add_argument("--wandb_api_key", default=None, help="Wandb API key.")
    parser.add_argument("--cloud_api_key", default=None, help="Cloud backend API key. When provided, jobs are dispatched to the remote cloud backend.")
    parser.add_argument("--cloud_url", default=None, help="Cloud backend URL (default: https://api.synthyra.com).")
    parser.add_argument("--cloud_gpu_type", default=None, help="GPU type for cloud execution (e.g. A10, A100, H100).")
    parser.add_argument("--cloud_timeout_seconds", type=int, default=None, help="Timeout in seconds for cloud jobs (default: 86400).")
    parser.add_argument("--cloud_poll_interval", type=int, default=None, help="Poll interval in seconds for cloud job status (default: 5).")
    parser.add_argument("--cloud_artifacts_dir", default=None, help="Local directory to save cloud job artifacts (default: cloud_artifacts).")

    # ----------------- Paths ----------------- #
    parser.add_argument("--hf_home", type=str, default=None, help="Customize the HF cache directory.")
    parser.add_argument("--yaml_path", type=str, default=None, help="Path to the YAML file.")
    parser.add_argument("--log_dir", type=str, default="logs", help="Path to the log directory.")
    parser.add_argument("--results_dir", type=str, default="results", help="Path to the results directory.")
    parser.add_argument("--model_save_dir", default="weights", help="Directory to save models.")
    parser.add_argument("--embedding_save_dir", default="embeddings", help="Directory to save embeddings.")
    parser.add_argument("--download_dir", default="Synthyra/vector_embeddings", help="Directory to download embeddings to.")
    parser.add_argument("--plots_dir", default="plots", help="Directory to save plots.")
    parser.add_argument("--replay_path", type=str, default=None, help="Path to the replay file.")
    parser.add_argument("--pretrained_probe_path", type=str, default=None) # TODO not used right now

    # ----------------- DataArguments ----------------- #
    parser.add_argument("--delimiter", default=",", help="Delimiter for data.")
    parser.add_argument("--col_names", nargs="+", default=["seqs", "labels"], help="Column names.") # DEPRECATED, found automatically now
    parser.add_argument("--max_length", type=int, default=2048, help="Maximum sequence length.")
    parser.add_argument(
        "--padding",
        choices=["max_length", "longest"],
        default="max_length",
        help="Padding strategy. 'max_length' pads all sequences to --max_length (recommended for torch.compile + flex attention). 'longest' pads to the longest sequence in each batch.",
    )
    parser.add_argument("--trim", action="store_true",
                        help="Truncate sequences longer than --max_length instead of dropping them from the dataset.")
    parser.add_argument("--data_names", nargs="+", default=[], help="List of HF dataset names.") # TODO rename to data_names
    parser.add_argument("--data_dirs", nargs="+", default=[], help="List of local data directories.")
    parser.add_argument("--aa_to_dna", action="store_true", help="Translate amino-acid sequences to DNA codon sequences using common human synonymous codons.")
    parser.add_argument("--aa_to_rna", action="store_true", help="Translate amino-acid sequences to RNA codon sequences using common human synonymous codons.")
    parser.add_argument("--dna_to_aa", action="store_true", help="Translate DNA codon sequences to amino-acid sequences and drop stop codons.")
    parser.add_argument("--rna_to_aa", action="store_true", help="Translate RNA codon sequences to amino-acid sequences and drop stop codons.")
    parser.add_argument("--codon_to_aa", action="store_true", help="Translate codon-token sequences to amino-acid sequences and drop stop codons.")
    parser.add_argument("--aa_to_codon", action="store_true", help="Translate amino-acid sequences to codon-token sequences.")
    parser.add_argument("--random_pair_flipping", action="store_true", help="Randomly swap paired inputs during training.")

    # ----------------- BaseModelArguments ----------------- #
    parser.add_argument("--model_names", nargs="+", default=None, help="List of preset model names to use (e.g. ESM2-8). Mutually exclusive with --model_paths/--model_types.")
    parser.add_argument("--model_paths", nargs="+", default=None, help="List of model paths (HuggingFace or local). Must be paired with --model_types. Mutually exclusive with --model_names.")
    parser.add_argument("--model_types", nargs="+", default=None, help="List of model type keywords paired with --model_paths (e.g. esm2, esmc, protbert, prott5, ankh, glm, dplm, dplm2, protclm, onehot, amplify, e1, vec2vec, calm, custom, random).")
    parser.add_argument("--model_dtype", type=str, choices=["fp32", "fp16", "bf16", "float32", "float16", "bfloat16"], default="bf16", help="Data type for loading base models.")
    parser.add_argument("--use_xformers", action="store_true", help="Use xformers memory-efficient attention for AMPLIFY models.")

    # ----------------- ProbeArguments ----------------- #
    parser.add_argument("--probe_type", choices=["linear", "transformer", "lyra"], default="linear", help="Type of probe.")
    parser.add_argument("--tokenwise", action="store_true", help="Use a tokenwise probe (per-token outputs) instead of a sequence-level probe.")
    parser.add_argument("--hidden_size", type=int, default=8192, help="Hidden dimension size for probe.")
    parser.add_argument("--dropout", type=float, default=0.2, help="Dropout rate.")
    parser.add_argument("--n_layers", type=int, default=1, help="Number of layers.")
    parser.add_argument("--pre_ln", action="store_false",
                        help="Disable pre-layernorm in the transformer probe (pre-LN enabled by default).")
    parser.add_argument("--classifier_size", type=int, default=4096, help="Feed-forward dimension.")
    parser.add_argument("--transformer_dropout", type=float, default=0.1, help="Dropout rate for the transformer layers.")
    parser.add_argument("--classifier_dropout", type=float, default=0.2, help="Dropout rate for the classifier.")
    parser.add_argument("--head_size", type=int, default=128, help="Attention head dimension. n_heads is derived as hidden_size // head_size.")
    parser.add_argument("--n_heads", type=int, default=None, help="[DEPRECATED] Use --head_size. If provided, head_size is derived as hidden_size // n_heads.")
    parser.add_argument("--rotary", action="store_false",
                        help="Disable rotary embeddings in the transformer probe (rotary enabled by default).")
    parser.add_argument("--attention_backend", choices=["kernels", "flex", "sdpa"], default="flex", help="Attention backend for transformer-style probes.")
    parser.add_argument("--output_s_max", action="store_true", help="Return s_max bounds from transformer-style probe attention layers.")
    parser.add_argument("--probe_pooling_types", nargs="+", default=["mean", "var"], help="Pooling types to use.")
    parser.add_argument("--bom_k", type=int, default=60, help="K-mer window size for 'bom' pooling in the transformer probe. Default 60 is a cross-task compromise from Hoang & Singh 2025: peak for DPI (Section 4.3), mid-range of the {20,40,60,80,100} sweep on FLUO/BLAC, and close to the reported k=100 optimum for remote homology. Only used when 'bom' is in --probe_pooling_types.")
    parser.add_argument("--use_bias", action="store_true", help="Use bias terms in Linear layers.")
    parser.add_argument("--expansion_ratio", type=float, default=8/3, help="FFN expansion ratio for transformer probes.")
    parser.add_argument("--save_model", action="store_true", help="Save the trained model/probe to disk.")
    parser.add_argument("--push_raw_probe", action="store_true", help="With --save_model, push the raw probe class to the Hub instead of the packaged AutoModel.")
    parser.add_argument("--push_raw_probe_repo", type=str, default=None, help="Custom HF repo id for --push_raw_probe. If omitted, auto-generated.")
    parser.add_argument("--production_model", action="store_true", help="Train a production-grade scikit model (used with --use_scikit).")
    parser.add_argument("--lora", action="store_true", help="Wrap the base model in LoRA adapters during full-finetuning / hybrid training.")
    parser.add_argument("--lora_r", type=int, default=8, help="Number of trainable parameters in the LoRA model.")
    parser.add_argument("--lora_alpha", type=float, default=32.0, help="Alpha for the LoRA model.")
    parser.add_argument("--lora_dropout", type=float, default=0.01, help="Dropout rate for the LoRA model.")
    parser.add_argument("--sim_type", choices=["dot", "euclidean", "cosine"], default="dot", help="Cross-attention mechanism for token-parameter-attention")
    parser.add_argument("--add_token_ids", action="store_true", help="Add learned token-type embeddings to distinguish protein A vs B in PPI tasks.")

    # ----------------- ScikitArguments ----------------- #
    parser.add_argument("--scikit_n_iter", type=int, default=10, help="Number of iterations for scikit model.")
    parser.add_argument("--scikit_cv", type=int, default=3, help="Number of cross-validation folds for scikit model.")
    parser.add_argument("--scikit_random_state", type=int, default=None, help="Random state for scikit model (if None, uses global seed).")
    parser.add_argument("--scikit_model_name", type=str, default=None, help="Name of the scikit model to use.")
    parser.add_argument("--scikit_model_args", type=str, default=None, help="JSON string of hyperparameters to use (skips tuning). E.g. '{\"n_estimators\": 500, \"max_depth\": 7}'")
    parser.add_argument("--use_scikit", action="store_true", help="Use a scikit-learn model instead of a neural probe.")
    parser.add_argument("--n_jobs", type=int, default=1, help="Number of processes to use in scikit.") # TODO integrate with GUI and main

    # ----------------- EmbeddingArguments ----------------- #
    parser.add_argument("--embedding_batch_size", type=int, default=16, help="Batch size for embedding generation.")
    parser.add_argument("--embedding_num_workers", type=int, default=0, help="Number of worker processes for embedding generation.")
    parser.add_argument("--num_workers", type=int, default=0, help="Number of worker processes for data loading.")
    parser.add_argument("--download_embeddings", action="store_true", help="Download pre-computed embeddings from the Hub instead of computing them locally.")
    parser.add_argument("--matrix_embed", action="store_true", help="Store per-token (matrix) embeddings instead of pooled vector embeddings.")
    parser.add_argument("--embedding_pooling_types", nargs="+", default=["mean", "var"], help="Pooling types for embeddings.")
    parser.add_argument("--embedding_hidden_state_index", type=int, default=-1, help="Hidden-state tuple index to embed from. -1 uses the final hidden state.")
    parser.add_argument("--save_embeddings", action="store_true", help="Save computed embeddings to disk.")
    parser.add_argument("--embed_dtype", type=str, choices=["fp32", "fp16", "bf16", "float32", "float16", "bfloat16"], default=None, help="Data type for embeddings. If omitted, uses --model_dtype.")
    parser.add_argument("--no_embedding_scaler", dest="embedding_scaler", action="store_false", default=True,
                        help="Disable StandardScaler for pooled vector embeddings during probe/scikit training.")
    parser.add_argument("--sql", action="store_true", help="Store embeddings in a SQLite backend (streamed at train time) instead of in-RAM .pth.")
    parser.add_argument("--read_scaler", type=int, default=100, help="Read scaler for SQL storage.")

    # ----------------- Multi-Column Sequences ----------------- #
    parser.add_argument("--multi_column", nargs="+", default=None, help="If set, list of sequence column names to combine per sample.")

    # ----------------- TrainerArguments ----------------- #
    parser.add_argument("--num_epochs", type=int, default=200, help="Number of epochs to train for.")
    parser.add_argument("--probe_batch_size", type=int, default=64, help="Batch size for probe training.")
    parser.add_argument("--base_batch_size", type=int, default=4, help="Batch size for base model training.")
    parser.add_argument("--probe_grad_accum", type=int, default=1, help='Gradient accumulation steps for probe training.')
    parser.add_argument("--base_grad_accum", type=int, default=8, help='Gradient accumulation steps for base model training.')
    parser.add_argument("--lr", type=float, default=1e-4, help="Learning rate.")
    ### TODO integrate
    #parser.add_argument("--probe_lr", type=float, default=1e-4, help="Learning rate for probe training.")
    #parser.add_argument("--base_lr", type=float, default=1e-5, help="Learning rate for base model training.")
    #parser.add_argument("--lr_scheduler", type=str, default='cosine', help='Learning rate scheduler.')
    #parser.add_argument("--optimizer", type=str, default='adamw', help='Optimizer.')
    parser.add_argument("--weight_decay", type=float, default=0.00, help="Weight decay.")
    parser.add_argument("--patience", type=int, default=1, help="Patience for early stopping (probe phase, and base phase unless --base_patience is set).")
    parser.add_argument("--base_num_epochs", type=int, default=None,
                        help="Epoch count for the base-model phase of hybrid / full-finetuning training. If omitted, falls back to --num_epochs.")
    parser.add_argument("--base_patience", type=int, default=None,
                        help="Early-stopping patience for the base-model phase of hybrid / full-finetuning training. If omitted, falls back to --patience.")
    parser.add_argument("--base_lr", type=float, default=None,
                        help="Learning rate for the base-model phase of hybrid / full-finetuning training (useful when LoRA/full-FT wants a different LR than the probe). If omitted, falls back to --lr.")
    parser.add_argument("--seed", type=int, default=None, help="Seed for reproducibility (if omitted, current time is used).")
    parser.add_argument("--deterministic", action="store_true",
                        help="Enable deterministic behavior for reproducibility (slightly slower training).")
    parser.add_argument("--full_finetuning", action="store_true", help="Fully fine-tune the base model end-to-end.")
    parser.add_argument("--hybrid_probe", action="store_true", help="Train probe first, then fine-tune base model + probe jointly.")
    parser.add_argument("--num_runs", type=int, default=1, help="Number of training runs with different seeds. Results will show mean±std across runs.")
    parser.add_argument("--no_compile", action="store_true", help="Disable torch.compile on probes during training (compiled by default).")

    # ----------------- Balanced Regression Metrics (EpHod-style) ----------------- #
    parser.add_argument("--balanced_regression_metrics", action="store_true",
                        help="Enable EpHod-style balanced regression metrics on valid/test (enabled by default).")
    parser.add_argument("--no_balanced_regression_metrics", dest="balanced_regression_metrics", action="store_false",
                        help="Disable EpHod-style balanced regression metrics.")
    parser.set_defaults(balanced_regression_metrics=True)
    parser.add_argument("--balanced_weight_method", type=str, default='bin_inv',
                        choices=['none', 'bin_inv', 'bin_inv_sqrt', 'LDS_inv', 'LDS_inv_sqrt', 'LDS_extreme'],
                        help="Weighting scheme for balanced regression metrics.")
    parser.add_argument("--balanced_bin_borders", type=float, nargs='+', default=None,
                        help="Explicit bin borders for balanced metrics (e.g., 5 9 for pH). Default: 1/3 and 2/3 quantiles of training labels.")
    parser.add_argument("--balanced_n_resamples", type=int, default=100,
                        help="Number of resamples for balanced Pearson/Spearman (default: 100).")
    parser.add_argument("--balanced_lds_bins", type=int, default=100,
                        help="Number of bins for LDS density estimation.")
    parser.add_argument("--balanced_lds_ks", type=int, default=5,
                        help="Kernel size for LDS Gaussian smoothing.")
    parser.add_argument("--balanced_lds_sigma", type=float, default=2.0,
                        help="Sigma for LDS Gaussian smoothing.")

    # ----------------- ProteinGym Arguments ----------------- #
    parser.add_argument("--dms_ids", nargs="+", default=["all"],
                        help="ProteinGym DMS assay IDs to evaluate (space-separated), or 'all' to run all assays.")
    parser.add_argument("--proteingym", action="store_true", help="Run a ProteinGym zero-shot experiment.")
    parser.add_argument("--mode", type=str, default='benchmark',
                        help="ProteinGym zero-shot mode: 'benchmark', 'indels', 'multiples', 'singles'")
    parser.add_argument("--scoring_method", choices=["masked_marginal", "mutant_marginal", "wildtype_marginal", "pll", "global_log_prob"], default="masked_marginal",
                        help="Select a scoring method for ProteinGym zero-shot.")
    parser.add_argument("--scoring_window", choices=["optimal", "sliding"], default="optimal",
                        help="Select how to slice the sequence for ProteinGym zero-shot.")
    parser.add_argument("--pg_batch_size", type=int, default=32,
                        help="Batch size for ProteinGym zero-shot scoring (default: 32).")
    parser.add_argument("--compare_scoring_methods", action="store_true",
                        help="Compare different scoring methods across models and DMS assays.")
    parser.add_argument("--score_only", action="store_true",
                        help="Run only the ProteinGym benchmarking script on existing CSV files; skip zero-shot scoring.")

    # ----------------- W&B Arguments ----------------- #
    parser.add_argument("--use_wandb_hyperopt", action="store_true", help="Run a Weights & Biases hyperparameter sweep instead of a single training run.")
    parser.add_argument("--wandb_project", type=str, default="Protify", help="W&B project name for sweeps.")
    parser.add_argument("--wandb_entity", type=str, default=None, help="W&B entity (team/user) for sweeps.")
    parser.add_argument("--sweep_config_path", type=str, default="yamls/sweep.yaml", help="Path to W&B sweep config YAML.")
    parser.add_argument("--sweep_count", type=int, default=10, help="Number of hyperparameter trials to run in the sweep.")
    parser.add_argument("--sweep_method", type=str, default="bayes", choices=["bayes", "grid", "random"], help="Sweep method for hyperparameter optimization.")
    parser.add_argument("--sweep_metric_cls",type=str,default="eval_loss", help="Classification metric to optimize during sweep (e.g., eval_f1, eval_accuracy, eval_mcc)")
    parser.add_argument("--sweep_metric_reg",type=str,default="eval_loss", help="Regression metric to optimize during sweep (e.g., eval_r_squared, eval_spearman_rho, eval_pearson_rho)")
    parser.add_argument("--sweep_goal", type=str, default='minimize', choices=['maximize', 'minimize'], help="Goal for the sweep metric (maximize/minimize)")
    parser.add_argument("--sweep_name", type=str, default=None, help="Display name for the W&B sweep. Overrides 'name' in the sweep YAML.")
    args = parser.parse_args()

    if args.n_heads is not None:
        import warnings
        warnings.warn(
            "--n_heads is deprecated and will be removed in a future release. "
            "Use --head_size instead (n_heads = hidden_size // head_size).",
            DeprecationWarning,
            stacklevel=2,
        )
        assert args.hidden_size % args.n_heads == 0, (
            f"--hidden_size {args.hidden_size} not divisible by deprecated --n_heads {args.n_heads}"
        )
        derived_head_size = args.hidden_size // args.n_heads
        explicit_head_size = "--head_size" in raw_argv
        if explicit_head_size:
            assert derived_head_size == args.head_size, (
                f"--head_size {args.head_size} conflicts with deprecated --n_heads {args.n_heads} "
                f"(derived head_size={derived_head_size})"
            )
        args.head_size = derived_head_size
        args.n_heads = None

    # Validate model_names vs model_paths/model_types mutual exclusivity
    if args.model_paths is not None:
        assert args.model_types is not None, "--model_types is required when --model_paths is provided."
        assert len(args.model_paths) == len(args.model_types), f"--model_paths ({len(args.model_paths)}) and --model_types ({len(args.model_types)}) must have the same length."
        assert args.model_names is None, "--model_names cannot be used together with --model_paths/--model_types."
    elif args.model_types is not None:
        assert args.model_paths is not None, "--model_paths is required when --model_types is provided."
    if args.model_names is None and args.model_paths is None:
        args.model_names = ["ESM2-8"]

    assert args.probe_type == "linear" or args.matrix_embed, "When probe_type is not linear, --matrix_embed must be True."

    if args.hf_token is not None:
        from huggingface_hub import login
        # Override environment variable to ensure this token is used
        os.environ["HF_TOKEN"] = args.hf_token
        login(args.hf_token)
        print(f"Logged in to HuggingFace Hub with token from arguments")
    else:
        # Check if token exists in environment
        hf_token_env = os.environ.get("HF_TOKEN")
        if hf_token_env:
            print(f"Note: HF_TOKEN found in environment")
            print(f"Note: This token will be used for read operations only unless overridden")
    if args.wandb_api_key is not None:
        try:
            import wandb
            wandb.login(key=args.wandb_api_key)
            print('Logged into Weights & Biases')
        except Exception as e:
            print(f'W&B login failed: {e}')
    if args.synthyra_api_key is not None:
        print('Synthyra API key provided')

    if args.yaml_path is not None:
        with open(args.yaml_path, 'r') as file:
            settings = yaml.safe_load(file)
        yaml_args = SimpleNamespace(**settings)

        def _merge_store_true(cli_value: bool, key: str) -> bool:
            if cli_value:
                return True
            if key in yaml_args.__dict__:
                return bool(yaml_args.__dict__[key])
            return False

        if args.hf_token is not None:
            yaml_args.hf_token = args.hf_token
        elif "hf_token" not in yaml_args.__dict__:
            yaml_args.hf_token = None

        if args.hf_home is not None:
            yaml_args.hf_home = args.hf_home
        elif "hf_home" not in yaml_args.__dict__:
            yaml_args.hf_home = None

        if args.synthyra_api_key is not None:
            yaml_args.synthyra_api_key = args.synthyra_api_key
        elif "synthyra_api_key" not in yaml_args.__dict__:
            yaml_args.synthyra_api_key = None

        if args.wandb_api_key is not None:
            yaml_args.wandb_api_key = args.wandb_api_key
        elif "wandb_api_key" not in yaml_args.__dict__:
            yaml_args.wandb_api_key = None

        if args.cloud_api_key is not None:
            yaml_args.cloud_api_key = args.cloud_api_key
        elif "cloud_api_key" not in yaml_args.__dict__:
            yaml_args.cloud_api_key = None

        if args.cloud_url is not None:
            yaml_args.cloud_url = args.cloud_url
        elif "cloud_url" not in yaml_args.__dict__:
            yaml_args.cloud_url = None

        if args.cloud_gpu_type is not None:
            yaml_args.cloud_gpu_type = args.cloud_gpu_type
        elif "cloud_gpu_type" not in yaml_args.__dict__:
            yaml_args.cloud_gpu_type = None

        if args.cloud_timeout_seconds is not None:
            yaml_args.cloud_timeout_seconds = args.cloud_timeout_seconds
        elif "cloud_timeout_seconds" not in yaml_args.__dict__:
            yaml_args.cloud_timeout_seconds = None

        if args.cloud_poll_interval is not None:
            yaml_args.cloud_poll_interval = args.cloud_poll_interval
        elif "cloud_poll_interval" not in yaml_args.__dict__:
            yaml_args.cloud_poll_interval = None

        if args.cloud_artifacts_dir is not None:
            yaml_args.cloud_artifacts_dir = args.cloud_artifacts_dir
        elif "cloud_artifacts_dir" not in yaml_args.__dict__:
            yaml_args.cloud_artifacts_dir = None

        yaml_args.use_wandb_hyperopt = _merge_store_true(args.use_wandb_hyperopt, "use_wandb_hyperopt")

        if (args.wandb_project != "Protify") or ("wandb_project" not in yaml_args.__dict__):
            yaml_args.wandb_project = args.wandb_project
        if (args.wandb_entity is not None) or ("wandb_entity" not in yaml_args.__dict__):
            yaml_args.wandb_entity = args.wandb_entity
        if (args.sweep_config_path != "yamls/sweep.yaml") or ("sweep_config_path" not in yaml_args.__dict__):
            yaml_args.sweep_config_path = args.sweep_config_path
        if (args.sweep_count != 10) or ("sweep_count" not in yaml_args.__dict__):
            yaml_args.sweep_count = args.sweep_count
        if (args.sweep_method != "bayes") or ("sweep_method" not in yaml_args.__dict__):
            yaml_args.sweep_method = args.sweep_method
        if (args.sweep_metric_cls != "eval_loss") or ("sweep_metric_cls" not in yaml_args.__dict__):
            yaml_args.sweep_metric_cls = args.sweep_metric_cls
        if (args.sweep_metric_reg != "eval_loss") or ("sweep_metric_reg" not in yaml_args.__dict__):
            yaml_args.sweep_metric_reg = args.sweep_metric_reg
        if (args.sweep_goal != "minimize") or ("sweep_goal" not in yaml_args.__dict__):
            yaml_args.sweep_goal = args.sweep_goal
        yaml_args.yaml_path = args.yaml_path
        yaml_args.aa_to_dna = _merge_store_true(args.aa_to_dna, "aa_to_dna")
        yaml_args.aa_to_rna = _merge_store_true(args.aa_to_rna, "aa_to_rna")
        yaml_args.dna_to_aa = _merge_store_true(args.dna_to_aa, "dna_to_aa")
        yaml_args.rna_to_aa = _merge_store_true(args.rna_to_aa, "rna_to_aa")
        yaml_args.codon_to_aa = _merge_store_true(args.codon_to_aa, "codon_to_aa")
        yaml_args.aa_to_codon = _merge_store_true(args.aa_to_codon, "aa_to_codon")
        yaml_args.random_pair_flipping = _merge_store_true(args.random_pair_flipping, "random_pair_flipping")
        yaml_args.push_raw_probe = _merge_store_true(args.push_raw_probe, "push_raw_probe")
        # Ensure ProteinGym defaults exist when using YAML configs
        if not hasattr(yaml_args, 'proteingym'):
            yaml_args.proteingym = False
        if not hasattr(yaml_args, 'dms_ids'):
            yaml_args.dms_ids = ["all"]
        if not hasattr(yaml_args, 'mode'):
            yaml_args.mode = None
        if not hasattr(yaml_args, 'scoring_method'):
            yaml_args.scoring_method = "masked_marginal"
        # Ensure num_runs default exists
        if not hasattr(yaml_args, 'num_runs'):
            yaml_args.num_runs = 1
        if "model_dtype" not in yaml_args.__dict__ or yaml_args.model_dtype is None:
            yaml_args.model_dtype = args.model_dtype
        if "embed_dtype" not in yaml_args.__dict__:
            yaml_args.embed_dtype = args.embed_dtype
        explicit_hidden_state_index = any(
            token == "--embedding_hidden_state_index"
            or token.startswith("--embedding_hidden_state_index=")
            for token in raw_argv
        )
        if explicit_hidden_state_index or "embedding_hidden_state_index" not in yaml_args.__dict__:
            yaml_args.embedding_hidden_state_index = args.embedding_hidden_state_index
        if "--no_embedding_scaler" in raw_argv:
            yaml_args.embedding_scaler = False
        elif "embedding_scaler" not in yaml_args.__dict__:
            yaml_args.embedding_scaler = args.embedding_scaler
        if "model_paths" not in yaml_args.__dict__:
            yaml_args.model_paths = args.model_paths
        if "model_types" not in yaml_args.__dict__:
            yaml_args.model_types = args.model_types
        if "model_names" not in yaml_args.__dict__:
            yaml_args.model_names = args.model_names
        return yaml_args
    else:
        return args


if __name__ == "__main__":
    # Settings that need to happen pre-imports
    args = parse_arguments()

    # Require that either datasets are specified or a ProteinGym experiment is chosen
    has_datasets = bool(args.data_names or args.data_dirs)
    has_proteingym = bool(args.proteingym)
    if not has_datasets and not has_proteingym and args.yaml_path is None:
        raise AssertionError("No datasets specified. Provide --data_names or --data_dirs, or run a ProteinGym experiment.")

    if args.use_xformers:
        os.environ["_USE_XFORMERS"] = "1"
        print("xformers memory efficient attention enabled for AMPLIFY models")

    if args.hf_home is not None:
        # Needs to happen before any HF imports
        import pathlib
        base_path = args.hf_home
        cache_root = f"{base_path}/hf_cache"
        tmp_root   = f"{base_path}/tmp"
        pathlib.Path(cache_root).mkdir(parents=True, exist_ok=True)
        pathlib.Path(tmp_root).mkdir(parents=True, exist_ok=True)

        os.environ["HF_HOME"]            = cache_root
        os.environ["HF_DATASETS_CACHE"]  = f"{cache_root}/datasets"
        os.environ["TRANSFORMERS_CACHE"] = f"{cache_root}/transformers" # this is deprecated, but does not hurt anything
        os.environ["HF_HUB_CACHE"]       = f"{cache_root}/hub"
        print(f"HF_HOME: {os.environ['HF_HOME']}")
        print(f"HF_DATASETS_CACHE: {os.environ['HF_DATASETS_CACHE']}")
        print(f"TRANSFORMERS_CACHE: {os.environ['TRANSFORMERS_CACHE']}")
        print(f"HF_HUB_CACHE: {os.environ['HF_HUB_CACHE']}")

    # Set global seed before doing anything else
    # If seed is None, set_global_seed will derive it from current time
    if args.deterministic:
        from protify.seed_utils import set_determinism
        set_determinism()

    import protify.entrypoint_setup # needs to happen after set_determinism()


import torch
from torchinfo import summary

from protify.base_models.get_base_models import BaseModelArguments, get_base_model_for_training, get_tokenizer
from protify.base_models.utils import wrap_lora
from protify.benchmarks.proteingym.compare_scoring_methods import compare_scoring_methods
from protify.benchmarks.proteingym.scorer import ProteinGymRunner
from protify.data.data_mixin import DataArguments, DataMixin
from protify.embedder import Embedder, EmbeddingArguments, get_embedding_filename
from protify.hyperopt_utils import HyperoptModule
from protify.logger import MetricsLogger, log_method_calls
from protify.probes.get_probe import ProbeArguments, get_probe
from protify.probes.scikit_classes import ScikitArguments, ScikitProbe
from protify.probes.trainers import TrainerArguments, TrainerMixin
from protify.seed_utils import set_global_seed
from protify.utils import expand_dms_ids_all, print_message, torch_load
from protify.visualization.plot_result import create_plots


class MainProcess(MetricsLogger, DataMixin, TrainerMixin):
    def __init__(self, full_args, GUI=False):
        super(MainProcess, self).__init__(full_args)
        super(DataMixin, self).__init__()
        super(TrainerMixin, self).__init__()
        self.full_args = full_args
        if not GUI:
            self.start_log_main()

        self.dtype_map = {
            "fp32": torch.float32,
            "fp16": torch.float16,
            "bf16": torch.bfloat16,
            "float32": torch.float32,
            "float16": torch.float16,
            "bfloat16": torch.bfloat16,
            "float8_e4m3fn": torch.float8_e4m3fn,
            "float8_e5m2": torch.float8_e5m2,
            #"int8": torch.int8,
        }

    def _build_scikit_args(self):
        if "scikit_n_iter" in self.full_args.__dict__:
            n_iter = self.full_args.scikit_n_iter
        else:
            n_iter = 10

        if "scikit_cv" in self.full_args.__dict__:
            cv = self.full_args.scikit_cv
        else:
            cv = 3

        if "scikit_random_state" in self.full_args.__dict__:
            random_state = self.full_args.scikit_random_state
        else:
            random_state = None

        if "scikit_model_name" in self.full_args.__dict__:
            model_name = self.full_args.scikit_model_name
        else:
            model_name = None

        if "production_model" in self.full_args.__dict__:
            production_model = self.full_args.production_model
        else:
            production_model = False

        return ScikitArguments(
            n_iter=n_iter,
            cv=cv,
            random_state=random_state,
            model_name=model_name,
            production_model=production_model,
        )

    @log_method_calls
    def apply_current_settings(self):
        if "model_dtype" not in self.full_args.__dict__:
            self.full_args.model_dtype = "bf16"
        if "embed_dtype" not in self.full_args.__dict__:
            self.full_args.embed_dtype = None
        if isinstance(self.full_args.model_dtype, str):
            self.full_args.model_dtype = self.dtype_map[self.full_args.model_dtype]
        if self.full_args.embed_dtype is None:
            self.full_args.embed_dtype = self.full_args.model_dtype
        elif isinstance(self.full_args.embed_dtype, str):
            self.full_args.embed_dtype = self.dtype_map[self.full_args.embed_dtype]
        else:
            self.full_args.embed_dtype = self.full_args.embed_dtype
        if "torch_compile" not in self.full_args.__dict__:
            no_compile = getattr(self.full_args, "no_compile", False)
            self.full_args.torch_compile = not no_compile
        self.data_args = DataArguments(**self.full_args.__dict__)
        self.embedding_args = EmbeddingArguments(**self.full_args.__dict__)
        self.model_args = BaseModelArguments(**self.full_args.__dict__)
        self.probe_args = ProbeArguments(**self.full_args.__dict__)
        self.trainer_args = TrainerArguments(**self.full_args.__dict__)
        self.logger_args = SimpleNamespace(**self.full_args.__dict__)
        self.scikit_args = self._build_scikit_args()
        self._sql = self.full_args.sql
        self._full = self.full_args.matrix_embed
        self._max_length = self.full_args.max_length
        self._trim = self.full_args.trim
        self._delimiter = self.full_args.delimiter
        self._col_names = self.full_args.col_names
        self._aa_to_dna = self.full_args.aa_to_dna
        self._aa_to_rna = self.full_args.aa_to_rna
        self._dna_to_aa = self.full_args.dna_to_aa
        self._rna_to_aa = self.full_args.rna_to_aa
        self._codon_to_aa = self.full_args.codon_to_aa
        self._aa_to_codon = self.full_args.aa_to_codon
        self._multi_column = getattr(self.full_args, 'multi_column', None)

    @log_method_calls
    def get_datasets(self):
        self.datasets, self.all_seqs = self.get_data()

    @log_method_calls
    def save_embeddings_to_disk(self):
        self.embedding_args.save_embeddings = True
        embedder = Embedder(self.embedding_args, self.all_seqs)
        for display_name, dispatch_type, model_path in self.model_args.model_entries():
            _ = embedder(display_name, model_type=dispatch_type, model_path=model_path)

    def _create_model_factory(self, model_name, tokenwise, num_labels, hybrid, model_path=None):
        """Function for creating fresh models in multi-run mode."""
        def factory():
            model, _ = get_base_model_for_training(
                model_name,
                tokenwise=tokenwise,
                num_labels=num_labels,
                hybrid=hybrid,
                dtype=self.model_args.model_dtype,
                model_path=model_path,
            )
            if self.probe_args.lora:
                model = wrap_lora(model, self.probe_args.lora_r, self.probe_args.lora_alpha, self.probe_args.lora_dropout)
            return model
        return factory

    def _create_probe_factory(self):
        """Function for creating fresh probes in multi-run mode."""
        def factory():
            return get_probe(self.probe_args)
        return factory

    def _run_nn_probe(
            self,
            model_name,
            data_name,
            train_set,
            valid_set,
            test_set,
            tokenizer,
            emb_dict=None,
            ppi=False,
            source_model_name=None,
            sweep_mode: bool = False,
        ):
        if source_model_name is None:
            source_model_name = model_name
        # Create initial probe (for single run or as template for multi-run)
        probe = get_probe(self.probe_args)
        summary(probe)

        # trainer_probe handles multi-run internally if num_runs > 1
        probe, valid_metrics, test_metrics, _, _ = self.trainer_probe(
            model=probe,
            tokenizer=tokenizer,
            model_name=model_name,
            data_name=data_name,
            train_dataset=train_set,
            valid_dataset=valid_set,
            test_dataset=test_set,
            emb_dict=emb_dict,
            ppi=ppi,
            log_id=self.random_id,
            source_model_name=source_model_name,
        )
        if not sweep_mode:
            self.log_metrics(data_name, model_name, valid_metrics, split_name='valid')
            self.log_metrics(data_name, model_name, test_metrics, split_name='test')
        return probe, valid_metrics, test_metrics

    def _train_nn_probe_fold(self, model_name, dms_id, subtrain_seqs, subtrain_labels,
                            valid_seqs, valid_labels, test_seqs, test_labels,
                            emb_dict, fold_info):
        """Trains a neural network probe on a ProteinGym DMS assay CV fold."""

        train_set = {'seqs': subtrain_seqs, 'labels': subtrain_labels}
        valid_set = None if (valid_seqs is None or valid_labels is None) else {'seqs': valid_seqs, 'labels': valid_labels}
        test_set = {'seqs': test_seqs, 'labels': test_labels}

        # Get tokenizer and determine input dimensions
        tokenizer = get_tokenizer(model_name)
        pooling_types = self.embedding_args.pooling_types
        hidden_state_index = self.embedding_args.hidden_state_index

        if self._sql:
            filename = get_embedding_filename(model_name, self._full, pooling_types, 'db', hidden_state_index)
            save_path = os.path.join(self.embedding_args.embedding_save_dir, filename)
            input_dim = self.get_embedding_dim_sql(save_path, subtrain_seqs[0], tokenizer)
            emb_for_training = None
        else:
            filename = get_embedding_filename(model_name, self._full, pooling_types, 'pth', hidden_state_index)
            save_path = os.path.join(self.embedding_args.embedding_save_dir, filename)
            emb_for_training = torch_load(save_path) if os.path.exists(save_path) else emb_dict
            input_dim = self.get_embedding_dim_pth(emb_for_training, subtrain_seqs[0], tokenizer)

        # Configure probe for regression
        self.probe_args.input_size = input_dim
        self.probe_args.task_type = 'regression'
        self.probe_args.num_labels = 1
        self.trainer_args.task_type = 'regression'

        probe = get_probe(self.probe_args)
        _, _, test_metrics = self.trainer_probe(
            model=probe,
            tokenizer=tokenizer,
            model_name=model_name,
            data_name=f"{dms_id}_{fold_info}",
            train_dataset=train_set,
            valid_dataset=valid_set,
            test_dataset=test_set,
            emb_dict=emb_for_training,
            ppi=False,
            log_id=f"{self.random_id}_{fold_info}",
            source_model_name=model_name,
        )

        # Handle both plain and test-prefixed metric keys returned by HF Trainer
        rho = test_metrics.get('spearman_rho', test_metrics.get('test_spearman_rho', None))
        mse = test_metrics.get('mse', test_metrics.get('test_mse', None))
        return rho, mse

    def _run_full_finetuning(
            self,
            model_name,
            data_name,
            train_set,
            valid_set,
            test_set,
            ppi=False,
            source_model_name=None,
            sweep_mode: bool = False,
            model_path: str = None,
        ):
        if source_model_name is None:
            source_model_name = model_name
        tokenwise = self.probe_args.tokenwise
        num_labels = self.probe_args.num_labels
        num_runs = getattr(self.trainer_args, 'num_runs', 1)

        model_factory = self._create_model_factory(model_name, tokenwise, num_labels, hybrid=False, model_path=model_path) if num_runs > 1 else None
        model, tokenizer = get_base_model_for_training(
            model_name,
            tokenwise=tokenwise,
            num_labels=num_labels,
            hybrid=False,
            dtype=self.model_args.model_dtype,
            model_path=model_path,
        )
        if self.probe_args.lora:
            model = wrap_lora(model, self.probe_args.lora_r, self.probe_args.lora_alpha, self.probe_args.lora_dropout)
        summary(model)
        model, valid_metrics, test_metrics, _, _ = self.trainer_base_model(
            model=model,
            tokenizer=tokenizer,
            model_name=model_name,
            data_name=data_name,
            train_dataset=train_set,
            valid_dataset=valid_set,
            test_dataset=test_set,
            ppi=ppi,
            log_id=self.random_id,
            source_model_name=source_model_name,
            model_factory=model_factory,
        )
        if not sweep_mode:
            self.log_metrics(data_name, model_name, valid_metrics, split_name='valid')
            self.log_metrics(data_name, model_name, test_metrics, split_name='test')
        return model, valid_metrics, test_metrics

    def _run_hybrid_probe(
            self,
            model_name,
            data_name,
            train_set,
            valid_set,
            test_set,
            tokenizer,
            emb_dict=None,
            ppi=False,
            source_model_name=None,
            sweep_mode: bool = False,
            model_path: str = None,
        ):
        if source_model_name is None:
            source_model_name = model_name
        # Random models don't have a trainable base model, so fall back to regular probe
        if "random" in model_name.lower():
            print_message(f"Model {model_name} does not support hybrid training. Training a linear probe instead.")
            probe = get_probe(self.probe_args)
            summary(probe)
            probe, valid_metrics, test_metrics = self.trainer_probe(
                model=probe,
                tokenizer=tokenizer,
                model_name=model_name,
                data_name=data_name,
                train_dataset=train_set,
                valid_dataset=valid_set,
                test_dataset=test_set,
                emb_dict=emb_dict,
                ppi=ppi,
                log_id=self.random_id,
                source_model_name=source_model_name,
            )
            if not sweep_mode:
                self.log_metrics(data_name, model_name, valid_metrics, split_name='valid')
                self.log_metrics(data_name, model_name, test_metrics, split_name='test')
            return probe, valid_metrics, test_metrics

        tokenwise = self.probe_args.tokenwise
        num_labels = self.probe_args.num_labels
        num_runs = getattr(self.trainer_args, 'num_runs', 1)

        model_factory = self._create_model_factory(model_name, tokenwise, num_labels, hybrid=True, model_path=model_path) if num_runs > 1 else None
        probe_factory = self._create_probe_factory() if num_runs > 1 else None
        model, tokenizer = get_base_model_for_training(
            model_name,
            tokenwise=tokenwise,
            num_labels=num_labels,
            hybrid=True,
            dtype=self.model_args.model_dtype,
            model_path=model_path,
        )
        if self.probe_args.lora:
            model = wrap_lora(model, self.probe_args.lora_r, self.probe_args.lora_alpha, self.probe_args.lora_dropout)
        probe = get_probe(self.probe_args)
        summary(model)
        summary(probe)
        model, valid_metrics, test_metrics, _, _ = self.trainer_hybrid_model(
            model=model,
            tokenizer=tokenizer,
            probe=probe,
            model_name=model_name,
            data_name=data_name,
            train_dataset=train_set,
            valid_dataset=valid_set,
            test_dataset=test_set,
            emb_dict=emb_dict,
            ppi=ppi,
            log_id=self.random_id,
            source_model_name=source_model_name,
            model_factory=model_factory,
            probe_factory=probe_factory,
        )
        if not sweep_mode:
            self.log_metrics(data_name, model_name, valid_metrics, split_name='valid')
            self.log_metrics(data_name, model_name, test_metrics, split_name='test')
        return model, valid_metrics, test_metrics

    @log_method_calls
    def run_full_finetuning(self):
        total_combinations = len(self.model_args.model_names) * len(self.datasets)
        self.logger.info(f"Processing {total_combinations} model/dataset combinations")
        for display_name, dispatch_type, model_path in self.model_args.model_entries():
            for data_name, dataset in self.datasets.items():
                self.logger.info(f"Processing dataset: {data_name}")
                train_set, valid_set, test_set, num_labels, label_type, ppi = dataset
                self.probe_args.num_labels = num_labels
                self.probe_args.task_type = label_type
                self.trainer_args.task_type = label_type
                self.logger.info(f'Training probe for {data_name} with {display_name}')
                _ = self._run_full_finetuning(dispatch_type, data_name, train_set, valid_set, test_set, ppi, model_path=model_path)
                torch.cuda.empty_cache()

    @log_method_calls
    def run_hybrid_probes(self):
        probe_args = self.probe_args
        test_seq = self.all_seqs[0]

        # Log the combinations we're going to process
        total_combinations = len(self.model_args.model_names) * len(self.datasets)
        self.logger.info(f"Processing {total_combinations} model/dataset combinations")

        # for each model, gather the settings and embeddings
        # assumes save_embeddings_to_disk has already been called
        for display_name, dispatch_type, model_path in self.model_args.model_entries():
            self.logger.info(f"Processing model: {display_name}")

            # get tokenizer
            tokenizer = get_tokenizer(dispatch_type, model_path=model_path)

            # get embedding size
            pooling_types = self.embedding_args.pooling_types
            hidden_state_index = self.embedding_args.hidden_state_index
            if self._sql:
                # for sql, the embeddings will be gathered in real time during training
                filename = get_embedding_filename(display_name, self._full, pooling_types, 'db', hidden_state_index)
                save_path = os.path.join(self.embedding_args.embedding_save_dir, filename)
                input_size = self.get_embedding_dim_sql(save_path, test_seq, tokenizer)
                emb_dict = None
            else:
                # for pth, the embeddings are loaded entirely into RAM and accessed during training
                filename = get_embedding_filename(display_name, self._full, pooling_types, 'pth', hidden_state_index)
                save_path = os.path.join(self.embedding_args.embedding_save_dir, filename)
                emb_dict = torch_load(save_path)
                input_size = self.get_embedding_dim_pth(emb_dict, test_seq, tokenizer)

            # Adjust input dim for multi-column vector embeddings
            if (not self._full) and getattr(self.full_args, 'multi_column', None):
                input_size = input_size * len(self.full_args.multi_column)

            # for each dataset, gather the settings and train the probe
            for data_name, dataset in self.datasets.items():
                self.logger.info(f"Processing dataset: {data_name}")
                train_set, valid_set, test_set, num_labels, label_type, ppi = dataset
                if ppi and not self._full:
                    probe_args.input_size = input_size * 2
                else:
                    probe_args.input_size = input_size
                # PPI concatenates two proteins along the sequence dim, so matrix-mode
                # PPI sequences can reach up to 2 * max_length.
                probe_args.max_seq_len = self._max_length * 2 if (ppi and self._full) else self._max_length

                self.probe_args.num_labels = num_labels
                self.probe_args.task_type = label_type
                ### TODO we currently need both, settings should probably be consolidated
                self.trainer_args.task_type = label_type
                self.logger.info(f'Training probe for {data_name} with {display_name}')
                ### TODO eventually add options for optimizers and schedulers
                ### TODO here is probably where we can differentiate between the different training schemes
                _ = self._run_hybrid_probe(
                    model_name=dispatch_type,
                    data_name=data_name,
                    train_set=train_set,
                    valid_set=valid_set,
                    test_set=test_set,
                    tokenizer=tokenizer,
                    emb_dict=emb_dict,
                    ppi=ppi,
                    source_model_name=display_name,
                    model_path=model_path,
                )
                torch.cuda.empty_cache()
                ### TODO may link from probe here to running inference on input csv or HF datasets

    @log_method_calls
    def run_nn_probes(self):
        probe_args = self.probe_args
        test_seq = self.all_seqs[0]

        # Log the combinations we're going to process
        total_combinations = len(self.model_args.model_names) * len(self.datasets)
        self.logger.info(f"Processing {total_combinations} model/dataset combinations")

        # for each model, gather the settings and embeddings
        # assumes save_embeddings_to_disk has already been called
        for display_name, dispatch_type, model_path in self.model_args.model_entries():
            self.logger.info(f"Processing model: {display_name}")

            # get tokenizer
            tokenizer = get_tokenizer(dispatch_type, model_path=model_path)

            # get embedding size
            pooling_types = self.embedding_args.pooling_types
            hidden_state_index = self.embedding_args.hidden_state_index
            if self._sql:
                # for sql, the embeddings will be gathered in real time during training
                filename = get_embedding_filename(display_name, self._full, pooling_types, 'db', hidden_state_index)
                save_path = os.path.join(self.embedding_args.embedding_save_dir, filename)
                input_size = self.get_embedding_dim_sql(save_path, test_seq, tokenizer)
                emb_dict = None
            else:
                # for pth, the embeddings are loaded entirely into RAM and accessed during training
                filename = get_embedding_filename(display_name, self._full, pooling_types, 'pth', hidden_state_index)
                save_path = os.path.join(self.embedding_args.embedding_save_dir, filename)
                emb_dict = torch_load(save_path)
                input_size = self.get_embedding_dim_pth(emb_dict, test_seq, tokenizer)

            # Adjust input dim for multi-column vector embeddings
            if (not self._full) and getattr(self.full_args, 'multi_column', None):
                input_size = input_size * len(self.full_args.multi_column)

            print(f'Input dim: {input_size}')

            # for each dataset, gather the settings and train the probe
            for data_name, dataset in self.datasets.items():
                self.logger.info(f"Processing dataset: {data_name}")
                train_set, valid_set, test_set, num_labels, label_type, ppi = dataset
                if ppi and not self._full:
                    probe_args.input_size = input_size * 2
                else:
                    probe_args.input_size = input_size
                # PPI concatenates two proteins along the sequence dim, so matrix-mode
                # PPI sequences can reach up to 2 * max_length.
                probe_args.max_seq_len = self._max_length * 2 if (ppi and self._full) else self._max_length

                self.probe_args.num_labels = num_labels
                self.probe_args.task_type = label_type
                ### TODO we currently need both, settings should probably be consolidated
                self.trainer_args.task_type = label_type
                self.logger.info(f'Training probe for {data_name} with {display_name}')
                ### TODO eventually add options for optimizers and schedulers
                ### TODO here is probably where we can differentiate between the different training schemes
                _ = self._run_nn_probe(
                    model_name=display_name,
                    data_name=data_name,
                    train_set=train_set,
                    valid_set=valid_set,
                    test_set=test_set,
                    tokenizer=tokenizer,
                    emb_dict=emb_dict,
                    ppi=ppi,
                    source_model_name=display_name,
                )
                torch.cuda.empty_cache()
                ### TODO may link from probe here to running inference on input csv or HF datasets

    @log_method_calls
    def run_scikit_scheme(self):
        self.scikit_args = self._build_scikit_args()
        scikit_probe = ScikitProbe(self.scikit_args)
        if "n_jobs" in self.full_args.__dict__:
            scikit_probe.n_jobs = self.full_args.n_jobs
        else:
            scikit_probe.n_jobs = 1
        for display_name, dispatch_type, model_path in self.model_args.model_entries():
            for data_name, dataset in self.datasets.items():
                ### find best scikit model and parameters via cross validation and lazy predict
                X_train, y_train, X_valid, y_valid, X_test, y_test, label_type = self.prepare_scikit_dataset(display_name, dataset)

                # If a specific model is specified, skip LazyPredict and go straight to that model
                if self.scikit_args.model_name is not None:
                    print_message(f"Skipping LazyPredict, using specified model: {self.scikit_args.model_name}")
                    results = scikit_probe.run_specific_model(X_train, y_train, X_valid, y_valid, X_test, y_test, model_results=None)
                else:
                    # Find best model via LazyPredict
                    if label_type == 'singlelabel':
                        results = scikit_probe.find_best_classifier(X_train, y_train, X_valid, y_valid)
                    elif label_type == 'regression':
                        results = scikit_probe.find_best_regressor(X_train, y_train, X_valid, y_valid)
                    else:
                        raise ValueError(f'Label type {label_type} not supported')
                    # Train and evaluate best model with optimal hyperparameters
                    results = scikit_probe.run_specific_model(X_train, y_train, X_valid, y_valid, X_test, y_test, results)

                # Log the results for plotting
                metrics_dict = {'test_mcc': results.final_scores} if isinstance(results.final_scores, (int, float)) else results.final_scores
                self.log_metrics(data_name, display_name, metrics_dict, split_name='test')

    @log_method_calls
    def generate_plots(self):
        print_message("Generating visualization plots...")
        # Determine which results file to use