-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathmain.py
More file actions
1205 lines (1080 loc) · 65.7 KB
/
main.py
File metadata and controls
1205 lines (1080 loc) · 65.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import argparse
import os
import sys
import yaml
from types import SimpleNamespace
# Ensure src is in sys.path
_current_dir = os.path.dirname(os.path.abspath(__file__))
_src_dir = os.path.abspath(os.path.join(_current_dir, ".."))
if _src_dir not in sys.path:
sys.path.insert(0, _src_dir)
# Docker/Mount fallback: If the 'src/protify' folder was mounted directly
# as the root (e.g. /workspace) and lost its 'protify' wrapper name.
try:
import protify
except ImportError:
if os.path.exists(os.path.join(_current_dir, "__init__.py")):
import importlib.util
_spec = importlib.util.spec_from_file_location("protify", os.path.join(_current_dir, "__init__.py"))
if _spec is not None:
_protify_mod = importlib.util.module_from_spec(_spec)
sys.modules["protify"] = _protify_mod
_spec.loader.exec_module(_protify_mod)
from protify.cloud_cli import _run_on_cloud, _should_auto_run_cloud
def parse_arguments():
raw_argv = sys.argv[1:]
parser = argparse.ArgumentParser(description="Script with arguments mirroring the provided YAML settings.")
# ----------------- ID ----------------- #
parser.add_argument("--hf_username", default="Synthyra", help="Hugging Face username.")
parser.add_argument("--hf_token", default=None, help="Hugging Face token.")
parser.add_argument("--synthyra_api_key", default=None, help="Synthyra API key.")
parser.add_argument("--wandb_api_key", default=None, help="Wandb API key.")
parser.add_argument("--cloud_api_key", default=None, help="Cloud backend API key. When provided, jobs are dispatched to the remote cloud backend.")
parser.add_argument("--cloud_url", default=None, help="Cloud backend URL (default: https://api.synthyra.com).")
parser.add_argument("--cloud_gpu_type", default=None, help="GPU type for cloud execution (e.g. A10, A100, H100).")
parser.add_argument("--cloud_timeout_seconds", type=int, default=None, help="Timeout in seconds for cloud jobs (default: 86400).")
parser.add_argument("--cloud_poll_interval", type=int, default=None, help="Poll interval in seconds for cloud job status (default: 5).")
parser.add_argument("--cloud_artifacts_dir", default=None, help="Local directory to save cloud job artifacts (default: cloud_artifacts).")
# ----------------- Paths ----------------- #
parser.add_argument("--hf_home", type=str, default=None, help="Customize the HF cache directory.")
parser.add_argument("--yaml_path", type=str, default=None, help="Path to the YAML file.")
parser.add_argument("--log_dir", type=str, default="logs", help="Path to the log directory.")
parser.add_argument("--results_dir", type=str, default="results", help="Path to the results directory.")
parser.add_argument("--model_save_dir", default="weights", help="Directory to save models.")
parser.add_argument("--embedding_save_dir", default="embeddings", help="Directory to save embeddings.")
parser.add_argument("--download_dir", default="Synthyra/vector_embeddings", help="Directory to download embeddings to.")
parser.add_argument("--plots_dir", default="plots", help="Directory to save plots.")
parser.add_argument("--replay_path", type=str, default=None, help="Path to the replay file.")
parser.add_argument("--pretrained_probe_path", type=str, default=None) # TODO not used right now
# ----------------- DataArguments ----------------- #
parser.add_argument("--delimiter", default=",", help="Delimiter for data.")
parser.add_argument("--col_names", nargs="+", default=["seqs", "labels"], help="Column names.") # DEPRECATED, found automatically now
parser.add_argument("--max_length", type=int, default=2048, help="Maximum sequence length.")
parser.add_argument(
"--padding",
choices=["max_length", "longest"],
default="max_length",
help="Padding strategy. 'max_length' pads all sequences to --max_length (recommended for torch.compile + flex attention). 'longest' pads to the longest sequence in each batch.",
)
parser.add_argument("--trim", action="store_true",
help="Truncate sequences longer than --max_length instead of dropping them from the dataset.")
parser.add_argument("--data_names", nargs="+", default=[], help="List of HF dataset names.") # TODO rename to data_names
parser.add_argument("--data_dirs", nargs="+", default=[], help="List of local data directories.")
parser.add_argument("--aa_to_dna", action="store_true", help="Translate amino-acid sequences to DNA codon sequences using common human synonymous codons.")
parser.add_argument("--aa_to_rna", action="store_true", help="Translate amino-acid sequences to RNA codon sequences using common human synonymous codons.")
parser.add_argument("--dna_to_aa", action="store_true", help="Translate DNA codon sequences to amino-acid sequences and drop stop codons.")
parser.add_argument("--rna_to_aa", action="store_true", help="Translate RNA codon sequences to amino-acid sequences and drop stop codons.")
parser.add_argument("--codon_to_aa", action="store_true", help="Translate codon-token sequences to amino-acid sequences and drop stop codons.")
parser.add_argument("--aa_to_codon", action="store_true", help="Translate amino-acid sequences to codon-token sequences.")
parser.add_argument("--random_pair_flipping", action="store_true", help="Randomly swap paired inputs during training.")
# ----------------- BaseModelArguments ----------------- #
parser.add_argument("--model_names", nargs="+", default=None, help="List of preset model names to use (e.g. ESM2-8). Mutually exclusive with --model_paths/--model_types.")
parser.add_argument("--model_paths", nargs="+", default=None, help="List of model paths (HuggingFace or local). Must be paired with --model_types. Mutually exclusive with --model_names.")
parser.add_argument("--model_types", nargs="+", default=None, help="List of model type keywords paired with --model_paths (e.g. esm2, esmc, protbert, prott5, ankh, glm, dplm, dplm2, protclm, onehot, amplify, e1, vec2vec, calm, custom, random).")
parser.add_argument("--model_dtype", type=str, choices=["fp32", "fp16", "bf16", "float32", "float16", "bfloat16"], default="bf16", help="Data type for loading base models.")
parser.add_argument("--use_xformers", action="store_true", help="Use xformers memory-efficient attention for AMPLIFY models.")
# ----------------- ProbeArguments ----------------- #
parser.add_argument("--probe_type", choices=["linear", "transformer", "lyra"], default="linear", help="Type of probe.")
parser.add_argument("--tokenwise", action="store_true", help="Use a tokenwise probe (per-token outputs) instead of a sequence-level probe.")
parser.add_argument("--hidden_size", type=int, default=8192, help="Hidden dimension size for probe.")
parser.add_argument("--dropout", type=float, default=0.2, help="Dropout rate.")
parser.add_argument("--n_layers", type=int, default=1, help="Number of layers.")
parser.add_argument("--pre_ln", action="store_false",
help="Disable pre-layernorm in the transformer probe (pre-LN enabled by default).")
parser.add_argument("--classifier_size", type=int, default=4096, help="Feed-forward dimension.")
parser.add_argument("--transformer_dropout", type=float, default=0.1, help="Dropout rate for the transformer layers.")
parser.add_argument("--classifier_dropout", type=float, default=0.2, help="Dropout rate for the classifier.")
parser.add_argument("--head_size", type=int, default=128, help="Attention head dimension. n_heads is derived as hidden_size // head_size.")
parser.add_argument("--n_heads", type=int, default=None, help="[DEPRECATED] Use --head_size. If provided, head_size is derived as hidden_size // n_heads.")
parser.add_argument("--rotary", action="store_false",
help="Disable rotary embeddings in the transformer probe (rotary enabled by default).")
parser.add_argument("--attention_backend", choices=["kernels", "flex", "sdpa"], default="flex", help="Attention backend for transformer-style probes.")
parser.add_argument("--output_s_max", action="store_true", help="Return s_max bounds from transformer-style probe attention layers.")
parser.add_argument("--probe_pooling_types", nargs="+", default=["mean", "var"], help="Pooling types to use.")
parser.add_argument("--bom_k", type=int, default=60, help="K-mer window size for 'bom' pooling in the transformer probe. Default 60 is a cross-task compromise from Hoang & Singh 2025: peak for DPI (Section 4.3), mid-range of the {20,40,60,80,100} sweep on FLUO/BLAC, and close to the reported k=100 optimum for remote homology. Only used when 'bom' is in --probe_pooling_types.")
parser.add_argument("--use_bias", action="store_true", help="Use bias terms in Linear layers.")
parser.add_argument("--expansion_ratio", type=float, default=8/3, help="FFN expansion ratio for transformer probes.")
parser.add_argument("--save_model", action="store_true", help="Save the trained model/probe to disk.")
parser.add_argument("--push_raw_probe", action="store_true", help="With --save_model, push the raw probe class to the Hub instead of the packaged AutoModel.")
parser.add_argument("--push_raw_probe_repo", type=str, default=None, help="Custom HF repo id for --push_raw_probe. If omitted, auto-generated.")
parser.add_argument("--production_model", action="store_true", help="Train a production-grade scikit model (used with --use_scikit).")
parser.add_argument("--lora", action="store_true", help="Wrap the base model in LoRA adapters during full-finetuning / hybrid training.")
parser.add_argument("--lora_r", type=int, default=8, help="Number of trainable parameters in the LoRA model.")
parser.add_argument("--lora_alpha", type=float, default=32.0, help="Alpha for the LoRA model.")
parser.add_argument("--lora_dropout", type=float, default=0.01, help="Dropout rate for the LoRA model.")
parser.add_argument("--sim_type", choices=["dot", "euclidean", "cosine"], default="dot", help="Cross-attention mechanism for token-parameter-attention")
parser.add_argument("--add_token_ids", action="store_true", help="Add learned token-type embeddings to distinguish protein A vs B in PPI tasks.")
# ----------------- ScikitArguments ----------------- #
parser.add_argument("--scikit_n_iter", type=int, default=10, help="Number of iterations for scikit model.")
parser.add_argument("--scikit_cv", type=int, default=3, help="Number of cross-validation folds for scikit model.")
parser.add_argument("--scikit_random_state", type=int, default=None, help="Random state for scikit model (if None, uses global seed).")
parser.add_argument("--scikit_model_name", type=str, default=None, help="Name of the scikit model to use.")
parser.add_argument("--scikit_model_args", type=str, default=None, help="JSON string of hyperparameters to use (skips tuning). E.g. '{\"n_estimators\": 500, \"max_depth\": 7}'")
parser.add_argument("--use_scikit", action="store_true", help="Use a scikit-learn model instead of a neural probe.")
parser.add_argument("--n_jobs", type=int, default=1, help="Number of processes to use in scikit.") # TODO integrate with GUI and main
# ----------------- EmbeddingArguments ----------------- #
parser.add_argument("--embedding_batch_size", type=int, default=16, help="Batch size for embedding generation.")
parser.add_argument("--embedding_num_workers", type=int, default=0, help="Number of worker processes for embedding generation.")
parser.add_argument("--num_workers", type=int, default=0, help="Number of worker processes for data loading.")
parser.add_argument("--download_embeddings", action="store_true", help="Download pre-computed embeddings from the Hub instead of computing them locally.")
parser.add_argument("--matrix_embed", action="store_true", help="Store per-token (matrix) embeddings instead of pooled vector embeddings.")
parser.add_argument("--embedding_pooling_types", nargs="+", default=["mean", "var"], help="Pooling types for embeddings.")
parser.add_argument("--embedding_hidden_state_index", type=int, default=-1, help="Hidden-state tuple index to embed from. -1 uses the final hidden state.")
parser.add_argument("--save_embeddings", action="store_true", help="Save computed embeddings to disk.")
parser.add_argument("--embed_dtype", type=str, choices=["fp32", "fp16", "bf16", "float32", "float16", "bfloat16"], default=None, help="Data type for embeddings. If omitted, uses --model_dtype.")
parser.add_argument("--no_embedding_scaler", dest="embedding_scaler", action="store_false", default=True,
help="Disable StandardScaler for pooled vector embeddings during probe/scikit training.")
parser.add_argument("--sql", action="store_true", help="Store embeddings in a SQLite backend (streamed at train time) instead of in-RAM .pth.")
parser.add_argument("--read_scaler", type=int, default=100, help="Read scaler for SQL storage.")
# ----------------- Multi-Column Sequences ----------------- #
parser.add_argument("--multi_column", nargs="+", default=None, help="If set, list of sequence column names to combine per sample.")
# ----------------- TrainerArguments ----------------- #
parser.add_argument("--num_epochs", type=int, default=200, help="Number of epochs to train for.")
parser.add_argument("--probe_batch_size", type=int, default=64, help="Batch size for probe training.")
parser.add_argument("--base_batch_size", type=int, default=4, help="Batch size for base model training.")
parser.add_argument("--probe_grad_accum", type=int, default=1, help='Gradient accumulation steps for probe training.')
parser.add_argument("--base_grad_accum", type=int, default=8, help='Gradient accumulation steps for base model training.')
parser.add_argument("--lr", type=float, default=1e-4, help="Learning rate.")
### TODO integrate
#parser.add_argument("--probe_lr", type=float, default=1e-4, help="Learning rate for probe training.")
#parser.add_argument("--base_lr", type=float, default=1e-5, help="Learning rate for base model training.")
#parser.add_argument("--lr_scheduler", type=str, default='cosine', help='Learning rate scheduler.')
#parser.add_argument("--optimizer", type=str, default='adamw', help='Optimizer.')
parser.add_argument("--weight_decay", type=float, default=0.00, help="Weight decay.")
parser.add_argument("--patience", type=int, default=1, help="Patience for early stopping (probe phase, and base phase unless --base_patience is set).")
parser.add_argument("--base_num_epochs", type=int, default=None,
help="Epoch count for the base-model phase of hybrid / full-finetuning training. If omitted, falls back to --num_epochs.")
parser.add_argument("--base_patience", type=int, default=None,
help="Early-stopping patience for the base-model phase of hybrid / full-finetuning training. If omitted, falls back to --patience.")
parser.add_argument("--base_lr", type=float, default=None,
help="Learning rate for the base-model phase of hybrid / full-finetuning training (useful when LoRA/full-FT wants a different LR than the probe). If omitted, falls back to --lr.")
parser.add_argument("--seed", type=int, default=None, help="Seed for reproducibility (if omitted, current time is used).")
parser.add_argument("--deterministic", action="store_true",
help="Enable deterministic behavior for reproducibility (slightly slower training).")
parser.add_argument("--full_finetuning", action="store_true", help="Fully fine-tune the base model end-to-end.")
parser.add_argument("--hybrid_probe", action="store_true", help="Train probe first, then fine-tune base model + probe jointly.")
parser.add_argument("--num_runs", type=int, default=1, help="Number of training runs with different seeds. Results will show mean±std across runs.")
parser.add_argument("--no_compile", action="store_true", help="Disable torch.compile on probes during training (compiled by default).")
# ----------------- Balanced Regression Metrics (EpHod-style) ----------------- #
parser.add_argument("--balanced_regression_metrics", action="store_true",
help="Enable EpHod-style balanced regression metrics on valid/test (enabled by default).")
parser.add_argument("--no_balanced_regression_metrics", dest="balanced_regression_metrics", action="store_false",
help="Disable EpHod-style balanced regression metrics.")
parser.set_defaults(balanced_regression_metrics=True)
parser.add_argument("--balanced_weight_method", type=str, default='bin_inv',
choices=['none', 'bin_inv', 'bin_inv_sqrt', 'LDS_inv', 'LDS_inv_sqrt', 'LDS_extreme'],
help="Weighting scheme for balanced regression metrics.")
parser.add_argument("--balanced_bin_borders", type=float, nargs='+', default=None,
help="Explicit bin borders for balanced metrics (e.g., 5 9 for pH). Default: 1/3 and 2/3 quantiles of training labels.")
parser.add_argument("--balanced_n_resamples", type=int, default=100,
help="Number of resamples for balanced Pearson/Spearman (default: 100).")
parser.add_argument("--balanced_lds_bins", type=int, default=100,
help="Number of bins for LDS density estimation.")
parser.add_argument("--balanced_lds_ks", type=int, default=5,
help="Kernel size for LDS Gaussian smoothing.")
parser.add_argument("--balanced_lds_sigma", type=float, default=2.0,
help="Sigma for LDS Gaussian smoothing.")
# ----------------- ProteinGym Arguments ----------------- #
parser.add_argument("--dms_ids", nargs="+", default=["all"],
help="ProteinGym DMS assay IDs to evaluate (space-separated), or 'all' to run all assays.")
parser.add_argument("--proteingym", action="store_true", help="Run a ProteinGym zero-shot experiment.")
parser.add_argument("--mode", type=str, default='benchmark',
help="ProteinGym zero-shot mode: 'benchmark', 'indels', 'multiples', 'singles'")
parser.add_argument("--scoring_method", choices=["masked_marginal", "mutant_marginal", "wildtype_marginal", "pll", "global_log_prob"], default="masked_marginal",
help="Select a scoring method for ProteinGym zero-shot.")
parser.add_argument("--scoring_window", choices=["optimal", "sliding"], default="optimal",
help="Select how to slice the sequence for ProteinGym zero-shot.")
parser.add_argument("--pg_batch_size", type=int, default=32,
help="Batch size for ProteinGym zero-shot scoring (default: 32).")
parser.add_argument("--compare_scoring_methods", action="store_true",
help="Compare different scoring methods across models and DMS assays.")
parser.add_argument("--score_only", action="store_true",
help="Run only the ProteinGym benchmarking script on existing CSV files; skip zero-shot scoring.")
# ----------------- W&B Arguments ----------------- #
parser.add_argument("--use_wandb_hyperopt", action="store_true", help="Run a Weights & Biases hyperparameter sweep instead of a single training run.")
parser.add_argument("--wandb_project", type=str, default="Protify", help="W&B project name for sweeps.")
parser.add_argument("--wandb_entity", type=str, default=None, help="W&B entity (team/user) for sweeps.")
parser.add_argument("--sweep_config_path", type=str, default="yamls/sweep.yaml", help="Path to W&B sweep config YAML.")
parser.add_argument("--sweep_count", type=int, default=10, help="Number of hyperparameter trials to run in the sweep.")
parser.add_argument("--sweep_method", type=str, default="bayes", choices=["bayes", "grid", "random"], help="Sweep method for hyperparameter optimization.")
parser.add_argument("--sweep_metric_cls",type=str,default="eval_loss", help="Classification metric to optimize during sweep (e.g., eval_f1, eval_accuracy, eval_mcc)")
parser.add_argument("--sweep_metric_reg",type=str,default="eval_loss", help="Regression metric to optimize during sweep (e.g., eval_r_squared, eval_spearman_rho, eval_pearson_rho)")
parser.add_argument("--sweep_goal", type=str, default='minimize', choices=['maximize', 'minimize'], help="Goal for the sweep metric (maximize/minimize)")
parser.add_argument("--sweep_name", type=str, default=None, help="Display name for the W&B sweep. Overrides 'name' in the sweep YAML.")
args = parser.parse_args()
if args.n_heads is not None:
import warnings
warnings.warn(
"--n_heads is deprecated and will be removed in a future release. "
"Use --head_size instead (n_heads = hidden_size // head_size).",
DeprecationWarning,
stacklevel=2,
)
assert args.hidden_size % args.n_heads == 0, (
f"--hidden_size {args.hidden_size} not divisible by deprecated --n_heads {args.n_heads}"
)
derived_head_size = args.hidden_size // args.n_heads
explicit_head_size = "--head_size" in raw_argv
if explicit_head_size:
assert derived_head_size == args.head_size, (
f"--head_size {args.head_size} conflicts with deprecated --n_heads {args.n_heads} "
f"(derived head_size={derived_head_size})"
)
args.head_size = derived_head_size
args.n_heads = None
# Validate model_names vs model_paths/model_types mutual exclusivity
if args.model_paths is not None:
assert args.model_types is not None, "--model_types is required when --model_paths is provided."
assert len(args.model_paths) == len(args.model_types), f"--model_paths ({len(args.model_paths)}) and --model_types ({len(args.model_types)}) must have the same length."
assert args.model_names is None, "--model_names cannot be used together with --model_paths/--model_types."
elif args.model_types is not None:
assert args.model_paths is not None, "--model_paths is required when --model_types is provided."
if args.model_names is None and args.model_paths is None:
args.model_names = ["ESM2-8"]
assert args.probe_type == "linear" or args.matrix_embed, "When probe_type is not linear, --matrix_embed must be True."
if args.hf_token is not None:
from huggingface_hub import login
# Override environment variable to ensure this token is used
os.environ["HF_TOKEN"] = args.hf_token
login(args.hf_token)
print(f"Logged in to HuggingFace Hub with token from arguments")
else:
# Check if token exists in environment
hf_token_env = os.environ.get("HF_TOKEN")
if hf_token_env:
print(f"Note: HF_TOKEN found in environment")
print(f"Note: This token will be used for read operations only unless overridden")
if args.wandb_api_key is not None:
try:
import wandb
wandb.login(key=args.wandb_api_key)
print('Logged into Weights & Biases')
except Exception as e:
print(f'W&B login failed: {e}')
if args.synthyra_api_key is not None:
print('Synthyra API key provided')
if args.yaml_path is not None:
with open(args.yaml_path, 'r') as file:
settings = yaml.safe_load(file)
yaml_args = SimpleNamespace(**settings)
def _merge_store_true(cli_value: bool, key: str) -> bool:
if cli_value:
return True
if key in yaml_args.__dict__:
return bool(yaml_args.__dict__[key])
return False
if args.hf_token is not None:
yaml_args.hf_token = args.hf_token
elif "hf_token" not in yaml_args.__dict__:
yaml_args.hf_token = None
if args.hf_home is not None:
yaml_args.hf_home = args.hf_home
elif "hf_home" not in yaml_args.__dict__:
yaml_args.hf_home = None
if args.synthyra_api_key is not None:
yaml_args.synthyra_api_key = args.synthyra_api_key
elif "synthyra_api_key" not in yaml_args.__dict__:
yaml_args.synthyra_api_key = None
if args.wandb_api_key is not None:
yaml_args.wandb_api_key = args.wandb_api_key
elif "wandb_api_key" not in yaml_args.__dict__:
yaml_args.wandb_api_key = None
if args.cloud_api_key is not None:
yaml_args.cloud_api_key = args.cloud_api_key
elif "cloud_api_key" not in yaml_args.__dict__:
yaml_args.cloud_api_key = None
if args.cloud_url is not None:
yaml_args.cloud_url = args.cloud_url
elif "cloud_url" not in yaml_args.__dict__:
yaml_args.cloud_url = None
if args.cloud_gpu_type is not None:
yaml_args.cloud_gpu_type = args.cloud_gpu_type
elif "cloud_gpu_type" not in yaml_args.__dict__:
yaml_args.cloud_gpu_type = None
if args.cloud_timeout_seconds is not None:
yaml_args.cloud_timeout_seconds = args.cloud_timeout_seconds
elif "cloud_timeout_seconds" not in yaml_args.__dict__:
yaml_args.cloud_timeout_seconds = None
if args.cloud_poll_interval is not None:
yaml_args.cloud_poll_interval = args.cloud_poll_interval
elif "cloud_poll_interval" not in yaml_args.__dict__:
yaml_args.cloud_poll_interval = None
if args.cloud_artifacts_dir is not None:
yaml_args.cloud_artifacts_dir = args.cloud_artifacts_dir
elif "cloud_artifacts_dir" not in yaml_args.__dict__:
yaml_args.cloud_artifacts_dir = None
yaml_args.use_wandb_hyperopt = _merge_store_true(args.use_wandb_hyperopt, "use_wandb_hyperopt")
if (args.wandb_project != "Protify") or ("wandb_project" not in yaml_args.__dict__):
yaml_args.wandb_project = args.wandb_project
if (args.wandb_entity is not None) or ("wandb_entity" not in yaml_args.__dict__):
yaml_args.wandb_entity = args.wandb_entity
if (args.sweep_config_path != "yamls/sweep.yaml") or ("sweep_config_path" not in yaml_args.__dict__):
yaml_args.sweep_config_path = args.sweep_config_path
if (args.sweep_count != 10) or ("sweep_count" not in yaml_args.__dict__):
yaml_args.sweep_count = args.sweep_count
if (args.sweep_method != "bayes") or ("sweep_method" not in yaml_args.__dict__):
yaml_args.sweep_method = args.sweep_method
if (args.sweep_metric_cls != "eval_loss") or ("sweep_metric_cls" not in yaml_args.__dict__):
yaml_args.sweep_metric_cls = args.sweep_metric_cls
if (args.sweep_metric_reg != "eval_loss") or ("sweep_metric_reg" not in yaml_args.__dict__):
yaml_args.sweep_metric_reg = args.sweep_metric_reg
if (args.sweep_goal != "minimize") or ("sweep_goal" not in yaml_args.__dict__):
yaml_args.sweep_goal = args.sweep_goal
yaml_args.yaml_path = args.yaml_path
yaml_args.aa_to_dna = _merge_store_true(args.aa_to_dna, "aa_to_dna")
yaml_args.aa_to_rna = _merge_store_true(args.aa_to_rna, "aa_to_rna")
yaml_args.dna_to_aa = _merge_store_true(args.dna_to_aa, "dna_to_aa")
yaml_args.rna_to_aa = _merge_store_true(args.rna_to_aa, "rna_to_aa")
yaml_args.codon_to_aa = _merge_store_true(args.codon_to_aa, "codon_to_aa")
yaml_args.aa_to_codon = _merge_store_true(args.aa_to_codon, "aa_to_codon")
yaml_args.random_pair_flipping = _merge_store_true(args.random_pair_flipping, "random_pair_flipping")
yaml_args.push_raw_probe = _merge_store_true(args.push_raw_probe, "push_raw_probe")
# Ensure ProteinGym defaults exist when using YAML configs
if not hasattr(yaml_args, 'proteingym'):
yaml_args.proteingym = False
if not hasattr(yaml_args, 'dms_ids'):
yaml_args.dms_ids = ["all"]
if not hasattr(yaml_args, 'mode'):
yaml_args.mode = None
if not hasattr(yaml_args, 'scoring_method'):
yaml_args.scoring_method = "masked_marginal"
# Ensure num_runs default exists
if not hasattr(yaml_args, 'num_runs'):
yaml_args.num_runs = 1
if "model_dtype" not in yaml_args.__dict__ or yaml_args.model_dtype is None:
yaml_args.model_dtype = args.model_dtype
if "embed_dtype" not in yaml_args.__dict__:
yaml_args.embed_dtype = args.embed_dtype
explicit_hidden_state_index = any(
token == "--embedding_hidden_state_index"
or token.startswith("--embedding_hidden_state_index=")
for token in raw_argv
)
if explicit_hidden_state_index or "embedding_hidden_state_index" not in yaml_args.__dict__:
yaml_args.embedding_hidden_state_index = args.embedding_hidden_state_index
if "--no_embedding_scaler" in raw_argv:
yaml_args.embedding_scaler = False
elif "embedding_scaler" not in yaml_args.__dict__:
yaml_args.embedding_scaler = args.embedding_scaler
if "model_paths" not in yaml_args.__dict__:
yaml_args.model_paths = args.model_paths
if "model_types" not in yaml_args.__dict__:
yaml_args.model_types = args.model_types
if "model_names" not in yaml_args.__dict__:
yaml_args.model_names = args.model_names
return yaml_args
else:
return args
if __name__ == "__main__":
# Settings that need to happen pre-imports
args = parse_arguments()
# Require that either datasets are specified or a ProteinGym experiment is chosen
has_datasets = bool(args.data_names or args.data_dirs)
has_proteingym = bool(args.proteingym)
if not has_datasets and not has_proteingym and args.yaml_path is None:
raise AssertionError("No datasets specified. Provide --data_names or --data_dirs, or run a ProteinGym experiment.")
if args.use_xformers:
os.environ["_USE_XFORMERS"] = "1"
print("xformers memory efficient attention enabled for AMPLIFY models")
if args.hf_home is not None:
# Needs to happen before any HF imports
import pathlib
base_path = args.hf_home
cache_root = f"{base_path}/hf_cache"
tmp_root = f"{base_path}/tmp"
pathlib.Path(cache_root).mkdir(parents=True, exist_ok=True)
pathlib.Path(tmp_root).mkdir(parents=True, exist_ok=True)
os.environ["HF_HOME"] = cache_root
os.environ["HF_DATASETS_CACHE"] = f"{cache_root}/datasets"
os.environ["TRANSFORMERS_CACHE"] = f"{cache_root}/transformers" # this is deprecated, but does not hurt anything
os.environ["HF_HUB_CACHE"] = f"{cache_root}/hub"
print(f"HF_HOME: {os.environ['HF_HOME']}")
print(f"HF_DATASETS_CACHE: {os.environ['HF_DATASETS_CACHE']}")
print(f"TRANSFORMERS_CACHE: {os.environ['TRANSFORMERS_CACHE']}")
print(f"HF_HUB_CACHE: {os.environ['HF_HUB_CACHE']}")
# Set global seed before doing anything else
# If seed is None, set_global_seed will derive it from current time
if args.deterministic:
from protify.seed_utils import set_determinism
set_determinism()
import protify.entrypoint_setup # needs to happen after set_determinism()
import torch
from torchinfo import summary
from protify.base_models.get_base_models import BaseModelArguments, get_base_model_for_training, get_tokenizer
from protify.base_models.utils import wrap_lora
from protify.benchmarks.proteingym.compare_scoring_methods import compare_scoring_methods
from protify.benchmarks.proteingym.scorer import ProteinGymRunner
from protify.data.data_mixin import DataArguments, DataMixin
from protify.embedder import Embedder, EmbeddingArguments, get_embedding_filename
from protify.hyperopt_utils import HyperoptModule
from protify.logger import MetricsLogger, log_method_calls
from protify.probes.get_probe import ProbeArguments, get_probe
from protify.probes.scikit_classes import ScikitArguments, ScikitProbe
from protify.probes.trainers import TrainerArguments, TrainerMixin
from protify.seed_utils import set_global_seed
from protify.utils import expand_dms_ids_all, print_message, torch_load
from protify.visualization.plot_result import create_plots
class MainProcess(MetricsLogger, DataMixin, TrainerMixin):
def __init__(self, full_args, GUI=False):
super(MainProcess, self).__init__(full_args)
super(DataMixin, self).__init__()
super(TrainerMixin, self).__init__()
self.full_args = full_args
if not GUI:
self.start_log_main()
self.dtype_map = {
"fp32": torch.float32,
"fp16": torch.float16,
"bf16": torch.bfloat16,
"float32": torch.float32,
"float16": torch.float16,
"bfloat16": torch.bfloat16,
"float8_e4m3fn": torch.float8_e4m3fn,
"float8_e5m2": torch.float8_e5m2,
#"int8": torch.int8,
}
def _build_scikit_args(self):
if "scikit_n_iter" in self.full_args.__dict__:
n_iter = self.full_args.scikit_n_iter
else:
n_iter = 10
if "scikit_cv" in self.full_args.__dict__:
cv = self.full_args.scikit_cv
else:
cv = 3
if "scikit_random_state" in self.full_args.__dict__:
random_state = self.full_args.scikit_random_state
else:
random_state = None
if "scikit_model_name" in self.full_args.__dict__:
model_name = self.full_args.scikit_model_name
else:
model_name = None
if "production_model" in self.full_args.__dict__:
production_model = self.full_args.production_model
else:
production_model = False
return ScikitArguments(
n_iter=n_iter,
cv=cv,
random_state=random_state,
model_name=model_name,
production_model=production_model,
)
@log_method_calls
def apply_current_settings(self):
if "model_dtype" not in self.full_args.__dict__:
self.full_args.model_dtype = "bf16"
if "embed_dtype" not in self.full_args.__dict__:
self.full_args.embed_dtype = None
if isinstance(self.full_args.model_dtype, str):
self.full_args.model_dtype = self.dtype_map[self.full_args.model_dtype]
if self.full_args.embed_dtype is None:
self.full_args.embed_dtype = self.full_args.model_dtype
elif isinstance(self.full_args.embed_dtype, str):
self.full_args.embed_dtype = self.dtype_map[self.full_args.embed_dtype]
else:
self.full_args.embed_dtype = self.full_args.embed_dtype
if "torch_compile" not in self.full_args.__dict__:
no_compile = getattr(self.full_args, "no_compile", False)
self.full_args.torch_compile = not no_compile
self.data_args = DataArguments(**self.full_args.__dict__)
self.embedding_args = EmbeddingArguments(**self.full_args.__dict__)
self.model_args = BaseModelArguments(**self.full_args.__dict__)
self.probe_args = ProbeArguments(**self.full_args.__dict__)
self.trainer_args = TrainerArguments(**self.full_args.__dict__)
self.logger_args = SimpleNamespace(**self.full_args.__dict__)
self.scikit_args = self._build_scikit_args()
self._sql = self.full_args.sql
self._full = self.full_args.matrix_embed
self._max_length = self.full_args.max_length
self._trim = self.full_args.trim
self._delimiter = self.full_args.delimiter
self._col_names = self.full_args.col_names
self._aa_to_dna = self.full_args.aa_to_dna
self._aa_to_rna = self.full_args.aa_to_rna
self._dna_to_aa = self.full_args.dna_to_aa
self._rna_to_aa = self.full_args.rna_to_aa
self._codon_to_aa = self.full_args.codon_to_aa
self._aa_to_codon = self.full_args.aa_to_codon
self._multi_column = getattr(self.full_args, 'multi_column', None)
@log_method_calls
def get_datasets(self):
self.datasets, self.all_seqs = self.get_data()
@log_method_calls
def save_embeddings_to_disk(self):
self.embedding_args.save_embeddings = True
embedder = Embedder(self.embedding_args, self.all_seqs)
for display_name, dispatch_type, model_path in self.model_args.model_entries():
_ = embedder(display_name, model_type=dispatch_type, model_path=model_path)
def _create_model_factory(self, model_name, tokenwise, num_labels, hybrid, model_path=None):
"""Function for creating fresh models in multi-run mode."""
def factory():
model, _ = get_base_model_for_training(
model_name,
tokenwise=tokenwise,
num_labels=num_labels,
hybrid=hybrid,
dtype=self.model_args.model_dtype,
model_path=model_path,
)
if self.probe_args.lora:
model = wrap_lora(model, self.probe_args.lora_r, self.probe_args.lora_alpha, self.probe_args.lora_dropout)
return model
return factory
def _create_probe_factory(self):
"""Function for creating fresh probes in multi-run mode."""
def factory():
return get_probe(self.probe_args)
return factory
def _run_nn_probe(
self,
model_name,
data_name,
train_set,
valid_set,
test_set,
tokenizer,
emb_dict=None,
ppi=False,
source_model_name=None,
sweep_mode: bool = False,
):
if source_model_name is None:
source_model_name = model_name
# Create initial probe (for single run or as template for multi-run)
probe = get_probe(self.probe_args)
summary(probe)
# trainer_probe handles multi-run internally if num_runs > 1
probe, valid_metrics, test_metrics, _, _ = self.trainer_probe(
model=probe,
tokenizer=tokenizer,
model_name=model_name,
data_name=data_name,
train_dataset=train_set,
valid_dataset=valid_set,
test_dataset=test_set,
emb_dict=emb_dict,
ppi=ppi,
log_id=self.random_id,
source_model_name=source_model_name,
)
if not sweep_mode:
self.log_metrics(data_name, model_name, valid_metrics, split_name='valid')
self.log_metrics(data_name, model_name, test_metrics, split_name='test')
return probe, valid_metrics, test_metrics
def _train_nn_probe_fold(self, model_name, dms_id, subtrain_seqs, subtrain_labels,
valid_seqs, valid_labels, test_seqs, test_labels,
emb_dict, fold_info):
"""Trains a neural network probe on a ProteinGym DMS assay CV fold."""
train_set = {'seqs': subtrain_seqs, 'labels': subtrain_labels}
valid_set = None if (valid_seqs is None or valid_labels is None) else {'seqs': valid_seqs, 'labels': valid_labels}
test_set = {'seqs': test_seqs, 'labels': test_labels}
# Get tokenizer and determine input dimensions
tokenizer = get_tokenizer(model_name)
pooling_types = self.embedding_args.pooling_types
hidden_state_index = self.embedding_args.hidden_state_index
if self._sql:
filename = get_embedding_filename(model_name, self._full, pooling_types, 'db', hidden_state_index)
save_path = os.path.join(self.embedding_args.embedding_save_dir, filename)
input_dim = self.get_embedding_dim_sql(save_path, subtrain_seqs[0], tokenizer)
emb_for_training = None
else:
filename = get_embedding_filename(model_name, self._full, pooling_types, 'pth', hidden_state_index)
save_path = os.path.join(self.embedding_args.embedding_save_dir, filename)
emb_for_training = torch_load(save_path) if os.path.exists(save_path) else emb_dict
input_dim = self.get_embedding_dim_pth(emb_for_training, subtrain_seqs[0], tokenizer)
# Configure probe for regression
self.probe_args.input_size = input_dim
self.probe_args.task_type = 'regression'
self.probe_args.num_labels = 1
self.trainer_args.task_type = 'regression'
probe = get_probe(self.probe_args)
_, _, test_metrics = self.trainer_probe(
model=probe,
tokenizer=tokenizer,
model_name=model_name,
data_name=f"{dms_id}_{fold_info}",
train_dataset=train_set,
valid_dataset=valid_set,
test_dataset=test_set,
emb_dict=emb_for_training,
ppi=False,
log_id=f"{self.random_id}_{fold_info}",
source_model_name=model_name,
)
# Handle both plain and test-prefixed metric keys returned by HF Trainer
rho = test_metrics.get('spearman_rho', test_metrics.get('test_spearman_rho', None))
mse = test_metrics.get('mse', test_metrics.get('test_mse', None))
return rho, mse
def _run_full_finetuning(
self,
model_name,
data_name,
train_set,
valid_set,
test_set,
ppi=False,
source_model_name=None,
sweep_mode: bool = False,
model_path: str = None,
):
if source_model_name is None:
source_model_name = model_name
tokenwise = self.probe_args.tokenwise
num_labels = self.probe_args.num_labels
num_runs = getattr(self.trainer_args, 'num_runs', 1)
model_factory = self._create_model_factory(model_name, tokenwise, num_labels, hybrid=False, model_path=model_path) if num_runs > 1 else None
model, tokenizer = get_base_model_for_training(
model_name,
tokenwise=tokenwise,
num_labels=num_labels,
hybrid=False,
dtype=self.model_args.model_dtype,
model_path=model_path,
)
if self.probe_args.lora:
model = wrap_lora(model, self.probe_args.lora_r, self.probe_args.lora_alpha, self.probe_args.lora_dropout)
summary(model)
model, valid_metrics, test_metrics, _, _ = self.trainer_base_model(
model=model,
tokenizer=tokenizer,
model_name=model_name,
data_name=data_name,
train_dataset=train_set,
valid_dataset=valid_set,
test_dataset=test_set,
ppi=ppi,
log_id=self.random_id,
source_model_name=source_model_name,
model_factory=model_factory,
)
if not sweep_mode:
self.log_metrics(data_name, model_name, valid_metrics, split_name='valid')
self.log_metrics(data_name, model_name, test_metrics, split_name='test')
return model, valid_metrics, test_metrics
def _run_hybrid_probe(
self,
model_name,
data_name,
train_set,
valid_set,
test_set,
tokenizer,
emb_dict=None,
ppi=False,
source_model_name=None,
sweep_mode: bool = False,
model_path: str = None,
):
if source_model_name is None:
source_model_name = model_name
# Random models don't have a trainable base model, so fall back to regular probe
if "random" in model_name.lower():
print_message(f"Model {model_name} does not support hybrid training. Training a linear probe instead.")
probe = get_probe(self.probe_args)
summary(probe)
probe, valid_metrics, test_metrics = self.trainer_probe(
model=probe,
tokenizer=tokenizer,
model_name=model_name,
data_name=data_name,
train_dataset=train_set,
valid_dataset=valid_set,
test_dataset=test_set,
emb_dict=emb_dict,
ppi=ppi,
log_id=self.random_id,
source_model_name=source_model_name,
)
if not sweep_mode:
self.log_metrics(data_name, model_name, valid_metrics, split_name='valid')
self.log_metrics(data_name, model_name, test_metrics, split_name='test')
return probe, valid_metrics, test_metrics
tokenwise = self.probe_args.tokenwise
num_labels = self.probe_args.num_labels
num_runs = getattr(self.trainer_args, 'num_runs', 1)
model_factory = self._create_model_factory(model_name, tokenwise, num_labels, hybrid=True, model_path=model_path) if num_runs > 1 else None
probe_factory = self._create_probe_factory() if num_runs > 1 else None
model, tokenizer = get_base_model_for_training(
model_name,
tokenwise=tokenwise,
num_labels=num_labels,
hybrid=True,
dtype=self.model_args.model_dtype,
model_path=model_path,
)
if self.probe_args.lora:
model = wrap_lora(model, self.probe_args.lora_r, self.probe_args.lora_alpha, self.probe_args.lora_dropout)
probe = get_probe(self.probe_args)
summary(model)
summary(probe)
model, valid_metrics, test_metrics, _, _ = self.trainer_hybrid_model(
model=model,
tokenizer=tokenizer,
probe=probe,
model_name=model_name,
data_name=data_name,
train_dataset=train_set,
valid_dataset=valid_set,
test_dataset=test_set,
emb_dict=emb_dict,
ppi=ppi,
log_id=self.random_id,
source_model_name=source_model_name,
model_factory=model_factory,
probe_factory=probe_factory,
)
if not sweep_mode:
self.log_metrics(data_name, model_name, valid_metrics, split_name='valid')
self.log_metrics(data_name, model_name, test_metrics, split_name='test')
return model, valid_metrics, test_metrics
@log_method_calls
def run_full_finetuning(self):
total_combinations = len(self.model_args.model_names) * len(self.datasets)
self.logger.info(f"Processing {total_combinations} model/dataset combinations")
for display_name, dispatch_type, model_path in self.model_args.model_entries():
for data_name, dataset in self.datasets.items():
self.logger.info(f"Processing dataset: {data_name}")
train_set, valid_set, test_set, num_labels, label_type, ppi = dataset
self.probe_args.num_labels = num_labels
self.probe_args.task_type = label_type
self.trainer_args.task_type = label_type
self.logger.info(f'Training probe for {data_name} with {display_name}')
_ = self._run_full_finetuning(dispatch_type, data_name, train_set, valid_set, test_set, ppi, model_path=model_path)
torch.cuda.empty_cache()
@log_method_calls
def run_hybrid_probes(self):
probe_args = self.probe_args
test_seq = self.all_seqs[0]
# Log the combinations we're going to process
total_combinations = len(self.model_args.model_names) * len(self.datasets)
self.logger.info(f"Processing {total_combinations} model/dataset combinations")
# for each model, gather the settings and embeddings
# assumes save_embeddings_to_disk has already been called
for display_name, dispatch_type, model_path in self.model_args.model_entries():
self.logger.info(f"Processing model: {display_name}")
# get tokenizer
tokenizer = get_tokenizer(dispatch_type, model_path=model_path)
# get embedding size
pooling_types = self.embedding_args.pooling_types
hidden_state_index = self.embedding_args.hidden_state_index
if self._sql:
# for sql, the embeddings will be gathered in real time during training
filename = get_embedding_filename(display_name, self._full, pooling_types, 'db', hidden_state_index)
save_path = os.path.join(self.embedding_args.embedding_save_dir, filename)
input_size = self.get_embedding_dim_sql(save_path, test_seq, tokenizer)
emb_dict = None
else:
# for pth, the embeddings are loaded entirely into RAM and accessed during training
filename = get_embedding_filename(display_name, self._full, pooling_types, 'pth', hidden_state_index)
save_path = os.path.join(self.embedding_args.embedding_save_dir, filename)
emb_dict = torch_load(save_path)
input_size = self.get_embedding_dim_pth(emb_dict, test_seq, tokenizer)
# Adjust input dim for multi-column vector embeddings
if (not self._full) and getattr(self.full_args, 'multi_column', None):
input_size = input_size * len(self.full_args.multi_column)
# for each dataset, gather the settings and train the probe
for data_name, dataset in self.datasets.items():
self.logger.info(f"Processing dataset: {data_name}")
train_set, valid_set, test_set, num_labels, label_type, ppi = dataset
if ppi and not self._full:
probe_args.input_size = input_size * 2
else:
probe_args.input_size = input_size
# PPI concatenates two proteins along the sequence dim, so matrix-mode
# PPI sequences can reach up to 2 * max_length.
probe_args.max_seq_len = self._max_length * 2 if (ppi and self._full) else self._max_length
self.probe_args.num_labels = num_labels
self.probe_args.task_type = label_type
### TODO we currently need both, settings should probably be consolidated
self.trainer_args.task_type = label_type
self.logger.info(f'Training probe for {data_name} with {display_name}')
### TODO eventually add options for optimizers and schedulers
### TODO here is probably where we can differentiate between the different training schemes
_ = self._run_hybrid_probe(
model_name=dispatch_type,
data_name=data_name,
train_set=train_set,
valid_set=valid_set,
test_set=test_set,
tokenizer=tokenizer,
emb_dict=emb_dict,
ppi=ppi,
source_model_name=display_name,
model_path=model_path,
)
torch.cuda.empty_cache()
### TODO may link from probe here to running inference on input csv or HF datasets
@log_method_calls
def run_nn_probes(self):
probe_args = self.probe_args
test_seq = self.all_seqs[0]
# Log the combinations we're going to process
total_combinations = len(self.model_args.model_names) * len(self.datasets)
self.logger.info(f"Processing {total_combinations} model/dataset combinations")
# for each model, gather the settings and embeddings
# assumes save_embeddings_to_disk has already been called
for display_name, dispatch_type, model_path in self.model_args.model_entries():
self.logger.info(f"Processing model: {display_name}")
# get tokenizer
tokenizer = get_tokenizer(dispatch_type, model_path=model_path)
# get embedding size
pooling_types = self.embedding_args.pooling_types
hidden_state_index = self.embedding_args.hidden_state_index
if self._sql:
# for sql, the embeddings will be gathered in real time during training
filename = get_embedding_filename(display_name, self._full, pooling_types, 'db', hidden_state_index)
save_path = os.path.join(self.embedding_args.embedding_save_dir, filename)
input_size = self.get_embedding_dim_sql(save_path, test_seq, tokenizer)
emb_dict = None
else:
# for pth, the embeddings are loaded entirely into RAM and accessed during training
filename = get_embedding_filename(display_name, self._full, pooling_types, 'pth', hidden_state_index)
save_path = os.path.join(self.embedding_args.embedding_save_dir, filename)
emb_dict = torch_load(save_path)
input_size = self.get_embedding_dim_pth(emb_dict, test_seq, tokenizer)
# Adjust input dim for multi-column vector embeddings
if (not self._full) and getattr(self.full_args, 'multi_column', None):
input_size = input_size * len(self.full_args.multi_column)
print(f'Input dim: {input_size}')
# for each dataset, gather the settings and train the probe
for data_name, dataset in self.datasets.items():
self.logger.info(f"Processing dataset: {data_name}")
train_set, valid_set, test_set, num_labels, label_type, ppi = dataset
if ppi and not self._full:
probe_args.input_size = input_size * 2
else:
probe_args.input_size = input_size
# PPI concatenates two proteins along the sequence dim, so matrix-mode
# PPI sequences can reach up to 2 * max_length.
probe_args.max_seq_len = self._max_length * 2 if (ppi and self._full) else self._max_length
self.probe_args.num_labels = num_labels
self.probe_args.task_type = label_type
### TODO we currently need both, settings should probably be consolidated
self.trainer_args.task_type = label_type
self.logger.info(f'Training probe for {data_name} with {display_name}')
### TODO eventually add options for optimizers and schedulers
### TODO here is probably where we can differentiate between the different training schemes
_ = self._run_nn_probe(
model_name=display_name,
data_name=data_name,
train_set=train_set,
valid_set=valid_set,
test_set=test_set,
tokenizer=tokenizer,
emb_dict=emb_dict,
ppi=ppi,
source_model_name=display_name,
)
torch.cuda.empty_cache()
### TODO may link from probe here to running inference on input csv or HF datasets
@log_method_calls
def run_scikit_scheme(self):
self.scikit_args = self._build_scikit_args()
scikit_probe = ScikitProbe(self.scikit_args)
if "n_jobs" in self.full_args.__dict__:
scikit_probe.n_jobs = self.full_args.n_jobs
else:
scikit_probe.n_jobs = 1
for display_name, dispatch_type, model_path in self.model_args.model_entries():
for data_name, dataset in self.datasets.items():
### find best scikit model and parameters via cross validation and lazy predict
X_train, y_train, X_valid, y_valid, X_test, y_test, label_type = self.prepare_scikit_dataset(display_name, dataset)
# If a specific model is specified, skip LazyPredict and go straight to that model
if self.scikit_args.model_name is not None:
print_message(f"Skipping LazyPredict, using specified model: {self.scikit_args.model_name}")
results = scikit_probe.run_specific_model(X_train, y_train, X_valid, y_valid, X_test, y_test, model_results=None)
else:
# Find best model via LazyPredict
if label_type == 'singlelabel':
results = scikit_probe.find_best_classifier(X_train, y_train, X_valid, y_valid)
elif label_type == 'regression':
results = scikit_probe.find_best_regressor(X_train, y_train, X_valid, y_valid)
else:
raise ValueError(f'Label type {label_type} not supported')
# Train and evaluate best model with optimal hyperparameters
results = scikit_probe.run_specific_model(X_train, y_train, X_valid, y_valid, X_test, y_test, results)
# Log the results for plotting
metrics_dict = {'test_mcc': results.final_scores} if isinstance(results.final_scores, (int, float)) else results.final_scores
self.log_metrics(data_name, display_name, metrics_dict, split_name='test')
@log_method_calls
def generate_plots(self):
print_message("Generating visualization plots...")
# Determine which results file to use