nomelt/dvc.yaml at main · BeckResearchLab/nomelt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
stages:
  # get data ready for training eg. filtering out all but best pairs, balancing, etc.
  prepare_data:
    wdir: ./
    cmd: python scripts/prepare_data.py
    deps:
      - ./scripts/prepare_data.py
      - ./data/database.ddb
    params:
      - data.min_temp_diff # difference in temperature between meso and thermo homologs
      - data.min_thermo_temp # minimum temperature of thermo homologs
      - data.max_meso_temp # maximum temperature of meso homologs
      - data.min_align_cov # minimum alignment coverage of homologs to bo considered a protein pair
      - data.mmseq_params # parameters for mmseqs2 clustering
      - data.test_size # size of test set in fraction
      - data.dev_sample_data # sample data to speed up development, ignore for production
      - data.additional_filters # string of sql expression used to filter protein pair data from database
    outs:
      - ./data/dataset/ # a huggingface dataset that contains protein pairs
    metrics:
      - ./data/data_metrics.yaml: # contains number of clusters in data splits and number of data
          cache: false
  blast_test_train:
    wdir: ./
    cmd: python scripts/blast_test_train.py
    deps:
      - ./scripts/blast_test_train.py
      - ./data/dataset/
    metrics:
      - ./data/test_train_blast_metrics.json: # scores for seq id and E value of test blasted against train
          cache: false
    plots:
      - ./data/plots/test_train_blast_hist.png: # distributions of seq id and E value of test blasted against train
          cache: false
# train model
  train:
    wdir: ./
    cmd: accelerate launch --config_file ./.config/accelerate/default_config.yaml scripts/train.py # runs deepspeed
    deps:
      - ./scripts/train.py
      - ./data/dataset/
      - ./.config/accelerate/default_config.yaml # this is a deepspeed config file
    params:
      - model.pretrained_model # pretrained model string from huggingface or disk
      - model.task # either translation or reconstruction
      - model.generation_max_length # maximum length of generated sequence
      - model.model_hyperparams # hyperparameters for model, passed to HF params class
      - training.reweight # whether to weight the loss function by the number of sequences in each cluster
      - training.freeze_early_layers # fraction of layers to freeze
      - training.dev_sample_data # sample data to speed up development, ignore for production
      - training.per_device_batch_size # batch size per device
      - training.gradient_accumulation # number of batches to accumulate gradients over
      - training.auto_find_batch_size # whether to find the largest batch size that fits on the gpu
      - training.evals_per_save # number of evals between saves
      - training.evals_per_epoch # number of evals between epoch saves
      - training.epochs # max number of epochs to train for
      - training.gradient_checkpointing # whether to use gradient checkpointing
      - training.learning_rate # learning rate
      - training.lr_scheduler_type # type of learning rate scheduler
      - training.label_smoothing_factor # label smoothing factor
      - training.warmup_ratio # fraction of total steps to warmup learning rate
      - training.optim # optimizer
      - training.optim_args # optimizer arguments passed to HF optimizer
      - training.eval_single_example_per_cluster # whether to evaluate a single example per cluster in the eval set
      - training.fp16 # whether to use fp16
      - training.bf16 # whether to use bf16
      - training.early_stopping # whether to use early stopping
      - training.early_stopping_patience # number of evals without improvement before stopping
      - training.early_stopping_threshold # threshold for improvement to be considered significant
    outs:
      - ./data/nomelt-model/model/: # this is a HF model save directory
          persist: true
      - ./data/nomelt-model/live/report.md: # this is a markdown report of the training run
          cache: false
    metrics:
      - ./data/nomelt-model/live/metrics.json: # this is a json file of metrics from the training run as a function of steps
          cache: false
      - ./data/nomelt-model/live/plots/: # this is a directory of plots from the training run that show up in the report
          cache: false
          persist: true
    plots:
      - ./data/nomelt-model/live/static/:
          cache: false
  train_all: # this is the same as above except it trains on the whole dataset and does not evaluate on anything. Final model
    wdir: ./
    cmd: accelerate launch --config_file ./.config/accelerate/default_config.yaml scripts/train_all.py
    deps:
      - ./scripts/train_all.py
      - ./data/dataset/
      - ./.config/accelerate/default_config.yaml
    params:
      - model.pretrained_model
      - model.task
      - model.generation_max_length
      - model.model_hyperparams
      - training.reweight
      - training.freeze_early_layers
      - training.dev_sample_data
      - training.per_device_batch_size
      - training.gradient_accumulation
      - training.auto_find_batch_size
      - training.evals_per_save
      - training.evals_per_epoch
      - training.epochs
      - training.gradient_checkpointing
      - training.learning_rate
      - training.lr_scheduler_type
      - training.label_smoothing_factor
      - training.warmup_ratio
      - training.optim
      - training.optim_args
      - training.eval_single_example_per_cluster
      - training.fp16
      - training.bf16
      - training.early_stopping
      - training.early_stopping_patience
      - training.early_stopping_threshold
    outs:
      - ./data/nomelt-model-full/model/:
          persist: true
      - ./data/nomelt-model-full/live/report.md:
          cache: false
    metrics:
      - ./data/nomelt-model-full/live/metrics.json:
          cache: false
      - ./data/nomelt-model-full/live/plots/:
          cache: false
          persist: true
    plots:
      - ./data/nomelt-model-full/live/static/:
          cache: false
# run the model on test set and save generated sequences
  make_predictions:
    wdir: ./
    cmd:  accelerate launch --config_file ./.config/accelerate/data_parallel_config.yaml scripts/make_predictions.py
    deps:
      - ./scripts/make_predictions.py
      - ./data/nomelt-model/model/
      - ./data/dataset/
    params:
      - model.generation_max_length # maximum length of generated sequence
      - model.generation_num_beams # number of beams to use in beam search
    outs:
      - ./data/nomelt-model/predictions.tsv # a tsv file of predictions includes meso, thermo and predicted sequences
# compute metrics on the test set from the predictions
  score_predictions:
    wdir: ./
    cmd: python scripts/score_predictions.py
    deps:
      - ./scripts/score_predictions.py
      - ./data/nomelt-model/predictions.tsv
    outs:
      - ./data/nomelt-model/test_scores.json: # a json file of metrics computed on the test set, these are only HF metrics like google bleu
          cache: false
# call the model to generate thermostable variant of enh1
  translate_enh1:
    wdir: ./
    cmd: python scripts/translate_enh1.py
    deps:
      - ./scripts/translate_enh1.py
      - ./data/nomelt-model-full/model/
    params:
      - model.generation_max_length
      - model.generation_num_beams
    outs:
      - ./data/enh/translate_enh1.json: # a json file of the generated sequence for enh1
          cache: false
# call thermostability estimator on the generated sequence
  estimate_trans_energy_enh1:
    wdir: ./
    cmd: python scripts/estimate_trans_energy_enh1.py
    deps:
      - scripts/estimate_trans_energy_enh1.py
      - ./data/enh/translate_enh1.json
      - ./.config/af_singularity_config.yaml # this is a config file for the af2dg estimator
    params:
      - optimize.estimator # name of estimator for estimating thermal stability, class in NOMELT
      - optimize.estimator_args # arguments for estimator
    outs:
      - ./data/enh/translated_energy_enh1.json: # a json file of the estimated thermal stability of the generated sequence
          cache: false
      - ./data/enh/initial_estimate/ # contains data dump from running estimator
  # use thermo estimator as oracle and optuna to find a subset of mutations that are likely to increase thermostability
  optimize_enh1:
    wdir: ./
    cmd: python scripts/optimize_enh1.py
    deps:
      - ./scripts/optimize_enh1.py
      - ./data/enh/translate_enh1.json
    params:
      - optimize.estimator # name of estimator for estimating thermal stability, class in NOMELT
      - optimize.estimator_args # arguments for estimator
      - optimize.n_trials # number of trials to run
      - optimize.direction # minimize or maximize
      - optimize.sampler # optuna sampler
      - optimize.cut_tails # number of gap spaces to keep on ends of the alignment
      - optimize.gap_compressed_mutations # whether to consider a string of gaps a single mutation
      - optimize.matrix # substitution matrix
      - optimize.match_score # match score
      - optimize.mismatch_score # mismatch score
      - optimize.gapopen # gap open penalty
      - optimize.gapextend # gap extend penalty
      - optimize.penalize_end_gaps # whether to penalize end gaps
      - optimize.sampler_args # arguments for sampler
      - optimize.optuna_overwrite # whether to overwrite optuna study
    outs:
      - ./data/enh/optimize_enh1/: # contains data dump from running optimizer
          persist: true
      - ./data/enh/optimize_enh1_trials.csv: # a csv file of the trials from the optimizer
          cache: false
      - ./data/enh/optimize_enh1_results.json: # a json file of the results, best seq, structure file, and estimated thermal stability
          cache: false
  # computes and saves residue wise embeddings for final layers of model on the test set. to help evaluate residue wise scores
  compute_test_embeddings:
    wdir: ./
    cmd: python scripts/compute_test_embeddings.py
    deps:
      - ./scripts/compute_test_embeddings.py
      - ./data/dataset/
      - ./data/nomelt-model/model/
    metrics:
      - ./data/nomelt-model/test_loss.json: # a json file of the loss on the test set
          cache: false
  # compare generated to thermophilic structures with blast alignments, also compare to meso sequence
  compare_sequence_alignment:
    wdir: ./
    cmd: python scripts/compare_sequence_alignment.py
    deps:
      - ./scripts/compare_sequence_alignment.py
      - ./data/nomelt-model/predictions.tsv
    outs:
      - ./data/nomelt-model/test_predictions_aligned_results.json: # a json of smith waterman alignments in a triangle for each meso, thermo, generated
          cache: false
  # compare generated to thermophilic structures with structure alignments
  compare_structure:
    wdir: ./
    cmd: python scripts/compare_structure.py
    deps:
      - ./scripts/compare_structure.py
      - ./data/nomelt-model/predictions.tsv
    outs:
      - ./data/nomelt-model/structure_metrics.json: # fatcat and dssp comparison
          cache: false
# call the af2dg thermostability estimator on a sample of data and save the scores. runs meso, thermo, and generated
  data_estimator_distribution:
    wdir: ./
    cmd: python scripts/data_estimator_distribution.py
    deps:
      - ./data/nomelt-model/predictions.tsv
      - ./scripts/data_estimator_distribution.py
    outs:
     - ./data/thermo_gen_estimated.json: # json of meso, thermo, generated thermal stability scores
         cache: false
  # looks at model outputs and compares to experimental data for melting temperature or another thermal stability target
  # for two proteins with multiple mutations accumulated
  zero_shot_estimation:
    wdir: ./
    cmd: python ./scripts/zero_shot_experiment.py
    deps:
      - ./scripts/zero_shot_experiment.py
      - ./data/nomelt-model-full/model/
    outs:
      - ./data/nomelt-model-full/zero_shot_estimated.json: # exp vs model scores
          cache: false
    plots:
      - ./data/plots/exp_tm_scores.png: # regression plot of exp vs model scores
          cache: false
  # run zero shot like above on protein gym T50 benchmark set
  protein_gym_benchmark:
    wdir: ./
    cmd: python ./scripts/protein_gym_benchmark.py
    deps:
      - ./scripts/protein_gym_benchmark.py
      - ./data/nomelt-model-full/model/
    outs:
      - ./data/nomelt-model-full/lipa_gym_zero_shot.json: # correlation coeffs and fraction of stat sig pairs in DMS library that were qualitatively predicted
          cache: false
    plots:
      - ./data/plots/lipa_gym.png: # density plot of exp vs model scores
          cache: false

  meltome_benchmark:
    wdir: ./
    cmd: python scripts/meltome_benchmark.py
    deps:
      - ./scripts/meltome_benchmark.py
      - ./data/nomelt-model-full/model/
    params:
      - model.generation_max_length
      - model.model_hyperparams
    outs:
      - ./data/nomelt-model-full/meltome_benchmark_results.json:
          cache: false
    plots:
      - ./data/plots/original_meltome_scores.png:
          cache: false
      - ./data/plots/new_meltome_scores.png:
          cache: false
# proof of principle scripts
#################################

# natural log loss
  natural_diversity_entropy:
    wdir: ./
    cmd: python scripts/proof_of_principle/natural_diversity_entropy.py
    deps:
      - ./data/dataset
      - ./scripts/proof_of_principle/natural_diversity_entropy.py
    outs:
      - ./data/proof_of_principle/natural_diversity_entropy.json: # contains cross entropy from searching natural thermophilic diversity
          cache: false
  # call mAFdg on homologs with expirimental data and see if lenght difference normalization works
  mAF_length_diff_test:
    wdir: ./
    cmd: python scripts/proof_of_principle/mAF_length_diff_test.py
    metrics:
      - ./data/proof_of_principle/mAF_length_diff_test.json:
          cache: false
    plots:
      - ./data/plots/mAF_length_diff_test.png: # mAF scores normalized vs expermientla scores for a number of indel variants
          cache: false

# call the af2dg thermostability estimator the wild type enh and the consensus sequence from literature
  consensus_estimated:
    wdir: ./
    cmd: python scripts/proof_of_principle/consensus_estimated.py
    deps:
      - ./scripts/proof_of_principle/consensus_estimated.py
    outs:
      - ./data/proof_of_principle/consensus_estimated.json: # mAF score for consensus
          cache: false
# manually add a few mutations to the wild type enh and call the af2dg thermostability estimator to see how it effects the score
  enh1_vary_mutations:
    wdir: ./
    cmd: python scripts/proof_of_principle/enh1_vary_mutations.py
    deps:
      - ./scripts/proof_of_principle/enh1_vary_mutations.py
    outs:
      - ./data/proof_of_principle/vary_mutations.json:
          cache: false
# run the optimization process on the wild type enh vs literature consensus sequence
  enh1_consensus_optimize:
    wdir: ./
    cmd: python scripts/proof_of_principle/enh1_consensus_optimize.py
    deps:
      - ./scripts/proof_of_principle/enh1_consensus_optimize.py
    outs:
      - ./data/proof_of_principle/optimize_enh1_cons_results.json:
          cache: false
      - ./data/proof_of_principle/optimize_enh1_cons_trials.csv:
          cache: false
# run the optimization process on the wild type enh vs random sequence
  enh1_random_opt:
    wdir: ./
    cmd: python scripts/proof_of_principle/enh1_random_opt.py
    deps:
      - ./scripts/proof_of_principle/enh1_random_opt.py
    outs:
      - ./data/proof_of_principle/optimize_enh1_random_results.json:
          cache: false
      - ./data/proof_of_principle/optimize_enh1_rand_trials.csv:
          cache: false
  # searches for test exdamples eg. ENH, LovD, and Lipase A in the training set of the model
  tests_in_training_set:
    wdir: ./
    cmd: python scripts/proof_of_principle/check_training_set_for_case_studies.py
    deps:
      - ./scripts/proof_of_principle/check_training_set_for_case_studies.py
    outs:
      -  ./data/enh/training_data_homologs.json: # e value of any hits found in training set
          cache: false