-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathdvc.yaml
More file actions
372 lines (368 loc) · 15.8 KB
/
dvc.yaml
File metadata and controls
372 lines (368 loc) · 15.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
stages:
# get data ready for training eg. filtering out all but best pairs, balancing, etc.
prepare_data:
wdir: ./
cmd: python scripts/prepare_data.py
deps:
- ./scripts/prepare_data.py
- ./data/database.ddb
params:
- data.min_temp_diff # difference in temperature between meso and thermo homologs
- data.min_thermo_temp # minimum temperature of thermo homologs
- data.max_meso_temp # maximum temperature of meso homologs
- data.min_align_cov # minimum alignment coverage of homologs to bo considered a protein pair
- data.mmseq_params # parameters for mmseqs2 clustering
- data.test_size # size of test set in fraction
- data.dev_sample_data # sample data to speed up development, ignore for production
- data.additional_filters # string of sql expression used to filter protein pair data from database
outs:
- ./data/dataset/ # a huggingface dataset that contains protein pairs
metrics:
- ./data/data_metrics.yaml: # contains number of clusters in data splits and number of data
cache: false
blast_test_train:
wdir: ./
cmd: python scripts/blast_test_train.py
deps:
- ./scripts/blast_test_train.py
- ./data/dataset/
metrics:
- ./data/test_train_blast_metrics.json: # scores for seq id and E value of test blasted against train
cache: false
plots:
- ./data/plots/test_train_blast_hist.png: # distributions of seq id and E value of test blasted against train
cache: false
# train model
train:
wdir: ./
cmd: accelerate launch --config_file ./.config/accelerate/default_config.yaml scripts/train.py # runs deepspeed
deps:
- ./scripts/train.py
- ./data/dataset/
- ./.config/accelerate/default_config.yaml # this is a deepspeed config file
params:
- model.pretrained_model # pretrained model string from huggingface or disk
- model.task # either translation or reconstruction
- model.generation_max_length # maximum length of generated sequence
- model.model_hyperparams # hyperparameters for model, passed to HF params class
- training.reweight # whether to weight the loss function by the number of sequences in each cluster
- training.freeze_early_layers # fraction of layers to freeze
- training.dev_sample_data # sample data to speed up development, ignore for production
- training.per_device_batch_size # batch size per device
- training.gradient_accumulation # number of batches to accumulate gradients over
- training.auto_find_batch_size # whether to find the largest batch size that fits on the gpu
- training.evals_per_save # number of evals between saves
- training.evals_per_epoch # number of evals between epoch saves
- training.epochs # max number of epochs to train for
- training.gradient_checkpointing # whether to use gradient checkpointing
- training.learning_rate # learning rate
- training.lr_scheduler_type # type of learning rate scheduler
- training.label_smoothing_factor # label smoothing factor
- training.warmup_ratio # fraction of total steps to warmup learning rate
- training.optim # optimizer
- training.optim_args # optimizer arguments passed to HF optimizer
- training.eval_single_example_per_cluster # whether to evaluate a single example per cluster in the eval set
- training.fp16 # whether to use fp16
- training.bf16 # whether to use bf16
- training.early_stopping # whether to use early stopping
- training.early_stopping_patience # number of evals without improvement before stopping
- training.early_stopping_threshold # threshold for improvement to be considered significant
outs:
- ./data/nomelt-model/model/: # this is a HF model save directory
persist: true
- ./data/nomelt-model/live/report.md: # this is a markdown report of the training run
cache: false
metrics:
- ./data/nomelt-model/live/metrics.json: # this is a json file of metrics from the training run as a function of steps
cache: false
- ./data/nomelt-model/live/plots/: # this is a directory of plots from the training run that show up in the report
cache: false
persist: true
plots:
- ./data/nomelt-model/live/static/:
cache: false
train_all: # this is the same as above except it trains on the whole dataset and does not evaluate on anything. Final model
wdir: ./
cmd: accelerate launch --config_file ./.config/accelerate/default_config.yaml scripts/train_all.py
deps:
- ./scripts/train_all.py
- ./data/dataset/
- ./.config/accelerate/default_config.yaml
params:
- model.pretrained_model
- model.task
- model.generation_max_length
- model.model_hyperparams
- training.reweight
- training.freeze_early_layers
- training.dev_sample_data
- training.per_device_batch_size
- training.gradient_accumulation
- training.auto_find_batch_size
- training.evals_per_save
- training.evals_per_epoch
- training.epochs
- training.gradient_checkpointing
- training.learning_rate
- training.lr_scheduler_type
- training.label_smoothing_factor
- training.warmup_ratio
- training.optim
- training.optim_args
- training.eval_single_example_per_cluster
- training.fp16
- training.bf16
- training.early_stopping
- training.early_stopping_patience
- training.early_stopping_threshold
outs:
- ./data/nomelt-model-full/model/:
persist: true
- ./data/nomelt-model-full/live/report.md:
cache: false
metrics:
- ./data/nomelt-model-full/live/metrics.json:
cache: false
- ./data/nomelt-model-full/live/plots/:
cache: false
persist: true
plots:
- ./data/nomelt-model-full/live/static/:
cache: false
# run the model on test set and save generated sequences
make_predictions:
wdir: ./
cmd: accelerate launch --config_file ./.config/accelerate/data_parallel_config.yaml scripts/make_predictions.py
deps:
- ./scripts/make_predictions.py
- ./data/nomelt-model/model/
- ./data/dataset/
params:
- model.generation_max_length # maximum length of generated sequence
- model.generation_num_beams # number of beams to use in beam search
outs:
- ./data/nomelt-model/predictions.tsv # a tsv file of predictions includes meso, thermo and predicted sequences
# compute metrics on the test set from the predictions
score_predictions:
wdir: ./
cmd: python scripts/score_predictions.py
deps:
- ./scripts/score_predictions.py
- ./data/nomelt-model/predictions.tsv
outs:
- ./data/nomelt-model/test_scores.json: # a json file of metrics computed on the test set, these are only HF metrics like google bleu
cache: false
# call the model to generate thermostable variant of enh1
translate_enh1:
wdir: ./
cmd: python scripts/translate_enh1.py
deps:
- ./scripts/translate_enh1.py
- ./data/nomelt-model-full/model/
params:
- model.generation_max_length
- model.generation_num_beams
outs:
- ./data/enh/translate_enh1.json: # a json file of the generated sequence for enh1
cache: false
# call thermostability estimator on the generated sequence
estimate_trans_energy_enh1:
wdir: ./
cmd: python scripts/estimate_trans_energy_enh1.py
deps:
- scripts/estimate_trans_energy_enh1.py
- ./data/enh/translate_enh1.json
- ./.config/af_singularity_config.yaml # this is a config file for the af2dg estimator
params:
- optimize.estimator # name of estimator for estimating thermal stability, class in NOMELT
- optimize.estimator_args # arguments for estimator
outs:
- ./data/enh/translated_energy_enh1.json: # a json file of the estimated thermal stability of the generated sequence
cache: false
- ./data/enh/initial_estimate/ # contains data dump from running estimator
# use thermo estimator as oracle and optuna to find a subset of mutations that are likely to increase thermostability
optimize_enh1:
wdir: ./
cmd: python scripts/optimize_enh1.py
deps:
- ./scripts/optimize_enh1.py
- ./data/enh/translate_enh1.json
params:
- optimize.estimator # name of estimator for estimating thermal stability, class in NOMELT
- optimize.estimator_args # arguments for estimator
- optimize.n_trials # number of trials to run
- optimize.direction # minimize or maximize
- optimize.sampler # optuna sampler
- optimize.cut_tails # number of gap spaces to keep on ends of the alignment
- optimize.gap_compressed_mutations # whether to consider a string of gaps a single mutation
- optimize.matrix # substitution matrix
- optimize.match_score # match score
- optimize.mismatch_score # mismatch score
- optimize.gapopen # gap open penalty
- optimize.gapextend # gap extend penalty
- optimize.penalize_end_gaps # whether to penalize end gaps
- optimize.sampler_args # arguments for sampler
- optimize.optuna_overwrite # whether to overwrite optuna study
outs:
- ./data/enh/optimize_enh1/: # contains data dump from running optimizer
persist: true
- ./data/enh/optimize_enh1_trials.csv: # a csv file of the trials from the optimizer
cache: false
- ./data/enh/optimize_enh1_results.json: # a json file of the results, best seq, structure file, and estimated thermal stability
cache: false
# computes and saves residue wise embeddings for final layers of model on the test set. to help evaluate residue wise scores
compute_test_embeddings:
wdir: ./
cmd: python scripts/compute_test_embeddings.py
deps:
- ./scripts/compute_test_embeddings.py
- ./data/dataset/
- ./data/nomelt-model/model/
metrics:
- ./data/nomelt-model/test_loss.json: # a json file of the loss on the test set
cache: false
# compare generated to thermophilic structures with blast alignments, also compare to meso sequence
compare_sequence_alignment:
wdir: ./
cmd: python scripts/compare_sequence_alignment.py
deps:
- ./scripts/compare_sequence_alignment.py
- ./data/nomelt-model/predictions.tsv
outs:
- ./data/nomelt-model/test_predictions_aligned_results.json: # a json of smith waterman alignments in a triangle for each meso, thermo, generated
cache: false
# compare generated to thermophilic structures with structure alignments
compare_structure:
wdir: ./
cmd: python scripts/compare_structure.py
deps:
- ./scripts/compare_structure.py
- ./data/nomelt-model/predictions.tsv
outs:
- ./data/nomelt-model/structure_metrics.json: # fatcat and dssp comparison
cache: false
# call the af2dg thermostability estimator on a sample of data and save the scores. runs meso, thermo, and generated
data_estimator_distribution:
wdir: ./
cmd: python scripts/data_estimator_distribution.py
deps:
- ./data/nomelt-model/predictions.tsv
- ./scripts/data_estimator_distribution.py
outs:
- ./data/thermo_gen_estimated.json: # json of meso, thermo, generated thermal stability scores
cache: false
# looks at model outputs and compares to experimental data for melting temperature or another thermal stability target
# for two proteins with multiple mutations accumulated
zero_shot_estimation:
wdir: ./
cmd: python ./scripts/zero_shot_experiment.py
deps:
- ./scripts/zero_shot_experiment.py
- ./data/nomelt-model-full/model/
outs:
- ./data/nomelt-model-full/zero_shot_estimated.json: # exp vs model scores
cache: false
plots:
- ./data/plots/exp_tm_scores.png: # regression plot of exp vs model scores
cache: false
# run zero shot like above on protein gym T50 benchmark set
protein_gym_benchmark:
wdir: ./
cmd: python ./scripts/protein_gym_benchmark.py
deps:
- ./scripts/protein_gym_benchmark.py
- ./data/nomelt-model-full/model/
outs:
- ./data/nomelt-model-full/lipa_gym_zero_shot.json: # correlation coeffs and fraction of stat sig pairs in DMS library that were qualitatively predicted
cache: false
plots:
- ./data/plots/lipa_gym.png: # density plot of exp vs model scores
cache: false
meltome_benchmark:
wdir: ./
cmd: python scripts/meltome_benchmark.py
deps:
- ./scripts/meltome_benchmark.py
- ./data/nomelt-model-full/model/
params:
- model.generation_max_length
- model.model_hyperparams
outs:
- ./data/nomelt-model-full/meltome_benchmark_results.json:
cache: false
plots:
- ./data/plots/original_meltome_scores.png:
cache: false
- ./data/plots/new_meltome_scores.png:
cache: false
# proof of principle scripts
#################################
# natural log loss
natural_diversity_entropy:
wdir: ./
cmd: python scripts/proof_of_principle/natural_diversity_entropy.py
deps:
- ./data/dataset
- ./scripts/proof_of_principle/natural_diversity_entropy.py
outs:
- ./data/proof_of_principle/natural_diversity_entropy.json: # contains cross entropy from searching natural thermophilic diversity
cache: false
# call mAFdg on homologs with expirimental data and see if lenght difference normalization works
mAF_length_diff_test:
wdir: ./
cmd: python scripts/proof_of_principle/mAF_length_diff_test.py
metrics:
- ./data/proof_of_principle/mAF_length_diff_test.json:
cache: false
plots:
- ./data/plots/mAF_length_diff_test.png: # mAF scores normalized vs expermientla scores for a number of indel variants
cache: false
# call the af2dg thermostability estimator the wild type enh and the consensus sequence from literature
consensus_estimated:
wdir: ./
cmd: python scripts/proof_of_principle/consensus_estimated.py
deps:
- ./scripts/proof_of_principle/consensus_estimated.py
outs:
- ./data/proof_of_principle/consensus_estimated.json: # mAF score for consensus
cache: false
# manually add a few mutations to the wild type enh and call the af2dg thermostability estimator to see how it effects the score
enh1_vary_mutations:
wdir: ./
cmd: python scripts/proof_of_principle/enh1_vary_mutations.py
deps:
- ./scripts/proof_of_principle/enh1_vary_mutations.py
outs:
- ./data/proof_of_principle/vary_mutations.json:
cache: false
# run the optimization process on the wild type enh vs literature consensus sequence
enh1_consensus_optimize:
wdir: ./
cmd: python scripts/proof_of_principle/enh1_consensus_optimize.py
deps:
- ./scripts/proof_of_principle/enh1_consensus_optimize.py
outs:
- ./data/proof_of_principle/optimize_enh1_cons_results.json:
cache: false
- ./data/proof_of_principle/optimize_enh1_cons_trials.csv:
cache: false
# run the optimization process on the wild type enh vs random sequence
enh1_random_opt:
wdir: ./
cmd: python scripts/proof_of_principle/enh1_random_opt.py
deps:
- ./scripts/proof_of_principle/enh1_random_opt.py
outs:
- ./data/proof_of_principle/optimize_enh1_random_results.json:
cache: false
- ./data/proof_of_principle/optimize_enh1_rand_trials.csv:
cache: false
# searches for test exdamples eg. ENH, LovD, and Lipase A in the training set of the model
tests_in_training_set:
wdir: ./
cmd: python scripts/proof_of_principle/check_training_set_for_case_studies.py
deps:
- ./scripts/proof_of_principle/check_training_set_for_case_studies.py
outs:
- ./data/enh/training_data_homologs.json: # e value of any hits found in training set
cache: false