forked from texttron/tevatron
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain_RetroSyn.sh
More file actions
65 lines (57 loc) · 1.98 KB
/
train_RetroSyn.sh
File metadata and controls
65 lines (57 loc) · 1.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
MODEL_DIR=output/RetroSyn
MASTER_PORT=$(shuf -n 1 -i 10000-65535)
python -m torch.distributed.launch --nproc_per_node=8 --master_port $MASTER_PORT -m tevatron.driver.train \
--output_dir ${MODEL_DIR} \
--train_dir preprocessed/RetroSyn/train_matched_rn.jsonl \
--cache_dir cache/ \
--data_cache_dir cache/data/ \
--model_name_or_path seyonec/ChemBERTa-zinc-base-v1 \
--p_model_name_or_path allenai/scibert_scivocab_uncased \
--do_train \
--dataloader_num_workers 4 \
--save_steps 20000 \
--fp16 \
--per_device_train_batch_size 64 \
--train_n_passages 2 \
--learning_rate 1e-4 \
--q_max_len 128 \
--p_max_len 256 \
--num_train_epochs 400 \
--negatives_x_device \
--overwrite_output_dir
python -m torch.distributed.launch --nproc_per_node=1 --master_port $MASTER_PORT -m tevatron.driver.encode \
--output_dir=${MODEL_DIR}/ \
--cache_dir cache/ \
--data_cache_dir cache/data/ \
--model_name_or_path ${MODEL_DIR}/ \
--tokenizer_name ${MODEL_DIR}/passage_model \
--fp16 \
--per_device_eval_batch_size 1024 \
--p_max_len 256 \
--dataset_name json \
--encode_in_path preprocessed/RetroSyn/corpus.jsonl \
--encoded_save_path ${MODEL_DIR}/corpus.pkl
for split in test valid train
do
echo $split
python -m torch.distributed.launch --nproc_per_node=1 --master_port $MASTER_PORT -m tevatron.driver.encode \
--output_dir=${MODEL_DIR}/ \
--cache_dir cache/ \
--data_cache_dir cache/data/ \
--model_name_or_path ${MODEL_DIR}/ \
--tokenizer_name ${MODEL_DIR}/query_model \
--fp16 \
--per_device_eval_batch_size 1024 \
--q_max_len 128 \
--dataset_name json \
--encode_in_path preprocessed/RetroSyn/${split}.jsonl \
--encoded_save_path ${MODEL_DIR}/${split}.pkl \
--encode_is_qry
python -m tevatron.faiss_retriever \
--query_reps ${MODEL_DIR}/${split}.pkl \
--passage_reps ${MODEL_DIR}/corpus.pkl \
--depth 20 \
--batch_size -1 \
--save_json \
--save_ranking_to ${MODEL_DIR}/${split}_rank.json
done