-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathgenerate_and_tokenize_data.sh
More file actions
36 lines (27 loc) · 2.16 KB
/
generate_and_tokenize_data.sh
File metadata and controls
36 lines (27 loc) · 2.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
## Training Data -- these commands approximately correspond to the zipped data we provide
# bitwise or
python create_pos_or_variants.py --n 20 --m 20 --dir_name <NAME> --max 100
python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type pad --test_split_ratio 0.01
# addition
python create_data_split.py --bucket --op + --n 20 --m 20 --limit 20000000 --p 0.0 --dir_name <NAME> --reverse_all
python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type pad --test_split_ratio 0.01
# addition with index hints
python create_data_split.py --bucket --op + --n 20 --m 20 --limit 20000000 --p 0.0 --dir_name <NAME> --reverse_all --index_hints
python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type index
# multiplication
python create_data_split.py --bucket --op x --n 15 --m 15 --limit 20000000 --dir_name <NAME> --reverse_all --p 0.0
python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type pad --test_split_ratio 0.01
# sorting
python create_data_split.py --uniform_distribution_sort_data --continue_to_tokenize --tokenize --tokenizer_type sort --test_split_ratio 0.01 --n 10 --m 10 --limit 20000000 --dir <NAME> --sort_generation_method bucket_uniform_distribution --reverse_all
## Evaluation Data -- run line and tokenize once for each operand length
# bitwise or
python create_pos_or_variants.py --n <i> --m <j> --dir_name <NAME> --exact --eval --max 100
python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type pad --test_split_ratio 0.0
# addition
python create_data_split.py --op + --n <i> --m <j> --num_samples 100 --dir_name <NAME> --exact
python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type pad --test_split_ratio 0.0
# multiplication
python create_data_split.py --op x --n <i> --m <j> --num_samples 100 --dir_name <NAME> --exact
python create_data_split.py --tokenize --dir_name <NAME> --tokenizer_type pad --test_split_ratio 0.0
# sorting
python create_data_split.py --uniform_distribution_sort_data --continue_to_tokenize --tokenize --tokenizer_type sort --test_split_ratio 0.01 --n <i> --m <j> --limit 100 --dir <NAME> --sort_generation_method bucket_uniform_distribution --reverse_all --exact