-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmakeTrainingDS.py
More file actions
executable file
·76 lines (60 loc) · 3.44 KB
/
makeTrainingDS.py
File metadata and controls
executable file
·76 lines (60 loc) · 3.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python3
import argparse
import os
import pickle as pkl
import gzip
from camaptools.Dataset import TrainingDataset
from camaptools.EnhancedFutures import EnhancedProcessPoolExecutor, EnhancedMPIPoolExecutor
def write_to_disk(dat, ds_name):
if not os.path.exists('output/trainDS'):
os.makedirs('output/trainDS')
with gzip.open('output/trainDS/%s_encoding-CodonEmbeddings.pkl.gz' % ds_name, 'wb') as f:
pkl.dump(dat.enc_dct, f)
with gzip.open('output/trainDS/%s_encoding-CodonShuffleEmbeddings.pkl.gz' % ds_name, 'wb') as f:
pkl.dump(dat.shuff_enc_dct, f)
with gzip.open('output/trainDS/%s_encoding-AminoAcidEmbeddings.pkl.gz' % ds_name, 'wb') as f:
pkl.dump(dat.aa_enc_dct, f)
with gzip.open('output/trainDS/%s-Metadata.pkl.gz' % ds_name, 'wb') as f:
pkl.dump(dat.meta_dct, f)
def main():
parser=argparse.ArgumentParser()
parser.add_argument("-g", "--genome", help="genome [GRCh37.75, GRCm38.78, etc.]", type=str, default="GRCh38.98")
parser.add_argument("-d", "--dataset", help="dataset [BLCL, EL4, etc.]", type=str, default="BLCL")
parser.add_argument("-p", "--peplen", help="peptide length", type=str, default="")
parser.add_argument("-c", "--context", help="mRNA context length on each side", type=int, default=162)
parser.add_argument("-r", "--bs_or_rank", help="max binding score or rank (if < 25)", type=int, default=1250)
parser.add_argument("-m", "--ncontexts", help="max contexts permitted to keep peptide", type=int, default=None)
parser.add_argument("-t", "--ratio", help="non-source to source ratio", type=int, default=None)
parser.add_argument("-y", "--sametpm", help="sample non-source keeping TPM proportions", action='store_true')
parser.add_argument("-z", "--samebs", help="sample non-source keeping nM proportions", action='store_true')
parser.add_argument("-s", "--nsplits", help="number of random splittings", type=int, default=1)
parser.add_argument("-w", "--workers", help="number of parallel workers in addition to main", type=int, default=0)
parser.add_argument("--mpi", help="Parallelize using MPI", action='store_true')
args=parser.parse_args().__dict__
genome = args['genome']
ds = args['dataset']
peplen = args['peplen']
context_len = args['context']
max_bs_or_rank = args['bs_or_rank']
max_contexts = args['ncontexts']
ratio = args['ratio']
same_tpm = args['sametpm']
same_bs = args['samebs']
seeds = [int(s) for s in range(1, args['nsplits']+1)]
workers = args['workers']
mpi = args['mpi']
Executor = EnhancedMPIPoolExecutor if mpi else EnhancedProcessPoolExecutor
dat = TrainingDataset(genome, ds, context_len, workers=workers, executor=Executor)
dat.pepfiles = [f for f in dat.pepfiles if '%s.pkl' % peplen in f]
dat.load_peptides_options(max_bs_or_rank=max_bs_or_rank, max_contexts=max_contexts)
dat.load_peptides()
for seed in seeds:
dat.split_dataset(ratio=ratio, same_tpm=same_tpm, same_bs=same_bs, seed=seed)
dat.encode_peptides(seed=seed)
ds_name = '%s_%s_padding%d_maxBS%d_maxContexts%d_ratio%d_%s%s%sseed%d' % (
ds, genome, context_len, max_bs_or_rank, max_contexts if max_contexts else 0,
ratio if ratio else 0, 'peplen%s_' % peplen if peplen else '', 'sameTPM_' if same_tpm else '',
'sameBS_' if same_bs else '', seed)
write_to_disk(dat, ds_name)
if __name__ == '__main__':
main()