CAMAP_Tools/makeTrainingDS.py at master · feghalya/CAMAP_Tools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python3

import argparse
import os
import pickle as pkl
import gzip

from camaptools.Dataset import TrainingDataset
from camaptools.EnhancedFutures import EnhancedProcessPoolExecutor, EnhancedMPIPoolExecutor


def write_to_disk(dat, ds_name):
    if not os.path.exists('output/trainDS'):
        os.makedirs('output/trainDS')

    with gzip.open('output/trainDS/%s_encoding-CodonEmbeddings.pkl.gz' % ds_name, 'wb') as f:
        pkl.dump(dat.enc_dct, f)
    with gzip.open('output/trainDS/%s_encoding-CodonShuffleEmbeddings.pkl.gz' % ds_name, 'wb') as f:
        pkl.dump(dat.shuff_enc_dct, f)
    with gzip.open('output/trainDS/%s_encoding-AminoAcidEmbeddings.pkl.gz' % ds_name, 'wb') as f:
        pkl.dump(dat.aa_enc_dct, f)
    with gzip.open('output/trainDS/%s-Metadata.pkl.gz' % ds_name, 'wb') as f:
        pkl.dump(dat.meta_dct, f)


def main():
    parser=argparse.ArgumentParser()

    parser.add_argument("-g", "--genome", help="genome [GRCh37.75, GRCm38.78, etc.]", type=str, default="GRCh38.98")
    parser.add_argument("-d", "--dataset", help="dataset [BLCL, EL4, etc.]", type=str, default="BLCL")
    parser.add_argument("-p", "--peplen", help="peptide length", type=str, default="")
    parser.add_argument("-c", "--context", help="mRNA context length on each side", type=int, default=162)
    parser.add_argument("-r", "--bs_or_rank", help="max binding score or rank (if < 25)", type=int, default=1250)
    parser.add_argument("-m", "--ncontexts", help="max contexts permitted to keep peptide", type=int, default=None)
    parser.add_argument("-t", "--ratio", help="non-source to source ratio", type=int, default=None)
    parser.add_argument("-y", "--sametpm", help="sample non-source keeping TPM proportions", action='store_true')
    parser.add_argument("-z", "--samebs", help="sample non-source keeping nM proportions", action='store_true')
    parser.add_argument("-s", "--nsplits", help="number of random splittings", type=int, default=1)
    parser.add_argument("-w", "--workers", help="number of parallel workers in addition to main", type=int, default=0)
    parser.add_argument("--mpi", help="Parallelize using MPI", action='store_true')

    args=parser.parse_args().__dict__

    genome = args['genome']
    ds = args['dataset']
    peplen = args['peplen']
    context_len = args['context']
    max_bs_or_rank = args['bs_or_rank']
    max_contexts = args['ncontexts']
    ratio = args['ratio']
    same_tpm = args['sametpm']
    same_bs = args['samebs']
    seeds = [int(s) for s in range(1, args['nsplits']+1)]

    workers = args['workers']
    mpi = args['mpi']

    Executor = EnhancedMPIPoolExecutor if mpi else EnhancedProcessPoolExecutor

    dat = TrainingDataset(genome, ds, context_len, workers=workers, executor=Executor)
    dat.pepfiles = [f for f in dat.pepfiles if '%s.pkl' % peplen in f]
    dat.load_peptides_options(max_bs_or_rank=max_bs_or_rank, max_contexts=max_contexts)
    dat.load_peptides()

    for seed in seeds:
        dat.split_dataset(ratio=ratio, same_tpm=same_tpm, same_bs=same_bs, seed=seed)
        dat.encode_peptides(seed=seed)
        ds_name = '%s_%s_padding%d_maxBS%d_maxContexts%d_ratio%d_%s%s%sseed%d' % (
                ds, genome, context_len, max_bs_or_rank, max_contexts if max_contexts else 0,
                ratio if ratio else 0, 'peplen%s_' % peplen if peplen else '', 'sameTPM_' if same_tpm else '',
                'sameBS_' if same_bs else '', seed)
        write_to_disk(dat, ds_name)


if __name__ == '__main__':
    main()