LoginputEngine/main.py at master · R0uter/LoginputEngine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import tqdm
import os
from train_kenlm import arpa_to_lmdb, data_produce, build_tokenizer
if not os.path.exists('./result_files'):
    os.makedirs('./result_files')

lmplz = 'train_kenlm/kenlm/build/bin/lmplz'
data = './result_files/data_cuted.txt'
arpa = './result_files/log.arpa'


def main():
    # ── Phase 1: Build tokenizer word list (run once per corpus) ─────────────
    # Trains a HuggingFace Unigram tokenizer on the raw corpus and exports a
    # Chinese word list to result_files/word_list.txt
    # build_tokenizer.gen_word_list(vocab_size=200_000)

    # ── Phase 2: Process corpus with tokenizer + pinyin ───────────────────────
    # Segments corpus using the Unigram tokenizer (Viterbi decoding),
    # simultaneously derives per-token pinyin from sentence context.
    # Outputs: result_files/data_cuted.txt  (for KenLM)
    #          result_files/word_pinyin.txt (for emission DB)
    # data_produce.gen_data_txt(process_num=6, mem_limit_gb=10)

    # ── Phase 3: Train KenLM n-gram model ────────────────────────────────────
    # os.system('{} -o 3 --verbose_header --text {}  --arpa {} --prune 0 30 50'.format(lmplz, data, arpa))

    # ── Phase 4: Build emission + transition databases ────────────────────────
    # arpa_to_lmdb.gen_emission_and_database()


def test():
    import utility
    from dag import dag_v2 as dag
    from datetime import datetime
    # dag.Database_Type = dag.kLMDB
    dag.load_data()

    pys = utility.get_pinyin_str("he'li'ji'qun'zhong'man'yi'de'fang'an")
    start = datetime.now()
    l = dag.get_candidates_from(pys, path_num=10)
    end = datetime.now()
    print('Running time:{}ms'.format((end - start).microseconds / 1000))
    for item in l:
        print('/'.join(item.path), item.score)
    print(dag.get_counter_stats())
    import res.test
    test_data = res.test.smallData
    pbar = tqdm.tqdm(total=len(test_data))
    hit = 0
    for py, value in test_data.items():
        pbar.update()
        r = dag.get_candidates_from(py, path_num=10)
        rstr = 'None'
        if len(r) > 0:
            rstr = ''.join(r[0].path)
        if rstr == value:
            hit += 1
        if pbar.n % 100 == 0 and rstr != value and len(r) > 0:
            print("test:{}, result:{}, should:{}".format(py, '/'.join(r[0].path), value))
    print('命中率：{}%'.format(hit / len(test_data) * 100), )
    print(dag.get_counter_stats())


if __name__ == '__main__':
    main()
    # test()

    # import lmdb
    #
    # env = lmdb.open('./result_files/transition_v2.mdb',
    #                 map_size=1048576000,
    #                 readonly=True,
    #                 lock=False,
    #                 subdir=False)
    # with env.begin() as txn:
    #     num_records = sum(1 for _ in txn.cursor())
    # print(f"Total N-gram records: {num_records}")