LanguageModel/train.py at master · pengming617/LanguageModel · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import numpy as np
import tensorflow as tf
import config as config
import lstm_lm as lstm_lm
import bilstm_lm as bilstm_lm

con = config.Config()


# 使用给定的模型model在数据data上运行train_op并返回再全部数据上的perplexity值
def run_epoch(session, model, batches, train_op, drop_out_keep, output_log, step):
    # 计算平均perplexity的辅助变量
    total_costs = 0.0
    iters = 0
    # 训练一个epoch
    for x, y in batches:
        # 在当前batch上运行train_op并计算损失值，交叉熵损失函数计算的就是下一个单词为给定单词的概率
        cost, _ = session.run(
            [model.cost, train_op],
            {model.input_data: x, model.targets: y, model.sequence_length: np.array([model.num_steps] * model.batch_size),
             model.drop_out_prob: drop_out_keep}
        )
        total_costs += cost
        iters += model.num_steps
        # 只有在训练时输出日志
        if output_log and step % 1000 == 0:
            print('After %d steps, perplexity is %.3f' % (step, np.exp(total_costs / iters)))
        step += 1
    # 返回给定模型在给定数据上的perplexity值
    return step, np.exp(total_costs / iters)


# 从文件中读取数据，并返回包含单词编号的数组
def read_data(file_path):
    with open(file_path, 'r') as fin:
        # 将整个文档读进一个长字符串
        lines = []
        for x in fin.readlines():
            lines.append(x)
        id_string = ' '.join([line.strip() for line in lines])
    id_list = [int(w) for w in id_string.split()]  # 将读取的单词编号转为整数
    print(file_path + " read success")
    return id_list


def make_batch(id_list, batch_size, num_step):
    # 计算总的batch数量，每个batch包含的单词数量是batch_size * num_step
    num_batches = (len(id_list) - 1) // (batch_size * num_step)
    # 将数据整理成一个维度为[batch_size, num_batches * num_step]的二维数组
    data = np.array(id_list[: num_batches * batch_size * num_step])
    data = np.reshape(data, [batch_size, num_batches * num_step])
    # 沿着第二个维度将数据切分成num_batches个batch,存入一个数组
    data_batches = np.split(data, num_batches, axis=1)

    # 重复上述操作，但是每个位置向右移动一位，这里得到的时RNN每一步输出所需要预测的下一个单词
    label = np.array(id_list[1: num_batches * batch_size * num_step + 1])
    label = np.reshape(label, [batch_size, num_batches * num_step])
    label_batches = np.split(label, num_batches, axis=1)
    # 返回一个长度为num_batches的数组，其中每一项包含一个data矩阵和一个label矩阵
    return list(zip(data_batches, label_batches))


def main():
    # 定义初始化函数
    initializer = tf.random_uniform_initializer(-0.05, 0.05)
    # 定义训练用的循环神经网络模型
    with tf.variable_scope('language_model', reuse=None, initializer=initializer):
        # train_model = lstm_lm.Lstm_LanguageModel(True, con.BATCH_SIZE, con.NUM_STEP, con.VOCAB_SIZE,
        #                                          con.HIDDEN_SIZE, con.NUM_LAYERS)
        train_model = bilstm_lm.BiLstm_LanguageModel(True, con.BATCH_SIZE, con.NUM_STEP, con.VOCAB_SIZE,
                                                     con.HIDDEN_SIZE, con.NUM_LAYERS)

    # 定义测试用的循环神经网络模型。它与train_model公用参数，但是没有dropout
    with tf.variable_scope('language_model', reuse=True, initializer=initializer):
        # eval_model = lstm_lm.Lstm_LanguageModel(False, con.BATCH_SIZE, con.NUM_STEP, con.VOCAB_SIZE,
        #                                         con.HIDDEN_SIZE, con.NUM_LAYERS)
        eval_model = bilstm_lm.BiLstm_LanguageModel(False, con.BATCH_SIZE, con.NUM_STEP, con.VOCAB_SIZE,
                                                    con.HIDDEN_SIZE, con.NUM_LAYERS)
    # 训练模型
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        saver = tf.train.Saver()
        train_batches = make_batch(read_data(con.TRAIN_DATA), con.BATCH_SIZE, con.NUM_STEP)
        eval_batches = make_batch(read_data(con.EVAL_DATA), con.BATCH_SIZE, con.NUM_STEP)
        test_batches = make_batch(read_data(con.TEST_DATA), con.BATCH_SIZE, con.NUM_STEP)

        step = 0
        min_perplexity = 999999.0
        for i in range(con.NUM_EPOCH):
            print('In iteration: %d' % (i + 1))
            step, train_pplx = run_epoch(sess, train_model, train_batches, train_model.train_op, con.LSTM_KEEP_PROB,
                                         True, step)
            print('Epoch: %d Train Perplexity: %.3f' % (i + 1, train_pplx))
            _, eval_pplx = run_epoch(sess, eval_model, eval_batches, tf.no_op(), 1.0, False, 0)
            print('Epoch: %d Eval Perplexity: %.3f' % (i + 1, eval_pplx))
            if eval_pplx < min_perplexity:
                min_perplexity = eval_pplx
                # saver.save(sess, "model/Bilstm_model/lstm_lm.ckpt")
                saver.save(sess, "model/Bilstm_model/lstm_lm.ckpt")

        _, test_pplx = run_epoch(sess, eval_model, test_batches, tf.no_op(), 1.0, False, 0)
        print('Test Perplexity: %.3f' % test_pplx)


if __name__ == '__main__':
    main()