pp/MLE.py at master · xiaoshuai09/pp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import abc,os,pickle,sys
import scipy.stats
import numpy as np
from datetime import datetime
import tensorflow as tf
from BatchIterator import PaddedDataIterator
from generation import *
from Plotter import get_intensity,get_integral,get_integral_empirical
import statsmodels.api as sm
import scipy.stats as stats
from Utils import sequence_filter,lambda_estimation,file2sequence,sequence2file


##############################################################################
# parameters
DATA = 'hawkes' # hawkes, gaussian, rnn, polynimial

BATCH_SIZE = 256 # Batch size
MAX_STEPS = 300
ITERS = 20000#100000 # how many generator iterations to train for
SEED = 1234 # set graph-level seed to make the random sequences generated by all ops be repeatable across sessions
D_DIFF = False
MARK = False
ITERATION = 1
DATA = sys.argv[1]
#T = 15.0 # end time of simulation
T = float(sys.argv[3])
SEQ_NUM = 2000 # number of sequences
DIM_SIZE = 1
SEQ_NUM = int(float(sys.argv[2]))

if DATA in ['911calls','hawkes_gaussian','hawkes_poly','mimic','meme','citation','stock',"mixture1","mixture2","mixture3","mixture4"]:
    REAL_DATA = True
else:
    REAL_DATA = False

tf.set_random_seed(SEED)
np.random.seed(SEED)

##############################################################################
# prepare data


FILE_NAME = 'pickled_data_ppgan_{}'.format(DATA)
if not os.path.isfile(FILE_NAME):
    if DATA=='gaussian': #QQ_plot for gaussian is not good as hawkes,selfcorrecting, perhaps that simulating is not good.
        intensityGaussian = IntensitySumGaussianKernel(3,[3,7,11], [1,1,1], [2,3,2])
        real_sequences = generate_sample(intensityGaussian, T, 20000)
        sequence2file(real_sequences,'gaussian')
    else:
        real_sequences = file2sequence(DATA)

    lambda0 = np.mean([len(item) for item in real_sequences])/T
    intensityPoisson = IntensityHomogenuosPoisson(lambda0)
    fake_sequences = generate_sample(intensityPoisson, T, 2000)
    pickle.dump([real_sequences,fake_sequences],open(FILE_NAME,'wb'))
else:
    real_sequences,fake_sequences = pickle.load(open(FILE_NAME,'rb'))


real_sequences,_ = pickle.load(open(FILE_NAME,'rb'))
print ((np.mean([len(item) for item in real_sequences])/T),((np.mean([len(item) for item in fake_sequences])/T)))
if not REAL_DATA:
    real_sequences = real_sequences[:SEQ_NUM]
real_iterator = PaddedDataIterator(real_sequences,T,MARK,D_DIFF)

K= 3
#should add more modal
coef = tf.Variable(tf.random_uniform([K], 0, 1, tf.float32),name='coef')
center = tf.constant([2.7,7.3,11.5], tf.float32) #[1.0,3.0,5.0,7.0,9.0,11.0,14.0] np.arange(1,14.1,13.0/(K-1))
std = tf.constant(np.ones([K]), tf.float32)


data = tf.placeholder(tf.float32, [BATCH_SIZE,None])
seqlen = tf.placeholder(tf.int32, [BATCH_SIZE])
tend = tf.constant(np.ones([BATCH_SIZE])*T, tf.float32)
tstart = tf.constant(np.zeros([BATCH_SIZE]), tf.float32)

lower_triangular_ones = tf.constant(np.tril(np.ones([MAX_STEPS,MAX_STEPS])),dtype=tf.float32)
seqlen_mask = tf.slice(tf.gather(lower_triangular_ones, seqlen - 1),[0, 0], tf.shape(data))
dist_list = []
for i_ in range(K):
    dist_list.append( tf.distributions.Normal(center[i_], std[i_]) )

mul_intens = 0
int_intens = 0
for i_ in range(K):
    mul_intens += coef[i_]*dist_list[i_].prob(data)
    int_intens += coef[i_]*(dist_list[i_].cdf(tend)-dist_list[i_].cdf(tstart))
loglikeylihood = tf.reduce_sum( (tf.log( mul_intens ))*seqlen_mask, axis=1)
loglikeylihood -= int_intens

#loglikeylihood =  tf.reduce_sum( (tf.log( coef[0]*dist0.pdf(data) + coef[1]*dist1.pdf(data) +coef[2]* dist2.pdf(data)+coef[3]*dist3.pdf(data) + coef[4]*dist4.pdf(data) +coef[5]* dist5.pdf(data)+coef[6]* dist6.pdf(data)  ))*seqlen_mask, axis=1)
#loglikeylihood -= coef[0]*(dist0.cdf(tend)-dist0.cdf(tstart)) + coef[1]*(dist1.cdf(tend)-dist1.cdf(tstart)) + coef[2]*(dist2.cdf(tend)-dist2.cdf(tstart)) + coef[3]*(dist3.cdf(tend)-dist3.cdf(tstart)) + coef[4]*(dist4.cdf(tend)-dist4.cdf(tstart)) + coef[5]*(dist5.cdf(tend)-dist5.cdf(tstart))+ coef[6]*(dist6.cdf(tend)-dist6.cdf(tstart))
loglikeylihood = - tf.reduce_mean(loglikeylihood)

train_variables = tf.trainable_variables()
print(map(lambda x: x.op.name, train_variables))
trainable_variable = [v for v in train_variables if v.name.startswith("coef")]
train_op = tf.train.RMSPropOptimizer(learning_rate=2e-3).minimize(loglikeylihood, var_list=trainable_variable)

saved_file = "gaussian_{}_{}_{}_{}_{}_{}".format(DATA,SEQ_NUM,ITERATION,datetime.now().day,datetime.now().hour,datetime.now().minute)
if not os.path.exists('out/%s'%saved_file):
    os.makedirs('out/%s'%saved_file)


n_t = 30
ts_real, intensity_real = get_intensity(real_sequences, T, n_t)

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1.0, allow_growth=True)
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options))
sess.run(tf.global_variables_initializer())

stop_indicator = False
last_value = 0#np.zeros([7])

for it in range(ITERS):
    real_batch = real_iterator.next_batch(BATCH_SIZE)
    loss,_,coef_ = sess.run([loglikeylihood,train_op,coef], feed_dict={ data:np.reshape(real_batch[0],real_batch[0].shape[:2]), seqlen:real_batch[1]})
    if it%1000==0:
        print ('Iter: {}; loss: {}; {} coef;{}'.format(it, loss, DATA, coef_))
        if np.max(np.abs(last_value-coef_))<1e-3:
            stop_indicator = True
        last_value = coef_

    if it%1000==0:
        intensityGaussian = IntensitySumGaussianKernel(K,[2.7,7.3,11.5], np.ones([K]), coef_)
        generated_sequences = generate_sample(intensityGaussian, T, 256)

        ts_gen, intensity_gen = get_intensity(generated_sequences, T, n_t)
        deviation = np.linalg.norm(intensity_gen-intensity_real)/np.linalg.norm(intensity_real)
        # can use correlation or other metric
        print ('Iter: {};  deviation: {}'.format(it,deviation))
        plt.plot(ts_real,intensity_real, label='real')
        plt.plot(ts_gen, intensity_gen, label='generated')
        plt.legend(loc=1)
        plt.xlabel('time')
        plt.ylabel('intensity')
        plt.savefig('out/{}/{}_{}.png'
                    .format(saved_file,str(it).zfill(3),deviation), bbox_inches='tight')
        plt.close()

        if not REAL_DATA and DATA!="rmtpp":
            integral_intensity = get_integral(generated_sequences, DATA)
            integral_intensity = np.asarray(integral_intensity)
            fig = plt.figure()
            left = -1.8   #x coordinate for text insert
            ax1 = fig.add_subplot(1,2,1)
            fig = sm.qqplot(integral_intensity, stats.expon, distargs=(), loc=0, scale=1,line='45',ax=ax1)
            plt.grid()
            ax2 = fig.add_subplot(1,2,2)
            top = ax2.get_ylim()[1] * 0.75
            res,slope_intercept = stats.probplot(integral_intensity, dist=stats.expon, plot=ax2)
            txt = ax2.text(left, top, "{}_{}".format(slope_intercept[0],slope_intercept[1]),verticalalignment='top')
            plt.grid()
            fig.savefig('out/{}/{}.png'.format(saved_file,it))
            plt.close()

    if it==ITERS-1 or stop_indicator:
        intensityGaussian = IntensitySumGaussianKernel(K,[2.7,7.3,11.5], np.ones([K]), coef_)
        generated_sequences = generate_sample(intensityGaussian, T, 2000)
        sequence2file(generated_sequences, 'gaussian_solver_{}_{}_{}'.format(DATA,SEQ_NUM,ITERATION))
        break