diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/cifar.py b/cifar.py new file mode 100644 index 0000000..59f19cf --- /dev/null +++ b/cifar.py @@ -0,0 +1,31 @@ +import keras +from keras.datasets import cifar10 +import numpy as np + +def load(): + num_classes = 10 # number of classes + (x_train, y_train), (x_test, y_test) = cifar10.load_data() + + # print(x_train.shape[0], 'train samples') + # print(x_test.shape[0], 'test samples') + # print(type(x_train)) + + x_train = x_train.reshape((x_train.shape[0],-1)) + x_test = x_test.reshape((x_test.shape[0],-1)) + # print('x_train shape:', x_train.shape) + # print(x_train[0],x_train.dtype) + n, D = x_train.shape # (n_sample, n_feature) + x_train = np.divide(x_train,255.0) + x_test = np.divide(x_test,255.0) + + print("Load CIFAR10 dataset.") + print(x_train.shape[0], 'train samples') + print(x_test.shape[0], 'test samples') + + d = np.int32(n / 2) * 2 # number of random features + + # convert class vectors to binary class matrices + y_train = keras.utils.to_categorical(y_train, num_classes) + y_test = keras.utils.to_categorical(y_test, num_classes) + R = np.max(np.linalg.norm(x_train,2,axis=1))**2 + return num_classes,x_train,x_test,y_train,y_test,n,D,R,d \ No newline at end of file diff --git a/dnn.py b/dnn.py new file mode 100644 index 0000000..63741f4 --- /dev/null +++ b/dnn.py @@ -0,0 +1,103 @@ +'''Train a simple deep CNN on the CIFAR10 small images dataset. +GPU run command with Theano backend (with TensorFlow, the GPU is automatically used): + THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatx=float32 python cifar10_cnn.py +It gets down to 0.65 test logloss in 25 epochs, and down to 0.55 after 50 epochs. +(it's still underfitting at that point, though). +''' + +from __future__ import print_function +import keras +from keras.datasets import cifar10 +from keras.preprocessing.image import ImageDataGenerator +from keras.models import Sequential +from keras.layers import Dense, Dropout, Activation, Flatten +from keras.layers import Conv2D, MaxPooling2D +import numpy.linalg +from numpy.linalg import svd +import mnist +import cifar +import imdb +batch_size = 16 +epochs = 50 +data_augmentation = False + +# The data, shuffled and split between train and test sets: +num_classes,x_train,x_test,y_train,y_test,n,D,R,d = imdb.load() + +model = Sequential() +#model.add(Flatten(input_shape=x_train.shape[1:])) +layer1 = Dense(256,input_shape=(x_train.shape[1],))# +#Conv2D(32, (3, 3),strides=(3,3),padding='same', +# ) +model.add(layer1) +# model.add(Activation('relu')) +# model.add(Dropout(0.5)) +# layer2 = Dense(128) +#Conv2D(32, (3, 3)) +# model.add(layer2) +# model.add(Activation('relu')) +# model.add(Dropout(0.5)) +#model.add(MaxPooling2D(pool_size=(2, 2))) +#model.add(Dropout(0.25)) +# layer3 = Dense(64)#Conv2D(64, (3, 3), padding='same') +# model.add(layer3) +# model.add(Activation('relu')) +#model.add(Dropout(0.5)) +#model.add(Dense(100)) +#model.add(Activation('relu')) +layer5 = Dense(num_classes) +model.add(layer5) +model.add(Activation('softmax')) +model.summary() +# initiate RMSprop optimizer +opt = keras.optimizers.sgd(lr=0.01) + +# Let's train the model using RMSprop +model.compile(loss='categorical_crossentropy', + optimizer=opt, + metrics=['accuracy']) + + +num_classes,x_train,x_test,y_train,y_test,n,D,R,d = imdb.load() +from numpy.random import shuffle + + +model.fit(x_train, y_train, + batch_size=batch_size, + epochs=epochs, + validation_data=(x_test, y_test), + shuffle=True, + verbose=2) + + +# spectrum1 = svd(layer1.get_weights()[0],compute_uv=False) +# spectrum2 = svd(layer2.get_weights()[0],compute_uv=False) +# spectrum3 = svd(layer3.get_weights()[0],compute_uv=False) +# #spectrum4 = svd(numpy.reshape(layer4.get_weights()[0],(64*9,64)),compute_uv=False) +# spectrum5 = svd(layer5.get_weights()[0],compute_uv=False) + +# numpy.set_printoptions(threshold=numpy.nan) +# #print(layer1.get_weights()[0].dot(layer1.get_weights()[0].transpose())) +# W = layer1.get_weights()[0].transpose() +# print(W.shape) +# print(W.dot(W.transpose()).shape) +# W = W/(numpy.linalg.norm(W,axis=1,ord=2).reshape(W.shape[0],1)) +# print(-numpy.sort(-(W.dot(W.transpose())).flatten())[W.shape[0]:W.shape[0]*3:2]) +# W = layer2.get_weights()[0].transpose() +# W = W/(numpy.linalg.norm(W,axis=1,ord=2).reshape(W.shape[0],1)) +# print(-numpy.sort(-(W.dot(W.transpose())).flatten())[W.shape[0]:W.shape[0]*3:2]) +# W = layer3.get_weights()[0].transpose() +# W = W/(numpy.linalg.norm(W,axis=1,ord=2).reshape(W.shape[0],1)) +# print(-numpy.sort(-(W.dot(W.transpose())).flatten())[W.shape[0]:W.shape[0]*3:2]) +# #print(spectrum4) +# W = layer5.get_weights()[0].transpose() +# W = W/(numpy.linalg.norm(W,axis=1,ord=2).reshape(W.shape[0],1)) +# print(-numpy.sort(-(W.dot(W.transpose())).flatten())[W.shape[0]:W.shape[0]*3:2]) + +# print(spectrum1) +# print(spectrum2) +# print(spectrum3) +# print(spectrum5) + +#preds = model.predict_on_batch(x_test) +#print("done",preds[0]) \ No newline at end of file diff --git a/imdb.py b/imdb.py new file mode 100644 index 0000000..c36cf24 --- /dev/null +++ b/imdb.py @@ -0,0 +1,47 @@ +import keras +import numpy as np +from keras.datasets import imdb +from scipy.sparse import csr_matrix +def load(): + (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000, + skip_top=30, + maxlen=None) + indptr = [0] + indices = [] + data = [] + vocabulary = {} + for d in x_train: + for index in d: + indices.append(index) + data.append(1.0/len(d)) + indptr.append(len(indices)) + x_train = csr_matrix((data, indices, indptr), dtype=float).toarray() + + indptr = [0] + indices = [] + data = [] + vocabulary = {} + for d in x_test: + for index in d: + indices.append(index) + data.append(1.0/len(d)) + indptr.append(len(indices)) + x_test = csr_matrix((data, indices, indptr), dtype=float).toarray() + print("Load IMDB dataset.") + print(x_train.shape[0], 'train samples') + print(x_test.shape[0], 'test samples') + print(x_train.shape[1],"features") + #x_train = keras.preprocessing.text.one_hot(x_train,10000) + print(x_train[0]) + #x_test = keras.preprocessing.text.one_hot(x_test,10000) + n, D = x_train.shape # (n_sample, n_feature) + d = np.int32(n / 2) * 2 # number of random features + num_classes = 2 + # convert class vectors to binary class matrices + y_train = keras.utils.to_categorical(y_train, num_classes) + y_test = keras.utils.to_categorical(y_test, num_classes) + R = np.max(np.linalg.norm(x_train,2,axis=1))**2 + # x_train/=R + # x_test/=R + # R = 1.0 + return num_classes,x_train,x_test,y_train,y_test,n,D,R,d \ No newline at end of file diff --git a/kernels.py b/kernels.py index ef6803f..61662dd 100644 --- a/kernels.py +++ b/kernels.py @@ -3,76 +3,237 @@ from keras import backend as K def D2(X, Y): - """ Calculate the pointwise (squared) distance. - - Arguments: - X: of shape (n_sample, n_feature). - Y: of shape (n_center, n_feature). - - Returns: - pointwise distances (n_sample, n_center). - """ - XX = K.sum(K.square(X), axis = 1, keepdims=True) - if X is Y: - YY = XX - else: - YY = K.sum(K.square(Y), axis = 1, keepdims=True) - XY = K.dot(X, K.transpose(Y)) - d2 = K.reshape(XX, (K.shape(X)[0], 1)) \ - + K.reshape(YY, (1, K.shape(Y)[0])) \ - - 2 * XY - return d2 + """ Calculate the pointwise (squared) distance. + + Arguments: + X: of shape (n_sample, n_feature). + Y: of shape (n_center, n_feature). + + Returns: + pointwise distances (n_sample, n_center). + """ + XX = K.sum(K.square(X), axis = 1, keepdims=True) + if X is Y: + YY = XX + else: + YY = K.sum(K.square(Y), axis = 1, keepdims=True) + XY = K.dot(X, K.transpose(Y)) + d2 = K.reshape(XX, (K.shape(X)[0], 1)) \ + + K.reshape(YY, (1, K.shape(Y)[0])) \ + - 2 * XY + return d2 + +def Linear(X,Y): + return K.dot(X,K.transpose(Y)) def Gaussian(X, Y, s): - """ Gaussian kernel. - - Arguments: - X: of shape (n_sample, n_feature). - Y: of shape (n_center, n_feature). - s: kernel bandwidth. - - Returns: - kernel matrix of shape (n_sample, n_center). - """ - assert s > 0 - - d2 = D2(X, Y) - gamma = np.float32(1. / (2 * s ** 2)) - G = K.exp(-gamma * K.clip(d2, 0, None)) - return G + """ Gaussian kernel. + + Arguments: + X: of shape (n_sample, n_feature). + Y: of shape (n_center, n_feature). + s: kernel bandwidth. + + Returns: + kernel matrix of shape (n_sample, n_center). + """ + assert s > 0 + + d2 = D2(X, Y) + gamma = np.float32(1. / (2 * s ** 2)) + G = K.exp(-gamma * K.clip(d2, 0, None)) + return G def Laplace(X, Y, s): - """ Laplace kernel. - - Arguments: - X: of shape (n_sample, n_feature). - Y: of shape (n_center, n_feature). - s: kernel bandwidth. - - Returns: - kernel matrix of shape (n_sample, n_center). - """ - assert s > 0 - - d2 = K.clip(D2(X, Y), 0, None) - d = K.sqrt(d2) - G = K.exp(- d / s) - return G + """ Laplace kernel. + + Arguments: + X: of shape (n_sample, n_feature). + Y: of shape (n_center, n_feature). + s: kernel bandwidth. + + Returns: + kernel matrix of shape (n_sample, n_center). + """ + assert s > 0 + + d2 = K.clip(D2(X, Y), 0, None) + d = K.sqrt(d2) + G = K.exp(- d / s) + return G def Cauchy(X, Y, s): - """ Cauchy kernel. - - Arguments: - X: of shape (n_sample, n_feature). - Y: of shape (n_center, n_feature). - s: kernel bandwidth. - - Returns: - kernel matrix of shape (n_sample, n_center). - """ - assert s > 0 - - d2 = D2(X, Y) - s2 = np.float32(s**2) - G = 1 / K.exp( 1 + K.clip(d2, 0, None) / s2) - return G + """ Cauchy kernel. + + Arguments: + X: of shape (n_sample, n_feature). + Y: of shape (n_center, n_feature). + s: kernel bandwidth. + + Returns: + kernel matrix of shape (n_sample, n_center). + """ + assert s > 0 + + d2 = D2(X, Y) + s2 = np.float32(s**2) + G = 1 / K.exp( 1 + K.clip(d2, 0, None) / s2) + return G + +import tensorflow as tf + +def ReLu(X,Y,s): + #Francis Bach - Breaking the Curse of Dimensionality with Convex Neural Networks + s = np.float32(s) + XX = K.sum(K.square(X), axis = 1, keepdims=True) + if Y is X: + YY = XX + else: + YY = K.sum(K.square(Y), axis = 1, keepdims=True) + XX = K.reshape(XX, (K.shape(X)[0], 1)) + YY = K.reshape(YY, (1,K.shape(Y)[0])) + + xx = K.sqrt(XX/s+np.float32(1.0)) + yy = K.sqrt(YY/s+np.float32(1.0)) + + XY = K.dot(X, K.transpose(Y)) + + cos = (XY/s + np.float32(1.0)) / (K.dot(xx,yy)) # + cos = K.clip(cos,-1,1) + delta = tf.acos(cos) + pi = np.float32(3.14159265359) + d = K.int_shape(X)[1] + return K.dot(xx,yy)*((pi-delta)*K.cos(delta)+K.sin(delta))/(2*d*pi) #dot product, eg. macht nur das erste sinn. + + +def InversePolynomial(X,Y,s): + # Zhang, Lee and Jordan - l1 regularized Neural Networks are Improperly Learniable in Polynomial Time + XY = K.dot(X, K.transpose(Y)) + XX = K.sqrt(K.sum(K.square(X), axis = 1, keepdims=True)) + if Y is X: + YY = XX + else: + YY = K.sqrt(K.sum(K.square(Y), axis = 1, keepdims=True)) + XX = K.reshape(XX, (K.shape(X)[0], 1)) + YY = K.reshape(YY, (1,K.shape(Y)[0])) + XY = XY/K.dot(XX,YY) + XY = K.clip(XY,-1,1) + for i in range(s): + XY = np.float(1)/(np.float32(2)-XY) + XY = K.clip(XY,0,1) + return XY + + +def CompositionalFeedForwardKernel(X,Y,s): + # Amit Daniely, Roy Frostig, Yoram Singer - Toward Deeper Understanding of Neural Networks -- The Power of Initialization and a Dual View on Expressivity + s = np.float32(s) + + XX = K.sqrt(K.sum(K.square(X), axis = 1, keepdims=True)) + if Y is X: + YY = XX + else: + YY = K.sqrt(K.sum(K.square(Y), axis = 1, keepdims=True)) + + XX = K.reshape(XX, (K.shape(X)[0], 1)) + YY = K.reshape(YY, (1,K.shape(Y)[0])) + + XY = K.dot(X, K.transpose(Y)) + XY = tf.div(XY,K.dot(XX,YY)) + XY = K.clip(XY,-1,1) + + pi = np.float32(3.14159265359) + d = K.int_shape(X)[1] + XY = XY/d + for i in range(s): + XY = (K.sqrt(1-K.square(XY))+tf.multiply((pi-tf.acos(XY)),XY))/pi + #XY = K.clip(XY,-1,1) + return XY + +def ArcCosine(X,Y,s,bias=0.0): + # Youngmin Cho, Lawrence K. Saul - Kernel Methods for Deep Learning + # k(x,y) = \frac 1 \pi ||x||\cdot||y|| \cdot \left( \sin \theta + (\pi - \theta)\cos \theta\right) + #bias should be as large as R + if s<=1: + XX = K.sqrt(bias+K.sum(K.square(X), axis = 1, keepdims=True)) + if Y is X: + YY = XX + else: + YY = K.sqrt(bias+K.sum(K.square(Y), axis = 1, keepdims=True)) + XX = K.reshape(XX, (K.shape(X)[0], 1)) + YY = K.reshape(YY, (1,K.shape(Y)[0])) + + XY = K.dot(X, K.transpose(Y))+bias + XY = tf.div(XY,K.dot(XX,YY)) + XY = K.clip(XY,-1,1) + theta = tf.acos(XY) + pi = np.float32(3.14159265359) + + return (K.dot(XX,YY) * (K.sin(theta) + (pi-theta) * K.cos(theta)))/pi + else: + XX = K.sqrt(bias+K.sum(K.square(X), axis = 1, keepdims=True)) + if Y is X: + YY = XX + else: + YY = K.sqrt(bias+K.sum(K.square(Y), axis = 1, keepdims=True)) + XX = K.reshape(XX, (K.shape(X)[0],1)) + YY = K.reshape(YY, (1,K.shape(Y)[0])) + XY = ArcCosine(X,Y,s-1)+bias + XY = tf.div(XY,K.dot(XX,YY)) + XY = K.clip(XY,-1,1) + theta = tf.acos(XY) + pi = np.float32(3.14159265359) + + return (K.dot(XX,YY) * (K.sin(theta) + (pi-theta) * K.cos(theta)))/pi + + +def Convolution(X,Y,kernel_size=(4,4)): + # XX = K.sqrt(K.sum(K.square(X), axis = 1, keepdims=False)) + # if Y is X: + # YY = XX + # else: + # YY = K.sqrt(K.sum(K.square(Y), axis = 1, keepdims=False)) + + # # XX = K.reshape(XX, (K.shape(X)[0])) + # # YY = K.reshape(YY, (K.shape(Y)[0], 1)) + # print(K.int_shape(X),K.int_shape(XX)) + # Xnorm = K.dot(tf.diag(1.0/XX),X) + # Ynorm = K.dot(tf.diag(1.0/YY),Y) + + xs = [] + ys = [] + pi = np.float32(3.14159265359) + print("before reshape",K.int_shape(X)) + x = K.reshape(X,(-1,28,28)) + y = K.reshape(Y,(-1,28,28)) + print("after reshape",K.int_shape(x)) + output_row = (K.int_shape(x)[1]-kernel_size[0])>>1+1 + output_col = (K.int_shape(x)[2]-kernel_size[1])>>1+1 + feature_dim = kernel_size[0]*kernel_size[1] + data_format = 'channels_last' + stride = 2 + for i in range(output_row): + for j in range(output_col): + slice_row = slice(i * stride, + i * stride + kernel_size[0]) + slice_col = slice(j * stride, + j * stride + kernel_size[1]) + if data_format == 'channels_first': + xs.append(K.reshape(x[:, slice_row, slice_col], + (1, -1,feature_dim))) + else: + xs.append(K.reshape(x[:, slice_row, slice_col], + (-1,1,feature_dim))) + ys.append(K.reshape(y[:, slice_row, slice_col], + (-1,1,feature_dim))) + + x_aggregate =K.concatenate(xs, axis=1) + y_aggregate =K.concatenate(ys, axis=1) + #1st dimension: example. 2nd dimension: position in output, 3rd dimension: position in patch + print("after patching", K.int_shape(x_aggregate),K.shape(x_aggregate)) + XY = tf.einsum("ijk,ljk->ilj",x_aggregate,y_aggregate)/feature_dim + #normalization is missing...crap this is never going to work. why isn't acos crashing? + #I should be able to figure this out...just some angles...hate thinking in tensors... + xy = (K.sqrt(1-K.square(XY))+tf.multiply((pi-tf.acos(XY)),XY))/pi + print("After einsum",K.int_shape(XY)) + hidden = tf.reduce_sum(xy,2)/(output_row*output_col) + return (K.sqrt(1-K.square(hidden))+tf.multiply((pi-tf.acos(hidden)),hidden))/pi diff --git a/mnist.py b/mnist.py index 02e614b..fc776b0 100644 --- a/mnist.py +++ b/mnist.py @@ -1,6 +1,6 @@ import numpy as np - -from keras.datasets.mnist import load_data +import keras +from keras.datasets.mnist import load_data as mnist_data def unit_range_normalize(X): @@ -11,13 +11,19 @@ def unit_range_normalize(X): SX = (X - min_) / diff_ return SX -def load(): +def prod(x): + s = 1 + for xx in x: + s*=xx + return s + +def load(convolution=False): # input image dimensions img_rows, img_cols = 28, 28 # the data, shuffled and split between train and test sets - (x_train, y_train), (x_test, y_test) = load_data() - + (x_train, y_train), (x_test, y_test) = mnist_data() + #if not convolution: x_train = x_train.reshape(x_train.shape[0], img_rows * img_cols) x_test = x_test.reshape(x_test.shape[0], img_rows * img_cols) @@ -29,5 +35,17 @@ def load(): print("Load MNIST dataset.") print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') - - return (x_train, y_train), (x_test, y_test) + print(x_train[0]) + # convert class vectors to binary class matrices + num_classes = 10 + y_train = keras.utils.to_categorical(y_train, num_classes) + y_test = keras.utils.to_categorical(y_test, num_classes) + R = np.max(np.linalg.norm(x_train,2,axis=1))**2 + + n = x_train.shape[0] # (n_sample, n_feature) + D = prod(x_train.shape[1:]) + d = np.int32(n / 2) * 2 # number of random features + x_train = keras.utils.normalize(x_train,axis=1) + x_test = keras.utils.normalize(x_test,axis=1) + print(x_train[0].shape) + return num_classes,x_train,x_test,y_train,y_test,n,D,R,d diff --git a/run_expr.py b/run_expr.py index f2ba50e..4acf7c3 100644 --- a/run_expr.py +++ b/run_expr.py @@ -12,6 +12,7 @@ import numpy as np import time import warnings +from math import sqrt from distutils.version import StrictVersion from keras.layers import Dense, Input @@ -21,10 +22,16 @@ import kernels import mnist import utils +import cifar +import imdb from backend_extra import hasGPU from layers import KernelEmbedding, RFF from optimizers import PSGD, SGD +import tensorflow as tf +tf.logging.set_verbosity(tf.logging.ERROR) +import os +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' assert StrictVersion(keras.__version__) >= StrictVersion('2.0.8'), \ "Requires Keras (>=2.0.8)." @@ -37,11 +44,15 @@ assert keras.backend.backend() == u'tensorflow', \ "Requires Tensorflow (>=1.2.1)." -assert hasGPU(), "Requires GPU." +#assert hasGPU(), "Requires GPU." parser = argparse.ArgumentParser(description='Run EigenPro tests.') parser.add_argument('--kernel', type=str, default='Gaussian', help='kernel function (e.g. Gaussian, Laplace, and Cauchy)') +parser.add_argument('--dataset', type=str, default='mnist', + help='dataset used for evaluation (e.g. mnist, cifar10, imdb (more to follow))') +parser.add_argument('--depth', type=int, default=1, + help='the depth of the neural network that is mimiced by the kernel (if supported by kernel)') args = parser.parse_args() args_dict = vars(args) @@ -51,67 +62,72 @@ M = 4800 # (EigenPro) subsample size k = 160 # (EigenPro) top-k eigensystem -num_classes = 10 # number of classes +if args_dict['dataset'] == "mnist": + num_classes,x_train,x_test,y_train,y_test,n,D,R,d = mnist.load() +elif args_dict['dataset'] == "cifar10": + num_classes,x_train,x_test,y_train,y_test,n,D,R,d = cifar.load() +elif args_dict['dataset'] == "imdb": + num_classes,x_train,x_test,y_train,y_test,n,D,R,d = imdb.load() -(x_train, y_train), (x_test, y_test) = mnist.load() -n, D = x_train.shape # (n_sample, n_feature) -d = np.int32(n / 2) * 2 # number of random features -# convert class vectors to binary class matrices -y_train = keras.utils.to_categorical(y_train, num_classes) -y_test = keras.utils.to_categorical(y_test, num_classes) if args_dict['kernel'] == 'Gaussian': - s = 5 # kernel bandwidth + s = sqrt(D) # kernel bandwidth kernel = lambda x,y: kernels.Gaussian(x, y, s) elif args_dict['kernel'] == 'Laplace': s = np.sqrt(10, dtype=np.float32) kernel = lambda x,y: kernels.Laplace(x, y, s) - +elif args_dict['kernel'] == 'Linear': + kernel = lambda x,y: kernels.Linear(x, y) elif args_dict['kernel'] == 'Cauchy': s = np.sqrt(40, dtype=np.float32) kernel = lambda x,y: kernels.Cauchy(x, y, s) - +elif args_dict['kernel'] == 'ReLu': + print("AAAH",np.linalg.norm(x_train,2,axis=1).shape) + s = np.max(np.linalg.norm(x_train,2,axis=1))**2 + kernel = lambda x,y: kernels.ReLu(x, y, s) +elif args_dict['kernel'] == 'InversePolynomial': + s = int(args_dict["depth"]) + kernel = lambda x,y: kernels.InversePolynomial(x, y, s) +elif args_dict['kernel'] == 'CompositionalFeedForwardKernel': + s = int(args_dict["depth"]) + kernel = lambda x,y: kernels.CompositionalFeedForwardKernel(x, y, s) +elif args_dict['kernel'] == 'ArcCosine': + s = int(args_dict["depth"]) + kernel = lambda x,y: kernels.ArcCosine(x, y, s, R) +elif args_dict['kernel'] == 'Convolution': + s = int(args_dict["depth"]) + kernel = lambda x,y: kernels.Convolution(x, y) else: raise Exception("Unknown kernel function - %s. \ Try Gaussian, Laplace, or Cauchy" % args_dict['kernel']) - +print("Using ",args_dict["kernel"], "with Depth" ,args_dict["depth"]) trainers = collections.OrderedDict() Trainer = collections.namedtuple('Trainer', ['model', 'x_train', 'x_test']) +print(x_train.shape) + # Calculate step size and (Primal) EigenPro preconditioner. kf, scale, s0 = utils.asm_eigenpro_f( x_train, kernel, M, k, 1, in_rkhs=True) eta = np.float32(1.5 / s0) # 1.5 / s0 eta = eta * num_classes # correction due to mse loss -# Assemble Pegasos trainer. +# # Assemble Pegasos trainer. input_shape = (D+1,) # n_feature, (sample) index -ix = Input(shape=input_shape, dtype='float32', name='indexed-feat') +ix = Input(shape=(input_shape), dtype='float32', name='indexed-feat') x, index = utils.separate_index(ix) # features, sample_id kfeat = KernelEmbedding(kernel, x_train, - input_shape=(D,))(x) -y = Dense(num_classes, input_shape=(n,), - activation='linear', - kernel_initializer='zeros', - use_bias=False)(kfeat) - -model = Model(ix, y) -model.compile(loss='mse', - optimizer=PSGD(pred_t=y, index_t=index, eta=eta), - metrics=['accuracy']) -trainers['Pegasos'] = Trainer(model=model, - x_train = utils.add_index(x_train), - x_test=utils.add_index(x_test)) - + input_shape=(D,))(x) # Assemble kernel EigenPro trainer. embed = Model(ix, kfeat) y = Dense(num_classes, input_shape=(n,), activation='linear', kernel_initializer='zeros', + #kernel_regularizer=keras.regularizers.l2(0.1), use_bias=False)(kfeat) model = Model(ix, y) model.compile(loss='mse', @@ -124,51 +140,18 @@ x_train = utils.add_index(x_train), x_test=utils.add_index(x_test)) - -# Assemble SGD trainer. -rff_weights = np.float32( # for Gaussian kernel - np.sqrt(2. / (2 * 5 ** 2)) # s = 5 - * np.random.randn(D, d/2)) -input_shape = (D,) -x = Input(shape=input_shape, dtype='float32', name='feat') -rf_f = RFF(rff_weights, input_shape=input_shape) -y = Dense(num_classes, input_shape=(d,), - activation='linear', - kernel_initializer='zeros', - use_bias=False)(rf_f(x)) -model = Model(x, y) - -model.compile(loss='mse', - optimizer=SGD(eta=eta), - metrics=['accuracy']) -trainers['SGD with random Fourier feature'] = Trainer( - model=model, x_train = x_train, x_test=x_test) - -# Assemble EigenPro trainer. -f, scale, _ = utils.asm_eigenpro_f( - x_train, rf_f, M, k, .25) -y = Dense(num_classes, input_shape=(d,), - activation='linear', - kernel_initializer='zeros', - use_bias=False)(rf_f(x)) -model = Model(x, y) -model.compile(loss='mse', - optimizer=SGD(eta=scale*eta, eigenpro_f=f), - metrics=['accuracy']) -trainers['EigenPro with random Fourier feature'] = Trainer( - model=model, x_train = x_train, x_test=x_test) - # Start training. -for name, trainer in trainers.iteritems(): +for name, trainer in trainers.items(): print("") initial_epoch=0 np.random.seed(1) # Keras uses numpy random number generator train_ts = 0 # training time in seconds - for epoch in [1, 2, 5, 10, 20, 40]: + for epoch in [100]: start = time.time() + print("Running ",epoch," Iterations on ",name) trainer.model.fit( trainer.x_train, y_train, - batch_size=bs, epochs=epoch, verbose=0, + batch_size=bs, epochs=epoch, verbose=2, validation_data=(trainer.x_test, y_test), initial_epoch=initial_epoch) train_ts += time.time() - start diff --git a/utils.py b/utils.py index d94b0dc..c4c7b80 100644 --- a/utils.py +++ b/utils.py @@ -19,6 +19,7 @@ def add_index(X): matrix of shape (n_sample, n_feat+1). """ inx = np.reshape(np.arange(X.shape[0]), (-1, 1)) + print("inx",inx.shape) return np.hstack([X, inx]) def separate_index(IX): @@ -54,6 +55,7 @@ def rsvd(X, phi, M, k): sk: (k+1)-th largest eigenvalue of phi(X). """ n, _ = X.shape + print("rsvd",X.shape) index = np.random.choice(n, M, replace=False) A = phi(X[index]) @@ -69,7 +71,11 @@ def rsvd(X, phi, M, k): sk = np.sqrt(n / M) * S1[k] V = VT1[:k].T return s, V, sk - +def prod(x): + s = 1 + for xx in x: + s*=xx + return s def asm_eigenpro_f(feat, phi, M, k, tau, in_rkhs=False): """Assemble eigenpro map and calculate step size scale factor such that the update rule, @@ -91,10 +97,11 @@ def asm_eigenpro_f(feat, phi, M, k, tau, in_rkhs=False): """ start = time.time() - n, D = feat.shape + n = feat.shape[0] + D = prod(feat.shape[1:]) x = Input(shape=(D,), dtype='float32', name='feat') if in_rkhs: - if n >= 10**5: + if n >= 10**4: _s, _V = nystrom_kernel_svd(feat, phi, M, k) # phi is k(x, y) else: kfeat = KernelEmbedding(phi, feat, @@ -107,8 +114,8 @@ def asm_eigenpro_f(feat, phi, M, k, tau, in_rkhs=False): fmap = lambda _x: model.predict(_x, batch_size=1024) _s, _V, _sk = rsvd(feat, fmap, M, k) # phi is a feature map _s, _sk, _V = _s[:k], _s[-1], _V[:, :k] - print("SVD time: %.2f, Eigenvalue ratio: %.2f" % - (time.time() - start, _s[0] / _sk)) + print("SVD time: %.2f, Eigenvalue ratio: %.2f %.2f %.8f" % + (time.time() - start, _s[0] / _sk,_s[0] ,_sk)) s = K.constant(_s) V = K.constant(_V)