diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bee8a64
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/cifar.py b/cifar.py
new file mode 100644
index 0000000..59f19cf
--- /dev/null
+++ b/cifar.py
@@ -0,0 +1,31 @@
+import keras
+from keras.datasets import cifar10
+import numpy as np
+
+def load():
+	num_classes = 10	    # number of classes
+	(x_train, y_train), (x_test, y_test) = cifar10.load_data()
+
+	# print(x_train.shape[0], 'train samples')
+	# print(x_test.shape[0], 'test samples')
+	# print(type(x_train))
+
+	x_train = x_train.reshape((x_train.shape[0],-1))
+	x_test = x_test.reshape((x_test.shape[0],-1))
+	# print('x_train shape:', x_train.shape)
+	# print(x_train[0],x_train.dtype)
+	n, D = x_train.shape    # (n_sample, n_feature)
+	x_train = np.divide(x_train,255.0)
+	x_test = np.divide(x_test,255.0)
+	
+	print("Load CIFAR10 dataset.")
+	print(x_train.shape[0], 'train samples')
+	print(x_test.shape[0], 'test samples')
+
+	d = np.int32(n / 2) * 2 # number of random features
+
+	# convert class vectors to binary class matrices
+	y_train = keras.utils.to_categorical(y_train, num_classes)
+	y_test = keras.utils.to_categorical(y_test, num_classes)
+	R = np.max(np.linalg.norm(x_train,2,axis=1))**2
+	return num_classes,x_train,x_test,y_train,y_test,n,D,R,d
\ No newline at end of file
diff --git a/dnn.py b/dnn.py
new file mode 100644
index 0000000..63741f4
--- /dev/null
+++ b/dnn.py
@@ -0,0 +1,103 @@
+'''Train a simple deep CNN on the CIFAR10 small images dataset.
+GPU run command with Theano backend (with TensorFlow, the GPU is automatically used):
+		THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatx=float32 python cifar10_cnn.py
+It gets down to 0.65 test logloss in 25 epochs, and down to 0.55 after 50 epochs.
+(it's still underfitting at that point, though).
+'''
+
+from __future__ import print_function
+import keras
+from keras.datasets import cifar10
+from keras.preprocessing.image import ImageDataGenerator
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Activation, Flatten
+from keras.layers import Conv2D, MaxPooling2D
+import numpy.linalg
+from numpy.linalg import svd
+import mnist
+import cifar
+import imdb
+batch_size = 16
+epochs = 50
+data_augmentation = False
+
+# The data, shuffled and split between train and test sets:
+num_classes,x_train,x_test,y_train,y_test,n,D,R,d = imdb.load()
+
+model = Sequential()
+#model.add(Flatten(input_shape=x_train.shape[1:]))
+layer1 = Dense(256,input_shape=(x_train.shape[1],))#
+#Conv2D(32, (3, 3),strides=(3,3),padding='same',
+#					)
+model.add(layer1)
+# model.add(Activation('relu'))
+# model.add(Dropout(0.5))
+# layer2 = Dense(128)
+#Conv2D(32, (3, 3))
+# model.add(layer2)
+# model.add(Activation('relu'))
+# model.add(Dropout(0.5))
+#model.add(MaxPooling2D(pool_size=(2, 2)))
+#model.add(Dropout(0.25))
+# layer3 = Dense(64)#Conv2D(64, (3, 3), padding='same')
+# model.add(layer3)
+# model.add(Activation('relu'))
+#model.add(Dropout(0.5))
+#model.add(Dense(100))
+#model.add(Activation('relu'))
+layer5 = Dense(num_classes)
+model.add(layer5)
+model.add(Activation('softmax'))
+model.summary()
+# initiate RMSprop optimizer
+opt = keras.optimizers.sgd(lr=0.01)
+
+# Let's train the model using RMSprop
+model.compile(loss='categorical_crossentropy',
+							optimizer=opt,
+							metrics=['accuracy'])
+
+
+num_classes,x_train,x_test,y_train,y_test,n,D,R,d = imdb.load()
+from numpy.random import shuffle
+
+
+model.fit(x_train, y_train,
+		batch_size=batch_size,
+		epochs=epochs,
+		validation_data=(x_test, y_test),
+		shuffle=True,
+		verbose=2)
+
+
+# spectrum1 = svd(layer1.get_weights()[0],compute_uv=False)
+# spectrum2 = svd(layer2.get_weights()[0],compute_uv=False)
+# spectrum3 = svd(layer3.get_weights()[0],compute_uv=False)
+# #spectrum4 = svd(numpy.reshape(layer4.get_weights()[0],(64*9,64)),compute_uv=False)
+# spectrum5 = svd(layer5.get_weights()[0],compute_uv=False)
+
+# numpy.set_printoptions(threshold=numpy.nan)
+# #print(layer1.get_weights()[0].dot(layer1.get_weights()[0].transpose()))
+# W = layer1.get_weights()[0].transpose()
+# print(W.shape)
+# print(W.dot(W.transpose()).shape)
+# W = W/(numpy.linalg.norm(W,axis=1,ord=2).reshape(W.shape[0],1))
+# print(-numpy.sort(-(W.dot(W.transpose())).flatten())[W.shape[0]:W.shape[0]*3:2])
+# W = layer2.get_weights()[0].transpose()
+# W = W/(numpy.linalg.norm(W,axis=1,ord=2).reshape(W.shape[0],1))
+# print(-numpy.sort(-(W.dot(W.transpose())).flatten())[W.shape[0]:W.shape[0]*3:2])
+# W = layer3.get_weights()[0].transpose()
+# W = W/(numpy.linalg.norm(W,axis=1,ord=2).reshape(W.shape[0],1))
+# print(-numpy.sort(-(W.dot(W.transpose())).flatten())[W.shape[0]:W.shape[0]*3:2])
+# #print(spectrum4)
+# W = layer5.get_weights()[0].transpose()
+# W = W/(numpy.linalg.norm(W,axis=1,ord=2).reshape(W.shape[0],1))
+# print(-numpy.sort(-(W.dot(W.transpose())).flatten())[W.shape[0]:W.shape[0]*3:2])
+
+# print(spectrum1)
+# print(spectrum2)
+# print(spectrum3)
+# print(spectrum5)
+
+#preds = model.predict_on_batch(x_test)
+#print("done",preds[0])
\ No newline at end of file
diff --git a/imdb.py b/imdb.py
new file mode 100644
index 0000000..c36cf24
--- /dev/null
+++ b/imdb.py
@@ -0,0 +1,47 @@
+import keras
+import numpy as np
+from keras.datasets import imdb
+from scipy.sparse import csr_matrix
+def load():
+	(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000,
+													  skip_top=30,
+													  maxlen=None)
+	indptr = [0]
+	indices = []
+	data = []
+	vocabulary = {}
+	for d in x_train:
+		for index in d:
+			indices.append(index)
+			data.append(1.0/len(d))
+		indptr.append(len(indices))
+	x_train = csr_matrix((data, indices, indptr), dtype=float).toarray()
+
+	indptr = [0]
+	indices = []
+	data = []
+	vocabulary = {}
+	for d in x_test:
+		for index in d:
+			indices.append(index)
+			data.append(1.0/len(d))
+		indptr.append(len(indices))
+	x_test = csr_matrix((data, indices, indptr), dtype=float).toarray()
+	print("Load IMDB dataset.")
+	print(x_train.shape[0], 'train samples')
+	print(x_test.shape[0], 'test samples')
+	print(x_train.shape[1],"features")
+	#x_train = keras.preprocessing.text.one_hot(x_train,10000)
+	print(x_train[0])
+	#x_test = keras.preprocessing.text.one_hot(x_test,10000)
+	n, D = x_train.shape    # (n_sample, n_feature)
+	d = np.int32(n / 2) * 2 # number of random features
+	num_classes = 2
+	# convert class vectors to binary class matrices
+	y_train = keras.utils.to_categorical(y_train, num_classes)
+	y_test = keras.utils.to_categorical(y_test, num_classes)
+	R = np.max(np.linalg.norm(x_train,2,axis=1))**2
+	# x_train/=R
+	# x_test/=R
+	# R = 1.0
+	return num_classes,x_train,x_test,y_train,y_test,n,D,R,d
\ No newline at end of file
diff --git a/kernels.py b/kernels.py
index ef6803f..61662dd 100644
--- a/kernels.py
+++ b/kernels.py
@@ -3,76 +3,237 @@
 from keras import backend as K
 
 def D2(X, Y):
-    """ Calculate the pointwise (squared) distance.
-    
-    Arguments:
-    	X: of shape (n_sample, n_feature).
-    	Y: of shape (n_center, n_feature).
-    
-    Returns:
-    	pointwise distances (n_sample, n_center).
-    """
-    XX = K.sum(K.square(X), axis = 1, keepdims=True)
-    if X is Y:
-        YY = XX
-    else:
-        YY = K.sum(K.square(Y), axis = 1, keepdims=True)
-    XY = K.dot(X, K.transpose(Y))
-    d2 = K.reshape(XX, (K.shape(X)[0], 1)) \
-       + K.reshape(YY, (1, K.shape(Y)[0])) \
-       - 2 * XY
-    return d2
+	""" Calculate the pointwise (squared) distance.
+	
+	Arguments:
+		X: of shape (n_sample, n_feature).
+		Y: of shape (n_center, n_feature).
+	
+	Returns:
+		pointwise distances (n_sample, n_center).
+	"""
+	XX = K.sum(K.square(X), axis = 1, keepdims=True)
+	if X is Y:
+		YY = XX
+	else:
+		YY = K.sum(K.square(Y), axis = 1, keepdims=True)
+	XY = K.dot(X, K.transpose(Y))
+	d2 = K.reshape(XX, (K.shape(X)[0], 1)) \
+	   + K.reshape(YY, (1, K.shape(Y)[0])) \
+	   - 2 * XY
+	return d2
+
+def Linear(X,Y):
+	return K.dot(X,K.transpose(Y))
 
 def Gaussian(X, Y, s):
-    """ Gaussian kernel.
-    
-    Arguments:
-    	X: of shape (n_sample, n_feature).
-    	Y: of shape (n_center, n_feature).
-    	s: kernel bandwidth.
-    
-    Returns:
-    	kernel matrix of shape (n_sample, n_center).
-    """
-    assert s > 0
-    
-    d2 = D2(X, Y)
-    gamma = np.float32(1. / (2 * s ** 2))
-    G = K.exp(-gamma * K.clip(d2, 0, None))
-    return G
+	""" Gaussian kernel.
+	
+	Arguments:
+		X: of shape (n_sample, n_feature).
+		Y: of shape (n_center, n_feature).
+		s: kernel bandwidth.
+	
+	Returns:
+		kernel matrix of shape (n_sample, n_center).
+	"""
+	assert s > 0
+	
+	d2 = D2(X, Y)
+	gamma = np.float32(1. / (2 * s ** 2))
+	G = K.exp(-gamma * K.clip(d2, 0, None))
+	return G
 
 def Laplace(X, Y, s):
-    """ Laplace kernel.
-    
-    Arguments:
-    	X: of shape (n_sample, n_feature).
-    	Y: of shape (n_center, n_feature).
-    	s: kernel bandwidth.
-    
-    Returns:
-    	kernel matrix of shape (n_sample, n_center).
-    """
-    assert s > 0
-    
-    d2 = K.clip(D2(X, Y), 0, None)
-    d = K.sqrt(d2)
-    G = K.exp(- d / s)
-    return G
+	""" Laplace kernel.
+	
+	Arguments:
+		X: of shape (n_sample, n_feature).
+		Y: of shape (n_center, n_feature).
+		s: kernel bandwidth.
+	
+	Returns:
+		kernel matrix of shape (n_sample, n_center).
+	"""
+	assert s > 0
+	
+	d2 = K.clip(D2(X, Y), 0, None)
+	d = K.sqrt(d2)
+	G = K.exp(- d / s)
+	return G
 
 def Cauchy(X, Y, s):
-    """ Cauchy kernel.
-    
-    Arguments:
-    	X: of shape (n_sample, n_feature).
-    	Y: of shape (n_center, n_feature).
-    	s: kernel bandwidth.
-    
-    Returns:
-    	kernel matrix of shape (n_sample, n_center).
-    """
-    assert s > 0
-    
-    d2 = D2(X, Y)
-    s2 = np.float32(s**2)
-    G = 1 / K.exp( 1 + K.clip(d2, 0, None) / s2)
-    return G
+	""" Cauchy kernel.
+	
+	Arguments:
+		X: of shape (n_sample, n_feature).
+		Y: of shape (n_center, n_feature).
+		s: kernel bandwidth.
+	
+	Returns:
+		kernel matrix of shape (n_sample, n_center).
+	"""
+	assert s > 0
+	
+	d2 = D2(X, Y)
+	s2 = np.float32(s**2)
+	G = 1 / K.exp( 1 + K.clip(d2, 0, None) / s2)
+	return G
+
+import tensorflow as tf
+
+def ReLu(X,Y,s):
+	#Francis Bach - Breaking the Curse of Dimensionality with Convex Neural Networks
+	s = np.float32(s)
+	XX = K.sum(K.square(X), axis = 1, keepdims=True)
+	if Y is X:
+		YY = XX
+	else:
+		YY = K.sum(K.square(Y), axis = 1, keepdims=True)
+	XX = K.reshape(XX, (K.shape(X)[0], 1))
+	YY = K.reshape(YY, (1,K.shape(Y)[0]))
+
+	xx = K.sqrt(XX/s+np.float32(1.0))
+	yy = K.sqrt(YY/s+np.float32(1.0))
+
+	XY = K.dot(X, K.transpose(Y))
+	
+	cos = (XY/s + np.float32(1.0)) / (K.dot(xx,yy)) #
+	cos = K.clip(cos,-1,1)
+	delta = tf.acos(cos)
+	pi = np.float32(3.14159265359)
+	d = K.int_shape(X)[1]
+	return K.dot(xx,yy)*((pi-delta)*K.cos(delta)+K.sin(delta))/(2*d*pi) #dot product, eg. macht nur das erste sinn.
+
+
+def InversePolynomial(X,Y,s):
+	# Zhang, Lee and Jordan - l1 regularized Neural Networks are Improperly Learniable in Polynomial Time
+	XY = K.dot(X, K.transpose(Y))
+	XX = K.sqrt(K.sum(K.square(X), axis = 1, keepdims=True))
+	if Y is X:
+		YY = XX
+	else:
+		YY = K.sqrt(K.sum(K.square(Y), axis = 1, keepdims=True))
+	XX = K.reshape(XX, (K.shape(X)[0], 1))
+	YY = K.reshape(YY, (1,K.shape(Y)[0]))
+	XY = XY/K.dot(XX,YY)
+	XY = K.clip(XY,-1,1)
+	for i in range(s):
+		XY = np.float(1)/(np.float32(2)-XY)
+		XY = K.clip(XY,0,1)
+	return XY
+
+
+def CompositionalFeedForwardKernel(X,Y,s):
+	# Amit Daniely, Roy Frostig, Yoram Singer - Toward Deeper Understanding of Neural Networks -- The Power of Initialization and a Dual View on Expressivity 
+	s = np.float32(s)
+	
+	XX = K.sqrt(K.sum(K.square(X), axis = 1, keepdims=True))
+	if Y is X:
+		YY = XX
+	else:
+		YY = K.sqrt(K.sum(K.square(Y), axis = 1, keepdims=True))
+
+	XX = K.reshape(XX, (K.shape(X)[0], 1))
+	YY = K.reshape(YY, (1,K.shape(Y)[0]))
+	
+	XY = K.dot(X, K.transpose(Y))
+	XY = tf.div(XY,K.dot(XX,YY))
+	XY = K.clip(XY,-1,1)
+	
+	pi = np.float32(3.14159265359)
+	d = K.int_shape(X)[1]
+	XY = XY/d
+	for i in range(s):
+		XY = (K.sqrt(1-K.square(XY))+tf.multiply((pi-tf.acos(XY)),XY))/pi
+		#XY = K.clip(XY,-1,1)
+	return XY
+
+def ArcCosine(X,Y,s,bias=0.0):
+	# Youngmin Cho, Lawrence K. Saul - Kernel Methods for Deep Learning
+	# k(x,y) = \frac 1 \pi ||x||\cdot||y|| \cdot \left( \sin \theta + (\pi - \theta)\cos \theta\right)
+	#bias should be as large as R
+	if s<=1:
+		XX = K.sqrt(bias+K.sum(K.square(X), axis = 1, keepdims=True))
+		if Y is X:
+			YY = XX
+		else:
+			YY = K.sqrt(bias+K.sum(K.square(Y), axis = 1, keepdims=True))
+		XX = K.reshape(XX, (K.shape(X)[0], 1))
+		YY = K.reshape(YY, (1,K.shape(Y)[0]))
+		
+		XY = K.dot(X, K.transpose(Y))+bias
+		XY = tf.div(XY,K.dot(XX,YY))
+		XY = K.clip(XY,-1,1)
+		theta = tf.acos(XY)
+		pi = np.float32(3.14159265359)
+		
+		return (K.dot(XX,YY) * (K.sin(theta) + (pi-theta) * K.cos(theta)))/pi
+	else:
+		XX = K.sqrt(bias+K.sum(K.square(X), axis = 1, keepdims=True))
+		if Y is X:
+			YY = XX
+		else:
+			YY = K.sqrt(bias+K.sum(K.square(Y), axis = 1, keepdims=True))
+		XX = K.reshape(XX, (K.shape(X)[0],1))
+		YY = K.reshape(YY, (1,K.shape(Y)[0]))
+		XY = ArcCosine(X,Y,s-1)+bias
+		XY = tf.div(XY,K.dot(XX,YY))
+		XY = K.clip(XY,-1,1)
+		theta = tf.acos(XY)
+		pi = np.float32(3.14159265359)
+		
+		return (K.dot(XX,YY) * (K.sin(theta) + (pi-theta) * K.cos(theta)))/pi
+
+
+def Convolution(X,Y,kernel_size=(4,4)):
+	# XX = K.sqrt(K.sum(K.square(X), axis = 1, keepdims=False))
+	# if Y is X:
+	# 	YY = XX
+	# else:
+	# 	YY = K.sqrt(K.sum(K.square(Y), axis = 1, keepdims=False))
+
+	# # XX = K.reshape(XX, (K.shape(X)[0]))
+	# # YY = K.reshape(YY, (K.shape(Y)[0], 1))
+	# print(K.int_shape(X),K.int_shape(XX))
+	# Xnorm = K.dot(tf.diag(1.0/XX),X)
+	# Ynorm = K.dot(tf.diag(1.0/YY),Y)
+
+	xs = []
+	ys = []
+	pi = np.float32(3.14159265359)
+	print("before reshape",K.int_shape(X))
+	x = K.reshape(X,(-1,28,28))
+	y = K.reshape(Y,(-1,28,28))
+	print("after reshape",K.int_shape(x))
+	output_row = (K.int_shape(x)[1]-kernel_size[0])>>1+1
+	output_col = (K.int_shape(x)[2]-kernel_size[1])>>1+1
+	feature_dim = kernel_size[0]*kernel_size[1]
+	data_format = 'channels_last'
+	stride = 2
+	for i in range(output_row):
+		for j in range(output_col):
+			slice_row = slice(i * stride,
+							  i * stride + kernel_size[0])
+			slice_col = slice(j * stride,
+							  j * stride + kernel_size[1])
+			if data_format == 'channels_first':
+				xs.append(K.reshape(x[:, slice_row, slice_col],
+								  (1, -1,feature_dim)))
+			else:
+				xs.append(K.reshape(x[:, slice_row, slice_col],
+								  (-1,1,feature_dim)))
+				ys.append(K.reshape(y[:, slice_row, slice_col],
+								  (-1,1,feature_dim)))
+
+	x_aggregate =K.concatenate(xs, axis=1)
+	y_aggregate =K.concatenate(ys, axis=1)
+	#1st dimension: example. 2nd dimension: position in output, 3rd dimension: position in patch
+	print("after patching", K.int_shape(x_aggregate),K.shape(x_aggregate))
+	XY = tf.einsum("ijk,ljk->ilj",x_aggregate,y_aggregate)/feature_dim
+	#normalization is missing...crap this is never going to work. why isn't acos crashing?
+	#I should be able to figure this out...just some angles...hate thinking in tensors...
+	xy = (K.sqrt(1-K.square(XY))+tf.multiply((pi-tf.acos(XY)),XY))/pi
+	print("After einsum",K.int_shape(XY))
+	hidden = tf.reduce_sum(xy,2)/(output_row*output_col)
+	return (K.sqrt(1-K.square(hidden))+tf.multiply((pi-tf.acos(hidden)),hidden))/pi
diff --git a/mnist.py b/mnist.py
index 02e614b..fc776b0 100644
--- a/mnist.py
+++ b/mnist.py
@@ -1,6 +1,6 @@
 import numpy as np
-
-from keras.datasets.mnist import load_data
+import keras
+from keras.datasets.mnist import load_data as mnist_data
 
 
 def unit_range_normalize(X):
@@ -11,13 +11,19 @@ def unit_range_normalize(X):
 	SX = (X - min_) / diff_
 	return SX
 
-def load():
+def prod(x):
+	s = 1
+	for xx in x:
+		s*=xx
+	return s
+
+def load(convolution=False):
 	# input image dimensions
 	img_rows, img_cols = 28, 28
 	
 	# the data, shuffled and split between train and test sets
-	(x_train, y_train), (x_test, y_test) = load_data()
-	
+	(x_train, y_train), (x_test, y_test) = mnist_data()
+	#if not convolution:
 	x_train = x_train.reshape(x_train.shape[0], img_rows * img_cols)
 	x_test = x_test.reshape(x_test.shape[0], img_rows * img_cols)
 	
@@ -29,5 +35,17 @@ def load():
 	print("Load MNIST dataset.")
 	print(x_train.shape[0], 'train samples')
 	print(x_test.shape[0], 'test samples')
-
-	return (x_train, y_train), (x_test, y_test)
+	print(x_train[0])
+	# convert class vectors to binary class matrices
+	num_classes = 10
+	y_train = keras.utils.to_categorical(y_train, num_classes)
+	y_test = keras.utils.to_categorical(y_test, num_classes)
+	R = np.max(np.linalg.norm(x_train,2,axis=1))**2
+	
+	n = x_train.shape[0]    # (n_sample, n_feature)
+	D = prod(x_train.shape[1:])
+	d = np.int32(n / 2) * 2 # number of random features
+	x_train = keras.utils.normalize(x_train,axis=1)
+	x_test = keras.utils.normalize(x_test,axis=1)
+	print(x_train[0].shape)
+	return num_classes,x_train,x_test,y_train,y_test,n,D,R,d
diff --git a/run_expr.py b/run_expr.py
index f2ba50e..4acf7c3 100644
--- a/run_expr.py
+++ b/run_expr.py
@@ -12,6 +12,7 @@
 import numpy as np
 import time
 import warnings
+from math import sqrt
 
 from distutils.version import StrictVersion
 from keras.layers import Dense, Input
@@ -21,10 +22,16 @@
 import kernels
 import mnist
 import utils
+import cifar
+import imdb
 
 from backend_extra import hasGPU
 from layers import KernelEmbedding, RFF
 from optimizers import PSGD, SGD
+import tensorflow as tf
+tf.logging.set_verbosity(tf.logging.ERROR)
+import os
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
 
 assert StrictVersion(keras.__version__) >= StrictVersion('2.0.8'), \
        "Requires Keras (>=2.0.8)."
@@ -37,11 +44,15 @@
 
 assert keras.backend.backend() == u'tensorflow', \
        "Requires Tensorflow (>=1.2.1)."
-assert hasGPU(), "Requires GPU."
+#assert hasGPU(), "Requires GPU."
 
 parser = argparse.ArgumentParser(description='Run EigenPro tests.')
 parser.add_argument('--kernel', type=str, default='Gaussian',
                     help='kernel function (e.g. Gaussian, Laplace, and Cauchy)')
+parser.add_argument('--dataset', type=str, default='mnist',
+                    help='dataset used for evaluation (e.g. mnist, cifar10, imdb (more to follow))')
+parser.add_argument('--depth', type=int, default=1,
+                    help='the depth of the neural network that is mimiced by the kernel (if supported by kernel)')
 args = parser.parse_args()
 args_dict = vars(args)
 
@@ -51,67 +62,72 @@
 M = 4800            # (EigenPro) subsample size
 k = 160             # (EigenPro) top-k eigensystem
 
-num_classes = 10	    # number of classes
+if args_dict['dataset'] == "mnist":
+  num_classes,x_train,x_test,y_train,y_test,n,D,R,d = mnist.load()
+elif args_dict['dataset'] == "cifar10":
+  num_classes,x_train,x_test,y_train,y_test,n,D,R,d = cifar.load()
+elif args_dict['dataset'] == "imdb":
+  num_classes,x_train,x_test,y_train,y_test,n,D,R,d = imdb.load()
 
-(x_train, y_train), (x_test, y_test) = mnist.load()
-n, D = x_train.shape    # (n_sample, n_feature)
-d = np.int32(n / 2) * 2 # number of random features
 
-# convert class vectors to binary class matrices
-y_train = keras.utils.to_categorical(y_train, num_classes)
-y_test = keras.utils.to_categorical(y_test, num_classes)
 
 if args_dict['kernel'] == 'Gaussian':
-    s = 5   # kernel bandwidth
+    s = sqrt(D)   # kernel bandwidth
     kernel = lambda x,y: kernels.Gaussian(x, y, s)
 
 elif args_dict['kernel'] == 'Laplace':
     s = np.sqrt(10, dtype=np.float32)
     kernel = lambda x,y: kernels.Laplace(x, y, s)
-
+elif args_dict['kernel'] == 'Linear':
+    kernel = lambda x,y: kernels.Linear(x, y)
 elif args_dict['kernel'] == 'Cauchy':
     s = np.sqrt(40, dtype=np.float32)
     kernel = lambda x,y: kernels.Cauchy(x, y, s)
-
+elif args_dict['kernel'] == 'ReLu':
+    print("AAAH",np.linalg.norm(x_train,2,axis=1).shape)
+    s = np.max(np.linalg.norm(x_train,2,axis=1))**2
+    kernel = lambda x,y: kernels.ReLu(x, y, s)
+elif args_dict['kernel'] == 'InversePolynomial':
+    s = int(args_dict["depth"])
+    kernel = lambda x,y: kernels.InversePolynomial(x, y, s)
+elif args_dict['kernel'] == 'CompositionalFeedForwardKernel':
+    s = int(args_dict["depth"])
+    kernel = lambda x,y: kernels.CompositionalFeedForwardKernel(x, y, s)
+elif args_dict['kernel'] == 'ArcCosine':
+    s = int(args_dict["depth"])
+    kernel = lambda x,y: kernels.ArcCosine(x, y, s, R)
+elif args_dict['kernel'] == 'Convolution':
+    s = int(args_dict["depth"])
+    kernel = lambda x,y: kernels.Convolution(x, y)
 else:
     raise Exception("Unknown kernel function - %s. \
                      Try Gaussian, Laplace, or Cauchy"
                     % args_dict['kernel'])
-
+print("Using ",args_dict["kernel"], "with Depth" ,args_dict["depth"])
 trainers = collections.OrderedDict()
 Trainer = collections.namedtuple('Trainer', ['model', 'x_train', 'x_test'])
 
 
+print(x_train.shape)
+
 # Calculate step size and (Primal) EigenPro preconditioner.
 kf, scale, s0 = utils.asm_eigenpro_f(
     x_train, kernel, M, k, 1, in_rkhs=True)
 eta = np.float32(1.5 / s0) # 1.5 / s0
 eta = eta * num_classes # correction due to mse loss
 
-# Assemble Pegasos trainer.
+# # Assemble Pegasos trainer.
 input_shape = (D+1,) # n_feature, (sample) index
-ix = Input(shape=input_shape, dtype='float32', name='indexed-feat')
+ix = Input(shape=(input_shape), dtype='float32', name='indexed-feat')
 x, index = utils.separate_index(ix)	# features, sample_id
 kfeat = KernelEmbedding(kernel, x_train,
-                        input_shape=(D,))(x)
-y = Dense(num_classes, input_shape=(n,),
-          activation='linear',
-          kernel_initializer='zeros',
-          use_bias=False)(kfeat)
-
-model = Model(ix, y)
-model.compile(loss='mse',
-              optimizer=PSGD(pred_t=y, index_t=index, eta=eta),
-              metrics=['accuracy'])
-trainers['Pegasos'] = Trainer(model=model,
-                              x_train = utils.add_index(x_train),
-                              x_test=utils.add_index(x_test))
-
+                         input_shape=(D,))(x)
 # Assemble kernel EigenPro trainer.
 embed = Model(ix, kfeat)
 y = Dense(num_classes, input_shape=(n,),
           activation='linear',
           kernel_initializer='zeros',
+          #kernel_regularizer=keras.regularizers.l2(0.1),
           use_bias=False)(kfeat)
 model = Model(ix, y)
 model.compile(loss='mse',
@@ -124,51 +140,18 @@
                               x_train = utils.add_index(x_train),
                               x_test=utils.add_index(x_test))
 
-
-# Assemble SGD trainer.
-rff_weights = np.float32(       # for Gaussian kernel
-    np.sqrt(2. / (2 * 5 ** 2))  # s = 5
-    * np.random.randn(D, d/2))
-input_shape = (D,)
-x = Input(shape=input_shape, dtype='float32', name='feat')
-rf_f = RFF(rff_weights, input_shape=input_shape)
-y = Dense(num_classes, input_shape=(d,),
-          activation='linear',
-          kernel_initializer='zeros',
-          use_bias=False)(rf_f(x))
-model = Model(x, y)
-
-model.compile(loss='mse',
-              optimizer=SGD(eta=eta),
-              metrics=['accuracy'])
-trainers['SGD with random Fourier feature'] = Trainer(
-	model=model, x_train = x_train,	x_test=x_test)
-
-# Assemble EigenPro trainer.
-f, scale, _ = utils.asm_eigenpro_f(
-	x_train, rf_f, M, k, .25)
-y = Dense(num_classes, input_shape=(d,),
-          activation='linear',
-          kernel_initializer='zeros',
-          use_bias=False)(rf_f(x))
-model = Model(x, y)
-model.compile(loss='mse',
-              optimizer=SGD(eta=scale*eta, eigenpro_f=f),
-              metrics=['accuracy'])
-trainers['EigenPro with random Fourier feature'] = Trainer(
-	model=model, x_train = x_train,	x_test=x_test)
-
 # Start training.
-for name, trainer in trainers.iteritems():
+for name, trainer in trainers.items():
     print("")
     initial_epoch=0
     np.random.seed(1) # Keras uses numpy random number generator
     train_ts = 0 # training time in seconds
-    for epoch in [1, 2, 5, 10, 20, 40]:
+    for epoch in [100]:
         start = time.time()
+        print("Running ",epoch," Iterations on ",name)
         trainer.model.fit(
 			trainer.x_train, y_train,
-			batch_size=bs, epochs=epoch, verbose=0,
+			batch_size=bs, epochs=epoch, verbose=2,
 			validation_data=(trainer.x_test, y_test),
 			initial_epoch=initial_epoch)
         train_ts += time.time() - start
diff --git a/utils.py b/utils.py
index d94b0dc..c4c7b80 100644
--- a/utils.py
+++ b/utils.py
@@ -19,6 +19,7 @@ def add_index(X):
         matrix of shape (n_sample, n_feat+1).
     """
     inx = np.reshape(np.arange(X.shape[0]), (-1, 1))
+    print("inx",inx.shape)
     return np.hstack([X, inx])
 
 def separate_index(IX):
@@ -54,6 +55,7 @@ def rsvd(X, phi, M, k):
         sk: (k+1)-th largest eigenvalue of phi(X).
     """
     n, _ = X.shape
+    print("rsvd",X.shape)
     index = np.random.choice(n, M, replace=False)
     A = phi(X[index])
 
@@ -69,7 +71,11 @@ def rsvd(X, phi, M, k):
     sk = np.sqrt(n / M) * S1[k]
     V = VT1[:k].T
     return s, V, sk
-
+def prod(x):
+    s = 1
+    for xx in x:
+        s*=xx
+    return s
 def asm_eigenpro_f(feat, phi, M, k, tau, in_rkhs=False):
     """Assemble eigenpro map and calculate step size scale factor
     such that the update rule,
@@ -91,10 +97,11 @@ def asm_eigenpro_f(feat, phi, M, k, tau, in_rkhs=False):
     """
 
     start = time.time()
-    n, D = feat.shape
+    n = feat.shape[0]
+    D = prod(feat.shape[1:])
     x = Input(shape=(D,), dtype='float32', name='feat')
     if in_rkhs:
-        if n >= 10**5:
+        if n >= 10**4:
             _s, _V = nystrom_kernel_svd(feat, phi, M, k) # phi is k(x, y)
         else:
             kfeat = KernelEmbedding(phi, feat,
@@ -107,8 +114,8 @@ def asm_eigenpro_f(feat, phi, M, k, tau, in_rkhs=False):
         fmap = lambda _x: model.predict(_x, batch_size=1024)
         _s, _V, _sk = rsvd(feat, fmap, M, k) # phi is a feature map
     _s, _sk, _V = _s[:k], _s[-1], _V[:, :k]
-    print("SVD time: %.2f, Eigenvalue ratio: %.2f" %
-          (time.time() - start, _s[0] / _sk))
+    print("SVD time: %.2f, Eigenvalue ratio: %.2f %.2f %.8f" %
+          (time.time() - start, _s[0] / _sk,_s[0] ,_sk))
 
     s = K.constant(_s)
     V = K.constant(_V)