-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkmeans.py
More file actions
111 lines (95 loc) · 4.82 KB
/
kmeans.py
File metadata and controls
111 lines (95 loc) · 4.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/python
from __future__ import print_function
from time import time
import numpy as np
import matplotlib.pyplot as plt
import pdb
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
#np.random.seed(42) #uncomment if you want to memorize random number used
dataset_file = 'dataset7000.npz' #numpy array containing matrix 'X' and vector of labels 'y' generated by 'main.py'
print(dataset_file)
dataset = np.load(dataset_file)
data,labels = (dataset['x'],dataset['y'])
data = np.asarray(data, dtype=np.float32)
#scaled_data = scale(data) # to be used for PCA, since generated matrix 'X' consists of 0s and 1s and it is too sparce it is not recommended to scale
n_samples, n_features = data.shape
n_clusters = len(np.unique(labels)) # number of clusters
#n_clusters = 2 # assign number of clusters to generate manually
print("n_clusters: %d, \t n_samples %d, \t n_features %d"
% (n_clusters, n_samples, n_features))
print(79 * '_')
print('% 9s' % 'init'
' time inertia homo compl v-meas ARI AMI')
def bench_k_means(estimator, name, data):
t0 = time()
estimator.fit(data)
print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f'
% (name, (time() - t0), estimator.inertia_,
metrics.homogeneity_score(labels, estimator.labels_),
metrics.completeness_score(labels, estimator.labels_),
metrics.v_measure_score(labels, estimator.labels_),
metrics.adjusted_rand_score(labels, estimator.labels_),
metrics.adjusted_mutual_info_score(labels, estimator.labels_)))
bench_k_means(KMeans(init='k-means++', n_clusters=n_clusters, n_init=100, verbose=0, tol=0.0), name="k-means++", data=data)
bench_k_means(KMeans(init='random', n_clusters=n_clusters, n_init=100, verbose=0, tol=0.0), name="random", data=data)
# in this case the seeding of the centers is deterministic, hence we run the
# kmeans algorithm only once with n_init=1, basically we are using PCA to find initial centroids
pca = PCA(n_components=n_clusters).fit(data)
bench_k_means(KMeans(init=pca.components_, n_clusters=n_clusters, n_init=1,verbose=0, tol=0.0), name="PCA-based", data=data)
if 1:
print(79 * '_')
print('Performaing dimensionality reduction:')
pca = PCA(n_components=None).fit(data)
variance = 0
var_percentage = 0.99
index = 0
while 1:
variance += pca.explained_variance_ratio_[index]
if variance > var_percentage:
break
index += 1
reduced_data = PCA(n_components=index).fit_transform(data)
print('Performaing clustering on dimensionality reduced data:')
bench_k_means(KMeans(init="k-means++", n_clusters=n_clusters, n_init=100,verbose=0, tol=0.0), name="k-means++", data=reduced_data)
bench_k_means(KMeans(init='random', n_clusters=n_clusters, n_init=100, verbose=0, tol=0.0), name="random", data=reduced_data)
print('Dimension: '+str(index))
###############################################################################
# Visualize the results on PCA-reduced data (2D)
pca2 = PCA(n_components=2).fit(data)
reduced_data = pca2.transform(data)
kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=120, verbose=0, tol=0.0)
kmeans.fit(reduced_data)
print(79 * '_')
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02 # point in the mesh [x_min, m_max]x[y_min, y_max].
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() + 0, reduced_data[:, 0].max() - 0
y_min, y_max = reduced_data[:, 1].min() + 0, reduced_data[:, 1].max() - 0
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect='auto', origin='lower')
#plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
plt.plot(reduced_data[labels==0, 0], reduced_data[labels==0, 1], 'k.', markersize=3)
plt.plot(reduced_data[labels==1, 0], reduced_data[labels==1, 1], 'r.', markersize=3)
plt.plot(reduced_data[labels==2, 0], reduced_data[labels==2, 1], 'b.', markersize=3)
plt.plot(reduced_data[labels==3, 0], reduced_data[labels==3, 1], 'g.', markersize=3)
#plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=5, color='w', zorder=10)
plt.title('K-means clustering on the TDT2 dataset (PCA-reduced data)\n Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()