PersonalPythonSource/kmeans.py at master · vishwarajanand/PersonalPythonSource · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/python
from __future__ import print_function
from time import time
import numpy as np
import matplotlib.pyplot as plt
import pdb
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

#np.random.seed(42) #uncomment if you want to memorize random number used
dataset_file = 'dataset7000.npz' #numpy array containing matrix 'X' and vector of labels 'y' generated by 'main.py'
print(dataset_file)
dataset = np.load(dataset_file)
data,labels = (dataset['x'],dataset['y'])
data = np.asarray(data, dtype=np.float32)
#scaled_data = scale(data)  # to be used for PCA, since generated matrix 'X' consists of 0s and 1s and it is too sparce it is not recommended to scale

n_samples, n_features = data.shape
n_clusters = len(np.unique(labels)) # number of clusters
#n_clusters = 2 # assign number of clusters to generate manually

print("n_clusters: %d, \t n_samples %d, \t n_features %d"
      % (n_clusters, n_samples, n_features))
print(79 * '_')
print('% 9s' % 'init'
      '        time     inertia    homo   compl  v-meas     ARI AMI')

def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_)))


bench_k_means(KMeans(init='k-means++', n_clusters=n_clusters, n_init=100, verbose=0, tol=0.0), name="k-means++", data=data)
bench_k_means(KMeans(init='random', n_clusters=n_clusters, n_init=100, verbose=0, tol=0.0), name="random", data=data)

# in this case the seeding of the centers is deterministic, hence we run the
# kmeans algorithm only once with n_init=1, basically we are using PCA to find initial centroids
pca = PCA(n_components=n_clusters).fit(data)
bench_k_means(KMeans(init=pca.components_, n_clusters=n_clusters, n_init=1,verbose=0, tol=0.0), name="PCA-based", data=data)

if 1:
    print(79 * '_')
    print('Performaing dimensionality reduction:')
    pca = PCA(n_components=None).fit(data)
    variance = 0
    var_percentage = 0.99
    index = 0
    while 1:
        variance += pca.explained_variance_ratio_[index]
        if variance > var_percentage:
            break
        index += 1
    reduced_data = PCA(n_components=index).fit_transform(data)
    print('Performaing clustering on dimensionality reduced data:')
    bench_k_means(KMeans(init="k-means++", n_clusters=n_clusters, n_init=100,verbose=0, tol=0.0), name="k-means++", data=reduced_data)
    bench_k_means(KMeans(init='random', n_clusters=n_clusters, n_init=100, verbose=0, tol=0.0), name="random", data=reduced_data)
    print('Dimension: '+str(index))

###############################################################################
# Visualize the results on PCA-reduced data (2D)

pca2 = PCA(n_components=2).fit(data)
reduced_data = pca2.transform(data)
kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=120, verbose=0, tol=0.0)
kmeans.fit(reduced_data)
print(79 * '_')

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02     # point in the mesh [x_min, m_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() + 0, reduced_data[:, 0].max() - 0
y_min, y_max = reduced_data[:, 1].min() + 0, reduced_data[:, 1].max() - 0
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')

#plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
plt.plot(reduced_data[labels==0, 0], reduced_data[labels==0, 1], 'k.', markersize=3)
plt.plot(reduced_data[labels==1, 0], reduced_data[labels==1, 1], 'r.', markersize=3)
plt.plot(reduced_data[labels==2, 0], reduced_data[labels==2, 1], 'b.', markersize=3)
plt.plot(reduced_data[labels==3, 0], reduced_data[labels==3, 1], 'g.', markersize=3)

#plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=5, color='w', zorder=10)
plt.title('K-means clustering on the TDT2 dataset (PCA-reduced data)\n Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()