-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathk_means.php
More file actions
89 lines (79 loc) · 1.93 KB
/
k_means.php
File metadata and controls
89 lines (79 loc) · 1.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
<?php
function getClusterMap($matrix)
{
//pick initial seeds randomly
for($i=0;$i<5;$i++)
{
$centroids[] = $matrix[rand(0, count($matrix)-1)];
}
//assign documents to new clusters and update centroids x10 or until there are no changes made
for($i=0;$i<10;$i++)
{
$stop = $centroids;
$clusters = assignDocuments($matrix, $centroids);
$centroids = updateCentroids($matrix, $clusters);
if($centroids == $stop) break;
}
//take out weights and return an array with cluster/doc values
foreach($clusters as $clusterID => $cluster)
{
foreach($cluster as $docID => $doc)
{
$cluster_map[$clusterID][] = $docID;
}
}
return $cluster_map;
}
function assignDocuments($matrix, $centroids)
{
//loop over each document in the matrix
foreach($matrix as $docID => $doc)
{
$min_cos_sim = -1;
$closest_centroid = NULL;
//loop over each centroid and calculate the cosine similarity between doc and centroid
foreach($centroids as $centroidID => $centroid)
{
$cos_sim = getCosineSim($centroid, $doc);
//find the closest centroid to the document and assign it to that cluster
if($cos_sim > $min_cos_sim)
{
$min_cos_sim = $cos_sim;
$closest_centroid = $centroidID;
}
}
$clusters[$closest_centroid][$docID] = $doc;
}
return $clusters;
}
//update centroids
function updateCentroids($matrix, $clusters)
{
//get the average length of the vectors in each cluster
foreach($clusters as $clusterID => $cluster)
{
$dimensions = count($matrix[0]);
for ($i = 0;$i < $dimensions;$i++)
{
$total_weight = 0;
foreach($cluster as $doc)
{
$total_weight += $doc[$i];
}
$new_centroid[$i] = $total_weight/$dimensions;
}
$new_centroids[$clusterID] = $new_centroid;
}
return $new_centroids;
}
//returns dot product of each vector
function getCosineSim($centroid, $doc)
{
$total = 0;
foreach($centroid as $dimension => $weight)
{
$total += $weight * $doc[$dimension];
}
return $total;
}
?>