diff --git a/notebooks/Unsupervised_Learning.ipynb b/notebooks/Unsupervised_Learning.ipynb deleted file mode 100644 index 7a91bd1a..00000000 --- a/notebooks/Unsupervised_Learning.ipynb +++ /dev/null @@ -1,643 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "288e6a20", - "metadata": {}, - "source": [ - "# Unsupervised Learning Methods in TopologicPy and the Role of Graphs" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "788e90ae", - "metadata": {}, - "outputs": [], - "source": [ - "# This cell is not needed if you have pip installed topologicpy\n", - "import sys\n", - "sys.path.append(\"C:/Users/sarwj/OneDrive - Cardiff University/Documents/GitHub/topologicpy/src\")" - ] - }, - { - "cell_type": "markdown", - "id": "a60be9e7", - "metadata": {}, - "source": [ - "## Import the needed libraries" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8c307920", - "metadata": {}, - "outputs": [], - "source": [ - "from topologicpy.Vertex import Vertex\n", - "from topologicpy.Edge import Edge\n", - "from topologicpy.Wire import Wire\n", - "from topologicpy.Face import Face\n", - "from topologicpy.Shell import Shell\n", - "from topologicpy.Cell import Cell\n", - "from topologicpy.Cluster import Cluster\n", - "from topologicpy.Topology import Topology\n", - "from topologicpy.Dictionary import Dictionary\n", - "from topologicpy.Helper import Helper\n", - "from topologicpy.Grid import Grid\n", - "from topologicpy.Graph import Graph\n", - "from topologicpy.Color import Color" - ] - }, - { - "cell_type": "markdown", - "id": "c850d680", - "metadata": {}, - "source": [ - "## Check the TopologicPy Version" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5c59158f", - "metadata": {}, - "outputs": [], - "source": [ - "print(Helper.Version())" - ] - }, - { - "cell_type": "markdown", - "id": "097be6e4", - "metadata": {}, - "source": [ - "## Utility Function to Reset the Face Dictionaries" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "fe7d679f", - "metadata": {}, - "outputs": [], - "source": [ - "def reset_dictionaries(shell):\n", - " faces = Topology.Faces(shell)\n", - " for i, f in enumerate(faces):\n", - " d = Topology.Dictionary(f)\n", - " keys = Dictionary.Keys(d)\n", - " for key in keys:\n", - " if not key == \"face_id\":\n", - " d = Dictionary.RemoveKey(d, key)\n", - " f = Topology.SetDictionary(f, d)\n", - "\n", - "def transfer_dicts_by_key(topologies, selectors, key):\n", - " dicts = {}\n", - " for t in topologies:\n", - " d = Topology.Dictionary(t)\n", - " value = Dictionary.ValueAtKey(d, key, None)\n", - " if value:\n", - " dicts[str(value)] = t\n", - " \n", - " for s in selectors:\n", - " d = Topology.Dictionary(s)\n", - " value = Dictionary.ValueAtKey(d, key, None)\n", - " if value:\n", - " f = dicts[str(value)]\n", - " f = Topology.SetDictionary(f, d)\n" - ] - }, - { - "cell_type": "markdown", - "id": "0a70e879", - "metadata": {}, - "source": [ - "## 1. Import the gallery floor plan" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "35acd393", - "metadata": {}, - "outputs": [], - "source": [ - "gallery = Topology.ByBREPPath(r\"C:\\Users\\sarwj\\OneDrive - Cardiff University\\Documents\\GitHub\\topologicpy\\assets\\MachineLearning\\gallery.brep\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "07291ea5", - "metadata": {}, - "outputs": [], - "source": [ - "Topology.Show(gallery, camera=[0,0,6], faceColor=[210,210,250], faceOpacity=1, edgeColor=\"white\", edgeWidth=3, showVertices=False, backgroundColor=\"black\", width=800, height=600)" - ] - }, - { - "cell_type": "markdown", - "id": "8f895060", - "metadata": {}, - "source": [ - "## 2. Create a grid overlay" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "1c85940e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "214.505355 141.063941\n" - ] - } - ], - "source": [ - "b_r = Wire.BoundingRectangle(gallery)\n", - "d = Topology.Dictionary(b_r)\n", - "xmin = Dictionary.ValueAtKey(d, \"xmin\")\n", - "xmax = Dictionary.ValueAtKey(d, \"xmax\")\n", - "ymin = Dictionary.ValueAtKey(d, \"ymin\")\n", - "ymax = Dictionary.ValueAtKey(d, \"ymax\")\n", - "width = Dictionary.ValueAtKey(d, \"width\")\n", - "length = Dictionary.ValueAtKey(d, \"length\")\n", - "print(width, length)\n", - "uRange = list(range(0,int(width)+3,3))\n", - "vRange = list(range(0,int(length)+3,3))\n", - "\n", - "grid = Grid.EdgesByDistances(gallery, clip=True, uRange=uRange, vRange=vRange)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1f0fba1", - "metadata": {}, - "outputs": [], - "source": [ - "Topology.Show(gallery, grid, camera=[0,0,6], faceColor=[210,210,250], edgeColor=\"grey\", edgeWidth=3, faceOpacity=1, showVertices=False, backgroundColor=\"black\", width=800, height=600)" - ] - }, - { - "cell_type": "markdown", - "id": "c203cdff", - "metadata": {}, - "source": [ - "## 3. Slice the floor plan with the grid to create a topologic shell" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "93908245", - "metadata": {}, - "outputs": [], - "source": [ - "shell = Topology.Slice(gallery, grid)\n", - "faces = Topology.Faces(shell)\n", - "for i, f in enumerate(faces):\n", - " d = Dictionary.ByKeyValue(\"face_id\", \"face_\"+str(i+1))\n", - " f = Topology.SetDictionary(f, d)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dcfb863b", - "metadata": {}, - "outputs": [], - "source": [ - "Topology.Show(shell, camera=[0,0,6], faceColor=[210,210,250], edgeColor=\"black\", edgeWidth=3, faceOpacity=0.6, showVertices=False, backgroundColor=\"black\", width=800, height=600)" - ] - }, - { - "cell_type": "markdown", - "id": "9696c22a", - "metadata": {}, - "source": [ - "## 4. Derive navigation and analysis graphs from the shell" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "cfb4c857", - "metadata": {}, - "outputs": [], - "source": [ - "navigation_graph = Graph.ByTopology(shell, direct=False, viaSharedTopologies=True)\n", - "analysis_graph = Graph.ByTopology(shell)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "6639f476", - "metadata": {}, - "outputs": [], - "source": [ - "g_verts = Graph.Vertices(analysis_graph)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a25ff457", - "metadata": {}, - "outputs": [], - "source": [ - "Topology.Show(analysis_graph, camera=[0,0,6], vertexSize=4, vertexColor=\"red\", edgeColor=\"lightgrey\", backgroundColor=\"black\", width=800, height=600)" - ] - }, - { - "cell_type": "markdown", - "id": "a6f1fa62", - "metadata": {}, - "source": [ - "## 5. Analyse Graph: (Examples: Shortest Path and Closeness Centrality/Integration)" - ] - }, - { - "cell_type": "markdown", - "id": "cc42ed72", - "metadata": {}, - "source": [ - "### Shortest Path (Use navigation graph)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "631b5ae3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Duration: 2.03\n", - "Duration: 2.02\n", - "Duration: 2.07\n", - "Duration: 2.1\n" - ] - } - ], - "source": [ - "import time\n", - "\n", - "start_vertex = Vertex.ByCoordinates(xmin+2, ymax-2,0) # Upper left corner\n", - "end_vertex = Vertex.ByCoordinates(xmax-2,ymin+2,0) # Lower right corner\n", - "\n", - "crg = Graph.CompiledRoutingGraph(navigation_graph, precomputeTurns=True)\n", - "path1 = Graph.ShortestPath(crg, vertexA=start_vertex, vertexB=end_vertex)\n", - "for i in range(4):\n", - " start = time.time()\n", - " shortest_path = Graph.ShortestPath(crg, vertexA=start_vertex, vertexB=end_vertex)\n", - " end = time.time()\n", - " print(\"Duration:\", round(end-start, 2))\n", - "\n", - "# straight_path = Wire.Straighten(shortest_path, host=gallery)\n", - "# print(\"Original Path Length:\", round(Wire.Length(shortest_path), 2))\n", - "# print(\"Straightened Path Length:\", round(Wire.Length(straight_path), 2))\n", - "# edges = Topology.Edges(shortest_path)\n", - "# for edge in edges:\n", - "# d = Dictionary.ByKeysValues([\"width\", \"color\"], [7, \"red\"])\n", - "# edge = Topology.SetDictionary(edge, d)\n", - "# edges = Topology.Edges(straight_path)\n", - "# for edge in edges:\n", - "# d = Dictionary.ByKeysValues([\"width\", \"color\"], [7, \"blue\"])\n", - "# edge = Topology.SetDictionary(edge, d)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "01efbf31", - "metadata": {}, - "outputs": [], - "source": [ - "Topology.Show(shell, shortest_path, straight_path, camera=[0,0,6], faceColor=[210,210,250], edgeColorKey=\"color\", edgeWidthKey=\"width\", backgroundColor=\"black\", width=800, height=600)" - ] - }, - { - "cell_type": "markdown", - "id": "d645c057", - "metadata": {}, - "source": [ - "### Closeness Centrality/Integration\n", - "* Closeness centrality is a graph metric that quantifies how close a node is to all other nodes by taking the reciprocal of the sum of its shortest path distances to every other node in the network.\n", - "* In space syntax, closeness centrality corresponds to global integration, measuring how spatially accessible or topologically shallow a space is within a configuration, thereby indicating its potential for movement flow and encounter density." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "50f8404f", - "metadata": {}, - "outputs": [], - "source": [ - "centrality_list = Graph.ClosenessCentrality(analysis_graph, colorScale=\"thermal\")" - ] - }, - { - "cell_type": "markdown", - "id": "fc24ef9a", - "metadata": {}, - "source": [ - "## 6. Transfer the information from the graph back to the shell" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9a4a5b5b", - "metadata": {}, - "outputs": [], - "source": [ - "reset_dictionaries(shell)\n", - "faces = Topology.Faces(shell)\n", - "_ = transfer_dicts_by_key(faces, g_verts, \"face_id\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b3f13640", - "metadata": {}, - "outputs": [], - "source": [ - "Topology.Show(faces, faceColorKey=\"cc_color\", faceOpacity=1, showEdges=False, showVertices=False, camera=[0,0,6], backgroundColor=\"black\", width=800, height=600)" - ] - }, - { - "cell_type": "markdown", - "id": "2041be46", - "metadata": {}, - "source": [ - "## K-Means\n", - "* K-means clustering is an unsupervised learning algorithm that partitions a dataset of n observations into K disjoint clusters by minimizing the within-cluster sum of squared distances (WCSS) between data points and their assigned cluster centroid." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6413eb22", - "metadata": {}, - "outputs": [], - "source": [ - "k = 35\n", - "clusters = Cluster.KMeans(g_verts, k=k, keys=[\"x\", \"y\"])\n", - "colors = [Color.ByValueInRange(x, minValue=0, maxValue=k-1, colorScale=\"thermal\") for x in range(k)]\n", - "for i, cluster in enumerate(clusters):\n", - " color = colors[i]\n", - " c_verts = Topology.Vertices(cluster)\n", - " for c_vert in c_verts:\n", - " d = Topology.Dictionary(c_vert)\n", - " d = Dictionary.SetValuesAtKeys(d, [\"km_id\", \"km_color\"], [i, color])\n", - " c_vert = Topology.SetDictionary(c_vert, d)" - ] - }, - { - "cell_type": "markdown", - "id": "d054d5ec", - "metadata": {}, - "source": [ - "### 6. Transfer the information from the grid back to the shell" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f5e46275", - "metadata": {}, - "outputs": [], - "source": [ - "reset_dictionaries(shell)\n", - "shell = Topology.TransferDictionariesBySelectors(shell, selectors=g_verts, tranFaces=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05678ec3", - "metadata": {}, - "outputs": [], - "source": [ - "Topology.Show(shell, faceColorKey=\"km_color\", faceOpacity=1, edgeWidthKey=\"width\", showVertices=False, camera=[0,0,5], backgroundColor=\"black\", width=800, height=600)" - ] - }, - { - "cell_type": "markdown", - "id": "ddde4fe4", - "metadata": {}, - "source": [ - "## DBSCAN\n", - "* DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is an unsupervised clustering algorithm that groups points based on local density rather than distance to a centroid. It defines clusters as contiguous regions of high point density separated by regions of low density." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "814fc667", - "metadata": {}, - "outputs": [], - "source": [ - "spiral_1 = Wire.Spiral(radiusA=1, radiusB=5, height=10, turns=2)\n", - "verts = []\n", - "for i in range(0, 101, 1):\n", - " v = Wire.VertexByParameter(spiral_1, i*0.01)\n", - " verts.append(v)\n", - "spiral_2 = Topology.Rotate(spiral_1, angle=120)\n", - "for i in range(0, 101, 1):\n", - " v = Wire.VertexByParameter(spiral_2, i*0.01)\n", - " verts.append(v)\n", - "spiral_3 = Topology.Rotate(spiral_1, angle=240)\n", - "for i in range(0, 101, 1):\n", - " v = Wire.VertexByParameter(spiral_3, i*0.01)\n", - " verts.append(v)\n", - "Topology.Show(verts, vertexSize=12, backgroundColor=\"white\", width=500, height=500)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bb8a8a51", - "metadata": {}, - "outputs": [], - "source": [ - "dis = Vertex.Distance(verts[0], verts[1])\n", - "result = Cluster.DBSCAN(verts, keys=[\"x\", \"y\", \"z\"], epsilon=dis*1.1, minSamples=2)\n", - "clusters = result[0]\n", - "print(\"Final Number of Clusters:\", len(clusters))\n", - "colors = [\"red\", \"blue\", \"green\"]\n", - "for i, cluster in enumerate(clusters):\n", - " color = colors[i]\n", - " c_verts = Topology.Vertices(cluster)\n", - " for c_vert in c_verts:\n", - " d = Topology.Dictionary(c_vert)\n", - " d = Dictionary.SetValueAtKey(d, \"db_color\", color)\n", - " c_vert = Topology.SetDictionary(c_vert, d)\n", - "Topology.Show(verts, vertexColorKey=\"db_color\", vertexSize=14, camera=[2,0,0.5], backgroundColor=\"white\", width=500, height=500)" - ] - }, - { - "cell_type": "markdown", - "id": "c0dfa30e", - "metadata": {}, - "source": [ - "## 5. Analyse the graph" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "311df5a5", - "metadata": {}, - "outputs": [], - "source": [ - "n_clusters = 0\n", - "attempts = 1\n", - "epsilon = 1\n", - "while n_clusters < 15 and attempts < 20:\n", - " result = Cluster.DBSCAN(g_verts, keys=[\"x\", \"y\"], epsilon=epsilon, minSamples=10)\n", - " clusters = result[0]\n", - " n_clusters = len(clusters)\n", - " print(\"Attempt:\", attempts)\n", - " print(\" Epsilon:\", round(epsilon, 2), \"---> Clusters:\", n_clusters)\n", - " attempts += 1\n", - " epsilon += 1\n", - "print(\"Final Number of Clusters:\", len(clusters))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "beac31a7", - "metadata": {}, - "outputs": [], - "source": [ - "colors = [Color.ByValueInRange(x, minValue=0, maxValue=len(clusters)-1, colorScale=\"thermal\") for x in range(len(clusters))]\n", - "for i, cluster in enumerate(clusters):\n", - " color = colors[i]\n", - " c_verts = Topology.Vertices(cluster)\n", - " for c_vert in c_verts:\n", - " d = Topology.Dictionary(c_vert)\n", - " d = Dictionary.SetValuesAtKeys(d, [\"db_id\", \"db_color\"], [i, color])\n", - " c_vert = Topology.SetDictionary(c_vert, d)" - ] - }, - { - "cell_type": "markdown", - "id": "1269f366", - "metadata": {}, - "source": [ - "## 6. Transfer the information from the graph back to the shell" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4af7c5fe", - "metadata": {}, - "outputs": [], - "source": [ - "reset_dictionaries(shell)\n", - "shell = Topology.TransferDictionariesBySelectors(shell, selectors=g_verts, tranFaces=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6e38729b", - "metadata": {}, - "outputs": [], - "source": [ - "Topology.Show(shell, faceColorKey=\"db_color\", faceOpacity=1, edgeWidthKey=\"width\", showVertices=False, camera=[0,0,5], backgroundColor=\"black\", width=800, height=600)" - ] - }, - { - "cell_type": "markdown", - "id": "21743913", - "metadata": {}, - "source": [ - "## Graph Community Partition (Louvain)\n", - "### Graph community partition using the Louvain method is an unsupervised algorithm for detecting communities in a network by maximising modularity, a measure of the density of edges within groups compared to edges between groups." - ] - }, - { - "cell_type": "markdown", - "id": "e4387090", - "metadata": {}, - "source": [ - "## 5. Analyse the graph" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85dcaf08", - "metadata": {}, - "outputs": [], - "source": [ - "_ = Graph.CommunityPartition(graph, colorScale=\"thermal\")" - ] - }, - { - "cell_type": "markdown", - "id": "b5868352", - "metadata": {}, - "source": [ - "## 6. Transfer the information from the graph back to the shell" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1dd62162", - "metadata": {}, - "outputs": [], - "source": [ - "reset_dictionaries(shell)\n", - "shell = Topology.TransferDictionariesBySelectors(shell, selectors=g_verts, tranFaces=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9bb66a90", - "metadata": {}, - "outputs": [], - "source": [ - "Topology.Show(shell, faceColorKey=\"cp_color\", faceOpacity=1, edgeWidthKey=\"width\", showVertices=False, camera=[0,0,5], backgroundColor=\"black\", width=800, height=600)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/src/topologicpy/Cluster.py b/src/topologicpy/Cluster.py index b1c4bb7f..f3ebaacd 100644 --- a/src/topologicpy/Cluster.py +++ b/src/topologicpy/Cluster.py @@ -633,6 +633,453 @@ def expand_cluster_3d_indices(labels, dists, point_index, neighbors, cluster_id, tp_noise = Cluster.ByTopologies([topologyList[i] for i in noise]) return tp_clusters, tp_noise + @staticmethod + def HDBSCAN(topologies, selectors=None, keys=["x", "y", "z"], minClusterSize: int = 5, minSamples: int = None, clusterSelectionMethod: str = "eom", allowSingleCluster: bool = False): + """ + Clusters the input topologies based on the Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBSCAN) method. + See https://en.wikipedia.org/wiki/HDBSCAN + + Unlike DBSCAN, HDBSCAN does not require an epsilon parameter, making it more robust for datasets with clusters of varying density. + It builds a hierarchy of clusters and extracts the most stable ones automatically. + + Parameters + ---------- + topologies : list + The input list of topologies to be clustered. + selectors : list , optional + If the list of topologies are not vertices then please provide a corresponding list of selectors (vertices) that represent the topologies for clustering. For example, these can be the centroids of the topologies. + If set to None, the list of topologies is expected to be a list of vertices. Default is None. + keys : list, optional + The keys in the embedded dictionaries in the topologies. If specified, the values at these keys will be added to the dimensions to be clustered. The values must be numeric. If you wish the x, y, z location to be included, + make sure the keys list includes "X", "Y", and/or "Z" (case insensitive). Default is ["x", "y", "z"] + minClusterSize : int , optional + The minimum number of points required to form a cluster. Default is 5. + minSamples : int , optional + The number of neighbors used to compute the core distance for each point. If set to None, it defaults to the value of minClusterSize. Default is None. + clusterSelectionMethod : str , optional + The method used to select clusters from the condensed tree. Options are "eom" (Excess of Mass, default) which maximizes total cluster stability, or "leaf" which selects the leaf clusters. Default is "eom". + allowSingleCluster : bool , optional + Whether to allow the algorithm to return a single cluster. When False (default), the root cluster cannot + dominate its children in the EOM selection, preventing trivial single-cluster results. Default is False. + + Returns + ------- + list, list + The list of clusters and the list of topologies considered to be noise if any (otherwise returns None). + + """ + from topologicpy.Vertex import Vertex + from topologicpy.Topology import Topology + from topologicpy.Dictionary import Dictionary + + def _compute_core_distances(dists, k): + """Compute core distance for each point: distance to k-th nearest neighbor.""" + n = dists.shape[0] + core_dists = np.empty(n) + for i in range(n): + sorted_dists = np.sort(dists[i]) + # sorted_dists[0] is distance to self (0), so k-th neighbor is at index k + if k < n: + core_dists[i] = sorted_dists[k] + else: + core_dists[i] = sorted_dists[-1] + return core_dists + + def _mutual_reachability_matrix(dists, core_dists): + """Compute mutual reachability distance: max(core(a), core(b), dist(a,b)).""" + n = dists.shape[0] + mrd = np.copy(dists) + for i in range(n): + for j in range(i + 1, n): + val = max(core_dists[i], core_dists[j], dists[i, j]) + mrd[i, j] = val + mrd[j, i] = val + return mrd + + def _prim_mst(mrd): + """Build minimum spanning tree using Prim's algorithm. Returns sorted edges.""" + n = mrd.shape[0] + in_tree = np.zeros(n, dtype=bool) + min_edge = np.full(n, np.inf) + min_edge_from = np.full(n, -1, dtype=int) + edges = [] + + # Start from node 0 + in_tree[0] = True + for j in range(n): + if j != 0: + min_edge[j] = mrd[0, j] + min_edge_from[j] = 0 + + for _ in range(n - 1): + # Find the closest node not yet in tree + candidates = np.where(~in_tree)[0] + best = candidates[np.argmin(min_edge[candidates])] + edges.append((min_edge[best], min_edge_from[best], best)) + in_tree[best] = True + + # Update distances + for j in range(n): + if not in_tree[j] and mrd[best, j] < min_edge[j]: + min_edge[j] = mrd[best, j] + min_edge_from[j] = best + + # Sort edges by weight + edges.sort(key=lambda x: x[0]) + return edges + + def _build_hierarchy_and_condensed_tree(mst_edges, n_points, min_cluster_size): + """ + Build a full dendrogram from MST, then walk it top-down to create the condensed tree. + Returns (condensed_tree, cluster_node_set) where condensed_tree is a list of + (parent_cluster, child, lambda_val, child_size) tuples. + """ + # Step 1: Build full dendrogram using union-find (bottom-up) + parent_uf = list(range(2 * n_points)) + size_uf = [1] * (2 * n_points) + # dendrogram: for each internal node (n_points + i), store (left_child, right_child, weight, size) + dendrogram = {} + + def find(x): + while parent_uf[x] != x: + parent_uf[x] = parent_uf[parent_uf[x]] + x = parent_uf[x] + return x + + next_id = n_points + node_map = {} # UF root -> dendrogram node id + for i in range(n_points): + node_map[i] = i + + for weight, u, v in mst_edges: + root_u = find(u) + root_v = find(v) + if root_u == root_v: + continue + + left = node_map[root_u] + right = node_map[root_v] + left_size = size_uf[root_u] + right_size = size_uf[root_v] + new_node = next_id + next_id += 1 + + dendrogram[new_node] = (left, right, weight, left_size, right_size) + + # Union + parent_uf[root_u] = new_node + parent_uf[root_v] = new_node + parent_uf[new_node] = new_node + size_uf[new_node] = left_size + right_size + node_map[find(new_node)] = new_node + + root = next_id - 1 + + # Step 2: Collect all leaf points beneath each dendrogram node + leaves_cache = {} + + def get_leaves(node): + if node in leaves_cache: + return leaves_cache[node] + if node < n_points: + leaves_cache[node] = [node] + return [node] + left, right, w, ls, rs = dendrogram[node] + result = get_leaves(left) + get_leaves(right) + leaves_cache[node] = result + return result + + # Step 3: Walk dendrogram top-down to build condensed tree + condensed_tree = [] # (parent_cluster, child, lambda_val, child_size) + cluster_nodes = set() + next_cluster_id = [n_points] # mutable counter + + def walk(dendro_node, cluster_id): + """Walk the dendrogram top-down, building the condensed tree.""" + if dendro_node < n_points: + return + if dendro_node not in dendrogram: + return + + left, right, weight, left_size, right_size = dendrogram[dendro_node] + lambda_val = 1.0 / weight if weight > 0 else float('inf') + + if left_size >= min_cluster_size and right_size >= min_cluster_size: + # Real split: both children become new clusters + new_left_id = next_cluster_id[0] + next_cluster_id[0] += 1 + new_right_id = next_cluster_id[0] + next_cluster_id[0] += 1 + + cluster_nodes.add(new_left_id) + cluster_nodes.add(new_right_id) + + condensed_tree.append((cluster_id, new_left_id, lambda_val, left_size)) + condensed_tree.append((cluster_id, new_right_id, lambda_val, right_size)) + + walk(left, new_left_id) + walk(right, new_right_id) + + elif left_size >= min_cluster_size: + # Right is too small - its points fall out as noise + for pt in get_leaves(right): + condensed_tree.append((cluster_id, pt, lambda_val, 1)) + # Left continues with same cluster_id + walk(left, cluster_id) + + elif right_size >= min_cluster_size: + # Left is too small - its points fall out as noise + for pt in get_leaves(left): + condensed_tree.append((cluster_id, pt, lambda_val, 1)) + # Right continues with same cluster_id + walk(right, cluster_id) + + else: + # Both too small - all points fall out as noise + for pt in get_leaves(left): + condensed_tree.append((cluster_id, pt, lambda_val, 1)) + for pt in get_leaves(right): + condensed_tree.append((cluster_id, pt, lambda_val, 1)) + + # Root cluster + root_cluster = n_points # Use first available cluster id for root + next_cluster_id[0] = n_points + 1 + cluster_nodes.add(root_cluster) + walk(root, root_cluster) + + return condensed_tree, cluster_nodes + + def _compute_stability_and_extract(condensed_tree, cluster_nodes, n_points, method, allow_single_cluster=False): + """ + Compute stability for each cluster and extract final clusters. + Returns labels array. + """ + # Find lambda_birth for each cluster (when it appeared as a child in the condensed tree) + lambda_birth = {} + children_of = {} # cluster -> list of (child, lambda_val, size) + + for parent, child, lam, sz in condensed_tree: + if child in cluster_nodes: + lambda_birth[child] = lam + if parent not in children_of: + children_of[parent] = [] + children_of[parent].append((child, lam, sz)) + + # Root cluster birth = 0 + for node in cluster_nodes: + if node not in lambda_birth: + lambda_birth[node] = 0.0 + + # Compute stability for each cluster: + # stability = sum over all individual points falling out of (lambda_point - lambda_birth) + stability = {node: 0.0 for node in cluster_nodes} + for parent, child, lam, sz in condensed_tree: + if parent in cluster_nodes and child < n_points: + # Individual point falling out + birth = lambda_birth.get(parent, 0.0) + stability[parent] += (lam - birth) + + # Collect points belonging to each cluster node + # A cluster's points = all individual points that fall out of it + points in child clusters + cluster_points = {node: set() for node in cluster_nodes} + for parent, child, lam, sz in condensed_tree: + if parent in cluster_nodes and child < n_points: + cluster_points[parent].add(child) + + # Add points from child clusters (recursively) + def get_all_points(node): + pts = set(cluster_points.get(node, set())) + if node in children_of: + for ch, lam, sz in children_of[node]: + if ch in cluster_nodes: + pts |= get_all_points(ch) + return pts + + all_points = {node: get_all_points(node) for node in cluster_nodes} + + if method == "leaf": + # Leaf clusters: cluster nodes that have no cluster children + leaf_clusters = [] + for node in cluster_nodes: + child_clusters = [] + if node in children_of: + child_clusters = [ch for ch, lam, sz in children_of[node] if ch in cluster_nodes] + if len(child_clusters) == 0: + leaf_clusters.append(node) + + labels = np.full(n_points, -1, dtype=int) + for cluster_id, node in enumerate(sorted(leaf_clusters)): + for pt in all_points[node]: + if labels[pt] == -1: + labels[pt] = cluster_id + return labels + + # EOM method: bottom-up selection + selected = {node: True for node in cluster_nodes} + stab = dict(stability) + + # Identify the root cluster (the one with no parent in the condensed tree) + root_cluster = min(cluster_nodes) + + # Process bottom-up (largest id first = leaves before parents) + for node in sorted(cluster_nodes, reverse=True): + if node not in children_of: + continue + child_clusters = [ch for ch, lam, sz in children_of[node] if ch in cluster_nodes] + if len(child_clusters) == 0: + continue + + children_stab_sum = sum(stab.get(ch, 0.0) for ch in child_clusters) + node_stab = stab.get(node, 0.0) + + # When allow_single_cluster is False, the root can never beat its children + children_win = children_stab_sum > node_stab + if not allow_single_cluster and node == root_cluster: + children_win = True + + if children_win: + # Children are more stable - deselect parent, propagate stability up + selected[node] = False + stab[node] = children_stab_sum + else: + # Parent is more stable - deselect all descendant clusters + def deselect(n): + selected[n] = False + if n in children_of: + for ch, lam, sz in children_of[n]: + if ch in cluster_nodes: + deselect(ch) + for ch in child_clusters: + deselect(ch) + + # Collect selected clusters (skip root if it's the only one selected) + result_clusters = sorted([n for n in cluster_nodes if selected[n]]) + + # If only the root is selected, that means no meaningful sub-clusters were found + # Still return it as one cluster + labels = np.full(n_points, -1, dtype=int) + for cluster_id, node in enumerate(result_clusters): + for pt in all_points[node]: + if labels[pt] == -1: + labels[pt] = cluster_id + + return labels + + # --- Input validation (same pattern as DBSCAN) --- + if not isinstance(topologies, list): + print("Cluster.HDBSCAN - Error: The input topologies parameter is not a valid list. Returning None.") + return None, None + topologyList = [t for t in topologies if Topology.IsInstance(t, "Topology")] + if len(topologyList) < 1: + print("Cluster.HDBSCAN - Error: The input topologies parameter does not contain any valid topologies. Returning None.") + return None, None + + if minSamples is None: + minSamples = minClusterSize + + if len(topologyList) < minClusterSize: + print("Cluster.HDBSCAN - Error: The input minClusterSize parameter cannot be larger than the number of topologies. Returning None.") + return None, None + + if not isinstance(selectors, list): + check_vertices = [t for t in topologyList if not Topology.IsInstance(t, "Vertex")] + if len(check_vertices) > 0: + print("Cluster.HDBSCAN - Error: The input selectors parameter is not a valid list and this is needed since the list of topologies contains objects of type other than a topologic_core.Vertex. Returning None.") + return None, None + else: + selectors = [s for s in selectors if Topology.IsInstance(s, "Vertex")] + if len(selectors) < 1: + check_vertices = [t for t in topologyList if not Topology.IsInstance(t, "Vertex")] + if len(check_vertices) > 0: + print("Cluster.HDBSCAN - Error: The input selectors parameter does not contain any valid vertices and this is needed since the list of topologies contains objects of type other than a topologic_core.Vertex. Returning None.") + return None, None + if not len(selectors) == len(topologyList): + print("Cluster.HDBSCAN - Error: The input topologies and selectors parameters do not have the same length. Returning None.") + return None, None + if not isinstance(keys, list): + print("Cluster.HDBSCAN - Error: The input keys parameter is not a valid list. Returning None.") + return None, None + + if clusterSelectionMethod not in ("eom", "leaf"): + print("Cluster.HDBSCAN - Error: The clusterSelectionMethod must be 'eom' or 'leaf'. Returning None.") + return None, None + + # --- Feature extraction (same pattern as DBSCAN) --- + data = [] + if selectors is None: + for t in topologyList: + elements = [] + if keys: + d = Topology.Dictionary(t) + for key in keys: + if key.lower() == "x": + value = Vertex.X(t) + elif key.lower() == "y": + value = Vertex.Y(t) + elif key.lower() == "z": + value = Vertex.Z(t) + else: + value = Dictionary.ValueAtKey(d, key) + if value is not None: + elements.append(value) + data.append(elements) + else: + for i, s in enumerate(selectors): + elements = [] + if keys: + d = Topology.Dictionary(topologyList[i]) + for key in keys: + if key.lower() == "x": + value = Vertex.X(s) + elif key.lower() == "y": + value = Vertex.Y(s) + elif key.lower() == "z": + value = Vertex.Z(s) + else: + value = Dictionary.ValueAtKey(d, key) + if value is not None: + elements.append(value) + data.append(elements) + + # --- HDBSCAN algorithm --- + data_array = np.array(data) + n_points = data_array.shape[0] + + # Step 1: Compute pairwise distances + dists = squareform(pdist(data_array)) + + # Step 2: Compute core distances + core_dists = _compute_core_distances(dists, minSamples) + + # Step 3: Compute mutual reachability distance matrix + mrd = _mutual_reachability_matrix(dists, core_dists) + + # Step 4: Build minimum spanning tree + mst_edges = _prim_mst(mrd) + + # Step 5: Build condensed tree from hierarchy + condensed_tree, cluster_nodes = _build_hierarchy_and_condensed_tree(mst_edges, n_points, minClusterSize) + + # Step 6: Extract clusters + labels = _compute_stability_and_extract(condensed_tree, cluster_nodes, n_points, clusterSelectionMethod, allowSingleCluster) + + # --- Build output clusters (same pattern as DBSCAN) --- + unique_labels = sorted(set(labels)) + tp_clusters = [] + for lab in unique_labels: + if lab == -1: + continue + indices = list(np.where(labels == lab)[0]) + if len(indices) > 0: + tp_clusters.append(Cluster.ByTopologies([topologyList[i] for i in indices])) + + noise_indices = list(np.where(labels == -1)[0]) + tp_noise = None + if len(noise_indices) > 0: + tp_noise = Cluster.ByTopologies([topologyList[i] for i in noise_indices]) + + return tp_clusters, tp_noise + @staticmethod def Edges(cluster) -> list: """