From 8259e11169a719b9914e6a16c625a15a7bc22275 Mon Sep 17 00:00:00 2001
From: PRERITARYA <preritarya251@gmail.com>
Date: Tue, 23 Jun 2026 11:04:33 +0530
Subject: [PATCH 1/3] Add DBSCAN clustering algorithm in machine_learning/

---
 machine_learning/dbscan.py | 202 +++++++++++++++++++++++++++++++++++++
 1 file changed, 202 insertions(+)
 create mode 100644 machine_learning/dbscan.py

diff --git a/machine_learning/dbscan.py b/machine_learning/dbscan.py
new file mode 100644
index 000000000000..18e92c3f8ecf
--- /dev/null
+++ b/machine_learning/dbscan.py
@@ -0,0 +1,202 @@
+"""
+DBSCAN (Density-Based Spatial Clustering of Applications with Noise)
+
+A density-based clustering algorithm that groups together points that are
+closely packed together, while marking points in low-density regions as outliers.
+
+Unlike K-Means, DBSCAN:
+- Does NOT require specifying the number of clusters in advance
+- Can find clusters of arbitrary shapes
+- Is robust to outliers (labels them as noise, cluster id = -1)
+
+Key Parameters:
+    epsilon (eps): The maximum distance between two points to be considered neighbors
+    min_points: Minimum number of points to form a dense region (core point)
+
+Point Types:
+    - Core point:    Has at least `min_points` neighbors within `epsilon` distance
+    - Border point:  Within `epsilon` of a core point, but has fewer than
+                     `min_points` neighbors
+    - Noise point:   Neither core nor border — labeled as -1
+
+Time Complexity:  O(n²) with brute-force neighbor search
+Space Complexity: O(n)
+
+References:
+    - https://en.wikipedia.org/wiki/DBSCAN
+    - Ester, M., et al. "A density-based algorithm for discovering clusters."
+      KDD 1996. https://dl.acm.org/doi/10.5555/3001460.3001507
+"""
+
+
+def euclidean_distance(point_a: list[float], point_b: list[float]) -> float:
+    """
+    Compute the Euclidean distance between two points in n-dimensional space.
+
+    >>> euclidean_distance([0.0, 0.0], [3.0, 4.0])
+    5.0
+    >>> euclidean_distance([1.0, 2.0, 3.0], [1.0, 2.0, 3.0])
+    0.0
+    >>> euclidean_distance([0.0], [5.0])
+    5.0
+    >>> euclidean_distance([0.0, 0.0], [1.0])
+    Traceback (most recent call last):
+        ...
+    ValueError: Both points must have the same number of dimensions.
+    """
+    if len(point_a) != len(point_b):
+        raise ValueError("Both points must have the same number of dimensions.")
+    return sum((a - b) ** 2 for a, b in zip(point_a, point_b)) ** 0.5
+
+
+def get_neighbors(
+    data: list[list[float]], point_index: int, epsilon: float
+) -> list[int]:
+    """
+    Return indices of all points within epsilon distance of data[point_index].
+
+    >>> data = [[0.0, 0.0], [0.1, 0.1], [5.0, 5.0]]
+    >>> get_neighbors(data, 0, 0.5)
+    [0, 1]
+    >>> get_neighbors(data, 2, 0.5)
+    [2]
+    >>> get_neighbors(data, 0, 10.0)
+    [0, 1, 2]
+    """
+    return [
+        index
+        for index, point in enumerate(data)
+        if euclidean_distance(data[point_index], point) <= epsilon
+    ]
+
+
+def dbscan(
+    data: list[list[float]],
+    epsilon: float,
+    min_points: int,
+) -> list[int]:
+    """
+    Perform DBSCAN clustering on a dataset.
+
+    Args:
+        data:       List of n-dimensional data points, e.g. [[x1,y1], [x2,y2], ...]
+        epsilon:    Maximum distance between two points to be considered neighbors.
+                    Must be greater than 0.
+        min_points: Minimum number of neighbors (including self) to be a core point.
+                    Must be at least 1.
+
+    Returns:
+        A list of integer cluster labels, one per input point.
+        Noise points are labeled -1.
+        Cluster IDs start from 0.
+
+    Raises:
+        ValueError: If data is empty.
+        ValueError: If epsilon is not positive.
+        ValueError: If min_points is less than 1.
+
+    Example — two well-separated clusters:
+    >>> data = [
+    ...     [1.0, 1.0], [1.1, 1.0], [1.0, 1.1],
+    ...     [9.0, 9.0], [9.1, 9.0], [9.0, 9.1],
+    ... ]
+    >>> labels = dbscan(data, epsilon=0.5, min_points=2)
+    >>> len(set(labels))  # two clusters
+    2
+    >>> labels[0] == labels[1] == labels[2]  # first three in same cluster
+    True
+    >>> labels[3] == labels[4] == labels[5]  # last three in same cluster
+    True
+    >>> labels[0] != labels[3]               # different clusters
+    True
+
+    Example — isolated noise point:
+    >>> data = [[0.0, 0.0], [0.1, 0.0], [0.0, 0.1], [99.0, 99.0]]
+    >>> labels = dbscan(data, epsilon=0.5, min_points=2)
+    >>> labels[3]  # noise
+    -1
+    >>> labels[0] == labels[1] == labels[2]  # one cluster
+    True
+
+    Example — all points are noise (min_points too high):
+    >>> data = [[0.0, 0.0], [5.0, 5.0]]
+    >>> dbscan(data, epsilon=0.3, min_points=5)
+    [-1, -1]
+
+    Example — single cluster (all points close together):
+    >>> data = [[0.0, 0.0], [0.1, 0.0], [0.0, 0.1], [0.1, 0.1]]
+    >>> labels = dbscan(data, epsilon=0.5, min_points=2)
+    >>> len(set(labels))
+    1
+    >>> -1 not in labels
+    True
+
+    Example — invalid inputs:
+    >>> dbscan([], epsilon=0.5, min_points=2)
+    Traceback (most recent call last):
+        ...
+    ValueError: Data must not be empty.
+    >>> dbscan([[1.0, 2.0]], epsilon=0.0, min_points=2)
+    Traceback (most recent call last):
+        ...
+    ValueError: Epsilon must be greater than 0.
+    >>> dbscan([[1.0, 2.0]], epsilon=0.5, min_points=0)
+    Traceback (most recent call last):
+        ...
+    ValueError: min_points must be at least 1.
+    """
+    if not data:
+        raise ValueError("Data must not be empty.")
+    if epsilon <= 0:
+        raise ValueError("Epsilon must be greater than 0.")
+    if min_points < 1:
+        raise ValueError("min_points must be at least 1.")
+
+    labels = [-1] * len(data)  # all points start as noise
+    current_cluster_id = 0
+
+    for point_index in range(len(data)):
+        if labels[point_index] != -1:
+            continue  # already assigned
+
+        neighbors = get_neighbors(data, point_index, epsilon)
+
+        if len(neighbors) < min_points:
+            continue  # not a core point — remains noise for now
+
+        # point_index is a core point — start a new cluster
+        labels[point_index] = current_cluster_id
+        seeds = [n for n in neighbors if n != point_index]
+
+        while seeds:
+            current_point = seeds.pop()
+
+            if labels[current_point] == -1:
+                # was noise — reassign as border point of this cluster
+                labels[current_point] = current_cluster_id
+
+            already_in_another = (
+                labels[current_point] != -1
+                and labels[current_point] != current_cluster_id
+            )
+            if already_in_another:
+                continue  # already in another cluster
+
+            labels[current_point] = current_cluster_id
+            current_neighbors = get_neighbors(data, current_point, epsilon)
+
+            if len(current_neighbors) >= min_points:
+                # current_point is also a core point — expand cluster
+                for neighbor in current_neighbors:
+                    if labels[neighbor] == -1:
+                        seeds.append(neighbor)
+
+        current_cluster_id += 1
+
+    return labels
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(verbose=True)

From 8c8ad8c0bda1f765d153fe85a66e1083cadda761 Mon Sep 17 00:00:00 2001
From: PRERITARYA <preritarya251@gmail.com>
Date: Tue, 23 Jun 2026 13:59:55 +0530
Subject: [PATCH 2/3] Fix logic order in dbscan: check cluster membership
 before assigning

---
 machine_learning/dbscan.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/machine_learning/dbscan.py b/machine_learning/dbscan.py
index 18e92c3f8ecf..899581597531 100644
--- a/machine_learning/dbscan.py
+++ b/machine_learning/dbscan.py
@@ -171,17 +171,14 @@ def dbscan(
         while seeds:
             current_point = seeds.pop()
 
-            if labels[current_point] == -1:
-                # was noise — reassign as border point of this cluster
-                labels[current_point] = current_cluster_id
-
-            already_in_another = (
+            # skip points already claimed by a different cluster
+            if (
                 labels[current_point] != -1
                 and labels[current_point] != current_cluster_id
-            )
-            if already_in_another:
-                continue  # already in another cluster
+            ):
+                continue
 
+            # assign noise points and unvisited points to this cluster
             labels[current_point] = current_cluster_id
             current_neighbors = get_neighbors(data, current_point, epsilon)
 
@@ -199,4 +196,4 @@ def dbscan(
 if __name__ == "__main__":
     import doctest
 
-    doctest.testmod(verbose=True)
+    doctest.testmod(verbose=True)
\ No newline at end of file

From 062f88aec86ccee016c213f76700ec5e0fe3be70 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 23 Jun 2026 08:30:27 +0000
Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 machine_learning/dbscan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine_learning/dbscan.py b/machine_learning/dbscan.py
index 899581597531..6fac3f53ad4b 100644
--- a/machine_learning/dbscan.py
+++ b/machine_learning/dbscan.py
@@ -196,4 +196,4 @@ def dbscan(
 if __name__ == "__main__":
     import doctest
 
-    doctest.testmod(verbose=True)
\ No newline at end of file
+    doctest.testmod(verbose=True)