rishabh0203iitr · proteetpaul · Apr 20, 2025 · Apr 20, 2025 · Apr 27, 2025 · Apr 28, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,5 @@
 # Ignore raw datasets
 data/raw/
+.vscode/
+build/
+
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,65 @@
+{
+    "files.associations": {
+        "random": "cpp",
+        "array": "cpp",
+        "atomic": "cpp",
+        "bit": "cpp",
+        "*.tcc": "cpp",
+        "bitset": "cpp",
+        "cctype": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "compare": "cpp",
+        "complex": "cpp",
+        "concepts": "cpp",
+        "cstdarg": "cpp",
+        "cstddef": "cpp",
+        "cstdint": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "deque": "cpp",
+        "map": "cpp",
+        "string": "cpp",
+        "unordered_map": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "algorithm": "cpp",
+        "functional": "cpp",
+        "iterator": "cpp",
+        "memory": "cpp",
+        "memory_resource": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "regex": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "utility": "cpp",
+        "fstream": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "iosfwd": "cpp",
+        "iostream": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "new": "cpp",
+        "numbers": "cpp",
+        "ostream": "cpp",
+        "sstream": "cpp",
+        "stdexcept": "cpp",
+        "streambuf": "cpp",
+        "typeinfo": "cpp",
+        "unordered_set": "cpp",
+        "charconv": "cpp",
+        "chrono": "cpp",
+        "ratio": "cpp",
+        "format": "cpp",
+        "span": "cpp",
+        "variant": "cpp"
+    }
+}
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,7 @@
+cmake_minimum_required(VERSION 3.8)
+project(CUDA_VectorDB LANGUAGES CXX CUDA)
+set(CMAKE_CXX_STANDARD 17)
+
+add_subdirectory(src/cuda)
+add_subdirectory(tests)
+add_dependencies(tests cuda_vector_db)
diff --git a/README.md b/README.md
@@ -3,21 +3,15 @@ Vector database search optimized using cuda
 
 ---
 
-## ✅ Implemented So Far
-
-- ✅ SIFT1M dataset preprocessing (`.fvecs → .npy`)
-- ✅ Optional vector normalization
-- 🔧 Custom CUDA-based KMeans:
-  - `kmeans_gpu.cu`: core CUDA kernels for assignment and centroid update
-  - `kmeans_driver.cpp`: runs clustering loop and saves centroids
-- ⏳ Python IVF index builder (`build_ivf_index.py`) using precomputed centroids
-- ⏳ Scripts for converting data, checking shape, and benchmarking CPU vs GPU KMeans
+## This branch is dedicated to implementation testing using the SIFTSMALL dataset.
 
 ---
 
-## Dataset: SIFT1M
+## Dataset: SIFTSMALL
+
+You can find the original compressed dataset file in `data/raw`. The dataset has already been preprocessed, and the resulting  `.npy` files are available in `data/processed`. For human-readable inspection, you can use `parse.py` to convert   `.npy` file to `.txt`. 
 
-Due to GitHub file size limits, we do not include the full dataset in this repo.
+If you'd like to re-run the preprocessing yourself, feel free to follow the steps below. Be sure to adjust any file paths or names if needed (eg. sift1m vs. siftsmall).
 
 To download:
 ```bash
@@ -36,6 +30,84 @@ python3 scripts/normalize_vectors.py
 
 ## IVF Indexing (Inverted File Index)
 ### Run custom GPU KMeans and save IVF centroids:
+We run KMeans clustering on the base vectors to compute `K` coarse centroids. These centroids serve as the anchors for grouping similar vectors in the inverted index.
+
+Compilation command:
+Notice that the test environment was based on Euler, with `nvidia/cuda/11.8.0` module loaded. And we choose `K=100` for a small dataset like siwfsmall.
+```bash
+nvcc -std=c++17 -O2  -Xcompiler -Wall -Xptxas -O3 -Iinclude -Iinclude/cnpy -o bin/kmeans_driver  src/cuda/kmeans_driver.cpp  src/cuda/kmeans_gpu.cu include/cnpy/cnpy.cpp -lcuda -lcudart -lz
+```
+Run the job:
+Here is an example sbatch script `test_kmeans_siftsmall.sh `. Please adjust it yourself if needed.
+```bash
+#!/usr/bin/env bash
+#SBATCH -p instruction
+#SBATCH --job-name=kmeans-siftsmall
+#SBATCH --output=kmeans-siftsmall.out
+#SBATCH --error=kmeans-siftsmall.err
+#SBATCH --ntasks=1
+#SBATCH --gres=gpu:1
+#SBATCH --time=00:05:00
+
+echo "[INFO] Job started on $(hostname) at $(date)"
+echo "[INFO] Working directory: $(pwd)"
+echo "[INFO] Running binary..."
+
+# Check if binary exists
+if [[ ! -x ./bin/kmeans_driver ]]; then
+  echo "[ERROR] ./bin/kmeans_driver not found or not executable!"
+  exit 1
+fi
+
+# Run your program
+./bin/kmeans_driver
+
+echo "[INFO] Job finished at $(date)"
+```
+Run the job:
+Here is an example sbatch script `test_kmeans_siftsmall.sh `. Please adjust it yourself if needed.
+```bash
+#!/usr/bin/env bash
+#SBATCH -p instruction
+#SBATCH --job-name=kmeans-siftsmall
+#SBATCH --output=kmeans-siftsmall.out
+#SBATCH --error=kmeans-siftsmall.err
+#SBATCH --ntasks=1
+#SBATCH --gres=gpu:1
+#SBATCH --time=00:05:00
+
+echo "[INFO] Job started on $(hostname) at $(date)"
+echo "[INFO] Working directory: $(pwd)"
+echo "[INFO] Running binary..."
+
+# Check if binary exists
+if [[ ! -x ./bin/kmeans_driver ]]; then
+  echo "[ERROR] ./bin/kmeans_driver not found or not executable!"
+  exit 1
+fi
+
+# Run your program
+./bin/kmeans_driver
+
+echo "[INFO] Job finished at $(date)"
+```
+After running `kmeans_driver`, `ivf_centroids.npy` and `ivf_centroids.bin` are generated and stored in `data/processed`. 
+
+### Inverted File (IVF) Index Construction
+Use the trained centroids to assign each base vector to its closest cluster, producing the inverted index (ie. `ivf_list_ids.npy` and `ivf_offsets.npy`.
+```bash
+python3 scripts/build_ivf_index.py
+```
+To inspect the resulting IVF and debug, you can use `scripts/debug_ivf_index.py`. This script helps you verify the IVF index built from KMeans centroids.
+```bash
+python3 scripts/debug_ivf_index.py
+```
+
+### Note from Rui
+All output files from my test trial are saved in the `kmeans_output/` directory for your reference and comparison.
+
+### Note from Rui
+All output files from my test trial are saved in the `kmeans_output/` directory for your reference and comparison.
 
 ## Project Structure
 ```
@@ -44,22 +116,42 @@ repo/
 ├── .gitignore
 ├── data/
 │   ├── raw/                      # Original downloaded datasets (e.g., SIFT1M)
+│   │   └── siftsmall.tar.gz      # Compressed test dataset
+│   │   └── siftsmall.tar.gz      # Compressed test dataset
 │   ├── processed/                # .npy vectors, normalized data, IVF index files
-│   └── scripts/                  # (Future) MS MARCO embedding scripts
+│   │   ├── siftsmall_base.npy
+│   │   ├── siftsmall_base.npy
+│   │   ├── ivf_centroids.npy       # Trained KMeans centroids
+│   │   ├── ivf_centroids.bin       # Binary version of centroids for GPU use
+│   │   ├── ivf_list_ids.npy        # Flattened list of vector indices by cluster
+│   │   ├── ivf_offsets.npy         # Offsets into list_ids for each cluster
+│   │   ├── parse.py                # Helper script that converts .npy to .txt
+│   │   └── siftsmall_query.npy     # (Optional) query set
+│   │   └── siftsmall_query.npy     # (Optional) query set
 ├── src/
 │   ├── cuda/
 │   │   ├── kmeans_gpu.cu         # Custom CUDA KMeans kernel code
 │   │   ├── kmeans_driver.cpp     # Host-side driver to run GPU KMeans
+│   │   ├── IVFIndex.h     # Index class
 │   ├── cpu_baseline/
 │   │   └── hnswlib_baseline.cpp  # (TODO) CPU-only ANN baseline using HNSWlib
 │   └── main.cpp                  # CLI entrypoint (planned)
 ├── include/
 │   └── kmeans_gpu.h              # CUDA kernel function declarations
+│   └── cnpy/                     # Embedded C++ library for reading/writing .npy
+│       ├── cnpy.h
+│       └── cnpy.cpp
+│   └── cnpy/                     # Embedded C++ library for reading/writing .npy
+│       ├── cnpy.h
+│       └── cnpy.cpp
 ├── scripts/
 │   ├── convert_fvecs.py          # Convert SIFT1M .fvecs → .npy
 │   ├── normalize_vectors.py      # Optional: normalize vectors
 │   ├── build_ivf_index.py        # Build inverted lists using IVF centroids
+│   ├── debug_ivf_index.py        # Inspect and validate IVF inverted lists
+│   ├── debug_ivf_index.py        # Inspect and validate IVF inverted lists
 │   ├── benchmark_kmeans.py       # Compare sklearn/FAISS vs your CUDA KMeans
 │   └── check_sift_stats.py       # Inspect shape/dtype of parsed vectors
-├── tests/                        # planned
+├── kmeans_output/                # Output files for comparison
+├── kmeans_output/                # Output files for comparison
 ```
diff --git a/build.sh b/build.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env zsh
+#SBATCH --job-name=Build_vector_db
+#SBATCH --partition=instruction
+#SBATCH --time 00:05:00
+#SBATCH --ntasks=1
+#SBATCH --gpus-per-task=1
+#SBATCH --output=Build_vector_db.out
+
+module load nvidia/cuda/11.8.0
+
+mkdir -p build
+cd build
+cmake ..
+cmake --build .
+cmake --install .
diff --git a/createIndex.sh b/createIndex.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env zsh
+#SBATCH -p instruction
+#SBATCH --job-name=kmeans-siftsmall
+#SBATCH --output=kmeans-siftsmall.out
+#SBATCH --error=kmeans-siftsmall.err
+#SBATCH --ntasks=1
+#SBATCH --gres=gpu:1
+
+echo "[INFO] Job started on $(hostname) at $(date)"
+echo "[INFO] Working directory: $(pwd)"
+echo "[INFO] Running binary..."
+module load nvidia/cuda/11.8.0
+mkdir -p bin
+nvcc -std=c++17 -O3 -Xcompiler -Wall -Iinclude -Iinclude/cnpy -o bin/kmeans_driver src/cuda/kmeans_driver.cu src/cuda/IVFIndex.cu src/cuda/kmeans_gpu.cu include/cnpy/cnpy.cpp -lcudart -lz
+
+# Check if binary exists
+if [[ ! -x ./bin/kmeans_driver ]]; then
+  echo "[ERROR] ./bin/kmeans_driver not found or not executable!"
+  exit 1
+fi
+
+# Run your program
+./bin/kmeans_driver
+
+echo "[INFO] Job finished at $(date)"
diff --git a/data/processed/.DS_Store b/data/processed/.DS_Store
diff --git a/data/processed/parse.py b/data/processed/parse.py
@@ -0,0 +1,36 @@
+import numpy as np
+import argparse
+
+def load_and_export_sift_base(input_path, output_path, max_rows=20, precision=5):
+    print(f"[INFO] Loading: {input_path}")
+    data = np.load(input_path)
+
+    print(f"[INFO] Shape: {data.shape}")  # e.g., (1000000, 128)
+
+    print(f"[INFO] Writing first {max_rows} vectors to: {output_path}")
+    np.savetxt(
+        output_path,
+        data[:max_rows],
+        fmt=f"%.{precision}f",
+        delimiter=" ",
+        header=f"First {max_rows} vectors from {input_path} (dim={data.shape[1]})",
+        comments=''
+    )
+
+    print("Done!")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input", default="ivf_list_ids.npy", help="Path to .npy file"
+    )
+    parser.add_argument(
+        "--output", default="ivf_list_ids.txt", help="Path to output .txt file"
+    )
+    parser.add_argument(
+        "--rows", type=int, default=20, help="Number of rows to export"
+    )
+    args = parser.parse_args()
+
+    load_and_export_sift_base(args.input, args.output, max_rows=args.rows)
+
diff --git a/data/raw/siftsmall.tar.gz b/data/raw/siftsmall.tar.gz
diff --git a/include/IVFIndex.h b/include/IVFIndex.h
@@ -0,0 +1,58 @@
+#ifndef IVFINDEX_H
+#define IVFINDEX_H
+
+#include <cuda_runtime.h>
+#include <string>
+#include <vector>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+typedef int64_t IndexType;
+
+class IVFIndex {
+private:
+    int num_centroids;
+    int num_vectors;
+    int dim;
+    int iterations;
+    float* centroids;
+    float* data;
+    int* assignments;
+    float* distances;
+    int* counts;
+    float** d_inverted_lists;
+    int* d_list_sizes;
+    int* d_cluster_offsets;
+    float* d_all_vectors_flat;
+
+    void checkCudaError(const std::string& kernelName);
+
+public:
+    IVFIndex(int num_centroids, int num_vectors, int dim, int iterations);
+    ~IVFIndex();
+
+    float* getData() const;
+    float* getallvectorsflat() const;
+    float** getInvertedLists() const;
+    int* getClusterOffsets() const;
+    float* getCentroids() const;
+    int* getAssignments() const;
+    int getNumCentroids() const;
+    int getDim() const;
+    int getNumVectors() const;
+
+    void train(const float* train_data);
+    void search_coarse_grained(thrust::device_vector<float> &queries, 
+        thrust::device_vector<float> &queries_norm, 
+        thrust::device_vector<int64_t> *coarse_grained_output,
+        uint32_t centroids_to_select);
+
+    void search_fine_grained(thrust::device_vector<float> &queries, thrust::device_vector<float> &queries_norm, 
+        thrust::device_vector<int64_t> &coarse_grained_output,
+        thrust::device_vector<int64_t> *fine_grained_output,
+        uint32_t k);
+
+    thrust::device_vector<int64_t> search_batch(thrust::device_vector<float> &queries, uint32_t k);
+};
+
+#endif // IVFINDEX_H