Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
2d3f5ef
Add kernel for L2 norm computation
proteetpaul Apr 20, 2025
aa81e8a
Fixes
proteetpaul Apr 20, 2025
1c1c6d8
Changes
proteetpaul Apr 27, 2025
2a169a6
Initial bitonic sort implementation
proteetpaul Apr 28, 2025
ce3d58b
Rename select.cu
proteetpaul Apr 28, 2025
f68dce5
Implement K-means + Store cluster centroids
Apr 21, 2025
7a4a844
test index construction with siftsmall dataset
Apr 24, 2025
c63d583
Update README.md
Ruikang-L23 Apr 24, 2025
67026c5
Update README.md
Ruikang-L23 Apr 24, 2025
4ac57bf
Added class for index.
Apr 27, 2025
a13061f
Update README.md
Ruikang-L23 Apr 24, 2025
01eb776
Updated project structure.
darshsharma Apr 27, 2025
f9e88a4
changes for inverted index.
darshsharma Apr 28, 2025
961592b
corrected assignment kernel.
darshsharma Apr 28, 2025
aa6917b
Add fine grained search
Apr 28, 2025
1c2a2e6
Add CMake integration and fix compilation errors
proteetpaul Apr 29, 2025
5fd798e
Add executable in CMake
proteetpaul Apr 29, 2025
9fe8ced
Rename select.cuh to heap.cuh
proteetpaul Apr 30, 2025
6860551
Add kernel for k-selection
proteetpaul Apr 30, 2025
922f5ea
Add kernel for distance computation during fine-grained search
proteetpaul May 1, 2025
aeb96ef
Merging changes with index branch
May 1, 2025
c7e8871
WIP: Add kernel for k-min selection during fine-grained search
proteetpaul May 2, 2025
5fa4fa8
Fixing more errors
May 2, 2025
9a0d35d
Fixed compilation errors
May 2, 2025
d07035f
Final implementation changes
proteetpaul May 2, 2025
240135c
Compilation error fixes
proteetpaul May 3, 2025
ef30d33
Add test for GPU heap
proteetpaul May 4, 2025
afd9fee
Remove gtest dependency
proteetpaul May 4, 2025
2c742c9
Test fixes
proteetpaul May 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
# Ignore raw datasets
data/raw/
.vscode/
build/

65 changes: 65 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
{
"files.associations": {
"random": "cpp",
"array": "cpp",
"atomic": "cpp",
"bit": "cpp",
"*.tcc": "cpp",
"bitset": "cpp",
"cctype": "cpp",
"clocale": "cpp",
"cmath": "cpp",
"compare": "cpp",
"complex": "cpp",
"concepts": "cpp",
"cstdarg": "cpp",
"cstddef": "cpp",
"cstdint": "cpp",
"cstdio": "cpp",
"cstdlib": "cpp",
"cstring": "cpp",
"ctime": "cpp",
"cwchar": "cpp",
"cwctype": "cpp",
"deque": "cpp",
"map": "cpp",
"string": "cpp",
"unordered_map": "cpp",
"vector": "cpp",
"exception": "cpp",
"algorithm": "cpp",
"functional": "cpp",
"iterator": "cpp",
"memory": "cpp",
"memory_resource": "cpp",
"numeric": "cpp",
"optional": "cpp",
"regex": "cpp",
"string_view": "cpp",
"system_error": "cpp",
"tuple": "cpp",
"type_traits": "cpp",
"utility": "cpp",
"fstream": "cpp",
"initializer_list": "cpp",
"iomanip": "cpp",
"iosfwd": "cpp",
"iostream": "cpp",
"istream": "cpp",
"limits": "cpp",
"new": "cpp",
"numbers": "cpp",
"ostream": "cpp",
"sstream": "cpp",
"stdexcept": "cpp",
"streambuf": "cpp",
"typeinfo": "cpp",
"unordered_set": "cpp",
"charconv": "cpp",
"chrono": "cpp",
"ratio": "cpp",
"format": "cpp",
"span": "cpp",
"variant": "cpp"
}
}
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
cmake_minimum_required(VERSION 3.8)
project(CUDA_VectorDB LANGUAGES CXX CUDA)
set(CMAKE_CXX_STANDARD 17)

add_subdirectory(src/cuda)
add_subdirectory(tests)
add_dependencies(tests cuda_vector_db)
118 changes: 105 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,15 @@ Vector database search optimized using cuda

---

## ✅ Implemented So Far

- ✅ SIFT1M dataset preprocessing (`.fvecs → .npy`)
- ✅ Optional vector normalization
- 🔧 Custom CUDA-based KMeans:
- `kmeans_gpu.cu`: core CUDA kernels for assignment and centroid update
- `kmeans_driver.cpp`: runs clustering loop and saves centroids
- ⏳ Python IVF index builder (`build_ivf_index.py`) using precomputed centroids
- ⏳ Scripts for converting data, checking shape, and benchmarking CPU vs GPU KMeans
## This branch is dedicated to implementation testing using the SIFTSMALL dataset.

---

## Dataset: SIFT1M
## Dataset: SIFTSMALL

You can find the original compressed dataset file in `data/raw`. The dataset has already been preprocessed, and the resulting `.npy` files are available in `data/processed`. For human-readable inspection, you can use `parse.py` to convert `.npy` file to `.txt`.

Due to GitHub file size limits, we do not include the full dataset in this repo.
If you'd like to re-run the preprocessing yourself, feel free to follow the steps below. Be sure to adjust any file paths or names if needed (eg. sift1m vs. siftsmall).

To download:
```bash
Expand All @@ -36,6 +30,84 @@ python3 scripts/normalize_vectors.py

## IVF Indexing (Inverted File Index)
### Run custom GPU KMeans and save IVF centroids:
We run KMeans clustering on the base vectors to compute `K` coarse centroids. These centroids serve as the anchors for grouping similar vectors in the inverted index.

Compilation command:
Notice that the test environment was based on Euler, with `nvidia/cuda/11.8.0` module loaded. And we choose `K=100` for a small dataset like siwfsmall.
```bash
nvcc -std=c++17 -O2 -Xcompiler -Wall -Xptxas -O3 -Iinclude -Iinclude/cnpy -o bin/kmeans_driver src/cuda/kmeans_driver.cpp src/cuda/kmeans_gpu.cu include/cnpy/cnpy.cpp -lcuda -lcudart -lz
```
Run the job:
Here is an example sbatch script `test_kmeans_siftsmall.sh `. Please adjust it yourself if needed.
```bash
#!/usr/bin/env bash
#SBATCH -p instruction
#SBATCH --job-name=kmeans-siftsmall
#SBATCH --output=kmeans-siftsmall.out
#SBATCH --error=kmeans-siftsmall.err
#SBATCH --ntasks=1
#SBATCH --gres=gpu:1
#SBATCH --time=00:05:00

echo "[INFO] Job started on $(hostname) at $(date)"
echo "[INFO] Working directory: $(pwd)"
echo "[INFO] Running binary..."

# Check if binary exists
if [[ ! -x ./bin/kmeans_driver ]]; then
echo "[ERROR] ./bin/kmeans_driver not found or not executable!"
exit 1
fi

# Run your program
./bin/kmeans_driver

echo "[INFO] Job finished at $(date)"
```
Run the job:
Here is an example sbatch script `test_kmeans_siftsmall.sh `. Please adjust it yourself if needed.
```bash
#!/usr/bin/env bash
#SBATCH -p instruction
#SBATCH --job-name=kmeans-siftsmall
#SBATCH --output=kmeans-siftsmall.out
#SBATCH --error=kmeans-siftsmall.err
#SBATCH --ntasks=1
#SBATCH --gres=gpu:1
#SBATCH --time=00:05:00

echo "[INFO] Job started on $(hostname) at $(date)"
echo "[INFO] Working directory: $(pwd)"
echo "[INFO] Running binary..."

# Check if binary exists
if [[ ! -x ./bin/kmeans_driver ]]; then
echo "[ERROR] ./bin/kmeans_driver not found or not executable!"
exit 1
fi

# Run your program
./bin/kmeans_driver

echo "[INFO] Job finished at $(date)"
```
After running `kmeans_driver`, `ivf_centroids.npy` and `ivf_centroids.bin` are generated and stored in `data/processed`.

### Inverted File (IVF) Index Construction
Use the trained centroids to assign each base vector to its closest cluster, producing the inverted index (ie. `ivf_list_ids.npy` and `ivf_offsets.npy`.
```bash
python3 scripts/build_ivf_index.py
```
To inspect the resulting IVF and debug, you can use `scripts/debug_ivf_index.py`. This script helps you verify the IVF index built from KMeans centroids.
```bash
python3 scripts/debug_ivf_index.py
```

### Note from Rui
All output files from my test trial are saved in the `kmeans_output/` directory for your reference and comparison.

### Note from Rui
All output files from my test trial are saved in the `kmeans_output/` directory for your reference and comparison.

## Project Structure
```
Expand All @@ -44,22 +116,42 @@ repo/
├── .gitignore
├── data/
│ ├── raw/ # Original downloaded datasets (e.g., SIFT1M)
│ │ └── siftsmall.tar.gz # Compressed test dataset
│ │ └── siftsmall.tar.gz # Compressed test dataset
│ ├── processed/ # .npy vectors, normalized data, IVF index files
│ └── scripts/ # (Future) MS MARCO embedding scripts
│ │ ├── siftsmall_base.npy
│ │ ├── siftsmall_base.npy
│ │ ├── ivf_centroids.npy # Trained KMeans centroids
│ │ ├── ivf_centroids.bin # Binary version of centroids for GPU use
│ │ ├── ivf_list_ids.npy # Flattened list of vector indices by cluster
│ │ ├── ivf_offsets.npy # Offsets into list_ids for each cluster
│ │ ├── parse.py # Helper script that converts .npy to .txt
│ │ └── siftsmall_query.npy # (Optional) query set
│ │ └── siftsmall_query.npy # (Optional) query set
├── src/
│ ├── cuda/
│ │ ├── kmeans_gpu.cu # Custom CUDA KMeans kernel code
│ │ ├── kmeans_driver.cpp # Host-side driver to run GPU KMeans
│ │ ├── IVFIndex.h # Index class
│ ├── cpu_baseline/
│ │ └── hnswlib_baseline.cpp # (TODO) CPU-only ANN baseline using HNSWlib
│ └── main.cpp # CLI entrypoint (planned)
├── include/
│ └── kmeans_gpu.h # CUDA kernel function declarations
│ └── cnpy/ # Embedded C++ library for reading/writing .npy
│ ├── cnpy.h
│ └── cnpy.cpp
│ └── cnpy/ # Embedded C++ library for reading/writing .npy
│ ├── cnpy.h
│ └── cnpy.cpp
├── scripts/
│ ├── convert_fvecs.py # Convert SIFT1M .fvecs → .npy
│ ├── normalize_vectors.py # Optional: normalize vectors
│ ├── build_ivf_index.py # Build inverted lists using IVF centroids
│ ├── debug_ivf_index.py # Inspect and validate IVF inverted lists
│ ├── debug_ivf_index.py # Inspect and validate IVF inverted lists
│ ├── benchmark_kmeans.py # Compare sklearn/FAISS vs your CUDA KMeans
│ └── check_sift_stats.py # Inspect shape/dtype of parsed vectors
├── tests/ # planned
├── kmeans_output/ # Output files for comparison
├── kmeans_output/ # Output files for comparison
```
15 changes: 15 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env zsh
#SBATCH --job-name=Build_vector_db
#SBATCH --partition=instruction
#SBATCH --time 00:05:00
#SBATCH --ntasks=1
#SBATCH --gpus-per-task=1
#SBATCH --output=Build_vector_db.out

module load nvidia/cuda/11.8.0

mkdir -p build
cd build
cmake ..
cmake --build .
cmake --install .
25 changes: 25 additions & 0 deletions createIndex.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env zsh
#SBATCH -p instruction
#SBATCH --job-name=kmeans-siftsmall
#SBATCH --output=kmeans-siftsmall.out
#SBATCH --error=kmeans-siftsmall.err
#SBATCH --ntasks=1
#SBATCH --gres=gpu:1

echo "[INFO] Job started on $(hostname) at $(date)"
echo "[INFO] Working directory: $(pwd)"
echo "[INFO] Running binary..."
module load nvidia/cuda/11.8.0
mkdir -p bin
nvcc -std=c++17 -O3 -Xcompiler -Wall -Iinclude -Iinclude/cnpy -o bin/kmeans_driver src/cuda/kmeans_driver.cu src/cuda/IVFIndex.cu src/cuda/kmeans_gpu.cu include/cnpy/cnpy.cpp -lcudart -lz

# Check if binary exists
if [[ ! -x ./bin/kmeans_driver ]]; then
echo "[ERROR] ./bin/kmeans_driver not found or not executable!"
exit 1
fi

# Run your program
./bin/kmeans_driver

echo "[INFO] Job finished at $(date)"
Binary file added data/processed/.DS_Store
Binary file not shown.
36 changes: 36 additions & 0 deletions data/processed/parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import numpy as np
import argparse

def load_and_export_sift_base(input_path, output_path, max_rows=20, precision=5):
print(f"[INFO] Loading: {input_path}")
data = np.load(input_path)

print(f"[INFO] Shape: {data.shape}") # e.g., (1000000, 128)

print(f"[INFO] Writing first {max_rows} vectors to: {output_path}")
np.savetxt(
output_path,
data[:max_rows],
fmt=f"%.{precision}f",
delimiter=" ",
header=f"First {max_rows} vectors from {input_path} (dim={data.shape[1]})",
comments=''
)

print("Done!")

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--input", default="ivf_list_ids.npy", help="Path to .npy file"
)
parser.add_argument(
"--output", default="ivf_list_ids.txt", help="Path to output .txt file"
)
parser.add_argument(
"--rows", type=int, default=20, help="Number of rows to export"
)
args = parser.parse_args()

load_and_export_sift_base(args.input, args.output, max_rows=args.rows)

Binary file added data/raw/siftsmall.tar.gz
Binary file not shown.
58 changes: 58 additions & 0 deletions include/IVFIndex.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#ifndef IVFINDEX_H
#define IVFINDEX_H

#include <cuda_runtime.h>
#include <string>
#include <vector>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

typedef int64_t IndexType;

class IVFIndex {
private:
int num_centroids;
int num_vectors;
int dim;
int iterations;
float* centroids;
float* data;
int* assignments;
float* distances;
int* counts;
float** d_inverted_lists;
int* d_list_sizes;
int* d_cluster_offsets;
float* d_all_vectors_flat;

void checkCudaError(const std::string& kernelName);

public:
IVFIndex(int num_centroids, int num_vectors, int dim, int iterations);
~IVFIndex();

float* getData() const;
float* getallvectorsflat() const;
float** getInvertedLists() const;
int* getClusterOffsets() const;
float* getCentroids() const;
int* getAssignments() const;
int getNumCentroids() const;
int getDim() const;
int getNumVectors() const;

void train(const float* train_data);
void search_coarse_grained(thrust::device_vector<float> &queries,
thrust::device_vector<float> &queries_norm,
thrust::device_vector<int64_t> *coarse_grained_output,
uint32_t centroids_to_select);

void search_fine_grained(thrust::device_vector<float> &queries, thrust::device_vector<float> &queries_norm,
thrust::device_vector<int64_t> &coarse_grained_output,
thrust::device_vector<int64_t> *fine_grained_output,
uint32_t k);

thrust::device_vector<int64_t> search_batch(thrust::device_vector<float> &queries, uint32_t k);
};

#endif // IVFINDEX_H
Loading