diff --git a/.gitignore b/.gitignore index d877ae7..8d9b4c1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ # Ignore raw datasets data/raw/ +.vscode/ +build/ + diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..79ad8be --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,65 @@ +{ + "files.associations": { + "random": "cpp", + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "*.tcc": "cpp", + "bitset": "cpp", + "cctype": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "compare": "cpp", + "complex": "cpp", + "concepts": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "deque": "cpp", + "map": "cpp", + "string": "cpp", + "unordered_map": "cpp", + "vector": "cpp", + "exception": "cpp", + "algorithm": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "optional": "cpp", + "regex": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "fstream": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "limits": "cpp", + "new": "cpp", + "numbers": "cpp", + "ostream": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "streambuf": "cpp", + "typeinfo": "cpp", + "unordered_set": "cpp", + "charconv": "cpp", + "chrono": "cpp", + "ratio": "cpp", + "format": "cpp", + "span": "cpp", + "variant": "cpp" + } +} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..b066ab0 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,7 @@ +cmake_minimum_required(VERSION 3.8) +project(CUDA_VectorDB LANGUAGES CXX CUDA) +set(CMAKE_CXX_STANDARD 17) + +add_subdirectory(src/cuda) +add_subdirectory(tests) +add_dependencies(tests cuda_vector_db) \ No newline at end of file diff --git a/README.md b/README.md index 21e9ecd..19bb1a9 100644 --- a/README.md +++ b/README.md @@ -3,21 +3,15 @@ Vector database search optimized using cuda --- -## ✅ Implemented So Far - -- ✅ SIFT1M dataset preprocessing (`.fvecs → .npy`) -- ✅ Optional vector normalization -- 🔧 Custom CUDA-based KMeans: - - `kmeans_gpu.cu`: core CUDA kernels for assignment and centroid update - - `kmeans_driver.cpp`: runs clustering loop and saves centroids -- ⏳ Python IVF index builder (`build_ivf_index.py`) using precomputed centroids -- ⏳ Scripts for converting data, checking shape, and benchmarking CPU vs GPU KMeans +## This branch is dedicated to implementation testing using the SIFTSMALL dataset. --- -## Dataset: SIFT1M +## Dataset: SIFTSMALL + +You can find the original compressed dataset file in `data/raw`. The dataset has already been preprocessed, and the resulting `.npy` files are available in `data/processed`. For human-readable inspection, you can use `parse.py` to convert `.npy` file to `.txt`. -Due to GitHub file size limits, we do not include the full dataset in this repo. +If you'd like to re-run the preprocessing yourself, feel free to follow the steps below. Be sure to adjust any file paths or names if needed (eg. sift1m vs. siftsmall). To download: ```bash @@ -36,6 +30,84 @@ python3 scripts/normalize_vectors.py ## IVF Indexing (Inverted File Index) ### Run custom GPU KMeans and save IVF centroids: +We run KMeans clustering on the base vectors to compute `K` coarse centroids. These centroids serve as the anchors for grouping similar vectors in the inverted index. + +Compilation command: +Notice that the test environment was based on Euler, with `nvidia/cuda/11.8.0` module loaded. And we choose `K=100` for a small dataset like siwfsmall. +```bash +nvcc -std=c++17 -O2 -Xcompiler -Wall -Xptxas -O3 -Iinclude -Iinclude/cnpy -o bin/kmeans_driver src/cuda/kmeans_driver.cpp src/cuda/kmeans_gpu.cu include/cnpy/cnpy.cpp -lcuda -lcudart -lz +``` +Run the job: +Here is an example sbatch script `test_kmeans_siftsmall.sh `. Please adjust it yourself if needed. +```bash +#!/usr/bin/env bash +#SBATCH -p instruction +#SBATCH --job-name=kmeans-siftsmall +#SBATCH --output=kmeans-siftsmall.out +#SBATCH --error=kmeans-siftsmall.err +#SBATCH --ntasks=1 +#SBATCH --gres=gpu:1 +#SBATCH --time=00:05:00 + +echo "[INFO] Job started on $(hostname) at $(date)" +echo "[INFO] Working directory: $(pwd)" +echo "[INFO] Running binary..." + +# Check if binary exists +if [[ ! -x ./bin/kmeans_driver ]]; then + echo "[ERROR] ./bin/kmeans_driver not found or not executable!" + exit 1 +fi + +# Run your program +./bin/kmeans_driver + +echo "[INFO] Job finished at $(date)" +``` +Run the job: +Here is an example sbatch script `test_kmeans_siftsmall.sh `. Please adjust it yourself if needed. +```bash +#!/usr/bin/env bash +#SBATCH -p instruction +#SBATCH --job-name=kmeans-siftsmall +#SBATCH --output=kmeans-siftsmall.out +#SBATCH --error=kmeans-siftsmall.err +#SBATCH --ntasks=1 +#SBATCH --gres=gpu:1 +#SBATCH --time=00:05:00 + +echo "[INFO] Job started on $(hostname) at $(date)" +echo "[INFO] Working directory: $(pwd)" +echo "[INFO] Running binary..." + +# Check if binary exists +if [[ ! -x ./bin/kmeans_driver ]]; then + echo "[ERROR] ./bin/kmeans_driver not found or not executable!" + exit 1 +fi + +# Run your program +./bin/kmeans_driver + +echo "[INFO] Job finished at $(date)" +``` +After running `kmeans_driver`, `ivf_centroids.npy` and `ivf_centroids.bin` are generated and stored in `data/processed`. + +### Inverted File (IVF) Index Construction +Use the trained centroids to assign each base vector to its closest cluster, producing the inverted index (ie. `ivf_list_ids.npy` and `ivf_offsets.npy`. +```bash +python3 scripts/build_ivf_index.py +``` +To inspect the resulting IVF and debug, you can use `scripts/debug_ivf_index.py`. This script helps you verify the IVF index built from KMeans centroids. +```bash +python3 scripts/debug_ivf_index.py +``` + +### Note from Rui +All output files from my test trial are saved in the `kmeans_output/` directory for your reference and comparison. + +### Note from Rui +All output files from my test trial are saved in the `kmeans_output/` directory for your reference and comparison. ## Project Structure ``` @@ -44,22 +116,42 @@ repo/ ├── .gitignore ├── data/ │ ├── raw/ # Original downloaded datasets (e.g., SIFT1M) +│ │ └── siftsmall.tar.gz # Compressed test dataset +│ │ └── siftsmall.tar.gz # Compressed test dataset │ ├── processed/ # .npy vectors, normalized data, IVF index files -│ └── scripts/ # (Future) MS MARCO embedding scripts +│ │ ├── siftsmall_base.npy +│ │ ├── siftsmall_base.npy +│ │ ├── ivf_centroids.npy # Trained KMeans centroids +│ │ ├── ivf_centroids.bin # Binary version of centroids for GPU use +│ │ ├── ivf_list_ids.npy # Flattened list of vector indices by cluster +│ │ ├── ivf_offsets.npy # Offsets into list_ids for each cluster +│ │ ├── parse.py # Helper script that converts .npy to .txt +│ │ └── siftsmall_query.npy # (Optional) query set +│ │ └── siftsmall_query.npy # (Optional) query set ├── src/ │ ├── cuda/ │ │ ├── kmeans_gpu.cu # Custom CUDA KMeans kernel code │ │ ├── kmeans_driver.cpp # Host-side driver to run GPU KMeans +│ │ ├── IVFIndex.h # Index class │ ├── cpu_baseline/ │ │ └── hnswlib_baseline.cpp # (TODO) CPU-only ANN baseline using HNSWlib │ └── main.cpp # CLI entrypoint (planned) ├── include/ │ └── kmeans_gpu.h # CUDA kernel function declarations +│ └── cnpy/ # Embedded C++ library for reading/writing .npy +│ ├── cnpy.h +│ └── cnpy.cpp +│ └── cnpy/ # Embedded C++ library for reading/writing .npy +│ ├── cnpy.h +│ └── cnpy.cpp ├── scripts/ │ ├── convert_fvecs.py # Convert SIFT1M .fvecs → .npy │ ├── normalize_vectors.py # Optional: normalize vectors │ ├── build_ivf_index.py # Build inverted lists using IVF centroids +│ ├── debug_ivf_index.py # Inspect and validate IVF inverted lists +│ ├── debug_ivf_index.py # Inspect and validate IVF inverted lists │ ├── benchmark_kmeans.py # Compare sklearn/FAISS vs your CUDA KMeans │ └── check_sift_stats.py # Inspect shape/dtype of parsed vectors -├── tests/ # planned +├── kmeans_output/ # Output files for comparison +├── kmeans_output/ # Output files for comparison ``` diff --git a/build.sh b/build.sh new file mode 100644 index 0000000..aa303c8 --- /dev/null +++ b/build.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env zsh +#SBATCH --job-name=Build_vector_db +#SBATCH --partition=instruction +#SBATCH --time 00:05:00 +#SBATCH --ntasks=1 +#SBATCH --gpus-per-task=1 +#SBATCH --output=Build_vector_db.out + +module load nvidia/cuda/11.8.0 + +mkdir -p build +cd build +cmake .. +cmake --build . +cmake --install . \ No newline at end of file diff --git a/createIndex.sh b/createIndex.sh new file mode 100644 index 0000000..d719b6f --- /dev/null +++ b/createIndex.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env zsh +#SBATCH -p instruction +#SBATCH --job-name=kmeans-siftsmall +#SBATCH --output=kmeans-siftsmall.out +#SBATCH --error=kmeans-siftsmall.err +#SBATCH --ntasks=1 +#SBATCH --gres=gpu:1 + +echo "[INFO] Job started on $(hostname) at $(date)" +echo "[INFO] Working directory: $(pwd)" +echo "[INFO] Running binary..." +module load nvidia/cuda/11.8.0 +mkdir -p bin +nvcc -std=c++17 -O3 -Xcompiler -Wall -Iinclude -Iinclude/cnpy -o bin/kmeans_driver src/cuda/kmeans_driver.cu src/cuda/IVFIndex.cu src/cuda/kmeans_gpu.cu include/cnpy/cnpy.cpp -lcudart -lz + +# Check if binary exists +if [[ ! -x ./bin/kmeans_driver ]]; then + echo "[ERROR] ./bin/kmeans_driver not found or not executable!" + exit 1 +fi + +# Run your program +./bin/kmeans_driver + +echo "[INFO] Job finished at $(date)" \ No newline at end of file diff --git a/data/processed/.DS_Store b/data/processed/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/data/processed/.DS_Store differ diff --git a/data/processed/parse.py b/data/processed/parse.py new file mode 100644 index 0000000..9afc43f --- /dev/null +++ b/data/processed/parse.py @@ -0,0 +1,36 @@ +import numpy as np +import argparse + +def load_and_export_sift_base(input_path, output_path, max_rows=20, precision=5): + print(f"[INFO] Loading: {input_path}") + data = np.load(input_path) + + print(f"[INFO] Shape: {data.shape}") # e.g., (1000000, 128) + + print(f"[INFO] Writing first {max_rows} vectors to: {output_path}") + np.savetxt( + output_path, + data[:max_rows], + fmt=f"%.{precision}f", + delimiter=" ", + header=f"First {max_rows} vectors from {input_path} (dim={data.shape[1]})", + comments='' + ) + + print("Done!") + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--input", default="ivf_list_ids.npy", help="Path to .npy file" + ) + parser.add_argument( + "--output", default="ivf_list_ids.txt", help="Path to output .txt file" + ) + parser.add_argument( + "--rows", type=int, default=20, help="Number of rows to export" + ) + args = parser.parse_args() + + load_and_export_sift_base(args.input, args.output, max_rows=args.rows) + diff --git a/data/raw/siftsmall.tar.gz b/data/raw/siftsmall.tar.gz new file mode 100644 index 0000000..68f3da0 Binary files /dev/null and b/data/raw/siftsmall.tar.gz differ diff --git a/include/IVFIndex.h b/include/IVFIndex.h new file mode 100644 index 0000000..9c02190 --- /dev/null +++ b/include/IVFIndex.h @@ -0,0 +1,58 @@ +#ifndef IVFINDEX_H +#define IVFINDEX_H + +#include +#include +#include +#include +#include + +typedef int64_t IndexType; + +class IVFIndex { +private: + int num_centroids; + int num_vectors; + int dim; + int iterations; + float* centroids; + float* data; + int* assignments; + float* distances; + int* counts; + float** d_inverted_lists; + int* d_list_sizes; + int* d_cluster_offsets; + float* d_all_vectors_flat; + + void checkCudaError(const std::string& kernelName); + +public: + IVFIndex(int num_centroids, int num_vectors, int dim, int iterations); + ~IVFIndex(); + + float* getData() const; + float* getallvectorsflat() const; + float** getInvertedLists() const; + int* getClusterOffsets() const; + float* getCentroids() const; + int* getAssignments() const; + int getNumCentroids() const; + int getDim() const; + int getNumVectors() const; + + void train(const float* train_data); + void search_coarse_grained(thrust::device_vector &queries, + thrust::device_vector &queries_norm, + thrust::device_vector *coarse_grained_output, + uint32_t centroids_to_select); + + void search_fine_grained(thrust::device_vector &queries, thrust::device_vector &queries_norm, + thrust::device_vector &coarse_grained_output, + thrust::device_vector *fine_grained_output, + uint32_t k); + + thrust::device_vector search_batch(thrust::device_vector &queries, uint32_t k); +}; + +#endif // IVFINDEX_H \ No newline at end of file diff --git a/include/cnpy/CMakeLists.txt b/include/cnpy/CMakeLists.txt new file mode 100644 index 0000000..9eb550f --- /dev/null +++ b/include/cnpy/CMakeLists.txt @@ -0,0 +1,30 @@ +CMAKE_MINIMUM_REQUIRED(VERSION 3.0 FATAL_ERROR) +if(COMMAND cmake_policy) + cmake_policy(SET CMP0003 NEW) +endif(COMMAND cmake_policy) + +project(CNPY) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + +option(ENABLE_STATIC "Build static (.a) library" ON) + +find_package(ZLIB REQUIRED) + +include_directories(${ZLIB_INCLUDE_DIRS}) + +add_library(cnpy SHARED "cnpy.cpp") +target_link_libraries(cnpy ${ZLIB_LIBRARIES}) +install(TARGETS "cnpy" LIBRARY DESTINATION lib PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE) + +if(ENABLE_STATIC) + add_library(cnpy-static STATIC "cnpy.cpp") + set_target_properties(cnpy-static PROPERTIES OUTPUT_NAME "cnpy") + install(TARGETS "cnpy-static" ARCHIVE DESTINATION lib) +endif(ENABLE_STATIC) + +install(FILES "cnpy.h" DESTINATION include) +install(FILES "mat2npz" "npy2mat" "npz2mat" DESTINATION bin PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE) + +add_executable(example1 example1.cpp) +target_link_libraries(example1 cnpy) diff --git a/include/cnpy/LICENSE b/include/cnpy/LICENSE new file mode 100644 index 0000000..e60eadb --- /dev/null +++ b/include/cnpy/LICENSE @@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) Carl Rogers, 2011 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/include/cnpy/README.md b/include/cnpy/README.md new file mode 100644 index 0000000..37c4a43 --- /dev/null +++ b/include/cnpy/README.md @@ -0,0 +1,55 @@ +# Purpose: + +NumPy offers the `save` method for easy saving of arrays into .npy and `savez` for zipping multiple .npy arrays together into a .npz file. + +`cnpy` lets you read and write to these formats in C++. + +The motivation comes from scientific programming where large amounts of data are generated in C++ and analyzed in Python. + +Writing to .npy has the advantage of using low-level C++ I/O (fread and fwrite) for speed and binary format for size. +The .npy file header takes care of specifying the size, shape, and data type of the array, so specifying the format of the data is unnecessary. + +Loading data written in numpy formats into C++ is equally simple, but requires you to type-cast the loaded data to the type of your choice. + +# Installation: + +Default installation directory is /usr/local. +To specify a different directory, add `-DCMAKE_INSTALL_PREFIX=/path/to/install/dir` to the cmake invocation in step 4. + +1. get [cmake](www.cmake.org) +2. create a build directory, say $HOME/build +3. cd $HOME/build +4. cmake /path/to/cnpy +5. make +6. make install + +# Using: + +To use, `#include"cnpy.h"` in your source code. Compile the source code mycode.cpp as + +```bash +g++ -o mycode mycode.cpp -L/path/to/install/dir -lcnpy -lz --std=c++11 +``` + +# Description: + +There are two functions for writing data: `npy_save` and `npz_save`. + +There are 3 functions for reading: +- `npy_load` will load a .npy file. +- `npz_load(fname)` will load a .npz and return a dictionary of NpyArray structues. +- `npz_load(fname,varname)` will load and return the NpyArray for data varname from the specified .npz file. + +The data structure for loaded data is below. +Data is accessed via the `data()`-method, which returns a pointer of the specified type (which must match the underlying datatype of the data). +The array shape and word size are read from the npy header. + +```c++ +struct NpyArray { + std::vector shape; + size_t word_size; + template T* data(); +}; +``` + +See [example1.cpp](example1.cpp) for examples of how to use the library. example1 will also be build during cmake installation. diff --git a/include/cnpy/cnpy.cpp b/include/cnpy/cnpy.cpp new file mode 100644 index 0000000..2d28578 --- /dev/null +++ b/include/cnpy/cnpy.cpp @@ -0,0 +1,340 @@ +//Copyright (C) 2011 Carl Rogers +//Released under MIT License +//license available in LICENSE file, or at http://www.opensource.org/licenses/mit-license.php + +#include"cnpy.h" +#include +#include +#include +#include +#include +#include +#include +#include + +char cnpy::BigEndianTest() { + int x = 1; + return (((char *)&x)[0]) ? '<' : '>'; +} + +char cnpy::map_type(const std::type_info& t) +{ + if(t == typeid(float) ) return 'f'; + if(t == typeid(double) ) return 'f'; + if(t == typeid(long double) ) return 'f'; + + if(t == typeid(int) ) return 'i'; + if(t == typeid(char) ) return 'i'; + if(t == typeid(short) ) return 'i'; + if(t == typeid(long) ) return 'i'; + if(t == typeid(long long) ) return 'i'; + + if(t == typeid(unsigned char) ) return 'u'; + if(t == typeid(unsigned short) ) return 'u'; + if(t == typeid(unsigned long) ) return 'u'; + if(t == typeid(unsigned long long) ) return 'u'; + if(t == typeid(unsigned int) ) return 'u'; + + if(t == typeid(bool) ) return 'b'; + + if(t == typeid(std::complex) ) return 'c'; + if(t == typeid(std::complex) ) return 'c'; + if(t == typeid(std::complex) ) return 'c'; + + else return '?'; +} + +template<> std::vector& cnpy::operator+=(std::vector& lhs, const std::string rhs) { + lhs.insert(lhs.end(),rhs.begin(),rhs.end()); + return lhs; +} + +template<> std::vector& cnpy::operator+=(std::vector& lhs, const char* rhs) { + //write in little endian + size_t len = strlen(rhs); + lhs.reserve(len); + for(size_t byte = 0; byte < len; byte++) { + lhs.push_back(rhs[byte]); + } + return lhs; +} + +void cnpy::parse_npy_header(unsigned char* buffer,size_t& word_size, std::vector& shape, bool& fortran_order) { + //std::string magic_string(buffer,6); + uint8_t major_version = *reinterpret_cast(buffer+6); + uint8_t minor_version = *reinterpret_cast(buffer+7); + uint16_t header_len = *reinterpret_cast(buffer+8); + std::string header(reinterpret_cast(buffer+9),header_len); + + size_t loc1, loc2; + + //fortran order + loc1 = header.find("fortran_order")+16; + fortran_order = (header.substr(loc1,4) == "True" ? true : false); + + //shape + loc1 = header.find("("); + loc2 = header.find(")"); + + std::regex num_regex("[0-9][0-9]*"); + std::smatch sm; + shape.clear(); + + std::string str_shape = header.substr(loc1+1,loc2-loc1-1); + while(std::regex_search(str_shape, sm, num_regex)) { + shape.push_back(std::stoi(sm[0].str())); + str_shape = sm.suffix().str(); + } + + //endian, word size, data type + //byte order code | stands for not applicable. + //not sure when this applies except for byte array + loc1 = header.find("descr")+9; + bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false); + assert(littleEndian); + + //char type = header[loc1+1]; + //assert(type == map_type(T)); + + std::string str_ws = header.substr(loc1+2); + loc2 = str_ws.find("'"); + word_size = atoi(str_ws.substr(0,loc2).c_str()); +} + +void cnpy::parse_npy_header(FILE* fp, size_t& word_size, std::vector& shape, bool& fortran_order) { + char buffer[256]; + size_t res = fread(buffer,sizeof(char),11,fp); + if(res != 11) + throw std::runtime_error("parse_npy_header: failed fread"); + std::string header = fgets(buffer,256,fp); + assert(header[header.size()-1] == '\n'); + + size_t loc1, loc2; + + //fortran order + loc1 = header.find("fortran_order"); + if (loc1 == std::string::npos) + throw std::runtime_error("parse_npy_header: failed to find header keyword: 'fortran_order'"); + loc1 += 16; + fortran_order = (header.substr(loc1,4) == "True" ? true : false); + + //shape + loc1 = header.find("("); + loc2 = header.find(")"); + if (loc1 == std::string::npos || loc2 == std::string::npos) + throw std::runtime_error("parse_npy_header: failed to find header keyword: '(' or ')'"); + + std::regex num_regex("[0-9][0-9]*"); + std::smatch sm; + shape.clear(); + + std::string str_shape = header.substr(loc1+1,loc2-loc1-1); + while(std::regex_search(str_shape, sm, num_regex)) { + shape.push_back(std::stoi(sm[0].str())); + str_shape = sm.suffix().str(); + } + + //endian, word size, data type + //byte order code | stands for not applicable. + //not sure when this applies except for byte array + loc1 = header.find("descr"); + if (loc1 == std::string::npos) + throw std::runtime_error("parse_npy_header: failed to find header keyword: 'descr'"); + loc1 += 9; + bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false); + assert(littleEndian); + + //char type = header[loc1+1]; + //assert(type == map_type(T)); + + std::string str_ws = header.substr(loc1+2); + loc2 = str_ws.find("'"); + word_size = atoi(str_ws.substr(0,loc2).c_str()); +} + +void cnpy::parse_zip_footer(FILE* fp, uint16_t& nrecs, size_t& global_header_size, size_t& global_header_offset) +{ + std::vector footer(22); + fseek(fp,-22,SEEK_END); + size_t res = fread(&footer[0],sizeof(char),22,fp); + if(res != 22) + throw std::runtime_error("parse_zip_footer: failed fread"); + + uint16_t disk_no, disk_start, nrecs_on_disk, comment_len; + disk_no = *(uint16_t*) &footer[4]; + disk_start = *(uint16_t*) &footer[6]; + nrecs_on_disk = *(uint16_t*) &footer[8]; + nrecs = *(uint16_t*) &footer[10]; + global_header_size = *(uint32_t*) &footer[12]; + global_header_offset = *(uint32_t*) &footer[16]; + comment_len = *(uint16_t*) &footer[20]; + + assert(disk_no == 0); + assert(disk_start == 0); + assert(nrecs_on_disk == nrecs); + assert(comment_len == 0); +} + +cnpy::NpyArray load_the_npy_file(FILE* fp) { + std::vector shape; + size_t word_size; + bool fortran_order; + cnpy::parse_npy_header(fp,word_size,shape,fortran_order); + + cnpy::NpyArray arr(shape, word_size, fortran_order); + size_t nread = fread(arr.data(),1,arr.num_bytes(),fp); + if(nread != arr.num_bytes()) + throw std::runtime_error("load_the_npy_file: failed fread"); + return arr; +} + +cnpy::NpyArray load_the_npz_array(FILE* fp, uint32_t compr_bytes, uint32_t uncompr_bytes) { + + std::vector buffer_compr(compr_bytes); + std::vector buffer_uncompr(uncompr_bytes); + size_t nread = fread(&buffer_compr[0],1,compr_bytes,fp); + if(nread != compr_bytes) + throw std::runtime_error("load_the_npy_file: failed fread"); + + int err; + z_stream d_stream; + + d_stream.zalloc = Z_NULL; + d_stream.zfree = Z_NULL; + d_stream.opaque = Z_NULL; + d_stream.avail_in = 0; + d_stream.next_in = Z_NULL; + err = inflateInit2(&d_stream, -MAX_WBITS); + + d_stream.avail_in = compr_bytes; + d_stream.next_in = &buffer_compr[0]; + d_stream.avail_out = uncompr_bytes; + d_stream.next_out = &buffer_uncompr[0]; + + err = inflate(&d_stream, Z_FINISH); + err = inflateEnd(&d_stream); + + std::vector shape; + size_t word_size; + bool fortran_order; + cnpy::parse_npy_header(&buffer_uncompr[0],word_size,shape,fortran_order); + + cnpy::NpyArray array(shape, word_size, fortran_order); + + size_t offset = uncompr_bytes - array.num_bytes(); + memcpy(array.data(),&buffer_uncompr[0]+offset,array.num_bytes()); + + return array; +} + +cnpy::npz_t cnpy::npz_load(std::string fname) { + FILE* fp = fopen(fname.c_str(),"rb"); + + if(!fp) { + throw std::runtime_error("npz_load: Error! Unable to open file "+fname+"!"); + } + + cnpy::npz_t arrays; + + while(1) { + std::vector local_header(30); + size_t headerres = fread(&local_header[0],sizeof(char),30,fp); + if(headerres != 30) + throw std::runtime_error("npz_load: failed fread"); + + //if we've reached the global header, stop reading + if(local_header[2] != 0x03 || local_header[3] != 0x04) break; + + //read in the variable name + uint16_t name_len = *(uint16_t*) &local_header[26]; + std::string varname(name_len,' '); + size_t vname_res = fread(&varname[0],sizeof(char),name_len,fp); + if(vname_res != name_len) + throw std::runtime_error("npz_load: failed fread"); + + //erase the lagging .npy + varname.erase(varname.end()-4,varname.end()); + + //read in the extra field + uint16_t extra_field_len = *(uint16_t*) &local_header[28]; + if(extra_field_len > 0) { + std::vector buff(extra_field_len); + size_t efield_res = fread(&buff[0],sizeof(char),extra_field_len,fp); + if(efield_res != extra_field_len) + throw std::runtime_error("npz_load: failed fread"); + } + + uint16_t compr_method = *reinterpret_cast(&local_header[0]+8); + uint32_t compr_bytes = *reinterpret_cast(&local_header[0]+18); + uint32_t uncompr_bytes = *reinterpret_cast(&local_header[0]+22); + + if(compr_method == 0) {arrays[varname] = load_the_npy_file(fp);} + else {arrays[varname] = load_the_npz_array(fp,compr_bytes,uncompr_bytes);} + } + + fclose(fp); + return arrays; +} + +cnpy::NpyArray cnpy::npz_load(std::string fname, std::string varname) { + FILE* fp = fopen(fname.c_str(),"rb"); + + if(!fp) throw std::runtime_error("npz_load: Unable to open file "+fname); + + while(1) { + std::vector local_header(30); + size_t header_res = fread(&local_header[0],sizeof(char),30,fp); + if(header_res != 30) + throw std::runtime_error("npz_load: failed fread"); + + //if we've reached the global header, stop reading + if(local_header[2] != 0x03 || local_header[3] != 0x04) break; + + //read in the variable name + uint16_t name_len = *(uint16_t*) &local_header[26]; + std::string vname(name_len,' '); + size_t vname_res = fread(&vname[0],sizeof(char),name_len,fp); + if(vname_res != name_len) + throw std::runtime_error("npz_load: failed fread"); + vname.erase(vname.end()-4,vname.end()); //erase the lagging .npy + + //read in the extra field + uint16_t extra_field_len = *(uint16_t*) &local_header[28]; + fseek(fp,extra_field_len,SEEK_CUR); //skip past the extra field + + uint16_t compr_method = *reinterpret_cast(&local_header[0]+8); + uint32_t compr_bytes = *reinterpret_cast(&local_header[0]+18); + uint32_t uncompr_bytes = *reinterpret_cast(&local_header[0]+22); + + if(vname == varname) { + NpyArray array = (compr_method == 0) ? load_the_npy_file(fp) : load_the_npz_array(fp,compr_bytes,uncompr_bytes); + fclose(fp); + return array; + } + else { + //skip past the data + uint32_t size = *(uint32_t*) &local_header[22]; + fseek(fp,size,SEEK_CUR); + } + } + + fclose(fp); + + //if we get here, we haven't found the variable in the file + throw std::runtime_error("npz_load: Variable name "+varname+" not found in "+fname); +} + +cnpy::NpyArray cnpy::npy_load(std::string fname) { + + FILE* fp = fopen(fname.c_str(), "rb"); + + if(!fp) throw std::runtime_error("npy_load: Unable to open file "+fname); + + NpyArray arr = load_the_npy_file(fp); + + fclose(fp); + return arr; +} + + + diff --git a/include/cnpy/cnpy.h b/include/cnpy/cnpy.h new file mode 100644 index 0000000..0d3bb4c --- /dev/null +++ b/include/cnpy/cnpy.h @@ -0,0 +1,269 @@ +//Copyright (C) 2011 Carl Rogers +//Released under MIT License +//license available in LICENSE file, or at http://www.opensource.org/licenses/mit-license.php + +#ifndef LIBCNPY_H_ +#define LIBCNPY_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cnpy { + + struct NpyArray { + NpyArray(const std::vector& _shape, size_t _word_size, bool _fortran_order) : + shape(_shape), word_size(_word_size), fortran_order(_fortran_order) + { + num_vals = 1; + for(size_t i = 0;i < shape.size();i++) num_vals *= shape[i]; + data_holder = std::shared_ptr>( + new std::vector(num_vals * word_size)); + } + + NpyArray() : shape(0), word_size(0), fortran_order(0), num_vals(0) { } + + template + T* data() { + return reinterpret_cast(&(*data_holder)[0]); + } + + template + const T* data() const { + return reinterpret_cast(&(*data_holder)[0]); + } + + template + std::vector as_vec() const { + const T* p = data(); + return std::vector(p, p+num_vals); + } + + size_t num_bytes() const { + return data_holder->size(); + } + + std::shared_ptr> data_holder; + std::vector shape; + size_t word_size; + bool fortran_order; + size_t num_vals; + }; + + using npz_t = std::map; + + char BigEndianTest(); + char map_type(const std::type_info& t); + template std::vector create_npy_header(const std::vector& shape); + void parse_npy_header(FILE* fp,size_t& word_size, std::vector& shape, bool& fortran_order); + void parse_npy_header(unsigned char* buffer,size_t& word_size, std::vector& shape, bool& fortran_order); + void parse_zip_footer(FILE* fp, uint16_t& nrecs, size_t& global_header_size, size_t& global_header_offset); + npz_t npz_load(std::string fname); + NpyArray npz_load(std::string fname, std::string varname); + NpyArray npy_load(std::string fname); + + template std::vector& operator+=(std::vector& lhs, const T rhs) { + //write in little endian + for(size_t byte = 0; byte < sizeof(T); byte++) { + char val = *((char*)&rhs+byte); + lhs.push_back(val); + } + return lhs; + } + + template<> std::vector& operator+=(std::vector& lhs, const std::string rhs); + template<> std::vector& operator+=(std::vector& lhs, const char* rhs); + + + template void npy_save(std::string fname, const T* data, const std::vector shape, std::string mode = "w") { + FILE* fp = NULL; + std::vector true_data_shape; //if appending, the shape of existing + new data + + if(mode == "a") fp = fopen(fname.c_str(),"r+b"); + + if(fp) { + //file exists. we need to append to it. read the header, modify the array size + size_t word_size; + bool fortran_order; + parse_npy_header(fp,word_size,true_data_shape,fortran_order); + assert(!fortran_order); + + if(word_size != sizeof(T)) { + std::cout<<"libnpy error: "< header = create_npy_header(true_data_shape); + size_t nels = std::accumulate(shape.begin(),shape.end(),1,std::multiplies()); + + fseek(fp,0,SEEK_SET); + fwrite(&header[0],sizeof(char),header.size(),fp); + fseek(fp,0,SEEK_END); + fwrite(data,sizeof(T),nels,fp); + fclose(fp); + } + + template void npz_save(std::string zipname, std::string fname, const T* data, const std::vector& shape, std::string mode = "w") + { + //first, append a .npy to the fname + fname += ".npy"; + + //now, on with the show + FILE* fp = NULL; + uint16_t nrecs = 0; + size_t global_header_offset = 0; + std::vector global_header; + + if(mode == "a") fp = fopen(zipname.c_str(),"r+b"); + + if(fp) { + //zip file exists. we need to add a new npy file to it. + //first read the footer. this gives us the offset and size of the global header + //then read and store the global header. + //below, we will write the the new data at the start of the global header then append the global header and footer below it + size_t global_header_size; + parse_zip_footer(fp,nrecs,global_header_size,global_header_offset); + fseek(fp,global_header_offset,SEEK_SET); + global_header.resize(global_header_size); + size_t res = fread(&global_header[0],sizeof(char),global_header_size,fp); + if(res != global_header_size){ + throw std::runtime_error("npz_save: header read error while adding to existing zip"); + } + fseek(fp,global_header_offset,SEEK_SET); + } + else { + fp = fopen(zipname.c_str(),"wb"); + } + + std::vector npy_header = create_npy_header(shape); + + size_t nels = std::accumulate(shape.begin(),shape.end(),1,std::multiplies()); + size_t nbytes = nels*sizeof(T) + npy_header.size(); + + //get the CRC of the data to be added + uint32_t crc = crc32(0L,(uint8_t*)&npy_header[0],npy_header.size()); + crc = crc32(crc,(uint8_t*)data,nels*sizeof(T)); + + //build the local header + std::vector local_header; + local_header += "PK"; //first part of sig + local_header += (uint16_t) 0x0403; //second part of sig + local_header += (uint16_t) 20; //min version to extract + local_header += (uint16_t) 0; //general purpose bit flag + local_header += (uint16_t) 0; //compression method + local_header += (uint16_t) 0; //file last mod time + local_header += (uint16_t) 0; //file last mod date + local_header += (uint32_t) crc; //crc + local_header += (uint32_t) nbytes; //compressed size + local_header += (uint32_t) nbytes; //uncompressed size + local_header += (uint16_t) fname.size(); //fname length + local_header += (uint16_t) 0; //extra field length + local_header += fname; + + //build global header + global_header += "PK"; //first part of sig + global_header += (uint16_t) 0x0201; //second part of sig + global_header += (uint16_t) 20; //version made by + global_header.insert(global_header.end(),local_header.begin()+4,local_header.begin()+30); + global_header += (uint16_t) 0; //file comment length + global_header += (uint16_t) 0; //disk number where file starts + global_header += (uint16_t) 0; //internal file attributes + global_header += (uint32_t) 0; //external file attributes + global_header += (uint32_t) global_header_offset; //relative offset of local file header, since it begins where the global header used to begin + global_header += fname; + + //build footer + std::vector footer; + footer += "PK"; //first part of sig + footer += (uint16_t) 0x0605; //second part of sig + footer += (uint16_t) 0; //number of this disk + footer += (uint16_t) 0; //disk where footer starts + footer += (uint16_t) (nrecs+1); //number of records on this disk + footer += (uint16_t) (nrecs+1); //total number of records + footer += (uint32_t) global_header.size(); //nbytes of global headers + footer += (uint32_t) (global_header_offset + nbytes + local_header.size()); //offset of start of global headers, since global header now starts after newly written array + footer += (uint16_t) 0; //zip file comment length + + //write everything + fwrite(&local_header[0],sizeof(char),local_header.size(),fp); + fwrite(&npy_header[0],sizeof(char),npy_header.size(),fp); + fwrite(data,sizeof(T),nels,fp); + fwrite(&global_header[0],sizeof(char),global_header.size(),fp); + fwrite(&footer[0],sizeof(char),footer.size(),fp); + fclose(fp); + } + + template void npy_save(std::string fname, const std::vector data, std::string mode = "w") { + std::vector shape; + shape.push_back(data.size()); + npy_save(fname, &data[0], shape, mode); + } + + template void npz_save(std::string zipname, std::string fname, const std::vector data, std::string mode = "w") { + std::vector shape; + shape.push_back(data.size()); + npz_save(zipname, fname, &data[0], shape, mode); + } + + template std::vector create_npy_header(const std::vector& shape) { + + std::vector dict; + dict += "{'descr': '"; + dict += BigEndianTest(); + dict += map_type(typeid(T)); + dict += std::to_string(sizeof(T)); + dict += "', 'fortran_order': False, 'shape': ("; + dict += std::to_string(shape[0]); + for(size_t i = 1;i < shape.size();i++) { + dict += ", "; + dict += std::to_string(shape[i]); + } + if(shape.size() == 1) dict += ","; + dict += "), }"; + //pad with spaces so that preamble+dict is modulo 16 bytes. preamble is 10 bytes. dict needs to end with \n + int remainder = 16 - (10 + dict.size()) % 16; + dict.insert(dict.end(),remainder,' '); + dict.back() = '\n'; + + std::vector header; + header += (char) 0x93; + header += "NUMPY"; + header += (char) 0x01; //major version of numpy format + header += (char) 0x00; //minor version of numpy format + header += (uint16_t) dict.size(); + header.insert(header.end(),dict.begin(),dict.end()); + + return header; + } + + +} + +#endif diff --git a/include/cnpy/example1.cpp b/include/cnpy/example1.cpp new file mode 100644 index 0000000..70ac5aa --- /dev/null +++ b/include/cnpy/example1.cpp @@ -0,0 +1,55 @@ +#include"cnpy.h" +#include +#include +#include +#include +#include + +const int Nx = 128; +const int Ny = 64; +const int Nz = 32; + +int main() +{ + //set random seed so that result is reproducible (for testing) + srand(0); + //create random data + std::vector> data(Nx*Ny*Nz); + for(int i = 0;i < Nx*Ny*Nz;i++) data[i] = std::complex(rand(),rand()); + + //save it to file + cnpy::npy_save("arr1.npy",&data[0],{Nz,Ny,Nx},"w"); + + //load it into a new array + cnpy::NpyArray arr = cnpy::npy_load("arr1.npy"); + std::complex* loaded_data = arr.data>(); + + //make sure the loaded data matches the saved data + assert(arr.word_size == sizeof(std::complex)); + assert(arr.shape.size() == 3 && arr.shape[0] == Nz && arr.shape[1] == Ny && arr.shape[2] == Nx); + for(int i = 0; i < Nx*Ny*Nz;i++) assert(data[i] == loaded_data[i]); + + //append the same data to file + //npy array on file now has shape (Nz+Nz,Ny,Nx) + cnpy::npy_save("arr1.npy",&data[0],{Nz,Ny,Nx},"a"); + + //now write to an npz file + //non-array variables are treated as 1D arrays with 1 element + double myVar1 = 1.2; + char myVar2 = 'a'; + cnpy::npz_save("out.npz","myVar1",&myVar1,{1},"w"); //"w" overwrites any existing file + cnpy::npz_save("out.npz","myVar2",&myVar2,{1},"a"); //"a" appends to the file we created above + cnpy::npz_save("out.npz","arr1",&data[0],{Nz,Ny,Nx},"a"); //"a" appends to the file we created above + + //load a single var from the npz file + cnpy::NpyArray arr2 = cnpy::npz_load("out.npz","arr1"); + + //load the entire npz file + cnpy::npz_t my_npz = cnpy::npz_load("out.npz"); + + //check that the loaded myVar1 matches myVar1 + cnpy::NpyArray arr_mv1 = my_npz["myVar1"]; + double* mv1 = arr_mv1.data(); + assert(arr_mv1.shape.size() == 1 && arr_mv1.shape[0] == 1); + assert(mv1[0] == myVar1); +} diff --git a/include/cnpy/mat2npz b/include/cnpy/mat2npz new file mode 100644 index 0000000..c1bbd3e --- /dev/null +++ b/include/cnpy/mat2npz @@ -0,0 +1,18 @@ +#!/usr/bin/env python + +import sys +from numpy import savez +from scipy.io import loadmat + +assert len(sys.argv) > 1 + +files = sys.argv[1:] + +for f in files: + mat_vars = loadmat(f) + mat_vars.pop('__version__') + mat_vars.pop('__header__') + mat_vars.pop('__globals__') + + fn = f.replace('.mat','.npz') + savez(fn,**mat_vars) diff --git a/include/cnpy/npy2mat b/include/cnpy/npy2mat new file mode 100644 index 0000000..fdd5da7 --- /dev/null +++ b/include/cnpy/npy2mat @@ -0,0 +1,15 @@ +#!/usr/bin/env python + +import sys +from numpy import load +from scipy.io import savemat + +assert len(sys.argv) > 1 + +files = sys.argv[1:] + +for f in files: + data = load(f) + fn = f.replace('.npy','') + fn = fn.replace('.','_') + savemat(fn,{fn : data}) diff --git a/include/cnpy/npz2mat b/include/cnpy/npz2mat new file mode 100755 index 0000000..2e317d6 --- /dev/null +++ b/include/cnpy/npz2mat @@ -0,0 +1,15 @@ +#!/usr/bin/env python + +import sys +from numpy import load +from scipy.io import savemat + +assert len(sys.argv) > 1 + +files = sys.argv[1:] + +for f in files: + data = load(f) + fn = f.replace('.npz','') + fn = fn.replace('.','_') #matlab cant handle dots + savemat(fn,data) diff --git a/include/kmeans_gpu.h b/include/kmeans_gpu.h index 63fdab0..9bdffc4 100644 --- a/include/kmeans_gpu.h +++ b/include/kmeans_gpu.h @@ -1,7 +1,39 @@ #pragma once +#include void kmeans_gpu(const float* h_data, float* h_centroids, int* h_assignments, int N, int D, int K, int max_iter); +__global__ void calculateDistances(const float *data, const float *centroids, float *distances, int num_vectors, int num_centroids, int dim); + +__global__ void normalize_centroids_kernel( + float* centroids, // [K × D] centroid sums + const int* counts, // [K] number of points in each cluster + int K, int D); + +// Assign each point to nearest centroid +__global__ void assign_clusters_kernel(const float *distances, + int *assignments, int N, int K); + +// Update centroids by summing points per cluster +__global__ void update_centroids_kernel(const float *data, const int *assignments, + float *new_centroids, int *counts, + int N, int D, int K); + +__global__ void build_inverted_lists_kernel( + const float* data, + const int* assignments, + float** inverted_lists, + int* list_sizes, + int N, int D); + +__global__ void create_all_vectors_flat_kernel( + const float* data, // [num_vectors x dim] + const int* assignments, // [num_vectors] + const int* cluster_offsets, // [num_centroids] + int* cluster_sizes, // [num_centroids] + float* all_vectors_flat, // [num_vectors x dim] + int num_vectors, + int dim); diff --git a/include/load_bin_to_gpu.h b/include/load_bin_to_gpu.h new file mode 100644 index 0000000..984259e --- /dev/null +++ b/include/load_bin_to_gpu.h @@ -0,0 +1,44 @@ +#pragma once +#include +#include +#include + +// Loads a float32 .bin file and copies it to GPU +// Returns: pointer to device memory, and sets N_out = num vectors, D_out = dim per vector +float* load_bin_to_gpu(const std::string& filename, int expected_dim, int& N_out, int& D_out) { + std::ifstream fin(filename, std::ios::binary); + if (!fin) { + std::cerr << "[ERROR] Could not open " << filename << "\n"; + return nullptr; + } + + // Determine size + fin.seekg(0, std::ios::end); + size_t total_bytes = fin.tellg(); + fin.seekg(0, std::ios::beg); + + size_t total_floats = total_bytes / sizeof(float); + if (total_floats % expected_dim != 0) { + std::cerr << "[ERROR] File size is not divisible by expected dim " << expected_dim << "\n"; + return nullptr; + } + + N_out = total_floats / expected_dim; + D_out = expected_dim; + + std::cout << "[INFO] Loading " << N_out << " vectors of dim " << D_out << " from: " << filename << "\n"; + + // Read into host buffer + float* h_data = new float[total_floats]; + fin.read(reinterpret_cast(h_data), total_bytes); + fin.close(); + + // Allocate and copy to device + float* d_data = nullptr; + cudaMalloc(&d_data, total_bytes); + cudaMemcpy(d_data, h_data, total_bytes, cudaMemcpyHostToDevice); + + delete[] h_data; + return d_data; +} + diff --git a/kmeans_output/ivf_centroids.bin b/kmeans_output/ivf_centroids.bin new file mode 100644 index 0000000..591adf9 Binary files /dev/null and b/kmeans_output/ivf_centroids.bin differ diff --git a/kmeans_output/ivf_centroids.npy b/kmeans_output/ivf_centroids.npy new file mode 100644 index 0000000..34c5be8 Binary files /dev/null and b/kmeans_output/ivf_centroids.npy differ diff --git a/kmeans_output/ivf_debug.txt b/kmeans_output/ivf_debug.txt new file mode 100644 index 0000000..9eded58 --- /dev/null +++ b/kmeans_output/ivf_debug.txt @@ -0,0 +1,300 @@ +Cluster 0 (57 vectors): +114 140 324 547 724 876 963 1105 1647 1648 1657 1673 1732 1817 1904 2534 3435 3822 4205 4307 ... + +Cluster 1 (41 vectors): +33 195 198 308 534 958 1003 1348 1360 1375 1380 1392 1418 1797 1836 1860 2049 2243 2251 2288 ... + +Cluster 2 (79 vectors): +0 2 101 257 434 519 543 555 608 619 727 784 879 887 1135 1216 1424 1483 1547 1590 ... + +Cluster 3 (55 vectors): +189 297 305 404 640 737 802 974 1081 1881 2247 2453 2480 2563 2983 3487 3741 3875 4030 4240 ... + +Cluster 4 (82 vectors): +595 930 1220 1256 1337 1641 1907 1924 2166 2246 2335 2431 2441 2486 2523 2581 2612 2621 2632 2669 ... + +Cluster 5 (124 vectors): +69 279 513 552 596 1001 1088 1145 1303 1477 1500 1654 1661 1675 1722 1820 1945 1956 2034 2167 ... + +Cluster 6 (70 vectors): +3 6 8 29 74 85 180 186 250 304 306 590 860 873 904 945 1039 1207 1247 1455 ... + +Cluster 7 (64 vectors): +386 745 881 1137 1394 1440 1570 1642 1819 1913 1976 2126 2322 2367 2546 2656 2813 2956 3054 3090 ... + +Cluster 8 (73 vectors): +4 18 20 234 254 256 383 480 527 533 603 654 890 902 910 962 971 1153 1179 1270 ... + +Cluster 9 (82 vectors): +9 11 30 212 266 353 379 416 574 620 650 728 729 936 1217 1365 1410 1456 1495 1591 ... + +Cluster 10 (42 vectors): +10 202 521 557 1047 1056 1439 1582 1857 1878 1923 2007 2076 2083 2340 2403 2660 2861 2863 2997 ... + +Cluster 11 (52 vectors): +384 454 528 591 763 1118 1262 1269 2045 2428 2757 2891 2894 2988 3129 3252 3305 3689 3748 3937 ... + +Cluster 12 (144 vectors): +1528 1696 1766 2047 2131 2195 2296 3167 3190 3554 3923 4054 4566 4574 4825 4849 4856 4866 4911 4958 ... + +Cluster 13 (89 vectors): +13 296 414 682 878 968 1180 1274 1289 1652 1781 1891 1994 2012 2027 2266 2386 2482 2522 2620 ... + +Cluster 14 (34 vectors): +1 14 391 594 722 1258 1304 1434 1542 1754 2470 2596 2786 2832 3651 3760 4029 4081 4218 4403 ... + +Cluster 15 (76 vectors): +15 23 222 295 425 501 536 824 950 1061 1298 1436 1471 1593 1804 2011 2014 2554 2593 2728 ... + +Cluster 16 (49 vectors): +16 27 160 372 621 715 1021 1284 1464 1480 1575 1594 1782 2424 2443 2549 2568 2680 3007 3277 ... + +Cluster 17 (53 vectors): +28 719 864 970 1043 1079 1267 1406 2094 2955 3036 3057 3371 3738 4109 4601 4748 4842 4851 4946 ... + +Cluster 18 (50 vectors): +456 751 820 830 993 1055 1275 1530 1560 1621 1730 2120 2123 2230 2401 2679 2831 3000 3048 3053 ... + +Cluster 19 (68 vectors): +230 255 589 655 692 759 926 1007 1060 1077 1168 1329 1592 1644 1655 1818 1882 1957 2031 2275 ... + +Cluster 20 (101 vectors): +70 233 394 409 413 959 969 1302 1361 1405 1478 1529 1600 1651 1674 1892 1917 2033 2350 2426 ... + +Cluster 21 (59 vectors): +21 431 593 1025 1190 1197 1257 1299 1371 1425 1539 1611 1629 2101 2487 2572 3128 3202 3597 3801 ... + +Cluster 22 (202 vectors): +199 228 243 280 360 465 492 502 505 532 616 631 643 695 702 735 762 800 812 843 ... + +Cluster 23 (99 vectors): +999 1277 2098 2402 2920 3238 3630 3958 4195 4258 4558 4602 4706 4711 4780 4890 5010 5071 5133 5155 ... + +Cluster 24 (201 vectors): +24 191 210 219 262 281 288 319 341 371 421 535 539 542 671 703 792 883 924 1004 ... + +Cluster 25 (203 vectors): +25 136 149 174 184 185 193 215 259 283 328 396 407 427 470 524 563 579 600 638 ... + +Cluster 26 (150 vectors): +22 26 43 270 370 418 444 462 504 529 531 564 573 609 610 668 676 747 755 756 ... + +Cluster 27 (37 vectors): +867 1431 1977 2020 2592 3021 3260 3662 3798 5069 5575 5929 5984 6308 6615 6619 6644 6693 6786 6884 ... + +Cluster 28 (62 vectors): +797 854 906 1040 1356 1408 1850 1861 1875 1979 2591 2610 2799 2978 3020 3239 3250 3259 3384 3607 ... + +Cluster 29 (85 vectors): +639 736 796 853 905 984 1018 1041 1080 1407 1411 1499 1549 1876 1880 2389 2452 2479 2562 2977 ... + +Cluster 30 (93 vectors): +931 952 1000 1221 1338 2013 2099 2196 2884 2919 3264 3631 3700 3819 3844 4257 4813 4893 4960 4993 ... + +Cluster 31 (93 vectors): +5 17 19 31 86 115 141 206 235 253 312 545 553 622 888 894 934 986 1020 1106 ... + +Cluster 32 (81 vectors): +32 103 143 157 378 475 520 725 836 935 1084 1121 1177 1330 1358 1391 1412 1472 1526 1650 ... + +Cluster 33 (64 vectors): +710 1075 1864 2090 2156 2600 2667 2691 2719 2883 2904 2986 3339 3586 3639 3820 3994 4228 4615 4675 ... + +Cluster 34 (122 vectors): +807 844 1162 2213 2244 2396 2966 3222 3888 4142 4187 4290 4298 5098 5202 5298 5662 5824 5851 5932 ... + +Cluster 35 (86 vectors): +344 570 644 780 795 834 998 1051 1140 1283 1384 1386 1597 1634 1738 2053 2055 2066 2088 2214 ... + +Cluster 36 (191 vectors): +130 135 148 173 192 216 282 316 317 327 406 422 428 450 469 478 494 522 525 526 ... + +Cluster 37 (129 vectors): +238 313 340 495 626 629 738 786 886 978 1111 1448 1505 1523 1694 1761 1785 1788 1840 1937 ... + +Cluster 38 (46 vectors): +716 754 1092 1438 2532 2567 3466 3851 5690 6140 6198 6232 6443 6653 7070 7163 7203 7401 7612 7629 ... + +Cluster 39 (40 vectors): +39 712 713 1194 1558 1938 2015 2127 2313 2569 2570 3263 3577 3629 3926 4418 7336 7568 7785 7880 ... + +Cluster 40 (138 vectors): +34 40 177 248 338 345 400 516 571 680 760 788 884 1109 1383 1428 1551 1842 2021 2043 ... + +Cluster 41 (78 vectors): +35 38 41 226 401 517 681 761 789 1120 1139 1227 1249 1843 2179 3498 3677 4262 4292 4456 ... + +Cluster 42 (133 vectors): +42 263 451 486 566 769 1070 1107 1148 1240 1292 1325 1352 1543 1556 1566 1646 1678 1705 1775 ... + +Cluster 43 (128 vectors): +381 497 862 982 1032 1166 1186 1266 1363 1416 1444 1557 1645 1679 1727 1736 1772 1801 1815 1862 ... + +Cluster 44 (100 vectors): +1017 1052 1065 1204 1389 1400 1712 1794 2103 2173 2181 2260 2363 2372 2533 2566 2683 2804 2933 3081 ... + +Cluster 45 (123 vectors): +169 336 392 435 667 804 861 939 987 1015 1097 1165 1205 1239 1245 1301 1403 1680 1799 1887 ... + +Cluster 46 (157 vectors): +94 150 171 240 264 315 346 363 448 567 601 663 723 742 773 840 922 954 964 1012 ... + +Cluster 47 (160 vectors): +109 121 126 137 151 172 176 188 232 239 302 303 314 321 420 423 445 449 507 675 ... + +Cluster 48 (97 vectors): +139 477 485 489 537 582 666 677 768 799 815 1072 1132 1176 1414 1476 1553 1563 1571 1711 ... + +Cluster 49 (87 vectors): +1236 1390 2136 2258 2684 2729 2899 3080 3162 3220 3425 3437 3654 3837 5347 5599 5699 5856 5881 5901 ... + +Cluster 50 (86 vectors): +50 153 178 183 258 268 323 584 635 774 828 845 855 898 900 965 1232 1316 1364 1508 ... + +Cluster 51 (48 vectors): +277 289 390 481 991 1133 1379 1746 1803 1950 2704 3184 3194 3254 3257 3650 3945 4032 4166 4337 ... + +Cluster 52 (44 vectors): +52 111 116 142 164 207 245 274 429 546 648 895 1349 1393 1798 2142 2594 2787 3881 4012 ... + +Cluster 53 (77 vectors): +53 307 331 377 459 490 499 515 707 829 871 892 1008 1122 1233 1359 1435 1555 1755 1768 ... + +Cluster 54 (102 vectors): +51 54 63 64 152 208 269 273 278 458 607 618 636 647 708 726 775 827 868 872 ... + +Cluster 55 (60 vectors): +55 175 325 453 474 498 554 632 718 778 851 877 961 967 985 1010 1023 1042 1083 2227 ... + +Cluster 56 (96 vectors): +405 659 731 875 975 1195 1297 1610 1639 1671 1701 2057 2122 2140 2500 2873 3006 3139 3272 3547 ... + +Cluster 57 (92 vectors): +12 71 732 1196 1251 1273 1276 1601 1640 1718 1890 1960 2058 2141 2256 2481 2608 2698 2707 2874 ... + +Cluster 58 (93 vectors): +58 214 267 365 373 424 441 472 658 714 874 891 944 1048 1098 1252 1296 1374 1398 1421 ... + +Cluster 59 (77 vectors): +113 168 333 550 562 649 814 1466 2333 2709 3049 3541 4469 4477 4508 4511 4519 4548 4583 4595 ... + +Cluster 60 (102 vectors): +60 92 112 158 165 167 246 252 290 311 326 592 653 688 1011 1013 1022 1224 1347 1465 ... + +Cluster 61 (85 vectors): +443 500 556 880 1054 1136 1225 1545 1658 1826 1965 2269 2324 2461 2557 2979 2998 3050 3483 3542 ... + +Cluster 62 (100 vectors): +59 62 91 359 689 1748 2084 2194 2556 2653 3546 3962 3978 4327 4479 4495 4506 4510 4512 4562 ... + +Cluster 63 (50 vectors): +623 711 911 1076 1113 2091 2155 2185 2598 2668 2692 2720 2853 2930 3340 3640 3658 4674 4781 4787 ... + +Cluster 64 (57 vectors): +709 770 1603 1931 1932 2954 3003 3690 3775 3921 4349 5193 5194 6462 6715 7142 7143 7168 7204 7205 ... + +Cluster 65 (49 vectors): +251 291 455 512 634 921 1511 1579 1637 1741 2291 2338 2599 2903 3018 3175 3579 3584 3728 3824 ... + +Cluster 66 (153 vectors): +7 61 66 104 105 117 118 119 161 163 181 204 205 356 357 1525 2268 3278 3666 4399 ... + +Cluster 67 (100 vectors): +83 1387 1469 1474 1708 1774 2063 2114 2192 2360 2397 2475 2716 2866 2871 3067 3285 3353 3390 3416 ... + +Cluster 68 (130 vectors): +129 656 741 946 1142 1230 1381 1475 1763 1793 1973 2165 2218 2361 2362 2451 2476 2491 2527 2565 ... + +Cluster 69 (209 vectors): +865 1209 1264 1527 1697 1765 1925 2048 2132 2239 2297 2432 2442 2789 3168 3191 3209 3555 3722 3924 ... + +Cluster 70 (136 vectors): +551 704 1002 1089 1208 1342 1653 1662 1723 1951 1955 1978 2169 2385 2521 2559 2835 2868 2892 2895 ... + +Cluster 71 (75 vectors): +588 657 753 1024 1214 1335 1814 2223 2332 2688 2857 2876 3444 3523 3564 3571 3671 3674 3708 3821 ... + +Cluster 72 (108 vectors): +72 220 236 241 446 580 752 766 908 956 1059 1171 1320 1336 1372 1376 1568 1922 1961 2224 ... + +Cluster 73 (58 vectors): +73 201 320 385 432 433 447 461 767 783 1170 1172 1314 1417 1670 1835 1872 1901 2037 2290 ... + +Cluster 74 (40 vectors): +223 518 604 1173 1422 1767 1810 1912 2190 2353 2780 2818 3015 4196 4406 4634 4878 4970 5070 5265 ... + +Cluster 75 (78 vectors): +75 134 187 196 388 399 652 705 776 810 960 1343 1397 1442 1636 1808 1888 1914 2032 2178 ... + +Cluster 76 (106 vectors): +76 568 791 819 826 1016 1185 1401 1667 1713 1771 1783 2161 2234 2259 2265 2344 2369 2383 2390 ... + +Cluster 77 (86 vectors): +56 77 179 332 334 351 354 491 548 561 575 730 813 937 1066 1169 1313 1441 1681 1690 ... + +Cluster 78 (77 vectors): +57 78 352 355 549 585 1067 1449 1524 1546 1595 1659 1691 1744 2070 2425 2595 2916 2980 3602 ... + +Cluster 79 (80 vectors): +79 110 159 162 166 182 203 247 335 514 744 765 893 1014 1019 1213 1260 1562 1692 2146 ... + +Cluster 80 (172 vectors): +138 209 322 361 625 947 1143 1334 1382 1430 1714 1740 1926 2118 2219 2309 2376 2508 2573 2674 ... + +Cluster 81 (90 vectors): +47 330 339 523 782 885 1035 1110 1429 1552 1776 2022 2036 2044 2277 2449 2498 2503 2574 2722 ... + +Cluster 82 (93 vectors): +65 82 294 426 460 831 1082 1152 1206 1395 1404 1409 1554 1588 1742 1827 1870 2010 2217 2298 ... + +Cluster 83 (182 vectors): +231 276 299 301 349 419 487 587 615 630 642 697 734 785 869 925 929 983 1033 1119 ... + +Cluster 84 (48 vectors): +84 1073 1388 4234 4363 5801 5809 5917 6002 6317 6532 6590 6699 6723 6759 6889 6969 7030 7069 7122 ... + +Cluster 85 (87 vectors): +265 366 380 387 457 466 473 701 764 777 809 837 889 901 920 1046 1063 1078 1128 1144 ... + +Cluster 86 (88 vectors): +211 367 442 581 700 866 907 955 972 973 1058 1115 1158 1198 1345 1351 1373 1481 1515 1559 ... + +Cluster 87 (102 vectors): +87 227 569 717 779 794 835 943 1141 1282 1385 1633 1668 1703 1974 2384 2394 2840 3005 3099 ... + +Cluster 88 (64 vectors): +88 100 221 237 242 374 511 558 633 674 927 1223 1321 1423 1450 1586 1813 2017 2528 2640 ... + +Cluster 89 (65 vectors): +89 213 343 951 1189 1234 1300 1487 1837 2067 2177 2215 2748 2803 2825 2865 3031 3089 3187 3212 ... + +Cluster 90 (66 vectors): +90 342 398 690 793 808 832 852 856 1114 1222 1396 1437 1567 1630 1721 1919 1975 2092 2102 ... + +Cluster 91 (78 vectors): +395 410 415 572 576 678 750 918 976 1104 1160 1278 1451 1550 1702 1724 1779 2184 2188 2485 ... + +Cluster 92 (85 vectors): +358 417 651 691 749 758 801 917 992 1006 1103 1259 1285 1350 1355 1538 1643 1656 1729 1778 ... + +Cluster 93 (159 vectors): +93 123 145 190 224 292 309 347 364 368 408 430 471 479 488 538 583 602 627 669 ... + +Cluster 94 (175 vectors): +261 285 310 318 337 369 393 436 612 646 670 673 685 772 821 825 988 1030 1049 1057 ... + +Cluster 95 (114 vectors): +95 244 249 362 645 693 771 1062 1218 1470 1709 1927 2310 2346 2962 3016 3084 3169 3427 3477 ... + +Cluster 96 (261 vectors): +96 97 102 106 107 108 120 122 124 125 127 128 131 132 133 144 146 147 154 155 ... + +Cluster 97 (238 vectors): +194 200 229 260 271 275 284 287 298 348 382 397 403 452 463 496 508 530 586 598 ... + +Cluster 98 (261 vectors): +36 44 46 48 67 80 98 170 329 375 437 439 468 509 559 605 665 720 858 896 ... + +Cluster 99 (189 vectors): +37 45 49 68 81 99 376 438 440 467 476 510 560 606 664 721 798 859 897 916 ... + diff --git a/kmeans_output/ivf_list_ids.npy b/kmeans_output/ivf_list_ids.npy new file mode 100644 index 0000000..4010439 Binary files /dev/null and b/kmeans_output/ivf_list_ids.npy differ diff --git a/kmeans_output/ivf_offsets.npy b/kmeans_output/ivf_offsets.npy new file mode 100644 index 0000000..81ac624 Binary files /dev/null and b/kmeans_output/ivf_offsets.npy differ diff --git a/scripts/debug_ivf_index.py b/scripts/debug_ivf_index.py new file mode 100644 index 0000000..fc3da44 --- /dev/null +++ b/scripts/debug_ivf_index.py @@ -0,0 +1,56 @@ +import numpy as np +import argparse + +def debug_ivf_index(list_ids_path, offsets_path, output_txt=None): + list_ids = np.load(list_ids_path) + offsets = np.load(offsets_path) + + K = len(offsets) - 1 + N = len(list_ids) + + print(f"[INFO] Loaded:") + print(f" - list_ids: {list_ids.shape}") + print(f" - offsets: {offsets.shape}") + print(f" - Number of clusters (K): {K}") + print(f" - Total assigned vectors: {N}\n") + + # Basic validity checks + assert np.sum(np.diff(offsets)) == N, "!!! Offsets don't match total vector count!" + assert list_ids.min() >= 0, "!!! Negative vector ID detected!" + assert len(np.unique(list_ids)) == len(list_ids), "!!! Duplicate vector IDs detected!" + + print("[CHECKED] Basic index checks passed.\n") + + # Print cluster sizes + print("[INFO] Cluster sizes:") + for i in range(K): + start, end = offsets[i], offsets[i + 1] + size = end - start + print(f" Cluster {i:3d}: {size:5d} vectors") + + # Print example IDs + print("\n[INFO] Sample cluster assignments:") + for i in range(min(5, K)): + start, end = offsets[i], offsets[i + 1] + ids = list_ids[start:end] + print(f" Cluster {i:3d}: {ids[:10]} ...") + + # Optional: Save to text file + if output_txt: + with open(output_txt, "w") as f: + for i in range(K): + start, end = offsets[i], offsets[i + 1] + ids = list_ids[start:end] + f.write(f"Cluster {i} ({len(ids)} vectors):\n") + f.write(" ".join(map(str, ids[:20])) + " ...\n\n") + print(f"\n[OUTPUT]] Saved readable output to: {output_txt}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--list_ids", default="data/processed/ivf_list_ids.npy", help="Path to ivf_list_ids.npy") + parser.add_argument("--offsets", default="data/processed/ivf_offsets.npy", help="Path to ivf_offsets.npy") + parser.add_argument("--out", help="Optional output .txt file") + args = parser.parse_args() + + debug_ivf_index(args.list_ids, args.offsets, args.out) + diff --git a/scripts/npy_to_bin.py b/scripts/npy_to_bin.py new file mode 100644 index 0000000..00d7eea --- /dev/null +++ b/scripts/npy_to_bin.py @@ -0,0 +1,19 @@ +import numpy as np +import argparse +import os + +def convert_npy_to_bin(npy_path, bin_path=None): + data = np.load(npy_path) + if bin_path is None: + bin_path = os.path.splitext(npy_path)[0] + ".bin" + data.astype(np.float32).tofile(bin_path) + print(f"Converted {npy_path} → {bin_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("npy_file", help="Path to .npy file") + parser.add_argument("--out", help="Optional output .bin path") + args = parser.parse_args() + + convert_npy_to_bin(args.npy_file, args.out) + diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt new file mode 100644 index 0000000..80f7ccb --- /dev/null +++ b/src/cuda/CMakeLists.txt @@ -0,0 +1,29 @@ +find_package(CUDA REQUIRED) + +add_library(cuda_vector_db SHARED distance.cu + index.cu select.cu + kmeans_gpu.cu + ) + +set_target_properties( cuda_vector_db + PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + +set_target_properties(cuda_vector_db PROPERTIES POSITION_INDEPENDENT_CODE ON) + +# Pass options to ptxas using -Xptxas +target_compile_options(cuda_vector_db PRIVATE + $<$:-Xptxas -O3> # Pass -O3 optimization to ptxas + # $<$:-Xcompiler -O2> # Pass -O3 optimization to the host compiler + # $<$:-Xcompiler -Wall> # Enable all warnings for the host compiler +) + +target_compile_features(cuda_vector_db PUBLIC cxx_std_17) + +add_executable(kmeans_driver kmeans_driver.cpp) +target_link_libraries(kmeans_driver cuda_vector_db) +target_include_directories(cuda_vector_db PUBLIC "${CMAKE_CURRENT_BINARY_DIR}") +target_include_directories(cuda_vector_db PUBLIC "${PROJECT_SOURCE_DIR}/include") + + +include_directories("${CMAKE_CURRENT_BINARY_DIR}") +include_directories("${PROJECT_SOURCE_DIR}/include") diff --git a/src/cuda/IVFIndex.cu b/src/cuda/IVFIndex.cu new file mode 100644 index 0000000..c7da105 --- /dev/null +++ b/src/cuda/IVFIndex.cu @@ -0,0 +1,259 @@ +#include "kmeans_gpu.h" +#include "IVFIndex.h" +#include "distance.cuh" +#include +#include +#include // for std::shuffle +#include +#include "select.cu" +#include +#include + +IVFIndex::IVFIndex(int num_centroids, int num_vectors, int dim, int iterations) : num_centroids(num_centroids), + num_vectors(num_vectors), + dim(dim), + iterations(iterations) +{ + cudaMalloc(¢roids, num_centroids * dim * sizeof(float)); + cudaMalloc(&data, num_vectors * dim * sizeof(float)); + cudaMalloc(&assignments, num_vectors * sizeof(int)); + cudaMalloc(&distances, num_vectors * num_centroids * sizeof(float)); + cudaMalloc(&counts, num_centroids * sizeof(int)); + cudaMemset(counts, 0, num_centroids * sizeof(int)); + cudaMalloc(&d_inverted_lists, num_centroids * sizeof(float *)); + cudaMalloc(&d_list_sizes, num_centroids * sizeof(int)); + cudaMalloc(&d_cluster_offsets, (num_centroids + 1) * sizeof(int)); + cudaMemset(d_list_sizes, 0, num_centroids * sizeof(int)); + cudaMalloc(&d_all_vectors_flat, sizeof(float) * num_vectors * dim); +} + +IVFIndex::~IVFIndex() { + if (centroids) cudaFree(centroids); + if (distances) cudaFree(distances); + if (counts) cudaFree(counts); + if (data) cudaFree(data); + if (assignments) cudaFree(assignments); + if (d_inverted_lists) cudaFree(d_inverted_lists); + if (d_list_sizes) cudaFree(d_list_sizes); + if (d_cluster_offsets) cudaFree(d_cluster_offsets); + if (d_all_vectors_flat) cudaFree(d_all_vectors_flat); +} + +float* IVFIndex::getData() const { return data; } + +float* IVFIndex::getallvectorsflat() const { return d_all_vectors_flat; } + +float** IVFIndex::getInvertedLists() const { return d_inverted_lists; } + +int* IVFIndex::getClusterOffsets() const { return d_cluster_offsets; } + +float* IVFIndex::getCentroids() const { return centroids; } + +int* IVFIndex::getAssignments() const { return assignments; } + +int IVFIndex::getNumCentroids() const { return num_centroids; } + +int IVFIndex::getDim() const { return dim; } + +int IVFIndex::getNumVectors() const { return num_vectors; } + +void IVFIndex::train (const float* train_data){ + cudaMemcpy(data, train_data, num_vectors * dim * sizeof(float), cudaMemcpyHostToDevice); + + std::vector h_centroids(num_centroids * dim); + + // Set up random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> dis(0, num_vectors - 1); + + // Optional: Keep track of selected indices if you want **unique** centroids + std::unordered_set selected_indices; + + int selected = 0; + while (selected < num_centroids) { + int idx = dis(gen); + if (selected_indices.find(idx) != selected_indices.end()) { + continue; // already selected this one + } + selected_indices.insert(idx); + + // Copy the vector from train_data + for (int j = 0; j < dim; j++) { + h_centroids[selected * dim + j] = train_data[idx * dim + j]; + } + selected++; + } + // printf("Print the centroids\n"); + // for (int i = 0; i < num_centroids; ++i) { + // printf("Centroid %d: ", i); + // for (int j = 0; j < dim; ++j) { + // printf("%f ", h_centroids[i * dim + j]); + // } + // printf("\n"); + // } + cudaMemcpy(centroids, h_centroids.data(), num_centroids * dim * sizeof(float), cudaMemcpyHostToDevice); + + for (int iter = 0; iter < iterations; ++iter) { + cudaMemset(counts, 0, num_centroids * sizeof(int)); + dim3 blockDim(16, 16); + dim3 gridDim((num_vectors + blockDim.x - 1) / blockDim.x, (num_centroids + blockDim.y - 1) / blockDim.y); + + calculateDistances<<>>(data, centroids, distances, num_vectors, num_centroids, dim); + cudaDeviceSynchronize(); + checkCudaError("calculateDistances"); + dim3 blockDim2(256); + dim3 gridDim2((num_vectors + blockDim2.x - 1) / blockDim2.x); + + assign_clusters_kernel<<>>(distances, assignments, num_vectors, num_centroids); + cudaDeviceSynchronize(); + checkCudaError("assign_clusters_kernel"); + + update_centroids_kernel<<>>(data, assignments, centroids, counts, num_vectors, dim, num_centroids); + cudaDeviceSynchronize(); + checkCudaError("update_centroids_kernel"); + + int threads = 256; + int blocks = (num_centroids * dim + threads - 1) / threads; + + normalize_centroids_kernel<<>>(centroids, counts, num_centroids, dim); + cudaDeviceSynchronize(); + checkCudaError("normalize_centroids_kernel"); + } + // Now we have the centroids, let's create the inverted lists + std::vector h_counts(num_centroids); + cudaMemcpy(h_counts.data(), counts, num_centroids * sizeof(int), cudaMemcpyDeviceToHost); + + std::vector cluster_offsets(num_centroids + 1, 0); + cluster_offsets[0] = 0; + printf("Cluster offsets: "); + for (int i = 1; i <= num_centroids; ++i) { + cluster_offsets[i] = cluster_offsets[i-1] + h_counts[i-1]; + printf("%d ", cluster_offsets[i]); + } + + // Upload to device + + cudaMemcpy(d_cluster_offsets, cluster_offsets.data(), (num_centroids+1) * sizeof(int), cudaMemcpyHostToDevice); + int* d_cluster_sizes; + cudaMalloc(&d_cluster_sizes, sizeof(int) * num_centroids); + cudaMemset(d_cluster_sizes, 0, sizeof(int) * num_centroids); + int block_size = 256; + int grid_size = (num_vectors + block_size - 1) / block_size; + create_all_vectors_flat_kernel<<>>( + data, + assignments, + d_cluster_offsets, + d_cluster_sizes, + d_all_vectors_flat, + num_vectors, + dim + ); + cudaDeviceSynchronize(); + // for (int i = 0; i < num_centroids; i++) { + // float* list; + // cudaMalloc(&list, num_vectors * dim * sizeof(float)); // Upper bound + // cudaMemcpy(&d_inverted_lists[i], &list, sizeof(float*), cudaMemcpyHostToDevice); + // } + + // int threads = 256; + // int blocks = (num_vectors + threads - 1) / threads; + + // build_inverted_lists_kernel<<>>(data, assignments, d_inverted_lists, d_list_sizes, num_vectors, dim); + // cudaDeviceSynchronize(); + // checkCudaError("build_inverted_lists_kernel"); + + // Now clean temporary memory + cudaFree(assignments); + assignments = nullptr; + + cudaFree(distances); + distances = nullptr; + + cudaFree(counts); + counts = nullptr; + + cudaFree(data); // because we copied into inverted lists + data = nullptr; + + cudaFree(d_cluster_sizes); + d_cluster_sizes = nullptr; + +} + +void IVFIndex::checkCudaError(const std::string& kernelName) { + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("CUDA Error after %s: %s\n", kernelName.c_str(), cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +void IVFIndex::search_coarse_grained(thrust::device_vector &queries, + thrust::device_vector &queries_norm, + thrust::device_vector *coarse_grained_output, + uint32_t centroids_to_select) { + thrust::device_vector output_distances; + const int block_size = 16; + // TODO(): Compute tiles of product instead of computing all + // Perform tiled matrix multiplication + thrust::device_vector product; + float* q = thrust::raw_pointer_cast(queries.data()); + float* p = thrust::raw_pointer_cast(product.data()); + matrix_mul_with_transpose(q, centroids, dim, p, queries.size()/dim, num_centroids, block_size); + + // Call L2SelectMin + thrust::device_vector centroid_norms; + thrust::device_vector centroids_vector(centroids, centroids + num_centroids); + compute_l2_norm(centroids_vector, ¢roid_norms, dim); + select_min(product, centroids_vector, centroid_norms, dim, queries.size()/dim, *coarse_grained_output, centroids_to_select); +} + +void IVFIndex::search_fine_grained(thrust::device_vector &queries, thrust::device_vector &queries_norm, + thrust::device_vector &coarse_grained_output, + thrust::device_vector *fine_grained_output, + uint32_t k) { + /** + * Inputs: + * 1. queries + * 2. inverted index (from the class) + * - data points (a single buffer) + * - offsets into the data points array + * 3. Coarse grained output: centroid ids + * Steps: + * 1. Calculate distances + * 2. Select min + */ + uint32_t nqueries = queries.size() / dim; + // TODO(): Allocate memory for output_distances + thrust::device_vector output_distances; + float* vectors = IVFIndex::getallvectorsflat(); + thrust::device_vector ivf_data_points(vectors, vectors + IVFIndex::getNumVectors()); + int* offsets = IVFIndex::getClusterOffsets(); + thrust::device_vector ivf_offsets(offsets, offsets + IVFIndex::getNumCentroids() + 1); + calc_distances_for_fine_grained_search(ivf_data_points, ivf_offsets, queries, coarse_grained_output, + output_distances, dim); +} + +thrust::device_vector IVFIndex::search_batch(thrust::device_vector &queries, uint32_t k) { + thrust::device_vector coarse_grained_output; + // TODO(): Discuss + uint32_t centroids_to_select = (uint32_t)sqrt(k); + + // Compute L2 norm for queries + thrust::device_vector queries_norm; + compute_l2_norm(queries, &queries_norm, dim); + + search_coarse_grained(queries, queries_norm, &coarse_grained_output, centroids_to_select); + + thrust::device_vector fine_grained_output; + search_fine_grained(queries, queries_norm, coarse_grained_output, &fine_grained_output, k); + + // Cast fine_grained_output to IndexType + thrust::device_vector results(fine_grained_output.size()); + thrust::transform(fine_grained_output.begin(), fine_grained_output.end(), results.begin(), thrust::placeholders::_1); + + return results; +} + + diff --git a/src/cuda/distance.cu b/src/cuda/distance.cu index a2644b9..00ab03b 100644 --- a/src/cuda/distance.cu +++ b/src/cuda/distance.cu @@ -1,4 +1,4 @@ -#include +#include "distance.cuh" #include __global__ void compute_l2_distances( @@ -24,3 +24,223 @@ __global__ void compute_l2_distances( distances[q_idx * num_bases + b_idx] = sum; } + + +constexpr int ROW_TILE_SIZE = 8; +constexpr int BLOCK_SIZE = 128; + +__global__ void compute_l2_norm_kernel(const float *input, + float *output, int ndim, int input_size) { + extern __shared__ float s[]; + float *sum_staging = &s[ROW_TILE_SIZE]; + int global_thread_id = blockDim.x * blockIdx.x + threadIdx.x; + int row_start = global_thread_id * ROW_TILE_SIZE; + + if (threadIdx.x < ROW_TILE_SIZE) { + s[threadIdx.x] = 0; + } + __syncthreads(); + + #pragma unroll + for (int i=0; i= input_size) { + break; + } + float sum = 0.0f; + for (int j=threadIdx.x; j 0; j>>=1) { + if (threadIdx.x < j) { + sum_staging[threadIdx.x] += sum_staging[threadIdx.x + j]; + } + __syncthreads(); + } + if (threadIdx.x == 0) { + s[i] = sum_staging[0]; + } + } + + if (threadIdx.x == 0) { + for (int i=0; i &input, + thrust::device_vector *output, int ndim) { + output = new thrust::device_vector(input.size()/ndim); + int num_threads = (input.size() + 7)/ROW_TILE_SIZE; + int num_blocks = (num_threads + BLOCK_SIZE - 1)/BLOCK_SIZE; + int shared_mem_size = (ROW_TILE_SIZE + BLOCK_SIZE) * sizeof(float); + compute_l2_norm_kernel<<>>(thrust::raw_pointer_cast(input.data()), + thrust::raw_pointer_cast(output->data()), ndim, input.size()); +} + + +/** + * CUDA kernel to calculate C = A * B(T) + * Dimensions of matrix A: (la * ndim) + * Dimensions of matrix B: (lb * ndim) + * + * This kernel performs matrix multiplication where the second matrix (B) is transposed. + * It uses shared memory to load tiles of both matrices A and B for efficient computation. + * The key difference from `matrix_mul_kernel` is that this kernel transposes matrix B + * on-the-fly while loading it into shared memory, enabling computation of A * B(T). + * This is particularly useful when the transpose of B is required for the operation. + */ +template +__global__ void matrix_mul_kernel_with_transpose(const float* A, const float *B, int ndim, + float *C, int lb, int la) { + __shared__ float a_tile_shared[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float b_tile_shared[BLOCK_SIZE][BLOCK_SIZE]; + + // x -> vertical dimension + // y -> horizontal dimension + // ndim -> number of dimensions + int bx = blockIdx.x; + int by = blockIdx.y; + int tx = threadIdx.x; + int ty = threadIdx.y; + + int a_begin_offset = bx * BLOCK_SIZE * ndim; + int b_begin_offset = by * BLOCK_SIZE * ndim; + + int a_end_offset = a_begin_offset + ndim; + + float sum = 0.0f; + + for (int i=a_begin_offset, j=b_begin_offset; i<<>>(A, B, ndim, C, lb, la); + + case 16: + matrix_mul_kernel_with_transpose<16><<>>(A, B, ndim, C, lb, la); + + case 32: + matrix_mul_kernel_with_transpose<32><<>>(A, B, ndim, C, lb, la); +} +} + + +__device__ __forceinline__ int get_lane_id() { + int lane_id; + asm("mov.u32 %0, %%laneid;" : "=r"(lane_id)); + return lane_id; +} + +__device__ inline float warp_reduce_sum(float &s) { + for (int offset = 16; offset > 0; offset /= 2) + s += __shfl_down_sync(0xFFFFFFFF, s, offset); +} + +__global__ void calc_fine_grained_distance_kernel(float *queries, float *ivf_data_points, + int64_t *ivf_offsets, int64_t *ivf_indexes, float *output_distances, + int64_t *coarse_grained_output, int64_t *data_points_per_query, + int64_t *indexes_per_query, uint32_t centroids_per_query, uint32_t ndim, + uint32_t nqueries, int64_t max_ivf_list_length + ) { + // Each warp handles a single query point + uint32_t query_id = (blockDim.x * blockIdx.x + threadIdx.x)/32; + if (query_id >= nqueries) return; + uint32_t query_offset = query_id * ndim; + uint32_t centroid_id_start_offset = query_id * centroids_per_query; + // Hold number of data points per query + int64_t query_ctr = 0ll; + for (uint32_t i=0; i &ivf_data_points, + thrust::device_vector &ivf_offsets, thrust::device_vector &ivf_indexes, + thrust::device_vector &queries, + thrust::device_vector &coarse_grained_output, + thrust::device_vector &output_distances, + thrust::device_vector &output_ids, + thrust::device_vector &output_offsets, + int ndim, int64_t max_ivf_list_length) { + uint32_t nqueries = queries.size()/ndim; + uint32_t query_tile_size = 32; + uint32_t centroids_per_query = coarse_grained_output.size()/nqueries; + uint32_t BLOCK_SIZE = 256; + thrust::device_vector points_per_query (nqueries); + for (uint32_t i=0; i>>(queries_slice, thrust::raw_pointer_cast(ivf_data_points.data()), + thrust::raw_pointer_cast(ivf_offsets.data()), thrust::raw_pointer_cast(ivf_indexes.data()), + output_distances_slice, + coarse_grained_output_slice, + points_per_query_slice, + output_ids_slice, + centroids_per_query, + ndim, nqueries, max_ivf_list_length); + } + thrust::inclusive_scan(points_per_query.begin(), points_per_query.end(), output_offsets.begin()); +} \ No newline at end of file diff --git a/src/cuda/distance.cuh b/src/cuda/distance.cuh new file mode 100644 index 0000000..6730231 --- /dev/null +++ b/src/cuda/distance.cuh @@ -0,0 +1,52 @@ +#pragma once + +#include +#include + +__global__ void compute_l2_distances( + const float* __restrict__ queries, // [num_queries x dim] + const float* __restrict__ bases, // [num_bases x dim] + float* __restrict__ distances, // [num_queries x num_bases] + int num_queries, + int num_bases, + int dim); + +__global__ void compute_l2_norm_kernel(const float *input, + float *output, int ndim, int input_size); + +void compute_l2_norm(thrust::device_vector &input, + thrust::device_vector *output, int ndim); + + +/** + * @brief Computes the matrix multiplication of matrix A and the transpose of matrix B. + * + * This function performs the multiplication of a matrix A with the transpose of + * matrix B. Both matrices are assumed to be stored in row-major order. + * + * @param A Pointer to the first input matrix (matrix A). + * @param B Pointer to the second input matrix (matrix B). + * @param ndim The dimension of the matrices A and B (number of rows/columns). + */ +void matrix_mul_with_transpose(const float* A, const float *B, int ndim, + float *C, int la, int lb, int block_size); + + +/** + * @brief Calculates distances for a fine-grained search using IVF (Inverted File Index) data points. + * + * @param ivf_data_points A reference to a Thrust device vector containing the IVF data points. + * These data points are used to compute distances during the search process. + * + * This function is designed to operate on GPU memory and leverages CUDA and Thrust for efficient + * parallel computation. It is typically used in vector database applications where fine-grained + * distance calculations are required for nearest neighbor search or similar tasks. + */ +void calc_distances_for_fine_grained_search(thrust::device_vector &ivf_data_points, + thrust::device_vector &ivf_offsets, thrust::device_vector &ivf_indexes, + thrust::device_vector &queries, + thrust::device_vector &coarse_grained_output, + thrust::device_vector &output_distances, + thrust::device_vector &output_ids, + thrust::device_vector &output_offsets, + int ndim, int64_t max_ivf_list_length); \ No newline at end of file diff --git a/src/cuda/heap.cuh b/src/cuda/heap.cuh new file mode 100644 index 0000000..da4e1ac --- /dev/null +++ b/src/cuda/heap.cuh @@ -0,0 +1,89 @@ +#pragma once + +#include +#include +#include "sort.cuh" + +constexpr int WARP_SIZE = 32; + + +template +class GPUHeap { +public: + __device__ GPUHeap() { + #pragma unroll + for (int i=0; i=0; i--) { + thread_queue_values[i] = thread_queue_values[i-1]; + thread_queue_keys[i] = thread_queue_keys[i-1]; + } + + thread_queue_values[0] = value; + thread_queue_keys[0] = key; + num_candidates++; + } + bool sort_required = (num_candidates == THREAD_QUEUE_SIZE); + sort_required = __any_sync(0xFFFFFFFF, sort_required); + // Sort if atleast one thread requires a sort + if (!sort_required) { + return; + } + + merge_warp_queues(); + } + + // Merge warp queue and thread queues using bitonic sort + __device__ void merge_warp_queues() { + // Sort the thread queues while interpreting them as one lane-strided register array + sort_odd(thread_queue_values, thread_queue_keys); + + __syncwarp(); + + // Merge the warp queue (already sorted) with the thread queue + merge_odd(warp_queue_values, thread_queue_values, warp_queue_keys, thread_queue_keys); + __syncwarp(); + + num_candidates = 0; + } + + __device__ void dump(int64_t *out_keys, int k) { + int lane_id = get_lane_id(); + + #pragma unroll + for (int i=0; i + +void IVFIndex::search_coarse_grained(thrust::device_vector &queries, + thrust::device_vector &queries_norm, + thrust::device_vector *coarse_grained_output, + uint32_t centroids_to_select) { + thrust::device_vector output_distances; + const int block_size = 16; + // TODO(): Compute tiles of product instead of computing all + // Perform tiled matrix multiplication + thrust::device_vector product; + matrix_mul_with_transpose(thrust::raw_pointer_cast(queries.data()), thrust::raw_pointer_cast(centroids.data()), ndim, + thrust::raw_pointer_cast(product.data()), queries.size()/ndim, centroids.size()/ndim, block_size); + + // Call L2SelectMin + select_min(product, centroids, centroid_norms, ndim, queries.size()/ndim, *coarse_grained_output, centroids_to_select); +} + +void IVFIndex::search_fine_grained(thrust::device_vector &queries, thrust::device_vector &queries_norm, + thrust::device_vector &coarse_grained_output, + thrust::device_vector &output_distances, + thrust::device_vector &output_indexes, + uint32_t k) { + uint32_t nqueries = queries.size() / ndim; + thrust::device_vector distances (max_ivf_list_length * nqueries); + thrust::device_vector indexes (max_ivf_list_length * nqueries); + // Number of points over which we are going to run the fine-grained search for each query + thrust::device_vector offsets (nqueries + 1); + calc_distances_for_fine_grained_search(ivf_data_points, ivf_offsets, ivf_indices, queries, + coarse_grained_output, distances, indexes, offsets, ndim, max_ivf_list_length); + // output_distances is grouped per query. We will stream these distances through a select kernel + select_fine_grained(distances, indexes, offsets, output_distances, output_indexes, ndim, nqueries, k); +} + +thrust::device_vector IVFIndex::search_batch(thrust::device_vector &queries, uint32_t k) { + thrust::device_vector coarse_grained_output; + uint32_t centroids_to_select = (uint32_t)sqrt(k); + + // Compute L2 norm for queries + thrust::device_vector queries_norm; + compute_l2_norm(queries, &queries_norm, ndim); + + search_coarse_grained(queries, queries_norm, &coarse_grained_output, centroids_to_select); + + thrust::device_vector output_distances(k * queries.size()/ndim); + thrust::device_vector output_indexes(k * queries.size()/ndim); + search_fine_grained(queries, queries_norm, coarse_grained_output, + output_distances, output_indexes, k); + + return output_indexes; +} \ No newline at end of file diff --git a/src/cuda/index.cuh b/src/cuda/index.cuh new file mode 100644 index 0000000..26592e8 --- /dev/null +++ b/src/cuda/index.cuh @@ -0,0 +1,40 @@ +#pragma once + +#include +#include +#include + +typedef int64_t IndexType; + +class IVFIndex { +public: + explicit IVFIndex(); + + thrust::device_vector search_batch(thrust::device_vector &queries, uint32_t k); + +private: + thrust::device_vector centroids; + + thrust::device_vector centroid_norms; + + int ndim; + + thrust::device_vector ivf_data_points; + + thrust::device_vector ivf_offsets; + + thrust::device_vector ivf_indices; + + int64_t max_ivf_list_length; + + void search_coarse_grained(thrust::device_vector &queries, + thrust::device_vector &queries_norms, + thrust::device_vector *coarse_grained_output, + uint32_t centroids_to_select); + + void search_fine_grained(thrust::device_vector &queries, thrust::device_vector &queries_norm, + thrust::device_vector &coarse_grained_output, + thrust::device_vector &output_distances, + thrust::device_vector &output_indexes, + uint32_t k); +}; \ No newline at end of file diff --git a/src/cuda/kmeans_driver.cpp b/src/cuda/kmeans_driver.cpp index 183b828..6680c4d 100644 --- a/src/cuda/kmeans_driver.cpp +++ b/src/cuda/kmeans_driver.cpp @@ -1,55 +1,87 @@ #include #include -#include -#include #include -#include +#include +#include #include "kmeans_gpu.h" +#include "cnpy.h" +#include "IVFIndex.h" -void kmeans_gpu(const float* h_data, float* h_centroids, - int* h_assignments, int N, int D, int K, int max_iter); - -// Stub only — assume memory is handled externally - -void print_shape(const std::string& name, const std::vector& shape) { - std::cout << name << " shape: ["; - for (size_t i = 0; i < shape.size(); ++i) { - std::cout << shape[i]; - if (i != shape.size() - 1) std::cout << ", "; +void save_bin(const std::string& filename, const float* data, int count) { + std::ofstream fout(filename, std::ios::binary); + if (!fout) { + std::cerr << "Failed to open " << filename << " for writing.\n"; + return; } - std::cout << "]" << std::endl; + fout.write(reinterpret_cast(data), count * sizeof(float)); + fout.close(); } -int main() { - // TODO: Load sift_base.npy into float* h_data +int main(int argc, char** argv) { std::string input_path = "data/processed/sift_base.npy"; - std::string output_path = "data/processed/ivf_centroids.npy"; + std::string out_npy_path = "data/processed/ivf_centroids.npy"; + std::string out_bin_path = "data/processed/ivf_centroids.bin"; - const int num_clusters = 1024; + const int K = 100; const int max_iter = 20; - std::cout << "Loading training vectors from: " << input_path << std::endl; + std::cout << "[INFO] Loading training vectors from: " << input_path << std::endl; cnpy::NpyArray arr = cnpy::npy_load(input_path); - float* data = arr.data(); + float* h_data = arr.data(); std::vector shape = arr.shape; + int N = shape[0]; + int D = shape[1]; + + + + std::cout << "[INFO] Dataset size: " << N << " vectors of dimension " << D << std::endl; + + IVFIndex ivf_index(K, N, D, max_iter); - size_t N = shape[0]; - size_t D = shape[1]; - print_shape("Input", shape); + // std::vector h_centroids(K * D); + // std::vector h_assignments(N); - // TODO: Randomly initialize K centroids from h_data - std::vector centroids(num_clusters * D); - std::vector assignments(N, -1); + std::cout << "[INFO] Running CUDA KMeans with K=" << K << ", max_iter=" << max_iter << "...\n"; + + // Run KMeans on GPU + ivf_index.train(h_data); - // Run CUDA KMeans implementation - kmeans_gpu(data, centroids.data(), assignments.data(), N, D, num_clusters, max_iter); + // Debug lines - should be deleted!! + std::cout << "\n[DEBUG] Cluster assignment count:\n"; + std::vector cluster_count(K, 0); + // std::vector h_assignments(N); + // int assignments_size = N * sizeof(int); + // cudaMemcpy(h_assignments, ivf_index.getAssignments(), assignments_size, cudaMemcpyDeviceToHost); + // for (int i = 0; i < N; ++i) { + // if (h_assignments[i] >= 0 && h_assignments[i] < K) + // cluster_count[h_assignments[i]]++; + // } + + // for (int k = 0; k < K; ++k) { + // std::cout << " Cluster " << k << ": " << cluster_count[k] << " vectors\n"; + // } + + std::cout << "\n[DEBUG] First 5 centroid values for each cluster:\n"; + + std::vector h_centroids(K * D); + cudaMemcpy(h_centroids.data(), ivf_index.getCentroids(), K * D * sizeof(float), cudaMemcpyDeviceToHost); + for (int k = 0; k < K; ++k) { + std::cout << " Centroid " << k << ": "; + for (int d = 0; d < std::min(5, D); ++d) { + std::cout << h_centroids[k * D + d] << " "; + } + std::cout << "...\n"; + } + // Debug lines - should be deleted!! + + std::cout << "[INFO] Saving centroids to .npy and .bin...\n"; + cnpy::npy_save(out_npy_path, h_centroids.data(), {static_cast(K), static_cast(D)}, "w"); + save_bin(out_bin_path, h_centroids.data(), K * D); - // TODO: Save centroids to ivf_centroids.npy - // Save centroids as .npy - std::cout << "Saving centroids to: " << output_path << std::endl; - cnpy::npy_save(output_path, ¢roids[0], {num_clusters, D}, "w"); + std::cout << "KMeans complete. Saved to:\n"; + std::cout << " - " << out_npy_path << "\n"; + std::cout << " - " << out_bin_path << "\n"; - std::cout << "KMeans complete. Centroids saved." << std::endl; return 0; } diff --git a/src/cuda/kmeans_driver.cu b/src/cuda/kmeans_driver.cu new file mode 100644 index 0000000..9d00e59 --- /dev/null +++ b/src/cuda/kmeans_driver.cu @@ -0,0 +1,118 @@ +#include "kmeans_gpu.h" +#include "IVFIndex.h" +#include +#include +#include +#include +#include +#include "cnpy.h" +#include + +#include // <-- For timing +using namespace std::chrono; + + + +void save_bin(const std::string& filename, const float* data, int count) { + std::ofstream fout(filename, std::ios::binary); + if (!fout) { + std::cerr << "Failed to open " << filename << " for writing.\n"; + return; + } + fout.write(reinterpret_cast(data), count * sizeof(float)); + fout.close(); +} + +int main(int argc, char** argv) { + std::string input_path = "data/processed/sift_base.npy"; + std::string out_npy_path = "data/processed/ivf_centroids.npy"; + std::string out_bin_path = "data/processed/ivf_centroids.bin"; + + const int K = 100; + const int max_iter = 20; + auto load_start = high_resolution_clock::now(); + + std::cout << "[INFO] Loading training vectors from: " << input_path << std::endl; + cnpy::NpyArray arr = cnpy::npy_load(input_path); + float* h_data = arr.data(); + std::vector shape = arr.shape; + int N = shape[0]; + int D = shape[1]; + + auto load_end = high_resolution_clock::now(); + std::cout << "[TIME] Data loading time: " + << duration_cast(load_end - load_start).count() + << " ms\n"; + + std::cout << "[INFO] Dataset size: " << N << " vectors of dimension " << D << std::endl; + + // Assuming h_data is already loaded, and N, D are known + int num_to_print = 5; // Change this to print more/less vectors + for (int i = 0; i < std::min(num_to_print, N); ++i) { + std::cout << "Vector " << i << ": "; + for (int j = 0; j < D; ++j) { + std::cout << std::fixed << std::setprecision(4) << h_data[i * D + j] << " "; + } + std::cout << std::endl; + } + + IVFIndex ivf_index(K, N, D, max_iter); + + // std::vector h_centroids(K * D); + // std::vector h_assignments(N); + + std::cout << "[INFO] Running CUDA KMeans with K=" << K << ", max_iter=" << max_iter << "...\n"; + + // Run KMeans on GPU + auto train_start = high_resolution_clock::now(); + ivf_index.train(h_data); + auto train_end = high_resolution_clock::now(); + std::cout << "[TIME] KMeans training time: " << duration_cast(train_end - train_start).count() << " ms\n"; + // Debug lines - should be deleted!! + // std::cout << "\n[DEBUG] Cluster assignment count:\n"; + // std::vector cluster_count(K, 0); + // std::vector h_assignments(N); + // int assignments_size = N * sizeof(int); + // cudaMemcpy(h_assignments, ivf_index.getAssignments(), assignments_size, cudaMemcpyDeviceToHost); + // for (int i = 0; i < N; ++i) { + // if (h_assignments[i] >= 0 && h_assignments[i] < K) + // cluster_count[h_assignments[i]]++; + // } + + // for (int k = 0; k < K; ++k) { + // std::cout << " Cluster " << k << ": " << cluster_count[k] << " vectors\n"; + // } + + std::cout << "\n[DEBUG] First 5 centroid values for each cluster:\n"; + + std::vector h_centroids(K * D); + cudaMemcpy(h_centroids.data(), ivf_index.getCentroids(), K * D * sizeof(float), cudaMemcpyDeviceToHost); + for (int k = 0; k < K; ++k) { + std::cout << " Centroid " << k << ": "; + for (int d = 0; d < std::min(5, D); ++d) { + std::cout << h_centroids[k * D + d] << " "; + } + std::cout << "...\n"; + } + std::vector h_all_vectors_flat(N * D); + cudaMemcpy(h_all_vectors_flat.data(), ivf_index.getallvectorsflat(), N * D * sizeof(float), cudaMemcpyDeviceToHost); + std::vector h_offsets(K+1); + cudaMemcpy(h_offsets.data(), ivf_index.getClusterOffsets(), (K+1) * sizeof(int), cudaMemcpyDeviceToHost); + for (int i = 0; i < K; ++i) + { + int num_points = h_offsets[i+1] - h_offsets[i]; + std::cout << "Cluster " << i << " has " << num_points << " points.\n"; + } + + + std::cout << "[INFO] Saving centroids to .npy and .bin...\n"; + cnpy::npy_save(out_npy_path, h_centroids.data(), {static_cast(K), static_cast(D)}, "w"); + save_bin(out_bin_path, h_centroids.data(), K * D); + + std::cout << "KMeans complete. Saved to:\n"; + std::cout << " - " << out_npy_path << "\n"; + std::cout << " - " << out_bin_path << "\n"; + + return 0; +} + diff --git a/src/cuda/kmeans_gpu.cu b/src/cuda/kmeans_gpu.cu index 8ba6d0f..c009ad2 100644 --- a/src/cuda/kmeans_gpu.cu +++ b/src/cuda/kmeans_gpu.cu @@ -1,45 +1,195 @@ -#include -#include +#include "kmeans_gpu.h" +#include #include -#include +#include +#include +#include -__global__ void assign_clusters_kernel(const float* data, const float* centroids, - int* assignments, int N, int D, int K) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - const float* point = &data[i * D]; - float best_dist = FLT_MAX; - int best_k = -1; +// Kernel for calculating L2 distance between vectors +__global__ void calculateDistances(const float *data, const float *centroids, + float *distances, int num_vectors, int num_centroids, + int dim) +{ + int vector_idx = blockIdx.x * blockDim.x + threadIdx.x; + int centroid_idx = blockIdx.y * blockDim.y + threadIdx.y; - for (int k = 0; k < K; ++k) { + if (vector_idx < num_vectors && centroid_idx < num_centroids) + { float dist = 0.0f; - const float* centroid = ¢roids[k * D]; - for (int d = 0; d < D; ++d) { - float diff = point[d] - centroid[d]; + for (int d = 0; d < dim; d++) + { + float diff = data[vector_idx * dim + d] - centroids[centroid_idx * dim + d]; dist += diff * diff; } + distances[vector_idx * num_centroids + centroid_idx] = dist; + } +} + +__global__ void normalize_centroids_kernel( + float* centroids, // [K × D] centroid sums + const int* counts, // [K] number of points in each cluster + int K, int D) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= K * D) return; + + int cluster_id = idx / D; + int dim_id = idx % D; + + int count = max(1, counts[cluster_id]); // avoid divide-by-zero + + centroids[cluster_id * D + dim_id] /= count; +} + +// Assign each point to nearest centroid +__global__ void assign_clusters_kernel(const float *distances, + int *assignments, int N, int K) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) return; + + float best_dist = FLT_MAX; + int best_k = -1; + + for (int k = 0; k < K; ++k) { + float dist = distances[idx * K + k]; if (dist < best_dist) { best_dist = dist; best_k = k; } } - - assignments[i] = best_k; + assignments[idx] = best_k; } -__global__ void update_centroids_kernel(const float* data, const int* assignments, - float* new_centroids, int* counts, - int N, int D, int K) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; +// Update centroids by summing points per cluster +__global__ void update_centroids_kernel(const float *data, const int *assignments, + float *new_centroids, int *counts, + int N, int D, int K) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) return; - int cluster = assignments[i]; + int cluster = assignments[idx]; for (int d = 0; d < D; ++d) { - atomicAdd(&new_centroids[cluster * D + d], data[i * D + d]); + atomicAdd(&new_centroids[cluster * D + d], data[idx * D + d]); } atomicAdd(&counts[cluster], 1); } -// Optional: normalize new centroids by dividing by counts on host +__global__ void build_inverted_lists_kernel( + const float* data, + const int* assignments, + float** inverted_lists, + int* list_sizes, + int N, int D) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) return; + + int cluster_id = assignments[idx]; + + // Atomically get a spot in the centroid's list + int pos = atomicAdd(&list_sizes[cluster_id], 1); + + float* list = inverted_lists[cluster_id]; + + // Copy the vector + for (int d = 0; d < D; ++d) { + list[pos * D + d] = data[idx * D + d]; + } +} + +__global__ void create_all_vectors_flat_kernel( + const float* data, // [num_vectors x dim] + const int* assignments, // [num_vectors] + const int* cluster_offsets, // [num_centroids] + int* cluster_sizes, // [num_centroids] + float* all_vectors_flat, // [num_vectors x dim] + int num_vectors, + int dim) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= num_vectors) return; + + int cluster_id = assignments[idx]; + + // Atomically reserve a spot within this cluster + int pos_in_cluster = atomicAdd(&cluster_sizes[cluster_id], 1); + + int flat_index = (cluster_offsets[cluster_id] + pos_in_cluster) * dim; + + // Copy the vector + for (int d = 0; d < dim; ++d) { + all_vectors_flat[flat_index + d] = data[idx * dim + d]; + } +} + +void kmeans_gpu(const float *h_data, float *h_centroids, int *h_assignments, + int N, int D, int K, int max_iter) +{ + // // Allocate device memory + // float *d_data, *d_centroids, *d_new_centroids; + // int *d_assignments, *d_counts; + + // size_t data_size = N * D * sizeof(float); + // size_t centroids_size = K * D * sizeof(float); + // size_t assignments_size = N * sizeof(int); + // size_t counts_size = K * sizeof(int); + + // cudaMalloc(&d_data, data_size); + // cudaMalloc(&d_centroids, centroids_size); + // cudaMalloc(&d_new_centroids, centroids_size); + // cudaMalloc(&d_assignments, assignments_size); + // cudaMalloc(&d_counts, counts_size); + // cudaMemcpy(d_data, h_data, data_size, cudaMemcpyHostToDevice); + + // // Initialize centroids by copying first K vectors from data + // cudaMemcpy(d_centroids, d_data, centroids_size, cudaMemcpyDeviceToDevice); + + // int threads = 256; + // int blocks = (N + threads - 1) / threads; + + // for (int iter = 0; iter < max_iter; ++iter) + // { + // // Step 1: Assign + // assign_clusters_kernel<<>>(d_data, d_centroids, d_assignments, N, D, K); + // cudaDeviceSynchronize(); + + // // Step 2: Reset new centroids + counts + // cudaMemset(d_new_centroids, 0, centroids_size); + // cudaMemset(d_counts, 0, counts_size); + + // // Step 3: Update + // update_centroids_kernel<<>>(d_data, d_assignments, d_new_centroids, d_counts, N, D, K); + // cudaDeviceSynchronize(); + + // // Step 4: Average + // std::vector h_new_centroids(K * D); + // std::vector h_counts(K); + // cudaMemcpy(h_new_centroids.data(), d_new_centroids, centroids_size, cudaMemcpyDeviceToHost); + // cudaMemcpy(h_counts.data(), d_counts, counts_size, cudaMemcpyDeviceToHost); + + // for (int k = 0; k < K; ++k) + // { + // int count = max(h_counts[k], 1); // Avoid division by zero + // for (int d = 0; d < D; ++d) + // { + // h_centroids[k * D + d] = h_new_centroids[k * D + d] / count; + // } + // } + + // // Copy updated centroids back to device + // cudaMemcpy(d_centroids, h_centroids, centroids_size, cudaMemcpyHostToDevice); + // } + + // // Copy final assignments if needed + // cudaMemcpy(h_assignments, d_assignments, assignments_size, cudaMemcpyDeviceToHost); + + // cudaFree(d_data); + // cudaFree(d_centroids); + // cudaFree(d_new_centroids); + // cudaFree(d_assignments); + // cudaFree(d_counts); +} diff --git a/src/cuda/loader_usage_example.cpp b/src/cuda/loader_usage_example.cpp new file mode 100644 index 0000000..51fa650 --- /dev/null +++ b/src/cuda/loader_usage_example.cpp @@ -0,0 +1,14 @@ +#include "load_bin_to_gpu.h" + +int main() { + int N, D; + float* d_centroids = load_bin_to_gpu("data/processed/ivf_centroids.bin", 128, N, D); + std::cout << "Loaded " << N << " centroids of dimension " << D << std::endl; + + // Now you can use `d_centroids` in your kernels + // Remember to free it later + cudaFree(d_centroids); + + return 0; +} + diff --git a/src/cuda/select.cu b/src/cuda/select.cu new file mode 100644 index 0000000..4d7494a --- /dev/null +++ b/src/cuda/select.cu @@ -0,0 +1,121 @@ +#include "select.cuh" +#include "heap.cuh" + + +/** + * CUDA fused kernel that implements distance calculation and fast k-selection on the GPU. + * It uses the GPU heap implementation to store the nearest neighbor candidates in register arrays. + * Each warp is dedicated to k-selection over a set of candidates + */ +template +__global__ void select_min_kernel(float *product_distances, + float *centroid_norms, int64_t *output_indexes, + int nqueries, int ncentroids, uint64_t centroid_start_offset, uint32_t k) { + uint32_t query_id = (blockIdx.x * blockDim.x + threadIdx.x)/WARP_SIZE; + if (query_id > nqueries) return; + + GPUHeap heap; + + for (int i=threadIdx.x/WARP_SIZE; i &product_distances, thrust::device_vector ¢roids, + thrust::device_vector ¢roid_norms, int ndim, int nqueries, thrust::device_vector &output_indexes, + uint32_t k) { + uint32_t centroid_tile_size = 128; + uint32_t query_tile_size = 128; + uint32_t ncentroids = centroids.size()/ndim; + + #define CALL_KERNEL(WARP_QUEUE_LENGTH, THREAD_QUEUE_LENGTH) \ + select_min_kernel<<>>(product_distances_slice, \ + centroid_norms_slice, \ + output_indexes_slice, \ + nqueries, ncentroids, j, k); + + for (unsigned int i=0; i +__global__ void select_fine_grained_kernel(float* input_distances, + int64_t* input_indices, int64_t* output_indices, int64_t *offsets, + float *output_distances, uint32_t k, uint32_t ndim, uint32_t nqueries) { + uint32_t query_id = (blockIdx.x * blockDim.x + threadIdx.x)/WARP_SIZE; + if (query_id > nqueries) return; + + GPUHeap heap; + int64_t start_offset = offsets[query_id]; + int64_t end_offset = offsets[query_id]; + int64_t num_data_points = end_offset - start_offset; + + for (int64_t i=threadIdx.x/WARP_SIZE; i &input_distances, + thrust::device_vector &data_indexes, thrust::device_vector &offsets, + thrust::device_vector &output_distances, thrust::device_vector &output_indexes, + uint32_t ndim, uint32_t nqueries, uint32_t k) { + #define CALL_KERNEL(WARP_QUEUE_LENGTH, THREAD_QUEUE_LENGTH) \ + select_fine_grained_kernel<<>>(input_distances_ptr, \ + data_indexes_ptr, output_indexes_ptr, offsets_ptr, \ + output_distances_ptr, k, ndim, nqueries); + + // TODO(): Tiling on queries + int64_t *output_indexes_ptr = thrust::raw_pointer_cast(output_indexes.data()); + int64_t *data_indexes_ptr = thrust::raw_pointer_cast(data_indexes.data()); + float *input_distances_ptr = thrust::raw_pointer_cast(input_distances.data()); + float *output_distances_ptr = thrust::raw_pointer_cast(output_distances.data()); + int64_t *offsets_ptr = thrust::raw_pointer_cast(offsets.data()); + switch(k) { + // TODO(): Look into this!!! + dim3 grid = (1); + dim3 blocks = (32); + if (k <= 32) { + CALL_KERNEL(32, 2) + } else if (k <= 64) { + CALL_KERNEL(64, 3) + } else if (k <= 128) { + CALL_KERNEL(128, 3) + } else if (k <= 256) { + CALL_KERNEL(256, 4) + } else if (k <= 512) { + CALL_KERNEL(512, 4) + } else { + CALL_KERNEL(1024, 4) + } + } + #undef CALL_KERNEL +} \ No newline at end of file diff --git a/src/cuda/select.cuh b/src/cuda/select.cuh new file mode 100644 index 0000000..ed83598 --- /dev/null +++ b/src/cuda/select.cuh @@ -0,0 +1,14 @@ +#include +#include + +__host__ void select_min(thrust::device_vector &product_distances, + thrust::device_vector ¢roids, + thrust::device_vector ¢roid_norms, int ndim, int nqueries, + thrust::device_vector &output_indexes, + uint32_t k); + + +__host__ void select_fine_grained(thrust::device_vector &input_distances, + thrust::device_vector &data_indexes, thrust::device_vector &offsets, + thrust::device_vector &output_distances, thrust::device_vector &output_indexes, + uint32_t ndim, uint32_t nqueries, uint32_t k); \ No newline at end of file diff --git a/src/cuda/sort.cuh b/src/cuda/sort.cuh new file mode 100644 index 0000000..b31617d --- /dev/null +++ b/src/cuda/sort.cuh @@ -0,0 +1,238 @@ +#pragma once + +#include +#include + +constexpr __host__ __device__ bool is_power_of_2(uint32_t v) { + return (v && !(v & (v - 1))); +} + + +constexpr __host__ __device__ int log2(int n, int p = 0) { + return (n <= 1) ? p : log2(n / 2, p + 1); +} + +constexpr __host__ __device__ bool is_power_of_2(int v) { + return (v && !(v & (v - 1))); +} + +// Calculate largest power of 2 less than n +constexpr __host__ __device__ int32_t largest_power_of_2_lt_n(int n) { + return (is_power_of_2(n))? n/2 : (1 << (uint32_t)(log2(n))); +} + +template +__device__ void device_swap(T &v1, T &v2) { + T tmp = v1; + v1 = v2; + v2 = tmp; +} + +/** + * This corresponds to the MERGE-ODD-CONTINUE from the FAISS paper. + * Template params: + * - L -> length of the array + * - P -> Left (true) /right (false) + * + * TODO(): Optimizations for small-sized arrays + */ +template +void __device__ merge_odd_continue(uint64_t *keys, float *values) { + constexpr uint32_t H = largest_power_of_2_lt_n(L); + if (L == 1) return; + + if constexpr (L > 1) { + #pragma unroll + for (int i=0; i v2) { + device_swap(v1, v2); + device_swap(k1, k2); + } + } + + if (P) { + merge_odd_continue (keys, values); + + float new_values[H]; + uint64_t new_keys[H]; + + #pragma unroll + for (int i=0; i (new_keys, new_values); + + #pragma unroll + for (int i=0; i (keys, values); + + float new_values[L-H]; + uint64_t new_keys[L-H]; + + #pragma unroll + for (int i=0; i (new_keys, new_values); + + #pragma unroll + for (int i=0; i +void __device__ merge_odd(float values_left[L], float values_right[L], uint64_t keys_left[R], uint64_t keys_right[R]) { + #pragma unroll + for (int i=0; i other_v2) { + k1 = other_k2; + v1 = other_v2; + } + + if (other_v1 > v2) { + k2 = other_k1; + v2 = other_v1; + } + } + __syncwarp(); + + merge_odd_continue(keys_left, values_left); + merge_odd_continue(keys_right, values_right); +} + +__device__ __forceinline__ int get_lane_id() { + int lane_id; + asm("mov.u32 %0, %%laneid;" : "=r"(lane_id)); + return lane_id; +} + + +/** + * Merge WARP_SIZE/L sorted arrays of length L to form WARP_SIZE/2L sorted arrays of length 2L + */ +template +__device__ void sort_small_array(float &value, uint64_t &key) { + static_assert(is_power_of_2(L), "L must be power of 2"); + + // This is the MERGE-ODD part for small arrays + int lane_id = get_lane_id(); + float other_value = __shfl_xor_sync(0xFFFFFFFF, value, L); + uint64_t other_key = __shfl_xor_sync(0xFFFFFFFF, key, L); + bool smaller_lane = lane_id < (lane_id ^ L); + bool should_swap = smaller_lane ^ (value > other_value); + if (should_swap) { + device_swap<>(value, other_value); + device_swap<>(key, other_key); + } + __syncwarp(); + + // This corresponds to the MERGE-ODD-CONTINUE part + int lane_group_start = L & lane_id; + int lane_group_end = lane_group_start + L; + for (int stride=L/2; stride>0; stride/=2) { + smaller_lane = lane_id < (lane_id ^ stride); + other_value = __shfl_xor_sync(0xFFFFFFFF, value, stride); + other_key = __shfl_xor_sync(0xFFFFFFFF, key, stride); + + bool should_swap = smaller_lane ^ (value > other_value); + if (should_swap) { + device_swap<>(value, other_value); + device_swap<>(key, other_key); + } + __syncwarp(); + } +} + + +// Recursively sort a lane-strided array +template +__device__ void sort_odd(float values[N], uint64_t keys[N]) { + static_assert(N >= 2, "Use template specialization for N=1"); + constexpr uint32_t size_a = N/2; + constexpr uint32_t size_b = N - size_a; + + float values_a[size_a]; + uint64_t keys_a[size_a]; + + // Sort the left half + #pragma unroll + for (int i=0; i 0) { + sort_odd(values_a, keys_a); + } + + // Sort the right half + float values_b[size_b]; + uint64_t keys_b[size_b]; + + #pragma unroll + for (int i=N/2; i 1) { + sort_odd(values_b, keys_b); + } + + // Merge the two halves + merge_odd(values_a, values_b, keys_a, keys_b); + + #pragma unroll + for (int i=0; i +__device__ void sort_odd<1>(float values[1], uint64_t keys[1]) { + sort_small_array<1>(values[0], keys[0]); + sort_small_array<2>(values[0], keys[0]); + sort_small_array<4>(values[0], keys[0]); + sort_small_array<8>(values[0], keys[0]); + sort_small_array<16>(values[0], keys[0]); +} \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000..4cdae2b --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,12 @@ +enable_testing() + +find_package(CUDA REQUIRED) + +add_executable(tests test_heap.cu) +target_link_libraries(tests cuda_vector_db) + +target_compile_options(tests PRIVATE + $<$:-Xptxas -O3> # Pass -O3 optimization to ptxas +) + +target_compile_features(cuda_vector_db PUBLIC cxx_std_17) \ No newline at end of file diff --git a/tests/test_heap.cu b/tests/test_heap.cu new file mode 100644 index 0000000..07b33aa --- /dev/null +++ b/tests/test_heap.cu @@ -0,0 +1,94 @@ +#include +#include +#include +#include +#include +#include + +#include "../src/cuda/heap.cuh" + +constexpr int WARP_QUEUE_SIZE = 32; +constexpr int THREAD_QUEUE_SIZE = 8; + +__global__ void test_gpu_heap_add(const int* input, int size, int64_t* out_keys) { + GPUHeap heap; + + // Add elements from the input array to the heap + for (int i = threadIdx.x; i < size; i += blockDim.x) { + heap.add(i, static_cast(input[i])); + } + + // Dump the results to shared memory + __shared__ int64_t shared_out_keys[WARP_QUEUE_SIZE]; + heap.dump(shared_out_keys, WARP_QUEUE_SIZE); + + // Copy results to global memory + if (threadIdx.x == 0) { + for (int i = 0; i < WARP_QUEUE_SIZE; i++) { + out_keys[i] = shared_out_keys[i]; + } + } +} + +int main() { + // Generate random input array of size 1024 + constexpr int INPUT_SIZE = 1024; + std::vector input(INPUT_SIZE); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> dis(-1000000, 1000000); + + for (int& val : input) { + val = dis(gen); + } + + int size = input.size(); + + // Allocate device memory for input and output + int* d_input; + int64_t* d_out_keys; + cudaMalloc(&d_input, size * sizeof(int)); + cudaMalloc(&d_out_keys, WARP_QUEUE_SIZE * sizeof(int64_t)); + + // Copy input data to device + cudaMemcpy(d_input, input.data(), size * sizeof(int), cudaMemcpyHostToDevice); + + // Launch the kernel + test_gpu_heap_add<<<1, 32>>>(d_input, size, d_out_keys); + cudaDeviceSynchronize(); + + // Check for CUDA errors + cudaError_t err = cudaGetLastError(); + assert(err == cudaSuccess); + + // Copy results back to host + std::vector gpu_heap(WARP_QUEUE_SIZE); + cudaMemcpy(gpu_heap.data(), d_out_keys, WARP_QUEUE_SIZE * sizeof(int64_t), cudaMemcpyDeviceToHost); + + // CPU min-heap + std::priority_queue, std::vector>, std::greater>> cpu_heap; + int i=0; + for (int val : input) { + cpu_heap.push(std::make_pair<>(val, i)); + if (cpu_heap.size() > WARP_QUEUE_SIZE) { + cpu_heap.pop(); + } + } + + // Extract CPU heap results + std::vector cpu_heap_results; + while (!cpu_heap.empty()) { + cpu_heap_results.push_back(cpu_heap.top().second); + cpu_heap.pop(); + } + + // Compare results + assert(gpu_heap.size() == cpu_heap_results.size()); + for (size_t i = 0; i < cpu_heap_results.size(); ++i) { + assert(gpu_heap[i] == cpu_heap_results[i]); + } + + // Free device memory + cudaFree(d_input); + cudaFree(d_out_keys); +}