diff --git a/.gitignore b/.gitignore
index d877ae7..8d9b4c1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,5 @@
 # Ignore raw datasets
 data/raw/
+.vscode/
+build/
+
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..79ad8be
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,65 @@
+{
+    "files.associations": {
+        "random": "cpp",
+        "array": "cpp",
+        "atomic": "cpp",
+        "bit": "cpp",
+        "*.tcc": "cpp",
+        "bitset": "cpp",
+        "cctype": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "compare": "cpp",
+        "complex": "cpp",
+        "concepts": "cpp",
+        "cstdarg": "cpp",
+        "cstddef": "cpp",
+        "cstdint": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "deque": "cpp",
+        "map": "cpp",
+        "string": "cpp",
+        "unordered_map": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "algorithm": "cpp",
+        "functional": "cpp",
+        "iterator": "cpp",
+        "memory": "cpp",
+        "memory_resource": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "regex": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "utility": "cpp",
+        "fstream": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "iosfwd": "cpp",
+        "iostream": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "new": "cpp",
+        "numbers": "cpp",
+        "ostream": "cpp",
+        "sstream": "cpp",
+        "stdexcept": "cpp",
+        "streambuf": "cpp",
+        "typeinfo": "cpp",
+        "unordered_set": "cpp",
+        "charconv": "cpp",
+        "chrono": "cpp",
+        "ratio": "cpp",
+        "format": "cpp",
+        "span": "cpp",
+        "variant": "cpp"
+    }
+}
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..b066ab0
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,7 @@
+cmake_minimum_required(VERSION 3.8)
+project(CUDA_VectorDB LANGUAGES CXX CUDA)
+set(CMAKE_CXX_STANDARD 17)
+
+add_subdirectory(src/cuda)
+add_subdirectory(tests)
+add_dependencies(tests cuda_vector_db)
\ No newline at end of file
diff --git a/README.md b/README.md
index 21e9ecd..19bb1a9 100644
--- a/README.md
+++ b/README.md
@@ -3,21 +3,15 @@ Vector database search optimized using cuda
 
 ---
 
-## ✅ Implemented So Far
-
-- ✅ SIFT1M dataset preprocessing (`.fvecs → .npy`)
-- ✅ Optional vector normalization
-- 🔧 Custom CUDA-based KMeans:
-  - `kmeans_gpu.cu`: core CUDA kernels for assignment and centroid update
-  - `kmeans_driver.cpp`: runs clustering loop and saves centroids
-- ⏳ Python IVF index builder (`build_ivf_index.py`) using precomputed centroids
-- ⏳ Scripts for converting data, checking shape, and benchmarking CPU vs GPU KMeans
+## This branch is dedicated to implementation testing using the SIFTSMALL dataset.
 
 ---
 
-## Dataset: SIFT1M
+## Dataset: SIFTSMALL
+
+You can find the original compressed dataset file in `data/raw`. The dataset has already been preprocessed, and the resulting  `.npy` files are available in `data/processed`. For human-readable inspection, you can use `parse.py` to convert   `.npy` file to `.txt`. 
 
-Due to GitHub file size limits, we do not include the full dataset in this repo.
+If you'd like to re-run the preprocessing yourself, feel free to follow the steps below. Be sure to adjust any file paths or names if needed (eg. sift1m vs. siftsmall).
 
 To download:
 ```bash
@@ -36,6 +30,84 @@ python3 scripts/normalize_vectors.py
 
 ## IVF Indexing (Inverted File Index)
 ### Run custom GPU KMeans and save IVF centroids:
+We run KMeans clustering on the base vectors to compute `K` coarse centroids. These centroids serve as the anchors for grouping similar vectors in the inverted index.
+
+Compilation command:
+Notice that the test environment was based on Euler, with `nvidia/cuda/11.8.0` module loaded. And we choose `K=100` for a small dataset like siwfsmall.
+```bash
+nvcc -std=c++17 -O2  -Xcompiler -Wall -Xptxas -O3 -Iinclude -Iinclude/cnpy -o bin/kmeans_driver  src/cuda/kmeans_driver.cpp  src/cuda/kmeans_gpu.cu include/cnpy/cnpy.cpp -lcuda -lcudart -lz
+```
+Run the job:
+Here is an example sbatch script `test_kmeans_siftsmall.sh `. Please adjust it yourself if needed.
+```bash
+#!/usr/bin/env bash
+#SBATCH -p instruction
+#SBATCH --job-name=kmeans-siftsmall
+#SBATCH --output=kmeans-siftsmall.out
+#SBATCH --error=kmeans-siftsmall.err
+#SBATCH --ntasks=1
+#SBATCH --gres=gpu:1
+#SBATCH --time=00:05:00
+
+echo "[INFO] Job started on $(hostname) at $(date)"
+echo "[INFO] Working directory: $(pwd)"
+echo "[INFO] Running binary..."
+
+# Check if binary exists
+if [[ ! -x ./bin/kmeans_driver ]]; then
+  echo "[ERROR] ./bin/kmeans_driver not found or not executable!"
+  exit 1
+fi
+
+# Run your program
+./bin/kmeans_driver
+
+echo "[INFO] Job finished at $(date)"
+```
+Run the job:
+Here is an example sbatch script `test_kmeans_siftsmall.sh `. Please adjust it yourself if needed.
+```bash
+#!/usr/bin/env bash
+#SBATCH -p instruction
+#SBATCH --job-name=kmeans-siftsmall
+#SBATCH --output=kmeans-siftsmall.out
+#SBATCH --error=kmeans-siftsmall.err
+#SBATCH --ntasks=1
+#SBATCH --gres=gpu:1
+#SBATCH --time=00:05:00
+
+echo "[INFO] Job started on $(hostname) at $(date)"
+echo "[INFO] Working directory: $(pwd)"
+echo "[INFO] Running binary..."
+
+# Check if binary exists
+if [[ ! -x ./bin/kmeans_driver ]]; then
+  echo "[ERROR] ./bin/kmeans_driver not found or not executable!"
+  exit 1
+fi
+
+# Run your program
+./bin/kmeans_driver
+
+echo "[INFO] Job finished at $(date)"
+```
+After running `kmeans_driver`, `ivf_centroids.npy` and `ivf_centroids.bin` are generated and stored in `data/processed`. 
+
+### Inverted File (IVF) Index Construction
+Use the trained centroids to assign each base vector to its closest cluster, producing the inverted index (ie. `ivf_list_ids.npy` and `ivf_offsets.npy`.
+```bash
+python3 scripts/build_ivf_index.py
+```
+To inspect the resulting IVF and debug, you can use `scripts/debug_ivf_index.py`. This script helps you verify the IVF index built from KMeans centroids.
+```bash
+python3 scripts/debug_ivf_index.py
+```
+
+### Note from Rui
+All output files from my test trial are saved in the `kmeans_output/` directory for your reference and comparison.
+
+### Note from Rui
+All output files from my test trial are saved in the `kmeans_output/` directory for your reference and comparison.
 
 ## Project Structure
 ```
@@ -44,22 +116,42 @@ repo/
 ├── .gitignore
 ├── data/
 │   ├── raw/                      # Original downloaded datasets (e.g., SIFT1M)
+│   │   └── siftsmall.tar.gz      # Compressed test dataset
+│   │   └── siftsmall.tar.gz      # Compressed test dataset
 │   ├── processed/                # .npy vectors, normalized data, IVF index files
-│   └── scripts/                  # (Future) MS MARCO embedding scripts
+│   │   ├── siftsmall_base.npy
+│   │   ├── siftsmall_base.npy
+│   │   ├── ivf_centroids.npy       # Trained KMeans centroids
+│   │   ├── ivf_centroids.bin       # Binary version of centroids for GPU use
+│   │   ├── ivf_list_ids.npy        # Flattened list of vector indices by cluster
+│   │   ├── ivf_offsets.npy         # Offsets into list_ids for each cluster
+│   │   ├── parse.py                # Helper script that converts .npy to .txt
+│   │   └── siftsmall_query.npy     # (Optional) query set
+│   │   └── siftsmall_query.npy     # (Optional) query set
 ├── src/
 │   ├── cuda/
 │   │   ├── kmeans_gpu.cu         # Custom CUDA KMeans kernel code
 │   │   ├── kmeans_driver.cpp     # Host-side driver to run GPU KMeans
+│   │   ├── IVFIndex.h     # Index class
 │   ├── cpu_baseline/
 │   │   └── hnswlib_baseline.cpp  # (TODO) CPU-only ANN baseline using HNSWlib
 │   └── main.cpp                  # CLI entrypoint (planned)
 ├── include/
 │   └── kmeans_gpu.h              # CUDA kernel function declarations
+│   └── cnpy/                     # Embedded C++ library for reading/writing .npy
+│       ├── cnpy.h
+│       └── cnpy.cpp
+│   └── cnpy/                     # Embedded C++ library for reading/writing .npy
+│       ├── cnpy.h
+│       └── cnpy.cpp
 ├── scripts/
 │   ├── convert_fvecs.py          # Convert SIFT1M .fvecs → .npy
 │   ├── normalize_vectors.py      # Optional: normalize vectors
 │   ├── build_ivf_index.py        # Build inverted lists using IVF centroids
+│   ├── debug_ivf_index.py        # Inspect and validate IVF inverted lists
+│   ├── debug_ivf_index.py        # Inspect and validate IVF inverted lists
 │   ├── benchmark_kmeans.py       # Compare sklearn/FAISS vs your CUDA KMeans
 │   └── check_sift_stats.py       # Inspect shape/dtype of parsed vectors
-├── tests/                        # planned
+├── kmeans_output/                # Output files for comparison
+├── kmeans_output/                # Output files for comparison
 ```
diff --git a/build.sh b/build.sh
new file mode 100644
index 0000000..aa303c8
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env zsh
+#SBATCH --job-name=Build_vector_db
+#SBATCH --partition=instruction
+#SBATCH --time 00:05:00
+#SBATCH --ntasks=1
+#SBATCH --gpus-per-task=1
+#SBATCH --output=Build_vector_db.out
+
+module load nvidia/cuda/11.8.0
+
+mkdir -p build
+cd build
+cmake ..
+cmake --build .
+cmake --install .
\ No newline at end of file
diff --git a/createIndex.sh b/createIndex.sh
new file mode 100644
index 0000000..d719b6f
--- /dev/null
+++ b/createIndex.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env zsh
+#SBATCH -p instruction
+#SBATCH --job-name=kmeans-siftsmall
+#SBATCH --output=kmeans-siftsmall.out
+#SBATCH --error=kmeans-siftsmall.err
+#SBATCH --ntasks=1
+#SBATCH --gres=gpu:1
+
+echo "[INFO] Job started on $(hostname) at $(date)"
+echo "[INFO] Working directory: $(pwd)"
+echo "[INFO] Running binary..."
+module load nvidia/cuda/11.8.0
+mkdir -p bin
+nvcc -std=c++17 -O3 -Xcompiler -Wall -Iinclude -Iinclude/cnpy -o bin/kmeans_driver src/cuda/kmeans_driver.cu src/cuda/IVFIndex.cu src/cuda/kmeans_gpu.cu include/cnpy/cnpy.cpp -lcudart -lz
+
+# Check if binary exists
+if [[ ! -x ./bin/kmeans_driver ]]; then
+  echo "[ERROR] ./bin/kmeans_driver not found or not executable!"
+  exit 1
+fi
+
+# Run your program
+./bin/kmeans_driver
+
+echo "[INFO] Job finished at $(date)"
\ No newline at end of file
diff --git a/data/processed/.DS_Store b/data/processed/.DS_Store
new file mode 100644
index 0000000..5008ddf
Binary files /dev/null and b/data/processed/.DS_Store differ
diff --git a/data/processed/parse.py b/data/processed/parse.py
new file mode 100644
index 0000000..9afc43f
--- /dev/null
+++ b/data/processed/parse.py
@@ -0,0 +1,36 @@
+import numpy as np
+import argparse
+
+def load_and_export_sift_base(input_path, output_path, max_rows=20, precision=5):
+    print(f"[INFO] Loading: {input_path}")
+    data = np.load(input_path)
+
+    print(f"[INFO] Shape: {data.shape}")  # e.g., (1000000, 128)
+
+    print(f"[INFO] Writing first {max_rows} vectors to: {output_path}")
+    np.savetxt(
+        output_path,
+        data[:max_rows],
+        fmt=f"%.{precision}f",
+        delimiter=" ",
+        header=f"First {max_rows} vectors from {input_path} (dim={data.shape[1]})",
+        comments=''
+    )
+
+    print("Done!")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input", default="ivf_list_ids.npy", help="Path to .npy file"
+    )
+    parser.add_argument(
+        "--output", default="ivf_list_ids.txt", help="Path to output .txt file"
+    )
+    parser.add_argument(
+        "--rows", type=int, default=20, help="Number of rows to export"
+    )
+    args = parser.parse_args()
+
+    load_and_export_sift_base(args.input, args.output, max_rows=args.rows)
+
diff --git a/data/raw/siftsmall.tar.gz b/data/raw/siftsmall.tar.gz
new file mode 100644
index 0000000..68f3da0
Binary files /dev/null and b/data/raw/siftsmall.tar.gz differ
diff --git a/include/IVFIndex.h b/include/IVFIndex.h
new file mode 100644
index 0000000..9c02190
--- /dev/null
+++ b/include/IVFIndex.h
@@ -0,0 +1,58 @@
+#ifndef IVFINDEX_H
+#define IVFINDEX_H
+
+#include <cuda_runtime.h>
+#include <string>
+#include <vector>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+typedef int64_t IndexType;
+
+class IVFIndex {
+private:
+    int num_centroids;
+    int num_vectors;
+    int dim;
+    int iterations;
+    float* centroids;
+    float* data;
+    int* assignments;
+    float* distances;
+    int* counts;
+    float** d_inverted_lists;
+    int* d_list_sizes;
+    int* d_cluster_offsets;
+    float* d_all_vectors_flat;
+
+    void checkCudaError(const std::string& kernelName);
+
+public:
+    IVFIndex(int num_centroids, int num_vectors, int dim, int iterations);
+    ~IVFIndex();
+
+    float* getData() const;
+    float* getallvectorsflat() const;
+    float** getInvertedLists() const;
+    int* getClusterOffsets() const;
+    float* getCentroids() const;
+    int* getAssignments() const;
+    int getNumCentroids() const;
+    int getDim() const;
+    int getNumVectors() const;
+
+    void train(const float* train_data);
+    void search_coarse_grained(thrust::device_vector<float> &queries, 
+        thrust::device_vector<float> &queries_norm, 
+        thrust::device_vector<int64_t> *coarse_grained_output,
+        uint32_t centroids_to_select);
+
+    void search_fine_grained(thrust::device_vector<float> &queries, thrust::device_vector<float> &queries_norm, 
+        thrust::device_vector<int64_t> &coarse_grained_output,
+        thrust::device_vector<int64_t> *fine_grained_output,
+        uint32_t k);
+
+    thrust::device_vector<int64_t> search_batch(thrust::device_vector<float> &queries, uint32_t k);
+};
+
+#endif // IVFINDEX_H
\ No newline at end of file
diff --git a/include/cnpy/CMakeLists.txt b/include/cnpy/CMakeLists.txt
new file mode 100644
index 0000000..9eb550f
--- /dev/null
+++ b/include/cnpy/CMakeLists.txt
@@ -0,0 +1,30 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 3.0 FATAL_ERROR)
+if(COMMAND cmake_policy)
+	cmake_policy(SET CMP0003 NEW)
+endif(COMMAND cmake_policy)
+
+project(CNPY)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
+option(ENABLE_STATIC "Build static (.a) library" ON)
+
+find_package(ZLIB REQUIRED)
+
+include_directories(${ZLIB_INCLUDE_DIRS})
+
+add_library(cnpy SHARED "cnpy.cpp")
+target_link_libraries(cnpy ${ZLIB_LIBRARIES})
+install(TARGETS "cnpy" LIBRARY DESTINATION lib PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
+
+if(ENABLE_STATIC)
+    add_library(cnpy-static STATIC "cnpy.cpp")
+    set_target_properties(cnpy-static PROPERTIES OUTPUT_NAME "cnpy")
+    install(TARGETS "cnpy-static" ARCHIVE DESTINATION lib)
+endif(ENABLE_STATIC)
+
+install(FILES "cnpy.h" DESTINATION include)
+install(FILES "mat2npz" "npy2mat" "npz2mat" DESTINATION bin PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
+
+add_executable(example1 example1.cpp)
+target_link_libraries(example1 cnpy)
diff --git a/include/cnpy/LICENSE b/include/cnpy/LICENSE
new file mode 100644
index 0000000..e60eadb
--- /dev/null
+++ b/include/cnpy/LICENSE
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) Carl Rogers, 2011
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/include/cnpy/README.md b/include/cnpy/README.md
new file mode 100644
index 0000000..37c4a43
--- /dev/null
+++ b/include/cnpy/README.md
@@ -0,0 +1,55 @@
+# Purpose:
+
+NumPy offers the `save` method for easy saving of arrays into .npy and `savez` for zipping multiple .npy arrays together into a .npz file. 
+
+`cnpy` lets you read and write to these formats in C++. 
+
+The motivation comes from scientific programming where large amounts of data are generated in C++ and analyzed in Python.
+
+Writing to .npy has the advantage of using low-level C++ I/O (fread and fwrite) for speed and binary format for size. 
+The .npy file header takes care of specifying the size, shape, and data type of the array, so specifying the format of the data is unnecessary.
+
+Loading data written in numpy formats into C++ is equally simple, but requires you to type-cast the loaded data to the type of your choice.
+
+# Installation:
+
+Default installation directory is /usr/local. 
+To specify a different directory, add `-DCMAKE_INSTALL_PREFIX=/path/to/install/dir` to the cmake invocation in step 4.
+
+1. get [cmake](www.cmake.org)
+2. create a build directory, say $HOME/build
+3. cd $HOME/build
+4. cmake /path/to/cnpy
+5. make
+6. make install
+
+# Using:
+
+To use, `#include"cnpy.h"` in your source code. Compile the source code mycode.cpp as
+
+```bash
+g++ -o mycode mycode.cpp -L/path/to/install/dir -lcnpy -lz --std=c++11
+```
+
+# Description:
+
+There are two functions for writing data: `npy_save` and `npz_save`.
+
+There are 3 functions for reading:
+- `npy_load` will load a .npy file. 
+- `npz_load(fname)` will load a .npz and return a dictionary of NpyArray structues. 
+- `npz_load(fname,varname)` will load and return the NpyArray for data varname from the specified .npz file.
+
+The data structure for loaded data is below. 
+Data is accessed via the `data<T>()`-method, which returns a pointer of the specified type (which must match the underlying datatype of the data). 
+The array shape and word size are read from the npy header.
+
+```c++
+struct NpyArray {
+    std::vector<size_t> shape;
+    size_t word_size;
+    template<typename T> T* data();
+};
+```
+
+See [example1.cpp](example1.cpp) for examples of how to use the library. example1 will also be build during cmake installation.
diff --git a/include/cnpy/cnpy.cpp b/include/cnpy/cnpy.cpp
new file mode 100644
index 0000000..2d28578
--- /dev/null
+++ b/include/cnpy/cnpy.cpp
@@ -0,0 +1,340 @@
+//Copyright (C) 2011  Carl Rogers
+//Released under MIT License
+//license available in LICENSE file, or at http://www.opensource.org/licenses/mit-license.php
+
+#include"cnpy.h"
+#include<complex>
+#include<cstdlib>
+#include<algorithm>
+#include<cstring>
+#include<iomanip>
+#include<stdint.h>
+#include<stdexcept>
+#include <regex>
+
+char cnpy::BigEndianTest() {
+    int x = 1;
+    return (((char *)&x)[0]) ? '<' : '>';
+}
+
+char cnpy::map_type(const std::type_info& t)
+{
+    if(t == typeid(float) ) return 'f';
+    if(t == typeid(double) ) return 'f';
+    if(t == typeid(long double) ) return 'f';
+
+    if(t == typeid(int) ) return 'i';
+    if(t == typeid(char) ) return 'i';
+    if(t == typeid(short) ) return 'i';
+    if(t == typeid(long) ) return 'i';
+    if(t == typeid(long long) ) return 'i';
+
+    if(t == typeid(unsigned char) ) return 'u';
+    if(t == typeid(unsigned short) ) return 'u';
+    if(t == typeid(unsigned long) ) return 'u';
+    if(t == typeid(unsigned long long) ) return 'u';
+    if(t == typeid(unsigned int) ) return 'u';
+
+    if(t == typeid(bool) ) return 'b';
+
+    if(t == typeid(std::complex<float>) ) return 'c';
+    if(t == typeid(std::complex<double>) ) return 'c';
+    if(t == typeid(std::complex<long double>) ) return 'c';
+
+    else return '?';
+}
+
+template<> std::vector<char>& cnpy::operator+=(std::vector<char>& lhs, const std::string rhs) {
+    lhs.insert(lhs.end(),rhs.begin(),rhs.end());
+    return lhs;
+}
+
+template<> std::vector<char>& cnpy::operator+=(std::vector<char>& lhs, const char* rhs) {
+    //write in little endian
+    size_t len = strlen(rhs);
+    lhs.reserve(len);
+    for(size_t byte = 0; byte < len; byte++) {
+        lhs.push_back(rhs[byte]);
+    }
+    return lhs;
+}
+
+void cnpy::parse_npy_header(unsigned char* buffer,size_t& word_size, std::vector<size_t>& shape, bool& fortran_order) {
+    //std::string magic_string(buffer,6);
+    uint8_t major_version = *reinterpret_cast<uint8_t*>(buffer+6);
+    uint8_t minor_version = *reinterpret_cast<uint8_t*>(buffer+7);
+    uint16_t header_len = *reinterpret_cast<uint16_t*>(buffer+8);
+    std::string header(reinterpret_cast<char*>(buffer+9),header_len);
+
+    size_t loc1, loc2;
+
+    //fortran order
+    loc1 = header.find("fortran_order")+16;
+    fortran_order = (header.substr(loc1,4) == "True" ? true : false);
+
+    //shape
+    loc1 = header.find("(");
+    loc2 = header.find(")");
+
+    std::regex num_regex("[0-9][0-9]*");
+    std::smatch sm;
+    shape.clear();
+
+    std::string str_shape = header.substr(loc1+1,loc2-loc1-1);
+    while(std::regex_search(str_shape, sm, num_regex)) {
+        shape.push_back(std::stoi(sm[0].str()));
+        str_shape = sm.suffix().str();
+    }
+
+    //endian, word size, data type
+    //byte order code | stands for not applicable. 
+    //not sure when this applies except for byte array
+    loc1 = header.find("descr")+9;
+    bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false);
+    assert(littleEndian);
+
+    //char type = header[loc1+1];
+    //assert(type == map_type(T));
+
+    std::string str_ws = header.substr(loc1+2);
+    loc2 = str_ws.find("'");
+    word_size = atoi(str_ws.substr(0,loc2).c_str());
+}
+
+void cnpy::parse_npy_header(FILE* fp, size_t& word_size, std::vector<size_t>& shape, bool& fortran_order) {  
+    char buffer[256];
+    size_t res = fread(buffer,sizeof(char),11,fp);       
+    if(res != 11)
+        throw std::runtime_error("parse_npy_header: failed fread");
+    std::string header = fgets(buffer,256,fp);
+    assert(header[header.size()-1] == '\n');
+
+    size_t loc1, loc2;
+
+    //fortran order
+    loc1 = header.find("fortran_order");
+    if (loc1 == std::string::npos)
+        throw std::runtime_error("parse_npy_header: failed to find header keyword: 'fortran_order'");
+    loc1 += 16;
+    fortran_order = (header.substr(loc1,4) == "True" ? true : false);
+
+    //shape
+    loc1 = header.find("(");
+    loc2 = header.find(")");
+    if (loc1 == std::string::npos || loc2 == std::string::npos)
+        throw std::runtime_error("parse_npy_header: failed to find header keyword: '(' or ')'");
+
+    std::regex num_regex("[0-9][0-9]*");
+    std::smatch sm;
+    shape.clear();
+
+    std::string str_shape = header.substr(loc1+1,loc2-loc1-1);
+    while(std::regex_search(str_shape, sm, num_regex)) {
+        shape.push_back(std::stoi(sm[0].str()));
+        str_shape = sm.suffix().str();
+    }
+
+    //endian, word size, data type
+    //byte order code | stands for not applicable. 
+    //not sure when this applies except for byte array
+    loc1 = header.find("descr");
+    if (loc1 == std::string::npos)
+        throw std::runtime_error("parse_npy_header: failed to find header keyword: 'descr'");
+    loc1 += 9;
+    bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false);
+    assert(littleEndian);
+
+    //char type = header[loc1+1];
+    //assert(type == map_type(T));
+
+    std::string str_ws = header.substr(loc1+2);
+    loc2 = str_ws.find("'");
+    word_size = atoi(str_ws.substr(0,loc2).c_str());
+}
+
+void cnpy::parse_zip_footer(FILE* fp, uint16_t& nrecs, size_t& global_header_size, size_t& global_header_offset)
+{
+    std::vector<char> footer(22);
+    fseek(fp,-22,SEEK_END);
+    size_t res = fread(&footer[0],sizeof(char),22,fp);
+    if(res != 22)
+        throw std::runtime_error("parse_zip_footer: failed fread");
+
+    uint16_t disk_no, disk_start, nrecs_on_disk, comment_len;
+    disk_no = *(uint16_t*) &footer[4];
+    disk_start = *(uint16_t*) &footer[6];
+    nrecs_on_disk = *(uint16_t*) &footer[8];
+    nrecs = *(uint16_t*) &footer[10];
+    global_header_size = *(uint32_t*) &footer[12];
+    global_header_offset = *(uint32_t*) &footer[16];
+    comment_len = *(uint16_t*) &footer[20];
+
+    assert(disk_no == 0);
+    assert(disk_start == 0);
+    assert(nrecs_on_disk == nrecs);
+    assert(comment_len == 0);
+}
+
+cnpy::NpyArray load_the_npy_file(FILE* fp) {
+    std::vector<size_t> shape;
+    size_t word_size;
+    bool fortran_order;
+    cnpy::parse_npy_header(fp,word_size,shape,fortran_order);
+
+    cnpy::NpyArray arr(shape, word_size, fortran_order);
+    size_t nread = fread(arr.data<char>(),1,arr.num_bytes(),fp);
+    if(nread != arr.num_bytes())
+        throw std::runtime_error("load_the_npy_file: failed fread");
+    return arr;
+}
+
+cnpy::NpyArray load_the_npz_array(FILE* fp, uint32_t compr_bytes, uint32_t uncompr_bytes) {
+
+    std::vector<unsigned char> buffer_compr(compr_bytes);
+    std::vector<unsigned char> buffer_uncompr(uncompr_bytes);
+    size_t nread = fread(&buffer_compr[0],1,compr_bytes,fp);
+    if(nread != compr_bytes)
+        throw std::runtime_error("load_the_npy_file: failed fread");
+
+    int err;
+    z_stream d_stream;
+
+    d_stream.zalloc = Z_NULL;
+    d_stream.zfree = Z_NULL;
+    d_stream.opaque = Z_NULL;
+    d_stream.avail_in = 0;
+    d_stream.next_in = Z_NULL;
+    err = inflateInit2(&d_stream, -MAX_WBITS);
+
+    d_stream.avail_in = compr_bytes;
+    d_stream.next_in = &buffer_compr[0];
+    d_stream.avail_out = uncompr_bytes;
+    d_stream.next_out = &buffer_uncompr[0];
+
+    err = inflate(&d_stream, Z_FINISH);
+    err = inflateEnd(&d_stream);
+
+    std::vector<size_t> shape;
+    size_t word_size;
+    bool fortran_order;
+    cnpy::parse_npy_header(&buffer_uncompr[0],word_size,shape,fortran_order);
+
+    cnpy::NpyArray array(shape, word_size, fortran_order);
+
+    size_t offset = uncompr_bytes - array.num_bytes();
+    memcpy(array.data<unsigned char>(),&buffer_uncompr[0]+offset,array.num_bytes());
+
+    return array;
+}
+
+cnpy::npz_t cnpy::npz_load(std::string fname) {
+    FILE* fp = fopen(fname.c_str(),"rb");
+
+    if(!fp) {
+        throw std::runtime_error("npz_load: Error! Unable to open file "+fname+"!");
+    }
+
+    cnpy::npz_t arrays;  
+
+    while(1) {
+        std::vector<char> local_header(30);
+        size_t headerres = fread(&local_header[0],sizeof(char),30,fp);
+        if(headerres != 30)
+            throw std::runtime_error("npz_load: failed fread");
+
+        //if we've reached the global header, stop reading
+        if(local_header[2] != 0x03 || local_header[3] != 0x04) break;
+
+        //read in the variable name
+        uint16_t name_len = *(uint16_t*) &local_header[26];
+        std::string varname(name_len,' ');
+        size_t vname_res = fread(&varname[0],sizeof(char),name_len,fp);
+        if(vname_res != name_len)
+            throw std::runtime_error("npz_load: failed fread");
+
+        //erase the lagging .npy        
+        varname.erase(varname.end()-4,varname.end());
+
+        //read in the extra field
+        uint16_t extra_field_len = *(uint16_t*) &local_header[28];
+        if(extra_field_len > 0) {
+            std::vector<char> buff(extra_field_len);
+            size_t efield_res = fread(&buff[0],sizeof(char),extra_field_len,fp);
+            if(efield_res != extra_field_len)
+                throw std::runtime_error("npz_load: failed fread");
+        }
+
+        uint16_t compr_method = *reinterpret_cast<uint16_t*>(&local_header[0]+8);
+        uint32_t compr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+18);
+        uint32_t uncompr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+22);
+
+        if(compr_method == 0) {arrays[varname] = load_the_npy_file(fp);}
+        else {arrays[varname] = load_the_npz_array(fp,compr_bytes,uncompr_bytes);}
+    }
+
+    fclose(fp);
+    return arrays;  
+}
+
+cnpy::NpyArray cnpy::npz_load(std::string fname, std::string varname) {
+    FILE* fp = fopen(fname.c_str(),"rb");
+
+    if(!fp) throw std::runtime_error("npz_load: Unable to open file "+fname);
+
+    while(1) {
+        std::vector<char> local_header(30);
+        size_t header_res = fread(&local_header[0],sizeof(char),30,fp);
+        if(header_res != 30)
+            throw std::runtime_error("npz_load: failed fread");
+
+        //if we've reached the global header, stop reading
+        if(local_header[2] != 0x03 || local_header[3] != 0x04) break;
+
+        //read in the variable name
+        uint16_t name_len = *(uint16_t*) &local_header[26];
+        std::string vname(name_len,' ');
+        size_t vname_res = fread(&vname[0],sizeof(char),name_len,fp);      
+        if(vname_res != name_len)
+            throw std::runtime_error("npz_load: failed fread");
+        vname.erase(vname.end()-4,vname.end()); //erase the lagging .npy
+
+        //read in the extra field
+        uint16_t extra_field_len = *(uint16_t*) &local_header[28];
+        fseek(fp,extra_field_len,SEEK_CUR); //skip past the extra field
+        
+        uint16_t compr_method = *reinterpret_cast<uint16_t*>(&local_header[0]+8);
+        uint32_t compr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+18);
+        uint32_t uncompr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+22);
+
+        if(vname == varname) {
+            NpyArray array  = (compr_method == 0) ? load_the_npy_file(fp) : load_the_npz_array(fp,compr_bytes,uncompr_bytes);
+            fclose(fp);
+            return array;
+        }
+        else {
+            //skip past the data
+            uint32_t size = *(uint32_t*) &local_header[22];
+            fseek(fp,size,SEEK_CUR);
+        }
+    }
+
+    fclose(fp);
+
+    //if we get here, we haven't found the variable in the file
+    throw std::runtime_error("npz_load: Variable name "+varname+" not found in "+fname);
+}
+
+cnpy::NpyArray cnpy::npy_load(std::string fname) {
+
+    FILE* fp = fopen(fname.c_str(), "rb");
+
+    if(!fp) throw std::runtime_error("npy_load: Unable to open file "+fname);
+
+    NpyArray arr = load_the_npy_file(fp);
+
+    fclose(fp);
+    return arr;
+}
+
+
+
diff --git a/include/cnpy/cnpy.h b/include/cnpy/cnpy.h
new file mode 100644
index 0000000..0d3bb4c
--- /dev/null
+++ b/include/cnpy/cnpy.h
@@ -0,0 +1,269 @@
+//Copyright (C) 2011  Carl Rogers
+//Released under MIT License
+//license available in LICENSE file, or at http://www.opensource.org/licenses/mit-license.php
+
+#ifndef LIBCNPY_H_
+#define LIBCNPY_H_
+
+#include<string>
+#include<stdexcept>
+#include<sstream>
+#include<vector>
+#include<cstdio>
+#include<typeinfo>
+#include<iostream>
+#include<cassert>
+#include<zlib.h>
+#include<map>
+#include<memory>
+#include<stdint.h>
+#include<numeric>
+
+namespace cnpy {
+
+    struct NpyArray {
+        NpyArray(const std::vector<size_t>& _shape, size_t _word_size, bool _fortran_order) :
+            shape(_shape), word_size(_word_size), fortran_order(_fortran_order)
+        {
+            num_vals = 1;
+            for(size_t i = 0;i < shape.size();i++) num_vals *= shape[i];
+            data_holder = std::shared_ptr<std::vector<char>>(
+                new std::vector<char>(num_vals * word_size));
+        }
+
+        NpyArray() : shape(0), word_size(0), fortran_order(0), num_vals(0) { }
+
+        template<typename T>
+        T* data() {
+            return reinterpret_cast<T*>(&(*data_holder)[0]);
+        }
+
+        template<typename T>
+        const T* data() const {
+            return reinterpret_cast<T*>(&(*data_holder)[0]);
+        }
+
+        template<typename T>
+        std::vector<T> as_vec() const {
+            const T* p = data<T>();
+            return std::vector<T>(p, p+num_vals);
+        }
+
+        size_t num_bytes() const {
+            return data_holder->size();
+        }
+
+        std::shared_ptr<std::vector<char>> data_holder;
+        std::vector<size_t> shape;
+        size_t word_size;
+        bool fortran_order;
+        size_t num_vals;
+    };
+   
+    using npz_t = std::map<std::string, NpyArray>; 
+
+    char BigEndianTest();
+    char map_type(const std::type_info& t);
+    template<typename T> std::vector<char> create_npy_header(const std::vector<size_t>& shape);
+    void parse_npy_header(FILE* fp,size_t& word_size, std::vector<size_t>& shape, bool& fortran_order);
+    void parse_npy_header(unsigned char* buffer,size_t& word_size, std::vector<size_t>& shape, bool& fortran_order);
+    void parse_zip_footer(FILE* fp, uint16_t& nrecs, size_t& global_header_size, size_t& global_header_offset);
+    npz_t npz_load(std::string fname);
+    NpyArray npz_load(std::string fname, std::string varname);
+    NpyArray npy_load(std::string fname);
+
+    template<typename T> std::vector<char>& operator+=(std::vector<char>& lhs, const T rhs) {
+        //write in little endian
+        for(size_t byte = 0; byte < sizeof(T); byte++) {
+            char val = *((char*)&rhs+byte); 
+            lhs.push_back(val);
+        }
+        return lhs;
+    }
+
+    template<> std::vector<char>& operator+=(std::vector<char>& lhs, const std::string rhs);
+    template<> std::vector<char>& operator+=(std::vector<char>& lhs, const char* rhs);
+
+
+    template<typename T> void npy_save(std::string fname, const T* data, const std::vector<size_t> shape, std::string mode = "w") {
+        FILE* fp = NULL;
+        std::vector<size_t> true_data_shape; //if appending, the shape of existing + new data
+
+        if(mode == "a") fp = fopen(fname.c_str(),"r+b");
+
+        if(fp) {
+            //file exists. we need to append to it. read the header, modify the array size
+            size_t word_size;
+            bool fortran_order;
+            parse_npy_header(fp,word_size,true_data_shape,fortran_order);
+            assert(!fortran_order);
+
+            if(word_size != sizeof(T)) {
+                std::cout<<"libnpy error: "<<fname<<" has word size "<<word_size<<" but npy_save appending data sized "<<sizeof(T)<<"\n";
+                assert( word_size == sizeof(T) );
+            }
+            if(true_data_shape.size() != shape.size()) {
+                std::cout<<"libnpy error: npy_save attempting to append misdimensioned data to "<<fname<<"\n";
+                assert(true_data_shape.size() != shape.size());
+            }
+
+            for(size_t i = 1; i < shape.size(); i++) {
+                if(shape[i] != true_data_shape[i]) {
+                    std::cout<<"libnpy error: npy_save attempting to append misshaped data to "<<fname<<"\n";
+                    assert(shape[i] == true_data_shape[i]);
+                }
+            }
+            true_data_shape[0] += shape[0];
+        }
+        else {
+            fp = fopen(fname.c_str(),"wb");
+            true_data_shape = shape;
+        }
+
+        std::vector<char> header = create_npy_header<T>(true_data_shape);
+        size_t nels = std::accumulate(shape.begin(),shape.end(),1,std::multiplies<size_t>());
+
+        fseek(fp,0,SEEK_SET);
+        fwrite(&header[0],sizeof(char),header.size(),fp);
+        fseek(fp,0,SEEK_END);
+        fwrite(data,sizeof(T),nels,fp);
+        fclose(fp);
+    }
+
+    template<typename T> void npz_save(std::string zipname, std::string fname, const T* data, const std::vector<size_t>& shape, std::string mode = "w")
+    {
+        //first, append a .npy to the fname
+        fname += ".npy";
+
+        //now, on with the show
+        FILE* fp = NULL;
+        uint16_t nrecs = 0;
+        size_t global_header_offset = 0;
+        std::vector<char> global_header;
+
+        if(mode == "a") fp = fopen(zipname.c_str(),"r+b");
+
+        if(fp) {
+            //zip file exists. we need to add a new npy file to it.
+            //first read the footer. this gives us the offset and size of the global header
+            //then read and store the global header.
+            //below, we will write the the new data at the start of the global header then append the global header and footer below it
+            size_t global_header_size;
+            parse_zip_footer(fp,nrecs,global_header_size,global_header_offset);
+            fseek(fp,global_header_offset,SEEK_SET);
+            global_header.resize(global_header_size);
+            size_t res = fread(&global_header[0],sizeof(char),global_header_size,fp);
+            if(res != global_header_size){
+                throw std::runtime_error("npz_save: header read error while adding to existing zip");
+            }
+            fseek(fp,global_header_offset,SEEK_SET);
+        }
+        else {
+            fp = fopen(zipname.c_str(),"wb");
+        }
+
+        std::vector<char> npy_header = create_npy_header<T>(shape);
+
+        size_t nels = std::accumulate(shape.begin(),shape.end(),1,std::multiplies<size_t>());
+        size_t nbytes = nels*sizeof(T) + npy_header.size();
+
+        //get the CRC of the data to be added
+        uint32_t crc = crc32(0L,(uint8_t*)&npy_header[0],npy_header.size());
+        crc = crc32(crc,(uint8_t*)data,nels*sizeof(T));
+
+        //build the local header
+        std::vector<char> local_header;
+        local_header += "PK"; //first part of sig
+        local_header += (uint16_t) 0x0403; //second part of sig
+        local_header += (uint16_t) 20; //min version to extract
+        local_header += (uint16_t) 0; //general purpose bit flag
+        local_header += (uint16_t) 0; //compression method
+        local_header += (uint16_t) 0; //file last mod time
+        local_header += (uint16_t) 0;     //file last mod date
+        local_header += (uint32_t) crc; //crc
+        local_header += (uint32_t) nbytes; //compressed size
+        local_header += (uint32_t) nbytes; //uncompressed size
+        local_header += (uint16_t) fname.size(); //fname length
+        local_header += (uint16_t) 0; //extra field length
+        local_header += fname;
+
+        //build global header
+        global_header += "PK"; //first part of sig
+        global_header += (uint16_t) 0x0201; //second part of sig
+        global_header += (uint16_t) 20; //version made by
+        global_header.insert(global_header.end(),local_header.begin()+4,local_header.begin()+30);
+        global_header += (uint16_t) 0; //file comment length
+        global_header += (uint16_t) 0; //disk number where file starts
+        global_header += (uint16_t) 0; //internal file attributes
+        global_header += (uint32_t) 0; //external file attributes
+        global_header += (uint32_t) global_header_offset; //relative offset of local file header, since it begins where the global header used to begin
+        global_header += fname;
+
+        //build footer
+        std::vector<char> footer;
+        footer += "PK"; //first part of sig
+        footer += (uint16_t) 0x0605; //second part of sig
+        footer += (uint16_t) 0; //number of this disk
+        footer += (uint16_t) 0; //disk where footer starts
+        footer += (uint16_t) (nrecs+1); //number of records on this disk
+        footer += (uint16_t) (nrecs+1); //total number of records
+        footer += (uint32_t) global_header.size(); //nbytes of global headers
+        footer += (uint32_t) (global_header_offset + nbytes + local_header.size()); //offset of start of global headers, since global header now starts after newly written array
+        footer += (uint16_t) 0; //zip file comment length
+
+        //write everything
+        fwrite(&local_header[0],sizeof(char),local_header.size(),fp);
+        fwrite(&npy_header[0],sizeof(char),npy_header.size(),fp);
+        fwrite(data,sizeof(T),nels,fp);
+        fwrite(&global_header[0],sizeof(char),global_header.size(),fp);
+        fwrite(&footer[0],sizeof(char),footer.size(),fp);
+        fclose(fp);
+    }
+
+    template<typename T> void npy_save(std::string fname, const std::vector<T> data, std::string mode = "w") {
+        std::vector<size_t> shape;
+        shape.push_back(data.size());
+        npy_save(fname, &data[0], shape, mode);
+    }
+
+    template<typename T> void npz_save(std::string zipname, std::string fname, const std::vector<T> data, std::string mode = "w") {
+        std::vector<size_t> shape;
+        shape.push_back(data.size());
+        npz_save(zipname, fname, &data[0], shape, mode);
+    }
+
+    template<typename T> std::vector<char> create_npy_header(const std::vector<size_t>& shape) {  
+
+        std::vector<char> dict;
+        dict += "{'descr': '";
+        dict += BigEndianTest();
+        dict += map_type(typeid(T));
+        dict += std::to_string(sizeof(T));
+        dict += "', 'fortran_order': False, 'shape': (";
+        dict += std::to_string(shape[0]);
+        for(size_t i = 1;i < shape.size();i++) {
+            dict += ", ";
+            dict += std::to_string(shape[i]);
+        }
+        if(shape.size() == 1) dict += ",";
+        dict += "), }";
+        //pad with spaces so that preamble+dict is modulo 16 bytes. preamble is 10 bytes. dict needs to end with \n
+        int remainder = 16 - (10 + dict.size()) % 16;
+        dict.insert(dict.end(),remainder,' ');
+        dict.back() = '\n';
+
+        std::vector<char> header;
+        header += (char) 0x93;
+        header += "NUMPY";
+        header += (char) 0x01; //major version of numpy format
+        header += (char) 0x00; //minor version of numpy format
+        header += (uint16_t) dict.size();
+        header.insert(header.end(),dict.begin(),dict.end());
+
+        return header;
+    }
+
+
+}
+
+#endif
diff --git a/include/cnpy/example1.cpp b/include/cnpy/example1.cpp
new file mode 100644
index 0000000..70ac5aa
--- /dev/null
+++ b/include/cnpy/example1.cpp
@@ -0,0 +1,55 @@
+#include"cnpy.h"
+#include<complex>
+#include<cstdlib>
+#include<iostream>
+#include<map>
+#include<string>
+
+const int Nx = 128;
+const int Ny = 64;
+const int Nz = 32;
+
+int main()
+{
+    //set random seed so that result is reproducible (for testing)
+    srand(0);
+    //create random data
+    std::vector<std::complex<double>> data(Nx*Ny*Nz);
+    for(int i = 0;i < Nx*Ny*Nz;i++) data[i] = std::complex<double>(rand(),rand());
+
+    //save it to file
+    cnpy::npy_save("arr1.npy",&data[0],{Nz,Ny,Nx},"w");
+
+    //load it into a new array
+    cnpy::NpyArray arr = cnpy::npy_load("arr1.npy");
+    std::complex<double>* loaded_data = arr.data<std::complex<double>>();
+    
+    //make sure the loaded data matches the saved data
+    assert(arr.word_size == sizeof(std::complex<double>));
+    assert(arr.shape.size() == 3 && arr.shape[0] == Nz && arr.shape[1] == Ny && arr.shape[2] == Nx);
+    for(int i = 0; i < Nx*Ny*Nz;i++) assert(data[i] == loaded_data[i]);
+
+    //append the same data to file
+    //npy array on file now has shape (Nz+Nz,Ny,Nx)
+    cnpy::npy_save("arr1.npy",&data[0],{Nz,Ny,Nx},"a");
+
+    //now write to an npz file
+    //non-array variables are treated as 1D arrays with 1 element
+    double myVar1 = 1.2;
+    char myVar2 = 'a';
+    cnpy::npz_save("out.npz","myVar1",&myVar1,{1},"w"); //"w" overwrites any existing file
+    cnpy::npz_save("out.npz","myVar2",&myVar2,{1},"a"); //"a" appends to the file we created above
+    cnpy::npz_save("out.npz","arr1",&data[0],{Nz,Ny,Nx},"a"); //"a" appends to the file we created above
+
+    //load a single var from the npz file
+    cnpy::NpyArray arr2 = cnpy::npz_load("out.npz","arr1");
+
+    //load the entire npz file
+    cnpy::npz_t my_npz = cnpy::npz_load("out.npz");
+    
+    //check that the loaded myVar1 matches myVar1
+    cnpy::NpyArray arr_mv1 = my_npz["myVar1"];
+    double* mv1 = arr_mv1.data<double>();
+    assert(arr_mv1.shape.size() == 1 && arr_mv1.shape[0] == 1);
+    assert(mv1[0] == myVar1);
+}
diff --git a/include/cnpy/mat2npz b/include/cnpy/mat2npz
new file mode 100644
index 0000000..c1bbd3e
--- /dev/null
+++ b/include/cnpy/mat2npz
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+
+import sys
+from numpy import savez
+from scipy.io import loadmat
+
+assert len(sys.argv) > 1
+
+files = sys.argv[1:]
+
+for f in files:
+    mat_vars = loadmat(f)
+    mat_vars.pop('__version__')
+    mat_vars.pop('__header__')
+    mat_vars.pop('__globals__')
+
+    fn = f.replace('.mat','.npz')
+    savez(fn,**mat_vars)
diff --git a/include/cnpy/npy2mat b/include/cnpy/npy2mat
new file mode 100644
index 0000000..fdd5da7
--- /dev/null
+++ b/include/cnpy/npy2mat
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+
+import sys
+from numpy import load
+from scipy.io import savemat
+
+assert len(sys.argv) > 1
+
+files = sys.argv[1:]
+
+for f in files:
+   data = load(f)
+   fn = f.replace('.npy','')
+   fn = fn.replace('.','_')
+   savemat(fn,{fn : data})
diff --git a/include/cnpy/npz2mat b/include/cnpy/npz2mat
new file mode 100755
index 0000000..2e317d6
--- /dev/null
+++ b/include/cnpy/npz2mat
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+
+import sys
+from numpy import load
+from scipy.io import savemat
+
+assert len(sys.argv) > 1
+
+files = sys.argv[1:]
+
+for f in files:
+   data = load(f)
+   fn = f.replace('.npz','')
+   fn = fn.replace('.','_') #matlab cant handle dots
+   savemat(fn,data)
diff --git a/include/kmeans_gpu.h b/include/kmeans_gpu.h
index 63fdab0..9bdffc4 100644
--- a/include/kmeans_gpu.h
+++ b/include/kmeans_gpu.h
@@ -1,7 +1,39 @@
 #pragma once
+#include <cuda_runtime.h>
 
 void kmeans_gpu(const float* h_data,
                 float* h_centroids,
                 int* h_assignments,
                 int N, int D, int K, int max_iter);
 
+__global__ void calculateDistances(const float *data, const float *centroids, float *distances, int num_vectors, int num_centroids, int dim);
+
+__global__ void normalize_centroids_kernel(
+    float* centroids,   // [K × D] centroid sums
+    const int* counts,  // [K] number of points in each cluster
+    int K, int D);
+
+// Assign each point to nearest centroid
+__global__ void assign_clusters_kernel(const float *distances,
+    int *assignments, int N, int K);
+
+// Update centroids by summing points per cluster
+__global__ void update_centroids_kernel(const float *data, const int *assignments,
+    float *new_centroids, int *counts,
+    int N, int D, int K);
+
+__global__ void build_inverted_lists_kernel(
+        const float* data,
+        const int* assignments,
+        float** inverted_lists,
+        int* list_sizes,
+        int N, int D);
+
+__global__ void create_all_vectors_flat_kernel(
+    const float* data,             // [num_vectors x dim]
+    const int* assignments,        // [num_vectors]
+    const int* cluster_offsets,    // [num_centroids]
+    int* cluster_sizes,            // [num_centroids]
+    float* all_vectors_flat,       // [num_vectors x dim]
+    int num_vectors, 
+    int dim);
diff --git a/include/load_bin_to_gpu.h b/include/load_bin_to_gpu.h
new file mode 100644
index 0000000..984259e
--- /dev/null
+++ b/include/load_bin_to_gpu.h
@@ -0,0 +1,44 @@
+#pragma once
+#include <fstream>
+#include <iostream>
+#include <cuda_runtime.h>
+
+// Loads a float32 .bin file and copies it to GPU
+// Returns: pointer to device memory, and sets N_out = num vectors, D_out = dim per vector
+float* load_bin_to_gpu(const std::string& filename, int expected_dim, int& N_out, int& D_out) {
+    std::ifstream fin(filename, std::ios::binary);
+    if (!fin) {
+        std::cerr << "[ERROR] Could not open " << filename << "\n";
+        return nullptr;
+    }
+
+    // Determine size
+    fin.seekg(0, std::ios::end);
+    size_t total_bytes = fin.tellg();
+    fin.seekg(0, std::ios::beg);
+
+    size_t total_floats = total_bytes / sizeof(float);
+    if (total_floats % expected_dim != 0) {
+        std::cerr << "[ERROR] File size is not divisible by expected dim " << expected_dim << "\n";
+        return nullptr;
+    }
+
+    N_out = total_floats / expected_dim;
+    D_out = expected_dim;
+
+    std::cout << "[INFO] Loading " << N_out << " vectors of dim " << D_out << " from: " << filename << "\n";
+
+    // Read into host buffer
+    float* h_data = new float[total_floats];
+    fin.read(reinterpret_cast<char*>(h_data), total_bytes);
+    fin.close();
+
+    // Allocate and copy to device
+    float* d_data = nullptr;
+    cudaMalloc(&d_data, total_bytes);
+    cudaMemcpy(d_data, h_data, total_bytes, cudaMemcpyHostToDevice);
+
+    delete[] h_data;
+    return d_data;
+}
+
diff --git a/kmeans_output/ivf_centroids.bin b/kmeans_output/ivf_centroids.bin
new file mode 100644
index 0000000..591adf9
Binary files /dev/null and b/kmeans_output/ivf_centroids.bin differ
diff --git a/kmeans_output/ivf_centroids.npy b/kmeans_output/ivf_centroids.npy
new file mode 100644
index 0000000..34c5be8
Binary files /dev/null and b/kmeans_output/ivf_centroids.npy differ
diff --git a/kmeans_output/ivf_debug.txt b/kmeans_output/ivf_debug.txt
new file mode 100644
index 0000000..9eded58
--- /dev/null
+++ b/kmeans_output/ivf_debug.txt
@@ -0,0 +1,300 @@
+Cluster 0 (57 vectors):
+114 140 324 547 724 876 963 1105 1647 1648 1657 1673 1732 1817 1904 2534 3435 3822 4205 4307 ...
+
+Cluster 1 (41 vectors):
+33 195 198 308 534 958 1003 1348 1360 1375 1380 1392 1418 1797 1836 1860 2049 2243 2251 2288 ...
+
+Cluster 2 (79 vectors):
+0 2 101 257 434 519 543 555 608 619 727 784 879 887 1135 1216 1424 1483 1547 1590 ...
+
+Cluster 3 (55 vectors):
+189 297 305 404 640 737 802 974 1081 1881 2247 2453 2480 2563 2983 3487 3741 3875 4030 4240 ...
+
+Cluster 4 (82 vectors):
+595 930 1220 1256 1337 1641 1907 1924 2166 2246 2335 2431 2441 2486 2523 2581 2612 2621 2632 2669 ...
+
+Cluster 5 (124 vectors):
+69 279 513 552 596 1001 1088 1145 1303 1477 1500 1654 1661 1675 1722 1820 1945 1956 2034 2167 ...
+
+Cluster 6 (70 vectors):
+3 6 8 29 74 85 180 186 250 304 306 590 860 873 904 945 1039 1207 1247 1455 ...
+
+Cluster 7 (64 vectors):
+386 745 881 1137 1394 1440 1570 1642 1819 1913 1976 2126 2322 2367 2546 2656 2813 2956 3054 3090 ...
+
+Cluster 8 (73 vectors):
+4 18 20 234 254 256 383 480 527 533 603 654 890 902 910 962 971 1153 1179 1270 ...
+
+Cluster 9 (82 vectors):
+9 11 30 212 266 353 379 416 574 620 650 728 729 936 1217 1365 1410 1456 1495 1591 ...
+
+Cluster 10 (42 vectors):
+10 202 521 557 1047 1056 1439 1582 1857 1878 1923 2007 2076 2083 2340 2403 2660 2861 2863 2997 ...
+
+Cluster 11 (52 vectors):
+384 454 528 591 763 1118 1262 1269 2045 2428 2757 2891 2894 2988 3129 3252 3305 3689 3748 3937 ...
+
+Cluster 12 (144 vectors):
+1528 1696 1766 2047 2131 2195 2296 3167 3190 3554 3923 4054 4566 4574 4825 4849 4856 4866 4911 4958 ...
+
+Cluster 13 (89 vectors):
+13 296 414 682 878 968 1180 1274 1289 1652 1781 1891 1994 2012 2027 2266 2386 2482 2522 2620 ...
+
+Cluster 14 (34 vectors):
+1 14 391 594 722 1258 1304 1434 1542 1754 2470 2596 2786 2832 3651 3760 4029 4081 4218 4403 ...
+
+Cluster 15 (76 vectors):
+15 23 222 295 425 501 536 824 950 1061 1298 1436 1471 1593 1804 2011 2014 2554 2593 2728 ...
+
+Cluster 16 (49 vectors):
+16 27 160 372 621 715 1021 1284 1464 1480 1575 1594 1782 2424 2443 2549 2568 2680 3007 3277 ...
+
+Cluster 17 (53 vectors):
+28 719 864 970 1043 1079 1267 1406 2094 2955 3036 3057 3371 3738 4109 4601 4748 4842 4851 4946 ...
+
+Cluster 18 (50 vectors):
+456 751 820 830 993 1055 1275 1530 1560 1621 1730 2120 2123 2230 2401 2679 2831 3000 3048 3053 ...
+
+Cluster 19 (68 vectors):
+230 255 589 655 692 759 926 1007 1060 1077 1168 1329 1592 1644 1655 1818 1882 1957 2031 2275 ...
+
+Cluster 20 (101 vectors):
+70 233 394 409 413 959 969 1302 1361 1405 1478 1529 1600 1651 1674 1892 1917 2033 2350 2426 ...
+
+Cluster 21 (59 vectors):
+21 431 593 1025 1190 1197 1257 1299 1371 1425 1539 1611 1629 2101 2487 2572 3128 3202 3597 3801 ...
+
+Cluster 22 (202 vectors):
+199 228 243 280 360 465 492 502 505 532 616 631 643 695 702 735 762 800 812 843 ...
+
+Cluster 23 (99 vectors):
+999 1277 2098 2402 2920 3238 3630 3958 4195 4258 4558 4602 4706 4711 4780 4890 5010 5071 5133 5155 ...
+
+Cluster 24 (201 vectors):
+24 191 210 219 262 281 288 319 341 371 421 535 539 542 671 703 792 883 924 1004 ...
+
+Cluster 25 (203 vectors):
+25 136 149 174 184 185 193 215 259 283 328 396 407 427 470 524 563 579 600 638 ...
+
+Cluster 26 (150 vectors):
+22 26 43 270 370 418 444 462 504 529 531 564 573 609 610 668 676 747 755 756 ...
+
+Cluster 27 (37 vectors):
+867 1431 1977 2020 2592 3021 3260 3662 3798 5069 5575 5929 5984 6308 6615 6619 6644 6693 6786 6884 ...
+
+Cluster 28 (62 vectors):
+797 854 906 1040 1356 1408 1850 1861 1875 1979 2591 2610 2799 2978 3020 3239 3250 3259 3384 3607 ...
+
+Cluster 29 (85 vectors):
+639 736 796 853 905 984 1018 1041 1080 1407 1411 1499 1549 1876 1880 2389 2452 2479 2562 2977 ...
+
+Cluster 30 (93 vectors):
+931 952 1000 1221 1338 2013 2099 2196 2884 2919 3264 3631 3700 3819 3844 4257 4813 4893 4960 4993 ...
+
+Cluster 31 (93 vectors):
+5 17 19 31 86 115 141 206 235 253 312 545 553 622 888 894 934 986 1020 1106 ...
+
+Cluster 32 (81 vectors):
+32 103 143 157 378 475 520 725 836 935 1084 1121 1177 1330 1358 1391 1412 1472 1526 1650 ...
+
+Cluster 33 (64 vectors):
+710 1075 1864 2090 2156 2600 2667 2691 2719 2883 2904 2986 3339 3586 3639 3820 3994 4228 4615 4675 ...
+
+Cluster 34 (122 vectors):
+807 844 1162 2213 2244 2396 2966 3222 3888 4142 4187 4290 4298 5098 5202 5298 5662 5824 5851 5932 ...
+
+Cluster 35 (86 vectors):
+344 570 644 780 795 834 998 1051 1140 1283 1384 1386 1597 1634 1738 2053 2055 2066 2088 2214 ...
+
+Cluster 36 (191 vectors):
+130 135 148 173 192 216 282 316 317 327 406 422 428 450 469 478 494 522 525 526 ...
+
+Cluster 37 (129 vectors):
+238 313 340 495 626 629 738 786 886 978 1111 1448 1505 1523 1694 1761 1785 1788 1840 1937 ...
+
+Cluster 38 (46 vectors):
+716 754 1092 1438 2532 2567 3466 3851 5690 6140 6198 6232 6443 6653 7070 7163 7203 7401 7612 7629 ...
+
+Cluster 39 (40 vectors):
+39 712 713 1194 1558 1938 2015 2127 2313 2569 2570 3263 3577 3629 3926 4418 7336 7568 7785 7880 ...
+
+Cluster 40 (138 vectors):
+34 40 177 248 338 345 400 516 571 680 760 788 884 1109 1383 1428 1551 1842 2021 2043 ...
+
+Cluster 41 (78 vectors):
+35 38 41 226 401 517 681 761 789 1120 1139 1227 1249 1843 2179 3498 3677 4262 4292 4456 ...
+
+Cluster 42 (133 vectors):
+42 263 451 486 566 769 1070 1107 1148 1240 1292 1325 1352 1543 1556 1566 1646 1678 1705 1775 ...
+
+Cluster 43 (128 vectors):
+381 497 862 982 1032 1166 1186 1266 1363 1416 1444 1557 1645 1679 1727 1736 1772 1801 1815 1862 ...
+
+Cluster 44 (100 vectors):
+1017 1052 1065 1204 1389 1400 1712 1794 2103 2173 2181 2260 2363 2372 2533 2566 2683 2804 2933 3081 ...
+
+Cluster 45 (123 vectors):
+169 336 392 435 667 804 861 939 987 1015 1097 1165 1205 1239 1245 1301 1403 1680 1799 1887 ...
+
+Cluster 46 (157 vectors):
+94 150 171 240 264 315 346 363 448 567 601 663 723 742 773 840 922 954 964 1012 ...
+
+Cluster 47 (160 vectors):
+109 121 126 137 151 172 176 188 232 239 302 303 314 321 420 423 445 449 507 675 ...
+
+Cluster 48 (97 vectors):
+139 477 485 489 537 582 666 677 768 799 815 1072 1132 1176 1414 1476 1553 1563 1571 1711 ...
+
+Cluster 49 (87 vectors):
+1236 1390 2136 2258 2684 2729 2899 3080 3162 3220 3425 3437 3654 3837 5347 5599 5699 5856 5881 5901 ...
+
+Cluster 50 (86 vectors):
+50 153 178 183 258 268 323 584 635 774 828 845 855 898 900 965 1232 1316 1364 1508 ...
+
+Cluster 51 (48 vectors):
+277 289 390 481 991 1133 1379 1746 1803 1950 2704 3184 3194 3254 3257 3650 3945 4032 4166 4337 ...
+
+Cluster 52 (44 vectors):
+52 111 116 142 164 207 245 274 429 546 648 895 1349 1393 1798 2142 2594 2787 3881 4012 ...
+
+Cluster 53 (77 vectors):
+53 307 331 377 459 490 499 515 707 829 871 892 1008 1122 1233 1359 1435 1555 1755 1768 ...
+
+Cluster 54 (102 vectors):
+51 54 63 64 152 208 269 273 278 458 607 618 636 647 708 726 775 827 868 872 ...
+
+Cluster 55 (60 vectors):
+55 175 325 453 474 498 554 632 718 778 851 877 961 967 985 1010 1023 1042 1083 2227 ...
+
+Cluster 56 (96 vectors):
+405 659 731 875 975 1195 1297 1610 1639 1671 1701 2057 2122 2140 2500 2873 3006 3139 3272 3547 ...
+
+Cluster 57 (92 vectors):
+12 71 732 1196 1251 1273 1276 1601 1640 1718 1890 1960 2058 2141 2256 2481 2608 2698 2707 2874 ...
+
+Cluster 58 (93 vectors):
+58 214 267 365 373 424 441 472 658 714 874 891 944 1048 1098 1252 1296 1374 1398 1421 ...
+
+Cluster 59 (77 vectors):
+113 168 333 550 562 649 814 1466 2333 2709 3049 3541 4469 4477 4508 4511 4519 4548 4583 4595 ...
+
+Cluster 60 (102 vectors):
+60 92 112 158 165 167 246 252 290 311 326 592 653 688 1011 1013 1022 1224 1347 1465 ...
+
+Cluster 61 (85 vectors):
+443 500 556 880 1054 1136 1225 1545 1658 1826 1965 2269 2324 2461 2557 2979 2998 3050 3483 3542 ...
+
+Cluster 62 (100 vectors):
+59 62 91 359 689 1748 2084 2194 2556 2653 3546 3962 3978 4327 4479 4495 4506 4510 4512 4562 ...
+
+Cluster 63 (50 vectors):
+623 711 911 1076 1113 2091 2155 2185 2598 2668 2692 2720 2853 2930 3340 3640 3658 4674 4781 4787 ...
+
+Cluster 64 (57 vectors):
+709 770 1603 1931 1932 2954 3003 3690 3775 3921 4349 5193 5194 6462 6715 7142 7143 7168 7204 7205 ...
+
+Cluster 65 (49 vectors):
+251 291 455 512 634 921 1511 1579 1637 1741 2291 2338 2599 2903 3018 3175 3579 3584 3728 3824 ...
+
+Cluster 66 (153 vectors):
+7 61 66 104 105 117 118 119 161 163 181 204 205 356 357 1525 2268 3278 3666 4399 ...
+
+Cluster 67 (100 vectors):
+83 1387 1469 1474 1708 1774 2063 2114 2192 2360 2397 2475 2716 2866 2871 3067 3285 3353 3390 3416 ...
+
+Cluster 68 (130 vectors):
+129 656 741 946 1142 1230 1381 1475 1763 1793 1973 2165 2218 2361 2362 2451 2476 2491 2527 2565 ...
+
+Cluster 69 (209 vectors):
+865 1209 1264 1527 1697 1765 1925 2048 2132 2239 2297 2432 2442 2789 3168 3191 3209 3555 3722 3924 ...
+
+Cluster 70 (136 vectors):
+551 704 1002 1089 1208 1342 1653 1662 1723 1951 1955 1978 2169 2385 2521 2559 2835 2868 2892 2895 ...
+
+Cluster 71 (75 vectors):
+588 657 753 1024 1214 1335 1814 2223 2332 2688 2857 2876 3444 3523 3564 3571 3671 3674 3708 3821 ...
+
+Cluster 72 (108 vectors):
+72 220 236 241 446 580 752 766 908 956 1059 1171 1320 1336 1372 1376 1568 1922 1961 2224 ...
+
+Cluster 73 (58 vectors):
+73 201 320 385 432 433 447 461 767 783 1170 1172 1314 1417 1670 1835 1872 1901 2037 2290 ...
+
+Cluster 74 (40 vectors):
+223 518 604 1173 1422 1767 1810 1912 2190 2353 2780 2818 3015 4196 4406 4634 4878 4970 5070 5265 ...
+
+Cluster 75 (78 vectors):
+75 134 187 196 388 399 652 705 776 810 960 1343 1397 1442 1636 1808 1888 1914 2032 2178 ...
+
+Cluster 76 (106 vectors):
+76 568 791 819 826 1016 1185 1401 1667 1713 1771 1783 2161 2234 2259 2265 2344 2369 2383 2390 ...
+
+Cluster 77 (86 vectors):
+56 77 179 332 334 351 354 491 548 561 575 730 813 937 1066 1169 1313 1441 1681 1690 ...
+
+Cluster 78 (77 vectors):
+57 78 352 355 549 585 1067 1449 1524 1546 1595 1659 1691 1744 2070 2425 2595 2916 2980 3602 ...
+
+Cluster 79 (80 vectors):
+79 110 159 162 166 182 203 247 335 514 744 765 893 1014 1019 1213 1260 1562 1692 2146 ...
+
+Cluster 80 (172 vectors):
+138 209 322 361 625 947 1143 1334 1382 1430 1714 1740 1926 2118 2219 2309 2376 2508 2573 2674 ...
+
+Cluster 81 (90 vectors):
+47 330 339 523 782 885 1035 1110 1429 1552 1776 2022 2036 2044 2277 2449 2498 2503 2574 2722 ...
+
+Cluster 82 (93 vectors):
+65 82 294 426 460 831 1082 1152 1206 1395 1404 1409 1554 1588 1742 1827 1870 2010 2217 2298 ...
+
+Cluster 83 (182 vectors):
+231 276 299 301 349 419 487 587 615 630 642 697 734 785 869 925 929 983 1033 1119 ...
+
+Cluster 84 (48 vectors):
+84 1073 1388 4234 4363 5801 5809 5917 6002 6317 6532 6590 6699 6723 6759 6889 6969 7030 7069 7122 ...
+
+Cluster 85 (87 vectors):
+265 366 380 387 457 466 473 701 764 777 809 837 889 901 920 1046 1063 1078 1128 1144 ...
+
+Cluster 86 (88 vectors):
+211 367 442 581 700 866 907 955 972 973 1058 1115 1158 1198 1345 1351 1373 1481 1515 1559 ...
+
+Cluster 87 (102 vectors):
+87 227 569 717 779 794 835 943 1141 1282 1385 1633 1668 1703 1974 2384 2394 2840 3005 3099 ...
+
+Cluster 88 (64 vectors):
+88 100 221 237 242 374 511 558 633 674 927 1223 1321 1423 1450 1586 1813 2017 2528 2640 ...
+
+Cluster 89 (65 vectors):
+89 213 343 951 1189 1234 1300 1487 1837 2067 2177 2215 2748 2803 2825 2865 3031 3089 3187 3212 ...
+
+Cluster 90 (66 vectors):
+90 342 398 690 793 808 832 852 856 1114 1222 1396 1437 1567 1630 1721 1919 1975 2092 2102 ...
+
+Cluster 91 (78 vectors):
+395 410 415 572 576 678 750 918 976 1104 1160 1278 1451 1550 1702 1724 1779 2184 2188 2485 ...
+
+Cluster 92 (85 vectors):
+358 417 651 691 749 758 801 917 992 1006 1103 1259 1285 1350 1355 1538 1643 1656 1729 1778 ...
+
+Cluster 93 (159 vectors):
+93 123 145 190 224 292 309 347 364 368 408 430 471 479 488 538 583 602 627 669 ...
+
+Cluster 94 (175 vectors):
+261 285 310 318 337 369 393 436 612 646 670 673 685 772 821 825 988 1030 1049 1057 ...
+
+Cluster 95 (114 vectors):
+95 244 249 362 645 693 771 1062 1218 1470 1709 1927 2310 2346 2962 3016 3084 3169 3427 3477 ...
+
+Cluster 96 (261 vectors):
+96 97 102 106 107 108 120 122 124 125 127 128 131 132 133 144 146 147 154 155 ...
+
+Cluster 97 (238 vectors):
+194 200 229 260 271 275 284 287 298 348 382 397 403 452 463 496 508 530 586 598 ...
+
+Cluster 98 (261 vectors):
+36 44 46 48 67 80 98 170 329 375 437 439 468 509 559 605 665 720 858 896 ...
+
+Cluster 99 (189 vectors):
+37 45 49 68 81 99 376 438 440 467 476 510 560 606 664 721 798 859 897 916 ...
+
diff --git a/kmeans_output/ivf_list_ids.npy b/kmeans_output/ivf_list_ids.npy
new file mode 100644
index 0000000..4010439
Binary files /dev/null and b/kmeans_output/ivf_list_ids.npy differ
diff --git a/kmeans_output/ivf_offsets.npy b/kmeans_output/ivf_offsets.npy
new file mode 100644
index 0000000..81ac624
Binary files /dev/null and b/kmeans_output/ivf_offsets.npy differ
diff --git a/scripts/debug_ivf_index.py b/scripts/debug_ivf_index.py
new file mode 100644
index 0000000..fc3da44
--- /dev/null
+++ b/scripts/debug_ivf_index.py
@@ -0,0 +1,56 @@
+import numpy as np
+import argparse
+
+def debug_ivf_index(list_ids_path, offsets_path, output_txt=None):
+    list_ids = np.load(list_ids_path)
+    offsets = np.load(offsets_path)
+
+    K = len(offsets) - 1
+    N = len(list_ids)
+
+    print(f"[INFO] Loaded:")
+    print(f" - list_ids: {list_ids.shape}")
+    print(f" - offsets: {offsets.shape}")
+    print(f" - Number of clusters (K): {K}")
+    print(f" - Total assigned vectors: {N}\n")
+
+    # Basic validity checks
+    assert np.sum(np.diff(offsets)) == N, "!!! Offsets don't match total vector count!"
+    assert list_ids.min() >= 0, "!!! Negative vector ID detected!"
+    assert len(np.unique(list_ids)) == len(list_ids), "!!! Duplicate vector IDs detected!"
+
+    print("[CHECKED] Basic index checks passed.\n")
+
+    # Print cluster sizes
+    print("[INFO] Cluster sizes:")
+    for i in range(K):
+        start, end = offsets[i], offsets[i + 1]
+        size = end - start
+        print(f"  Cluster {i:3d}: {size:5d} vectors")
+
+    # Print example IDs
+    print("\n[INFO] Sample cluster assignments:")
+    for i in range(min(5, K)):
+        start, end = offsets[i], offsets[i + 1]
+        ids = list_ids[start:end]
+        print(f"  Cluster {i:3d}: {ids[:10]} ...")
+
+    # Optional: Save to text file
+    if output_txt:
+        with open(output_txt, "w") as f:
+            for i in range(K):
+                start, end = offsets[i], offsets[i + 1]
+                ids = list_ids[start:end]
+                f.write(f"Cluster {i} ({len(ids)} vectors):\n")
+                f.write(" ".join(map(str, ids[:20])) + " ...\n\n")
+        print(f"\n[OUTPUT]] Saved readable output to: {output_txt}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--list_ids", default="data/processed/ivf_list_ids.npy", help="Path to ivf_list_ids.npy")
+    parser.add_argument("--offsets", default="data/processed/ivf_offsets.npy", help="Path to ivf_offsets.npy")
+    parser.add_argument("--out", help="Optional output .txt file")
+    args = parser.parse_args()
+
+    debug_ivf_index(args.list_ids, args.offsets, args.out)
+
diff --git a/scripts/npy_to_bin.py b/scripts/npy_to_bin.py
new file mode 100644
index 0000000..00d7eea
--- /dev/null
+++ b/scripts/npy_to_bin.py
@@ -0,0 +1,19 @@
+import numpy as np
+import argparse
+import os
+
+def convert_npy_to_bin(npy_path, bin_path=None):
+    data = np.load(npy_path)
+    if bin_path is None:
+        bin_path = os.path.splitext(npy_path)[0] + ".bin"
+    data.astype(np.float32).tofile(bin_path)
+    print(f"Converted {npy_path} → {bin_path}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("npy_file", help="Path to .npy file")
+    parser.add_argument("--out", help="Optional output .bin path")
+    args = parser.parse_args()
+
+    convert_npy_to_bin(args.npy_file, args.out)
+
diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt
new file mode 100644
index 0000000..80f7ccb
--- /dev/null
+++ b/src/cuda/CMakeLists.txt
@@ -0,0 +1,29 @@
+find_package(CUDA REQUIRED)
+
+add_library(cuda_vector_db SHARED distance.cu 
+            index.cu select.cu
+            kmeans_gpu.cu
+            )
+
+set_target_properties( cuda_vector_db
+            PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+set_target_properties(cuda_vector_db PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+# Pass options to ptxas using -Xptxas
+target_compile_options(cuda_vector_db PRIVATE 
+    $<$<COMPILE_LANGUAGE:CUDA>:-Xptxas -O3>   # Pass -O3 optimization to ptxas
+    # $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler -O2>   # Pass -O3 optimization to the host compiler
+    # $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler -Wall>  # Enable all warnings for the host compiler
+)
+
+target_compile_features(cuda_vector_db PUBLIC cxx_std_17)
+
+add_executable(kmeans_driver kmeans_driver.cpp)
+target_link_libraries(kmeans_driver cuda_vector_db)
+target_include_directories(cuda_vector_db PUBLIC "${CMAKE_CURRENT_BINARY_DIR}")
+target_include_directories(cuda_vector_db PUBLIC "${PROJECT_SOURCE_DIR}/include")
+
+
+include_directories("${CMAKE_CURRENT_BINARY_DIR}")
+include_directories("${PROJECT_SOURCE_DIR}/include")
diff --git a/src/cuda/IVFIndex.cu b/src/cuda/IVFIndex.cu
new file mode 100644
index 0000000..c7da105
--- /dev/null
+++ b/src/cuda/IVFIndex.cu
@@ -0,0 +1,259 @@
+#include "kmeans_gpu.h"
+#include "IVFIndex.h"
+#include "distance.cuh"
+#include <random>
+#include <unordered_set>
+#include <algorithm> // for std::shuffle
+#include <iostream>
+#include "select.cu"
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+IVFIndex::IVFIndex(int num_centroids, int num_vectors, int dim, int iterations) : num_centroids(num_centroids),
+                                                                                  num_vectors(num_vectors),
+                                                                                  dim(dim),
+                                                                                  iterations(iterations)
+{
+    cudaMalloc(&centroids, num_centroids * dim * sizeof(float));
+    cudaMalloc(&data, num_vectors * dim * sizeof(float));
+    cudaMalloc(&assignments, num_vectors * sizeof(int));
+    cudaMalloc(&distances, num_vectors * num_centroids * sizeof(float));
+    cudaMalloc(&counts, num_centroids * sizeof(int));
+    cudaMemset(counts, 0, num_centroids * sizeof(int));
+    cudaMalloc(&d_inverted_lists, num_centroids * sizeof(float *));
+    cudaMalloc(&d_list_sizes, num_centroids * sizeof(int));
+    cudaMalloc(&d_cluster_offsets, (num_centroids + 1) * sizeof(int));
+    cudaMemset(d_list_sizes, 0, num_centroids * sizeof(int));
+    cudaMalloc(&d_all_vectors_flat, sizeof(float) * num_vectors * dim);
+}
+
+IVFIndex::~IVFIndex() {
+    if (centroids) cudaFree(centroids);
+    if (distances) cudaFree(distances);
+    if (counts) cudaFree(counts);
+    if (data) cudaFree(data);
+    if (assignments) cudaFree(assignments);
+    if (d_inverted_lists) cudaFree(d_inverted_lists);
+    if (d_list_sizes) cudaFree(d_list_sizes);
+    if (d_cluster_offsets) cudaFree(d_cluster_offsets);
+    if (d_all_vectors_flat) cudaFree(d_all_vectors_flat);
+}
+
+float* IVFIndex::getData() const { return data; }
+
+float* IVFIndex::getallvectorsflat() const { return d_all_vectors_flat; } 
+
+float** IVFIndex::getInvertedLists() const { return d_inverted_lists; }
+
+int* IVFIndex::getClusterOffsets() const { return d_cluster_offsets; }
+
+float* IVFIndex::getCentroids() const { return centroids; }
+
+int* IVFIndex::getAssignments() const { return assignments; }
+
+int IVFIndex::getNumCentroids() const { return num_centroids; }
+
+int IVFIndex::getDim() const { return dim; }
+
+int IVFIndex::getNumVectors() const { return num_vectors; }
+
+void IVFIndex::train (const float* train_data){
+    cudaMemcpy(data, train_data, num_vectors * dim * sizeof(float), cudaMemcpyHostToDevice);
+    
+    std::vector<float> h_centroids(num_centroids * dim);
+
+    // Set up random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis(0, num_vectors - 1);
+
+    // Optional: Keep track of selected indices if you want **unique** centroids
+    std::unordered_set<int> selected_indices;
+
+    int selected = 0;
+    while (selected < num_centroids) {
+        int idx = dis(gen);
+        if (selected_indices.find(idx) != selected_indices.end()) {
+            continue; // already selected this one
+        }
+        selected_indices.insert(idx);
+
+        // Copy the vector from train_data
+        for (int j = 0; j < dim; j++) {
+            h_centroids[selected * dim + j] = train_data[idx * dim + j];
+        }
+        selected++;
+    }
+    // printf("Print the centroids\n");
+    // for (int i = 0; i < num_centroids; ++i) {
+    //     printf("Centroid %d: ", i);
+    //     for (int j = 0; j < dim; ++j) {
+    //         printf("%f ", h_centroids[i * dim + j]);
+    //     }
+    //     printf("\n");
+    // }
+    cudaMemcpy(centroids, h_centroids.data(), num_centroids * dim * sizeof(float), cudaMemcpyHostToDevice);
+
+    for (int iter = 0; iter < iterations; ++iter) {
+        cudaMemset(counts, 0, num_centroids * sizeof(int));
+        dim3 blockDim(16, 16);
+        dim3 gridDim((num_vectors + blockDim.x - 1) / blockDim.x, (num_centroids + blockDim.y - 1) / blockDim.y);
+        
+        calculateDistances<<<gridDim, blockDim>>>(data, centroids, distances, num_vectors, num_centroids, dim);
+        cudaDeviceSynchronize();
+        checkCudaError("calculateDistances");
+        dim3 blockDim2(256);
+        dim3 gridDim2((num_vectors + blockDim2.x - 1) / blockDim2.x);
+        
+        assign_clusters_kernel<<<gridDim2, blockDim2>>>(distances, assignments, num_vectors, num_centroids);
+        cudaDeviceSynchronize();
+        checkCudaError("assign_clusters_kernel");
+
+        update_centroids_kernel<<<gridDim2, blockDim2>>>(data, assignments, centroids, counts, num_vectors, dim, num_centroids);
+        cudaDeviceSynchronize();
+        checkCudaError("update_centroids_kernel");
+
+        int threads = 256;
+        int blocks = (num_centroids * dim + threads - 1) / threads;
+
+        normalize_centroids_kernel<<<blocks, threads>>>(centroids, counts, num_centroids, dim);
+        cudaDeviceSynchronize();
+        checkCudaError("normalize_centroids_kernel");
+    }
+    // Now we have the centroids, let's create the inverted lists
+    std::vector<int> h_counts(num_centroids);
+    cudaMemcpy(h_counts.data(), counts, num_centroids * sizeof(int), cudaMemcpyDeviceToHost);
+
+    std::vector<int> cluster_offsets(num_centroids + 1, 0);
+    cluster_offsets[0] = 0;
+    printf("Cluster offsets: ");
+    for (int i = 1; i <= num_centroids; ++i) {
+        cluster_offsets[i] = cluster_offsets[i-1] + h_counts[i-1];
+        printf("%d ", cluster_offsets[i]);
+    }
+
+    // Upload to device
+    
+    cudaMemcpy(d_cluster_offsets, cluster_offsets.data(), (num_centroids+1) * sizeof(int), cudaMemcpyHostToDevice);
+    int* d_cluster_sizes;
+    cudaMalloc(&d_cluster_sizes, sizeof(int) * num_centroids);
+    cudaMemset(d_cluster_sizes, 0, sizeof(int) * num_centroids);
+    int block_size = 256;
+    int grid_size = (num_vectors + block_size - 1) / block_size;
+    create_all_vectors_flat_kernel<<<grid_size, block_size>>>(
+        data, 
+        assignments,
+        d_cluster_offsets,
+        d_cluster_sizes,
+        d_all_vectors_flat,
+        num_vectors,
+        dim
+    );
+    cudaDeviceSynchronize();
+    // for (int i = 0; i < num_centroids; i++) {
+    //     float* list;
+    //     cudaMalloc(&list, num_vectors * dim * sizeof(float)); // Upper bound
+    //     cudaMemcpy(&d_inverted_lists[i], &list, sizeof(float*), cudaMemcpyHostToDevice);
+    // }
+
+    // int threads = 256;
+    // int blocks = (num_vectors + threads - 1) / threads;
+
+    // build_inverted_lists_kernel<<<blocks, threads>>>(data, assignments, d_inverted_lists, d_list_sizes, num_vectors, dim);
+    // cudaDeviceSynchronize();
+    // checkCudaError("build_inverted_lists_kernel");
+
+    // Now clean temporary memory
+    cudaFree(assignments);
+    assignments = nullptr;
+
+    cudaFree(distances);
+    distances = nullptr;
+
+    cudaFree(counts);
+    counts = nullptr;
+
+    cudaFree(data);  // because we copied into inverted lists
+    data = nullptr;
+    
+    cudaFree(d_cluster_sizes);
+    d_cluster_sizes = nullptr;
+
+}
+
+void IVFIndex::checkCudaError(const std::string& kernelName) {
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("CUDA Error after %s: %s\n", kernelName.c_str(), cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+}
+
+void IVFIndex::search_coarse_grained(thrust::device_vector<float> &queries, 
+        thrust::device_vector<float> &queries_norm, 
+        thrust::device_vector<int64_t> *coarse_grained_output,
+        uint32_t centroids_to_select) {
+    thrust::device_vector<float> output_distances;
+    const int block_size = 16;
+    // TODO(): Compute tiles of product instead of computing all
+    // Perform tiled matrix multiplication
+    thrust::device_vector<float> product;
+    float* q = thrust::raw_pointer_cast(queries.data());
+    float* p = thrust::raw_pointer_cast(product.data());
+    matrix_mul_with_transpose(q, centroids, dim, p, queries.size()/dim, num_centroids, block_size);
+
+    // Call L2SelectMin
+    thrust::device_vector<float> centroid_norms;
+    thrust::device_vector<float> centroids_vector(centroids, centroids + num_centroids);
+    compute_l2_norm(centroids_vector, &centroid_norms, dim);
+    select_min(product, centroids_vector, centroid_norms, dim, queries.size()/dim, *coarse_grained_output, centroids_to_select);
+}
+
+void IVFIndex::search_fine_grained(thrust::device_vector<float> &queries, thrust::device_vector<float> &queries_norm, 
+        thrust::device_vector<int64_t> &coarse_grained_output,
+        thrust::device_vector<int64_t> *fine_grained_output,
+        uint32_t k) {
+    /**
+     * Inputs: 
+     * 1. queries
+     * 2. inverted index (from the class)
+     *      - data points (a single buffer)
+     *      - offsets into the data points array
+     * 3. Coarse grained output: centroid ids
+     * Steps:
+     * 1. Calculate distances
+     * 2. Select min
+     */
+    uint32_t nqueries = queries.size() / dim;
+    // TODO(): Allocate memory for output_distances
+    thrust::device_vector<float> output_distances;
+    float* vectors = IVFIndex::getallvectorsflat();
+    thrust::device_vector<float> ivf_data_points(vectors, vectors + IVFIndex::getNumVectors());
+    int* offsets = IVFIndex::getClusterOffsets();
+    thrust::device_vector<int64_t> ivf_offsets(offsets, offsets + IVFIndex::getNumCentroids() + 1);
+    calc_distances_for_fine_grained_search(ivf_data_points, ivf_offsets, queries, coarse_grained_output,
+        output_distances, dim);
+}
+
+thrust::device_vector<int64_t> IVFIndex::search_batch(thrust::device_vector<float> &queries, uint32_t k) {
+    thrust::device_vector<int64_t> coarse_grained_output;
+    // TODO(): Discuss
+    uint32_t centroids_to_select = (uint32_t)sqrt(k);
+
+    // Compute L2 norm for queries
+    thrust::device_vector<float> queries_norm;
+    compute_l2_norm(queries, &queries_norm, dim);
+
+    search_coarse_grained(queries, queries_norm, &coarse_grained_output, centroids_to_select);
+
+    thrust::device_vector<int64_t> fine_grained_output;
+    search_fine_grained(queries, queries_norm, coarse_grained_output, &fine_grained_output, k);
+
+    // Cast fine_grained_output to IndexType
+    thrust::device_vector<IndexType> results(fine_grained_output.size());
+    thrust::transform(fine_grained_output.begin(), fine_grained_output.end(), results.begin(), thrust::placeholders::_1);
+
+    return results;
+}
+
+
diff --git a/src/cuda/distance.cu b/src/cuda/distance.cu
index a2644b9..00ab03b 100644
--- a/src/cuda/distance.cu
+++ b/src/cuda/distance.cu
@@ -1,4 +1,4 @@
-#include <cuda_runtime.h>
+#include "distance.cuh"
 #include <cmath>
 
 __global__ void compute_l2_distances(
@@ -24,3 +24,223 @@ __global__ void compute_l2_distances(
 
     distances[q_idx * num_bases + b_idx] = sum;
 }
+
+
+constexpr int ROW_TILE_SIZE = 8;
+constexpr int BLOCK_SIZE = 128;
+
+__global__ void compute_l2_norm_kernel(const float *input,
+        float *output, int ndim, int input_size) {
+    extern __shared__ float s[];
+    float *sum_staging = &s[ROW_TILE_SIZE];
+    int global_thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    int row_start = global_thread_id * ROW_TILE_SIZE;
+
+    if (threadIdx.x < ROW_TILE_SIZE) {
+        s[threadIdx.x] = 0;
+    }
+    __syncthreads();
+    
+    #pragma unroll
+    for (int i=0; i<ROW_TILE_SIZE; i++) {
+        if (row_start + i >= input_size) {
+            break;
+        }
+        float sum = 0.0f;
+        for (int j=threadIdx.x; j<ndim; j+=blockDim.x) {
+            float v = input[(row_start+i) * ndim + j];
+            sum += v * v;
+        }
+        sum_staging[threadIdx.x] = sum;
+        __syncthreads();
+
+        for (int j=blockDim.x/2; j > 0; j>>=1) {
+            if (threadIdx.x < j) {
+                sum_staging[threadIdx.x] += sum_staging[threadIdx.x + j];
+            }
+            __syncthreads();
+        }
+        if (threadIdx.x == 0) {
+            s[i] = sum_staging[0];
+        }
+    }
+    
+    if (threadIdx.x == 0) {
+        for (int i=0; i<ROW_TILE_SIZE && (i+row_start)<input_size; i++) {
+            output[row_start+i] = s[i];
+        }
+    }
+}
+
+void compute_l2_norm(thrust::device_vector<float> &input,
+        thrust::device_vector<float> *output, int ndim) {
+    output = new thrust::device_vector<float>(input.size()/ndim);
+    int num_threads = (input.size() + 7)/ROW_TILE_SIZE;
+    int num_blocks = (num_threads + BLOCK_SIZE - 1)/BLOCK_SIZE;
+    int shared_mem_size = (ROW_TILE_SIZE + BLOCK_SIZE) * sizeof(float);
+    compute_l2_norm_kernel<<<num_blocks, BLOCK_SIZE, shared_mem_size>>>(thrust::raw_pointer_cast(input.data()),
+        thrust::raw_pointer_cast(output->data()), ndim, input.size());
+}
+
+
+/**
+ * CUDA kernel to calculate C = A * B(T)
+ * Dimensions of matrix A: (la * ndim)
+ * Dimensions of matrix B: (lb * ndim)
+ * 
+ * This kernel performs matrix multiplication where the second matrix (B) is transposed.
+ * It uses shared memory to load tiles of both matrices A and B for efficient computation.
+ * The key difference from `matrix_mul_kernel` is that this kernel transposes matrix B
+ * on-the-fly while loading it into shared memory, enabling computation of A * B(T).
+ * This is particularly useful when the transpose of B is required for the operation.
+ */
+template <uint32_t BLOCK_SIZE>
+__global__ void matrix_mul_kernel_with_transpose(const float* A, const float *B, int ndim, 
+        float *C, int lb, int la) {
+    __shared__ float a_tile_shared[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ float b_tile_shared[BLOCK_SIZE][BLOCK_SIZE];
+    
+    // x -> vertical dimension
+    // y -> horizontal dimension
+    // ndim -> number of dimensions
+    int bx = blockIdx.x;
+    int by = blockIdx.y;
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    int a_begin_offset = bx * BLOCK_SIZE * ndim;
+    int b_begin_offset = by * BLOCK_SIZE * ndim;
+    
+    int a_end_offset = a_begin_offset + ndim;
+
+    float sum = 0.0f;
+    
+    for (int i=a_begin_offset, j=b_begin_offset; i<a_end_offset; i+=BLOCK_SIZE, j+=BLOCK_SIZE) {
+        if (i + tx * ndim + ty < la * ndim) {
+            a_tile_shared[tx][ty] = A[i + tx * ndim + ty];
+        }
+        if (j + tx * ndim + ty < ndim * lb) {
+            // Transpose and load in a single step
+            b_tile_shared[ty][tx] = B[j + tx * ndim + ty];
+        }
+
+        __syncthreads();
+
+        for (int k=0; k<BLOCK_SIZE; k++) {
+            sum += a_tile_shared[tx][k] * b_tile_shared[k][ty];
+        }
+
+        __syncthreads();
+    }
+    int C_row = bx * BLOCK_SIZE + tx;
+    int C_col = by * BLOCK_SIZE + ty;
+    // int C_block_offset = lb * BLOCK_SIZE * bx + BLOCK_SIZE * by;
+    if (C_row < la && C_col < lb) {
+        C[C_row * lb + C_col] = sum;
+    }
+}
+
+
+void matrix_mul_with_transpose(const float* A, const float *B, int ndim, 
+        float *C, int la, int lb, int block_size) {
+    int num_blocks_x = (la + block_size - 1)/block_size;
+    int num_blocks_y = (lb + block_size - 1)/block_size;
+    dim3 blocks (num_blocks_x, num_blocks_y);
+    dim3 threads (BLOCK_SIZE, BLOCK_SIZE);
+    switch (block_size) {
+        case 8:
+            matrix_mul_kernel_with_transpose<8><<<blocks, threads>>>(A, B, ndim, C, lb, la);
+        
+        case 16:
+            matrix_mul_kernel_with_transpose<16><<<blocks, threads>>>(A, B, ndim, C, lb, la);
+
+        case 32:
+            matrix_mul_kernel_with_transpose<32><<<blocks, threads>>>(A, B, ndim, C, lb, la);
+}
+}
+
+
+__device__ __forceinline__ int get_lane_id() {
+    int lane_id;
+    asm("mov.u32 %0, %%laneid;" : "=r"(lane_id));
+    return lane_id;
+}
+
+__device__ inline float warp_reduce_sum(float &s) {
+    for (int offset = 16; offset > 0; offset /= 2)
+        s += __shfl_down_sync(0xFFFFFFFF, s, offset);
+}
+
+__global__ void calc_fine_grained_distance_kernel(float *queries, float *ivf_data_points, 
+        int64_t *ivf_offsets, int64_t *ivf_indexes, float *output_distances,
+        int64_t *coarse_grained_output, int64_t *data_points_per_query,
+        int64_t *indexes_per_query, uint32_t centroids_per_query, uint32_t ndim, 
+        uint32_t nqueries, int64_t max_ivf_list_length
+        ) {
+    // Each warp handles a single query point
+    uint32_t query_id = (blockDim.x * blockIdx.x + threadIdx.x)/32;
+    if (query_id >= nqueries) return;
+    uint32_t query_offset = query_id * ndim;
+    uint32_t centroid_id_start_offset = query_id * centroids_per_query;
+    // Hold number of data points per query
+    int64_t query_ctr = 0ll;
+    for (uint32_t i=0; i<centroids_per_query; i++) {
+        uint32_t centroid_id = coarse_grained_output[centroid_id_start_offset + i];
+        int64_t inverted_list_start_offset = ivf_offsets[centroid_id];
+        int64_t inverted_list_end_offset = ivf_offsets[centroid_id + 1];
+        int64_t inverted_list_length = (inverted_list_end_offset - inverted_list_start_offset)/ndim;
+        for (int64_t j=0; j<inverted_list_length; j++) {
+            int lane_id = get_lane_id();
+            float s = 0.0f;
+            for (int dim=lane_id; dim<ndim; dim += 32) {
+                float v = queries[query_offset + dim] - 
+                    ivf_data_points[inverted_list_start_offset + j * ndim + ndim];
+                s += v * v;
+            }
+            warp_reduce_sum(s);
+            if (threadIdx.x % 32 == 0) {
+                int64_t output_idx = query_id * centroids_per_query * max_ivf_list_length + query_ctr;
+                output_distances[output_idx] = s;
+                indexes_per_query[output_idx] = ivf_indexes[inverted_list_start_offset + j];
+            }
+            ++query_ctr;
+        } 
+    }
+    if (threadIdx.x % 32 == 0) {
+        data_points_per_query[query_id] = query_ctr;
+    }
+}
+
+void calc_distances_for_fine_grained_search(thrust::device_vector<float> &ivf_data_points,
+        thrust::device_vector<int64_t> &ivf_offsets, thrust::device_vector<int64_t> &ivf_indexes,
+        thrust::device_vector<float> &queries,
+        thrust::device_vector<int64_t> &coarse_grained_output,
+        thrust::device_vector<float> &output_distances,
+        thrust::device_vector<int64_t> &output_ids,
+        thrust::device_vector<int64_t> &output_offsets,
+        int ndim, int64_t max_ivf_list_length) {
+    uint32_t nqueries = queries.size()/ndim;
+    uint32_t query_tile_size = 32;
+    uint32_t centroids_per_query = coarse_grained_output.size()/nqueries;
+    uint32_t BLOCK_SIZE = 256;
+    thrust::device_vector<int64_t> points_per_query (nqueries);
+    for (uint32_t i=0; i<nqueries; i+=query_tile_size) {
+        float *queries_slice = thrust::raw_pointer_cast(&queries[i * ndim]);
+        int64_t *coarse_grained_output_slice = \
+            thrust::raw_pointer_cast(&coarse_grained_output[i * centroids_per_query]);
+        int64_t output_offset = i * centroids_per_query * max_ivf_list_length * i;
+        int64_t *output_ids_slice = thrust::raw_pointer_cast(&output_ids[output_offset]);
+        float *output_distances_slice = thrust::raw_pointer_cast(&output_distances[output_offset]);
+        int64_t *points_per_query_slice = thrust::raw_pointer_cast(&points_per_query[i]);
+        dim3 grid (query_tile_size/(BLOCK_SIZE/32));
+        calc_fine_grained_distance_kernel<<<grid, BLOCK_SIZE>>>(queries_slice, thrust::raw_pointer_cast(ivf_data_points.data()),
+            thrust::raw_pointer_cast(ivf_offsets.data()), thrust::raw_pointer_cast(ivf_indexes.data()),
+            output_distances_slice,
+            coarse_grained_output_slice,  
+            points_per_query_slice,
+            output_ids_slice,
+            centroids_per_query, 
+            ndim, nqueries, max_ivf_list_length);
+    }
+    thrust::inclusive_scan(points_per_query.begin(), points_per_query.end(), output_offsets.begin());
+}
\ No newline at end of file
diff --git a/src/cuda/distance.cuh b/src/cuda/distance.cuh
new file mode 100644
index 0000000..6730231
--- /dev/null
+++ b/src/cuda/distance.cuh
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <thrust/device_vector.h>
+
+__global__ void compute_l2_distances(
+    const float* __restrict__ queries,   // [num_queries x dim]
+    const float* __restrict__ bases,     // [num_bases x dim]
+    float* __restrict__ distances,       // [num_queries x num_bases]
+    int num_queries,
+    int num_bases,
+    int dim);
+
+__global__ void compute_l2_norm_kernel(const float *input,
+    float *output, int ndim, int input_size);
+
+void compute_l2_norm(thrust::device_vector<float> &input,
+    thrust::device_vector<float> *output, int ndim);
+
+
+/**
+ * @brief Computes the matrix multiplication of matrix A and the transpose of matrix B.
+ *
+ * This function performs the multiplication of a matrix A with the transpose of 
+ * matrix B. Both matrices are assumed to be stored in row-major order.
+ *
+ * @param A Pointer to the first input matrix (matrix A).
+ * @param B Pointer to the second input matrix (matrix B).
+ * @param ndim The dimension of the matrices A and B (number of rows/columns).
+ */
+void matrix_mul_with_transpose(const float* A, const float *B, int ndim, 
+    float *C, int la, int lb, int block_size);
+
+
+/**
+ * @brief Calculates distances for a fine-grained search using IVF (Inverted File Index) data points.
+ * 
+ * @param ivf_data_points A reference to a Thrust device vector containing the IVF data points.
+ *                        These data points are used to compute distances during the search process.
+ * 
+ * This function is designed to operate on GPU memory and leverages CUDA and Thrust for efficient
+ * parallel computation. It is typically used in vector database applications where fine-grained
+ * distance calculations are required for nearest neighbor search or similar tasks.
+ */
+void calc_distances_for_fine_grained_search(thrust::device_vector<float> &ivf_data_points,
+    thrust::device_vector<int64_t> &ivf_offsets, thrust::device_vector<int64_t> &ivf_indexes,
+    thrust::device_vector<float> &queries,
+    thrust::device_vector<int64_t> &coarse_grained_output,
+    thrust::device_vector<float> &output_distances,
+    thrust::device_vector<int64_t> &output_ids,
+    thrust::device_vector<int64_t> &output_offsets,
+    int ndim, int64_t max_ivf_list_length);
\ No newline at end of file
diff --git a/src/cuda/heap.cuh b/src/cuda/heap.cuh
new file mode 100644
index 0000000..da4e1ac
--- /dev/null
+++ b/src/cuda/heap.cuh
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <cuda.h>
+#include <cstdint>
+#include "sort.cuh"
+
+constexpr int WARP_SIZE = 32;
+
+
+template <uint32_t WARP_QUEUE_SIZE,
+    uint32_t THREAD_QUEUE_SIZE>
+class GPUHeap {
+public:
+    __device__ GPUHeap() {
+        #pragma unroll
+        for (int i=0; i<THREAD_QUEUE_SIZE; i++) {
+            thread_queue_values[i] = INFINITY;
+        }
+        top_kth = INFINITY;
+        num_candidates = 0;
+    }
+
+    __device__ void add(uint64_t key, float value) {
+        if (value < top_kth) {
+            for (int i=THREAD_QUEUE_SIZE-1; i>=0; i--) {
+                thread_queue_values[i] = thread_queue_values[i-1];
+                thread_queue_keys[i] = thread_queue_keys[i-1];
+            }
+
+            thread_queue_values[0] = value;
+            thread_queue_keys[0] = key;
+            num_candidates++;
+        }
+        bool sort_required = (num_candidates == THREAD_QUEUE_SIZE);
+        sort_required = __any_sync(0xFFFFFFFF, sort_required);
+        // Sort if atleast one thread requires a sort
+        if (!sort_required) {
+            return;
+        }
+
+        merge_warp_queues();
+    }
+
+    // Merge warp queue and thread queues using bitonic sort
+    __device__ void merge_warp_queues() {
+        // Sort the thread queues while interpreting them as one lane-strided register array
+        sort_odd<THREAD_QUEUE_SIZE>(thread_queue_values, thread_queue_keys);
+        
+        __syncwarp();
+
+        // Merge the warp queue (already sorted) with the thread queue
+        merge_odd<WARP_QUEUE_LENGTH, THREAD_QUEUE_SIZE>(warp_queue_values, thread_queue_values, warp_queue_keys, thread_queue_keys);
+        __syncwarp();
+
+        num_candidates = 0;
+    }
+
+    __device__ void dump(int64_t *out_keys, int k) {
+        int lane_id = get_lane_id();
+
+        #pragma unroll
+        for (int i=0; i<WARP_QUEUE_LENGTH; i++) {
+            int idx = i * 32 + lane_id;
+            if (idx < k) {
+                // out_values[idx] = warp_queue_values[i];
+                out_keys[idx] = warp_queue_keys[i];
+            }
+        }
+    }
+
+private:
+    static constexpr int WARP_QUEUE_LENGTH = WARP_QUEUE_SIZE / WARP_SIZE;
+
+    // Lane-strided warp queue which contains the top k values/indices
+    uint64_t warp_queue_keys[WARP_QUEUE_LENGTH];
+
+    float warp_queue_values[WARP_QUEUE_LENGTH];
+
+    // Thread queues which hold candidates for top-k elements
+    uint64_t thread_queue_keys[THREAD_QUEUE_SIZE];
+
+    float thread_queue_values[THREAD_QUEUE_SIZE];
+
+    // We should compare incoming elements with this
+    float top_kth;
+
+    // Number of top-k candidates added to thread queue
+    int num_candidates;
+};
\ No newline at end of file
diff --git a/src/cuda/index.cu b/src/cuda/index.cu
new file mode 100644
index 0000000..ed4e546
--- /dev/null
+++ b/src/cuda/index.cu
@@ -0,0 +1,55 @@
+#include "index.cuh"
+#include "distance.cuh"
+#include "select.cuh"
+
+#include <math.h>
+
+void IVFIndex::search_coarse_grained(thrust::device_vector<float> &queries, 
+        thrust::device_vector<float> &queries_norm, 
+        thrust::device_vector<int64_t> *coarse_grained_output,
+        uint32_t centroids_to_select) {
+    thrust::device_vector<float> output_distances;
+    const int block_size = 16;
+    // TODO(): Compute tiles of product instead of computing all
+    // Perform tiled matrix multiplication
+    thrust::device_vector<float> product;
+    matrix_mul_with_transpose(thrust::raw_pointer_cast(queries.data()), thrust::raw_pointer_cast(centroids.data()), ndim,
+        thrust::raw_pointer_cast(product.data()), queries.size()/ndim, centroids.size()/ndim, block_size);
+
+    // Call L2SelectMin
+    select_min(product, centroids, centroid_norms, ndim, queries.size()/ndim, *coarse_grained_output, centroids_to_select);
+}
+
+void IVFIndex::search_fine_grained(thrust::device_vector<float> &queries, thrust::device_vector<float> &queries_norm, 
+        thrust::device_vector<int64_t> &coarse_grained_output,
+        thrust::device_vector<float> &output_distances,
+        thrust::device_vector<int64_t> &output_indexes,
+        uint32_t k) {
+    uint32_t nqueries = queries.size() / ndim;
+    thrust::device_vector<float> distances (max_ivf_list_length * nqueries);
+    thrust::device_vector<int64_t> indexes (max_ivf_list_length * nqueries);
+    // Number of points over which we are going to run the fine-grained search for each query
+    thrust::device_vector<int64_t> offsets (nqueries + 1);
+    calc_distances_for_fine_grained_search(ivf_data_points, ivf_offsets, ivf_indices, queries, 
+        coarse_grained_output, distances, indexes, offsets, ndim, max_ivf_list_length);
+    // output_distances is grouped per query. We will stream these distances through a select kernel
+    select_fine_grained(distances, indexes, offsets, output_distances, output_indexes, ndim, nqueries, k);
+}
+
+thrust::device_vector<int64_t> IVFIndex::search_batch(thrust::device_vector<float> &queries, uint32_t k) {
+    thrust::device_vector<int64_t> coarse_grained_output;
+    uint32_t centroids_to_select = (uint32_t)sqrt(k);
+
+    // Compute L2 norm for queries
+    thrust::device_vector<float> queries_norm;
+    compute_l2_norm(queries, &queries_norm, ndim);
+
+    search_coarse_grained(queries, queries_norm, &coarse_grained_output, centroids_to_select);
+
+    thrust::device_vector<float> output_distances(k * queries.size()/ndim);
+    thrust::device_vector<int64_t> output_indexes(k * queries.size()/ndim); 
+    search_fine_grained(queries, queries_norm, coarse_grained_output, 
+        output_distances, output_indexes, k);
+
+    return output_indexes;
+}
\ No newline at end of file
diff --git a/src/cuda/index.cuh b/src/cuda/index.cuh
new file mode 100644
index 0000000..26592e8
--- /dev/null
+++ b/src/cuda/index.cuh
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <cuda.h>
+#include <vector>
+#include <thrust/device_vector.h>
+
+typedef int64_t IndexType;
+
+class IVFIndex {
+public:
+    explicit IVFIndex();
+
+    thrust::device_vector<IndexType> search_batch(thrust::device_vector<float> &queries, uint32_t k);
+
+private:
+    thrust::device_vector<float> centroids;
+
+    thrust::device_vector<float> centroid_norms;
+
+    int ndim;
+
+    thrust::device_vector<float> ivf_data_points;
+
+    thrust::device_vector<int64_t> ivf_offsets;
+
+    thrust::device_vector<int64_t> ivf_indices;
+
+    int64_t max_ivf_list_length;
+
+    void search_coarse_grained(thrust::device_vector<float> &queries, 
+        thrust::device_vector<float> &queries_norms, 
+        thrust::device_vector<int64_t> *coarse_grained_output,
+        uint32_t centroids_to_select);
+
+    void search_fine_grained(thrust::device_vector<float> &queries, thrust::device_vector<float> &queries_norm, 
+        thrust::device_vector<int64_t> &coarse_grained_output,
+        thrust::device_vector<float> &output_distances,
+        thrust::device_vector<int64_t> &output_indexes,
+        uint32_t k);
+};
\ No newline at end of file
diff --git a/src/cuda/kmeans_driver.cpp b/src/cuda/kmeans_driver.cpp
index 183b828..6680c4d 100644
--- a/src/cuda/kmeans_driver.cpp
+++ b/src/cuda/kmeans_driver.cpp
@@ -1,55 +1,87 @@
 #include <iostream>
 #include <vector>
-#include <cstdlib>
-#include <cmath>
 #include <string>
-#include <cnpy.h>
+#include <fstream>
+#include <algorithm>
 #include "kmeans_gpu.h"
+#include "cnpy.h"
+#include "IVFIndex.h"
 
-void kmeans_gpu(const float* h_data, float* h_centroids,
-                int* h_assignments, int N, int D, int K, int max_iter);
-
-// Stub only — assume memory is handled externally
-
-void print_shape(const std::string& name, const std::vector<size_t>& shape) {
-    std::cout << name << " shape: [";
-    for (size_t i = 0; i < shape.size(); ++i) {
-        std::cout << shape[i];
-        if (i != shape.size() - 1) std::cout << ", ";
+void save_bin(const std::string& filename, const float* data, int count) {
+    std::ofstream fout(filename, std::ios::binary);
+    if (!fout) {
+        std::cerr << "Failed to open " << filename << " for writing.\n";
+        return;
     }
-    std::cout << "]" << std::endl;
+    fout.write(reinterpret_cast<const char*>(data), count * sizeof(float));
+    fout.close();
 }
 
-int main() {
-    // TODO: Load sift_base.npy into float* h_data
+int main(int argc, char** argv) {
     std::string input_path = "data/processed/sift_base.npy";
-    std::string output_path = "data/processed/ivf_centroids.npy";
+    std::string out_npy_path = "data/processed/ivf_centroids.npy";
+    std::string out_bin_path = "data/processed/ivf_centroids.bin";
 
-    const int num_clusters = 1024;
+    const int K = 100;
     const int max_iter = 20;
 
-    std::cout << "Loading training vectors from: " << input_path << std::endl;
+    std::cout << "[INFO] Loading training vectors from: " << input_path << std::endl;
     cnpy::NpyArray arr = cnpy::npy_load(input_path);
-    float* data = arr.data<float>();
+    float* h_data = arr.data<float>();
     std::vector<size_t> shape = arr.shape;
+    int N = shape[0];
+    int D = shape[1];
+
+
+    
+    std::cout << "[INFO] Dataset size: " << N << " vectors of dimension " << D << std::endl;
+
+    IVFIndex ivf_index(K, N, D, max_iter);
 
-    size_t N = shape[0];
-    size_t D = shape[1];
-    print_shape("Input", shape);
+    // std::vector<float> h_centroids(K * D);
+    // std::vector<int> h_assignments(N);
 
-    // TODO: Randomly initialize K centroids from h_data
-    std::vector<float> centroids(num_clusters * D);
-    std::vector<int> assignments(N, -1);
+    std::cout << "[INFO] Running CUDA KMeans with K=" << K << ", max_iter=" << max_iter << "...\n";
+    
+    // Run KMeans on GPU
+    ivf_index.train(h_data);
 
-    // Run CUDA KMeans implementation
-    kmeans_gpu(data, centroids.data(), assignments.data(), N, D, num_clusters, max_iter);
+	// Debug lines - should be deleted!!
+    std::cout << "\n[DEBUG] Cluster assignment count:\n";
+    std::vector<int> cluster_count(K, 0);
+    // std::vector<int> h_assignments(N);
+    // int assignments_size = N * sizeof(int);
+    // cudaMemcpy(h_assignments, ivf_index.getAssignments(), assignments_size, cudaMemcpyDeviceToHost);
+    // for (int i = 0; i < N; ++i) {
+    //     if (h_assignments[i] >= 0 && h_assignments[i] < K)
+    //         cluster_count[h_assignments[i]]++;
+    // }
+
+    // for (int k = 0; k < K; ++k) {
+    //     std::cout << "  Cluster " << k << ": " << cluster_count[k] << " vectors\n";
+    // }
+
+    std::cout << "\n[DEBUG] First 5 centroid values for each cluster:\n";
+
+    std::vector<float> h_centroids(K * D);
+    cudaMemcpy(h_centroids.data(), ivf_index.getCentroids(), K * D * sizeof(float), cudaMemcpyDeviceToHost);
+    for (int k = 0; k < K; ++k) {
+        std::cout << "  Centroid " << k << ": ";
+        for (int d = 0; d < std::min(5, D); ++d) {
+            std::cout << h_centroids[k * D + d] << " ";
+        }
+        std::cout << "...\n";
+    }
+    // Debug lines - should be deleted!!
+    
+    std::cout << "[INFO] Saving centroids to .npy and .bin...\n";
+    cnpy::npy_save(out_npy_path, h_centroids.data(), {static_cast<size_t>(K), static_cast<size_t>(D)}, "w");
+    save_bin(out_bin_path, h_centroids.data(), K * D);
 
-    // TODO: Save centroids to ivf_centroids.npy
-    // Save centroids as .npy
-    std::cout << "Saving centroids to: " << output_path << std::endl;
-    cnpy::npy_save(output_path, &centroids[0], {num_clusters, D}, "w");
+    std::cout << "KMeans complete. Saved to:\n";
+    std::cout << "   - " << out_npy_path << "\n";
+    std::cout << "   - " << out_bin_path << "\n";
 
-    std::cout << "KMeans complete. Centroids saved." << std::endl;
     return 0;
 }
 
diff --git a/src/cuda/kmeans_driver.cu b/src/cuda/kmeans_driver.cu
new file mode 100644
index 0000000..9d00e59
--- /dev/null
+++ b/src/cuda/kmeans_driver.cu
@@ -0,0 +1,118 @@
+#include "kmeans_gpu.h"
+#include "IVFIndex.h"
+#include <iostream>
+#include <vector>
+#include <string>
+#include <fstream>
+#include <algorithm>
+#include "cnpy.h"
+#include <iomanip> 
+
+#include <chrono> // <-- For timing
+using namespace std::chrono;
+
+
+
+void save_bin(const std::string& filename, const float* data, int count) {
+    std::ofstream fout(filename, std::ios::binary);
+    if (!fout) {
+        std::cerr << "Failed to open " << filename << " for writing.\n";
+        return;
+    }
+    fout.write(reinterpret_cast<const char*>(data), count * sizeof(float));
+    fout.close();
+}
+
+int main(int argc, char** argv) {
+    std::string input_path = "data/processed/sift_base.npy";
+    std::string out_npy_path = "data/processed/ivf_centroids.npy";
+    std::string out_bin_path = "data/processed/ivf_centroids.bin";
+
+    const int K = 100;
+    const int max_iter = 20;
+    auto load_start = high_resolution_clock::now();
+
+    std::cout << "[INFO] Loading training vectors from: " << input_path << std::endl;
+    cnpy::NpyArray arr = cnpy::npy_load(input_path);
+    float* h_data = arr.data<float>();
+    std::vector<size_t> shape = arr.shape;
+    int N = shape[0];
+    int D = shape[1];
+
+    auto load_end = high_resolution_clock::now();
+    std::cout << "[TIME] Data loading time: " 
+          << duration_cast<milliseconds>(load_end - load_start).count()
+          << " ms\n";
+    
+    std::cout << "[INFO] Dataset size: " << N << " vectors of dimension " << D << std::endl;
+
+    // Assuming h_data is already loaded, and N, D are known
+    int num_to_print = 5;  // Change this to print more/less vectors
+    for (int i = 0; i < std::min(num_to_print, N); ++i) {
+        std::cout << "Vector " << i << ": ";
+        for (int j = 0; j < D; ++j) {
+            std::cout << std::fixed << std::setprecision(4) << h_data[i * D + j] << " ";
+        }
+        std::cout << std::endl;
+    }
+    
+    IVFIndex ivf_index(K, N, D, max_iter);
+
+    // std::vector<float> h_centroids(K * D);
+    // std::vector<int> h_assignments(N);
+
+    std::cout << "[INFO] Running CUDA KMeans with K=" << K << ", max_iter=" << max_iter << "...\n";
+    
+    // Run KMeans on GPU
+    auto train_start = high_resolution_clock::now();
+    ivf_index.train(h_data);
+    auto train_end = high_resolution_clock::now();
+    std::cout << "[TIME] KMeans training time: " << duration_cast<milliseconds>(train_end - train_start).count() << " ms\n";
+	// Debug lines - should be deleted!!
+    // std::cout << "\n[DEBUG] Cluster assignment count:\n";
+    // std::vector<int> cluster_count(K, 0);
+    // std::vector<int> h_assignments(N);
+    // int assignments_size = N * sizeof(int);
+    // cudaMemcpy(h_assignments, ivf_index.getAssignments(), assignments_size, cudaMemcpyDeviceToHost);
+    // for (int i = 0; i < N; ++i) {
+    //     if (h_assignments[i] >= 0 && h_assignments[i] < K)
+    //         cluster_count[h_assignments[i]]++;
+    // }
+
+    // for (int k = 0; k < K; ++k) {
+    //     std::cout << "  Cluster " << k << ": " << cluster_count[k] << " vectors\n";
+    // }
+
+    std::cout << "\n[DEBUG] First 5 centroid values for each cluster:\n";
+
+    std::vector<float> h_centroids(K * D);
+    cudaMemcpy(h_centroids.data(), ivf_index.getCentroids(), K * D * sizeof(float), cudaMemcpyDeviceToHost);
+    for (int k = 0; k < K; ++k) {
+        std::cout << "  Centroid " << k << ": ";
+        for (int d = 0; d < std::min(5, D); ++d) {
+            std::cout << h_centroids[k * D + d] << " ";
+        }
+        std::cout << "...\n";
+    }
+    std::vector<float> h_all_vectors_flat(N * D);
+    cudaMemcpy(h_all_vectors_flat.data(), ivf_index.getallvectorsflat(), N * D * sizeof(float), cudaMemcpyDeviceToHost);
+    std::vector<int> h_offsets(K+1);
+    cudaMemcpy(h_offsets.data(), ivf_index.getClusterOffsets(), (K+1) * sizeof(int), cudaMemcpyDeviceToHost);
+    for (int i = 0; i < K; ++i)
+    {
+        int num_points = h_offsets[i+1] - h_offsets[i];
+        std::cout << "Cluster " << i << " has " << num_points << " points.\n";
+    }
+    
+    
+    std::cout << "[INFO] Saving centroids to .npy and .bin...\n";
+    cnpy::npy_save(out_npy_path, h_centroids.data(), {static_cast<size_t>(K), static_cast<size_t>(D)}, "w");
+    save_bin(out_bin_path, h_centroids.data(), K * D);
+
+    std::cout << "KMeans complete. Saved to:\n";
+    std::cout << "   - " << out_npy_path << "\n";
+    std::cout << "   - " << out_bin_path << "\n";
+
+    return 0;
+}
+
diff --git a/src/cuda/kmeans_gpu.cu b/src/cuda/kmeans_gpu.cu
index 8ba6d0f..c009ad2 100644
--- a/src/cuda/kmeans_gpu.cu
+++ b/src/cuda/kmeans_gpu.cu
@@ -1,45 +1,195 @@
-#include <cuda_runtime.h>
-#include <stdio.h>
+#include "kmeans_gpu.h"
+#include <device_launch_parameters.h>
 #include <float.h>
-#include <math.h>
+#include <stdio.h>
+#include <vector>
+#include <algorithm>
 
-__global__ void assign_clusters_kernel(const float* data, const float* centroids,
-                                       int* assignments, int N, int D, int K) {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
 
-    const float* point = &data[i * D];
-    float best_dist = FLT_MAX;
-    int best_k = -1;
+// Kernel for calculating L2 distance between vectors
+__global__ void calculateDistances(const float *data, const float *centroids,
+                                   float *distances, int num_vectors, int num_centroids,
+                                   int dim)
+{
+    int vector_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int centroid_idx = blockIdx.y * blockDim.y + threadIdx.y;
 
-    for (int k = 0; k < K; ++k) {
+    if (vector_idx < num_vectors && centroid_idx < num_centroids)
+    {
         float dist = 0.0f;
-        const float* centroid = &centroids[k * D];
-        for (int d = 0; d < D; ++d) {
-            float diff = point[d] - centroid[d];
+        for (int d = 0; d < dim; d++)
+        {
+            float diff = data[vector_idx * dim + d] - centroids[centroid_idx * dim + d];
             dist += diff * diff;
         }
+        distances[vector_idx * num_centroids + centroid_idx] = dist;
+    }
+}
+
+__global__ void normalize_centroids_kernel(
+    float* centroids,   // [K × D] centroid sums
+    const int* counts,  // [K] number of points in each cluster
+    int K, int D)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= K * D) return;
+
+    int cluster_id = idx / D;
+    int dim_id = idx % D;
+
+    int count = max(1, counts[cluster_id]); // avoid divide-by-zero
+
+    centroids[cluster_id * D + dim_id] /= count;
+}
+
+// Assign each point to nearest centroid
+__global__ void assign_clusters_kernel(const float *distances,
+                                       int *assignments, int N, int K)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= N) return;
+
+    float best_dist = FLT_MAX;
+    int best_k = -1;
+
+    for (int k = 0; k < K; ++k) {
+        float dist = distances[idx * K + k];
         if (dist < best_dist) {
             best_dist = dist;
             best_k = k;
         }
     }
-
-    assignments[i] = best_k;
+    assignments[idx] = best_k;
 }
 
-__global__ void update_centroids_kernel(const float* data, const int* assignments,
-                                        float* new_centroids, int* counts,
-                                        int N, int D, int K) {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
+// Update centroids by summing points per cluster
+__global__ void update_centroids_kernel(const float *data, const int *assignments,
+                                        float *new_centroids, int *counts,
+                                        int N, int D, int K)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= N) return;
 
-    int cluster = assignments[i];
+    int cluster = assignments[idx];
     for (int d = 0; d < D; ++d) {
-        atomicAdd(&new_centroids[cluster * D + d], data[i * D + d]);
+        atomicAdd(&new_centroids[cluster * D + d], data[idx * D + d]);
     }
     atomicAdd(&counts[cluster], 1);
 }
 
-// Optional: normalize new centroids by dividing by counts on host
+__global__ void build_inverted_lists_kernel(
+    const float* data,
+    const int* assignments,
+    float** inverted_lists,
+    int* list_sizes,
+    int N, int D) 
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= N) return;
+
+    int cluster_id = assignments[idx];
+    
+    // Atomically get a spot in the centroid's list
+    int pos = atomicAdd(&list_sizes[cluster_id], 1);
+
+    float* list = inverted_lists[cluster_id];
+
+    // Copy the vector
+    for (int d = 0; d < D; ++d) {
+        list[pos * D + d] = data[idx * D + d];
+    }
+}
+
+__global__ void create_all_vectors_flat_kernel(
+    const float* data,             // [num_vectors x dim]
+    const int* assignments,        // [num_vectors]
+    const int* cluster_offsets,    // [num_centroids]
+    int* cluster_sizes,            // [num_centroids]
+    float* all_vectors_flat,       // [num_vectors x dim]
+    int num_vectors, 
+    int dim)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_vectors) return;
+
+    int cluster_id = assignments[idx];
+
+    // Atomically reserve a spot within this cluster
+    int pos_in_cluster = atomicAdd(&cluster_sizes[cluster_id], 1);
+
+    int flat_index = (cluster_offsets[cluster_id] + pos_in_cluster) * dim;
+
+    // Copy the vector
+    for (int d = 0; d < dim; ++d) {
+        all_vectors_flat[flat_index + d] = data[idx * dim + d];
+    }
+}
+
+void kmeans_gpu(const float *h_data, float *h_centroids, int *h_assignments,
+                int N, int D, int K, int max_iter)
+{
+    // // Allocate device memory
+    // float *d_data, *d_centroids, *d_new_centroids;
+    // int *d_assignments, *d_counts;
+
+    // size_t data_size = N * D * sizeof(float);
+    // size_t centroids_size = K * D * sizeof(float);
+    // size_t assignments_size = N * sizeof(int);
+    // size_t counts_size = K * sizeof(int);
+
+    // cudaMalloc(&d_data, data_size);
+    // cudaMalloc(&d_centroids, centroids_size);
+    // cudaMalloc(&d_new_centroids, centroids_size);
+    // cudaMalloc(&d_assignments, assignments_size);
+    // cudaMalloc(&d_counts, counts_size);
 
+    // cudaMemcpy(d_data, h_data, data_size, cudaMemcpyHostToDevice);
+
+    // // Initialize centroids by copying first K vectors from data
+    // cudaMemcpy(d_centroids, d_data, centroids_size, cudaMemcpyDeviceToDevice);
+
+    // int threads = 256;
+    // int blocks = (N + threads - 1) / threads;
+
+    // for (int iter = 0; iter < max_iter; ++iter)
+    // {
+    //     // Step 1: Assign
+    //     assign_clusters_kernel<<<blocks, threads>>>(d_data, d_centroids, d_assignments, N, D, K);
+    //     cudaDeviceSynchronize();
+
+    //     // Step 2: Reset new centroids + counts
+    //     cudaMemset(d_new_centroids, 0, centroids_size);
+    //     cudaMemset(d_counts, 0, counts_size);
+
+    //     // Step 3: Update
+    //     update_centroids_kernel<<<blocks, threads>>>(d_data, d_assignments, d_new_centroids, d_counts, N, D, K);
+    //     cudaDeviceSynchronize();
+
+    //     // Step 4: Average
+    //     std::vector<float> h_new_centroids(K * D);
+    //     std::vector<int> h_counts(K);
+    //     cudaMemcpy(h_new_centroids.data(), d_new_centroids, centroids_size, cudaMemcpyDeviceToHost);
+    //     cudaMemcpy(h_counts.data(), d_counts, counts_size, cudaMemcpyDeviceToHost);
+
+    //     for (int k = 0; k < K; ++k)
+    //     {
+    //         int count = max(h_counts[k], 1); // Avoid division by zero
+    //         for (int d = 0; d < D; ++d)
+    //         {
+    //             h_centroids[k * D + d] = h_new_centroids[k * D + d] / count;
+    //         }
+    //     }
+
+    //     // Copy updated centroids back to device
+    //     cudaMemcpy(d_centroids, h_centroids, centroids_size, cudaMemcpyHostToDevice);
+    // }
+
+    // // Copy final assignments if needed
+    // cudaMemcpy(h_assignments, d_assignments, assignments_size, cudaMemcpyDeviceToHost);
+
+    // cudaFree(d_data);
+    // cudaFree(d_centroids);
+    // cudaFree(d_new_centroids);
+    // cudaFree(d_assignments);
+    // cudaFree(d_counts);
+}
diff --git a/src/cuda/loader_usage_example.cpp b/src/cuda/loader_usage_example.cpp
new file mode 100644
index 0000000..51fa650
--- /dev/null
+++ b/src/cuda/loader_usage_example.cpp
@@ -0,0 +1,14 @@
+#include "load_bin_to_gpu.h"
+
+int main() {
+    int N, D;
+    float* d_centroids = load_bin_to_gpu("data/processed/ivf_centroids.bin", 128, N, D);
+    std::cout << "Loaded " << N << " centroids of dimension " << D << std::endl;
+
+    // Now you can use `d_centroids` in your kernels
+    // Remember to free it later
+    cudaFree(d_centroids);
+
+    return 0;
+}
+
diff --git a/src/cuda/select.cu b/src/cuda/select.cu
new file mode 100644
index 0000000..4d7494a
--- /dev/null
+++ b/src/cuda/select.cu
@@ -0,0 +1,121 @@
+#include "select.cuh"
+#include "heap.cuh"
+
+
+/**
+ * CUDA fused kernel that implements distance calculation and fast k-selection on the GPU. 
+ * It uses the GPU heap implementation to store the nearest neighbor candidates in register arrays.
+ * Each warp is dedicated to k-selection over a set of candidates
+ */
+template <uint32_t WARP_QUEUE_LENGTH, uint32_t THREAD_QUEUE_LENGTH>
+__global__ void select_min_kernel(float *product_distances,
+        float *centroid_norms, int64_t *output_indexes,
+        int nqueries, int ncentroids, uint64_t centroid_start_offset, uint32_t k) {
+    uint32_t query_id = (blockIdx.x * blockDim.x + threadIdx.x)/WARP_SIZE;
+    if (query_id > nqueries) return;
+
+    GPUHeap<WARP_QUEUE_LENGTH, THREAD_QUEUE_LENGTH> heap;
+
+    for (int i=threadIdx.x/WARP_SIZE; i<ncentroids; i+=WARP_SIZE) {
+        heap.add(centroid_start_offset + i, product_distances[query_id * ncentroids + i] + centroid_norms[i]);
+    }
+    heap.merge_warp_queues();
+    heap.dump(output_indexes, k);
+}
+
+__host__ void select_min(thrust::device_vector<float> &product_distances, thrust::device_vector<float> &centroids,
+        thrust::device_vector<float> &centroid_norms, int ndim, int nqueries, thrust::device_vector<int64_t> &output_indexes,
+        uint32_t k) {
+    uint32_t centroid_tile_size = 128;
+    uint32_t query_tile_size = 128;
+    uint32_t ncentroids = centroids.size()/ndim;
+
+    #define CALL_KERNEL(WARP_QUEUE_LENGTH, THREAD_QUEUE_LENGTH) \
+    select_min_kernel<WARP_QUEUE_LENGTH, THREAD_QUEUE_LENGTH><<<grid, blocks>>>(product_distances_slice, \
+        centroid_norms_slice, \
+        output_indexes_slice, \
+        nqueries, ncentroids, j, k);
+
+    for (unsigned int i=0; i<nqueries; i+=query_tile_size) {
+        int64_t *output_indexes_slice = thrust::raw_pointer_cast(&output_indexes[i * k]);
+        float *product_distances_slice = thrust::raw_pointer_cast(&product_distances[i * ncentroids]);
+        for (unsigned int j=0; j<ncentroids; j+=centroid_tile_size) {
+            float *centroid_norms_slice = thrust::raw_pointer_cast(&centroid_norms[j *ndim]);
+            // TODO(): Look into this!!!
+            dim3 grid = (1);
+            dim3 blocks = (32);
+            if (k <= 32) {
+                CALL_KERNEL(32, 2)
+            } else if (k <= 64) {
+                CALL_KERNEL(64, 3)
+            } else if (k <= 128) {
+                CALL_KERNEL(128, 3)
+            } else if (k <= 256) {
+                CALL_KERNEL(256, 4)
+            } else if (k <= 512) {
+                CALL_KERNEL(512, 4)
+            } else {
+                CALL_KERNEL(1024, 4)
+            }
+        }
+    }
+
+    #undef CALL_KERNEL
+}
+
+
+
+template <uint32_t WARP_QUEUE_LENGTH, uint32_t THREAD_QUEUE_LENGTH>
+__global__ void select_fine_grained_kernel(float* input_distances, 
+        int64_t* input_indices, int64_t* output_indices, int64_t *offsets,
+        float *output_distances, uint32_t k, uint32_t ndim, uint32_t nqueries) {
+    uint32_t query_id = (blockIdx.x * blockDim.x + threadIdx.x)/WARP_SIZE;
+    if (query_id > nqueries) return;
+
+    GPUHeap<WARP_QUEUE_LENGTH, THREAD_QUEUE_LENGTH> heap;
+    int64_t start_offset = offsets[query_id];
+    int64_t end_offset = offsets[query_id];
+    int64_t num_data_points = end_offset - start_offset;
+
+    for (int64_t i=threadIdx.x/WARP_SIZE; i<num_data_points; i+=WARP_SIZE) {
+        heap.add(input_indices[start_offset + i], input_distances[start_offset + i]);
+    }
+    heap.merge_warp_queues();
+    heap.dump(output_indices, k);
+}
+
+__host__ void select_fine_grained(thrust::device_vector<float> &input_distances,
+        thrust::device_vector<int64_t> &data_indexes, thrust::device_vector<int64_t> &offsets,
+        thrust::device_vector<float> &output_distances, thrust::device_vector<int64_t> &output_indexes, 
+        uint32_t ndim, uint32_t nqueries, uint32_t k) {
+    #define CALL_KERNEL(WARP_QUEUE_LENGTH, THREAD_QUEUE_LENGTH) \
+    select_fine_grained_kernel<WARP_QUEUE_LENGTH, THREAD_QUEUE_LENGTH><<<grid, blocks>>>(input_distances_ptr, \
+        data_indexes_ptr, output_indexes_ptr, offsets_ptr, \
+        output_distances_ptr, k, ndim, nqueries);
+
+    // TODO(): Tiling on queries
+    int64_t *output_indexes_ptr = thrust::raw_pointer_cast(output_indexes.data());
+    int64_t *data_indexes_ptr = thrust::raw_pointer_cast(data_indexes.data());
+    float *input_distances_ptr = thrust::raw_pointer_cast(input_distances.data());
+    float *output_distances_ptr = thrust::raw_pointer_cast(output_distances.data());
+    int64_t *offsets_ptr = thrust::raw_pointer_cast(offsets.data());
+    switch(k) {
+        // TODO(): Look into this!!!
+        dim3 grid = (1);
+        dim3 blocks = (32);
+        if (k <= 32) {
+            CALL_KERNEL(32, 2)
+        } else if (k <= 64) {
+            CALL_KERNEL(64, 3)
+        } else if (k <= 128) {
+            CALL_KERNEL(128, 3)
+        } else if (k <= 256) {
+            CALL_KERNEL(256, 4)
+        } else if (k <= 512) {
+            CALL_KERNEL(512, 4)
+        } else {
+            CALL_KERNEL(1024, 4)
+        }
+    }
+    #undef CALL_KERNEL
+}
\ No newline at end of file
diff --git a/src/cuda/select.cuh b/src/cuda/select.cuh
new file mode 100644
index 0000000..ed83598
--- /dev/null
+++ b/src/cuda/select.cuh
@@ -0,0 +1,14 @@
+#include<cuda.h>
+#include<thrust/device_vector.h>
+
+__host__ void select_min(thrust::device_vector<float> &product_distances, 
+    thrust::device_vector<float> &centroids,
+    thrust::device_vector<float> &centroid_norms, int ndim, int nqueries, 
+    thrust::device_vector<int64_t> &output_indexes,
+    uint32_t k);
+
+
+__host__ void select_fine_grained(thrust::device_vector<float> &input_distances,
+    thrust::device_vector<int64_t> &data_indexes, thrust::device_vector<int64_t> &offsets,
+    thrust::device_vector<float> &output_distances, thrust::device_vector<int64_t> &output_indexes, 
+    uint32_t ndim, uint32_t nqueries, uint32_t k);
\ No newline at end of file
diff --git a/src/cuda/sort.cuh b/src/cuda/sort.cuh
new file mode 100644
index 0000000..b31617d
--- /dev/null
+++ b/src/cuda/sort.cuh
@@ -0,0 +1,238 @@
+#pragma once
+
+#include <cuda.h>
+#include <algorithm>
+
+constexpr __host__ __device__ bool is_power_of_2(uint32_t v) {
+    return (v && !(v & (v - 1)));
+}
+
+
+constexpr __host__ __device__ int log2(int n, int p = 0) {
+    return (n <= 1) ? p : log2(n / 2, p + 1);
+}
+
+constexpr __host__ __device__ bool is_power_of_2(int v) {
+    return (v && !(v & (v - 1)));
+}
+
+// Calculate largest power of 2 less than n
+constexpr __host__ __device__ int32_t largest_power_of_2_lt_n(int n) {
+    return (is_power_of_2(n))? n/2 : (1 << (uint32_t)(log2(n)));
+}
+
+template <typename T>
+__device__ void device_swap(T &v1, T &v2) {
+    T tmp = v1;
+    v1 = v2;
+    v2 = tmp;
+}
+
+/**
+ * This corresponds to the MERGE-ODD-CONTINUE from the FAISS paper.
+ * Template params:
+ * - L -> length of the array
+ * - P -> Left (true) /right (false)
+ * 
+ * TODO(): Optimizations for small-sized arrays
+ */
+template <uint32_t L, bool P>
+void __device__ merge_odd_continue(uint64_t *keys, float *values) {
+    constexpr uint32_t H = largest_power_of_2_lt_n(L);
+    if (L == 1) return;
+
+    if constexpr (L > 1) {
+        #pragma unroll
+        for (int i=0; i<L-H; i++) {
+            uint64_t &k1 = keys[i];
+            float &v1 = values[i];
+
+            uint64_t &k2 = keys[i+H];
+            float &v2 = values[i+H];
+
+            if (v1 > v2) {
+                device_swap<float>(v1, v2);
+                device_swap<uint64_t>(k1, k2);
+            }
+        }
+
+        if (P) {
+            merge_odd_continue<L-H, true> (keys, values);
+            
+            float new_values[H];
+            uint64_t new_keys[H];
+
+            #pragma unroll
+            for (int i=0; i<H; i++) {
+                new_keys[i] = keys[L-H+i];
+                new_values[i] = values[L-H+i];
+            }
+
+            merge_odd_continue<H, false> (new_keys, new_values);
+
+            #pragma unroll
+            for (int i=0; i<H; i++) {
+                keys[L-H+i] = new_keys[i];
+                values[L-H+i] = new_values[i];
+            }
+        } else {
+            merge_odd_continue<H, true> (keys, values);
+
+            float new_values[L-H];
+            uint64_t new_keys[L-H];
+
+            #pragma unroll
+            for (int i=0; i<L-H; i++) {
+                new_keys[i] = keys[H+i];
+                new_values[i] = values[H+i];
+            }
+
+            merge_odd_continue<H, false> (new_keys, new_values);
+
+            #pragma unroll
+            for (int i=0; i<L-H; i++) {
+                keys[H+i] = new_keys[i];
+                values[H+i] = new_values[i];
+            }
+        }
+    }
+}
+
+/**
+ * Merge two arrays of size L and R. Both arrays are lane strided.
+ * This implements the MERGE-ODD algorithm in the FAISS paper
+ */
+template<uint32_t L, uint32_t R>
+void __device__ merge_odd(float values_left[L], float values_right[L], uint64_t keys_left[R], uint64_t keys_right[R]) {
+    #pragma unroll
+    for (int i=0; i<min(L, R); i++) {
+        // Compare and swap
+        uint64_t &k1 = keys_left[L-i-1];
+        float &v1 = values_left[L-i-1];
+
+        uint64_t &k2 = keys_right[i];
+        float &v2 = values_right[i];
+
+        int64_t other_k2 = __shfl_xor_sync(0xFFFFFFFF, k2, 31);
+        float other_v2 = __shfl_xor_sync(0xFFFFFFFF, v2, 31);
+
+        int64_t other_k1 = __shfl_xor_sync(0xFFFFFFFF, k1, 31);
+        float other_v1 = __shfl_xor_sync(0xFFFFFFFF, v1, 31);
+
+        if (v1 > other_v2) {
+            k1 = other_k2;
+            v1 = other_v2;
+        }
+
+        if (other_v1 > v2) {
+            k2 = other_k1;
+            v2 = other_v1;
+        }
+    }
+    __syncwarp();
+
+    merge_odd_continue<L, true>(keys_left, values_left);
+    merge_odd_continue<R, false>(keys_right, values_right);
+}
+
+__device__ __forceinline__ int get_lane_id() {
+    int lane_id;
+    asm("mov.u32 %0, %%laneid;" : "=r"(lane_id));
+    return lane_id;
+}
+
+
+/**
+ * Merge WARP_SIZE/L sorted arrays of length L to form WARP_SIZE/2L sorted arrays of length 2L
+ */
+template <uint32_t L>
+__device__ void sort_small_array(float &value, uint64_t &key) {
+    static_assert(is_power_of_2(L), "L must be power of 2");
+    
+    // This is the MERGE-ODD part for small arrays
+    int lane_id = get_lane_id();
+    float other_value = __shfl_xor_sync(0xFFFFFFFF, value, L);
+    uint64_t other_key = __shfl_xor_sync(0xFFFFFFFF, key, L);
+    bool smaller_lane = lane_id < (lane_id ^ L);
+    bool should_swap = smaller_lane ^ (value > other_value);
+    if (should_swap) {
+        device_swap<>(value, other_value);
+        device_swap<>(key, other_key);
+    }
+    __syncwarp();
+
+    // This corresponds to the MERGE-ODD-CONTINUE part
+    int lane_group_start = L & lane_id;
+    int lane_group_end = lane_group_start + L;
+    for (int stride=L/2; stride>0; stride/=2) {
+        smaller_lane = lane_id < (lane_id ^ stride);
+        other_value = __shfl_xor_sync(0xFFFFFFFF, value, stride);
+        other_key = __shfl_xor_sync(0xFFFFFFFF, key, stride);
+
+        bool should_swap = smaller_lane ^ (value > other_value);
+        if (should_swap) {
+            device_swap<>(value, other_value);
+            device_swap<>(key, other_key);
+        }
+        __syncwarp();
+    }
+}
+
+
+// Recursively sort a lane-strided array
+template <uint32_t N>
+__device__ void sort_odd(float values[N], uint64_t keys[N]) {
+    static_assert(N >= 2, "Use template specialization for N=1");
+    constexpr uint32_t size_a = N/2;
+    constexpr uint32_t size_b = N - size_a;
+
+    float values_a[size_a];
+    uint64_t keys_a[size_a];
+
+    // Sort the left half
+    #pragma unroll
+    for (int i=0; i<N/2; i++) {
+        values_a[i] = values[i];
+        keys_a[i] = keys[i];
+    }
+    if constexpr (size_a > 0) {
+        sort_odd<size_a>(values_a, keys_a);
+    }
+
+    // Sort the right half
+    float values_b[size_b];
+    uint64_t keys_b[size_b];
+
+    #pragma unroll
+    for (int i=N/2; i<N; i++) {
+        values_b[i-N/2] = values[i];
+        keys_b[i-N/2] = keys[i];
+    }
+    if constexpr (size_b > 1) {
+        sort_odd<size_b>(values_b, keys_b);
+    }
+
+    // Merge the two halves
+    merge_odd<size_a, size_b>(values_a, values_b, keys_a, keys_b);
+
+    #pragma unroll
+    for (int i=0; i<size_a; i++) {
+        keys[i] = keys_a[i];
+        values[i] = values_a[i];
+    }
+
+    #pragma unroll
+    for (int i=0; i<size_b; i++) {
+        keys[i+size_a] = keys_b[i];
+        values[i+size_a] = values_b[i];
+    }
+}
+
+template <>
+__device__ void sort_odd<1>(float values[1], uint64_t keys[1]) {
+    sort_small_array<1>(values[0], keys[0]);
+    sort_small_array<2>(values[0], keys[0]);
+    sort_small_array<4>(values[0], keys[0]);
+    sort_small_array<8>(values[0], keys[0]);
+    sort_small_array<16>(values[0], keys[0]);
+}
\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000..4cdae2b
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,12 @@
+enable_testing()
+
+find_package(CUDA REQUIRED)
+
+add_executable(tests test_heap.cu)
+target_link_libraries(tests cuda_vector_db)
+
+target_compile_options(tests PRIVATE 
+    $<$<COMPILE_LANGUAGE:CUDA>:-Xptxas -O3>   # Pass -O3 optimization to ptxas
+)
+
+target_compile_features(cuda_vector_db PUBLIC cxx_std_17)
\ No newline at end of file
diff --git a/tests/test_heap.cu b/tests/test_heap.cu
new file mode 100644
index 0000000..07b33aa
--- /dev/null
+++ b/tests/test_heap.cu
@@ -0,0 +1,94 @@
+#include <cuda_runtime.h>
+#include <iostream>
+#include <queue>
+#include <vector>
+#include <random>
+#include <cassert>
+
+#include "../src/cuda/heap.cuh"
+
+constexpr int WARP_QUEUE_SIZE = 32;
+constexpr int THREAD_QUEUE_SIZE = 8;
+
+__global__ void test_gpu_heap_add(const int* input, int size, int64_t* out_keys) {
+    GPUHeap<WARP_QUEUE_SIZE, THREAD_QUEUE_SIZE> heap;
+
+    // Add elements from the input array to the heap
+    for (int i = threadIdx.x; i < size; i += blockDim.x) {
+        heap.add(i, static_cast<float>(input[i]));
+    }
+
+    // Dump the results to shared memory
+    __shared__ int64_t shared_out_keys[WARP_QUEUE_SIZE];
+    heap.dump(shared_out_keys, WARP_QUEUE_SIZE);
+
+    // Copy results to global memory
+    if (threadIdx.x == 0) {
+        for (int i = 0; i < WARP_QUEUE_SIZE; i++) {
+            out_keys[i] = shared_out_keys[i];
+        }
+    }
+}
+
+int main() {
+    // Generate random input array of size 1024
+    constexpr int INPUT_SIZE = 1024;
+    std::vector<int> input(INPUT_SIZE);
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis(-1000000, 1000000);
+
+    for (int& val : input) {
+        val = dis(gen);
+    }
+
+    int size = input.size();
+
+    // Allocate device memory for input and output
+    int* d_input;
+    int64_t* d_out_keys;
+    cudaMalloc(&d_input, size * sizeof(int));
+    cudaMalloc(&d_out_keys, WARP_QUEUE_SIZE * sizeof(int64_t));
+
+    // Copy input data to device
+    cudaMemcpy(d_input, input.data(), size * sizeof(int), cudaMemcpyHostToDevice);
+
+    // Launch the kernel
+    test_gpu_heap_add<<<1, 32>>>(d_input, size, d_out_keys);
+    cudaDeviceSynchronize();
+
+    // Check for CUDA errors
+    cudaError_t err = cudaGetLastError();
+    assert(err == cudaSuccess);
+
+    // Copy results back to host
+    std::vector<int64_t> gpu_heap(WARP_QUEUE_SIZE);
+    cudaMemcpy(gpu_heap.data(), d_out_keys, WARP_QUEUE_SIZE * sizeof(int64_t), cudaMemcpyDeviceToHost);
+
+    // CPU min-heap
+    std::priority_queue<std::pair<int, int>, std::vector<std::pair<int, int>>, std::greater<std::pair<int, int>>> cpu_heap;
+    int i=0;
+    for (int val : input) {
+        cpu_heap.push(std::make_pair<>(val, i));
+        if (cpu_heap.size() > WARP_QUEUE_SIZE) {
+            cpu_heap.pop();
+        }
+    }
+
+    // Extract CPU heap results
+    std::vector<int> cpu_heap_results;
+    while (!cpu_heap.empty()) {
+        cpu_heap_results.push_back(cpu_heap.top().second);
+        cpu_heap.pop();
+    }
+
+    // Compare results
+    assert(gpu_heap.size() == cpu_heap_results.size());
+    for (size_t i = 0; i < cpu_heap_results.size(); ++i) {
+        assert(gpu_heap[i] == cpu_heap_results[i]);
+    }
+
+    // Free device memory
+    cudaFree(d_input);
+    cudaFree(d_out_keys);
+}