Low-memory extraction utilities for Chroma/HNSW persisted indices, exposed as:
- Rust library APIs
- C-ABI FFI (
cdylib) - Pure Go bindings via
purego(no CGO)
From a persisted HNSW directory (header.bin, data_level0.bin):
internal_idlabeldeletedvector
Optional metadata join from index_metadata.pickle:
user_idseq_id
Extract summary also includes index_properties sourced from header.bin:
mef_constructioncur_element_countmax_elementspersisted_versionword_size_bytes
Output format:
parquetarrow_ipc
From a columnar export (parquet or arrow_ipc) containing at least:
vector(fixed-size list/list of float32/float64)- optional
label(otherwise falls back tointernal_id, then row index) - optional
deleted(bool/int)
Build output:
- A new persisted HNSW file (
.hnsw) produced by the Rustfast-hnswformat.
Note: this build output is not Chroma/hnswlib's native persistence layout (header.bin, data_level0.bin, ...).
cargo buildOr via Make:
make build
make build-releaseDynamic library output (platform-specific):
- macOS:
target/debug/libhnsw_toolbox.dylib - Linux:
target/debug/libhnsw_toolbox.so - Windows:
target/debug/hnsw_toolbox.dll
Use extract_index(...) for record-by-record streaming callbacks, or
extract_index_to_columnar(...) for file output.
Use build_index_from_columnar(...) to build a new HNSW index file from
Parquet/Arrow exports.
hnsw_toolbox_version() -> *const c_charhnsw_toolbox_extract_index(request_json: *const c_char) -> *mut c_charhnsw_toolbox_build_index(request_json: *const c_char) -> *mut c_charhnsw_toolbox_get_last_error() -> *const c_charhnsw_toolbox_free_string(ptr: *mut c_char)
{
"index_dir": "/path/to/vector-segment-dir",
"output_path": "/tmp/rebuilt.parquet",
"output_format": "parquet",
"metadata_path": "/path/to/index_metadata.pickle",
"include_deleted": false,
"batch_size": 1024
}output_format: "parquet" or "arrow_ipc" (defaults to "parquet").
{
"output_path": "/tmp/rebuilt.parquet",
"output_format": "parquet",
"summary": {
"scanned": 10000,
"emitted": 8000,
"deleted_skipped": 2000,
"dimension": 384,
"index_properties": {
"m": 16,
"ef_construction": 200,
"cur_element_count": 10000,
"max_elements": 12000,
"persisted_version": 1,
"word_size_bytes": 8
}
}
}{
"input_path": "/tmp/extracted.parquet",
"output_path": "/tmp/rebuilt.hnsw",
"input_format": "parquet",
"metric": "euclidean",
"include_deleted": false,
"m": 16,
"m0": 32,
"ef_construction": 200,
"batch_size": 1024,
"capacity": 100000,
"seed": 42
}input_format: "parquet" or "arrow_ipc" (defaults to "parquet").
metric: "euclidean", "squared_euclidean", "cosine", "dot_product", "manhattan" (defaults to "euclidean").
{
"input_path": "/tmp/extracted.parquet",
"output_path": "/tmp/rebuilt.hnsw",
"input_format": "parquet",
"metric": "euclidean",
"summary": {
"scanned": 10000,
"inserted": 8000,
"deleted_skipped": 2000,
"dimension": 384
}
}import "github.com/amikos-tech/hnsw-toolbox"
err := hnswtoolbox.Init("/abs/path/to/libhnsw_toolbox.dylib")
if err != nil {
panic(err)
}
defer hnswtoolbox.Close()
resp, err := hnswtoolbox.ExtractIndex(hnswtoolbox.ExtractRequest{
IndexDir: "/path/to/vector-segment-dir",
OutputPath: "/tmp/extracted.parquet",
OutputFormat: hnswtoolbox.OutputFormatParquet,
MetadataPath: "/path/to/index_metadata.pickle",
IncludeDeleted: false,
BatchSize: 1024,
})
if err != nil {
panic(err)
}
_ = respBuild a new index:
buildResp, err := hnswtoolbox.BuildIndex(hnswtoolbox.BuildRequest{
InputPath: "/tmp/extracted.parquet",
OutputPath: "/tmp/rebuilt.hnsw",
InputFormat: hnswtoolbox.InputFormatParquet,
Metric: hnswtoolbox.DistanceMetricEuclidean,
IncludeDeleted: false,
M: 16,
EfConstruction: 200,
BatchSize: 1024,
})
if err != nil {
panic(err)
}
_ = buildRespcargo test
cargo clippy --all-targets -- -D warnings
go test ./...Or via Make:
make test
make lint
make fmt