Skip to content

Commit 38226c1

Browse files
committed
fix(postgres): extract HNSW dimensions from column typmod instead of hardcoding 128
Previously, hnsw_build() hardcoded dimensions=128 regardless of the actual vector column definition. This caused incorrect index behavior for vectors with dimensions other than 128. Now dimensions are extracted from the indexed column's type modifier (typmod). When users declare ruvector(384), PostgreSQL stores 384 in atttypmod, which we now read correctly. Changes: - hnsw_build(): Extract dimensions from typmod, error if not specified - hnsw_buildempty(): Also extract dimensions for empty index creation - Add doc comments explaining the dimension extraction behavior If the column is declared without explicit dimensions (e.g., just 'ruvector' instead of 'ruvector(384)'), an error message guides users to specify dimensions.
1 parent 0b1a9ef commit 38226c1

1 file changed

Lines changed: 86 additions & 32 deletions

File tree

crates/ruvector-postgres/src/index/hnsw_am.rs

Lines changed: 86 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,18 @@
33
//! This module implements HNSW as a proper PostgreSQL index access method,
44
//! storing the graph structure in PostgreSQL pages for persistence.
55
6+
use pgrx::pg_sys::{
7+
self, bytea, BlockNumber, Buffer, Cost, Datum, IndexAmRoutine, IndexBuildResult,
8+
IndexBulkDeleteCallback, IndexBulkDeleteResult, IndexInfo, IndexPath, IndexScanDesc,
9+
IndexUniqueCheck, IndexVacuumInfo, ItemPointer, ItemPointerData, NodeTag, Page, PageHeaderData,
10+
PlannerInfo, Relation, ScanDirection, ScanKey, Selectivity, Size, TIDBitmap,
11+
};
612
use pgrx::prelude::*;
7-
use pgrx::pg_sys::{self, Relation, IndexInfo, IndexBuildResult, IndexVacuumInfo,
8-
IndexBulkDeleteResult, IndexBulkDeleteCallback, PlannerInfo, IndexPath,
9-
Cost, Selectivity, IndexScanDesc, ScanDirection, TIDBitmap, ScanKey,
10-
IndexUniqueCheck, ItemPointer, Datum, Buffer, BlockNumber, Page,
11-
IndexAmRoutine, NodeTag, bytea, ItemPointerData, PageHeaderData, Size};
1213
use pgrx::Internal;
13-
use std::ptr;
1414
use std::mem::size_of;
15+
use std::ptr;
1516

16-
use crate::distance::{DistanceMetric, distance};
17+
use crate::distance::{distance, DistanceMetric};
1718
use crate::index::HnswConfig;
1819

1920
// ============================================================================
@@ -31,11 +32,11 @@ const HNSW_PAGE_DELETED: u8 = 2;
3132

3233
/// Maximum neighbors per node (aligned with default M)
3334
#[allow(dead_code)]
34-
const MAX_NEIGHBORS_L0: usize = 32; // 2*M for layer 0
35+
const MAX_NEIGHBORS_L0: usize = 32; // 2*M for layer 0
3536
#[allow(dead_code)]
36-
const MAX_NEIGHBORS: usize = 16; // M for other layers
37+
const MAX_NEIGHBORS: usize = 16; // M for other layers
3738
#[allow(dead_code)]
38-
const MAX_LAYERS: usize = 16; // Maximum graph layers
39+
const MAX_LAYERS: usize = 16; // Maximum graph layers
3940

4041
/// P_NEW equivalent for allocating new pages
4142
const P_NEW_BLOCK: BlockNumber = pg_sys::InvalidBlockNumber;
@@ -73,10 +74,10 @@ impl Default for HnswMetaPage {
7374
ef_construction: 64,
7475
entry_point: pg_sys::InvalidBlockNumber,
7576
max_layer: 0,
76-
metric: 0, // L2 by default
77+
metric: 0, // L2 by default
7778
_padding: 0,
7879
node_count: 0,
79-
next_block: 1, // First node page
80+
next_block: 1, // First node page
8081
}
8182
}
8283
}
@@ -89,7 +90,7 @@ struct HnswNodePageHeader {
8990
#[allow(dead_code)]
9091
max_layer: u8,
9192
_padding: [u8; 2],
92-
item_id: ItemPointerData, // TID of the heap tuple
93+
item_id: ItemPointerData, // TID of the heap tuple
9394
}
9495

9596
/// Neighbor entry in the graph
@@ -137,7 +138,8 @@ unsafe fn get_meta_page(index_rel: Relation) -> (Page, Buffer) {
137138
unsafe fn get_or_create_meta_page(index_rel: Relation, for_write: bool) -> (Page, Buffer) {
138139
// Check if the relation has any blocks
139140
// Use MAIN_FORKNUM (0) for the main relation fork
140-
let nblocks = pg_sys::RelationGetNumberOfBlocksInFork(index_rel, pg_sys::ForkNumber::MAIN_FORKNUM);
141+
let nblocks =
142+
pg_sys::RelationGetNumberOfBlocksInFork(index_rel, pg_sys::ForkNumber::MAIN_FORKNUM);
141143

142144
let buffer = if nblocks == 0 {
143145
// New index - allocate first page using P_NEW (InvalidBlockNumber)
@@ -166,7 +168,8 @@ unsafe fn read_metadata(page: Page) -> HnswMetaPage {
166168
/// Write metadata to page
167169
unsafe fn write_metadata(page: Page, meta: &HnswMetaPage) {
168170
let header = page as *mut PageHeaderData;
169-
let data_ptr = (header as *mut u8).add(std::mem::size_of::<PageHeaderData>()) as *mut HnswMetaPage;
171+
let data_ptr =
172+
(header as *mut u8).add(std::mem::size_of::<PageHeaderData>()) as *mut HnswMetaPage;
170173
ptr::write(data_ptr, *meta);
171174
}
172175

@@ -259,7 +262,11 @@ unsafe fn calculate_distance(
259262
// Access Method Callbacks
260263
// ============================================================================
261264

262-
/// Build callback - builds the index from scratch
265+
/// Build callback - builds the HNSW index from scratch
266+
///
267+
/// Extracts vector dimensions from the indexed column's type modifier.
268+
/// The column must be declared with explicit dimensions, e.g., `ruvector(384)`.
269+
/// Returns an error if dimensions are not specified.
263270
#[pg_guard]
264271
unsafe extern "C" fn hnsw_build(
265272
_heap: Relation,
@@ -268,8 +275,32 @@ unsafe extern "C" fn hnsw_build(
268275
) -> *mut IndexBuildResult {
269276
pgrx::log!("HNSW: Starting index build");
270277

271-
// Parse index options
272-
let dimensions = 128; // TODO: Extract from index definition
278+
// Extract dimensions from the indexed column's type modifier
279+
// When user defines ruvector(384), typmod = 384
280+
let dimensions = {
281+
// RelationGetDescr(index) -> (*index).rd_att
282+
let index_desc = (*index).rd_att;
283+
if index_desc.is_null() || (*index_desc).natts < 1 {
284+
pgrx::error!("HNSW: Cannot build index - no indexed columns found");
285+
}
286+
287+
// TupleDescAttr(desc, 0) -> (*desc).attrs.as_ptr().add(0)
288+
let attr = (*index_desc).attrs.as_ptr().add(0);
289+
let typmod = (*attr).atttypmod;
290+
291+
if typmod > 0 {
292+
typmod as u32
293+
} else {
294+
// typmod = -1 means dimensions not specified in type declaration
295+
// This happens with: CREATE TABLE t (v ruvector) instead of ruvector(384)
296+
pgrx::error!(
297+
"HNSW: Vector column must have dimensions specified. \
298+
Use ruvector(dimensions) instead of ruvector, e.g., ruvector(384)"
299+
);
300+
}
301+
};
302+
303+
pgrx::log!("HNSW: Building index with {} dimensions", dimensions);
273304
let config = HnswConfig::default();
274305

275306
// Initialize metadata page
@@ -298,7 +329,10 @@ unsafe extern "C" fn hnsw_build(
298329
// This is a simplified version - full implementation would use IndexBuildHeapScan
299330
let tuple_count = 0.0;
300331

301-
pgrx::log!("HNSW: Index build complete, {} tuples indexed", tuple_count as u64);
332+
pgrx::log!(
333+
"HNSW: Index build complete, {} tuples indexed",
334+
tuple_count as u64
335+
);
302336

303337
// Return build result
304338
let mut result = PgBox::<IndexBuildResult>::alloc0();
@@ -308,15 +342,38 @@ unsafe extern "C" fn hnsw_build(
308342
}
309343

310344
/// Build empty index callback
345+
///
346+
/// Creates an empty HNSW index with proper dimensions from the column's type modifier.
311347
#[pg_guard]
312348
unsafe extern "C" fn hnsw_buildempty(index: Relation) {
313349
pgrx::log!("HNSW: Building empty index");
314350

351+
// Extract dimensions from the indexed column's type modifier
352+
let dimensions = {
353+
// RelationGetDescr(index) -> (*index).rd_att
354+
let index_desc = (*index).rd_att;
355+
if !index_desc.is_null() && (*index_desc).natts >= 1 {
356+
// TupleDescAttr(desc, 0) -> (*desc).attrs.as_ptr().add(0)
357+
let attr = (*index_desc).attrs.as_ptr().add(0);
358+
let typmod = (*attr).atttypmod;
359+
if typmod > 0 {
360+
typmod as u32
361+
} else {
362+
0
363+
}
364+
} else {
365+
0
366+
}
367+
};
368+
315369
// Initialize metadata page only
316370
let (page, buffer) = get_or_create_meta_page(index, true);
317371
pg_sys::PageInit(page, pg_sys::BLCKSZ as Size, 0);
318372

319-
let meta = HnswMetaPage::default();
373+
let meta = HnswMetaPage {
374+
dimensions,
375+
..Default::default()
376+
};
320377
write_metadata(page, &meta);
321378

322379
pg_sys::MarkBufferDirty(buffer);
@@ -410,12 +467,12 @@ unsafe extern "C" fn hnsw_costestimate(
410467

411468
// Total cost is O(log n) for HNSW
412469
let log_tuples = tuples.max(1.0).ln();
413-
*index_total_cost = log_tuples * 10.0; // Scale factor for page accesses
470+
*index_total_cost = log_tuples * 10.0; // Scale factor for page accesses
414471

415472
// HNSW provides good selectivity for top-k queries
416-
*index_selectivity = 0.01; // Typically returns ~1% of tuples
417-
*index_correlation = 0.0; // No correlation with physical order
418-
*index_pages = (tuples / 100.0).max(1.0); // Rough estimate
473+
*index_selectivity = 0.01; // Typically returns ~1% of tuples
474+
*index_correlation = 0.0; // No correlation with physical order
475+
*index_pages = (tuples / 100.0).max(1.0); // Rough estimate
419476
}
420477

421478
/// Get tuple callback (for index scans)
@@ -480,10 +537,7 @@ unsafe extern "C" fn hnsw_canreturn(_index: Relation, attno: ::std::os::raw::c_i
480537

481538
/// Options callback - parse index options
482539
#[pg_guard]
483-
unsafe extern "C" fn hnsw_options(
484-
_reloptions: Datum,
485-
_validate: bool,
486-
) -> *mut bytea {
540+
unsafe extern "C" fn hnsw_options(_reloptions: Datum, _validate: bool) -> *mut bytea {
487541
pgrx::log!("HNSW: Parsing options");
488542

489543
// TODO: Parse m, ef_construction, metric from reloptions
@@ -501,14 +555,14 @@ static HNSW_AM_HANDLER: IndexAmRoutine = IndexAmRoutine {
501555
type_: NodeTag::T_IndexAmRoutine,
502556

503557
// Index structure capabilities
504-
amstrategies: 1, // One strategy: nearest neighbor
505-
amsupport: 1, // One support function: distance
558+
amstrategies: 1, // One strategy: nearest neighbor
559+
amsupport: 1, // One support function: distance
506560
amoptsprocnum: 0,
507561
amcanorder: false,
508-
amcanorderbyop: true, // Supports ORDER BY with distance operators
562+
amcanorderbyop: true, // Supports ORDER BY with distance operators
509563
amcanbackward: false,
510564
amcanunique: false,
511-
amcanmulticol: false, // Single column only (vector)
565+
amcanmulticol: false, // Single column only (vector)
512566
amoptionalkey: true,
513567
amsearcharray: false,
514568
amsearchnulls: false,

0 commit comments

Comments
 (0)