From c09b5f54e383d2bfe551ad28a29c10f30d8fcf9a Mon Sep 17 00:00:00 2001 From: John O'Hare Date: Tue, 27 Jan 2026 15:11:58 +0000 Subject: [PATCH] fix: register embedding functions in extension SQL and install v2.0.0 schema in Docker The Docker image builds ruvector.so with --features embeddings, compiling fastembed into the binary. However, the extension SQL files (ruvector--0.1.0.sql and ruvector--2.0.0.sql) contained only a comment stub for embedding functions instead of actual CREATE FUNCTION declarations. This meant CREATE EXTENSION ruvector never registered the embedding functions despite the compiled symbols being present in the .so. Additionally, the Dockerfile only copied ruvector--0.1.0.sql into the image while ruvector.control declares default_version = '2.0.0', causing CREATE EXTENSION ruvector to fail with "no installation script for version 2.0.0". Changes: - Replace embedding comment stubs in both ruvector--0.1.0.sql and ruvector--2.0.0.sql with actual CREATE FUNCTION declarations using the pgrx _wrapper symbol convention - Add ruvector_embed_vec() convenience function (text -> ruvector type) - Fix Dockerfile to copy both 0.1.0 and 2.0.0 SQL files into the image - Fix volatility markers in embeddings.sql (IMMUTABLE -> VOLATILE for functions that load models or mutate state) - Add embedding function smoke test to docker/init.sql Co-Authored-By: claude-flow --- crates/ruvector-postgres/Dockerfile | 17 ++-- crates/ruvector-postgres/docker/init.sql | 5 ++ crates/ruvector-postgres/sql/embeddings.sql | 28 ++++--- .../ruvector-postgres/sql/ruvector--0.1.0.sql | 77 ++++++++++++++++++- .../ruvector-postgres/sql/ruvector--2.0.0.sql | 77 ++++++++++++++++++- 5 files changed, 182 insertions(+), 22 deletions(-) diff --git a/crates/ruvector-postgres/Dockerfile b/crates/ruvector-postgres/Dockerfile index 3dd7a1dfb..413ed6549 100644 --- a/crates/ruvector-postgres/Dockerfile +++ b/crates/ruvector-postgres/Dockerfile @@ -69,17 +69,20 @@ RUN mkdir -p /opt/ruvector/models && \ echo "Model cache size: $(du -sh /opt/ruvector/models)" && \ ls -la /opt/ruvector/models/ -# Copy the pre-built SQL schema file (with sparse functions removed) -# cargo pgrx schema doesn't work reliably in Docker, so we use the hand-crafted file +# Copy the pre-built SQL schema files (with sparse functions removed) +# cargo pgrx schema doesn't work reliably in Docker, so we use the hand-crafted files +# Both versions are needed: 0.1.0 for legacy and 2.0.0 to match ruvector.control default_version RUN cp /build/sql/ruvector--0.1.0.sql /build/target/release/ruvector-pg17/usr/share/postgresql/17/extension/ruvector--0.1.0.sql && \ - echo "SQL schema copied with $(grep -c 'CREATE FUNCTION\|CREATE OR REPLACE FUNCTION' /build/target/release/ruvector-pg17/usr/share/postgresql/17/extension/ruvector--0.1.0.sql) functions" + cp /build/sql/ruvector--2.0.0.sql /build/target/release/ruvector-pg17/usr/share/postgresql/17/extension/ruvector--2.0.0.sql && \ + echo "SQL schema (0.1.0) copied with $(grep -c 'CREATE FUNCTION\|CREATE OR REPLACE FUNCTION' /build/target/release/ruvector-pg17/usr/share/postgresql/17/extension/ruvector--0.1.0.sql) functions" && \ + echo "SQL schema (2.0.0) copied with $(grep -c 'CREATE FUNCTION\|CREATE OR REPLACE FUNCTION' /build/target/release/ruvector-pg17/usr/share/postgresql/17/extension/ruvector--2.0.0.sql) functions" # Verify the extension files are complete RUN ls -la /build/target/release/ruvector-pg17/usr/share/postgresql/17/extension/ && \ - echo "=== First 20 lines of SQL ===" && \ - head -20 /build/target/release/ruvector-pg17/usr/share/postgresql/17/extension/ruvector--0.1.0.sql && \ - echo "=== CREATE FUNCTION count ===" && \ - grep -c "CREATE FUNCTION\|CREATE OR REPLACE FUNCTION" /build/target/release/ruvector-pg17/usr/share/postgresql/17/extension/ruvector--0.1.0.sql + echo "=== ruvector--2.0.0.sql CREATE FUNCTION count ===" && \ + grep -c "CREATE FUNCTION\|CREATE OR REPLACE FUNCTION" /build/target/release/ruvector-pg17/usr/share/postgresql/17/extension/ruvector--2.0.0.sql && \ + echo "=== Verify embedding functions present in 2.0.0 ===" && \ + grep -c "ruvector_embed" /build/target/release/ruvector-pg17/usr/share/postgresql/17/extension/ruvector--2.0.0.sql # Runtime stage FROM postgres:17-bookworm diff --git a/crates/ruvector-postgres/docker/init.sql b/crates/ruvector-postgres/docker/init.sql index e549dbf34..e16c05583 100644 --- a/crates/ruvector-postgres/docker/init.sql +++ b/crates/ruvector-postgres/docker/init.sql @@ -50,5 +50,10 @@ BEGIN RAISE NOTICE 'Inner product: %', inner_product_arr(ARRAY[1.0, 2.0, 3.0]::real[], ARRAY[1.0, 2.0, 3.0]::real[]); RAISE NOTICE 'Cosine distance: %', cosine_distance_arr(ARRAY[1.0, 0.0, 0.0]::real[], ARRAY[0.0, 1.0, 0.0]::real[]); + -- Test embedding functions + RAISE NOTICE 'Testing embedding functions...'; + RAISE NOTICE 'Default model: %', ruvector_default_model(); + RAISE NOTICE 'MiniLM dims: %', ruvector_embedding_dims('all-MiniLM-L6-v2'); + RAISE NOTICE 'All basic tests passed!'; END $$; diff --git a/crates/ruvector-postgres/sql/embeddings.sql b/crates/ruvector-postgres/sql/embeddings.sql index 739514ede..fccd660d2 100644 --- a/crates/ruvector-postgres/sql/embeddings.sql +++ b/crates/ruvector-postgres/sql/embeddings.sql @@ -1,18 +1,21 @@ -- ============================================================================ -- Embedding Generation Functions -- ============================================================================ +-- These functions require the 'embeddings' feature flag at compile time. +-- The Docker image builds with --features embeddings, so they are available. +-- pgrx generates C symbols with _wrapper suffix. -- Generate embedding from text using default or specified model CREATE OR REPLACE FUNCTION ruvector_embed(text text, model_name text DEFAULT 'all-MiniLM-L6-v2') RETURNS real[] AS 'MODULE_PATHNAME', 'ruvector_embed_wrapper' -LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; +LANGUAGE C VOLATILE STRICT PARALLEL SAFE; -- Generate embeddings for multiple texts in batch CREATE OR REPLACE FUNCTION ruvector_embed_batch(texts text[], model_name text DEFAULT 'all-MiniLM-L6-v2') RETURNS real[][] AS 'MODULE_PATHNAME', 'ruvector_embed_batch_wrapper' -LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; +LANGUAGE C VOLATILE STRICT PARALLEL SAFE; -- List all available embedding models CREATE OR REPLACE FUNCTION ruvector_embedding_models() @@ -23,46 +26,53 @@ RETURNS TABLE ( is_loaded boolean ) AS 'MODULE_PATHNAME', 'ruvector_embedding_models_wrapper' -LANGUAGE C IMMUTABLE STRICT; +LANGUAGE C VOLATILE PARALLEL SAFE; -- Load embedding model into memory CREATE OR REPLACE FUNCTION ruvector_load_model(model_name text) RETURNS boolean AS 'MODULE_PATHNAME', 'ruvector_load_model_wrapper' -LANGUAGE C STRICT; +LANGUAGE C VOLATILE STRICT PARALLEL SAFE; -- Unload embedding model from memory CREATE OR REPLACE FUNCTION ruvector_unload_model(model_name text) RETURNS boolean AS 'MODULE_PATHNAME', 'ruvector_unload_model_wrapper' -LANGUAGE C STRICT; +LANGUAGE C VOLATILE STRICT PARALLEL SAFE; -- Get information about a specific model CREATE OR REPLACE FUNCTION ruvector_model_info(model_name text) RETURNS jsonb AS 'MODULE_PATHNAME', 'ruvector_model_info_wrapper' -LANGUAGE C IMMUTABLE STRICT; +LANGUAGE C VOLATILE STRICT PARALLEL SAFE; -- Set default embedding model CREATE OR REPLACE FUNCTION ruvector_set_default_model(model_name text) RETURNS boolean AS 'MODULE_PATHNAME', 'ruvector_set_default_model_wrapper' -LANGUAGE C STRICT; +LANGUAGE C VOLATILE STRICT PARALLEL SAFE; -- Get current default embedding model CREATE OR REPLACE FUNCTION ruvector_default_model() RETURNS text AS 'MODULE_PATHNAME', 'ruvector_default_model_wrapper' -LANGUAGE C IMMUTABLE STRICT; +LANGUAGE C VOLATILE PARALLEL SAFE; -- Get embedding generation statistics CREATE OR REPLACE FUNCTION ruvector_embedding_stats() RETURNS jsonb AS 'MODULE_PATHNAME', 'ruvector_embedding_stats_wrapper' -LANGUAGE C IMMUTABLE STRICT; +LANGUAGE C VOLATILE PARALLEL SAFE; -- Get dimensions for a specific model CREATE OR REPLACE FUNCTION ruvector_embedding_dims(model_name text) RETURNS integer AS 'MODULE_PATHNAME', 'ruvector_embedding_dims_wrapper' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- Convenience: text → ruvector type in one call +CREATE OR REPLACE FUNCTION ruvector_embed_vec(text_input text, model_name text DEFAULT 'all-MiniLM-L6-v2') +RETURNS ruvector +AS $$ + SELECT replace(replace(ruvector_embed(text_input, model_name)::text, '{', '['), '}', ']')::ruvector; +$$ LANGUAGE SQL VOLATILE STRICT PARALLEL SAFE; diff --git a/crates/ruvector-postgres/sql/ruvector--0.1.0.sql b/crates/ruvector-postgres/sql/ruvector--0.1.0.sql index dca520c32..b41ffecc1 100644 --- a/crates/ruvector-postgres/sql/ruvector--0.1.0.sql +++ b/crates/ruvector-postgres/sql/ruvector--0.1.0.sql @@ -780,9 +780,80 @@ COMMENT ON FUNCTION graph_bipartite_score(real[], real[], real) IS 'Compute bipa -- ============================================================================ -- Embedding Generation Functions -- ============================================================================ --- Note: Embedding functions require the 'embeddings' feature flag to be enabled --- during compilation. These functions are not available in the default build. --- To enable, build with: cargo pgrx package --features embeddings +-- These functions require the 'embeddings' feature flag at compile time. +-- The Docker image builds with --features embeddings, so they are available. + +-- Generate embedding from text using default or specified model +CREATE OR REPLACE FUNCTION ruvector_embed(text text, model_name text DEFAULT 'all-MiniLM-L6-v2') +RETURNS real[] +AS 'MODULE_PATHNAME', 'ruvector_embed_wrapper' +LANGUAGE C VOLATILE STRICT PARALLEL SAFE; + +-- Generate embeddings for multiple texts in batch +CREATE OR REPLACE FUNCTION ruvector_embed_batch(texts text[], model_name text DEFAULT 'all-MiniLM-L6-v2') +RETURNS real[][] +AS 'MODULE_PATHNAME', 'ruvector_embed_batch_wrapper' +LANGUAGE C VOLATILE STRICT PARALLEL SAFE; + +-- List all available embedding models +CREATE OR REPLACE FUNCTION ruvector_embedding_models() +RETURNS TABLE ( + model_name text, + dimensions integer, + description text, + is_loaded boolean +) +AS 'MODULE_PATHNAME', 'ruvector_embedding_models_wrapper' +LANGUAGE C VOLATILE PARALLEL SAFE; + +-- Load embedding model into memory +CREATE OR REPLACE FUNCTION ruvector_load_model(model_name text) +RETURNS boolean +AS 'MODULE_PATHNAME', 'ruvector_load_model_wrapper' +LANGUAGE C VOLATILE STRICT PARALLEL SAFE; + +-- Unload embedding model from memory +CREATE OR REPLACE FUNCTION ruvector_unload_model(model_name text) +RETURNS boolean +AS 'MODULE_PATHNAME', 'ruvector_unload_model_wrapper' +LANGUAGE C VOLATILE STRICT PARALLEL SAFE; + +-- Get information about a specific model +CREATE OR REPLACE FUNCTION ruvector_model_info(model_name text) +RETURNS jsonb +AS 'MODULE_PATHNAME', 'ruvector_model_info_wrapper' +LANGUAGE C VOLATILE STRICT PARALLEL SAFE; + +-- Set default embedding model +CREATE OR REPLACE FUNCTION ruvector_set_default_model(model_name text) +RETURNS boolean +AS 'MODULE_PATHNAME', 'ruvector_set_default_model_wrapper' +LANGUAGE C VOLATILE STRICT PARALLEL SAFE; + +-- Get current default embedding model +CREATE OR REPLACE FUNCTION ruvector_default_model() +RETURNS text +AS 'MODULE_PATHNAME', 'ruvector_default_model_wrapper' +LANGUAGE C VOLATILE PARALLEL SAFE; + +-- Get embedding generation statistics +CREATE OR REPLACE FUNCTION ruvector_embedding_stats() +RETURNS jsonb +AS 'MODULE_PATHNAME', 'ruvector_embedding_stats_wrapper' +LANGUAGE C VOLATILE PARALLEL SAFE; + +-- Get dimensions for a specific model +CREATE OR REPLACE FUNCTION ruvector_embedding_dims(model_name text) +RETURNS integer +AS 'MODULE_PATHNAME', 'ruvector_embedding_dims_wrapper' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- Convenience: text → ruvector type in one call +CREATE OR REPLACE FUNCTION ruvector_embed_vec(text_input text, model_name text DEFAULT 'all-MiniLM-L6-v2') +RETURNS ruvector +AS $$ + SELECT replace(replace(ruvector_embed(text_input, model_name)::text, '{', '['), '}', ']')::ruvector; +$$ LANGUAGE SQL VOLATILE STRICT PARALLEL SAFE; -- ============================================================================ -- HNSW Access Method diff --git a/crates/ruvector-postgres/sql/ruvector--2.0.0.sql b/crates/ruvector-postgres/sql/ruvector--2.0.0.sql index c62b692df..d83a967b0 100644 --- a/crates/ruvector-postgres/sql/ruvector--2.0.0.sql +++ b/crates/ruvector-postgres/sql/ruvector--2.0.0.sql @@ -781,9 +781,80 @@ COMMENT ON FUNCTION graph_bipartite_score(real[], real[], real) IS 'Compute bipa -- ============================================================================ -- Embedding Generation Functions -- ============================================================================ --- Note: Embedding functions require the 'embeddings' feature flag to be enabled --- during compilation. These functions are not available in the default build. --- To enable, build with: cargo pgrx package --features embeddings +-- These functions require the 'embeddings' feature flag at compile time. +-- The Docker image builds with --features embeddings, so they are available. + +-- Generate embedding from text using default or specified model +CREATE OR REPLACE FUNCTION ruvector_embed(text text, model_name text DEFAULT 'all-MiniLM-L6-v2') +RETURNS real[] +AS 'MODULE_PATHNAME', 'ruvector_embed_wrapper' +LANGUAGE C VOLATILE STRICT PARALLEL SAFE; + +-- Generate embeddings for multiple texts in batch +CREATE OR REPLACE FUNCTION ruvector_embed_batch(texts text[], model_name text DEFAULT 'all-MiniLM-L6-v2') +RETURNS real[][] +AS 'MODULE_PATHNAME', 'ruvector_embed_batch_wrapper' +LANGUAGE C VOLATILE STRICT PARALLEL SAFE; + +-- List all available embedding models +CREATE OR REPLACE FUNCTION ruvector_embedding_models() +RETURNS TABLE ( + model_name text, + dimensions integer, + description text, + is_loaded boolean +) +AS 'MODULE_PATHNAME', 'ruvector_embedding_models_wrapper' +LANGUAGE C VOLATILE PARALLEL SAFE; + +-- Load embedding model into memory +CREATE OR REPLACE FUNCTION ruvector_load_model(model_name text) +RETURNS boolean +AS 'MODULE_PATHNAME', 'ruvector_load_model_wrapper' +LANGUAGE C VOLATILE STRICT PARALLEL SAFE; + +-- Unload embedding model from memory +CREATE OR REPLACE FUNCTION ruvector_unload_model(model_name text) +RETURNS boolean +AS 'MODULE_PATHNAME', 'ruvector_unload_model_wrapper' +LANGUAGE C VOLATILE STRICT PARALLEL SAFE; + +-- Get information about a specific model +CREATE OR REPLACE FUNCTION ruvector_model_info(model_name text) +RETURNS jsonb +AS 'MODULE_PATHNAME', 'ruvector_model_info_wrapper' +LANGUAGE C VOLATILE STRICT PARALLEL SAFE; + +-- Set default embedding model +CREATE OR REPLACE FUNCTION ruvector_set_default_model(model_name text) +RETURNS boolean +AS 'MODULE_PATHNAME', 'ruvector_set_default_model_wrapper' +LANGUAGE C VOLATILE STRICT PARALLEL SAFE; + +-- Get current default embedding model +CREATE OR REPLACE FUNCTION ruvector_default_model() +RETURNS text +AS 'MODULE_PATHNAME', 'ruvector_default_model_wrapper' +LANGUAGE C VOLATILE PARALLEL SAFE; + +-- Get embedding generation statistics +CREATE OR REPLACE FUNCTION ruvector_embedding_stats() +RETURNS jsonb +AS 'MODULE_PATHNAME', 'ruvector_embedding_stats_wrapper' +LANGUAGE C VOLATILE PARALLEL SAFE; + +-- Get dimensions for a specific model +CREATE OR REPLACE FUNCTION ruvector_embedding_dims(model_name text) +RETURNS integer +AS 'MODULE_PATHNAME', 'ruvector_embedding_dims_wrapper' +LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- Convenience: text → ruvector type in one call +CREATE OR REPLACE FUNCTION ruvector_embed_vec(text_input text, model_name text DEFAULT 'all-MiniLM-L6-v2') +RETURNS ruvector +AS $$ + SELECT replace(replace(ruvector_embed(text_input, model_name)::text, '{', '['), '}', ']')::ruvector; +$$ LANGUAGE SQL VOLATILE STRICT PARALLEL SAFE; -- ============================================================================ -- HNSW Access Method