diff --git a/docs/modules/gigamap/pages/indexing/jvector/configuration.adoc b/docs/modules/gigamap/pages/indexing/jvector/configuration.adoc index 8709c8c4..326bc71f 100644 --- a/docs/modules/gigamap/pages/indexing/jvector/configuration.adoc +++ b/docs/modules/gigamap/pages/indexing/jvector/configuration.adoc @@ -141,6 +141,10 @@ For datasets that exceed available memory, enable on-disk storage to use memory- |`pqSubspaces` |`0` |Number of PQ subspaces (0 = auto: dimension/4). + +|`parallelOnDiskWrite` +|`false` +|Use parallel direct buffers and multiple worker threads for on-disk index writing. Speeds up persistence for large indices but uses more resources. Only applies when `onDisk=true`. |=== === Example @@ -157,6 +161,55 @@ VectorIndexConfiguration config = VectorIndexConfiguration.builder() .build(); ---- +== Eventual Indexing + +Enable eventual indexing to defer expensive HNSW graph mutations to a background thread, reducing mutation latency at the cost of eventual search consistency. + +[options="header",cols="1,1,3"] +|=== +|Parameter |Default |Description + +|`eventualIndexing` +|`false` +|Defer HNSW graph mutations (add, update, remove) to a background thread. The vector store is updated synchronously, but graph construction happens asynchronously. Search results may not immediately reflect the most recent mutations. +|=== + +When enabled: + +* The vector store is always updated synchronously (no data loss). +* HNSW graph mutations are queued and applied by a single background worker thread. +* The queue is automatically drained before `optimize()`, `persistToDisk()`, and `close()`. + +=== Example + +[source, java] +---- +VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(768) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); +---- + +== Parallel On-Disk Writes + +When on-disk storage is enabled, persistence can optionally use parallel direct buffers and multiple worker threads (one per available processor) to write the index concurrently. This can significantly speed up persistence for large indices. + +This is disabled by default, as sequential single-threaded writing is preferred in resource-constrained environments or for smaller indices. + +=== Example + +[source, java] +---- +VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(768) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(Path.of("/data/vectors")) + .parallelOnDiskWrite(true) + .build(); +---- + == Background Persistence Enable automatic asynchronous persistence to avoid blocking operations during writes. diff --git a/gigamap/jvector/README.md b/gigamap/jvector/README.md index 43c4f099..d68ddc59 100644 --- a/gigamap/jvector/README.md +++ b/gigamap/jvector/README.md @@ -10,6 +10,8 @@ A Java library that integrates [JVector](https://github.com/datastax/jvector) (h - **PQ Compression**: Product Quantization for reduced memory footprint - **Background Persistence**: Automatic asynchronous persistence at configurable intervals - **Background Optimization**: Periodic graph cleanup for improved query performance +- **Eventual Indexing**: Deferred graph mutations via background thread for reduced write latency +- **Parallel On-Disk Writes**: Multi-threaded index persistence for large on-disk indices - **Lazy Entity Access**: Search results provide direct access to entities without additional lookups - **Stream API**: Java Stream support for search results - **GigaMap Integration**: Seamlessly integrates with GigaMap's index system @@ -163,6 +165,13 @@ List topDocs = result.stream() | `indexDirectory` | `null` | Directory for index files (required if `onDisk=true`) | | `enablePqCompression` | `false` | Enable Product Quantization compression | | `pqSubspaces` | `0` | Number of PQ subspaces (0 = auto: dimension/4) | +| `parallelOnDiskWrite` | `false` | Use parallel direct buffers and multiple worker threads for on-disk index writing. Speeds up persistence for large indices but uses more resources. Only applies when `onDisk=true` | + +### Eventual Indexing + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `eventualIndexing` | `false` | Defer HNSW graph mutations to a background thread. The vector store is updated synchronously, but graph construction happens asynchronously. Reduces mutation latency at the cost of eventual search consistency | ### Background Persistence @@ -223,6 +232,38 @@ VectorIndexConfiguration config = VectorIndexConfiguration.builder() .build(); ``` +### Eventual Indexing + +For high-throughput systems where mutation latency matters more than immediate search consistency: + +```java +VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(768) + .similarityFunction(VectorSimilarityFunction.COSINE) + // Eventual indexing (graph mutations deferred to background thread) + .eventualIndexing(true) + .build(); +``` + +When enabled, the vector store is always updated synchronously (no data loss), but expensive HNSW graph mutations are queued and applied by a background worker thread. Search results may not immediately reflect the most recent mutations. The queue is automatically drained before `optimize()`, `persistToDisk()`, and `close()`. + +### Parallel On-Disk Writes + +For large on-disk indices where persistence speed is critical: + +```java +VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(768) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(Path.of("/data/vectors")) + // Parallel on-disk writing (multiple worker threads) + .parallelOnDiskWrite(true) + .build(); +``` + +When enabled, the on-disk graph writer uses parallel direct buffers and multiple worker threads (one per available processor) to write the index concurrently. This is disabled by default as sequential writing is preferred in resource-constrained environments or for smaller indices. + ### Manual Optimization and Persistence ```java diff --git a/gigamap/jvector/pom.xml b/gigamap/jvector/pom.xml index 3da2236f..ab5cf765 100644 --- a/gigamap/jvector/pom.xml +++ b/gigamap/jvector/pom.xml @@ -19,7 +19,7 @@ https://projects.eclipse.org/projects/technology.store - 4.0.0-rc.7 + 4.0.0-rc.8 @@ -44,6 +44,12 @@ junit-jupiter-engine test + + org.awaitility + awaitility + 4.2.2 + test + diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundOptimizationManager.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundOptimizationManager.java deleted file mode 100644 index cace6283..00000000 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundOptimizationManager.java +++ /dev/null @@ -1,239 +0,0 @@ -package org.eclipse.store.gigamap.jvector; - -/*- - * #%L - * EclipseStore GigaMap JVector - * %% - * Copyright (C) 2023 - 2026 MicroStream Software - * %% - * This program and the accompanying materials are made - * available under the terms of the Eclipse Public License 2.0 - * which is available at https://www.eclipse.org/legal/epl-2.0/ - * - * SPDX-License-Identifier: EPL-2.0 - * #L% - */ - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.concurrent.*; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; - -/** - * Manages background optimization for a VectorIndex. - *

- * This manager handles: - *

- */ -interface BackgroundOptimizationManager -{ - /** - * Marks the index as dirty with the specified number of changes. - * - * @param count the number of changes - */ - public void markDirty(int count); - - /** - * Starts the scheduled background optimization task. - */ - public void startScheduledOptimization(); - - /** - * Shuts down the optimization manager. - * - * @param optimizePending if true and there are pending changes, optimize before shutdown - */ - public void shutdown(boolean optimizePending); - - /** - * Returns the number of times optimization has been performed. - * This is useful for testing and monitoring. - * - * @return the optimization count - */ - public long getOptimizationCount(); - - /** - * Returns the current pending change count. - * This is useful for testing and monitoring. - * - * @return the pending change count - */ - public int getPendingChangeCount(); - - - /** - * Callback interface for the optimization manager to perform optimization. - */ - public interface Callback - { - /** - * Performs optimization of the index. - */ - public void optimize(); - } - - - /** - * Default implementation of BackgroundOptimizationManager. - */ - public static class Default implements BackgroundOptimizationManager - { - private static final Logger LOG = LoggerFactory.getLogger(BackgroundOptimizationManager.class); - - private final Callback callback ; - private final String name ; - private final long intervalMs; - private final int minChanges; - private final ScheduledExecutorService scheduler ; - - private final AtomicInteger changeCount = new AtomicInteger(0); - private final AtomicLong optimizationCount = new AtomicLong(0); - - private ScheduledFuture scheduledTask; - private volatile boolean shutdown = false; - - Default( - final Callback callback , - final String name , - final long intervalMs, - final int minChanges - ) - { - this.callback = callback; - this.name = name; - this.intervalMs = intervalMs; - this.minChanges = minChanges; - - this.scheduler = Executors.newSingleThreadScheduledExecutor(r -> - { - final Thread t = new Thread(r, "VectorIndex-BackgroundOptimization-" + name); - t.setDaemon(true); - return t; - }); - } - - @Override - public void markDirty(final int count) - { - this.changeCount.addAndGet(count); - } - - @Override - public long getOptimizationCount() - { - return this.optimizationCount.get(); - } - - @Override - public int getPendingChangeCount() - { - return this.changeCount.get(); - } - - @Override - public void startScheduledOptimization() - { - this.scheduledTask = this.scheduler.scheduleAtFixedRate( - this::optimizeNowIfDirty, - this.intervalMs, - this.intervalMs, - TimeUnit.MILLISECONDS - ); - } - - /** - * Optimizes the index if the change threshold has been met. - */ - private void optimizeNowIfDirty() - { - if(this.shutdown) - { - return; - } - - final int currentChanges = this.changeCount.get(); - if(currentChanges < this.minChanges) - { - return; - } - - this.optimizeNow(); - } - - /** - * Optimizes the index immediately. - */ - private void optimizeNow() - { - LOG.debug("Background optimizing index '{}' with {} changes", - this.name, this.changeCount.get()); - - try - { - this.callback.optimize(); - - // Reset change count and increment optimization counter after success - this.changeCount.set(0); - this.optimizationCount.incrementAndGet(); - - LOG.debug("Background optimization completed for '{}'", this.name); - } - catch(final Exception e) - { - LOG.error("Background optimization failed for '{}': {}", this.name, e.getMessage(), e); - } - } - - @Override - public void shutdown(final boolean optimizePending) - { - this.shutdown = true; - - // Cancel the scheduled task - if(this.scheduledTask != null) - { - this.scheduledTask.cancel(false); - this.scheduledTask = null; - } - - // Optimize pending changes if requested - final int pendingChanges = this.changeCount.get(); - if(optimizePending && pendingChanges > 0) - { - LOG.info("Optimizing pending changes for '{}' before shutdown ({} changes)", - this.name, pendingChanges); - this.optimizeNow(); - } - - // Shutdown the scheduler - this.scheduler.shutdown(); - try - { - if(!this.scheduler.awaitTermination(30, TimeUnit.SECONDS)) - { - LOG.warn("Background optimization scheduler did not terminate gracefully for '{}'", - this.name); - this.scheduler.shutdownNow(); - } - } - catch(final InterruptedException e) - { - Thread.currentThread().interrupt(); - this.scheduler.shutdownNow(); - } - - LOG.info("Background optimization manager shutdown for '{}'", this.name); - } - - } - -} diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundPersistenceManager.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundPersistenceManager.java deleted file mode 100644 index 514d8721..00000000 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundPersistenceManager.java +++ /dev/null @@ -1,211 +0,0 @@ -package org.eclipse.store.gigamap.jvector; - -/*- - * #%L - * EclipseStore GigaMap JVector - * %% - * Copyright (C) 2023 - 2026 MicroStream Software - * %% - * This program and the accompanying materials are made - * available under the terms of the Eclipse Public License 2.0 - * which is available at https://www.eclipse.org/legal/epl-2.0/ - * - * SPDX-License-Identifier: EPL-2.0 - * #L% - */ - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.ScheduledFuture; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; - -/** - * Manages background persistence for a VectorIndex. - *

- * This manager handles: - *

- */ -interface BackgroundPersistenceManager -{ - /** - * Marks the index as dirty with the specified number of changes. - * - * @param count the number of changes - */ - public void markDirty(int count); - - /** - * Starts the scheduled background persistence task. - */ - public void startScheduledPersistence(); - - /** - * Shuts down the persistence manager. - * - * @param persistPending if true and there are pending changes, persist before shutdown - */ - public void shutdown(boolean persistPending); - - - /** - * Callback interface for the persistence manager to perform persistence. - */ - public interface Callback - { - /** - * Persists the index to disk. - */ - public void persistToDisk(); - } - - - /** - * Default implementation of BackgroundPersistenceManager. - */ - public static class Default implements BackgroundPersistenceManager - { - private static final Logger LOG = LoggerFactory.getLogger(BackgroundPersistenceManager.class); - - private final Callback callback ; - private final String name ; - private final long intervalMs; - private final int minChanges; - private final ScheduledExecutorService scheduler ; - - private final AtomicInteger changeCount = new AtomicInteger(0); - - private ScheduledFuture scheduledTask; - private volatile boolean shutdown = false; - - Default( - final Callback callback , - final String name , - final long intervalMs, - final int minChanges - ) - { - this.callback = callback ; - this.name = name ; - this.intervalMs = intervalMs; - this.minChanges = minChanges; - - this.scheduler = Executors.newSingleThreadScheduledExecutor(r -> - { - final Thread t = new Thread(r, "VectorIndex-BackgroundPersistence-" + name); - t.setDaemon(true); - return t; - }); - } - - @Override - public void markDirty(final int count) - { - this.changeCount.addAndGet(count); - } - - @Override - public void startScheduledPersistence() - { - this.scheduledTask = this.scheduler.scheduleAtFixedRate( - this::persistNowIfDirty, - this.intervalMs, - this.intervalMs, - TimeUnit.MILLISECONDS - ); - } - - /** - * Persists the index if the change threshold has been met. - */ - private void persistNowIfDirty() - { - if(this.shutdown) - { - return; - } - - final int currentChanges = this.changeCount.get(); - if(currentChanges < this.minChanges) - { - return; - } - - this.persistNow(); - } - - /** - * Persists the index immediately, regardless of dirty state or threshold. - */ - private void persistNow() - { - LOG.debug("Background persisting index '{}' with {} changes", - this.name, this.changeCount.get()); - - try - { - this.callback.persistToDisk(); - - // Reset change count after successful persistence - this.changeCount.set(0); - - LOG.debug("Background persistence completed for '{}'", this.name); - } - catch(final Exception e) - { - LOG.error("Background persistence failed for '{}': {}", this.name, e.getMessage(), e); - } - } - - @Override - public void shutdown(final boolean persistPending) - { - this.shutdown = true; - - // Cancel the scheduled task - if(this.scheduledTask != null) - { - this.scheduledTask.cancel(false); - this.scheduledTask = null; - } - - // Persist pending changes if requested - final int pendingChanges = this.changeCount.get(); - if(persistPending && pendingChanges > 0) - { - LOG.info("Persisting pending changes for '{}' before shutdown ({} changes)", - this.name, pendingChanges); - this.persistNow(); - } - - // Shutdown the scheduler - this.scheduler.shutdown(); - try - { - if(!this.scheduler.awaitTermination(30, TimeUnit.SECONDS)) - { - LOG.warn("Background persistence scheduler did not terminate gracefully for '{}'", - this.name); - this.scheduler.shutdownNow(); - } - } - catch(final InterruptedException e) - { - Thread.currentThread().interrupt(); - this.scheduler.shutdownNow(); - } - - LOG.info("Background persistence manager shutdown for '{}'", this.name); - } - - } - -} diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundTaskManager.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundTaskManager.java new file mode 100644 index 00000000..1f9adf7d --- /dev/null +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundTaskManager.java @@ -0,0 +1,558 @@ +package org.eclipse.store.gigamap.jvector; + +/*- + * #%L + * EclipseStore GigaMap JVector + * %% + * Copyright (C) 2023 - 2026 MicroStream Software + * %% + * This program and the accompanying materials are made + * available under the terms of the Eclipse Public License 2.0 + * which is available at https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + * #L% + */ + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Unified background task manager for VectorIndex. + *

+ * Consolidates background indexing, optimization, and persistence into a single + * {@link ScheduledExecutorService} with one daemon thread. All three workloads + * serialize on the same builder write-lock and never do useful work in parallel, + * so a single thread is sufficient. + *

+ * This manager handles: + *

+ */ +class BackgroundTaskManager +{ + private static final Logger LOG = LoggerFactory.getLogger(BackgroundTaskManager.class); + + // ======================================================================== + // Indexing Operations + // ======================================================================== + + /** + * Sealed interface for indexing operations that can be queued. + */ + sealed interface IndexingOperation + permits IndexingOperation.Add, + IndexingOperation.Update, + IndexingOperation.Remove + { + void execute(Callback callback); + + /** + * Add a node to the HNSW graph. + */ + record Add(int ordinal, float[] vector) implements IndexingOperation + { + @Override + public void execute(final Callback callback) + { + callback.applyGraphAdd(this.ordinal, this.vector); + callback.markDirtyForBackgroundManagers(1); + } + } + + /** + * Update a node in the HNSW graph (delete + re-add). + */ + record Update(int ordinal, float[] vector) implements IndexingOperation + { + @Override + public void execute(final Callback callback) + { + callback.applyGraphUpdate(this.ordinal, this.vector); + callback.markDirtyForBackgroundManagers(1); + } + } + + /** + * Remove a node from the HNSW graph. + */ + record Remove(int ordinal) implements IndexingOperation + { + @Override + public void execute(final Callback callback) + { + callback.applyGraphRemove(this.ordinal); + callback.markDirtyForBackgroundManagers(1); + } + } + } + + // ======================================================================== + // Callback + // ======================================================================== + + /** + * Callback interface for applying graph operations and core optimization/persistence. + * Implemented by {@code VectorIndex.Default}. + */ + interface Callback + { + void applyGraphAdd(int ordinal, float[] vector); + + void applyGraphUpdate(int ordinal, float[] vector); + + void applyGraphRemove(int ordinal); + + void markDirtyForBackgroundManagers(int count); + + /** + * Core optimization logic without queue drain. + * Called from the executor thread (inline drain already done). + */ + void doOptimize(); + + /** + * Core persistence logic without queue drain. + * Called from the executor thread (inline drain already done). + */ + void doPersistToDisk(); + } + + // ======================================================================== + // Instance fields + // ======================================================================== + + private final Callback callback ; + private final String name ; + private final ScheduledExecutorService executor ; + + // Indexing queue and dedup flag + private final ConcurrentLinkedQueue indexingQueue ; + private final AtomicBoolean indexingTaskScheduled ; + + // Optimization state + private final AtomicInteger optimizationChangeCount; + private final AtomicLong optimizationCount ; + private final int optimizationMinChanges ; + private ScheduledFuture optimizationTask ; + + // Persistence state + private final AtomicInteger persistenceChangeCount; + private final int persistenceMinChanges ; + private ScheduledFuture persistenceTask ; + + private volatile boolean shutdown = false; + + // ======================================================================== + // Constructor + // ======================================================================== + + BackgroundTaskManager( + final Callback callback, + final String name, + final boolean eventualIndexing, + final boolean backgroundOptimization, + final long optimizationIntervalMs, + final int optimizationMinChanges, + final boolean backgroundPersistence, + final long persistenceIntervalMs, + final int persistenceMinChanges + ) + { + this.callback = callback; + this.name = name ; + + this.executor = Executors.newSingleThreadScheduledExecutor(r -> + { + final Thread t = new Thread(r, "VectorIndex-Background-" + name); + t.setDaemon(true); + return t; + }); + + // Indexing + this.indexingQueue = new ConcurrentLinkedQueue<>(); + this.indexingTaskScheduled = new AtomicBoolean(false); + + // Optimization + this.optimizationChangeCount = new AtomicInteger(0); + this.optimizationCount = new AtomicLong(0); + this.optimizationMinChanges = optimizationMinChanges; + + // Persistence + this.persistenceChangeCount = new AtomicInteger(0); + this.persistenceMinChanges = persistenceMinChanges; + + // Start scheduled tasks + if(backgroundOptimization) + { + this.optimizationTask = this.executor.scheduleAtFixedRate( + this::runOptimizationIfDirty, + optimizationIntervalMs, + optimizationIntervalMs, + TimeUnit.MILLISECONDS + ); + LOG.info("Background optimization started for index '{}' with interval {}ms", + name, optimizationIntervalMs); + } + + if(backgroundPersistence) + { + this.persistenceTask = this.executor.scheduleAtFixedRate( + this::runPersistenceIfDirty, + persistenceIntervalMs, + persistenceIntervalMs, + TimeUnit.MILLISECONDS + ); + LOG.info("Background persistence started for index '{}' with interval {}ms", + name, persistenceIntervalMs); + } + + if(eventualIndexing) + { + LOG.info("Eventual indexing enabled for index '{}'", name); + } + } + + // ======================================================================== + // Indexing queue methods + // ======================================================================== + + /** + * Enqueues an indexing operation for background processing. + */ + void enqueue(final IndexingOperation op) + { + this.indexingQueue.add(op); + if(this.indexingTaskScheduled.compareAndSet(false, true)) + { + this.executor.submit(this::processIndexingBatch); + } + } + + /** + * Blocks until all currently enqueued indexing operations have been applied. + * Called from user threads (not the executor thread) before optimize/persistToDisk. + */ + void drainQueue() + { + if(this.shutdown) + { + return; + } + + try + { + this.executor.submit(this::processAllPendingIndexingOps).get(); + } + catch(final InterruptedException e) + { + Thread.currentThread().interrupt(); + LOG.warn("Interrupted while draining indexing queue for '{}'", this.name); + } + catch(final ExecutionException e) + { + LOG.error("Error while draining indexing queue for '{}': {}", this.name, e.getMessage(), e); + } + } + + /** + * Discards all pending indexing operations without applying them. + * Used during {@code internalRemoveAll()} where pending operations + * refer to stale ordinals that are no longer valid. + */ + void discardQueue() + { + final int discarded = this.indexingQueue.size(); + this.indexingQueue.clear(); + this.indexingTaskScheduled.set(false); + if(discarded > 0) + { + LOG.info("Discarded {} pending indexing operations for '{}'", discarded, this.name); + } + } + + /** + * Returns the number of pending indexing operations in the queue. + */ + int getPendingIndexingCount() + { + return this.indexingQueue.size(); + } + + // ======================================================================== + // Optimization monitoring + // ======================================================================== + + /** + * Marks dirty for optimization and persistence tracking. + */ + void markDirty(final int count) + { + this.optimizationChangeCount.addAndGet(count); + this.persistenceChangeCount.addAndGet(count); + } + + /** + * Returns the number of times optimization has been performed. + */ + long getOptimizationCount() + { + return this.optimizationCount.get(); + } + + /** + * Returns the current pending change count for optimization. + */ + int getOptimizationPendingChangeCount() + { + return this.optimizationChangeCount.get(); + } + + // ======================================================================== + // Shutdown + // ======================================================================== + + /** + * Shuts down the background task manager. + * + * @param drainPending if true, drain all pending indexing operations + * @param optimizePending if true and there are pending changes, optimize before shutdown + * @param persistPending if true and there are pending changes, persist before shutdown + */ + void shutdown(final boolean drainPending, final boolean optimizePending, final boolean persistPending) + { + this.shutdown = true; + + // Cancel scheduled tasks + if(this.optimizationTask != null) + { + this.optimizationTask.cancel(false); + this.optimizationTask = null; + } + if(this.persistenceTask != null) + { + this.persistenceTask.cancel(false); + this.persistenceTask = null; + } + + // Perform final work if requested + if(drainPending || optimizePending || persistPending) + { + try + { + this.executor.submit(() -> this.finalShutdownWork(drainPending, optimizePending, persistPending)) + .get(30, TimeUnit.SECONDS); + } + catch(final InterruptedException e) + { + Thread.currentThread().interrupt(); + } + catch(final ExecutionException e) + { + LOG.error("Error during shutdown work for '{}': {}", this.name, e.getMessage(), e); + } + catch(final TimeoutException e) + { + LOG.warn("Shutdown work timed out for '{}'", this.name); + } + } + + // Shutdown the executor + this.executor.shutdown(); + try + { + if(!this.executor.awaitTermination(30, TimeUnit.SECONDS)) + { + LOG.warn("Background task executor did not terminate gracefully for '{}'", this.name); + this.executor.shutdownNow(); + } + } + catch(final InterruptedException e) + { + Thread.currentThread().interrupt(); + this.executor.shutdownNow(); + } + + LOG.info("Background task manager shutdown for '{}'", this.name); + } + + // ======================================================================== + // Internal methods — all run on the executor thread + // ======================================================================== + + /** + * Processes all pending indexing ops in a batch. + * Called via {@code executor.submit()} when ops are enqueued. + */ + private void processIndexingBatch() + { + try + { + this.processAllPendingIndexingOps(); + } + finally + { + this.indexingTaskScheduled.set(false); + // Re-check: if new ops were added after we polled the last one + // but before we reset the flag, schedule another batch. + if(!this.indexingQueue.isEmpty()) + { + if(this.indexingTaskScheduled.compareAndSet(false, true)) + { + this.executor.submit(this::processIndexingBatch); + } + } + } + } + + /** + * Polls and executes all currently queued indexing operations. + * Safe to call from the executor thread (inline) or via {@code Future.get()} from user threads. + */ + private void processAllPendingIndexingOps() + { + IndexingOperation op; + while((op = this.indexingQueue.poll()) != null) + { + try + { + op.execute(this.callback); + } + catch(final Exception e) + { + LOG.error("Error applying indexing operation for '{}': {}", this.name, e.getMessage(), e); + } + } + } + + /** + * Runs optimization if the change threshold has been met. + * Called by the scheduled optimization task on the executor thread. + */ + private void runOptimizationIfDirty() + { + if(this.shutdown) + { + return; + } + + if(this.optimizationChangeCount.get() < this.optimizationMinChanges) + { + return; + } + + LOG.debug("Background optimizing index '{}' with {} changes", + this.name, this.optimizationChangeCount.get()); + + try + { + // Drain pending indexing ops inline (same thread, no deadlock) + this.processAllPendingIndexingOps(); + + this.callback.doOptimize(); + + this.optimizationChangeCount.set(0); + this.optimizationCount.incrementAndGet(); + + LOG.debug("Background optimization completed for '{}'", this.name); + } + catch(final Exception e) + { + LOG.error("Background optimization failed for '{}': {}", this.name, e.getMessage(), e); + } + } + + /** + * Runs persistence if the change threshold has been met. + * Called by the scheduled persistence task on the executor thread. + */ + private void runPersistenceIfDirty() + { + if(this.shutdown) + { + return; + } + + if(this.persistenceChangeCount.get() < this.persistenceMinChanges) + { + return; + } + + LOG.debug("Background persisting index '{}' with {} changes", + this.name, this.persistenceChangeCount.get()); + + try + { + // Drain pending indexing ops inline (same thread, no deadlock) + this.processAllPendingIndexingOps(); + + this.callback.doPersistToDisk(); + + this.persistenceChangeCount.set(0); + + LOG.debug("Background persistence completed for '{}'", this.name); + } + catch(final Exception e) + { + LOG.error("Background persistence failed for '{}': {}", this.name, e.getMessage(), e); + } + } + + /** + * Performs final shutdown work on the executor thread. + */ + private void finalShutdownWork( + final boolean drainPending, + final boolean optimizePending, + final boolean persistPending + ) + { + if(drainPending) + { + LOG.info("Draining {} pending indexing operations for '{}' before shutdown", + this.indexingQueue.size(), this.name); + this.processAllPendingIndexingOps(); + } + + if(optimizePending && this.optimizationChangeCount.get() > 0) + { + LOG.info("Optimizing pending changes for '{}' before shutdown ({} changes)", + this.name, this.optimizationChangeCount.get()); + try + { + this.callback.doOptimize(); + this.optimizationChangeCount.set(0); + this.optimizationCount.incrementAndGet(); + } + catch(final Exception e) + { + LOG.error("Shutdown optimization failed for '{}': {}", this.name, e.getMessage(), e); + } + } + + if(persistPending && this.persistenceChangeCount.get() > 0) + { + LOG.info("Persisting pending changes for '{}' before shutdown ({} changes)", + this.name, this.persistenceChangeCount.get()); + try + { + this.callback.doPersistToDisk(); + this.persistenceChangeCount.set(0); + } + catch(final Exception e) + { + LOG.error("Shutdown persistence failed for '{}': {}", this.name, e.getMessage(), e); + } + } + } + +} diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/DiskIndexManager.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/DiskIndexManager.java index 5a2a915c..2abe6f62 100644 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/DiskIndexManager.java +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/DiskIndexManager.java @@ -20,6 +20,7 @@ import io.github.jbellis.jvector.graph.RandomAccessVectorValues; import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex; import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndexWriter; +import io.github.jbellis.jvector.graph.disk.OnDiskParallelGraphIndexWriter; import io.github.jbellis.jvector.graph.disk.feature.Feature; import io.github.jbellis.jvector.graph.disk.feature.FeatureId; import io.github.jbellis.jvector.graph.disk.feature.FusedPQ; @@ -126,29 +127,32 @@ public static class Default implements DiskIndexManager { private static final Logger LOG = LoggerFactory.getLogger(DiskIndexManager.class); - private final IndexStateProvider provider ; - private final String name ; - private final Path indexDirectory; - private final int dimension ; - private final int maxDegree ; + private final IndexStateProvider provider ; + private final String name ; + private final Path indexDirectory ; + private final int dimension ; + private final int maxDegree ; + private final boolean parallelOnDiskWrite ; private OnDiskGraphIndex diskIndex ; private ReaderSupplier readerSupplier; private boolean loaded ; Default( - final IndexStateProvider provider , - final String name , - final Path indexDirectory, - final int dimension , - final int maxDegree + final IndexStateProvider provider , + final String name , + final Path indexDirectory , + final int dimension , + final int maxDegree , + final boolean parallelOnDiskWrite ) { - this.provider = provider ; - this.name = name ; - this.indexDirectory = indexDirectory; - this.dimension = dimension ; - this.maxDegree = maxDegree ; + this.provider = provider ; + this.name = name ; + this.indexDirectory = indexDirectory ; + this.dimension = dimension ; + this.maxDegree = maxDegree ; + this.parallelOnDiskWrite = parallelOnDiskWrite ; } @Override @@ -287,33 +291,46 @@ private void writeIndexWithFusedPQ( final InlineVectors inlineVectors = new InlineVectors(this.dimension); final FusedPQ fusedPQ = new FusedPQ(this.maxDegree, pq); - // Build writer with features using sequential renumbering (identity mapping) - try(final OnDiskGraphIndexWriter writer = new OnDiskGraphIndexWriter.Builder(index, graphPath) - .with(inlineVectors) - .with(fusedPQ) - .build()) - { - // Create feature suppliers that provide feature state for each node - final Map> suppliers = new EnumMap<>(FeatureId.class); - - suppliers.put(FeatureId.INLINE_VECTORS, nodeId -> - new InlineVectors.State(ravv.getVector(nodeId)) - ); + // Create feature suppliers that provide feature state for each node + final Map> suppliers = new EnumMap<>(FeatureId.class); - // Get a view for FusedPQ state creation - final var view = index.getView(); - suppliers.put(FeatureId.FUSED_PQ, nodeId -> - new FusedPQ.State(view, pqVectors, nodeId) - ); + suppliers.put(FeatureId.INLINE_VECTORS, nodeId -> + new InlineVectors.State(ravv.getVector(nodeId)) + ); - // Write with sequential renumbering (maintains ordinals) - writer.write(suppliers); + // Get a view for FusedPQ state creation + final var view = index.getView(); + suppliers.put(FeatureId.FUSED_PQ, nodeId -> + new FusedPQ.State(view, pqVectors, nodeId) + ); - // Close the view after writing - view.close(); + if(this.parallelOnDiskWrite) + { + try(final OnDiskParallelGraphIndexWriter writer = new OnDiskParallelGraphIndexWriter.Builder(index, graphPath) + .withParallelDirectBuffers(true) + .with(inlineVectors) + .with(fusedPQ) + .build()) + { + writer.write(suppliers); + } } + else + { + try(final OnDiskGraphIndexWriter writer = new OnDiskGraphIndexWriter.Builder(index, graphPath) + .with(inlineVectors) + .with(fusedPQ) + .build()) + { + writer.write(suppliers); + } + } + + // Close the view after writing + view.close(); - LOG.info("Wrote index '{}' with FusedPQ compression ({} nodes)", this.name, index.size(0)); + LOG.info("Wrote index '{}' with FusedPQ compression ({} nodes, parallel={})", + this.name, index.size(0), this.parallelOnDiskWrite); } /** diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorEntry.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorEntry.java index 36be53d4..3c2d8923 100644 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorEntry.java +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorEntry.java @@ -55,9 +55,13 @@ public Condition is(final Long key) public boolean equals(final Object obj) { if (obj == this) + { return true; + } if (obj == null || obj.getClass() != this.getClass()) + { return false; + } final VectorEntry that = (VectorEntry)obj; return this.sourceEntityId == that.sourceEntityId && Arrays.equals(this.vector, that.vector) diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java index bf143022..1f9e48b5 100644 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java @@ -35,6 +35,7 @@ import java.io.*; import java.util.*; +import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.locks.ReentrantReadWriteLock; /** @@ -53,6 +54,8 @@ *
  • On-Disk Storage - Optional memory-mapped indices for large datasets
  • *
  • PQ Compression - Product Quantization for reduced memory footprint
  • *
  • Background Optimization - Automatic graph cleanup for improved performance
  • + *
  • Eventual Indexing - Deferred graph mutations via background thread for reduced write latency
  • + *
  • Parallel On-Disk Writes - Multi-threaded index persistence for large on-disk indices
  • * * *

    Basic Usage

    @@ -160,6 +163,37 @@ * .build(); * } * + *

    Eventual Indexing

    + * When enabled, expensive HNSW graph mutations (add, update, remove) are deferred to a background + * thread. The vector store is still updated synchronously, so no data is lost, but graph construction + * happens asynchronously. This reduces the latency of mutation operations at the cost of eventual + * consistency — search results may not immediately reflect the most recent mutations. + *

    + * The graph is automatically drained (all pending operations applied) before + * {@code optimize()}, {@code persistToDisk()}, and {@code close()}. + *

    {@code
    + * VectorIndexConfiguration config = VectorIndexConfiguration.builder()
    + *     .dimension(768)
    + *     .similarityFunction(VectorSimilarityFunction.COSINE)
    + *     .eventualIndexing(true)
    + *     .build();
    + * }
    + * + *

    Parallel On-Disk Writes

    + * When on-disk storage is enabled, persistence can optionally use parallel direct buffers and + * multiple worker threads (one per available processor) to write the index concurrently. This can + * significantly speed up persistence for large indices. Disabled by default, as sequential + * single-threaded writing is preferred in resource-constrained environments or for smaller indices. + *
    {@code
    + * VectorIndexConfiguration config = VectorIndexConfiguration.builder()
    + *     .dimension(768)
    + *     .similarityFunction(VectorSimilarityFunction.COSINE)
    + *     .onDisk(true)
    + *     .indexDirectory(Path.of("/data/vectors"))
    + *     .parallelOnDiskWrite(true)
    + *     .build();
    + * }
    + * *

    Search Methods

    *
    {@code
      * // Search by vector
    @@ -226,6 +260,8 @@
      *   
  • Search - Thread-safe, multiple concurrent searches allowed
  • *
  • Add/Remove - Thread-safe via GigaMap synchronization
  • *
  • Optimization - Briefly blocks add/remove/search during cleanup
  • + *
  • Eventual Indexing - Graph mutations are applied sequentially by a single + * background worker thread; vector store updates remain synchronous
  • * * *

    Limitations

    @@ -593,8 +629,7 @@ public interface Internal extends VectorIndex public static class Default extends AbstractStateChangeFlagged implements VectorIndex.Internal, - BackgroundPersistenceManager.Callback, - BackgroundOptimizationManager.Callback, + BackgroundTaskManager.Callback, PQCompressionManager.VectorProvider, DiskIndexManager.IndexStateProvider { @@ -625,10 +660,9 @@ static BinaryTypeHandler> provideTypeHandler() private transient OnHeapGraphIndex index ; // Managers (transient - recreated on load) - private transient DiskIndexManager diskManager ; - private transient PQCompressionManager pqManager ; - private transient BackgroundPersistenceManager persistenceManager ; - transient BackgroundOptimizationManager optimizationManager; + private transient DiskIndexManager diskManager ; + private transient PQCompressionManager pqManager ; + transient BackgroundTaskManager backgroundTaskManager; // GraphSearcher pool for thread-local reuse private transient ExplicitThreadLocal searcherPool; @@ -636,10 +670,18 @@ static BinaryTypeHandler> provideTypeHandler() // Flag indicating graph was loaded from file (skip rebuild) private transient boolean graphLoadedFromFile; - // Read/write lock for concurrent search during persistence - // Read lock: allows concurrent searches - // Write lock: exclusive access during persistence - private transient ReentrantReadWriteLock persistenceLock; + // Read/write lock for builder operations. + // Read lock: concurrent searches and background-worker mutations + // Write lock: exclusive access for cleanup, persistence, removeAll, close + private transient ReentrantReadWriteLock builderLock; + + // When true, sync-mode mutations defer builder ops to avoid racing with cleanup(). + // cleanup()'s ForkJoinPool workers need the GigaMap monitor (for embedded vectorizers), + // so sync-mode mutations (which hold that monitor) cannot use builderLock — they use + // this flag instead. The synchronized(parentMap) barrier in optimize()/persistToDisk() + // ensures any in-flight mutation completes before cleanup begins. + private transient volatile boolean cleanupInProgress; + private transient ConcurrentLinkedQueue deferredBuilderOps; /////////////////////////////////////////////////////////////////////////// @@ -671,8 +713,9 @@ static BinaryTypeHandler> provideTypeHandler() .build() ; - // Initialize persistence lock early (before ensureIndexInitialized) - this.persistenceLock = new ReentrantReadWriteLock(); + // Initialize builder lock early (before ensureIndexInitialized) + this.builderLock = new ReentrantReadWriteLock(); + this.deferredBuilderOps = new ConcurrentLinkedQueue<>(); this.ensureIndexInitialized(); } @@ -757,10 +800,14 @@ private void initializeIndex() { this.vectorTypeSupport = VectorizationProvider.getInstance().getVectorTypeSupport(); - // Initialize persistence lock (always, for consistent locking semantics) - if(this.persistenceLock == null) + // Initialize builder lock (always, for consistent locking semantics) + if(this.builderLock == null) + { + this.builderLock = new ReentrantReadWriteLock(); + } + if(this.deferredBuilderOps == null) { - this.persistenceLock = new ReentrantReadWriteLock(); + this.deferredBuilderOps = new ConcurrentLinkedQueue<>(); } // Initialize PQ manager if compression enabled @@ -782,7 +829,8 @@ private void initializeIndex() this.name, this.configuration.indexDirectory(), this.configuration.dimension(), - this.configuration.maxDegree() + this.configuration.maxDegree(), + this.configuration.parallelOnDiskWrite() ); if(this.diskManager.tryLoad()) { @@ -810,56 +858,41 @@ private void initializeIndex() } /** - * Starts background persistence and optimization managers if configured. + * Starts the unified background task manager if any background feature is enabled. */ private void startBackgroundManagersIfEnabled() { - this.startBackgroundPersistenceIfEnabled(); - this.startBackgroundOptimizationIfEnabled(); - } + final boolean eventualIndexing = this.configuration.eventualIndexing(); + final boolean backgroundOptimization = this.configuration.backgroundOptimization(); + final boolean backgroundPersistence = this.configuration.onDisk() && this.configuration.backgroundPersistence(); - /** - * Starts the background persistence manager if configured. - */ - private void startBackgroundPersistenceIfEnabled() - { - if(this.configuration.onDisk() && this.configuration.backgroundPersistence()) + if(eventualIndexing || backgroundOptimization || backgroundPersistence) { - if(this.persistenceManager == null) + if(this.backgroundTaskManager == null) { - this.persistenceManager = new BackgroundPersistenceManager.Default( + this.backgroundTaskManager = new BackgroundTaskManager( this, this.name, + eventualIndexing, + backgroundOptimization, + this.configuration.optimizationIntervalMs(), + this.configuration.minChangesBetweenOptimizations(), + backgroundPersistence, this.configuration.persistenceIntervalMs(), this.configuration.minChangesBetweenPersists() ); - this.persistenceManager.startScheduledPersistence(); - LOG.info("Background persistence started for index '{}' with interval {}ms", - this.name, this.configuration.persistenceIntervalMs()); } } } /** - * Starts the background optimization manager if configured. + * Returns whether eventual indexing is active (background task manager exists + * AND eventualIndexing is configured). The manager may exist for optimization + * or persistence alone. */ - private void startBackgroundOptimizationIfEnabled() + private boolean isEventualIndexing() { - if(this.configuration.backgroundOptimization()) - { - if(this.optimizationManager == null) - { - this.optimizationManager = new BackgroundOptimizationManager.Default( - this, - this.name, - this.configuration.optimizationIntervalMs(), - this.configuration.minChangesBetweenOptimizations() - ); - this.optimizationManager.startScheduledOptimization(); - LOG.info("Background optimization started for index '{}' with interval {}ms", - this.name, this.configuration.optimizationIntervalMs()); - } - } + return this.backgroundTaskManager != null && this.configuration.eventualIndexing(); } /** @@ -1028,37 +1061,43 @@ public VectorIndexConfiguration configuration() private io.github.jbellis.jvector.vector.VectorSimilarityFunction jvectorSimilarityFunction() { + // use switch not valueOf(name) to ensure compiler assistance when jvector enum changes return switch(this.configuration.similarityFunction()) { case EUCLIDEAN -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.EUCLIDEAN; case DOT_PRODUCT -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.DOT_PRODUCT; case COSINE -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.COSINE; - default -> throw new IllegalArgumentException("Unsupported similarity function: " + this.configuration.similarityFunction()); }; } @Override public void internalAdd(final long entityId, final E entity) { + // No synchronized(parentMap) needed — called from GigaMap's synchronized methods. final int ordinal = toOrdinal(entityId); - synchronized(this.parentMap()) - { - this.ensureIndexInitialized(); + this.ensureIndexInitialized(); - final float[] vector = this.vectorize(entity); + final float[] vector = this.vectorize(entity); - // Store based on vectorizer type - if(!this.isEmbedded()) - { - this.vectorStore.add(new VectorEntry(entityId, vector)); - } + // Store based on vectorizer type + if(!this.isEmbedded()) + { + this.vectorStore.add(new VectorEntry(entityId, vector)); + } + + this.markStateChangeChildren(); + if(this.isEventualIndexing()) + { + // Defer graph update to background thread + this.backgroundTaskManager.enqueue(new BackgroundTaskManager.IndexingOperation.Add(ordinal, vector)); + } + else + { // Add to HNSW graph using entity ID as ordinal final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); - this.builder.addGraphNode(ordinal, vf); - - this.markStateChangeChildren(); + this.executeOrDeferBuilderOp(() -> this.builder.addGraphNode(ordinal, vf)); // Mark dirty for background managers this.markDirtyForBackgroundManagers(1); @@ -1086,27 +1125,35 @@ public void internalAddAll(final long firstEntityId, final Iterable @Override public void internalUpdate(final long entityId, final E replacedEntity, final E entity) { - synchronized(this.parentMap()) - { - this.ensureIndexInitialized(); + // No synchronized(parentMap) needed — called from GigaMap's synchronized methods. + this.ensureIndexInitialized(); - final float[] vector = this.vectorize(entity); + final float[] vector = this.vectorize(entity); - final int ordinal = toOrdinal(entityId); - this.builder.markNodeDeleted(ordinal); - this.builder.removeDeletedNodes(); + final int ordinal = toOrdinal(entityId); - // Update based on vectorizer type - if(!this.isEmbedded()) - { - this.vectorStore.set(entityId, new VectorEntry(entityId, vector)); - } + // Update based on vectorizer type + if(!this.isEmbedded()) + { + this.vectorStore.set(entityId, new VectorEntry(entityId, vector)); + } - // Add to HNSW graph using entity ID as ordinal - final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); - this.builder.addGraphNode(ordinal, vf); + this.markStateChangeChildren(); - this.markStateChangeChildren(); + if(this.isEventualIndexing()) + { + // Defer graph update to background thread + this.backgroundTaskManager.enqueue(new BackgroundTaskManager.IndexingOperation.Update(ordinal, vector)); + } + else + { + final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); + this.executeOrDeferBuilderOp(() -> + { + this.builder.markNodeDeleted(ordinal); + this.builder.removeDeletedNodes(); + this.builder.addGraphNode(ordinal, vf); + }); // Mark dirty for background managers this.markDirtyForBackgroundManagers(1); @@ -1135,18 +1182,28 @@ private List collectVectors(final long firstEntityId, final Iterabl */ private void addVectorEntries(final List entries) { - synchronized(this.parentMap()) - { - this.ensureIndexInitialized(); + // No synchronized(parentMap) needed — called from GigaMap's synchronized methods. + this.ensureIndexInitialized(); - if(!this.isEmbedded()) - { - this.vectorStore.addAll(entries); - } + if(!this.isEmbedded()) + { + this.vectorStore.addAll(entries); + } - this.addGraphNodesSequential(entries); + this.markStateChangeChildren(); - this.markStateChangeChildren(); + if(this.isEventualIndexing()) + { + // Defer graph updates to background thread + entries.forEach(entry -> + this.backgroundTaskManager.enqueue(new BackgroundTaskManager.IndexingOperation.Add( + toOrdinal(entry.sourceEntityId), entry.vector + )) + ); + } + else + { + this.executeOrDeferBuilderOp(() -> this.addGraphNodesSequential(entries)); // Mark dirty for background managers (with count for debouncing) this.markDirtyForBackgroundManagers(entries.size()); @@ -1156,15 +1213,12 @@ private void addVectorEntries(final List entries) /** * Marks dirty for background managers with the specified change count. */ - private void markDirtyForBackgroundManagers(final int count) + @Override + public void markDirtyForBackgroundManagers(final int count) { - if(this.persistenceManager != null) - { - this.persistenceManager.markDirty(count); - } - if(this.optimizationManager != null) + if(this.backgroundTaskManager != null) { - this.optimizationManager.markDirty(count); + this.backgroundTaskManager.markDirty(count); } } @@ -1184,18 +1238,25 @@ private void addGraphNodesSequential(final List entries) @Override public void internalRemove(final long entityId, final E entity) { - synchronized(this.parentMap()) + // No synchronized(parentMap) needed — called from GigaMap's synchronized methods. + this.ensureIndexInitialized(); + + final int ordinal = toOrdinal(entityId); + if(!this.isEmbedded()) { - this.ensureIndexInitialized(); + this.vectorStore.removeById(entityId); + } - final int ordinal = toOrdinal(entityId); - if(!this.isEmbedded()) - { - this.vectorStore.removeById(entityId); - } - this.builder.markNodeDeleted(ordinal); + this.markStateChangeChildren(); - this.markStateChangeChildren(); + if(this.isEventualIndexing()) + { + // Defer graph update to background thread + this.backgroundTaskManager.enqueue(new BackgroundTaskManager.IndexingOperation.Remove(ordinal)); + } + else + { + this.executeOrDeferBuilderOp(() -> this.builder.markNodeDeleted(ordinal)); // Mark dirty for background managers this.markDirtyForBackgroundManagers(1); @@ -1205,7 +1266,13 @@ public void internalRemove(final long entityId, final E entity) @Override public void internalRemoveAll() { - synchronized(this.parentMap()) + // Acquire write lock to ensure no concurrent persistToDisk() Phase 2, + // search, or background worker mutation is running. + // closeInternalResources() destroys the graph and disk manager, which would + // corrupt any in-flight operation. + // No synchronized(parentMap) needed — called from GigaMap's synchronized methods. + this.builderLock.writeLock().lock(); + try { this.ensureIndexInitialized(); @@ -1214,11 +1281,8 @@ public void internalRemoveAll() this.vectorStore.removeAll(); } - // Shutdown optimization manager before closing - this.shutdownOptimizationManager(false); - - // Shutdown persistence manager before closing - this.shutdownPersistenceManager(false); + // Shutdown background task manager (discard pending ops — they're stale) + this.shutdownBackgroundTaskManager(false, false, false); this.closeInternalResources(); @@ -1229,6 +1293,10 @@ public void internalRemoveAll() // Mark dirty for background managers this.markDirtyForBackgroundManagers(1); } + finally + { + this.builderLock.writeLock().unlock(); + } } @Override @@ -1236,34 +1304,34 @@ public VectorSearchResult search(final float[] queryVector, final int k) { this.validateDimension(queryVector); - // Acquire read lock for concurrent search during persistence - this.persistenceLock.readLock().lock(); + // Acquire read lock — blocks during cleanup/persistence/removeAll/close, + // allows concurrent searches and GigaMap mutations. + // No synchronized(parentMap) — avoids lock-ordering deadlock with + // internalRemoveAll (which holds the GigaMap monitor and needs the write lock). + this.builderLock.readLock().lock(); try { - synchronized(this.parentMap()) - { - this.ensureIndexInitialized(); - - final VectorFloat query = this.vectorTypeSupport.createFloatVector(queryVector); + this.ensureIndexInitialized(); - // Choose search strategy based on index mode - final SearchResult result; - final boolean diskLoaded = this.diskManager != null && this.diskManager.isLoaded(); - if(diskLoaded && this.diskManager.getDiskIndex() != null) - { - result = this.searchDiskIndex(query, k); - } - else - { - result = this.searchInMemoryIndex(query, k); - } + final VectorFloat query = this.vectorTypeSupport.createFloatVector(queryVector); - return this.convertSearchResult(result); + // Choose search strategy based on index mode + final SearchResult result; + final boolean diskLoaded = this.diskManager != null && this.diskManager.isLoaded(); + if(diskLoaded && this.diskManager.getDiskIndex() != null) + { + result = this.searchDiskIndex(query, k); + } + else + { + result = this.searchInMemoryIndex(query, k); } + + return this.convertSearchResult(result); } finally { - this.persistenceLock.readLock().unlock(); + this.builderLock.readLock().unlock(); } } @@ -1359,15 +1427,63 @@ private VectorSearchResult convertSearchResult(final SearchResult result) @Override public void optimize() { - synchronized(this.parentMap()) + // Drain pending indexing operations to ensure graph is complete + if(this.isEventualIndexing()) { - this.ensureIndexInitialized(); - if(this.builder != null) + this.backgroundTaskManager.drainQueue(); + } + + this.doOptimize(); + } + + /** + * Core optimization logic without queue drain. + * Called directly from the background task manager's executor thread + * (where inline drain is already done) and from the public optimize() method. + */ + @Override + public void doOptimize() + { + final GraphIndexBuilder capturedBuilder; + + // Signal sync-mode mutations to defer builder ops during cleanup. + this.cleanupInProgress = true; + try + { + // Barrier: any in-flight GigaMap mutation (which holds the GigaMap monitor) + // will complete before we proceed. New mutations see the flag and defer. + synchronized(this.parentMap()) { - this.builder.cleanup(); + this.ensureIndexInitialized(); + capturedBuilder = this.builder; } - this.markStateChangeChildren(); + + // cleanup() uses ForkJoinPool internally — must be outside + // synchronized(parentMap) to avoid deadlock with embedded vectorizers + // whose worker threads call parentMap.get(). + if(capturedBuilder != null) + { + // Write lock blocks background worker mutations (readLock) and searches. + this.builderLock.writeLock().lock(); + try + { + capturedBuilder.cleanup(); + } + finally + { + this.builderLock.writeLock().unlock(); + } + } + } + finally + { + this.cleanupInProgress = false; } + + // Apply any deferred sync-mode mutations now that cleanup is done. + this.drainDeferredBuilderOps(); + + this.markStateChangeChildren(); } @Override @@ -1378,19 +1494,57 @@ public void persistToDisk() return; // No-op for in-memory indices } - // Acquire write lock for exclusive access during persistence - this.persistenceLock.writeLock().lock(); + // Drain pending indexing operations to ensure graph is complete + if(this.isEventualIndexing()) + { + this.backgroundTaskManager.drainQueue(); + } + + this.doPersistToDisk(); + } + + /** + * Core persistence logic without queue drain. + * Called directly from the background task manager's executor thread + * (where inline drain is already done) and from the public persistToDisk() method. + */ + @Override + public void doPersistToDisk() + { + if(!this.configuration.onDisk()) + { + return; // No-op for in-memory indices + } + + // Signal sync-mode mutations to defer builder ops during cleanup + disk write. + this.cleanupInProgress = true; try { - synchronized(this.parentMap()) + // Acquire write lock for exclusive access during persistence. + // This blocks searches, background worker mutations, removeAll, and close. + this.builderLock.writeLock().lock(); + try { - this.ensureIndexInitialized(); - - // If we have an in-memory builder, write it to disk - if(this.builder != null && this.index != null) + // Captured references for Phase 2 (disk write outside synchronized block) + final OnHeapGraphIndex capturedIndex ; + final RandomAccessVectorValues capturedRavv ; + final PQCompressionManager capturedPqMgr ; + final DiskIndexManager capturedDiskMgr; + + final GraphIndexBuilder capturedBuilder; + + // Phase 1: Barrier + reference capture inside synchronized(parentMap). + // The barrier ensures any in-flight GigaMap mutation completes. + // New mutations see cleanupInProgress=true and defer. + synchronized(this.parentMap()) { - // Cleanup the graph before writing (removes excess neighbors) - this.builder.cleanup(); + this.ensureIndexInitialized(); + + // If we have an in-memory builder, prepare for disk write + if(this.builder == null || this.index == null) + { + return; + } // Initialize disk manager if needed if(this.diskManager == null) @@ -1400,26 +1554,48 @@ public void persistToDisk() this.name, this.configuration.indexDirectory(), this.configuration.dimension(), - this.configuration.maxDegree() + this.configuration.maxDegree(), + this.configuration.parallelOnDiskWrite() ); } - // Create vector values for writing - final RandomAccessVectorValues ravv = this.createVectorValues(); - - // Write using disk manager - this.diskManager.writeIndex(this.index, ravv, this.pqManager); + // Capture references for use outside the synchronized block. + // The parentMap monitor is released before cleanup and disk write + // so that worker threads (ForkJoinPool in cleanup, disk writer) + // can freely call parentMap.get() without deadlocking. + capturedBuilder = this.builder; + capturedIndex = this.index; + capturedRavv = new NullSafeVectorValues( + this.createVectorValues(), this.configuration.dimension(), this.vectorTypeSupport + ); + capturedPqMgr = this.pqManager; + capturedDiskMgr = this.diskManager; } + + // Phase 2: Cleanup and disk write outside synchronized(parentMap). + // builderLock.writeLock() is still held, blocking searches, + // background worker mutations, removeAll, and close. + // parentMap monitor is released, so ForkJoinPool workers and + // disk writer threads can call parentMap.get() for embedded vectors. + capturedBuilder.cleanup(); + capturedDiskMgr.writeIndex(capturedIndex, capturedRavv, capturedPqMgr); + } + catch(final IOException ioe) + { + throw new IORuntimeException(ioe); + } + finally + { + this.builderLock.writeLock().unlock(); } - } - catch(final IOException ioe) - { - throw new IORuntimeException(ioe); } finally { - this.persistenceLock.writeLock().unlock(); + this.cleanupInProgress = false; } + + // Apply any deferred sync-mode mutations now that cleanup + persistence is done. + this.drainDeferredBuilderOps(); } @Override @@ -1459,43 +1635,47 @@ protected void clearChildrenStateChangeMarkers() @Override public void close() { - // Shutdown optimization manager first (may optimize pending changes) - this.shutdownOptimizationManager(this.configuration.optimizeOnShutdown()); - - // Shutdown persistence manager second (may persist pending changes) - this.shutdownPersistenceManager(this.configuration.persistOnShutdown()); + // Shutdown background task manager — drain indexing, optionally optimize and persist + this.shutdownBackgroundTaskManager( + true, + this.configuration.optimizeOnShutdown(), + this.configuration.persistOnShutdown() + ); - synchronized(this.parentMap()) + // Acquire write lock to ensure no concurrent search or persistToDisk() is running. + // closeInternalResources() destroys the graph and disk manager. + this.builderLock.writeLock().lock(); + try { this.closeInternalResources(); } - } - - /** - * Shuts down the background optimization manager. - * - * @param optimizePending if true, optimize pending changes before shutdown - */ - private void shutdownOptimizationManager(final boolean optimizePending) - { - if(this.optimizationManager != null) + finally { - this.optimizationManager.shutdown(optimizePending); - this.optimizationManager = null; + this.builderLock.writeLock().unlock(); } } /** - * Shuts down the background persistence manager. + * Shuts down the background task manager. * - * @param persistPending if true, persist pending changes before shutdown + * @param drainPending if true, drain all pending indexing operations + * @param optimizePending if true and there are pending changes, optimize before shutdown + * @param persistPending if true and there are pending changes, persist before shutdown */ - private void shutdownPersistenceManager(final boolean persistPending) + private void shutdownBackgroundTaskManager( + final boolean drainPending, + final boolean optimizePending, + final boolean persistPending + ) { - if(this.persistenceManager != null) + if(this.backgroundTaskManager != null) { - this.persistenceManager.shutdown(persistPending); - this.persistenceManager = null; + if(!drainPending) + { + this.backgroundTaskManager.discardQueue(); + } + this.backgroundTaskManager.shutdown(drainPending, optimizePending, persistPending); + this.backgroundTaskManager = null; } } @@ -1566,12 +1746,6 @@ private RandomAccessVectorValues createVectorValues() // callback interface implementations // //////////////////////////////////////// - // Note: BackgroundPersistenceManager.Callback.persistToDisk() is implemented - // by the public persistToDisk() method above. - - // Note: BackgroundOptimizationManager.Callback.optimize() is implemented - // by the public optimize() method above. - // PQCompressionManager.VectorProvider @Override @@ -1616,6 +1790,94 @@ public long getExpectedVectorCount() } } + // ================================================================ + // BackgroundTaskManager.Callback implementation + // ================================================================ + + @Override + public void applyGraphAdd(final int ordinal, final float[] vector) + { + // Called from the background indexing worker thread (not from GigaMap's + // synchronized methods), so we use builderLock.readLock() to coordinate + // with cleanup (writeLock). + this.builderLock.readLock().lock(); + try + { + final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); + this.builder.addGraphNode(ordinal, vf); + } + finally + { + this.builderLock.readLock().unlock(); + } + } + + @Override + public void applyGraphUpdate(final int ordinal, final float[] vector) + { + this.builderLock.readLock().lock(); + try + { + this.builder.markNodeDeleted(ordinal); + this.builder.removeDeletedNodes(); + final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); + this.builder.addGraphNode(ordinal, vf); + } + finally + { + this.builderLock.readLock().unlock(); + } + } + + @Override + public void applyGraphRemove(final int ordinal) + { + this.builderLock.readLock().lock(); + try + { + this.builder.markNodeDeleted(ordinal); + } + finally + { + this.builderLock.readLock().unlock(); + } + } + + + // ================================================================ + // Builder operation deferral helpers + // ================================================================ + + /** + * Executes a builder operation immediately, or defers it if cleanup is in progress. + * Used by sync-mode mutations (called from GigaMap's synchronized methods) which + * cannot acquire builderLock without risking deadlock with embedded vectorizers. + */ + private void executeOrDeferBuilderOp(final Runnable op) + { + if(this.cleanupInProgress) + { + this.deferredBuilderOps.add(op); + } + else + { + op.run(); + } + } + + /** + * Drains and executes all deferred builder operations. + * Called after cleanup completes (cleanupInProgress is already false). + */ + private void drainDeferredBuilderOps() + { + Runnable op; + while((op = this.deferredBuilderOps.poll()) != null) + { + op.run(); + } + } + } } diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndexConfiguration.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndexConfiguration.java index fa292a8c..c1fd4428 100644 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndexConfiguration.java +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndexConfiguration.java @@ -448,6 +448,41 @@ public default boolean backgroundOptimization() */ public boolean optimizeOnShutdown(); + /** + * Returns whether eventual indexing mode is enabled. + *

    + * When enabled, expensive HNSW graph mutations (add, update, remove) are + * deferred to a background thread. The vector store is still updated + * synchronously, but graph construction happens asynchronously. + *

    + * This reduces the latency of mutation operations at the cost of + * eventual consistency — search results may not immediately reflect the + * most recent mutations. + *

    + * The graph is automatically drained (all pending operations applied) + * before {@code optimize()}, {@code persistToDisk()}, and {@code close()}. + * + * @return true if eventual indexing is enabled (default: false) + */ + public boolean eventualIndexing(); + + /** + * Returns whether parallel writing is used for on-disk index persistence. + *

    + * When enabled, the on-disk graph writer uses parallel direct buffers and + * multiple worker threads (one per available processor) to write the index + * concurrently. This significantly speeds up persistence for large indices. + *

    + * When disabled, a sequential single-threaded writer is used, which may be + * preferable in resource-constrained environments or when writing smaller indices. + *

    + * Only applies when {@link #onDisk()} is true. + * + * @return true if parallel on-disk writing is enabled (default: false) + * @see #onDisk() + */ + public boolean parallelOnDiskWrite(); + /** * Creates a new builder for constructing a {@link VectorIndexConfiguration}. @@ -904,6 +939,30 @@ public static interface Builder */ public Builder optimizeOnShutdown(boolean optimizeOnShutdown); + /** + * Enables or disables parallel writing for on-disk index persistence. + *

    + * When enabled, uses multiple worker threads and parallel direct buffers + * for faster disk writes. Only applies when {@link #onDisk(boolean)} is true. + * + * @param parallelOnDiskWrite true to enable parallel on-disk writing + * @return this builder for method chaining + * @see VectorIndexConfiguration#parallelOnDiskWrite() + */ + public Builder parallelOnDiskWrite(boolean parallelOnDiskWrite); + + /** + * Enables or disables eventual indexing mode. + *

    + * When enabled, HNSW graph mutations are deferred to a background thread, + * reducing mutation latency at the cost of eventual consistency for searches. + * + * @param eventualIndexing true to enable eventual indexing + * @return this builder for method chaining + * @see VectorIndexConfiguration#eventualIndexing() + */ + public Builder eventualIndexing(boolean eventualIndexing); + /** * Builds the configuration with the specified parameters. * @@ -943,6 +1002,8 @@ public static class Default implements Builder private long optimizationIntervalMs ; private int minChangesBetweenOptimizations; private boolean optimizeOnShutdown ; + private boolean parallelOnDiskWrite ; + private boolean eventualIndexing ; Default() { @@ -962,6 +1023,8 @@ public static class Default implements Builder this.optimizationIntervalMs = 0; // 0 = disabled this.minChangesBetweenOptimizations = 1000; this.optimizeOnShutdown = false; + this.parallelOnDiskWrite = false; + this.eventualIndexing = false; } @Override @@ -1096,6 +1159,20 @@ public Builder optimizeOnShutdown(final boolean optimizeOnShutdown) return this; } + @Override + public Builder parallelOnDiskWrite(final boolean parallelOnDiskWrite) + { + this.parallelOnDiskWrite = parallelOnDiskWrite; + return this; + } + + @Override + public Builder eventualIndexing(final boolean eventualIndexing) + { + this.eventualIndexing = eventualIndexing; + return this; + } + @Override public VectorIndexConfiguration build() { @@ -1143,7 +1220,9 @@ public VectorIndexConfiguration build() this.minChangesBetweenPersists, this.optimizationIntervalMs, this.minChangesBetweenOptimizations, - this.optimizeOnShutdown + this.optimizeOnShutdown, + this.parallelOnDiskWrite, + this.eventualIndexing ); } @@ -1173,6 +1252,8 @@ public static class Default implements VectorIndexConfiguration private final long optimizationIntervalMs ; private final int minChangesBetweenOptimizations; private final boolean optimizeOnShutdown ; + private final boolean parallelOnDiskWrite ; + private final boolean eventualIndexing ; Default( final int dimension , @@ -1190,7 +1271,9 @@ public static class Default implements VectorIndexConfiguration final int minChangesBetweenPersists , final long optimizationIntervalMs , final int minChangesBetweenOptimizations , - final boolean optimizeOnShutdown + final boolean optimizeOnShutdown , + final boolean parallelOnDiskWrite , + final boolean eventualIndexing ) { this.dimension = dimension ; @@ -1209,6 +1292,8 @@ public static class Default implements VectorIndexConfiguration this.optimizationIntervalMs = optimizationIntervalMs ; this.minChangesBetweenOptimizations = minChangesBetweenOptimizations ; this.optimizeOnShutdown = optimizeOnShutdown ; + this.parallelOnDiskWrite = parallelOnDiskWrite ; + this.eventualIndexing = eventualIndexing ; } @Override @@ -1307,6 +1392,18 @@ public boolean optimizeOnShutdown() return this.optimizeOnShutdown; } + @Override + public boolean parallelOnDiskWrite() + { + return this.parallelOnDiskWrite; + } + + @Override + public boolean eventualIndexing() + { + return this.eventualIndexing; + } + } } diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConcurrentStressTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConcurrentStressTest.java new file mode 100644 index 00000000..e0255df0 --- /dev/null +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConcurrentStressTest.java @@ -0,0 +1,666 @@ +package org.eclipse.store.gigamap.jvector; + +/*- + * #%L + * EclipseStore GigaMap JVector + * %% + * Copyright (C) 2023 - 2026 MicroStream Software + * %% + * This program and the accompanying materials are made + * available under the terms of the Eclipse Public License 2.0 + * which is available at https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + * #L% + */ + +import org.eclipse.store.gigamap.types.GigaMap; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.api.io.TempDir; + +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Concurrent stress tests for VectorIndex thread-safety. + *

    + * Multiple threads perform random add, update, remove, and search operations + * concurrently. Each test configuration exercises a different combination of: + *

      + *
    • On-disk vs. in-memory
    • + *
    • PQ compression
    • + *
    • Eventual indexing
    • + *
    • Parallel on-disk write
    • + *
    • Background optimization
    • + *
    • Background persistence
    • + *
    + *

    + * The primary assertion is that no exceptions are thrown and no deadlocks occur + * (enforced by {@link Timeout}). + */ +class VectorIndexConcurrentStressTest +{ + record Document(String content, float[] embedding) {} + + static class ComputedDocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + } + + static class EmbeddedDocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + + @Override + public boolean isEmbedded() + { + return true; + } + } + + private static float[] randomVector(final Random random, final int dimension) + { + final float[] vector = new float[dimension]; + float norm = 0; + for(int i = 0; i < dimension; i++) + { + vector[i] = random.nextFloat() * 2 - 1; + norm += vector[i] * vector[i]; + } + norm = (float)Math.sqrt(norm); + for(int i = 0; i < dimension; i++) + { + vector[i] /= norm; + } + return vector; + } + + + // ==================== Configuration Combinations ==================== + + /** + * Describes one configuration combination. + */ + private record ConfigCombo( + String label, + boolean onDisk, + boolean pqCompression, + boolean eventual, + boolean parallel, + boolean backgroundOptimization, + boolean backgroundPersistence + ) {} + + /** + * Generates all valid configuration combinations. + *

    + * Constraints: + *

      + *
    • PQ compression requires onDisk
    • + *
    • Background persistence requires onDisk
    • + *
    • parallel only meaningful when onDisk
    • + *
    + */ + private static List allCombos() + { + final List combos = new ArrayList<>(); + + // In-memory combos: onDisk=false → pq=false, persistence=false, parallel irrelevant + for(final boolean eventual : new boolean[]{false, true}) + { + for(final boolean optimization : new boolean[]{false, true}) + { + combos.add(new ConfigCombo( + "mem|eventual=" + eventual + "|opt=" + optimization, + false, false, eventual, false, optimization, false + )); + } + } + + // On-disk combos + for(final boolean pq : new boolean[]{false, true}) + { + for(final boolean eventual : new boolean[]{false, true}) + { + for(final boolean parallel : new boolean[]{false, true}) + { + for(final boolean optimization : new boolean[]{false, true}) + { + for(final boolean persistence : new boolean[]{false, true}) + { + combos.add(new ConfigCombo( + "disk|pq=" + pq + + "|eventual=" + eventual + + "|parallel=" + parallel + + "|opt=" + optimization + + "|persist=" + persistence, + true, pq, eventual, parallel, optimization, persistence + )); + } + } + } + } + } + + return combos; + } + + /** + * Builds a VectorIndexConfiguration from a combo. + */ + private static VectorIndexConfiguration buildConfig( + final ConfigCombo combo, + final int dimension, + final Path indexDir + ) + { + final VectorIndexConfiguration.Builder builder = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(combo.pqCompression() ? 32 : 16) + .beamWidth(100) + .eventualIndexing(combo.eventual()); + + if(combo.onDisk()) + { + builder + .onDisk(true) + .indexDirectory(indexDir) + .parallelOnDiskWrite(combo.parallel()); + + if(combo.pqCompression()) + { + builder + .enablePqCompression(true) + .pqSubspaces(dimension / 4); + } + + if(combo.backgroundPersistence()) + { + builder + .persistenceIntervalMs(200) + .minChangesBetweenPersists(1); + } + } + + if(combo.backgroundOptimization()) + { + builder + .optimizationIntervalMs(200) + .minChangesBetweenOptimizations(5); + } + + return builder.build(); + } + + + // ==================== Stress Test Core ==================== + + /** + * Runs a concurrent stress test for a single configuration. + *

    + * 4 threads perform random add/update/remove/search operations concurrently. + * A pool of pre-seeded entities ensures ordinals exist for update/remove. + * + * @param combo the configuration combination + * @param indexDir directory for on-disk index (may be null for in-memory) + */ + private void runStressTest(final ConfigCombo combo, final Path indexDir) throws Exception + { + final int dimension = 64; + final int seedCount = 30; + final int opsPerThread = 60; + final int threadCount = 4; + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = buildConfig(combo, dimension, indexDir); + + try (final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + )) + { + // Seed the index with initial entities so updates/removes have targets + final Random seedRandom = new Random(42); + for (int i = 0; i < seedCount; i++) + { + gigaMap.add(new Document("seed_" + i, randomVector(seedRandom, dimension))); + } + + // For eventual indexing, drain the seed operations + if (combo.eventual()) + { + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); + } + + // If PQ compression, train before concurrent access + if (combo.pqCompression()) + { + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + } + + // Shared state for coordinating threads + final AtomicLong nextEntityId = new AtomicLong(seedCount); + final AtomicBoolean hasError = new AtomicBoolean(false); + final AtomicInteger completedOps = new AtomicInteger(0); + final List errors = java.util.Collections.synchronizedList(new ArrayList<>()); + final CountDownLatch startLatch = new CountDownLatch(1); + final CountDownLatch doneLatch = new CountDownLatch(threadCount); + + final ExecutorService executor = Executors.newFixedThreadPool(threadCount); + + for (int t = 0; t < threadCount; t++) + { + final int threadId = t; + executor.submit(() -> + { + try + { + // Wait for all threads to be ready + startLatch.await(); + + final Random random = new Random(1000 + threadId); + + for (int op = 0; op < opsPerThread && !hasError.get(); op++) + { + try + { + final int action = random.nextInt(100); + + if (action < 30) + { + // 30%: ADD + final float[] vector = randomVector(random, dimension); + synchronized (gigaMap) + { + gigaMap.add(new Document( + "t" + threadId + "_" + op, vector + )); + } + } else if (action < 45) + { + // 15%: UPDATE (set) — target a seed entity + final long targetId = random.nextInt(seedCount); + final float[] vector = randomVector(random, dimension); + synchronized (gigaMap) + { + try + { + gigaMap.set(targetId, new Document( + "updated_" + targetId, vector + )); + } catch (final Exception e) + { + // Entity may have been removed by another thread — acceptable + } + } + } else if (action < 55) + { + // 10%: REMOVE — target a seed entity + final long targetId = random.nextInt(seedCount); + synchronized (gigaMap) + { + try + { + gigaMap.removeById(targetId); + } catch (final Exception e) + { + // Entity may already be removed — acceptable + } + } + } else + { + // 45%: SEARCH + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult result = index.search(queryVector, 5); + // Result may be empty if all entities were removed — that's fine + assertNotNull(result); + } + + completedOps.incrementAndGet(); + } catch (final Exception e) + { + errors.add(e); + hasError.set(true); + } + } + } catch (final InterruptedException e) + { + Thread.currentThread().interrupt(); + } finally + { + doneLatch.countDown(); + } + }); + } + + // Release all threads simultaneously + startLatch.countDown(); + + // Wait for completion + assertTrue(doneLatch.await(60, TimeUnit.SECONDS), + "Threads should complete within timeout for: " + combo.label()); + + executor.shutdown(); + assertTrue(executor.awaitTermination(10, TimeUnit.SECONDS)); + + // Report errors + if (!errors.isEmpty()) + { + final StringBuilder sb = new StringBuilder(); + sb.append("Concurrent stress test failed for: ").append(combo.label()); + sb.append("\n").append(errors.size()).append(" error(s):"); + for (final Throwable err : errors) + { + sb.append("\n - ").append(err.getClass().getSimpleName()) + .append(": ").append(err.getMessage()); + } + fail(sb.toString()); + } + + // Verify the index is still consistent — drain and search + if (combo.eventual()) + { + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + if (defaultIndex.backgroundTaskManager != null) + { + defaultIndex.backgroundTaskManager.drainQueue(); + } + } + + final VectorSearchResult finalResult = index.search( + randomVector(new Random(999), dimension), 5 + ); + assertNotNull(finalResult); + } + } + + + // ==================== In-Memory Combinations ==================== + + @Test + @Timeout(value = 120, unit = TimeUnit.SECONDS) + void testConcurrentStress_InMemory() + { + final List combos = allCombos().stream() + .filter(c -> !c.onDisk()) + .toList(); + + assertFalse(combos.isEmpty(), "Should have in-memory combos"); + + final List passed = new ArrayList<>(); + for(final ConfigCombo combo : combos) + { + try + { + this.runStressTest(combo, null); + passed.add(combo.label()); + } + catch(final Exception e) + { + fail("Failed for combo: " + combo.label() + " — " + e.getMessage(), e); + } + } + + assertEquals(combos.size(), passed.size(), + "All in-memory combos should pass"); + } + + + // ==================== On-Disk without PQ Combinations ==================== + + @Test + @Timeout(value = 180, unit = TimeUnit.SECONDS) + void testConcurrentStress_OnDisk_NoPQ(@TempDir final Path tempDir) + { + final List combos = allCombos().stream() + .filter(c -> c.onDisk() && !c.pqCompression()) + .toList(); + + assertFalse(combos.isEmpty(), "Should have on-disk no-PQ combos"); + + final List passed = new ArrayList<>(); + int comboIndex = 0; + for(final ConfigCombo combo : combos) + { + final Path indexDir = tempDir.resolve("combo_" + comboIndex++); + try + { + this.runStressTest(combo, indexDir); + passed.add(combo.label()); + } + catch(final Exception e) + { + fail("Failed for combo: " + combo.label() + " — " + e.getMessage(), e); + } + } + + assertEquals(combos.size(), passed.size(), + "All on-disk no-PQ combos should pass"); + } + + + // ==================== On-Disk with PQ Combinations ==================== + + @Test + @Timeout(value = 180, unit = TimeUnit.SECONDS) + void testConcurrentStress_OnDisk_WithPQ(@TempDir final Path tempDir) + { + final List combos = allCombos().stream() + .filter(c -> c.onDisk() && c.pqCompression()) + .toList(); + + assertFalse(combos.isEmpty(), "Should have on-disk PQ combos"); + + final List passed = new ArrayList<>(); + int comboIndex = 0; + for(final ConfigCombo combo : combos) + { + final Path indexDir = tempDir.resolve("pq_combo_" + comboIndex++); + try + { + this.runStressTest(combo, indexDir); + passed.add(combo.label()); + } + catch(final Exception e) + { + fail("Failed for combo: " + combo.label() + " — " + e.getMessage(), e); + } + } + + assertEquals(combos.size(), passed.size(), + "All on-disk PQ combos should pass"); + } + + + // ==================== Focused Eventual Indexing Stress ==================== + + /** + * Focused test: heavier load with eventual indexing enabled. + * More operations per thread to stress the background queue. + */ + @Test + @Timeout(value = 60, unit = TimeUnit.SECONDS) + void testEventualIndexingHeavyConcurrentLoad(@TempDir final Path tempDir) + throws Exception + { + final int dimension = 64; + final int seedCount = 50; + final int opsPerThread = 150; + final int threadCount = 6; + final Path indexDir = tempDir.resolve("heavy"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .onDisk(true) + .indexDirectory(indexDir) + .eventualIndexing(true) + .optimizationIntervalMs(300) + .minChangesBetweenOptimizations(10) + .persistenceIntervalMs(500) + .minChangesBetweenPersists(5) + .build(); + + try (final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + )) + { + // Seed + final Random seedRandom = new Random(42); + for (int i = 0; i < seedCount; i++) + { + gigaMap.add(new Document("seed_" + i, randomVector(seedRandom, dimension))); + } + + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); + + final AtomicBoolean hasError = new AtomicBoolean(false); + final List errors = java.util.Collections.synchronizedList(new ArrayList<>()); + final CountDownLatch startLatch = new CountDownLatch(1); + final CountDownLatch doneLatch = new CountDownLatch(threadCount); + + final ExecutorService executor = Executors.newFixedThreadPool(threadCount); + + for (int t = 0; t < threadCount; t++) + { + final int threadId = t; + executor.submit(() -> + { + try + { + startLatch.await(); + final Random random = new Random(2000 + threadId); + + for (int op = 0; op < opsPerThread && !hasError.get(); op++) + { + try + { + final int action = random.nextInt(100); + + if (action < 25) + { + // ADD + synchronized (gigaMap) + { + gigaMap.add(new Document( + "t" + threadId + "_" + op, + randomVector(random, dimension) + )); + } + } + else if (action < 40) + { + // UPDATE + final long targetId = random.nextInt(seedCount); + synchronized (gigaMap) + { + try + { + gigaMap.set(targetId, new Document( + "upd_" + targetId, + randomVector(random, dimension) + )); + } + catch(final Exception ignored) + { + } + } + } else if (action < 50) + { + // REMOVE + final long targetId = random.nextInt(seedCount); + synchronized (gigaMap) + { + try + { + gigaMap.removeById(targetId); + } + catch(final Exception ignored) + { + } + } + } else + { + // SEARCH + final VectorSearchResult result = index.search( + randomVector(random, dimension), 5 + ); + assertNotNull(result); + } + } + catch(final Exception e) + { + errors.add(e); + hasError.set(true); + } + } + } + catch(final InterruptedException e) + { + Thread.currentThread().interrupt(); + } + finally + { + doneLatch.countDown(); + } + }); + } + + startLatch.countDown(); + + assertTrue(doneLatch.await(60, TimeUnit.SECONDS), + "Heavy concurrent load should complete within timeout"); + + executor.shutdown(); + assertTrue(executor.awaitTermination(10, TimeUnit.SECONDS)); + + if (!errors.isEmpty()) + { + final StringBuilder sb = new StringBuilder("Heavy eventual indexing stress test failed:"); + for (final Throwable err : errors) + { + sb.append("\n - ").append(err.getClass().getSimpleName()) + .append(": ").append(err.getMessage()); + } + fail(sb.toString()); + } + + // Drain and verify final state + defaultIndex.backgroundTaskManager.drainQueue(); + + final VectorSearchResult finalResult = index.search( + randomVector(new Random(999), dimension), 5 + ); + assertNotNull(finalResult); + } + } +} diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java index caa5c874..901ce3df 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java @@ -54,6 +54,8 @@ void testBuilderDefaults() assertEquals(0L, config.optimizationIntervalMs()); assertEquals(1000, config.minChangesBetweenOptimizations()); assertFalse(config.optimizeOnShutdown()); + assertFalse(config.parallelOnDiskWrite()); + assertFalse(config.eventualIndexing()); } // ==================== Builder Validation Tests ==================== @@ -121,30 +123,6 @@ void testBuilderRequiresNonNegativePqSubspaces() ); } - @Test - void testBuilderRequiresNonNegativePersistenceIntervalMs() - { - // 0 is valid (means disabled) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(64) - .persistenceIntervalMs(0) - .build(); - assertEquals(0L, config.persistenceIntervalMs()); - assertFalse(config.backgroundPersistence()); - - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder().dimension(64).persistenceIntervalMs(-1).build() - ); - } - - @Test - void testBuilderRequiresNonNegativeMinChangesBetweenPersists() - { - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder().dimension(64).minChangesBetweenPersists(-1).build() - ); - } - @Test void testBuilderRequiresNonNegativeOptimizationIntervalMs() { @@ -190,19 +168,6 @@ void testOnDiskRequiresIndexDirectory() ); } - @Test - void testOnDiskWithIndexDirectorySucceeds(@TempDir final Path tempDir) - { - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(64) - .onDisk(true) - .indexDirectory(tempDir) - .build(); - - assertTrue(config.onDisk()); - assertEquals(tempDir, config.indexDirectory()); - } - @Test void testCompressionRequiresOnDisk() { @@ -269,6 +234,129 @@ void testPqSubspacesZeroMeansAuto(@TempDir final Path tempDir) assertEquals(0, config.pqSubspaces()); } + // ==================== Parallel On-Disk Write Tests ==================== + + @Test + void testParallelOnDiskWriteDefaultFalse() + { + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(64) + .build(); + + assertFalse(config.parallelOnDiskWrite()); + } + + @Test + void testParallelOnDiskWriteCanBeDisabled(@TempDir final Path tempDir) + { + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(64) + .onDisk(true) + .indexDirectory(tempDir) + .parallelOnDiskWrite(false) + .build(); + + assertFalse(config.parallelOnDiskWrite()); + } + + @Test + void testParallelVsNonParallelShareSameDefaults(@TempDir final Path tempDir) + { + final VectorIndexConfiguration parallel = VectorIndexConfiguration.builder() + .dimension(768) + .onDisk(true) + .indexDirectory(tempDir) + .parallelOnDiskWrite(true) + .build(); + + final VectorIndexConfiguration sequential = VectorIndexConfiguration.builder() + .dimension(768) + .onDisk(true) + .indexDirectory(tempDir) + .parallelOnDiskWrite(false) + .build(); + + assertTrue(parallel.parallelOnDiskWrite()); + assertFalse(sequential.parallelOnDiskWrite()); + + // All other parameters remain identical + assertEquals(parallel.dimension(), sequential.dimension()); + assertEquals(parallel.similarityFunction(), sequential.similarityFunction()); + assertEquals(parallel.maxDegree(), sequential.maxDegree()); + assertEquals(parallel.beamWidth(), sequential.beamWidth()); + assertEquals(parallel.neighborOverflow(), sequential.neighborOverflow()); + assertEquals(parallel.alpha(), sequential.alpha()); + assertEquals(parallel.onDisk(), sequential.onDisk()); + assertEquals(parallel.indexDirectory(), sequential.indexDirectory()); + assertEquals(parallel.enablePqCompression(), sequential.enablePqCompression()); + assertEquals(parallel.pqSubspaces(), sequential.pqSubspaces()); + assertEquals(parallel.persistenceIntervalMs(), sequential.persistenceIntervalMs()); + assertEquals(parallel.persistOnShutdown(), sequential.persistOnShutdown()); + assertEquals(parallel.minChangesBetweenPersists(), sequential.minChangesBetweenPersists()); + assertEquals(parallel.optimizationIntervalMs(), sequential.optimizationIntervalMs()); + assertEquals(parallel.minChangesBetweenOptimizations(), sequential.minChangesBetweenOptimizations()); + assertEquals(parallel.optimizeOnShutdown(), sequential.optimizeOnShutdown()); + } + + @Test + void testParallelVsNonParallelWithCompression(@TempDir final Path tempDir) + { + final VectorIndexConfiguration parallel = VectorIndexConfiguration.builder() + .dimension(768) + .onDisk(true) + .indexDirectory(tempDir) + .enablePqCompression(true) + .pqSubspaces(48) + .parallelOnDiskWrite(true) + .build(); + + final VectorIndexConfiguration sequential = VectorIndexConfiguration.builder() + .dimension(768) + .onDisk(true) + .indexDirectory(tempDir) + .enablePqCompression(true) + .pqSubspaces(48) + .parallelOnDiskWrite(false) + .build(); + + assertTrue(parallel.parallelOnDiskWrite()); + assertFalse(sequential.parallelOnDiskWrite()); + + // Compression settings are identical regardless of parallel mode + assertEquals(parallel.enablePqCompression(), sequential.enablePqCompression()); + assertEquals(parallel.pqSubspaces(), sequential.pqSubspaces()); + assertEquals(parallel.maxDegree(), sequential.maxDegree()); + } + + @Test + void testFactoryMethodsDefaultToSequential(@TempDir final Path tempDir) + { + final Path indexDir = tempDir.resolve("vectors"); + + final VectorIndexConfiguration medium = VectorIndexConfiguration.forMediumDataset(768, indexDir); + assertFalse(medium.parallelOnDiskWrite()); + + final VectorIndexConfiguration large = VectorIndexConfiguration.forLargeDataset(768, indexDir); + assertFalse(large.parallelOnDiskWrite()); + + final VectorIndexConfiguration highPrecision = VectorIndexConfiguration.forHighPrecision(768, indexDir); + assertFalse(highPrecision.parallelOnDiskWrite()); + } + + @Test + void testBuilderForLargeDatasetCanDisableParallel(@TempDir final Path tempDir) + { + final Path indexDir = tempDir.resolve("vectors"); + final VectorIndexConfiguration config = VectorIndexConfiguration.builderForLargeDataset(768, indexDir) + .parallelOnDiskWrite(false) + .enablePqCompression(true) + .build(); + + assertTrue(config.onDisk()); + assertTrue(config.enablePqCompression()); + assertFalse(config.parallelOnDiskWrite()); + } + // ==================== Similarity Function Tests ==================== @Test @@ -592,6 +680,7 @@ void testFullOnDiskConfiguration(@TempDir final Path tempDir) .optimizationIntervalMs(120_000) .minChangesBetweenOptimizations(500) .optimizeOnShutdown(true) + .parallelOnDiskWrite(false) .build(); assertEquals(768, config.dimension()); @@ -612,6 +701,7 @@ void testFullOnDiskConfiguration(@TempDir final Path tempDir) assertEquals(120_000L, config.optimizationIntervalMs()); assertEquals(500, config.minChangesBetweenOptimizations()); assertTrue(config.optimizeOnShutdown()); + assertFalse(config.parallelOnDiskWrite()); } @Test @@ -774,6 +864,7 @@ void testBuilderMethodChainingReturnsBuilder() assertSame(builder, builder.optimizationIntervalMs(60_000)); assertSame(builder, builder.minChangesBetweenOptimizations(1000)); assertSame(builder, builder.optimizeOnShutdown(false)); + assertSame(builder, builder.parallelOnDiskWrite(true)); } // ==================== Factory Methods Comparison Tests ==================== @@ -829,4 +920,226 @@ void testLargeDatasetEnablesCompression(@TempDir final Path tempDir) assertTrue(config.enablePqCompression(), "Large dataset should have compression enabled by default"); } + + // ==================== Eventual Indexing Tests ==================== + + @Test + void testEventualIndexingDefaultFalse() + { + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(64) + .build(); + + assertFalse(config.eventualIndexing()); + } + + @Test + void testEventualIndexingCanBeEnabled() + { + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(64) + .eventualIndexing(true) + .build(); + + assertTrue(config.eventualIndexing()); + } + + @Test + void testEventualIndexingCanBeDisabledExplicitly() + { + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(64) + .eventualIndexing(false) + .build(); + + assertFalse(config.eventualIndexing()); + } + + @Test + void testEventualIndexingWithOnDiskConfig(@TempDir final Path tempDir) + { + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(64) + .onDisk(true) + .indexDirectory(tempDir) + .eventualIndexing(true) + .build(); + + assertTrue(config.eventualIndexing()); + assertTrue(config.onDisk()); + } + + @Test + void testFactoryMethodsDefaultEventualIndexingFalse(@TempDir final Path tempDir) + { + assertFalse(VectorIndexConfiguration.forSmallDataset(64).eventualIndexing()); + assertFalse(VectorIndexConfiguration.forMediumDataset(64).eventualIndexing()); + assertFalse(VectorIndexConfiguration.forLargeDataset(64, tempDir).eventualIndexing()); + assertFalse(VectorIndexConfiguration.forHighPrecision(64).eventualIndexing()); + } + + /** + * Test on-disk configuration builder. + */ + @Test + void testOnDiskConfigurationBuilder(@TempDir final Path tempDir) + { + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(tempDir) + .build(); + + assertTrue(config.onDisk()); + assertEquals(tempDir, config.indexDirectory()); + assertFalse(config.enablePqCompression()); + assertEquals(0, config.pqSubspaces()); + } + + /** + * Test on-disk configuration with compression. + * FusedPQ requires maxDegree=32, so it should be auto-set. + */ + @Test + void testOnDiskConfigurationWithCompression(@TempDir final Path tempDir) + { + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) // Will be overridden to 32 for FusedPQ + .onDisk(true) + .indexDirectory(tempDir) + .enablePqCompression(true) + .pqSubspaces(32) + .build(); + + assertTrue(config.onDisk()); + assertTrue(config.enablePqCompression()); + assertEquals(32, config.pqSubspaces()); + assertEquals(32, config.maxDegree(), "FusedPQ requires maxDegree=32"); + } + + /** + * Test that maxDegree is auto-set to 32 when compression is enabled. + */ + @Test + void testFusedPQRequiresMaxDegree32(@TempDir final Path tempDir) + { + // Try to set maxDegree to 64 with compression enabled + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .maxDegree(64) + .onDisk(true) + .indexDirectory(tempDir) + .enablePqCompression(true) + .build(); + + // Should be overridden to 32 + assertEquals(32, config.maxDegree(), "FusedPQ should enforce maxDegree=32"); + } + + /** + * Test background persistence configuration builder. + */ + @Test + void testBackgroundPersistenceConfigurationBuilder(@TempDir final Path tempDir) + { + final Path indexDir = tempDir.resolve("index"); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(60_000) + .persistOnShutdown(true) + .minChangesBetweenPersists(50) + .build(); + + assertTrue(config.onDisk()); + assertTrue(config.backgroundPersistence()); + assertEquals(60_000, config.persistenceIntervalMs()); + assertTrue(config.persistOnShutdown()); + assertEquals(50, config.minChangesBetweenPersists()); + } + + /** + * Test validation: persistenceIntervalMs must be non-negative. + */ + @Test + void testPersistenceIntervalMsMustBeNonNegative(@TempDir final Path tempDir) + { + // 0 is valid (means disabled) + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .persistenceIntervalMs(0) + .build(); + assertEquals(0, config.persistenceIntervalMs()); + assertFalse(config.backgroundPersistence()); + + assertThrows(IllegalArgumentException.class, () -> + VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .persistenceIntervalMs(-1000) + .build() + ); + } + + /** + * Test validation: minChangesBetweenPersists must be non-negative. + */ + @Test + void testMinChangesBetweenPersistsMustBeNonNegative(@TempDir final Path tempDir) + { + assertThrows(IllegalArgumentException.class, () -> + VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .minChangesBetweenPersists(-1) + .build() + ); + + // Zero should be allowed (persist on every interval) + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .minChangesBetweenPersists(0) + .build(); + assertEquals(0, config.minChangesBetweenPersists()); + } + + /** + * Test background optimization configuration builder. + */ + @Test + void testBackgroundOptimizationConfigurationBuilder(@TempDir final Path tempDir) + { + final Path indexDir = tempDir.resolve("index"); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(120_000) + .minChangesBetweenOptimizations(500) + .optimizeOnShutdown(true) + .build(); + + assertTrue(config.onDisk()); + assertTrue(config.backgroundOptimization()); + assertEquals(120_000, config.optimizationIntervalMs()); + assertEquals(500, config.minChangesBetweenOptimizations()); + assertTrue(config.optimizeOnShutdown()); + } + } diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java index 3e453425..3d94f254 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java @@ -9,16 +9,14 @@ * This program and the accompanying materials are made * available under the terms of the Eclipse Public License 2.0 * which is available at https://www.eclipse.org/legal/epl-2.0/ - * + * * SPDX-License-Identifier: EPL-2.0 * #L% */ -import org.eclipse.store.gigamap.types.GigaMap; -import org.eclipse.store.storage.embedded.types.EmbeddedStorage; -import org.eclipse.store.storage.embedded.types.EmbeddedStorageManager; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; +import static java.time.Duration.ofMillis; +import static org.awaitility.Awaitility.await; +import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; import java.nio.file.Files; @@ -32,8 +30,14 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.IntStream; -import static org.junit.jupiter.api.Assertions.*; +import org.eclipse.store.gigamap.types.GigaMap; +import org.eclipse.store.storage.embedded.types.EmbeddedStorage; +import org.eclipse.store.storage.embedded.types.EmbeddedStorageManager; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.api.io.TempDir; /** * Tests for on-disk VectorIndex functionality and Product Quantization. @@ -57,6 +61,24 @@ public float[] vectorize(final Document entity) } } + /** + * Embedded vectorizer - vectors are part of the entity, not stored separately. + */ + static class EmbeddedDocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + + @Override + public boolean isEmbedded() + { + return true; + } + } + /** * Helper to generate a random normalized vector. */ @@ -78,120 +100,33 @@ private static float[] randomVector(final Random random, final int dimension) } /** - * Test on-disk configuration builder. - */ - @Test - void testOnDiskConfigurationBuilder(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - assertTrue(config.onDisk()); - assertEquals(indexDir, config.indexDirectory()); - assertFalse(config.enablePqCompression()); - assertEquals(0, config.pqSubspaces()); - } - - /** - * Test on-disk configuration with compression. - * FusedPQ requires maxDegree=32, so it should be auto-set. - */ - @Test - void testOnDiskConfigurationWithCompression(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(16) // Will be overridden to 32 for FusedPQ - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(32) - .build(); - - assertTrue(config.onDisk()); - assertTrue(config.enablePqCompression()); - assertEquals(32, config.pqSubspaces()); - assertEquals(32, config.maxDegree(), "FusedPQ requires maxDegree=32"); - } - - /** - * Test that maxDegree is auto-set to 32 when compression is enabled. - */ - @Test - void testFusedPQRequiresMaxDegree32(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - // Try to set maxDegree to 64 with compression enabled - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .maxDegree(64) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .build(); - - // Should be overridden to 32 - assertEquals(32, config.maxDegree(), "FusedPQ should enforce maxDegree=32"); - } - - /** - * Test validation: onDisk requires indexDirectory. + * Helper to add multiple documents with random vectors to a GigaMap. */ - @Test - void testOnDiskRequiresIndexDirectory() + private static void addRandomDocuments( + final GigaMap gigaMap, + final Random random, + final int dimension, + final int count, + final String prefix + ) { - assertThrows(IllegalStateException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - // indexDirectory not set - .build() - ); + IntStream.range(0, count) + .forEach(i -> gigaMap.add(new Document(prefix + i, randomVector(random, dimension)))); } /** - * Test validation: compression requires onDisk. + * Helper to add multiple documents from a list of pre-generated vectors. */ - @Test - void testCompressionRequiresOnDisk() + private static void addDocumentsFromVectors( + final GigaMap gigaMap, + final List vectors, + final String prefix + ) { - assertThrows(IllegalStateException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .enablePqCompression(true) - // onDisk not set - .build() - ); + IntStream.range(0, vectors.size()) + .forEach(i -> gigaMap.add(new Document(prefix + i, vectors.get(i)))); } - /** - * Test validation: pqSubspaces must divide dimension evenly. - */ - @Test - void testPqSubspacesMustDivideDimension(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(100) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(33) // 100 is not divisible by 33 - .build() - ); - } /** * Test creating an on-disk index and persisting it. @@ -241,10 +176,7 @@ void testOnDiskIndexCreationAndPersistence(@TempDir final Path tempDir) throws I assertFalse(index.isPqCompressionEnabled()); // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } + addDocumentsFromVectors(gigaMap, vectors, "doc_"); // Search and record expected results final VectorSearchResult result = index.search(queryVector, 10); @@ -275,7 +207,6 @@ void testOnDiskIndexCreationAndPersistence(@TempDir final Path tempDir) throws I assertEquals(vectorCount, gigaMap.size()); final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); assertTrue(index.isOnDisk()); // Search and compare results @@ -327,10 +258,7 @@ void testOnDiskIndexWithCompression(@TempDir final Path tempDir) throws IOExcept assertTrue(index.isPqCompressionEnabled()); // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Train compression ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -342,11 +270,7 @@ void testOnDiskIndexWithCompression(@TempDir final Path tempDir) throws IOExcept assertEquals(10, result.size()); // Verify all entities are accessible - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - assertTrue(entry.entity().content().startsWith("doc_")); - } + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); // Persist to disk index.persistToDisk(); @@ -387,10 +311,7 @@ void testOnDiskSearchQuality(@TempDir final Path tempDir) throws IOException ); // Add random vectors - for(int i = 0; i < vectorCount - 1; i++) - { - gigaMap.add(new Document("random_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount - 1, "random_"); // Add a one-hot "needle" vector that randomVector() cannot produce, // since randomVector() populates all dimensions with non-zero values. @@ -440,10 +361,7 @@ void testOnDiskIndexMultipleRestarts(@TempDir final Path tempDir) throws IOExcep vectorIndices.add("embeddings", config, new ComputedDocumentVectorizer()); - for(int i = 0; i < 100; i++) - { - gigaMap.add(new Document("phase1_doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 100, "phase1_doc_"); assertEquals(100, gigaMap.size()); storage.storeRoot(); @@ -465,10 +383,7 @@ void testOnDiskIndexMultipleRestarts(@TempDir final Path tempDir) throws IOExcep assertEquals(10, result.size()); // Add more vectors - for(int i = 0; i < 50; i++) - { - gigaMap.add(new Document("phase2_doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 50, "phase2_doc_"); assertEquals(150, gigaMap.size()); storage.storeRoot(); @@ -530,10 +445,7 @@ void testPqCompressionSearchQuality(@TempDir final Path tempDir) ); // Add random vectors - for(int i = 0; i < vectorCount - 1; i++) - { - gigaMap.add(new Document("random_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount - 1, "random_"); // Add a one-hot "needle" vector that randomVector() cannot produce, // since randomVector() populates all dimensions with non-zero values. @@ -615,10 +527,7 @@ void testPqCompressionPersistAndReload(@TempDir final Path tempDir) throws IOExc assertTrue(index.isOnDisk()); assertTrue(index.isPqCompressionEnabled()); - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } + addDocumentsFromVectors(gigaMap, vectors, "doc_"); // Train and search ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -649,7 +558,6 @@ void testPqCompressionPersistAndReload(@TempDir final Path tempDir) throws IOExc assertEquals(vectorCount, gigaMap.size()); final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); assertTrue(index.isOnDisk()); assertTrue(index.isPqCompressionEnabled()); @@ -667,11 +575,7 @@ void testPqCompressionPersistAndReload(@TempDir final Path tempDir) throws IOExc assertEquals(expectedIds.size(), actualIds.size()); // Verify all entities are accessible - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - assertTrue(entry.entity().content().startsWith("doc_")); - } + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); } } } @@ -707,10 +611,7 @@ void testPqCompressionWithDotProduct(@TempDir final Path tempDir) new ComputedDocumentVectorizer() ); - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -718,10 +619,7 @@ void testPqCompressionWithDotProduct(@TempDir final Path tempDir) final VectorSearchResult result = index.search(queryVector, 10); assertEquals(10, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - } + result.forEach(entry -> assertNotNull(entry.entity())); } /** @@ -755,10 +653,7 @@ void testPqCompressionWithEuclidean(@TempDir final Path tempDir) new ComputedDocumentVectorizer() ); - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -766,10 +661,7 @@ void testPqCompressionWithEuclidean(@TempDir final Path tempDir) final VectorSearchResult result = index.search(queryVector, 10); assertEquals(10, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - } + result.forEach(entry -> assertNotNull(entry.entity())); } /** @@ -805,10 +697,7 @@ void testPqCompressionWithDefaultSubspaces(@TempDir final Path tempDir) new ComputedDocumentVectorizer() ); - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -816,11 +705,7 @@ void testPqCompressionWithDefaultSubspaces(@TempDir final Path tempDir) final VectorSearchResult result = index.search(queryVector, 10); assertEquals(10, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - assertTrue(entry.entity().content().startsWith("doc_")); - } + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); } /** @@ -855,10 +740,7 @@ void testPqCompressionWithRemoval(@TempDir final Path tempDir) new ComputedDocumentVectorizer() ); - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -916,10 +798,7 @@ void testPqCompressionConcurrentSearch(@TempDir final Path tempDir) throws Excep new ComputedDocumentVectorizer() ); - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -997,10 +876,7 @@ void testPqCompressionAddAfterTraining(@TempDir final Path tempDir) ); // Add initial vectors - for(int i = 0; i < initialCount; i++) - { - gigaMap.add(new Document("initial_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, initialCount, "initial_"); // Train PQ ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -1011,10 +887,7 @@ void testPqCompressionAddAfterTraining(@TempDir final Path tempDir) assertEquals(10, resultBefore.size()); // Add more vectors after training - for(int i = 0; i < additionalCount; i++) - { - gigaMap.add(new Document("additional_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, additionalCount, "additional_"); assertEquals(initialCount + additionalCount, gigaMap.size()); @@ -1068,10 +941,7 @@ void testPqCompressionMultipleRestarts(@TempDir final Path tempDir) throws IOExc new ComputedDocumentVectorizer() ); - for(int i = 0; i < 500; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 500, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); index.persistToDisk(); @@ -1095,7 +965,6 @@ void testPqCompressionMultipleRestarts(@TempDir final Path tempDir) throws IOExc assertEquals(500, gigaMap.size()); final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); assertTrue(index.isOnDisk()); assertTrue(index.isPqCompressionEnabled()); @@ -1104,11 +973,8 @@ void testPqCompressionMultipleRestarts(@TempDir final Path tempDir) throws IOExc assertEquals(10, result.size()); // Verify all entities are accessible - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - assertTrue(entry.entity().content().startsWith("doc_")); - } + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); + } } @@ -1161,10 +1027,7 @@ void testPqCompressionRemoveAllAndRepopulate(@TempDir final Path tempDir) ); // Initial population - for(int i = 0; i < 500; i++) - { - gigaMap.add(new Document("old_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 500, "old_"); assertEquals(500, gigaMap.size()); @@ -1173,10 +1036,7 @@ void testPqCompressionRemoveAllAndRepopulate(@TempDir final Path tempDir) assertEquals(0, gigaMap.size()); // Repopulate - for(int i = 0; i < 600; i++) - { - gigaMap.add(new Document("new_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 600, "new_"); assertEquals(600, gigaMap.size()); @@ -1190,11 +1050,8 @@ void testPqCompressionRemoveAllAndRepopulate(@TempDir final Path tempDir) final VectorSearchResult result = indexAfter.search(randomVector(random, dimension), 20); assertEquals(20, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - assertTrue(entry.entity().content().startsWith("new_"), - "All results should be from new population"); - } + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("new_"))); + } /** @@ -1227,10 +1084,7 @@ void testInMemoryIndexStillWorks() assertFalse(index.isOnDisk()); // Add vectors - for(int i = 0; i < 100; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 100, "doc_"); // Search should work final VectorSearchResult result = index.search(randomVector(random, dimension), 10); @@ -1242,118 +1096,6 @@ void testInMemoryIndexStillWorks() // Background Persistence Tests // ======================================================================== - /** - * Test background persistence configuration builder. - */ - @Test - void testBackgroundPersistenceConfigurationBuilder(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(60_000) - .persistOnShutdown(true) - .minChangesBetweenPersists(50) - .build(); - - assertTrue(config.onDisk()); - assertTrue(config.backgroundPersistence()); - assertEquals(60_000, config.persistenceIntervalMs()); - assertTrue(config.persistOnShutdown()); - assertEquals(50, config.minChangesBetweenPersists()); - } - - /** - * Test background persistence configuration defaults. - */ - @Test - void testBackgroundPersistenceConfigurationDefaults(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - // Background persistence should be disabled by default - assertFalse(config.backgroundPersistence()); - assertEquals(0, config.persistenceIntervalMs()); - assertTrue(config.persistOnShutdown()); - assertEquals(100, config.minChangesBetweenPersists()); - } - - /** - * Test validation: background persistence requires onDisk. - */ - @Test - void testBackgroundPersistenceRequiresOnDisk() - { - assertThrows(IllegalStateException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .persistenceIntervalMs(30_000) - // onDisk not set - .build() - ); - } - - /** - * Test validation: persistenceIntervalMs must be non-negative. - */ - @Test - void testPersistenceIntervalMsMustBeNonNegative(@TempDir final Path tempDir) - { - // 0 is valid (means disabled) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .persistenceIntervalMs(0) - .build(); - assertEquals(0, config.persistenceIntervalMs()); - assertFalse(config.backgroundPersistence()); - - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .persistenceIntervalMs(-1000) - .build() - ); - } - - /** - * Test validation: minChangesBetweenPersists must be non-negative. - */ - @Test - void testMinChangesBetweenPersistsMustBeNonNegative(@TempDir final Path tempDir) - { - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .minChangesBetweenPersists(-1) - .build() - ); - - // Zero should be allowed (persist on every interval) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .minChangesBetweenPersists(0) - .build(); - assertEquals(0, config.minChangesBetweenPersists()); - } - /** * Test that background persistence triggers after the configured interval. */ @@ -1386,23 +1128,22 @@ void testBackgroundPersistenceTriggersAfterInterval(@TempDir final Path tempDir) try { // Add vectors to trigger dirty state - for(int i = 0; i < 50; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // Initially, files should not exist (not yet persisted) assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), "Graph file should not exist immediately after adding"); // Wait for background persistence to trigger (interval + some buffer) - Thread.sleep(1500); + await() + .atMost(ofMillis(1500)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertAll( + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist after background persistence"), + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), + "Meta file should exist after background persistence"))); - // Files should now exist - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after background persistence"); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), - "Meta file should exist after background persistence"); } finally { @@ -1442,10 +1183,7 @@ void testConcurrentSearchDuringBackgroundPersistence(@TempDir final Path tempDir try { // Add initial vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Run concurrent searches while background persistence may be running final int numSearches = 50; @@ -1528,10 +1266,7 @@ void testShutdownPersistsPendingChanges(@TempDir final Path tempDir) throws Exce ); // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Files should not exist yet (interval hasn't triggered) assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), @@ -1578,10 +1313,7 @@ void testShutdownSkipsPersistWhenDisabled(@TempDir final Path tempDir) throws Ex ); // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Close the index (should NOT trigger persist) index.close(); @@ -1623,13 +1355,10 @@ void testDebouncing(@TempDir final Path tempDir) throws Exception try { // Add fewer vectors than the threshold - for(int i = 0; i < 50; i++) // 50 < 500 threshold - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // 50 < 500 threshold // Wait for multiple persistence intervals - Thread.sleep(800); + Thread.sleep(500); // Files should NOT exist because change count is below threshold assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), @@ -1641,12 +1370,11 @@ void testDebouncing(@TempDir final Path tempDir) throws Exception gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); } - // Wait for persistence to trigger - Thread.sleep(500); - - // Now files should exist - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist when changes exceed threshold"); + await() + .atMost(ofMillis(500)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist when changes exceed threshold")); } finally { @@ -1693,14 +1421,12 @@ void testBulkAddTracksChangeCount(@TempDir final Path tempDir) throws Exception gigaMap.addAll(documents); // Wait for persistence - Thread.sleep(800); - - // Files should exist because bulk add counted as 150 changes (> 100 threshold) - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after bulk add exceeds threshold"); - } - finally - { + await() + .atMost(ofMillis(800)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist after bulk add exceeds threshold")); + } finally { index.close(); } } @@ -1745,10 +1471,7 @@ void testBackgroundPersistenceWithRestart(@TempDir final Path tempDir) throws Ex ); // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Verify search works final VectorSearchResult result = index.search(queryVector, expectedK); @@ -1779,7 +1502,6 @@ void testBackgroundPersistenceWithRestart(@TempDir final Path tempDir) throws Ex assertEquals(vectorCount, gigaMap.size()); final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); assertTrue(index.isOnDisk(), "Index should be on-disk after reload"); // Search should still work after reload @@ -1824,10 +1546,7 @@ void testManualPersistWithBackgroundPersistenceEnabled(@TempDir final Path tempD try { // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Files should not exist yet assertFalse(Files.exists(indexDir.resolve("embeddings.graph"))); @@ -1853,116 +1572,19 @@ void testManualPersistWithBackgroundPersistenceEnabled(@TempDir final Path tempD // ======================================================================== /** - * Test background optimization configuration builder. + * Test that background optimization runs after the configured interval and threshold. */ @Test - void testBackgroundOptimizationConfigurationBuilder(@TempDir final Path tempDir) + void testBackgroundOptimizationTriggersAfterIntervalAndThreshold(@TempDir final Path tempDir) throws Exception { + final int dimension = 32; + final Random random = new Random(42); final Path indexDir = tempDir.resolve("index"); - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(120_000) - .minChangesBetweenOptimizations(500) - .optimizeOnShutdown(true) - .build(); - - assertTrue(config.onDisk()); - assertTrue(config.backgroundOptimization()); - assertEquals(120_000, config.optimizationIntervalMs()); - assertEquals(500, config.minChangesBetweenOptimizations()); - assertTrue(config.optimizeOnShutdown()); - } - - /** - * Test background optimization configuration defaults. - */ - @Test - void testBackgroundOptimizationConfigurationDefaults(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - // Background optimization should be disabled by default - assertFalse(config.backgroundOptimization()); - assertEquals(0, config.optimizationIntervalMs()); - assertEquals(1000, config.minChangesBetweenOptimizations()); - assertFalse(config.optimizeOnShutdown()); - } - - /** - * Test validation: optimizationIntervalMs must be non-negative. - */ - @Test - void testOptimizationIntervalMsMustBeNonNegative(@TempDir final Path tempDir) - { - // 0 is valid (means disabled) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .optimizationIntervalMs(0) - .build(); - assertEquals(0, config.optimizationIntervalMs()); - assertFalse(config.backgroundOptimization()); - - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .optimizationIntervalMs(-1000) - .build() - ); - } - - /** - * Test validation: minChangesBetweenOptimizations must be non-negative. - */ - @Test - void testMinChangesBetweenOptimizationsMustBeNonNegative(@TempDir final Path tempDir) - { - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .minChangesBetweenOptimizations(-1) - .build() - ); - - // Zero should be allowed (optimize on every interval) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .minChangesBetweenOptimizations(0) - .build(); - assertEquals(0, config.minChangesBetweenOptimizations()); - } - - /** - * Test that background optimization runs after the configured interval and threshold. - */ - @Test - void testBackgroundOptimizationTriggersAfterIntervalAndThreshold(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - // Configure with short interval and low threshold for testing + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + // Configure with short interval and low threshold for testing final VectorIndexConfiguration config = VectorIndexConfiguration.builder() .dimension(dimension) .similarityFunction(VectorSimilarityFunction.COSINE) @@ -1983,28 +1605,26 @@ void testBackgroundOptimizationTriggersAfterIntervalAndThreshold(@TempDir final final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; // Initially, optimization count should be 0 - assertEquals(0, defaultIndex.optimizationManager.getOptimizationCount(), + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), "Optimization count should be 0 initially"); // Add vectors to trigger dirty state above threshold - for(int i = 0; i < 50; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // Verify pending changes are tracked - assertTrue(defaultIndex.optimizationManager.getPendingChangeCount() > 0, + assertTrue(defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount() > 0, "Pending changes should be tracked"); - // Wait for background optimization to run - Thread.sleep(800); - // Verify optimization was actually performed - assertTrue(defaultIndex.optimizationManager.getOptimizationCount() >= 1, - "Optimization should have been performed at least once"); + await() + .atLeast(ofMillis(300)) + .atMost(ofMillis(800)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertTrue(defaultIndex.backgroundTaskManager.getOptimizationCount() >= 1, + "Optimization should have been performed at least once")); // Verify pending changes were reset - assertEquals(0, defaultIndex.optimizationManager.getPendingChangeCount(), + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), "Pending changes should be reset after optimization"); // Verify search still works @@ -2051,24 +1671,21 @@ void testOptimizationDebouncingBelowThreshold(@TempDir final Path tempDir) throw final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; // Add fewer vectors than the threshold - for(int i = 0; i < 50; i++) // 50 < 500 threshold - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // 50 < 500 threshold // Verify pending changes are tracked - assertEquals(50, defaultIndex.optimizationManager.getPendingChangeCount(), + assertEquals(50, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), "Pending changes should be 50"); // Wait for multiple optimization intervals Thread.sleep(600); // Verify optimization was NOT performed (below threshold) - assertEquals(0, defaultIndex.optimizationManager.getOptimizationCount(), + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), "Optimization should NOT have been performed (below threshold)"); // Verify pending changes are still tracked (not reset) - assertEquals(50, defaultIndex.optimizationManager.getPendingChangeCount(), + assertEquals(50, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), "Pending changes should still be 50 (not reset)"); // Search should still work @@ -2113,18 +1730,14 @@ void testShutdownOptimizesPendingChanges(@TempDir final Path tempDir) throws Exc final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Verify pending changes are tracked - assertEquals(vectorCount, defaultIndex.optimizationManager.getPendingChangeCount(), + assertEquals(vectorCount, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), "Pending changes should equal vector count"); // Verify no optimization has run yet - assertEquals(0, defaultIndex.optimizationManager.getOptimizationCount(), + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), "Optimization count should be 0 before close"); // Verify search works before close @@ -2172,17 +1785,14 @@ void testShutdownSkipsOptimizeWhenDisabled(@TempDir final Path tempDir) throws E final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Verify pending changes are tracked - assertEquals(vectorCount, defaultIndex.optimizationManager.getPendingChangeCount(), + assertEquals(vectorCount, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), "Pending changes should equal vector count"); // Verify no optimization has run yet - assertEquals(0, defaultIndex.optimizationManager.getOptimizationCount(), + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), "Optimization count should be 0 before close"); // Close the index (should NOT trigger optimize) @@ -2227,10 +1837,7 @@ void testConcurrentSearchDuringBackgroundOptimization(@TempDir final Path tempDi try { // Add initial vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Run concurrent searches while background optimization may be running final int numSearches = 50; @@ -2321,7 +1928,7 @@ void testBulkAddTracksChangeCountForOptimization(@TempDir final Path tempDir) th gigaMap.addAll(documents); // Wait for optimization - Thread.sleep(800); + Thread.sleep(500); // Search should still work final VectorSearchResult result = index.search(randomVector(random, dimension), 10); @@ -2365,10 +1972,7 @@ void testManualOptimizeWithBackgroundOptimizationEnabled(@TempDir final Path tem try { // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Manually trigger optimization index.optimize(); @@ -2412,18 +2016,15 @@ void testBackgroundPersistenceAndOptimizationTogether(@TempDir final Path tempDi .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); try { // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Wait for both background tasks to run Thread.sleep(1000); @@ -2442,55 +2043,367 @@ void testBackgroundPersistenceAndOptimizationTogether(@TempDir final Path tempDi } } + + // ======================================================================== + // Parallel vs Non-Parallel On-Disk Write Tests + // ======================================================================== + + /** - * Test that in-memory index can also use background optimization. + * Test that parallel and non-parallel on-disk writes both support persist-and-reload + * for a large PQ-compressed index. + * Verifies that the graph files produced by both modes can be loaded correctly + * and yield equivalent search results after restart. */ @Test - void testInMemoryIndexWithBackgroundOptimization(@TempDir final Path tempDir) throws Exception + void testParallelVsNonParallelPersistAndReload(@TempDir final Path tempDir) throws IOException { - final int dimension = 32; - final int vectorCount = 150; + final int vectorCount = 2000; + final int dimension = 64; + final int pqSubspaces = 16; + final int k = 20; + final Random random = new Random(42); + + // Generate shared vectors and query + final List vectors = new ArrayList<>(); + for(int i = 0; i < vectorCount; i++) + { + vectors.add(randomVector(random, dimension)); + } + final float[] queryVector = randomVector(new Random(999), dimension); + + final Path parallelIndexDir = tempDir.resolve("parallel-index"); + final Path parallelStorageDir = tempDir.resolve("parallel-storage"); + final Path sequentialIndexDir = tempDir.resolve("sequential-index"); + final Path sequentialStorageDir = tempDir.resolve("sequential-storage"); + + // --- Build and persist both modes --- + buildAndPersistIndex(vectors, queryVector, dimension, pqSubspaces, parallelIndexDir, parallelStorageDir, true); + buildAndPersistIndex(vectors, queryVector, dimension, pqSubspaces, sequentialIndexDir, sequentialStorageDir, false); + + // --- Reload both and compare search results --- + final List parallelIds = new ArrayList<>(); + final List parallelScores = new ArrayList<>(); + { + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(parallelStorageDir)) + { + @SuppressWarnings("unchecked") + final GigaMap gigaMap = (GigaMap)storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(vectorCount, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + assertTrue(index.isOnDisk()); + + final VectorSearchResult result = index.search(queryVector, k); + assertEquals(k, result.size()); + for(final VectorSearchResult.Entry entry : result) + { + parallelIds.add(entry.entityId()); + parallelScores.add(entry.score()); + assertNotNull(entry.entity()); + } + } + } + + final List sequentialIds = new ArrayList<>(); + final List sequentialScores = new ArrayList<>(); + { + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(sequentialStorageDir)) + { + @SuppressWarnings("unchecked") + final GigaMap gigaMap = (GigaMap)storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(vectorCount, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + assertTrue(index.isOnDisk()); + + final VectorSearchResult result = index.search(queryVector, k); + assertEquals(k, result.size()); + for(final VectorSearchResult.Entry entry : result) + { + sequentialIds.add(entry.entityId()); + sequentialScores.add(entry.score()); + assertNotNull(entry.entity()); + } + } + } + + // Both modes should produce equivalent results after reload + assertEquals(parallelIds, sequentialIds, + "Parallel and sequential modes should produce identical search results after reload"); + assertEquals(parallelScores, sequentialScores, + "Parallel and sequential modes should produce identical search scores after reload"); + } + + /** + * Helper to build, populate, train PQ, persist, and store a PQ-compressed index. + */ + private void buildAndPersistIndex( + final List vectors , + final float[] queryVector , + final int dimension , + final int pqSubspaces , + final Path indexDir , + final Path storageDir , + final boolean parallel + ) throws IOException + { + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) + { + final GigaMap gigaMap = GigaMap.New(); + storage.setRoot(gigaMap); + + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(32) + .beamWidth(100) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .parallelOnDiskWrite(parallel) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + addDocumentsFromVectors(gigaMap, vectors, "doc_"); + + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); + index.persistToDisk(); + + assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); + + storage.storeRoot(); + } + } + + + // ======================================================================== + // Embedded Vectorizer + On-Disk Tests + // ======================================================================== + + /** + * Test that an embedded vectorizer with parallel on-disk write completes without deadlock. + *

    + * This is a regression test for a deadlock where {@code persistToDisk()} held + * {@code synchronized(parentMap)} for the entire disk write. The disk writer uses + * internal worker threads (ForkJoinPool for PQ encoding, parallel graph writer) + * that call {@code parentMap.get()} — which also synchronizes on the same monitor. + *

    + * The fix restructures locking: Phase 1 (prep) runs inside {@code synchronized(parentMap)}, + * Phase 2 (disk write) runs outside it but still holds {@code persistenceLock.writeLock()}. + *

    + * Uses {@code @Timeout} to fail fast if a deadlock occurs instead of hanging indefinitely. + */ + @Test + @Timeout(value = 60, unit = TimeUnit.SECONDS) + void testEmbeddedVectorizerWithParallelOnDiskWrite(@TempDir final Path tempDir) + { + final int vectorCount = 500; + final int dimension = 64; final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + final GigaMap gigaMap = GigaMap.New(); final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - // In-memory index with background optimization only final VectorIndexConfiguration config = VectorIndexConfiguration.builder() .dimension(dimension) .similarityFunction(VectorSimilarityFunction.COSINE) - .optimizationIntervalMs(200) - .minChangesBetweenOptimizations(10) - .optimizeOnShutdown(true) + .onDisk(true) + .indexDirectory(indexDir) + .parallelOnDiskWrite(true) .build(); - assertFalse(config.onDisk(), "Should be in-memory index"); - assertTrue(config.backgroundOptimization(), "Background optimization should be enabled"); + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new EmbeddedDocumentVectorizer() + ); + + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // This would deadlock before the fix + index.persistToDisk(); + + // Verify files were created + assertAll( + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))), + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))) + ); + + // Verify search still works after persist + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult result = index.search(queryVector, 10); + assertEquals(10, result.size()); + + result.forEach(entry -> assertNotNull(entry.entity())); + } + + /** + * Test that an embedded vectorizer with PQ compression and parallel on-disk write + * completes without deadlock. + *

    + * This is the most deadlock-prone scenario: FusedPQ encoding uses a ForkJoinPool + * that calls {@code getVector()} on worker threads, plus the parallel graph writer + * also calls {@code getVector()} from its own thread pool. + */ + @Test + @Timeout(value = 60, unit = TimeUnit.SECONDS) + void testEmbeddedVectorizerWithPqAndParallelOnDiskWrite(@TempDir final Path tempDir) + { + final int vectorCount = 500; + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .parallelOnDiskWrite(true) + .build(); final VectorIndex index = vectorIndices.add( "embeddings", config, - new ComputedDocumentVectorizer() + new EmbeddedDocumentVectorizer() ); - try - { - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - // Wait for optimization to run - Thread.sleep(600); + // Train PQ compression + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - // Search should still work - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); + // This would deadlock before the fix + index.persistToDisk(); + + // Verify files were created + assertAll( + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))), + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))) + ); + + // Verify search still works + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult result = index.search(queryVector, 10); + assertEquals(10, result.size()); + } + + /** + * Test that parallel and non-parallel on-disk writes produce equivalent search results + * for a large index without PQ compression. + * Both modes should produce identical graph files that yield the same search quality. + */ + @Test + void testParallelVsSequentialOnDiskWrite(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 2000; + final int dimension = 64; + final int k = 20; + final Random random = new Random(42); + + // Generate shared vectors and query + final List vectors = new ArrayList<>(); + for (int i = 0; i < vectorCount; i++) { + vectors.add(randomVector(random, dimension)); } - finally - { - index.close(); + final float[] queryVector = randomVector(new Random(999), dimension); + + final Path parallelIndexDir = tempDir.resolve("parallel"); + final Path sequentialIndexDir = tempDir.resolve("sequential"); + + final List parallelIds = new ArrayList<>(); + final List parallelScores = new ArrayList<>(); + final List sequentialIds = new ArrayList<>(); + final List sequentialScores = new ArrayList<>(); + + // --- Parallel config + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration configParallel = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .onDisk(true) + .indexDirectory(parallelIndexDir) + .parallelOnDiskWrite(true) + .build(); + + // --- Sequential config + final VectorIndex index = vectorIndices.add( + "embeddings", configParallel, new ComputedDocumentVectorizer() + ); + + final VectorIndexConfiguration configSequential = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .enablePqCompression(true) + .beamWidth(100) + .onDisk(true) + .indexDirectory(sequentialIndexDir) + .parallelOnDiskWrite(false) + .build(); + + final VectorIndex indexSequential = vectorIndices.add( + "embeddingsSequential", configSequential, new ComputedDocumentVectorizer() + ); + + addDocumentsFromVectors(gigaMap, vectors, "doc_"); + + index.persistToDisk(); + indexSequential.persistToDisk(); + + //parallel + final VectorSearchResult result = index.search(queryVector, k); + for (final VectorSearchResult.Entry entry : result) { + parallelIds.add(entry.entityId()); + parallelScores.add(entry.score()); + } + + //sequential + final VectorSearchResult resultSequential = indexSequential.search(queryVector, k); + for (final VectorSearchResult.Entry entry : resultSequential) { + sequentialIds.add(entry.entityId()); + sequentialScores.add(entry.score()); } + + assertAll( + () -> assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.graph"))), + () -> assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.meta"))), + () -> assertTrue(Files.exists(sequentialIndexDir.resolve("embeddingsSequential.graph"))), + () -> assertTrue(Files.exists(sequentialIndexDir.resolve("embeddingsSequential.meta"))) + ); + + // Both indices were built from the same data with the same HNSW parameters, + // so search results must be identical. + assertEquals(parallelIds, sequentialIds, + "Parallel and sequential on-disk writes should produce identical search results"); + assertEquals(parallelScores, sequentialScores, + "Parallel and sequential on-disk writes should produce identical search scores"); } + + } diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexEventualIndexingTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexEventualIndexingTest.java new file mode 100644 index 00000000..25462827 --- /dev/null +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexEventualIndexingTest.java @@ -0,0 +1,748 @@ +package org.eclipse.store.gigamap.jvector; + +/*- + * #%L + * EclipseStore GigaMap JVector + * %% + * Copyright (C) 2023 - 2026 MicroStream Software + * %% + * This program and the accompanying materials are made + * available under the terms of the Eclipse Public License 2.0 + * which is available at https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + * #L% + */ + +import org.eclipse.store.gigamap.types.GigaMap; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.api.io.TempDir; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Random; +import java.util.concurrent.TimeUnit; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Tests for eventual indexing mode in VectorIndex. + *

    + * Eventual indexing defers HNSW graph mutations to a background thread + * while keeping vector store updates immediate. This trades immediate + * search consistency for reduced mutation latency. + */ +class VectorIndexEventualIndexingTest +{ + record Document(String content, float[] embedding) {} + + static class ComputedDocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + } + + static class EmbeddedDocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + + @Override + public boolean isEmbedded() + { + return true; + } + } + + private static float[] randomVector(final Random random, final int dimension) + { + final float[] vector = new float[dimension]; + float norm = 0; + for(int i = 0; i < dimension; i++) + { + vector[i] = random.nextFloat() * 2 - 1; + norm += vector[i] * vector[i]; + } + norm = (float)Math.sqrt(norm); + for(int i = 0; i < dimension; i++) + { + vector[i] /= norm; + } + return vector; + } + + // ==================== Basic Add / Search Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testAddAndSearchWithEventualIndexing() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + try (final VectorIndex index = vectorIndices.add( + "embeddings", config, new EmbeddedDocumentVectorizer() + )) + { + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); + gigaMap.add(new Document("doc3", new float[]{0.0f, 0.0f, 1.0f})); + + // Drain queue to ensure all graph operations are applied + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); + + // Search should find all 3 documents + final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 3); + assertEquals(3, result.size()); + + // The closest match should be doc1 + assertEquals("doc1", result.toList().get(0).entity().content()); + } + } + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testAddAndSearchWithComputedVectorizer() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + try (final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + )) + { + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); + gigaMap.add(new Document("doc3", new float[]{0.0f, 0.0f, 1.0f})); + + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); + + final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 3); + assertEquals(3, result.size()); + assertEquals("doc1", result.toList().get(0).entity().content()); + } + } + + // ==================== Bulk Add Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testBulkAddWithEventualIndexing() + { + final int dimension = 64; + final int vectorCount = 100; + final Random random = new Random(42); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + try (final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + )) + { + for (int i = 0; i < vectorCount; i++) + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } + + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); + + final VectorSearchResult result = index.search( + randomVector(new Random(99), dimension), 10 + ); + assertEquals(10, result.size()); + } + } + + // ==================== Update Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testUpdateWithEventualIndexing() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + try (final VectorIndex index = vectorIndices.add( + "embeddings", config, new EmbeddedDocumentVectorizer() + )) + { + final Document doc1 = new Document("doc1", new float[]{1.0f, 0.0f, 0.0f}); + final Document doc2 = new Document("doc2", new float[]{0.0f, 1.0f, 0.0f}); + gigaMap.add(doc1); + gigaMap.add(doc2); + + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); + + // Update doc1's vector to be close to doc2 + final Document updatedDoc1 = new Document("doc1_updated", new float[]{0.1f, 0.9f, 0.0f}); + gigaMap.set(0L, updatedDoc1); + + defaultIndex.backgroundTaskManager.drainQueue(); + + // Search for doc2-like vector: updated doc1 should now be close + final VectorSearchResult result = index.search(new float[]{0.0f, 1.0f, 0.0f}, 2); + assertEquals(2, result.size()); + } + } + + // ==================== Remove Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testRemoveWithEventualIndexing() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + try (final VectorIndex index = vectorIndices.add( + "embeddings", config, new EmbeddedDocumentVectorizer() + )) + { + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); + gigaMap.add(new Document("doc3", new float[]{0.0f, 0.0f, 1.0f})); + + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); + + // Remove doc1 + gigaMap.removeById(0L); + + defaultIndex.backgroundTaskManager.drainQueue(); + + // Search should only return 2 documents + final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 3); + assertEquals(2, result.size()); + } + } + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testRemoveMultipleWithEventualIndexing() + { + final int dimension = 64; + final int vectorCount = 50; + final Random random = new Random(42); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + try (final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + )) + { + for (int i = 0; i < vectorCount; i++) + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } + + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); + + // Remove first 10 entities + for (int i = 0; i < 10; i++) + { + gigaMap.removeById(i); + } + + defaultIndex.backgroundTaskManager.drainQueue(); + + // Should have 40 remaining + final VectorSearchResult result = index.search( + randomVector(new Random(99), dimension), 50 + ); + assertEquals(40, result.size()); + } + } + + // ==================== RemoveAll Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testRemoveAllDiscardsQueueAndResets() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + try (final VectorIndex index = vectorIndices.add( + "embeddings", config, new EmbeddedDocumentVectorizer() + )) + { + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); + + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); + + // RemoveAll — this discards pending operations and shuts down manager + gigaMap.removeAll(); + + // Index should be empty + final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 10); + assertEquals(0, result.size()); + + // Add new data after removeAll — indexing manager is recreated by initializeIndex + gigaMap.add(new Document("new_doc", new float[]{1.0f, 0.0f, 0.0f})); + + // Drain the new indexing manager + defaultIndex.backgroundTaskManager.drainQueue(); + + final VectorSearchResult result2 = index.search(new float[]{1.0f, 0.0f, 0.0f}, 10); + assertEquals(1, result2.size()); + } + } + + // ==================== Optimize Drains Queue Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testOptimizeDrainsQueueBeforeCleanup() + { + final int dimension = 64; + final Random random = new Random(42); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + try (final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + )) + { + for (int i = 0; i < 50; i++) + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } + + // Optimize should drain the queue first, then run cleanup + index.optimize(); + + // After optimize, all nodes should be searchable + final VectorSearchResult result = index.search( + randomVector(new Random(99), dimension), 10 + ); + assertEquals(10, result.size()); + } + } + + // ==================== PersistToDisk Drains Queue Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testPersistToDiskDrainsQueueBeforeWrite(@TempDir final Path tempDir) + { + final int dimension = 64; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .eventualIndexing(true) + .build(); + + try (final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + )) + { + for (int i = 0; i < 50; i++) + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } + + // PersistToDisk should drain the queue first + index.persistToDisk(); + + // Verify files were created + assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); + + // Search should work after persist + final VectorSearchResult result = index.search( + randomVector(new Random(99), dimension), 10 + ); + assertEquals(10, result.size()); + } + } + + // ==================== Close Drains Queue Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testCloseDrainsPendingOperations() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new EmbeddedDocumentVectorizer() + ); + + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); + + // Close should drain pending operations without error + index.close(); + + // No assertion needed — if close() deadlocks or throws, the @Timeout will catch it + } + + // ==================== Pending Count Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testPendingCountTracksQueuedOperations() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + try (final VectorIndex index = vectorIndices.add( + "embeddings", config, new EmbeddedDocumentVectorizer() + )) + { + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + + // Initially empty + assertEquals(0, defaultIndex.backgroundTaskManager.getPendingIndexingCount()); + + // After drain, count should be 0 + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); + + defaultIndex.backgroundTaskManager.drainQueue(); + + assertEquals(0, defaultIndex.backgroundTaskManager.getPendingIndexingCount()); + } + } + + // ==================== Large Data Set Tests ==================== + + @Test + @Timeout(value = 60, unit = TimeUnit.SECONDS) + void testLargeDataSetWithEventualIndexing() + { + final int dimension = 128; + final int vectorCount = 500; + final Random random = new Random(42); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .eventualIndexing(true) + .build(); + + try (final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + )) + { + // Add random vectors + for (int i = 0; i < vectorCount; i++) + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } + + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); + + // Search should return correct number of results + final VectorSearchResult result = index.search( + randomVector(new Random(99), dimension), 10 + ); + assertEquals(10, result.size()); + + // All results should have valid scores + for (final VectorSearchResult.Entry entry : result) + { + assertTrue(entry.score() > 0, "Score should be positive"); + assertNotNull(entry.entity()); + } + } + } + + // ==================== On-Disk with Eventual Indexing ==================== + + @Test + @Timeout(value = 60, unit = TimeUnit.SECONDS) + void testOnDiskWithEventualIndexing(@TempDir final Path tempDir) + { + final int dimension = 64; + final int vectorCount = 100; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .eventualIndexing(true) + .build(); + + try (final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + )) + { + for (int i = 0; i < vectorCount; i++) + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } + + // Persist triggers drain + index.persistToDisk(); + + assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); + + final VectorSearchResult result = index.search( + randomVector(new Random(99), dimension), 10 + ); + assertEquals(10, result.size()); + } + } + + // ==================== Background Persistence + Eventual Indexing ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testBackgroundPersistenceWithEventualIndexing(@TempDir final Path tempDir) throws Exception + { + final int dimension = 64; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(500) + .minChangesBetweenPersists(1) + .eventualIndexing(true) + .build(); + + try (final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + )) + { + for (int i = 0; i < 20; i++) + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } + + // Wait for background persistence (which should drain first) + Thread.sleep(1500); + + assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); + } + } + + // ==================== Disabled by Default Tests ==================== + + @Test + void testEventualIndexingDisabledByDefault() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + try(final VectorIndex index = vectorIndices.add( + "embeddings", config, new EmbeddedDocumentVectorizer() + )) + { + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + + // Background task manager should be null when no background features are enabled + assertNull(defaultIndex.backgroundTaskManager); + + // Synchronous indexing should still work + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + + final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + assertEquals(1, result.size()); + } + } + + // ==================== Combined Operations Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testAddUpdateRemoveSequenceWithEventualIndexing() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + try (final VectorIndex index = vectorIndices.add( + "embeddings", config, new EmbeddedDocumentVectorizer() + )) + { + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + + // Add 3 documents + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); + gigaMap.add(new Document("doc3", new float[]{0.0f, 0.0f, 1.0f})); + + defaultIndex.backgroundTaskManager.drainQueue(); + + // Update doc2 + gigaMap.set(1L, new Document("doc2_updated", new float[]{0.9f, 0.1f, 0.0f})); + + defaultIndex.backgroundTaskManager.drainQueue(); + + // Remove doc3 + gigaMap.removeById(2L); + + defaultIndex.backgroundTaskManager.drainQueue(); + + // Search: should find 2 documents + final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 3); + assertEquals(2, result.size()); + + // doc1 should be closest to [1,0,0], followed by updated doc2 [0.9,0.1,0] + assertEquals("doc1", result.toList().get(0).entity().content()); + assertEquals("doc2_updated", result.toList().get(1).entity().content()); + } + } + + // ==================== Background Optimization + Eventual Indexing ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testBackgroundOptimizationWithEventualIndexing() throws Exception + { + final int dimension = 64; + final Random random = new Random(42); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .optimizationIntervalMs(300) + .minChangesBetweenOptimizations(10) + .eventualIndexing(true) + .build(); + + try (final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + )) + { + for (int i = 0; i < 50; i++) + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } + + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); + + // Wait for background optimization to run + Thread.sleep(800); + + // Optimization should have run at least once + assertTrue(defaultIndex.backgroundTaskManager.getOptimizationCount() >= 1); + + // Search should still work + final VectorSearchResult result = index.search( + randomVector(new Random(99), dimension), 10 + ); + assertEquals(10, result.size()); + } + } +} diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexPerformanceTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexPerformanceTest.java index 998b55fe..7578e8c0 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexPerformanceTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexPerformanceTest.java @@ -17,7 +17,9 @@ import org.eclipse.store.gigamap.types.GigaMap; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import java.nio.file.Path; import java.util.ArrayList; import java.util.List; import java.util.Random; @@ -58,6 +60,18 @@ float[] embedding() * Computed vectorizer for performance tests. */ static class DocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + } + + /** + * Embedded vectorizer for performance tests - vectors stored in entity, not separately. + */ + static class EmbeddedDocumentVectorizer extends Vectorizer { @Override public float[] vectorize(final Document entity) @@ -356,4 +370,669 @@ void testPerformanceWithDifferentConfigurations() System.out.println("=== Configuration Comparison Complete ==="); } + + /** + * Performance test comparing parallel vs non-parallel on-disk write speed. + *

    + * Measures the time taken by {@code persistToDisk()} for both modes with + * PQ compression enabled (the primary target of the parallel writer) and + * without PQ compression. + *

    + * Increase {@code vectorCount} for more meaningful results (e.g., 100_000+). + */ + @Test + void testParallelVsNonParallelOnDiskWritePerformance(@TempDir final Path tempDir) + { + final int vectorCount = 10_000; + final int dimension = 128; + final int pqSubspaces = 32; + final int iterations = 3; + + System.err.println("=== Parallel vs Non-Parallel On-Disk Write Performance ==="); + System.err.println("Vector count: " + vectorCount); + System.err.println("Dimension: " + dimension); + System.err.println("Available processors: " + Runtime.getRuntime().availableProcessors()); + System.err.println(); + + // Pre-generate vectors + System.err.print("Generating vectors... "); + final Random random = new Random(42); + final List documents = new ArrayList<>(vectorCount); + for(int i = 0; i < vectorCount; i++) + { + documents.add(new Document("doc_" + i, randomVector(random, dimension))); + } + System.err.println("done."); + + // ========== WITHOUT PQ COMPRESSION ========== + System.err.println(); + System.err.println("--- Without PQ Compression ---"); + + final long[] noPqParallelTimes = new long[iterations]; + final long[] noPqSequentialTimes = new long[iterations]; + + for(int i = 0; i < iterations; i++) + { + noPqParallelTimes[i] = this.measurePersist( + tempDir.resolve("nopq-par-" + i), documents, dimension, false, 0, true + ); + noPqSequentialTimes[i] = this.measurePersist( + tempDir.resolve("nopq-seq-" + i), documents, dimension, false, 0, false + ); + } + + printComparisonResults("Without PQ", noPqParallelTimes, noPqSequentialTimes); + + // ========== WITH PQ COMPRESSION ========== + System.err.println(); + System.err.println("--- With PQ Compression (FusedPQ writer path) ---"); + + final long[] pqParallelTimes = new long[iterations]; + final long[] pqSequentialTimes = new long[iterations]; + + for(int i = 0; i < iterations; i++) + { + pqParallelTimes[i] = this.measurePersist( + tempDir.resolve("pq-par-" + i), documents, dimension, true, pqSubspaces, true + ); + pqSequentialTimes[i] = this.measurePersist( + tempDir.resolve("pq-seq-" + i), documents, dimension, true, pqSubspaces, false + ); + } + + printComparisonResults("With PQ", pqParallelTimes, pqSequentialTimes); + + System.err.println(); + System.err.println("=== Performance Comparison Complete ==="); + } + + /** + * Creates an index, populates it, persists to disk, and returns the persist duration. + * All resources are properly closed before returning. + */ + private long measurePersist( + final Path indexDir , + final List documents, + final int dimension, + final boolean enablePq , + final int pqSubspaces, + final boolean parallel + ) + { + final String mode = parallel ? "parallel" : "sequential"; + final String pq = enablePq ? "pq" : "nopq"; + + System.err.printf(" [%s/%s] creating index... ", pq, mode); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration.Builder configBuilder = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(enablePq ? 32 : 16) + .beamWidth(100) + .onDisk(true) + .indexDirectory(indexDir) + .parallelOnDiskWrite(parallel); + + if(enablePq) + { + configBuilder + .enablePqCompression(true) + .pqSubspaces(pqSubspaces); + } + + try(final VectorIndex index = vectorIndices.add( + "embeddings", configBuilder.build(), new DocumentVectorizer() + )) + { + System.err.print("populating... "); + gigaMap.addAll(documents); + + if(enablePq) + { + System.err.print("training PQ... "); + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); + } + + System.err.print("persisting... "); + + final long start = System.nanoTime(); + index.persistToDisk(); + final long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + System.err.printf("%,d ms%n", elapsedMs); + + return elapsedMs; + } + } + + /** + * Performance test comparing parallel vs non-parallel on-disk write with embedded vectorizer. + *

    + * This variant uses {@code isEmbedded()=true}, meaning vectors are fetched from entities + * via the parentMap during disk write. This is the scenario most prone to deadlock + * when the parentMap monitor is held during the write phase. + */ + @Test + void testParallelOnDiskWriteWithEmbeddedVectorizer(@TempDir final Path tempDir) + { + final int vectorCount = 100_000; + final int dimension = 128; + final int iterations = 3; + + System.err.println("=== Embedded Vectorizer: Parallel On-Disk Write Performance ==="); + System.err.println("Vector count: " + vectorCount); + System.err.println("Dimension: " + dimension); + System.err.println("Available processors: " + Runtime.getRuntime().availableProcessors()); + System.err.println(); + + // Pre-generate vectors + System.err.print("Generating vectors... "); + final Random random = new Random(42); + final List documents = new ArrayList<>(vectorCount); + for(int i = 0; i < vectorCount; i++) + { + documents.add(new Document("doc_" + i, randomVector(random, dimension))); + } + System.err.println("done."); + + System.err.println(); + System.err.println("--- Embedded Vectorizer (no PQ) ---"); + + final long[] parallelTimes = new long[iterations]; + final long[] sequentialTimes = new long[iterations]; + + for(int i = 0; i < iterations; i++) + { + parallelTimes[i] = this.measurePersistEmbedded( + tempDir.resolve("emb-par-" + i), documents, dimension, true + ); + sequentialTimes[i] = this.measurePersistEmbedded( + tempDir.resolve("emb-seq-" + i), documents, dimension, false + ); + } + + printComparisonResults("Embedded Vectorizer", parallelTimes, sequentialTimes); + + System.err.println(); + System.err.println("=== Embedded Vectorizer Performance Complete ==="); + } + + /** + * Creates an index with embedded vectorizer, populates it, persists to disk, + * and returns the persist duration. + */ + private long measurePersistEmbedded( + final Path indexDir , + final List documents, + final int dimension, + final boolean parallel + ) + { + final String mode = parallel ? "parallel" : "sequential"; + + System.err.printf(" [embedded/%s] creating index... ", mode); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .onDisk(true) + .indexDirectory(indexDir) + .parallelOnDiskWrite(parallel) + .build(); + + try(final VectorIndex index = vectorIndices.add( + "embeddings", config, new EmbeddedDocumentVectorizer() + )) + { + System.err.print("populating... "); + gigaMap.addAll(documents); + + System.err.print("persisting... "); + + final long start = System.nanoTime(); + index.persistToDisk(); + final long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + System.err.printf("%,d ms%n", elapsedMs); + + return elapsedMs; + } + } + + /** + * Prints a comparison summary for parallel vs sequential persist times. + */ + private static void printComparisonResults( + final String label , + final long[] parallelTimes , + final long[] sequentialTimes + ) + { + final long parallelAvg = average(parallelTimes); + final long parallelMin = min(parallelTimes); + final long parallelMax = max(parallelTimes); + final long sequentialAvg = average(sequentialTimes); + final long sequentialMin = min(sequentialTimes); + final long sequentialMax = max(sequentialTimes); + + System.err.println(); + System.err.println("=== " + label + " Results ==="); + System.err.printf(" Parallel: avg=%,d ms min=%,d ms max=%,d ms%n", + parallelAvg, parallelMin, parallelMax); + System.err.printf(" Sequential: avg=%,d ms min=%,d ms max=%,d ms%n", + sequentialAvg, sequentialMin, sequentialMax); + + if(sequentialAvg > 0 && parallelAvg > 0) + { + final double speedup = (double) sequentialAvg / parallelAvg; + System.err.printf(" Speedup: %.2fx%n", speedup); + } + } + + private static long average(final long[] values) + { + long sum = 0; + for(final long v : values) + { + sum += v; + } + return sum / values.length; + } + + private static long min(final long[] values) + { + long result = Long.MAX_VALUE; + for(final long v : values) + { + if(v < result) result = v; + } + return result; + } + + private static long max(final long[] values) + { + long result = Long.MIN_VALUE; + for(final long v : values) + { + if(v > result) result = v; + } + return result; + } + + + // ==================== Eventual Indexing Performance ==================== + + /** + * Performance test comparing mass insertion with and without eventual indexing. + *

    + * Eventual indexing defers HNSW graph construction to a background thread, + * so the caller-visible insertion time should be significantly lower since + * it only pays for vectorStore update + queue enqueue instead of the + * expensive {@code addGraphNode()} call. + *

    + * Both modes are measured with: + *

      + *
    • Single-entity adds via {@code gigaMap.add()}
    • + *
    • Batch adds via {@code gigaMap.addAll()}
    • + *
    + *

    + * After insertion, eventual mode is drained and both indices are verified + * for search quality (recall) to confirm the deferred graph is correct. + */ + @Test + void testEventualVsSynchronousInsertionPerformance() + { + final int vectorCount = 10_000; + final int dimension = 128; + final int searchIterations = 200; + final int k = 10; + final int batchSize = 1_000; + final int iterations = 3; + + System.err.println("=== Eventual vs. Synchronous Indexing Performance ==="); + System.err.println("Vector count: " + vectorCount); + System.err.println("Dimension: " + dimension); + System.err.println("Batch size: " + batchSize); + System.err.println("Iterations: " + iterations); + System.err.println(); + + // Pre-generate all vectors for fair comparison + System.err.print("Generating vectors... "); + final Random random = new Random(42); + final List documents = new ArrayList<>(vectorCount); + for(int i = 0; i < vectorCount; i++) + { + documents.add(new Document("doc_" + i, randomVector(random, dimension))); + } + System.err.println("done."); + + // Pre-generate query vectors + final float[][] queryVectors = new float[searchIterations][]; + final Random queryRandom = new Random(999); + for(int i = 0; i < searchIterations; i++) + { + queryVectors[i] = randomVector(queryRandom, dimension); + } + + // ========== SINGLE ADD: synchronous vs. eventual ========== + System.err.println(); + System.err.println("--- Single Add (gigaMap.add) ---"); + + final long[] syncSingleTimes = new long[iterations]; + final long[] eventualSingleTimes = new long[iterations]; + final long[] eventualSingleDrainTimes = new long[iterations]; + + for(int iter = 0; iter < iterations; iter++) + { + // Synchronous + syncSingleTimes[iter] = this.measureSingleAdd(documents, dimension, false); + // Eventual + final long[] eventualResult = this.measureSingleAddEventual(documents, dimension); + eventualSingleTimes[iter] = eventualResult[0]; + eventualSingleDrainTimes[iter] = eventualResult[1]; + } + + System.err.println(); + System.err.println(" Single Add Results:"); + System.err.printf(" Synchronous: avg=%,d ms min=%,d ms max=%,d ms%n", + average(syncSingleTimes), min(syncSingleTimes), max(syncSingleTimes)); + System.err.printf(" Eventual (add): avg=%,d ms min=%,d ms max=%,d ms%n", + average(eventualSingleTimes), min(eventualSingleTimes), max(eventualSingleTimes)); + System.err.printf(" Eventual (drain): avg=%,d ms min=%,d ms max=%,d ms%n", + average(eventualSingleDrainTimes), min(eventualSingleDrainTimes), max(eventualSingleDrainTimes)); + + if(average(syncSingleTimes) > 0 && average(eventualSingleTimes) > 0) + { + System.err.printf(" Caller-visible speedup: %.2fx%n", + (double)average(syncSingleTimes) / average(eventualSingleTimes)); + System.err.printf(" Total (add+drain) vs sync: %.2fx%n", + (double)average(syncSingleTimes) / + (average(eventualSingleTimes) + average(eventualSingleDrainTimes))); + } + + // ========== BATCH ADD: synchronous vs. eventual ========== + System.err.println(); + System.err.println("--- Batch Add (gigaMap.addAll, batch=" + batchSize + ") ---"); + + final long[] syncBatchTimes = new long[iterations]; + final long[] eventualBatchTimes = new long[iterations]; + final long[] eventualBatchDrainTimes = new long[iterations]; + + for(int iter = 0; iter < iterations; iter++) + { + // Synchronous + syncBatchTimes[iter] = this.measureBatchAdd(documents, dimension, batchSize, false); + // Eventual + final long[] eventualResult = this.measureBatchAddEventual(documents, dimension, batchSize); + eventualBatchTimes[iter] = eventualResult[0]; + eventualBatchDrainTimes[iter] = eventualResult[1]; + } + + System.err.println(); + System.err.println(" Batch Add Results:"); + System.err.printf(" Synchronous: avg=%,d ms min=%,d ms max=%,d ms%n", + average(syncBatchTimes), min(syncBatchTimes), max(syncBatchTimes)); + System.err.printf(" Eventual (add): avg=%,d ms min=%,d ms max=%,d ms%n", + average(eventualBatchTimes), min(eventualBatchTimes), max(eventualBatchTimes)); + System.err.printf(" Eventual (drain): avg=%,d ms min=%,d ms max=%,d ms%n", + average(eventualBatchDrainTimes), min(eventualBatchDrainTimes), max(eventualBatchDrainTimes)); + + if(average(syncBatchTimes) > 0 && average(eventualBatchTimes) > 0) + { + System.err.printf(" Caller-visible speedup: %.2fx%n", + (double)average(syncBatchTimes) / average(eventualBatchTimes)); + System.err.printf(" Total (add+drain) vs sync: %.2fx%n", + (double)average(syncBatchTimes) / + (average(eventualBatchTimes) + average(eventualBatchDrainTimes))); + } + + // ========== SEARCH QUALITY VERIFICATION ========== + System.err.println(); + System.err.println("--- Search Quality Verification ---"); + + this.verifySearchQuality(documents, dimension, queryVectors, k, false, "Synchronous"); + this.verifySearchQuality(documents, dimension, queryVectors, k, true, "Eventual"); + + System.err.println(); + System.err.println("=== Eventual Indexing Performance Complete ==="); + } + + /** + * Measures single-entity add time (synchronous). + */ + private long measureSingleAdd( + final List documents, + final int dimension, + final boolean eventual + ) + { + final String mode = eventual ? "eventual" : "sync"; + System.err.printf(" [single/%s] ", mode); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .eventualIndexing(eventual) + .build(); + + try(final VectorIndex index = vectorIndices.add( + "embeddings", config, new DocumentVectorizer() + )) + { + final long start = System.nanoTime(); + for(final Document doc : documents) + { + gigaMap.add(doc); + } + final long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + System.err.printf("%,d ms (%,.0f vec/sec)%n", + elapsedMs, documents.size() / (elapsedMs / 1000.0)); + + return elapsedMs; + } + } + + /** + * Measures single-entity add time (eventual). Returns [addTime, drainTime]. + */ + private long[] measureSingleAddEventual( + final List documents, + final int dimension + ) + { + System.err.print(" [single/eventual] "); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .eventualIndexing(true) + .build(); + + try(final VectorIndex index = vectorIndices.add( + "embeddings", config, new DocumentVectorizer() + )) + { + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + + final long addStart = System.nanoTime(); + for(final Document doc : documents) + { + gigaMap.add(doc); + } + final long addMs = (System.nanoTime() - addStart) / 1_000_000; + + final long drainStart = System.nanoTime(); + defaultIndex.backgroundTaskManager.drainQueue(); + final long drainMs = (System.nanoTime() - drainStart) / 1_000_000; + + System.err.printf("add=%,d ms drain=%,d ms total=%,d ms (%,.0f vec/sec add-visible)%n", + addMs, drainMs, addMs + drainMs, + documents.size() / (addMs / 1000.0)); + + return new long[]{addMs, drainMs}; + } + } + + /** + * Measures batch add time (synchronous or eventual). + */ + private long measureBatchAdd( + final List documents, + final int dimension, + final int batchSize, + final boolean eventual + ) + { + final String mode = eventual ? "eventual" : "sync"; + System.err.printf(" [batch/%s] ", mode); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .eventualIndexing(eventual) + .build(); + + try(final VectorIndex index = vectorIndices.add( + "embeddings", config, new DocumentVectorizer() + )) + { + final long start = System.nanoTime(); + for(int i = 0; i < documents.size(); i += batchSize) + { + final int end = Math.min(i + batchSize, documents.size()); + gigaMap.addAll(documents.subList(i, end)); + } + final long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + System.err.printf("%,d ms (%,.0f vec/sec)%n", + elapsedMs, documents.size() / (elapsedMs / 1000.0)); + + return elapsedMs; + } + } + + /** + * Measures batch add time (eventual). Returns [addTime, drainTime]. + */ + private long[] measureBatchAddEventual( + final List documents, + final int dimension, + final int batchSize + ) + { + System.err.print(" [batch/eventual] "); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .eventualIndexing(true) + .build(); + + try(final VectorIndex index = vectorIndices.add( + "embeddings", config, new DocumentVectorizer() + )) + { + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + + final long addStart = System.nanoTime(); + for(int i = 0; i < documents.size(); i += batchSize) + { + final int end = Math.min(i + batchSize, documents.size()); + gigaMap.addAll(documents.subList(i, end)); + } + final long addMs = (System.nanoTime() - addStart) / 1_000_000; + + final long drainStart = System.nanoTime(); + defaultIndex.backgroundTaskManager.drainQueue(); + final long drainMs = (System.nanoTime() - drainStart) / 1_000_000; + + System.err.printf("add=%,d ms drain=%,d ms total=%,d ms (%,.0f vec/sec add-visible)%n", + addMs, drainMs, addMs + drainMs, + documents.size() / (addMs / 1000.0)); + + return new long[]{addMs, drainMs}; + } + } + + /** + * Verifies search quality (recall) for a given mode, to confirm eventual + * indexing produces the same graph quality as synchronous indexing. + */ + private void verifySearchQuality( + final List documents, + final int dimension, + final float[][] queryVectors, + final int k, + final boolean eventual, + final String label + ) + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .eventualIndexing(eventual) + .build(); + + try(final VectorIndex index = vectorIndices.add( + "embeddings", config, new DocumentVectorizer() + )) + { + gigaMap.addAll(documents); + + if(eventual) + { + ((VectorIndex.Default)index).backgroundTaskManager.drainQueue(); + } + + int totalResults = 0; + int fullResults = 0; + for(final float[] query : queryVectors) + { + final VectorSearchResult result = index.search(query, k); + totalResults++; + if(result.size() == k) + { + fullResults++; + } + } + + System.err.printf(" %s: %d/%d queries returned full %d results (%.1f%%)%n", + label, fullResults, totalResults, k, + 100.0 * fullResults / totalResults); + } + } } diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndicesTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndicesTest.java new file mode 100644 index 00000000..5a6351f6 --- /dev/null +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndicesTest.java @@ -0,0 +1,379 @@ +package org.eclipse.store.gigamap.jvector; + +/*- + * #%L + * EclipseStore GigaMap JVector + * %% + * Copyright (C) 2023 - 2026 MicroStream Software + * %% + * This program and the accompanying materials are made + * available under the terms of the Eclipse Public License 2.0 + * which is available at https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + * #L% + */ + +import org.eclipse.store.gigamap.types.GigaMap; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Unit tests for {@link VectorIndices}. + *

    + * Tests the core functionality of vector index management: + * - Index registration and retrieval + * - Index name validation + * - Lifecycle management + */ +class VectorIndicesTest +{ + record Document(String content, float[] embedding) {} + + static class DocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + } + + @Test + void testAddIndex() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex index = vectorIndices.add("test-index", config, new DocumentVectorizer()); + + assertNotNull(index); + assertEquals("test-index", index.name()); + } + + @Test + void testAddDuplicateIndexThrows() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("duplicate", config, new DocumentVectorizer()); + + assertThrows(RuntimeException.class, () -> + vectorIndices.add("duplicate", config, new DocumentVectorizer()) + ); + } + + @Test + void testGetExistingIndex() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex created = vectorIndices.add("my-index", config, new DocumentVectorizer()); + final VectorIndex retrieved = vectorIndices.get("my-index"); + + assertSame(created, retrieved); + } + + @Test + void testGetNonExistentIndexReturnsNull() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + assertNull(vectorIndices.get("non-existent")); + } + + @Test + void testEnsureCreatesNewIndex() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex index = vectorIndices.ensure("new-index", config, new DocumentVectorizer()); + + assertNotNull(index); + assertEquals("new-index", index.name()); + } + + @Test + void testEnsureReturnsExistingIndex() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex first = vectorIndices.ensure("existing", config, new DocumentVectorizer()); + final VectorIndex second = vectorIndices.ensure("existing", config, new DocumentVectorizer()); + + assertSame(first, second); + } + + @Test + void testValidateIndexNameNull() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add(null, config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameEmpty() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add("", config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameWithSlash() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add("invalid/name", config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameWithBackslash() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add("invalid\\name", config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameTooLong() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final String tooLong = "a".repeat(201); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add(tooLong, config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameWithValidCharacters() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertDoesNotThrow(() -> + vectorIndices.add("valid-index_name.123", config, new DocumentVectorizer()) + ); + } + + @Test + void testInternalAddPropagates() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + vectorIndices.add("index2", config, new DocumentVectorizer()); + + final Document doc = new Document("test", new float[]{1.0f, 0.0f, 0.0f}); + gigaMap.add(doc); + + final VectorIndex index1 = vectorIndices.get("index1"); + final VectorIndex index2 = vectorIndices.get("index2"); + + final VectorSearchResult result1 = index1.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + final VectorSearchResult result2 = index2.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + + assertEquals(1, result1.size()); + assertEquals(1, result2.size()); + } + + @Test + void testInternalRemovePropagates() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + vectorIndices.add("index2", config, new DocumentVectorizer()); + + final Document doc = new Document("test", new float[]{1.0f, 0.0f, 0.0f}); + gigaMap.add(doc); + gigaMap.removeById(0); + + final VectorIndex index1 = vectorIndices.get("index1"); + final VectorIndex index2 = vectorIndices.get("index2"); + + final VectorSearchResult result1 = index1.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + final VectorSearchResult result2 = index2.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + + assertEquals(0, result1.size()); + assertEquals(0, result2.size()); + } + + @Test + void testInternalRemoveAllPropagates() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + + gigaMap.add(new Document("test1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("test2", new float[]{0.0f, 1.0f, 0.0f})); + + gigaMap.removeAll(); + + final VectorIndex index1 = vectorIndices.get("index1"); + final VectorSearchResult result = index1.search(new float[]{1.0f, 0.0f, 0.0f}, 10); + + assertEquals(0, result.size()); + } + + @Test + void testIterateIndices() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + vectorIndices.add("index2", config, new DocumentVectorizer()); + vectorIndices.add("index3", config, new DocumentVectorizer()); + + final int[] count = {0}; + vectorIndices.iterate(index -> count[0]++); + + assertEquals(3, count[0]); + } + + @Test + void testAccessIndices() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + vectorIndices.add("index2", config, new DocumentVectorizer()); + + vectorIndices.accessIndices(table -> { + assertNotNull(table.get("index1")); + assertNotNull(table.get("index2")); + assertNull(table.get("non-existent")); + }); + } + + @Test + void testIndexAutoPopulatesExistingEntities() + { + final GigaMap gigaMap = GigaMap.New(); + + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); + + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex index = vectorIndices.add("new-index", config, new DocumentVectorizer()); + + final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 10); + + assertEquals(2, result.size(), "Index should auto-populate with existing entities"); + } +} +