From b400b3b29b717c6a4d476fc8c4733928ded818a5 Mon Sep 17 00:00:00 2001 From: fh-ms Date: Tue, 17 Feb 2026 12:10:28 +0100 Subject: [PATCH 01/15] Bump jvector version to 4.0.0-rc.8 in pom.xml --- gigamap/jvector/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gigamap/jvector/pom.xml b/gigamap/jvector/pom.xml index 3da2236f..00f7e8f3 100644 --- a/gigamap/jvector/pom.xml +++ b/gigamap/jvector/pom.xml @@ -19,7 +19,7 @@ https://projects.eclipse.org/projects/technology.store - 4.0.0-rc.7 + 4.0.0-rc.8 From 183aaece091d309b4f9db5c14212776d839584bf Mon Sep 17 00:00:00 2001 From: fh-ms Date: Tue, 17 Feb 2026 16:38:25 +0100 Subject: [PATCH 02/15] Add parallel on-disk write support to VectorIndex --- .../gigamap/jvector/DiskIndexManager.java | 91 +-- .../store/gigamap/jvector/VectorIndex.java | 148 +++-- .../jvector/VectorIndexConfiguration.java | 52 +- .../jvector/VectorIndexConfigurationTest.java | 127 +++++ .../gigamap/jvector/VectorIndexDiskTest.java | 538 ++++++++++++++++++ .../jvector/VectorIndexPerformanceTest.java | 310 ++++++++++ 6 files changed, 1181 insertions(+), 85 deletions(-) diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/DiskIndexManager.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/DiskIndexManager.java index 5a2a915c..2abe6f62 100644 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/DiskIndexManager.java +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/DiskIndexManager.java @@ -20,6 +20,7 @@ import io.github.jbellis.jvector.graph.RandomAccessVectorValues; import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex; import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndexWriter; +import io.github.jbellis.jvector.graph.disk.OnDiskParallelGraphIndexWriter; import io.github.jbellis.jvector.graph.disk.feature.Feature; import io.github.jbellis.jvector.graph.disk.feature.FeatureId; import io.github.jbellis.jvector.graph.disk.feature.FusedPQ; @@ -126,29 +127,32 @@ public static class Default implements DiskIndexManager { private static final Logger LOG = LoggerFactory.getLogger(DiskIndexManager.class); - private final IndexStateProvider provider ; - private final String name ; - private final Path indexDirectory; - private final int dimension ; - private final int maxDegree ; + private final IndexStateProvider provider ; + private final String name ; + private final Path indexDirectory ; + private final int dimension ; + private final int maxDegree ; + private final boolean parallelOnDiskWrite ; private OnDiskGraphIndex diskIndex ; private ReaderSupplier readerSupplier; private boolean loaded ; Default( - final IndexStateProvider provider , - final String name , - final Path indexDirectory, - final int dimension , - final int maxDegree + final IndexStateProvider provider , + final String name , + final Path indexDirectory , + final int dimension , + final int maxDegree , + final boolean parallelOnDiskWrite ) { - this.provider = provider ; - this.name = name ; - this.indexDirectory = indexDirectory; - this.dimension = dimension ; - this.maxDegree = maxDegree ; + this.provider = provider ; + this.name = name ; + this.indexDirectory = indexDirectory ; + this.dimension = dimension ; + this.maxDegree = maxDegree ; + this.parallelOnDiskWrite = parallelOnDiskWrite ; } @Override @@ -287,33 +291,46 @@ private void writeIndexWithFusedPQ( final InlineVectors inlineVectors = new InlineVectors(this.dimension); final FusedPQ fusedPQ = new FusedPQ(this.maxDegree, pq); - // Build writer with features using sequential renumbering (identity mapping) - try(final OnDiskGraphIndexWriter writer = new OnDiskGraphIndexWriter.Builder(index, graphPath) - .with(inlineVectors) - .with(fusedPQ) - .build()) - { - // Create feature suppliers that provide feature state for each node - final Map> suppliers = new EnumMap<>(FeatureId.class); - - suppliers.put(FeatureId.INLINE_VECTORS, nodeId -> - new InlineVectors.State(ravv.getVector(nodeId)) - ); + // Create feature suppliers that provide feature state for each node + final Map> suppliers = new EnumMap<>(FeatureId.class); - // Get a view for FusedPQ state creation - final var view = index.getView(); - suppliers.put(FeatureId.FUSED_PQ, nodeId -> - new FusedPQ.State(view, pqVectors, nodeId) - ); + suppliers.put(FeatureId.INLINE_VECTORS, nodeId -> + new InlineVectors.State(ravv.getVector(nodeId)) + ); - // Write with sequential renumbering (maintains ordinals) - writer.write(suppliers); + // Get a view for FusedPQ state creation + final var view = index.getView(); + suppliers.put(FeatureId.FUSED_PQ, nodeId -> + new FusedPQ.State(view, pqVectors, nodeId) + ); - // Close the view after writing - view.close(); + if(this.parallelOnDiskWrite) + { + try(final OnDiskParallelGraphIndexWriter writer = new OnDiskParallelGraphIndexWriter.Builder(index, graphPath) + .withParallelDirectBuffers(true) + .with(inlineVectors) + .with(fusedPQ) + .build()) + { + writer.write(suppliers); + } } + else + { + try(final OnDiskGraphIndexWriter writer = new OnDiskGraphIndexWriter.Builder(index, graphPath) + .with(inlineVectors) + .with(fusedPQ) + .build()) + { + writer.write(suppliers); + } + } + + // Close the view after writing + view.close(); - LOG.info("Wrote index '{}' with FusedPQ compression ({} nodes)", this.name, index.size(0)); + LOG.info("Wrote index '{}' with FusedPQ compression ({} nodes, parallel={})", + this.name, index.size(0), this.parallelOnDiskWrite); } /** diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java index bf143022..ee11d906 100644 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java @@ -782,7 +782,8 @@ private void initializeIndex() this.name, this.configuration.indexDirectory(), this.configuration.dimension(), - this.configuration.maxDegree() + this.configuration.maxDegree(), + this.configuration.parallelOnDiskWrite() ); if(this.diskManager.tryLoad()) { @@ -1205,29 +1206,40 @@ public void internalRemove(final long entityId, final E entity) @Override public void internalRemoveAll() { - synchronized(this.parentMap()) + // Acquire write lock to ensure no concurrent persistToDisk() Phase 2 is running. + // closeInternalResources() destroys the graph and disk manager, which would + // corrupt a write in progress. + this.persistenceLock.writeLock().lock(); + try { - this.ensureIndexInitialized(); - - if(!this.isEmbedded()) + synchronized(this.parentMap()) { - this.vectorStore.removeAll(); - } + this.ensureIndexInitialized(); - // Shutdown optimization manager before closing - this.shutdownOptimizationManager(false); + if(!this.isEmbedded()) + { + this.vectorStore.removeAll(); + } - // Shutdown persistence manager before closing - this.shutdownPersistenceManager(false); + // Shutdown optimization manager before closing + this.shutdownOptimizationManager(false); - this.closeInternalResources(); + // Shutdown persistence manager before closing + this.shutdownPersistenceManager(false); - // Reinitialize the index (this will also restart background managers if configured) - this.initializeIndex(); - this.markStateChangeChildren(); + this.closeInternalResources(); - // Mark dirty for background managers - this.markDirtyForBackgroundManagers(1); + // Reinitialize the index (this will also restart background managers if configured) + this.initializeIndex(); + this.markStateChangeChildren(); + + // Mark dirty for background managers + this.markDirtyForBackgroundManagers(1); + } + } + finally + { + this.persistenceLock.writeLock().unlock(); } } @@ -1359,13 +1371,21 @@ private VectorSearchResult convertSearchResult(final SearchResult result) @Override public void optimize() { + final GraphIndexBuilder capturedBuilder; synchronized(this.parentMap()) { this.ensureIndexInitialized(); - if(this.builder != null) - { - this.builder.cleanup(); - } + capturedBuilder = this.builder; + } + // cleanup() uses ForkJoinPool internally — must be outside + // synchronized(parentMap) to avoid deadlock with embedded vectorizers + // whose worker threads call parentMap.get(). + if(capturedBuilder != null) + { + capturedBuilder.cleanup(); + } + synchronized(this.parentMap()) + { this.markStateChangeChildren(); } } @@ -1378,39 +1398,64 @@ public void persistToDisk() return; // No-op for in-memory indices } - // Acquire write lock for exclusive access during persistence + // Acquire write lock for exclusive access during persistence. + // This blocks searches and other persist/removeAll/close calls. this.persistenceLock.writeLock().lock(); try { + // Captured references for Phase 2 (disk write outside synchronized block) + final OnHeapGraphIndex capturedIndex ; + final RandomAccessVectorValues capturedRavv ; + final PQCompressionManager capturedPqMgr ; + final DiskIndexManager capturedDiskMgr; + + final GraphIndexBuilder capturedBuilder; + + // Phase 1: Exclusive prep inside synchronized(parentMap). + // Disk manager init and reference capture. synchronized(this.parentMap()) { this.ensureIndexInitialized(); - // If we have an in-memory builder, write it to disk - if(this.builder != null && this.index != null) + // If we have an in-memory builder, prepare for disk write + if(this.builder == null || this.index == null) { - // Cleanup the graph before writing (removes excess neighbors) - this.builder.cleanup(); - - // Initialize disk manager if needed - if(this.diskManager == null) - { - this.diskManager = new DiskIndexManager.Default( - this, - this.name, - this.configuration.indexDirectory(), - this.configuration.dimension(), - this.configuration.maxDegree() - ); - } - - // Create vector values for writing - final RandomAccessVectorValues ravv = this.createVectorValues(); - - // Write using disk manager - this.diskManager.writeIndex(this.index, ravv, this.pqManager); + return; } + + // Initialize disk manager if needed + if(this.diskManager == null) + { + this.diskManager = new DiskIndexManager.Default( + this, + this.name, + this.configuration.indexDirectory(), + this.configuration.dimension(), + this.configuration.maxDegree(), + this.configuration.parallelOnDiskWrite() + ); + } + + // Capture references for use outside the synchronized block. + // The parentMap monitor is released before cleanup and disk write + // so that worker threads (ForkJoinPool in cleanup and disk writer) + // can freely call parentMap.get() without deadlocking. + capturedBuilder = this.builder; + capturedIndex = this.index; + capturedRavv = new NullSafeVectorValues( + this.createVectorValues(), this.configuration.dimension(), this.vectorTypeSupport + ); + capturedPqMgr = this.pqManager; + capturedDiskMgr = this.diskManager; } + + // Phase 2: Cleanup and disk write outside synchronized(parentMap). + // persistenceLock.writeLock() is still held, blocking searches, + // removeAll, and close. But parentMap monitor is released, so + // worker threads (ForkJoinPool in cleanup, disk writer threads) + // can call parentMap.get() for embedded vectors. + capturedBuilder.cleanup(); + capturedDiskMgr.writeIndex(capturedIndex, capturedRavv, capturedPqMgr); } catch(final IOException ioe) { @@ -1465,9 +1510,19 @@ public void close() // Shutdown persistence manager second (may persist pending changes) this.shutdownPersistenceManager(this.configuration.persistOnShutdown()); - synchronized(this.parentMap()) + // Acquire write lock to ensure no concurrent persistToDisk() Phase 2 is running. + // closeInternalResources() destroys the graph and disk manager. + this.persistenceLock.writeLock().lock(); + try { - this.closeInternalResources(); + synchronized(this.parentMap()) + { + this.closeInternalResources(); + } + } + finally + { + this.persistenceLock.writeLock().unlock(); } } @@ -1616,6 +1671,7 @@ public long getExpectedVectorCount() } } + } } diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndexConfiguration.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndexConfiguration.java index fa292a8c..7de93834 100644 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndexConfiguration.java +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndexConfiguration.java @@ -448,6 +448,23 @@ public default boolean backgroundOptimization() */ public boolean optimizeOnShutdown(); + /** + * Returns whether parallel writing is used for on-disk index persistence. + *

+ * When enabled, the on-disk graph writer uses parallel direct buffers and + * multiple worker threads (one per available processor) to write the index + * concurrently. This significantly speeds up persistence for large indices. + *

+ * When disabled, a sequential single-threaded writer is used, which may be + * preferable in resource-constrained environments or when writing smaller indices. + *

+ * Only applies when {@link #onDisk()} is true. + * + * @return true if parallel on-disk writing is enabled (default: true) + * @see #onDisk() + */ + public boolean parallelOnDiskWrite(); + /** * Creates a new builder for constructing a {@link VectorIndexConfiguration}. @@ -904,6 +921,18 @@ public static interface Builder */ public Builder optimizeOnShutdown(boolean optimizeOnShutdown); + /** + * Enables or disables parallel writing for on-disk index persistence. + *

+ * When enabled, uses multiple worker threads and parallel direct buffers + * for faster disk writes. Only applies when {@link #onDisk(boolean)} is true. + * + * @param parallelOnDiskWrite true to enable parallel on-disk writing + * @return this builder for method chaining + * @see VectorIndexConfiguration#parallelOnDiskWrite() + */ + public Builder parallelOnDiskWrite(boolean parallelOnDiskWrite); + /** * Builds the configuration with the specified parameters. * @@ -943,6 +972,7 @@ public static class Default implements Builder private long optimizationIntervalMs ; private int minChangesBetweenOptimizations; private boolean optimizeOnShutdown ; + private boolean parallelOnDiskWrite ; Default() { @@ -962,6 +992,7 @@ public static class Default implements Builder this.optimizationIntervalMs = 0; // 0 = disabled this.minChangesBetweenOptimizations = 1000; this.optimizeOnShutdown = false; + this.parallelOnDiskWrite = true; } @Override @@ -1096,6 +1127,13 @@ public Builder optimizeOnShutdown(final boolean optimizeOnShutdown) return this; } + @Override + public Builder parallelOnDiskWrite(final boolean parallelOnDiskWrite) + { + this.parallelOnDiskWrite = parallelOnDiskWrite; + return this; + } + @Override public VectorIndexConfiguration build() { @@ -1143,7 +1181,8 @@ public VectorIndexConfiguration build() this.minChangesBetweenPersists, this.optimizationIntervalMs, this.minChangesBetweenOptimizations, - this.optimizeOnShutdown + this.optimizeOnShutdown, + this.parallelOnDiskWrite ); } @@ -1173,6 +1212,7 @@ public static class Default implements VectorIndexConfiguration private final long optimizationIntervalMs ; private final int minChangesBetweenOptimizations; private final boolean optimizeOnShutdown ; + private final boolean parallelOnDiskWrite ; Default( final int dimension , @@ -1190,7 +1230,8 @@ public static class Default implements VectorIndexConfiguration final int minChangesBetweenPersists , final long optimizationIntervalMs , final int minChangesBetweenOptimizations , - final boolean optimizeOnShutdown + final boolean optimizeOnShutdown , + final boolean parallelOnDiskWrite ) { this.dimension = dimension ; @@ -1209,6 +1250,7 @@ public static class Default implements VectorIndexConfiguration this.optimizationIntervalMs = optimizationIntervalMs ; this.minChangesBetweenOptimizations = minChangesBetweenOptimizations ; this.optimizeOnShutdown = optimizeOnShutdown ; + this.parallelOnDiskWrite = parallelOnDiskWrite ; } @Override @@ -1307,6 +1349,12 @@ public boolean optimizeOnShutdown() return this.optimizeOnShutdown; } + @Override + public boolean parallelOnDiskWrite() + { + return this.parallelOnDiskWrite; + } + } } diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java index caa5c874..8106570d 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java @@ -54,6 +54,7 @@ void testBuilderDefaults() assertEquals(0L, config.optimizationIntervalMs()); assertEquals(1000, config.minChangesBetweenOptimizations()); assertFalse(config.optimizeOnShutdown()); + assertTrue(config.parallelOnDiskWrite()); } // ==================== Builder Validation Tests ==================== @@ -269,6 +270,129 @@ void testPqSubspacesZeroMeansAuto(@TempDir final Path tempDir) assertEquals(0, config.pqSubspaces()); } + // ==================== Parallel On-Disk Write Tests ==================== + + @Test + void testParallelOnDiskWriteDefaultTrue() + { + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(64) + .build(); + + assertTrue(config.parallelOnDiskWrite()); + } + + @Test + void testParallelOnDiskWriteCanBeDisabled(@TempDir final Path tempDir) + { + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(64) + .onDisk(true) + .indexDirectory(tempDir) + .parallelOnDiskWrite(false) + .build(); + + assertFalse(config.parallelOnDiskWrite()); + } + + @Test + void testParallelVsNonParallelShareSameDefaults(@TempDir final Path tempDir) + { + final VectorIndexConfiguration parallel = VectorIndexConfiguration.builder() + .dimension(768) + .onDisk(true) + .indexDirectory(tempDir) + .parallelOnDiskWrite(true) + .build(); + + final VectorIndexConfiguration sequential = VectorIndexConfiguration.builder() + .dimension(768) + .onDisk(true) + .indexDirectory(tempDir) + .parallelOnDiskWrite(false) + .build(); + + assertTrue(parallel.parallelOnDiskWrite()); + assertFalse(sequential.parallelOnDiskWrite()); + + // All other parameters remain identical + assertEquals(parallel.dimension(), sequential.dimension()); + assertEquals(parallel.similarityFunction(), sequential.similarityFunction()); + assertEquals(parallel.maxDegree(), sequential.maxDegree()); + assertEquals(parallel.beamWidth(), sequential.beamWidth()); + assertEquals(parallel.neighborOverflow(), sequential.neighborOverflow()); + assertEquals(parallel.alpha(), sequential.alpha()); + assertEquals(parallel.onDisk(), sequential.onDisk()); + assertEquals(parallel.indexDirectory(), sequential.indexDirectory()); + assertEquals(parallel.enablePqCompression(), sequential.enablePqCompression()); + assertEquals(parallel.pqSubspaces(), sequential.pqSubspaces()); + assertEquals(parallel.persistenceIntervalMs(), sequential.persistenceIntervalMs()); + assertEquals(parallel.persistOnShutdown(), sequential.persistOnShutdown()); + assertEquals(parallel.minChangesBetweenPersists(), sequential.minChangesBetweenPersists()); + assertEquals(parallel.optimizationIntervalMs(), sequential.optimizationIntervalMs()); + assertEquals(parallel.minChangesBetweenOptimizations(), sequential.minChangesBetweenOptimizations()); + assertEquals(parallel.optimizeOnShutdown(), sequential.optimizeOnShutdown()); + } + + @Test + void testParallelVsNonParallelWithCompression(@TempDir final Path tempDir) + { + final VectorIndexConfiguration parallel = VectorIndexConfiguration.builder() + .dimension(768) + .onDisk(true) + .indexDirectory(tempDir) + .enablePqCompression(true) + .pqSubspaces(48) + .parallelOnDiskWrite(true) + .build(); + + final VectorIndexConfiguration sequential = VectorIndexConfiguration.builder() + .dimension(768) + .onDisk(true) + .indexDirectory(tempDir) + .enablePqCompression(true) + .pqSubspaces(48) + .parallelOnDiskWrite(false) + .build(); + + assertTrue(parallel.parallelOnDiskWrite()); + assertFalse(sequential.parallelOnDiskWrite()); + + // Compression settings are identical regardless of parallel mode + assertEquals(parallel.enablePqCompression(), sequential.enablePqCompression()); + assertEquals(parallel.pqSubspaces(), sequential.pqSubspaces()); + assertEquals(parallel.maxDegree(), sequential.maxDegree()); + } + + @Test + void testFactoryMethodsDefaultToParallel(@TempDir final Path tempDir) + { + final Path indexDir = tempDir.resolve("vectors"); + + final VectorIndexConfiguration medium = VectorIndexConfiguration.forMediumDataset(768, indexDir); + assertTrue(medium.parallelOnDiskWrite()); + + final VectorIndexConfiguration large = VectorIndexConfiguration.forLargeDataset(768, indexDir); + assertTrue(large.parallelOnDiskWrite()); + + final VectorIndexConfiguration highPrecision = VectorIndexConfiguration.forHighPrecision(768, indexDir); + assertTrue(highPrecision.parallelOnDiskWrite()); + } + + @Test + void testBuilderForLargeDatasetCanDisableParallel(@TempDir final Path tempDir) + { + final Path indexDir = tempDir.resolve("vectors"); + final VectorIndexConfiguration config = VectorIndexConfiguration.builderForLargeDataset(768, indexDir) + .parallelOnDiskWrite(false) + .enablePqCompression(true) + .build(); + + assertTrue(config.onDisk()); + assertTrue(config.enablePqCompression()); + assertFalse(config.parallelOnDiskWrite()); + } + // ==================== Similarity Function Tests ==================== @Test @@ -592,6 +716,7 @@ void testFullOnDiskConfiguration(@TempDir final Path tempDir) .optimizationIntervalMs(120_000) .minChangesBetweenOptimizations(500) .optimizeOnShutdown(true) + .parallelOnDiskWrite(false) .build(); assertEquals(768, config.dimension()); @@ -612,6 +737,7 @@ void testFullOnDiskConfiguration(@TempDir final Path tempDir) assertEquals(120_000L, config.optimizationIntervalMs()); assertEquals(500, config.minChangesBetweenOptimizations()); assertTrue(config.optimizeOnShutdown()); + assertFalse(config.parallelOnDiskWrite()); } @Test @@ -774,6 +900,7 @@ void testBuilderMethodChainingReturnsBuilder() assertSame(builder, builder.optimizationIntervalMs(60_000)); assertSame(builder, builder.minChangesBetweenOptimizations(1000)); assertSame(builder, builder.optimizeOnShutdown(false)); + assertSame(builder, builder.parallelOnDiskWrite(true)); } // ==================== Factory Methods Comparison Tests ==================== diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java index 3e453425..504cd3ea 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java @@ -18,6 +18,7 @@ import org.eclipse.store.storage.embedded.types.EmbeddedStorage; import org.eclipse.store.storage.embedded.types.EmbeddedStorageManager; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; import org.junit.jupiter.api.io.TempDir; import java.io.IOException; @@ -57,6 +58,24 @@ public float[] vectorize(final Document entity) } } + /** + * Embedded vectorizer - vectors are part of the entity, not stored separately. + */ + static class EmbeddedDocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + + @Override + public boolean isEmbedded() + { + return true; + } + } + /** * Helper to generate a random normalized vector. */ @@ -2493,4 +2512,523 @@ void testInMemoryIndexWithBackgroundOptimization(@TempDir final Path tempDir) th index.close(); } } + + + // ======================================================================== + // Parallel vs Non-Parallel On-Disk Write Tests + // ======================================================================== + + /** + * Test that parallel and non-parallel on-disk writes produce equivalent search results + * for a large index without PQ compression. + * Both modes should produce identical graph files that yield the same search quality. + */ + @Test + void testParallelVsNonParallelOnDiskWrite(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 2000; + final int dimension = 64; + final int k = 20; + final Random random = new Random(42); + + // Generate shared vectors and query + final List vectors = new ArrayList<>(); + for(int i = 0; i < vectorCount; i++) + { + vectors.add(randomVector(random, dimension)); + } + final float[] queryVector = randomVector(new Random(999), dimension); + + final Path parallelIndexDir = tempDir.resolve("parallel"); + final Path sequentialIndexDir = tempDir.resolve("sequential"); + + // --- Parallel mode --- + final List parallelIds; + final List parallelScores; + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .onDisk(true) + .indexDirectory(parallelIndexDir) + .parallelOnDiskWrite(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + for(int i = 0; i < vectorCount; i++) + { + gigaMap.add(new Document("doc_" + i, vectors.get(i))); + } + + index.persistToDisk(); + + final VectorSearchResult result = index.search(queryVector, k); + parallelIds = new ArrayList<>(); + parallelScores = new ArrayList<>(); + for(final VectorSearchResult.Entry entry : result) + { + parallelIds.add(entry.entityId()); + parallelScores.add(entry.score()); + } + + assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.graph"))); + assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.meta"))); + } + + // --- Sequential mode --- + final List sequentialIds; + final List sequentialScores; + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .onDisk(true) + .indexDirectory(sequentialIndexDir) + .parallelOnDiskWrite(false) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + for(int i = 0; i < vectorCount; i++) + { + gigaMap.add(new Document("doc_" + i, vectors.get(i))); + } + + index.persistToDisk(); + + final VectorSearchResult result = index.search(queryVector, k); + sequentialIds = new ArrayList<>(); + sequentialScores = new ArrayList<>(); + for(final VectorSearchResult.Entry entry : result) + { + sequentialIds.add(entry.entityId()); + sequentialScores.add(entry.score()); + } + + assertTrue(Files.exists(sequentialIndexDir.resolve("embeddings.graph"))); + assertTrue(Files.exists(sequentialIndexDir.resolve("embeddings.meta"))); + } + + // --- Compare results --- + assertEquals(k, parallelIds.size()); + assertEquals(k, sequentialIds.size()); + + // Both indices were built from the same data with the same HNSW parameters, + // so search results must be identical. + assertEquals(parallelIds, sequentialIds, + "Parallel and sequential on-disk writes should produce identical search results"); + assertEquals(parallelScores, sequentialScores, + "Parallel and sequential on-disk writes should produce identical search scores"); + } + + /** + * Test that parallel and non-parallel on-disk writes produce equivalent search results + * for a large index with PQ compression enabled. + * This exercises the FusedPQ write path which is the primary target of the parallel mode setting. + */ + @Test + void testParallelVsNonParallelOnDiskWriteWithCompression(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 2000; + final int dimension = 64; + final int pqSubspaces = 16; + final int k = 20; + final Random random = new Random(42); + + // Generate shared vectors and query + final List vectors = new ArrayList<>(); + for(int i = 0; i < vectorCount; i++) + { + vectors.add(randomVector(random, dimension)); + } + final float[] queryVector = randomVector(new Random(999), dimension); + + final Path parallelIndexDir = tempDir.resolve("parallel"); + final Path sequentialIndexDir = tempDir.resolve("sequential"); + + // --- Parallel mode with PQ --- + final List parallelIds; + final List parallelScores; + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(32) + .beamWidth(100) + .onDisk(true) + .indexDirectory(parallelIndexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .parallelOnDiskWrite(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + for(int i = 0; i < vectorCount; i++) + { + gigaMap.add(new Document("doc_" + i, vectors.get(i))); + } + + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); + index.persistToDisk(); + + final VectorSearchResult result = index.search(queryVector, k); + parallelIds = new ArrayList<>(); + parallelScores = new ArrayList<>(); + for(final VectorSearchResult.Entry entry : result) + { + parallelIds.add(entry.entityId()); + parallelScores.add(entry.score()); + } + + assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.graph"))); + assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.meta"))); + } + + // --- Sequential mode with PQ --- + final List sequentialIds; + final List sequentialScores; + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(32) + .beamWidth(100) + .onDisk(true) + .indexDirectory(sequentialIndexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .parallelOnDiskWrite(false) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + for(int i = 0; i < vectorCount; i++) + { + gigaMap.add(new Document("doc_" + i, vectors.get(i))); + } + + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); + index.persistToDisk(); + + final VectorSearchResult result = index.search(queryVector, k); + sequentialIds = new ArrayList<>(); + sequentialScores = new ArrayList<>(); + for(final VectorSearchResult.Entry entry : result) + { + sequentialIds.add(entry.entityId()); + sequentialScores.add(entry.score()); + } + + assertTrue(Files.exists(sequentialIndexDir.resolve("embeddings.graph"))); + assertTrue(Files.exists(sequentialIndexDir.resolve("embeddings.meta"))); + } + + // --- Compare results --- + assertEquals(k, parallelIds.size()); + assertEquals(k, sequentialIds.size()); + + // Both indices were built from the same data with identical HNSW parameters and PQ training, + // so search results must be identical. + assertEquals(parallelIds, sequentialIds, + "Parallel and sequential PQ-compressed on-disk writes should produce identical search results"); + assertEquals(parallelScores, sequentialScores, + "Parallel and sequential PQ-compressed on-disk writes should produce identical search scores"); + } + + /** + * Test that parallel and non-parallel on-disk writes both support persist-and-reload + * for a large PQ-compressed index. + * Verifies that the graph files produced by both modes can be loaded correctly + * and yield equivalent search results after restart. + */ + @Test + void testParallelVsNonParallelPersistAndReload(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 2000; + final int dimension = 64; + final int pqSubspaces = 16; + final int k = 20; + final Random random = new Random(42); + + // Generate shared vectors and query + final List vectors = new ArrayList<>(); + for(int i = 0; i < vectorCount; i++) + { + vectors.add(randomVector(random, dimension)); + } + final float[] queryVector = randomVector(new Random(999), dimension); + + final Path parallelIndexDir = tempDir.resolve("parallel-index"); + final Path parallelStorageDir = tempDir.resolve("parallel-storage"); + final Path sequentialIndexDir = tempDir.resolve("sequential-index"); + final Path sequentialStorageDir = tempDir.resolve("sequential-storage"); + + // --- Build and persist both modes --- + buildAndPersistIndex(vectors, queryVector, dimension, pqSubspaces, parallelIndexDir, parallelStorageDir, true); + buildAndPersistIndex(vectors, queryVector, dimension, pqSubspaces, sequentialIndexDir, sequentialStorageDir, false); + + // --- Reload both and compare search results --- + final List parallelIds = new ArrayList<>(); + final List parallelScores = new ArrayList<>(); + { + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(parallelStorageDir)) + { + @SuppressWarnings("unchecked") + final GigaMap gigaMap = (GigaMap)storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(vectorCount, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + assertNotNull(index); + assertTrue(index.isOnDisk()); + + final VectorSearchResult result = index.search(queryVector, k); + assertEquals(k, result.size()); + for(final VectorSearchResult.Entry entry : result) + { + parallelIds.add(entry.entityId()); + parallelScores.add(entry.score()); + assertNotNull(entry.entity()); + } + } + } + + final List sequentialIds = new ArrayList<>(); + final List sequentialScores = new ArrayList<>(); + { + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(sequentialStorageDir)) + { + @SuppressWarnings("unchecked") + final GigaMap gigaMap = (GigaMap)storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(vectorCount, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + assertNotNull(index); + assertTrue(index.isOnDisk()); + + final VectorSearchResult result = index.search(queryVector, k); + assertEquals(k, result.size()); + for(final VectorSearchResult.Entry entry : result) + { + sequentialIds.add(entry.entityId()); + sequentialScores.add(entry.score()); + assertNotNull(entry.entity()); + } + } + } + + // Both modes should produce equivalent results after reload + assertEquals(parallelIds, sequentialIds, + "Parallel and sequential modes should produce identical search results after reload"); + assertEquals(parallelScores, sequentialScores, + "Parallel and sequential modes should produce identical search scores after reload"); + } + + /** + * Helper to build, populate, train PQ, persist, and store a PQ-compressed index. + */ + private void buildAndPersistIndex( + final List vectors , + final float[] queryVector , + final int dimension , + final int pqSubspaces , + final Path indexDir , + final Path storageDir , + final boolean parallel + ) throws IOException + { + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) + { + final GigaMap gigaMap = GigaMap.New(); + storage.setRoot(gigaMap); + + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(32) + .beamWidth(100) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .parallelOnDiskWrite(parallel) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + for(int i = 0; i < vectors.size(); i++) + { + gigaMap.add(new Document("doc_" + i, vectors.get(i))); + } + + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); + index.persistToDisk(); + + assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); + + storage.storeRoot(); + } + } + + + // ======================================================================== + // Embedded Vectorizer + On-Disk Tests + // ======================================================================== + + /** + * Test that an embedded vectorizer with parallel on-disk write completes without deadlock. + *

+ * This is a regression test for a deadlock where {@code persistToDisk()} held + * {@code synchronized(parentMap)} for the entire disk write. The disk writer uses + * internal worker threads (ForkJoinPool for PQ encoding, parallel graph writer) + * that call {@code parentMap.get()} — which also synchronizes on the same monitor. + *

+ * The fix restructures locking: Phase 1 (prep) runs inside {@code synchronized(parentMap)}, + * Phase 2 (disk write) runs outside it but still holds {@code persistenceLock.writeLock()}. + *

+ * Uses {@code @Timeout} to fail fast if a deadlock occurs instead of hanging indefinitely. + */ + @Test + @Timeout(value = 60, unit = TimeUnit.SECONDS) + void testEmbeddedVectorizerWithParallelOnDiskWrite(@TempDir final Path tempDir) + { + final int vectorCount = 500; + final int dimension = 64; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .parallelOnDiskWrite(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new EmbeddedDocumentVectorizer() + ); + + // Add vectors + for(int i = 0; i < vectorCount; i++) + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } + + // This would deadlock before the fix + index.persistToDisk(); + + // Verify files were created + assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); + + // Verify search still works after persist + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult result = index.search(queryVector, 10); + assertEquals(10, result.size()); + + for(final VectorSearchResult.Entry entry : result) + { + assertNotNull(entry.entity()); + } + } + + /** + * Test that an embedded vectorizer with PQ compression and parallel on-disk write + * completes without deadlock. + *

+ * This is the most deadlock-prone scenario: FusedPQ encoding uses a ForkJoinPool + * that calls {@code getVector()} on worker threads, plus the parallel graph writer + * also calls {@code getVector()} from its own thread pool. + */ + @Test + @Timeout(value = 60, unit = TimeUnit.SECONDS) + void testEmbeddedVectorizerWithPqAndParallelOnDiskWrite(@TempDir final Path tempDir) + { + final int vectorCount = 500; + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .parallelOnDiskWrite(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new EmbeddedDocumentVectorizer() + ); + + // Add vectors + for(int i = 0; i < vectorCount; i++) + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } + + // Train PQ compression + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); + + // This would deadlock before the fix + index.persistToDisk(); + + // Verify files were created + assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); + + // Verify search still works + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult result = index.search(queryVector, 10); + assertEquals(10, result.size()); + } } diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexPerformanceTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexPerformanceTest.java index 998b55fe..27dec828 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexPerformanceTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexPerformanceTest.java @@ -17,7 +17,9 @@ import org.eclipse.store.gigamap.types.GigaMap; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import java.nio.file.Path; import java.util.ArrayList; import java.util.List; import java.util.Random; @@ -58,6 +60,18 @@ float[] embedding() * Computed vectorizer for performance tests. */ static class DocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + } + + /** + * Embedded vectorizer for performance tests - vectors stored in entity, not separately. + */ + static class EmbeddedDocumentVectorizer extends Vectorizer { @Override public float[] vectorize(final Document entity) @@ -356,4 +370,300 @@ void testPerformanceWithDifferentConfigurations() System.out.println("=== Configuration Comparison Complete ==="); } + + /** + * Performance test comparing parallel vs non-parallel on-disk write speed. + *

+ * Measures the time taken by {@code persistToDisk()} for both modes with + * PQ compression enabled (the primary target of the parallel writer) and + * without PQ compression. + *

+ * Increase {@code vectorCount} for more meaningful results (e.g., 100_000+). + */ + @Test + void testParallelVsNonParallelOnDiskWritePerformance(@TempDir final Path tempDir) + { + final int vectorCount = 10_000; + final int dimension = 128; + final int pqSubspaces = 32; + final int iterations = 3; + + System.err.println("=== Parallel vs Non-Parallel On-Disk Write Performance ==="); + System.err.println("Vector count: " + vectorCount); + System.err.println("Dimension: " + dimension); + System.err.println("Available processors: " + Runtime.getRuntime().availableProcessors()); + System.err.println(); + + // Pre-generate vectors + System.err.print("Generating vectors... "); + final Random random = new Random(42); + final List documents = new ArrayList<>(vectorCount); + for(int i = 0; i < vectorCount; i++) + { + documents.add(new Document("doc_" + i, randomVector(random, dimension))); + } + System.err.println("done."); + + // ========== WITHOUT PQ COMPRESSION ========== + System.err.println(); + System.err.println("--- Without PQ Compression ---"); + + final long[] noPqParallelTimes = new long[iterations]; + final long[] noPqSequentialTimes = new long[iterations]; + + for(int i = 0; i < iterations; i++) + { + noPqParallelTimes[i] = this.measurePersist( + tempDir.resolve("nopq-par-" + i), documents, dimension, false, 0, true + ); + noPqSequentialTimes[i] = this.measurePersist( + tempDir.resolve("nopq-seq-" + i), documents, dimension, false, 0, false + ); + } + + printComparisonResults("Without PQ", noPqParallelTimes, noPqSequentialTimes); + + // ========== WITH PQ COMPRESSION ========== + System.err.println(); + System.err.println("--- With PQ Compression (FusedPQ writer path) ---"); + + final long[] pqParallelTimes = new long[iterations]; + final long[] pqSequentialTimes = new long[iterations]; + + for(int i = 0; i < iterations; i++) + { + pqParallelTimes[i] = this.measurePersist( + tempDir.resolve("pq-par-" + i), documents, dimension, true, pqSubspaces, true + ); + pqSequentialTimes[i] = this.measurePersist( + tempDir.resolve("pq-seq-" + i), documents, dimension, true, pqSubspaces, false + ); + } + + printComparisonResults("With PQ", pqParallelTimes, pqSequentialTimes); + + System.err.println(); + System.err.println("=== Performance Comparison Complete ==="); + } + + /** + * Creates an index, populates it, persists to disk, and returns the persist duration. + * All resources are properly closed before returning. + */ + private long measurePersist( + final Path indexDir , + final List documents, + final int dimension, + final boolean enablePq , + final int pqSubspaces, + final boolean parallel + ) + { + final String mode = parallel ? "parallel" : "sequential"; + final String pq = enablePq ? "pq" : "nopq"; + + System.err.printf(" [%s/%s] creating index... ", pq, mode); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration.Builder configBuilder = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(enablePq ? 32 : 16) + .beamWidth(100) + .onDisk(true) + .indexDirectory(indexDir) + .parallelOnDiskWrite(parallel); + + if(enablePq) + { + configBuilder + .enablePqCompression(true) + .pqSubspaces(pqSubspaces); + } + + try(final VectorIndex index = vectorIndices.add( + "embeddings", configBuilder.build(), new DocumentVectorizer() + )) + { + System.err.print("populating... "); + gigaMap.addAll(documents); + + if(enablePq) + { + System.err.print("training PQ... "); + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); + } + + System.err.print("persisting... "); + + final long start = System.nanoTime(); + index.persistToDisk(); + final long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + System.err.printf("%,d ms%n", elapsedMs); + + return elapsedMs; + } + } + + /** + * Performance test comparing parallel vs non-parallel on-disk write with embedded vectorizer. + *

+ * This variant uses {@code isEmbedded()=true}, meaning vectors are fetched from entities + * via the parentMap during disk write. This is the scenario most prone to deadlock + * when the parentMap monitor is held during the write phase. + */ + @Test + void testParallelOnDiskWriteWithEmbeddedVectorizer(@TempDir final Path tempDir) + { + final int vectorCount = 100_000; + final int dimension = 128; + final int iterations = 3; + + System.err.println("=== Embedded Vectorizer: Parallel On-Disk Write Performance ==="); + System.err.println("Vector count: " + vectorCount); + System.err.println("Dimension: " + dimension); + System.err.println("Available processors: " + Runtime.getRuntime().availableProcessors()); + System.err.println(); + + // Pre-generate vectors + System.err.print("Generating vectors... "); + final Random random = new Random(42); + final List documents = new ArrayList<>(vectorCount); + for(int i = 0; i < vectorCount; i++) + { + documents.add(new Document("doc_" + i, randomVector(random, dimension))); + } + System.err.println("done."); + + System.err.println(); + System.err.println("--- Embedded Vectorizer (no PQ) ---"); + + final long[] parallelTimes = new long[iterations]; + final long[] sequentialTimes = new long[iterations]; + + for(int i = 0; i < iterations; i++) + { + parallelTimes[i] = this.measurePersistEmbedded( + tempDir.resolve("emb-par-" + i), documents, dimension, true + ); + sequentialTimes[i] = this.measurePersistEmbedded( + tempDir.resolve("emb-seq-" + i), documents, dimension, false + ); + } + + printComparisonResults("Embedded Vectorizer", parallelTimes, sequentialTimes); + + System.err.println(); + System.err.println("=== Embedded Vectorizer Performance Complete ==="); + } + + /** + * Creates an index with embedded vectorizer, populates it, persists to disk, + * and returns the persist duration. + */ + private long measurePersistEmbedded( + final Path indexDir , + final List documents, + final int dimension, + final boolean parallel + ) + { + final String mode = parallel ? "parallel" : "sequential"; + + System.err.printf(" [embedded/%s] creating index... ", mode); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .onDisk(true) + .indexDirectory(indexDir) + .parallelOnDiskWrite(parallel) + .build(); + + try(final VectorIndex index = vectorIndices.add( + "embeddings", config, new EmbeddedDocumentVectorizer() + )) + { + System.err.print("populating... "); + gigaMap.addAll(documents); + + System.err.print("persisting... "); + + final long start = System.nanoTime(); + index.persistToDisk(); + final long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + System.err.printf("%,d ms%n", elapsedMs); + + return elapsedMs; + } + } + + /** + * Prints a comparison summary for parallel vs sequential persist times. + */ + private static void printComparisonResults( + final String label , + final long[] parallelTimes , + final long[] sequentialTimes + ) + { + final long parallelAvg = average(parallelTimes); + final long parallelMin = min(parallelTimes); + final long parallelMax = max(parallelTimes); + final long sequentialAvg = average(sequentialTimes); + final long sequentialMin = min(sequentialTimes); + final long sequentialMax = max(sequentialTimes); + + System.err.println(); + System.err.println("=== " + label + " Results ==="); + System.err.printf(" Parallel: avg=%,d ms min=%,d ms max=%,d ms%n", + parallelAvg, parallelMin, parallelMax); + System.err.printf(" Sequential: avg=%,d ms min=%,d ms max=%,d ms%n", + sequentialAvg, sequentialMin, sequentialMax); + + if(sequentialAvg > 0 && parallelAvg > 0) + { + final double speedup = (double) sequentialAvg / parallelAvg; + System.err.printf(" Speedup: %.2fx%n", speedup); + } + } + + private static long average(final long[] values) + { + long sum = 0; + for(final long v : values) + { + sum += v; + } + return sum / values.length; + } + + private static long min(final long[] values) + { + long result = Long.MAX_VALUE; + for(final long v : values) + { + if(v < result) result = v; + } + return result; + } + + private static long max(final long[] values) + { + long result = Long.MIN_VALUE; + for(final long v : values) + { + if(v > result) result = v; + } + return result; + } } From 87a3fdbfc11610ae5639cc97be269bb7d63dd7db Mon Sep 17 00:00:00 2001 From: fh-ms Date: Tue, 17 Feb 2026 17:32:24 +0100 Subject: [PATCH 03/15] Introduce BackgroundIndexingManager for eventual consistency in VectorIndex - Added `BackgroundIndexingManager` interface to manage background graph indexing. - Implemented a default asynchronous queue-based manager for deferred HNSW operations. - Added unit tests to validate eventual indexing behavior across add, update, remove, and bulk operations. --- .../jvector/BackgroundIndexingManager.java | 323 +++++++ .../store/gigamap/jvector/VectorIndex.java | 166 +++- .../jvector/VectorIndexConfiguration.java | 53 +- .../jvector/VectorIndexConfigurationTest.java | 58 ++ .../VectorIndexEventualIndexingTest.java | 837 ++++++++++++++++++ 5 files changed, 1411 insertions(+), 26 deletions(-) create mode 100644 gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundIndexingManager.java create mode 100644 gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexEventualIndexingTest.java diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundIndexingManager.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundIndexingManager.java new file mode 100644 index 00000000..231642f5 --- /dev/null +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundIndexingManager.java @@ -0,0 +1,323 @@ +package org.eclipse.store.gigamap.jvector; + +/*- + * #%L + * EclipseStore GigaMap JVector + * %% + * Copyright (C) 2023 - 2026 MicroStream Software + * %% + * This program and the accompanying materials are made + * available under the terms of the Eclipse Public License 2.0 + * which is available at https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + * #L% + */ + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.LinkedBlockingQueue; + +/** + * Manages background graph indexing for a VectorIndex. + *

+ * When eventual indexing is enabled, graph mutations (add/update/remove) are + * queued and applied asynchronously by a single background worker thread. + * The vector store is still updated synchronously so that data is not lost, + * but the expensive HNSW graph operations are deferred. + *

+ * This trades immediate search consistency for reduced latency on mutation + * operations — search results may not immediately reflect the most recent + * mutations (eventual consistency). + *

+ * This manager handles: + *

    + *
  • Queuing of graph indexing operations (add, update, remove)
  • + *
  • Sequential application of operations by a single background thread
  • + *
  • Draining the queue (blocking until all pending operations are applied)
  • + *
  • Graceful shutdown with optional drain
  • + *
+ */ +interface BackgroundIndexingManager +{ + /** + * Enqueues an indexing operation for background processing. + * + * @param operation the operation to enqueue + */ + void enqueue(IndexingOperation operation); + + /** + * Blocks until all currently enqueued operations have been applied. + *

+ * This is used before {@code optimize()} and {@code persistToDisk()} to + * ensure the graph is complete before those operations proceed. + */ + void drainQueue(); + + /** + * Discards all pending operations without applying them. + *

+ * Used during {@code internalRemoveAll()} where pending operations + * refer to stale ordinals that are no longer valid. + */ + void discardQueue(); + + /** + * Shuts down the background indexing manager. + * + * @param drainPending if true, drain all pending operations before shutdown + */ + void shutdown(boolean drainPending); + + /** + * Returns the number of pending operations in the queue. + * Useful for monitoring and testing. + * + * @return the number of pending operations + */ + int getPendingCount(); + + + // ======================================================================== + // Indexing Operations + // ======================================================================== + + /** + * Sealed interface for indexing operations that can be queued. + */ + sealed interface IndexingOperation + permits IndexingOperation.Add, + IndexingOperation.Update, + IndexingOperation.Remove, + IndexingOperation.DrainBarrier + { + /** + * Add a node to the HNSW graph. + */ + record Add(int ordinal, float[] vector) implements IndexingOperation {} + + /** + * Update a node in the HNSW graph (delete + re-add). + */ + record Update(int ordinal, float[] vector) implements IndexingOperation {} + + /** + * Remove a node from the HNSW graph. + */ + record Remove(int ordinal) implements IndexingOperation {} + + /** + * Sentinel operation for drainQueue() — signals the worker to release the latch. + */ + record DrainBarrier(CountDownLatch latch) implements IndexingOperation {} + } + + + // ======================================================================== + // Callback + // ======================================================================== + + /** + * Callback interface for applying graph operations. + * Implemented by {@code VectorIndex.Default}. + */ + interface Callback + { + /** + * Adds a node to the HNSW graph. + * + * @param ordinal the node ordinal + * @param vector the vector data + */ + void applyGraphAdd(int ordinal, float[] vector); + + /** + * Updates a node in the HNSW graph (delete old + add new). + * + * @param ordinal the node ordinal + * @param vector the new vector data + */ + void applyGraphUpdate(int ordinal, float[] vector); + + /** + * Removes a node from the HNSW graph. + * + * @param ordinal the node ordinal + */ + void applyGraphRemove(int ordinal); + + /** + * Marks dirty for persistence/optimization background managers. + * + * @param count the number of changes + */ + void markDirtyForBackgroundManagers(int count); + } + + + // ======================================================================== + // Default Implementation + // ======================================================================== + + /** + * Default implementation of BackgroundIndexingManager. + *

+ * Uses a single daemon worker thread that continuously takes operations + * from a {@link LinkedBlockingQueue} and applies them via the callback. + */ + static class Default implements BackgroundIndexingManager + { + private static final Logger LOG = LoggerFactory.getLogger(BackgroundIndexingManager.class); + + private final Callback callback; + private final String name ; + private final LinkedBlockingQueue queue ; + private final Thread worker ; + + private volatile boolean shutdown = false; + + Default(final Callback callback, final String name) + { + this.callback = callback; + this.name = name ; + this.queue = new LinkedBlockingQueue<>(); + + this.worker = new Thread(this::workerLoop, "VectorIndex-BackgroundIndexing-" + name); + this.worker.setDaemon(true); + this.worker.start(); + + LOG.info("Background indexing started for '{}'", name); + } + + @Override + public void enqueue(final IndexingOperation operation) + { + this.queue.add(operation); + } + + @Override + public void drainQueue() + { + if(this.shutdown) + { + return; + } + + final CountDownLatch latch = new CountDownLatch(1); + this.queue.add(new IndexingOperation.DrainBarrier(latch)); + + try + { + latch.await(); + } + catch(final InterruptedException e) + { + Thread.currentThread().interrupt(); + LOG.warn("Interrupted while draining indexing queue for '{}'", this.name); + } + } + + @Override + public void discardQueue() + { + final int discarded = this.queue.size(); + this.queue.clear(); + if(discarded > 0) + { + LOG.info("Discarded {} pending indexing operations for '{}'", discarded, this.name); + } + } + + @Override + public void shutdown(final boolean drainPending) + { + if(drainPending) + { + LOG.info("Draining {} pending indexing operations for '{}' before shutdown", + this.queue.size(), this.name); + this.drainQueue(); + } + + this.shutdown = true; + this.worker.interrupt(); + + try + { + this.worker.join(30_000); + if(this.worker.isAlive()) + { + LOG.warn("Background indexing worker did not terminate gracefully for '{}'", this.name); + } + } + catch(final InterruptedException e) + { + Thread.currentThread().interrupt(); + } + + LOG.info("Background indexing manager shutdown for '{}'", this.name); + } + + @Override + public int getPendingCount() + { + return this.queue.size(); + } + + /** + * Worker loop that continuously takes operations from the queue and applies them. + */ + private void workerLoop() + { + while(!this.shutdown) + { + try + { + final IndexingOperation op = this.queue.take(); + this.applyOperation(op); + } + catch(final InterruptedException e) + { + if(!this.shutdown) + { + LOG.debug("Background indexing worker interrupted for '{}'", this.name); + } + // Re-check shutdown flag in loop condition + } + catch(final Exception e) + { + LOG.error("Error applying indexing operation for '{}': {}", this.name, e.getMessage(), e); + } + } + } + + /** + * Applies a single indexing operation via the callback. + */ + private void applyOperation(final IndexingOperation op) + { + if(op instanceof IndexingOperation.Add add) + { + this.callback.applyGraphAdd(add.ordinal(), add.vector()); + this.callback.markDirtyForBackgroundManagers(1); + } + else if(op instanceof IndexingOperation.Update update) + { + this.callback.applyGraphUpdate(update.ordinal(), update.vector()); + this.callback.markDirtyForBackgroundManagers(1); + } + else if(op instanceof IndexingOperation.Remove remove) + { + this.callback.applyGraphRemove(remove.ordinal()); + this.callback.markDirtyForBackgroundManagers(1); + } + else if(op instanceof IndexingOperation.DrainBarrier barrier) + { + barrier.latch().countDown(); + } + } + } + +} diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java index ee11d906..8280402a 100644 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java @@ -595,6 +595,7 @@ public static class Default implements VectorIndex.Internal, BackgroundPersistenceManager.Callback, BackgroundOptimizationManager.Callback, + BackgroundIndexingManager.Callback, PQCompressionManager.VectorProvider, DiskIndexManager.IndexStateProvider { @@ -629,6 +630,7 @@ static BinaryTypeHandler> provideTypeHandler() private transient PQCompressionManager pqManager ; private transient BackgroundPersistenceManager persistenceManager ; transient BackgroundOptimizationManager optimizationManager; + transient BackgroundIndexingManager indexingManager ; // GraphSearcher pool for thread-local reuse private transient ExplicitThreadLocal searcherPool; @@ -815,10 +817,26 @@ private void initializeIndex() */ private void startBackgroundManagersIfEnabled() { + this.startBackgroundIndexingIfEnabled(); this.startBackgroundPersistenceIfEnabled(); this.startBackgroundOptimizationIfEnabled(); } + /** + * Starts the background indexing manager if eventual indexing is configured. + */ + private void startBackgroundIndexingIfEnabled() + { + if(this.configuration.eventualIndexing()) + { + if(this.indexingManager == null) + { + this.indexingManager = new BackgroundIndexingManager.Default(this, this.name); + LOG.info("Eventual indexing enabled for index '{}'", this.name); + } + } + } + /** * Starts the background persistence manager if configured. */ @@ -1055,14 +1073,22 @@ public void internalAdd(final long entityId, final E entity) this.vectorStore.add(new VectorEntry(entityId, vector)); } - // Add to HNSW graph using entity ID as ordinal - final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); - this.builder.addGraphNode(ordinal, vf); - this.markStateChangeChildren(); - // Mark dirty for background managers - this.markDirtyForBackgroundManagers(1); + if(this.indexingManager != null) + { + // Defer graph update to background thread + this.indexingManager.enqueue(new BackgroundIndexingManager.IndexingOperation.Add(ordinal, vector)); + } + else + { + // Add to HNSW graph using entity ID as ordinal + final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); + this.builder.addGraphNode(ordinal, vf); + + // Mark dirty for background managers + this.markDirtyForBackgroundManagers(1); + } } } @@ -1094,8 +1120,6 @@ public void internalUpdate(final long entityId, final E replacedEntity, final E final float[] vector = this.vectorize(entity); final int ordinal = toOrdinal(entityId); - this.builder.markNodeDeleted(ordinal); - this.builder.removeDeletedNodes(); // Update based on vectorizer type if(!this.isEmbedded()) @@ -1103,14 +1127,25 @@ public void internalUpdate(final long entityId, final E replacedEntity, final E this.vectorStore.set(entityId, new VectorEntry(entityId, vector)); } - // Add to HNSW graph using entity ID as ordinal - final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); - this.builder.addGraphNode(ordinal, vf); - this.markStateChangeChildren(); - // Mark dirty for background managers - this.markDirtyForBackgroundManagers(1); + if(this.indexingManager != null) + { + // Defer graph update to background thread + this.indexingManager.enqueue(new BackgroundIndexingManager.IndexingOperation.Update(ordinal, vector)); + } + else + { + this.builder.markNodeDeleted(ordinal); + this.builder.removeDeletedNodes(); + + // Add to HNSW graph using entity ID as ordinal + final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); + this.builder.addGraphNode(ordinal, vf); + + // Mark dirty for background managers + this.markDirtyForBackgroundManagers(1); + } } } @@ -1145,19 +1180,32 @@ private void addVectorEntries(final List entries) this.vectorStore.addAll(entries); } - this.addGraphNodesSequential(entries); - this.markStateChangeChildren(); - // Mark dirty for background managers (with count for debouncing) - this.markDirtyForBackgroundManagers(entries.size()); + if(this.indexingManager != null) + { + // Defer graph updates to background thread + entries.forEach(entry -> + this.indexingManager.enqueue(new BackgroundIndexingManager.IndexingOperation.Add( + toOrdinal(entry.sourceEntityId), entry.vector + )) + ); + } + else + { + this.addGraphNodesSequential(entries); + + // Mark dirty for background managers (with count for debouncing) + this.markDirtyForBackgroundManagers(entries.size()); + } } } /** * Marks dirty for background managers with the specified change count. */ - private void markDirtyForBackgroundManagers(final int count) + @Override + public void markDirtyForBackgroundManagers(final int count) { if(this.persistenceManager != null) { @@ -1194,12 +1242,21 @@ public void internalRemove(final long entityId, final E entity) { this.vectorStore.removeById(entityId); } - this.builder.markNodeDeleted(ordinal); this.markStateChangeChildren(); - // Mark dirty for background managers - this.markDirtyForBackgroundManagers(1); + if(this.indexingManager != null) + { + // Defer graph update to background thread + this.indexingManager.enqueue(new BackgroundIndexingManager.IndexingOperation.Remove(ordinal)); + } + else + { + this.builder.markNodeDeleted(ordinal); + + // Mark dirty for background managers + this.markDirtyForBackgroundManagers(1); + } } } @@ -1221,6 +1278,9 @@ public void internalRemoveAll() this.vectorStore.removeAll(); } + // Discard and shutdown indexing manager (pending ops are stale) + this.shutdownIndexingManager(false); + // Shutdown optimization manager before closing this.shutdownOptimizationManager(false); @@ -1371,6 +1431,12 @@ private VectorSearchResult convertSearchResult(final SearchResult result) @Override public void optimize() { + // Drain pending indexing operations to ensure graph is complete + if(this.indexingManager != null) + { + this.indexingManager.drainQueue(); + } + final GraphIndexBuilder capturedBuilder; synchronized(this.parentMap()) { @@ -1398,6 +1464,12 @@ public void persistToDisk() return; // No-op for in-memory indices } + // Drain pending indexing operations to ensure graph is complete + if(this.indexingManager != null) + { + this.indexingManager.drainQueue(); + } + // Acquire write lock for exclusive access during persistence. // This blocks searches and other persist/removeAll/close calls. this.persistenceLock.writeLock().lock(); @@ -1504,10 +1576,13 @@ protected void clearChildrenStateChangeMarkers() @Override public void close() { - // Shutdown optimization manager first (may optimize pending changes) + // Shutdown indexing manager first — drain all pending graph operations + this.shutdownIndexingManager(true); + + // Shutdown optimization manager (may optimize pending changes) this.shutdownOptimizationManager(this.configuration.optimizeOnShutdown()); - // Shutdown persistence manager second (may persist pending changes) + // Shutdown persistence manager (may persist pending changes) this.shutdownPersistenceManager(this.configuration.persistOnShutdown()); // Acquire write lock to ensure no concurrent persistToDisk() Phase 2 is running. @@ -1554,6 +1629,24 @@ private void shutdownPersistenceManager(final boolean persistPending) } } + /** + * Shuts down the background indexing manager. + * + * @param drainPending if true, drain all pending operations before shutdown + */ + private void shutdownIndexingManager(final boolean drainPending) + { + if(this.indexingManager != null) + { + if(!drainPending) + { + this.indexingManager.discardQueue(); + } + this.indexingManager.shutdown(drainPending); + this.indexingManager = null; + } + } + /** * Closes internal resources (builder, index, disk resources). * Must be called within synchronized block. @@ -1671,6 +1764,31 @@ public long getExpectedVectorCount() } } + // ================================================================ + // BackgroundIndexingManager.Callback implementation + // ================================================================ + + @Override + public void applyGraphAdd(final int ordinal, final float[] vector) + { + final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); + this.builder.addGraphNode(ordinal, vf); + } + + @Override + public void applyGraphUpdate(final int ordinal, final float[] vector) + { + this.builder.markNodeDeleted(ordinal); + this.builder.removeDeletedNodes(); + final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); + this.builder.addGraphNode(ordinal, vf); + } + + @Override + public void applyGraphRemove(final int ordinal) + { + this.builder.markNodeDeleted(ordinal); + } } diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndexConfiguration.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndexConfiguration.java index 7de93834..679cb447 100644 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndexConfiguration.java +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndexConfiguration.java @@ -448,6 +448,24 @@ public default boolean backgroundOptimization() */ public boolean optimizeOnShutdown(); + /** + * Returns whether eventual indexing mode is enabled. + *

+ * When enabled, expensive HNSW graph mutations (add, update, remove) are + * deferred to a background thread. The vector store is still updated + * synchronously, but graph construction happens asynchronously. + *

+ * This reduces the latency of mutation operations at the cost of + * eventual consistency — search results may not immediately reflect the + * most recent mutations. + *

+ * The graph is automatically drained (all pending operations applied) + * before {@code optimize()}, {@code persistToDisk()}, and {@code close()}. + * + * @return true if eventual indexing is enabled (default: false) + */ + public boolean eventualIndexing(); + /** * Returns whether parallel writing is used for on-disk index persistence. *

@@ -933,6 +951,18 @@ public static interface Builder */ public Builder parallelOnDiskWrite(boolean parallelOnDiskWrite); + /** + * Enables or disables eventual indexing mode. + *

+ * When enabled, HNSW graph mutations are deferred to a background thread, + * reducing mutation latency at the cost of eventual consistency for searches. + * + * @param eventualIndexing true to enable eventual indexing + * @return this builder for method chaining + * @see VectorIndexConfiguration#eventualIndexing() + */ + public Builder eventualIndexing(boolean eventualIndexing); + /** * Builds the configuration with the specified parameters. * @@ -973,6 +1003,7 @@ public static class Default implements Builder private int minChangesBetweenOptimizations; private boolean optimizeOnShutdown ; private boolean parallelOnDiskWrite ; + private boolean eventualIndexing ; Default() { @@ -993,6 +1024,7 @@ public static class Default implements Builder this.minChangesBetweenOptimizations = 1000; this.optimizeOnShutdown = false; this.parallelOnDiskWrite = true; + this.eventualIndexing = false; } @Override @@ -1134,6 +1166,13 @@ public Builder parallelOnDiskWrite(final boolean parallelOnDiskWrite) return this; } + @Override + public Builder eventualIndexing(final boolean eventualIndexing) + { + this.eventualIndexing = eventualIndexing; + return this; + } + @Override public VectorIndexConfiguration build() { @@ -1182,7 +1221,8 @@ public VectorIndexConfiguration build() this.optimizationIntervalMs, this.minChangesBetweenOptimizations, this.optimizeOnShutdown, - this.parallelOnDiskWrite + this.parallelOnDiskWrite, + this.eventualIndexing ); } @@ -1213,6 +1253,7 @@ public static class Default implements VectorIndexConfiguration private final int minChangesBetweenOptimizations; private final boolean optimizeOnShutdown ; private final boolean parallelOnDiskWrite ; + private final boolean eventualIndexing ; Default( final int dimension , @@ -1231,7 +1272,8 @@ public static class Default implements VectorIndexConfiguration final long optimizationIntervalMs , final int minChangesBetweenOptimizations , final boolean optimizeOnShutdown , - final boolean parallelOnDiskWrite + final boolean parallelOnDiskWrite , + final boolean eventualIndexing ) { this.dimension = dimension ; @@ -1251,6 +1293,7 @@ public static class Default implements VectorIndexConfiguration this.minChangesBetweenOptimizations = minChangesBetweenOptimizations ; this.optimizeOnShutdown = optimizeOnShutdown ; this.parallelOnDiskWrite = parallelOnDiskWrite ; + this.eventualIndexing = eventualIndexing ; } @Override @@ -1355,6 +1398,12 @@ public boolean parallelOnDiskWrite() return this.parallelOnDiskWrite; } + @Override + public boolean eventualIndexing() + { + return this.eventualIndexing; + } + } } diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java index 8106570d..03f11058 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java @@ -55,6 +55,7 @@ void testBuilderDefaults() assertEquals(1000, config.minChangesBetweenOptimizations()); assertFalse(config.optimizeOnShutdown()); assertTrue(config.parallelOnDiskWrite()); + assertFalse(config.eventualIndexing()); } // ==================== Builder Validation Tests ==================== @@ -956,4 +957,61 @@ void testLargeDatasetEnablesCompression(@TempDir final Path tempDir) assertTrue(config.enablePqCompression(), "Large dataset should have compression enabled by default"); } + + // ==================== Eventual Indexing Tests ==================== + + @Test + void testEventualIndexingDefaultFalse() + { + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(64) + .build(); + + assertFalse(config.eventualIndexing()); + } + + @Test + void testEventualIndexingCanBeEnabled() + { + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(64) + .eventualIndexing(true) + .build(); + + assertTrue(config.eventualIndexing()); + } + + @Test + void testEventualIndexingCanBeDisabledExplicitly() + { + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(64) + .eventualIndexing(false) + .build(); + + assertFalse(config.eventualIndexing()); + } + + @Test + void testEventualIndexingWithOnDiskConfig(@TempDir final Path tempDir) + { + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(64) + .onDisk(true) + .indexDirectory(tempDir) + .eventualIndexing(true) + .build(); + + assertTrue(config.eventualIndexing()); + assertTrue(config.onDisk()); + } + + @Test + void testFactoryMethodsDefaultEventualIndexingFalse(@TempDir final Path tempDir) + { + assertFalse(VectorIndexConfiguration.forSmallDataset(64).eventualIndexing()); + assertFalse(VectorIndexConfiguration.forMediumDataset(64).eventualIndexing()); + assertFalse(VectorIndexConfiguration.forLargeDataset(64, tempDir).eventualIndexing()); + assertFalse(VectorIndexConfiguration.forHighPrecision(64).eventualIndexing()); + } } diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexEventualIndexingTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexEventualIndexingTest.java new file mode 100644 index 00000000..f32f1116 --- /dev/null +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexEventualIndexingTest.java @@ -0,0 +1,837 @@ +package org.eclipse.store.gigamap.jvector; + +/*- + * #%L + * EclipseStore GigaMap JVector + * %% + * Copyright (C) 2023 - 2026 MicroStream Software + * %% + * This program and the accompanying materials are made + * available under the terms of the Eclipse Public License 2.0 + * which is available at https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + * #L% + */ + +import org.eclipse.store.gigamap.types.GigaMap; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.api.io.TempDir; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Random; +import java.util.concurrent.TimeUnit; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Tests for eventual indexing mode in VectorIndex. + *

+ * Eventual indexing defers HNSW graph mutations to a background thread + * while keeping vector store updates immediate. This trades immediate + * search consistency for reduced mutation latency. + */ +class VectorIndexEventualIndexingTest +{ + record Document(String content, float[] embedding) {} + + static class ComputedDocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + } + + static class EmbeddedDocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + + @Override + public boolean isEmbedded() + { + return true; + } + } + + private static float[] randomVector(final Random random, final int dimension) + { + final float[] vector = new float[dimension]; + float norm = 0; + for(int i = 0; i < dimension; i++) + { + vector[i] = random.nextFloat() * 2 - 1; + norm += vector[i] * vector[i]; + } + norm = (float)Math.sqrt(norm); + for(int i = 0; i < dimension; i++) + { + vector[i] /= norm; + } + return vector; + } + + // ==================== Basic Add / Search Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testAddAndSearchWithEventualIndexing() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new EmbeddedDocumentVectorizer() + ); + + try + { + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); + gigaMap.add(new Document("doc3", new float[]{0.0f, 0.0f, 1.0f})); + + // Drain queue to ensure all graph operations are applied + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + defaultIndex.indexingManager.drainQueue(); + + // Search should find all 3 documents + final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 3); + assertEquals(3, result.size()); + + // The closest match should be doc1 + assertEquals("doc1", result.toList().get(0).entity().content()); + } + finally + { + index.close(); + } + } + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testAddAndSearchWithComputedVectorizer() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + try + { + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); + gigaMap.add(new Document("doc3", new float[]{0.0f, 0.0f, 1.0f})); + + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + defaultIndex.indexingManager.drainQueue(); + + final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 3); + assertEquals(3, result.size()); + assertEquals("doc1", result.toList().get(0).entity().content()); + } + finally + { + index.close(); + } + } + + // ==================== Bulk Add Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testBulkAddWithEventualIndexing() + { + final int dimension = 64; + final int vectorCount = 100; + final Random random = new Random(42); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + try + { + for(int i = 0; i < vectorCount; i++) + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } + + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + defaultIndex.indexingManager.drainQueue(); + + final VectorSearchResult result = index.search( + randomVector(new Random(99), dimension), 10 + ); + assertEquals(10, result.size()); + } + finally + { + index.close(); + } + } + + // ==================== Update Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testUpdateWithEventualIndexing() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new EmbeddedDocumentVectorizer() + ); + + try + { + final Document doc1 = new Document("doc1", new float[]{1.0f, 0.0f, 0.0f}); + final Document doc2 = new Document("doc2", new float[]{0.0f, 1.0f, 0.0f}); + gigaMap.add(doc1); + gigaMap.add(doc2); + + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + defaultIndex.indexingManager.drainQueue(); + + // Update doc1's vector to be close to doc2 + final Document updatedDoc1 = new Document("doc1_updated", new float[]{0.1f, 0.9f, 0.0f}); + gigaMap.set(0L, updatedDoc1); + + defaultIndex.indexingManager.drainQueue(); + + // Search for doc2-like vector: updated doc1 should now be close + final VectorSearchResult result = index.search(new float[]{0.0f, 1.0f, 0.0f}, 2); + assertEquals(2, result.size()); + } + finally + { + index.close(); + } + } + + // ==================== Remove Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testRemoveWithEventualIndexing() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new EmbeddedDocumentVectorizer() + ); + + try + { + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); + gigaMap.add(new Document("doc3", new float[]{0.0f, 0.0f, 1.0f})); + + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + defaultIndex.indexingManager.drainQueue(); + + // Remove doc1 + gigaMap.removeById(0L); + + defaultIndex.indexingManager.drainQueue(); + + // Search should only return 2 documents + final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 3); + assertEquals(2, result.size()); + } + finally + { + index.close(); + } + } + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testRemoveMultipleWithEventualIndexing() + { + final int dimension = 64; + final int vectorCount = 50; + final Random random = new Random(42); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + try + { + for(int i = 0; i < vectorCount; i++) + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } + + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + defaultIndex.indexingManager.drainQueue(); + + // Remove first 10 entities + for(int i = 0; i < 10; i++) + { + gigaMap.removeById(i); + } + + defaultIndex.indexingManager.drainQueue(); + + // Should have 40 remaining + final VectorSearchResult result = index.search( + randomVector(new Random(99), dimension), 50 + ); + assertEquals(40, result.size()); + } + finally + { + index.close(); + } + } + + // ==================== RemoveAll Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testRemoveAllDiscardsQueueAndResets() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new EmbeddedDocumentVectorizer() + ); + + try + { + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); + + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + defaultIndex.indexingManager.drainQueue(); + + // RemoveAll — this discards pending operations and shuts down manager + gigaMap.removeAll(); + + // Index should be empty + final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 10); + assertEquals(0, result.size()); + + // Add new data after removeAll — indexing manager is recreated by initializeIndex + gigaMap.add(new Document("new_doc", new float[]{1.0f, 0.0f, 0.0f})); + + // Drain the new indexing manager + defaultIndex.indexingManager.drainQueue(); + + final VectorSearchResult result2 = index.search(new float[]{1.0f, 0.0f, 0.0f}, 10); + assertEquals(1, result2.size()); + } + finally + { + index.close(); + } + } + + // ==================== Optimize Drains Queue Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testOptimizeDrainsQueueBeforeCleanup() + { + final int dimension = 64; + final Random random = new Random(42); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + try + { + for(int i = 0; i < 50; i++) + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } + + // Optimize should drain the queue first, then run cleanup + index.optimize(); + + // After optimize, all nodes should be searchable + final VectorSearchResult result = index.search( + randomVector(new Random(99), dimension), 10 + ); + assertEquals(10, result.size()); + } + finally + { + index.close(); + } + } + + // ==================== PersistToDisk Drains Queue Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testPersistToDiskDrainsQueueBeforeWrite(@TempDir final Path tempDir) + { + final int dimension = 64; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .eventualIndexing(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + try + { + for(int i = 0; i < 50; i++) + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } + + // PersistToDisk should drain the queue first + index.persistToDisk(); + + // Verify files were created + assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); + + // Search should work after persist + final VectorSearchResult result = index.search( + randomVector(new Random(99), dimension), 10 + ); + assertEquals(10, result.size()); + } + finally + { + index.close(); + } + } + + // ==================== Close Drains Queue Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testCloseDrainsPendingOperations() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new EmbeddedDocumentVectorizer() + ); + + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); + + // Close should drain pending operations without error + index.close(); + + // No assertion needed — if close() deadlocks or throws, the @Timeout will catch it + } + + // ==================== Pending Count Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testPendingCountTracksQueuedOperations() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new EmbeddedDocumentVectorizer() + ); + + try + { + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + + // Initially empty + assertEquals(0, defaultIndex.indexingManager.getPendingCount()); + + // After drain, count should be 0 + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); + + defaultIndex.indexingManager.drainQueue(); + + assertEquals(0, defaultIndex.indexingManager.getPendingCount()); + } + finally + { + index.close(); + } + } + + // ==================== Large Data Set Tests ==================== + + @Test + @Timeout(value = 60, unit = TimeUnit.SECONDS) + void testLargeDataSetWithEventualIndexing() + { + final int dimension = 128; + final int vectorCount = 500; + final Random random = new Random(42); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .eventualIndexing(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + try + { + // Add random vectors + for(int i = 0; i < vectorCount; i++) + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } + + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + defaultIndex.indexingManager.drainQueue(); + + // Search should return correct number of results + final VectorSearchResult result = index.search( + randomVector(new Random(99), dimension), 10 + ); + assertEquals(10, result.size()); + + // All results should have valid scores + for(final VectorSearchResult.Entry entry : result) + { + assertTrue(entry.score() > 0, "Score should be positive"); + assertNotNull(entry.entity()); + } + } + finally + { + index.close(); + } + } + + // ==================== On-Disk with Eventual Indexing ==================== + + @Test + @Timeout(value = 60, unit = TimeUnit.SECONDS) + void testOnDiskWithEventualIndexing(@TempDir final Path tempDir) + { + final int dimension = 64; + final int vectorCount = 100; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .eventualIndexing(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + try + { + for(int i = 0; i < vectorCount; i++) + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } + + // Persist triggers drain + index.persistToDisk(); + + assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); + + final VectorSearchResult result = index.search( + randomVector(new Random(99), dimension), 10 + ); + assertEquals(10, result.size()); + } + finally + { + index.close(); + } + } + + // ==================== Background Persistence + Eventual Indexing ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testBackgroundPersistenceWithEventualIndexing(@TempDir final Path tempDir) throws Exception + { + final int dimension = 64; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(500) + .minChangesBetweenPersists(1) + .eventualIndexing(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + try + { + for(int i = 0; i < 20; i++) + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } + + // Wait for background persistence (which should drain first) + Thread.sleep(1500); + + assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); + } + finally + { + index.close(); + } + } + + // ==================== Disabled by Default Tests ==================== + + @Test + void testEventualIndexingDisabledByDefault() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new EmbeddedDocumentVectorizer() + ); + + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + + // Indexing manager should be null when eventualIndexing is false + assertNull(defaultIndex.indexingManager); + + // Synchronous indexing should still work + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + + final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + assertEquals(1, result.size()); + } + + // ==================== Combined Operations Tests ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testAddUpdateRemoveSequenceWithEventualIndexing() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new EmbeddedDocumentVectorizer() + ); + + try + { + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + + // Add 3 documents + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); + gigaMap.add(new Document("doc3", new float[]{0.0f, 0.0f, 1.0f})); + + defaultIndex.indexingManager.drainQueue(); + + // Update doc2 + gigaMap.set(1L, new Document("doc2_updated", new float[]{0.9f, 0.1f, 0.0f})); + + defaultIndex.indexingManager.drainQueue(); + + // Remove doc3 + gigaMap.removeById(2L); + + defaultIndex.indexingManager.drainQueue(); + + // Search: should find 2 documents + final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 3); + assertEquals(2, result.size()); + + // doc1 should be closest to [1,0,0], followed by updated doc2 [0.9,0.1,0] + assertEquals("doc1", result.toList().get(0).entity().content()); + assertEquals("doc2_updated", result.toList().get(1).entity().content()); + } + finally + { + index.close(); + } + } + + // ==================== Background Optimization + Eventual Indexing ==================== + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void testBackgroundOptimizationWithEventualIndexing() throws Exception + { + final int dimension = 64; + final Random random = new Random(42); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .optimizationIntervalMs(300) + .minChangesBetweenOptimizations(10) + .eventualIndexing(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + try + { + for(int i = 0; i < 50; i++) + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } + + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + defaultIndex.indexingManager.drainQueue(); + + // Wait for background optimization to run + Thread.sleep(800); + + // Optimization should have run at least once + assertTrue(defaultIndex.optimizationManager.getOptimizationCount() >= 1); + + // Search should still work + final VectorSearchResult result = index.search( + randomVector(new Random(99), dimension), 10 + ); + assertEquals(10, result.size()); + } + finally + { + index.close(); + } + } +} From a61da1b469f8101fc65a84e80fe416b866265b08 Mon Sep 17 00:00:00 2001 From: fh-ms Date: Tue, 17 Feb 2026 17:39:38 +0100 Subject: [PATCH 04/15] Add concurrent stress tests for VectorIndex thread safety - Introduced stress test cases to validate VectorIndex under heavy concurrent operations. - Tests cover various configurations, including in-memory and on-disk setups with/without PQ compression and background tasks. - Ensured thread safety via assertions for no exceptions or deadlocks. - Included targeted eventual indexing and heavy load scenarios for robustness. --- .../VectorIndexConcurrentStressTest.java | 684 ++++++++++++++++++ 1 file changed, 684 insertions(+) create mode 100644 gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConcurrentStressTest.java diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConcurrentStressTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConcurrentStressTest.java new file mode 100644 index 00000000..5c6d8016 --- /dev/null +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConcurrentStressTest.java @@ -0,0 +1,684 @@ +package org.eclipse.store.gigamap.jvector; + +/*- + * #%L + * EclipseStore GigaMap JVector + * %% + * Copyright (C) 2023 - 2026 MicroStream Software + * %% + * This program and the accompanying materials are made + * available under the terms of the Eclipse Public License 2.0 + * which is available at https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + * #L% + */ + +import org.eclipse.store.gigamap.types.GigaMap; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.api.io.TempDir; + +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Concurrent stress tests for VectorIndex thread-safety. + *

+ * Multiple threads perform random add, update, remove, and search operations + * concurrently. Each test configuration exercises a different combination of: + *

    + *
  • On-disk vs. in-memory
  • + *
  • PQ compression
  • + *
  • Eventual indexing
  • + *
  • Parallel on-disk write
  • + *
  • Background optimization
  • + *
  • Background persistence
  • + *
+ *

+ * The primary assertion is that no exceptions are thrown and no deadlocks occur + * (enforced by {@link Timeout}). + */ +class VectorIndexConcurrentStressTest +{ + record Document(String content, float[] embedding) {} + + static class ComputedDocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + } + + static class EmbeddedDocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + + @Override + public boolean isEmbedded() + { + return true; + } + } + + private static float[] randomVector(final Random random, final int dimension) + { + final float[] vector = new float[dimension]; + float norm = 0; + for(int i = 0; i < dimension; i++) + { + vector[i] = random.nextFloat() * 2 - 1; + norm += vector[i] * vector[i]; + } + norm = (float)Math.sqrt(norm); + for(int i = 0; i < dimension; i++) + { + vector[i] /= norm; + } + return vector; + } + + + // ==================== Configuration Combinations ==================== + + /** + * Describes one configuration combination. + */ + private record ConfigCombo( + String label, + boolean onDisk, + boolean pqCompression, + boolean eventual, + boolean parallel, + boolean backgroundOptimization, + boolean backgroundPersistence + ) {} + + /** + * Generates all valid configuration combinations. + *

+ * Constraints: + *

    + *
  • PQ compression requires onDisk
  • + *
  • Background persistence requires onDisk
  • + *
  • parallel only meaningful when onDisk
  • + *
+ */ + private static List allCombos() + { + final List combos = new ArrayList<>(); + + // In-memory combos: onDisk=false → pq=false, persistence=false, parallel irrelevant + for(final boolean eventual : new boolean[]{false, true}) + { + for(final boolean optimization : new boolean[]{false, true}) + { + combos.add(new ConfigCombo( + "mem|eventual=" + eventual + "|opt=" + optimization, + false, false, eventual, false, optimization, false + )); + } + } + + // On-disk combos + for(final boolean pq : new boolean[]{false, true}) + { + for(final boolean eventual : new boolean[]{false, true}) + { + for(final boolean parallel : new boolean[]{false, true}) + { + for(final boolean optimization : new boolean[]{false, true}) + { + for(final boolean persistence : new boolean[]{false, true}) + { + combos.add(new ConfigCombo( + "disk|pq=" + pq + + "|eventual=" + eventual + + "|parallel=" + parallel + + "|opt=" + optimization + + "|persist=" + persistence, + true, pq, eventual, parallel, optimization, persistence + )); + } + } + } + } + } + + return combos; + } + + /** + * Builds a VectorIndexConfiguration from a combo. + */ + private static VectorIndexConfiguration buildConfig( + final ConfigCombo combo, + final int dimension, + final Path indexDir + ) + { + final VectorIndexConfiguration.Builder builder = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(combo.pqCompression() ? 32 : 16) + .beamWidth(100) + .eventualIndexing(combo.eventual()); + + if(combo.onDisk()) + { + builder + .onDisk(true) + .indexDirectory(indexDir) + .parallelOnDiskWrite(combo.parallel()); + + if(combo.pqCompression()) + { + builder + .enablePqCompression(true) + .pqSubspaces(dimension / 4); + } + + if(combo.backgroundPersistence()) + { + builder + .persistenceIntervalMs(200) + .minChangesBetweenPersists(1); + } + } + + if(combo.backgroundOptimization()) + { + builder + .optimizationIntervalMs(200) + .minChangesBetweenOptimizations(5); + } + + return builder.build(); + } + + + // ==================== Stress Test Core ==================== + + /** + * Runs a concurrent stress test for a single configuration. + *

+ * 4 threads perform random add/update/remove/search operations concurrently. + * A pool of pre-seeded entities ensures ordinals exist for update/remove. + * + * @param combo the configuration combination + * @param indexDir directory for on-disk index (may be null for in-memory) + */ + private void runStressTest(final ConfigCombo combo, final Path indexDir) throws Exception + { + final int dimension = 64; + final int seedCount = 30; + final int opsPerThread = 60; + final int threadCount = 4; + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = buildConfig(combo, dimension, indexDir); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + try + { + // Seed the index with initial entities so updates/removes have targets + final Random seedRandom = new Random(42); + for(int i = 0; i < seedCount; i++) + { + gigaMap.add(new Document("seed_" + i, randomVector(seedRandom, dimension))); + } + + // For eventual indexing, drain the seed operations + if(combo.eventual()) + { + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + defaultIndex.indexingManager.drainQueue(); + } + + // If PQ compression, train before concurrent access + if(combo.pqCompression()) + { + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); + } + + // Shared state for coordinating threads + final AtomicLong nextEntityId = new AtomicLong(seedCount); + final AtomicBoolean hasError = new AtomicBoolean(false); + final AtomicInteger completedOps = new AtomicInteger(0); + final List errors = java.util.Collections.synchronizedList(new ArrayList<>()); + final CountDownLatch startLatch = new CountDownLatch(1); + final CountDownLatch doneLatch = new CountDownLatch(threadCount); + + final ExecutorService executor = Executors.newFixedThreadPool(threadCount); + + for(int t = 0; t < threadCount; t++) + { + final int threadId = t; + executor.submit(() -> + { + try + { + // Wait for all threads to be ready + startLatch.await(); + + final Random random = new Random(1000 + threadId); + + for(int op = 0; op < opsPerThread && !hasError.get(); op++) + { + try + { + final int action = random.nextInt(100); + + if(action < 30) + { + // 30%: ADD + final float[] vector = randomVector(random, dimension); + synchronized(gigaMap) + { + gigaMap.add(new Document( + "t" + threadId + "_" + op, vector + )); + } + } + else if(action < 45) + { + // 15%: UPDATE (set) — target a seed entity + final long targetId = random.nextInt(seedCount); + final float[] vector = randomVector(random, dimension); + synchronized(gigaMap) + { + try + { + gigaMap.set(targetId, new Document( + "updated_" + targetId, vector + )); + } + catch(final Exception e) + { + // Entity may have been removed by another thread — acceptable + } + } + } + else if(action < 55) + { + // 10%: REMOVE — target a seed entity + final long targetId = random.nextInt(seedCount); + synchronized(gigaMap) + { + try + { + gigaMap.removeById(targetId); + } + catch(final Exception e) + { + // Entity may already be removed — acceptable + } + } + } + else + { + // 45%: SEARCH + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult result = index.search(queryVector, 5); + // Result may be empty if all entities were removed — that's fine + assertNotNull(result); + } + + completedOps.incrementAndGet(); + } + catch(final Exception e) + { + errors.add(e); + hasError.set(true); + } + } + } + catch(final InterruptedException e) + { + Thread.currentThread().interrupt(); + } + finally + { + doneLatch.countDown(); + } + }); + } + + // Release all threads simultaneously + startLatch.countDown(); + + // Wait for completion + assertTrue(doneLatch.await(60, TimeUnit.SECONDS), + "Threads should complete within timeout for: " + combo.label()); + + executor.shutdown(); + assertTrue(executor.awaitTermination(10, TimeUnit.SECONDS)); + + // Report errors + if(!errors.isEmpty()) + { + final StringBuilder sb = new StringBuilder(); + sb.append("Concurrent stress test failed for: ").append(combo.label()); + sb.append("\n").append(errors.size()).append(" error(s):"); + for(final Throwable err : errors) + { + sb.append("\n - ").append(err.getClass().getSimpleName()) + .append(": ").append(err.getMessage()); + } + fail(sb.toString()); + } + + // Verify the index is still consistent — drain and search + if(combo.eventual()) + { + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + if(defaultIndex.indexingManager != null) + { + defaultIndex.indexingManager.drainQueue(); + } + } + + final VectorSearchResult finalResult = index.search( + randomVector(new Random(999), dimension), 5 + ); + assertNotNull(finalResult); + } + finally + { + index.close(); + } + } + + + // ==================== In-Memory Combinations ==================== + + @Test + @Timeout(value = 120, unit = TimeUnit.SECONDS) + void testConcurrentStress_InMemory() + { + final List combos = allCombos().stream() + .filter(c -> !c.onDisk()) + .toList(); + + assertFalse(combos.isEmpty(), "Should have in-memory combos"); + + final List passed = new ArrayList<>(); + for(final ConfigCombo combo : combos) + { + try + { + this.runStressTest(combo, null); + passed.add(combo.label()); + } + catch(final Exception e) + { + fail("Failed for combo: " + combo.label() + " — " + e.getMessage(), e); + } + } + + assertEquals(combos.size(), passed.size(), + "All in-memory combos should pass"); + } + + + // ==================== On-Disk without PQ Combinations ==================== + + @Test + @Timeout(value = 180, unit = TimeUnit.SECONDS) + void testConcurrentStress_OnDisk_NoPQ(@TempDir final Path tempDir) + { + final List combos = allCombos().stream() + .filter(c -> c.onDisk() && !c.pqCompression()) + .toList(); + + assertFalse(combos.isEmpty(), "Should have on-disk no-PQ combos"); + + final List passed = new ArrayList<>(); + int comboIndex = 0; + for(final ConfigCombo combo : combos) + { + final Path indexDir = tempDir.resolve("combo_" + comboIndex++); + try + { + this.runStressTest(combo, indexDir); + passed.add(combo.label()); + } + catch(final Exception e) + { + fail("Failed for combo: " + combo.label() + " — " + e.getMessage(), e); + } + } + + assertEquals(combos.size(), passed.size(), + "All on-disk no-PQ combos should pass"); + } + + + // ==================== On-Disk with PQ Combinations ==================== + + @Test + @Timeout(value = 180, unit = TimeUnit.SECONDS) + void testConcurrentStress_OnDisk_WithPQ(@TempDir final Path tempDir) + { + final List combos = allCombos().stream() + .filter(c -> c.onDisk() && c.pqCompression()) + .toList(); + + assertFalse(combos.isEmpty(), "Should have on-disk PQ combos"); + + final List passed = new ArrayList<>(); + int comboIndex = 0; + for(final ConfigCombo combo : combos) + { + final Path indexDir = tempDir.resolve("pq_combo_" + comboIndex++); + try + { + this.runStressTest(combo, indexDir); + passed.add(combo.label()); + } + catch(final Exception e) + { + fail("Failed for combo: " + combo.label() + " — " + e.getMessage(), e); + } + } + + assertEquals(combos.size(), passed.size(), + "All on-disk PQ combos should pass"); + } + + + // ==================== Focused Eventual Indexing Stress ==================== + + /** + * Focused test: heavier load with eventual indexing enabled. + * More operations per thread to stress the background queue. + */ + @Test + @Timeout(value = 60, unit = TimeUnit.SECONDS) + void testEventualIndexingHeavyConcurrentLoad(@TempDir final Path tempDir) + throws Exception + { + final int dimension = 64; + final int seedCount = 50; + final int opsPerThread = 150; + final int threadCount = 6; + final Path indexDir = tempDir.resolve("heavy"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .onDisk(true) + .indexDirectory(indexDir) + .eventualIndexing(true) + .optimizationIntervalMs(300) + .minChangesBetweenOptimizations(10) + .persistenceIntervalMs(500) + .minChangesBetweenPersists(5) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + try + { + // Seed + final Random seedRandom = new Random(42); + for(int i = 0; i < seedCount; i++) + { + gigaMap.add(new Document("seed_" + i, randomVector(seedRandom, dimension))); + } + + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + defaultIndex.indexingManager.drainQueue(); + + final AtomicBoolean hasError = new AtomicBoolean(false); + final List errors = java.util.Collections.synchronizedList(new ArrayList<>()); + final CountDownLatch startLatch = new CountDownLatch(1); + final CountDownLatch doneLatch = new CountDownLatch(threadCount); + + final ExecutorService executor = Executors.newFixedThreadPool(threadCount); + + for(int t = 0; t < threadCount; t++) + { + final int threadId = t; + executor.submit(() -> + { + try + { + startLatch.await(); + final Random random = new Random(2000 + threadId); + + for(int op = 0; op < opsPerThread && !hasError.get(); op++) + { + try + { + final int action = random.nextInt(100); + + if(action < 25) + { + // ADD + synchronized(gigaMap) + { + gigaMap.add(new Document( + "t" + threadId + "_" + op, + randomVector(random, dimension) + )); + } + } + else if(action < 40) + { + // UPDATE + final long targetId = random.nextInt(seedCount); + synchronized(gigaMap) + { + try + { + gigaMap.set(targetId, new Document( + "upd_" + targetId, + randomVector(random, dimension) + )); + } + catch(final Exception ignored) {} + } + } + else if(action < 50) + { + // REMOVE + final long targetId = random.nextInt(seedCount); + synchronized(gigaMap) + { + try + { + gigaMap.removeById(targetId); + } + catch(final Exception ignored) {} + } + } + else + { + // SEARCH + final VectorSearchResult result = index.search( + randomVector(random, dimension), 5 + ); + assertNotNull(result); + } + } + catch(final Exception e) + { + errors.add(e); + hasError.set(true); + } + } + } + catch(final InterruptedException e) + { + Thread.currentThread().interrupt(); + } + finally + { + doneLatch.countDown(); + } + }); + } + + startLatch.countDown(); + + assertTrue(doneLatch.await(60, TimeUnit.SECONDS), + "Heavy concurrent load should complete within timeout"); + + executor.shutdown(); + assertTrue(executor.awaitTermination(10, TimeUnit.SECONDS)); + + if(!errors.isEmpty()) + { + final StringBuilder sb = new StringBuilder("Heavy eventual indexing stress test failed:"); + for(final Throwable err : errors) + { + sb.append("\n - ").append(err.getClass().getSimpleName()) + .append(": ").append(err.getMessage()); + } + fail(sb.toString()); + } + + // Drain and verify final state + defaultIndex.indexingManager.drainQueue(); + + final VectorSearchResult finalResult = index.search( + randomVector(new Random(999), dimension), 5 + ); + assertNotNull(finalResult); + } + finally + { + index.close(); + } + } +} From 0806de71ba8bd8544c792cc9d81a875c96f2b692 Mon Sep 17 00:00:00 2001 From: fh-ms Date: Tue, 17 Feb 2026 17:58:18 +0100 Subject: [PATCH 05/15] Add performance tests for synchronous vs eventual indexing in VectorIndex - Introduced test cases to evaluate insertion performance (single and batch adds) between synchronous and eventual indexing modes. - Verified search quality to ensure correctness of deferred graph indexing. - Included detailed metrics on caller-visible speedup and indexing throughput. --- .../jvector/VectorIndexPerformanceTest.java | 369 ++++++++++++++++++ 1 file changed, 369 insertions(+) diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexPerformanceTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexPerformanceTest.java index 27dec828..fe651b85 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexPerformanceTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexPerformanceTest.java @@ -666,4 +666,373 @@ private static long max(final long[] values) } return result; } + + + // ==================== Eventual Indexing Performance ==================== + + /** + * Performance test comparing mass insertion with and without eventual indexing. + *

+ * Eventual indexing defers HNSW graph construction to a background thread, + * so the caller-visible insertion time should be significantly lower since + * it only pays for vectorStore update + queue enqueue instead of the + * expensive {@code addGraphNode()} call. + *

+ * Both modes are measured with: + *

    + *
  • Single-entity adds via {@code gigaMap.add()}
  • + *
  • Batch adds via {@code gigaMap.addAll()}
  • + *
+ *

+ * After insertion, eventual mode is drained and both indices are verified + * for search quality (recall) to confirm the deferred graph is correct. + */ + @Test + void testEventualVsSynchronousInsertionPerformance() + { + final int vectorCount = 10_000; + final int dimension = 128; + final int searchIterations = 200; + final int k = 10; + final int batchSize = 1_000; + final int iterations = 3; + + System.err.println("=== Eventual vs. Synchronous Indexing Performance ==="); + System.err.println("Vector count: " + vectorCount); + System.err.println("Dimension: " + dimension); + System.err.println("Batch size: " + batchSize); + System.err.println("Iterations: " + iterations); + System.err.println(); + + // Pre-generate all vectors for fair comparison + System.err.print("Generating vectors... "); + final Random random = new Random(42); + final List documents = new ArrayList<>(vectorCount); + for(int i = 0; i < vectorCount; i++) + { + documents.add(new Document("doc_" + i, randomVector(random, dimension))); + } + System.err.println("done."); + + // Pre-generate query vectors + final float[][] queryVectors = new float[searchIterations][]; + final Random queryRandom = new Random(999); + for(int i = 0; i < searchIterations; i++) + { + queryVectors[i] = randomVector(queryRandom, dimension); + } + + // ========== SINGLE ADD: synchronous vs. eventual ========== + System.err.println(); + System.err.println("--- Single Add (gigaMap.add) ---"); + + final long[] syncSingleTimes = new long[iterations]; + final long[] eventualSingleTimes = new long[iterations]; + final long[] eventualSingleDrainTimes = new long[iterations]; + + for(int iter = 0; iter < iterations; iter++) + { + // Synchronous + syncSingleTimes[iter] = this.measureSingleAdd(documents, dimension, false); + // Eventual + final long[] eventualResult = this.measureSingleAddEventual(documents, dimension); + eventualSingleTimes[iter] = eventualResult[0]; + eventualSingleDrainTimes[iter] = eventualResult[1]; + } + + System.err.println(); + System.err.println(" Single Add Results:"); + System.err.printf(" Synchronous: avg=%,d ms min=%,d ms max=%,d ms%n", + average(syncSingleTimes), min(syncSingleTimes), max(syncSingleTimes)); + System.err.printf(" Eventual (add): avg=%,d ms min=%,d ms max=%,d ms%n", + average(eventualSingleTimes), min(eventualSingleTimes), max(eventualSingleTimes)); + System.err.printf(" Eventual (drain): avg=%,d ms min=%,d ms max=%,d ms%n", + average(eventualSingleDrainTimes), min(eventualSingleDrainTimes), max(eventualSingleDrainTimes)); + + if(average(syncSingleTimes) > 0 && average(eventualSingleTimes) > 0) + { + System.err.printf(" Caller-visible speedup: %.2fx%n", + (double)average(syncSingleTimes) / average(eventualSingleTimes)); + System.err.printf(" Total (add+drain) vs sync: %.2fx%n", + (double)average(syncSingleTimes) / + (average(eventualSingleTimes) + average(eventualSingleDrainTimes))); + } + + // ========== BATCH ADD: synchronous vs. eventual ========== + System.err.println(); + System.err.println("--- Batch Add (gigaMap.addAll, batch=" + batchSize + ") ---"); + + final long[] syncBatchTimes = new long[iterations]; + final long[] eventualBatchTimes = new long[iterations]; + final long[] eventualBatchDrainTimes = new long[iterations]; + + for(int iter = 0; iter < iterations; iter++) + { + // Synchronous + syncBatchTimes[iter] = this.measureBatchAdd(documents, dimension, batchSize, false); + // Eventual + final long[] eventualResult = this.measureBatchAddEventual(documents, dimension, batchSize); + eventualBatchTimes[iter] = eventualResult[0]; + eventualBatchDrainTimes[iter] = eventualResult[1]; + } + + System.err.println(); + System.err.println(" Batch Add Results:"); + System.err.printf(" Synchronous: avg=%,d ms min=%,d ms max=%,d ms%n", + average(syncBatchTimes), min(syncBatchTimes), max(syncBatchTimes)); + System.err.printf(" Eventual (add): avg=%,d ms min=%,d ms max=%,d ms%n", + average(eventualBatchTimes), min(eventualBatchTimes), max(eventualBatchTimes)); + System.err.printf(" Eventual (drain): avg=%,d ms min=%,d ms max=%,d ms%n", + average(eventualBatchDrainTimes), min(eventualBatchDrainTimes), max(eventualBatchDrainTimes)); + + if(average(syncBatchTimes) > 0 && average(eventualBatchTimes) > 0) + { + System.err.printf(" Caller-visible speedup: %.2fx%n", + (double)average(syncBatchTimes) / average(eventualBatchTimes)); + System.err.printf(" Total (add+drain) vs sync: %.2fx%n", + (double)average(syncBatchTimes) / + (average(eventualBatchTimes) + average(eventualBatchDrainTimes))); + } + + // ========== SEARCH QUALITY VERIFICATION ========== + System.err.println(); + System.err.println("--- Search Quality Verification ---"); + + this.verifySearchQuality(documents, dimension, queryVectors, k, false, "Synchronous"); + this.verifySearchQuality(documents, dimension, queryVectors, k, true, "Eventual"); + + System.err.println(); + System.err.println("=== Eventual Indexing Performance Complete ==="); + } + + /** + * Measures single-entity add time (synchronous). + */ + private long measureSingleAdd( + final List documents, + final int dimension, + final boolean eventual + ) + { + final String mode = eventual ? "eventual" : "sync"; + System.err.printf(" [single/%s] ", mode); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .eventualIndexing(eventual) + .build(); + + try(final VectorIndex index = vectorIndices.add( + "embeddings", config, new DocumentVectorizer() + )) + { + final long start = System.nanoTime(); + for(final Document doc : documents) + { + gigaMap.add(doc); + } + final long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + System.err.printf("%,d ms (%,.0f vec/sec)%n", + elapsedMs, documents.size() / (elapsedMs / 1000.0)); + + return elapsedMs; + } + } + + /** + * Measures single-entity add time (eventual). Returns [addTime, drainTime]. + */ + private long[] measureSingleAddEventual( + final List documents, + final int dimension + ) + { + System.err.print(" [single/eventual] "); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .eventualIndexing(true) + .build(); + + try(final VectorIndex index = vectorIndices.add( + "embeddings", config, new DocumentVectorizer() + )) + { + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + + final long addStart = System.nanoTime(); + for(final Document doc : documents) + { + gigaMap.add(doc); + } + final long addMs = (System.nanoTime() - addStart) / 1_000_000; + + final long drainStart = System.nanoTime(); + defaultIndex.indexingManager.drainQueue(); + final long drainMs = (System.nanoTime() - drainStart) / 1_000_000; + + System.err.printf("add=%,d ms drain=%,d ms total=%,d ms (%,.0f vec/sec add-visible)%n", + addMs, drainMs, addMs + drainMs, + documents.size() / (addMs / 1000.0)); + + return new long[]{addMs, drainMs}; + } + } + + /** + * Measures batch add time (synchronous or eventual). + */ + private long measureBatchAdd( + final List documents, + final int dimension, + final int batchSize, + final boolean eventual + ) + { + final String mode = eventual ? "eventual" : "sync"; + System.err.printf(" [batch/%s] ", mode); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .eventualIndexing(eventual) + .build(); + + try(final VectorIndex index = vectorIndices.add( + "embeddings", config, new DocumentVectorizer() + )) + { + final long start = System.nanoTime(); + for(int i = 0; i < documents.size(); i += batchSize) + { + final int end = Math.min(i + batchSize, documents.size()); + gigaMap.addAll(documents.subList(i, end)); + } + final long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + System.err.printf("%,d ms (%,.0f vec/sec)%n", + elapsedMs, documents.size() / (elapsedMs / 1000.0)); + + return elapsedMs; + } + } + + /** + * Measures batch add time (eventual). Returns [addTime, drainTime]. + */ + private long[] measureBatchAddEventual( + final List documents, + final int dimension, + final int batchSize + ) + { + System.err.print(" [batch/eventual] "); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .eventualIndexing(true) + .build(); + + try(final VectorIndex index = vectorIndices.add( + "embeddings", config, new DocumentVectorizer() + )) + { + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + + final long addStart = System.nanoTime(); + for(int i = 0; i < documents.size(); i += batchSize) + { + final int end = Math.min(i + batchSize, documents.size()); + gigaMap.addAll(documents.subList(i, end)); + } + final long addMs = (System.nanoTime() - addStart) / 1_000_000; + + final long drainStart = System.nanoTime(); + defaultIndex.indexingManager.drainQueue(); + final long drainMs = (System.nanoTime() - drainStart) / 1_000_000; + + System.err.printf("add=%,d ms drain=%,d ms total=%,d ms (%,.0f vec/sec add-visible)%n", + addMs, drainMs, addMs + drainMs, + documents.size() / (addMs / 1000.0)); + + return new long[]{addMs, drainMs}; + } + } + + /** + * Verifies search quality (recall) for a given mode, to confirm eventual + * indexing produces the same graph quality as synchronous indexing. + */ + private void verifySearchQuality( + final List documents, + final int dimension, + final float[][] queryVectors, + final int k, + final boolean eventual, + final String label + ) + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .eventualIndexing(eventual) + .build(); + + try(final VectorIndex index = vectorIndices.add( + "embeddings", config, new DocumentVectorizer() + )) + { + gigaMap.addAll(documents); + + if(eventual) + { + ((VectorIndex.Default)index).indexingManager.drainQueue(); + } + + int totalResults = 0; + int fullResults = 0; + for(final float[] query : queryVectors) + { + final VectorSearchResult result = index.search(query, k); + totalResults++; + if(result.size() == k) + { + fullResults++; + } + } + + System.err.printf(" %s: %d/%d queries returned full %d results (%.1f%%)%n", + label, fullResults, totalResults, k, + 100.0 * fullResults / totalResults); + } + } } From 556a071b7ff22784d6a17108e5f9c8dbd1904547 Mon Sep 17 00:00:00 2001 From: fh-ms Date: Tue, 17 Feb 2026 18:14:27 +0100 Subject: [PATCH 06/15] Refactor BackgroundIndexingManager to delegate operation handling via execute() - Replaced the applyOperation method with per-operation execute() implementations in IndexingOperation types. - Simplified indexing logic by encapsulating operation-specific behavior within each record. - Improved maintainability and reduced duplication in BackgroundIndexingManager. --- .../jvector/BackgroundIndexingManager.java | 69 +++++++++++-------- 1 file changed, 39 insertions(+), 30 deletions(-) diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundIndexingManager.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundIndexingManager.java index 231642f5..d9db3655 100644 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundIndexingManager.java +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundIndexingManager.java @@ -94,25 +94,59 @@ sealed interface IndexingOperation IndexingOperation.Remove, IndexingOperation.DrainBarrier { + public void execute(Callback callback); + + /** * Add a node to the HNSW graph. */ - record Add(int ordinal, float[] vector) implements IndexingOperation {} + record Add(int ordinal, float[] vector) implements IndexingOperation + { + @Override + public void execute(final Callback callback) + { + callback.applyGraphAdd(this.ordinal, this.vector); + callback.markDirtyForBackgroundManagers(1); + } + } /** * Update a node in the HNSW graph (delete + re-add). */ - record Update(int ordinal, float[] vector) implements IndexingOperation {} + record Update(int ordinal, float[] vector) implements IndexingOperation + { + @Override + public void execute(final Callback callback) + { + callback.applyGraphUpdate(this.ordinal, this.vector); + callback.markDirtyForBackgroundManagers(1); + } + } /** * Remove a node from the HNSW graph. */ - record Remove(int ordinal) implements IndexingOperation {} + record Remove(int ordinal) implements IndexingOperation + { + @Override + public void execute(final Callback callback) + { + callback.applyGraphRemove(this.ordinal); + callback.markDirtyForBackgroundManagers(1); + } + } /** * Sentinel operation for drainQueue() — signals the worker to release the latch. */ - record DrainBarrier(CountDownLatch latch) implements IndexingOperation {} + record DrainBarrier(CountDownLatch latch) implements IndexingOperation + { + @Override + public void execute(final Callback callback) + { + this.latch().countDown(); + } + } } @@ -276,7 +310,7 @@ private void workerLoop() try { final IndexingOperation op = this.queue.take(); - this.applyOperation(op); + op.execute(this.callback); } catch(final InterruptedException e) { @@ -293,31 +327,6 @@ private void workerLoop() } } - /** - * Applies a single indexing operation via the callback. - */ - private void applyOperation(final IndexingOperation op) - { - if(op instanceof IndexingOperation.Add add) - { - this.callback.applyGraphAdd(add.ordinal(), add.vector()); - this.callback.markDirtyForBackgroundManagers(1); - } - else if(op instanceof IndexingOperation.Update update) - { - this.callback.applyGraphUpdate(update.ordinal(), update.vector()); - this.callback.markDirtyForBackgroundManagers(1); - } - else if(op instanceof IndexingOperation.Remove remove) - { - this.callback.applyGraphRemove(remove.ordinal()); - this.callback.markDirtyForBackgroundManagers(1); - } - else if(op instanceof IndexingOperation.DrainBarrier barrier) - { - barrier.latch().countDown(); - } - } } } From 4e253b474d382828da5a9b98f34f5ccdb24bef8c Mon Sep 17 00:00:00 2001 From: fh-ms Date: Wed, 18 Feb 2026 09:46:15 +0100 Subject: [PATCH 07/15] Update default value for parallel on-disk write to false in VectorIndexConfiguration --- .../store/gigamap/jvector/VectorIndexConfiguration.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndexConfiguration.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndexConfiguration.java index 679cb447..c1fd4428 100644 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndexConfiguration.java +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndexConfiguration.java @@ -478,7 +478,7 @@ public default boolean backgroundOptimization() *

* Only applies when {@link #onDisk()} is true. * - * @return true if parallel on-disk writing is enabled (default: true) + * @return true if parallel on-disk writing is enabled (default: false) * @see #onDisk() */ public boolean parallelOnDiskWrite(); @@ -1023,7 +1023,7 @@ public static class Default implements Builder this.optimizationIntervalMs = 0; // 0 = disabled this.minChangesBetweenOptimizations = 1000; this.optimizeOnShutdown = false; - this.parallelOnDiskWrite = true; + this.parallelOnDiskWrite = false; this.eventualIndexing = false; } From 66fbcadebffc91b0531f1d911e8158ae0b475d6f Mon Sep 17 00:00:00 2001 From: fh-ms Date: Wed, 18 Feb 2026 12:01:49 +0100 Subject: [PATCH 08/15] Update tests for parallel on-disk write default change to false in VectorIndexConfiguration --- .../jvector/VectorIndexConfigurationTest.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java index 03f11058..786b55b7 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java @@ -54,7 +54,7 @@ void testBuilderDefaults() assertEquals(0L, config.optimizationIntervalMs()); assertEquals(1000, config.minChangesBetweenOptimizations()); assertFalse(config.optimizeOnShutdown()); - assertTrue(config.parallelOnDiskWrite()); + assertFalse(config.parallelOnDiskWrite()); assertFalse(config.eventualIndexing()); } @@ -274,13 +274,13 @@ void testPqSubspacesZeroMeansAuto(@TempDir final Path tempDir) // ==================== Parallel On-Disk Write Tests ==================== @Test - void testParallelOnDiskWriteDefaultTrue() + void testParallelOnDiskWriteDefaultFalse() { final VectorIndexConfiguration config = VectorIndexConfiguration.builder() .dimension(64) .build(); - assertTrue(config.parallelOnDiskWrite()); + assertFalse(config.parallelOnDiskWrite()); } @Test @@ -366,18 +366,18 @@ void testParallelVsNonParallelWithCompression(@TempDir final Path tempDir) } @Test - void testFactoryMethodsDefaultToParallel(@TempDir final Path tempDir) + void testFactoryMethodsDefaultToSequential(@TempDir final Path tempDir) { final Path indexDir = tempDir.resolve("vectors"); final VectorIndexConfiguration medium = VectorIndexConfiguration.forMediumDataset(768, indexDir); - assertTrue(medium.parallelOnDiskWrite()); + assertFalse(medium.parallelOnDiskWrite()); final VectorIndexConfiguration large = VectorIndexConfiguration.forLargeDataset(768, indexDir); - assertTrue(large.parallelOnDiskWrite()); + assertFalse(large.parallelOnDiskWrite()); final VectorIndexConfiguration highPrecision = VectorIndexConfiguration.forHighPrecision(768, indexDir); - assertTrue(highPrecision.parallelOnDiskWrite()); + assertFalse(highPrecision.parallelOnDiskWrite()); } @Test From e7a933b76d998eb69b85d678200f2fabae041ef5 Mon Sep 17 00:00:00 2001 From: fh-ms Date: Wed, 18 Feb 2026 12:18:56 +0100 Subject: [PATCH 09/15] Refactor VectorIndex synchronization and builder operation handling - Replaced `persistenceLock` with `builderLock` for unified read/write access control over builder operations. - Introduced deferred operation handling for sync-mode mutations during cleanup phases. - Improved thread-safety for concurrent graph updates by coordinating via builder read/write locks. - Added `cleanupInProgress` flag and `deferredBuilderOps` queue to manage in-flight operations during cleanup and persistence tasks. - Removed redundant `synchronized(parentMap)` calls to avoid lock-ordering issues. --- .../store/gigamap/jvector/VectorIndex.java | 547 +++++++++++------- 1 file changed, 324 insertions(+), 223 deletions(-) diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java index 8280402a..56d7d907 100644 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java @@ -35,6 +35,7 @@ import java.io.*; import java.util.*; +import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.locks.ReentrantReadWriteLock; /** @@ -638,10 +639,18 @@ static BinaryTypeHandler> provideTypeHandler() // Flag indicating graph was loaded from file (skip rebuild) private transient boolean graphLoadedFromFile; - // Read/write lock for concurrent search during persistence - // Read lock: allows concurrent searches - // Write lock: exclusive access during persistence - private transient ReentrantReadWriteLock persistenceLock; + // Read/write lock for builder operations. + // Read lock: concurrent searches and background-worker mutations + // Write lock: exclusive access for cleanup, persistence, removeAll, close + private transient ReentrantReadWriteLock builderLock; + + // When true, sync-mode mutations defer builder ops to avoid racing with cleanup(). + // cleanup()'s ForkJoinPool workers need the GigaMap monitor (for embedded vectorizers), + // so sync-mode mutations (which hold that monitor) cannot use builderLock — they use + // this flag instead. The synchronized(parentMap) barrier in optimize()/persistToDisk() + // ensures any in-flight mutation completes before cleanup begins. + private transient volatile boolean cleanupInProgress; + private transient ConcurrentLinkedQueue deferredBuilderOps; /////////////////////////////////////////////////////////////////////////// @@ -673,8 +682,9 @@ static BinaryTypeHandler> provideTypeHandler() .build() ; - // Initialize persistence lock early (before ensureIndexInitialized) - this.persistenceLock = new ReentrantReadWriteLock(); + // Initialize builder lock early (before ensureIndexInitialized) + this.builderLock = new ReentrantReadWriteLock(); + this.deferredBuilderOps = new ConcurrentLinkedQueue<>(); this.ensureIndexInitialized(); } @@ -759,10 +769,14 @@ private void initializeIndex() { this.vectorTypeSupport = VectorizationProvider.getInstance().getVectorTypeSupport(); - // Initialize persistence lock (always, for consistent locking semantics) - if(this.persistenceLock == null) + // Initialize builder lock (always, for consistent locking semantics) + if(this.builderLock == null) + { + this.builderLock = new ReentrantReadWriteLock(); + } + if(this.deferredBuilderOps == null) { - this.persistenceLock = new ReentrantReadWriteLock(); + this.deferredBuilderOps = new ConcurrentLinkedQueue<>(); } // Initialize PQ manager if compression enabled @@ -1059,36 +1073,34 @@ private io.github.jbellis.jvector.vector.VectorSimilarityFunction jvectorSimilar @Override public void internalAdd(final long entityId, final E entity) { + // No synchronized(parentMap) needed — called from GigaMap's synchronized methods. final int ordinal = toOrdinal(entityId); - synchronized(this.parentMap()) - { - this.ensureIndexInitialized(); + this.ensureIndexInitialized(); - final float[] vector = this.vectorize(entity); + final float[] vector = this.vectorize(entity); - // Store based on vectorizer type - if(!this.isEmbedded()) - { - this.vectorStore.add(new VectorEntry(entityId, vector)); - } + // Store based on vectorizer type + if(!this.isEmbedded()) + { + this.vectorStore.add(new VectorEntry(entityId, vector)); + } - this.markStateChangeChildren(); + this.markStateChangeChildren(); - if(this.indexingManager != null) - { - // Defer graph update to background thread - this.indexingManager.enqueue(new BackgroundIndexingManager.IndexingOperation.Add(ordinal, vector)); - } - else - { - // Add to HNSW graph using entity ID as ordinal - final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); - this.builder.addGraphNode(ordinal, vf); + if(this.indexingManager != null) + { + // Defer graph update to background thread + this.indexingManager.enqueue(new BackgroundIndexingManager.IndexingOperation.Add(ordinal, vector)); + } + else + { + // Add to HNSW graph using entity ID as ordinal + final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); + this.executeOrDeferBuilderOp(() -> this.builder.addGraphNode(ordinal, vf)); - // Mark dirty for background managers - this.markDirtyForBackgroundManagers(1); - } + // Mark dirty for background managers + this.markDirtyForBackgroundManagers(1); } } @@ -1113,39 +1125,38 @@ public void internalAddAll(final long firstEntityId, final Iterable @Override public void internalUpdate(final long entityId, final E replacedEntity, final E entity) { - synchronized(this.parentMap()) - { - this.ensureIndexInitialized(); + // No synchronized(parentMap) needed — called from GigaMap's synchronized methods. + this.ensureIndexInitialized(); - final float[] vector = this.vectorize(entity); + final float[] vector = this.vectorize(entity); - final int ordinal = toOrdinal(entityId); + final int ordinal = toOrdinal(entityId); - // Update based on vectorizer type - if(!this.isEmbedded()) - { - this.vectorStore.set(entityId, new VectorEntry(entityId, vector)); - } + // Update based on vectorizer type + if(!this.isEmbedded()) + { + this.vectorStore.set(entityId, new VectorEntry(entityId, vector)); + } - this.markStateChangeChildren(); + this.markStateChangeChildren(); - if(this.indexingManager != null) - { - // Defer graph update to background thread - this.indexingManager.enqueue(new BackgroundIndexingManager.IndexingOperation.Update(ordinal, vector)); - } - else + if(this.indexingManager != null) + { + // Defer graph update to background thread + this.indexingManager.enqueue(new BackgroundIndexingManager.IndexingOperation.Update(ordinal, vector)); + } + else + { + final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); + this.executeOrDeferBuilderOp(() -> { this.builder.markNodeDeleted(ordinal); this.builder.removeDeletedNodes(); - - // Add to HNSW graph using entity ID as ordinal - final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); this.builder.addGraphNode(ordinal, vf); + }); - // Mark dirty for background managers - this.markDirtyForBackgroundManagers(1); - } + // Mark dirty for background managers + this.markDirtyForBackgroundManagers(1); } } @@ -1171,33 +1182,31 @@ private List collectVectors(final long firstEntityId, final Iterabl */ private void addVectorEntries(final List entries) { - synchronized(this.parentMap()) - { - this.ensureIndexInitialized(); + // No synchronized(parentMap) needed — called from GigaMap's synchronized methods. + this.ensureIndexInitialized(); - if(!this.isEmbedded()) - { - this.vectorStore.addAll(entries); - } + if(!this.isEmbedded()) + { + this.vectorStore.addAll(entries); + } - this.markStateChangeChildren(); + this.markStateChangeChildren(); - if(this.indexingManager != null) - { - // Defer graph updates to background thread - entries.forEach(entry -> - this.indexingManager.enqueue(new BackgroundIndexingManager.IndexingOperation.Add( - toOrdinal(entry.sourceEntityId), entry.vector - )) - ); - } - else - { - this.addGraphNodesSequential(entries); + if(this.indexingManager != null) + { + // Defer graph updates to background thread + entries.forEach(entry -> + this.indexingManager.enqueue(new BackgroundIndexingManager.IndexingOperation.Add( + toOrdinal(entry.sourceEntityId), entry.vector + )) + ); + } + else + { + this.executeOrDeferBuilderOp(() -> this.addGraphNodesSequential(entries)); - // Mark dirty for background managers (with count for debouncing) - this.markDirtyForBackgroundManagers(entries.size()); - } + // Mark dirty for background managers (with count for debouncing) + this.markDirtyForBackgroundManagers(entries.size()); } } @@ -1233,73 +1242,70 @@ private void addGraphNodesSequential(final List entries) @Override public void internalRemove(final long entityId, final E entity) { - synchronized(this.parentMap()) - { - this.ensureIndexInitialized(); + // No synchronized(parentMap) needed — called from GigaMap's synchronized methods. + this.ensureIndexInitialized(); - final int ordinal = toOrdinal(entityId); - if(!this.isEmbedded()) - { - this.vectorStore.removeById(entityId); - } + final int ordinal = toOrdinal(entityId); + if(!this.isEmbedded()) + { + this.vectorStore.removeById(entityId); + } - this.markStateChangeChildren(); + this.markStateChangeChildren(); - if(this.indexingManager != null) - { - // Defer graph update to background thread - this.indexingManager.enqueue(new BackgroundIndexingManager.IndexingOperation.Remove(ordinal)); - } - else - { - this.builder.markNodeDeleted(ordinal); + if(this.indexingManager != null) + { + // Defer graph update to background thread + this.indexingManager.enqueue(new BackgroundIndexingManager.IndexingOperation.Remove(ordinal)); + } + else + { + this.executeOrDeferBuilderOp(() -> this.builder.markNodeDeleted(ordinal)); - // Mark dirty for background managers - this.markDirtyForBackgroundManagers(1); - } + // Mark dirty for background managers + this.markDirtyForBackgroundManagers(1); } } @Override public void internalRemoveAll() { - // Acquire write lock to ensure no concurrent persistToDisk() Phase 2 is running. + // Acquire write lock to ensure no concurrent persistToDisk() Phase 2, + // search, or background worker mutation is running. // closeInternalResources() destroys the graph and disk manager, which would - // corrupt a write in progress. - this.persistenceLock.writeLock().lock(); + // corrupt any in-flight operation. + // No synchronized(parentMap) needed — called from GigaMap's synchronized methods. + this.builderLock.writeLock().lock(); try { - synchronized(this.parentMap()) - { - this.ensureIndexInitialized(); + this.ensureIndexInitialized(); - if(!this.isEmbedded()) - { - this.vectorStore.removeAll(); - } + if(!this.isEmbedded()) + { + this.vectorStore.removeAll(); + } - // Discard and shutdown indexing manager (pending ops are stale) - this.shutdownIndexingManager(false); + // Discard and shutdown indexing manager (pending ops are stale) + this.shutdownIndexingManager(false); - // Shutdown optimization manager before closing - this.shutdownOptimizationManager(false); + // Shutdown optimization manager before closing + this.shutdownOptimizationManager(false); - // Shutdown persistence manager before closing - this.shutdownPersistenceManager(false); + // Shutdown persistence manager before closing + this.shutdownPersistenceManager(false); - this.closeInternalResources(); + this.closeInternalResources(); - // Reinitialize the index (this will also restart background managers if configured) - this.initializeIndex(); - this.markStateChangeChildren(); + // Reinitialize the index (this will also restart background managers if configured) + this.initializeIndex(); + this.markStateChangeChildren(); - // Mark dirty for background managers - this.markDirtyForBackgroundManagers(1); - } + // Mark dirty for background managers + this.markDirtyForBackgroundManagers(1); } finally { - this.persistenceLock.writeLock().unlock(); + this.builderLock.writeLock().unlock(); } } @@ -1308,34 +1314,34 @@ public VectorSearchResult search(final float[] queryVector, final int k) { this.validateDimension(queryVector); - // Acquire read lock for concurrent search during persistence - this.persistenceLock.readLock().lock(); + // Acquire read lock — blocks during cleanup/persistence/removeAll/close, + // allows concurrent searches and GigaMap mutations. + // No synchronized(parentMap) — avoids lock-ordering deadlock with + // internalRemoveAll (which holds the GigaMap monitor and needs the write lock). + this.builderLock.readLock().lock(); try { - synchronized(this.parentMap()) - { - this.ensureIndexInitialized(); + this.ensureIndexInitialized(); - final VectorFloat query = this.vectorTypeSupport.createFloatVector(queryVector); + final VectorFloat query = this.vectorTypeSupport.createFloatVector(queryVector); - // Choose search strategy based on index mode - final SearchResult result; - final boolean diskLoaded = this.diskManager != null && this.diskManager.isLoaded(); - if(diskLoaded && this.diskManager.getDiskIndex() != null) - { - result = this.searchDiskIndex(query, k); - } - else - { - result = this.searchInMemoryIndex(query, k); - } - - return this.convertSearchResult(result); + // Choose search strategy based on index mode + final SearchResult result; + final boolean diskLoaded = this.diskManager != null && this.diskManager.isLoaded(); + if(diskLoaded && this.diskManager.getDiskIndex() != null) + { + result = this.searchDiskIndex(query, k); + } + else + { + result = this.searchInMemoryIndex(query, k); } + + return this.convertSearchResult(result); } finally { - this.persistenceLock.readLock().unlock(); + this.builderLock.readLock().unlock(); } } @@ -1438,22 +1444,45 @@ public void optimize() } final GraphIndexBuilder capturedBuilder; - synchronized(this.parentMap()) - { - this.ensureIndexInitialized(); - capturedBuilder = this.builder; - } - // cleanup() uses ForkJoinPool internally — must be outside - // synchronized(parentMap) to avoid deadlock with embedded vectorizers - // whose worker threads call parentMap.get(). - if(capturedBuilder != null) + + // Signal sync-mode mutations to defer builder ops during cleanup. + this.cleanupInProgress = true; + try { - capturedBuilder.cleanup(); + // Barrier: any in-flight GigaMap mutation (which holds the GigaMap monitor) + // will complete before we proceed. New mutations see the flag and defer. + synchronized(this.parentMap()) + { + this.ensureIndexInitialized(); + capturedBuilder = this.builder; + } + + // cleanup() uses ForkJoinPool internally — must be outside + // synchronized(parentMap) to avoid deadlock with embedded vectorizers + // whose worker threads call parentMap.get(). + if(capturedBuilder != null) + { + // Write lock blocks background worker mutations (readLock) and searches. + this.builderLock.writeLock().lock(); + try + { + capturedBuilder.cleanup(); + } + finally + { + this.builderLock.writeLock().unlock(); + } + } } - synchronized(this.parentMap()) + finally { - this.markStateChangeChildren(); + this.cleanupInProgress = false; } + + // Apply any deferred sync-mode mutations now that cleanup is done. + this.drainDeferredBuilderOps(); + + this.markStateChangeChildren(); } @Override @@ -1470,73 +1499,86 @@ public void persistToDisk() this.indexingManager.drainQueue(); } - // Acquire write lock for exclusive access during persistence. - // This blocks searches and other persist/removeAll/close calls. - this.persistenceLock.writeLock().lock(); + // Signal sync-mode mutations to defer builder ops during cleanup + disk write. + this.cleanupInProgress = true; try { - // Captured references for Phase 2 (disk write outside synchronized block) - final OnHeapGraphIndex capturedIndex ; - final RandomAccessVectorValues capturedRavv ; - final PQCompressionManager capturedPqMgr ; - final DiskIndexManager capturedDiskMgr; - - final GraphIndexBuilder capturedBuilder; - - // Phase 1: Exclusive prep inside synchronized(parentMap). - // Disk manager init and reference capture. - synchronized(this.parentMap()) + // Acquire write lock for exclusive access during persistence. + // This blocks searches, background worker mutations, removeAll, and close. + this.builderLock.writeLock().lock(); + try { - this.ensureIndexInitialized(); - - // If we have an in-memory builder, prepare for disk write - if(this.builder == null || this.index == null) - { - return; - } - - // Initialize disk manager if needed - if(this.diskManager == null) + // Captured references for Phase 2 (disk write outside synchronized block) + final OnHeapGraphIndex capturedIndex ; + final RandomAccessVectorValues capturedRavv ; + final PQCompressionManager capturedPqMgr ; + final DiskIndexManager capturedDiskMgr; + + final GraphIndexBuilder capturedBuilder; + + // Phase 1: Barrier + reference capture inside synchronized(parentMap). + // The barrier ensures any in-flight GigaMap mutation completes. + // New mutations see cleanupInProgress=true and defer. + synchronized(this.parentMap()) { - this.diskManager = new DiskIndexManager.Default( - this, - this.name, - this.configuration.indexDirectory(), - this.configuration.dimension(), - this.configuration.maxDegree(), - this.configuration.parallelOnDiskWrite() + this.ensureIndexInitialized(); + + // If we have an in-memory builder, prepare for disk write + if(this.builder == null || this.index == null) + { + return; + } + + // Initialize disk manager if needed + if(this.diskManager == null) + { + this.diskManager = new DiskIndexManager.Default( + this, + this.name, + this.configuration.indexDirectory(), + this.configuration.dimension(), + this.configuration.maxDegree(), + this.configuration.parallelOnDiskWrite() + ); + } + + // Capture references for use outside the synchronized block. + // The parentMap monitor is released before cleanup and disk write + // so that worker threads (ForkJoinPool in cleanup, disk writer) + // can freely call parentMap.get() without deadlocking. + capturedBuilder = this.builder; + capturedIndex = this.index; + capturedRavv = new NullSafeVectorValues( + this.createVectorValues(), this.configuration.dimension(), this.vectorTypeSupport ); + capturedPqMgr = this.pqManager; + capturedDiskMgr = this.diskManager; } - // Capture references for use outside the synchronized block. - // The parentMap monitor is released before cleanup and disk write - // so that worker threads (ForkJoinPool in cleanup and disk writer) - // can freely call parentMap.get() without deadlocking. - capturedBuilder = this.builder; - capturedIndex = this.index; - capturedRavv = new NullSafeVectorValues( - this.createVectorValues(), this.configuration.dimension(), this.vectorTypeSupport - ); - capturedPqMgr = this.pqManager; - capturedDiskMgr = this.diskManager; + // Phase 2: Cleanup and disk write outside synchronized(parentMap). + // builderLock.writeLock() is still held, blocking searches, + // background worker mutations, removeAll, and close. + // parentMap monitor is released, so ForkJoinPool workers and + // disk writer threads can call parentMap.get() for embedded vectors. + capturedBuilder.cleanup(); + capturedDiskMgr.writeIndex(capturedIndex, capturedRavv, capturedPqMgr); + } + catch(final IOException ioe) + { + throw new IORuntimeException(ioe); + } + finally + { + this.builderLock.writeLock().unlock(); } - - // Phase 2: Cleanup and disk write outside synchronized(parentMap). - // persistenceLock.writeLock() is still held, blocking searches, - // removeAll, and close. But parentMap monitor is released, so - // worker threads (ForkJoinPool in cleanup, disk writer threads) - // can call parentMap.get() for embedded vectors. - capturedBuilder.cleanup(); - capturedDiskMgr.writeIndex(capturedIndex, capturedRavv, capturedPqMgr); - } - catch(final IOException ioe) - { - throw new IORuntimeException(ioe); } finally { - this.persistenceLock.writeLock().unlock(); + this.cleanupInProgress = false; } + + // Apply any deferred sync-mode mutations now that cleanup + persistence is done. + this.drainDeferredBuilderOps(); } @Override @@ -1585,19 +1627,16 @@ public void close() // Shutdown persistence manager (may persist pending changes) this.shutdownPersistenceManager(this.configuration.persistOnShutdown()); - // Acquire write lock to ensure no concurrent persistToDisk() Phase 2 is running. + // Acquire write lock to ensure no concurrent search or persistToDisk() is running. // closeInternalResources() destroys the graph and disk manager. - this.persistenceLock.writeLock().lock(); + this.builderLock.writeLock().lock(); try { - synchronized(this.parentMap()) - { - this.closeInternalResources(); - } + this.closeInternalResources(); } finally { - this.persistenceLock.writeLock().unlock(); + this.builderLock.writeLock().unlock(); } } @@ -1771,23 +1810,85 @@ public long getExpectedVectorCount() @Override public void applyGraphAdd(final int ordinal, final float[] vector) { - final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); - this.builder.addGraphNode(ordinal, vf); + // Called from the background indexing worker thread (not from GigaMap's + // synchronized methods), so we use builderLock.readLock() to coordinate + // with cleanup (writeLock). + this.builderLock.readLock().lock(); + try + { + final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); + this.builder.addGraphNode(ordinal, vf); + } + finally + { + this.builderLock.readLock().unlock(); + } } @Override public void applyGraphUpdate(final int ordinal, final float[] vector) { - this.builder.markNodeDeleted(ordinal); - this.builder.removeDeletedNodes(); - final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); - this.builder.addGraphNode(ordinal, vf); + this.builderLock.readLock().lock(); + try + { + this.builder.markNodeDeleted(ordinal); + this.builder.removeDeletedNodes(); + final VectorFloat vf = this.vectorTypeSupport.createFloatVector(vector); + this.builder.addGraphNode(ordinal, vf); + } + finally + { + this.builderLock.readLock().unlock(); + } } @Override public void applyGraphRemove(final int ordinal) { - this.builder.markNodeDeleted(ordinal); + this.builderLock.readLock().lock(); + try + { + this.builder.markNodeDeleted(ordinal); + } + finally + { + this.builderLock.readLock().unlock(); + } + } + + + // ================================================================ + // Builder operation deferral helpers + // ================================================================ + + /** + * Executes a builder operation immediately, or defers it if cleanup is in progress. + * Used by sync-mode mutations (called from GigaMap's synchronized methods) which + * cannot acquire builderLock without risking deadlock with embedded vectorizers. + */ + private void executeOrDeferBuilderOp(final Runnable op) + { + if(this.cleanupInProgress) + { + this.deferredBuilderOps.add(op); + } + else + { + op.run(); + } + } + + /** + * Drains and executes all deferred builder operations. + * Called after cleanup completes (cleanupInProgress is already false). + */ + private void drainDeferredBuilderOps() + { + Runnable op; + while((op = this.deferredBuilderOps.poll()) != null) + { + op.run(); + } } } From 26c001a3ccc6af59308c6547861f894aa8aaeb72 Mon Sep 17 00:00:00 2001 From: fh-ms Date: Wed, 18 Feb 2026 12:28:44 +0100 Subject: [PATCH 10/15] Documented `eventualIndexing` and `parallelOnDiskWrite` features in VectorIndexConfiguration with examples and configuration details. --- .../pages/indexing/jvector/configuration.adoc | 53 +++++++++++++++++++ gigamap/jvector/README.md | 41 ++++++++++++++ 2 files changed, 94 insertions(+) diff --git a/docs/modules/gigamap/pages/indexing/jvector/configuration.adoc b/docs/modules/gigamap/pages/indexing/jvector/configuration.adoc index 8709c8c4..326bc71f 100644 --- a/docs/modules/gigamap/pages/indexing/jvector/configuration.adoc +++ b/docs/modules/gigamap/pages/indexing/jvector/configuration.adoc @@ -141,6 +141,10 @@ For datasets that exceed available memory, enable on-disk storage to use memory- |`pqSubspaces` |`0` |Number of PQ subspaces (0 = auto: dimension/4). + +|`parallelOnDiskWrite` +|`false` +|Use parallel direct buffers and multiple worker threads for on-disk index writing. Speeds up persistence for large indices but uses more resources. Only applies when `onDisk=true`. |=== === Example @@ -157,6 +161,55 @@ VectorIndexConfiguration config = VectorIndexConfiguration.builder() .build(); ---- +== Eventual Indexing + +Enable eventual indexing to defer expensive HNSW graph mutations to a background thread, reducing mutation latency at the cost of eventual search consistency. + +[options="header",cols="1,1,3"] +|=== +|Parameter |Default |Description + +|`eventualIndexing` +|`false` +|Defer HNSW graph mutations (add, update, remove) to a background thread. The vector store is updated synchronously, but graph construction happens asynchronously. Search results may not immediately reflect the most recent mutations. +|=== + +When enabled: + +* The vector store is always updated synchronously (no data loss). +* HNSW graph mutations are queued and applied by a single background worker thread. +* The queue is automatically drained before `optimize()`, `persistToDisk()`, and `close()`. + +=== Example + +[source, java] +---- +VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(768) + .similarityFunction(VectorSimilarityFunction.COSINE) + .eventualIndexing(true) + .build(); +---- + +== Parallel On-Disk Writes + +When on-disk storage is enabled, persistence can optionally use parallel direct buffers and multiple worker threads (one per available processor) to write the index concurrently. This can significantly speed up persistence for large indices. + +This is disabled by default, as sequential single-threaded writing is preferred in resource-constrained environments or for smaller indices. + +=== Example + +[source, java] +---- +VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(768) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(Path.of("/data/vectors")) + .parallelOnDiskWrite(true) + .build(); +---- + == Background Persistence Enable automatic asynchronous persistence to avoid blocking operations during writes. diff --git a/gigamap/jvector/README.md b/gigamap/jvector/README.md index 43c4f099..d68ddc59 100644 --- a/gigamap/jvector/README.md +++ b/gigamap/jvector/README.md @@ -10,6 +10,8 @@ A Java library that integrates [JVector](https://github.com/datastax/jvector) (h - **PQ Compression**: Product Quantization for reduced memory footprint - **Background Persistence**: Automatic asynchronous persistence at configurable intervals - **Background Optimization**: Periodic graph cleanup for improved query performance +- **Eventual Indexing**: Deferred graph mutations via background thread for reduced write latency +- **Parallel On-Disk Writes**: Multi-threaded index persistence for large on-disk indices - **Lazy Entity Access**: Search results provide direct access to entities without additional lookups - **Stream API**: Java Stream support for search results - **GigaMap Integration**: Seamlessly integrates with GigaMap's index system @@ -163,6 +165,13 @@ List topDocs = result.stream() | `indexDirectory` | `null` | Directory for index files (required if `onDisk=true`) | | `enablePqCompression` | `false` | Enable Product Quantization compression | | `pqSubspaces` | `0` | Number of PQ subspaces (0 = auto: dimension/4) | +| `parallelOnDiskWrite` | `false` | Use parallel direct buffers and multiple worker threads for on-disk index writing. Speeds up persistence for large indices but uses more resources. Only applies when `onDisk=true` | + +### Eventual Indexing + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `eventualIndexing` | `false` | Defer HNSW graph mutations to a background thread. The vector store is updated synchronously, but graph construction happens asynchronously. Reduces mutation latency at the cost of eventual search consistency | ### Background Persistence @@ -223,6 +232,38 @@ VectorIndexConfiguration config = VectorIndexConfiguration.builder() .build(); ``` +### Eventual Indexing + +For high-throughput systems where mutation latency matters more than immediate search consistency: + +```java +VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(768) + .similarityFunction(VectorSimilarityFunction.COSINE) + // Eventual indexing (graph mutations deferred to background thread) + .eventualIndexing(true) + .build(); +``` + +When enabled, the vector store is always updated synchronously (no data loss), but expensive HNSW graph mutations are queued and applied by a background worker thread. Search results may not immediately reflect the most recent mutations. The queue is automatically drained before `optimize()`, `persistToDisk()`, and `close()`. + +### Parallel On-Disk Writes + +For large on-disk indices where persistence speed is critical: + +```java +VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(768) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(Path.of("/data/vectors")) + // Parallel on-disk writing (multiple worker threads) + .parallelOnDiskWrite(true) + .build(); +``` + +When enabled, the on-disk graph writer uses parallel direct buffers and multiple worker threads (one per available processor) to write the index concurrently. This is disabled by default as sequential writing is preferred in resource-constrained environments or for smaller indices. + ### Manual Optimization and Persistence ```java From 52446e22d5fcd779aafeba5e15b052324bedab40 Mon Sep 17 00:00:00 2001 From: fh-ms Date: Wed, 18 Feb 2026 13:07:33 +0100 Subject: [PATCH 11/15] Document eventual indexing and parallel on-disk write features with examples and configuration guidance in VectorIndex JavaDoc. --- .../store/gigamap/jvector/VectorIndex.java | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java index 56d7d907..804a412f 100644 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java @@ -54,6 +54,8 @@ *

  • On-Disk Storage - Optional memory-mapped indices for large datasets
  • *
  • PQ Compression - Product Quantization for reduced memory footprint
  • *
  • Background Optimization - Automatic graph cleanup for improved performance
  • + *
  • Eventual Indexing - Deferred graph mutations via background thread for reduced write latency
  • + *
  • Parallel On-Disk Writes - Multi-threaded index persistence for large on-disk indices
  • * * *

    Basic Usage

    @@ -161,6 +163,37 @@ * .build(); * } * + *

    Eventual Indexing

    + * When enabled, expensive HNSW graph mutations (add, update, remove) are deferred to a background + * thread. The vector store is still updated synchronously, so no data is lost, but graph construction + * happens asynchronously. This reduces the latency of mutation operations at the cost of eventual + * consistency — search results may not immediately reflect the most recent mutations. + *

    + * The graph is automatically drained (all pending operations applied) before + * {@code optimize()}, {@code persistToDisk()}, and {@code close()}. + *

    {@code
    + * VectorIndexConfiguration config = VectorIndexConfiguration.builder()
    + *     .dimension(768)
    + *     .similarityFunction(VectorSimilarityFunction.COSINE)
    + *     .eventualIndexing(true)
    + *     .build();
    + * }
    + * + *

    Parallel On-Disk Writes

    + * When on-disk storage is enabled, persistence can optionally use parallel direct buffers and + * multiple worker threads (one per available processor) to write the index concurrently. This can + * significantly speed up persistence for large indices. Disabled by default, as sequential + * single-threaded writing is preferred in resource-constrained environments or for smaller indices. + *
    {@code
    + * VectorIndexConfiguration config = VectorIndexConfiguration.builder()
    + *     .dimension(768)
    + *     .similarityFunction(VectorSimilarityFunction.COSINE)
    + *     .onDisk(true)
    + *     .indexDirectory(Path.of("/data/vectors"))
    + *     .parallelOnDiskWrite(true)
    + *     .build();
    + * }
    + * *

    Search Methods

    *
    {@code
      * // Search by vector
    @@ -227,6 +260,8 @@
      *   
  • Search - Thread-safe, multiple concurrent searches allowed
  • *
  • Add/Remove - Thread-safe via GigaMap synchronization
  • *
  • Optimization - Briefly blocks add/remove/search during cleanup
  • + *
  • Eventual Indexing - Graph mutations are applied sequentially by a single + * background worker thread; vector store updates remain synchronous
  • * * *

    Limitations

    From 46b84c1b30cb907d73cd964cebfbbe214a61dfa1 Mon Sep 17 00:00:00 2001 From: fh-ms Date: Wed, 18 Feb 2026 13:09:53 +0100 Subject: [PATCH 12/15] Remove redundant default case check for unsupported similarity function in VectorIndex and add comment --- .../java/org/eclipse/store/gigamap/jvector/VectorIndex.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java index 804a412f..f0a80306 100644 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java @@ -1096,12 +1096,12 @@ public VectorIndexConfiguration configuration() private io.github.jbellis.jvector.vector.VectorSimilarityFunction jvectorSimilarityFunction() { + // use switch not valueOf(name) to ensure compiler assistance when jvector enum changes return switch(this.configuration.similarityFunction()) { case EUCLIDEAN -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.EUCLIDEAN; case DOT_PRODUCT -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.DOT_PRODUCT; case COSINE -> io.github.jbellis.jvector.vector.VectorSimilarityFunction.COSINE; - default -> throw new IllegalArgumentException("Unsupported similarity function: " + this.configuration.similarityFunction()); }; } From a52a11bca54a5e46fa8d26cf5359e8b463b0a42c Mon Sep 17 00:00:00 2001 From: fh-ms Date: Wed, 18 Feb 2026 13:10:03 +0100 Subject: [PATCH 13/15] Add braces to conditional statements in VectorEntry equals() for consistency --- .../java/org/eclipse/store/gigamap/jvector/VectorEntry.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorEntry.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorEntry.java index 36be53d4..3c2d8923 100644 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorEntry.java +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorEntry.java @@ -55,9 +55,13 @@ public Condition is(final Long key) public boolean equals(final Object obj) { if (obj == this) + { return true; + } if (obj == null || obj.getClass() != this.getClass()) + { return false; + } final VectorEntry that = (VectorEntry)obj; return this.sourceEntityId == that.sourceEntityId && Arrays.equals(this.vector, that.vector) From 8b66b99b883862c644986332a9275bc5ccbff629 Mon Sep 17 00:00:00 2001 From: fh-ms Date: Thu, 19 Feb 2026 12:03:03 +0100 Subject: [PATCH 14/15] Consolidate BackgroundIndexingManager, BackgroundOptimizationManager, and BackgroundPersistenceManager into unified BackgroundTaskManager. --- .../jvector/BackgroundIndexingManager.java | 332 ----------- .../BackgroundOptimizationManager.java | 239 -------- .../jvector/BackgroundPersistenceManager.java | 211 ------- .../jvector/BackgroundTaskManager.java | 558 ++++++++++++++++++ .../store/gigamap/jvector/VectorIndex.java | 216 +++---- .../VectorIndexConcurrentStressTest.java | 124 ++-- .../gigamap/jvector/VectorIndexDiskTest.java | 22 +- .../VectorIndexEventualIndexingTest.java | 253 +++----- .../jvector/VectorIndexPerformanceTest.java | 6 +- 9 files changed, 791 insertions(+), 1170 deletions(-) delete mode 100644 gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundIndexingManager.java delete mode 100644 gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundOptimizationManager.java delete mode 100644 gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundPersistenceManager.java create mode 100644 gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundTaskManager.java diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundIndexingManager.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundIndexingManager.java deleted file mode 100644 index d9db3655..00000000 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundIndexingManager.java +++ /dev/null @@ -1,332 +0,0 @@ -package org.eclipse.store.gigamap.jvector; - -/*- - * #%L - * EclipseStore GigaMap JVector - * %% - * Copyright (C) 2023 - 2026 MicroStream Software - * %% - * This program and the accompanying materials are made - * available under the terms of the Eclipse Public License 2.0 - * which is available at https://www.eclipse.org/legal/epl-2.0/ - * - * SPDX-License-Identifier: EPL-2.0 - * #L% - */ - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.LinkedBlockingQueue; - -/** - * Manages background graph indexing for a VectorIndex. - *

    - * When eventual indexing is enabled, graph mutations (add/update/remove) are - * queued and applied asynchronously by a single background worker thread. - * The vector store is still updated synchronously so that data is not lost, - * but the expensive HNSW graph operations are deferred. - *

    - * This trades immediate search consistency for reduced latency on mutation - * operations — search results may not immediately reflect the most recent - * mutations (eventual consistency). - *

    - * This manager handles: - *

      - *
    • Queuing of graph indexing operations (add, update, remove)
    • - *
    • Sequential application of operations by a single background thread
    • - *
    • Draining the queue (blocking until all pending operations are applied)
    • - *
    • Graceful shutdown with optional drain
    • - *
    - */ -interface BackgroundIndexingManager -{ - /** - * Enqueues an indexing operation for background processing. - * - * @param operation the operation to enqueue - */ - void enqueue(IndexingOperation operation); - - /** - * Blocks until all currently enqueued operations have been applied. - *

    - * This is used before {@code optimize()} and {@code persistToDisk()} to - * ensure the graph is complete before those operations proceed. - */ - void drainQueue(); - - /** - * Discards all pending operations without applying them. - *

    - * Used during {@code internalRemoveAll()} where pending operations - * refer to stale ordinals that are no longer valid. - */ - void discardQueue(); - - /** - * Shuts down the background indexing manager. - * - * @param drainPending if true, drain all pending operations before shutdown - */ - void shutdown(boolean drainPending); - - /** - * Returns the number of pending operations in the queue. - * Useful for monitoring and testing. - * - * @return the number of pending operations - */ - int getPendingCount(); - - - // ======================================================================== - // Indexing Operations - // ======================================================================== - - /** - * Sealed interface for indexing operations that can be queued. - */ - sealed interface IndexingOperation - permits IndexingOperation.Add, - IndexingOperation.Update, - IndexingOperation.Remove, - IndexingOperation.DrainBarrier - { - public void execute(Callback callback); - - - /** - * Add a node to the HNSW graph. - */ - record Add(int ordinal, float[] vector) implements IndexingOperation - { - @Override - public void execute(final Callback callback) - { - callback.applyGraphAdd(this.ordinal, this.vector); - callback.markDirtyForBackgroundManagers(1); - } - } - - /** - * Update a node in the HNSW graph (delete + re-add). - */ - record Update(int ordinal, float[] vector) implements IndexingOperation - { - @Override - public void execute(final Callback callback) - { - callback.applyGraphUpdate(this.ordinal, this.vector); - callback.markDirtyForBackgroundManagers(1); - } - } - - /** - * Remove a node from the HNSW graph. - */ - record Remove(int ordinal) implements IndexingOperation - { - @Override - public void execute(final Callback callback) - { - callback.applyGraphRemove(this.ordinal); - callback.markDirtyForBackgroundManagers(1); - } - } - - /** - * Sentinel operation for drainQueue() — signals the worker to release the latch. - */ - record DrainBarrier(CountDownLatch latch) implements IndexingOperation - { - @Override - public void execute(final Callback callback) - { - this.latch().countDown(); - } - } - } - - - // ======================================================================== - // Callback - // ======================================================================== - - /** - * Callback interface for applying graph operations. - * Implemented by {@code VectorIndex.Default}. - */ - interface Callback - { - /** - * Adds a node to the HNSW graph. - * - * @param ordinal the node ordinal - * @param vector the vector data - */ - void applyGraphAdd(int ordinal, float[] vector); - - /** - * Updates a node in the HNSW graph (delete old + add new). - * - * @param ordinal the node ordinal - * @param vector the new vector data - */ - void applyGraphUpdate(int ordinal, float[] vector); - - /** - * Removes a node from the HNSW graph. - * - * @param ordinal the node ordinal - */ - void applyGraphRemove(int ordinal); - - /** - * Marks dirty for persistence/optimization background managers. - * - * @param count the number of changes - */ - void markDirtyForBackgroundManagers(int count); - } - - - // ======================================================================== - // Default Implementation - // ======================================================================== - - /** - * Default implementation of BackgroundIndexingManager. - *

    - * Uses a single daemon worker thread that continuously takes operations - * from a {@link LinkedBlockingQueue} and applies them via the callback. - */ - static class Default implements BackgroundIndexingManager - { - private static final Logger LOG = LoggerFactory.getLogger(BackgroundIndexingManager.class); - - private final Callback callback; - private final String name ; - private final LinkedBlockingQueue queue ; - private final Thread worker ; - - private volatile boolean shutdown = false; - - Default(final Callback callback, final String name) - { - this.callback = callback; - this.name = name ; - this.queue = new LinkedBlockingQueue<>(); - - this.worker = new Thread(this::workerLoop, "VectorIndex-BackgroundIndexing-" + name); - this.worker.setDaemon(true); - this.worker.start(); - - LOG.info("Background indexing started for '{}'", name); - } - - @Override - public void enqueue(final IndexingOperation operation) - { - this.queue.add(operation); - } - - @Override - public void drainQueue() - { - if(this.shutdown) - { - return; - } - - final CountDownLatch latch = new CountDownLatch(1); - this.queue.add(new IndexingOperation.DrainBarrier(latch)); - - try - { - latch.await(); - } - catch(final InterruptedException e) - { - Thread.currentThread().interrupt(); - LOG.warn("Interrupted while draining indexing queue for '{}'", this.name); - } - } - - @Override - public void discardQueue() - { - final int discarded = this.queue.size(); - this.queue.clear(); - if(discarded > 0) - { - LOG.info("Discarded {} pending indexing operations for '{}'", discarded, this.name); - } - } - - @Override - public void shutdown(final boolean drainPending) - { - if(drainPending) - { - LOG.info("Draining {} pending indexing operations for '{}' before shutdown", - this.queue.size(), this.name); - this.drainQueue(); - } - - this.shutdown = true; - this.worker.interrupt(); - - try - { - this.worker.join(30_000); - if(this.worker.isAlive()) - { - LOG.warn("Background indexing worker did not terminate gracefully for '{}'", this.name); - } - } - catch(final InterruptedException e) - { - Thread.currentThread().interrupt(); - } - - LOG.info("Background indexing manager shutdown for '{}'", this.name); - } - - @Override - public int getPendingCount() - { - return this.queue.size(); - } - - /** - * Worker loop that continuously takes operations from the queue and applies them. - */ - private void workerLoop() - { - while(!this.shutdown) - { - try - { - final IndexingOperation op = this.queue.take(); - op.execute(this.callback); - } - catch(final InterruptedException e) - { - if(!this.shutdown) - { - LOG.debug("Background indexing worker interrupted for '{}'", this.name); - } - // Re-check shutdown flag in loop condition - } - catch(final Exception e) - { - LOG.error("Error applying indexing operation for '{}': {}", this.name, e.getMessage(), e); - } - } - } - - } - -} diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundOptimizationManager.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundOptimizationManager.java deleted file mode 100644 index cace6283..00000000 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundOptimizationManager.java +++ /dev/null @@ -1,239 +0,0 @@ -package org.eclipse.store.gigamap.jvector; - -/*- - * #%L - * EclipseStore GigaMap JVector - * %% - * Copyright (C) 2023 - 2026 MicroStream Software - * %% - * This program and the accompanying materials are made - * available under the terms of the Eclipse Public License 2.0 - * which is available at https://www.eclipse.org/legal/epl-2.0/ - * - * SPDX-License-Identifier: EPL-2.0 - * #L% - */ - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.concurrent.*; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; - -/** - * Manages background optimization for a VectorIndex. - *

    - * This manager handles: - *

      - *
    • Dirty tracking with change counting
    • - *
    • Scheduled background optimization at configurable intervals
    • - *
    • Debouncing based on minimum change threshold
    • - *
    • Graceful shutdown with optional final optimization
    • - *
    - */ -interface BackgroundOptimizationManager -{ - /** - * Marks the index as dirty with the specified number of changes. - * - * @param count the number of changes - */ - public void markDirty(int count); - - /** - * Starts the scheduled background optimization task. - */ - public void startScheduledOptimization(); - - /** - * Shuts down the optimization manager. - * - * @param optimizePending if true and there are pending changes, optimize before shutdown - */ - public void shutdown(boolean optimizePending); - - /** - * Returns the number of times optimization has been performed. - * This is useful for testing and monitoring. - * - * @return the optimization count - */ - public long getOptimizationCount(); - - /** - * Returns the current pending change count. - * This is useful for testing and monitoring. - * - * @return the pending change count - */ - public int getPendingChangeCount(); - - - /** - * Callback interface for the optimization manager to perform optimization. - */ - public interface Callback - { - /** - * Performs optimization of the index. - */ - public void optimize(); - } - - - /** - * Default implementation of BackgroundOptimizationManager. - */ - public static class Default implements BackgroundOptimizationManager - { - private static final Logger LOG = LoggerFactory.getLogger(BackgroundOptimizationManager.class); - - private final Callback callback ; - private final String name ; - private final long intervalMs; - private final int minChanges; - private final ScheduledExecutorService scheduler ; - - private final AtomicInteger changeCount = new AtomicInteger(0); - private final AtomicLong optimizationCount = new AtomicLong(0); - - private ScheduledFuture scheduledTask; - private volatile boolean shutdown = false; - - Default( - final Callback callback , - final String name , - final long intervalMs, - final int minChanges - ) - { - this.callback = callback; - this.name = name; - this.intervalMs = intervalMs; - this.minChanges = minChanges; - - this.scheduler = Executors.newSingleThreadScheduledExecutor(r -> - { - final Thread t = new Thread(r, "VectorIndex-BackgroundOptimization-" + name); - t.setDaemon(true); - return t; - }); - } - - @Override - public void markDirty(final int count) - { - this.changeCount.addAndGet(count); - } - - @Override - public long getOptimizationCount() - { - return this.optimizationCount.get(); - } - - @Override - public int getPendingChangeCount() - { - return this.changeCount.get(); - } - - @Override - public void startScheduledOptimization() - { - this.scheduledTask = this.scheduler.scheduleAtFixedRate( - this::optimizeNowIfDirty, - this.intervalMs, - this.intervalMs, - TimeUnit.MILLISECONDS - ); - } - - /** - * Optimizes the index if the change threshold has been met. - */ - private void optimizeNowIfDirty() - { - if(this.shutdown) - { - return; - } - - final int currentChanges = this.changeCount.get(); - if(currentChanges < this.minChanges) - { - return; - } - - this.optimizeNow(); - } - - /** - * Optimizes the index immediately. - */ - private void optimizeNow() - { - LOG.debug("Background optimizing index '{}' with {} changes", - this.name, this.changeCount.get()); - - try - { - this.callback.optimize(); - - // Reset change count and increment optimization counter after success - this.changeCount.set(0); - this.optimizationCount.incrementAndGet(); - - LOG.debug("Background optimization completed for '{}'", this.name); - } - catch(final Exception e) - { - LOG.error("Background optimization failed for '{}': {}", this.name, e.getMessage(), e); - } - } - - @Override - public void shutdown(final boolean optimizePending) - { - this.shutdown = true; - - // Cancel the scheduled task - if(this.scheduledTask != null) - { - this.scheduledTask.cancel(false); - this.scheduledTask = null; - } - - // Optimize pending changes if requested - final int pendingChanges = this.changeCount.get(); - if(optimizePending && pendingChanges > 0) - { - LOG.info("Optimizing pending changes for '{}' before shutdown ({} changes)", - this.name, pendingChanges); - this.optimizeNow(); - } - - // Shutdown the scheduler - this.scheduler.shutdown(); - try - { - if(!this.scheduler.awaitTermination(30, TimeUnit.SECONDS)) - { - LOG.warn("Background optimization scheduler did not terminate gracefully for '{}'", - this.name); - this.scheduler.shutdownNow(); - } - } - catch(final InterruptedException e) - { - Thread.currentThread().interrupt(); - this.scheduler.shutdownNow(); - } - - LOG.info("Background optimization manager shutdown for '{}'", this.name); - } - - } - -} diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundPersistenceManager.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundPersistenceManager.java deleted file mode 100644 index 514d8721..00000000 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundPersistenceManager.java +++ /dev/null @@ -1,211 +0,0 @@ -package org.eclipse.store.gigamap.jvector; - -/*- - * #%L - * EclipseStore GigaMap JVector - * %% - * Copyright (C) 2023 - 2026 MicroStream Software - * %% - * This program and the accompanying materials are made - * available under the terms of the Eclipse Public License 2.0 - * which is available at https://www.eclipse.org/legal/epl-2.0/ - * - * SPDX-License-Identifier: EPL-2.0 - * #L% - */ - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.ScheduledFuture; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; - -/** - * Manages background persistence for a VectorIndex. - *

    - * This manager handles: - *

      - *
    • Dirty tracking with change counting
    • - *
    • Scheduled background persistence at configurable intervals
    • - *
    • Debouncing based on minimum change threshold
    • - *
    • Graceful shutdown with optional final persistence
    • - *
    - */ -interface BackgroundPersistenceManager -{ - /** - * Marks the index as dirty with the specified number of changes. - * - * @param count the number of changes - */ - public void markDirty(int count); - - /** - * Starts the scheduled background persistence task. - */ - public void startScheduledPersistence(); - - /** - * Shuts down the persistence manager. - * - * @param persistPending if true and there are pending changes, persist before shutdown - */ - public void shutdown(boolean persistPending); - - - /** - * Callback interface for the persistence manager to perform persistence. - */ - public interface Callback - { - /** - * Persists the index to disk. - */ - public void persistToDisk(); - } - - - /** - * Default implementation of BackgroundPersistenceManager. - */ - public static class Default implements BackgroundPersistenceManager - { - private static final Logger LOG = LoggerFactory.getLogger(BackgroundPersistenceManager.class); - - private final Callback callback ; - private final String name ; - private final long intervalMs; - private final int minChanges; - private final ScheduledExecutorService scheduler ; - - private final AtomicInteger changeCount = new AtomicInteger(0); - - private ScheduledFuture scheduledTask; - private volatile boolean shutdown = false; - - Default( - final Callback callback , - final String name , - final long intervalMs, - final int minChanges - ) - { - this.callback = callback ; - this.name = name ; - this.intervalMs = intervalMs; - this.minChanges = minChanges; - - this.scheduler = Executors.newSingleThreadScheduledExecutor(r -> - { - final Thread t = new Thread(r, "VectorIndex-BackgroundPersistence-" + name); - t.setDaemon(true); - return t; - }); - } - - @Override - public void markDirty(final int count) - { - this.changeCount.addAndGet(count); - } - - @Override - public void startScheduledPersistence() - { - this.scheduledTask = this.scheduler.scheduleAtFixedRate( - this::persistNowIfDirty, - this.intervalMs, - this.intervalMs, - TimeUnit.MILLISECONDS - ); - } - - /** - * Persists the index if the change threshold has been met. - */ - private void persistNowIfDirty() - { - if(this.shutdown) - { - return; - } - - final int currentChanges = this.changeCount.get(); - if(currentChanges < this.minChanges) - { - return; - } - - this.persistNow(); - } - - /** - * Persists the index immediately, regardless of dirty state or threshold. - */ - private void persistNow() - { - LOG.debug("Background persisting index '{}' with {} changes", - this.name, this.changeCount.get()); - - try - { - this.callback.persistToDisk(); - - // Reset change count after successful persistence - this.changeCount.set(0); - - LOG.debug("Background persistence completed for '{}'", this.name); - } - catch(final Exception e) - { - LOG.error("Background persistence failed for '{}': {}", this.name, e.getMessage(), e); - } - } - - @Override - public void shutdown(final boolean persistPending) - { - this.shutdown = true; - - // Cancel the scheduled task - if(this.scheduledTask != null) - { - this.scheduledTask.cancel(false); - this.scheduledTask = null; - } - - // Persist pending changes if requested - final int pendingChanges = this.changeCount.get(); - if(persistPending && pendingChanges > 0) - { - LOG.info("Persisting pending changes for '{}' before shutdown ({} changes)", - this.name, pendingChanges); - this.persistNow(); - } - - // Shutdown the scheduler - this.scheduler.shutdown(); - try - { - if(!this.scheduler.awaitTermination(30, TimeUnit.SECONDS)) - { - LOG.warn("Background persistence scheduler did not terminate gracefully for '{}'", - this.name); - this.scheduler.shutdownNow(); - } - } - catch(final InterruptedException e) - { - Thread.currentThread().interrupt(); - this.scheduler.shutdownNow(); - } - - LOG.info("Background persistence manager shutdown for '{}'", this.name); - } - - } - -} diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundTaskManager.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundTaskManager.java new file mode 100644 index 00000000..1f9adf7d --- /dev/null +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/BackgroundTaskManager.java @@ -0,0 +1,558 @@ +package org.eclipse.store.gigamap.jvector; + +/*- + * #%L + * EclipseStore GigaMap JVector + * %% + * Copyright (C) 2023 - 2026 MicroStream Software + * %% + * This program and the accompanying materials are made + * available under the terms of the Eclipse Public License 2.0 + * which is available at https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + * #L% + */ + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Unified background task manager for VectorIndex. + *

    + * Consolidates background indexing, optimization, and persistence into a single + * {@link ScheduledExecutorService} with one daemon thread. All three workloads + * serialize on the same builder write-lock and never do useful work in parallel, + * so a single thread is sufficient. + *

    + * This manager handles: + *

      + *
    • Queuing and batch processing of graph indexing operations (add, update, remove)
    • + *
    • Scheduled background optimization at configurable intervals with debouncing
    • + *
    • Scheduled background persistence at configurable intervals with debouncing
    • + *
    • Graceful shutdown with optional drain, optimize, and persist
    • + *
    + */ +class BackgroundTaskManager +{ + private static final Logger LOG = LoggerFactory.getLogger(BackgroundTaskManager.class); + + // ======================================================================== + // Indexing Operations + // ======================================================================== + + /** + * Sealed interface for indexing operations that can be queued. + */ + sealed interface IndexingOperation + permits IndexingOperation.Add, + IndexingOperation.Update, + IndexingOperation.Remove + { + void execute(Callback callback); + + /** + * Add a node to the HNSW graph. + */ + record Add(int ordinal, float[] vector) implements IndexingOperation + { + @Override + public void execute(final Callback callback) + { + callback.applyGraphAdd(this.ordinal, this.vector); + callback.markDirtyForBackgroundManagers(1); + } + } + + /** + * Update a node in the HNSW graph (delete + re-add). + */ + record Update(int ordinal, float[] vector) implements IndexingOperation + { + @Override + public void execute(final Callback callback) + { + callback.applyGraphUpdate(this.ordinal, this.vector); + callback.markDirtyForBackgroundManagers(1); + } + } + + /** + * Remove a node from the HNSW graph. + */ + record Remove(int ordinal) implements IndexingOperation + { + @Override + public void execute(final Callback callback) + { + callback.applyGraphRemove(this.ordinal); + callback.markDirtyForBackgroundManagers(1); + } + } + } + + // ======================================================================== + // Callback + // ======================================================================== + + /** + * Callback interface for applying graph operations and core optimization/persistence. + * Implemented by {@code VectorIndex.Default}. + */ + interface Callback + { + void applyGraphAdd(int ordinal, float[] vector); + + void applyGraphUpdate(int ordinal, float[] vector); + + void applyGraphRemove(int ordinal); + + void markDirtyForBackgroundManagers(int count); + + /** + * Core optimization logic without queue drain. + * Called from the executor thread (inline drain already done). + */ + void doOptimize(); + + /** + * Core persistence logic without queue drain. + * Called from the executor thread (inline drain already done). + */ + void doPersistToDisk(); + } + + // ======================================================================== + // Instance fields + // ======================================================================== + + private final Callback callback ; + private final String name ; + private final ScheduledExecutorService executor ; + + // Indexing queue and dedup flag + private final ConcurrentLinkedQueue indexingQueue ; + private final AtomicBoolean indexingTaskScheduled ; + + // Optimization state + private final AtomicInteger optimizationChangeCount; + private final AtomicLong optimizationCount ; + private final int optimizationMinChanges ; + private ScheduledFuture optimizationTask ; + + // Persistence state + private final AtomicInteger persistenceChangeCount; + private final int persistenceMinChanges ; + private ScheduledFuture persistenceTask ; + + private volatile boolean shutdown = false; + + // ======================================================================== + // Constructor + // ======================================================================== + + BackgroundTaskManager( + final Callback callback, + final String name, + final boolean eventualIndexing, + final boolean backgroundOptimization, + final long optimizationIntervalMs, + final int optimizationMinChanges, + final boolean backgroundPersistence, + final long persistenceIntervalMs, + final int persistenceMinChanges + ) + { + this.callback = callback; + this.name = name ; + + this.executor = Executors.newSingleThreadScheduledExecutor(r -> + { + final Thread t = new Thread(r, "VectorIndex-Background-" + name); + t.setDaemon(true); + return t; + }); + + // Indexing + this.indexingQueue = new ConcurrentLinkedQueue<>(); + this.indexingTaskScheduled = new AtomicBoolean(false); + + // Optimization + this.optimizationChangeCount = new AtomicInteger(0); + this.optimizationCount = new AtomicLong(0); + this.optimizationMinChanges = optimizationMinChanges; + + // Persistence + this.persistenceChangeCount = new AtomicInteger(0); + this.persistenceMinChanges = persistenceMinChanges; + + // Start scheduled tasks + if(backgroundOptimization) + { + this.optimizationTask = this.executor.scheduleAtFixedRate( + this::runOptimizationIfDirty, + optimizationIntervalMs, + optimizationIntervalMs, + TimeUnit.MILLISECONDS + ); + LOG.info("Background optimization started for index '{}' with interval {}ms", + name, optimizationIntervalMs); + } + + if(backgroundPersistence) + { + this.persistenceTask = this.executor.scheduleAtFixedRate( + this::runPersistenceIfDirty, + persistenceIntervalMs, + persistenceIntervalMs, + TimeUnit.MILLISECONDS + ); + LOG.info("Background persistence started for index '{}' with interval {}ms", + name, persistenceIntervalMs); + } + + if(eventualIndexing) + { + LOG.info("Eventual indexing enabled for index '{}'", name); + } + } + + // ======================================================================== + // Indexing queue methods + // ======================================================================== + + /** + * Enqueues an indexing operation for background processing. + */ + void enqueue(final IndexingOperation op) + { + this.indexingQueue.add(op); + if(this.indexingTaskScheduled.compareAndSet(false, true)) + { + this.executor.submit(this::processIndexingBatch); + } + } + + /** + * Blocks until all currently enqueued indexing operations have been applied. + * Called from user threads (not the executor thread) before optimize/persistToDisk. + */ + void drainQueue() + { + if(this.shutdown) + { + return; + } + + try + { + this.executor.submit(this::processAllPendingIndexingOps).get(); + } + catch(final InterruptedException e) + { + Thread.currentThread().interrupt(); + LOG.warn("Interrupted while draining indexing queue for '{}'", this.name); + } + catch(final ExecutionException e) + { + LOG.error("Error while draining indexing queue for '{}': {}", this.name, e.getMessage(), e); + } + } + + /** + * Discards all pending indexing operations without applying them. + * Used during {@code internalRemoveAll()} where pending operations + * refer to stale ordinals that are no longer valid. + */ + void discardQueue() + { + final int discarded = this.indexingQueue.size(); + this.indexingQueue.clear(); + this.indexingTaskScheduled.set(false); + if(discarded > 0) + { + LOG.info("Discarded {} pending indexing operations for '{}'", discarded, this.name); + } + } + + /** + * Returns the number of pending indexing operations in the queue. + */ + int getPendingIndexingCount() + { + return this.indexingQueue.size(); + } + + // ======================================================================== + // Optimization monitoring + // ======================================================================== + + /** + * Marks dirty for optimization and persistence tracking. + */ + void markDirty(final int count) + { + this.optimizationChangeCount.addAndGet(count); + this.persistenceChangeCount.addAndGet(count); + } + + /** + * Returns the number of times optimization has been performed. + */ + long getOptimizationCount() + { + return this.optimizationCount.get(); + } + + /** + * Returns the current pending change count for optimization. + */ + int getOptimizationPendingChangeCount() + { + return this.optimizationChangeCount.get(); + } + + // ======================================================================== + // Shutdown + // ======================================================================== + + /** + * Shuts down the background task manager. + * + * @param drainPending if true, drain all pending indexing operations + * @param optimizePending if true and there are pending changes, optimize before shutdown + * @param persistPending if true and there are pending changes, persist before shutdown + */ + void shutdown(final boolean drainPending, final boolean optimizePending, final boolean persistPending) + { + this.shutdown = true; + + // Cancel scheduled tasks + if(this.optimizationTask != null) + { + this.optimizationTask.cancel(false); + this.optimizationTask = null; + } + if(this.persistenceTask != null) + { + this.persistenceTask.cancel(false); + this.persistenceTask = null; + } + + // Perform final work if requested + if(drainPending || optimizePending || persistPending) + { + try + { + this.executor.submit(() -> this.finalShutdownWork(drainPending, optimizePending, persistPending)) + .get(30, TimeUnit.SECONDS); + } + catch(final InterruptedException e) + { + Thread.currentThread().interrupt(); + } + catch(final ExecutionException e) + { + LOG.error("Error during shutdown work for '{}': {}", this.name, e.getMessage(), e); + } + catch(final TimeoutException e) + { + LOG.warn("Shutdown work timed out for '{}'", this.name); + } + } + + // Shutdown the executor + this.executor.shutdown(); + try + { + if(!this.executor.awaitTermination(30, TimeUnit.SECONDS)) + { + LOG.warn("Background task executor did not terminate gracefully for '{}'", this.name); + this.executor.shutdownNow(); + } + } + catch(final InterruptedException e) + { + Thread.currentThread().interrupt(); + this.executor.shutdownNow(); + } + + LOG.info("Background task manager shutdown for '{}'", this.name); + } + + // ======================================================================== + // Internal methods — all run on the executor thread + // ======================================================================== + + /** + * Processes all pending indexing ops in a batch. + * Called via {@code executor.submit()} when ops are enqueued. + */ + private void processIndexingBatch() + { + try + { + this.processAllPendingIndexingOps(); + } + finally + { + this.indexingTaskScheduled.set(false); + // Re-check: if new ops were added after we polled the last one + // but before we reset the flag, schedule another batch. + if(!this.indexingQueue.isEmpty()) + { + if(this.indexingTaskScheduled.compareAndSet(false, true)) + { + this.executor.submit(this::processIndexingBatch); + } + } + } + } + + /** + * Polls and executes all currently queued indexing operations. + * Safe to call from the executor thread (inline) or via {@code Future.get()} from user threads. + */ + private void processAllPendingIndexingOps() + { + IndexingOperation op; + while((op = this.indexingQueue.poll()) != null) + { + try + { + op.execute(this.callback); + } + catch(final Exception e) + { + LOG.error("Error applying indexing operation for '{}': {}", this.name, e.getMessage(), e); + } + } + } + + /** + * Runs optimization if the change threshold has been met. + * Called by the scheduled optimization task on the executor thread. + */ + private void runOptimizationIfDirty() + { + if(this.shutdown) + { + return; + } + + if(this.optimizationChangeCount.get() < this.optimizationMinChanges) + { + return; + } + + LOG.debug("Background optimizing index '{}' with {} changes", + this.name, this.optimizationChangeCount.get()); + + try + { + // Drain pending indexing ops inline (same thread, no deadlock) + this.processAllPendingIndexingOps(); + + this.callback.doOptimize(); + + this.optimizationChangeCount.set(0); + this.optimizationCount.incrementAndGet(); + + LOG.debug("Background optimization completed for '{}'", this.name); + } + catch(final Exception e) + { + LOG.error("Background optimization failed for '{}': {}", this.name, e.getMessage(), e); + } + } + + /** + * Runs persistence if the change threshold has been met. + * Called by the scheduled persistence task on the executor thread. + */ + private void runPersistenceIfDirty() + { + if(this.shutdown) + { + return; + } + + if(this.persistenceChangeCount.get() < this.persistenceMinChanges) + { + return; + } + + LOG.debug("Background persisting index '{}' with {} changes", + this.name, this.persistenceChangeCount.get()); + + try + { + // Drain pending indexing ops inline (same thread, no deadlock) + this.processAllPendingIndexingOps(); + + this.callback.doPersistToDisk(); + + this.persistenceChangeCount.set(0); + + LOG.debug("Background persistence completed for '{}'", this.name); + } + catch(final Exception e) + { + LOG.error("Background persistence failed for '{}': {}", this.name, e.getMessage(), e); + } + } + + /** + * Performs final shutdown work on the executor thread. + */ + private void finalShutdownWork( + final boolean drainPending, + final boolean optimizePending, + final boolean persistPending + ) + { + if(drainPending) + { + LOG.info("Draining {} pending indexing operations for '{}' before shutdown", + this.indexingQueue.size(), this.name); + this.processAllPendingIndexingOps(); + } + + if(optimizePending && this.optimizationChangeCount.get() > 0) + { + LOG.info("Optimizing pending changes for '{}' before shutdown ({} changes)", + this.name, this.optimizationChangeCount.get()); + try + { + this.callback.doOptimize(); + this.optimizationChangeCount.set(0); + this.optimizationCount.incrementAndGet(); + } + catch(final Exception e) + { + LOG.error("Shutdown optimization failed for '{}': {}", this.name, e.getMessage(), e); + } + } + + if(persistPending && this.persistenceChangeCount.get() > 0) + { + LOG.info("Persisting pending changes for '{}' before shutdown ({} changes)", + this.name, this.persistenceChangeCount.get()); + try + { + this.callback.doPersistToDisk(); + this.persistenceChangeCount.set(0); + } + catch(final Exception e) + { + LOG.error("Shutdown persistence failed for '{}': {}", this.name, e.getMessage(), e); + } + } + } + +} diff --git a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java index f0a80306..1f9e48b5 100644 --- a/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java +++ b/gigamap/jvector/src/main/java/org/eclipse/store/gigamap/jvector/VectorIndex.java @@ -629,9 +629,7 @@ public interface Internal extends VectorIndex public static class Default extends AbstractStateChangeFlagged implements VectorIndex.Internal, - BackgroundPersistenceManager.Callback, - BackgroundOptimizationManager.Callback, - BackgroundIndexingManager.Callback, + BackgroundTaskManager.Callback, PQCompressionManager.VectorProvider, DiskIndexManager.IndexStateProvider { @@ -662,11 +660,9 @@ static BinaryTypeHandler> provideTypeHandler() private transient OnHeapGraphIndex index ; // Managers (transient - recreated on load) - private transient DiskIndexManager diskManager ; - private transient PQCompressionManager pqManager ; - private transient BackgroundPersistenceManager persistenceManager ; - transient BackgroundOptimizationManager optimizationManager; - transient BackgroundIndexingManager indexingManager ; + private transient DiskIndexManager diskManager ; + private transient PQCompressionManager pqManager ; + transient BackgroundTaskManager backgroundTaskManager; // GraphSearcher pool for thread-local reuse private transient ExplicitThreadLocal searcherPool; @@ -862,72 +858,41 @@ private void initializeIndex() } /** - * Starts background persistence and optimization managers if configured. + * Starts the unified background task manager if any background feature is enabled. */ private void startBackgroundManagersIfEnabled() { - this.startBackgroundIndexingIfEnabled(); - this.startBackgroundPersistenceIfEnabled(); - this.startBackgroundOptimizationIfEnabled(); - } + final boolean eventualIndexing = this.configuration.eventualIndexing(); + final boolean backgroundOptimization = this.configuration.backgroundOptimization(); + final boolean backgroundPersistence = this.configuration.onDisk() && this.configuration.backgroundPersistence(); - /** - * Starts the background indexing manager if eventual indexing is configured. - */ - private void startBackgroundIndexingIfEnabled() - { - if(this.configuration.eventualIndexing()) + if(eventualIndexing || backgroundOptimization || backgroundPersistence) { - if(this.indexingManager == null) + if(this.backgroundTaskManager == null) { - this.indexingManager = new BackgroundIndexingManager.Default(this, this.name); - LOG.info("Eventual indexing enabled for index '{}'", this.name); - } - } - } - - /** - * Starts the background persistence manager if configured. - */ - private void startBackgroundPersistenceIfEnabled() - { - if(this.configuration.onDisk() && this.configuration.backgroundPersistence()) - { - if(this.persistenceManager == null) - { - this.persistenceManager = new BackgroundPersistenceManager.Default( + this.backgroundTaskManager = new BackgroundTaskManager( this, this.name, + eventualIndexing, + backgroundOptimization, + this.configuration.optimizationIntervalMs(), + this.configuration.minChangesBetweenOptimizations(), + backgroundPersistence, this.configuration.persistenceIntervalMs(), this.configuration.minChangesBetweenPersists() ); - this.persistenceManager.startScheduledPersistence(); - LOG.info("Background persistence started for index '{}' with interval {}ms", - this.name, this.configuration.persistenceIntervalMs()); } } } /** - * Starts the background optimization manager if configured. + * Returns whether eventual indexing is active (background task manager exists + * AND eventualIndexing is configured). The manager may exist for optimization + * or persistence alone. */ - private void startBackgroundOptimizationIfEnabled() + private boolean isEventualIndexing() { - if(this.configuration.backgroundOptimization()) - { - if(this.optimizationManager == null) - { - this.optimizationManager = new BackgroundOptimizationManager.Default( - this, - this.name, - this.configuration.optimizationIntervalMs(), - this.configuration.minChangesBetweenOptimizations() - ); - this.optimizationManager.startScheduledOptimization(); - LOG.info("Background optimization started for index '{}' with interval {}ms", - this.name, this.configuration.optimizationIntervalMs()); - } - } + return this.backgroundTaskManager != null && this.configuration.eventualIndexing(); } /** @@ -1123,10 +1088,10 @@ public void internalAdd(final long entityId, final E entity) this.markStateChangeChildren(); - if(this.indexingManager != null) + if(this.isEventualIndexing()) { // Defer graph update to background thread - this.indexingManager.enqueue(new BackgroundIndexingManager.IndexingOperation.Add(ordinal, vector)); + this.backgroundTaskManager.enqueue(new BackgroundTaskManager.IndexingOperation.Add(ordinal, vector)); } else { @@ -1175,10 +1140,10 @@ public void internalUpdate(final long entityId, final E replacedEntity, final E this.markStateChangeChildren(); - if(this.indexingManager != null) + if(this.isEventualIndexing()) { // Defer graph update to background thread - this.indexingManager.enqueue(new BackgroundIndexingManager.IndexingOperation.Update(ordinal, vector)); + this.backgroundTaskManager.enqueue(new BackgroundTaskManager.IndexingOperation.Update(ordinal, vector)); } else { @@ -1227,11 +1192,11 @@ private void addVectorEntries(final List entries) this.markStateChangeChildren(); - if(this.indexingManager != null) + if(this.isEventualIndexing()) { // Defer graph updates to background thread entries.forEach(entry -> - this.indexingManager.enqueue(new BackgroundIndexingManager.IndexingOperation.Add( + this.backgroundTaskManager.enqueue(new BackgroundTaskManager.IndexingOperation.Add( toOrdinal(entry.sourceEntityId), entry.vector )) ); @@ -1251,13 +1216,9 @@ private void addVectorEntries(final List entries) @Override public void markDirtyForBackgroundManagers(final int count) { - if(this.persistenceManager != null) - { - this.persistenceManager.markDirty(count); - } - if(this.optimizationManager != null) + if(this.backgroundTaskManager != null) { - this.optimizationManager.markDirty(count); + this.backgroundTaskManager.markDirty(count); } } @@ -1288,10 +1249,10 @@ public void internalRemove(final long entityId, final E entity) this.markStateChangeChildren(); - if(this.indexingManager != null) + if(this.isEventualIndexing()) { // Defer graph update to background thread - this.indexingManager.enqueue(new BackgroundIndexingManager.IndexingOperation.Remove(ordinal)); + this.backgroundTaskManager.enqueue(new BackgroundTaskManager.IndexingOperation.Remove(ordinal)); } else { @@ -1320,14 +1281,8 @@ public void internalRemoveAll() this.vectorStore.removeAll(); } - // Discard and shutdown indexing manager (pending ops are stale) - this.shutdownIndexingManager(false); - - // Shutdown optimization manager before closing - this.shutdownOptimizationManager(false); - - // Shutdown persistence manager before closing - this.shutdownPersistenceManager(false); + // Shutdown background task manager (discard pending ops — they're stale) + this.shutdownBackgroundTaskManager(false, false, false); this.closeInternalResources(); @@ -1473,11 +1428,22 @@ private VectorSearchResult convertSearchResult(final SearchResult result) public void optimize() { // Drain pending indexing operations to ensure graph is complete - if(this.indexingManager != null) + if(this.isEventualIndexing()) { - this.indexingManager.drainQueue(); + this.backgroundTaskManager.drainQueue(); } + this.doOptimize(); + } + + /** + * Core optimization logic without queue drain. + * Called directly from the background task manager's executor thread + * (where inline drain is already done) and from the public optimize() method. + */ + @Override + public void doOptimize() + { final GraphIndexBuilder capturedBuilder; // Signal sync-mode mutations to defer builder ops during cleanup. @@ -1529,9 +1495,25 @@ public void persistToDisk() } // Drain pending indexing operations to ensure graph is complete - if(this.indexingManager != null) + if(this.isEventualIndexing()) { - this.indexingManager.drainQueue(); + this.backgroundTaskManager.drainQueue(); + } + + this.doPersistToDisk(); + } + + /** + * Core persistence logic without queue drain. + * Called directly from the background task manager's executor thread + * (where inline drain is already done) and from the public persistToDisk() method. + */ + @Override + public void doPersistToDisk() + { + if(!this.configuration.onDisk()) + { + return; // No-op for in-memory indices } // Signal sync-mode mutations to defer builder ops during cleanup + disk write. @@ -1653,14 +1635,12 @@ protected void clearChildrenStateChangeMarkers() @Override public void close() { - // Shutdown indexing manager first — drain all pending graph operations - this.shutdownIndexingManager(true); - - // Shutdown optimization manager (may optimize pending changes) - this.shutdownOptimizationManager(this.configuration.optimizeOnShutdown()); - - // Shutdown persistence manager (may persist pending changes) - this.shutdownPersistenceManager(this.configuration.persistOnShutdown()); + // Shutdown background task manager — drain indexing, optionally optimize and persist + this.shutdownBackgroundTaskManager( + true, + this.configuration.optimizeOnShutdown(), + this.configuration.persistOnShutdown() + ); // Acquire write lock to ensure no concurrent search or persistToDisk() is running. // closeInternalResources() destroys the graph and disk manager. @@ -1676,48 +1656,26 @@ public void close() } /** - * Shuts down the background optimization manager. - * - * @param optimizePending if true, optimize pending changes before shutdown - */ - private void shutdownOptimizationManager(final boolean optimizePending) - { - if(this.optimizationManager != null) - { - this.optimizationManager.shutdown(optimizePending); - this.optimizationManager = null; - } - } - - /** - * Shuts down the background persistence manager. - * - * @param persistPending if true, persist pending changes before shutdown - */ - private void shutdownPersistenceManager(final boolean persistPending) - { - if(this.persistenceManager != null) - { - this.persistenceManager.shutdown(persistPending); - this.persistenceManager = null; - } - } - - /** - * Shuts down the background indexing manager. + * Shuts down the background task manager. * - * @param drainPending if true, drain all pending operations before shutdown + * @param drainPending if true, drain all pending indexing operations + * @param optimizePending if true and there are pending changes, optimize before shutdown + * @param persistPending if true and there are pending changes, persist before shutdown */ - private void shutdownIndexingManager(final boolean drainPending) + private void shutdownBackgroundTaskManager( + final boolean drainPending, + final boolean optimizePending, + final boolean persistPending + ) { - if(this.indexingManager != null) + if(this.backgroundTaskManager != null) { if(!drainPending) { - this.indexingManager.discardQueue(); + this.backgroundTaskManager.discardQueue(); } - this.indexingManager.shutdown(drainPending); - this.indexingManager = null; + this.backgroundTaskManager.shutdown(drainPending, optimizePending, persistPending); + this.backgroundTaskManager = null; } } @@ -1788,12 +1746,6 @@ private RandomAccessVectorValues createVectorValues() // callback interface implementations // //////////////////////////////////////// - // Note: BackgroundPersistenceManager.Callback.persistToDisk() is implemented - // by the public persistToDisk() method above. - - // Note: BackgroundOptimizationManager.Callback.optimize() is implemented - // by the public optimize() method above. - // PQCompressionManager.VectorProvider @Override @@ -1839,7 +1791,7 @@ public long getExpectedVectorCount() } // ================================================================ - // BackgroundIndexingManager.Callback implementation + // BackgroundTaskManager.Callback implementation // ================================================================ @Override diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConcurrentStressTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConcurrentStressTest.java index 5c6d8016..e0255df0 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConcurrentStressTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConcurrentStressTest.java @@ -237,30 +237,28 @@ private void runStressTest(final ConfigCombo combo, final Path indexDir) throws final VectorIndexConfiguration config = buildConfig(combo, dimension, indexDir); - final VectorIndex index = vectorIndices.add( + try (final VectorIndex index = vectorIndices.add( "embeddings", config, new ComputedDocumentVectorizer() - ); - - try + )) { // Seed the index with initial entities so updates/removes have targets final Random seedRandom = new Random(42); - for(int i = 0; i < seedCount; i++) + for (int i = 0; i < seedCount; i++) { gigaMap.add(new Document("seed_" + i, randomVector(seedRandom, dimension))); } // For eventual indexing, drain the seed operations - if(combo.eventual()) + if (combo.eventual()) { - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - defaultIndex.indexingManager.drainQueue(); + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); } // If PQ compression, train before concurrent access - if(combo.pqCompression()) + if (combo.pqCompression()) { - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); } // Shared state for coordinating threads @@ -273,7 +271,7 @@ private void runStressTest(final ConfigCombo combo, final Path indexDir) throws final ExecutorService executor = Executors.newFixedThreadPool(threadCount); - for(int t = 0; t < threadCount; t++) + for (int t = 0; t < threadCount; t++) { final int threadId = t; executor.submit(() -> @@ -285,59 +283,54 @@ private void runStressTest(final ConfigCombo combo, final Path indexDir) throws final Random random = new Random(1000 + threadId); - for(int op = 0; op < opsPerThread && !hasError.get(); op++) + for (int op = 0; op < opsPerThread && !hasError.get(); op++) { try { final int action = random.nextInt(100); - if(action < 30) + if (action < 30) { // 30%: ADD final float[] vector = randomVector(random, dimension); - synchronized(gigaMap) + synchronized (gigaMap) { gigaMap.add(new Document( "t" + threadId + "_" + op, vector )); } - } - else if(action < 45) + } else if (action < 45) { // 15%: UPDATE (set) — target a seed entity final long targetId = random.nextInt(seedCount); final float[] vector = randomVector(random, dimension); - synchronized(gigaMap) + synchronized (gigaMap) { try { gigaMap.set(targetId, new Document( "updated_" + targetId, vector )); - } - catch(final Exception e) + } catch (final Exception e) { // Entity may have been removed by another thread — acceptable } } - } - else if(action < 55) + } else if (action < 55) { // 10%: REMOVE — target a seed entity final long targetId = random.nextInt(seedCount); - synchronized(gigaMap) + synchronized (gigaMap) { try { gigaMap.removeById(targetId); - } - catch(final Exception e) + } catch (final Exception e) { // Entity may already be removed — acceptable } } - } - else + } else { // 45%: SEARCH final float[] queryVector = randomVector(random, dimension); @@ -347,19 +340,16 @@ else if(action < 55) } completedOps.incrementAndGet(); - } - catch(final Exception e) + } catch (final Exception e) { errors.add(e); hasError.set(true); } } - } - catch(final InterruptedException e) + } catch (final InterruptedException e) { Thread.currentThread().interrupt(); - } - finally + } finally { doneLatch.countDown(); } @@ -377,26 +367,26 @@ else if(action < 55) assertTrue(executor.awaitTermination(10, TimeUnit.SECONDS)); // Report errors - if(!errors.isEmpty()) + if (!errors.isEmpty()) { final StringBuilder sb = new StringBuilder(); sb.append("Concurrent stress test failed for: ").append(combo.label()); sb.append("\n").append(errors.size()).append(" error(s):"); - for(final Throwable err : errors) + for (final Throwable err : errors) { sb.append("\n - ").append(err.getClass().getSimpleName()) - .append(": ").append(err.getMessage()); + .append(": ").append(err.getMessage()); } fail(sb.toString()); } // Verify the index is still consistent — drain and search - if(combo.eventual()) + if (combo.eventual()) { - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - if(defaultIndex.indexingManager != null) + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + if (defaultIndex.backgroundTaskManager != null) { - defaultIndex.indexingManager.drainQueue(); + defaultIndex.backgroundTaskManager.drainQueue(); } } @@ -405,10 +395,6 @@ else if(action < 55) ); assertNotNull(finalResult); } - finally - { - index.close(); - } } @@ -543,21 +529,19 @@ void testEventualIndexingHeavyConcurrentLoad(@TempDir final Path tempDir) .minChangesBetweenPersists(5) .build(); - final VectorIndex index = vectorIndices.add( + try (final VectorIndex index = vectorIndices.add( "embeddings", config, new ComputedDocumentVectorizer() - ); - - try + )) { // Seed final Random seedRandom = new Random(42); - for(int i = 0; i < seedCount; i++) + for (int i = 0; i < seedCount; i++) { gigaMap.add(new Document("seed_" + i, randomVector(seedRandom, dimension))); } - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - defaultIndex.indexingManager.drainQueue(); + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); final AtomicBoolean hasError = new AtomicBoolean(false); final List errors = java.util.Collections.synchronizedList(new ArrayList<>()); @@ -566,7 +550,7 @@ void testEventualIndexingHeavyConcurrentLoad(@TempDir final Path tempDir) final ExecutorService executor = Executors.newFixedThreadPool(threadCount); - for(int t = 0; t < threadCount; t++) + for (int t = 0; t < threadCount; t++) { final int threadId = t; executor.submit(() -> @@ -576,16 +560,16 @@ void testEventualIndexingHeavyConcurrentLoad(@TempDir final Path tempDir) startLatch.await(); final Random random = new Random(2000 + threadId); - for(int op = 0; op < opsPerThread && !hasError.get(); op++) + for (int op = 0; op < opsPerThread && !hasError.get(); op++) { try { final int action = random.nextInt(100); - if(action < 25) + if (action < 25) { // ADD - synchronized(gigaMap) + synchronized (gigaMap) { gigaMap.add(new Document( "t" + threadId + "_" + op, @@ -593,11 +577,11 @@ void testEventualIndexingHeavyConcurrentLoad(@TempDir final Path tempDir) )); } } - else if(action < 40) + else if (action < 40) { // UPDATE final long targetId = random.nextInt(seedCount); - synchronized(gigaMap) + synchronized (gigaMap) { try { @@ -606,23 +590,25 @@ else if(action < 40) randomVector(random, dimension) )); } - catch(final Exception ignored) {} + catch(final Exception ignored) + { + } } - } - else if(action < 50) + } else if (action < 50) { // REMOVE final long targetId = random.nextInt(seedCount); - synchronized(gigaMap) + synchronized (gigaMap) { try { gigaMap.removeById(targetId); } - catch(final Exception ignored) {} + catch(final Exception ignored) + { + } } - } - else + } else { // SEARCH final VectorSearchResult result = index.search( @@ -657,28 +643,24 @@ else if(action < 50) executor.shutdown(); assertTrue(executor.awaitTermination(10, TimeUnit.SECONDS)); - if(!errors.isEmpty()) + if (!errors.isEmpty()) { final StringBuilder sb = new StringBuilder("Heavy eventual indexing stress test failed:"); - for(final Throwable err : errors) + for (final Throwable err : errors) { sb.append("\n - ").append(err.getClass().getSimpleName()) - .append(": ").append(err.getMessage()); + .append(": ").append(err.getMessage()); } fail(sb.toString()); } // Drain and verify final state - defaultIndex.indexingManager.drainQueue(); + defaultIndex.backgroundTaskManager.drainQueue(); final VectorSearchResult finalResult = index.search( randomVector(new Random(999), dimension), 5 ); assertNotNull(finalResult); } - finally - { - index.close(); - } } } diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java index 504cd3ea..ad8c6c8a 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java @@ -2002,7 +2002,7 @@ void testBackgroundOptimizationTriggersAfterIntervalAndThreshold(@TempDir final final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; // Initially, optimization count should be 0 - assertEquals(0, defaultIndex.optimizationManager.getOptimizationCount(), + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), "Optimization count should be 0 initially"); // Add vectors to trigger dirty state above threshold @@ -2012,18 +2012,18 @@ void testBackgroundOptimizationTriggersAfterIntervalAndThreshold(@TempDir final } // Verify pending changes are tracked - assertTrue(defaultIndex.optimizationManager.getPendingChangeCount() > 0, + assertTrue(defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount() > 0, "Pending changes should be tracked"); // Wait for background optimization to run Thread.sleep(800); // Verify optimization was actually performed - assertTrue(defaultIndex.optimizationManager.getOptimizationCount() >= 1, + assertTrue(defaultIndex.backgroundTaskManager.getOptimizationCount() >= 1, "Optimization should have been performed at least once"); // Verify pending changes were reset - assertEquals(0, defaultIndex.optimizationManager.getPendingChangeCount(), + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), "Pending changes should be reset after optimization"); // Verify search still works @@ -2076,18 +2076,18 @@ void testOptimizationDebouncingBelowThreshold(@TempDir final Path tempDir) throw } // Verify pending changes are tracked - assertEquals(50, defaultIndex.optimizationManager.getPendingChangeCount(), + assertEquals(50, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), "Pending changes should be 50"); // Wait for multiple optimization intervals Thread.sleep(600); // Verify optimization was NOT performed (below threshold) - assertEquals(0, defaultIndex.optimizationManager.getOptimizationCount(), + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), "Optimization should NOT have been performed (below threshold)"); // Verify pending changes are still tracked (not reset) - assertEquals(50, defaultIndex.optimizationManager.getPendingChangeCount(), + assertEquals(50, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), "Pending changes should still be 50 (not reset)"); // Search should still work @@ -2139,11 +2139,11 @@ void testShutdownOptimizesPendingChanges(@TempDir final Path tempDir) throws Exc } // Verify pending changes are tracked - assertEquals(vectorCount, defaultIndex.optimizationManager.getPendingChangeCount(), + assertEquals(vectorCount, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), "Pending changes should equal vector count"); // Verify no optimization has run yet - assertEquals(0, defaultIndex.optimizationManager.getOptimizationCount(), + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), "Optimization count should be 0 before close"); // Verify search works before close @@ -2197,11 +2197,11 @@ void testShutdownSkipsOptimizeWhenDisabled(@TempDir final Path tempDir) throws E } // Verify pending changes are tracked - assertEquals(vectorCount, defaultIndex.optimizationManager.getPendingChangeCount(), + assertEquals(vectorCount, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), "Pending changes should equal vector count"); // Verify no optimization has run yet - assertEquals(0, defaultIndex.optimizationManager.getOptimizationCount(), + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), "Optimization count should be 0 before close"); // Close the index (should NOT trigger optimize) diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexEventualIndexingTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexEventualIndexingTest.java index f32f1116..25462827 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexEventualIndexingTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexEventualIndexingTest.java @@ -93,19 +93,17 @@ void testAddAndSearchWithEventualIndexing() .eventualIndexing(true) .build(); - final VectorIndex index = vectorIndices.add( + try (final VectorIndex index = vectorIndices.add( "embeddings", config, new EmbeddedDocumentVectorizer() - ); - - try + )) { gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); gigaMap.add(new Document("doc3", new float[]{0.0f, 0.0f, 1.0f})); // Drain queue to ensure all graph operations are applied - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - defaultIndex.indexingManager.drainQueue(); + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); // Search should find all 3 documents final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 3); @@ -114,10 +112,6 @@ void testAddAndSearchWithEventualIndexing() // The closest match should be doc1 assertEquals("doc1", result.toList().get(0).entity().content()); } - finally - { - index.close(); - } } @Test @@ -133,27 +127,21 @@ void testAddAndSearchWithComputedVectorizer() .eventualIndexing(true) .build(); - final VectorIndex index = vectorIndices.add( + try (final VectorIndex index = vectorIndices.add( "embeddings", config, new ComputedDocumentVectorizer() - ); - - try + )) { gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); gigaMap.add(new Document("doc3", new float[]{0.0f, 0.0f, 1.0f})); - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - defaultIndex.indexingManager.drainQueue(); + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 3); assertEquals(3, result.size()); assertEquals("doc1", result.toList().get(0).entity().content()); } - finally - { - index.close(); - } } // ==================== Bulk Add Tests ==================== @@ -175,29 +163,23 @@ void testBulkAddWithEventualIndexing() .eventualIndexing(true) .build(); - final VectorIndex index = vectorIndices.add( + try (final VectorIndex index = vectorIndices.add( "embeddings", config, new ComputedDocumentVectorizer() - ); - - try + )) { - for(int i = 0; i < vectorCount; i++) + for (int i = 0; i < vectorCount; i++) { gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); } - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - defaultIndex.indexingManager.drainQueue(); + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); final VectorSearchResult result = index.search( randomVector(new Random(99), dimension), 10 ); assertEquals(10, result.size()); } - finally - { - index.close(); - } } // ==================== Update Tests ==================== @@ -215,34 +197,28 @@ void testUpdateWithEventualIndexing() .eventualIndexing(true) .build(); - final VectorIndex index = vectorIndices.add( + try (final VectorIndex index = vectorIndices.add( "embeddings", config, new EmbeddedDocumentVectorizer() - ); - - try + )) { final Document doc1 = new Document("doc1", new float[]{1.0f, 0.0f, 0.0f}); final Document doc2 = new Document("doc2", new float[]{0.0f, 1.0f, 0.0f}); gigaMap.add(doc1); gigaMap.add(doc2); - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - defaultIndex.indexingManager.drainQueue(); + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); // Update doc1's vector to be close to doc2 final Document updatedDoc1 = new Document("doc1_updated", new float[]{0.1f, 0.9f, 0.0f}); gigaMap.set(0L, updatedDoc1); - defaultIndex.indexingManager.drainQueue(); + defaultIndex.backgroundTaskManager.drainQueue(); // Search for doc2-like vector: updated doc1 should now be close final VectorSearchResult result = index.search(new float[]{0.0f, 1.0f, 0.0f}, 2); assertEquals(2, result.size()); } - finally - { - index.close(); - } } // ==================== Remove Tests ==================== @@ -260,32 +236,26 @@ void testRemoveWithEventualIndexing() .eventualIndexing(true) .build(); - final VectorIndex index = vectorIndices.add( + try (final VectorIndex index = vectorIndices.add( "embeddings", config, new EmbeddedDocumentVectorizer() - ); - - try + )) { gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); gigaMap.add(new Document("doc3", new float[]{0.0f, 0.0f, 1.0f})); - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - defaultIndex.indexingManager.drainQueue(); + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); // Remove doc1 gigaMap.removeById(0L); - defaultIndex.indexingManager.drainQueue(); + defaultIndex.backgroundTaskManager.drainQueue(); // Search should only return 2 documents final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 3); assertEquals(2, result.size()); } - finally - { - index.close(); - } } @Test @@ -305,27 +275,25 @@ void testRemoveMultipleWithEventualIndexing() .eventualIndexing(true) .build(); - final VectorIndex index = vectorIndices.add( + try (final VectorIndex index = vectorIndices.add( "embeddings", config, new ComputedDocumentVectorizer() - ); - - try + )) { - for(int i = 0; i < vectorCount; i++) + for (int i = 0; i < vectorCount; i++) { gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); } - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - defaultIndex.indexingManager.drainQueue(); + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); // Remove first 10 entities - for(int i = 0; i < 10; i++) + for (int i = 0; i < 10; i++) { gigaMap.removeById(i); } - defaultIndex.indexingManager.drainQueue(); + defaultIndex.backgroundTaskManager.drainQueue(); // Should have 40 remaining final VectorSearchResult result = index.search( @@ -333,10 +301,6 @@ void testRemoveMultipleWithEventualIndexing() ); assertEquals(40, result.size()); } - finally - { - index.close(); - } } // ==================== RemoveAll Tests ==================== @@ -354,17 +318,15 @@ void testRemoveAllDiscardsQueueAndResets() .eventualIndexing(true) .build(); - final VectorIndex index = vectorIndices.add( + try (final VectorIndex index = vectorIndices.add( "embeddings", config, new EmbeddedDocumentVectorizer() - ); - - try + )) { gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - defaultIndex.indexingManager.drainQueue(); + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); // RemoveAll — this discards pending operations and shuts down manager gigaMap.removeAll(); @@ -377,15 +339,11 @@ void testRemoveAllDiscardsQueueAndResets() gigaMap.add(new Document("new_doc", new float[]{1.0f, 0.0f, 0.0f})); // Drain the new indexing manager - defaultIndex.indexingManager.drainQueue(); + defaultIndex.backgroundTaskManager.drainQueue(); final VectorSearchResult result2 = index.search(new float[]{1.0f, 0.0f, 0.0f}, 10); assertEquals(1, result2.size()); } - finally - { - index.close(); - } } // ==================== Optimize Drains Queue Tests ==================== @@ -406,13 +364,11 @@ void testOptimizeDrainsQueueBeforeCleanup() .eventualIndexing(true) .build(); - final VectorIndex index = vectorIndices.add( + try (final VectorIndex index = vectorIndices.add( "embeddings", config, new ComputedDocumentVectorizer() - ); - - try + )) { - for(int i = 0; i < 50; i++) + for (int i = 0; i < 50; i++) { gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); } @@ -426,10 +382,6 @@ void testOptimizeDrainsQueueBeforeCleanup() ); assertEquals(10, result.size()); } - finally - { - index.close(); - } } // ==================== PersistToDisk Drains Queue Tests ==================== @@ -453,13 +405,11 @@ void testPersistToDiskDrainsQueueBeforeWrite(@TempDir final Path tempDir) .eventualIndexing(true) .build(); - final VectorIndex index = vectorIndices.add( + try (final VectorIndex index = vectorIndices.add( "embeddings", config, new ComputedDocumentVectorizer() - ); - - try + )) { - for(int i = 0; i < 50; i++) + for (int i = 0; i < 50; i++) { gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); } @@ -477,10 +427,6 @@ void testPersistToDiskDrainsQueueBeforeWrite(@TempDir final Path tempDir) ); assertEquals(10, result.size()); } - finally - { - index.close(); - } } // ==================== Close Drains Queue Tests ==================== @@ -526,28 +472,22 @@ void testPendingCountTracksQueuedOperations() .eventualIndexing(true) .build(); - final VectorIndex index = vectorIndices.add( + try (final VectorIndex index = vectorIndices.add( "embeddings", config, new EmbeddedDocumentVectorizer() - ); - - try + )) { - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; // Initially empty - assertEquals(0, defaultIndex.indexingManager.getPendingCount()); + assertEquals(0, defaultIndex.backgroundTaskManager.getPendingIndexingCount()); // After drain, count should be 0 gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); - defaultIndex.indexingManager.drainQueue(); + defaultIndex.backgroundTaskManager.drainQueue(); - assertEquals(0, defaultIndex.indexingManager.getPendingCount()); - } - finally - { - index.close(); + assertEquals(0, defaultIndex.backgroundTaskManager.getPendingIndexingCount()); } } @@ -572,20 +512,18 @@ void testLargeDataSetWithEventualIndexing() .eventualIndexing(true) .build(); - final VectorIndex index = vectorIndices.add( + try (final VectorIndex index = vectorIndices.add( "embeddings", config, new ComputedDocumentVectorizer() - ); - - try + )) { // Add random vectors - for(int i = 0; i < vectorCount; i++) + for (int i = 0; i < vectorCount; i++) { gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); } - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - defaultIndex.indexingManager.drainQueue(); + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); // Search should return correct number of results final VectorSearchResult result = index.search( @@ -594,16 +532,12 @@ void testLargeDataSetWithEventualIndexing() assertEquals(10, result.size()); // All results should have valid scores - for(final VectorSearchResult.Entry entry : result) + for (final VectorSearchResult.Entry entry : result) { assertTrue(entry.score() > 0, "Score should be positive"); assertNotNull(entry.entity()); } } - finally - { - index.close(); - } } // ==================== On-Disk with Eventual Indexing ==================== @@ -628,13 +562,11 @@ void testOnDiskWithEventualIndexing(@TempDir final Path tempDir) .eventualIndexing(true) .build(); - final VectorIndex index = vectorIndices.add( + try (final VectorIndex index = vectorIndices.add( "embeddings", config, new ComputedDocumentVectorizer() - ); - - try + )) { - for(int i = 0; i < vectorCount; i++) + for (int i = 0; i < vectorCount; i++) { gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); } @@ -649,10 +581,6 @@ void testOnDiskWithEventualIndexing(@TempDir final Path tempDir) ); assertEquals(10, result.size()); } - finally - { - index.close(); - } } // ==================== Background Persistence + Eventual Indexing ==================== @@ -678,13 +606,11 @@ void testBackgroundPersistenceWithEventualIndexing(@TempDir final Path tempDir) .eventualIndexing(true) .build(); - final VectorIndex index = vectorIndices.add( + try (final VectorIndex index = vectorIndices.add( "embeddings", config, new ComputedDocumentVectorizer() - ); - - try + )) { - for(int i = 0; i < 20; i++) + for (int i = 0; i < 20; i++) { gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); } @@ -694,10 +620,6 @@ void testBackgroundPersistenceWithEventualIndexing(@TempDir final Path tempDir) assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); } - finally - { - index.close(); - } } // ==================== Disabled by Default Tests ==================== @@ -713,20 +635,21 @@ void testEventualIndexingDisabledByDefault() .similarityFunction(VectorSimilarityFunction.COSINE) .build(); - final VectorIndex index = vectorIndices.add( + try(final VectorIndex index = vectorIndices.add( "embeddings", config, new EmbeddedDocumentVectorizer() - ); - - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + )) + { + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; - // Indexing manager should be null when eventualIndexing is false - assertNull(defaultIndex.indexingManager); + // Background task manager should be null when no background features are enabled + assertNull(defaultIndex.backgroundTaskManager); - // Synchronous indexing should still work - gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + // Synchronous indexing should still work + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); - final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 1); - assertEquals(1, result.size()); + final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + assertEquals(1, result.size()); + } } // ==================== Combined Operations Tests ==================== @@ -744,30 +667,28 @@ void testAddUpdateRemoveSequenceWithEventualIndexing() .eventualIndexing(true) .build(); - final VectorIndex index = vectorIndices.add( + try (final VectorIndex index = vectorIndices.add( "embeddings", config, new EmbeddedDocumentVectorizer() - ); - - try + )) { - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; // Add 3 documents gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); gigaMap.add(new Document("doc3", new float[]{0.0f, 0.0f, 1.0f})); - defaultIndex.indexingManager.drainQueue(); + defaultIndex.backgroundTaskManager.drainQueue(); // Update doc2 gigaMap.set(1L, new Document("doc2_updated", new float[]{0.9f, 0.1f, 0.0f})); - defaultIndex.indexingManager.drainQueue(); + defaultIndex.backgroundTaskManager.drainQueue(); // Remove doc3 gigaMap.removeById(2L); - defaultIndex.indexingManager.drainQueue(); + defaultIndex.backgroundTaskManager.drainQueue(); // Search: should find 2 documents final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 3); @@ -777,10 +698,6 @@ void testAddUpdateRemoveSequenceWithEventualIndexing() assertEquals("doc1", result.toList().get(0).entity().content()); assertEquals("doc2_updated", result.toList().get(1).entity().content()); } - finally - { - index.close(); - } } // ==================== Background Optimization + Eventual Indexing ==================== @@ -803,25 +720,23 @@ void testBackgroundOptimizationWithEventualIndexing() throws Exception .eventualIndexing(true) .build(); - final VectorIndex index = vectorIndices.add( + try (final VectorIndex index = vectorIndices.add( "embeddings", config, new ComputedDocumentVectorizer() - ); - - try + )) { - for(int i = 0; i < 50; i++) + for (int i = 0; i < 50; i++) { gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); } - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - defaultIndex.indexingManager.drainQueue(); + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + defaultIndex.backgroundTaskManager.drainQueue(); // Wait for background optimization to run Thread.sleep(800); // Optimization should have run at least once - assertTrue(defaultIndex.optimizationManager.getOptimizationCount() >= 1); + assertTrue(defaultIndex.backgroundTaskManager.getOptimizationCount() >= 1); // Search should still work final VectorSearchResult result = index.search( @@ -829,9 +744,5 @@ void testBackgroundOptimizationWithEventualIndexing() throws Exception ); assertEquals(10, result.size()); } - finally - { - index.close(); - } } } diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexPerformanceTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexPerformanceTest.java index fe651b85..7578e8c0 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexPerformanceTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexPerformanceTest.java @@ -881,7 +881,7 @@ private long[] measureSingleAddEventual( final long addMs = (System.nanoTime() - addStart) / 1_000_000; final long drainStart = System.nanoTime(); - defaultIndex.indexingManager.drainQueue(); + defaultIndex.backgroundTaskManager.drainQueue(); final long drainMs = (System.nanoTime() - drainStart) / 1_000_000; System.err.printf("add=%,d ms drain=%,d ms total=%,d ms (%,.0f vec/sec add-visible)%n", @@ -972,7 +972,7 @@ private long[] measureBatchAddEventual( final long addMs = (System.nanoTime() - addStart) / 1_000_000; final long drainStart = System.nanoTime(); - defaultIndex.indexingManager.drainQueue(); + defaultIndex.backgroundTaskManager.drainQueue(); final long drainMs = (System.nanoTime() - drainStart) / 1_000_000; System.err.printf("add=%,d ms drain=%,d ms total=%,d ms (%,.0f vec/sec add-visible)%n", @@ -1015,7 +1015,7 @@ private void verifySearchQuality( if(eventual) { - ((VectorIndex.Default)index).indexingManager.drainQueue(); + ((VectorIndex.Default)index).backgroundTaskManager.drainQueue(); } int totalResults = 0; From 174e3ee5f331e2fbe006493e9ce6b347ce9eff59 Mon Sep 17 00:00:00 2001 From: Zdenek Jonas Date: Sat, 21 Feb 2026 15:10:19 +0100 Subject: [PATCH 15/15] Gigamap jvector update tests updade (#569) * move configuration tests to VICT file, remove duplicite tests. * remove test, duplicate of testBackgroundOptimizationTriggersAfterIntervalAndThreshold * test refactoring * change indents - to see diff in PR * reverts original formats * add vector indices unit tests * remove dulicite tests --- gigamap/jvector/pom.xml | 6 + .../jvector/VectorIndexConfigurationTest.java | 202 +++- .../gigamap/jvector/VectorIndexDiskTest.java | 1077 ++++------------- .../gigamap/jvector/VectorIndicesTest.java | 379 ++++++ 4 files changed, 776 insertions(+), 888 deletions(-) create mode 100644 gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndicesTest.java diff --git a/gigamap/jvector/pom.xml b/gigamap/jvector/pom.xml index 00f7e8f3..ab5cf765 100644 --- a/gigamap/jvector/pom.xml +++ b/gigamap/jvector/pom.xml @@ -44,6 +44,12 @@ junit-jupiter-engine test + + org.awaitility + awaitility + 4.2.2 + test + diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java index 786b55b7..901ce3df 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java @@ -123,30 +123,6 @@ void testBuilderRequiresNonNegativePqSubspaces() ); } - @Test - void testBuilderRequiresNonNegativePersistenceIntervalMs() - { - // 0 is valid (means disabled) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(64) - .persistenceIntervalMs(0) - .build(); - assertEquals(0L, config.persistenceIntervalMs()); - assertFalse(config.backgroundPersistence()); - - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder().dimension(64).persistenceIntervalMs(-1).build() - ); - } - - @Test - void testBuilderRequiresNonNegativeMinChangesBetweenPersists() - { - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder().dimension(64).minChangesBetweenPersists(-1).build() - ); - } - @Test void testBuilderRequiresNonNegativeOptimizationIntervalMs() { @@ -192,19 +168,6 @@ void testOnDiskRequiresIndexDirectory() ); } - @Test - void testOnDiskWithIndexDirectorySucceeds(@TempDir final Path tempDir) - { - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(64) - .onDisk(true) - .indexDirectory(tempDir) - .build(); - - assertTrue(config.onDisk()); - assertEquals(tempDir, config.indexDirectory()); - } - @Test void testCompressionRequiresOnDisk() { @@ -1014,4 +977,169 @@ void testFactoryMethodsDefaultEventualIndexingFalse(@TempDir final Path tempDir) assertFalse(VectorIndexConfiguration.forLargeDataset(64, tempDir).eventualIndexing()); assertFalse(VectorIndexConfiguration.forHighPrecision(64).eventualIndexing()); } + + /** + * Test on-disk configuration builder. + */ + @Test + void testOnDiskConfigurationBuilder(@TempDir final Path tempDir) + { + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(tempDir) + .build(); + + assertTrue(config.onDisk()); + assertEquals(tempDir, config.indexDirectory()); + assertFalse(config.enablePqCompression()); + assertEquals(0, config.pqSubspaces()); + } + + /** + * Test on-disk configuration with compression. + * FusedPQ requires maxDegree=32, so it should be auto-set. + */ + @Test + void testOnDiskConfigurationWithCompression(@TempDir final Path tempDir) + { + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) // Will be overridden to 32 for FusedPQ + .onDisk(true) + .indexDirectory(tempDir) + .enablePqCompression(true) + .pqSubspaces(32) + .build(); + + assertTrue(config.onDisk()); + assertTrue(config.enablePqCompression()); + assertEquals(32, config.pqSubspaces()); + assertEquals(32, config.maxDegree(), "FusedPQ requires maxDegree=32"); + } + + /** + * Test that maxDegree is auto-set to 32 when compression is enabled. + */ + @Test + void testFusedPQRequiresMaxDegree32(@TempDir final Path tempDir) + { + // Try to set maxDegree to 64 with compression enabled + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .maxDegree(64) + .onDisk(true) + .indexDirectory(tempDir) + .enablePqCompression(true) + .build(); + + // Should be overridden to 32 + assertEquals(32, config.maxDegree(), "FusedPQ should enforce maxDegree=32"); + } + + /** + * Test background persistence configuration builder. + */ + @Test + void testBackgroundPersistenceConfigurationBuilder(@TempDir final Path tempDir) + { + final Path indexDir = tempDir.resolve("index"); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(60_000) + .persistOnShutdown(true) + .minChangesBetweenPersists(50) + .build(); + + assertTrue(config.onDisk()); + assertTrue(config.backgroundPersistence()); + assertEquals(60_000, config.persistenceIntervalMs()); + assertTrue(config.persistOnShutdown()); + assertEquals(50, config.minChangesBetweenPersists()); + } + + /** + * Test validation: persistenceIntervalMs must be non-negative. + */ + @Test + void testPersistenceIntervalMsMustBeNonNegative(@TempDir final Path tempDir) + { + // 0 is valid (means disabled) + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .persistenceIntervalMs(0) + .build(); + assertEquals(0, config.persistenceIntervalMs()); + assertFalse(config.backgroundPersistence()); + + assertThrows(IllegalArgumentException.class, () -> + VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .persistenceIntervalMs(-1000) + .build() + ); + } + + /** + * Test validation: minChangesBetweenPersists must be non-negative. + */ + @Test + void testMinChangesBetweenPersistsMustBeNonNegative(@TempDir final Path tempDir) + { + assertThrows(IllegalArgumentException.class, () -> + VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .minChangesBetweenPersists(-1) + .build() + ); + + // Zero should be allowed (persist on every interval) + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .minChangesBetweenPersists(0) + .build(); + assertEquals(0, config.minChangesBetweenPersists()); + } + + /** + * Test background optimization configuration builder. + */ + @Test + void testBackgroundOptimizationConfigurationBuilder(@TempDir final Path tempDir) + { + final Path indexDir = tempDir.resolve("index"); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(120_000) + .minChangesBetweenOptimizations(500) + .optimizeOnShutdown(true) + .build(); + + assertTrue(config.onDisk()); + assertTrue(config.backgroundOptimization()); + assertEquals(120_000, config.optimizationIntervalMs()); + assertEquals(500, config.minChangesBetweenOptimizations()); + assertTrue(config.optimizeOnShutdown()); + } + } diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java index ad8c6c8a..3d94f254 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java @@ -9,17 +9,14 @@ * This program and the accompanying materials are made * available under the terms of the Eclipse Public License 2.0 * which is available at https://www.eclipse.org/legal/epl-2.0/ - * + * * SPDX-License-Identifier: EPL-2.0 * #L% */ -import org.eclipse.store.gigamap.types.GigaMap; -import org.eclipse.store.storage.embedded.types.EmbeddedStorage; -import org.eclipse.store.storage.embedded.types.EmbeddedStorageManager; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.Timeout; -import org.junit.jupiter.api.io.TempDir; +import static java.time.Duration.ofMillis; +import static org.awaitility.Awaitility.await; +import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; import java.nio.file.Files; @@ -33,8 +30,14 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.IntStream; -import static org.junit.jupiter.api.Assertions.*; +import org.eclipse.store.gigamap.types.GigaMap; +import org.eclipse.store.storage.embedded.types.EmbeddedStorage; +import org.eclipse.store.storage.embedded.types.EmbeddedStorageManager; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.api.io.TempDir; /** * Tests for on-disk VectorIndex functionality and Product Quantization. @@ -97,120 +100,33 @@ private static float[] randomVector(final Random random, final int dimension) } /** - * Test on-disk configuration builder. - */ - @Test - void testOnDiskConfigurationBuilder(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - assertTrue(config.onDisk()); - assertEquals(indexDir, config.indexDirectory()); - assertFalse(config.enablePqCompression()); - assertEquals(0, config.pqSubspaces()); - } - - /** - * Test on-disk configuration with compression. - * FusedPQ requires maxDegree=32, so it should be auto-set. - */ - @Test - void testOnDiskConfigurationWithCompression(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(16) // Will be overridden to 32 for FusedPQ - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(32) - .build(); - - assertTrue(config.onDisk()); - assertTrue(config.enablePqCompression()); - assertEquals(32, config.pqSubspaces()); - assertEquals(32, config.maxDegree(), "FusedPQ requires maxDegree=32"); - } - - /** - * Test that maxDegree is auto-set to 32 when compression is enabled. - */ - @Test - void testFusedPQRequiresMaxDegree32(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - // Try to set maxDegree to 64 with compression enabled - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .maxDegree(64) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .build(); - - // Should be overridden to 32 - assertEquals(32, config.maxDegree(), "FusedPQ should enforce maxDegree=32"); - } - - /** - * Test validation: onDisk requires indexDirectory. + * Helper to add multiple documents with random vectors to a GigaMap. */ - @Test - void testOnDiskRequiresIndexDirectory() + private static void addRandomDocuments( + final GigaMap gigaMap, + final Random random, + final int dimension, + final int count, + final String prefix + ) { - assertThrows(IllegalStateException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - // indexDirectory not set - .build() - ); + IntStream.range(0, count) + .forEach(i -> gigaMap.add(new Document(prefix + i, randomVector(random, dimension)))); } /** - * Test validation: compression requires onDisk. + * Helper to add multiple documents from a list of pre-generated vectors. */ - @Test - void testCompressionRequiresOnDisk() + private static void addDocumentsFromVectors( + final GigaMap gigaMap, + final List vectors, + final String prefix + ) { - assertThrows(IllegalStateException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .enablePqCompression(true) - // onDisk not set - .build() - ); + IntStream.range(0, vectors.size()) + .forEach(i -> gigaMap.add(new Document(prefix + i, vectors.get(i)))); } - /** - * Test validation: pqSubspaces must divide dimension evenly. - */ - @Test - void testPqSubspacesMustDivideDimension(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(100) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(33) // 100 is not divisible by 33 - .build() - ); - } /** * Test creating an on-disk index and persisting it. @@ -260,10 +176,7 @@ void testOnDiskIndexCreationAndPersistence(@TempDir final Path tempDir) throws I assertFalse(index.isPqCompressionEnabled()); // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } + addDocumentsFromVectors(gigaMap, vectors, "doc_"); // Search and record expected results final VectorSearchResult result = index.search(queryVector, 10); @@ -294,7 +207,6 @@ void testOnDiskIndexCreationAndPersistence(@TempDir final Path tempDir) throws I assertEquals(vectorCount, gigaMap.size()); final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); assertTrue(index.isOnDisk()); // Search and compare results @@ -346,10 +258,7 @@ void testOnDiskIndexWithCompression(@TempDir final Path tempDir) throws IOExcept assertTrue(index.isPqCompressionEnabled()); // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Train compression ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -361,11 +270,7 @@ void testOnDiskIndexWithCompression(@TempDir final Path tempDir) throws IOExcept assertEquals(10, result.size()); // Verify all entities are accessible - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - assertTrue(entry.entity().content().startsWith("doc_")); - } + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); // Persist to disk index.persistToDisk(); @@ -406,10 +311,7 @@ void testOnDiskSearchQuality(@TempDir final Path tempDir) throws IOException ); // Add random vectors - for(int i = 0; i < vectorCount - 1; i++) - { - gigaMap.add(new Document("random_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount - 1, "random_"); // Add a one-hot "needle" vector that randomVector() cannot produce, // since randomVector() populates all dimensions with non-zero values. @@ -459,10 +361,7 @@ void testOnDiskIndexMultipleRestarts(@TempDir final Path tempDir) throws IOExcep vectorIndices.add("embeddings", config, new ComputedDocumentVectorizer()); - for(int i = 0; i < 100; i++) - { - gigaMap.add(new Document("phase1_doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 100, "phase1_doc_"); assertEquals(100, gigaMap.size()); storage.storeRoot(); @@ -484,10 +383,7 @@ void testOnDiskIndexMultipleRestarts(@TempDir final Path tempDir) throws IOExcep assertEquals(10, result.size()); // Add more vectors - for(int i = 0; i < 50; i++) - { - gigaMap.add(new Document("phase2_doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 50, "phase2_doc_"); assertEquals(150, gigaMap.size()); storage.storeRoot(); @@ -549,10 +445,7 @@ void testPqCompressionSearchQuality(@TempDir final Path tempDir) ); // Add random vectors - for(int i = 0; i < vectorCount - 1; i++) - { - gigaMap.add(new Document("random_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount - 1, "random_"); // Add a one-hot "needle" vector that randomVector() cannot produce, // since randomVector() populates all dimensions with non-zero values. @@ -634,10 +527,7 @@ void testPqCompressionPersistAndReload(@TempDir final Path tempDir) throws IOExc assertTrue(index.isOnDisk()); assertTrue(index.isPqCompressionEnabled()); - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } + addDocumentsFromVectors(gigaMap, vectors, "doc_"); // Train and search ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -668,7 +558,6 @@ void testPqCompressionPersistAndReload(@TempDir final Path tempDir) throws IOExc assertEquals(vectorCount, gigaMap.size()); final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); assertTrue(index.isOnDisk()); assertTrue(index.isPqCompressionEnabled()); @@ -686,11 +575,7 @@ void testPqCompressionPersistAndReload(@TempDir final Path tempDir) throws IOExc assertEquals(expectedIds.size(), actualIds.size()); // Verify all entities are accessible - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - assertTrue(entry.entity().content().startsWith("doc_")); - } + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); } } } @@ -726,10 +611,7 @@ void testPqCompressionWithDotProduct(@TempDir final Path tempDir) new ComputedDocumentVectorizer() ); - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -737,10 +619,7 @@ void testPqCompressionWithDotProduct(@TempDir final Path tempDir) final VectorSearchResult result = index.search(queryVector, 10); assertEquals(10, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - } + result.forEach(entry -> assertNotNull(entry.entity())); } /** @@ -774,10 +653,7 @@ void testPqCompressionWithEuclidean(@TempDir final Path tempDir) new ComputedDocumentVectorizer() ); - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -785,10 +661,7 @@ void testPqCompressionWithEuclidean(@TempDir final Path tempDir) final VectorSearchResult result = index.search(queryVector, 10); assertEquals(10, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - } + result.forEach(entry -> assertNotNull(entry.entity())); } /** @@ -824,10 +697,7 @@ void testPqCompressionWithDefaultSubspaces(@TempDir final Path tempDir) new ComputedDocumentVectorizer() ); - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -835,11 +705,7 @@ void testPqCompressionWithDefaultSubspaces(@TempDir final Path tempDir) final VectorSearchResult result = index.search(queryVector, 10); assertEquals(10, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - assertTrue(entry.entity().content().startsWith("doc_")); - } + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); } /** @@ -874,10 +740,7 @@ void testPqCompressionWithRemoval(@TempDir final Path tempDir) new ComputedDocumentVectorizer() ); - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -935,10 +798,7 @@ void testPqCompressionConcurrentSearch(@TempDir final Path tempDir) throws Excep new ComputedDocumentVectorizer() ); - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -1016,10 +876,7 @@ void testPqCompressionAddAfterTraining(@TempDir final Path tempDir) ); // Add initial vectors - for(int i = 0; i < initialCount; i++) - { - gigaMap.add(new Document("initial_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, initialCount, "initial_"); // Train PQ ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -1030,10 +887,7 @@ void testPqCompressionAddAfterTraining(@TempDir final Path tempDir) assertEquals(10, resultBefore.size()); // Add more vectors after training - for(int i = 0; i < additionalCount; i++) - { - gigaMap.add(new Document("additional_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, additionalCount, "additional_"); assertEquals(initialCount + additionalCount, gigaMap.size()); @@ -1087,10 +941,7 @@ void testPqCompressionMultipleRestarts(@TempDir final Path tempDir) throws IOExc new ComputedDocumentVectorizer() ); - for(int i = 0; i < 500; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 500, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); index.persistToDisk(); @@ -1114,7 +965,6 @@ void testPqCompressionMultipleRestarts(@TempDir final Path tempDir) throws IOExc assertEquals(500, gigaMap.size()); final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); assertTrue(index.isOnDisk()); assertTrue(index.isPqCompressionEnabled()); @@ -1123,11 +973,8 @@ void testPqCompressionMultipleRestarts(@TempDir final Path tempDir) throws IOExc assertEquals(10, result.size()); // Verify all entities are accessible - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - assertTrue(entry.entity().content().startsWith("doc_")); - } + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); + } } @@ -1180,10 +1027,7 @@ void testPqCompressionRemoveAllAndRepopulate(@TempDir final Path tempDir) ); // Initial population - for(int i = 0; i < 500; i++) - { - gigaMap.add(new Document("old_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 500, "old_"); assertEquals(500, gigaMap.size()); @@ -1192,10 +1036,7 @@ void testPqCompressionRemoveAllAndRepopulate(@TempDir final Path tempDir) assertEquals(0, gigaMap.size()); // Repopulate - for(int i = 0; i < 600; i++) - { - gigaMap.add(new Document("new_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 600, "new_"); assertEquals(600, gigaMap.size()); @@ -1209,11 +1050,8 @@ void testPqCompressionRemoveAllAndRepopulate(@TempDir final Path tempDir) final VectorSearchResult result = indexAfter.search(randomVector(random, dimension), 20); assertEquals(20, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - assertTrue(entry.entity().content().startsWith("new_"), - "All results should be from new population"); - } + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("new_"))); + } /** @@ -1246,10 +1084,7 @@ void testInMemoryIndexStillWorks() assertFalse(index.isOnDisk()); // Add vectors - for(int i = 0; i < 100; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 100, "doc_"); // Search should work final VectorSearchResult result = index.search(randomVector(random, dimension), 10); @@ -1261,118 +1096,6 @@ void testInMemoryIndexStillWorks() // Background Persistence Tests // ======================================================================== - /** - * Test background persistence configuration builder. - */ - @Test - void testBackgroundPersistenceConfigurationBuilder(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(60_000) - .persistOnShutdown(true) - .minChangesBetweenPersists(50) - .build(); - - assertTrue(config.onDisk()); - assertTrue(config.backgroundPersistence()); - assertEquals(60_000, config.persistenceIntervalMs()); - assertTrue(config.persistOnShutdown()); - assertEquals(50, config.minChangesBetweenPersists()); - } - - /** - * Test background persistence configuration defaults. - */ - @Test - void testBackgroundPersistenceConfigurationDefaults(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - // Background persistence should be disabled by default - assertFalse(config.backgroundPersistence()); - assertEquals(0, config.persistenceIntervalMs()); - assertTrue(config.persistOnShutdown()); - assertEquals(100, config.minChangesBetweenPersists()); - } - - /** - * Test validation: background persistence requires onDisk. - */ - @Test - void testBackgroundPersistenceRequiresOnDisk() - { - assertThrows(IllegalStateException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .persistenceIntervalMs(30_000) - // onDisk not set - .build() - ); - } - - /** - * Test validation: persistenceIntervalMs must be non-negative. - */ - @Test - void testPersistenceIntervalMsMustBeNonNegative(@TempDir final Path tempDir) - { - // 0 is valid (means disabled) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .persistenceIntervalMs(0) - .build(); - assertEquals(0, config.persistenceIntervalMs()); - assertFalse(config.backgroundPersistence()); - - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .persistenceIntervalMs(-1000) - .build() - ); - } - - /** - * Test validation: minChangesBetweenPersists must be non-negative. - */ - @Test - void testMinChangesBetweenPersistsMustBeNonNegative(@TempDir final Path tempDir) - { - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .minChangesBetweenPersists(-1) - .build() - ); - - // Zero should be allowed (persist on every interval) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .minChangesBetweenPersists(0) - .build(); - assertEquals(0, config.minChangesBetweenPersists()); - } - /** * Test that background persistence triggers after the configured interval. */ @@ -1405,23 +1128,22 @@ void testBackgroundPersistenceTriggersAfterInterval(@TempDir final Path tempDir) try { // Add vectors to trigger dirty state - for(int i = 0; i < 50; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // Initially, files should not exist (not yet persisted) assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), "Graph file should not exist immediately after adding"); // Wait for background persistence to trigger (interval + some buffer) - Thread.sleep(1500); + await() + .atMost(ofMillis(1500)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertAll( + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist after background persistence"), + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), + "Meta file should exist after background persistence"))); - // Files should now exist - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after background persistence"); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), - "Meta file should exist after background persistence"); } finally { @@ -1461,10 +1183,7 @@ void testConcurrentSearchDuringBackgroundPersistence(@TempDir final Path tempDir try { // Add initial vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Run concurrent searches while background persistence may be running final int numSearches = 50; @@ -1547,10 +1266,7 @@ void testShutdownPersistsPendingChanges(@TempDir final Path tempDir) throws Exce ); // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Files should not exist yet (interval hasn't triggered) assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), @@ -1597,10 +1313,7 @@ void testShutdownSkipsPersistWhenDisabled(@TempDir final Path tempDir) throws Ex ); // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Close the index (should NOT trigger persist) index.close(); @@ -1642,13 +1355,10 @@ void testDebouncing(@TempDir final Path tempDir) throws Exception try { // Add fewer vectors than the threshold - for(int i = 0; i < 50; i++) // 50 < 500 threshold - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // 50 < 500 threshold // Wait for multiple persistence intervals - Thread.sleep(800); + Thread.sleep(500); // Files should NOT exist because change count is below threshold assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), @@ -1660,12 +1370,11 @@ void testDebouncing(@TempDir final Path tempDir) throws Exception gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); } - // Wait for persistence to trigger - Thread.sleep(500); - - // Now files should exist - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist when changes exceed threshold"); + await() + .atMost(ofMillis(500)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist when changes exceed threshold")); } finally { @@ -1712,14 +1421,12 @@ void testBulkAddTracksChangeCount(@TempDir final Path tempDir) throws Exception gigaMap.addAll(documents); // Wait for persistence - Thread.sleep(800); - - // Files should exist because bulk add counted as 150 changes (> 100 threshold) - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after bulk add exceeds threshold"); - } - finally - { + await() + .atMost(ofMillis(800)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist after bulk add exceeds threshold")); + } finally { index.close(); } } @@ -1764,10 +1471,7 @@ void testBackgroundPersistenceWithRestart(@TempDir final Path tempDir) throws Ex ); // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Verify search works final VectorSearchResult result = index.search(queryVector, expectedK); @@ -1798,7 +1502,6 @@ void testBackgroundPersistenceWithRestart(@TempDir final Path tempDir) throws Ex assertEquals(vectorCount, gigaMap.size()); final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); assertTrue(index.isOnDisk(), "Index should be on-disk after reload"); // Search should still work after reload @@ -1843,10 +1546,7 @@ void testManualPersistWithBackgroundPersistenceEnabled(@TempDir final Path tempD try { // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Files should not exist yet assertFalse(Files.exists(indexDir.resolve("embeddings.graph"))); @@ -1872,130 +1572,33 @@ void testManualPersistWithBackgroundPersistenceEnabled(@TempDir final Path tempD // ======================================================================== /** - * Test background optimization configuration builder. + * Test that background optimization runs after the configured interval and threshold. */ @Test - void testBackgroundOptimizationConfigurationBuilder(@TempDir final Path tempDir) + void testBackgroundOptimizationTriggersAfterIntervalAndThreshold(@TempDir final Path tempDir) throws Exception { + final int dimension = 32; + final Random random = new Random(42); final Path indexDir = tempDir.resolve("index"); + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + // Configure with short interval and low threshold for testing final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) + .dimension(dimension) .similarityFunction(VectorSimilarityFunction.COSINE) .onDisk(true) .indexDirectory(indexDir) - .optimizationIntervalMs(120_000) - .minChangesBetweenOptimizations(500) - .optimizeOnShutdown(true) + .optimizationIntervalMs(300) // 300ms for fast test + .minChangesBetweenOptimizations(10) // Low threshold .build(); - assertTrue(config.onDisk()); - assertTrue(config.backgroundOptimization()); - assertEquals(120_000, config.optimizationIntervalMs()); - assertEquals(500, config.minChangesBetweenOptimizations()); - assertTrue(config.optimizeOnShutdown()); - } - - /** - * Test background optimization configuration defaults. - */ - @Test - void testBackgroundOptimizationConfigurationDefaults(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - // Background optimization should be disabled by default - assertFalse(config.backgroundOptimization()); - assertEquals(0, config.optimizationIntervalMs()); - assertEquals(1000, config.minChangesBetweenOptimizations()); - assertFalse(config.optimizeOnShutdown()); - } - - /** - * Test validation: optimizationIntervalMs must be non-negative. - */ - @Test - void testOptimizationIntervalMsMustBeNonNegative(@TempDir final Path tempDir) - { - // 0 is valid (means disabled) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .optimizationIntervalMs(0) - .build(); - assertEquals(0, config.optimizationIntervalMs()); - assertFalse(config.backgroundOptimization()); - - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .optimizationIntervalMs(-1000) - .build() - ); - } - - /** - * Test validation: minChangesBetweenOptimizations must be non-negative. - */ - @Test - void testMinChangesBetweenOptimizationsMustBeNonNegative(@TempDir final Path tempDir) - { - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .minChangesBetweenOptimizations(-1) - .build() - ); - - // Zero should be allowed (optimize on every interval) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .minChangesBetweenOptimizations(0) - .build(); - assertEquals(0, config.minChangesBetweenOptimizations()); - } - - /** - * Test that background optimization runs after the configured interval and threshold. - */ - @Test - void testBackgroundOptimizationTriggersAfterIntervalAndThreshold(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - // Configure with short interval and low threshold for testing - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(300) // 300ms for fast test - .minChangesBetweenOptimizations(10) // Low threshold - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); try { @@ -2006,21 +1609,19 @@ void testBackgroundOptimizationTriggersAfterIntervalAndThreshold(@TempDir final "Optimization count should be 0 initially"); // Add vectors to trigger dirty state above threshold - for(int i = 0; i < 50; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // Verify pending changes are tracked assertTrue(defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount() > 0, "Pending changes should be tracked"); - // Wait for background optimization to run - Thread.sleep(800); - // Verify optimization was actually performed - assertTrue(defaultIndex.backgroundTaskManager.getOptimizationCount() >= 1, - "Optimization should have been performed at least once"); + await() + .atLeast(ofMillis(300)) + .atMost(ofMillis(800)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertTrue(defaultIndex.backgroundTaskManager.getOptimizationCount() >= 1, + "Optimization should have been performed at least once")); // Verify pending changes were reset assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), @@ -2070,10 +1671,7 @@ void testOptimizationDebouncingBelowThreshold(@TempDir final Path tempDir) throw final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; // Add fewer vectors than the threshold - for(int i = 0; i < 50; i++) // 50 < 500 threshold - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // 50 < 500 threshold // Verify pending changes are tracked assertEquals(50, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), @@ -2132,11 +1730,7 @@ void testShutdownOptimizesPendingChanges(@TempDir final Path tempDir) throws Exc final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Verify pending changes are tracked assertEquals(vectorCount, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), @@ -2191,10 +1785,7 @@ void testShutdownSkipsOptimizeWhenDisabled(@TempDir final Path tempDir) throws E final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Verify pending changes are tracked assertEquals(vectorCount, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), @@ -2246,10 +1837,7 @@ void testConcurrentSearchDuringBackgroundOptimization(@TempDir final Path tempDi try { // Add initial vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Run concurrent searches while background optimization may be running final int numSearches = 50; @@ -2340,7 +1928,7 @@ void testBulkAddTracksChangeCountForOptimization(@TempDir final Path tempDir) th gigaMap.addAll(documents); // Wait for optimization - Thread.sleep(800); + Thread.sleep(500); // Search should still work final VectorSearchResult result = index.search(randomVector(random, dimension), 10); @@ -2384,10 +1972,7 @@ void testManualOptimizeWithBackgroundOptimizationEnabled(@TempDir final Path tem try { // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Manually trigger optimization index.optimize(); @@ -2431,18 +2016,15 @@ void testBackgroundPersistenceAndOptimizationTogether(@TempDir final Path tempDi .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); try { // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Wait for both background tasks to run Thread.sleep(1000); @@ -2461,305 +2043,11 @@ void testBackgroundPersistenceAndOptimizationTogether(@TempDir final Path tempDi } } - /** - * Test that in-memory index can also use background optimization. - */ - @Test - void testInMemoryIndexWithBackgroundOptimization(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 150; - final Random random = new Random(42); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - // In-memory index with background optimization only - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .optimizationIntervalMs(200) - .minChangesBetweenOptimizations(10) - .optimizeOnShutdown(true) - .build(); - - assertFalse(config.onDisk(), "Should be in-memory index"); - assertTrue(config.backgroundOptimization(), "Background optimization should be enabled"); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try - { - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Wait for optimization to run - Thread.sleep(600); - - // Search should still work - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - } - finally - { - index.close(); - } - } - // ======================================================================== // Parallel vs Non-Parallel On-Disk Write Tests // ======================================================================== - /** - * Test that parallel and non-parallel on-disk writes produce equivalent search results - * for a large index without PQ compression. - * Both modes should produce identical graph files that yield the same search quality. - */ - @Test - void testParallelVsNonParallelOnDiskWrite(@TempDir final Path tempDir) throws IOException - { - final int vectorCount = 2000; - final int dimension = 64; - final int k = 20; - final Random random = new Random(42); - - // Generate shared vectors and query - final List vectors = new ArrayList<>(); - for(int i = 0; i < vectorCount; i++) - { - vectors.add(randomVector(random, dimension)); - } - final float[] queryVector = randomVector(new Random(999), dimension); - - final Path parallelIndexDir = tempDir.resolve("parallel"); - final Path sequentialIndexDir = tempDir.resolve("sequential"); - - // --- Parallel mode --- - final List parallelIds; - final List parallelScores; - { - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(16) - .beamWidth(100) - .onDisk(true) - .indexDirectory(parallelIndexDir) - .parallelOnDiskWrite(true) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", config, new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } - - index.persistToDisk(); - - final VectorSearchResult result = index.search(queryVector, k); - parallelIds = new ArrayList<>(); - parallelScores = new ArrayList<>(); - for(final VectorSearchResult.Entry entry : result) - { - parallelIds.add(entry.entityId()); - parallelScores.add(entry.score()); - } - - assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.meta"))); - } - - // --- Sequential mode --- - final List sequentialIds; - final List sequentialScores; - { - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(16) - .beamWidth(100) - .onDisk(true) - .indexDirectory(sequentialIndexDir) - .parallelOnDiskWrite(false) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", config, new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } - - index.persistToDisk(); - - final VectorSearchResult result = index.search(queryVector, k); - sequentialIds = new ArrayList<>(); - sequentialScores = new ArrayList<>(); - for(final VectorSearchResult.Entry entry : result) - { - sequentialIds.add(entry.entityId()); - sequentialScores.add(entry.score()); - } - - assertTrue(Files.exists(sequentialIndexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(sequentialIndexDir.resolve("embeddings.meta"))); - } - - // --- Compare results --- - assertEquals(k, parallelIds.size()); - assertEquals(k, sequentialIds.size()); - - // Both indices were built from the same data with the same HNSW parameters, - // so search results must be identical. - assertEquals(parallelIds, sequentialIds, - "Parallel and sequential on-disk writes should produce identical search results"); - assertEquals(parallelScores, sequentialScores, - "Parallel and sequential on-disk writes should produce identical search scores"); - } - - /** - * Test that parallel and non-parallel on-disk writes produce equivalent search results - * for a large index with PQ compression enabled. - * This exercises the FusedPQ write path which is the primary target of the parallel mode setting. - */ - @Test - void testParallelVsNonParallelOnDiskWriteWithCompression(@TempDir final Path tempDir) throws IOException - { - final int vectorCount = 2000; - final int dimension = 64; - final int pqSubspaces = 16; - final int k = 20; - final Random random = new Random(42); - - // Generate shared vectors and query - final List vectors = new ArrayList<>(); - for(int i = 0; i < vectorCount; i++) - { - vectors.add(randomVector(random, dimension)); - } - final float[] queryVector = randomVector(new Random(999), dimension); - - final Path parallelIndexDir = tempDir.resolve("parallel"); - final Path sequentialIndexDir = tempDir.resolve("sequential"); - - // --- Parallel mode with PQ --- - final List parallelIds; - final List parallelScores; - { - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(32) - .beamWidth(100) - .onDisk(true) - .indexDirectory(parallelIndexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .parallelOnDiskWrite(true) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", config, new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } - - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - index.persistToDisk(); - - final VectorSearchResult result = index.search(queryVector, k); - parallelIds = new ArrayList<>(); - parallelScores = new ArrayList<>(); - for(final VectorSearchResult.Entry entry : result) - { - parallelIds.add(entry.entityId()); - parallelScores.add(entry.score()); - } - - assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.meta"))); - } - - // --- Sequential mode with PQ --- - final List sequentialIds; - final List sequentialScores; - { - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(32) - .beamWidth(100) - .onDisk(true) - .indexDirectory(sequentialIndexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .parallelOnDiskWrite(false) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", config, new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } - - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - index.persistToDisk(); - - final VectorSearchResult result = index.search(queryVector, k); - sequentialIds = new ArrayList<>(); - sequentialScores = new ArrayList<>(); - for(final VectorSearchResult.Entry entry : result) - { - sequentialIds.add(entry.entityId()); - sequentialScores.add(entry.score()); - } - - assertTrue(Files.exists(sequentialIndexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(sequentialIndexDir.resolve("embeddings.meta"))); - } - - // --- Compare results --- - assertEquals(k, parallelIds.size()); - assertEquals(k, sequentialIds.size()); - - // Both indices were built from the same data with identical HNSW parameters and PQ training, - // so search results must be identical. - assertEquals(parallelIds, sequentialIds, - "Parallel and sequential PQ-compressed on-disk writes should produce identical search results"); - assertEquals(parallelScores, sequentialScores, - "Parallel and sequential PQ-compressed on-disk writes should produce identical search scores"); - } /** * Test that parallel and non-parallel on-disk writes both support persist-and-reload @@ -2806,7 +2094,6 @@ void testParallelVsNonParallelPersistAndReload(@TempDir final Path tempDir) thro assertEquals(vectorCount, gigaMap.size()); final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); assertTrue(index.isOnDisk()); final VectorSearchResult result = index.search(queryVector, k); @@ -2832,7 +2119,6 @@ void testParallelVsNonParallelPersistAndReload(@TempDir final Path tempDir) thro assertEquals(vectorCount, gigaMap.size()); final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); assertTrue(index.isOnDisk()); final VectorSearchResult result = index.search(queryVector, k); @@ -2888,10 +2174,7 @@ private void buildAndPersistIndex( "embeddings", config, new ComputedDocumentVectorizer() ); - for(int i = 0; i < vectors.size(); i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } + addDocumentsFromVectors(gigaMap, vectors, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); index.persistToDisk(); @@ -2948,28 +2231,23 @@ void testEmbeddedVectorizerWithParallelOnDiskWrite(@TempDir final Path tempDir) new EmbeddedDocumentVectorizer() ); - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // This would deadlock before the fix index.persistToDisk(); // Verify files were created - assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); + assertAll( + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))), + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))) + ); // Verify search still works after persist final float[] queryVector = randomVector(random, dimension); final VectorSearchResult result = index.search(queryVector, 10); assertEquals(10, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - } + result.forEach(entry -> assertNotNull(entry.entity())); } /** @@ -3011,10 +2289,7 @@ void testEmbeddedVectorizerWithPqAndParallelOnDiskWrite(@TempDir final Path temp ); // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Train PQ compression ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -3023,12 +2298,112 @@ void testEmbeddedVectorizerWithPqAndParallelOnDiskWrite(@TempDir final Path temp index.persistToDisk(); // Verify files were created - assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); + assertAll( + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))), + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))) + ); // Verify search still works final float[] queryVector = randomVector(random, dimension); final VectorSearchResult result = index.search(queryVector, 10); assertEquals(10, result.size()); } + + /** + * Test that parallel and non-parallel on-disk writes produce equivalent search results + * for a large index without PQ compression. + * Both modes should produce identical graph files that yield the same search quality. + */ + @Test + void testParallelVsSequentialOnDiskWrite(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 2000; + final int dimension = 64; + final int k = 20; + final Random random = new Random(42); + + // Generate shared vectors and query + final List vectors = new ArrayList<>(); + for (int i = 0; i < vectorCount; i++) { + vectors.add(randomVector(random, dimension)); + } + final float[] queryVector = randomVector(new Random(999), dimension); + + final Path parallelIndexDir = tempDir.resolve("parallel"); + final Path sequentialIndexDir = tempDir.resolve("sequential"); + + final List parallelIds = new ArrayList<>(); + final List parallelScores = new ArrayList<>(); + final List sequentialIds = new ArrayList<>(); + final List sequentialScores = new ArrayList<>(); + + // --- Parallel config + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration configParallel = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .onDisk(true) + .indexDirectory(parallelIndexDir) + .parallelOnDiskWrite(true) + .build(); + + // --- Sequential config + final VectorIndex index = vectorIndices.add( + "embeddings", configParallel, new ComputedDocumentVectorizer() + ); + + final VectorIndexConfiguration configSequential = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .enablePqCompression(true) + .beamWidth(100) + .onDisk(true) + .indexDirectory(sequentialIndexDir) + .parallelOnDiskWrite(false) + .build(); + + final VectorIndex indexSequential = vectorIndices.add( + "embeddingsSequential", configSequential, new ComputedDocumentVectorizer() + ); + + addDocumentsFromVectors(gigaMap, vectors, "doc_"); + + index.persistToDisk(); + indexSequential.persistToDisk(); + + //parallel + final VectorSearchResult result = index.search(queryVector, k); + for (final VectorSearchResult.Entry entry : result) { + parallelIds.add(entry.entityId()); + parallelScores.add(entry.score()); + } + + //sequential + final VectorSearchResult resultSequential = indexSequential.search(queryVector, k); + for (final VectorSearchResult.Entry entry : resultSequential) { + sequentialIds.add(entry.entityId()); + sequentialScores.add(entry.score()); + } + + assertAll( + () -> assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.graph"))), + () -> assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.meta"))), + () -> assertTrue(Files.exists(sequentialIndexDir.resolve("embeddingsSequential.graph"))), + () -> assertTrue(Files.exists(sequentialIndexDir.resolve("embeddingsSequential.meta"))) + ); + + // Both indices were built from the same data with the same HNSW parameters, + // so search results must be identical. + assertEquals(parallelIds, sequentialIds, + "Parallel and sequential on-disk writes should produce identical search results"); + assertEquals(parallelScores, sequentialScores, + "Parallel and sequential on-disk writes should produce identical search scores"); + } + + } diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndicesTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndicesTest.java new file mode 100644 index 00000000..5a6351f6 --- /dev/null +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndicesTest.java @@ -0,0 +1,379 @@ +package org.eclipse.store.gigamap.jvector; + +/*- + * #%L + * EclipseStore GigaMap JVector + * %% + * Copyright (C) 2023 - 2026 MicroStream Software + * %% + * This program and the accompanying materials are made + * available under the terms of the Eclipse Public License 2.0 + * which is available at https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + * #L% + */ + +import org.eclipse.store.gigamap.types.GigaMap; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Unit tests for {@link VectorIndices}. + *

    + * Tests the core functionality of vector index management: + * - Index registration and retrieval + * - Index name validation + * - Lifecycle management + */ +class VectorIndicesTest +{ + record Document(String content, float[] embedding) {} + + static class DocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + } + + @Test + void testAddIndex() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex index = vectorIndices.add("test-index", config, new DocumentVectorizer()); + + assertNotNull(index); + assertEquals("test-index", index.name()); + } + + @Test + void testAddDuplicateIndexThrows() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("duplicate", config, new DocumentVectorizer()); + + assertThrows(RuntimeException.class, () -> + vectorIndices.add("duplicate", config, new DocumentVectorizer()) + ); + } + + @Test + void testGetExistingIndex() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex created = vectorIndices.add("my-index", config, new DocumentVectorizer()); + final VectorIndex retrieved = vectorIndices.get("my-index"); + + assertSame(created, retrieved); + } + + @Test + void testGetNonExistentIndexReturnsNull() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + assertNull(vectorIndices.get("non-existent")); + } + + @Test + void testEnsureCreatesNewIndex() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex index = vectorIndices.ensure("new-index", config, new DocumentVectorizer()); + + assertNotNull(index); + assertEquals("new-index", index.name()); + } + + @Test + void testEnsureReturnsExistingIndex() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex first = vectorIndices.ensure("existing", config, new DocumentVectorizer()); + final VectorIndex second = vectorIndices.ensure("existing", config, new DocumentVectorizer()); + + assertSame(first, second); + } + + @Test + void testValidateIndexNameNull() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add(null, config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameEmpty() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add("", config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameWithSlash() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add("invalid/name", config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameWithBackslash() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add("invalid\\name", config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameTooLong() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final String tooLong = "a".repeat(201); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add(tooLong, config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameWithValidCharacters() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertDoesNotThrow(() -> + vectorIndices.add("valid-index_name.123", config, new DocumentVectorizer()) + ); + } + + @Test + void testInternalAddPropagates() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + vectorIndices.add("index2", config, new DocumentVectorizer()); + + final Document doc = new Document("test", new float[]{1.0f, 0.0f, 0.0f}); + gigaMap.add(doc); + + final VectorIndex index1 = vectorIndices.get("index1"); + final VectorIndex index2 = vectorIndices.get("index2"); + + final VectorSearchResult result1 = index1.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + final VectorSearchResult result2 = index2.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + + assertEquals(1, result1.size()); + assertEquals(1, result2.size()); + } + + @Test + void testInternalRemovePropagates() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + vectorIndices.add("index2", config, new DocumentVectorizer()); + + final Document doc = new Document("test", new float[]{1.0f, 0.0f, 0.0f}); + gigaMap.add(doc); + gigaMap.removeById(0); + + final VectorIndex index1 = vectorIndices.get("index1"); + final VectorIndex index2 = vectorIndices.get("index2"); + + final VectorSearchResult result1 = index1.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + final VectorSearchResult result2 = index2.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + + assertEquals(0, result1.size()); + assertEquals(0, result2.size()); + } + + @Test + void testInternalRemoveAllPropagates() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + + gigaMap.add(new Document("test1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("test2", new float[]{0.0f, 1.0f, 0.0f})); + + gigaMap.removeAll(); + + final VectorIndex index1 = vectorIndices.get("index1"); + final VectorSearchResult result = index1.search(new float[]{1.0f, 0.0f, 0.0f}, 10); + + assertEquals(0, result.size()); + } + + @Test + void testIterateIndices() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + vectorIndices.add("index2", config, new DocumentVectorizer()); + vectorIndices.add("index3", config, new DocumentVectorizer()); + + final int[] count = {0}; + vectorIndices.iterate(index -> count[0]++); + + assertEquals(3, count[0]); + } + + @Test + void testAccessIndices() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + vectorIndices.add("index2", config, new DocumentVectorizer()); + + vectorIndices.accessIndices(table -> { + assertNotNull(table.get("index1")); + assertNotNull(table.get("index2")); + assertNull(table.get("non-existent")); + }); + } + + @Test + void testIndexAutoPopulatesExistingEntities() + { + final GigaMap gigaMap = GigaMap.New(); + + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); + + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex index = vectorIndices.add("new-index", config, new DocumentVectorizer()); + + final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 10); + + assertEquals(2, result.size(), "Index should auto-populate with existing entities"); + } +} +