diff --git a/gigamap/jvector/pom.xml b/gigamap/jvector/pom.xml index 00f7e8f3..ab5cf765 100644 --- a/gigamap/jvector/pom.xml +++ b/gigamap/jvector/pom.xml @@ -44,6 +44,12 @@ junit-jupiter-engine test + + org.awaitility + awaitility + 4.2.2 + test + diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java index 786b55b7..901ce3df 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java @@ -123,30 +123,6 @@ void testBuilderRequiresNonNegativePqSubspaces() ); } - @Test - void testBuilderRequiresNonNegativePersistenceIntervalMs() - { - // 0 is valid (means disabled) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(64) - .persistenceIntervalMs(0) - .build(); - assertEquals(0L, config.persistenceIntervalMs()); - assertFalse(config.backgroundPersistence()); - - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder().dimension(64).persistenceIntervalMs(-1).build() - ); - } - - @Test - void testBuilderRequiresNonNegativeMinChangesBetweenPersists() - { - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder().dimension(64).minChangesBetweenPersists(-1).build() - ); - } - @Test void testBuilderRequiresNonNegativeOptimizationIntervalMs() { @@ -192,19 +168,6 @@ void testOnDiskRequiresIndexDirectory() ); } - @Test - void testOnDiskWithIndexDirectorySucceeds(@TempDir final Path tempDir) - { - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(64) - .onDisk(true) - .indexDirectory(tempDir) - .build(); - - assertTrue(config.onDisk()); - assertEquals(tempDir, config.indexDirectory()); - } - @Test void testCompressionRequiresOnDisk() { @@ -1014,4 +977,169 @@ void testFactoryMethodsDefaultEventualIndexingFalse(@TempDir final Path tempDir) assertFalse(VectorIndexConfiguration.forLargeDataset(64, tempDir).eventualIndexing()); assertFalse(VectorIndexConfiguration.forHighPrecision(64).eventualIndexing()); } + + /** + * Test on-disk configuration builder. + */ + @Test + void testOnDiskConfigurationBuilder(@TempDir final Path tempDir) + { + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(tempDir) + .build(); + + assertTrue(config.onDisk()); + assertEquals(tempDir, config.indexDirectory()); + assertFalse(config.enablePqCompression()); + assertEquals(0, config.pqSubspaces()); + } + + /** + * Test on-disk configuration with compression. + * FusedPQ requires maxDegree=32, so it should be auto-set. + */ + @Test + void testOnDiskConfigurationWithCompression(@TempDir final Path tempDir) + { + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) // Will be overridden to 32 for FusedPQ + .onDisk(true) + .indexDirectory(tempDir) + .enablePqCompression(true) + .pqSubspaces(32) + .build(); + + assertTrue(config.onDisk()); + assertTrue(config.enablePqCompression()); + assertEquals(32, config.pqSubspaces()); + assertEquals(32, config.maxDegree(), "FusedPQ requires maxDegree=32"); + } + + /** + * Test that maxDegree is auto-set to 32 when compression is enabled. + */ + @Test + void testFusedPQRequiresMaxDegree32(@TempDir final Path tempDir) + { + // Try to set maxDegree to 64 with compression enabled + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .maxDegree(64) + .onDisk(true) + .indexDirectory(tempDir) + .enablePqCompression(true) + .build(); + + // Should be overridden to 32 + assertEquals(32, config.maxDegree(), "FusedPQ should enforce maxDegree=32"); + } + + /** + * Test background persistence configuration builder. + */ + @Test + void testBackgroundPersistenceConfigurationBuilder(@TempDir final Path tempDir) + { + final Path indexDir = tempDir.resolve("index"); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(60_000) + .persistOnShutdown(true) + .minChangesBetweenPersists(50) + .build(); + + assertTrue(config.onDisk()); + assertTrue(config.backgroundPersistence()); + assertEquals(60_000, config.persistenceIntervalMs()); + assertTrue(config.persistOnShutdown()); + assertEquals(50, config.minChangesBetweenPersists()); + } + + /** + * Test validation: persistenceIntervalMs must be non-negative. + */ + @Test + void testPersistenceIntervalMsMustBeNonNegative(@TempDir final Path tempDir) + { + // 0 is valid (means disabled) + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .persistenceIntervalMs(0) + .build(); + assertEquals(0, config.persistenceIntervalMs()); + assertFalse(config.backgroundPersistence()); + + assertThrows(IllegalArgumentException.class, () -> + VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .persistenceIntervalMs(-1000) + .build() + ); + } + + /** + * Test validation: minChangesBetweenPersists must be non-negative. + */ + @Test + void testMinChangesBetweenPersistsMustBeNonNegative(@TempDir final Path tempDir) + { + assertThrows(IllegalArgumentException.class, () -> + VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .minChangesBetweenPersists(-1) + .build() + ); + + // Zero should be allowed (persist on every interval) + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .minChangesBetweenPersists(0) + .build(); + assertEquals(0, config.minChangesBetweenPersists()); + } + + /** + * Test background optimization configuration builder. + */ + @Test + void testBackgroundOptimizationConfigurationBuilder(@TempDir final Path tempDir) + { + final Path indexDir = tempDir.resolve("index"); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(120_000) + .minChangesBetweenOptimizations(500) + .optimizeOnShutdown(true) + .build(); + + assertTrue(config.onDisk()); + assertTrue(config.backgroundOptimization()); + assertEquals(120_000, config.optimizationIntervalMs()); + assertEquals(500, config.minChangesBetweenOptimizations()); + assertTrue(config.optimizeOnShutdown()); + } + } diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java index ad8c6c8a..3d94f254 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java @@ -9,17 +9,14 @@ * This program and the accompanying materials are made * available under the terms of the Eclipse Public License 2.0 * which is available at https://www.eclipse.org/legal/epl-2.0/ - * + * * SPDX-License-Identifier: EPL-2.0 * #L% */ -import org.eclipse.store.gigamap.types.GigaMap; -import org.eclipse.store.storage.embedded.types.EmbeddedStorage; -import org.eclipse.store.storage.embedded.types.EmbeddedStorageManager; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.Timeout; -import org.junit.jupiter.api.io.TempDir; +import static java.time.Duration.ofMillis; +import static org.awaitility.Awaitility.await; +import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; import java.nio.file.Files; @@ -33,8 +30,14 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.IntStream; -import static org.junit.jupiter.api.Assertions.*; +import org.eclipse.store.gigamap.types.GigaMap; +import org.eclipse.store.storage.embedded.types.EmbeddedStorage; +import org.eclipse.store.storage.embedded.types.EmbeddedStorageManager; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.api.io.TempDir; /** * Tests for on-disk VectorIndex functionality and Product Quantization. @@ -97,120 +100,33 @@ private static float[] randomVector(final Random random, final int dimension) } /** - * Test on-disk configuration builder. - */ - @Test - void testOnDiskConfigurationBuilder(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - assertTrue(config.onDisk()); - assertEquals(indexDir, config.indexDirectory()); - assertFalse(config.enablePqCompression()); - assertEquals(0, config.pqSubspaces()); - } - - /** - * Test on-disk configuration with compression. - * FusedPQ requires maxDegree=32, so it should be auto-set. - */ - @Test - void testOnDiskConfigurationWithCompression(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(16) // Will be overridden to 32 for FusedPQ - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(32) - .build(); - - assertTrue(config.onDisk()); - assertTrue(config.enablePqCompression()); - assertEquals(32, config.pqSubspaces()); - assertEquals(32, config.maxDegree(), "FusedPQ requires maxDegree=32"); - } - - /** - * Test that maxDegree is auto-set to 32 when compression is enabled. - */ - @Test - void testFusedPQRequiresMaxDegree32(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - // Try to set maxDegree to 64 with compression enabled - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .maxDegree(64) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .build(); - - // Should be overridden to 32 - assertEquals(32, config.maxDegree(), "FusedPQ should enforce maxDegree=32"); - } - - /** - * Test validation: onDisk requires indexDirectory. + * Helper to add multiple documents with random vectors to a GigaMap. */ - @Test - void testOnDiskRequiresIndexDirectory() + private static void addRandomDocuments( + final GigaMap gigaMap, + final Random random, + final int dimension, + final int count, + final String prefix + ) { - assertThrows(IllegalStateException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - // indexDirectory not set - .build() - ); + IntStream.range(0, count) + .forEach(i -> gigaMap.add(new Document(prefix + i, randomVector(random, dimension)))); } /** - * Test validation: compression requires onDisk. + * Helper to add multiple documents from a list of pre-generated vectors. */ - @Test - void testCompressionRequiresOnDisk() + private static void addDocumentsFromVectors( + final GigaMap gigaMap, + final List vectors, + final String prefix + ) { - assertThrows(IllegalStateException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .enablePqCompression(true) - // onDisk not set - .build() - ); + IntStream.range(0, vectors.size()) + .forEach(i -> gigaMap.add(new Document(prefix + i, vectors.get(i)))); } - /** - * Test validation: pqSubspaces must divide dimension evenly. - */ - @Test - void testPqSubspacesMustDivideDimension(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(100) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(33) // 100 is not divisible by 33 - .build() - ); - } /** * Test creating an on-disk index and persisting it. @@ -260,10 +176,7 @@ void testOnDiskIndexCreationAndPersistence(@TempDir final Path tempDir) throws I assertFalse(index.isPqCompressionEnabled()); // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } + addDocumentsFromVectors(gigaMap, vectors, "doc_"); // Search and record expected results final VectorSearchResult result = index.search(queryVector, 10); @@ -294,7 +207,6 @@ void testOnDiskIndexCreationAndPersistence(@TempDir final Path tempDir) throws I assertEquals(vectorCount, gigaMap.size()); final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); assertTrue(index.isOnDisk()); // Search and compare results @@ -346,10 +258,7 @@ void testOnDiskIndexWithCompression(@TempDir final Path tempDir) throws IOExcept assertTrue(index.isPqCompressionEnabled()); // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Train compression ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -361,11 +270,7 @@ void testOnDiskIndexWithCompression(@TempDir final Path tempDir) throws IOExcept assertEquals(10, result.size()); // Verify all entities are accessible - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - assertTrue(entry.entity().content().startsWith("doc_")); - } + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); // Persist to disk index.persistToDisk(); @@ -406,10 +311,7 @@ void testOnDiskSearchQuality(@TempDir final Path tempDir) throws IOException ); // Add random vectors - for(int i = 0; i < vectorCount - 1; i++) - { - gigaMap.add(new Document("random_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount - 1, "random_"); // Add a one-hot "needle" vector that randomVector() cannot produce, // since randomVector() populates all dimensions with non-zero values. @@ -459,10 +361,7 @@ void testOnDiskIndexMultipleRestarts(@TempDir final Path tempDir) throws IOExcep vectorIndices.add("embeddings", config, new ComputedDocumentVectorizer()); - for(int i = 0; i < 100; i++) - { - gigaMap.add(new Document("phase1_doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 100, "phase1_doc_"); assertEquals(100, gigaMap.size()); storage.storeRoot(); @@ -484,10 +383,7 @@ void testOnDiskIndexMultipleRestarts(@TempDir final Path tempDir) throws IOExcep assertEquals(10, result.size()); // Add more vectors - for(int i = 0; i < 50; i++) - { - gigaMap.add(new Document("phase2_doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 50, "phase2_doc_"); assertEquals(150, gigaMap.size()); storage.storeRoot(); @@ -549,10 +445,7 @@ void testPqCompressionSearchQuality(@TempDir final Path tempDir) ); // Add random vectors - for(int i = 0; i < vectorCount - 1; i++) - { - gigaMap.add(new Document("random_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount - 1, "random_"); // Add a one-hot "needle" vector that randomVector() cannot produce, // since randomVector() populates all dimensions with non-zero values. @@ -634,10 +527,7 @@ void testPqCompressionPersistAndReload(@TempDir final Path tempDir) throws IOExc assertTrue(index.isOnDisk()); assertTrue(index.isPqCompressionEnabled()); - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } + addDocumentsFromVectors(gigaMap, vectors, "doc_"); // Train and search ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -668,7 +558,6 @@ void testPqCompressionPersistAndReload(@TempDir final Path tempDir) throws IOExc assertEquals(vectorCount, gigaMap.size()); final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); assertTrue(index.isOnDisk()); assertTrue(index.isPqCompressionEnabled()); @@ -686,11 +575,7 @@ void testPqCompressionPersistAndReload(@TempDir final Path tempDir) throws IOExc assertEquals(expectedIds.size(), actualIds.size()); // Verify all entities are accessible - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - assertTrue(entry.entity().content().startsWith("doc_")); - } + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); } } } @@ -726,10 +611,7 @@ void testPqCompressionWithDotProduct(@TempDir final Path tempDir) new ComputedDocumentVectorizer() ); - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -737,10 +619,7 @@ void testPqCompressionWithDotProduct(@TempDir final Path tempDir) final VectorSearchResult result = index.search(queryVector, 10); assertEquals(10, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - } + result.forEach(entry -> assertNotNull(entry.entity())); } /** @@ -774,10 +653,7 @@ void testPqCompressionWithEuclidean(@TempDir final Path tempDir) new ComputedDocumentVectorizer() ); - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -785,10 +661,7 @@ void testPqCompressionWithEuclidean(@TempDir final Path tempDir) final VectorSearchResult result = index.search(queryVector, 10); assertEquals(10, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - } + result.forEach(entry -> assertNotNull(entry.entity())); } /** @@ -824,10 +697,7 @@ void testPqCompressionWithDefaultSubspaces(@TempDir final Path tempDir) new ComputedDocumentVectorizer() ); - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -835,11 +705,7 @@ void testPqCompressionWithDefaultSubspaces(@TempDir final Path tempDir) final VectorSearchResult result = index.search(queryVector, 10); assertEquals(10, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - assertTrue(entry.entity().content().startsWith("doc_")); - } + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); } /** @@ -874,10 +740,7 @@ void testPqCompressionWithRemoval(@TempDir final Path tempDir) new ComputedDocumentVectorizer() ); - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -935,10 +798,7 @@ void testPqCompressionConcurrentSearch(@TempDir final Path tempDir) throws Excep new ComputedDocumentVectorizer() ); - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -1016,10 +876,7 @@ void testPqCompressionAddAfterTraining(@TempDir final Path tempDir) ); // Add initial vectors - for(int i = 0; i < initialCount; i++) - { - gigaMap.add(new Document("initial_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, initialCount, "initial_"); // Train PQ ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -1030,10 +887,7 @@ void testPqCompressionAddAfterTraining(@TempDir final Path tempDir) assertEquals(10, resultBefore.size()); // Add more vectors after training - for(int i = 0; i < additionalCount; i++) - { - gigaMap.add(new Document("additional_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, additionalCount, "additional_"); assertEquals(initialCount + additionalCount, gigaMap.size()); @@ -1087,10 +941,7 @@ void testPqCompressionMultipleRestarts(@TempDir final Path tempDir) throws IOExc new ComputedDocumentVectorizer() ); - for(int i = 0; i < 500; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 500, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); index.persistToDisk(); @@ -1114,7 +965,6 @@ void testPqCompressionMultipleRestarts(@TempDir final Path tempDir) throws IOExc assertEquals(500, gigaMap.size()); final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); assertTrue(index.isOnDisk()); assertTrue(index.isPqCompressionEnabled()); @@ -1123,11 +973,8 @@ void testPqCompressionMultipleRestarts(@TempDir final Path tempDir) throws IOExc assertEquals(10, result.size()); // Verify all entities are accessible - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - assertTrue(entry.entity().content().startsWith("doc_")); - } + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); + } } @@ -1180,10 +1027,7 @@ void testPqCompressionRemoveAllAndRepopulate(@TempDir final Path tempDir) ); // Initial population - for(int i = 0; i < 500; i++) - { - gigaMap.add(new Document("old_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 500, "old_"); assertEquals(500, gigaMap.size()); @@ -1192,10 +1036,7 @@ void testPqCompressionRemoveAllAndRepopulate(@TempDir final Path tempDir) assertEquals(0, gigaMap.size()); // Repopulate - for(int i = 0; i < 600; i++) - { - gigaMap.add(new Document("new_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 600, "new_"); assertEquals(600, gigaMap.size()); @@ -1209,11 +1050,8 @@ void testPqCompressionRemoveAllAndRepopulate(@TempDir final Path tempDir) final VectorSearchResult result = indexAfter.search(randomVector(random, dimension), 20); assertEquals(20, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - assertTrue(entry.entity().content().startsWith("new_"), - "All results should be from new population"); - } + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("new_"))); + } /** @@ -1246,10 +1084,7 @@ void testInMemoryIndexStillWorks() assertFalse(index.isOnDisk()); // Add vectors - for(int i = 0; i < 100; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 100, "doc_"); // Search should work final VectorSearchResult result = index.search(randomVector(random, dimension), 10); @@ -1261,118 +1096,6 @@ void testInMemoryIndexStillWorks() // Background Persistence Tests // ======================================================================== - /** - * Test background persistence configuration builder. - */ - @Test - void testBackgroundPersistenceConfigurationBuilder(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(60_000) - .persistOnShutdown(true) - .minChangesBetweenPersists(50) - .build(); - - assertTrue(config.onDisk()); - assertTrue(config.backgroundPersistence()); - assertEquals(60_000, config.persistenceIntervalMs()); - assertTrue(config.persistOnShutdown()); - assertEquals(50, config.minChangesBetweenPersists()); - } - - /** - * Test background persistence configuration defaults. - */ - @Test - void testBackgroundPersistenceConfigurationDefaults(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - // Background persistence should be disabled by default - assertFalse(config.backgroundPersistence()); - assertEquals(0, config.persistenceIntervalMs()); - assertTrue(config.persistOnShutdown()); - assertEquals(100, config.minChangesBetweenPersists()); - } - - /** - * Test validation: background persistence requires onDisk. - */ - @Test - void testBackgroundPersistenceRequiresOnDisk() - { - assertThrows(IllegalStateException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .persistenceIntervalMs(30_000) - // onDisk not set - .build() - ); - } - - /** - * Test validation: persistenceIntervalMs must be non-negative. - */ - @Test - void testPersistenceIntervalMsMustBeNonNegative(@TempDir final Path tempDir) - { - // 0 is valid (means disabled) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .persistenceIntervalMs(0) - .build(); - assertEquals(0, config.persistenceIntervalMs()); - assertFalse(config.backgroundPersistence()); - - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .persistenceIntervalMs(-1000) - .build() - ); - } - - /** - * Test validation: minChangesBetweenPersists must be non-negative. - */ - @Test - void testMinChangesBetweenPersistsMustBeNonNegative(@TempDir final Path tempDir) - { - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .minChangesBetweenPersists(-1) - .build() - ); - - // Zero should be allowed (persist on every interval) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .minChangesBetweenPersists(0) - .build(); - assertEquals(0, config.minChangesBetweenPersists()); - } - /** * Test that background persistence triggers after the configured interval. */ @@ -1405,23 +1128,22 @@ void testBackgroundPersistenceTriggersAfterInterval(@TempDir final Path tempDir) try { // Add vectors to trigger dirty state - for(int i = 0; i < 50; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // Initially, files should not exist (not yet persisted) assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), "Graph file should not exist immediately after adding"); // Wait for background persistence to trigger (interval + some buffer) - Thread.sleep(1500); + await() + .atMost(ofMillis(1500)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertAll( + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist after background persistence"), + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), + "Meta file should exist after background persistence"))); - // Files should now exist - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after background persistence"); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), - "Meta file should exist after background persistence"); } finally { @@ -1461,10 +1183,7 @@ void testConcurrentSearchDuringBackgroundPersistence(@TempDir final Path tempDir try { // Add initial vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Run concurrent searches while background persistence may be running final int numSearches = 50; @@ -1547,10 +1266,7 @@ void testShutdownPersistsPendingChanges(@TempDir final Path tempDir) throws Exce ); // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Files should not exist yet (interval hasn't triggered) assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), @@ -1597,10 +1313,7 @@ void testShutdownSkipsPersistWhenDisabled(@TempDir final Path tempDir) throws Ex ); // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Close the index (should NOT trigger persist) index.close(); @@ -1642,13 +1355,10 @@ void testDebouncing(@TempDir final Path tempDir) throws Exception try { // Add fewer vectors than the threshold - for(int i = 0; i < 50; i++) // 50 < 500 threshold - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // 50 < 500 threshold // Wait for multiple persistence intervals - Thread.sleep(800); + Thread.sleep(500); // Files should NOT exist because change count is below threshold assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), @@ -1660,12 +1370,11 @@ void testDebouncing(@TempDir final Path tempDir) throws Exception gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); } - // Wait for persistence to trigger - Thread.sleep(500); - - // Now files should exist - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist when changes exceed threshold"); + await() + .atMost(ofMillis(500)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist when changes exceed threshold")); } finally { @@ -1712,14 +1421,12 @@ void testBulkAddTracksChangeCount(@TempDir final Path tempDir) throws Exception gigaMap.addAll(documents); // Wait for persistence - Thread.sleep(800); - - // Files should exist because bulk add counted as 150 changes (> 100 threshold) - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after bulk add exceeds threshold"); - } - finally - { + await() + .atMost(ofMillis(800)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist after bulk add exceeds threshold")); + } finally { index.close(); } } @@ -1764,10 +1471,7 @@ void testBackgroundPersistenceWithRestart(@TempDir final Path tempDir) throws Ex ); // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Verify search works final VectorSearchResult result = index.search(queryVector, expectedK); @@ -1798,7 +1502,6 @@ void testBackgroundPersistenceWithRestart(@TempDir final Path tempDir) throws Ex assertEquals(vectorCount, gigaMap.size()); final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); assertTrue(index.isOnDisk(), "Index should be on-disk after reload"); // Search should still work after reload @@ -1843,10 +1546,7 @@ void testManualPersistWithBackgroundPersistenceEnabled(@TempDir final Path tempD try { // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Files should not exist yet assertFalse(Files.exists(indexDir.resolve("embeddings.graph"))); @@ -1872,130 +1572,33 @@ void testManualPersistWithBackgroundPersistenceEnabled(@TempDir final Path tempD // ======================================================================== /** - * Test background optimization configuration builder. + * Test that background optimization runs after the configured interval and threshold. */ @Test - void testBackgroundOptimizationConfigurationBuilder(@TempDir final Path tempDir) + void testBackgroundOptimizationTriggersAfterIntervalAndThreshold(@TempDir final Path tempDir) throws Exception { + final int dimension = 32; + final Random random = new Random(42); final Path indexDir = tempDir.resolve("index"); + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + // Configure with short interval and low threshold for testing final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) + .dimension(dimension) .similarityFunction(VectorSimilarityFunction.COSINE) .onDisk(true) .indexDirectory(indexDir) - .optimizationIntervalMs(120_000) - .minChangesBetweenOptimizations(500) - .optimizeOnShutdown(true) + .optimizationIntervalMs(300) // 300ms for fast test + .minChangesBetweenOptimizations(10) // Low threshold .build(); - assertTrue(config.onDisk()); - assertTrue(config.backgroundOptimization()); - assertEquals(120_000, config.optimizationIntervalMs()); - assertEquals(500, config.minChangesBetweenOptimizations()); - assertTrue(config.optimizeOnShutdown()); - } - - /** - * Test background optimization configuration defaults. - */ - @Test - void testBackgroundOptimizationConfigurationDefaults(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - // Background optimization should be disabled by default - assertFalse(config.backgroundOptimization()); - assertEquals(0, config.optimizationIntervalMs()); - assertEquals(1000, config.minChangesBetweenOptimizations()); - assertFalse(config.optimizeOnShutdown()); - } - - /** - * Test validation: optimizationIntervalMs must be non-negative. - */ - @Test - void testOptimizationIntervalMsMustBeNonNegative(@TempDir final Path tempDir) - { - // 0 is valid (means disabled) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .optimizationIntervalMs(0) - .build(); - assertEquals(0, config.optimizationIntervalMs()); - assertFalse(config.backgroundOptimization()); - - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .optimizationIntervalMs(-1000) - .build() - ); - } - - /** - * Test validation: minChangesBetweenOptimizations must be non-negative. - */ - @Test - void testMinChangesBetweenOptimizationsMustBeNonNegative(@TempDir final Path tempDir) - { - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .minChangesBetweenOptimizations(-1) - .build() - ); - - // Zero should be allowed (optimize on every interval) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .minChangesBetweenOptimizations(0) - .build(); - assertEquals(0, config.minChangesBetweenOptimizations()); - } - - /** - * Test that background optimization runs after the configured interval and threshold. - */ - @Test - void testBackgroundOptimizationTriggersAfterIntervalAndThreshold(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - // Configure with short interval and low threshold for testing - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(300) // 300ms for fast test - .minChangesBetweenOptimizations(10) // Low threshold - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); try { @@ -2006,21 +1609,19 @@ void testBackgroundOptimizationTriggersAfterIntervalAndThreshold(@TempDir final "Optimization count should be 0 initially"); // Add vectors to trigger dirty state above threshold - for(int i = 0; i < 50; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // Verify pending changes are tracked assertTrue(defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount() > 0, "Pending changes should be tracked"); - // Wait for background optimization to run - Thread.sleep(800); - // Verify optimization was actually performed - assertTrue(defaultIndex.backgroundTaskManager.getOptimizationCount() >= 1, - "Optimization should have been performed at least once"); + await() + .atLeast(ofMillis(300)) + .atMost(ofMillis(800)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertTrue(defaultIndex.backgroundTaskManager.getOptimizationCount() >= 1, + "Optimization should have been performed at least once")); // Verify pending changes were reset assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), @@ -2070,10 +1671,7 @@ void testOptimizationDebouncingBelowThreshold(@TempDir final Path tempDir) throw final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; // Add fewer vectors than the threshold - for(int i = 0; i < 50; i++) // 50 < 500 threshold - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // 50 < 500 threshold // Verify pending changes are tracked assertEquals(50, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), @@ -2132,11 +1730,7 @@ void testShutdownOptimizesPendingChanges(@TempDir final Path tempDir) throws Exc final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Verify pending changes are tracked assertEquals(vectorCount, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), @@ -2191,10 +1785,7 @@ void testShutdownSkipsOptimizeWhenDisabled(@TempDir final Path tempDir) throws E final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Verify pending changes are tracked assertEquals(vectorCount, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), @@ -2246,10 +1837,7 @@ void testConcurrentSearchDuringBackgroundOptimization(@TempDir final Path tempDi try { // Add initial vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Run concurrent searches while background optimization may be running final int numSearches = 50; @@ -2340,7 +1928,7 @@ void testBulkAddTracksChangeCountForOptimization(@TempDir final Path tempDir) th gigaMap.addAll(documents); // Wait for optimization - Thread.sleep(800); + Thread.sleep(500); // Search should still work final VectorSearchResult result = index.search(randomVector(random, dimension), 10); @@ -2384,10 +1972,7 @@ void testManualOptimizeWithBackgroundOptimizationEnabled(@TempDir final Path tem try { // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Manually trigger optimization index.optimize(); @@ -2431,18 +2016,15 @@ void testBackgroundPersistenceAndOptimizationTogether(@TempDir final Path tempDi .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); try { // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Wait for both background tasks to run Thread.sleep(1000); @@ -2461,305 +2043,11 @@ void testBackgroundPersistenceAndOptimizationTogether(@TempDir final Path tempDi } } - /** - * Test that in-memory index can also use background optimization. - */ - @Test - void testInMemoryIndexWithBackgroundOptimization(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 150; - final Random random = new Random(42); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - // In-memory index with background optimization only - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .optimizationIntervalMs(200) - .minChangesBetweenOptimizations(10) - .optimizeOnShutdown(true) - .build(); - - assertFalse(config.onDisk(), "Should be in-memory index"); - assertTrue(config.backgroundOptimization(), "Background optimization should be enabled"); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try - { - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Wait for optimization to run - Thread.sleep(600); - - // Search should still work - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - } - finally - { - index.close(); - } - } - // ======================================================================== // Parallel vs Non-Parallel On-Disk Write Tests // ======================================================================== - /** - * Test that parallel and non-parallel on-disk writes produce equivalent search results - * for a large index without PQ compression. - * Both modes should produce identical graph files that yield the same search quality. - */ - @Test - void testParallelVsNonParallelOnDiskWrite(@TempDir final Path tempDir) throws IOException - { - final int vectorCount = 2000; - final int dimension = 64; - final int k = 20; - final Random random = new Random(42); - - // Generate shared vectors and query - final List vectors = new ArrayList<>(); - for(int i = 0; i < vectorCount; i++) - { - vectors.add(randomVector(random, dimension)); - } - final float[] queryVector = randomVector(new Random(999), dimension); - - final Path parallelIndexDir = tempDir.resolve("parallel"); - final Path sequentialIndexDir = tempDir.resolve("sequential"); - - // --- Parallel mode --- - final List parallelIds; - final List parallelScores; - { - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(16) - .beamWidth(100) - .onDisk(true) - .indexDirectory(parallelIndexDir) - .parallelOnDiskWrite(true) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", config, new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } - - index.persistToDisk(); - - final VectorSearchResult result = index.search(queryVector, k); - parallelIds = new ArrayList<>(); - parallelScores = new ArrayList<>(); - for(final VectorSearchResult.Entry entry : result) - { - parallelIds.add(entry.entityId()); - parallelScores.add(entry.score()); - } - - assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.meta"))); - } - - // --- Sequential mode --- - final List sequentialIds; - final List sequentialScores; - { - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(16) - .beamWidth(100) - .onDisk(true) - .indexDirectory(sequentialIndexDir) - .parallelOnDiskWrite(false) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", config, new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } - - index.persistToDisk(); - - final VectorSearchResult result = index.search(queryVector, k); - sequentialIds = new ArrayList<>(); - sequentialScores = new ArrayList<>(); - for(final VectorSearchResult.Entry entry : result) - { - sequentialIds.add(entry.entityId()); - sequentialScores.add(entry.score()); - } - - assertTrue(Files.exists(sequentialIndexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(sequentialIndexDir.resolve("embeddings.meta"))); - } - - // --- Compare results --- - assertEquals(k, parallelIds.size()); - assertEquals(k, sequentialIds.size()); - - // Both indices were built from the same data with the same HNSW parameters, - // so search results must be identical. - assertEquals(parallelIds, sequentialIds, - "Parallel and sequential on-disk writes should produce identical search results"); - assertEquals(parallelScores, sequentialScores, - "Parallel and sequential on-disk writes should produce identical search scores"); - } - - /** - * Test that parallel and non-parallel on-disk writes produce equivalent search results - * for a large index with PQ compression enabled. - * This exercises the FusedPQ write path which is the primary target of the parallel mode setting. - */ - @Test - void testParallelVsNonParallelOnDiskWriteWithCompression(@TempDir final Path tempDir) throws IOException - { - final int vectorCount = 2000; - final int dimension = 64; - final int pqSubspaces = 16; - final int k = 20; - final Random random = new Random(42); - - // Generate shared vectors and query - final List vectors = new ArrayList<>(); - for(int i = 0; i < vectorCount; i++) - { - vectors.add(randomVector(random, dimension)); - } - final float[] queryVector = randomVector(new Random(999), dimension); - - final Path parallelIndexDir = tempDir.resolve("parallel"); - final Path sequentialIndexDir = tempDir.resolve("sequential"); - - // --- Parallel mode with PQ --- - final List parallelIds; - final List parallelScores; - { - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(32) - .beamWidth(100) - .onDisk(true) - .indexDirectory(parallelIndexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .parallelOnDiskWrite(true) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", config, new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } - - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - index.persistToDisk(); - - final VectorSearchResult result = index.search(queryVector, k); - parallelIds = new ArrayList<>(); - parallelScores = new ArrayList<>(); - for(final VectorSearchResult.Entry entry : result) - { - parallelIds.add(entry.entityId()); - parallelScores.add(entry.score()); - } - - assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.meta"))); - } - - // --- Sequential mode with PQ --- - final List sequentialIds; - final List sequentialScores; - { - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(32) - .beamWidth(100) - .onDisk(true) - .indexDirectory(sequentialIndexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .parallelOnDiskWrite(false) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", config, new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } - - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - index.persistToDisk(); - - final VectorSearchResult result = index.search(queryVector, k); - sequentialIds = new ArrayList<>(); - sequentialScores = new ArrayList<>(); - for(final VectorSearchResult.Entry entry : result) - { - sequentialIds.add(entry.entityId()); - sequentialScores.add(entry.score()); - } - - assertTrue(Files.exists(sequentialIndexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(sequentialIndexDir.resolve("embeddings.meta"))); - } - - // --- Compare results --- - assertEquals(k, parallelIds.size()); - assertEquals(k, sequentialIds.size()); - - // Both indices were built from the same data with identical HNSW parameters and PQ training, - // so search results must be identical. - assertEquals(parallelIds, sequentialIds, - "Parallel and sequential PQ-compressed on-disk writes should produce identical search results"); - assertEquals(parallelScores, sequentialScores, - "Parallel and sequential PQ-compressed on-disk writes should produce identical search scores"); - } /** * Test that parallel and non-parallel on-disk writes both support persist-and-reload @@ -2806,7 +2094,6 @@ void testParallelVsNonParallelPersistAndReload(@TempDir final Path tempDir) thro assertEquals(vectorCount, gigaMap.size()); final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); assertTrue(index.isOnDisk()); final VectorSearchResult result = index.search(queryVector, k); @@ -2832,7 +2119,6 @@ void testParallelVsNonParallelPersistAndReload(@TempDir final Path tempDir) thro assertEquals(vectorCount, gigaMap.size()); final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); assertTrue(index.isOnDisk()); final VectorSearchResult result = index.search(queryVector, k); @@ -2888,10 +2174,7 @@ private void buildAndPersistIndex( "embeddings", config, new ComputedDocumentVectorizer() ); - for(int i = 0; i < vectors.size(); i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } + addDocumentsFromVectors(gigaMap, vectors, "doc_"); ((VectorIndex.Internal)index).trainCompressionIfNeeded(); index.persistToDisk(); @@ -2948,28 +2231,23 @@ void testEmbeddedVectorizerWithParallelOnDiskWrite(@TempDir final Path tempDir) new EmbeddedDocumentVectorizer() ); - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // This would deadlock before the fix index.persistToDisk(); // Verify files were created - assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); + assertAll( + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))), + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))) + ); // Verify search still works after persist final float[] queryVector = randomVector(random, dimension); final VectorSearchResult result = index.search(queryVector, 10); assertEquals(10, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - } + result.forEach(entry -> assertNotNull(entry.entity())); } /** @@ -3011,10 +2289,7 @@ void testEmbeddedVectorizerWithPqAndParallelOnDiskWrite(@TempDir final Path temp ); // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Train PQ compression ((VectorIndex.Internal)index).trainCompressionIfNeeded(); @@ -3023,12 +2298,112 @@ void testEmbeddedVectorizerWithPqAndParallelOnDiskWrite(@TempDir final Path temp index.persistToDisk(); // Verify files were created - assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); + assertAll( + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))), + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))) + ); // Verify search still works final float[] queryVector = randomVector(random, dimension); final VectorSearchResult result = index.search(queryVector, 10); assertEquals(10, result.size()); } + + /** + * Test that parallel and non-parallel on-disk writes produce equivalent search results + * for a large index without PQ compression. + * Both modes should produce identical graph files that yield the same search quality. + */ + @Test + void testParallelVsSequentialOnDiskWrite(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 2000; + final int dimension = 64; + final int k = 20; + final Random random = new Random(42); + + // Generate shared vectors and query + final List vectors = new ArrayList<>(); + for (int i = 0; i < vectorCount; i++) { + vectors.add(randomVector(random, dimension)); + } + final float[] queryVector = randomVector(new Random(999), dimension); + + final Path parallelIndexDir = tempDir.resolve("parallel"); + final Path sequentialIndexDir = tempDir.resolve("sequential"); + + final List parallelIds = new ArrayList<>(); + final List parallelScores = new ArrayList<>(); + final List sequentialIds = new ArrayList<>(); + final List sequentialScores = new ArrayList<>(); + + // --- Parallel config + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration configParallel = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .onDisk(true) + .indexDirectory(parallelIndexDir) + .parallelOnDiskWrite(true) + .build(); + + // --- Sequential config + final VectorIndex index = vectorIndices.add( + "embeddings", configParallel, new ComputedDocumentVectorizer() + ); + + final VectorIndexConfiguration configSequential = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .enablePqCompression(true) + .beamWidth(100) + .onDisk(true) + .indexDirectory(sequentialIndexDir) + .parallelOnDiskWrite(false) + .build(); + + final VectorIndex indexSequential = vectorIndices.add( + "embeddingsSequential", configSequential, new ComputedDocumentVectorizer() + ); + + addDocumentsFromVectors(gigaMap, vectors, "doc_"); + + index.persistToDisk(); + indexSequential.persistToDisk(); + + //parallel + final VectorSearchResult result = index.search(queryVector, k); + for (final VectorSearchResult.Entry entry : result) { + parallelIds.add(entry.entityId()); + parallelScores.add(entry.score()); + } + + //sequential + final VectorSearchResult resultSequential = indexSequential.search(queryVector, k); + for (final VectorSearchResult.Entry entry : resultSequential) { + sequentialIds.add(entry.entityId()); + sequentialScores.add(entry.score()); + } + + assertAll( + () -> assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.graph"))), + () -> assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.meta"))), + () -> assertTrue(Files.exists(sequentialIndexDir.resolve("embeddingsSequential.graph"))), + () -> assertTrue(Files.exists(sequentialIndexDir.resolve("embeddingsSequential.meta"))) + ); + + // Both indices were built from the same data with the same HNSW parameters, + // so search results must be identical. + assertEquals(parallelIds, sequentialIds, + "Parallel and sequential on-disk writes should produce identical search results"); + assertEquals(parallelScores, sequentialScores, + "Parallel and sequential on-disk writes should produce identical search scores"); + } + + } diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndicesTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndicesTest.java new file mode 100644 index 00000000..5a6351f6 --- /dev/null +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndicesTest.java @@ -0,0 +1,379 @@ +package org.eclipse.store.gigamap.jvector; + +/*- + * #%L + * EclipseStore GigaMap JVector + * %% + * Copyright (C) 2023 - 2026 MicroStream Software + * %% + * This program and the accompanying materials are made + * available under the terms of the Eclipse Public License 2.0 + * which is available at https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + * #L% + */ + +import org.eclipse.store.gigamap.types.GigaMap; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Unit tests for {@link VectorIndices}. + *

+ * Tests the core functionality of vector index management: + * - Index registration and retrieval + * - Index name validation + * - Lifecycle management + */ +class VectorIndicesTest +{ + record Document(String content, float[] embedding) {} + + static class DocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + } + + @Test + void testAddIndex() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex index = vectorIndices.add("test-index", config, new DocumentVectorizer()); + + assertNotNull(index); + assertEquals("test-index", index.name()); + } + + @Test + void testAddDuplicateIndexThrows() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("duplicate", config, new DocumentVectorizer()); + + assertThrows(RuntimeException.class, () -> + vectorIndices.add("duplicate", config, new DocumentVectorizer()) + ); + } + + @Test + void testGetExistingIndex() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex created = vectorIndices.add("my-index", config, new DocumentVectorizer()); + final VectorIndex retrieved = vectorIndices.get("my-index"); + + assertSame(created, retrieved); + } + + @Test + void testGetNonExistentIndexReturnsNull() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + assertNull(vectorIndices.get("non-existent")); + } + + @Test + void testEnsureCreatesNewIndex() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex index = vectorIndices.ensure("new-index", config, new DocumentVectorizer()); + + assertNotNull(index); + assertEquals("new-index", index.name()); + } + + @Test + void testEnsureReturnsExistingIndex() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex first = vectorIndices.ensure("existing", config, new DocumentVectorizer()); + final VectorIndex second = vectorIndices.ensure("existing", config, new DocumentVectorizer()); + + assertSame(first, second); + } + + @Test + void testValidateIndexNameNull() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add(null, config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameEmpty() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add("", config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameWithSlash() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add("invalid/name", config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameWithBackslash() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add("invalid\\name", config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameTooLong() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final String tooLong = "a".repeat(201); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add(tooLong, config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameWithValidCharacters() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertDoesNotThrow(() -> + vectorIndices.add("valid-index_name.123", config, new DocumentVectorizer()) + ); + } + + @Test + void testInternalAddPropagates() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + vectorIndices.add("index2", config, new DocumentVectorizer()); + + final Document doc = new Document("test", new float[]{1.0f, 0.0f, 0.0f}); + gigaMap.add(doc); + + final VectorIndex index1 = vectorIndices.get("index1"); + final VectorIndex index2 = vectorIndices.get("index2"); + + final VectorSearchResult result1 = index1.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + final VectorSearchResult result2 = index2.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + + assertEquals(1, result1.size()); + assertEquals(1, result2.size()); + } + + @Test + void testInternalRemovePropagates() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + vectorIndices.add("index2", config, new DocumentVectorizer()); + + final Document doc = new Document("test", new float[]{1.0f, 0.0f, 0.0f}); + gigaMap.add(doc); + gigaMap.removeById(0); + + final VectorIndex index1 = vectorIndices.get("index1"); + final VectorIndex index2 = vectorIndices.get("index2"); + + final VectorSearchResult result1 = index1.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + final VectorSearchResult result2 = index2.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + + assertEquals(0, result1.size()); + assertEquals(0, result2.size()); + } + + @Test + void testInternalRemoveAllPropagates() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + + gigaMap.add(new Document("test1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("test2", new float[]{0.0f, 1.0f, 0.0f})); + + gigaMap.removeAll(); + + final VectorIndex index1 = vectorIndices.get("index1"); + final VectorSearchResult result = index1.search(new float[]{1.0f, 0.0f, 0.0f}, 10); + + assertEquals(0, result.size()); + } + + @Test + void testIterateIndices() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + vectorIndices.add("index2", config, new DocumentVectorizer()); + vectorIndices.add("index3", config, new DocumentVectorizer()); + + final int[] count = {0}; + vectorIndices.iterate(index -> count[0]++); + + assertEquals(3, count[0]); + } + + @Test + void testAccessIndices() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + vectorIndices.add("index2", config, new DocumentVectorizer()); + + vectorIndices.accessIndices(table -> { + assertNotNull(table.get("index1")); + assertNotNull(table.get("index2")); + assertNull(table.get("non-existent")); + }); + } + + @Test + void testIndexAutoPopulatesExistingEntities() + { + final GigaMap gigaMap = GigaMap.New(); + + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); + + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex index = vectorIndices.add("new-index", config, new DocumentVectorizer()); + + final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 10); + + assertEquals(2, result.size(), "Index should auto-populate with existing entities"); + } +} +