From 19821a80c374160756ec70a4955ee31a3da1e5cb Mon Sep 17 00:00:00 2001 From: Zdenek Jonas Date: Fri, 20 Feb 2026 14:07:28 +0100 Subject: [PATCH 1/7] move configuration tests to VICT file, remove duplicite tests. --- .../jvector/VectorIndexConfigurationTest.java | 161 +++++ .../gigamap/jvector/VectorIndexDiskTest.java | 572 ++++-------------- 2 files changed, 262 insertions(+), 471 deletions(-) diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java index 786b55b7..ebb4f451 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java @@ -1014,4 +1014,165 @@ void testFactoryMethodsDefaultEventualIndexingFalse(@TempDir final Path tempDir) assertFalse(VectorIndexConfiguration.forLargeDataset(64, tempDir).eventualIndexing()); assertFalse(VectorIndexConfiguration.forHighPrecision(64).eventualIndexing()); } + + /** + * Test on-disk configuration builder. + */ + @Test + void testOnDiskConfigurationBuilder(@TempDir final Path tempDir) + { + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(tempDir) + .build(); + + assertTrue(config.onDisk()); + assertEquals(tempDir, config.indexDirectory()); + assertFalse(config.enablePqCompression()); + assertEquals(0, config.pqSubspaces()); + } + + /** + * Test on-disk configuration with compression. + * FusedPQ requires maxDegree=32, so it should be auto-set. + */ + @Test + void testOnDiskConfigurationWithCompression(@TempDir final Path tempDir) + { + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) // Will be overridden to 32 for FusedPQ + .onDisk(true) + .indexDirectory(tempDir) + .enablePqCompression(true) + .pqSubspaces(32) + .build(); + + assertTrue(config.onDisk()); + assertTrue(config.enablePqCompression()); + assertEquals(32, config.pqSubspaces()); + assertEquals(32, config.maxDegree(), "FusedPQ requires maxDegree=32"); + } + + /** + * Test that maxDegree is auto-set to 32 when compression is enabled. + */ + @Test + void testFusedPQRequiresMaxDegree32(@TempDir final Path tempDir) + { + // Try to set maxDegree to 64 with compression enabled + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .maxDegree(64) + .onDisk(true) + .indexDirectory(tempDir) + .enablePqCompression(true) + .build(); + + // Should be overridden to 32 + assertEquals(32, config.maxDegree(), "FusedPQ should enforce maxDegree=32"); + } + + /** + * Test background persistence configuration builder. + */ + @Test + void testBackgroundPersistenceConfigurationBuilder(@TempDir final Path tempDir) + { + final Path indexDir = tempDir.resolve("index"); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(60_000) + .persistOnShutdown(true) + .minChangesBetweenPersists(50) + .build(); + + assertTrue(config.onDisk()); + assertTrue(config.backgroundPersistence()); + assertEquals(60_000, config.persistenceIntervalMs()); + assertTrue(config.persistOnShutdown()); + assertEquals(50, config.minChangesBetweenPersists()); + } + + /** + * Test background persistence configuration defaults. + */ + @Test + void testBackgroundPersistenceConfigurationDefaults(@TempDir final Path tempDir) + { + final Path indexDir = tempDir.resolve("index"); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(indexDir) + .build(); + + // Background persistence should be disabled by default + assertFalse(config.backgroundPersistence()); + assertEquals(0, config.persistenceIntervalMs()); + assertTrue(config.persistOnShutdown()); + assertEquals(100, config.minChangesBetweenPersists()); + } + + /** + * Test validation: persistenceIntervalMs must be non-negative. + */ + @Test + void testPersistenceIntervalMsMustBeNonNegative(@TempDir final Path tempDir) + { + // 0 is valid (means disabled) + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .persistenceIntervalMs(0) + .build(); + assertEquals(0, config.persistenceIntervalMs()); + assertFalse(config.backgroundPersistence()); + + assertThrows(IllegalArgumentException.class, () -> + VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .persistenceIntervalMs(-1000) + .build() + ); + } + + /** + * Test validation: minChangesBetweenPersists must be non-negative. + */ + @Test + void testMinChangesBetweenPersistsMustBeNonNegative(@TempDir final Path tempDir) + { + assertThrows(IllegalArgumentException.class, () -> + VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .minChangesBetweenPersists(-1) + .build() + ); + + // Zero should be allowed (persist on every interval) + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .minChangesBetweenPersists(0) + .build(); + assertEquals(0, config.minChangesBetweenPersists()); + } + } diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java index ad8c6c8a..db8b2875 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java @@ -96,121 +96,6 @@ private static float[] randomVector(final Random random, final int dimension) return vector; } - /** - * Test on-disk configuration builder. - */ - @Test - void testOnDiskConfigurationBuilder(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - assertTrue(config.onDisk()); - assertEquals(indexDir, config.indexDirectory()); - assertFalse(config.enablePqCompression()); - assertEquals(0, config.pqSubspaces()); - } - - /** - * Test on-disk configuration with compression. - * FusedPQ requires maxDegree=32, so it should be auto-set. - */ - @Test - void testOnDiskConfigurationWithCompression(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(16) // Will be overridden to 32 for FusedPQ - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(32) - .build(); - - assertTrue(config.onDisk()); - assertTrue(config.enablePqCompression()); - assertEquals(32, config.pqSubspaces()); - assertEquals(32, config.maxDegree(), "FusedPQ requires maxDegree=32"); - } - - /** - * Test that maxDegree is auto-set to 32 when compression is enabled. - */ - @Test - void testFusedPQRequiresMaxDegree32(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - // Try to set maxDegree to 64 with compression enabled - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .maxDegree(64) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .build(); - - // Should be overridden to 32 - assertEquals(32, config.maxDegree(), "FusedPQ should enforce maxDegree=32"); - } - - /** - * Test validation: onDisk requires indexDirectory. - */ - @Test - void testOnDiskRequiresIndexDirectory() - { - assertThrows(IllegalStateException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - // indexDirectory not set - .build() - ); - } - - /** - * Test validation: compression requires onDisk. - */ - @Test - void testCompressionRequiresOnDisk() - { - assertThrows(IllegalStateException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .enablePqCompression(true) - // onDisk not set - .build() - ); - } - - /** - * Test validation: pqSubspaces must divide dimension evenly. - */ - @Test - void testPqSubspacesMustDivideDimension(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(100) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(33) // 100 is not divisible by 33 - .build() - ); - } /** * Test creating an on-disk index and persisting it. @@ -500,11 +385,10 @@ void testOnDiskIndexMultipleRestarts(@TempDir final Path tempDir) throws IOExcep { @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap)storage.root(); - final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + VectorIndex index = gigaMap.index().get(VectorIndices.Category()).get("embeddings"); assertEquals(150, gigaMap.size()); - final VectorIndex index = vectorIndices.get("embeddings"); final VectorSearchResult result = index.search(randomVector(random, dimension), 30); assertEquals(30, result.size()); } @@ -1261,118 +1145,6 @@ void testInMemoryIndexStillWorks() // Background Persistence Tests // ======================================================================== - /** - * Test background persistence configuration builder. - */ - @Test - void testBackgroundPersistenceConfigurationBuilder(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(60_000) - .persistOnShutdown(true) - .minChangesBetweenPersists(50) - .build(); - - assertTrue(config.onDisk()); - assertTrue(config.backgroundPersistence()); - assertEquals(60_000, config.persistenceIntervalMs()); - assertTrue(config.persistOnShutdown()); - assertEquals(50, config.minChangesBetweenPersists()); - } - - /** - * Test background persistence configuration defaults. - */ - @Test - void testBackgroundPersistenceConfigurationDefaults(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - // Background persistence should be disabled by default - assertFalse(config.backgroundPersistence()); - assertEquals(0, config.persistenceIntervalMs()); - assertTrue(config.persistOnShutdown()); - assertEquals(100, config.minChangesBetweenPersists()); - } - - /** - * Test validation: background persistence requires onDisk. - */ - @Test - void testBackgroundPersistenceRequiresOnDisk() - { - assertThrows(IllegalStateException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .persistenceIntervalMs(30_000) - // onDisk not set - .build() - ); - } - - /** - * Test validation: persistenceIntervalMs must be non-negative. - */ - @Test - void testPersistenceIntervalMsMustBeNonNegative(@TempDir final Path tempDir) - { - // 0 is valid (means disabled) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .persistenceIntervalMs(0) - .build(); - assertEquals(0, config.persistenceIntervalMs()); - assertFalse(config.backgroundPersistence()); - - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .persistenceIntervalMs(-1000) - .build() - ); - } - - /** - * Test validation: minChangesBetweenPersists must be non-negative. - */ - @Test - void testMinChangesBetweenPersistsMustBeNonNegative(@TempDir final Path tempDir) - { - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .minChangesBetweenPersists(-1) - .build() - ); - - // Zero should be allowed (persist on every interval) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .minChangesBetweenPersists(0) - .build(); - assertEquals(0, config.minChangesBetweenPersists()); - } - /** * Test that background persistence triggers after the configured interval. */ @@ -2518,248 +2290,6 @@ void testInMemoryIndexWithBackgroundOptimization(@TempDir final Path tempDir) th // Parallel vs Non-Parallel On-Disk Write Tests // ======================================================================== - /** - * Test that parallel and non-parallel on-disk writes produce equivalent search results - * for a large index without PQ compression. - * Both modes should produce identical graph files that yield the same search quality. - */ - @Test - void testParallelVsNonParallelOnDiskWrite(@TempDir final Path tempDir) throws IOException - { - final int vectorCount = 2000; - final int dimension = 64; - final int k = 20; - final Random random = new Random(42); - - // Generate shared vectors and query - final List vectors = new ArrayList<>(); - for(int i = 0; i < vectorCount; i++) - { - vectors.add(randomVector(random, dimension)); - } - final float[] queryVector = randomVector(new Random(999), dimension); - - final Path parallelIndexDir = tempDir.resolve("parallel"); - final Path sequentialIndexDir = tempDir.resolve("sequential"); - - // --- Parallel mode --- - final List parallelIds; - final List parallelScores; - { - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(16) - .beamWidth(100) - .onDisk(true) - .indexDirectory(parallelIndexDir) - .parallelOnDiskWrite(true) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", config, new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } - - index.persistToDisk(); - - final VectorSearchResult result = index.search(queryVector, k); - parallelIds = new ArrayList<>(); - parallelScores = new ArrayList<>(); - for(final VectorSearchResult.Entry entry : result) - { - parallelIds.add(entry.entityId()); - parallelScores.add(entry.score()); - } - - assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.meta"))); - } - - // --- Sequential mode --- - final List sequentialIds; - final List sequentialScores; - { - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(16) - .beamWidth(100) - .onDisk(true) - .indexDirectory(sequentialIndexDir) - .parallelOnDiskWrite(false) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", config, new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } - - index.persistToDisk(); - - final VectorSearchResult result = index.search(queryVector, k); - sequentialIds = new ArrayList<>(); - sequentialScores = new ArrayList<>(); - for(final VectorSearchResult.Entry entry : result) - { - sequentialIds.add(entry.entityId()); - sequentialScores.add(entry.score()); - } - - assertTrue(Files.exists(sequentialIndexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(sequentialIndexDir.resolve("embeddings.meta"))); - } - - // --- Compare results --- - assertEquals(k, parallelIds.size()); - assertEquals(k, sequentialIds.size()); - - // Both indices were built from the same data with the same HNSW parameters, - // so search results must be identical. - assertEquals(parallelIds, sequentialIds, - "Parallel and sequential on-disk writes should produce identical search results"); - assertEquals(parallelScores, sequentialScores, - "Parallel and sequential on-disk writes should produce identical search scores"); - } - - /** - * Test that parallel and non-parallel on-disk writes produce equivalent search results - * for a large index with PQ compression enabled. - * This exercises the FusedPQ write path which is the primary target of the parallel mode setting. - */ - @Test - void testParallelVsNonParallelOnDiskWriteWithCompression(@TempDir final Path tempDir) throws IOException - { - final int vectorCount = 2000; - final int dimension = 64; - final int pqSubspaces = 16; - final int k = 20; - final Random random = new Random(42); - - // Generate shared vectors and query - final List vectors = new ArrayList<>(); - for(int i = 0; i < vectorCount; i++) - { - vectors.add(randomVector(random, dimension)); - } - final float[] queryVector = randomVector(new Random(999), dimension); - - final Path parallelIndexDir = tempDir.resolve("parallel"); - final Path sequentialIndexDir = tempDir.resolve("sequential"); - - // --- Parallel mode with PQ --- - final List parallelIds; - final List parallelScores; - { - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(32) - .beamWidth(100) - .onDisk(true) - .indexDirectory(parallelIndexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .parallelOnDiskWrite(true) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", config, new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } - - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - index.persistToDisk(); - - final VectorSearchResult result = index.search(queryVector, k); - parallelIds = new ArrayList<>(); - parallelScores = new ArrayList<>(); - for(final VectorSearchResult.Entry entry : result) - { - parallelIds.add(entry.entityId()); - parallelScores.add(entry.score()); - } - - assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.meta"))); - } - - // --- Sequential mode with PQ --- - final List sequentialIds; - final List sequentialScores; - { - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(32) - .beamWidth(100) - .onDisk(true) - .indexDirectory(sequentialIndexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .parallelOnDiskWrite(false) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", config, new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } - - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - index.persistToDisk(); - - final VectorSearchResult result = index.search(queryVector, k); - sequentialIds = new ArrayList<>(); - sequentialScores = new ArrayList<>(); - for(final VectorSearchResult.Entry entry : result) - { - sequentialIds.add(entry.entityId()); - sequentialScores.add(entry.score()); - } - - assertTrue(Files.exists(sequentialIndexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(sequentialIndexDir.resolve("embeddings.meta"))); - } - - // --- Compare results --- - assertEquals(k, parallelIds.size()); - assertEquals(k, sequentialIds.size()); - - // Both indices were built from the same data with identical HNSW parameters and PQ training, - // so search results must be identical. - assertEquals(parallelIds, sequentialIds, - "Parallel and sequential PQ-compressed on-disk writes should produce identical search results"); - assertEquals(parallelScores, sequentialScores, - "Parallel and sequential PQ-compressed on-disk writes should produce identical search scores"); - } /** * Test that parallel and non-parallel on-disk writes both support persist-and-reload @@ -3031,4 +2561,104 @@ void testEmbeddedVectorizerWithPqAndParallelOnDiskWrite(@TempDir final Path temp final VectorSearchResult result = index.search(queryVector, 10); assertEquals(10, result.size()); } + + /** + * Test that parallel and non-parallel on-disk writes produce equivalent search results + * for a large index without PQ compression. + * Both modes should produce identical graph files that yield the same search quality. + */ + @Test + void testParallelVsSequentialOnDiskWrite(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 2000; + final int dimension = 64; + final int k = 20; + final Random random = new Random(42); + + // Generate shared vectors and query + final List vectors = new ArrayList<>(); + for (int i = 0; i < vectorCount; i++) { + vectors.add(randomVector(random, dimension)); + } + final float[] queryVector = randomVector(new Random(999), dimension); + + final Path parallelIndexDir = tempDir.resolve("parallel"); + final Path sequentialIndexDir = tempDir.resolve("sequential"); + + final List parallelIds = new ArrayList<>(); + final List parallelScores = new ArrayList<>(); + final List sequentialIds = new ArrayList<>(); + final List sequentialScores = new ArrayList<>(); + + // --- Parallel config + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration configParallel = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .onDisk(true) + .indexDirectory(parallelIndexDir) + .parallelOnDiskWrite(true) + .build(); + + // --- Sequential config + final VectorIndex index = vectorIndices.add( + "embeddings", configParallel, new ComputedDocumentVectorizer() + ); + + final VectorIndexConfiguration configSequential = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .enablePqCompression(true) + .beamWidth(100) + .onDisk(true) + .indexDirectory(sequentialIndexDir) + .parallelOnDiskWrite(false) + .build(); + + final VectorIndex indexSequential = vectorIndices.add( + "embeddingsSequential", configSequential, new ComputedDocumentVectorizer() + ); + + for (int i = 0; i < vectorCount; i++) { + gigaMap.add(new Document("doc_" + i, vectors.get(i))); + } + + index.persistToDisk(); + indexSequential.persistToDisk(); + + //parallel + final VectorSearchResult result = index.search(queryVector, k); + for (final VectorSearchResult.Entry entry : result) { + parallelIds.add(entry.entityId()); + parallelScores.add(entry.score()); + } + + //sequential + final VectorSearchResult resultSequential = indexSequential.search(queryVector, k); + for (final VectorSearchResult.Entry entry : resultSequential) { + sequentialIds.add(entry.entityId()); + sequentialScores.add(entry.score()); + } + + assertAll( + () -> assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.graph"))), + () -> assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.meta"))), + () -> assertTrue(Files.exists(sequentialIndexDir.resolve("embeddingsSequential.graph"))), + () -> assertTrue(Files.exists(sequentialIndexDir.resolve("embeddingsSequential.meta"))) + ); + + // Both indices were built from the same data with the same HNSW parameters, + // so search results must be identical. + assertEquals(parallelIds, sequentialIds, + "Parallel and sequential on-disk writes should produce identical search results"); + assertEquals(parallelScores, sequentialScores, + "Parallel and sequential on-disk writes should produce identical search scores"); + } + + } From f274dbc9182b6b9e1a51e487f0ada5df2bbf9bb2 Mon Sep 17 00:00:00 2001 From: Zdenek Jonas Date: Fri, 20 Feb 2026 14:08:33 +0100 Subject: [PATCH 2/7] remove test, duplicate of testBackgroundOptimizationTriggersAfterIntervalAndThreshold --- .../gigamap/jvector/VectorIndexDiskTest.java | 52 ------------------- 1 file changed, 52 deletions(-) diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java index db8b2875..0f2b6435 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java @@ -2233,58 +2233,6 @@ void testBackgroundPersistenceAndOptimizationTogether(@TempDir final Path tempDi } } - /** - * Test that in-memory index can also use background optimization. - */ - @Test - void testInMemoryIndexWithBackgroundOptimization(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 150; - final Random random = new Random(42); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - // In-memory index with background optimization only - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .optimizationIntervalMs(200) - .minChangesBetweenOptimizations(10) - .optimizeOnShutdown(true) - .build(); - - assertFalse(config.onDisk(), "Should be in-memory index"); - assertTrue(config.backgroundOptimization(), "Background optimization should be enabled"); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try - { - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Wait for optimization to run - Thread.sleep(600); - - // Search should still work - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - } - finally - { - index.close(); - } - } - // ======================================================================== // Parallel vs Non-Parallel On-Disk Write Tests From 4e1246d734123ae137c979f7e9134d27c42165cf Mon Sep 17 00:00:00 2001 From: Zdenek Jonas Date: Fri, 20 Feb 2026 19:13:59 +0100 Subject: [PATCH 3/7] test refactoring --- gigamap/jvector/pom.xml | 6 + .../jvector/VectorIndexConfigurationTest.java | 97 + .../gigamap/jvector/VectorIndexDiskTest.java | 4837 ++++++++--------- 3 files changed, 2373 insertions(+), 2567 deletions(-) diff --git a/gigamap/jvector/pom.xml b/gigamap/jvector/pom.xml index 00f7e8f3..ab5cf765 100644 --- a/gigamap/jvector/pom.xml +++ b/gigamap/jvector/pom.xml @@ -44,6 +44,12 @@ junit-jupiter-engine test + + org.awaitility + awaitility + 4.2.2 + test + diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java index ebb4f451..fcaf437c 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java @@ -1175,4 +1175,101 @@ void testMinChangesBetweenPersistsMustBeNonNegative(@TempDir final Path tempDir) assertEquals(0, config.minChangesBetweenPersists()); } + /** + * Test validation: optimizationIntervalMs must be non-negative. + */ + @Test + void testOptimizationIntervalMsMustBeNonNegative(@TempDir final Path tempDir) + { + // 0 is valid (means disabled) + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .optimizationIntervalMs(0) + .build(); + assertEquals(0, config.optimizationIntervalMs()); + assertFalse(config.backgroundOptimization()); + + assertThrows(IllegalArgumentException.class, () -> + VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .optimizationIntervalMs(-1000) + .build() + ); + } + + /** + * Test background optimization configuration defaults. + */ + @Test + void testBackgroundOptimizationConfigurationDefaults(@TempDir final Path tempDir) + { + final Path indexDir = tempDir.resolve("index"); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(indexDir) + .build(); + + // Background optimization should be disabled by default + assertFalse(config.backgroundOptimization()); + assertEquals(0, config.optimizationIntervalMs()); + assertEquals(1000, config.minChangesBetweenOptimizations()); + assertFalse(config.optimizeOnShutdown()); + } + + /** + * Test validation: minChangesBetweenOptimizations must be non-negative. + */ + @Test + void testMinChangesBetweenOptimizationsMustBeNonNegative(@TempDir final Path tempDir) + { + assertThrows(IllegalArgumentException.class, () -> + VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .minChangesBetweenOptimizations(-1) + .build() + ); + + // Zero should be allowed (optimize on every interval) + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .onDisk(true) + .indexDirectory(tempDir) + .minChangesBetweenOptimizations(0) + .build(); + assertEquals(0, config.minChangesBetweenOptimizations()); + } + + /** + * Test background optimization configuration builder. + */ + @Test + void testBackgroundOptimizationConfigurationBuilder(@TempDir final Path tempDir) + { + final Path indexDir = tempDir.resolve("index"); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(128) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(120_000) + .minChangesBetweenOptimizations(500) + .optimizeOnShutdown(true) + .build(); + + assertTrue(config.onDisk()); + assertTrue(config.backgroundOptimization()); + assertEquals(120_000, config.optimizationIntervalMs()); + assertEquals(500, config.minChangesBetweenOptimizations()); + assertTrue(config.optimizeOnShutdown()); + } + } diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java index 0f2b6435..ad9765e4 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java @@ -9,7 +9,7 @@ * This program and the accompanying materials are made * available under the terms of the Eclipse Public License 2.0 * which is available at https://www.eclipse.org/legal/epl-2.0/ - * + * * SPDX-License-Identifier: EPL-2.0 * #L% */ @@ -17,6 +17,8 @@ import org.eclipse.store.gigamap.types.GigaMap; import org.eclipse.store.storage.embedded.types.EmbeddedStorage; import org.eclipse.store.storage.embedded.types.EmbeddedStorageManager; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.RepeatedTest; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Timeout; import org.junit.jupiter.api.io.TempDir; @@ -33,7 +35,11 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.IntStream; +import static java.time.Duration.ofMillis; +import static java.time.Duration.ofSeconds; +import static org.awaitility.Awaitility.await; import static org.junit.jupiter.api.Assertions.*; /** @@ -41,2572 +47,2269 @@ */ class VectorIndexDiskTest { - /** - * Simple entity with an embedding vector. - */ - record Document(String content, float[] embedding) {} - - /** - * Computed vectorizer - simulates externally computed vectors. - */ - static class ComputedDocumentVectorizer extends Vectorizer - { - @Override - public float[] vectorize(final Document entity) - { - return entity.embedding(); - } - } - - /** - * Embedded vectorizer - vectors are part of the entity, not stored separately. - */ - static class EmbeddedDocumentVectorizer extends Vectorizer - { - @Override - public float[] vectorize(final Document entity) - { - return entity.embedding(); - } - - @Override - public boolean isEmbedded() - { - return true; - } - } - - /** - * Helper to generate a random normalized vector. - */ - private static float[] randomVector(final Random random, final int dimension) - { - final float[] vector = new float[dimension]; - float norm = 0; - for(int i = 0; i < dimension; i++) - { - vector[i] = random.nextFloat() * 2 - 1; - norm += vector[i] * vector[i]; - } - norm = (float)Math.sqrt(norm); - for(int i = 0; i < dimension; i++) - { - vector[i] /= norm; - } - return vector; - } - - - /** - * Test creating an on-disk index and persisting it. - */ - @Test - void testOnDiskIndexCreationAndPersistence(@TempDir final Path tempDir) throws IOException - { - final int vectorCount = 500; - final int dimension = 64; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - final Path storageDir = tempDir.resolve("storage"); - - // Generate vectors - final List vectors = new ArrayList<>(); - for(int i = 0; i < vectorCount; i++) - { - vectors.add(randomVector(random, dimension)); - } - - final float[] queryVector = randomVector(new Random(999), dimension); - final List expectedIds = new ArrayList<>(); - - // Phase 1: Create index and persist - { - try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) - { - final GigaMap gigaMap = GigaMap.New(); - storage.setRoot(gigaMap); - - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - assertTrue(index.isOnDisk()); - assertFalse(index.isPqCompressionEnabled()); - - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } - - // Search and record expected results - final VectorSearchResult result = index.search(queryVector, 10); - for(final VectorSearchResult.Entry entry : result) - { - expectedIds.add(entry.entityId()); - } - - // Persist index to disk - index.persistToDisk(); - - // Verify files were created - assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); - - storage.storeRoot(); - } - } - - // Phase 2: Reload and verify - { - try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) - { - @SuppressWarnings("unchecked") - final GigaMap gigaMap = (GigaMap)storage.root(); - final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); - - assertEquals(vectorCount, gigaMap.size()); - - final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); - assertTrue(index.isOnDisk()); - - // Search and compare results - final VectorSearchResult result = index.search(queryVector, 10); - final List actualIds = new ArrayList<>(); - for(final VectorSearchResult.Entry entry : result) - { - actualIds.add(entry.entityId()); - } - - // Results should match (or at least be very similar due to HNSW nature) - assertEquals(expectedIds.size(), actualIds.size()); - } - } - } - - /** - * Test on-disk index with compression (PQ). - */ - @Test - void testOnDiskIndexWithCompression(@TempDir final Path tempDir) throws IOException - { - final int vectorCount = 500; - final int dimension = 64; - final int pqSubspaces = 16; // 64 / 16 = 4 dimensions per subspace - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - assertTrue(index.isOnDisk()); - assertTrue(index.isPqCompressionEnabled()); - - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Train compression - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - - // Search should work - final float[] queryVector = randomVector(random, dimension); - final VectorSearchResult result = index.search(queryVector, 10); - - assertEquals(10, result.size()); - - // Verify all entities are accessible - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - assertTrue(entry.entity().content().startsWith("doc_")); - } - - // Persist to disk - index.persistToDisk(); - - // Verify graph file was created (FusedPQ is embedded in graph, no separate .pq file) - assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); - assertFalse(Files.exists(indexDir.resolve("embeddings.pq")), - "FusedPQ should be embedded in graph file, not in separate .pq file"); - } - - /** - * Test search quality with on-disk index - verify exact match is found first. - */ - @Test - void testOnDiskSearchQuality(@TempDir final Path tempDir) throws IOException - { - final int vectorCount = 1000; - final int dimension = 64; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - // Add random vectors - for(int i = 0; i < vectorCount - 1; i++) - { - gigaMap.add(new Document("random_" + i, randomVector(random, dimension))); - } - - // Add a one-hot "needle" vector that randomVector() cannot produce, - // since randomVector() populates all dimensions with non-zero values. - final float[] needleVector = new float[dimension]; - needleVector[0] = 1.0f; - - gigaMap.add(new Document("needle", needleVector)); - - // Persist index - index.persistToDisk(); - - // Search for the needle vector - it should be the first result - final VectorSearchResult result = index.search(needleVector, 5); - - assertEquals(5, result.size()); - final VectorSearchResult.Entry firstResult = result.iterator().next(); - assertEquals("needle", firstResult.entity().content(), "Exact match should be first result"); - assertTrue(firstResult.score() > 0.99f, "Exact match should have score close to 1.0"); - } - - /** - * Test multiple restarts with on-disk index. - */ - @Test - void testOnDiskIndexMultipleRestarts(@TempDir final Path tempDir) throws IOException - { - final int dimension = 32; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - final Path storageDir = tempDir.resolve("storage"); - - // Phase 1: Create with 100 vectors - { - try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) - { - final GigaMap gigaMap = GigaMap.New(); - storage.setRoot(gigaMap); - - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - vectorIndices.add("embeddings", config, new ComputedDocumentVectorizer()); - - for(int i = 0; i < 100; i++) - { - gigaMap.add(new Document("phase1_doc_" + i, randomVector(random, dimension))); - } - - assertEquals(100, gigaMap.size()); - storage.storeRoot(); - } - } - - // Phase 2: Restart and add 50 more vectors - { - try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) - { - @SuppressWarnings("unchecked") - final GigaMap gigaMap = (GigaMap)storage.root(); - final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); - - assertEquals(100, gigaMap.size()); - - final VectorIndex index = vectorIndices.get("embeddings"); - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - - // Add more vectors - for(int i = 0; i < 50; i++) - { - gigaMap.add(new Document("phase2_doc_" + i, randomVector(random, dimension))); - } - - assertEquals(150, gigaMap.size()); - storage.storeRoot(); - } - } - - // Phase 3: Final verification - { - try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) - { - @SuppressWarnings("unchecked") - final GigaMap gigaMap = (GigaMap)storage.root(); - VectorIndex index = gigaMap.index().get(VectorIndices.Category()).get("embeddings"); - - assertEquals(150, gigaMap.size()); - - final VectorSearchResult result = index.search(randomVector(random, dimension), 30); - assertEquals(30, result.size()); - } - } - } - - // ======================================================================== - // PQ Compression Search Tests - // ======================================================================== - - /** - * Test search quality with PQ compression enabled. - * Verifies that an exact match (needle) is found in the top results - * despite quantization loss from Product Quantization. - */ - @Test - void testPqCompressionSearchQuality(@TempDir final Path tempDir) - { - final int vectorCount = 500; - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - // Add random vectors - for(int i = 0; i < vectorCount - 1; i++) - { - gigaMap.add(new Document("random_" + i, randomVector(random, dimension))); - } - - // Add a one-hot "needle" vector that randomVector() cannot produce, - // since randomVector() populates all dimensions with non-zero values. - final float[] needleVector = new float[dimension]; - needleVector[0] = 1.0f; - - gigaMap.add(new Document("needle", needleVector)); - - // Train PQ compression - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - - // Search for the needle vector - it should be in the top results - final VectorSearchResult result = index.search(needleVector, 5); - - assertEquals(5, result.size()); - final VectorSearchResult.Entry firstResult = result.iterator().next(); - assertEquals("needle", firstResult.entity().content(), - "Exact match should be first result even with PQ compression"); - assertTrue(firstResult.score() > 0.99f, - "Exact match should have score close to 1.0"); - - // Verify results are ordered by score - float prevScore = Float.MAX_VALUE; - for(final VectorSearchResult.Entry entry : result) - { - assertTrue(entry.score() <= prevScore, "Results should be ordered by score"); - prevScore = entry.score(); - } - } - - /** - * Test PQ-compressed disk index persistence and reload with search verification. - * Verifies that search still works correctly after saving and reloading - * a PQ-compressed index. - */ - @Test - void testPqCompressionPersistAndReload(@TempDir final Path tempDir) throws IOException - { - final int vectorCount = 500; - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - final Path storageDir = tempDir.resolve("storage"); - - final List vectors = new ArrayList<>(); - for(int i = 0; i < vectorCount; i++) - { - vectors.add(randomVector(random, dimension)); - } - - final float[] queryVector = randomVector(new Random(999), dimension); - final List expectedIds = new ArrayList<>(); - - // Phase 1: Create index with PQ, populate, search, persist - { - try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) - { - final GigaMap gigaMap = GigaMap.New(); - storage.setRoot(gigaMap); - - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - assertTrue(index.isOnDisk()); - assertTrue(index.isPqCompressionEnabled()); - - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } - - // Train and search - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - - final VectorSearchResult result = index.search(queryVector, 10); - for(final VectorSearchResult.Entry entry : result) - { - expectedIds.add(entry.entityId()); - } - - // Persist - index.persistToDisk(); - assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); - - storage.storeRoot(); - } - } - - // Phase 2: Reload and verify search results - { - try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) - { - @SuppressWarnings("unchecked") - final GigaMap gigaMap = (GigaMap)storage.root(); - final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); - - assertEquals(vectorCount, gigaMap.size()); - - final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); - assertTrue(index.isOnDisk()); - assertTrue(index.isPqCompressionEnabled()); - - // Search after reload - final VectorSearchResult result = index.search(queryVector, 10); - assertEquals(10, result.size()); - - final List actualIds = new ArrayList<>(); - for(final VectorSearchResult.Entry entry : result) - { - actualIds.add(entry.entityId()); - } - - // Results should match (or at least overlap significantly) - assertEquals(expectedIds.size(), actualIds.size()); - - // Verify all entities are accessible - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - assertTrue(entry.entity().content().startsWith("doc_")); - } - } - } - } - - /** - * Test PQ-compressed disk index with DOT_PRODUCT similarity function. - */ - @Test - void testPqCompressionWithDotProduct(@TempDir final Path tempDir) - { - final int vectorCount = 500; - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.DOT_PRODUCT) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - - final float[] queryVector = randomVector(random, dimension); - final VectorSearchResult result = index.search(queryVector, 10); - - assertEquals(10, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - } - } - - /** - * Test PQ-compressed disk index with EUCLIDEAN similarity function. - */ - @Test - void testPqCompressionWithEuclidean(@TempDir final Path tempDir) - { - final int vectorCount = 500; - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.EUCLIDEAN) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - - final float[] queryVector = randomVector(random, dimension); - final VectorSearchResult result = index.search(queryVector, 10); - - assertEquals(10, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - } - } - - /** - * Test PQ compression with default subspaces (auto-calculated as dimension/4). - */ - @Test - void testPqCompressionWithDefaultSubspaces(@TempDir final Path tempDir) - { - final int vectorCount = 500; - final int dimension = 128; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - // pqSubspaces not set - should default to dimension/4 = 32 - .build(); - - assertEquals(0, config.pqSubspaces(), - "pqSubspaces should be 0 (auto-calculated at runtime)"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - - final float[] queryVector = randomVector(random, dimension); - final VectorSearchResult result = index.search(queryVector, 10); - - assertEquals(10, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - assertTrue(entry.entity().content().startsWith("doc_")); - } - } - - /** - * Test removing entities from a PQ-compressed disk index. - * Verifies that removed entities do not appear in search results. - */ - @Test - void testPqCompressionWithRemoval(@TempDir final Path tempDir) - { - final int vectorCount = 500; - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - - // Remove every other entity (even IDs) - for(int i = 0; i < vectorCount; i += 2) - { - gigaMap.removeById(i); - } - - assertEquals(vectorCount / 2, gigaMap.size()); - - // Search should only return remaining entities - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - final String content = entry.entity().content(); - final int docNum = Integer.parseInt(content.replace("doc_", "")); - assertTrue(docNum % 2 != 0, - "Only odd-numbered documents should remain, found: " + content); - } - } - - /** - * Test concurrent search with PQ compression enabled. - * Verifies thread safety of PQ-compressed search. - */ - @Test - void testPqCompressionConcurrentSearch(@TempDir final Path tempDir) throws Exception - { - final int vectorCount = 500; - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - - // Run concurrent searches - final int numSearches = 50; - final AtomicInteger successfulSearches = new AtomicInteger(0); - final AtomicBoolean hasError = new AtomicBoolean(false); - final CountDownLatch latch = new CountDownLatch(numSearches); - final ExecutorService executor = Executors.newFixedThreadPool(4); - - for(int i = 0; i < numSearches; i++) - { - final float[] queryVector = randomVector(new Random(i), dimension); - executor.submit(() -> - { - try - { - final VectorSearchResult result = index.search(queryVector, 10); - if(result.size() == 10) - { - successfulSearches.incrementAndGet(); - } - } - catch(final Exception e) - { - hasError.set(true); - e.printStackTrace(); - } - finally - { - latch.countDown(); - } - }); - } - - assertTrue(latch.await(30, TimeUnit.SECONDS), "Searches should complete within timeout"); - executor.shutdown(); - - assertFalse(hasError.get(), "No errors should occur during concurrent PQ search"); - assertEquals(numSearches, successfulSearches.get(), - "All concurrent PQ searches should return expected results"); - } - - /** - * Test adding vectors after PQ training. - * Verifies that search still works after adding more vectors post-training. - */ - @Test - void testPqCompressionAddAfterTraining(@TempDir final Path tempDir) - { - final int initialCount = 500; - final int additionalCount = 200; - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - // Add initial vectors - for(int i = 0; i < initialCount; i++) - { - gigaMap.add(new Document("initial_" + i, randomVector(random, dimension))); - } - - // Train PQ - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - - // Search before adding more - final float[] queryVector = randomVector(random, dimension); - final VectorSearchResult resultBefore = index.search(queryVector, 10); - assertEquals(10, resultBefore.size()); - - // Add more vectors after training - for(int i = 0; i < additionalCount; i++) - { - gigaMap.add(new Document("additional_" + i, randomVector(random, dimension))); - } - - assertEquals(initialCount + additionalCount, gigaMap.size()); - - // Search should still work and may include newly added vectors - final VectorSearchResult resultAfter = index.search(queryVector, 10); - assertEquals(10, resultAfter.size()); - - for(final VectorSearchResult.Entry entry : resultAfter) - { - assertNotNull(entry.entity()); - } - } - - /** - * Test PQ-compressed disk index with multiple restarts. - * Verifies that search works correctly after persisting a PQ-compressed - * index to disk and reloading it across multiple restart cycles. - */ - @Test - void testPqCompressionMultipleRestarts(@TempDir final Path tempDir) throws IOException - { - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - final Path storageDir = tempDir.resolve("storage"); - - final float[] queryVector = randomVector(new Random(999), dimension); - - // Phase 1: Create with 500 vectors and PQ, persist to disk - { - try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) - { - final GigaMap gigaMap = GigaMap.New(); - storage.setRoot(gigaMap); - - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < 500; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - index.persistToDisk(); - - // Verify search works before restart - final VectorSearchResult result = index.search(queryVector, 10); - assertEquals(10, result.size()); - - storage.storeRoot(); - } - } - - // Phase 2: Restart and verify search works from loaded disk index - { - try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) - { - @SuppressWarnings("unchecked") - final GigaMap gigaMap = (GigaMap)storage.root(); - final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); - - assertEquals(500, gigaMap.size()); - - final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); - assertTrue(index.isOnDisk()); - assertTrue(index.isPqCompressionEnabled()); - - // Search should work after reload - final VectorSearchResult result = index.search(queryVector, 10); - assertEquals(10, result.size()); - - // Verify all entities are accessible - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - assertTrue(entry.entity().content().startsWith("doc_")); - } - } - } - - // Phase 3: Second restart - verify search still works - { - try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) - { - @SuppressWarnings("unchecked") - final GigaMap gigaMap = (GigaMap)storage.root(); - final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); - - assertEquals(500, gigaMap.size()); - - final VectorIndex index = vectorIndices.get("embeddings"); - final VectorSearchResult result = index.search(queryVector, 20); - assertEquals(20, result.size()); - } - } - } - - /** - * Test PQ-compressed disk index with removeAll and repopulation. - * Verifies the index can be cleared and rebuilt with PQ compression. - */ - @Test - void testPqCompressionRemoveAllAndRepopulate(@TempDir final Path tempDir) - { - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - // Initial population - for(int i = 0; i < 500; i++) - { - gigaMap.add(new Document("old_" + i, randomVector(random, dimension))); - } - - assertEquals(500, gigaMap.size()); - - // Clear all - gigaMap.removeAll(); - assertEquals(0, gigaMap.size()); - - // Repopulate - for(int i = 0; i < 600; i++) - { - gigaMap.add(new Document("new_" + i, randomVector(random, dimension))); - } - - assertEquals(600, gigaMap.size()); - - final VectorIndices vectorIndicesAfter = gigaMap.index().get(VectorIndices.Category()); - final VectorIndex indexAfter = vectorIndicesAfter.get("embeddings"); - - // Train PQ on new data - ((VectorIndex.Internal)indexAfter).trainCompressionIfNeeded(); - - // Search should find only new documents - final VectorSearchResult result = indexAfter.search(randomVector(random, dimension), 20); - assertEquals(20, result.size()); - - for(final VectorSearchResult.Entry entry : result) - { - assertTrue(entry.entity().content().startsWith("new_"), - "All results should be from new population"); - } - } - - /** - * Test that in-memory index (default) still works as expected. - */ - @Test - void testInMemoryIndexStillWorks() - { - final int dimension = 32; - final Random random = new Random(42); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - // Default configuration (in-memory) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .build(); - - assertFalse(config.onDisk()); - assertNull(config.indexDirectory()); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - assertFalse(index.isOnDisk()); - - // Add vectors - for(int i = 0; i < 100; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Search should work - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - } - - - // ======================================================================== - // Background Persistence Tests - // ======================================================================== - - /** - * Test that background persistence triggers after the configured interval. - */ - @Test - void testBackgroundPersistenceTriggersAfterInterval(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - // Configure with short interval for testing - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(500) // 500ms for fast test - .minChangesBetweenPersists(1) // Persist on any change - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try - { - // Add vectors to trigger dirty state - for(int i = 0; i < 50; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Initially, files should not exist (not yet persisted) - assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should not exist immediately after adding"); - - // Wait for background persistence to trigger (interval + some buffer) - Thread.sleep(1500); - - // Files should now exist - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after background persistence"); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), - "Meta file should exist after background persistence"); - } - finally - { - index.close(); - } - } - - /** - * Test that search works concurrently during background persistence. - */ - @Test - void testConcurrentSearchDuringBackgroundPersistence(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 200; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(200) // Short interval to trigger during test - .minChangesBetweenPersists(1) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try - { - // Add initial vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Run concurrent searches while background persistence may be running - final int numSearches = 50; - final AtomicInteger successfulSearches = new AtomicInteger(0); - final AtomicBoolean hasError = new AtomicBoolean(false); - final CountDownLatch latch = new CountDownLatch(numSearches); - final ExecutorService executor = Executors.newFixedThreadPool(4); - - for(int i = 0; i < numSearches; i++) - { - final float[] queryVector = randomVector(new Random(i), dimension); - executor.submit(() -> - { - try - { - final VectorSearchResult result = index.search(queryVector, 10); - if(result.size() == 10) - { - successfulSearches.incrementAndGet(); - } - } - catch(final Exception e) - { - hasError.set(true); - e.printStackTrace(); - } - finally - { - latch.countDown(); - } - }); - - // Small delay to spread searches over time - Thread.sleep(20); - } - - // Wait for all searches to complete - assertTrue(latch.await(30, TimeUnit.SECONDS), "Searches should complete within timeout"); - executor.shutdown(); - - // Verify all searches succeeded - assertFalse(hasError.get(), "No errors should occur during concurrent search"); - assertEquals(numSearches, successfulSearches.get(), - "All searches should return expected number of results"); - } - finally - { - index.close(); - } - } - - /** - * Test that shutdown persists pending changes when persistOnShutdown is true. - */ - @Test - void testShutdownPersistsPendingChanges(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 100; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(60_000) // Long interval - won't trigger during test - .minChangesBetweenPersists(1) - .persistOnShutdown(true) // Should persist on close - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Files should not exist yet (interval hasn't triggered) - assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should not exist before close"); - - // Close the index (should trigger persist due to persistOnShutdown=true) - index.close(); - - // Files should now exist - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after close with persistOnShutdown=true"); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), - "Meta file should exist after close with persistOnShutdown=true"); - } - - /** - * Test that shutdown does NOT persist when persistOnShutdown is false. - */ - @Test - void testShutdownSkipsPersistWhenDisabled(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 100; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(60_000) // Long interval - won't trigger during test - .minChangesBetweenPersists(1) - .persistOnShutdown(false) // Should NOT persist on close - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Close the index (should NOT trigger persist) - index.close(); - - // Files should NOT exist - assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should not exist after close with persistOnShutdown=false"); - } - - /** - * Test debouncing: persistence is skipped when change count is below threshold. - */ - @Test - void testDebouncing(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - // Configure with high threshold that won't be met - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(200) // Short interval - .minChangesBetweenPersists(500) // High threshold - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try - { - // Add fewer vectors than the threshold - for(int i = 0; i < 50; i++) // 50 < 500 threshold - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Wait for multiple persistence intervals - Thread.sleep(800); - - // Files should NOT exist because change count is below threshold - assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should not exist when changes below threshold"); - - // Now add more vectors to exceed the threshold - for(int i = 50; i < 600; i++) // Total now 600 > 500 threshold - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Wait for persistence to trigger - Thread.sleep(500); - - // Now files should exist - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist when changes exceed threshold"); - } - finally - { - index.close(); - } - } - - /** - * Test that adding vectors in bulk correctly tracks change count. - */ - @Test - void testBulkAddTracksChangeCount(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(300) - .minChangesBetweenPersists(100) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try - { - // Bulk add documents - final List documents = new ArrayList<>(); - for(int i = 0; i < 150; i++) - { - documents.add(new Document("doc_" + i, randomVector(random, dimension))); - } - gigaMap.addAll(documents); - - // Wait for persistence - Thread.sleep(800); - - // Files should exist because bulk add counted as 150 changes (> 100 threshold) - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after bulk add exceeds threshold"); - } - finally - { - index.close(); - } - } - - /** - * Test that background persistence can be reloaded after restart. - */ - @Test - void testBackgroundPersistenceWithRestart(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 200; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - final Path storageDir = tempDir.resolve("storage"); - - final float[] queryVector = randomVector(new Random(999), dimension); - final int expectedK = 10; - - // Phase 1: Create index with background persistence and add vectors - { - try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) - { - final GigaMap gigaMap = GigaMap.New(); - storage.setRoot(gigaMap); - - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(100) - .minChangesBetweenPersists(1) - .persistOnShutdown(true) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Verify search works - final VectorSearchResult result = index.search(queryVector, expectedK); - assertEquals(expectedK, result.size()); - - storage.storeRoot(); - - // Explicitly close the index to trigger persistOnShutdown - // (EmbeddedStorageManager doesn't auto-close VectorIndex) - index.close(); - } - } - - // Verify files were persisted - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after close"); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), - "Meta file should exist after close"); - - // Phase 2: Reload and verify - { - try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) - { - @SuppressWarnings("unchecked") - final GigaMap gigaMap = (GigaMap)storage.root(); - final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); - - assertEquals(vectorCount, gigaMap.size()); - - final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); - assertTrue(index.isOnDisk(), "Index should be on-disk after reload"); - - // Search should still work after reload - final VectorSearchResult result = index.search(queryVector, expectedK); - assertEquals(expectedK, result.size()); - - // Clean up - index.close(); - } - } - } - - /** - * Test that manual persistToDisk still works with background persistence enabled. - */ - @Test - void testManualPersistWithBackgroundPersistenceEnabled(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 100; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(60_000) // Long interval - won't trigger - .minChangesBetweenPersists(1000) // High threshold - won't trigger - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try - { - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Files should not exist yet - assertFalse(Files.exists(indexDir.resolve("embeddings.graph"))); - - // Manually trigger persistence - index.persistToDisk(); - - // Files should now exist - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after manual persistToDisk"); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), - "Meta file should exist after manual persistToDisk"); - } - finally - { - index.close(); - } - } - - - // ======================================================================== - // Background Optimization Tests - // ======================================================================== - - /** - * Test background optimization configuration builder. - */ - @Test - void testBackgroundOptimizationConfigurationBuilder(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(120_000) - .minChangesBetweenOptimizations(500) - .optimizeOnShutdown(true) - .build(); - - assertTrue(config.onDisk()); - assertTrue(config.backgroundOptimization()); - assertEquals(120_000, config.optimizationIntervalMs()); - assertEquals(500, config.minChangesBetweenOptimizations()); - assertTrue(config.optimizeOnShutdown()); - } - - /** - * Test background optimization configuration defaults. - */ - @Test - void testBackgroundOptimizationConfigurationDefaults(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - // Background optimization should be disabled by default - assertFalse(config.backgroundOptimization()); - assertEquals(0, config.optimizationIntervalMs()); - assertEquals(1000, config.minChangesBetweenOptimizations()); - assertFalse(config.optimizeOnShutdown()); - } - - /** - * Test validation: optimizationIntervalMs must be non-negative. - */ - @Test - void testOptimizationIntervalMsMustBeNonNegative(@TempDir final Path tempDir) - { - // 0 is valid (means disabled) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .optimizationIntervalMs(0) - .build(); - assertEquals(0, config.optimizationIntervalMs()); - assertFalse(config.backgroundOptimization()); - - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .optimizationIntervalMs(-1000) - .build() - ); - } - - /** - * Test validation: minChangesBetweenOptimizations must be non-negative. - */ - @Test - void testMinChangesBetweenOptimizationsMustBeNonNegative(@TempDir final Path tempDir) - { - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .minChangesBetweenOptimizations(-1) - .build() - ); - - // Zero should be allowed (optimize on every interval) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .minChangesBetweenOptimizations(0) - .build(); - assertEquals(0, config.minChangesBetweenOptimizations()); - } - - /** - * Test that background optimization runs after the configured interval and threshold. - */ - @Test - void testBackgroundOptimizationTriggersAfterIntervalAndThreshold(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - // Configure with short interval and low threshold for testing - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(300) // 300ms for fast test - .minChangesBetweenOptimizations(10) // Low threshold - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try - { - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - - // Initially, optimization count should be 0 - assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), - "Optimization count should be 0 initially"); - - // Add vectors to trigger dirty state above threshold - for(int i = 0; i < 50; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Verify pending changes are tracked - assertTrue(defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount() > 0, - "Pending changes should be tracked"); - - // Wait for background optimization to run - Thread.sleep(800); - - // Verify optimization was actually performed - assertTrue(defaultIndex.backgroundTaskManager.getOptimizationCount() >= 1, - "Optimization should have been performed at least once"); - - // Verify pending changes were reset - assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), - "Pending changes should be reset after optimization"); - - // Verify search still works - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - } - finally - { - index.close(); - } - } - - /** - * Test that optimization is skipped when change count is below threshold. - */ - @Test - void testOptimizationDebouncingBelowThreshold(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - // Configure with high threshold that won't be met - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(200) // Short interval - .minChangesBetweenOptimizations(500) // High threshold - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try - { - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - - // Add fewer vectors than the threshold - for(int i = 0; i < 50; i++) // 50 < 500 threshold - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Verify pending changes are tracked - assertEquals(50, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), - "Pending changes should be 50"); - - // Wait for multiple optimization intervals - Thread.sleep(600); - - // Verify optimization was NOT performed (below threshold) - assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), - "Optimization should NOT have been performed (below threshold)"); - - // Verify pending changes are still tracked (not reset) - assertEquals(50, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), - "Pending changes should still be 50 (not reset)"); - - // Search should still work - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - } - finally - { - index.close(); - } - } - - /** - * Test that shutdown optimizes pending changes when optimizeOnShutdown is true. - */ - @Test - void testShutdownOptimizesPendingChanges(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 100; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(60_000) // Long interval - won't trigger during test - .minChangesBetweenOptimizations(1) - .optimizeOnShutdown(true) // Should optimize on close - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Verify pending changes are tracked - assertEquals(vectorCount, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), - "Pending changes should equal vector count"); - - // Verify no optimization has run yet - assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), - "Optimization count should be 0 before close"); - - // Verify search works before close - final VectorSearchResult resultBefore = index.search(randomVector(random, dimension), 10); - assertEquals(10, resultBefore.size()); - - // Close the index (should trigger optimize due to optimizeOnShutdown=true) - index.close(); - - // Note: After close(), we can't verify the count changed because the manager is shutdown. - // But we verified above that pending changes existed and the interval hadn't triggered. - // The fact that close() completed without error indicates optimization was attempted. - } - - /** - * Test that shutdown does NOT optimize when optimizeOnShutdown is false. - */ - @Test - void testShutdownSkipsOptimizeWhenDisabled(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 100; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(60_000) // Long interval - won't trigger during test - .minChangesBetweenOptimizations(1) - .optimizeOnShutdown(false) // Should NOT optimize on close - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; - - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Verify pending changes are tracked - assertEquals(vectorCount, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), - "Pending changes should equal vector count"); - - // Verify no optimization has run yet - assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), - "Optimization count should be 0 before close"); - - // Close the index (should NOT trigger optimize) - index.close(); - - // Note: After close(), we can't access the manager. But we verified: - // 1. Pending changes existed - // 2. No background optimization had run - // 3. optimizeOnShutdown=false was set - // So the pending changes should remain unoptimized. - } - - /** - * Test that search works concurrently during background optimization. - */ - @Test - void testConcurrentSearchDuringBackgroundOptimization(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 200; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(150) // Short interval to trigger during test - .minChangesBetweenOptimizations(1) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try - { - // Add initial vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Run concurrent searches while background optimization may be running - final int numSearches = 50; - final AtomicInteger successfulSearches = new AtomicInteger(0); - final AtomicBoolean hasError = new AtomicBoolean(false); - final CountDownLatch latch = new CountDownLatch(numSearches); - final ExecutorService executor = Executors.newFixedThreadPool(4); - - for(int i = 0; i < numSearches; i++) - { - final float[] queryVector = randomVector(new Random(i), dimension); - executor.submit(() -> - { - try - { - final VectorSearchResult result = index.search(queryVector, 10); - if(result.size() == 10) - { - successfulSearches.incrementAndGet(); - } - } - catch(final Exception e) - { - hasError.set(true); - e.printStackTrace(); - } - finally - { - latch.countDown(); - } - }); - - // Small delay to spread searches over time - Thread.sleep(15); - } - - // Wait for all searches to complete - assertTrue(latch.await(30, TimeUnit.SECONDS), "Searches should complete within timeout"); - executor.shutdown(); - - // Verify all searches succeeded - assertFalse(hasError.get(), "No errors should occur during concurrent search with optimization"); - assertEquals(numSearches, successfulSearches.get(), - "All searches should return expected number of results"); - } - finally - { - index.close(); - } - } - - /** - * Test that bulk add correctly tracks change count for optimization. - */ - @Test - void testBulkAddTracksChangeCountForOptimization(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(300) - .minChangesBetweenOptimizations(100) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try - { - // Bulk add documents that exceeds the threshold - final List documents = new ArrayList<>(); - for(int i = 0; i < 150; i++) - { - documents.add(new Document("doc_" + i, randomVector(random, dimension))); - } - gigaMap.addAll(documents); - - // Wait for optimization - Thread.sleep(800); - - // Search should still work - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - } - finally - { - index.close(); - } - } - - /** - * Test that manual optimize() method still works with background optimization enabled. - */ - @Test - void testManualOptimizeWithBackgroundOptimizationEnabled(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 100; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(60_000) // Long interval - won't trigger - .minChangesBetweenOptimizations(1000) // High threshold - won't trigger - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try - { - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Manually trigger optimization - index.optimize(); - - // Search should still work - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - } - finally - { - index.close(); - } - } - - /** - * Test that both background persistence and optimization can be enabled together. - */ - @Test - void testBackgroundPersistenceAndOptimizationTogether(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 150; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - // Enable both background persistence and optimization - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(300) - .minChangesBetweenPersists(10) - .persistOnShutdown(true) - .optimizationIntervalMs(400) - .minChangesBetweenOptimizations(10) - .optimizeOnShutdown(true) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try - { - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Wait for both background tasks to run - Thread.sleep(1000); - - // Search should still work - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - - // Files should exist from background persistence - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist from background persistence"); - } - finally - { - index.close(); - } - } - - - // ======================================================================== - // Parallel vs Non-Parallel On-Disk Write Tests - // ======================================================================== - - - /** - * Test that parallel and non-parallel on-disk writes both support persist-and-reload - * for a large PQ-compressed index. - * Verifies that the graph files produced by both modes can be loaded correctly - * and yield equivalent search results after restart. - */ - @Test - void testParallelVsNonParallelPersistAndReload(@TempDir final Path tempDir) throws IOException - { - final int vectorCount = 2000; - final int dimension = 64; - final int pqSubspaces = 16; - final int k = 20; - final Random random = new Random(42); - - // Generate shared vectors and query - final List vectors = new ArrayList<>(); - for(int i = 0; i < vectorCount; i++) - { - vectors.add(randomVector(random, dimension)); - } - final float[] queryVector = randomVector(new Random(999), dimension); - - final Path parallelIndexDir = tempDir.resolve("parallel-index"); - final Path parallelStorageDir = tempDir.resolve("parallel-storage"); - final Path sequentialIndexDir = tempDir.resolve("sequential-index"); - final Path sequentialStorageDir = tempDir.resolve("sequential-storage"); - - // --- Build and persist both modes --- - buildAndPersistIndex(vectors, queryVector, dimension, pqSubspaces, parallelIndexDir, parallelStorageDir, true); - buildAndPersistIndex(vectors, queryVector, dimension, pqSubspaces, sequentialIndexDir, sequentialStorageDir, false); - - // --- Reload both and compare search results --- - final List parallelIds = new ArrayList<>(); - final List parallelScores = new ArrayList<>(); - { - try(final EmbeddedStorageManager storage = EmbeddedStorage.start(parallelStorageDir)) - { - @SuppressWarnings("unchecked") - final GigaMap gigaMap = (GigaMap)storage.root(); - final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); - - assertEquals(vectorCount, gigaMap.size()); - - final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); - assertTrue(index.isOnDisk()); - - final VectorSearchResult result = index.search(queryVector, k); - assertEquals(k, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - parallelIds.add(entry.entityId()); - parallelScores.add(entry.score()); - assertNotNull(entry.entity()); - } - } - } - - final List sequentialIds = new ArrayList<>(); - final List sequentialScores = new ArrayList<>(); - { - try(final EmbeddedStorageManager storage = EmbeddedStorage.start(sequentialStorageDir)) - { - @SuppressWarnings("unchecked") - final GigaMap gigaMap = (GigaMap)storage.root(); - final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); - - assertEquals(vectorCount, gigaMap.size()); - - final VectorIndex index = vectorIndices.get("embeddings"); - assertNotNull(index); - assertTrue(index.isOnDisk()); - - final VectorSearchResult result = index.search(queryVector, k); - assertEquals(k, result.size()); - for(final VectorSearchResult.Entry entry : result) - { - sequentialIds.add(entry.entityId()); - sequentialScores.add(entry.score()); - assertNotNull(entry.entity()); - } - } - } - - // Both modes should produce equivalent results after reload - assertEquals(parallelIds, sequentialIds, - "Parallel and sequential modes should produce identical search results after reload"); - assertEquals(parallelScores, sequentialScores, - "Parallel and sequential modes should produce identical search scores after reload"); - } - - /** - * Helper to build, populate, train PQ, persist, and store a PQ-compressed index. - */ - private void buildAndPersistIndex( - final List vectors , - final float[] queryVector , - final int dimension , - final int pqSubspaces , - final Path indexDir , - final Path storageDir , - final boolean parallel - ) throws IOException - { - try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) - { - final GigaMap gigaMap = GigaMap.New(); - storage.setRoot(gigaMap); - - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(32) - .beamWidth(100) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .parallelOnDiskWrite(parallel) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", config, new ComputedDocumentVectorizer() - ); - - for(int i = 0; i < vectors.size(); i++) - { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } - - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - index.persistToDisk(); - - assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); - - storage.storeRoot(); - } - } - - - // ======================================================================== - // Embedded Vectorizer + On-Disk Tests - // ======================================================================== - - /** - * Test that an embedded vectorizer with parallel on-disk write completes without deadlock. - *

- * This is a regression test for a deadlock where {@code persistToDisk()} held - * {@code synchronized(parentMap)} for the entire disk write. The disk writer uses - * internal worker threads (ForkJoinPool for PQ encoding, parallel graph writer) - * that call {@code parentMap.get()} — which also synchronizes on the same monitor. - *

- * The fix restructures locking: Phase 1 (prep) runs inside {@code synchronized(parentMap)}, - * Phase 2 (disk write) runs outside it but still holds {@code persistenceLock.writeLock()}. - *

- * Uses {@code @Timeout} to fail fast if a deadlock occurs instead of hanging indefinitely. - */ - @Test - @Timeout(value = 60, unit = TimeUnit.SECONDS) - void testEmbeddedVectorizerWithParallelOnDiskWrite(@TempDir final Path tempDir) - { - final int vectorCount = 500; - final int dimension = 64; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .parallelOnDiskWrite(true) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new EmbeddedDocumentVectorizer() - ); - - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // This would deadlock before the fix - index.persistToDisk(); - - // Verify files were created - assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); - - // Verify search still works after persist - final float[] queryVector = randomVector(random, dimension); - final VectorSearchResult result = index.search(queryVector, 10); - assertEquals(10, result.size()); - - for(final VectorSearchResult.Entry entry : result) - { - assertNotNull(entry.entity()); - } - } - - /** - * Test that an embedded vectorizer with PQ compression and parallel on-disk write - * completes without deadlock. - *

- * This is the most deadlock-prone scenario: FusedPQ encoding uses a ForkJoinPool - * that calls {@code getVector()} on worker threads, plus the parallel graph writer - * also calls {@code getVector()} from its own thread pool. - */ - @Test - @Timeout(value = 60, unit = TimeUnit.SECONDS) - void testEmbeddedVectorizerWithPqAndParallelOnDiskWrite(@TempDir final Path tempDir) - { - final int vectorCount = 500; - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .parallelOnDiskWrite(true) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new EmbeddedDocumentVectorizer() - ); - - // Add vectors - for(int i = 0; i < vectorCount; i++) - { - gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); - } - - // Train PQ compression - ((VectorIndex.Internal)index).trainCompressionIfNeeded(); - - // This would deadlock before the fix - index.persistToDisk(); - - // Verify files were created - assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); - - // Verify search still works - final float[] queryVector = randomVector(random, dimension); - final VectorSearchResult result = index.search(queryVector, 10); - assertEquals(10, result.size()); - } - - /** - * Test that parallel and non-parallel on-disk writes produce equivalent search results - * for a large index without PQ compression. - * Both modes should produce identical graph files that yield the same search quality. - */ - @Test - void testParallelVsSequentialOnDiskWrite(@TempDir final Path tempDir) throws IOException - { - final int vectorCount = 2000; - final int dimension = 64; - final int k = 20; - final Random random = new Random(42); - - // Generate shared vectors and query - final List vectors = new ArrayList<>(); - for (int i = 0; i < vectorCount; i++) { - vectors.add(randomVector(random, dimension)); - } - final float[] queryVector = randomVector(new Random(999), dimension); - - final Path parallelIndexDir = tempDir.resolve("parallel"); - final Path sequentialIndexDir = tempDir.resolve("sequential"); - - final List parallelIds = new ArrayList<>(); - final List parallelScores = new ArrayList<>(); - final List sequentialIds = new ArrayList<>(); - final List sequentialScores = new ArrayList<>(); - - // --- Parallel config - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration configParallel = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(16) - .beamWidth(100) - .onDisk(true) - .indexDirectory(parallelIndexDir) - .parallelOnDiskWrite(true) - .build(); - - // --- Sequential config - final VectorIndex index = vectorIndices.add( - "embeddings", configParallel, new ComputedDocumentVectorizer() - ); - - final VectorIndexConfiguration configSequential = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(16) - .enablePqCompression(true) - .beamWidth(100) - .onDisk(true) - .indexDirectory(sequentialIndexDir) - .parallelOnDiskWrite(false) - .build(); - - final VectorIndex indexSequential = vectorIndices.add( - "embeddingsSequential", configSequential, new ComputedDocumentVectorizer() - ); - - for (int i = 0; i < vectorCount; i++) { - gigaMap.add(new Document("doc_" + i, vectors.get(i))); - } - - index.persistToDisk(); - indexSequential.persistToDisk(); - - //parallel - final VectorSearchResult result = index.search(queryVector, k); - for (final VectorSearchResult.Entry entry : result) { - parallelIds.add(entry.entityId()); - parallelScores.add(entry.score()); - } - - //sequential - final VectorSearchResult resultSequential = indexSequential.search(queryVector, k); - for (final VectorSearchResult.Entry entry : resultSequential) { - sequentialIds.add(entry.entityId()); - sequentialScores.add(entry.score()); - } - - assertAll( - () -> assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.graph"))), - () -> assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.meta"))), - () -> assertTrue(Files.exists(sequentialIndexDir.resolve("embeddingsSequential.graph"))), - () -> assertTrue(Files.exists(sequentialIndexDir.resolve("embeddingsSequential.meta"))) - ); - - // Both indices were built from the same data with the same HNSW parameters, - // so search results must be identical. - assertEquals(parallelIds, sequentialIds, - "Parallel and sequential on-disk writes should produce identical search results"); - assertEquals(parallelScores, sequentialScores, - "Parallel and sequential on-disk writes should produce identical search scores"); - } + /** + * Simple entity with an embedding vector. + */ + record Document(String content, float[] embedding) + { + } + + /** + * Computed vectorizer - simulates externally computed vectors. + */ + static class ComputedDocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + } + + /** + * Embedded vectorizer - vectors are part of the entity, not stored separately. + */ + static class EmbeddedDocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + + @Override + public boolean isEmbedded() + { + return true; + } + } + + /** + * Helper to generate a random normalized vector. + */ + private static float[] randomVector(final Random random, final int dimension) + { + final float[] vector = new float[dimension]; + float norm = 0; + for (int i = 0; i < dimension; i++) { + vector[i] = random.nextFloat() * 2 - 1; + norm += vector[i] * vector[i]; + } + norm = (float) Math.sqrt(norm); + for (int i = 0; i < dimension; i++) { + vector[i] /= norm; + } + return vector; + } + + /** + * Helper to add multiple documents with random vectors to a GigaMap. + */ + private static void addRandomDocuments( + final GigaMap gigaMap, + final Random random, + final int dimension, + final int count, + final String prefix + ) + { + IntStream.range(0, count) + .forEach(i -> gigaMap.add(new Document(prefix + i, randomVector(random, dimension)))); + } + + /** + * Helper to add multiple documents from a list of pre-generated vectors. + */ + private static void addDocumentsFromVectors( + final GigaMap gigaMap, + final List vectors, + final String prefix + ) + { + IntStream.range(0, vectors.size()) + .forEach(i -> gigaMap.add(new Document(prefix + i, vectors.get(i)))); + } + + + /** + * Test creating an on-disk index and persisting it. + */ + @Test + void testOnDiskIndexCreationAndPersistence(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 500; + final int dimension = 64; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + final Path storageDir = tempDir.resolve("storage"); + + // Generate vectors + final List vectors = new ArrayList<>(); + for (int i = 0; i < vectorCount; i++) { + vectors.add(randomVector(random, dimension)); + } + + final float[] queryVector = randomVector(new Random(999), dimension); + final List expectedIds = new ArrayList<>(); + + // Phase 1: Create index and persist + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + final GigaMap gigaMap = GigaMap.New(); + storage.setRoot(gigaMap); + + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + assertTrue(index.isOnDisk()); + assertFalse(index.isPqCompressionEnabled()); + + // Add vectors + addDocumentsFromVectors(gigaMap, vectors, "doc_"); + + // Search and record expected results + final VectorSearchResult result = index.search(queryVector, 10); + for (final VectorSearchResult.Entry entry : result) { + expectedIds.add(entry.entityId()); + } + + // Persist index to disk + index.persistToDisk(); + + // Verify files were created + assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); + + storage.storeRoot(); + } + } + + // Phase 2: Reload and verify + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(vectorCount, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + assertTrue(index.isOnDisk()); + + // Search and compare results + final VectorSearchResult result = index.search(queryVector, 10); + final List actualIds = new ArrayList<>(); + for (final VectorSearchResult.Entry entry : result) { + actualIds.add(entry.entityId()); + } + + // Results should match (or at least be very similar due to HNSW nature) + assertEquals(expectedIds.size(), actualIds.size()); + } + } + } + + /** + * Test on-disk index with compression (PQ). + */ + @Test + void testOnDiskIndexWithCompression(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 500; + final int dimension = 64; + final int pqSubspaces = 16; // 64 / 16 = 4 dimensions per subspace + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + assertTrue(index.isOnDisk()); + assertTrue(index.isPqCompressionEnabled()); + + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Train compression + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + // Search should work + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult result = index.search(queryVector, 10); + + assertEquals(10, result.size()); + + // Verify all entities are accessible + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); + + // Persist to disk + index.persistToDisk(); + + // Verify graph file was created (FusedPQ is embedded in graph, no separate .pq file) + assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); + assertFalse(Files.exists(indexDir.resolve("embeddings.pq")), + "FusedPQ should be embedded in graph file, not in separate .pq file"); + } + + /** + * Test search quality with on-disk index - verify exact match is found first. + */ + @Test + void testOnDiskSearchQuality(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 1000; + final int dimension = 64; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + // Add random vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount - 1, "random_"); + + // Add a one-hot "needle" vector that randomVector() cannot produce, + // since randomVector() populates all dimensions with non-zero values. + final float[] needleVector = new float[dimension]; + needleVector[0] = 1.0f; + + gigaMap.add(new Document("needle", needleVector)); + + // Persist index + index.persistToDisk(); + + // Search for the needle vector - it should be the first result + final VectorSearchResult result = index.search(needleVector, 5); + + assertEquals(5, result.size()); + final VectorSearchResult.Entry firstResult = result.iterator().next(); + assertEquals("needle", firstResult.entity().content(), "Exact match should be first result"); + assertTrue(firstResult.score() > 0.99f, "Exact match should have score close to 1.0"); + } + + /** + * Test multiple restarts with on-disk index. + */ + @Test + void testOnDiskIndexMultipleRestarts(@TempDir final Path tempDir) throws IOException + { + final int dimension = 32; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + final Path storageDir = tempDir.resolve("storage"); + + // Phase 1: Create with 100 vectors + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + final GigaMap gigaMap = GigaMap.New(); + storage.setRoot(gigaMap); + + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .build(); + + vectorIndices.add("embeddings", config, new ComputedDocumentVectorizer()); + + addRandomDocuments(gigaMap, random, dimension, 100, "phase1_doc_"); + + assertEquals(100, gigaMap.size()); + storage.storeRoot(); + } + } + + // Phase 2: Restart and add 50 more vectors + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(100, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + final VectorSearchResult result = index.search(randomVector(random, dimension), 10); + assertEquals(10, result.size()); + + // Add more vectors + addRandomDocuments(gigaMap, random, dimension, 50, "phase2_doc_"); + + assertEquals(150, gigaMap.size()); + storage.storeRoot(); + } + } + + // Phase 3: Final verification + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + VectorIndex index = gigaMap.index().get(VectorIndices.Category()).get("embeddings"); + + assertEquals(150, gigaMap.size()); + + final VectorSearchResult result = index.search(randomVector(random, dimension), 30); + assertEquals(30, result.size()); + } + } + } + + // ======================================================================== + // PQ Compression Search Tests + // ======================================================================== + + /** + * Test search quality with PQ compression enabled. + * Verifies that an exact match (needle) is found in the top results + * despite quantization loss from Product Quantization. + */ + @Test + void testPqCompressionSearchQuality(@TempDir final Path tempDir) + { + final int vectorCount = 500; + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + // Add random vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount - 1, "random_"); + + // Add a one-hot "needle" vector that randomVector() cannot produce, + // since randomVector() populates all dimensions with non-zero values. + final float[] needleVector = new float[dimension]; + needleVector[0] = 1.0f; + + gigaMap.add(new Document("needle", needleVector)); + + // Train PQ compression + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + // Search for the needle vector - it should be in the top results + final VectorSearchResult result = index.search(needleVector, 5); + + assertEquals(5, result.size()); + final VectorSearchResult.Entry firstResult = result.iterator().next(); + assertEquals("needle", firstResult.entity().content(), + "Exact match should be first result even with PQ compression"); + assertTrue(firstResult.score() > 0.99f, + "Exact match should have score close to 1.0"); + + // Verify results are ordered by score + float prevScore = Float.MAX_VALUE; + for (final VectorSearchResult.Entry entry : result) { + assertTrue(entry.score() <= prevScore, "Results should be ordered by score"); + prevScore = entry.score(); + } + } + + /** + * Test PQ-compressed disk index persistence and reload with search verification. + * Verifies that search still works correctly after saving and reloading + * a PQ-compressed index. + */ + @Test + void testPqCompressionPersistAndReload(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 500; + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + final Path storageDir = tempDir.resolve("storage"); + + final List vectors = new ArrayList<>(); + for (int i = 0; i < vectorCount; i++) { + vectors.add(randomVector(random, dimension)); + } + + final float[] queryVector = randomVector(new Random(999), dimension); + final List expectedIds = new ArrayList<>(); + + // Phase 1: Create index with PQ, populate, search, persist + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + final GigaMap gigaMap = GigaMap.New(); + storage.setRoot(gigaMap); + + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + assertTrue(index.isOnDisk()); + assertTrue(index.isPqCompressionEnabled()); + + addDocumentsFromVectors(gigaMap, vectors, "doc_"); + + // Train and search + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + final VectorSearchResult result = index.search(queryVector, 10); + for (final VectorSearchResult.Entry entry : result) { + expectedIds.add(entry.entityId()); + } + + // Persist + index.persistToDisk(); + assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); + + storage.storeRoot(); + } + } + + // Phase 2: Reload and verify search results + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(vectorCount, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + assertTrue(index.isOnDisk()); + assertTrue(index.isPqCompressionEnabled()); + + // Search after reload + final VectorSearchResult result = index.search(queryVector, 10); + assertEquals(10, result.size()); + + final List actualIds = new ArrayList<>(); + for (final VectorSearchResult.Entry entry : result) { + actualIds.add(entry.entityId()); + } + + // Results should match (or at least overlap significantly) + assertEquals(expectedIds.size(), actualIds.size()); + + // Verify all entities are accessible + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); + } + } + } + + /** + * Test PQ-compressed disk index with DOT_PRODUCT similarity function. + */ + @Test + void testPqCompressionWithDotProduct(@TempDir final Path tempDir) + { + final int vectorCount = 500; + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.DOT_PRODUCT) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult result = index.search(queryVector, 10); + + assertEquals(10, result.size()); + result.forEach(entry -> assertNotNull(entry.entity())); + } + + /** + * Test PQ-compressed disk index with EUCLIDEAN similarity function. + */ + @Test + void testPqCompressionWithEuclidean(@TempDir final Path tempDir) + { + final int vectorCount = 500; + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.EUCLIDEAN) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult result = index.search(queryVector, 10); + + assertEquals(10, result.size()); + result.forEach(entry -> assertNotNull(entry.entity())); + } + + /** + * Test PQ compression with default subspaces (auto-calculated as dimension/4). + */ + @Test + void testPqCompressionWithDefaultSubspaces(@TempDir final Path tempDir) + { + final int vectorCount = 500; + final int dimension = 128; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + // pqSubspaces not set - should default to dimension/4 = 32 + .build(); + + assertEquals(0, config.pqSubspaces(), + "pqSubspaces should be 0 (auto-calculated at runtime)"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult result = index.search(queryVector, 10); + + assertEquals(10, result.size()); + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); + } + + /** + * Test removing entities from a PQ-compressed disk index. + * Verifies that removed entities do not appear in search results. + */ + @Test + void testPqCompressionWithRemoval(@TempDir final Path tempDir) + { + final int vectorCount = 500; + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + // Remove every other entity (even IDs) + for (int i = 0; i < vectorCount; i += 2) { + gigaMap.removeById(i); + } + + assertEquals(vectorCount / 2, gigaMap.size()); + + // Search should only return remaining entities + final VectorSearchResult result = index.search(randomVector(random, dimension), 10); + assertEquals(10, result.size()); + + for (final VectorSearchResult.Entry entry : result) { + final String content = entry.entity().content(); + final int docNum = Integer.parseInt(content.replace("doc_", "")); + assertTrue(docNum % 2 != 0, + "Only odd-numbered documents should remain, found: " + content); + } + } + + /** + * Test concurrent search with PQ compression enabled. + * Verifies thread safety of PQ-compressed search. + */ + @Test + void testPqCompressionConcurrentSearch(@TempDir final Path tempDir) throws Exception + { + final int vectorCount = 500; + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + // Run concurrent searches + final int numSearches = 50; + final AtomicInteger successfulSearches = new AtomicInteger(0); + final AtomicBoolean hasError = new AtomicBoolean(false); + final CountDownLatch latch = new CountDownLatch(numSearches); + final ExecutorService executor = Executors.newFixedThreadPool(4); + + for (int i = 0; i < numSearches; i++) { + final float[] queryVector = randomVector(new Random(i), dimension); + executor.submit(() -> + { + try { + final VectorSearchResult result = index.search(queryVector, 10); + if (result.size() == 10) { + successfulSearches.incrementAndGet(); + } + } catch (final Exception e) { + hasError.set(true); + e.printStackTrace(); + } finally { + latch.countDown(); + } + }); + } + + assertTrue(latch.await(30, TimeUnit.SECONDS), "Searches should complete within timeout"); + executor.shutdown(); + + assertFalse(hasError.get(), "No errors should occur during concurrent PQ search"); + assertEquals(numSearches, successfulSearches.get(), + "All concurrent PQ searches should return expected results"); + } + + /** + * Test adding vectors after PQ training. + * Verifies that search still works after adding more vectors post-training. + */ + @Test + void testPqCompressionAddAfterTraining(@TempDir final Path tempDir) + { + final int initialCount = 500; + final int additionalCount = 200; + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + // Add initial vectors + addRandomDocuments(gigaMap, random, dimension, initialCount, "initial_"); + + // Train PQ + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + // Search before adding more + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult resultBefore = index.search(queryVector, 10); + assertEquals(10, resultBefore.size()); + + // Add more vectors after training + addRandomDocuments(gigaMap, random, dimension, additionalCount, "additional_"); + + assertEquals(initialCount + additionalCount, gigaMap.size()); + + // Search should still work and may include newly added vectors + final VectorSearchResult resultAfter = index.search(queryVector, 10); + assertEquals(10, resultAfter.size()); + + resultAfter.forEach(entry -> assertNotNull(entry.entity())); + } + + /** + * Test PQ-compressed disk index with multiple restarts. + * Verifies that search works correctly after persisting a PQ-compressed + * index to disk and reloading it across multiple restart cycles. + */ + @Test + void testPqCompressionMultipleRestarts(@TempDir final Path tempDir) throws IOException + { + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + final Path storageDir = tempDir.resolve("storage"); + + final float[] queryVector = randomVector(new Random(999), dimension); + + // Phase 1: Create with 500 vectors and PQ, persist to disk + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + final GigaMap gigaMap = GigaMap.New(); + storage.setRoot(gigaMap); + + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + addRandomDocuments(gigaMap, random, dimension, 500, "doc_"); + + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + index.persistToDisk(); + + // Verify search works before restart + final VectorSearchResult result = index.search(queryVector, 10); + assertEquals(10, result.size()); + + storage.storeRoot(); + } + } + + // Phase 2: Restart and verify search works from loaded disk index + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(500, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + assertTrue(index.isOnDisk()); + assertTrue(index.isPqCompressionEnabled()); + + // Search should work after reload + final VectorSearchResult result = index.search(queryVector, 10); + assertEquals(10, result.size()); + + // Verify all entities are accessible + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); + + } + } + + // Phase 3: Second restart - verify search still works + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(500, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + final VectorSearchResult result = index.search(queryVector, 20); + assertEquals(20, result.size()); + } + } + } + + /** + * Test PQ-compressed disk index with removeAll and repopulation. + * Verifies the index can be cleared and rebuilt with PQ compression. + */ + @Test + void testPqCompressionRemoveAllAndRepopulate(@TempDir final Path tempDir) + { + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + // Initial population + addRandomDocuments(gigaMap, random, dimension, 500, "old_"); + + assertEquals(500, gigaMap.size()); + + // Clear all + gigaMap.removeAll(); + assertEquals(0, gigaMap.size()); + + // Repopulate + addRandomDocuments(gigaMap, random, dimension, 600, "new_"); + + assertEquals(600, gigaMap.size()); + + final VectorIndices vectorIndicesAfter = gigaMap.index().get(VectorIndices.Category()); + final VectorIndex indexAfter = vectorIndicesAfter.get("embeddings"); + + // Train PQ on new data + ((VectorIndex.Internal) indexAfter).trainCompressionIfNeeded(); + + // Search should find only new documents + final VectorSearchResult result = indexAfter.search(randomVector(random, dimension), 20); + assertEquals(20, result.size()); + + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("new_"))); + + } + + /** + * Test that in-memory index (default) still works as expected. + */ + @Test + void testInMemoryIndexStillWorks() + { + final int dimension = 32; + final Random random = new Random(42); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + // Default configuration (in-memory) + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertFalse(config.onDisk()); + assertNull(config.indexDirectory()); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + assertFalse(index.isOnDisk()); + + // Add vectors + addRandomDocuments(gigaMap, random, dimension, 100, "doc_"); + + // Search should work + final VectorSearchResult result = index.search(randomVector(random, dimension), 10); + assertEquals(10, result.size()); + } + + + // ======================================================================== + // Background Persistence Tests + // ======================================================================== + + /** + * Test that background persistence triggers after the configured interval. + */ + @Test + void testBackgroundPersistenceTriggersAfterInterval(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + // Configure with short interval for testing + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(500) // 500ms for fast test + .minChangesBetweenPersists(1) // Persist on any change + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + // Add vectors to trigger dirty state + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); + + // Initially, files should not exist (not yet persisted) + assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should not exist immediately after adding"); + + // Wait for background persistence to trigger (interval + some buffer) + await() + .atMost(ofMillis(1500)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertAll( + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist after background persistence"), + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), + "Meta file should exist after background persistence"))); + + } finally { + index.close(); + } + } + + /** + * Test that search works concurrently during background persistence. + */ + @Test + void testConcurrentSearchDuringBackgroundPersistence(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 200; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(200) // Short interval to trigger during test + .minChangesBetweenPersists(1) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + // Add initial vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Run concurrent searches while background persistence may be running + final int numSearches = 50; + final AtomicInteger successfulSearches = new AtomicInteger(0); + final AtomicBoolean hasError = new AtomicBoolean(false); + final CountDownLatch latch = new CountDownLatch(numSearches); + final ExecutorService executor = Executors.newFixedThreadPool(4); + + for (int i = 0; i < numSearches; i++) { + final float[] queryVector = randomVector(new Random(i), dimension); + executor.submit(() -> + { + try { + final VectorSearchResult result = index.search(queryVector, 10); + if (result.size() == 10) { + successfulSearches.incrementAndGet(); + } + } catch (final Exception e) { + hasError.set(true); + e.printStackTrace(); + } finally { + latch.countDown(); + } + }); + + // Small delay to spread searches over time + Thread.sleep(20); + } + + // Wait for all searches to complete + assertTrue(latch.await(30, TimeUnit.SECONDS), "Searches should complete within timeout"); + executor.shutdown(); + + // Verify all searches succeeded + assertFalse(hasError.get(), "No errors should occur during concurrent search"); + assertEquals(numSearches, successfulSearches.get(), + "All searches should return expected number of results"); + } finally { + index.close(); + } + } + + /** + * Test that shutdown persists pending changes when persistOnShutdown is true. + */ + @Test + void testShutdownPersistsPendingChanges(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 100; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(60_000) // Long interval - won't trigger during test + .minChangesBetweenPersists(1) + .persistOnShutdown(true) // Should persist on close + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Files should not exist yet (interval hasn't triggered) + assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should not exist before close"); + + // Close the index (should trigger persist due to persistOnShutdown=true) + index.close(); + + // Files should now exist + assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist after close with persistOnShutdown=true"); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), + "Meta file should exist after close with persistOnShutdown=true"); + } + + /** + * Test that shutdown does NOT persist when persistOnShutdown is false. + */ + @Test + void testShutdownSkipsPersistWhenDisabled(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 100; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(60_000) // Long interval - won't trigger during test + .minChangesBetweenPersists(1) + .persistOnShutdown(false) // Should NOT persist on close + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Close the index (should NOT trigger persist) + index.close(); + + // Files should NOT exist + assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should not exist after close with persistOnShutdown=false"); + } + + /** + * Test debouncing: persistence is skipped when change count is below threshold. + */ + @Test + void testDebouncing(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + // Configure with high threshold that won't be met + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(100) // Short interval + .minChangesBetweenPersists(500) // High threshold + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + // Add fewer vectors than the threshold + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // 50 < 500 threshold + + // Wait for multiple persistence intervals + Thread.sleep(500); + + // Files should NOT exist because change count is below threshold + assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should not exist when changes below threshold"); + + // Now add more vectors to exceed the threshold + IntStream.range(50, 600) // Total now 600 > 500 threshold + .forEach(i -> gigaMap.add(new Document("doc_" + i, randomVector(random, dimension)))); + + await() + .atMost(ofMillis(500)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist when changes exceed threshold")); + } finally { + index.close(); + } + } + + /** + * Test that adding vectors in bulk correctly tracks change count. + */ + @Test + void testBulkAddTracksChangeCount(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(300) + .minChangesBetweenPersists(100) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + // Bulk add documents + final List documents = new ArrayList<>(); + for (int i = 0; i < 150; i++) { + documents.add(new Document("doc_" + i, randomVector(random, dimension))); + } + gigaMap.addAll(documents); + + // Wait for persistence + await() + .atMost(ofMillis(800)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist after bulk add exceeds threshold")); + } finally { + index.close(); + } + } + + /** + * Test that background persistence can be reloaded after restart. + */ + @Test + void testBackgroundPersistenceWithRestart(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 200; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + final Path storageDir = tempDir.resolve("storage"); + + final float[] queryVector = randomVector(new Random(999), dimension); + final int expectedK = 10; + + // Phase 1: Create index with background persistence and add vectors + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + final GigaMap gigaMap = GigaMap.New(); + storage.setRoot(gigaMap); + + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(100) + .minChangesBetweenPersists(1) + .persistOnShutdown(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Verify search works + final VectorSearchResult result = index.search(queryVector, expectedK); + assertEquals(expectedK, result.size()); + + storage.storeRoot(); + + // Explicitly close the index to trigger persistOnShutdown + // (EmbeddedStorageManager doesn't auto-close VectorIndex) + index.close(); + } + } + + // Verify files were persisted + assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist after close"); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), + "Meta file should exist after close"); + + // Phase 2: Reload and verify + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(vectorCount, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + assertTrue(index.isOnDisk(), "Index should be on-disk after reload"); + + // Search should still work after reload + final VectorSearchResult result = index.search(queryVector, expectedK); + assertEquals(expectedK, result.size()); + + // Clean up + index.close(); + } + } + } + + /** + * Test that manual persistToDisk still works with background persistence enabled. + */ + @Test + void testManualPersistWithBackgroundPersistenceEnabled(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 100; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(60_000) // Long interval - won't trigger + .minChangesBetweenPersists(1000) // High threshold - won't trigger + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Files should not exist yet + assertFalse(Files.exists(indexDir.resolve("embeddings.graph"))); + + // Manually trigger persistence + index.persistToDisk(); + + // Files should now exist + assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist after manual persistToDisk"); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), + "Meta file should exist after manual persistToDisk"); + } finally { + index.close(); + } + } + + + // ======================================================================== + // Background Optimization Tests + // ======================================================================== + + /** + * Test that background optimization runs after the configured interval and threshold. + */ + @Test + void testBackgroundOptimizationTriggersAfterIntervalAndThreshold(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + // Configure with short interval and low threshold for testing + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(300) // 300ms for fast test + .minChangesBetweenOptimizations(10) // Low threshold + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + + // Initially, optimization count should be 0 + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), + "Optimization count should be 0 initially"); + + // Add vectors to trigger dirty state above threshold + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); + + // Verify pending changes are tracked + assertTrue(defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount() > 0, + "Pending changes should be tracked"); + + // Verify optimization was actually performed + await() + .atLeast(ofMillis(300)) + .atMost(ofMillis(800)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertTrue(defaultIndex.backgroundTaskManager.getOptimizationCount() >= 1, + "Optimization should have been performed at least once")); + + // Verify pending changes were reset + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), + "Pending changes should be reset after optimization"); + + // Verify search still works + final VectorSearchResult result = index.search(randomVector(random, dimension), 10); + assertEquals(10, result.size()); + } finally { + index.close(); + } + } + + /** + * Test that optimization is skipped when change count is below threshold. + */ + @Test + void testOptimizationDebouncingBelowThreshold(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + // Configure with high threshold that won't be met + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(200) // Short interval + .minChangesBetweenOptimizations(500) // High threshold + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + + // Add fewer vectors than the threshold + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // 50 < 500 threshold + + // Verify pending changes are tracked + assertEquals(50, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), + "Pending changes should be 50"); + + // Wait for multiple optimization intervals + Thread.sleep(600); + + // Verify optimization was NOT performed (below threshold) + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), + "Optimization should NOT have been performed (below threshold)"); + + // Verify pending changes are still tracked (not reset) + assertEquals(50, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), + "Pending changes should still be 50 (not reset)"); + + // Search should still work + final VectorSearchResult result = index.search(randomVector(random, dimension), 10); + assertEquals(10, result.size()); + } finally { + index.close(); + } + } + + /** + * Test that shutdown optimizes pending changes when optimizeOnShutdown is true. + */ + @Test + void testShutdownOptimizesPendingChanges(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 100; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(60_000) // Long interval - won't trigger during test + .minChangesBetweenOptimizations(1) + .optimizeOnShutdown(true) // Should optimize on close + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Verify pending changes are tracked + assertEquals(vectorCount, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), + "Pending changes should equal vector count"); + + // Verify no optimization has run yet + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), + "Optimization count should be 0 before close"); + + // Verify search works before close + final VectorSearchResult resultBefore = index.search(randomVector(random, dimension), 10); + assertEquals(10, resultBefore.size()); + + // Close the index (should trigger optimize due to optimizeOnShutdown=true) + index.close(); + + // Note: After close(), we can't verify the count changed because the manager is shutdown. + // But we verified above that pending changes existed and the interval hadn't triggered. + // The fact that close() completed without error indicates optimization was attempted. + } + + /** + * Test that shutdown does NOT optimize when optimizeOnShutdown is false. + */ + @Test + void testShutdownSkipsOptimizeWhenDisabled(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 100; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(60_000) // Long interval - won't trigger during test + .minChangesBetweenOptimizations(1) + .optimizeOnShutdown(false) // Should NOT optimize on close + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Verify pending changes are tracked + assertEquals(vectorCount, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), + "Pending changes should equal vector count"); + + // Verify no optimization has run yet + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), + "Optimization count should be 0 before close"); + + // Close the index (should NOT trigger optimize) + index.close(); + + // Note: After close(), we can't access the manager. But we verified: + // 1. Pending changes existed + // 2. No background optimization had run + // 3. optimizeOnShutdown=false was set + // So the pending changes should remain unoptimized. + } + + /** + * Test that search works concurrently during background optimization. + */ + @Test + void testConcurrentSearchDuringBackgroundOptimization(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 200; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(150) // Short interval to trigger during test + .minChangesBetweenOptimizations(1) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + // Add initial vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Run concurrent searches while background optimization may be running + final int numSearches = 50; + final AtomicInteger successfulSearches = new AtomicInteger(0); + final AtomicBoolean hasError = new AtomicBoolean(false); + final CountDownLatch latch = new CountDownLatch(numSearches); + final ExecutorService executor = Executors.newFixedThreadPool(4); + + for (int i = 0; i < numSearches; i++) { + final float[] queryVector = randomVector(new Random(i), dimension); + executor.submit(() -> + { + try { + final VectorSearchResult result = index.search(queryVector, 10); + if (result.size() == 10) { + successfulSearches.incrementAndGet(); + } + } catch (final Exception e) { + hasError.set(true); + e.printStackTrace(); + } finally { + latch.countDown(); + } + }); + + // Small delay to spread searches over time + Thread.sleep(15); + } + + // Wait for all searches to complete + assertTrue(latch.await(30, TimeUnit.SECONDS), "Searches should complete within timeout"); + executor.shutdown(); + + // Verify all searches succeeded + assertFalse(hasError.get(), "No errors should occur during concurrent search with optimization"); + assertEquals(numSearches, successfulSearches.get(), + "All searches should return expected number of results"); + } finally { + index.close(); + } + } + + /** + * Test that bulk add correctly tracks change count for optimization. + */ + @Test + void testBulkAddTracksChangeCountForOptimization(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(300) + .minChangesBetweenOptimizations(100) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + // Bulk add documents that exceeds the threshold + final List documents = new ArrayList<>(); + for (int i = 0; i < 150; i++) { + documents.add(new Document("doc_" + i, randomVector(random, dimension))); + } + gigaMap.addAll(documents); + + // Wait for optimization + Thread.sleep(500); + + // Search should still work + final VectorSearchResult result = index.search(randomVector(random, dimension), 10); + assertEquals(10, result.size()); + } finally { + index.close(); + } + } + + /** + * Test that manual optimize() method still works with background optimization enabled. + */ + @Test + void testManualOptimizeWithBackgroundOptimizationEnabled(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 100; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(60_000) // Long interval - won't trigger + .minChangesBetweenOptimizations(1000) // High threshold - won't trigger + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Manually trigger optimization + index.optimize(); + + // Search should still work + final VectorSearchResult result = index.search(randomVector(random, dimension), 10); + assertEquals(10, result.size()); + } finally { + index.close(); + } + } + + /** + * Test that both background persistence and optimization can be enabled together. + */ + @Test + void testBackgroundPersistenceAndOptimizationTogether(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 150; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + // Enable both background persistence and optimization + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(300) + .minChangesBetweenPersists(10) + .persistOnShutdown(true) + .optimizationIntervalMs(400) + .minChangesBetweenOptimizations(10) + .optimizeOnShutdown(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Wait for both background tasks to run + Thread.sleep(1000); + + // Search should still work + final VectorSearchResult result = index.search(randomVector(random, dimension), 10); + assertEquals(10, result.size()); + + // Files should exist from background persistence + assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist from background persistence"); + } finally { + index.close(); + } + } + + + // ======================================================================== + // Parallel vs Non-Parallel On-Disk Write Tests + // ======================================================================== + + + /** + * Test that parallel and non-parallel on-disk writes both support persist-and-reload + * for a large PQ-compressed index. + * Verifies that the graph files produced by both modes can be loaded correctly + * and yield equivalent search results after restart. + */ + @Test + void testParallelVsNonParallelPersistAndReload(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 2000; + final int dimension = 64; + final int pqSubspaces = 16; + final int k = 20; + final Random random = new Random(42); + + // Generate shared vectors and query + final List vectors = new ArrayList<>(); + for (int i = 0; i < vectorCount; i++) { + vectors.add(randomVector(random, dimension)); + } + final float[] queryVector = randomVector(new Random(999), dimension); + + final Path parallelIndexDir = tempDir.resolve("parallel-index"); + final Path parallelStorageDir = tempDir.resolve("parallel-storage"); + final Path sequentialIndexDir = tempDir.resolve("sequential-index"); + final Path sequentialStorageDir = tempDir.resolve("sequential-storage"); + + // --- Build and persist both modes --- + buildAndPersistIndex(vectors, queryVector, dimension, pqSubspaces, parallelIndexDir, parallelStorageDir, true); + buildAndPersistIndex(vectors, queryVector, dimension, pqSubspaces, sequentialIndexDir, sequentialStorageDir, false); + + // --- Reload both and compare search results --- + final List parallelIds = new ArrayList<>(); + final List parallelScores = new ArrayList<>(); + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(parallelStorageDir)) { + @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(vectorCount, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + assertTrue(index.isOnDisk()); + + final VectorSearchResult result = index.search(queryVector, k); + assertEquals(k, result.size()); + for (final VectorSearchResult.Entry entry : result) { + parallelIds.add(entry.entityId()); + parallelScores.add(entry.score()); + assertNotNull(entry.entity()); + } + } + } + + final List sequentialIds = new ArrayList<>(); + final List sequentialScores = new ArrayList<>(); + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(sequentialStorageDir)) { + @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(vectorCount, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + assertTrue(index.isOnDisk()); + + final VectorSearchResult result = index.search(queryVector, k); + assertEquals(k, result.size()); + for (final VectorSearchResult.Entry entry : result) { + sequentialIds.add(entry.entityId()); + sequentialScores.add(entry.score()); + assertNotNull(entry.entity()); + } + } + } + + // Both modes should produce equivalent results after reload + assertEquals(parallelIds, sequentialIds, + "Parallel and sequential modes should produce identical search results after reload"); + assertEquals(parallelScores, sequentialScores, + "Parallel and sequential modes should produce identical search scores after reload"); + } + + /** + * Helper to build, populate, train PQ, persist, and store a PQ-compressed index. + */ + private void buildAndPersistIndex( + final List vectors, + final float[] queryVector, + final int dimension, + final int pqSubspaces, + final Path indexDir, + final Path storageDir, + final boolean parallel + ) throws IOException + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + final GigaMap gigaMap = GigaMap.New(); + storage.setRoot(gigaMap); + + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(32) + .beamWidth(100) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .parallelOnDiskWrite(parallel) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + addDocumentsFromVectors(gigaMap, vectors, "doc_"); + + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + index.persistToDisk(); + + assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); + + storage.storeRoot(); + } + } + + + // ======================================================================== + // Embedded Vectorizer + On-Disk Tests + // ======================================================================== + + /** + * Test that an embedded vectorizer with parallel on-disk write completes without deadlock. + *

+ * This is a regression test for a deadlock where {@code persistToDisk()} held + * {@code synchronized(parentMap)} for the entire disk write. The disk writer uses + * internal worker threads (ForkJoinPool for PQ encoding, parallel graph writer) + * that call {@code parentMap.get()} — which also synchronizes on the same monitor. + *

+ * The fix restructures locking: Phase 1 (prep) runs inside {@code synchronized(parentMap)}, + * Phase 2 (disk write) runs outside it but still holds {@code persistenceLock.writeLock()}. + *

+ * Uses {@code @Timeout} to fail fast if a deadlock occurs instead of hanging indefinitely. + */ + @Test + @Timeout(value = 60, unit = TimeUnit.SECONDS) + void testEmbeddedVectorizerWithParallelOnDiskWrite(@TempDir final Path tempDir) + { + final int vectorCount = 500; + final int dimension = 64; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .parallelOnDiskWrite(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new EmbeddedDocumentVectorizer() + ); + + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // This would deadlock before the fix + index.persistToDisk(); + + // Verify files were created + assertAll( + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))), + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))) + ); + + // Verify search still works after persist + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult result = index.search(queryVector, 10); + assertEquals(10, result.size()); + + result.forEach(entry -> assertNotNull(entry.entity())); + } + + /** + * Test that an embedded vectorizer with PQ compression and parallel on-disk write + * completes without deadlock. + *

+ * This is the most deadlock-prone scenario: FusedPQ encoding uses a ForkJoinPool + * that calls {@code getVector()} on worker threads, plus the parallel graph writer + * also calls {@code getVector()} from its own thread pool. + */ + @Test + @Timeout(value = 60, unit = TimeUnit.SECONDS) + void testEmbeddedVectorizerWithPqAndParallelOnDiskWrite(@TempDir final Path tempDir) + { + final int vectorCount = 500; + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .parallelOnDiskWrite(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new EmbeddedDocumentVectorizer() + ); + + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Train PQ compression + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + // This would deadlock before the fix + index.persistToDisk(); + + // Verify files were created + assertAll( + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))), + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))) + ); + + // Verify search still works + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult result = index.search(queryVector, 10); + assertEquals(10, result.size()); + } + + /** + * Test that parallel and non-parallel on-disk writes produce equivalent search results + * for a large index without PQ compression. + * Both modes should produce identical graph files that yield the same search quality. + */ + @Test + void testParallelVsSequentialOnDiskWrite(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 2000; + final int dimension = 64; + final int k = 20; + final Random random = new Random(42); + + // Generate shared vectors and query + final List vectors = new ArrayList<>(); + for (int i = 0; i < vectorCount; i++) { + vectors.add(randomVector(random, dimension)); + } + final float[] queryVector = randomVector(new Random(999), dimension); + + final Path parallelIndexDir = tempDir.resolve("parallel"); + final Path sequentialIndexDir = tempDir.resolve("sequential"); + + final List parallelIds = new ArrayList<>(); + final List parallelScores = new ArrayList<>(); + final List sequentialIds = new ArrayList<>(); + final List sequentialScores = new ArrayList<>(); + + // --- Parallel config + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration configParallel = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .onDisk(true) + .indexDirectory(parallelIndexDir) + .parallelOnDiskWrite(true) + .build(); + + // --- Sequential config + final VectorIndex index = vectorIndices.add( + "embeddings", configParallel, new ComputedDocumentVectorizer() + ); + + final VectorIndexConfiguration configSequential = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .enablePqCompression(true) + .beamWidth(100) + .onDisk(true) + .indexDirectory(sequentialIndexDir) + .parallelOnDiskWrite(false) + .build(); + + final VectorIndex indexSequential = vectorIndices.add( + "embeddingsSequential", configSequential, new ComputedDocumentVectorizer() + ); + + addDocumentsFromVectors(gigaMap, vectors, "doc_"); + + index.persistToDisk(); + indexSequential.persistToDisk(); + + //parallel + final VectorSearchResult result = index.search(queryVector, k); + for (final VectorSearchResult.Entry entry : result) { + parallelIds.add(entry.entityId()); + parallelScores.add(entry.score()); + } + + //sequential + final VectorSearchResult resultSequential = indexSequential.search(queryVector, k); + for (final VectorSearchResult.Entry entry : resultSequential) { + sequentialIds.add(entry.entityId()); + sequentialScores.add(entry.score()); + } + + assertAll( + () -> assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.graph"))), + () -> assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.meta"))), + () -> assertTrue(Files.exists(sequentialIndexDir.resolve("embeddingsSequential.graph"))), + () -> assertTrue(Files.exists(sequentialIndexDir.resolve("embeddingsSequential.meta"))) + ); + + // Both indices were built from the same data with the same HNSW parameters, + // so search results must be identical. + assertEquals(parallelIds, sequentialIds, + "Parallel and sequential on-disk writes should produce identical search results"); + assertEquals(parallelScores, sequentialScores, + "Parallel and sequential on-disk writes should produce identical search scores"); + } } From f9d82e780516053f8458c389d0ae7c8a47527086 Mon Sep 17 00:00:00 2001 From: Zdenek Jonas Date: Fri, 20 Feb 2026 19:20:09 +0100 Subject: [PATCH 4/7] change indents - to see diff in PR --- .../gigamap/jvector/VectorIndexDiskTest.java | 4547 ++++++++--------- 1 file changed, 2272 insertions(+), 2275 deletions(-) diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java index ad9765e4..df8bcac4 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java @@ -14,14 +14,9 @@ * #L% */ -import org.eclipse.store.gigamap.types.GigaMap; -import org.eclipse.store.storage.embedded.types.EmbeddedStorage; -import org.eclipse.store.storage.embedded.types.EmbeddedStorageManager; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.RepeatedTest; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.Timeout; -import org.junit.jupiter.api.io.TempDir; +import static java.time.Duration.ofMillis; +import static org.awaitility.Awaitility.await; +import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; import java.nio.file.Files; @@ -37,2279 +32,2281 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.IntStream; -import static java.time.Duration.ofMillis; -import static java.time.Duration.ofSeconds; -import static org.awaitility.Awaitility.await; -import static org.junit.jupiter.api.Assertions.*; +import org.eclipse.store.gigamap.types.GigaMap; +import org.eclipse.store.storage.embedded.types.EmbeddedStorage; +import org.eclipse.store.storage.embedded.types.EmbeddedStorageManager; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.api.io.TempDir; /** * Tests for on-disk VectorIndex functionality and Product Quantization. */ class VectorIndexDiskTest { - /** - * Simple entity with an embedding vector. - */ - record Document(String content, float[] embedding) - { - } - - /** - * Computed vectorizer - simulates externally computed vectors. - */ - static class ComputedDocumentVectorizer extends Vectorizer - { - @Override - public float[] vectorize(final Document entity) - { - return entity.embedding(); - } - } - - /** - * Embedded vectorizer - vectors are part of the entity, not stored separately. - */ - static class EmbeddedDocumentVectorizer extends Vectorizer - { - @Override - public float[] vectorize(final Document entity) - { - return entity.embedding(); - } - - @Override - public boolean isEmbedded() - { - return true; - } - } - - /** - * Helper to generate a random normalized vector. - */ - private static float[] randomVector(final Random random, final int dimension) - { - final float[] vector = new float[dimension]; - float norm = 0; - for (int i = 0; i < dimension; i++) { - vector[i] = random.nextFloat() * 2 - 1; - norm += vector[i] * vector[i]; - } - norm = (float) Math.sqrt(norm); - for (int i = 0; i < dimension; i++) { - vector[i] /= norm; - } - return vector; - } - - /** - * Helper to add multiple documents with random vectors to a GigaMap. - */ - private static void addRandomDocuments( - final GigaMap gigaMap, - final Random random, - final int dimension, - final int count, - final String prefix - ) - { - IntStream.range(0, count) - .forEach(i -> gigaMap.add(new Document(prefix + i, randomVector(random, dimension)))); - } - - /** - * Helper to add multiple documents from a list of pre-generated vectors. - */ - private static void addDocumentsFromVectors( - final GigaMap gigaMap, - final List vectors, - final String prefix - ) - { - IntStream.range(0, vectors.size()) - .forEach(i -> gigaMap.add(new Document(prefix + i, vectors.get(i)))); - } - - - /** - * Test creating an on-disk index and persisting it. - */ - @Test - void testOnDiskIndexCreationAndPersistence(@TempDir final Path tempDir) throws IOException - { - final int vectorCount = 500; - final int dimension = 64; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - final Path storageDir = tempDir.resolve("storage"); - - // Generate vectors - final List vectors = new ArrayList<>(); - for (int i = 0; i < vectorCount; i++) { - vectors.add(randomVector(random, dimension)); - } - - final float[] queryVector = randomVector(new Random(999), dimension); - final List expectedIds = new ArrayList<>(); - - // Phase 1: Create index and persist - { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - final GigaMap gigaMap = GigaMap.New(); - storage.setRoot(gigaMap); - - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - assertTrue(index.isOnDisk()); - assertFalse(index.isPqCompressionEnabled()); - - // Add vectors - addDocumentsFromVectors(gigaMap, vectors, "doc_"); - - // Search and record expected results - final VectorSearchResult result = index.search(queryVector, 10); - for (final VectorSearchResult.Entry entry : result) { - expectedIds.add(entry.entityId()); - } - - // Persist index to disk - index.persistToDisk(); - - // Verify files were created - assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); - - storage.storeRoot(); - } - } - - // Phase 2: Reload and verify - { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); - final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); - - assertEquals(vectorCount, gigaMap.size()); - - final VectorIndex index = vectorIndices.get("embeddings"); - assertTrue(index.isOnDisk()); - - // Search and compare results - final VectorSearchResult result = index.search(queryVector, 10); - final List actualIds = new ArrayList<>(); - for (final VectorSearchResult.Entry entry : result) { - actualIds.add(entry.entityId()); - } - - // Results should match (or at least be very similar due to HNSW nature) - assertEquals(expectedIds.size(), actualIds.size()); - } - } - } - - /** - * Test on-disk index with compression (PQ). - */ - @Test - void testOnDiskIndexWithCompression(@TempDir final Path tempDir) throws IOException - { - final int vectorCount = 500; - final int dimension = 64; - final int pqSubspaces = 16; // 64 / 16 = 4 dimensions per subspace - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - assertTrue(index.isOnDisk()); - assertTrue(index.isPqCompressionEnabled()); - - // Add vectors - addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - - // Train compression - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); - - // Search should work - final float[] queryVector = randomVector(random, dimension); - final VectorSearchResult result = index.search(queryVector, 10); - - assertEquals(10, result.size()); - - // Verify all entities are accessible - result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); - - // Persist to disk - index.persistToDisk(); - - // Verify graph file was created (FusedPQ is embedded in graph, no separate .pq file) - assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); - assertFalse(Files.exists(indexDir.resolve("embeddings.pq")), - "FusedPQ should be embedded in graph file, not in separate .pq file"); - } - - /** - * Test search quality with on-disk index - verify exact match is found first. - */ - @Test - void testOnDiskSearchQuality(@TempDir final Path tempDir) throws IOException - { - final int vectorCount = 1000; - final int dimension = 64; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - // Add random vectors - addRandomDocuments(gigaMap, random, dimension, vectorCount - 1, "random_"); - - // Add a one-hot "needle" vector that randomVector() cannot produce, - // since randomVector() populates all dimensions with non-zero values. - final float[] needleVector = new float[dimension]; - needleVector[0] = 1.0f; - - gigaMap.add(new Document("needle", needleVector)); - - // Persist index - index.persistToDisk(); - - // Search for the needle vector - it should be the first result - final VectorSearchResult result = index.search(needleVector, 5); - - assertEquals(5, result.size()); - final VectorSearchResult.Entry firstResult = result.iterator().next(); - assertEquals("needle", firstResult.entity().content(), "Exact match should be first result"); - assertTrue(firstResult.score() > 0.99f, "Exact match should have score close to 1.0"); - } - - /** - * Test multiple restarts with on-disk index. - */ - @Test - void testOnDiskIndexMultipleRestarts(@TempDir final Path tempDir) throws IOException - { - final int dimension = 32; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - final Path storageDir = tempDir.resolve("storage"); - - // Phase 1: Create with 100 vectors - { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - final GigaMap gigaMap = GigaMap.New(); - storage.setRoot(gigaMap); - - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - vectorIndices.add("embeddings", config, new ComputedDocumentVectorizer()); - - addRandomDocuments(gigaMap, random, dimension, 100, "phase1_doc_"); - - assertEquals(100, gigaMap.size()); - storage.storeRoot(); - } - } - - // Phase 2: Restart and add 50 more vectors - { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); - final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); - - assertEquals(100, gigaMap.size()); - - final VectorIndex index = vectorIndices.get("embeddings"); - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - - // Add more vectors - addRandomDocuments(gigaMap, random, dimension, 50, "phase2_doc_"); - - assertEquals(150, gigaMap.size()); - storage.storeRoot(); - } - } - - // Phase 3: Final verification - { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); - VectorIndex index = gigaMap.index().get(VectorIndices.Category()).get("embeddings"); - - assertEquals(150, gigaMap.size()); - - final VectorSearchResult result = index.search(randomVector(random, dimension), 30); - assertEquals(30, result.size()); - } - } - } - - // ======================================================================== - // PQ Compression Search Tests - // ======================================================================== - - /** - * Test search quality with PQ compression enabled. - * Verifies that an exact match (needle) is found in the top results - * despite quantization loss from Product Quantization. - */ - @Test - void testPqCompressionSearchQuality(@TempDir final Path tempDir) - { - final int vectorCount = 500; - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - // Add random vectors - addRandomDocuments(gigaMap, random, dimension, vectorCount - 1, "random_"); - - // Add a one-hot "needle" vector that randomVector() cannot produce, - // since randomVector() populates all dimensions with non-zero values. - final float[] needleVector = new float[dimension]; - needleVector[0] = 1.0f; - - gigaMap.add(new Document("needle", needleVector)); - - // Train PQ compression - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); - - // Search for the needle vector - it should be in the top results - final VectorSearchResult result = index.search(needleVector, 5); - - assertEquals(5, result.size()); - final VectorSearchResult.Entry firstResult = result.iterator().next(); - assertEquals("needle", firstResult.entity().content(), - "Exact match should be first result even with PQ compression"); - assertTrue(firstResult.score() > 0.99f, - "Exact match should have score close to 1.0"); - - // Verify results are ordered by score - float prevScore = Float.MAX_VALUE; - for (final VectorSearchResult.Entry entry : result) { - assertTrue(entry.score() <= prevScore, "Results should be ordered by score"); - prevScore = entry.score(); - } - } - - /** - * Test PQ-compressed disk index persistence and reload with search verification. - * Verifies that search still works correctly after saving and reloading - * a PQ-compressed index. - */ - @Test - void testPqCompressionPersistAndReload(@TempDir final Path tempDir) throws IOException - { - final int vectorCount = 500; - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - final Path storageDir = tempDir.resolve("storage"); - - final List vectors = new ArrayList<>(); - for (int i = 0; i < vectorCount; i++) { - vectors.add(randomVector(random, dimension)); - } - - final float[] queryVector = randomVector(new Random(999), dimension); - final List expectedIds = new ArrayList<>(); - - // Phase 1: Create index with PQ, populate, search, persist - { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - final GigaMap gigaMap = GigaMap.New(); - storage.setRoot(gigaMap); - - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - assertTrue(index.isOnDisk()); - assertTrue(index.isPqCompressionEnabled()); - - addDocumentsFromVectors(gigaMap, vectors, "doc_"); - - // Train and search - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); - - final VectorSearchResult result = index.search(queryVector, 10); - for (final VectorSearchResult.Entry entry : result) { - expectedIds.add(entry.entityId()); - } - - // Persist - index.persistToDisk(); - assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); - - storage.storeRoot(); - } - } - - // Phase 2: Reload and verify search results - { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); - final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); - - assertEquals(vectorCount, gigaMap.size()); - - final VectorIndex index = vectorIndices.get("embeddings"); - assertTrue(index.isOnDisk()); - assertTrue(index.isPqCompressionEnabled()); - - // Search after reload - final VectorSearchResult result = index.search(queryVector, 10); - assertEquals(10, result.size()); - - final List actualIds = new ArrayList<>(); - for (final VectorSearchResult.Entry entry : result) { - actualIds.add(entry.entityId()); - } - - // Results should match (or at least overlap significantly) - assertEquals(expectedIds.size(), actualIds.size()); - - // Verify all entities are accessible - result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); - } - } - } - - /** - * Test PQ-compressed disk index with DOT_PRODUCT similarity function. - */ - @Test - void testPqCompressionWithDotProduct(@TempDir final Path tempDir) - { - final int vectorCount = 500; - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.DOT_PRODUCT) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); - - final float[] queryVector = randomVector(random, dimension); - final VectorSearchResult result = index.search(queryVector, 10); - - assertEquals(10, result.size()); - result.forEach(entry -> assertNotNull(entry.entity())); - } - - /** - * Test PQ-compressed disk index with EUCLIDEAN similarity function. - */ - @Test - void testPqCompressionWithEuclidean(@TempDir final Path tempDir) - { - final int vectorCount = 500; - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.EUCLIDEAN) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); - - final float[] queryVector = randomVector(random, dimension); - final VectorSearchResult result = index.search(queryVector, 10); - - assertEquals(10, result.size()); - result.forEach(entry -> assertNotNull(entry.entity())); - } - - /** - * Test PQ compression with default subspaces (auto-calculated as dimension/4). - */ - @Test - void testPqCompressionWithDefaultSubspaces(@TempDir final Path tempDir) - { - final int vectorCount = 500; - final int dimension = 128; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - // pqSubspaces not set - should default to dimension/4 = 32 - .build(); - - assertEquals(0, config.pqSubspaces(), - "pqSubspaces should be 0 (auto-calculated at runtime)"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); - - final float[] queryVector = randomVector(random, dimension); - final VectorSearchResult result = index.search(queryVector, 10); - - assertEquals(10, result.size()); - result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); - } - - /** - * Test removing entities from a PQ-compressed disk index. - * Verifies that removed entities do not appear in search results. - */ - @Test - void testPqCompressionWithRemoval(@TempDir final Path tempDir) - { - final int vectorCount = 500; - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); - - // Remove every other entity (even IDs) - for (int i = 0; i < vectorCount; i += 2) { - gigaMap.removeById(i); - } - - assertEquals(vectorCount / 2, gigaMap.size()); - - // Search should only return remaining entities - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - - for (final VectorSearchResult.Entry entry : result) { - final String content = entry.entity().content(); - final int docNum = Integer.parseInt(content.replace("doc_", "")); - assertTrue(docNum % 2 != 0, - "Only odd-numbered documents should remain, found: " + content); - } - } - - /** - * Test concurrent search with PQ compression enabled. - * Verifies thread safety of PQ-compressed search. - */ - @Test - void testPqCompressionConcurrentSearch(@TempDir final Path tempDir) throws Exception - { - final int vectorCount = 500; - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); - - // Run concurrent searches - final int numSearches = 50; - final AtomicInteger successfulSearches = new AtomicInteger(0); - final AtomicBoolean hasError = new AtomicBoolean(false); - final CountDownLatch latch = new CountDownLatch(numSearches); - final ExecutorService executor = Executors.newFixedThreadPool(4); - - for (int i = 0; i < numSearches; i++) { - final float[] queryVector = randomVector(new Random(i), dimension); - executor.submit(() -> - { - try { - final VectorSearchResult result = index.search(queryVector, 10); - if (result.size() == 10) { - successfulSearches.incrementAndGet(); - } - } catch (final Exception e) { - hasError.set(true); - e.printStackTrace(); - } finally { - latch.countDown(); - } - }); - } - - assertTrue(latch.await(30, TimeUnit.SECONDS), "Searches should complete within timeout"); - executor.shutdown(); - - assertFalse(hasError.get(), "No errors should occur during concurrent PQ search"); - assertEquals(numSearches, successfulSearches.get(), - "All concurrent PQ searches should return expected results"); - } - - /** - * Test adding vectors after PQ training. - * Verifies that search still works after adding more vectors post-training. - */ - @Test - void testPqCompressionAddAfterTraining(@TempDir final Path tempDir) - { - final int initialCount = 500; - final int additionalCount = 200; - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - // Add initial vectors - addRandomDocuments(gigaMap, random, dimension, initialCount, "initial_"); - - // Train PQ - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); - - // Search before adding more - final float[] queryVector = randomVector(random, dimension); - final VectorSearchResult resultBefore = index.search(queryVector, 10); - assertEquals(10, resultBefore.size()); - - // Add more vectors after training - addRandomDocuments(gigaMap, random, dimension, additionalCount, "additional_"); - - assertEquals(initialCount + additionalCount, gigaMap.size()); - - // Search should still work and may include newly added vectors - final VectorSearchResult resultAfter = index.search(queryVector, 10); - assertEquals(10, resultAfter.size()); - - resultAfter.forEach(entry -> assertNotNull(entry.entity())); - } - - /** - * Test PQ-compressed disk index with multiple restarts. - * Verifies that search works correctly after persisting a PQ-compressed - * index to disk and reloading it across multiple restart cycles. - */ - @Test - void testPqCompressionMultipleRestarts(@TempDir final Path tempDir) throws IOException - { - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - final Path storageDir = tempDir.resolve("storage"); - - final float[] queryVector = randomVector(new Random(999), dimension); - - // Phase 1: Create with 500 vectors and PQ, persist to disk - { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - final GigaMap gigaMap = GigaMap.New(); - storage.setRoot(gigaMap); - - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - addRandomDocuments(gigaMap, random, dimension, 500, "doc_"); - - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); - index.persistToDisk(); - - // Verify search works before restart - final VectorSearchResult result = index.search(queryVector, 10); - assertEquals(10, result.size()); - - storage.storeRoot(); - } - } - - // Phase 2: Restart and verify search works from loaded disk index - { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); - final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); - - assertEquals(500, gigaMap.size()); - - final VectorIndex index = vectorIndices.get("embeddings"); - assertTrue(index.isOnDisk()); - assertTrue(index.isPqCompressionEnabled()); - - // Search should work after reload - final VectorSearchResult result = index.search(queryVector, 10); - assertEquals(10, result.size()); - - // Verify all entities are accessible - result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); - - } - } - - // Phase 3: Second restart - verify search still works - { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); - final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); - - assertEquals(500, gigaMap.size()); - - final VectorIndex index = vectorIndices.get("embeddings"); - final VectorSearchResult result = index.search(queryVector, 20); - assertEquals(20, result.size()); - } - } - } - - /** - * Test PQ-compressed disk index with removeAll and repopulation. - * Verifies the index can be cleared and rebuilt with PQ compression. - */ - @Test - void testPqCompressionRemoveAllAndRepopulate(@TempDir final Path tempDir) - { - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); - - vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - // Initial population - addRandomDocuments(gigaMap, random, dimension, 500, "old_"); - - assertEquals(500, gigaMap.size()); - - // Clear all - gigaMap.removeAll(); - assertEquals(0, gigaMap.size()); - - // Repopulate - addRandomDocuments(gigaMap, random, dimension, 600, "new_"); - - assertEquals(600, gigaMap.size()); - - final VectorIndices vectorIndicesAfter = gigaMap.index().get(VectorIndices.Category()); - final VectorIndex indexAfter = vectorIndicesAfter.get("embeddings"); - - // Train PQ on new data - ((VectorIndex.Internal) indexAfter).trainCompressionIfNeeded(); - - // Search should find only new documents - final VectorSearchResult result = indexAfter.search(randomVector(random, dimension), 20); - assertEquals(20, result.size()); - - result.forEach(entry -> assertTrue(entry.entity().content().startsWith("new_"))); - - } - - /** - * Test that in-memory index (default) still works as expected. - */ - @Test - void testInMemoryIndexStillWorks() - { - final int dimension = 32; - final Random random = new Random(42); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - // Default configuration (in-memory) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .build(); - - assertFalse(config.onDisk()); - assertNull(config.indexDirectory()); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - assertFalse(index.isOnDisk()); - - // Add vectors - addRandomDocuments(gigaMap, random, dimension, 100, "doc_"); - - // Search should work - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - } - - - // ======================================================================== - // Background Persistence Tests - // ======================================================================== - - /** - * Test that background persistence triggers after the configured interval. - */ - @Test - void testBackgroundPersistenceTriggersAfterInterval(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - // Configure with short interval for testing - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(500) // 500ms for fast test - .minChangesBetweenPersists(1) // Persist on any change - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try { - // Add vectors to trigger dirty state - addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); - - // Initially, files should not exist (not yet persisted) - assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should not exist immediately after adding"); - - // Wait for background persistence to trigger (interval + some buffer) - await() - .atMost(ofMillis(1500)) - .pollInterval(ofMillis(100)) - .untilAsserted(() -> assertAll( - () -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after background persistence"), - () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), - "Meta file should exist after background persistence"))); - - } finally { - index.close(); - } - } - - /** - * Test that search works concurrently during background persistence. - */ - @Test - void testConcurrentSearchDuringBackgroundPersistence(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 200; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(200) // Short interval to trigger during test - .minChangesBetweenPersists(1) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try { - // Add initial vectors - addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - - // Run concurrent searches while background persistence may be running - final int numSearches = 50; - final AtomicInteger successfulSearches = new AtomicInteger(0); - final AtomicBoolean hasError = new AtomicBoolean(false); - final CountDownLatch latch = new CountDownLatch(numSearches); - final ExecutorService executor = Executors.newFixedThreadPool(4); - - for (int i = 0; i < numSearches; i++) { - final float[] queryVector = randomVector(new Random(i), dimension); - executor.submit(() -> - { - try { - final VectorSearchResult result = index.search(queryVector, 10); - if (result.size() == 10) { - successfulSearches.incrementAndGet(); - } - } catch (final Exception e) { - hasError.set(true); - e.printStackTrace(); - } finally { - latch.countDown(); - } - }); - - // Small delay to spread searches over time - Thread.sleep(20); - } - - // Wait for all searches to complete - assertTrue(latch.await(30, TimeUnit.SECONDS), "Searches should complete within timeout"); - executor.shutdown(); - - // Verify all searches succeeded - assertFalse(hasError.get(), "No errors should occur during concurrent search"); - assertEquals(numSearches, successfulSearches.get(), - "All searches should return expected number of results"); - } finally { - index.close(); - } - } - - /** - * Test that shutdown persists pending changes when persistOnShutdown is true. - */ - @Test - void testShutdownPersistsPendingChanges(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 100; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(60_000) // Long interval - won't trigger during test - .minChangesBetweenPersists(1) - .persistOnShutdown(true) // Should persist on close - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - // Add vectors - addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - - // Files should not exist yet (interval hasn't triggered) - assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should not exist before close"); - - // Close the index (should trigger persist due to persistOnShutdown=true) - index.close(); - - // Files should now exist - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after close with persistOnShutdown=true"); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), - "Meta file should exist after close with persistOnShutdown=true"); - } - - /** - * Test that shutdown does NOT persist when persistOnShutdown is false. - */ - @Test - void testShutdownSkipsPersistWhenDisabled(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 100; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(60_000) // Long interval - won't trigger during test - .minChangesBetweenPersists(1) - .persistOnShutdown(false) // Should NOT persist on close - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - // Add vectors - addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - - // Close the index (should NOT trigger persist) - index.close(); - - // Files should NOT exist - assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should not exist after close with persistOnShutdown=false"); - } - - /** - * Test debouncing: persistence is skipped when change count is below threshold. - */ - @Test - void testDebouncing(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - // Configure with high threshold that won't be met - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(100) // Short interval - .minChangesBetweenPersists(500) // High threshold - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try { - // Add fewer vectors than the threshold - addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // 50 < 500 threshold - - // Wait for multiple persistence intervals - Thread.sleep(500); - - // Files should NOT exist because change count is below threshold - assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should not exist when changes below threshold"); - - // Now add more vectors to exceed the threshold - IntStream.range(50, 600) // Total now 600 > 500 threshold - .forEach(i -> gigaMap.add(new Document("doc_" + i, randomVector(random, dimension)))); - - await() - .atMost(ofMillis(500)) - .pollInterval(ofMillis(100)) - .untilAsserted(() -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist when changes exceed threshold")); - } finally { - index.close(); - } - } - - /** - * Test that adding vectors in bulk correctly tracks change count. - */ - @Test - void testBulkAddTracksChangeCount(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(300) - .minChangesBetweenPersists(100) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try { - // Bulk add documents - final List documents = new ArrayList<>(); - for (int i = 0; i < 150; i++) { - documents.add(new Document("doc_" + i, randomVector(random, dimension))); - } - gigaMap.addAll(documents); - - // Wait for persistence - await() - .atMost(ofMillis(800)) - .pollInterval(ofMillis(100)) - .untilAsserted(() -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after bulk add exceeds threshold")); - } finally { - index.close(); - } - } - - /** - * Test that background persistence can be reloaded after restart. - */ - @Test - void testBackgroundPersistenceWithRestart(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 200; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - final Path storageDir = tempDir.resolve("storage"); - - final float[] queryVector = randomVector(new Random(999), dimension); - final int expectedK = 10; - - // Phase 1: Create index with background persistence and add vectors - { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - final GigaMap gigaMap = GigaMap.New(); - storage.setRoot(gigaMap); - - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(100) - .minChangesBetweenPersists(1) - .persistOnShutdown(true) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - // Add vectors - addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - - // Verify search works - final VectorSearchResult result = index.search(queryVector, expectedK); - assertEquals(expectedK, result.size()); - - storage.storeRoot(); - - // Explicitly close the index to trigger persistOnShutdown - // (EmbeddedStorageManager doesn't auto-close VectorIndex) - index.close(); - } - } - - // Verify files were persisted - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after close"); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), - "Meta file should exist after close"); - - // Phase 2: Reload and verify - { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); - final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); - - assertEquals(vectorCount, gigaMap.size()); - - final VectorIndex index = vectorIndices.get("embeddings"); - assertTrue(index.isOnDisk(), "Index should be on-disk after reload"); - - // Search should still work after reload - final VectorSearchResult result = index.search(queryVector, expectedK); - assertEquals(expectedK, result.size()); - - // Clean up - index.close(); - } - } - } - - /** - * Test that manual persistToDisk still works with background persistence enabled. - */ - @Test - void testManualPersistWithBackgroundPersistenceEnabled(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 100; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(60_000) // Long interval - won't trigger - .minChangesBetweenPersists(1000) // High threshold - won't trigger - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try { - // Add vectors - addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - - // Files should not exist yet - assertFalse(Files.exists(indexDir.resolve("embeddings.graph"))); - - // Manually trigger persistence - index.persistToDisk(); - - // Files should now exist - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after manual persistToDisk"); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), - "Meta file should exist after manual persistToDisk"); - } finally { - index.close(); - } - } - - - // ======================================================================== - // Background Optimization Tests - // ======================================================================== - - /** - * Test that background optimization runs after the configured interval and threshold. - */ - @Test - void testBackgroundOptimizationTriggersAfterIntervalAndThreshold(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - // Configure with short interval and low threshold for testing - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(300) // 300ms for fast test - .minChangesBetweenOptimizations(10) // Low threshold - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try { - final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; - - // Initially, optimization count should be 0 - assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), - "Optimization count should be 0 initially"); - - // Add vectors to trigger dirty state above threshold - addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); - - // Verify pending changes are tracked - assertTrue(defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount() > 0, - "Pending changes should be tracked"); - - // Verify optimization was actually performed - await() - .atLeast(ofMillis(300)) - .atMost(ofMillis(800)) - .pollInterval(ofMillis(100)) - .untilAsserted(() -> assertTrue(defaultIndex.backgroundTaskManager.getOptimizationCount() >= 1, - "Optimization should have been performed at least once")); - - // Verify pending changes were reset - assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), - "Pending changes should be reset after optimization"); - - // Verify search still works - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - } finally { - index.close(); - } - } - - /** - * Test that optimization is skipped when change count is below threshold. - */ - @Test - void testOptimizationDebouncingBelowThreshold(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - // Configure with high threshold that won't be met - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(200) // Short interval - .minChangesBetweenOptimizations(500) // High threshold - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try { - final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; - - // Add fewer vectors than the threshold - addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // 50 < 500 threshold - - // Verify pending changes are tracked - assertEquals(50, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), - "Pending changes should be 50"); - - // Wait for multiple optimization intervals - Thread.sleep(600); - - // Verify optimization was NOT performed (below threshold) - assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), - "Optimization should NOT have been performed (below threshold)"); - - // Verify pending changes are still tracked (not reset) - assertEquals(50, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), - "Pending changes should still be 50 (not reset)"); - - // Search should still work - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - } finally { - index.close(); - } - } - - /** - * Test that shutdown optimizes pending changes when optimizeOnShutdown is true. - */ - @Test - void testShutdownOptimizesPendingChanges(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 100; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(60_000) // Long interval - won't trigger during test - .minChangesBetweenOptimizations(1) - .optimizeOnShutdown(true) // Should optimize on close - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; - - addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - - // Verify pending changes are tracked - assertEquals(vectorCount, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), - "Pending changes should equal vector count"); - - // Verify no optimization has run yet - assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), - "Optimization count should be 0 before close"); - - // Verify search works before close - final VectorSearchResult resultBefore = index.search(randomVector(random, dimension), 10); - assertEquals(10, resultBefore.size()); - - // Close the index (should trigger optimize due to optimizeOnShutdown=true) - index.close(); - - // Note: After close(), we can't verify the count changed because the manager is shutdown. - // But we verified above that pending changes existed and the interval hadn't triggered. - // The fact that close() completed without error indicates optimization was attempted. - } - - /** - * Test that shutdown does NOT optimize when optimizeOnShutdown is false. - */ - @Test - void testShutdownSkipsOptimizeWhenDisabled(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 100; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(60_000) // Long interval - won't trigger during test - .minChangesBetweenOptimizations(1) - .optimizeOnShutdown(false) // Should NOT optimize on close - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; - - // Add vectors - addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - - // Verify pending changes are tracked - assertEquals(vectorCount, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), - "Pending changes should equal vector count"); - - // Verify no optimization has run yet - assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), - "Optimization count should be 0 before close"); - - // Close the index (should NOT trigger optimize) - index.close(); - - // Note: After close(), we can't access the manager. But we verified: - // 1. Pending changes existed - // 2. No background optimization had run - // 3. optimizeOnShutdown=false was set - // So the pending changes should remain unoptimized. - } - - /** - * Test that search works concurrently during background optimization. - */ - @Test - void testConcurrentSearchDuringBackgroundOptimization(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 200; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(150) // Short interval to trigger during test - .minChangesBetweenOptimizations(1) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try { - // Add initial vectors - addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - - // Run concurrent searches while background optimization may be running - final int numSearches = 50; - final AtomicInteger successfulSearches = new AtomicInteger(0); - final AtomicBoolean hasError = new AtomicBoolean(false); - final CountDownLatch latch = new CountDownLatch(numSearches); - final ExecutorService executor = Executors.newFixedThreadPool(4); - - for (int i = 0; i < numSearches; i++) { - final float[] queryVector = randomVector(new Random(i), dimension); - executor.submit(() -> - { - try { - final VectorSearchResult result = index.search(queryVector, 10); - if (result.size() == 10) { - successfulSearches.incrementAndGet(); - } - } catch (final Exception e) { - hasError.set(true); - e.printStackTrace(); - } finally { - latch.countDown(); - } - }); - - // Small delay to spread searches over time - Thread.sleep(15); - } - - // Wait for all searches to complete - assertTrue(latch.await(30, TimeUnit.SECONDS), "Searches should complete within timeout"); - executor.shutdown(); - - // Verify all searches succeeded - assertFalse(hasError.get(), "No errors should occur during concurrent search with optimization"); - assertEquals(numSearches, successfulSearches.get(), - "All searches should return expected number of results"); - } finally { - index.close(); - } - } - - /** - * Test that bulk add correctly tracks change count for optimization. - */ - @Test - void testBulkAddTracksChangeCountForOptimization(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(300) - .minChangesBetweenOptimizations(100) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try { - // Bulk add documents that exceeds the threshold - final List documents = new ArrayList<>(); - for (int i = 0; i < 150; i++) { - documents.add(new Document("doc_" + i, randomVector(random, dimension))); - } - gigaMap.addAll(documents); - - // Wait for optimization - Thread.sleep(500); - - // Search should still work - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - } finally { - index.close(); - } - } - - /** - * Test that manual optimize() method still works with background optimization enabled. - */ - @Test - void testManualOptimizeWithBackgroundOptimizationEnabled(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 100; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(60_000) // Long interval - won't trigger - .minChangesBetweenOptimizations(1000) // High threshold - won't trigger - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try { - // Add vectors - addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - - // Manually trigger optimization - index.optimize(); - - // Search should still work - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - } finally { - index.close(); - } - } - - /** - * Test that both background persistence and optimization can be enabled together. - */ - @Test - void testBackgroundPersistenceAndOptimizationTogether(@TempDir final Path tempDir) throws Exception - { - final int dimension = 32; - final int vectorCount = 150; - final Random random = new Random(42); - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - // Enable both background persistence and optimization - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(300) - .minChangesBetweenPersists(10) - .persistOnShutdown(true) - .optimizationIntervalMs(400) - .minChangesBetweenOptimizations(10) - .optimizeOnShutdown(true) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() - ); - - try { - // Add vectors - addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - - // Wait for both background tasks to run - Thread.sleep(1000); - - // Search should still work - final VectorSearchResult result = index.search(randomVector(random, dimension), 10); - assertEquals(10, result.size()); - - // Files should exist from background persistence - assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist from background persistence"); - } finally { - index.close(); - } - } - - - // ======================================================================== - // Parallel vs Non-Parallel On-Disk Write Tests - // ======================================================================== - - - /** - * Test that parallel and non-parallel on-disk writes both support persist-and-reload - * for a large PQ-compressed index. - * Verifies that the graph files produced by both modes can be loaded correctly - * and yield equivalent search results after restart. - */ - @Test - void testParallelVsNonParallelPersistAndReload(@TempDir final Path tempDir) throws IOException - { - final int vectorCount = 2000; - final int dimension = 64; - final int pqSubspaces = 16; - final int k = 20; - final Random random = new Random(42); - - // Generate shared vectors and query - final List vectors = new ArrayList<>(); - for (int i = 0; i < vectorCount; i++) { - vectors.add(randomVector(random, dimension)); - } - final float[] queryVector = randomVector(new Random(999), dimension); - - final Path parallelIndexDir = tempDir.resolve("parallel-index"); - final Path parallelStorageDir = tempDir.resolve("parallel-storage"); - final Path sequentialIndexDir = tempDir.resolve("sequential-index"); - final Path sequentialStorageDir = tempDir.resolve("sequential-storage"); - - // --- Build and persist both modes --- - buildAndPersistIndex(vectors, queryVector, dimension, pqSubspaces, parallelIndexDir, parallelStorageDir, true); - buildAndPersistIndex(vectors, queryVector, dimension, pqSubspaces, sequentialIndexDir, sequentialStorageDir, false); - - // --- Reload both and compare search results --- - final List parallelIds = new ArrayList<>(); - final List parallelScores = new ArrayList<>(); - { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(parallelStorageDir)) { - @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); - final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); - - assertEquals(vectorCount, gigaMap.size()); - - final VectorIndex index = vectorIndices.get("embeddings"); - assertTrue(index.isOnDisk()); - - final VectorSearchResult result = index.search(queryVector, k); - assertEquals(k, result.size()); - for (final VectorSearchResult.Entry entry : result) { - parallelIds.add(entry.entityId()); - parallelScores.add(entry.score()); - assertNotNull(entry.entity()); - } - } - } - - final List sequentialIds = new ArrayList<>(); - final List sequentialScores = new ArrayList<>(); - { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(sequentialStorageDir)) { - @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); - final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); - - assertEquals(vectorCount, gigaMap.size()); - - final VectorIndex index = vectorIndices.get("embeddings"); - assertTrue(index.isOnDisk()); - - final VectorSearchResult result = index.search(queryVector, k); - assertEquals(k, result.size()); - for (final VectorSearchResult.Entry entry : result) { - sequentialIds.add(entry.entityId()); - sequentialScores.add(entry.score()); - assertNotNull(entry.entity()); - } - } - } - - // Both modes should produce equivalent results after reload - assertEquals(parallelIds, sequentialIds, - "Parallel and sequential modes should produce identical search results after reload"); - assertEquals(parallelScores, sequentialScores, - "Parallel and sequential modes should produce identical search scores after reload"); - } - - /** - * Helper to build, populate, train PQ, persist, and store a PQ-compressed index. - */ - private void buildAndPersistIndex( - final List vectors, - final float[] queryVector, - final int dimension, - final int pqSubspaces, - final Path indexDir, - final Path storageDir, - final boolean parallel - ) throws IOException - { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - final GigaMap gigaMap = GigaMap.New(); - storage.setRoot(gigaMap); - - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(32) - .beamWidth(100) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .parallelOnDiskWrite(parallel) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", config, new ComputedDocumentVectorizer() - ); - - addDocumentsFromVectors(gigaMap, vectors, "doc_"); - - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); - index.persistToDisk(); - - assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); - assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); - - storage.storeRoot(); - } - } - - - // ======================================================================== - // Embedded Vectorizer + On-Disk Tests - // ======================================================================== - - /** - * Test that an embedded vectorizer with parallel on-disk write completes without deadlock. - *

- * This is a regression test for a deadlock where {@code persistToDisk()} held - * {@code synchronized(parentMap)} for the entire disk write. The disk writer uses - * internal worker threads (ForkJoinPool for PQ encoding, parallel graph writer) - * that call {@code parentMap.get()} — which also synchronizes on the same monitor. - *

- * The fix restructures locking: Phase 1 (prep) runs inside {@code synchronized(parentMap)}, - * Phase 2 (disk write) runs outside it but still holds {@code persistenceLock.writeLock()}. - *

- * Uses {@code @Timeout} to fail fast if a deadlock occurs instead of hanging indefinitely. - */ - @Test - @Timeout(value = 60, unit = TimeUnit.SECONDS) - void testEmbeddedVectorizerWithParallelOnDiskWrite(@TempDir final Path tempDir) - { - final int vectorCount = 500; - final int dimension = 64; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .parallelOnDiskWrite(true) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new EmbeddedDocumentVectorizer() - ); - - addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - - // This would deadlock before the fix - index.persistToDisk(); - - // Verify files were created - assertAll( - () -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))), - () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))) - ); - - // Verify search still works after persist - final float[] queryVector = randomVector(random, dimension); - final VectorSearchResult result = index.search(queryVector, 10); - assertEquals(10, result.size()); - - result.forEach(entry -> assertNotNull(entry.entity())); - } - - /** - * Test that an embedded vectorizer with PQ compression and parallel on-disk write - * completes without deadlock. - *

- * This is the most deadlock-prone scenario: FusedPQ encoding uses a ForkJoinPool - * that calls {@code getVector()} on worker threads, plus the parallel graph writer - * also calls {@code getVector()} from its own thread pool. - */ - @Test - @Timeout(value = 60, unit = TimeUnit.SECONDS) - void testEmbeddedVectorizerWithPqAndParallelOnDiskWrite(@TempDir final Path tempDir) - { - final int vectorCount = 500; - final int dimension = 64; - final int pqSubspaces = 16; - final Random random = new Random(42); - - final Path indexDir = tempDir.resolve("index"); - - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .parallelOnDiskWrite(true) - .build(); - - final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new EmbeddedDocumentVectorizer() - ); - - // Add vectors - addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - - // Train PQ compression - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); - - // This would deadlock before the fix - index.persistToDisk(); - - // Verify files were created - assertAll( - () -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))), - () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))) - ); - - // Verify search still works - final float[] queryVector = randomVector(random, dimension); - final VectorSearchResult result = index.search(queryVector, 10); - assertEquals(10, result.size()); - } - - /** - * Test that parallel and non-parallel on-disk writes produce equivalent search results - * for a large index without PQ compression. - * Both modes should produce identical graph files that yield the same search quality. - */ - @Test - void testParallelVsSequentialOnDiskWrite(@TempDir final Path tempDir) throws IOException - { - final int vectorCount = 2000; - final int dimension = 64; - final int k = 20; - final Random random = new Random(42); - - // Generate shared vectors and query - final List vectors = new ArrayList<>(); - for (int i = 0; i < vectorCount; i++) { - vectors.add(randomVector(random, dimension)); - } - final float[] queryVector = randomVector(new Random(999), dimension); - - final Path parallelIndexDir = tempDir.resolve("parallel"); - final Path sequentialIndexDir = tempDir.resolve("sequential"); - - final List parallelIds = new ArrayList<>(); - final List parallelScores = new ArrayList<>(); - final List sequentialIds = new ArrayList<>(); - final List sequentialScores = new ArrayList<>(); - - // --- Parallel config - final GigaMap gigaMap = GigaMap.New(); - final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); - - final VectorIndexConfiguration configParallel = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(16) - .beamWidth(100) - .onDisk(true) - .indexDirectory(parallelIndexDir) - .parallelOnDiskWrite(true) - .build(); - - // --- Sequential config - final VectorIndex index = vectorIndices.add( - "embeddings", configParallel, new ComputedDocumentVectorizer() - ); - - final VectorIndexConfiguration configSequential = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(16) - .enablePqCompression(true) - .beamWidth(100) - .onDisk(true) - .indexDirectory(sequentialIndexDir) - .parallelOnDiskWrite(false) - .build(); - - final VectorIndex indexSequential = vectorIndices.add( - "embeddingsSequential", configSequential, new ComputedDocumentVectorizer() - ); - - addDocumentsFromVectors(gigaMap, vectors, "doc_"); - - index.persistToDisk(); - indexSequential.persistToDisk(); - - //parallel - final VectorSearchResult result = index.search(queryVector, k); - for (final VectorSearchResult.Entry entry : result) { - parallelIds.add(entry.entityId()); - parallelScores.add(entry.score()); - } - - //sequential - final VectorSearchResult resultSequential = indexSequential.search(queryVector, k); - for (final VectorSearchResult.Entry entry : resultSequential) { - sequentialIds.add(entry.entityId()); - sequentialScores.add(entry.score()); - } - - assertAll( - () -> assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.graph"))), - () -> assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.meta"))), - () -> assertTrue(Files.exists(sequentialIndexDir.resolve("embeddingsSequential.graph"))), - () -> assertTrue(Files.exists(sequentialIndexDir.resolve("embeddingsSequential.meta"))) - ); - - // Both indices were built from the same data with the same HNSW parameters, - // so search results must be identical. - assertEquals(parallelIds, sequentialIds, - "Parallel and sequential on-disk writes should produce identical search results"); - assertEquals(parallelScores, sequentialScores, - "Parallel and sequential on-disk writes should produce identical search scores"); - } + /** + * Simple entity with an embedding vector. + */ + record Document(String content, float[] embedding) + { + } + + /** + * Computed vectorizer - simulates externally computed vectors. + */ + static class ComputedDocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + } + + /** + * Embedded vectorizer - vectors are part of the entity, not stored separately. + */ + static class EmbeddedDocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + + @Override + public boolean isEmbedded() + { + return true; + } + } + + /** + * Helper to generate a random normalized vector. + */ + private static float[] randomVector(final Random random, final int dimension) + { + final float[] vector = new float[dimension]; + float norm = 0; + for (int i = 0; i < dimension; i++) { + vector[i] = random.nextFloat() * 2 - 1; + norm += vector[i] * vector[i]; + } + norm = (float) Math.sqrt(norm); + for (int i = 0; i < dimension; i++) { + vector[i] /= norm; + } + return vector; + } + + /** + * Helper to add multiple documents with random vectors to a GigaMap. + */ + private static void addRandomDocuments( + final GigaMap gigaMap, + final Random random, + final int dimension, + final int count, + final String prefix + ) + { + IntStream.range(0, count) + .forEach(i -> gigaMap.add(new Document(prefix + i, randomVector(random, dimension)))); + } + + /** + * Helper to add multiple documents from a list of pre-generated vectors. + */ + private static void addDocumentsFromVectors( + final GigaMap gigaMap, + final List vectors, + final String prefix + ) + { + IntStream.range(0, vectors.size()) + .forEach(i -> gigaMap.add(new Document(prefix + i, vectors.get(i)))); + } + + + /** + * Test creating an on-disk index and persisting it. + */ + @Test + void testOnDiskIndexCreationAndPersistence(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 500; + final int dimension = 64; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + final Path storageDir = tempDir.resolve("storage"); + + // Generate vectors + final List vectors = new ArrayList<>(); + for (int i = 0; i < vectorCount; i++) { + vectors.add(randomVector(random, dimension)); + } + + final float[] queryVector = randomVector(new Random(999), dimension); + final List expectedIds = new ArrayList<>(); + + // Phase 1: Create index and persist + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + final GigaMap gigaMap = GigaMap.New(); + storage.setRoot(gigaMap); + + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + assertTrue(index.isOnDisk()); + assertFalse(index.isPqCompressionEnabled()); + + // Add vectors + addDocumentsFromVectors(gigaMap, vectors, "doc_"); + + // Search and record expected results + final VectorSearchResult result = index.search(queryVector, 10); + for (final VectorSearchResult.Entry entry : result) { + expectedIds.add(entry.entityId()); + } + + // Persist index to disk + index.persistToDisk(); + + // Verify files were created + assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); + + storage.storeRoot(); + } + } + + // Phase 2: Reload and verify + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(vectorCount, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + assertTrue(index.isOnDisk()); + + // Search and compare results + final VectorSearchResult result = index.search(queryVector, 10); + final List actualIds = new ArrayList<>(); + for (final VectorSearchResult.Entry entry : result) { + actualIds.add(entry.entityId()); + } + + // Results should match (or at least be very similar due to HNSW nature) + assertEquals(expectedIds.size(), actualIds.size()); + } + } + } + + /** + * Test on-disk index with compression (PQ). + */ + @Test + void testOnDiskIndexWithCompression(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 500; + final int dimension = 64; + final int pqSubspaces = 16; // 64 / 16 = 4 dimensions per subspace + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + assertTrue(index.isOnDisk()); + assertTrue(index.isPqCompressionEnabled()); + + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Train compression + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + // Search should work + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult result = index.search(queryVector, 10); + + assertEquals(10, result.size()); + + // Verify all entities are accessible + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); + + // Persist to disk + index.persistToDisk(); + + // Verify graph file was created (FusedPQ is embedded in graph, no separate .pq file) + assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); + assertFalse(Files.exists(indexDir.resolve("embeddings.pq")), + "FusedPQ should be embedded in graph file, not in separate .pq file"); + } + + /** + * Test search quality with on-disk index - verify exact match is found first. + */ + @Test + void testOnDiskSearchQuality(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 1000; + final int dimension = 64; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + // Add random vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount - 1, "random_"); + + // Add a one-hot "needle" vector that randomVector() cannot produce, + // since randomVector() populates all dimensions with non-zero values. + final float[] needleVector = new float[dimension]; + needleVector[0] = 1.0f; + + gigaMap.add(new Document("needle", needleVector)); + + // Persist index + index.persistToDisk(); + + // Search for the needle vector - it should be the first result + final VectorSearchResult result = index.search(needleVector, 5); + + assertEquals(5, result.size()); + final VectorSearchResult.Entry firstResult = result.iterator().next(); + assertEquals("needle", firstResult.entity().content(), "Exact match should be first result"); + assertTrue(firstResult.score() > 0.99f, "Exact match should have score close to 1.0"); + } + + /** + * Test multiple restarts with on-disk index. + */ + @Test + void testOnDiskIndexMultipleRestarts(@TempDir final Path tempDir) throws IOException + { + final int dimension = 32; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + final Path storageDir = tempDir.resolve("storage"); + + // Phase 1: Create with 100 vectors + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + final GigaMap gigaMap = GigaMap.New(); + storage.setRoot(gigaMap); + + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .build(); + + vectorIndices.add("embeddings", config, new ComputedDocumentVectorizer()); + + addRandomDocuments(gigaMap, random, dimension, 100, "phase1_doc_"); + + assertEquals(100, gigaMap.size()); + storage.storeRoot(); + } + } + + // Phase 2: Restart and add 50 more vectors + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(100, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + final VectorSearchResult result = index.search(randomVector(random, dimension), 10); + assertEquals(10, result.size()); + + // Add more vectors + addRandomDocuments(gigaMap, random, dimension, 50, "phase2_doc_"); + + assertEquals(150, gigaMap.size()); + storage.storeRoot(); + } + } + + // Phase 3: Final verification + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + VectorIndex index = gigaMap.index().get(VectorIndices.Category()).get("embeddings"); + + assertEquals(150, gigaMap.size()); + + final VectorSearchResult result = index.search(randomVector(random, dimension), 30); + assertEquals(30, result.size()); + } + } + } + + // ======================================================================== + // PQ Compression Search Tests + // ======================================================================== + + /** + * Test search quality with PQ compression enabled. + * Verifies that an exact match (needle) is found in the top results + * despite quantization loss from Product Quantization. + */ + @Test + void testPqCompressionSearchQuality(@TempDir final Path tempDir) + { + final int vectorCount = 500; + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + // Add random vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount - 1, "random_"); + + // Add a one-hot "needle" vector that randomVector() cannot produce, + // since randomVector() populates all dimensions with non-zero values. + final float[] needleVector = new float[dimension]; + needleVector[0] = 1.0f; + + gigaMap.add(new Document("needle", needleVector)); + + // Train PQ compression + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + // Search for the needle vector - it should be in the top results + final VectorSearchResult result = index.search(needleVector, 5); + + assertEquals(5, result.size()); + final VectorSearchResult.Entry firstResult = result.iterator().next(); + assertEquals("needle", firstResult.entity().content(), + "Exact match should be first result even with PQ compression"); + assertTrue(firstResult.score() > 0.99f, + "Exact match should have score close to 1.0"); + + // Verify results are ordered by score + float prevScore = Float.MAX_VALUE; + for (final VectorSearchResult.Entry entry : result) { + assertTrue(entry.score() <= prevScore, "Results should be ordered by score"); + prevScore = entry.score(); + } + } + + /** + * Test PQ-compressed disk index persistence and reload with search verification. + * Verifies that search still works correctly after saving and reloading + * a PQ-compressed index. + */ + @Test + void testPqCompressionPersistAndReload(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 500; + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + final Path storageDir = tempDir.resolve("storage"); + + final List vectors = new ArrayList<>(); + for (int i = 0; i < vectorCount; i++) { + vectors.add(randomVector(random, dimension)); + } + + final float[] queryVector = randomVector(new Random(999), dimension); + final List expectedIds = new ArrayList<>(); + + // Phase 1: Create index with PQ, populate, search, persist + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + final GigaMap gigaMap = GigaMap.New(); + storage.setRoot(gigaMap); + + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + assertTrue(index.isOnDisk()); + assertTrue(index.isPqCompressionEnabled()); + + addDocumentsFromVectors(gigaMap, vectors, "doc_"); + + // Train and search + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + final VectorSearchResult result = index.search(queryVector, 10); + for (final VectorSearchResult.Entry entry : result) { + expectedIds.add(entry.entityId()); + } + + // Persist + index.persistToDisk(); + assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); + + storage.storeRoot(); + } + } + + // Phase 2: Reload and verify search results + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(vectorCount, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + assertTrue(index.isOnDisk()); + assertTrue(index.isPqCompressionEnabled()); + + // Search after reload + final VectorSearchResult result = index.search(queryVector, 10); + assertEquals(10, result.size()); + + final List actualIds = new ArrayList<>(); + for (final VectorSearchResult.Entry entry : result) { + actualIds.add(entry.entityId()); + } + + // Results should match (or at least overlap significantly) + assertEquals(expectedIds.size(), actualIds.size()); + + // Verify all entities are accessible + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); + } + } + } + + /** + * Test PQ-compressed disk index with DOT_PRODUCT similarity function. + */ + @Test + void testPqCompressionWithDotProduct(@TempDir final Path tempDir) + { + final int vectorCount = 500; + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.DOT_PRODUCT) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult result = index.search(queryVector, 10); + + assertEquals(10, result.size()); + result.forEach(entry -> assertNotNull(entry.entity())); + } + + /** + * Test PQ-compressed disk index with EUCLIDEAN similarity function. + */ + @Test + void testPqCompressionWithEuclidean(@TempDir final Path tempDir) + { + final int vectorCount = 500; + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.EUCLIDEAN) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult result = index.search(queryVector, 10); + + assertEquals(10, result.size()); + result.forEach(entry -> assertNotNull(entry.entity())); + } + + /** + * Test PQ compression with default subspaces (auto-calculated as dimension/4). + */ + @Test + void testPqCompressionWithDefaultSubspaces(@TempDir final Path tempDir) + { + final int vectorCount = 500; + final int dimension = 128; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + // pqSubspaces not set - should default to dimension/4 = 32 + .build(); + + assertEquals(0, config.pqSubspaces(), + "pqSubspaces should be 0 (auto-calculated at runtime)"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult result = index.search(queryVector, 10); + + assertEquals(10, result.size()); + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); + } + + /** + * Test removing entities from a PQ-compressed disk index. + * Verifies that removed entities do not appear in search results. + */ + @Test + void testPqCompressionWithRemoval(@TempDir final Path tempDir) + { + final int vectorCount = 500; + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + // Remove every other entity (even IDs) + for (int i = 0; i < vectorCount; i += 2) { + gigaMap.removeById(i); + } + + assertEquals(vectorCount / 2, gigaMap.size()); + + // Search should only return remaining entities + final VectorSearchResult result = index.search(randomVector(random, dimension), 10); + assertEquals(10, result.size()); + + for (final VectorSearchResult.Entry entry : result) { + final String content = entry.entity().content(); + final int docNum = Integer.parseInt(content.replace("doc_", "")); + assertTrue(docNum % 2 != 0, + "Only odd-numbered documents should remain, found: " + content); + } + } + + /** + * Test concurrent search with PQ compression enabled. + * Verifies thread safety of PQ-compressed search. + */ + @Test + void testPqCompressionConcurrentSearch(@TempDir final Path tempDir) throws Exception + { + final int vectorCount = 500; + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + // Run concurrent searches + final int numSearches = 50; + final AtomicInteger successfulSearches = new AtomicInteger(0); + final AtomicBoolean hasError = new AtomicBoolean(false); + final CountDownLatch latch = new CountDownLatch(numSearches); + final ExecutorService executor = Executors.newFixedThreadPool(4); + + for (int i = 0; i < numSearches; i++) { + final float[] queryVector = randomVector(new Random(i), dimension); + executor.submit(() -> + { + try { + final VectorSearchResult result = index.search(queryVector, 10); + if (result.size() == 10) { + successfulSearches.incrementAndGet(); + } + } catch (final Exception e) { + hasError.set(true); + e.printStackTrace(); + } finally { + latch.countDown(); + } + }); + } + + assertTrue(latch.await(30, TimeUnit.SECONDS), "Searches should complete within timeout"); + executor.shutdown(); + + assertFalse(hasError.get(), "No errors should occur during concurrent PQ search"); + assertEquals(numSearches, successfulSearches.get(), + "All concurrent PQ searches should return expected results"); + } + + /** + * Test adding vectors after PQ training. + * Verifies that search still works after adding more vectors post-training. + */ + @Test + void testPqCompressionAddAfterTraining(@TempDir final Path tempDir) + { + final int initialCount = 500; + final int additionalCount = 200; + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + // Add initial vectors + addRandomDocuments(gigaMap, random, dimension, initialCount, "initial_"); + + // Train PQ + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + // Search before adding more + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult resultBefore = index.search(queryVector, 10); + assertEquals(10, resultBefore.size()); + + // Add more vectors after training + addRandomDocuments(gigaMap, random, dimension, additionalCount, "additional_"); + + assertEquals(initialCount + additionalCount, gigaMap.size()); + + // Search should still work and may include newly added vectors + final VectorSearchResult resultAfter = index.search(queryVector, 10); + assertEquals(10, resultAfter.size()); + + resultAfter.forEach(entry -> assertNotNull(entry.entity())); + } + + /** + * Test PQ-compressed disk index with multiple restarts. + * Verifies that search works correctly after persisting a PQ-compressed + * index to disk and reloading it across multiple restart cycles. + */ + @Test + void testPqCompressionMultipleRestarts(@TempDir final Path tempDir) throws IOException + { + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + final Path storageDir = tempDir.resolve("storage"); + + final float[] queryVector = randomVector(new Random(999), dimension); + + // Phase 1: Create with 500 vectors and PQ, persist to disk + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + final GigaMap gigaMap = GigaMap.New(); + storage.setRoot(gigaMap); + + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + addRandomDocuments(gigaMap, random, dimension, 500, "doc_"); + + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + index.persistToDisk(); + + // Verify search works before restart + final VectorSearchResult result = index.search(queryVector, 10); + assertEquals(10, result.size()); + + storage.storeRoot(); + } + } + + // Phase 2: Restart and verify search works from loaded disk index + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(500, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + assertTrue(index.isOnDisk()); + assertTrue(index.isPqCompressionEnabled()); + + // Search should work after reload + final VectorSearchResult result = index.search(queryVector, 10); + assertEquals(10, result.size()); + + // Verify all entities are accessible + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("doc_"))); + + } + } + + // Phase 3: Second restart - verify search still works + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(500, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + final VectorSearchResult result = index.search(queryVector, 20); + assertEquals(20, result.size()); + } + } + } + + /** + * Test PQ-compressed disk index with removeAll and repopulation. + * Verifies the index can be cleared and rebuilt with PQ compression. + */ + @Test + void testPqCompressionRemoveAllAndRepopulate(@TempDir final Path tempDir) + { + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); + + vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + // Initial population + addRandomDocuments(gigaMap, random, dimension, 500, "old_"); + + assertEquals(500, gigaMap.size()); + + // Clear all + gigaMap.removeAll(); + assertEquals(0, gigaMap.size()); + + // Repopulate + addRandomDocuments(gigaMap, random, dimension, 600, "new_"); + + assertEquals(600, gigaMap.size()); + + final VectorIndices vectorIndicesAfter = gigaMap.index().get(VectorIndices.Category()); + final VectorIndex indexAfter = vectorIndicesAfter.get("embeddings"); + + // Train PQ on new data + ((VectorIndex.Internal) indexAfter).trainCompressionIfNeeded(); + + // Search should find only new documents + final VectorSearchResult result = indexAfter.search(randomVector(random, dimension), 20); + assertEquals(20, result.size()); + + result.forEach(entry -> assertTrue(entry.entity().content().startsWith("new_"))); + + } + + /** + * Test that in-memory index (default) still works as expected. + */ + @Test + void testInMemoryIndexStillWorks() + { + final int dimension = 32; + final Random random = new Random(42); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + // Default configuration (in-memory) + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertFalse(config.onDisk()); + assertNull(config.indexDirectory()); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + assertFalse(index.isOnDisk()); + + // Add vectors + addRandomDocuments(gigaMap, random, dimension, 100, "doc_"); + + // Search should work + final VectorSearchResult result = index.search(randomVector(random, dimension), 10); + assertEquals(10, result.size()); + } + + + // ======================================================================== + // Background Persistence Tests + // ======================================================================== + + /** + * Test that background persistence triggers after the configured interval. + */ + @Test + void testBackgroundPersistenceTriggersAfterInterval(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + // Configure with short interval for testing + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(500) // 500ms for fast test + .minChangesBetweenPersists(1) // Persist on any change + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + // Add vectors to trigger dirty state + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); + + // Initially, files should not exist (not yet persisted) + assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should not exist immediately after adding"); + + // Wait for background persistence to trigger (interval + some buffer) + await() + .atMost(ofMillis(1500)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertAll( + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist after background persistence"), + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), + "Meta file should exist after background persistence"))); + + } finally { + index.close(); + } + } + + /** + * Test that search works concurrently during background persistence. + */ + @Test + void testConcurrentSearchDuringBackgroundPersistence(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 200; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(200) // Short interval to trigger during test + .minChangesBetweenPersists(1) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + // Add initial vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Run concurrent searches while background persistence may be running + final int numSearches = 50; + final AtomicInteger successfulSearches = new AtomicInteger(0); + final AtomicBoolean hasError = new AtomicBoolean(false); + final CountDownLatch latch = new CountDownLatch(numSearches); + final ExecutorService executor = Executors.newFixedThreadPool(4); + + for (int i = 0; i < numSearches; i++) { + final float[] queryVector = randomVector(new Random(i), dimension); + executor.submit(() -> + { + try { + final VectorSearchResult result = index.search(queryVector, 10); + if (result.size() == 10) { + successfulSearches.incrementAndGet(); + } + } catch (final Exception e) { + hasError.set(true); + e.printStackTrace(); + } finally { + latch.countDown(); + } + }); + + // Small delay to spread searches over time + Thread.sleep(20); + } + + // Wait for all searches to complete + assertTrue(latch.await(30, TimeUnit.SECONDS), "Searches should complete within timeout"); + executor.shutdown(); + + // Verify all searches succeeded + assertFalse(hasError.get(), "No errors should occur during concurrent search"); + assertEquals(numSearches, successfulSearches.get(), + "All searches should return expected number of results"); + } finally { + index.close(); + } + } + + /** + * Test that shutdown persists pending changes when persistOnShutdown is true. + */ + @Test + void testShutdownPersistsPendingChanges(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 100; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(60_000) // Long interval - won't trigger during test + .minChangesBetweenPersists(1) + .persistOnShutdown(true) // Should persist on close + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Files should not exist yet (interval hasn't triggered) + assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should not exist before close"); + + // Close the index (should trigger persist due to persistOnShutdown=true) + index.close(); + + // Files should now exist + assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist after close with persistOnShutdown=true"); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), + "Meta file should exist after close with persistOnShutdown=true"); + } + + /** + * Test that shutdown does NOT persist when persistOnShutdown is false. + */ + @Test + void testShutdownSkipsPersistWhenDisabled(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 100; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(60_000) // Long interval - won't trigger during test + .minChangesBetweenPersists(1) + .persistOnShutdown(false) // Should NOT persist on close + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Close the index (should NOT trigger persist) + index.close(); + + // Files should NOT exist + assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should not exist after close with persistOnShutdown=false"); + } + + /** + * Test debouncing: persistence is skipped when change count is below threshold. + */ + @Test + void testDebouncing(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + // Configure with high threshold that won't be met + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(100) // Short interval + .minChangesBetweenPersists(500) // High threshold + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + // Add fewer vectors than the threshold + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // 50 < 500 threshold + + // Wait for multiple persistence intervals + Thread.sleep(500); + + // Files should NOT exist because change count is below threshold + assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should not exist when changes below threshold"); + + // Now add more vectors to exceed the threshold + IntStream.range(50, 600) // Total now 600 > 500 threshold + .forEach(i -> gigaMap.add(new Document("doc_" + i, randomVector(random, dimension)))); + + await() + .atMost(ofMillis(500)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist when changes exceed threshold")); + } finally { + index.close(); + } + } + + /** + * Test that adding vectors in bulk correctly tracks change count. + */ + @Test + void testBulkAddTracksChangeCount(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(300) + .minChangesBetweenPersists(100) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + // Bulk add documents + final List documents = new ArrayList<>(); + for (int i = 0; i < 150; i++) { + documents.add(new Document("doc_" + i, randomVector(random, dimension))); + } + gigaMap.addAll(documents); + + // Wait for persistence + await() + .atMost(ofMillis(800)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist after bulk add exceeds threshold")); + } finally { + index.close(); + } + } + + /** + * Test that background persistence can be reloaded after restart. + */ + @Test + void testBackgroundPersistenceWithRestart(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 200; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + final Path storageDir = tempDir.resolve("storage"); + + final float[] queryVector = randomVector(new Random(999), dimension); + final int expectedK = 10; + + // Phase 1: Create index with background persistence and add vectors + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + final GigaMap gigaMap = GigaMap.New(); + storage.setRoot(gigaMap); + + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(100) + .minChangesBetweenPersists(1) + .persistOnShutdown(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Verify search works + final VectorSearchResult result = index.search(queryVector, expectedK); + assertEquals(expectedK, result.size()); + + storage.storeRoot(); + + // Explicitly close the index to trigger persistOnShutdown + // (EmbeddedStorageManager doesn't auto-close VectorIndex) + index.close(); + } + } + + // Verify files were persisted + assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist after close"); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), + "Meta file should exist after close"); + + // Phase 2: Reload and verify + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(vectorCount, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + assertTrue(index.isOnDisk(), "Index should be on-disk after reload"); + + // Search should still work after reload + final VectorSearchResult result = index.search(queryVector, expectedK); + assertEquals(expectedK, result.size()); + + // Clean up + index.close(); + } + } + } + + /** + * Test that manual persistToDisk still works with background persistence enabled. + */ + @Test + void testManualPersistWithBackgroundPersistenceEnabled(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 100; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(60_000) // Long interval - won't trigger + .minChangesBetweenPersists(1000) // High threshold - won't trigger + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Files should not exist yet + assertFalse(Files.exists(indexDir.resolve("embeddings.graph"))); + + // Manually trigger persistence + index.persistToDisk(); + + // Files should now exist + assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist after manual persistToDisk"); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), + "Meta file should exist after manual persistToDisk"); + } finally { + index.close(); + } + } + + + // ======================================================================== + // Background Optimization Tests + // ======================================================================== + + /** + * Test that background optimization runs after the configured interval and threshold. + */ + @Test + void testBackgroundOptimizationTriggersAfterIntervalAndThreshold(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + // Configure with short interval and low threshold for testing + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(300) // 300ms for fast test + .minChangesBetweenOptimizations(10) // Low threshold + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + + // Initially, optimization count should be 0 + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), + "Optimization count should be 0 initially"); + + // Add vectors to trigger dirty state above threshold + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); + + // Verify pending changes are tracked + assertTrue(defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount() > 0, + "Pending changes should be tracked"); + + // Verify optimization was actually performed + await() + .atLeast(ofMillis(300)) + .atMost(ofMillis(800)) + .pollInterval(ofMillis(100)) + .untilAsserted(() -> assertTrue(defaultIndex.backgroundTaskManager.getOptimizationCount() >= 1, + "Optimization should have been performed at least once")); + + // Verify pending changes were reset + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), + "Pending changes should be reset after optimization"); + + // Verify search still works + final VectorSearchResult result = index.search(randomVector(random, dimension), 10); + assertEquals(10, result.size()); + } finally { + index.close(); + } + } + + /** + * Test that optimization is skipped when change count is below threshold. + */ + @Test + void testOptimizationDebouncingBelowThreshold(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + // Configure with high threshold that won't be met + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(200) // Short interval + .minChangesBetweenOptimizations(500) // High threshold + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + + // Add fewer vectors than the threshold + addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // 50 < 500 threshold + + // Verify pending changes are tracked + assertEquals(50, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), + "Pending changes should be 50"); + + // Wait for multiple optimization intervals + Thread.sleep(600); + + // Verify optimization was NOT performed (below threshold) + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), + "Optimization should NOT have been performed (below threshold)"); + + // Verify pending changes are still tracked (not reset) + assertEquals(50, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), + "Pending changes should still be 50 (not reset)"); + + // Search should still work + final VectorSearchResult result = index.search(randomVector(random, dimension), 10); + assertEquals(10, result.size()); + } finally { + index.close(); + } + } + + /** + * Test that shutdown optimizes pending changes when optimizeOnShutdown is true. + */ + @Test + void testShutdownOptimizesPendingChanges(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 100; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(60_000) // Long interval - won't trigger during test + .minChangesBetweenOptimizations(1) + .optimizeOnShutdown(true) // Should optimize on close + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Verify pending changes are tracked + assertEquals(vectorCount, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), + "Pending changes should equal vector count"); + + // Verify no optimization has run yet + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), + "Optimization count should be 0 before close"); + + // Verify search works before close + final VectorSearchResult resultBefore = index.search(randomVector(random, dimension), 10); + assertEquals(10, resultBefore.size()); + + // Close the index (should trigger optimize due to optimizeOnShutdown=true) + index.close(); + + // Note: After close(), we can't verify the count changed because the manager is shutdown. + // But we verified above that pending changes existed and the interval hadn't triggered. + // The fact that close() completed without error indicates optimization was attempted. + } + + /** + * Test that shutdown does NOT optimize when optimizeOnShutdown is false. + */ + @Test + void testShutdownSkipsOptimizeWhenDisabled(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 100; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(60_000) // Long interval - won't trigger during test + .minChangesBetweenOptimizations(1) + .optimizeOnShutdown(false) // Should NOT optimize on close + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Verify pending changes are tracked + assertEquals(vectorCount, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), + "Pending changes should equal vector count"); + + // Verify no optimization has run yet + assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), + "Optimization count should be 0 before close"); + + // Close the index (should NOT trigger optimize) + index.close(); + + // Note: After close(), we can't access the manager. But we verified: + // 1. Pending changes existed + // 2. No background optimization had run + // 3. optimizeOnShutdown=false was set + // So the pending changes should remain unoptimized. + } + + /** + * Test that search works concurrently during background optimization. + */ + @Test + void testConcurrentSearchDuringBackgroundOptimization(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 200; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(150) // Short interval to trigger during test + .minChangesBetweenOptimizations(1) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + // Add initial vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Run concurrent searches while background optimization may be running + final int numSearches = 50; + final AtomicInteger successfulSearches = new AtomicInteger(0); + final AtomicBoolean hasError = new AtomicBoolean(false); + final CountDownLatch latch = new CountDownLatch(numSearches); + final ExecutorService executor = Executors.newFixedThreadPool(4); + + for (int i = 0; i < numSearches; i++) { + final float[] queryVector = randomVector(new Random(i), dimension); + executor.submit(() -> + { + try { + final VectorSearchResult result = index.search(queryVector, 10); + if (result.size() == 10) { + successfulSearches.incrementAndGet(); + } + } catch (final Exception e) { + hasError.set(true); + e.printStackTrace(); + } finally { + latch.countDown(); + } + }); + + // Small delay to spread searches over time + Thread.sleep(15); + } + + // Wait for all searches to complete + assertTrue(latch.await(30, TimeUnit.SECONDS), "Searches should complete within timeout"); + executor.shutdown(); + + // Verify all searches succeeded + assertFalse(hasError.get(), "No errors should occur during concurrent search with optimization"); + assertEquals(numSearches, successfulSearches.get(), + "All searches should return expected number of results"); + } finally { + index.close(); + } + } + + /** + * Test that bulk add correctly tracks change count for optimization. + */ + @Test + void testBulkAddTracksChangeCountForOptimization(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(300) + .minChangesBetweenOptimizations(100) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + // Bulk add documents that exceeds the threshold + final List documents = new ArrayList<>(); + for (int i = 0; i < 150; i++) { + documents.add(new Document("doc_" + i, randomVector(random, dimension))); + } + gigaMap.addAll(documents); + + // Wait for optimization + Thread.sleep(500); + + // Search should still work + final VectorSearchResult result = index.search(randomVector(random, dimension), 10); + assertEquals(10, result.size()); + } finally { + index.close(); + } + } + + /** + * Test that manual optimize() method still works with background optimization enabled. + */ + @Test + void testManualOptimizeWithBackgroundOptimizationEnabled(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 100; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(60_000) // Long interval - won't trigger + .minChangesBetweenOptimizations(1000) // High threshold - won't trigger + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Manually trigger optimization + index.optimize(); + + // Search should still work + final VectorSearchResult result = index.search(randomVector(random, dimension), 10); + assertEquals(10, result.size()); + } finally { + index.close(); + } + } + + /** + * Test that both background persistence and optimization can be enabled together. + */ + @Test + void testBackgroundPersistenceAndOptimizationTogether(@TempDir final Path tempDir) throws Exception + { + final int dimension = 32; + final int vectorCount = 150; + final Random random = new Random(42); + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + // Enable both background persistence and optimization + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(300) + .minChangesBetweenPersists(10) + .persistOnShutdown(true) + .optimizationIntervalMs(400) + .minChangesBetweenOptimizations(10) + .optimizeOnShutdown(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new ComputedDocumentVectorizer() + ); + + try { + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Wait for both background tasks to run + Thread.sleep(1000); + + // Search should still work + final VectorSearchResult result = index.search(randomVector(random, dimension), 10); + assertEquals(10, result.size()); + + // Files should exist from background persistence + assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), + "Graph file should exist from background persistence"); + } finally { + index.close(); + } + } + + + // ======================================================================== + // Parallel vs Non-Parallel On-Disk Write Tests + // ======================================================================== + + + /** + * Test that parallel and non-parallel on-disk writes both support persist-and-reload + * for a large PQ-compressed index. + * Verifies that the graph files produced by both modes can be loaded correctly + * and yield equivalent search results after restart. + */ + @Test + void testParallelVsNonParallelPersistAndReload(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 2000; + final int dimension = 64; + final int pqSubspaces = 16; + final int k = 20; + final Random random = new Random(42); + + // Generate shared vectors and query + final List vectors = new ArrayList<>(); + for (int i = 0; i < vectorCount; i++) { + vectors.add(randomVector(random, dimension)); + } + final float[] queryVector = randomVector(new Random(999), dimension); + + final Path parallelIndexDir = tempDir.resolve("parallel-index"); + final Path parallelStorageDir = tempDir.resolve("parallel-storage"); + final Path sequentialIndexDir = tempDir.resolve("sequential-index"); + final Path sequentialStorageDir = tempDir.resolve("sequential-storage"); + + // --- Build and persist both modes --- + buildAndPersistIndex(vectors, queryVector, dimension, pqSubspaces, parallelIndexDir, parallelStorageDir, true); + buildAndPersistIndex(vectors, queryVector, dimension, pqSubspaces, sequentialIndexDir, sequentialStorageDir, false); + + // --- Reload both and compare search results --- + final List parallelIds = new ArrayList<>(); + final List parallelScores = new ArrayList<>(); + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(parallelStorageDir)) { + @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(vectorCount, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + assertTrue(index.isOnDisk()); + + final VectorSearchResult result = index.search(queryVector, k); + assertEquals(k, result.size()); + for (final VectorSearchResult.Entry entry : result) { + parallelIds.add(entry.entityId()); + parallelScores.add(entry.score()); + assertNotNull(entry.entity()); + } + } + } + + final List sequentialIds = new ArrayList<>(); + final List sequentialScores = new ArrayList<>(); + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(sequentialStorageDir)) { + @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); + + assertEquals(vectorCount, gigaMap.size()); + + final VectorIndex index = vectorIndices.get("embeddings"); + assertTrue(index.isOnDisk()); + + final VectorSearchResult result = index.search(queryVector, k); + assertEquals(k, result.size()); + for (final VectorSearchResult.Entry entry : result) { + sequentialIds.add(entry.entityId()); + sequentialScores.add(entry.score()); + assertNotNull(entry.entity()); + } + } + } + + // Both modes should produce equivalent results after reload + assertEquals(parallelIds, sequentialIds, + "Parallel and sequential modes should produce identical search results after reload"); + assertEquals(parallelScores, sequentialScores, + "Parallel and sequential modes should produce identical search scores after reload"); + } + + /** + * Helper to build, populate, train PQ, persist, and store a PQ-compressed index. + */ + private void buildAndPersistIndex( + final List vectors, + final float[] queryVector, + final int dimension, + final int pqSubspaces, + final Path indexDir, + final Path storageDir, + final boolean parallel + ) throws IOException + { + try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + final GigaMap gigaMap = GigaMap.New(); + storage.setRoot(gigaMap); + + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(32) + .beamWidth(100) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .parallelOnDiskWrite(parallel) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", config, new ComputedDocumentVectorizer() + ); + + addDocumentsFromVectors(gigaMap, vectors, "doc_"); + + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + index.persistToDisk(); + + assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); + assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); + + storage.storeRoot(); + } + } + + + // ======================================================================== + // Embedded Vectorizer + On-Disk Tests + // ======================================================================== + + /** + * Test that an embedded vectorizer with parallel on-disk write completes without deadlock. + *

+ * This is a regression test for a deadlock where {@code persistToDisk()} held + * {@code synchronized(parentMap)} for the entire disk write. The disk writer uses + * internal worker threads (ForkJoinPool for PQ encoding, parallel graph writer) + * that call {@code parentMap.get()} — which also synchronizes on the same monitor. + *

+ * The fix restructures locking: Phase 1 (prep) runs inside {@code synchronized(parentMap)}, + * Phase 2 (disk write) runs outside it but still holds {@code persistenceLock.writeLock()}. + *

+ * Uses {@code @Timeout} to fail fast if a deadlock occurs instead of hanging indefinitely. + */ + @Test + @Timeout(value = 60, unit = TimeUnit.SECONDS) + void testEmbeddedVectorizerWithParallelOnDiskWrite(@TempDir final Path tempDir) + { + final int vectorCount = 500; + final int dimension = 64; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .parallelOnDiskWrite(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new EmbeddedDocumentVectorizer() + ); + + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // This would deadlock before the fix + index.persistToDisk(); + + // Verify files were created + assertAll( + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))), + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))) + ); + + // Verify search still works after persist + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult result = index.search(queryVector, 10); + assertEquals(10, result.size()); + + result.forEach(entry -> assertNotNull(entry.entity())); + } + + /** + * Test that an embedded vectorizer with PQ compression and parallel on-disk write + * completes without deadlock. + *

+ * This is the most deadlock-prone scenario: FusedPQ encoding uses a ForkJoinPool + * that calls {@code getVector()} on worker threads, plus the parallel graph writer + * also calls {@code getVector()} from its own thread pool. + */ + @Test + @Timeout(value = 60, unit = TimeUnit.SECONDS) + void testEmbeddedVectorizerWithPqAndParallelOnDiskWrite(@TempDir final Path tempDir) + { + final int vectorCount = 500; + final int dimension = 64; + final int pqSubspaces = 16; + final Random random = new Random(42); + + final Path indexDir = tempDir.resolve("index"); + + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .parallelOnDiskWrite(true) + .build(); + + final VectorIndex index = vectorIndices.add( + "embeddings", + config, + new EmbeddedDocumentVectorizer() + ); + + // Add vectors + addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); + + // Train PQ compression + ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + + // This would deadlock before the fix + index.persistToDisk(); + + // Verify files were created + assertAll( + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))), + () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))) + ); + + // Verify search still works + final float[] queryVector = randomVector(random, dimension); + final VectorSearchResult result = index.search(queryVector, 10); + assertEquals(10, result.size()); + } + + /** + * Test that parallel and non-parallel on-disk writes produce equivalent search results + * for a large index without PQ compression. + * Both modes should produce identical graph files that yield the same search quality. + */ + @Test + void testParallelVsSequentialOnDiskWrite(@TempDir final Path tempDir) throws IOException + { + final int vectorCount = 2000; + final int dimension = 64; + final int k = 20; + final Random random = new Random(42); + + // Generate shared vectors and query + final List vectors = new ArrayList<>(); + for (int i = 0; i < vectorCount; i++) { + vectors.add(randomVector(random, dimension)); + } + final float[] queryVector = randomVector(new Random(999), dimension); + + final Path parallelIndexDir = tempDir.resolve("parallel"); + final Path sequentialIndexDir = tempDir.resolve("sequential"); + + final List parallelIds = new ArrayList<>(); + final List parallelScores = new ArrayList<>(); + final List sequentialIds = new ArrayList<>(); + final List sequentialScores = new ArrayList<>(); + + // --- Parallel config + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration configParallel = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .beamWidth(100) + .onDisk(true) + .indexDirectory(parallelIndexDir) + .parallelOnDiskWrite(true) + .build(); + + // --- Sequential config + final VectorIndex index = vectorIndices.add( + "embeddings", configParallel, new ComputedDocumentVectorizer() + ); + + final VectorIndexConfiguration configSequential = VectorIndexConfiguration.builder() + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(16) + .enablePqCompression(true) + .beamWidth(100) + .onDisk(true) + .indexDirectory(sequentialIndexDir) + .parallelOnDiskWrite(false) + .build(); + + final VectorIndex indexSequential = vectorIndices.add( + "embeddingsSequential", configSequential, new ComputedDocumentVectorizer() + ); + + addDocumentsFromVectors(gigaMap, vectors, "doc_"); + + index.persistToDisk(); + indexSequential.persistToDisk(); + + //parallel + final VectorSearchResult result = index.search(queryVector, k); + for (final VectorSearchResult.Entry entry : result) { + parallelIds.add(entry.entityId()); + parallelScores.add(entry.score()); + } + + //sequential + final VectorSearchResult resultSequential = indexSequential.search(queryVector, k); + for (final VectorSearchResult.Entry entry : resultSequential) { + sequentialIds.add(entry.entityId()); + sequentialScores.add(entry.score()); + } + + assertAll( + () -> assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.graph"))), + () -> assertTrue(Files.exists(parallelIndexDir.resolve("embeddings.meta"))), + () -> assertTrue(Files.exists(sequentialIndexDir.resolve("embeddingsSequential.graph"))), + () -> assertTrue(Files.exists(sequentialIndexDir.resolve("embeddingsSequential.meta"))) + ); + + // Both indices were built from the same data with the same HNSW parameters, + // so search results must be identical. + assertEquals(parallelIds, sequentialIds, + "Parallel and sequential on-disk writes should produce identical search results"); + assertEquals(parallelScores, sequentialScores, + "Parallel and sequential on-disk writes should produce identical search scores"); + } } From a83a6b521c2b4aedb85a531d295043333bf2ac4e Mon Sep 17 00:00:00 2001 From: Zdenek Jonas Date: Fri, 20 Feb 2026 19:33:42 +0100 Subject: [PATCH 5/7] reverts original formats --- .../gigamap/jvector/VectorIndexDiskTest.java | 1049 +++++++++-------- 1 file changed, 573 insertions(+), 476 deletions(-) diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java index df8bcac4..3d94f254 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexDiskTest.java @@ -47,9 +47,7 @@ class VectorIndexDiskTest /** * Simple entity with an embedding vector. */ - record Document(String content, float[] embedding) - { - } + record Document(String content, float[] embedding) {} /** * Computed vectorizer - simulates externally computed vectors. @@ -88,12 +86,14 @@ private static float[] randomVector(final Random random, final int dimension) { final float[] vector = new float[dimension]; float norm = 0; - for (int i = 0; i < dimension; i++) { + for(int i = 0; i < dimension; i++) + { vector[i] = random.nextFloat() * 2 - 1; norm += vector[i] * vector[i]; } - norm = (float) Math.sqrt(norm); - for (int i = 0; i < dimension; i++) { + norm = (float)Math.sqrt(norm); + for(int i = 0; i < dimension; i++) + { vector[i] /= norm; } return vector; @@ -143,7 +143,8 @@ void testOnDiskIndexCreationAndPersistence(@TempDir final Path tempDir) throws I // Generate vectors final List vectors = new ArrayList<>(); - for (int i = 0; i < vectorCount; i++) { + for(int i = 0; i < vectorCount; i++) + { vectors.add(randomVector(random, dimension)); } @@ -152,22 +153,23 @@ void testOnDiskIndexCreationAndPersistence(@TempDir final Path tempDir) throws I // Phase 1: Create index and persist { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) + { final GigaMap gigaMap = GigaMap.New(); storage.setRoot(gigaMap); final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); assertTrue(index.isOnDisk()); @@ -178,7 +180,8 @@ void testOnDiskIndexCreationAndPersistence(@TempDir final Path tempDir) throws I // Search and record expected results final VectorSearchResult result = index.search(queryVector, 10); - for (final VectorSearchResult.Entry entry : result) { + for(final VectorSearchResult.Entry entry : result) + { expectedIds.add(entry.entityId()); } @@ -195,8 +198,10 @@ void testOnDiskIndexCreationAndPersistence(@TempDir final Path tempDir) throws I // Phase 2: Reload and verify { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) + { + @SuppressWarnings("unchecked") + final GigaMap gigaMap = (GigaMap)storage.root(); final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); assertEquals(vectorCount, gigaMap.size()); @@ -207,7 +212,8 @@ void testOnDiskIndexCreationAndPersistence(@TempDir final Path tempDir) throws I // Search and compare results final VectorSearchResult result = index.search(queryVector, 10); final List actualIds = new ArrayList<>(); - for (final VectorSearchResult.Entry entry : result) { + for(final VectorSearchResult.Entry entry : result) + { actualIds.add(entry.entityId()); } @@ -234,18 +240,18 @@ void testOnDiskIndexWithCompression(@TempDir final Path tempDir) throws IOExcept final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); assertTrue(index.isOnDisk()); @@ -255,7 +261,7 @@ void testOnDiskIndexWithCompression(@TempDir final Path tempDir) throws IOExcept addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Train compression - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); // Search should work final float[] queryVector = randomVector(random, dimension); @@ -273,7 +279,7 @@ void testOnDiskIndexWithCompression(@TempDir final Path tempDir) throws IOExcept assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); assertTrue(Files.exists(indexDir.resolve("embeddings.meta"))); assertFalse(Files.exists(indexDir.resolve("embeddings.pq")), - "FusedPQ should be embedded in graph file, not in separate .pq file"); + "FusedPQ should be embedded in graph file, not in separate .pq file"); } /** @@ -292,16 +298,16 @@ void testOnDiskSearchQuality(@TempDir final Path tempDir) throws IOException final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); // Add random vectors @@ -340,17 +346,18 @@ void testOnDiskIndexMultipleRestarts(@TempDir final Path tempDir) throws IOExcep // Phase 1: Create with 100 vectors { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) + { final GigaMap gigaMap = GigaMap.New(); storage.setRoot(gigaMap); final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .build(); vectorIndices.add("embeddings", config, new ComputedDocumentVectorizer()); @@ -363,8 +370,10 @@ void testOnDiskIndexMultipleRestarts(@TempDir final Path tempDir) throws IOExcep // Phase 2: Restart and add 50 more vectors { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) + { + @SuppressWarnings("unchecked") + final GigaMap gigaMap = (GigaMap)storage.root(); final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); assertEquals(100, gigaMap.size()); @@ -383,12 +392,15 @@ void testOnDiskIndexMultipleRestarts(@TempDir final Path tempDir) throws IOExcep // Phase 3: Final verification { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); - VectorIndex index = gigaMap.index().get(VectorIndices.Category()).get("embeddings"); + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) + { + @SuppressWarnings("unchecked") + final GigaMap gigaMap = (GigaMap)storage.root(); + final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); assertEquals(150, gigaMap.size()); + final VectorIndex index = vectorIndices.get("embeddings"); final VectorSearchResult result = index.search(randomVector(random, dimension), 30); assertEquals(30, result.size()); } @@ -418,18 +430,18 @@ void testPqCompressionSearchQuality(@TempDir final Path tempDir) final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); // Add random vectors @@ -443,7 +455,7 @@ void testPqCompressionSearchQuality(@TempDir final Path tempDir) gigaMap.add(new Document("needle", needleVector)); // Train PQ compression - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); // Search for the needle vector - it should be in the top results final VectorSearchResult result = index.search(needleVector, 5); @@ -451,13 +463,14 @@ void testPqCompressionSearchQuality(@TempDir final Path tempDir) assertEquals(5, result.size()); final VectorSearchResult.Entry firstResult = result.iterator().next(); assertEquals("needle", firstResult.entity().content(), - "Exact match should be first result even with PQ compression"); + "Exact match should be first result even with PQ compression"); assertTrue(firstResult.score() > 0.99f, - "Exact match should have score close to 1.0"); + "Exact match should have score close to 1.0"); // Verify results are ordered by score float prevScore = Float.MAX_VALUE; - for (final VectorSearchResult.Entry entry : result) { + for(final VectorSearchResult.Entry entry : result) + { assertTrue(entry.score() <= prevScore, "Results should be ordered by score"); prevScore = entry.score(); } @@ -480,7 +493,8 @@ void testPqCompressionPersistAndReload(@TempDir final Path tempDir) throws IOExc final Path storageDir = tempDir.resolve("storage"); final List vectors = new ArrayList<>(); - for (int i = 0; i < vectorCount; i++) { + for(int i = 0; i < vectorCount; i++) + { vectors.add(randomVector(random, dimension)); } @@ -489,24 +503,25 @@ void testPqCompressionPersistAndReload(@TempDir final Path tempDir) throws IOExc // Phase 1: Create index with PQ, populate, search, persist { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) + { final GigaMap gigaMap = GigaMap.New(); storage.setRoot(gigaMap); final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); assertTrue(index.isOnDisk()); @@ -515,10 +530,11 @@ void testPqCompressionPersistAndReload(@TempDir final Path tempDir) throws IOExc addDocumentsFromVectors(gigaMap, vectors, "doc_"); // Train and search - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); final VectorSearchResult result = index.search(queryVector, 10); - for (final VectorSearchResult.Entry entry : result) { + for(final VectorSearchResult.Entry entry : result) + { expectedIds.add(entry.entityId()); } @@ -533,8 +549,10 @@ void testPqCompressionPersistAndReload(@TempDir final Path tempDir) throws IOExc // Phase 2: Reload and verify search results { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) + { + @SuppressWarnings("unchecked") + final GigaMap gigaMap = (GigaMap)storage.root(); final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); assertEquals(vectorCount, gigaMap.size()); @@ -548,7 +566,8 @@ void testPqCompressionPersistAndReload(@TempDir final Path tempDir) throws IOExc assertEquals(10, result.size()); final List actualIds = new ArrayList<>(); - for (final VectorSearchResult.Entry entry : result) { + for(final VectorSearchResult.Entry entry : result) + { actualIds.add(entry.entityId()); } @@ -578,23 +597,23 @@ void testPqCompressionWithDotProduct(@TempDir final Path tempDir) final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.DOT_PRODUCT) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.DOT_PRODUCT) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); final float[] queryVector = randomVector(random, dimension); final VectorSearchResult result = index.search(queryVector, 10); @@ -620,23 +639,23 @@ void testPqCompressionWithEuclidean(@TempDir final Path tempDir) final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.EUCLIDEAN) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.EUCLIDEAN) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); final float[] queryVector = randomVector(random, dimension); final VectorSearchResult result = index.search(queryVector, 10); @@ -658,29 +677,29 @@ void testPqCompressionWithDefaultSubspaces(@TempDir final Path tempDir) final Path indexDir = tempDir.resolve("index"); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - // pqSubspaces not set - should default to dimension/4 = 32 - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + // pqSubspaces not set - should default to dimension/4 = 32 + .build(); assertEquals(0, config.pqSubspaces(), - "pqSubspaces should be 0 (auto-calculated at runtime)"); + "pqSubspaces should be 0 (auto-calculated at runtime)"); final GigaMap gigaMap = GigaMap.New(); final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); final float[] queryVector = randomVector(random, dimension); final VectorSearchResult result = index.search(queryVector, 10); @@ -707,26 +726,27 @@ void testPqCompressionWithRemoval(@TempDir final Path tempDir) final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); // Remove every other entity (even IDs) - for (int i = 0; i < vectorCount; i += 2) { + for(int i = 0; i < vectorCount; i += 2) + { gigaMap.removeById(i); } @@ -736,11 +756,13 @@ void testPqCompressionWithRemoval(@TempDir final Path tempDir) final VectorSearchResult result = index.search(randomVector(random, dimension), 10); assertEquals(10, result.size()); - for (final VectorSearchResult.Entry entry : result) { + for(final VectorSearchResult.Entry entry : result) + { + assertNotNull(entry.entity()); final String content = entry.entity().content(); final int docNum = Integer.parseInt(content.replace("doc_", "")); assertTrue(docNum % 2 != 0, - "Only odd-numbered documents should remain, found: " + content); + "Only odd-numbered documents should remain, found: " + content); } } @@ -762,23 +784,23 @@ void testPqCompressionConcurrentSearch(@TempDir final Path tempDir) throws Excep final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); // Run concurrent searches final int numSearches = 50; @@ -787,19 +809,26 @@ void testPqCompressionConcurrentSearch(@TempDir final Path tempDir) throws Excep final CountDownLatch latch = new CountDownLatch(numSearches); final ExecutorService executor = Executors.newFixedThreadPool(4); - for (int i = 0; i < numSearches; i++) { + for(int i = 0; i < numSearches; i++) + { final float[] queryVector = randomVector(new Random(i), dimension); executor.submit(() -> { - try { + try + { final VectorSearchResult result = index.search(queryVector, 10); - if (result.size() == 10) { + if(result.size() == 10) + { successfulSearches.incrementAndGet(); } - } catch (final Exception e) { + } + catch(final Exception e) + { hasError.set(true); e.printStackTrace(); - } finally { + } + finally + { latch.countDown(); } }); @@ -810,7 +839,7 @@ void testPqCompressionConcurrentSearch(@TempDir final Path tempDir) throws Excep assertFalse(hasError.get(), "No errors should occur during concurrent PQ search"); assertEquals(numSearches, successfulSearches.get(), - "All concurrent PQ searches should return expected results"); + "All concurrent PQ searches should return expected results"); } /** @@ -832,25 +861,25 @@ void testPqCompressionAddAfterTraining(@TempDir final Path tempDir) final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); // Add initial vectors addRandomDocuments(gigaMap, random, dimension, initialCount, "initial_"); // Train PQ - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); // Search before adding more final float[] queryVector = randomVector(random, dimension); @@ -866,7 +895,10 @@ void testPqCompressionAddAfterTraining(@TempDir final Path tempDir) final VectorSearchResult resultAfter = index.search(queryVector, 10); assertEquals(10, resultAfter.size()); - resultAfter.forEach(entry -> assertNotNull(entry.entity())); + for(final VectorSearchResult.Entry entry : resultAfter) + { + assertNotNull(entry.entity()); + } } /** @@ -888,29 +920,30 @@ void testPqCompressionMultipleRestarts(@TempDir final Path tempDir) throws IOExc // Phase 1: Create with 500 vectors and PQ, persist to disk { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) + { final GigaMap gigaMap = GigaMap.New(); storage.setRoot(gigaMap); final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); addRandomDocuments(gigaMap, random, dimension, 500, "doc_"); - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); index.persistToDisk(); // Verify search works before restart @@ -923,8 +956,10 @@ void testPqCompressionMultipleRestarts(@TempDir final Path tempDir) throws IOExc // Phase 2: Restart and verify search works from loaded disk index { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) + { + @SuppressWarnings("unchecked") + final GigaMap gigaMap = (GigaMap)storage.root(); final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); assertEquals(500, gigaMap.size()); @@ -945,8 +980,10 @@ void testPqCompressionMultipleRestarts(@TempDir final Path tempDir) throws IOExc // Phase 3: Second restart - verify search still works { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) + { + @SuppressWarnings("unchecked") + final GigaMap gigaMap = (GigaMap)storage.root(); final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); assertEquals(500, gigaMap.size()); @@ -975,18 +1012,18 @@ void testPqCompressionRemoveAllAndRepopulate(@TempDir final Path tempDir) final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .build(); vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); // Initial population @@ -1007,7 +1044,7 @@ void testPqCompressionRemoveAllAndRepopulate(@TempDir final Path tempDir) final VectorIndex indexAfter = vectorIndicesAfter.get("embeddings"); // Train PQ on new data - ((VectorIndex.Internal) indexAfter).trainCompressionIfNeeded(); + ((VectorIndex.Internal)indexAfter).trainCompressionIfNeeded(); // Search should find only new documents final VectorSearchResult result = indexAfter.search(randomVector(random, dimension), 20); @@ -1031,17 +1068,17 @@ void testInMemoryIndexStillWorks() // Default configuration (in-memory) final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); assertFalse(config.onDisk()); assertNull(config.indexDirectory()); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); assertFalse(index.isOnDisk()); @@ -1074,27 +1111,28 @@ void testBackgroundPersistenceTriggersAfterInterval(@TempDir final Path tempDir) // Configure with short interval for testing final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(500) // 500ms for fast test - .minChangesBetweenPersists(1) // Persist on any change - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(500) // 500ms for fast test + .minChangesBetweenPersists(1) // Persist on any change + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); - try { + try + { // Add vectors to trigger dirty state addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // Initially, files should not exist (not yet persisted) assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should not exist immediately after adding"); + "Graph file should not exist immediately after adding"); // Wait for background persistence to trigger (interval + some buffer) await() @@ -1106,7 +1144,9 @@ void testBackgroundPersistenceTriggersAfterInterval(@TempDir final Path tempDir) () -> assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), "Meta file should exist after background persistence"))); - } finally { + } + finally + { index.close(); } } @@ -1126,21 +1166,22 @@ void testConcurrentSearchDuringBackgroundPersistence(@TempDir final Path tempDir final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(200) // Short interval to trigger during test - .minChangesBetweenPersists(1) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(200) // Short interval to trigger during test + .minChangesBetweenPersists(1) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); - try { + try + { // Add initial vectors addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); @@ -1151,19 +1192,26 @@ void testConcurrentSearchDuringBackgroundPersistence(@TempDir final Path tempDir final CountDownLatch latch = new CountDownLatch(numSearches); final ExecutorService executor = Executors.newFixedThreadPool(4); - for (int i = 0; i < numSearches; i++) { + for(int i = 0; i < numSearches; i++) + { final float[] queryVector = randomVector(new Random(i), dimension); executor.submit(() -> { - try { + try + { final VectorSearchResult result = index.search(queryVector, 10); - if (result.size() == 10) { + if(result.size() == 10) + { successfulSearches.incrementAndGet(); } - } catch (final Exception e) { + } + catch(final Exception e) + { hasError.set(true); e.printStackTrace(); - } finally { + } + finally + { latch.countDown(); } }); @@ -1179,8 +1227,10 @@ void testConcurrentSearchDuringBackgroundPersistence(@TempDir final Path tempDir // Verify all searches succeeded assertFalse(hasError.get(), "No errors should occur during concurrent search"); assertEquals(numSearches, successfulSearches.get(), - "All searches should return expected number of results"); - } finally { + "All searches should return expected number of results"); + } + finally + { index.close(); } } @@ -1200,19 +1250,19 @@ void testShutdownPersistsPendingChanges(@TempDir final Path tempDir) throws Exce final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(60_000) // Long interval - won't trigger during test - .minChangesBetweenPersists(1) - .persistOnShutdown(true) // Should persist on close - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(60_000) // Long interval - won't trigger during test + .minChangesBetweenPersists(1) + .persistOnShutdown(true) // Should persist on close + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); // Add vectors @@ -1220,16 +1270,16 @@ void testShutdownPersistsPendingChanges(@TempDir final Path tempDir) throws Exce // Files should not exist yet (interval hasn't triggered) assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should not exist before close"); + "Graph file should not exist before close"); // Close the index (should trigger persist due to persistOnShutdown=true) index.close(); // Files should now exist assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after close with persistOnShutdown=true"); + "Graph file should exist after close with persistOnShutdown=true"); assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), - "Meta file should exist after close with persistOnShutdown=true"); + "Meta file should exist after close with persistOnShutdown=true"); } /** @@ -1247,19 +1297,19 @@ void testShutdownSkipsPersistWhenDisabled(@TempDir final Path tempDir) throws Ex final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(60_000) // Long interval - won't trigger during test - .minChangesBetweenPersists(1) - .persistOnShutdown(false) // Should NOT persist on close - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(60_000) // Long interval - won't trigger during test + .minChangesBetweenPersists(1) + .persistOnShutdown(false) // Should NOT persist on close + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); // Add vectors @@ -1270,7 +1320,7 @@ void testShutdownSkipsPersistWhenDisabled(@TempDir final Path tempDir) throws Ex // Files should NOT exist assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should not exist after close with persistOnShutdown=false"); + "Graph file should not exist after close with persistOnShutdown=false"); } /** @@ -1288,21 +1338,22 @@ void testDebouncing(@TempDir final Path tempDir) throws Exception // Configure with high threshold that won't be met final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(100) // Short interval - .minChangesBetweenPersists(500) // High threshold - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(200) // Short interval + .minChangesBetweenPersists(500) // High threshold + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); - try { + try + { // Add fewer vectors than the threshold addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // 50 < 500 threshold @@ -1311,18 +1362,22 @@ void testDebouncing(@TempDir final Path tempDir) throws Exception // Files should NOT exist because change count is below threshold assertFalse(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should not exist when changes below threshold"); + "Graph file should not exist when changes below threshold"); // Now add more vectors to exceed the threshold - IntStream.range(50, 600) // Total now 600 > 500 threshold - .forEach(i -> gigaMap.add(new Document("doc_" + i, randomVector(random, dimension)))); + for(int i = 50; i < 600; i++) // Total now 600 > 500 threshold + { + gigaMap.add(new Document("doc_" + i, randomVector(random, dimension))); + } await() .atMost(ofMillis(500)) .pollInterval(ofMillis(100)) .untilAsserted(() -> assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), "Graph file should exist when changes exceed threshold")); - } finally { + } + finally + { index.close(); } } @@ -1341,24 +1396,26 @@ void testBulkAddTracksChangeCount(@TempDir final Path tempDir) throws Exception final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(300) - .minChangesBetweenPersists(100) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(300) + .minChangesBetweenPersists(100) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); - try { + try + { // Bulk add documents final List documents = new ArrayList<>(); - for (int i = 0; i < 150; i++) { + for(int i = 0; i < 150; i++) + { documents.add(new Document("doc_" + i, randomVector(random, dimension))); } gigaMap.addAll(documents); @@ -1391,25 +1448,26 @@ void testBackgroundPersistenceWithRestart(@TempDir final Path tempDir) throws Ex // Phase 1: Create index with background persistence and add vectors { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) + { final GigaMap gigaMap = GigaMap.New(); storage.setRoot(gigaMap); final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(100) - .minChangesBetweenPersists(1) - .persistOnShutdown(true) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(100) + .minChangesBetweenPersists(1) + .persistOnShutdown(true) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); // Add vectors @@ -1429,14 +1487,16 @@ void testBackgroundPersistenceWithRestart(@TempDir final Path tempDir) throws Ex // Verify files were persisted assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after close"); + "Graph file should exist after close"); assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), - "Meta file should exist after close"); + "Meta file should exist after close"); // Phase 2: Reload and verify { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { - @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) + { + @SuppressWarnings("unchecked") + final GigaMap gigaMap = (GigaMap)storage.root(); final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); assertEquals(vectorCount, gigaMap.size()); @@ -1469,21 +1529,22 @@ void testManualPersistWithBackgroundPersistenceEnabled(@TempDir final Path tempD final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(60_000) // Long interval - won't trigger - .minChangesBetweenPersists(1000) // High threshold - won't trigger - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(60_000) // Long interval - won't trigger + .minChangesBetweenPersists(1000) // High threshold - won't trigger + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); - try { + try + { // Add vectors addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); @@ -1495,10 +1556,12 @@ void testManualPersistWithBackgroundPersistenceEnabled(@TempDir final Path tempD // Files should now exist assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist after manual persistToDisk"); + "Graph file should exist after manual persistToDisk"); assertTrue(Files.exists(indexDir.resolve("embeddings.meta")), - "Meta file should exist after manual persistToDisk"); - } finally { + "Meta file should exist after manual persistToDisk"); + } + finally + { index.close(); } } @@ -1523,33 +1586,34 @@ void testBackgroundOptimizationTriggersAfterIntervalAndThreshold(@TempDir final // Configure with short interval and low threshold for testing final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(300) // 300ms for fast test - .minChangesBetweenOptimizations(10) // Low threshold - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(300) // 300ms for fast test + .minChangesBetweenOptimizations(10) // Low threshold + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); - try { - final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + try + { + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; // Initially, optimization count should be 0 assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), - "Optimization count should be 0 initially"); + "Optimization count should be 0 initially"); // Add vectors to trigger dirty state above threshold addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // Verify pending changes are tracked assertTrue(defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount() > 0, - "Pending changes should be tracked"); + "Pending changes should be tracked"); // Verify optimization was actually performed await() @@ -1561,12 +1625,14 @@ void testBackgroundOptimizationTriggersAfterIntervalAndThreshold(@TempDir final // Verify pending changes were reset assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), - "Pending changes should be reset after optimization"); + "Pending changes should be reset after optimization"); // Verify search still works final VectorSearchResult result = index.search(randomVector(random, dimension), 10); assertEquals(10, result.size()); - } finally { + } + finally + { index.close(); } } @@ -1586,45 +1652,48 @@ void testOptimizationDebouncingBelowThreshold(@TempDir final Path tempDir) throw // Configure with high threshold that won't be met final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(200) // Short interval - .minChangesBetweenOptimizations(500) // High threshold - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(200) // Short interval + .minChangesBetweenOptimizations(500) // High threshold + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); - try { - final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + try + { + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; // Add fewer vectors than the threshold addRandomDocuments(gigaMap, random, dimension, 50, "doc_"); // 50 < 500 threshold // Verify pending changes are tracked assertEquals(50, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), - "Pending changes should be 50"); + "Pending changes should be 50"); // Wait for multiple optimization intervals Thread.sleep(600); // Verify optimization was NOT performed (below threshold) assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), - "Optimization should NOT have been performed (below threshold)"); + "Optimization should NOT have been performed (below threshold)"); // Verify pending changes are still tracked (not reset) assertEquals(50, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), - "Pending changes should still be 50 (not reset)"); + "Pending changes should still be 50 (not reset)"); // Search should still work final VectorSearchResult result = index.search(randomVector(random, dimension), 10); assertEquals(10, result.size()); - } finally { + } + finally + { index.close(); } } @@ -1644,32 +1713,32 @@ void testShutdownOptimizesPendingChanges(@TempDir final Path tempDir) throws Exc final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(60_000) // Long interval - won't trigger during test - .minChangesBetweenOptimizations(1) - .optimizeOnShutdown(true) // Should optimize on close - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(60_000) // Long interval - won't trigger during test + .minChangesBetweenOptimizations(1) + .optimizeOnShutdown(true) // Should optimize on close + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); - final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Verify pending changes are tracked assertEquals(vectorCount, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), - "Pending changes should equal vector count"); + "Pending changes should equal vector count"); // Verify no optimization has run yet assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), - "Optimization count should be 0 before close"); + "Optimization count should be 0 before close"); // Verify search works before close final VectorSearchResult resultBefore = index.search(randomVector(random, dimension), 10); @@ -1698,33 +1767,33 @@ void testShutdownSkipsOptimizeWhenDisabled(@TempDir final Path tempDir) throws E final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(60_000) // Long interval - won't trigger during test - .minChangesBetweenOptimizations(1) - .optimizeOnShutdown(false) // Should NOT optimize on close - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(60_000) // Long interval - won't trigger during test + .minChangesBetweenOptimizations(1) + .optimizeOnShutdown(false) // Should NOT optimize on close + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); - final VectorIndex.Default defaultIndex = (VectorIndex.Default) index; + final VectorIndex.Default defaultIndex = (VectorIndex.Default)index; // Add vectors addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Verify pending changes are tracked assertEquals(vectorCount, defaultIndex.backgroundTaskManager.getOptimizationPendingChangeCount(), - "Pending changes should equal vector count"); + "Pending changes should equal vector count"); // Verify no optimization has run yet assertEquals(0, defaultIndex.backgroundTaskManager.getOptimizationCount(), - "Optimization count should be 0 before close"); + "Optimization count should be 0 before close"); // Close the index (should NOT trigger optimize) index.close(); @@ -1751,21 +1820,22 @@ void testConcurrentSearchDuringBackgroundOptimization(@TempDir final Path tempDi final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(150) // Short interval to trigger during test - .minChangesBetweenOptimizations(1) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(150) // Short interval to trigger during test + .minChangesBetweenOptimizations(1) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); - try { + try + { // Add initial vectors addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); @@ -1776,19 +1846,26 @@ void testConcurrentSearchDuringBackgroundOptimization(@TempDir final Path tempDi final CountDownLatch latch = new CountDownLatch(numSearches); final ExecutorService executor = Executors.newFixedThreadPool(4); - for (int i = 0; i < numSearches; i++) { + for(int i = 0; i < numSearches; i++) + { final float[] queryVector = randomVector(new Random(i), dimension); executor.submit(() -> { - try { + try + { final VectorSearchResult result = index.search(queryVector, 10); - if (result.size() == 10) { + if(result.size() == 10) + { successfulSearches.incrementAndGet(); } - } catch (final Exception e) { + } + catch(final Exception e) + { hasError.set(true); e.printStackTrace(); - } finally { + } + finally + { latch.countDown(); } }); @@ -1804,8 +1881,10 @@ void testConcurrentSearchDuringBackgroundOptimization(@TempDir final Path tempDi // Verify all searches succeeded assertFalse(hasError.get(), "No errors should occur during concurrent search with optimization"); assertEquals(numSearches, successfulSearches.get(), - "All searches should return expected number of results"); - } finally { + "All searches should return expected number of results"); + } + finally + { index.close(); } } @@ -1824,24 +1903,26 @@ void testBulkAddTracksChangeCountForOptimization(@TempDir final Path tempDir) th final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(300) - .minChangesBetweenOptimizations(100) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(300) + .minChangesBetweenOptimizations(100) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); - try { + try + { // Bulk add documents that exceeds the threshold final List documents = new ArrayList<>(); - for (int i = 0; i < 150; i++) { + for(int i = 0; i < 150; i++) + { documents.add(new Document("doc_" + i, randomVector(random, dimension))); } gigaMap.addAll(documents); @@ -1852,7 +1933,9 @@ void testBulkAddTracksChangeCountForOptimization(@TempDir final Path tempDir) th // Search should still work final VectorSearchResult result = index.search(randomVector(random, dimension), 10); assertEquals(10, result.size()); - } finally { + } + finally + { index.close(); } } @@ -1872,21 +1955,22 @@ void testManualOptimizeWithBackgroundOptimizationEnabled(@TempDir final Path tem final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .optimizationIntervalMs(60_000) // Long interval - won't trigger - .minChangesBetweenOptimizations(1000) // High threshold - won't trigger - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .optimizationIntervalMs(60_000) // Long interval - won't trigger + .minChangesBetweenOptimizations(1000) // High threshold - won't trigger + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new ComputedDocumentVectorizer() + "embeddings", + config, + new ComputedDocumentVectorizer() ); - try { + try + { // Add vectors addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); @@ -1896,7 +1980,9 @@ void testManualOptimizeWithBackgroundOptimizationEnabled(@TempDir final Path tem // Search should still work final VectorSearchResult result = index.search(randomVector(random, dimension), 10); assertEquals(10, result.size()); - } finally { + } + finally + { index.close(); } } @@ -1917,17 +2003,17 @@ void testBackgroundPersistenceAndOptimizationTogether(@TempDir final Path tempDi // Enable both background persistence and optimization final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .persistenceIntervalMs(300) - .minChangesBetweenPersists(10) - .persistOnShutdown(true) - .optimizationIntervalMs(400) - .minChangesBetweenOptimizations(10) - .optimizeOnShutdown(true) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .persistenceIntervalMs(300) + .minChangesBetweenPersists(10) + .persistOnShutdown(true) + .optimizationIntervalMs(400) + .minChangesBetweenOptimizations(10) + .optimizeOnShutdown(true) + .build(); final VectorIndex index = vectorIndices.add( "embeddings", @@ -1935,7 +2021,8 @@ void testBackgroundPersistenceAndOptimizationTogether(@TempDir final Path tempDi new ComputedDocumentVectorizer() ); - try { + try + { // Add vectors addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); @@ -1948,8 +2035,10 @@ void testBackgroundPersistenceAndOptimizationTogether(@TempDir final Path tempDi // Files should exist from background persistence assertTrue(Files.exists(indexDir.resolve("embeddings.graph")), - "Graph file should exist from background persistence"); - } finally { + "Graph file should exist from background persistence"); + } + finally + { index.close(); } } @@ -1977,14 +2066,15 @@ void testParallelVsNonParallelPersistAndReload(@TempDir final Path tempDir) thro // Generate shared vectors and query final List vectors = new ArrayList<>(); - for (int i = 0; i < vectorCount; i++) { + for(int i = 0; i < vectorCount; i++) + { vectors.add(randomVector(random, dimension)); } final float[] queryVector = randomVector(new Random(999), dimension); - final Path parallelIndexDir = tempDir.resolve("parallel-index"); - final Path parallelStorageDir = tempDir.resolve("parallel-storage"); - final Path sequentialIndexDir = tempDir.resolve("sequential-index"); + final Path parallelIndexDir = tempDir.resolve("parallel-index"); + final Path parallelStorageDir = tempDir.resolve("parallel-storage"); + final Path sequentialIndexDir = tempDir.resolve("sequential-index"); final Path sequentialStorageDir = tempDir.resolve("sequential-storage"); // --- Build and persist both modes --- @@ -1995,8 +2085,10 @@ void testParallelVsNonParallelPersistAndReload(@TempDir final Path tempDir) thro final List parallelIds = new ArrayList<>(); final List parallelScores = new ArrayList<>(); { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(parallelStorageDir)) { - @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(parallelStorageDir)) + { + @SuppressWarnings("unchecked") + final GigaMap gigaMap = (GigaMap)storage.root(); final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); assertEquals(vectorCount, gigaMap.size()); @@ -2006,7 +2098,8 @@ void testParallelVsNonParallelPersistAndReload(@TempDir final Path tempDir) thro final VectorSearchResult result = index.search(queryVector, k); assertEquals(k, result.size()); - for (final VectorSearchResult.Entry entry : result) { + for(final VectorSearchResult.Entry entry : result) + { parallelIds.add(entry.entityId()); parallelScores.add(entry.score()); assertNotNull(entry.entity()); @@ -2017,8 +2110,10 @@ void testParallelVsNonParallelPersistAndReload(@TempDir final Path tempDir) thro final List sequentialIds = new ArrayList<>(); final List sequentialScores = new ArrayList<>(); { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(sequentialStorageDir)) { - @SuppressWarnings("unchecked") final GigaMap gigaMap = (GigaMap) storage.root(); + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(sequentialStorageDir)) + { + @SuppressWarnings("unchecked") + final GigaMap gigaMap = (GigaMap)storage.root(); final VectorIndices vectorIndices = gigaMap.index().get(VectorIndices.Category()); assertEquals(vectorCount, gigaMap.size()); @@ -2028,7 +2123,8 @@ void testParallelVsNonParallelPersistAndReload(@TempDir final Path tempDir) thro final VectorSearchResult result = index.search(queryVector, k); assertEquals(k, result.size()); - for (final VectorSearchResult.Entry entry : result) { + for(final VectorSearchResult.Entry entry : result) + { sequentialIds.add(entry.entityId()); sequentialScores.add(entry.score()); assertNotNull(entry.entity()); @@ -2038,48 +2134,49 @@ void testParallelVsNonParallelPersistAndReload(@TempDir final Path tempDir) thro // Both modes should produce equivalent results after reload assertEquals(parallelIds, sequentialIds, - "Parallel and sequential modes should produce identical search results after reload"); + "Parallel and sequential modes should produce identical search results after reload"); assertEquals(parallelScores, sequentialScores, - "Parallel and sequential modes should produce identical search scores after reload"); + "Parallel and sequential modes should produce identical search scores after reload"); } /** * Helper to build, populate, train PQ, persist, and store a PQ-compressed index. */ private void buildAndPersistIndex( - final List vectors, - final float[] queryVector, - final int dimension, - final int pqSubspaces, - final Path indexDir, - final Path storageDir, - final boolean parallel + final List vectors , + final float[] queryVector , + final int dimension , + final int pqSubspaces , + final Path indexDir , + final Path storageDir , + final boolean parallel ) throws IOException { - try (final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) { + try(final EmbeddedStorageManager storage = EmbeddedStorage.start(storageDir)) + { final GigaMap gigaMap = GigaMap.New(); storage.setRoot(gigaMap); final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .maxDegree(32) - .beamWidth(100) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .parallelOnDiskWrite(parallel) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .maxDegree(32) + .beamWidth(100) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .parallelOnDiskWrite(parallel) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", config, new ComputedDocumentVectorizer() + "embeddings", config, new ComputedDocumentVectorizer() ); addDocumentsFromVectors(gigaMap, vectors, "doc_"); - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); index.persistToDisk(); assertTrue(Files.exists(indexDir.resolve("embeddings.graph"))); @@ -2121,17 +2218,17 @@ void testEmbeddedVectorizerWithParallelOnDiskWrite(@TempDir final Path tempDir) final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .parallelOnDiskWrite(true) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .parallelOnDiskWrite(true) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new EmbeddedDocumentVectorizer() + "embeddings", + config, + new EmbeddedDocumentVectorizer() ); addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); @@ -2176,26 +2273,26 @@ void testEmbeddedVectorizerWithPqAndParallelOnDiskWrite(@TempDir final Path temp final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(dimension) - .similarityFunction(VectorSimilarityFunction.COSINE) - .onDisk(true) - .indexDirectory(indexDir) - .enablePqCompression(true) - .pqSubspaces(pqSubspaces) - .parallelOnDiskWrite(true) - .build(); + .dimension(dimension) + .similarityFunction(VectorSimilarityFunction.COSINE) + .onDisk(true) + .indexDirectory(indexDir) + .enablePqCompression(true) + .pqSubspaces(pqSubspaces) + .parallelOnDiskWrite(true) + .build(); final VectorIndex index = vectorIndices.add( - "embeddings", - config, - new EmbeddedDocumentVectorizer() + "embeddings", + config, + new EmbeddedDocumentVectorizer() ); // Add vectors addRandomDocuments(gigaMap, random, dimension, vectorCount, "doc_"); // Train PQ compression - ((VectorIndex.Internal) index).trainCompressionIfNeeded(); + ((VectorIndex.Internal)index).trainCompressionIfNeeded(); // This would deadlock before the fix index.persistToDisk(); From 9b35b5565b5f9ccba9c5e382a890a1d9050395e2 Mon Sep 17 00:00:00 2001 From: Zdenek Jonas Date: Sat, 21 Feb 2026 10:04:29 +0100 Subject: [PATCH 6/7] add vector indices unit tests --- .../gigamap/jvector/VectorIndicesTest.java | 379 ++++++++++++++++++ 1 file changed, 379 insertions(+) create mode 100644 gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndicesTest.java diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndicesTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndicesTest.java new file mode 100644 index 00000000..5a6351f6 --- /dev/null +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndicesTest.java @@ -0,0 +1,379 @@ +package org.eclipse.store.gigamap.jvector; + +/*- + * #%L + * EclipseStore GigaMap JVector + * %% + * Copyright (C) 2023 - 2026 MicroStream Software + * %% + * This program and the accompanying materials are made + * available under the terms of the Eclipse Public License 2.0 + * which is available at https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + * #L% + */ + +import org.eclipse.store.gigamap.types.GigaMap; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Unit tests for {@link VectorIndices}. + *

+ * Tests the core functionality of vector index management: + * - Index registration and retrieval + * - Index name validation + * - Lifecycle management + */ +class VectorIndicesTest +{ + record Document(String content, float[] embedding) {} + + static class DocumentVectorizer extends Vectorizer + { + @Override + public float[] vectorize(final Document entity) + { + return entity.embedding(); + } + } + + @Test + void testAddIndex() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex index = vectorIndices.add("test-index", config, new DocumentVectorizer()); + + assertNotNull(index); + assertEquals("test-index", index.name()); + } + + @Test + void testAddDuplicateIndexThrows() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("duplicate", config, new DocumentVectorizer()); + + assertThrows(RuntimeException.class, () -> + vectorIndices.add("duplicate", config, new DocumentVectorizer()) + ); + } + + @Test + void testGetExistingIndex() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex created = vectorIndices.add("my-index", config, new DocumentVectorizer()); + final VectorIndex retrieved = vectorIndices.get("my-index"); + + assertSame(created, retrieved); + } + + @Test + void testGetNonExistentIndexReturnsNull() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + assertNull(vectorIndices.get("non-existent")); + } + + @Test + void testEnsureCreatesNewIndex() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex index = vectorIndices.ensure("new-index", config, new DocumentVectorizer()); + + assertNotNull(index); + assertEquals("new-index", index.name()); + } + + @Test + void testEnsureReturnsExistingIndex() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex first = vectorIndices.ensure("existing", config, new DocumentVectorizer()); + final VectorIndex second = vectorIndices.ensure("existing", config, new DocumentVectorizer()); + + assertSame(first, second); + } + + @Test + void testValidateIndexNameNull() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add(null, config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameEmpty() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add("", config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameWithSlash() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add("invalid/name", config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameWithBackslash() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add("invalid\\name", config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameTooLong() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final String tooLong = "a".repeat(201); + + assertThrows(IllegalArgumentException.class, () -> + vectorIndices.add(tooLong, config, new DocumentVectorizer()) + ); + } + + @Test + void testValidateIndexNameWithValidCharacters() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + assertDoesNotThrow(() -> + vectorIndices.add("valid-index_name.123", config, new DocumentVectorizer()) + ); + } + + @Test + void testInternalAddPropagates() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + vectorIndices.add("index2", config, new DocumentVectorizer()); + + final Document doc = new Document("test", new float[]{1.0f, 0.0f, 0.0f}); + gigaMap.add(doc); + + final VectorIndex index1 = vectorIndices.get("index1"); + final VectorIndex index2 = vectorIndices.get("index2"); + + final VectorSearchResult result1 = index1.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + final VectorSearchResult result2 = index2.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + + assertEquals(1, result1.size()); + assertEquals(1, result2.size()); + } + + @Test + void testInternalRemovePropagates() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + vectorIndices.add("index2", config, new DocumentVectorizer()); + + final Document doc = new Document("test", new float[]{1.0f, 0.0f, 0.0f}); + gigaMap.add(doc); + gigaMap.removeById(0); + + final VectorIndex index1 = vectorIndices.get("index1"); + final VectorIndex index2 = vectorIndices.get("index2"); + + final VectorSearchResult result1 = index1.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + final VectorSearchResult result2 = index2.search(new float[]{1.0f, 0.0f, 0.0f}, 1); + + assertEquals(0, result1.size()); + assertEquals(0, result2.size()); + } + + @Test + void testInternalRemoveAllPropagates() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + + gigaMap.add(new Document("test1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("test2", new float[]{0.0f, 1.0f, 0.0f})); + + gigaMap.removeAll(); + + final VectorIndex index1 = vectorIndices.get("index1"); + final VectorSearchResult result = index1.search(new float[]{1.0f, 0.0f, 0.0f}, 10); + + assertEquals(0, result.size()); + } + + @Test + void testIterateIndices() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + vectorIndices.add("index2", config, new DocumentVectorizer()); + vectorIndices.add("index3", config, new DocumentVectorizer()); + + final int[] count = {0}; + vectorIndices.iterate(index -> count[0]++); + + assertEquals(3, count[0]); + } + + @Test + void testAccessIndices() + { + final GigaMap gigaMap = GigaMap.New(); + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + vectorIndices.add("index1", config, new DocumentVectorizer()); + vectorIndices.add("index2", config, new DocumentVectorizer()); + + vectorIndices.accessIndices(table -> { + assertNotNull(table.get("index1")); + assertNotNull(table.get("index2")); + assertNull(table.get("non-existent")); + }); + } + + @Test + void testIndexAutoPopulatesExistingEntities() + { + final GigaMap gigaMap = GigaMap.New(); + + gigaMap.add(new Document("doc1", new float[]{1.0f, 0.0f, 0.0f})); + gigaMap.add(new Document("doc2", new float[]{0.0f, 1.0f, 0.0f})); + + final VectorIndices vectorIndices = gigaMap.index().register(VectorIndices.Category()); + + final VectorIndexConfiguration config = VectorIndexConfiguration.builder() + .dimension(3) + .similarityFunction(VectorSimilarityFunction.COSINE) + .build(); + + final VectorIndex index = vectorIndices.add("new-index", config, new DocumentVectorizer()); + + final VectorSearchResult result = index.search(new float[]{1.0f, 0.0f, 0.0f}, 10); + + assertEquals(2, result.size(), "Index should auto-populate with existing entities"); + } +} + From 9b104c4ab2a23836a9eb0c32acc35502f63d4e3f Mon Sep 17 00:00:00 2001 From: Zdenek Jonas Date: Sat, 21 Feb 2026 10:15:19 +0100 Subject: [PATCH 7/7] remove dulicite tests --- .../jvector/VectorIndexConfigurationTest.java | 130 ------------------ 1 file changed, 130 deletions(-) diff --git a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java index fcaf437c..901ce3df 100644 --- a/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java +++ b/gigamap/jvector/src/test/java/org/eclipse/store/gigamap/jvector/VectorIndexConfigurationTest.java @@ -123,30 +123,6 @@ void testBuilderRequiresNonNegativePqSubspaces() ); } - @Test - void testBuilderRequiresNonNegativePersistenceIntervalMs() - { - // 0 is valid (means disabled) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(64) - .persistenceIntervalMs(0) - .build(); - assertEquals(0L, config.persistenceIntervalMs()); - assertFalse(config.backgroundPersistence()); - - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder().dimension(64).persistenceIntervalMs(-1).build() - ); - } - - @Test - void testBuilderRequiresNonNegativeMinChangesBetweenPersists() - { - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder().dimension(64).minChangesBetweenPersists(-1).build() - ); - } - @Test void testBuilderRequiresNonNegativeOptimizationIntervalMs() { @@ -192,19 +168,6 @@ void testOnDiskRequiresIndexDirectory() ); } - @Test - void testOnDiskWithIndexDirectorySucceeds(@TempDir final Path tempDir) - { - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(64) - .onDisk(true) - .indexDirectory(tempDir) - .build(); - - assertTrue(config.onDisk()); - assertEquals(tempDir, config.indexDirectory()); - } - @Test void testCompressionRequiresOnDisk() { @@ -1103,27 +1066,6 @@ void testBackgroundPersistenceConfigurationBuilder(@TempDir final Path tempDir) assertEquals(50, config.minChangesBetweenPersists()); } - /** - * Test background persistence configuration defaults. - */ - @Test - void testBackgroundPersistenceConfigurationDefaults(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - // Background persistence should be disabled by default - assertFalse(config.backgroundPersistence()); - assertEquals(0, config.persistenceIntervalMs()); - assertTrue(config.persistOnShutdown()); - assertEquals(100, config.minChangesBetweenPersists()); - } - /** * Test validation: persistenceIntervalMs must be non-negative. */ @@ -1175,78 +1117,6 @@ void testMinChangesBetweenPersistsMustBeNonNegative(@TempDir final Path tempDir) assertEquals(0, config.minChangesBetweenPersists()); } - /** - * Test validation: optimizationIntervalMs must be non-negative. - */ - @Test - void testOptimizationIntervalMsMustBeNonNegative(@TempDir final Path tempDir) - { - // 0 is valid (means disabled) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .optimizationIntervalMs(0) - .build(); - assertEquals(0, config.optimizationIntervalMs()); - assertFalse(config.backgroundOptimization()); - - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .optimizationIntervalMs(-1000) - .build() - ); - } - - /** - * Test background optimization configuration defaults. - */ - @Test - void testBackgroundOptimizationConfigurationDefaults(@TempDir final Path tempDir) - { - final Path indexDir = tempDir.resolve("index"); - - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(indexDir) - .build(); - - // Background optimization should be disabled by default - assertFalse(config.backgroundOptimization()); - assertEquals(0, config.optimizationIntervalMs()); - assertEquals(1000, config.minChangesBetweenOptimizations()); - assertFalse(config.optimizeOnShutdown()); - } - - /** - * Test validation: minChangesBetweenOptimizations must be non-negative. - */ - @Test - void testMinChangesBetweenOptimizationsMustBeNonNegative(@TempDir final Path tempDir) - { - assertThrows(IllegalArgumentException.class, () -> - VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .minChangesBetweenOptimizations(-1) - .build() - ); - - // Zero should be allowed (optimize on every interval) - final VectorIndexConfiguration config = VectorIndexConfiguration.builder() - .dimension(128) - .onDisk(true) - .indexDirectory(tempDir) - .minChangesBetweenOptimizations(0) - .build(); - assertEquals(0, config.minChangesBetweenOptimizations()); - } - /** * Test background optimization configuration builder. */