From 79030ce60039f0fce507903be27ff1cfbf11321a Mon Sep 17 00:00:00 2001 From: Malini Mahalakshmi Venkatachari Date: Fri, 29 May 2026 00:06:34 -0700 Subject: [PATCH] Revert all commits after v0.5.417 (restore v0.5.417 tree state) Reverts the 17 commits that landed on main after v0.5.417, bringing the tree back to exactly the v0.5.417 state. Squashed into a single revert commit for reviewability and to allow reinstating everything as one unit (revert this commit to bring all 17 changes back). Reverted commits (v0.5.417..main, newest first): - Revert #579 (HTS fields in table list api) (#610) - feat(optimizer): [3/N] Analyzer (#533) - [DataLoader] Handle Cast(Literal, TIMESTAMP/DATE/TIME) in scan optimizer (#569 follow-up) (#583) - Skip metadata.json parse in drop path (#589) - feat(optimizer): [2/N] Optimizer REST Service and Controller (#531) - [BDP-102028] feat(optimizer): [1/N] Optimizer Database (#530) - [RTAS]: Fix bug - remove fs scheme from tableLocation in commit (cont) (#594) - Trigger ELR process (#593) - [BDP-102028] feat(optimizer): [0/N] Optimizer API and internal model (#527) - Fail retention app when the columnPattern mismatch partition spec (#552) - [DataLoader] Drop OpenTelemetry minimum version to 1.38.0 (#590) - [DataLoader] Emit OpenTelemetry metrics for read operations (#582) - Cache iceberg metadata to reduce redundant requests to storage (#509) - bump iceberg 1.2 version to 1.2.0.17 (#587) - Support returning HTS fields in table list api (#579) - [DataLoader] Add unique id property to OpenHouseDataLoader (#580) - [DataLoader] Add OpenTelemetry metrics support (#575) Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/optimizer/analyzerapp/build.gradle | 14 - .../analyzer/AnalyzerApplication.java | 29 -- .../src/main/resources/application.properties | 8 - .../openhouse/jobs/spark/Operations.java | 12 - .../openhouse/catalog/e2e/RTASJavaTest.java | 4 +- .../openhouse/jobs/spark/OperationsTest.java | 48 --- build.gradle | 5 +- .../openhouse/internalcatalog/build.gradle | 2 - .../InternalCatalogMetricsConstant.java | 3 - .../catalog/OpenHouseInternalCatalog.java | 63 +-- .../OpenHouseInternalTableOperations.java | 18 +- .../catalog/cache/CacheConfiguration.java | 71 ---- .../cache/SpringTableMetadataCache.java | 29 -- .../catalog/cache/TableMetadataCache.java | 11 - .../config/InternalCatalogSettings.java | 21 - .../catalog/OpenHouseInternalCatalogTest.java | 140 +------ .../OpenHouseInternalTableOperationsTest.java | 140 +------ .../catalog/cache/CacheConfigurationTest.java | 131 ------ integrations/python/dataloader/pyproject.toml | 19 +- .../dataloader/_table_scan_context.py | 8 +- .../src/openhouse/dataloader/data_loader.py | 110 +---- .../openhouse/dataloader/data_loader_split.py | 167 ++------ .../src/openhouse/dataloader/filters.py | 16 +- .../openhouse/dataloader/metrics/__init__.py | 17 - .../dataloader/tests/integration_tests.py | 78 ---- .../dataloader/tests/test_data_loader.py | 12 - .../python/dataloader/tests/test_filters.py | 23 +- .../python/dataloader/tests/test_metrics.py | 379 ------------------ .../dataloader/tests/test_scan_optimizer.py | 25 -- integrations/python/dataloader/uv.lock | 70 +--- .../openhouse/spark/catalogtest/RTASTest.java | 20 +- services/common/build.gradle | 1 + services/optimizer/analyzer/build.gradle | 34 -- .../optimizer/analyzer/AnalyzerRunner.java | 172 -------- ...denceBasedOrphanFilesDeletionAnalyzer.java | 83 ---- .../optimizer/analyzer/CadencePolicy.java | 72 ---- .../optimizer/analyzer/OperationAnalyzer.java | 41 -- .../analyzer/AnalyzerRunnerTest.java | 218 ---------- ...eBasedOrphanFilesDeletionAnalyzerTest.java | 197 --------- services/optimizer/build.gradle | 17 - .../OptimizerServiceApplication.java | 13 - .../controller/TableOperationsController.java | 125 ------ .../TableOperationsHistoryController.java | 58 --- .../api/controller/TableStatsController.java | 111 ----- .../optimizer/api/spec/HistoryStatus.java | 21 - .../optimizer/api/spec/OperationStatus.java | 32 -- .../optimizer/api/spec/OperationType.java | 17 - .../optimizer/api/spec/TableOperations.java | 76 ---- .../api/spec/TableOperationsHistory.java | 66 --- .../optimizer/api/spec/TableStats.java | 70 ---- .../optimizer/api/spec/TableStatsHistory.java | 61 --- .../optimizer/api/spec/TableStatsPayload.java | 137 ------- .../api/spec/UpdateOperationRequest.java | 47 --- .../api/spec/UpsertTableStatsRequest.java | 52 --- .../optimizer/db/CommitDeltaMetrics.java | 28 -- .../openhouse/optimizer/db/HistoryStatus.java | 15 - .../optimizer/db/OperationStatus.java | 21 - .../openhouse/optimizer/db/OperationType.java | 14 - .../optimizer/db/SnapshotMetrics.java | 28 -- .../db/TableOperationsHistoryRow.java | 75 ---- .../optimizer/db/TableOperationsRow.java | 85 ---- .../optimizer/db/TableStatsHistoryRow.java | 74 ---- .../openhouse/optimizer/db/TableStatsRow.java | 64 --- .../optimizer/model/HistoryStatusDto.java | 27 -- .../optimizer/model/OperationStatusDto.java | 33 -- .../optimizer/model/OperationTypeDto.java | 22 - .../openhouse/optimizer/model/TableDto.java | 75 ---- .../optimizer/model/TableOperationDto.java | 104 ----- .../model/TableOperationsHistoryDto.java | 79 ---- .../optimizer/model/TableStatsDto.java | 194 --------- .../optimizer/model/TableStatsHistoryDto.java | 67 ---- .../TableOperationsHistoryRepository.java | 45 --- .../repository/TableOperationsRepository.java | 146 ------- .../TableStatsHistoryRepository.java | 34 -- .../repository/TableStatsRepository.java | 48 --- .../service/OptimizerDataService.java | 94 ----- .../service/OptimizerDataServiceImpl.java | 175 -------- .../src/main/resources/application.properties | 25 -- .../main/resources/db/optimizer-schema.sql | 54 --- .../OptimizerServiceContextTest.java | 25 -- .../ControllerErrorHandlingTest.java | 124 ------ .../TableOperationsHistoryRepositoryTest.java | 130 ------ .../TableOperationsRepositoryTest.java | 312 -------------- .../TableStatsHistoryRepositoryTest.java | 148 ------- .../repository/TableStatsRepositoryTest.java | 129 ------ .../service/OptimizerDataServiceImplTest.java | 173 -------- .../resources/application-test.properties | 12 - services/tables/build.gradle | 1 - .../tables/config/InternalCatalogBeans.java | 31 -- .../config/InternalCatalogProperties.java | 23 -- .../OpenHouseInternalRepository.java | 11 - .../impl/OpenHouseInternalRepositoryImpl.java | 24 +- .../tables/services/TablesServiceImpl.java | 14 +- .../config/InternalCatalogBeansTest.java | 105 ----- .../RepositoryTestWithSettableComponents.java | 42 +- .../tables/e2e/h2/TablesServiceTest.java | 39 -- .../OpenHouseInternalRepositoryImplTest.java | 57 --- settings.gradle | 3 - .../tablestest/OpenHouseSparkITest.java | 7 + 99 files changed, 129 insertions(+), 6321 deletions(-) delete mode 100644 apps/optimizer/analyzerapp/build.gradle delete mode 100644 apps/optimizer/analyzerapp/src/main/java/com/linkedin/openhouse/optimizer/analyzer/AnalyzerApplication.java delete mode 100644 apps/optimizer/analyzerapp/src/main/resources/application.properties delete mode 100644 iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/cache/CacheConfiguration.java delete mode 100644 iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/cache/SpringTableMetadataCache.java delete mode 100644 iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/cache/TableMetadataCache.java delete mode 100644 iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/config/InternalCatalogSettings.java delete mode 100644 iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/cache/CacheConfigurationTest.java delete mode 100644 integrations/python/dataloader/src/openhouse/dataloader/metrics/__init__.py delete mode 100644 integrations/python/dataloader/tests/test_metrics.py delete mode 100644 services/optimizer/analyzer/build.gradle delete mode 100644 services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/AnalyzerRunner.java delete mode 100644 services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/CadenceBasedOrphanFilesDeletionAnalyzer.java delete mode 100644 services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/CadencePolicy.java delete mode 100644 services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/OperationAnalyzer.java delete mode 100644 services/optimizer/analyzer/src/test/java/com/linkedin/openhouse/optimizer/analyzer/AnalyzerRunnerTest.java delete mode 100644 services/optimizer/analyzer/src/test/java/com/linkedin/openhouse/optimizer/analyzer/CadenceBasedOrphanFilesDeletionAnalyzerTest.java delete mode 100644 services/optimizer/build.gradle delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/OptimizerServiceApplication.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableOperationsController.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableOperationsHistoryController.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableStatsController.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/HistoryStatus.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/OperationStatus.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/OperationType.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableOperations.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableOperationsHistory.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableStats.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableStatsHistory.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableStatsPayload.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/UpdateOperationRequest.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/UpsertTableStatsRequest.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/CommitDeltaMetrics.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/HistoryStatus.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/OperationStatus.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/OperationType.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/SnapshotMetrics.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableOperationsHistoryRow.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableOperationsRow.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableStatsHistoryRow.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableStatsRow.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/HistoryStatusDto.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/OperationStatusDto.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/OperationTypeDto.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableDto.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableOperationDto.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableOperationsHistoryDto.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableStatsDto.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableStatsHistoryDto.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/repository/TableOperationsHistoryRepository.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/repository/TableOperationsRepository.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/repository/TableStatsHistoryRepository.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/repository/TableStatsRepository.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/service/OptimizerDataService.java delete mode 100644 services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/service/OptimizerDataServiceImpl.java delete mode 100644 services/optimizer/src/main/resources/application.properties delete mode 100644 services/optimizer/src/main/resources/db/optimizer-schema.sql delete mode 100644 services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/OptimizerServiceContextTest.java delete mode 100644 services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/api/controller/ControllerErrorHandlingTest.java delete mode 100644 services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/repository/TableOperationsHistoryRepositoryTest.java delete mode 100644 services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/repository/TableOperationsRepositoryTest.java delete mode 100644 services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/repository/TableStatsHistoryRepositoryTest.java delete mode 100644 services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/repository/TableStatsRepositoryTest.java delete mode 100644 services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/service/OptimizerDataServiceImplTest.java delete mode 100644 services/optimizer/src/test/resources/application-test.properties delete mode 100644 services/tables/src/main/java/com/linkedin/openhouse/tables/config/InternalCatalogBeans.java delete mode 100644 services/tables/src/main/java/com/linkedin/openhouse/tables/config/InternalCatalogProperties.java delete mode 100644 services/tables/src/test/java/com/linkedin/openhouse/tables/config/InternalCatalogBeansTest.java diff --git a/apps/optimizer/analyzerapp/build.gradle b/apps/optimizer/analyzerapp/build.gradle deleted file mode 100644 index 15947754c..000000000 --- a/apps/optimizer/analyzerapp/build.gradle +++ /dev/null @@ -1,14 +0,0 @@ -plugins { - id 'openhouse.springboot-ext-conventions' - id 'org.springframework.boot' version '2.7.8' -} - -// Deployable Spring Boot wrapper around the analyzer library. Holds AnalyzerApplication (the -// @SpringBootApplication entry point) and application.properties; the analysis logic lives in -// :services:optimizer:analyzer. -dependencies { - implementation project(':services:optimizer:analyzer') - implementation 'org.springframework.boot:spring-boot-starter:2.7.8' - implementation 'org.springframework.boot:spring-boot-starter-data-jpa:2.7.8' - runtimeOnly 'mysql:mysql-connector-java:8.0.33' -} diff --git a/apps/optimizer/analyzerapp/src/main/java/com/linkedin/openhouse/optimizer/analyzer/AnalyzerApplication.java b/apps/optimizer/analyzerapp/src/main/java/com/linkedin/openhouse/optimizer/analyzer/AnalyzerApplication.java deleted file mode 100644 index 220ccb9fa..000000000 --- a/apps/optimizer/analyzerapp/src/main/java/com/linkedin/openhouse/optimizer/analyzer/AnalyzerApplication.java +++ /dev/null @@ -1,29 +0,0 @@ -package com.linkedin.openhouse.optimizer.analyzer; - -import java.util.List; -import org.springframework.boot.CommandLineRunner; -import org.springframework.boot.SpringApplication; -import org.springframework.boot.autoconfigure.SpringBootApplication; -import org.springframework.boot.autoconfigure.domain.EntityScan; -import org.springframework.context.annotation.Bean; -import org.springframework.data.jpa.repository.config.EnableJpaRepositories; - -/** Entry point for the Optimizer Analyzer application. */ -@SpringBootApplication -@EntityScan(basePackages = "com.linkedin.openhouse.optimizer.db") -@EnableJpaRepositories(basePackages = "com.linkedin.openhouse.optimizer.repository") -public class AnalyzerApplication { - - public static void main(String[] args) { - SpringApplication.run(AnalyzerApplication.class, args); - } - - /** - * Runs the analyzer once per registered {@link OperationAnalyzer} per process invocation. Each - * call is scoped to one operation type; the runner iterates databases internally. - */ - @Bean - public CommandLineRunner run(AnalyzerRunner runner, List analyzers) { - return args -> analyzers.forEach(a -> runner.analyze(a.getOperationType())); - } -} diff --git a/apps/optimizer/analyzerapp/src/main/resources/application.properties b/apps/optimizer/analyzerapp/src/main/resources/application.properties deleted file mode 100644 index d0e70622a..000000000 --- a/apps/optimizer/analyzerapp/src/main/resources/application.properties +++ /dev/null @@ -1,8 +0,0 @@ -spring.application.name=openhouse-optimizer-analyzer -spring.main.web-application-type=none -spring.datasource.url=${OPTIMIZER_DB_URL:jdbc:h2:mem:analyzerdb;DB_CLOSE_DELAY=-1;MODE=MySQL} -spring.datasource.username=${OPTIMIZER_DB_USER:sa} -spring.datasource.password=${OPTIMIZER_DB_PASSWORD:} -spring.jpa.hibernate.ddl-auto=none -ofd.success-retry-hours=16 -ofd.failure-retry-hours=1 diff --git a/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/Operations.java b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/Operations.java index bd301c729..53b1297a5 100644 --- a/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/Operations.java +++ b/apps/spark/src/main/java/com/linkedin/openhouse/jobs/spark/Operations.java @@ -50,7 +50,6 @@ import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.spark.actions.SparkActions; @@ -340,17 +339,6 @@ private Map> prepareBackupDataManifests( TableScan scan = table.newScan().filter(filter); try (CloseableIterable filesIterable = scan.planFiles()) { List filesList = Lists.newArrayList(filesIterable); - filesList.stream() - .filter(task -> !Expressions.alwaysTrue().isEquivalentTo(task.residual())) - .findFirst() - .ifPresent( - task -> { - throw new IllegalStateException( - String.format( - "Retention with backup enabled requires a metadata-only delete for table %s, " - + "but file %s has residual filter %s, which would require a row-level rewrite.", - fqtn, task.file().path(), task.residual())); - }); return filesList.stream() .collect( Collectors.groupingBy( diff --git a/apps/spark/src/test/java/com/linkedin/openhouse/catalog/e2e/RTASJavaTest.java b/apps/spark/src/test/java/com/linkedin/openhouse/catalog/e2e/RTASJavaTest.java index 23cd7ae8d..09e75fcfd 100644 --- a/apps/spark/src/test/java/com/linkedin/openhouse/catalog/e2e/RTASJavaTest.java +++ b/apps/spark/src/test/java/com/linkedin/openhouse/catalog/e2e/RTASJavaTest.java @@ -141,8 +141,8 @@ private void verifyReplacedTable( Table replacedTable = catalog.loadTable(TABLE_IDENT); assertEquals( - originalLocation, - replacedTable.location(), + stripPathScheme(originalLocation), + stripPathScheme(replacedTable.location()), "Table location should be preserved after replace"); assertEquals( REPLACE_SCHEMA.asStruct(), diff --git a/apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/OperationsTest.java b/apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/OperationsTest.java index 2e44c9263..3646323b8 100644 --- a/apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/OperationsTest.java +++ b/apps/spark/src/test/java/com/linkedin/openhouse/jobs/spark/OperationsTest.java @@ -309,54 +309,6 @@ public void testRetentionDataManifestWithTimestampPartitionedTable() throws Exce } } - @Test - public void testRetentionWithBackupFailsWhenColumnPatternMismatchesPartition() throws Exception { - final String tableName = "db.test_retention_backup_pattern_mismatch"; - try (Operations ops = Operations.withCatalog(getSparkSession(), otelEmitter)) { - // The table is partitioned on `datepartition`, but retention will filter - // on `time_col` using a pattern unrelated to the partitioning. For each - // file's per-file min/max to actually straddle the cutoff (and produce - // a non-trivial residual), both `time_col` values within a partition - // must live in the same data file — so we force a single writer task - // via the COALESCE(1) hint. - ops.spark().sql(String.format("DROP TABLE IF EXISTS %s", tableName)).show(); - ops.spark() - .sql( - String.format( - "CREATE TABLE %s (data string, datepartition string, time_col string) " - + "PARTITIONED BY (datepartition)", - tableName)) - .show(); - ops.spark() - .sql( - "SELECT data, datepartition, time_col FROM VALUES " - + "('a', '2024-01', '2020-01-01-00'), " - + "('b', '2024-01', '2030-01-01-00'), " - + "('c', '2024-02', '2020-01-01-00'), " - + "('d', '2024-02', '2030-01-01-00') " - + "AS t(data, datepartition, time_col)") - .coalesce(1) - .writeTo(tableName) - .append(); - - // Fix `now` so the cutoff (now - 1 day, formatted yyyy-MM-dd-HH) falls - // strictly between each file's min ("2020-01-01-00") and max - // ("2030-01-01-00") — forcing a non-trivial residual on every file. - ZonedDateTime now = ZonedDateTime.of(2025, 6, 15, 10, 0, 0, 0, ZoneOffset.UTC); - IllegalStateException ex = - Assertions.assertThrows( - IllegalStateException.class, - () -> - ops.runRetention( - tableName, "time_col", "yyyy-MM-dd-HH", "day", 1, true, ".backup", now)); - Assertions.assertTrue( - ex.getMessage().contains("metadata-only delete"), - "Expected metadata-only delete error, got: " + ex.getMessage()); - // DELETE should not have executed: all 4 rows remain. - verifyRowCount(ops, tableName, 4); - } - } - @Test public void testOrphanFilesDeletionJavaAPI() throws Exception { final String tableName = "db.test_ofd_java"; diff --git a/build.gradle b/build.gradle index ec75fd89d..aee358d7c 100644 --- a/build.gradle +++ b/build.gradle @@ -30,7 +30,7 @@ ext { spark_version = "3.1.1" ok_http3_version = "4.11.0" junit_version = "5.11.0" - iceberg_1_2_version = "1.2.0.17" + iceberg_1_2_version = "1.2.0.16" iceberg_1_5_version = "1.5.2.11" otel_agent_version = "2.12.0" // Bundles OTel SDK 1.47.0 otel_annotations_version = "2.12.0" // Match agent version @@ -177,7 +177,6 @@ tasks.register('CopyGitHooksTask', Copy) { // tables-service.Dockerfile -> :services:tables:bootJar // housetables-service.Dockerfile -> :services:housetables:bootJar // jobs-service.Dockerfile -> :services:jobs:bootJar -// optimizer-service.Dockerfile -> :services:optimizer:bootJar // jobs-scheduler.Dockerfile -> :apps:openhouse-spark-apps_2.12:shadowJar (uber JAR) // spark-base-hadoop2.8.dockerfile -> // :integrations:spark:spark-3.1:openhouse-spark-runtime_2.12:shadowJar (uber JAR) @@ -197,7 +196,6 @@ tasks.register('dockerPrereqs') { dependsOn ':services:tables:bootJar' dependsOn ':services:housetables:bootJar' dependsOn ':services:jobs:bootJar' - dependsOn ':services:optimizer:bootJar' // Spark runtime uber JARs (shadowJar) dependsOn ':integrations:spark:spark-3.1:openhouse-spark-runtime_2.12:shadowJar' @@ -221,7 +219,6 @@ tasks.register('dockerPrereqs') { println ' build/tables/libs/tables.jar' println ' build/housetables/libs/housetables.jar' println ' build/jobs/libs/jobs.jar' - println ' build/optimizer/libs/optimizer.jar' println ' build/openhouse-spark-runtime_2.12/libs/openhouse-spark-runtime_2.12-uber.jar' println ' build/openhouse-spark-3.5-runtime_2.12/libs/openhouse-spark-3.5-runtime_2.12-uber.jar' println ' build/openhouse-spark-apps_2.12/libs/openhouse-spark-apps_2.12-uber.jar' diff --git a/iceberg/openhouse/internalcatalog/build.gradle b/iceberg/openhouse/internalcatalog/build.gradle index 64c16c60d..3002c63ad 100644 --- a/iceberg/openhouse/internalcatalog/build.gradle +++ b/iceberg/openhouse/internalcatalog/build.gradle @@ -11,9 +11,7 @@ plugins { dependencies { implementation 'com.github.spotbugs:spotbugs-annotations:4.8.1' - implementation 'com.github.ben-manes.caffeine:caffeine:2.8.8' api 'org.springframework.retry:spring-retry:1.3.3' - implementation 'org.springframework:spring-context-support:5.3.18' implementation "io.opentelemetry.instrumentation:opentelemetry-instrumentation-annotations:${otel_annotations_version}" api 'io.opentelemetry:opentelemetry-api:1.47.0' api project(':client:hts') diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/InternalCatalogMetricsConstant.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/InternalCatalogMetricsConstant.java index f40374d1d..d3f1bb4ef 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/InternalCatalogMetricsConstant.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/InternalCatalogMetricsConstant.java @@ -19,10 +19,7 @@ private InternalCatalogMetricsConstant() {} static final String METADATA_UPDATE_LATENCY = "metadata_update_latency"; static final String METADATA_RETRIEVAL_LATENCY = "metadata_retrieval_latency"; - public static final String METADATA_CACHE_REMOVAL_CTR = "metadata_cache_removal"; - // Tag constants for metric dimensions static final String DATABASE_TAG = "database"; static final String TABLE_TAG = "table"; - public static final String CACHE_REMOVAL_CAUSE_TAG = "cause"; } diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalCatalog.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalCatalog.java index 1039125b7..7f71a0d1a 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalCatalog.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalCatalog.java @@ -10,7 +10,6 @@ import com.linkedin.openhouse.common.exception.AlreadyExistsException; import com.linkedin.openhouse.common.exception.NoSuchSoftDeletedUserTableException; import com.linkedin.openhouse.common.utils.NamespaceUtil; -import com.linkedin.openhouse.internal.catalog.cache.TableMetadataCache; import com.linkedin.openhouse.internal.catalog.fileio.FileIOManager; import com.linkedin.openhouse.internal.catalog.mapper.HouseTableMapper; import com.linkedin.openhouse.internal.catalog.model.HouseTable; @@ -26,7 +25,6 @@ import java.util.stream.Collectors; import java.util.stream.StreamSupport; import lombok.extern.slf4j.Slf4j; -import org.apache.hadoop.fs.Path; import org.apache.iceberg.BaseMetastoreCatalog; import org.apache.iceberg.Table; import org.apache.iceberg.TableOperations; @@ -34,7 +32,6 @@ import org.apache.iceberg.UpdateProperties; import org.apache.iceberg.catalog.Namespace; import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.exceptions.NoSuchTableException; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.SupportsPrefixOperations; import org.apache.iceberg.relocated.com.google.common.collect.Lists; @@ -66,8 +63,6 @@ public class OpenHouseInternalCatalog extends BaseMetastoreCatalog { @Autowired MeterRegistry meterRegistry; - @Autowired TableMetadataCache tableMetadataCache; - @Override protected TableOperations newTableOps(TableIdentifier tableIdentifier) { FileIO fileIO = resolveFileIO(tableIdentifier); @@ -79,8 +74,7 @@ protected TableOperations newTableOps(TableIdentifier tableIdentifier) { houseTableMapper, tableIdentifier, metricsReporter, - fileIOManager, - tableMetadataCache); + fileIOManager); } @Override @@ -125,42 +119,17 @@ public Page listTables(Namespace namespace, Pageable pageable) .map(houseTable -> TableIdentifier.of(houseTable.getDatabaseId(), houseTable.getTableId())); } - /** - * Direct HTS lookup that returns the {@link HouseTable} row without parsing metadata.json. Use - * this when only HTS-resident columns (e.g. tableUUID, tableLocation) are needed — for example, - * to authorize a drop without loading the full Iceberg table, which is important when the - * underlying metadata is corrupted and {@link #loadTable} would throw. - */ - public Optional findHouseTable(TableIdentifier identifier) { - HouseTablePrimaryKey primaryKey = - HouseTablePrimaryKey.builder() - .databaseId(identifier.namespace().toString()) - .tableId(identifier.name()) - .build(); - try { - return houseTableRepository.findById(primaryKey); - } catch (HouseTableNotFoundException e) { - return Optional.empty(); - } - } - @Override public boolean dropTable(TableIdentifier identifier, boolean purge) { - // Look up the HouseTable row directly instead of calling loadTable(), so drop works even when - // the table's metadata.json is corrupted and cannot be parsed by TableMetadataParser. - HouseTable houseTable = - findHouseTable(identifier) - .orElseThrow(() -> new NoSuchTableException("Table does not exist: %s", identifier)); - - HouseTablePrimaryKey primaryKey = - HouseTablePrimaryKey.builder() - .databaseId(identifier.namespace().toString()) - .tableId(identifier.name()) - .build(); - String tableLocation = getTableBaseLocation(houseTable, identifier); + String tableLocation = loadTable(identifier).location(); FileIO fileIO = resolveFileIO(identifier); log.debug("Dropping table {}, purge:{}", tableLocation, purge); try { + HouseTablePrimaryKey primaryKey = + HouseTablePrimaryKey.builder() + .databaseId(identifier.namespace().toString()) + .tableId(identifier.name()) + .build(); houseTableRepository.deleteById(primaryKey, purge); } catch (HouseTableRepositoryException houseTableRepositoryException) { throw new RuntimeException( @@ -168,6 +137,7 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) { houseTableRepositoryException); } if (purge) { + // Delete data and metadata files from storage. if (fileIO instanceof SupportsPrefixOperations) { log.debug("Deleting files for table {}", tableLocation); ((SupportsPrefixOperations) fileIO).deletePrefix(tableLocation); @@ -182,23 +152,6 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) { return true; } - /** - * Returns the table base directory derived from the HouseTable's metadata location. OpenHouse - * writes metadata.json directly under the table base subdir, so the parent of the metadata.json - * path is the same value that {@link org.apache.iceberg.Table#location()} would return. - */ - private static String getTableBaseLocation(HouseTable houseTable, TableIdentifier identifier) { - String metadataLocation = houseTable.getTableLocation(); - // Defensive check to avoid any unintentional deletion - if (!metadataLocation.endsWith(".metadata.json")) { - throw new IllegalStateException( - String.format( - "Refusing to drop %s: metadata_location does not look like a metadata.json file: %s", - identifier, metadataLocation)); - } - return new Path(metadataLocation).getParent().toString(); - } - @Override public void renameTable(TableIdentifier from, TableIdentifier to) { Table fromTable = loadTable(from); diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java index 35c58ff84..d1636bda9 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java @@ -13,7 +13,6 @@ import com.linkedin.openhouse.cluster.storage.hdfs.HdfsStorageClient; import com.linkedin.openhouse.cluster.storage.local.LocalStorageClient; import com.linkedin.openhouse.common.exception.InvalidTableMetadataException; -import com.linkedin.openhouse.internal.catalog.cache.TableMetadataCache; import com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException; import com.linkedin.openhouse.internal.catalog.fileio.FileIOManager; import com.linkedin.openhouse.internal.catalog.mapper.HouseTableMapper; @@ -86,8 +85,6 @@ public class OpenHouseInternalTableOperations extends BaseMetastoreTableOperatio FileIOManager fileIOManager; - TableMetadataCache tableMetadataCache; - private static final Gson GSON = new Gson(); private static final Cache CACHE = @@ -136,10 +133,7 @@ protected void doRefresh() { protected void refreshMetadata(final String metadataLoc) { long startTime = System.currentTimeMillis(); boolean needToReload = !Objects.equal(currentMetadataLocation(), metadataLoc); - Runnable r = - () -> - super.refreshFromMetadataLocation( - metadataLoc, null, 20, this::loadTableMetadataWithCache); + Runnable r = () -> super.refreshFromMetadataLocation(metadataLoc); try { if (needToReload) { metricsReporter.executeWithStats( @@ -361,7 +355,6 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { updatedMtDataRef, io().newOutputFile(newMetadataLocation)), InternalCatalogMetricsConstant.METADATA_UPDATE_LATENCY, getCatalogMetricTags()); - tableMetadataCache.seed(newMetadataLocation, updatedMtDataRef); log.info( "updateMetadata to location {} succeeded, took {} ms", newMetadataLocation, @@ -379,7 +372,7 @@ updatedMtDataRef, io().newOutputFile(newMetadataLocation)), writeSpan.end(); } - houseTable = houseTableMapper.toHouseTable(updatedMtDataRef, fileIO); + houseTable = houseTableMapper.toHouseTable(metadataToCommit, fileIO); if (base != null && (properties.containsKey(CatalogConstants.OPENHOUSE_TABLEID_KEY) && !properties @@ -412,7 +405,7 @@ updatedMtDataRef, io().newOutputFile(newMetadataLocation)), * "forced refresh" in {@link OpenHouseInternalTableOperations#commit(TableMetadata, * TableMetadata)} */ - refreshMetadata(newMetadataLocation); + refreshFromMetadataLocation(newMetadataLocation); } if (isReplicatedTableCreate(properties)) { updateMetadataFieldForTable(metadata, newMetadataLocation); @@ -793,9 +786,4 @@ private List getIntermediateSchemasFromProps(TableMetadata metadata) { .create() .fromJson(serializedNewIntermediateSchemas, new TypeToken>() {}.getType()); } - - private TableMetadata loadTableMetadataWithCache(String metadataLocation) { - return tableMetadataCache.load( - metadataLocation, () -> TableMetadataParser.read(io(), metadataLocation)); - } } diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/cache/CacheConfiguration.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/cache/CacheConfiguration.java deleted file mode 100644 index de55061a4..000000000 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/cache/CacheConfiguration.java +++ /dev/null @@ -1,71 +0,0 @@ -package com.linkedin.openhouse.internal.catalog.cache; - -import com.github.benmanes.caffeine.cache.Caffeine; -import com.github.benmanes.caffeine.cache.RemovalListener; -import com.github.benmanes.caffeine.cache.Weigher; -import com.linkedin.openhouse.internal.catalog.InternalCatalogMetricsConstant; -import com.linkedin.openhouse.internal.catalog.config.InternalCatalogSettings; -import io.micrometer.core.instrument.MeterRegistry; -import java.util.List; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.TableMetadataParser; -import org.springframework.beans.factory.ObjectProvider; -import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean; -import org.springframework.cache.CacheManager; -import org.springframework.cache.annotation.EnableCaching; -import org.springframework.cache.caffeine.CaffeineCacheManager; -import org.springframework.cache.support.NoOpCacheManager; -import org.springframework.context.annotation.Bean; -import org.springframework.context.annotation.Configuration; - -@Configuration -@EnableCaching -public class CacheConfiguration { - - @Bean - @ConditionalOnMissingBean(InternalCatalogSettings.class) - public InternalCatalogSettings internalCatalogSettings() { - return new InternalCatalogSettings(); - } - - @Bean - public CacheManager internalCatalogCacheManager( - InternalCatalogSettings settings, ObjectProvider meterRegistry) { - if (!settings.getMetadataCache().isEnabled()) { - return new NoOpCacheManager(); - } - CaffeineCacheManager cacheManager = new CaffeineCacheManager(); - cacheManager.setAllowNullValues(false); - cacheManager.setCacheNames(List.of("tableMetadata")); - cacheManager.setCaffeine( - Caffeine.newBuilder() - .expireAfterWrite(settings.getMetadataCache().getTtl()) - .maximumWeight(settings.getMetadataCache().getMaxWeight().toBytes()) - .weigher(tableMetadataWeigher()) - .removalListener(removalListener(meterRegistry)) - .recordStats()); - return cacheManager; - } - - private static Weigher tableMetadataWeigher() { - return (key, value) -> { - if (value instanceof TableMetadata) { - return TableMetadataParser.toJson((TableMetadata) value).length(); - } - return 1; - }; - } - - private static RemovalListener removalListener( - ObjectProvider meterRegistry) { - return (key, value, cause) -> - meterRegistry.ifAvailable( - registry -> - registry - .counter( - InternalCatalogMetricsConstant.METADATA_CACHE_REMOVAL_CTR, - InternalCatalogMetricsConstant.CACHE_REMOVAL_CAUSE_TAG, - cause.name()) - .increment()); - } -} diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/cache/SpringTableMetadataCache.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/cache/SpringTableMetadataCache.java deleted file mode 100644 index f29504d32..000000000 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/cache/SpringTableMetadataCache.java +++ /dev/null @@ -1,29 +0,0 @@ -package com.linkedin.openhouse.internal.catalog.cache; - -import java.util.function.Supplier; -import org.apache.iceberg.TableMetadata; -import org.springframework.cache.annotation.CachePut; -import org.springframework.cache.annotation.Cacheable; -import org.springframework.stereotype.Component; - -@Component -public class SpringTableMetadataCache implements TableMetadataCache { - - @Override - @Cacheable( - cacheManager = "internalCatalogCacheManager", - cacheNames = "tableMetadata", - key = "#metadataLocation") - public TableMetadata load(String metadataLocation, Supplier metadataLoader) { - return metadataLoader.get(); - } - - @Override - @CachePut( - cacheManager = "internalCatalogCacheManager", - cacheNames = "tableMetadata", - key = "#metadataLocation") - public TableMetadata seed(String metadataLocation, TableMetadata tableMetadata) { - return tableMetadata; - } -} diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/cache/TableMetadataCache.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/cache/TableMetadataCache.java deleted file mode 100644 index 1f10aefda..000000000 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/cache/TableMetadataCache.java +++ /dev/null @@ -1,11 +0,0 @@ -package com.linkedin.openhouse.internal.catalog.cache; - -import java.util.function.Supplier; -import org.apache.iceberg.TableMetadata; - -public interface TableMetadataCache { - - TableMetadata load(String metadataLocation, Supplier metadataLoader); - - TableMetadata seed(String metadataLocation, TableMetadata tableMetadata); -} diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/config/InternalCatalogSettings.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/config/InternalCatalogSettings.java deleted file mode 100644 index 50bd06c8f..000000000 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/config/InternalCatalogSettings.java +++ /dev/null @@ -1,21 +0,0 @@ -package com.linkedin.openhouse.internal.catalog.config; - -import java.time.Duration; -import lombok.Getter; -import lombok.Setter; -import org.springframework.util.unit.DataSize; - -@Getter -@Setter -public class InternalCatalogSettings { - - private MetadataCache metadataCache = new MetadataCache(); - - @Getter - @Setter - public static class MetadataCache { - private boolean enabled = false; - private Duration ttl = Duration.ofMinutes(10); - private DataSize maxWeight = DataSize.ofMegabytes(512); - } -} diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalCatalogTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalCatalogTest.java index e9a8ae910..cdb8ef04a 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalCatalogTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalCatalogTest.java @@ -1,35 +1,11 @@ package com.linkedin.openhouse.internal.catalog; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.anyBoolean; -import static org.mockito.ArgumentMatchers.eq; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.never; -import static org.mockito.Mockito.verify; -import static org.mockito.Mockito.when; -import static org.mockito.Mockito.withSettings; - -import com.linkedin.openhouse.internal.catalog.model.HouseTable; -import com.linkedin.openhouse.internal.catalog.model.HouseTablePrimaryKey; -import com.linkedin.openhouse.internal.catalog.repository.HouseTableRepository; -import com.linkedin.openhouse.internal.catalog.repository.exception.HouseTableNotFoundException; -import java.util.Optional; import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.exceptions.NoSuchTableException; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.SupportsPrefixOperations; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; public class OpenHouseInternalCatalogTest { - private static final String DB = "test_db"; - private static final String TABLE = "test_table"; - private static final TableIdentifier IDENTIFIER = TableIdentifier.of(DB, TABLE); - private static final String METADATA_LOCATION = - "/data/openhouse/test_db/test_table-uuid/00001-aaa.metadata.json"; - private static final String EXPECTED_BASE = "/data/openhouse/test_db/test_table-uuid"; - @Test void testIsValidIdentifierRequiresDatabaseTableShape() { TestOpenHouseInternalCatalog catalog = new TestOpenHouseInternalCatalog(); @@ -41,122 +17,8 @@ void testIsValidIdentifierRequiresDatabaseTableShape() { catalog.isValidBaseIdentifier(TableIdentifier.of("db", "table", "partitions"))); } - @Test - void findHouseTableReturnsRowWhenPresent() { - HouseTableRepository repo = mock(HouseTableRepository.class); - HouseTable row = HouseTable.builder().databaseId(DB).tableId(TABLE).tableUUID("uuid").build(); - when(repo.findById(any(HouseTablePrimaryKey.class))).thenReturn(Optional.of(row)); - OpenHouseInternalCatalog catalog = new OpenHouseInternalCatalog(); - catalog.houseTableRepository = repo; - - Optional result = catalog.findHouseTable(IDENTIFIER); - - Assertions.assertTrue(result.isPresent()); - Assertions.assertEquals("uuid", result.get().getTableUUID()); - } - - @Test - void findHouseTableReturnsEmptyOnNotFoundException() { - HouseTableRepository repo = mock(HouseTableRepository.class); - when(repo.findById(any(HouseTablePrimaryKey.class))) - .thenThrow(new HouseTableNotFoundException("missing", new RuntimeException())); - OpenHouseInternalCatalog catalog = new OpenHouseInternalCatalog(); - catalog.houseTableRepository = repo; - - Assertions.assertFalse(catalog.findHouseTable(IDENTIFIER).isPresent()); - } - - @Test - void dropTableThrowsNoSuchTableWhenHouseTableMissing() { - HouseTableRepository repo = mock(HouseTableRepository.class); - when(repo.findById(any(HouseTablePrimaryKey.class))).thenReturn(Optional.empty()); - FileIO fileIO = - mock(FileIO.class, withSettings().extraInterfaces(SupportsPrefixOperations.class)); - OpenHouseInternalCatalog catalog = new FixedFileIOCatalog(fileIO); - catalog.houseTableRepository = repo; - - Assertions.assertThrows(NoSuchTableException.class, () -> catalog.dropTable(IDENTIFIER, true)); - verify(repo, never()).deleteById(any(), anyBoolean()); - verify((SupportsPrefixOperations) fileIO, never()).deletePrefix(any()); - } - - @Test - void dropTableWithPurgeDeletesHtsRowAndPrefix() { - HouseTableRepository repo = mock(HouseTableRepository.class); - HouseTable row = - HouseTable.builder() - .databaseId(DB) - .tableId(TABLE) - .tableUUID("uuid") - .tableLocation(METADATA_LOCATION) - .build(); - when(repo.findById(any(HouseTablePrimaryKey.class))).thenReturn(Optional.of(row)); - FileIO fileIO = - mock(FileIO.class, withSettings().extraInterfaces(SupportsPrefixOperations.class)); - OpenHouseInternalCatalog catalog = new FixedFileIOCatalog(fileIO); - catalog.houseTableRepository = repo; - - Assertions.assertTrue(catalog.dropTable(IDENTIFIER, true)); - - verify(repo).deleteById(any(HouseTablePrimaryKey.class), eq(true)); - verify((SupportsPrefixOperations) fileIO).deletePrefix(EXPECTED_BASE); - } - - @Test - void dropTableRefusesWhenMetadataLocationIsNotAMetadataJsonFile() { - // Defensive: if metadata_location somehow points at a directory (bad migration, manual - // MySQL edit, future regression), the derived parent would be a level too high — e.g. the - // whole database directory — which deletePrefix would happily wipe. Refuse instead. - HouseTableRepository repo = mock(HouseTableRepository.class); - HouseTable row = - HouseTable.builder() - .databaseId(DB) - .tableId(TABLE) - .tableLocation("/data/openhouse/test_db/test_table-uuid") // directory, not file - .build(); - when(repo.findById(any(HouseTablePrimaryKey.class))).thenReturn(Optional.of(row)); - FileIO fileIO = - mock(FileIO.class, withSettings().extraInterfaces(SupportsPrefixOperations.class)); - OpenHouseInternalCatalog catalog = new FixedFileIOCatalog(fileIO); - catalog.houseTableRepository = repo; - - Assertions.assertThrows(IllegalStateException.class, () -> catalog.dropTable(IDENTIFIER, true)); - verify(repo, never()).deleteById(any(), anyBoolean()); - verify((SupportsPrefixOperations) fileIO, never()).deletePrefix(any()); - } - - @Test - void dropTableWithoutPurgeSkipsPrefixDelete() { - HouseTableRepository repo = mock(HouseTableRepository.class); - HouseTable row = - HouseTable.builder().databaseId(DB).tableId(TABLE).tableLocation(METADATA_LOCATION).build(); - when(repo.findById(any(HouseTablePrimaryKey.class))).thenReturn(Optional.of(row)); - FileIO fileIO = - mock(FileIO.class, withSettings().extraInterfaces(SupportsPrefixOperations.class)); - OpenHouseInternalCatalog catalog = new FixedFileIOCatalog(fileIO); - catalog.houseTableRepository = repo; - - Assertions.assertTrue(catalog.dropTable(IDENTIFIER, false)); - - verify(repo).deleteById(any(HouseTablePrimaryKey.class), eq(false)); - verify((SupportsPrefixOperations) fileIO, never()).deletePrefix(any()); - } - - /** Test subclass that bypasses the real {@link OpenHouseInternalCatalog#resolveFileIO} wiring. */ - private static class FixedFileIOCatalog extends OpenHouseInternalCatalog { - private final FileIO fileIO; - - FixedFileIOCatalog(FileIO fileIO) { - this.fileIO = fileIO; - } - - @Override - protected FileIO resolveFileIO(TableIdentifier identifier) { - return fileIO; - } - } - private static class TestOpenHouseInternalCatalog extends OpenHouseInternalCatalog { + boolean isValidBaseIdentifier(TableIdentifier identifier) { return isValidIdentifier(identifier); } diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java index 26336b818..57aef978c 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java @@ -9,7 +9,6 @@ import com.linkedin.openhouse.cluster.storage.local.LocalStorage; import com.linkedin.openhouse.cluster.storage.local.LocalStorageClient; import com.linkedin.openhouse.common.exception.InvalidTableMetadataException; -import com.linkedin.openhouse.internal.catalog.cache.TableMetadataCache; import com.linkedin.openhouse.internal.catalog.fileio.FileIOManager; import com.linkedin.openhouse.internal.catalog.mapper.HouseTableMapper; import com.linkedin.openhouse.internal.catalog.model.HouseTable; @@ -36,10 +35,7 @@ import java.util.Optional; import java.util.Set; import java.util.UUID; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.atomic.AtomicReference; import java.util.function.Consumer; -import java.util.function.Supplier; import java.util.stream.Collectors; import lombok.SneakyThrows; import org.apache.commons.compress.utils.Lists; @@ -101,7 +97,6 @@ public class OpenHouseInternalTableOperationsTest { @Mock private FSDataInputStream mockFSDataInputStream; @Mock private FSDataOutputStream mockFSDataOutputStream; - private TableMetadataCache tableMetadataCache; private OpenHouseInternalTableOperations openHouseInternalTableOperations; private OpenHouseInternalTableOperations openHouseInternalTableOperationsWithMockMetrics; @@ -113,7 +108,6 @@ private static String getTempLocation() { @BeforeEach void setup() { MockitoAnnotations.openMocks(this); - tableMetadataCache = new InMemoryTableMetadataCache(); Mockito.when(mockHouseTableMapper.toHouseTable(Mockito.any(TableMetadata.class), Mockito.any())) .thenReturn(mockHouseTable); HadoopFileIO fileIO = new HadoopFileIO(new Configuration()); @@ -126,8 +120,7 @@ void setup() { mockHouseTableMapper, TEST_TABLE_IDENTIFIER, metricsReporter, - fileIOManager, - tableMetadataCache); + fileIOManager); // Create a separate instance with mock metrics reporter for testing metrics openHouseInternalTableOperationsWithMockMetrics = @@ -137,8 +130,7 @@ void setup() { mockHouseTableMapper, TEST_TABLE_IDENTIFIER, mockMetricsReporter, - fileIOManager, - tableMetadataCache); + fileIOManager); LocalStorage localStorage = mock(LocalStorage.class); when(fileIOManager.getStorage(fileIO)).thenReturn(localStorage); @@ -1077,113 +1069,6 @@ void testCommitMetadataUpdateLatencyHasHistogramBuckets() { this::executeCommitMetadata); } - @Test - void testRefreshReusesCachedMetadataAcrossOperations() { - HouseTablePrimaryKey primaryKey = - HouseTablePrimaryKey.builder() - .databaseId(TEST_TABLE_IDENTIFIER.namespace().toString()) - .tableId(TEST_TABLE_IDENTIFIER.name()) - .build(); - when(mockHouseTableRepository.findById(primaryKey)).thenReturn(Optional.of(mockHouseTable)); - when(mockHouseTable.getTableLocation()).thenReturn("test_metadata_location"); - - OpenHouseInternalTableOperations secondOperations = - new OpenHouseInternalTableOperations( - mockHouseTableRepository, - new HadoopFileIO(new Configuration()), - mockHouseTableMapper, - TEST_TABLE_IDENTIFIER, - new MetricsReporter(new SimpleMeterRegistry(), "TEST_CATALOG", Lists.newArrayList()), - fileIOManager, - tableMetadataCache); - - try (MockedStatic parserMock = - Mockito.mockStatic(TableMetadataParser.class, Mockito.CALLS_REAL_METHODS)) { - parserMock - .when( - () -> - TableMetadataParser.read( - Mockito.any(FileIO.class), Mockito.eq("test_metadata_location"))) - .thenReturn(BASE_TABLE_METADATA); - - openHouseInternalTableOperations.refresh(); - secondOperations.refresh(); - - parserMock.verify( - () -> - TableMetadataParser.read( - Mockito.any(FileIO.class), Mockito.eq("test_metadata_location")), - times(1)); - } - } - - @Test - void testCommitSeedsCacheForSubsequentRefresh() { - AtomicReference savedHouseTable = new AtomicReference<>(); - HouseTablePrimaryKey primaryKey = - HouseTablePrimaryKey.builder() - .databaseId(TEST_TABLE_IDENTIFIER.namespace().toString()) - .tableId(TEST_TABLE_IDENTIFIER.name()) - .build(); - when(mockHouseTableMapper.toHouseTable(Mockito.any(TableMetadata.class), Mockito.any())) - .thenAnswer( - invocation -> { - TableMetadata tableMetadata = invocation.getArgument(0); - HouseTable mappedHouseTable = - HouseTable.builder() - .databaseId(TEST_TABLE_IDENTIFIER.namespace().toString()) - .tableId(TEST_TABLE_IDENTIFIER.name()) - .tableLocation( - tableMetadata.properties().get(getCanonicalFieldName("tableLocation"))) - .build(); - savedHouseTable.set(mappedHouseTable); - return mappedHouseTable; - }); - when(mockHouseTableRepository.save(Mockito.any(HouseTable.class))) - .thenAnswer( - invocation -> { - HouseTable houseTable = invocation.getArgument(0); - savedHouseTable.set(houseTable); - return houseTable; - }); - when(mockHouseTableRepository.findById(primaryKey)) - .thenAnswer(invocation -> Optional.ofNullable(savedHouseTable.get())); - - OpenHouseInternalTableOperations refreshedOperations = - new OpenHouseInternalTableOperations( - mockHouseTableRepository, - new HadoopFileIO(new Configuration()), - mockHouseTableMapper, - TEST_TABLE_IDENTIFIER, - new MetricsReporter(new SimpleMeterRegistry(), "TEST_CATALOG", Lists.newArrayList()), - fileIOManager, - tableMetadataCache); - - Map properties = new HashMap<>(BASE_TABLE_METADATA.properties()); - properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); - TableMetadata metadata = BASE_TABLE_METADATA.replaceProperties(properties); - - try (MockedStatic parserMock = - Mockito.mockStatic(TableMetadataParser.class, Mockito.CALLS_REAL_METHODS)) { - parserMock - .when( - () -> - TableMetadataParser.write( - Mockito.any(TableMetadata.class), - Mockito.any(org.apache.iceberg.io.OutputFile.class))) - .thenAnswer(invocation -> null); - - openHouseInternalTableOperations.doCommit(BASE_TABLE_METADATA, metadata); - refreshedOperations.refresh(); - - String committedLocation = savedHouseTable.get().getTableLocation(); - Assertions.assertEquals(committedLocation, refreshedOperations.currentMetadataLocation()); - parserMock.verify( - () -> TableMetadataParser.read(Mockito.any(FileIO.class), Mockito.eq(committedLocation)), - never()); - } - } - /** * Common test method for verifying metrics exclude both database and table tags. * @@ -1211,8 +1096,7 @@ private void testMetricExcludesDatabaseTag( mockHouseTableMapper, TEST_TABLE_IDENTIFIER, realMetricsReporter, - fileIOManager, - tableMetadataCache); + fileIOManager); // Setup test-specific mocks setupFunction.accept(operationsWithRealMetrics); @@ -1274,8 +1158,7 @@ private void testMetricHasHistogramBuckets( mockHouseTableMapper, TEST_TABLE_IDENTIFIER, realMetricsReporter, - fileIOManager, - tableMetadataCache); + fileIOManager); // Setup test-specific mocks setupFunction.accept(operationsWithRealMetrics); @@ -1980,21 +1863,6 @@ void testDoCommitCreatesOtelSpans() { } } - private static final class InMemoryTableMetadataCache implements TableMetadataCache { - private final Map cache = new ConcurrentHashMap<>(); - - @Override - public TableMetadata load(String metadataLocation, Supplier metadataLoader) { - return cache.computeIfAbsent(metadataLocation, ignored -> metadataLoader.get()); - } - - @Override - public TableMetadata seed(String metadataLocation, TableMetadata tableMetadata) { - cache.put(metadataLocation, tableMetadata); - return tableMetadata; - } - } - /** * Simulates the real-world bug where a table's metadata file references a schema ID that doesn't * exist in the schemas list. Iceberg's TableMetadataParser throws IllegalArgumentException: diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/cache/CacheConfigurationTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/cache/CacheConfigurationTest.java deleted file mode 100644 index a89004bfd..000000000 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/cache/CacheConfigurationTest.java +++ /dev/null @@ -1,131 +0,0 @@ -package com.linkedin.openhouse.internal.catalog.cache; - -import com.linkedin.openhouse.internal.catalog.config.InternalCatalogSettings; -import io.micrometer.core.instrument.MeterRegistry; -import io.micrometer.core.instrument.simple.SimpleMeterRegistry; -import java.time.Duration; -import java.util.List; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.iceberg.TableMetadata; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; -import org.mockito.Mockito; -import org.springframework.boot.test.context.assertj.AssertableApplicationContext; -import org.springframework.boot.test.context.runner.ApplicationContextRunner; -import org.springframework.cache.CacheManager; -import org.springframework.cache.caffeine.CaffeineCache; -import org.springframework.cache.caffeine.CaffeineCacheManager; -import org.springframework.cache.support.NoOpCacheManager; -import org.springframework.util.unit.DataSize; - -class CacheConfigurationTest { - - private final ApplicationContextRunner contextRunner = - new ApplicationContextRunner().withUserConfiguration(CacheConfiguration.class); - - private final ApplicationContextRunner tableMetadataCacheContextRunner = - new ApplicationContextRunner() - .withUserConfiguration(CacheConfiguration.class) - .withBean(SpringTableMetadataCache.class, SpringTableMetadataCache::new) - .withBean(MeterRegistry.class, SimpleMeterRegistry::new); - - @Test - public void testMetadataCacheDisabledByDefault() { - contextRunner - .withBean(InternalCatalogSettings.class, InternalCatalogSettings::new) - .run( - context -> { - Assertions.assertNull(context.getStartupFailure()); - Assertions.assertFalse( - context.getBean(InternalCatalogSettings.class).getMetadataCache().isEnabled()); - Assertions.assertTrue( - context.getBean("internalCatalogCacheManager", CacheManager.class) - instanceof NoOpCacheManager); - }); - } - - @Test - public void testEnabledMetadataCacheConfiguration() { - contextRunner - .withBean( - InternalCatalogSettings.class, - () -> buildEnabledSettings(Duration.ofMinutes(7), DataSize.ofMegabytes(42))) - .run( - context -> - assertMetadataCacheConfiguration( - context, Duration.ofMinutes(7), DataSize.ofMegabytes(42))); - } - - @Test - public void testSpringTableMetadataCacheUsesConfiguredTableMetadataCache() { - tableMetadataCacheContextRunner - .withBean( - InternalCatalogSettings.class, - () -> buildEnabledSettings(Duration.ofMinutes(7), DataSize.ofMegabytes(42))) - .run( - context -> { - CaffeineCache tableMetadataCache = - assertMetadataCacheConfiguration( - context, Duration.ofMinutes(7), DataSize.ofMegabytes(42)); - TableMetadataCache cache = context.getBean(TableMetadataCache.class); - String metadataLocation = "metadata-location"; - TableMetadata seededMetadata = Mockito.mock(TableMetadata.class); - AtomicInteger loadCount = new AtomicInteger(); - - cache.seed(metadataLocation, seededMetadata); - TableMetadata loadedMetadata = - cache.load( - metadataLocation, - () -> { - loadCount.incrementAndGet(); - return Mockito.mock(TableMetadata.class); - }); - - Assertions.assertSame(seededMetadata, loadedMetadata); - Assertions.assertEquals(0, loadCount.get()); - Assertions.assertSame( - seededMetadata, tableMetadataCache.get(metadataLocation, TableMetadata.class)); - }); - } - - private InternalCatalogSettings buildEnabledSettings(Duration ttl, DataSize maxWeight) { - InternalCatalogSettings settings = new InternalCatalogSettings(); - settings.getMetadataCache().setEnabled(true); - settings.getMetadataCache().setTtl(ttl); - settings.getMetadataCache().setMaxWeight(maxWeight); - return settings; - } - - private CaffeineCache assertMetadataCacheConfiguration( - AssertableApplicationContext context, Duration expectedTtl, DataSize expectedMaxWeight) { - Assertions.assertNull(context.getStartupFailure()); - - InternalCatalogSettings settings = context.getBean(InternalCatalogSettings.class); - Assertions.assertTrue(settings.getMetadataCache().isEnabled()); - Assertions.assertEquals(expectedTtl, settings.getMetadataCache().getTtl()); - Assertions.assertEquals(expectedMaxWeight, settings.getMetadataCache().getMaxWeight()); - - CaffeineCacheManager cacheManager = - context.getBean("internalCatalogCacheManager", CaffeineCacheManager.class); - Assertions.assertFalse(cacheManager.isAllowNullValues()); - Assertions.assertEquals(List.of("tableMetadata"), List.copyOf(cacheManager.getCacheNames())); - - CaffeineCache tableMetadataCache = (CaffeineCache) cacheManager.getCache("tableMetadata"); - Assertions.assertNotNull(tableMetadataCache); - - com.github.benmanes.caffeine.cache.Cache nativeCache = - tableMetadataCache.getNativeCache(); - Assertions.assertEquals( - expectedTtl.toNanos(), - nativeCache - .policy() - .expireAfterWrite() - .orElseThrow() - .getExpiresAfter(TimeUnit.NANOSECONDS)); - Assertions.assertEquals( - expectedMaxWeight.toBytes(), nativeCache.policy().eviction().orElseThrow().getMaximum()); - Assertions.assertTrue(nativeCache.policy().eviction().orElseThrow().isWeighted()); - return tableMetadataCache; - } -} diff --git a/integrations/python/dataloader/pyproject.toml b/integrations/python/dataloader/pyproject.toml index 40c36b2f5..e157fe613 100644 --- a/integrations/python/dataloader/pyproject.toml +++ b/integrations/python/dataloader/pyproject.toml @@ -10,14 +10,7 @@ readme = "README.md" requires-python = ">=3.10" license = {text = "BSD-2-Clause"} keywords = ["openhouse", "data-loader", "lakehouse", "iceberg", "datafusion"] -dependencies = [ - "datafusion==53.0.0", - "li-pyiceberg==0.11.5", - "requests>=2.31.0", - "sqlglot>=29.0.0", - "tenacity>=8.0.0", - "opentelemetry-api>=1.38.0", -] +dependencies = ["datafusion==53.0.0", "li-pyiceberg==0.11.5", "requests>=2.31.0", "sqlglot>=29.0.0", "tenacity>=8.0.0"] [[tool.uv.index]] url = "https://linkedin.jfrog.io/artifactory/api/pypi/openhouse-pypi/simple/" @@ -27,15 +20,7 @@ name = "openhouse-pypi" li-pyiceberg = { index = "openhouse-pypi" } [project.optional-dependencies] -dev = [ - "responses>=0.25.0", - "ruff>=0.9.0", - "pytest>=8.0.0", - "twine>=6.0.0", - "mypy>=1.14.0", - "types-requests>=2.31.0", - "opentelemetry-sdk>=1.38.0", -] +dev = ["responses>=0.25.0", "ruff>=0.9.0", "pytest>=8.0.0", "twine>=6.0.0", "mypy>=1.14.0", "types-requests>=2.31.0"] [tool.hatch.version] source = "vcs" diff --git a/integrations/python/dataloader/src/openhouse/dataloader/_table_scan_context.py b/integrations/python/dataloader/src/openhouse/dataloader/_table_scan_context.py index 1b7d3ac33..ae20bd9c5 100644 --- a/integrations/python/dataloader/src/openhouse/dataloader/_table_scan_context.py +++ b/integrations/python/dataloader/src/openhouse/dataloader/_table_scan_context.py @@ -1,7 +1,6 @@ from __future__ import annotations -from collections.abc import Mapping -from dataclasses import dataclass, field +from dataclasses import dataclass from pyiceberg.expressions import AlwaysTrue, BooleanExpression from pyiceberg.io import FileIO, load_file_io @@ -18,7 +17,6 @@ def _unpickle_scan_context( row_filter: BooleanExpression, table_id: TableIdentifier, worker_jvm_args: str | None = None, - metric_attributes: Mapping[str, str] | None = None, ) -> TableScanContext: return TableScanContext( table_metadata=table_metadata, @@ -27,7 +25,6 @@ def _unpickle_scan_context( row_filter=row_filter, table_id=table_id, worker_jvm_args=worker_jvm_args, - metric_attributes=metric_attributes if metric_attributes is not None else {}, ) @@ -45,7 +42,6 @@ class TableScanContext: table_id: Identifier for the table being scanned row_filter: Row-level filter expression pushed down to the scan worker_jvm_args: JVM arguments applied when the JNI JVM is created in worker processes - metric_attributes: Attributes attached to every metric emitted while iterating splits. """ table_metadata: TableMetadata @@ -54,7 +50,6 @@ class TableScanContext: table_id: TableIdentifier row_filter: BooleanExpression = AlwaysTrue() worker_jvm_args: str | None = None - metric_attributes: Mapping[str, str] = field(default_factory=dict) def __reduce__(self) -> tuple: return ( @@ -66,6 +61,5 @@ def __reduce__(self) -> tuple: self.row_filter, self.table_id, self.worker_jvm_args, - dict(self.metric_attributes), ), ) diff --git a/integrations/python/dataloader/src/openhouse/dataloader/data_loader.py b/integrations/python/dataloader/src/openhouse/dataloader/data_loader.py index da9a3e943..26424d52c 100644 --- a/integrations/python/dataloader/src/openhouse/dataloader/data_loader.py +++ b/integrations/python/dataloader/src/openhouse/dataloader/data_loader.py @@ -1,8 +1,6 @@ from __future__ import annotations import logging -import time -import uuid from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence from dataclasses import dataclass from functools import cached_property @@ -10,7 +8,6 @@ from types import MappingProxyType from typing import TypeVar -from opentelemetry.metrics import Counter, Histogram, get_meter from pyiceberg.catalog import Catalog from pyiceberg.table import Table from pyiceberg.table.snapshots import Snapshot @@ -30,7 +27,6 @@ _to_pyiceberg, always_true, ) -from openhouse.dataloader.metrics import METER_NAME from openhouse.dataloader.scan_optimizer import optimize_scan from openhouse.dataloader.table_identifier import TableIdentifier from openhouse.dataloader.table_transformer import TableTransformer @@ -38,39 +34,6 @@ logger = logging.getLogger(__name__) -_meter = get_meter(METER_NAME) - -_load_table_duration = _meter.create_histogram( - name="OpenHouse.DataLoader.LoadTableTime", - unit="s", - description="Time spent loading the Iceberg table from the catalog.", -) -_load_table_success = _meter.create_counter( - name="OpenHouse.DataLoader.LoadTableSuccess", - unit="1", - description="Successful loads of the Iceberg table from the catalog.", -) -_load_table_failure = _meter.create_counter( - name="OpenHouse.DataLoader.LoadTableFailure", - unit="1", - description="Failed loads of the Iceberg table from the catalog.", -) -_plan_files_duration = _meter.create_histogram( - name="OpenHouse.DataLoader.PlanFilesTime", - unit="s", - description="Time spent planning which files to scan.", -) -_plan_files_success = _meter.create_counter( - name="OpenHouse.DataLoader.PlanFilesSuccess", - unit="1", - description="Successful file-planning operations for the scan.", -) -_plan_files_failure = _meter.create_counter( - name="OpenHouse.DataLoader.PlanFilesFailure", - unit="1", - description="Failed file-planning operations for the scan.", -) - def _is_transient(exc: BaseException) -> bool: """Return True if the exception is transient and worth retrying.""" @@ -88,38 +51,22 @@ def _batched(iterable: Iterable[_T], n: int) -> Iterator[tuple[_T, ...]]: yield batch -def _retry( - fn: Callable[[], _T], - label: str, - max_attempts: int, - duration_histogram: Histogram, - success_counter: Counter, - failure_counter: Counter, - attributes: Mapping[str, str], -) -> _T: - """Call *fn* with retry logic, logging the duration and recording the outcome. +def _retry(fn: Callable[[], _T], label: str, max_attempts: int) -> _T: + """Call *fn* with retry logic, logging duration of each attempt. Retries on ``OSError`` (transient network/storage I/O failures), except ``HTTPError`` which is only retried for 5xx status codes. Uses exponential backoff with up to *max_attempts* total attempts. """ - overall_start = time.monotonic() - succeeded = False - try: - for attempt in Retrying( - retry=retry_if_exception(_is_transient), - stop=stop_after_attempt(max_attempts), - wait=wait_exponential(), - reraise=True, - ): - with attempt, log_duration(logger, "%s (attempt %d)", label, attempt.retry_state.attempt_number): - result = fn() - succeeded = True - return result - raise AssertionError("unreachable") # pragma: no cover - finally: - duration_histogram.record(time.monotonic() - overall_start, attributes) - (success_counter if succeeded else failure_counter).add(1, attributes) + for attempt in Retrying( + retry=retry_if_exception(_is_transient), + stop=stop_after_attempt(max_attempts), + wait=wait_exponential(), + reraise=True, + ): + with attempt, log_duration(logger, "%s (attempt %d)", label, attempt.retry_state.attempt_number): + return fn() + raise AssertionError("unreachable") # pragma: no cover @dataclass(frozen=True) @@ -154,7 +101,6 @@ class DataLoaderContext: Args: execution_context: Dictionary of execution context information (e.g. tenant, environment) - metric_attribute_keys: Keys from ``execution_context`` to attach as dimensions on emitted metrics. table_transformer: Transformation to apply to the table before loading (e.g. column masking) udf_registry: UDFs required for the table transformation jvm_config: JVM configuration for JNI-based storage access. Currently only HDFS is supported @@ -162,7 +108,6 @@ class DataLoaderContext: """ execution_context: Mapping[str, str] | None = None - metric_attribute_keys: Sequence[str] | None = None table_transformer: TableTransformer | None = None udf_registry: UDFRegistry | None = None jvm_config: JvmConfig | None = None @@ -218,42 +163,18 @@ def __init__( self._max_attempts = max_attempts self._batch_size = batch_size self._files_per_split = files_per_split - self._id = f"dataloader-{uuid.uuid4()}" if self._context.jvm_config is not None and self._context.jvm_config.planner_args is not None: apply_libhdfs_opts(self._context.jvm_config.planner_args) - @cached_property - def _resolved_metric_attributes(self) -> Mapping[str, str]: - attrs: dict[str, str] = { - "OpenHouse.Database": self._table_id.database, - "OpenHouse.Table": self._table_id.table, - } - keys = self._context.metric_attribute_keys - if keys: - execution_context = self._context.execution_context or {} - for k in keys: - if k in execution_context: - attrs[k] = execution_context[k] - return attrs - @cached_property def _iceberg_table(self) -> Table: return _retry( lambda: self._catalog.load_table((self._table_id.database, self._table_id.table)), label=f"load_table {self._table_id}", max_attempts=self._max_attempts, - duration_histogram=_load_table_duration, - success_counter=_load_table_success, - failure_counter=_load_table_failure, - attributes=self._resolved_metric_attributes, ) - @property - def id(self) -> str: - """Unique identifier for this data loader instance, generated at construction.""" - return self._id - @property def table_properties(self) -> Mapping[str, str]: """Properties of the table being loaded""" @@ -355,19 +276,12 @@ def __iter__(self) -> Iterator[DataLoaderSplit]: row_filter=row_filter, table_id=self._table_id, worker_jvm_args=self._context.jvm_config.worker_args if self._context.jvm_config else None, - metric_attributes=self._resolved_metric_attributes, ) # plan_files() materializes all tasks at once (PyIceberg doesn't support streaming) # Manifests are read in parallel with one thread per manifest scan_tasks = _retry( - lambda: scan.plan_files(), - label=f"plan_files {self._table_id}", - max_attempts=self._max_attempts, - duration_histogram=_plan_files_duration, - success_counter=_plan_files_success, - failure_counter=_plan_files_failure, - attributes=self._resolved_metric_attributes, + lambda: scan.plan_files(), label=f"plan_files {self._table_id}", max_attempts=self._max_attempts ) for chunk in _batched(scan_tasks, self._files_per_split): diff --git a/integrations/python/dataloader/src/openhouse/dataloader/data_loader_split.py b/integrations/python/dataloader/src/openhouse/dataloader/data_loader_split.py index e9bd9ae82..77e8c1d81 100644 --- a/integrations/python/dataloader/src/openhouse/dataloader/data_loader_split.py +++ b/integrations/python/dataloader/src/openhouse/dataloader/data_loader_split.py @@ -10,7 +10,6 @@ from datafusion import SessionConfig from datafusion.context import SessionContext -from opentelemetry.metrics import get_meter from pyarrow import RecordBatch from pyiceberg.io.pyarrow import ArrowScan from pyiceberg.table import ArrivalOrder, FileScanTask @@ -19,70 +18,11 @@ from openhouse.dataloader._table_scan_context import TableScanContext from openhouse.dataloader._timer import log_duration from openhouse.dataloader.filters import _quote_identifier -from openhouse.dataloader.metrics import METER_NAME from openhouse.dataloader.table_identifier import TableIdentifier from openhouse.dataloader.udf_registry import NoOpRegistry, UDFRegistry logger = logging.getLogger(__name__) -_meter = get_meter(METER_NAME) - -_split_duration = _meter.create_histogram( - name="OpenHouse.DataLoader.SplitTime", - unit="s", - description="Time spent iterating a split.", -) -_split_files = _meter.create_histogram( - name="OpenHouse.DataLoader.SplitFiles", - unit="1", - description="Number of files in a split.", -) -_split_rows = _meter.create_histogram( - name="OpenHouse.DataLoader.SplitRows", - unit="1", - description="Rows yielded by a split.", -) -_split_bytes = _meter.create_histogram( - name="OpenHouse.DataLoader.SplitBytes", - unit="By", - description="Bytes yielded by a split.", -) -_split_batches = _meter.create_histogram( - name="OpenHouse.DataLoader.SplitBatches", - unit="1", - description="Record batches yielded by a split.", -) -_split_errors = _meter.create_counter( - name="OpenHouse.DataLoader.SplitErrors", - unit="1", - description="Errors raised while iterating a split.", -) -_batch_duration = _meter.create_histogram( - name="OpenHouse.DataLoader.BatchTime", - unit="s", - description="Time spent reading a record batch.", -) -_batch_rows = _meter.create_histogram( - name="OpenHouse.DataLoader.BatchRows", - unit="1", - description="Rows in a record batch.", -) -_batch_bytes = _meter.create_histogram( - name="OpenHouse.DataLoader.BatchBytes", - unit="By", - description="Bytes in a record batch.", -) -_batch_errors = _meter.create_counter( - name="OpenHouse.DataLoader.BatchErrors", - unit="1", - description="Errors raised while reading a record batch.", -) -_transform_duration = _meter.create_histogram( - name="OpenHouse.DataLoader.TransformTime", - unit="s", - description="Time spent applying the transform to a record batch.", -) - def to_sql_identifier(table_id: TableIdentifier) -> str: """Return the quoted DataFusion SQL identifier, e.g. ``"db"."tbl"``.""" @@ -117,21 +57,12 @@ def _bind_batch_table(session: SessionContext, table_id: TableIdentifier, batch: class _TimedBatchIter: - """Wraps a RecordBatch iterator to log and emit metrics for each ``next()`` call.""" + """Wraps a RecordBatch iterator to log the wall-clock time of each ``next()`` call.""" - def __init__( - self, - inner: Iterator[RecordBatch], - split_id: str, - attributes: Mapping[str, str], - ) -> None: + def __init__(self, inner: Iterator[RecordBatch], split_id: str) -> None: self._inner = inner self._split_id = split_id - self._attributes = attributes self._idx = 0 - self.total_rows = 0 - self.total_bytes = 0 - self.batch_count = 0 def __iter__(self) -> _TimedBatchIter: return self @@ -143,20 +74,11 @@ def __next__(self) -> RecordBatch: except StopIteration: raise except Exception: - elapsed = time.monotonic() - start - logger.warning("record_batch %s [%d] failed after %.3fs", self._split_id, self._idx, elapsed) - _batch_errors.add(1, self._attributes) + logger.warning( + "record_batch %s [%d] failed after %.3fs", self._split_id, self._idx, time.monotonic() - start + ) raise - elapsed = time.monotonic() - start - logger.info("record_batch %s [%d] in %.3fs", self._split_id, self._idx, elapsed) - rows = batch.num_rows - nbytes = batch.nbytes - _batch_duration.record(elapsed, self._attributes) - _batch_rows.record(rows, self._attributes) - _batch_bytes.record(nbytes, self._attributes) - self.total_rows += rows - self.total_bytes += nbytes - self.batch_count += 1 + logger.info("record_batch %s [%d] in %.3fs", self._split_id, self._idx, time.monotonic() - start) self._idx += 1 return batch @@ -166,16 +88,11 @@ def _timed_transform( split_id: str, session: SessionContext, apply_fn: Callable[[SessionContext, RecordBatch], Iterator[RecordBatch]], - attributes: Mapping[str, str], ) -> Iterator[RecordBatch]: - """Apply a transform to each batch, logging and recording the wall-clock time of each.""" + """Apply a transform to each batch, logging the wall-clock time of each.""" for idx, batch in enumerate(batches): - transform_start = time.monotonic() - try: - with log_duration(logger, "transform_batch %s [%d]", split_id, idx): - transformed = list(apply_fn(session, batch)) - finally: - _transform_duration.record(time.monotonic() - transform_start, attributes) + with log_duration(logger, "transform_batch %s [%d]", split_id, idx): + transformed = list(apply_fn(session, batch)) yield from transformed @@ -223,48 +140,34 @@ def __iter__(self) -> Iterator[RecordBatch]: ctx = self._scan_context if ctx.worker_jvm_args is not None: apply_libhdfs_opts(ctx.worker_jvm_args) - attributes = ctx.metric_attributes - split_start = time.monotonic() - timed: _TimedBatchIter | None = None - try: - arrow_scan = ArrowScan( - table_metadata=ctx.table_metadata, - io=ctx.io, - projected_schema=ctx.projected_schema, - row_filter=ctx.row_filter, + arrow_scan = ArrowScan( + table_metadata=ctx.table_metadata, + io=ctx.io, + projected_schema=ctx.projected_schema, + row_filter=ctx.row_filter, + ) + + split_id = self.id[:12] + + with log_duration(logger, "setup_scan %s", split_id): + batches = arrow_scan.to_record_batches( + self._file_scan_tasks, + order=ArrivalOrder(concurrent_streams=len(self._file_scan_tasks), batch_size=self._batch_size), ) - split_id = self.id[:12] - - with log_duration(logger, "setup_scan %s", split_id): - batches = arrow_scan.to_record_batches( - self._file_scan_tasks, - order=ArrivalOrder(concurrent_streams=len(self._file_scan_tasks), batch_size=self._batch_size), - ) - - timed = _TimedBatchIter(iter(batches), split_id, attributes) - - if self._transform_sql is None: - yield from timed - else: - # Materialize the first batch before creating the transform session - # so that the HDFS JVM starts (and picks up worker_jvm_args) before - # any UDF registration code can trigger JNI. - first = next(timed, None) - if first is None: - return - session = _create_transform_session(self._scan_context.table_id, self._udf_registry, self._batch_size) - yield from _timed_transform(chain([first], timed), split_id, session, self._apply_transform, attributes) - except BaseException: - _split_errors.add(1, attributes) - raise - finally: - _split_duration.record(time.monotonic() - split_start, attributes) - _split_files.record(len(self._file_scan_tasks), attributes) - if timed is not None: - _split_rows.record(timed.total_rows, attributes) - _split_bytes.record(timed.total_bytes, attributes) - _split_batches.record(timed.batch_count, attributes) + timed = _TimedBatchIter(iter(batches), split_id) + + if self._transform_sql is None: + yield from timed + else: + # Materialize the first batch before creating the transform session + # so that the HDFS JVM starts (and picks up worker_jvm_args) before + # any UDF registration code can trigger JNI. + first = next(timed, None) + if first is None: + return + session = _create_transform_session(self._scan_context.table_id, self._udf_registry, self._batch_size) + yield from _timed_transform(chain([first], timed), split_id, session, self._apply_transform) def _apply_transform(self, session: SessionContext, batch: RecordBatch) -> Iterator[RecordBatch]: """Execute the transform SQL against a single RecordBatch.""" diff --git a/integrations/python/dataloader/src/openhouse/dataloader/filters.py b/integrations/python/dataloader/src/openhouse/dataloader/filters.py index a725eaa5e..248012c2c 100644 --- a/integrations/python/dataloader/src/openhouse/dataloader/filters.py +++ b/integrations/python/dataloader/src/openhouse/dataloader/filters.py @@ -326,27 +326,25 @@ def _escape_like(value: str) -> str: def _literal_to_sql(value: object) -> str: - """Convert a Python literal to a SQL literal string using sqlglot. - - Datetime/date/time values are emitted as plain string literals (ISO format). - DataFusion implicitly coerces string literals to the column type at execution, - and PyIceberg promotes StringLiteral to the matching typed literal during expression binding. - """ + """Convert a Python literal to a SQL literal string using sqlglot.""" if isinstance(value, str): return exp.Literal.string(value).sql() if isinstance(value, bool): return exp.Boolean(this=True).sql() if value else exp.Boolean(this=False).sql() if isinstance(value, datetime): - return exp.Literal.string(value.isoformat()).sql() + lit = exp.Literal.string(value.strftime("%Y-%m-%d %H:%M:%S.%f%z")) + return exp.Cast(this=lit, to=exp.DataType.build("TIMESTAMP")).sql() if isinstance(value, date): - return exp.Literal.string(value.isoformat()).sql() + lit = exp.Literal.string(value.isoformat()) + return exp.Cast(this=lit, to=exp.DataType.build("DATE")).sql() if isinstance(value, time): if value.tzinfo is not None: raise TypeError( "DataFusion does not support timezones for time data types. " "The time should match the timezone used in the dataset." ) - return exp.Literal.string(value.isoformat()).sql() + lit = exp.Literal.string(value.strftime("%H:%M:%S.%f")) + return exp.Cast(this=lit, to=exp.DataType.build("TIME")).sql() if isinstance(value, (int, float)): if isinstance(value, float) and not math.isfinite(value): return exp.Cast(this=exp.Literal.string(str(value)), to=exp.DataType.build("DOUBLE")).sql() diff --git a/integrations/python/dataloader/src/openhouse/dataloader/metrics/__init__.py b/integrations/python/dataloader/src/openhouse/dataloader/metrics/__init__.py deleted file mode 100644 index d41e2aff1..000000000 --- a/integrations/python/dataloader/src/openhouse/dataloader/metrics/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -"""OpenTelemetry metrics infrastructure for the dataloader. - -This package depends only on ``opentelemetry-api``, which provides a no-op -fallback when no SDK is configured. The *application* (not this library) -is responsible for installing an SDK and configuring exporters. - -Call sites should obtain a ``Meter`` via the OTEL API directly:: - - from opentelemetry.metrics import get_meter - from openhouse.dataloader.metrics import METER_NAME - - meter = get_meter(METER_NAME) -""" - -METER_NAME = "OpenHouse.DataLoader" - -__all__ = ["METER_NAME"] diff --git a/integrations/python/dataloader/tests/integration_tests.py b/integrations/python/dataloader/tests/integration_tests.py index bb183a99c..538e4dd62 100644 --- a/integrations/python/dataloader/tests/integration_tests.py +++ b/integrations/python/dataloader/tests/integration_tests.py @@ -5,7 +5,6 @@ network as the oh-hadoop-spark Docker Compose services. """ -import datetime as _dt import logging import multiprocessing import os @@ -20,9 +19,7 @@ from openhouse.dataloader import DataLoaderContext, JvmConfig, OpenHouseDataLoader from openhouse.dataloader.catalog import OpenHouseCatalog -from openhouse.dataloader.data_loader_split import to_sql_identifier from openhouse.dataloader.filters import col -from openhouse.dataloader.table_transformer import TableTransformer BASE_URL = "http://openhouse-tables:8080" LIVY_URL = "http://spark-livy:8998" @@ -329,81 +326,6 @@ def read_token() -> str: ) print(f"PASS: worker_jvm_args honored by child JVM (MaxHeapSize={worker_heap})") - # 9. Day-partitioned table: datetime filters must prune partitions. - part_table = "t_part_itest" - part_fqtn = f"openhouse.{DATABASE_ID}.{part_table}" - livy.execute(f"CREATE TABLE {part_fqtn} (id BIGINT, ts TIMESTAMP) USING iceberg PARTITIONED BY (days(ts))") - try: - livy.execute( - f"INSERT INTO {part_fqtn} VALUES " - f"(1, TIMESTAMP '2026-05-02 00:00:00'), " - f"(2, TIMESTAMP '2026-05-03 00:00:00'), " - f"(3, TIMESTAMP '2026-05-08 00:00:00')" - ) - - # A trivial passthrough transformer forces OpenHouseDataLoader into the - # SQL roundtrip path (filters -> DataFusion SQL -> sqlglot -> scan_optimizer -> - # PyIceberg expression). Without a transformer, _build_query() returns None - # and the loader skips that path entirely, which would mean a CAST(literal, - # TIMESTAMP) regression in _literal_to_sql / scan_optimizer would go unnoticed. - class _PartPassthroughTransformer(TableTransformer): - def __init__(self): - super().__init__(dialect="datafusion") - - def transform(self, table, context): - return f'SELECT "id", "ts" FROM {to_sql_identifier(table)}' - - part_ctx = DataLoaderContext(table_transformer=_PartPassthroughTransformer()) - - loader = OpenHouseDataLoader(catalog=catalog, database=DATABASE_ID, table=part_table, context=part_ctx) - assert _read_all(loader).num_rows == 3 - print("PASS: partitioned table read all 3 rows with no filter") - - # Assert on split count, not just final rows. DataFusion's WHERE clause - # still filters correctly even if manifest pruning is silently dropped, so - # row-count assertions miss a CAST-handling regression. Split count is the - # direct signal that PyIceberg saw the predicate and pruned partition files. - range_filter = (col("ts") >= _dt.datetime(2026, 5, 2, tzinfo=_dt.timezone.utc)) & ( - col("ts") < _dt.datetime(2026, 5, 4, tzinfo=_dt.timezone.utc) - ) - range_loader = OpenHouseDataLoader( - catalog=catalog, - database=DATABASE_ID, - table=part_table, - filters=range_filter, - context=part_ctx, - ) - range_splits = list(range_loader) - assert len(range_splits) == 2, ( - f"Expected 2 splits from datetime range filter (5/2 + 5/3 partitions pruned to 5/2 + 5/3 splits), " - f"got {len(range_splits)}" - ) - result = _read_all(range_loader) - assert result.column("id").to_pylist() == [1, 2], ( - f"Expected ids [1, 2] from datetime range filter, got {result.column('id').to_pylist()}" - ) - print(f"PASS: datetime range filter returned {result.num_rows} rows from {len(range_splits)} splits") - - tight_filter = col("ts") >= _dt.datetime(2026, 5, 8, tzinfo=_dt.timezone.utc) - tight_loader = OpenHouseDataLoader( - catalog=catalog, - database=DATABASE_ID, - table=part_table, - filters=tight_filter, - context=part_ctx, - ) - tight_splits = list(tight_loader) - assert len(tight_splits) == 1, ( - f"Expected 1 split from tight datetime filter (only 5/8 partition survives), got {len(tight_splits)}" - ) - result = _read_all(tight_loader) - assert result.column("id").to_pylist() == [3], ( - f"Expected id [3] from tight datetime filter, got {result.column('id').to_pylist()}" - ) - print(f"PASS: tight datetime filter returned {result.num_rows} row from {len(tight_splits)} split") - finally: - livy.execute(f"DROP TABLE IF EXISTS {part_fqtn}") - print("All integration tests passed") finally: livy.close() diff --git a/integrations/python/dataloader/tests/test_data_loader.py b/integrations/python/dataloader/tests/test_data_loader.py index a66ee29ca..c120d2ba7 100644 --- a/integrations/python/dataloader/tests/test_data_loader.py +++ b/integrations/python/dataloader/tests/test_data_loader.py @@ -125,18 +125,6 @@ def test_table_properties_returns_metadata_properties(tmp_path): assert loader.table_properties["custom.key"] == "myvalue" -def test_id_is_unique_per_loader_instance(tmp_path): - catalog = _make_real_catalog(tmp_path) - - loader_a = OpenHouseDataLoader(catalog=catalog, database="db", table="tbl") - loader_b = OpenHouseDataLoader(catalog=catalog, database="db", table="tbl") - - assert isinstance(loader_a.id, str) - assert loader_a.id.startswith("dataloader-") - assert loader_a.id == loader_a.id - assert loader_a.id != loader_b.id - - def test_snapshot_id_returns_current_snapshot_id(tmp_path): catalog = _make_real_catalog(tmp_path) diff --git a/integrations/python/dataloader/tests/test_filters.py b/integrations/python/dataloader/tests/test_filters.py index 2bc74e612..67d2d3f08 100644 --- a/integrations/python/dataloader/tests/test_filters.py +++ b/integrations/python/dataloader/tests/test_filters.py @@ -356,55 +356,58 @@ class TestDataFusionLiteralConversion: def test_datetime_greater_than_or_equal(self): dt = datetime(2026, 4, 27, tzinfo=UTC) result = _to_datafusion_sql(col("datepartition") >= dt) - assert result == "\"datepartition\" >= '2026-04-27T00:00:00+00:00'" + assert result == "\"datepartition\" >= CAST('2026-04-27 00:00:00.000000+0000' AS TIMESTAMP)" def test_datetime_equal(self): dt = datetime(2026, 4, 27, 12, 30, 45, tzinfo=UTC) result = _to_datafusion_sql(col("ts") == dt) - assert result == "\"ts\" = '2026-04-27T12:30:45+00:00'" + assert result == "\"ts\" = CAST('2026-04-27 12:30:45.000000+0000' AS TIMESTAMP)" def test_datetime_with_microseconds(self): dt = datetime(2026, 4, 27, 12, 30, 45, 123456, tzinfo=UTC) result = _to_datafusion_sql(col("ts") == dt) - assert result == "\"ts\" = '2026-04-27T12:30:45.123456+00:00'" + assert result == "\"ts\" = CAST('2026-04-27 12:30:45.123456+0000' AS TIMESTAMP)" def test_datetime_non_utc_timezone_preserved(self): dt = datetime(2026, 4, 27, 12, 0, 0, tzinfo=timezone(timedelta(hours=5))) result = _to_datafusion_sql(col("ts") >= dt) - assert result == "\"ts\" >= '2026-04-27T12:00:00+05:00'" + assert result == "\"ts\" >= CAST('2026-04-27 12:00:00.000000+0500' AS TIMESTAMP)" def test_datetime_naive_no_offset(self): dt = datetime(2026, 4, 27, 12, 0, 0) result = _to_datafusion_sql(col("ts") >= dt) - assert result == "\"ts\" >= '2026-04-27T12:00:00'" + assert result == "\"ts\" >= CAST('2026-04-27 12:00:00.000000' AS TIMESTAMP)" def test_date_greater_than_or_equal(self): d = date(2026, 4, 27) result = _to_datafusion_sql(col("datepartition") >= d) - assert result == "\"datepartition\" >= '2026-04-27'" + assert result == "\"datepartition\" >= CAST('2026-04-27' AS DATE)" def test_datetime_between(self): dt1 = datetime(2026, 4, 27, tzinfo=UTC) dt2 = datetime(2026, 5, 1, tzinfo=UTC) result = _to_datafusion_sql(col("ts").between(dt1, dt2)) - assert result == "\"ts\" BETWEEN '2026-04-27T00:00:00+00:00' AND '2026-05-01T00:00:00+00:00'" + assert result == ( + "\"ts\" BETWEEN CAST('2026-04-27 00:00:00.000000+0000' AS TIMESTAMP)" + " AND CAST('2026-05-01 00:00:00.000000+0000' AS TIMESTAMP)" + ) def test_datetime_in_compound_filter(self): dt = datetime(2026, 4, 27, tzinfo=UTC) f = (col("datepartition") >= dt) & (col("status") == "active") result = _to_datafusion_sql(f) - assert "'2026-04-27T00:00:00+00:00'" in result + assert "CAST('2026-04-27 00:00:00.000000+0000' AS TIMESTAMP)" in result assert "\"status\" = 'active'" in result def test_time_equal(self): t = time(14, 30, 0) result = _to_datafusion_sql(col("event_time") == t) - assert result == "\"event_time\" = '14:30:00'" + assert result == "\"event_time\" = CAST('14:30:00.000000' AS TIME)" def test_time_with_microseconds(self): t = time(14, 30, 0, 500000) result = _to_datafusion_sql(col("event_time") == t) - assert result == "\"event_time\" = '14:30:00.500000'" + assert result == "\"event_time\" = CAST('14:30:00.500000' AS TIME)" def test_time_with_timezone_rejected(self): t = time(14, 30, 0, tzinfo=timezone(timedelta(hours=5))) diff --git a/integrations/python/dataloader/tests/test_metrics.py b/integrations/python/dataloader/tests/test_metrics.py deleted file mode 100644 index 6f074f127..000000000 --- a/integrations/python/dataloader/tests/test_metrics.py +++ /dev/null @@ -1,379 +0,0 @@ -"""Tests for the OpenTelemetry metrics emitted by the dataloader.""" - -from __future__ import annotations - -import os -import pickle -from collections.abc import Iterator -from unittest.mock import MagicMock - -import pyarrow as pa -import pyarrow.parquet as pq -import pytest -from opentelemetry import metrics as otel_metrics -from opentelemetry.metrics import Meter, get_meter -from opentelemetry.metrics import _internal as otel_metrics_internal -from opentelemetry.sdk.metrics import MeterProvider -from opentelemetry.sdk.metrics.export import InMemoryMetricReader -from pyiceberg.io import load_file_io -from pyiceberg.manifest import DataFile, FileFormat -from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC -from pyiceberg.schema import Schema -from pyiceberg.table import FileScanTask -from pyiceberg.table.metadata import new_table_metadata -from pyiceberg.table.sorting import UNSORTED_SORT_ORDER -from pyiceberg.types import LongType, NestedField - -from openhouse.dataloader import DataLoaderContext, OpenHouseDataLoader -from openhouse.dataloader._table_scan_context import TableScanContext -from openhouse.dataloader.data_loader import ( - _load_table_duration, - _load_table_failure, - _load_table_success, - _plan_files_duration, - _plan_files_failure, - _plan_files_success, - _retry, -) -from openhouse.dataloader.data_loader_split import DataLoaderSplit -from openhouse.dataloader.metrics import METER_NAME -from openhouse.dataloader.table_identifier import TableIdentifier - -# --- Meter / METER_NAME basics --- - - -def test_meter_name_is_stable(): - assert METER_NAME == "OpenHouse.DataLoader" - - -def test_get_meter_with_meter_name_returns_a_meter(): - assert isinstance(get_meter(METER_NAME), Meter) - - -# --- DataLoaderContext.metric_attribute_keys resolution --- - - -def _loader(context: DataLoaderContext) -> OpenHouseDataLoader: - return OpenHouseDataLoader(catalog=MagicMock(), database="db", table="tbl", context=context) - - -_BASE_ATTRS = {"OpenHouse.Database": "db", "OpenHouse.Table": "tbl"} - - -def test_resolved_metric_attributes_includes_table_identifier_only_by_default(): - loader = _loader(DataLoaderContext()) - assert dict(loader._resolved_metric_attributes) == _BASE_ATTRS - - -def test_resolved_metric_attributes_picks_whitelisted_keys(): - loader = _loader( - DataLoaderContext( - execution_context={"tenant": "t1", "env": "prod", "user_id": "u-42"}, - metric_attribute_keys=["tenant", "env"], - ) - ) - assert dict(loader._resolved_metric_attributes) == {**_BASE_ATTRS, "tenant": "t1", "env": "prod"} - - -def test_resolved_metric_attributes_skips_missing_keys(): - loader = _loader( - DataLoaderContext( - execution_context={"tenant": "t1"}, - metric_attribute_keys=["tenant", "env"], - ) - ) - assert dict(loader._resolved_metric_attributes) == {**_BASE_ATTRS, "tenant": "t1"} - - -def test_resolved_metric_attributes_no_extras_when_no_keys_configured(): - loader = _loader(DataLoaderContext(execution_context={"tenant": "t1"})) - assert dict(loader._resolved_metric_attributes) == _BASE_ATTRS - - -def test_resolved_metric_attributes_no_extras_when_execution_context_missing(): - loader = _loader(DataLoaderContext(metric_attribute_keys=["tenant"])) - assert dict(loader._resolved_metric_attributes) == _BASE_ATTRS - - -# --- InMemoryMetricReader harness --- - - -@pytest.fixture -def metrics_reader() -> Iterator[InMemoryMetricReader]: - """Install an SDK MeterProvider with an InMemoryMetricReader for the test. - - Resets the one-shot ``_METER_PROVIDER_SET_ONCE`` guard and restores the - prior MeterProvider on exit so other tests are not affected. - """ - reader = InMemoryMetricReader() - provider = MeterProvider(metric_readers=[reader]) - once = otel_metrics_internal._METER_PROVIDER_SET_ONCE - prior_provider = otel_metrics_internal._METER_PROVIDER - prior_done = once._done - once._done = False - otel_metrics.set_meter_provider(provider) - try: - yield reader - finally: - otel_metrics_internal._METER_PROVIDER = prior_provider - once._done = prior_done - - -def _data_points(reader: InMemoryMetricReader, metric_name: str) -> list: - """Collect and return all data points for *metric_name* across scopes. - - ``metric_name`` must be the lowercase form stored by the SDK — the - OpenTelemetry SDK lowercases instrument names at registration time - (``opentelemetry/sdk/metrics/_internal/instrument.py``), even though - the declared names are PascalCase. - """ - data = reader.get_metrics_data() - points: list = [] - if data is None: - return points - for resource_metric in data.resource_metrics: - for scope_metric in resource_metric.scope_metrics: - for metric in scope_metric.metrics: - if metric.name == metric_name: - points.extend(metric.data.data_points) - return points - - -def _attrs(point) -> dict: - return dict(point.attributes) - - -# --- _retry success / failure / duration --- - - -def test_retry_emits_success_and_duration_on_first_try(metrics_reader): - attrs = {"OpenHouse.Database": "db", "OpenHouse.Table": "tbl"} - result = _retry( - lambda: "ok", - label="load_table db.tbl", - max_attempts=3, - duration_histogram=_load_table_duration, - success_counter=_load_table_success, - failure_counter=_load_table_failure, - attributes=attrs, - ) - assert result == "ok" - - successes = _data_points(metrics_reader, "openhouse.dataloader.loadtablesuccess") - assert len(successes) == 1 - assert _attrs(successes[0]) == attrs - assert successes[0].value == 1 - - assert _data_points(metrics_reader, "openhouse.dataloader.loadtablefailure") == [] - - durations = _data_points(metrics_reader, "openhouse.dataloader.loadtabletime") - assert len(durations) == 1 - assert _attrs(durations[0]) == attrs - - -def test_retry_emits_single_success_after_transient_retry(metrics_reader): - attrs = {"OpenHouse.Database": "db", "OpenHouse.Table": "tbl", "Tenant": "t1"} - calls = {"n": 0} - - def fn(): - calls["n"] += 1 - if calls["n"] == 1: - raise OSError("transient") - return "ok" - - result = _retry( - fn, - label="plan_files db.tbl", - max_attempts=3, - duration_histogram=_plan_files_duration, - success_counter=_plan_files_success, - failure_counter=_plan_files_failure, - attributes=attrs, - ) - assert result == "ok" - assert calls["n"] == 2 - - successes = _data_points(metrics_reader, "openhouse.dataloader.planfilessuccess") - assert len(successes) == 1 - assert successes[0].value == 1 - assert _attrs(successes[0])["Tenant"] == "t1" - - assert _data_points(metrics_reader, "openhouse.dataloader.planfilesfailure") == [] - - durations = _data_points(metrics_reader, "openhouse.dataloader.planfilestime") - assert len(durations) == 1 - - -def test_retry_emits_failure_and_duration_on_permanent_failure(metrics_reader): - attrs = {"OpenHouse.Database": "db", "OpenHouse.Table": "tbl"} - - class _NonTransient(Exception): - pass - - def fn(): - raise _NonTransient("nope") - - with pytest.raises(_NonTransient): - _retry( - fn, - label="load_table", - max_attempts=3, - duration_histogram=_load_table_duration, - success_counter=_load_table_success, - failure_counter=_load_table_failure, - attributes=attrs, - ) - - failures = _data_points(metrics_reader, "openhouse.dataloader.loadtablefailure") - assert len(failures) == 1 - assert failures[0].value == 1 - - assert _data_points(metrics_reader, "openhouse.dataloader.loadtablesuccess") == [] - - durations = _data_points(metrics_reader, "openhouse.dataloader.loadtabletime") - assert len(durations) == 1 - - -# --- DataLoaderSplit instrumentation --- - -_SPLIT_SCHEMA = Schema(NestedField(field_id=1, name="id", field_type=LongType(), required=False)) -_SPLIT_TABLE_ID = TableIdentifier("db", "tbl") - - -def _make_split( - tmp_path, - metric_attributes: dict | None = None, - transform_sql: str | None = None, -) -> DataLoaderSplit: - file_path = str(tmp_path / "data.parquet") - table = pa.table({"id": pa.array([1, 2, 3], type=pa.int64())}) - fields = [field.with_metadata({b"PARQUET:field_id": str(i + 1).encode()}) for i, field in enumerate(table.schema)] - pq.write_table(table.cast(pa.schema(fields)), file_path) - - metadata = new_table_metadata( - schema=_SPLIT_SCHEMA, - partition_spec=UNPARTITIONED_PARTITION_SPEC, - sort_order=UNSORTED_SORT_ORDER, - location=str(tmp_path), - ) - scan_context = TableScanContext( - table_metadata=metadata, - io=load_file_io(properties={}, location=file_path), - projected_schema=_SPLIT_SCHEMA, - table_id=_SPLIT_TABLE_ID, - metric_attributes=metric_attributes or {}, - ) - data_file = DataFile.from_args( - file_path=file_path, - file_format=FileFormat.PARQUET, - record_count=table.num_rows, - file_size_in_bytes=os.path.getsize(file_path), - ) - data_file._spec_id = 0 - task = FileScanTask(data_file=data_file) - return DataLoaderSplit(file_scan_tasks=[task], scan_context=scan_context, transform_sql=transform_sql) - - -def test_split_emits_per_split_and_per_batch_metrics(tmp_path, metrics_reader): - expected_attrs = {**_BASE_ATTRS, "Tenant": "t1"} - split = _make_split(tmp_path, metric_attributes=expected_attrs) - batches = list(split) - assert sum(b.num_rows for b in batches) == 3 - - split_duration = _data_points(metrics_reader, "openhouse.dataloader.splittime") - assert len(split_duration) == 1 - assert _attrs(split_duration[0]) == expected_attrs - - split_files = _data_points(metrics_reader, "openhouse.dataloader.splitfiles") - assert len(split_files) == 1 - assert split_files[0].sum == 1 - - split_rows = _data_points(metrics_reader, "openhouse.dataloader.splitrows") - assert len(split_rows) == 1 - assert split_rows[0].sum == 3 - - split_bytes = _data_points(metrics_reader, "openhouse.dataloader.splitbytes") - assert len(split_bytes) == 1 - assert split_bytes[0].sum > 0 - - split_batches = _data_points(metrics_reader, "openhouse.dataloader.splitbatches") - assert len(split_batches) == 1 - assert split_batches[0].sum >= 1 - - batch_duration = _data_points(metrics_reader, "openhouse.dataloader.batchtime") - assert len(batch_duration) == 1 - assert _attrs(batch_duration[0]) == expected_attrs - - batch_rows = _data_points(metrics_reader, "openhouse.dataloader.batchrows") - assert len(batch_rows) == 1 - assert batch_rows[0].sum == 3 - - -def test_batch_read_failure_bumps_error_counters(tmp_path, monkeypatch, metrics_reader): - split = _make_split(tmp_path) - - class _ReaderError(Exception): - pass - - def _fake_to_record_batches(self, scan_tasks, **kwargs): - def _gen(): - raise _ReaderError("boom") - yield # pragma: no cover -- makes this a generator - - return _gen() - - monkeypatch.setattr( - "openhouse.dataloader.data_loader_split.ArrowScan.to_record_batches", - _fake_to_record_batches, - ) - - with pytest.raises(_ReaderError): - list(split) - - batch_errors = _data_points(metrics_reader, "openhouse.dataloader.batcherrors") - assert len(batch_errors) == 1 - assert batch_errors[0].value == 1 - - split_errors = _data_points(metrics_reader, "openhouse.dataloader.spliterrors") - assert len(split_errors) == 1 - assert split_errors[0].value == 1 - - # split.duration is still recorded on failure - split_duration = _data_points(metrics_reader, "openhouse.dataloader.splittime") - assert len(split_duration) == 1 - - -def test_split_with_transform_emits_transform_time(tmp_path, metrics_reader): - expected_attrs = {**_BASE_ATTRS, "Tenant": "t1"} - split = _make_split( - tmp_path, - metric_attributes=expected_attrs, - transform_sql='SELECT id FROM "db"."tbl"', - ) - list(split) - - transform_times = _data_points(metrics_reader, "openhouse.dataloader.transformtime") - assert len(transform_times) == 1 - assert _attrs(transform_times[0]) == expected_attrs - assert transform_times[0].sum > 0 - - -def test_split_without_transform_does_not_emit_transform_time(tmp_path, metrics_reader): - split = _make_split(tmp_path) - list(split) - - assert _data_points(metrics_reader, "openhouse.dataloader.transformtime") == [] - - -# --- TableScanContext.metric_attributes --- - - -def test_table_scan_context_default_metric_attributes_is_empty(tmp_path): - split = _make_split(tmp_path) - assert dict(split._scan_context.metric_attributes) == {} - - -def test_table_scan_context_pickle_preserves_metric_attributes(tmp_path): - split = _make_split(tmp_path, metric_attributes={"Tenant": "t1"}) - restored = pickle.loads(pickle.dumps(split._scan_context)) - assert dict(restored.metric_attributes) == {"Tenant": "t1"} diff --git a/integrations/python/dataloader/tests/test_scan_optimizer.py b/integrations/python/dataloader/tests/test_scan_optimizer.py index a1047bd12..576e903c4 100644 --- a/integrations/python/dataloader/tests/test_scan_optimizer.py +++ b/integrations/python/dataloader/tests/test_scan_optimizer.py @@ -169,31 +169,6 @@ def test_comparison_types(): assert plan.row_filter == expected_filter, f"row_filter mismatch for: {where_clause}" -def test_datetime_string_literals_pushed_as_strings(): - """`filters._literal_to_sql()` emits plain string literals for datetime/date/time - (see PR #569 + follow-up). The scan optimizer treats them as ordinary string - literals; PyIceberg promotes them to typed literals during expression binding - against the table schema, restoring partition pruning. - """ - cases = [ - ( - "\"x\" >= '2026-05-02T00:00:00+00:00'", - GreaterThanOrEqual("x", "2026-05-02T00:00:00+00:00"), - ), - ( - "\"x\" < '2026-05-04T00:00:00'", - LessThan("x", "2026-05-04T00:00:00"), - ), - ( - "\"x\" = '2026-05-02'", - EqualTo("x", "2026-05-02"), - ), - ] - for where_clause, expected_filter in cases: - plan = optimize_scan(f'SELECT "a" FROM "db"."tbl" WHERE {where_clause}') - assert plan.row_filter == expected_filter, f"row_filter mismatch for: {where_clause}; got {plan.row_filter!r}" - - def test_non_convertible_predicates_not_pushed(): """Predicates with functions or column-vs-column are not pushed.""" cases = [ diff --git a/integrations/python/dataloader/uv.lock b/integrations/python/dataloader/uv.lock index 888379e87..797a76158 100644 --- a/integrations/python/dataloader/uv.lock +++ b/integrations/python/dataloader/uv.lock @@ -53,43 +53,31 @@ sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8 wheels = [ { url = "https://files.pythonhosted.org/packages/50/bd/b1a6362b80628111e6653c961f987faa55262b4002fcec42308cad1db680/cffi-2.0.0-cp310-cp310-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:53f77cbe57044e88bbd5ed26ac1d0514d2acf0591dd6bb02a3ae37f76811b80c", size = 208811, upload-time = "2025-09-08T23:22:12.267Z" }, { url = "https://files.pythonhosted.org/packages/4f/27/6933a8b2562d7bd1fb595074cf99cc81fc3789f6a6c05cdabb46284a3188/cffi-2.0.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3e837e369566884707ddaf85fc1744b47575005c0a229de3327f8f9a20f4efeb", size = 216402, upload-time = "2025-09-08T23:22:13.455Z" }, - { url = "https://files.pythonhosted.org/packages/05/eb/b86f2a2645b62adcfff53b0dd97e8dfafb5c8aa864bd0d9a2c2049a0d551/cffi-2.0.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:5eda85d6d1879e692d546a078b44251cdd08dd1cfb98dfb77b670c97cee49ea0", size = 203217, upload-time = "2025-09-08T23:22:14.596Z" }, - { url = "https://files.pythonhosted.org/packages/9f/e0/6cbe77a53acf5acc7c08cc186c9928864bd7c005f9efd0d126884858a5fe/cffi-2.0.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:9332088d75dc3241c702d852d4671613136d90fa6881da7d770a483fd05248b4", size = 203079, upload-time = "2025-09-08T23:22:15.769Z" }, { url = "https://files.pythonhosted.org/packages/98/29/9b366e70e243eb3d14a5cb488dfd3a0b6b2f1fb001a203f653b93ccfac88/cffi-2.0.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fc7de24befaeae77ba923797c7c87834c73648a05a4bde34b3b7e5588973a453", size = 216475, upload-time = "2025-09-08T23:22:17.427Z" }, { url = "https://files.pythonhosted.org/packages/21/7a/13b24e70d2f90a322f2900c5d8e1f14fa7e2a6b3332b7309ba7b2ba51a5a/cffi-2.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cf364028c016c03078a23b503f02058f1814320a56ad535686f90565636a9495", size = 218829, upload-time = "2025-09-08T23:22:19.069Z" }, { url = "https://files.pythonhosted.org/packages/60/99/c9dc110974c59cc981b1f5b66e1d8af8af764e00f0293266824d9c4254bc/cffi-2.0.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e11e82b744887154b182fd3e7e8512418446501191994dbf9c9fc1f32cc8efd5", size = 211211, upload-time = "2025-09-08T23:22:20.588Z" }, { url = "https://files.pythonhosted.org/packages/49/72/ff2d12dbf21aca1b32a40ed792ee6b40f6dc3a9cf1644bd7ef6e95e0ac5e/cffi-2.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8ea985900c5c95ce9db1745f7933eeef5d314f0565b27625d9a10ec9881e1bfb", size = 218036, upload-time = "2025-09-08T23:22:22.143Z" }, { url = "https://files.pythonhosted.org/packages/b1/b7/1200d354378ef52ec227395d95c2576330fd22a869f7a70e88e1447eb234/cffi-2.0.0-cp311-cp311-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:baf5215e0ab74c16e2dd324e8ec067ef59e41125d3eade2b863d294fd5035c92", size = 209613, upload-time = "2025-09-08T23:22:29.475Z" }, { url = "https://files.pythonhosted.org/packages/b8/56/6033f5e86e8cc9bb629f0077ba71679508bdf54a9a5e112a3c0b91870332/cffi-2.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:730cacb21e1bdff3ce90babf007d0a0917cc3e6492f336c2f0134101e0944f93", size = 216476, upload-time = "2025-09-08T23:22:31.063Z" }, - { url = "https://files.pythonhosted.org/packages/dc/7f/55fecd70f7ece178db2f26128ec41430d8720f2d12ca97bf8f0a628207d5/cffi-2.0.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:6824f87845e3396029f3820c206e459ccc91760e8fa24422f8b0c3d1731cbec5", size = 203374, upload-time = "2025-09-08T23:22:32.507Z" }, - { url = "https://files.pythonhosted.org/packages/84/ef/a7b77c8bdc0f77adc3b46888f1ad54be8f3b7821697a7b89126e829e676a/cffi-2.0.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:9de40a7b0323d889cf8d23d1ef214f565ab154443c42737dfe52ff82cf857664", size = 202597, upload-time = "2025-09-08T23:22:34.132Z" }, { url = "https://files.pythonhosted.org/packages/d7/91/500d892b2bf36529a75b77958edfcd5ad8e2ce4064ce2ecfeab2125d72d1/cffi-2.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8941aaadaf67246224cee8c3803777eed332a19d909b47e29c9842ef1e79ac26", size = 215574, upload-time = "2025-09-08T23:22:35.443Z" }, { url = "https://files.pythonhosted.org/packages/44/64/58f6255b62b101093d5df22dcb752596066c7e89dd725e0afaed242a61be/cffi-2.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a05d0c237b3349096d3981b727493e22147f934b20f6f125a3eba8f994bec4a9", size = 218971, upload-time = "2025-09-08T23:22:36.805Z" }, { url = "https://files.pythonhosted.org/packages/ab/49/fa72cebe2fd8a55fbe14956f9970fe8eb1ac59e5df042f603ef7c8ba0adc/cffi-2.0.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:94698a9c5f91f9d138526b48fe26a199609544591f859c870d477351dc7b2414", size = 211972, upload-time = "2025-09-08T23:22:38.436Z" }, { url = "https://files.pythonhosted.org/packages/0b/28/dd0967a76aab36731b6ebfe64dec4e981aff7e0608f60c2d46b46982607d/cffi-2.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5fed36fccc0612a53f1d4d9a816b50a36702c28a2aa880cb8a122b3466638743", size = 217078, upload-time = "2025-09-08T23:22:39.776Z" }, { url = "https://files.pythonhosted.org/packages/ff/df/a4f0fbd47331ceeba3d37c2e51e9dfc9722498becbeec2bd8bc856c9538a/cffi-2.0.0-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe", size = 212529, upload-time = "2025-09-08T23:22:47.349Z" }, { url = "https://files.pythonhosted.org/packages/d5/72/12b5f8d3865bf0f87cf1404d8c374e7487dcf097a1c91c436e72e6badd83/cffi-2.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062", size = 220097, upload-time = "2025-09-08T23:22:48.677Z" }, - { url = "https://files.pythonhosted.org/packages/c2/95/7a135d52a50dfa7c882ab0ac17e8dc11cec9d55d2c18dda414c051c5e69e/cffi-2.0.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e", size = 207983, upload-time = "2025-09-08T23:22:50.06Z" }, - { url = "https://files.pythonhosted.org/packages/3a/c8/15cb9ada8895957ea171c62dc78ff3e99159ee7adb13c0123c001a2546c1/cffi-2.0.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037", size = 206519, upload-time = "2025-09-08T23:22:51.364Z" }, { url = "https://files.pythonhosted.org/packages/78/2d/7fa73dfa841b5ac06c7b8855cfc18622132e365f5b81d02230333ff26e9e/cffi-2.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba", size = 219572, upload-time = "2025-09-08T23:22:52.902Z" }, { url = "https://files.pythonhosted.org/packages/07/e0/267e57e387b4ca276b90f0434ff88b2c2241ad72b16d31836adddfd6031b/cffi-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94", size = 222963, upload-time = "2025-09-08T23:22:54.518Z" }, { url = "https://files.pythonhosted.org/packages/b6/75/1f2747525e06f53efbd878f4d03bac5b859cbc11c633d0fb81432d98a795/cffi-2.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187", size = 221361, upload-time = "2025-09-08T23:22:55.867Z" }, { url = "https://files.pythonhosted.org/packages/b0/1e/d22cc63332bd59b06481ceaac49d6c507598642e2230f201649058a7e704/cffi-2.0.0-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b", size = 212446, upload-time = "2025-09-08T23:23:03.472Z" }, { url = "https://files.pythonhosted.org/packages/a9/f5/a2c23eb03b61a0b8747f211eb716446c826ad66818ddc7810cc2cc19b3f2/cffi-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b", size = 220101, upload-time = "2025-09-08T23:23:04.792Z" }, - { url = "https://files.pythonhosted.org/packages/f2/7f/e6647792fc5850d634695bc0e6ab4111ae88e89981d35ac269956605feba/cffi-2.0.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2", size = 207948, upload-time = "2025-09-08T23:23:06.127Z" }, - { url = "https://files.pythonhosted.org/packages/cb/1e/a5a1bd6f1fb30f22573f76533de12a00bf274abcdc55c8edab639078abb6/cffi-2.0.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3", size = 206422, upload-time = "2025-09-08T23:23:07.753Z" }, { url = "https://files.pythonhosted.org/packages/98/df/0a1755e750013a2081e863e7cd37e0cdd02664372c754e5560099eb7aa44/cffi-2.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26", size = 219499, upload-time = "2025-09-08T23:23:09.648Z" }, { url = "https://files.pythonhosted.org/packages/50/e1/a969e687fcf9ea58e6e2a928ad5e2dd88cc12f6f0ab477e9971f2309b57c/cffi-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c", size = 222928, upload-time = "2025-09-08T23:23:10.928Z" }, { url = "https://files.pythonhosted.org/packages/36/54/0362578dd2c9e557a28ac77698ed67323ed5b9775ca9d3fe73fe191bb5d8/cffi-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b", size = 221302, upload-time = "2025-09-08T23:23:12.42Z" }, { url = "https://files.pythonhosted.org/packages/d6/43/0e822876f87ea8a4ef95442c3d766a06a51fc5298823f884ef87aaad168c/cffi-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b", size = 220049, upload-time = "2025-09-08T23:23:20.853Z" }, - { url = "https://files.pythonhosted.org/packages/b4/89/76799151d9c2d2d1ead63c2429da9ea9d7aac304603de0c6e8764e6e8e70/cffi-2.0.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c", size = 207793, upload-time = "2025-09-08T23:23:22.08Z" }, - { url = "https://files.pythonhosted.org/packages/bb/dd/3465b14bb9e24ee24cb88c9e3730f6de63111fffe513492bf8c808a3547e/cffi-2.0.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef", size = 206300, upload-time = "2025-09-08T23:23:23.314Z" }, { url = "https://files.pythonhosted.org/packages/47/d9/d83e293854571c877a92da46fdec39158f8d7e68da75bf73581225d28e90/cffi-2.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775", size = 219244, upload-time = "2025-09-08T23:23:24.541Z" }, { url = "https://files.pythonhosted.org/packages/2b/0f/1f177e3683aead2bb00f7679a16451d302c436b5cbf2505f0ea8146ef59e/cffi-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205", size = 222828, upload-time = "2025-09-08T23:23:26.143Z" }, { url = "https://files.pythonhosted.org/packages/c6/0f/cafacebd4b040e3119dcb32fed8bdef8dfe94da653155f9d0b9dc660166e/cffi-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1", size = 220926, upload-time = "2025-09-08T23:23:27.873Z" }, { url = "https://files.pythonhosted.org/packages/be/b4/c56878d0d1755cf9caa54ba71e5d049479c52f9e4afc230f06822162ab2f/cffi-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c", size = 221593, upload-time = "2025-09-08T23:23:31.91Z" }, - { url = "https://files.pythonhosted.org/packages/e0/0d/eb704606dfe8033e7128df5e90fee946bbcb64a04fcdaa97321309004000/cffi-2.0.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8", size = 209354, upload-time = "2025-09-08T23:23:33.214Z" }, - { url = "https://files.pythonhosted.org/packages/d8/19/3c435d727b368ca475fb8742ab97c9cb13a0de600ce86f62eab7fa3eea60/cffi-2.0.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc", size = 208480, upload-time = "2025-09-08T23:23:34.495Z" }, { url = "https://files.pythonhosted.org/packages/d0/44/681604464ed9541673e486521497406fadcc15b5217c3e326b061696899a/cffi-2.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592", size = 221584, upload-time = "2025-09-08T23:23:36.096Z" }, { url = "https://files.pythonhosted.org/packages/25/8e/342a504ff018a2825d395d44d63a767dd8ebc927ebda557fecdaca3ac33a/cffi-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512", size = 224443, upload-time = "2025-09-08T23:23:37.328Z" }, { url = "https://files.pythonhosted.org/packages/e1/5e/b666bacbbc60fbf415ba9988324a132c9a7a0448a9a8f125074671c0f2c3/cffi-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4", size = 223437, upload-time = "2025-09-08T23:23:38.945Z" }, @@ -218,33 +206,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/87/91/874b8910903159043b5c6a123b7e79c4559ddd1896e38967567942635778/cryptography-46.0.4-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5f14fba5bf6f4390d7ff8f086c566454bff0411f6d8aa7af79c88b6f9267aecc", size = 4275871, upload-time = "2026-01-28T00:23:09.439Z" }, { url = "https://files.pythonhosted.org/packages/c0/35/690e809be77896111f5b195ede56e4b4ed0435b428c2f2b6d35046fbb5e8/cryptography-46.0.4-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:47bcd19517e6389132f76e2d5303ded6cf3f78903da2158a671be8de024f4cd0", size = 4423124, upload-time = "2026-01-28T00:23:11.529Z" }, { url = "https://files.pythonhosted.org/packages/1a/5b/a26407d4f79d61ca4bebaa9213feafdd8806dc69d3d290ce24996d3cfe43/cryptography-46.0.4-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:01df4f50f314fbe7009f54046e908d1754f19d0c6d3070df1e6268c5a4af09fa", size = 4277090, upload-time = "2026-01-28T00:23:13.123Z" }, - { url = "https://files.pythonhosted.org/packages/0c/d8/4bb7aec442a9049827aa34cee1aa83803e528fa55da9a9d45d01d1bb933e/cryptography-46.0.4-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:5aa3e463596b0087b3da0dbe2b2487e9fc261d25da85754e30e3b40637d61f81", size = 4947652, upload-time = "2026-01-28T00:23:14.554Z" }, { url = "https://files.pythonhosted.org/packages/2b/08/f83e2e0814248b844265802d081f2fac2f1cbe6cd258e72ba14ff006823a/cryptography-46.0.4-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:0a9ad24359fee86f131836a9ac3bffc9329e956624a2d379b613f8f8abaf5255", size = 4455157, upload-time = "2026-01-28T00:23:16.443Z" }, { url = "https://files.pythonhosted.org/packages/0a/05/19d849cf4096448779d2dcc9bb27d097457dac36f7273ffa875a93b5884c/cryptography-46.0.4-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:dc1272e25ef673efe72f2096e92ae39dea1a1a450dd44918b15351f72c5a168e", size = 3981078, upload-time = "2026-01-28T00:23:17.838Z" }, { url = "https://files.pythonhosted.org/packages/e6/89/f7bac81d66ba7cde867a743ea5b37537b32b5c633c473002b26a226f703f/cryptography-46.0.4-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:de0f5f4ec8711ebc555f54735d4c673fc34b65c44283895f1a08c2b49d2fd99c", size = 4276213, upload-time = "2026-01-28T00:23:19.257Z" }, - { url = "https://files.pythonhosted.org/packages/da/9f/7133e41f24edd827020ad21b068736e792bc68eecf66d93c924ad4719fb3/cryptography-46.0.4-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:eeeb2e33d8dbcccc34d64651f00a98cb41b2dc69cef866771a5717e6734dfa32", size = 4912190, upload-time = "2026-01-28T00:23:21.244Z" }, { url = "https://files.pythonhosted.org/packages/a6/f7/6d43cbaddf6f65b24816e4af187d211f0bc536a29961f69faedc48501d8e/cryptography-46.0.4-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:3d425eacbc9aceafd2cb429e42f4e5d5633c6f873f5e567077043ef1b9bbf616", size = 4454641, upload-time = "2026-01-28T00:23:22.866Z" }, { url = "https://files.pythonhosted.org/packages/9e/4f/ebd0473ad656a0ac912a16bd07db0f5d85184924e14fc88feecae2492834/cryptography-46.0.4-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:91627ebf691d1ea3976a031b61fb7bac1ccd745afa03602275dda443e11c8de0", size = 4405159, upload-time = "2026-01-28T00:23:25.278Z" }, { url = "https://files.pythonhosted.org/packages/d1/f7/7923886f32dc47e27adeff8246e976d77258fd2aa3efdd1754e4e323bf49/cryptography-46.0.4-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:2d08bc22efd73e8854b0b7caff402d735b354862f1145d7be3b9c0f740fef6a0", size = 4666059, upload-time = "2026-01-28T00:23:26.766Z" }, { url = "https://files.pythonhosted.org/packages/f8/f5/559c25b77f40b6bf828eabaf988efb8b0e17b573545edb503368ca0a2a03/cryptography-46.0.4-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:078e5f06bd2fa5aea5a324f2a09f914b1484f1d0c2a4d6a8a28c74e72f65f2da", size = 4264508, upload-time = "2026-01-28T00:23:34.264Z" }, { url = "https://files.pythonhosted.org/packages/49/a1/551fa162d33074b660dc35c9bc3616fefa21a0e8c1edd27b92559902e408/cryptography-46.0.4-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:dce1e4f068f03008da7fa51cc7abc6ddc5e5de3e3d1550334eaf8393982a5829", size = 4409080, upload-time = "2026-01-28T00:23:35.793Z" }, { url = "https://files.pythonhosted.org/packages/b0/6a/4d8d129a755f5d6df1bbee69ea2f35ebfa954fa1847690d1db2e8bca46a5/cryptography-46.0.4-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:2067461c80271f422ee7bdbe79b9b4be54a5162e90345f86a23445a0cf3fd8a2", size = 4270039, upload-time = "2026-01-28T00:23:37.263Z" }, - { url = "https://files.pythonhosted.org/packages/4c/f5/ed3fcddd0a5e39321e595e144615399e47e7c153a1fb8c4862aec3151ff9/cryptography-46.0.4-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:c92010b58a51196a5f41c3795190203ac52edfd5dc3ff99149b4659eba9d2085", size = 4926748, upload-time = "2026-01-28T00:23:38.884Z" }, { url = "https://files.pythonhosted.org/packages/43/ae/9f03d5f0c0c00e85ecb34f06d3b79599f20630e4db91b8a6e56e8f83d410/cryptography-46.0.4-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:829c2b12bbc5428ab02d6b7f7e9bbfd53e33efd6672d21341f2177470171ad8b", size = 4442307, upload-time = "2026-01-28T00:23:40.56Z" }, { url = "https://files.pythonhosted.org/packages/8b/22/e0f9f2dae8040695103369cf2283ef9ac8abe4d51f68710bec2afd232609/cryptography-46.0.4-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:62217ba44bf81b30abaeda1488686a04a702a261e26f87db51ff61d9d3510abd", size = 3959253, upload-time = "2026-01-28T00:23:42.827Z" }, { url = "https://files.pythonhosted.org/packages/01/5b/6a43fcccc51dae4d101ac7d378a8724d1ba3de628a24e11bf2f4f43cba4d/cryptography-46.0.4-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:9c2da296c8d3415b93e6053f5a728649a87a48ce084a9aaf51d6e46c87c7f2d2", size = 4269372, upload-time = "2026-01-28T00:23:44.655Z" }, - { url = "https://files.pythonhosted.org/packages/17/b7/0f6b8c1dd0779df2b526e78978ff00462355e31c0a6f6cff8a3e99889c90/cryptography-46.0.4-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:9b34d8ba84454641a6bf4d6762d15847ecbd85c1316c0a7984e6e4e9f748ec2e", size = 4891908, upload-time = "2026-01-28T00:23:46.48Z" }, { url = "https://files.pythonhosted.org/packages/83/17/259409b8349aa10535358807a472c6a695cf84f106022268d31cea2b6c97/cryptography-46.0.4-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:df4a817fa7138dd0c96c8c8c20f04b8aaa1fac3bbf610913dcad8ea82e1bfd3f", size = 4441254, upload-time = "2026-01-28T00:23:48.403Z" }, { url = "https://files.pythonhosted.org/packages/9c/fe/e4a1b0c989b00cee5ffa0764401767e2d1cf59f45530963b894129fd5dce/cryptography-46.0.4-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:b1de0ebf7587f28f9190b9cb526e901bf448c9e6a99655d2b07fff60e8212a82", size = 4396520, upload-time = "2026-01-28T00:23:50.26Z" }, { url = "https://files.pythonhosted.org/packages/b3/81/ba8fd9657d27076eb40d6a2f941b23429a3c3d2f56f5a921d6b936a27bc9/cryptography-46.0.4-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9b4d17bc7bd7cdd98e3af40b441feaea4c68225e2eb2341026c84511ad246c0c", size = 4651479, upload-time = "2026-01-28T00:23:51.674Z" }, { url = "https://files.pythonhosted.org/packages/d8/cc/8f3224cbb2a928de7298d6ed4790f5ebc48114e02bdc9559196bfb12435d/cryptography-46.0.4-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8bf75b0259e87fa70bddc0b8b4078b76e7fd512fd9afae6c1193bcf440a4dbef", size = 4275419, upload-time = "2026-01-28T00:23:58.364Z" }, { url = "https://files.pythonhosted.org/packages/17/43/4a18faa7a872d00e4264855134ba82d23546c850a70ff209e04ee200e76f/cryptography-46.0.4-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3c268a3490df22270955966ba236d6bc4a8f9b6e4ffddb78aac535f1a5ea471d", size = 4419058, upload-time = "2026-01-28T00:23:59.867Z" }, { url = "https://files.pythonhosted.org/packages/ee/64/6651969409821d791ba12346a124f55e1b76f66a819254ae840a965d4b9c/cryptography-46.0.4-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:812815182f6a0c1d49a37893a303b44eaac827d7f0d582cecfc81b6427f22973", size = 4278151, upload-time = "2026-01-28T00:24:01.731Z" }, - { url = "https://files.pythonhosted.org/packages/20/0b/a7fce65ee08c3c02f7a8310cc090a732344066b990ac63a9dfd0a655d321/cryptography-46.0.4-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:a90e43e3ef65e6dcf969dfe3bb40cbf5aef0d523dff95bfa24256be172a845f4", size = 4939441, upload-time = "2026-01-28T00:24:03.175Z" }, { url = "https://files.pythonhosted.org/packages/db/a7/20c5701e2cd3e1dfd7a19d2290c522a5f435dd30957d431dcb531d0f1413/cryptography-46.0.4-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a05177ff6296644ef2876fce50518dffb5bcdf903c85250974fc8bc85d54c0af", size = 4451617, upload-time = "2026-01-28T00:24:05.403Z" }, { url = "https://files.pythonhosted.org/packages/00/dc/3e16030ea9aa47b63af6524c354933b4fb0e352257c792c4deeb0edae367/cryptography-46.0.4-cp38-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:daa392191f626d50f1b136c9b4cf08af69ca8279d110ea24f5c2700054d2e263", size = 3977774, upload-time = "2026-01-28T00:24:06.851Z" }, { url = "https://files.pythonhosted.org/packages/42/c8/ad93f14118252717b465880368721c963975ac4b941b7ef88f3c56bf2897/cryptography-46.0.4-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:e07ea39c5b048e085f15923511d8121e4a9dc45cee4e3b970ca4f0d338f23095", size = 4277008, upload-time = "2026-01-28T00:24:08.926Z" }, - { url = "https://files.pythonhosted.org/packages/00/cf/89c99698151c00a4631fbfcfcf459d308213ac29e321b0ff44ceeeac82f1/cryptography-46.0.4-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:d5a45ddc256f492ce42a4e35879c5e5528c09cd9ad12420828c972951d8e016b", size = 4903339, upload-time = "2026-01-28T00:24:12.009Z" }, { url = "https://files.pythonhosted.org/packages/03/c3/c90a2cb358de4ac9309b26acf49b2a100957e1ff5cc1e98e6c4996576710/cryptography-46.0.4-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:6bb5157bf6a350e5b28aee23beb2d84ae6f5be390b2f8ee7ea179cda077e1019", size = 4451216, upload-time = "2026-01-28T00:24:13.975Z" }, { url = "https://files.pythonhosted.org/packages/96/2c/8d7f4171388a10208671e181ca43cdc0e596d8259ebacbbcfbd16de593da/cryptography-46.0.4-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:dd5aba870a2c40f87a3af043e0dee7d9eb02d4aff88a797b48f2b43eff8c3ab4", size = 4404299, upload-time = "2026-01-28T00:24:16.169Z" }, { url = "https://files.pythonhosted.org/packages/e9/23/cbb2036e450980f65c6e0a173b73a56ff3bccd8998965dea5cc9ddd424a5/cryptography-46.0.4-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:93d8291da8d71024379ab2cb0b5c57915300155ad42e07f76bea6ad838d7e59b", size = 4664837, upload-time = "2026-01-28T00:24:17.629Z" }, @@ -324,14 +306,14 @@ wheels = [ [[package]] name = "importlib-metadata" -version = "8.7.1" +version = "9.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "zipp" }, + { name = "zipp", marker = "python_full_version < '3.14'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" } +sdist = { url = "https://files.pythonhosted.org/packages/a9/01/15bb152d77b21318514a96f43af312635eb2500c96b55398d020c93d86ea/importlib_metadata-9.0.0.tar.gz", hash = "sha256:a4f57ab599e6a2e3016d7595cfd72eb4661a5106e787a95bcc90c7105b831efc", size = 56405, upload-time = "2026-03-20T06:42:56.999Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" }, + { url = "https://files.pythonhosted.org/packages/38/3d/2d244233ac4f76e38533cfcb2991c9eb4c7bf688ae0a036d30725b8faafe/importlib_metadata-9.0.0-py3-none-any.whl", hash = "sha256:2d21d1cc5a017bd0559e36150c21c830ab1dc304dedd1b7ea85d20f45ef3edd7", size = 27789, upload-time = "2026-03-20T06:42:55.665Z" }, ] [[package]] @@ -777,7 +759,6 @@ source = { editable = "." } dependencies = [ { name = "datafusion" }, { name = "li-pyiceberg" }, - { name = "opentelemetry-api" }, { name = "requests" }, { name = "sqlglot" }, { name = "tenacity" }, @@ -786,7 +767,6 @@ dependencies = [ [package.optional-dependencies] dev = [ { name = "mypy" }, - { name = "opentelemetry-sdk" }, { name = "pytest" }, { name = "responses" }, { name = "ruff" }, @@ -799,8 +779,6 @@ requires-dist = [ { name = "datafusion", specifier = "==53.0.0" }, { name = "li-pyiceberg", specifier = "==0.11.5", index = "https://linkedin.jfrog.io/artifactory/api/pypi/openhouse-pypi/simple/" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.14.0" }, - { name = "opentelemetry-api", specifier = ">=1.38.0" }, - { name = "opentelemetry-sdk", marker = "extra == 'dev'", specifier = ">=1.38.0" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" }, { name = "requests", specifier = ">=2.31.0" }, { name = "responses", marker = "extra == 'dev'", specifier = ">=0.25.0" }, @@ -812,46 +790,6 @@ requires-dist = [ ] provides-extras = ["dev"] -[[package]] -name = "opentelemetry-api" -version = "1.38.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "importlib-metadata" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/08/d8/0f354c375628e048bd0570645b310797299754730079853095bf000fba69/opentelemetry_api-1.38.0.tar.gz", hash = "sha256:f4c193b5e8acb0912b06ac5b16321908dd0843d75049c091487322284a3eea12", size = 65242, upload-time = "2025-10-16T08:35:50.25Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ae/a2/d86e01c28300bd41bab8f18afd613676e2bd63515417b77636fc1add426f/opentelemetry_api-1.38.0-py3-none-any.whl", hash = "sha256:2891b0197f47124454ab9f0cf58f3be33faca394457ac3e09daba13ff50aa582", size = 65947, upload-time = "2025-10-16T08:35:30.23Z" }, -] - -[[package]] -name = "opentelemetry-sdk" -version = "1.38.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "opentelemetry-api" }, - { name = "opentelemetry-semantic-conventions" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/85/cb/f0eee1445161faf4c9af3ba7b848cc22a50a3d3e2515051ad8628c35ff80/opentelemetry_sdk-1.38.0.tar.gz", hash = "sha256:93df5d4d871ed09cb4272305be4d996236eedb232253e3ab864c8620f051cebe", size = 171942, upload-time = "2025-10-16T08:36:02.257Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2f/2e/e93777a95d7d9c40d270a371392b6d6f1ff170c2a3cb32d6176741b5b723/opentelemetry_sdk-1.38.0-py3-none-any.whl", hash = "sha256:1c66af6564ecc1553d72d811a01df063ff097cdc82ce188da9951f93b8d10f6b", size = 132349, upload-time = "2025-10-16T08:35:46.995Z" }, -] - -[[package]] -name = "opentelemetry-semantic-conventions" -version = "0.59b0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "opentelemetry-api" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/40/bc/8b9ad3802cd8ac6583a4eb7de7e5d7db004e89cb7efe7008f9c8a537ee75/opentelemetry_semantic_conventions-0.59b0.tar.gz", hash = "sha256:7a6db3f30d70202d5bf9fa4b69bc866ca6a30437287de6c510fb594878aed6b0", size = 129861, upload-time = "2025-10-16T08:36:03.346Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/24/7d/c88d7b15ba8fe5c6b8f93be50fc11795e9fc05386c44afaf6b76fe191f9b/opentelemetry_semantic_conventions-0.59b0-py3-none-any.whl", hash = "sha256:35d3b8833ef97d614136e253c1da9342b4c3c083bbaf29ce31d572a1c3825eed", size = 207954, upload-time = "2025-10-16T08:35:48.054Z" }, -] - [[package]] name = "packaging" version = "26.0" diff --git a/integrations/spark/spark-3.1/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/RTASTest.java b/integrations/spark/spark-3.1/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/RTASTest.java index e831ca706..147ee8cba 100644 --- a/integrations/spark/spark-3.1/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/RTASTest.java +++ b/integrations/spark/spark-3.1/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/RTASTest.java @@ -83,7 +83,10 @@ public void testRTAS() throws Exception { Table rtasTable = catalog.loadTable(tableIdent); // verify table location is unchanged - assertEquals(expectedTableLocation, rtasTable.location(), "Should have same table location"); + assertEquals( + stripPathScheme(expectedTableLocation), + stripPathScheme(rtasTable.location()), + "Should have same table location"); // verify schema and spec are changed assertEquals( expectedSchema.asStruct(), @@ -148,7 +151,10 @@ public void testCreateRTAS() throws Exception { Table rtasTable = catalog.loadTable(tableIdent); // verify table location is unchanged - assertEquals(expectedTableLocation, rtasTable.location(), "Should have same table location"); + assertEquals( + stripPathScheme(expectedTableLocation), + stripPathScheme(rtasTable.location()), + "Should have same table location"); // verify schema and spec are changed assertEquals( expectedSchema.asStruct(), @@ -195,7 +201,10 @@ public void testDataFrameV2Replace() throws Exception { Table rtasTable = catalog.loadTable(tableIdent); // verify table location is unchanged - assertEquals(expectedTableLocation, rtasTable.location(), "Should have same table location"); + assertEquals( + stripPathScheme(expectedTableLocation), + stripPathScheme(rtasTable.location()), + "Should have same table location"); // verify schema and spec are changed assertEquals( expectedSchema.asStruct(), @@ -256,7 +265,10 @@ public void testDataFrameV2CreateOrReplace() throws Exception { Table rtasTable = catalog.loadTable(tableIdent); // verify table location is unchanged - assertEquals(expectedTableLocation, rtasTable.location(), "Should have same table location"); + assertEquals( + stripPathScheme(expectedTableLocation), + stripPathScheme(rtasTable.location()), + "Should have same table location"); // verify schema and spec are changed assertEquals( expectedSchema.asStruct(), diff --git a/services/common/build.gradle b/services/common/build.gradle index f005a8c6c..83c54bfde 100644 --- a/services/common/build.gradle +++ b/services/common/build.gradle @@ -36,6 +36,7 @@ dependencies { implementation 'io.opentelemetry:opentelemetry-sdk:1.47.0' implementation 'io.opentelemetry:opentelemetry-semconv:1.14.0-alpha' implementation 'org.apache.commons:commons-lang3:3.12.0' + // version chosen to be consistent with the transitive dependency // from the springboot framework's version in other modules. testImplementation 'commons-io:commons-io:2.4' diff --git a/services/optimizer/analyzer/build.gradle b/services/optimizer/analyzer/build.gradle deleted file mode 100644 index c49951de3..000000000 --- a/services/optimizer/analyzer/build.gradle +++ /dev/null @@ -1,34 +0,0 @@ -plugins { - id 'openhouse.springboot-ext-conventions' - id 'org.springframework.boot' version '2.7.8' -} - -// Library jar — the @SpringBootApplication entry point lives in :apps:optimizer:analyzerapp. -// Disable bootJar so we don't try to assemble a runnable jar from a library that has no main -// class; keep jar enabled so consumers (the apps wrapper) get a normal library artifact. -bootJar { - enabled = false -} - -jar { - enabled = true - archiveClassifier = '' -} - -dependencies { - // api: the analyzer's public types (e.g. OperationAnalyzer's signature, OperationTypeDto) come - // from :services:optimizer, so consumers of this library see them on their compile classpath. - api project(':services:optimizer') - implementation 'org.springframework.boot:spring-boot-starter:2.7.8' - implementation 'org.springframework.boot:spring-boot-starter-webflux:2.7.8' - implementation 'org.springframework.boot:spring-boot-starter-data-jpa:2.7.8' - implementation 'org.springframework.boot:spring-boot-starter-aop:2.7.8' - runtimeOnly 'mysql:mysql-connector-java:8.0.33' - testImplementation 'org.springframework.boot:spring-boot-starter-test:2.7.8' - testImplementation 'com.squareup.okhttp3:mockwebserver:4.10.0' - testRuntimeOnly 'com.h2database:h2' -} - -test { - useJUnitPlatform() -} diff --git a/services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/AnalyzerRunner.java b/services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/AnalyzerRunner.java deleted file mode 100644 index c1a023736..000000000 --- a/services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/AnalyzerRunner.java +++ /dev/null @@ -1,172 +0,0 @@ -package com.linkedin.openhouse.optimizer.analyzer; - -import com.linkedin.openhouse.optimizer.model.OperationTypeDto; -import com.linkedin.openhouse.optimizer.model.TableDto; -import com.linkedin.openhouse.optimizer.model.TableOperationDto; -import com.linkedin.openhouse.optimizer.model.TableOperationsHistoryDto; -import com.linkedin.openhouse.optimizer.repository.TableOperationsHistoryRepository; -import com.linkedin.openhouse.optimizer.repository.TableOperationsRepository; -import com.linkedin.openhouse.optimizer.repository.TableStatsRepository; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.stream.Collectors; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; -import org.springframework.data.domain.Pageable; -import org.springframework.stereotype.Component; -import org.springframework.transaction.annotation.Transactional; - -/** - * Core analysis loop. For one operation type per call, iterates databases and evaluates each table - * in a database against the matching {@link OperationAnalyzer}. - * - *

Both sides of the join — current operations and latest history per (table, type) — are loaded - * into maps once per database before the table loop. This is correct at small scale (≤~100k - * tables); past that the per-db query shape and projection need further tuning. - * - *

The per-db working-set upper bound is not yet empirically validated. - */ -@Slf4j -@Component -@RequiredArgsConstructor -public class AnalyzerRunner { - - private final List analyzers; - private final TableStatsRepository statsRepo; - private final TableOperationsRepository operationsRepo; - private final TableOperationsHistoryRepository historyRepo; - - /** - * Run the analysis loop for {@code operationType} across all databases, with no filters. - * Equivalent to {@link #analyze(OperationTypeDto, Optional, Optional, Optional)} with all-empty - * filters. - */ - public void analyze(OperationTypeDto operationType) { - analyze(operationType, Optional.empty(), Optional.empty(), Optional.empty()); - } - - /** - * Run the analysis loop for the given operation type, optionally scoped to a single database, - * table name, or table UUID. Iterates databases one at a time so the working set is bounded by - * tables-per-db, not tables-total. - */ - public void analyze( - OperationTypeDto operationType, - Optional databaseName, - Optional tableName, - Optional tableUuid) { - OperationAnalyzer analyzer = - analyzers.stream() - .filter(a -> a.getOperationType() == operationType) - .findFirst() - .orElseThrow( - () -> - new IllegalStateException( - "No analyzer registered for operation type " + operationType)); - List dbs = databaseName.map(List::of).orElseGet(statsRepo::findDistinctDatabaseNames); - log.info("Analyzing {} across {} database(s)", operationType, dbs.size()); - dbs.forEach(db -> analyzeDatabase(analyzer, db, tableName, tableUuid)); - log.info("Analysis complete for {}", operationType); - } - - @Transactional - void analyzeDatabase( - OperationAnalyzer analyzer, - String databaseName, - Optional tableName, - Optional tableUuid) { - - // Load the three join inputs unbounded for this database. Aligned page-by-page pagination on - // these maps would leave keys in one map's page mismatched with the others' — a table whose - // op/history happens to fall in a different page would be misread as "no current op / no - // history" and trigger duplicate scheduling. Correctness requires the maps to be complete - // relative to the tables being processed; the working set is bounded by tables-in-db, not by - // any per-cycle cap. - Map currentOps = - operationsRepo - .find( - Optional.of(analyzer.getOperationType().toDb()), - Optional.empty(), - tableUuid, - Optional.of(databaseName), - tableName, - Optional.empty(), - Optional.empty(), - Pageable.unpaged()) - .stream() - .filter(e -> e.getTableUuid() != null) - .map(TableOperationDto::fromRow) - .collect( - Collectors.toMap( - TableOperationDto::getTableUuid, op -> op, TableOperationDto::mostRecent)); - - Map latestHistory = - historyRepo.findLatest(analyzer.getOperationType().toDb(), Pageable.unpaged()).stream() - .filter(r -> r.getTableUuid() != null) - .map(TableOperationsHistoryDto::fromRow) - .collect( - Collectors.toMap( - TableOperationsHistoryDto::getTableUuid, - h -> h, - TableOperationsHistoryDto::after)); - - List tables = - statsRepo.find(Optional.of(databaseName), tableName, tableUuid, Pageable.unpaged()).stream() - .filter(row -> row.getTableUuid() != null) - .map(TableDto::fromRow) - .collect(Collectors.toList()); - - /* - * For each table in this database, decide whether to create a new PENDING operation. - * - * 1. Skip tables not opted in to this operation type. - * 2. Look up the table's current active operation (if any) and its most recent completed - * history entry from the maps loaded above. - * 3. Delegate the schedule-or-not decision to the analyzer's shouldSchedule — strategy - * encapsulates cadence, retry policy, and any future per-operation signals. - * 4. On true, persist a new PENDING operation. The scheduler picks it up on its next pass. - */ - int created = 0; - int failed = 0; - for (TableDto table : tables) { - if (!analyzer.isEnabled(table)) { - continue; - } - Optional currentOp = - Optional.ofNullable(currentOps.get(table.getTableUuid())); - Optional entry = - Optional.ofNullable(latestHistory.get(table.getTableUuid())); - if (!analyzer.shouldSchedule(table, currentOp, entry)) { - continue; - } - try { - TableOperationDto op = TableOperationDto.pending(table, analyzer.getOperationType()); - operationsRepo.save(op.toRow()); - log.debug( - "Created PENDING {} operation for table {}.{}", - analyzer.getOperationType(), - table.getDatabaseName(), - table.getTableId()); - created++; - } catch (RuntimeException e) { - // One bad table should not abort the rest of the database. Log and continue; the next - // analyzer pass will retry for any table whose save failed here. - log.error( - "Failed to create PENDING {} operation for table {}.{}: {}", - analyzer.getOperationType(), - table.getDatabaseName(), - table.getTableId(), - e.toString(), - e); - failed++; - } - } - log.info( - "Finished analyzing Database {}: created {} PENDING {} operation(s) ({} failed)", - databaseName, - created, - analyzer.getOperationType(), - failed); - } -} diff --git a/services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/CadenceBasedOrphanFilesDeletionAnalyzer.java b/services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/CadenceBasedOrphanFilesDeletionAnalyzer.java deleted file mode 100644 index 1f4b31542..000000000 --- a/services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/CadenceBasedOrphanFilesDeletionAnalyzer.java +++ /dev/null @@ -1,83 +0,0 @@ -package com.linkedin.openhouse.optimizer.analyzer; - -import com.linkedin.openhouse.optimizer.model.OperationTypeDto; -import com.linkedin.openhouse.optimizer.model.TableDto; -import com.linkedin.openhouse.optimizer.model.TableOperationDto; -import com.linkedin.openhouse.optimizer.model.TableOperationsHistoryDto; -import java.time.Duration; -import java.util.Optional; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.stereotype.Component; - -/** - * Decides when to schedule an Orphan-Files-Deletion (OFD) run for a table. - * - *

OFD removes data files in the table's storage directory that are no longer referenced by any - * Iceberg snapshot — left-over output from failed writes, expired snapshots, or interrupted - * compactions. Running it too often wastes compute; running it too rarely lets orphan files - * accumulate and bloats storage cost. This analyzer balances the two on a per-table cadence. - * - *

When OFD fires for a table

- * - * All of the following must be true: - * - *
    - *
  1. Opt-in. The table sets {@code maintenance.optimizer.ofd.enabled=true} in its table - * properties. Without this flag, the analyzer ignores the table entirely. - *
  2. No active operation already in flight. If the table has a non-CANCELED operation row - * (PENDING, SCHEDULING, or SCHEDULED), the scheduler already owns it and the analyzer stays - * out. A CANCELED row does not block — it is treated as if no operation exists. - *
  3. Cadence elapsed since the last completed run. - *
      - *
    • If the table has no prior history, schedule immediately. - *
    • If the most recent history entry is {@code SUCCESS}, wait {@code - * ofd.success-retry-hours} (default 16h) after its {@code completedAt} before - * scheduling again. Set below 24h so that even when a run lands at an unlucky time of - * day, at least one re-evaluation is guaranteed within any rolling 24-hour window. - *
    • If the most recent history entry is {@code FAILED}, wait {@code - * ofd.failure-retry-hours} (default 1h) before retrying — shorter than the success - * interval so transient failures recover quickly. - *
    - *
- * - *

The two retry intervals are configurable via {@code application.properties} and can be tuned - * per environment. The opt-in property is per-table and managed through the standard table- - * properties API. - */ -@Component -public class CadenceBasedOrphanFilesDeletionAnalyzer implements OperationAnalyzer { - - static final String OFD_ENABLED_PROPERTY = "maintenance.optimizer.ofd.enabled"; - - private final CadencePolicy cadencePolicy; - - public CadenceBasedOrphanFilesDeletionAnalyzer( - @Value("${ofd.success-retry-hours:16}") long successRetryHours, - @Value("${ofd.failure-retry-hours:1}") long failureRetryHours) { - this.cadencePolicy = - new CadencePolicy(Duration.ofHours(successRetryHours), Duration.ofHours(failureRetryHours)); - } - - /** Package-private for tests that supply a pre-built {@link CadencePolicy}. */ - CadenceBasedOrphanFilesDeletionAnalyzer(CadencePolicy cadencePolicy) { - this.cadencePolicy = cadencePolicy; - } - - @Override - public OperationTypeDto getOperationType() { - return OperationTypeDto.ORPHAN_FILES_DELETION; - } - - @Override - public boolean isEnabled(TableDto table) { - return "true".equals(table.getTableProperties().get(OFD_ENABLED_PROPERTY)); - } - - @Override - public boolean shouldSchedule( - TableDto table, - Optional currentOp, - Optional latestHistory) { - return cadencePolicy.shouldSchedule(currentOp, latestHistory); - } -} diff --git a/services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/CadencePolicy.java b/services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/CadencePolicy.java deleted file mode 100644 index 2cea1e2f8..000000000 --- a/services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/CadencePolicy.java +++ /dev/null @@ -1,72 +0,0 @@ -package com.linkedin.openhouse.optimizer.analyzer; - -import com.linkedin.openhouse.optimizer.model.HistoryStatusDto; -import com.linkedin.openhouse.optimizer.model.OperationStatusDto; -import com.linkedin.openhouse.optimizer.model.TableOperationDto; -import com.linkedin.openhouse.optimizer.model.TableOperationsHistoryDto; -import java.time.Duration; -import java.time.Instant; -import java.util.Optional; -import lombok.RequiredArgsConstructor; - -/** - * Time-based scheduling policy. An analyzer delegates to {@link CadencePolicy} to decide whether to - * re-issue a recommendation for a table. - * - *

The analyzer stays out of any table that already has a non-CANCELED active operation — those - * belong to the scheduler. For tables with no active operation (or only a CANCELED one), the - * decision is based on the most recent completed-history entry: re-evaluate after {@code - * successRetryInterval} on success, or after {@code failureRetryInterval} on failure. - */ -@RequiredArgsConstructor -public class CadencePolicy { - - /** - * How long to wait after a successful operation before re-evaluating the table. For example, if - * set to 16 hours and OFD succeeded at 10:00 AM Monday, the table becomes eligible again at 2:00 - * AM Tuesday. Configured below 24h so that at least one re-evaluation is guaranteed within any - * rolling 24-hour window regardless of when the prior run landed. - */ - private final Duration successRetryInterval; - - /** - * How long to wait after a failed operation before retrying. Shorter than the success interval to - * allow quick recovery. For example, if set to 1 hour and OFD failed at 2:00 PM, the table - * becomes eligible for retry at 3:00 PM. - */ - private final Duration failureRetryInterval; - - /** - * Returns {@code true} if a new or refreshed operation record should be upserted. - * - * @param currentOp the existing active operation record, or empty if none exists - * @param latestHistory the most recent history entry for this (table, type), or empty - */ - public boolean shouldSchedule( - Optional currentOp, Optional latestHistory) { - if (currentOp.isPresent() && currentOp.get().getStatus() != OperationStatusDto.CANCELED) { - return false; - } - return latestHistory.map(this::readyAfterHistoryEntry).orElse(true); - } - - private boolean readyAfterHistoryEntry(TableOperationsHistoryDto entry) { - return Duration.between(entry.getCompletedAt(), Instant.now()) - .compareTo(intervalFor(entry.getStatus())) - > 0; - } - - private Duration intervalFor(HistoryStatusDto status) { - // Explicit per-status mapping. Adding a new HistoryStatusDto value forces this switch to - // grow a case; the default throws so an un-handled value surfaces at runtime rather than - // silently falling into the success bucket. - switch (status) { - case SUCCESS: - return successRetryInterval; - case FAILED: - return failureRetryInterval; - default: - throw new IllegalStateException("Unhandled HistoryStatusDto value: " + status); - } - } -} diff --git a/services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/OperationAnalyzer.java b/services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/OperationAnalyzer.java deleted file mode 100644 index 84fa11b67..000000000 --- a/services/optimizer/analyzer/src/main/java/com/linkedin/openhouse/optimizer/analyzer/OperationAnalyzer.java +++ /dev/null @@ -1,41 +0,0 @@ -package com.linkedin.openhouse.optimizer.analyzer; - -import com.linkedin.openhouse.optimizer.model.OperationTypeDto; -import com.linkedin.openhouse.optimizer.model.TableDto; -import com.linkedin.openhouse.optimizer.model.TableOperationDto; -import com.linkedin.openhouse.optimizer.model.TableOperationsHistoryDto; -import java.util.Optional; - -/** - * Strategy interface for a single operation type. Each implementation decides whether a given table - * needs an operation recommendation upserted in the Optimizer Service. - * - *

TODO(circuit-breaker): a chronically-failing table currently produces a new PENDING row on - * every Analyzer pass. Add a circuit breaker that suppresses scheduling for a (table, type) after N - * consecutive FAILED history entries. Requirements: configurable threshold per operation type, - * automatic reset via exponential backoff so tables can recover, and an operator-visible signal - * (metric or query) so tripped breakers are diagnosable. - */ -public interface OperationAnalyzer { - - /** The operation type this analyzer handles. */ - OperationTypeDto getOperationType(); - - /** - * Returns {@code true} if this operation is opted-in for the given table. Tables that return - * {@code false} are skipped entirely — no upsert is issued. - */ - boolean isEnabled(TableDto table); - - /** - * Returns {@code true} if a new or refreshed operation record should be upserted. - * - * @param table the table entry - * @param currentOp the existing active operation record, or empty if none exists - * @param latestHistory the most recent history entry for this (table, type), or empty - */ - boolean shouldSchedule( - TableDto table, - Optional currentOp, - Optional latestHistory); -} diff --git a/services/optimizer/analyzer/src/test/java/com/linkedin/openhouse/optimizer/analyzer/AnalyzerRunnerTest.java b/services/optimizer/analyzer/src/test/java/com/linkedin/openhouse/optimizer/analyzer/AnalyzerRunnerTest.java deleted file mode 100644 index 4731b5fb7..000000000 --- a/services/optimizer/analyzer/src/test/java/com/linkedin/openhouse/optimizer/analyzer/AnalyzerRunnerTest.java +++ /dev/null @@ -1,218 +0,0 @@ -package com.linkedin.openhouse.optimizer.analyzer; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.eq; -import static org.mockito.Mockito.never; -import static org.mockito.Mockito.verify; -import static org.mockito.Mockito.when; - -import com.linkedin.openhouse.optimizer.db.TableOperationsRow; -import com.linkedin.openhouse.optimizer.db.TableStatsRow; -import com.linkedin.openhouse.optimizer.model.OperationTypeDto; -import com.linkedin.openhouse.optimizer.model.TableDto; -import com.linkedin.openhouse.optimizer.model.TableOperationDto; -import com.linkedin.openhouse.optimizer.repository.TableOperationsHistoryRepository; -import com.linkedin.openhouse.optimizer.repository.TableOperationsRepository; -import com.linkedin.openhouse.optimizer.repository.TableStatsRepository; -import java.time.Instant; -import java.util.Collections; -import java.util.List; -import java.util.Optional; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.ExtendWith; -import org.mockito.ArgumentCaptor; -import org.mockito.Mock; -import org.mockito.junit.jupiter.MockitoExtension; - -@ExtendWith(MockitoExtension.class) -class AnalyzerRunnerTest { - - private static final OperationTypeDto OFD_TYPE = OperationTypeDto.ORPHAN_FILES_DELETION; - private static final com.linkedin.openhouse.optimizer.db.OperationType OFD_DB = - com.linkedin.openhouse.optimizer.db.OperationType.ORPHAN_FILES_DELETION; - private static final String DB = "db1"; - - @Mock private TableStatsRepository statsRepo; - @Mock private TableOperationsRepository operationsRepo; - @Mock private TableOperationsHistoryRepository historyRepo; - @Mock private OperationAnalyzer analyzer; - - private AnalyzerRunner runner; - - @BeforeEach - void setUp() { - runner = new AnalyzerRunner(List.of(analyzer), statsRepo, operationsRepo, historyRepo); - when(analyzer.getOperationType()).thenReturn(OFD_TYPE); - when(statsRepo.findDistinctDatabaseNames()).thenReturn(List.of(DB)); - } - - @Test - void analyze_insertsNewRow_forEligibleTableWithNoExistingOp() { - TableStatsRow statsEntity = - TableStatsRow.builder().tableUuid("uuid-1").databaseName(DB).tableName("tbl1").build(); - - TableDto expectedTable = TableDto.fromRow(statsEntity); - - when(statsRepo.find(eq(Optional.of(DB)), eq(Optional.empty()), eq(Optional.empty()), any())) - .thenReturn(List.of(statsEntity)); - when(operationsRepo.find( - eq(Optional.of(OFD_DB)), - eq(Optional.empty()), - eq(Optional.empty()), - eq(Optional.of(DB)), - eq(Optional.empty()), - eq(Optional.empty()), - eq(Optional.empty()), - any())) - .thenReturn(Collections.emptyList()); - when(historyRepo.findLatest(eq(OFD_DB), any())).thenReturn(Collections.emptyList()); - when(analyzer.isEnabled(expectedTable)).thenReturn(true); - when(analyzer.shouldSchedule(expectedTable, Optional.empty(), Optional.empty())) - .thenReturn(true); - - runner.analyze(OFD_TYPE); - - ArgumentCaptor captor = ArgumentCaptor.forClass(TableOperationsRow.class); - verify(operationsRepo).save(captor.capture()); - TableOperationsRow saved = captor.getValue(); - assertThat(saved.getTableUuid()).isEqualTo("uuid-1"); - assertThat(saved.getDatabaseName()).isEqualTo(DB); - assertThat(saved.getTableName()).isEqualTo("tbl1"); - assertThat(saved.getOperationType()).isEqualTo(OFD_DB); - assertThat(saved.getStatus()) - .isEqualTo(com.linkedin.openhouse.optimizer.db.OperationStatus.PENDING); - assertThat(saved.getId()).isNotNull(); - } - - @Test - void analyze_noOp_whenCadencePolicyReturnsFalseForPending() { - TableStatsRow statsEntity = - TableStatsRow.builder().tableUuid("uuid-1").databaseName(DB).tableName("tbl1").build(); - - TableDto expectedTable = TableDto.fromRow(statsEntity); - - TableOperationsRow existingEntity = - TableOperationsRow.builder() - .id("existing-op-id") - .status(com.linkedin.openhouse.optimizer.db.OperationStatus.PENDING) - .tableUuid("uuid-1") - .operationType(OFD_DB) - .createdAt(Instant.now()) - .build(); - - when(statsRepo.find(eq(Optional.of(DB)), eq(Optional.empty()), eq(Optional.empty()), any())) - .thenReturn(List.of(statsEntity)); - when(operationsRepo.find( - eq(Optional.of(OFD_DB)), - eq(Optional.empty()), - eq(Optional.empty()), - eq(Optional.of(DB)), - eq(Optional.empty()), - eq(Optional.empty()), - eq(Optional.empty()), - any())) - .thenReturn(List.of(existingEntity)); - when(historyRepo.findLatest(eq(OFD_DB), any())).thenReturn(Collections.emptyList()); - when(analyzer.isEnabled(expectedTable)).thenReturn(true); - - TableOperationDto existingOp = TableOperationDto.fromRow(existingEntity); - when(analyzer.shouldSchedule(expectedTable, Optional.of(existingOp), Optional.empty())) - .thenReturn(false); - - runner.analyze(OFD_TYPE); - - verify(operationsRepo, never()).save(any()); - } - - @Test - void analyze_skipsTable_whenNotEnabled() { - TableStatsRow statsEntity = - TableStatsRow.builder().tableUuid("uuid-1").databaseName(DB).build(); - - TableDto expectedTable = TableDto.fromRow(statsEntity); - - when(statsRepo.find(eq(Optional.of(DB)), eq(Optional.empty()), eq(Optional.empty()), any())) - .thenReturn(List.of(statsEntity)); - when(operationsRepo.find( - eq(Optional.of(OFD_DB)), - eq(Optional.empty()), - eq(Optional.empty()), - eq(Optional.of(DB)), - eq(Optional.empty()), - eq(Optional.empty()), - eq(Optional.empty()), - any())) - .thenReturn(Collections.emptyList()); - when(historyRepo.findLatest(eq(OFD_DB), any())).thenReturn(Collections.emptyList()); - when(analyzer.isEnabled(expectedTable)).thenReturn(false); - - runner.analyze(OFD_TYPE); - - verify(operationsRepo, never()).save(any()); - } - - @Test - void analyze_skipsTable_whenShouldScheduleReturnsFalse() { - TableStatsRow statsEntity = - TableStatsRow.builder().tableUuid("uuid-1").databaseName(DB).build(); - - TableDto expectedTable = TableDto.fromRow(statsEntity); - - TableOperationsRow scheduled = - TableOperationsRow.builder() - .id("op-id") - .status(com.linkedin.openhouse.optimizer.db.OperationStatus.SCHEDULED) - .tableUuid("uuid-1") - .operationType(OFD_DB) - .createdAt(Instant.now()) - .build(); - - when(statsRepo.find(eq(Optional.of(DB)), eq(Optional.empty()), eq(Optional.empty()), any())) - .thenReturn(List.of(statsEntity)); - when(operationsRepo.find( - eq(Optional.of(OFD_DB)), - eq(Optional.empty()), - eq(Optional.empty()), - eq(Optional.of(DB)), - eq(Optional.empty()), - eq(Optional.empty()), - eq(Optional.empty()), - any())) - .thenReturn(List.of(scheduled)); - when(historyRepo.findLatest(eq(OFD_DB), any())).thenReturn(Collections.emptyList()); - when(analyzer.isEnabled(expectedTable)).thenReturn(true); - - TableOperationDto scheduledOp = TableOperationDto.fromRow(scheduled); - when(analyzer.shouldSchedule(expectedTable, Optional.of(scheduledOp), Optional.empty())) - .thenReturn(false); - - runner.analyze(OFD_TYPE); - - verify(operationsRepo, never()).save(any()); - } - - @Test - void analyze_skipsTable_whenTableUuidIsNull() { - TableStatsRow statsEntity = TableStatsRow.builder().databaseName(DB).build(); - - when(statsRepo.find(eq(Optional.of(DB)), eq(Optional.empty()), eq(Optional.empty()), any())) - .thenReturn(List.of(statsEntity)); - when(operationsRepo.find( - eq(Optional.of(OFD_DB)), - eq(Optional.empty()), - eq(Optional.empty()), - eq(Optional.of(DB)), - eq(Optional.empty()), - eq(Optional.empty()), - eq(Optional.empty()), - any())) - .thenReturn(Collections.emptyList()); - when(historyRepo.findLatest(any(), any())).thenReturn(Collections.emptyList()); - - runner.analyze(OFD_TYPE); - - verify(operationsRepo, never()).save(any()); - } -} diff --git a/services/optimizer/analyzer/src/test/java/com/linkedin/openhouse/optimizer/analyzer/CadenceBasedOrphanFilesDeletionAnalyzerTest.java b/services/optimizer/analyzer/src/test/java/com/linkedin/openhouse/optimizer/analyzer/CadenceBasedOrphanFilesDeletionAnalyzerTest.java deleted file mode 100644 index d1b62c009..000000000 --- a/services/optimizer/analyzer/src/test/java/com/linkedin/openhouse/optimizer/analyzer/CadenceBasedOrphanFilesDeletionAnalyzerTest.java +++ /dev/null @@ -1,197 +0,0 @@ -package com.linkedin.openhouse.optimizer.analyzer; - -import static org.assertj.core.api.Assertions.assertThat; - -import com.linkedin.openhouse.optimizer.model.HistoryStatusDto; -import com.linkedin.openhouse.optimizer.model.OperationStatusDto; -import com.linkedin.openhouse.optimizer.model.TableDto; -import com.linkedin.openhouse.optimizer.model.TableOperationDto; -import com.linkedin.openhouse.optimizer.model.TableOperationsHistoryDto; -import java.time.Duration; -import java.time.Instant; -import java.util.Map; -import java.util.Optional; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -class CadenceBasedOrphanFilesDeletionAnalyzerTest { - - private static final Duration TEST_SUCCESS_INTERVAL = Duration.ofHours(24); - private static final Duration TEST_FAILURE_INTERVAL = Duration.ofHours(1); - - private CadenceBasedOrphanFilesDeletionAnalyzer analyzer; - - @BeforeEach - void setUp() { - analyzer = - new CadenceBasedOrphanFilesDeletionAnalyzer( - new CadencePolicy(TEST_SUCCESS_INTERVAL, TEST_FAILURE_INTERVAL)); - } - - // --- isEnabled --- - - @Test - void isEnabled_returnsTrue_whenPropertySet() { - assertThat(analyzer.isEnabled(tableWithProperty(true))).isTrue(); - } - - @Test - void isEnabled_returnsFalse_whenPropertyFalse() { - assertThat(analyzer.isEnabled(tableWithProperty(false))).isFalse(); - } - - @Test - void isEnabled_returnsFalse_whenTablePropertiesEmpty() { - TableDto table = TableDto.builder().tableUuid("uuid").build(); - assertThat(analyzer.isEnabled(table)).isFalse(); - } - - // --- shouldSchedule: no existing op --- - - @Test - void shouldSchedule_noOp_noHistory_returnsTrue() { - assertThat(analyzer.shouldSchedule(tableWithProperty(true), Optional.empty(), Optional.empty())) - .isTrue(); - } - - @Test - void shouldSchedule_noOp_successHistoryAfterCooldown_returnsTrue() { - Instant longAgo = Instant.now().minus(TEST_SUCCESS_INTERVAL).minusSeconds(60); - assertThat( - analyzer.shouldSchedule( - tableWithProperty(true), - Optional.empty(), - Optional.of(historyWithStatus(HistoryStatusDto.SUCCESS, longAgo)))) - .isTrue(); - } - - @Test - void shouldSchedule_noOp_successHistoryBeforeCooldown_returnsFalse() { - Instant recent = Instant.now().minus(TEST_SUCCESS_INTERVAL).plusSeconds(60); - assertThat( - analyzer.shouldSchedule( - tableWithProperty(true), - Optional.empty(), - Optional.of(historyWithStatus(HistoryStatusDto.SUCCESS, recent)))) - .isFalse(); - } - - @Test - void shouldSchedule_noOp_failedHistoryAfterRetry_returnsTrue() { - Instant longAgo = Instant.now().minus(TEST_FAILURE_INTERVAL).minusSeconds(60); - assertThat( - analyzer.shouldSchedule( - tableWithProperty(true), - Optional.empty(), - Optional.of(historyWithStatus(HistoryStatusDto.FAILED, longAgo)))) - .isTrue(); - } - - @Test - void shouldSchedule_noOp_failedHistoryBeforeRetry_returnsFalse() { - Instant recent = Instant.now().minus(TEST_FAILURE_INTERVAL).plusSeconds(60); - assertThat( - analyzer.shouldSchedule( - tableWithProperty(true), - Optional.empty(), - Optional.of(historyWithStatus(HistoryStatusDto.FAILED, recent)))) - .isFalse(); - } - - // --- shouldSchedule: active op (non-CANCELED) → analyzer stays out --- - - @Test - void shouldSchedule_pending_returnsFalse() { - assertThat( - analyzer.shouldSchedule( - tableWithProperty(true), - Optional.of(opWithStatus(OperationStatusDto.PENDING)), - Optional.empty())) - .isFalse(); - } - - @Test - void shouldSchedule_scheduling_returnsFalse() { - assertThat( - analyzer.shouldSchedule( - tableWithProperty(true), - Optional.of(opWithStatus(OperationStatusDto.SCHEDULING)), - Optional.empty())) - .isFalse(); - } - - @Test - void shouldSchedule_scheduled_returnsFalse_regardlessOfHistory() { - Instant historyAt = Instant.now().minus(TEST_SUCCESS_INTERVAL).minusSeconds(60); - assertThat( - analyzer.shouldSchedule( - tableWithProperty(true), - Optional.of(opWithStatus(OperationStatusDto.SCHEDULED)), - Optional.of(historyWithStatus(HistoryStatusDto.SUCCESS, historyAt)))) - .isFalse(); - } - - // --- shouldSchedule: CANCELED → cadence on history --- - - @Test - void shouldSchedule_canceled_successHistoryAfterCooldown_returnsTrue() { - Instant longAgo = Instant.now().minus(TEST_SUCCESS_INTERVAL).minusSeconds(60); - assertThat( - analyzer.shouldSchedule( - tableWithProperty(true), - Optional.of(opWithStatus(OperationStatusDto.CANCELED)), - Optional.of(historyWithStatus(HistoryStatusDto.SUCCESS, longAgo)))) - .isTrue(); - } - - @Test - void shouldSchedule_canceled_successHistoryBeforeCooldown_returnsFalse() { - Instant recent = Instant.now().minus(TEST_SUCCESS_INTERVAL).plusSeconds(60); - assertThat( - analyzer.shouldSchedule( - tableWithProperty(true), - Optional.of(opWithStatus(OperationStatusDto.CANCELED)), - Optional.of(historyWithStatus(HistoryStatusDto.SUCCESS, recent)))) - .isFalse(); - } - - @Test - void shouldSchedule_canceled_noHistory_returnsTrue() { - assertThat( - analyzer.shouldSchedule( - tableWithProperty(true), - Optional.of(opWithStatus(OperationStatusDto.CANCELED)), - Optional.empty())) - .isTrue(); - } - - // --- helpers --- - - private TableDto tableWithProperty(boolean enabled) { - return TableDto.builder() - .tableUuid("test-uuid") - .databaseName("db1") - .tableId("tbl1") - .tableProperties( - Map.of( - CadenceBasedOrphanFilesDeletionAnalyzer.OFD_ENABLED_PROPERTY, - Boolean.toString(enabled))) - .build(); - } - - private TableOperationDto opWithStatus(OperationStatusDto status) { - return TableOperationDto.builder().status(status).build(); - } - - private TableOperationsHistoryDto historyWithStatus( - HistoryStatusDto status, Instant completedAt) { - return TableOperationsHistoryDto.builder() - .id("hist-id") - .tableUuid("test-uuid") - .operationType( - com.linkedin.openhouse.optimizer.model.OperationTypeDto.ORPHAN_FILES_DELETION) - .completedAt(completedAt) - .status(status) - .build(); - } -} diff --git a/services/optimizer/build.gradle b/services/optimizer/build.gradle deleted file mode 100644 index c05c7f9c3..000000000 --- a/services/optimizer/build.gradle +++ /dev/null @@ -1,17 +0,0 @@ -plugins { - id 'openhouse.springboot-ext-conventions' - id 'org.springframework.boot' version '2.7.8' -} - -dependencies { - implementation 'org.springframework.boot:spring-boot-starter-data-jpa:2.7.8' - implementation 'com.vladmihalcea:hibernate-types-55:2.21.1' - implementation 'org.springframework.boot:spring-boot-starter-web:2.7.8' - implementation 'mysql:mysql-connector-java:8.+' - testImplementation 'com.h2database:h2:2.2.224' - testImplementation 'org.springframework.boot:spring-boot-starter-test:2.7.8' -} - -test { - useJUnitPlatform() -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/OptimizerServiceApplication.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/OptimizerServiceApplication.java deleted file mode 100644 index 38eb363a8..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/OptimizerServiceApplication.java +++ /dev/null @@ -1,13 +0,0 @@ -package com.linkedin.openhouse.optimizer; - -import org.springframework.boot.SpringApplication; -import org.springframework.boot.autoconfigure.SpringBootApplication; - -/** Spring Boot entry point for the Optimizer Service. */ -@SpringBootApplication -public class OptimizerServiceApplication { - - public static void main(String[] args) { - SpringApplication.run(OptimizerServiceApplication.class, args); - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableOperationsController.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableOperationsController.java deleted file mode 100644 index 2ee40802f..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableOperationsController.java +++ /dev/null @@ -1,125 +0,0 @@ -package com.linkedin.openhouse.optimizer.api.controller; - -import com.linkedin.openhouse.optimizer.api.spec.OperationStatus; -import com.linkedin.openhouse.optimizer.api.spec.OperationType; -import com.linkedin.openhouse.optimizer.api.spec.TableOperations; -import com.linkedin.openhouse.optimizer.api.spec.TableOperationsHistory; -import com.linkedin.openhouse.optimizer.api.spec.UpdateOperationRequest; -import com.linkedin.openhouse.optimizer.service.OptimizerDataService; -import io.swagger.v3.oas.annotations.responses.ApiResponse; -import io.swagger.v3.oas.annotations.responses.ApiResponses; -import java.util.List; -import java.util.Objects; -import java.util.Optional; -import java.util.stream.Collectors; -import lombok.RequiredArgsConstructor; -import org.springframework.http.HttpStatus; -import org.springframework.http.ResponseEntity; -import org.springframework.util.StringUtils; -import org.springframework.web.bind.annotation.GetMapping; -import org.springframework.web.bind.annotation.PathVariable; -import org.springframework.web.bind.annotation.PostMapping; -import org.springframework.web.bind.annotation.RequestBody; -import org.springframework.web.bind.annotation.RequestMapping; -import org.springframework.web.bind.annotation.RequestParam; -import org.springframework.web.bind.annotation.RestController; -import org.springframework.web.server.ResponseStatusException; - -/** REST controller for {@code table_operations}. */ -@RestController -@RequestMapping("/v1/optimizer/operations") -@RequiredArgsConstructor -public class TableOperationsController { - - private final OptimizerDataService service; - - /** - * Report an update to an operation. {@code id} comes from the URL; the body's {@code operationId} - * must match (the controller rejects mismatched requests with 400). The backend looks up the - * operation row, writes a history entry with the operation's table metadata, and returns 201 - * Created with the history row, or 404 if the operation does not exist. - */ - @ApiResponses( - value = { - @ApiResponse(responseCode = "201", description = "Operation UPDATE: CREATED"), - @ApiResponse(responseCode = "400", description = "Operation UPDATE: BAD_REQUEST"), - @ApiResponse(responseCode = "404", description = "Operation UPDATE: NOT_FOUND") - }) - @PostMapping("/{id}/update") - public ResponseEntity updateOperation( - @PathVariable String id, @RequestBody UpdateOperationRequest request) { - if (!StringUtils.hasText(request.getOperationId())) { - throw new ResponseStatusException(HttpStatus.BAD_REQUEST, "operationId is required"); - } - if (!Objects.equals(id, request.getOperationId())) { - throw new ResponseStatusException( - HttpStatus.BAD_REQUEST, - String.format( - "operationId in body (%s) does not match path id (%s)", - request.getOperationId(), id)); - } - if (request.getStatus() == null) { - throw new ResponseStatusException(HttpStatus.BAD_REQUEST, "status is required"); - } - return service - .updateOperation(id, request.getStatus().toModel()) - .map( - history -> - ResponseEntity.status(HttpStatus.CREATED) - .body(TableOperationsHistory.fromModel(history))) - .orElseThrow( - () -> - new ResponseStatusException( - HttpStatus.NOT_FOUND, String.format("no operation with id %s", id))); - } - - /** Fetch a single operation row by its ID, regardless of status. Returns 404 if not found. */ - @ApiResponses( - value = { - @ApiResponse(responseCode = "200", description = "Operation GET: OK"), - @ApiResponse(responseCode = "404", description = "Operation GET: NOT_FOUND") - }) - @GetMapping("/{id}") - public ResponseEntity getTableOperation(@PathVariable String id) { - return service - .getTableOperation(id) - .map(TableOperations::fromModel) - .map(ResponseEntity::ok) - .orElseThrow( - () -> - new ResponseStatusException( - HttpStatus.NOT_FOUND, String.format("no operation with id %s", id))); - } - - /** - * List operations matching the given filters, capped at {@code limit} rows. Every filter is - * optional; {@code limit} is required so callers always state how much they want back. - */ - @ApiResponses( - value = { - @ApiResponse(responseCode = "200", description = "Operation SEARCH: OK"), - @ApiResponse(responseCode = "400", description = "Operation SEARCH: BAD_REQUEST") - }) - @GetMapping - public ResponseEntity> listTableOperations( - @RequestParam(required = false) OperationType operationType, - @RequestParam(required = false) OperationStatus status, - @RequestParam(required = false) String databaseName, - @RequestParam(required = false) String tableName, - @RequestParam(required = false) String tableUuid, - @RequestParam int limit) { - List result = - service - .listTableOperations( - Optional.ofNullable(operationType).map(OperationType::toModel), - Optional.ofNullable(status).map(OperationStatus::toModel), - Optional.ofNullable(databaseName), - Optional.ofNullable(tableName), - Optional.ofNullable(tableUuid), - limit) - .stream() - .map(TableOperations::fromModel) - .collect(Collectors.toList()); - return ResponseEntity.ok(result); - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableOperationsHistoryController.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableOperationsHistoryController.java deleted file mode 100644 index 873d51d2e..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableOperationsHistoryController.java +++ /dev/null @@ -1,58 +0,0 @@ -package com.linkedin.openhouse.optimizer.api.controller; - -import com.linkedin.openhouse.optimizer.api.spec.TableOperationsHistory; -import com.linkedin.openhouse.optimizer.service.OptimizerDataService; -import io.swagger.v3.oas.annotations.responses.ApiResponse; -import io.swagger.v3.oas.annotations.responses.ApiResponses; -import java.util.List; -import java.util.stream.Collectors; -import lombok.RequiredArgsConstructor; -import org.springframework.http.HttpStatus; -import org.springframework.http.ResponseEntity; -import org.springframework.web.bind.annotation.GetMapping; -import org.springframework.web.bind.annotation.PathVariable; -import org.springframework.web.bind.annotation.PostMapping; -import org.springframework.web.bind.annotation.RequestBody; -import org.springframework.web.bind.annotation.RequestMapping; -import org.springframework.web.bind.annotation.RequestParam; -import org.springframework.web.bind.annotation.RestController; - -/** REST controller for {@code table_operations_history}. */ -@RestController -@RequestMapping("/v1/optimizer/operations-history") -@RequiredArgsConstructor -public class TableOperationsHistoryController { - - private final OptimizerDataService service; - - /** Append a completed-job result. Called by the SparkJob after each run (success or failure). */ - @ApiResponses( - value = { - @ApiResponse(responseCode = "201", description = "OperationsHistory CREATE: CREATED") - }) - @PostMapping - public ResponseEntity appendHistory( - @RequestBody TableOperationsHistory dto) { - return ResponseEntity.status(HttpStatus.CREATED) - .body(TableOperationsHistory.fromModel(service.appendHistory(dto.toModel()))); - } - - /** - * Return the most recent history for a table, newest first, capped at {@code limit} rows. {@code - * limit} is required. - */ - @ApiResponses( - value = { - @ApiResponse(responseCode = "200", description = "OperationsHistory GET: OK"), - @ApiResponse(responseCode = "400", description = "OperationsHistory GET: BAD_REQUEST") - }) - @GetMapping("/{tableUuid}") - public ResponseEntity> getHistory( - @PathVariable String tableUuid, @RequestParam int limit) { - List result = - service.getHistory(tableUuid, limit).stream() - .map(TableOperationsHistory::fromModel) - .collect(Collectors.toList()); - return ResponseEntity.ok(result); - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableStatsController.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableStatsController.java deleted file mode 100644 index b119dd1c7..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/controller/TableStatsController.java +++ /dev/null @@ -1,111 +0,0 @@ -package com.linkedin.openhouse.optimizer.api.controller; - -import com.linkedin.openhouse.optimizer.api.spec.TableStats; -import com.linkedin.openhouse.optimizer.api.spec.TableStatsHistory; -import com.linkedin.openhouse.optimizer.api.spec.UpsertTableStatsRequest; -import com.linkedin.openhouse.optimizer.service.OptimizerDataService; -import io.swagger.v3.oas.annotations.responses.ApiResponse; -import io.swagger.v3.oas.annotations.responses.ApiResponses; -import java.time.Instant; -import java.util.List; -import java.util.Optional; -import java.util.stream.Collectors; -import lombok.RequiredArgsConstructor; -import org.springframework.http.HttpStatus; -import org.springframework.http.ResponseEntity; -import org.springframework.web.bind.annotation.GetMapping; -import org.springframework.web.bind.annotation.PathVariable; -import org.springframework.web.bind.annotation.PutMapping; -import org.springframework.web.bind.annotation.RequestBody; -import org.springframework.web.bind.annotation.RequestMapping; -import org.springframework.web.bind.annotation.RequestParam; -import org.springframework.web.bind.annotation.RestController; -import org.springframework.web.server.ResponseStatusException; - -/** REST controller for managing per-table stats in the optimizer DB. */ -@RestController -@RequestMapping("/v1/optimizer/stats") -@RequiredArgsConstructor -public class TableStatsController { - - private final OptimizerDataService service; - - /** - * Create or overwrite the stats row for {@code tableUuid}. Called by the Tables Service on every - * Iceberg commit. Idempotent. - */ - @ApiResponses(value = {@ApiResponse(responseCode = "200", description = "Stats PUT: OK")}) - @PutMapping("/{tableUuid}") - public ResponseEntity upsertTableStats( - @PathVariable String tableUuid, @RequestBody UpsertTableStatsRequest request) { - return ResponseEntity.ok( - TableStats.fromModel(service.upsertTableStats(request.toModel(tableUuid)))); - } - - /** Fetch the stats row for {@code tableUuid}. Returns 404 if no stats have been written yet. */ - @ApiResponses( - value = { - @ApiResponse(responseCode = "200", description = "Stats GET: OK"), - @ApiResponse(responseCode = "404", description = "Stats GET: NOT_FOUND") - }) - @GetMapping("/{tableUuid}") - public ResponseEntity getTableStats(@PathVariable String tableUuid) { - return service - .getTableStats(tableUuid) - .map(TableStats::fromModel) - .map(ResponseEntity::ok) - .orElseThrow( - () -> - new ResponseStatusException( - HttpStatus.NOT_FOUND, String.format("no stats for tableUuid %s", tableUuid))); - } - - /** - * List stats rows matching the given filters, capped at {@code limit} rows. Every filter is - * optional; {@code limit} is required so callers always state how much they want back. - */ - @ApiResponses( - value = { - @ApiResponse(responseCode = "200", description = "Stats SEARCH: OK"), - @ApiResponse(responseCode = "400", description = "Stats SEARCH: BAD_REQUEST") - }) - @GetMapping - public ResponseEntity> listTableStats( - @RequestParam(required = false) String databaseName, - @RequestParam(required = false) String tableName, - @RequestParam(required = false) String tableUuid, - @RequestParam int limit) { - List result = - service - .listTableStats( - Optional.ofNullable(databaseName), - Optional.ofNullable(tableName), - Optional.ofNullable(tableUuid), - limit) - .stream() - .map(TableStats::fromModel) - .collect(Collectors.toList()); - return ResponseEntity.ok(result); - } - - /** - * Return per-commit stats history for {@code tableUuid}, newest first, capped at {@code limit} - * rows. Optional {@code since} filter (inclusive). {@code limit} is required. - */ - @ApiResponses( - value = { - @ApiResponse(responseCode = "200", description = "StatsHistory GET: OK"), - @ApiResponse(responseCode = "400", description = "StatsHistory GET: BAD_REQUEST") - }) - @GetMapping("/{tableUuid}/history") - public ResponseEntity> getStatsHistory( - @PathVariable String tableUuid, - @RequestParam(required = false) Instant since, - @RequestParam int limit) { - List result = - service.getStatsHistory(tableUuid, Optional.ofNullable(since), limit).stream() - .map(TableStatsHistory::fromModel) - .collect(Collectors.toList()); - return ResponseEntity.ok(result); - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/HistoryStatus.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/HistoryStatus.java deleted file mode 100644 index 1d799818f..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/HistoryStatus.java +++ /dev/null @@ -1,21 +0,0 @@ -package com.linkedin.openhouse.optimizer.api.spec; - -/** Terminal states for a completed Spark maintenance job. */ -public enum HistoryStatus { - - /** The Spark job for this operation completed successfully. */ - SUCCESS, - - /** The Spark job for this operation failed. */ - FAILED; - - /** Convert to the internal-model counterpart. */ - public com.linkedin.openhouse.optimizer.model.HistoryStatusDto toModel() { - return com.linkedin.openhouse.optimizer.model.HistoryStatusDto.valueOf(name()); - } - - /** Build the api-layer enum from the internal-model counterpart. */ - public static HistoryStatus fromModel(com.linkedin.openhouse.optimizer.model.HistoryStatusDto v) { - return v == null ? null : HistoryStatus.valueOf(v.name()); - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/OperationStatus.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/OperationStatus.java deleted file mode 100644 index b1cbe42b0..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/OperationStatus.java +++ /dev/null @@ -1,32 +0,0 @@ -package com.linkedin.openhouse.optimizer.api.spec; - -/** Lifecycle states for a table operation recommendation. */ -public enum OperationStatus { - - /** Recommended by the Analyzer but not yet claimed by the Scheduler. */ - PENDING, - - /** Claimed by the Scheduler; waiting for the Jobs Service to return a job ID. */ - SCHEDULING, - - /** Job submitted to the Jobs Service; the row now carries a {@code jobId}. */ - SCHEDULED, - - /** - * Marked by the Scheduler when it detects duplicate PENDING rows for the same {@code (table_uuid, - * operation_type)}. Only the most-recent PENDING row is claimed; older duplicates are CANCELED - * before the claim step. - */ - CANCELED; - - /** Convert to the internal-model counterpart. */ - public com.linkedin.openhouse.optimizer.model.OperationStatusDto toModel() { - return com.linkedin.openhouse.optimizer.model.OperationStatusDto.valueOf(name()); - } - - /** Build the api-layer enum from the internal-model counterpart. */ - public static OperationStatus fromModel( - com.linkedin.openhouse.optimizer.model.OperationStatusDto v) { - return v == null ? null : OperationStatus.valueOf(v.name()); - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/OperationType.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/OperationType.java deleted file mode 100644 index ea6d2797c..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/OperationType.java +++ /dev/null @@ -1,17 +0,0 @@ -package com.linkedin.openhouse.optimizer.api.spec; - -/** Maintenance operation types supported by the continuous optimizer. */ -public enum OperationType { - /** Removes orphaned data files no longer referenced by table metadata. */ - ORPHAN_FILES_DELETION; - - /** Convert to the internal-model counterpart. */ - public com.linkedin.openhouse.optimizer.model.OperationTypeDto toModel() { - return com.linkedin.openhouse.optimizer.model.OperationTypeDto.valueOf(name()); - } - - /** Build the api-layer enum from the internal-model counterpart. */ - public static OperationType fromModel(com.linkedin.openhouse.optimizer.model.OperationTypeDto v) { - return v == null ? null : OperationType.valueOf(v.name()); - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableOperations.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableOperations.java deleted file mode 100644 index 0bca95734..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableOperations.java +++ /dev/null @@ -1,76 +0,0 @@ -package com.linkedin.openhouse.optimizer.api.spec; - -import com.linkedin.openhouse.optimizer.model.TableOperationDto; -import java.time.Instant; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - -/** DTO for {@code table_operations} — Analyzer recommendations read by the Scheduler. */ -@Data -@Builder -@NoArgsConstructor -@AllArgsConstructor -public class TableOperations { - - /** Client-generated UUID identifying this specific operation recommendation. */ - private String id; - - /** Stable table identity from the Tables Service. */ - private String tableUuid; - - /** Denormalized database name for display; not part of the primary key. */ - private String databaseName; - - /** Denormalized table name for display; not part of the primary key. */ - private String tableName; - - /** The type of maintenance operation (e.g. ORPHAN_FILES_DELETION). */ - private OperationType operationType; - - /** {@code PENDING} or {@code SCHEDULED}. Defaults to {@code PENDING} on creation. */ - private OperationStatus status; - - /** Server-set when the row is first created by the Analyzer. */ - private Instant createdAt; - - /** Set by the Scheduler when claiming; {@code null} while PENDING. */ - private Instant scheduledAt; - - /** Job ID returned by the Jobs Service after successful submission. */ - private String jobId; - - /** Convert to the internal-model counterpart. */ - public TableOperationDto toModel() { - return TableOperationDto.builder() - .id(id) - .tableUuid(tableUuid) - .databaseName(databaseName) - .tableName(tableName) - .operationType(operationType == null ? null : operationType.toModel()) - .status(status == null ? null : status.toModel()) - .createdAt(createdAt) - .scheduledAt(scheduledAt) - .jobId(jobId) - .build(); - } - - /** Build a wire DTO from the internal-model counterpart. */ - public static TableOperations fromModel(TableOperationDto op) { - if (op == null) { - return null; - } - return TableOperations.builder() - .id(op.getId()) - .tableUuid(op.getTableUuid()) - .databaseName(op.getDatabaseName()) - .tableName(op.getTableName()) - .operationType(OperationType.fromModel(op.getOperationType())) - .status(OperationStatus.fromModel(op.getStatus())) - .createdAt(op.getCreatedAt()) - .scheduledAt(op.getScheduledAt()) - .jobId(op.getJobId()) - .build(); - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableOperationsHistory.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableOperationsHistory.java deleted file mode 100644 index 7a000f840..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableOperationsHistory.java +++ /dev/null @@ -1,66 +0,0 @@ -package com.linkedin.openhouse.optimizer.api.spec; - -import com.linkedin.openhouse.optimizer.model.TableOperationsHistoryDto; -import java.time.Instant; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - -/** DTO for {@code table_operations_history} — append-only operation results. */ -@Data -@Builder -@NoArgsConstructor -@AllArgsConstructor -public class TableOperationsHistory { - - /** Same UUID as the originating {@code table_operations.id}; supplied by the caller. */ - private String id; - - /** Stable table identity from the Tables Service. */ - private String tableUuid; - - /** Denormalized database name for display. */ - private String databaseName; - - /** Denormalized table name for display. */ - private String tableName; - - /** The type of maintenance operation this history row records. */ - private OperationType operationType; - - /** When the operation completed, as recorded by the complete endpoint. */ - private Instant completedAt; - - /** {@code SUCCESS} or {@code FAILED}. */ - private HistoryStatus status; - - /** Convert to the internal-model counterpart. */ - public TableOperationsHistoryDto toModel() { - return TableOperationsHistoryDto.builder() - .id(id) - .tableUuid(tableUuid) - .databaseName(databaseName) - .tableName(tableName) - .operationType(operationType == null ? null : operationType.toModel()) - .completedAt(completedAt) - .status(status == null ? null : status.toModel()) - .build(); - } - - /** Build a wire DTO from the internal-model counterpart. */ - public static TableOperationsHistory fromModel(TableOperationsHistoryDto h) { - if (h == null) { - return null; - } - return TableOperationsHistory.builder() - .id(h.getId()) - .tableUuid(h.getTableUuid()) - .databaseName(h.getDatabaseName()) - .tableName(h.getTableName()) - .operationType(OperationType.fromModel(h.getOperationType())) - .completedAt(h.getCompletedAt()) - .status(HistoryStatus.fromModel(h.getStatus())) - .build(); - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableStats.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableStats.java deleted file mode 100644 index 41f44f763..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableStats.java +++ /dev/null @@ -1,70 +0,0 @@ -package com.linkedin.openhouse.optimizer.api.spec; - -import java.time.Instant; -import java.util.Collections; -import java.util.Map; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - -/** DTO for {@code table_stats} — used for response payloads. */ -@Data -@Builder -@NoArgsConstructor -@AllArgsConstructor -public class TableStats { - - /** Stable Iceberg table UUID. Primary key of the stats row. */ - private String tableUuid; - - /** Denormalized database name for display. */ - private String databaseName; - - /** Denormalized table name for display. */ - private String tableName; - - /** Combined snapshot + delta stats payload, stored as JSON. */ - private TableStatsPayload stats; - - /** Current table properties snapshot (e.g. maintenance opt-in flags). */ - private Map tableProperties; - - /** When this row was last written. Used for staleness monitoring. */ - private Instant updatedAt; - - /** Convert to the internal-model counterpart. */ - public com.linkedin.openhouse.optimizer.model.TableStatsDto toModel() { - com.linkedin.openhouse.optimizer.model.TableStatsDto payload = - stats == null - ? new com.linkedin.openhouse.optimizer.model.TableStatsDto() - : stats.toModel(); - return payload - .toBuilder() - .tableUuid(tableUuid) - .databaseName(databaseName) - .tableName(tableName) - .tableProperties(tableProperties != null ? tableProperties : Collections.emptyMap()) - .updatedAt(updatedAt) - .build(); - } - - /** Build a wire DTO from the internal-model counterpart. */ - public static TableStats fromModel(com.linkedin.openhouse.optimizer.model.TableStatsDto m) { - if (m == null) { - return null; - } - return TableStats.builder() - .tableUuid(m.getTableUuid()) - .databaseName(m.getDatabaseName()) - .tableName(m.getTableName()) - .stats( - TableStatsPayload.builder() - .snapshot(TableStatsPayload.SnapshotMetricsDto.fromModel(m.getSnapshot())) - .delta(TableStatsPayload.CommitDeltaDto.fromModel(m.getDelta())) - .build()) - .tableProperties(m.getTableProperties()) - .updatedAt(m.getUpdatedAt()) - .build(); - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableStatsHistory.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableStatsHistory.java deleted file mode 100644 index 5508aca27..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableStatsHistory.java +++ /dev/null @@ -1,61 +0,0 @@ -package com.linkedin.openhouse.optimizer.api.spec; - -import com.linkedin.openhouse.optimizer.model.TableStatsHistoryDto; -import java.time.Instant; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - -/** DTO for {@code table_stats_history} — used for response payloads. */ -@Data -@Builder -@NoArgsConstructor -@AllArgsConstructor -public class TableStatsHistory { - - /** UUID primary key set by the caller. */ - private String id; - - /** Stable Iceberg table UUID. */ - private String tableUuid; - - /** Denormalized database name for display. */ - private String databaseName; - - /** Denormalized table name for display. */ - private String tableName; - - /** Snapshot + delta stats from this commit event. */ - private TableStatsPayload stats; - - /** When this history row was recorded. */ - private Instant recordedAt; - - /** Convert to the internal-model counterpart. */ - public TableStatsHistoryDto toModel() { - return TableStatsHistoryDto.builder() - .id(id) - .tableUuid(tableUuid) - .databaseName(databaseName) - .tableName(tableName) - .stats(stats == null ? null : stats.toModel()) - .recordedAt(recordedAt) - .build(); - } - - /** Build a wire DTO from the internal-model counterpart. */ - public static TableStatsHistory fromModel(TableStatsHistoryDto h) { - if (h == null) { - return null; - } - return TableStatsHistory.builder() - .id(h.getId()) - .tableUuid(h.getTableUuid()) - .databaseName(h.getDatabaseName()) - .tableName(h.getTableName()) - .stats(TableStatsPayload.fromModel(h.getStats())) - .recordedAt(h.getRecordedAt()) - .build(); - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableStatsPayload.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableStatsPayload.java deleted file mode 100644 index c347bf385..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/TableStatsPayload.java +++ /dev/null @@ -1,137 +0,0 @@ -package com.linkedin.openhouse.optimizer.api.spec; - -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - -/** - * Combined stats payload exposed on the optimizer wire API. - * - *

API-layer copy of the stats payload — self-contained, evolved only when the wire contract - * changes. - */ -@Data -@Builder(toBuilder = true) -@NoArgsConstructor -@AllArgsConstructor -@JsonIgnoreProperties(ignoreUnknown = true) -public class TableStatsPayload { - - /** Snapshot fields — overwritten on every upsert. */ - private SnapshotMetricsDto snapshot; - - /** Delta fields — accumulated across commit events. */ - private CommitDeltaDto delta; - - /** Convert to the internal-model counterpart. */ - public com.linkedin.openhouse.optimizer.model.TableStatsDto toModel() { - return com.linkedin.openhouse.optimizer.model.TableStatsDto.builder() - .snapshot(snapshot == null ? null : snapshot.toModel()) - .delta(delta == null ? null : delta.toModel()) - .build(); - } - - /** Build the api-layer payload from the internal-model counterpart. */ - public static TableStatsPayload fromModel( - com.linkedin.openhouse.optimizer.model.TableStatsDto m) { - if (m == null) { - return null; - } - return TableStatsPayload.builder() - .snapshot(SnapshotMetricsDto.fromModel(m.getSnapshot())) - .delta(CommitDeltaDto.fromModel(m.getDelta())) - .build(); - } - - /** Point-in-time metadata read from Iceberg at scan time. */ - @Data - @Builder(toBuilder = true) - @NoArgsConstructor - @AllArgsConstructor - @JsonIgnoreProperties(ignoreUnknown = true) - public static class SnapshotMetricsDto { - - /** Iceberg metadata version pointer for this snapshot. */ - private String tableVersion; - - /** Filesystem path (or URI) of the table's storage root. */ - private String tableLocation; - - /** Total on-disk size of the table at this snapshot, in bytes. */ - private Long tableSizeBytes; - - /** Total number of data files as of the latest snapshot — used for bin-packing. */ - private Long numCurrentFiles; - - /** Convert to the internal-model counterpart. */ - public com.linkedin.openhouse.optimizer.model.TableStatsDto.SnapshotMetrics toModel() { - return com.linkedin.openhouse.optimizer.model.TableStatsDto.SnapshotMetrics.builder() - .tableVersion(tableVersion) - .tableLocation(tableLocation) - .tableSizeBytes(tableSizeBytes) - .numCurrentFiles(numCurrentFiles) - .build(); - } - - /** Build the api-layer inner object from the internal-model counterpart. */ - public static SnapshotMetricsDto fromModel( - com.linkedin.openhouse.optimizer.model.TableStatsDto.SnapshotMetrics m) { - if (m == null) { - return null; - } - return SnapshotMetricsDto.builder() - .tableVersion(m.getTableVersion()) - .tableLocation(m.getTableLocation()) - .tableSizeBytes(m.getTableSizeBytes()) - .numCurrentFiles(m.getNumCurrentFiles()) - .build(); - } - } - - /** Per-commit incremental counters; accumulated across all recorded commit events. */ - @Data - @Builder(toBuilder = true) - @NoArgsConstructor - @AllArgsConstructor - @JsonIgnoreProperties(ignoreUnknown = true) - public static class CommitDeltaDto { - - /** Number of data files this commit added to the table. */ - private Long numFilesAdded; - - /** Number of data files this commit removed from the table. */ - private Long numFilesDeleted; - - /** Total bytes added by this commit. */ - private Long addedSizeBytes; - - /** Total bytes removed by this commit. */ - private Long deletedSizeBytes; - - /** Convert to the internal-model counterpart. */ - public com.linkedin.openhouse.optimizer.model.TableStatsDto.CommitDelta toModel() { - return com.linkedin.openhouse.optimizer.model.TableStatsDto.CommitDelta.builder() - .numFilesAdded(numFilesAdded) - .numFilesDeleted(numFilesDeleted) - .addedSizeBytes(addedSizeBytes) - .deletedSizeBytes(deletedSizeBytes) - .build(); - } - - /** Build the api-layer inner object from the internal-model counterpart. */ - public static CommitDeltaDto fromModel( - com.linkedin.openhouse.optimizer.model.TableStatsDto.CommitDelta m) { - if (m == null) { - return null; - } - return CommitDeltaDto.builder() - .numFilesAdded(m.getNumFilesAdded()) - .numFilesDeleted(m.getNumFilesDeleted()) - .addedSizeBytes(m.getAddedSizeBytes()) - .deletedSizeBytes(m.getDeletedSizeBytes()) - .build(); - } - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/UpdateOperationRequest.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/UpdateOperationRequest.java deleted file mode 100644 index a216e9db3..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/UpdateOperationRequest.java +++ /dev/null @@ -1,47 +0,0 @@ -package com.linkedin.openhouse.optimizer.api.spec; - -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - -/** - * Request body for {@code POST /v1/table-operations/update}. - * - *

Reports the outcome of a single operation update. The service looks up the operation row by - * {@link #operationId} and writes a history entry for it. - * - *

A single Spark job typically processes N tables and yields N independent (status) outcomes — - * one per operation. Callers issue one update request per operation; the service does not - * bulk-update by job. - * - *

The remaining fields ({@link #tableUuid}, {@link #databaseName}, {@link #tableName}, {@link - * #operationType}) are debug-only echo information. The server does not key off them; they are - * preserved on log lines and traces so an operator looking at a failing update call can see which - * (db, table, operation) the caller believed it was updating without joining back to the operation - * row. - */ -@Data -@Builder -@NoArgsConstructor -@AllArgsConstructor -public class UpdateOperationRequest { - - /** Operation row's UUID — the primary lookup key. */ - private String operationId; - - /** Terminal outcome for this single operation. */ - private HistoryStatus status; - - /** Debug echo: stable table identity the caller believed it was completing. */ - private String tableUuid; - - /** Debug echo: database name. */ - private String databaseName; - - /** Debug echo: table name. */ - private String tableName; - - /** Debug echo: operation type. */ - private OperationType operationType; -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/UpsertTableStatsRequest.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/UpsertTableStatsRequest.java deleted file mode 100644 index d1b4a5fe2..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/api/spec/UpsertTableStatsRequest.java +++ /dev/null @@ -1,52 +0,0 @@ -package com.linkedin.openhouse.optimizer.api.spec; - -import java.util.Collections; -import java.util.Map; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - -/** - * Request body for {@code PUT /v1/table-stats/{tableUuid}}. - * - *

{@code tableUuid} comes from the path variable. {@code databaseName} and {@code tableName} are - * denormalized display columns carried in the body. - */ -@Data -@Builder -@NoArgsConstructor -@AllArgsConstructor -public class UpsertTableStatsRequest { - - /** Denormalized database name for display. */ - private String databaseName; - - /** Denormalized table name for display. */ - private String tableName; - - /** Combined snapshot + delta stats payload from this commit. */ - private TableStatsPayload stats; - - /** Current table properties snapshot (e.g. maintenance opt-in flags). */ - private Map tableProperties; - - /** - * Build the internal-model {@link com.linkedin.openhouse.optimizer.model.TableStatsDto} described - * by this request. {@code tableUuid} comes from the URL path, not the body. {@code updatedAt} is - * left {@code null}; the service stamps it server-side at write time. - */ - public com.linkedin.openhouse.optimizer.model.TableStatsDto toModel(String tableUuid) { - com.linkedin.openhouse.optimizer.model.TableStatsDto payload = - stats == null - ? new com.linkedin.openhouse.optimizer.model.TableStatsDto() - : stats.toModel(); - return payload - .toBuilder() - .tableUuid(tableUuid) - .databaseName(databaseName) - .tableName(tableName) - .tableProperties(tableProperties != null ? tableProperties : Collections.emptyMap()) - .build(); - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/CommitDeltaMetrics.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/CommitDeltaMetrics.java deleted file mode 100644 index 5a30c9afd..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/CommitDeltaMetrics.java +++ /dev/null @@ -1,28 +0,0 @@ -package com.linkedin.openhouse.optimizer.db; - -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - -/** Per-commit incremental counters. Serialized as JSON into the {@code delta} column. */ -@Data -@Builder(toBuilder = true) -@NoArgsConstructor -@AllArgsConstructor -@JsonIgnoreProperties(ignoreUnknown = true) -public class CommitDeltaMetrics { - - /** Number of data files this commit added to the table. */ - private Long numFilesAdded; - - /** Number of data files this commit removed from the table. */ - private Long numFilesDeleted; - - /** Total bytes added by this commit. */ - private Long addedSizeBytes; - - /** Total bytes removed by this commit. */ - private Long deletedSizeBytes; -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/HistoryStatus.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/HistoryStatus.java deleted file mode 100644 index 3680735f4..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/HistoryStatus.java +++ /dev/null @@ -1,15 +0,0 @@ -package com.linkedin.openhouse.optimizer.db; - -/** - * DB-layer enum for the {@code status} column of {@code table_operations_history}. - * - *

Self-contained: no references to api/ or model/ types. - */ -public enum HistoryStatus { - - /** The Spark job for this operation completed successfully. */ - SUCCESS, - - /** The Spark job for this operation failed. */ - FAILED -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/OperationStatus.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/OperationStatus.java deleted file mode 100644 index 0a2e07483..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/OperationStatus.java +++ /dev/null @@ -1,21 +0,0 @@ -package com.linkedin.openhouse.optimizer.db; - -/** - * DB-layer enum for the {@code status} column of {@code table_operations}. - * - *

Self-contained: no references to api/ or model/ types. - */ -public enum OperationStatus { - - /** Analyzer has written the row; not yet claimed by the scheduler. */ - PENDING, - - /** Scheduler has claimed the row and is launching a job; jobId not yet recorded. */ - SCHEDULING, - - /** Job has been submitted to the Jobs Service; the row carries a {@code jobId}. */ - SCHEDULED, - - /** Scheduler marked this row as a duplicate of another PENDING row; not claimable. */ - CANCELED -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/OperationType.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/OperationType.java deleted file mode 100644 index e4caf549b..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/OperationType.java +++ /dev/null @@ -1,14 +0,0 @@ -package com.linkedin.openhouse.optimizer.db; - -/** - * DB-layer enum for the operation types persisted in {@code table_operations.operation_type} and - * {@code table_operations_history.operation_type}. - * - *

Self-contained: no references to api/ or model/ types. JPA binds this via - * {@code @Enumerated(EnumType.STRING)}. - */ -public enum OperationType { - - /** Removes orphaned data files no longer referenced by table metadata. */ - ORPHAN_FILES_DELETION -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/SnapshotMetrics.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/SnapshotMetrics.java deleted file mode 100644 index 452b35097..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/SnapshotMetrics.java +++ /dev/null @@ -1,28 +0,0 @@ -package com.linkedin.openhouse.optimizer.db; - -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - -/** Point-in-time snapshot fields. Serialized as JSON into the {@code snapshot} column. */ -@Data -@Builder(toBuilder = true) -@NoArgsConstructor -@AllArgsConstructor -@JsonIgnoreProperties(ignoreUnknown = true) -public class SnapshotMetrics { - - /** Iceberg metadata version pointer for this snapshot. */ - private String tableVersion; - - /** Filesystem path (or URI) of the table's storage root. */ - private String tableLocation; - - /** Total on-disk size of the table at this snapshot, in bytes. */ - private Long tableSizeBytes; - - /** Total number of data files as of the latest snapshot — used for bin-packing. */ - private Long numCurrentFiles; -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableOperationsHistoryRow.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableOperationsHistoryRow.java deleted file mode 100644 index 5f4a598d9..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableOperationsHistoryRow.java +++ /dev/null @@ -1,75 +0,0 @@ -package com.linkedin.openhouse.optimizer.db; - -import java.time.Instant; -import javax.persistence.Column; -import javax.persistence.Entity; -import javax.persistence.EnumType; -import javax.persistence.Enumerated; -import javax.persistence.Id; -import javax.persistence.Index; -import javax.persistence.Table; -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.NoArgsConstructor; - -/** - * Append-only record of a completed maintenance operation. - * - *

Written when the operation-complete endpoint is called. The {@code id} is the same UUID as the - * originating live-operations row, tying each history entry back to the operation cycle that - * produced it. Multiple runs of the same operation on the same table produce multiple rows. - * - *

Self-contained DB-layer type: enums are {@link OperationType} / {@link HistoryStatus} from the - * same package, JPA-bound as strings. - */ -@Entity -@Table( - name = "table_operations_history", - indexes = { - @Index(name = "idx_table_uuid_hist", columnList = "table_uuid"), - @Index(name = "idx_op_type_hist", columnList = "operation_type"), - @Index(name = "idx_completed_at", columnList = "completed_at"), - @Index(name = "idx_status_hist", columnList = "status"), - @Index(name = "idx_toph_db_table", columnList = "database_name, table_name") - }) -@Getter -@EqualsAndHashCode -@Builder(toBuilder = true) -@NoArgsConstructor(access = AccessLevel.PROTECTED) -@AllArgsConstructor(access = AccessLevel.PROTECTED) -public class TableOperationsHistoryRow { - - /** Same UUID as the originating live-operations row. Set by the caller; not generated. */ - @Id - @Column(name = "id", nullable = false, length = 36) - private String id; - - /** Stable table identity from the Tables Service. */ - @Column(name = "table_uuid", nullable = false, length = 36) - private String tableUuid; - - /** Denormalized database name. */ - @Column(name = "database_name", nullable = false, length = 128) - private String databaseName; - - /** Denormalized table name. */ - @Column(name = "table_name", nullable = false, length = 128) - private String tableName; - - /** The type of maintenance operation this history row records. */ - @Enumerated(EnumType.STRING) - @Column(name = "operation_type", nullable = false, length = 50) - private OperationType operationType; - - /** When the operation completed, as recorded by the complete endpoint. */ - @Column(name = "completed_at", nullable = false) - private Instant completedAt; - - /** Terminal outcome: {@link HistoryStatus#SUCCESS} or {@link HistoryStatus#FAILED}. */ - @Enumerated(EnumType.STRING) - @Column(name = "status", nullable = false, length = 20) - private HistoryStatus status; -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableOperationsRow.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableOperationsRow.java deleted file mode 100644 index dfe40d402..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableOperationsRow.java +++ /dev/null @@ -1,85 +0,0 @@ -package com.linkedin.openhouse.optimizer.db; - -import java.time.Instant; -import javax.persistence.Column; -import javax.persistence.Entity; -import javax.persistence.EnumType; -import javax.persistence.Enumerated; -import javax.persistence.Id; -import javax.persistence.Index; -import javax.persistence.Table; -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.NoArgsConstructor; - -/** - * JPA entity representing an Analyzer recommendation for a table maintenance operation. - * - *

Each row is identified by a client-generated UUID ({@code id}). The Analyzer creates a new row - * when it first recommends an operation for a table, or when re-recommending after a prior terminal - * state. {@code table_uuid} is the stable identity for the table (survives renames; rotates on - * drop+recreate). The application enforces one active (PENDING / SCHEDULING / SCHEDULED) row per - * {@code (table_uuid, operation_type)} at a time. - * - *

Self-contained DB-layer type: enums are {@link OperationType} / {@link OperationStatus} from - * the same package, JPA-bound as strings. - */ -@Entity -@Table( - name = "table_operations", - indexes = { - @Index(name = "idx_table_uuid", columnList = "table_uuid"), - @Index(name = "idx_op_type", columnList = "operation_type"), - @Index(name = "idx_status", columnList = "status"), - @Index(name = "idx_created_at", columnList = "created_at"), - @Index(name = "idx_scheduled_at", columnList = "scheduled_at") - }) -@Getter -@EqualsAndHashCode -@Builder(toBuilder = true) -@NoArgsConstructor(access = AccessLevel.PROTECTED) -@AllArgsConstructor(access = AccessLevel.PROTECTED) -public class TableOperationsRow { - - /** Client-generated UUID identifying this specific operation recommendation. */ - @Id - @Column(name = "id", nullable = false, length = 36) - private String id; - - /** Stable table identity from the Tables Service. Survives renames; rotates on drop+recreate. */ - @Column(name = "table_uuid", nullable = false, length = 36) - private String tableUuid; - - /** Denormalized database name. */ - @Column(name = "database_name", nullable = false, length = 128) - private String databaseName; - - /** Denormalized table name. */ - @Column(name = "table_name", nullable = false, length = 128) - private String tableName; - - /** The type of maintenance operation this row recommends. */ - @Enumerated(EnumType.STRING) - @Column(name = "operation_type", nullable = false, length = 50) - private OperationType operationType; - - /** Lifecycle state — drives the scheduler's CAS claim and the analyzer's eligibility check. */ - @Enumerated(EnumType.STRING) - @Column(name = "status", nullable = false, length = 20) - private OperationStatus status; - - /** When the analyzer first created this row. Set on insert; never updated. */ - @Column(name = "created_at", nullable = false) - private Instant createdAt; - - /** When the scheduler last submitted a job for this row. {@code null} while {@code PENDING}. */ - @Column(name = "scheduled_at") - private Instant scheduledAt; - - /** Spark job ID written by the scheduler at claim time. Internal-only; never exposed on wire. */ - @Column(name = "job_id", length = 255) - private String jobId; -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableStatsHistoryRow.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableStatsHistoryRow.java deleted file mode 100644 index 4eaee2a6f..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableStatsHistoryRow.java +++ /dev/null @@ -1,74 +0,0 @@ -package com.linkedin.openhouse.optimizer.db; - -import com.vladmihalcea.hibernate.type.json.JsonStringType; -import java.time.Instant; -import javax.persistence.Column; -import javax.persistence.Entity; -import javax.persistence.Id; -import javax.persistence.Index; -import javax.persistence.Table; -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.NoArgsConstructor; -import org.hibernate.annotations.Type; -import org.hibernate.annotations.TypeDef; - -/** - * Append-only record of per-commit stats reported by the Tables Service. - * - *

Each Iceberg commit produces one row. Consumers can query this table to reconstruct change - * rates over arbitrary time windows. - * - *

Self-contained DB-layer type. The stats payload is split across two JSON columns — {@link - * SnapshotMetrics} (point-in-time fields at commit time) and {@link CommitDeltaMetrics} (per-commit - * counters). - */ -@TypeDef(name = "json", typeClass = JsonStringType.class) -@Entity -@Table( - name = "table_stats_history", - indexes = { - @Index(name = "idx_tsh_table_uuid", columnList = "table_uuid"), - @Index(name = "idx_tsh_recorded_at", columnList = "recorded_at") - }) -@Getter -@EqualsAndHashCode -@Builder(toBuilder = true) -@NoArgsConstructor(access = AccessLevel.PROTECTED) -@AllArgsConstructor(access = AccessLevel.PROTECTED) -public class TableStatsHistoryRow { - - /** UUID primary key — set by the caller, not generated server-side. */ - @Id - @Column(name = "id", nullable = false, length = 36) - private String id; - - /** Stable Iceberg table UUID. */ - @Column(name = "table_uuid", nullable = false, length = 36) - private String tableUuid; - - /** Denormalized database name. */ - @Column(name = "database_name", nullable = false, length = 128) - private String databaseName; - - /** Denormalized table name. */ - @Column(name = "table_name", nullable = false, length = 128) - private String tableName; - - /** Snapshot fields at commit time. Stored as a JSON blob in the {@code snapshot} column. */ - @Type(type = "json") - @Column(name = "snapshot", columnDefinition = "TEXT") - private SnapshotMetrics snapshot; - - /** Per-commit delta counters. Stored as a JSON blob in the {@code delta} column. */ - @Type(type = "json") - @Column(name = "delta", columnDefinition = "TEXT") - private CommitDeltaMetrics delta; - - /** When this history row was recorded (commit time). */ - @Column(name = "recorded_at", nullable = false) - private Instant recordedAt; -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableStatsRow.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableStatsRow.java deleted file mode 100644 index 165247b6a..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/db/TableStatsRow.java +++ /dev/null @@ -1,64 +0,0 @@ -package com.linkedin.openhouse.optimizer.db; - -import com.vladmihalcea.hibernate.type.json.JsonStringType; -import java.time.Instant; -import java.util.Map; -import javax.persistence.Column; -import javax.persistence.Entity; -import javax.persistence.Id; -import javax.persistence.Table; -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.NoArgsConstructor; -import org.hibernate.annotations.Type; -import org.hibernate.annotations.TypeDef; - -/** - * JPA entity representing a per-table stats snapshot in the optimizer DB. - * - *

Written by the Tables Service on every Iceberg commit. Read by the Analyzer directly via JPA - * to enumerate tables and check scheduling eligibility. - * - *

Self-contained DB-layer type. Holds only the point-in-time {@link SnapshotMetrics} — - * per-commit deltas live exclusively on {@link TableStatsHistoryRow} and are not aggregated here. - */ -@TypeDef(name = "json", typeClass = JsonStringType.class) -@Entity -@Table(name = "table_stats") -@Getter -@EqualsAndHashCode -@Builder(toBuilder = true) -@NoArgsConstructor(access = AccessLevel.PROTECTED) -@AllArgsConstructor(access = AccessLevel.PROTECTED) -public class TableStatsRow { - - /** Stable Iceberg table UUID. Primary key. */ - @Id - @Column(name = "table_uuid", nullable = false, length = 36) - private String tableUuid; - - /** Denormalized database name. */ - @Column(name = "database_name", nullable = false, length = 128) - private String databaseName; - - /** Denormalized table name. */ - @Column(name = "table_name", nullable = false, length = 128) - private String tableName; - - /** Latest snapshot fields. Stored as a JSON blob in the {@code snapshot} column. */ - @Type(type = "json") - @Column(name = "snapshot", columnDefinition = "TEXT") - private SnapshotMetrics snapshot; - - /** Current table-property map (e.g. maintenance opt-in flags). Stored as JSON. */ - @Type(type = "json") - @Column(name = "table_properties", columnDefinition = "TEXT") - private Map tableProperties; - - /** Set on every upsert. Used for stats pipeline staleness monitoring. */ - @Column(name = "updated_at", nullable = false) - private Instant updatedAt; -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/HistoryStatusDto.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/HistoryStatusDto.java deleted file mode 100644 index af622d3ce..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/HistoryStatusDto.java +++ /dev/null @@ -1,27 +0,0 @@ -package com.linkedin.openhouse.optimizer.model; - -/** - * Internal lifecycle outcomes for a completed operation. Mirrors the values written to {@code - * table_operations_history.status}; parsed at the boundary so callers switch on a typed value - * instead of comparing strings. - * - *

Intentionally separate from the wire-API and DB representations. - */ -public enum HistoryStatusDto { - - /** The operation completed successfully. */ - SUCCESS, - - /** The operation failed. */ - FAILED; - - /** Convert to the DB-layer counterpart. */ - public com.linkedin.openhouse.optimizer.db.HistoryStatus toDb() { - return com.linkedin.openhouse.optimizer.db.HistoryStatus.valueOf(name()); - } - - /** Build the internal-model enum from the DB-layer counterpart. */ - public static HistoryStatusDto fromDb(com.linkedin.openhouse.optimizer.db.HistoryStatus v) { - return v == null ? null : HistoryStatusDto.valueOf(v.name()); - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/OperationStatusDto.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/OperationStatusDto.java deleted file mode 100644 index 2963f120f..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/OperationStatusDto.java +++ /dev/null @@ -1,33 +0,0 @@ -package com.linkedin.openhouse.optimizer.model; - -/** - * Internal lifecycle states for an operation. The analyzer writes {@link #PENDING}; the scheduler - * transitions through {@link #SCHEDULING} and {@link #SCHEDULED}. {@link #CANCELED} marks - * deduplicated PENDING rows. - * - *

Intentionally separate from the wire-API and DB representations. - */ -public enum OperationStatusDto { - - /** Analyzer has written the row; not yet claimed by the scheduler. */ - PENDING, - - /** Scheduler has claimed the row and is launching a job; jobId not yet recorded. */ - SCHEDULING, - - /** Job has been submitted to the Jobs Service; the row carries a {@code jobId}. */ - SCHEDULED, - - /** Scheduler marked this row as a duplicate of another PENDING row; not claimable. */ - CANCELED; - - /** Convert to the DB-layer counterpart. */ - public com.linkedin.openhouse.optimizer.db.OperationStatus toDb() { - return com.linkedin.openhouse.optimizer.db.OperationStatus.valueOf(name()); - } - - /** Build the internal-model enum from the DB-layer counterpart. */ - public static OperationStatusDto fromDb(com.linkedin.openhouse.optimizer.db.OperationStatus v) { - return v == null ? null : OperationStatusDto.valueOf(v.name()); - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/OperationTypeDto.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/OperationTypeDto.java deleted file mode 100644 index e2eb1158b..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/OperationTypeDto.java +++ /dev/null @@ -1,22 +0,0 @@ -package com.linkedin.openhouse.optimizer.model; - -/** - * Internal enum for the operation types the analyzer and scheduler know about. Intentionally - * separate from the wire-API and DB representations so the internal model can evolve its set of - * supported operations without churning either boundary. - */ -public enum OperationTypeDto { - - /** Removes orphaned data files no longer referenced by table metadata. */ - ORPHAN_FILES_DELETION; - - /** Convert to the DB-layer counterpart. */ - public com.linkedin.openhouse.optimizer.db.OperationType toDb() { - return com.linkedin.openhouse.optimizer.db.OperationType.valueOf(name()); - } - - /** Build the internal-model enum from the DB-layer counterpart. */ - public static OperationTypeDto fromDb(com.linkedin.openhouse.optimizer.db.OperationType v) { - return v == null ? null : OperationTypeDto.valueOf(v.name()); - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableDto.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableDto.java deleted file mode 100644 index db68fb3c1..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableDto.java +++ /dev/null @@ -1,75 +0,0 @@ -package com.linkedin.openhouse.optimizer.model; - -import com.linkedin.openhouse.optimizer.db.TableStatsRow; -import java.time.Instant; -import java.util.Collections; -import java.util.Map; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - -/** - * An OpenHouse table enriched with stats and properties, built by combining data sources. Consumed - * by the analyzer (decides whether to produce a {@link TableOperationDto}) and the scheduler (reads - * stats for bin-packing). - * - *

Conversion methods cross into the DB layer one-way; the inverse lives on the api side. db/ - * types know nothing about model/ or api/. - */ -@Data -@Builder(toBuilder = true) -@NoArgsConstructor -@AllArgsConstructor -public class TableDto { - - /** Stable table identity from the Tables Service. Survives renames; rotates on drop+recreate. */ - private String tableUuid; - - /** Database the table lives in. */ - private String databaseName; - - /** Iceberg table identifier (table name, not UUID). */ - private String tableId; - - /** Current table-property map (e.g. maintenance opt-in flags). Never null. */ - @Builder.Default private Map tableProperties = Collections.emptyMap(); - - /** Latest snapshot stats for this table. Delta is null when read from the current-state row. */ - private TableStatsDto stats; - - /** When the current snapshot was last written. Stamped server-side on every upsert. */ - private Instant updatedAt; - - /** - * Project to the current-state DB row. {@code table_stats} carries the snapshot only — per-commit - * deltas live on {@code table_stats_history} (see {@link TableStatsHistoryDto#toRow()}). - */ - public TableStatsRow toRow() { - return TableStatsRow.builder() - .tableUuid(tableUuid) - .databaseName(databaseName) - .tableName(tableId) - .snapshot(stats == null ? null : stats.toSnapshotRow()) - .tableProperties(tableProperties) - .updatedAt(updatedAt) - .build(); - } - - /** Build a {@link TableDto} from a current-state DB row. */ - public static TableDto fromRow(TableStatsRow row) { - if (row == null) { - return null; - } - return TableDto.builder() - .tableUuid(row.getTableUuid()) - .databaseName(row.getDatabaseName()) - .tableId(row.getTableName()) - .tableProperties( - row.getTableProperties() != null ? row.getTableProperties() : Collections.emptyMap()) - // table_stats holds only the snapshot — deltas live on the history table. - .stats(TableStatsDto.fromRows(row.getSnapshot(), null)) - .updatedAt(row.getUpdatedAt()) - .build(); - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableOperationDto.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableOperationDto.java deleted file mode 100644 index 18d57ce66..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableOperationDto.java +++ /dev/null @@ -1,104 +0,0 @@ -package com.linkedin.openhouse.optimizer.model; - -import com.linkedin.openhouse.optimizer.db.TableOperationsRow; -import java.time.Instant; -import java.util.Comparator; -import java.util.UUID; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - -/** - * An operation the analyzer has decided to schedule for a table, and that the scheduler later picks - * up and submits. - * - *

Conversion methods cross into the DB layer one-way; the inverse lives on the api side. db/ - * types know nothing about model/ or api/. - */ -@Data -@Builder -@NoArgsConstructor -@AllArgsConstructor -public class TableOperationDto { - - /** Unique operation ID (UUID). */ - private String id; - - /** The table this operation targets. */ - private String tableUuid; - - /** Database name. */ - private String databaseName; - - /** Table name. */ - private String tableName; - - /** Operation type. */ - private OperationTypeDto operationType; - - /** Current lifecycle status. */ - private OperationStatusDto status; - - /** When this operation record was created. */ - private Instant createdAt; - - /** When the scheduler last submitted a job for this operation. */ - private Instant scheduledAt; - - /** Job ID returned by the Jobs Service after the scheduler submitted; null until SCHEDULED. */ - private String jobId; - - /** Create a new PENDING operation for the given table and operation type. */ - public static TableOperationDto pending(TableDto table, OperationTypeDto operationType) { - return TableOperationDto.builder() - .id(UUID.randomUUID().toString()) - .tableUuid(table.getTableUuid()) - .databaseName(table.getDatabaseName()) - .tableName(table.getTableId()) - .operationType(operationType) - .status(OperationStatusDto.PENDING) - .createdAt(Instant.now()) - .build(); - } - - /** Return the more recently created of two operations. */ - public static TableOperationDto mostRecent(TableOperationDto a, TableOperationDto b) { - Comparator byCreatedAt = - Comparator.comparing(r -> r.getCreatedAt() != null ? r.getCreatedAt() : Instant.EPOCH); - return byCreatedAt.compare(a, b) >= 0 ? a : b; - } - - /** Convert to the corresponding DB row. */ - public TableOperationsRow toRow() { - return TableOperationsRow.builder() - .id(id) - .tableUuid(tableUuid) - .databaseName(databaseName) - .tableName(tableName) - .operationType(operationType == null ? null : operationType.toDb()) - .status(status == null ? null : status.toDb()) - .createdAt(createdAt) - .scheduledAt(scheduledAt) - .jobId(jobId) - .build(); - } - - /** Build a {@link TableOperationDto} from a DB row. */ - public static TableOperationDto fromRow(TableOperationsRow row) { - if (row == null) { - return null; - } - return TableOperationDto.builder() - .id(row.getId()) - .tableUuid(row.getTableUuid()) - .databaseName(row.getDatabaseName()) - .tableName(row.getTableName()) - .operationType(OperationTypeDto.fromDb(row.getOperationType())) - .status(OperationStatusDto.fromDb(row.getStatus())) - .createdAt(row.getCreatedAt()) - .scheduledAt(row.getScheduledAt()) - .jobId(row.getJobId()) - .build(); - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableOperationsHistoryDto.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableOperationsHistoryDto.java deleted file mode 100644 index 74922e7b0..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableOperationsHistoryDto.java +++ /dev/null @@ -1,79 +0,0 @@ -package com.linkedin.openhouse.optimizer.model; - -import com.linkedin.openhouse.optimizer.db.TableOperationsHistoryRow; -import java.time.Instant; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - -/** - * Internal-model view of a completed operation history record. - * - *

Mirrors the field set of the underlying history row but in internal types only. Used by - * components that need to reason about completed operations (e.g., scheduling-cadence analyzers). - */ -@Data -@Builder(toBuilder = true) -@NoArgsConstructor -@AllArgsConstructor -public class TableOperationsHistoryDto { - - /** Same UUID as the originating live-operations row. */ - private String id; - - /** Stable table identity from the Tables Service. */ - private String tableUuid; - - /** Denormalized database name. */ - private String databaseName; - - /** Denormalized table name. */ - private String tableName; - - /** Operation type for this completed run. */ - private OperationTypeDto operationType; - - /** When the operation completed, as recorded by the complete endpoint. */ - private Instant completedAt; - - /** Terminal outcome: {@link HistoryStatusDto#SUCCESS} or {@link HistoryStatusDto#FAILED}. */ - private HistoryStatusDto status; - - /** Convert to the corresponding DB row. */ - public TableOperationsHistoryRow toRow() { - return TableOperationsHistoryRow.builder() - .id(id) - .tableUuid(tableUuid) - .databaseName(databaseName) - .tableName(tableName) - .operationType(operationType == null ? null : operationType.toDb()) - .completedAt(completedAt) - .status(status == null ? null : status.toDb()) - .build(); - } - - /** Build a {@link TableOperationsHistoryDto} from a DB row. */ - public static TableOperationsHistoryDto fromRow(TableOperationsHistoryRow row) { - if (row == null) { - return null; - } - return TableOperationsHistoryDto.builder() - .id(row.getId()) - .tableUuid(row.getTableUuid()) - .databaseName(row.getDatabaseName()) - .tableName(row.getTableName()) - .operationType(OperationTypeDto.fromDb(row.getOperationType())) - .completedAt(row.getCompletedAt()) - .status(HistoryStatusDto.fromDb(row.getStatus())) - .build(); - } - - /** - * Return whichever of {@code this} and {@code other} completed later (or {@code this} on tie). - * Shaped for use as a {@link java.util.function.BinaryOperator} in stream collectors. - */ - public TableOperationsHistoryDto after(TableOperationsHistoryDto other) { - return this.completedAt.isBefore(other.completedAt) ? other : this; - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableStatsDto.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableStatsDto.java deleted file mode 100644 index 6dc52492c..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableStatsDto.java +++ /dev/null @@ -1,194 +0,0 @@ -package com.linkedin.openhouse.optimizer.model; - -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import java.time.Instant; -import java.util.Collections; -import java.util.Map; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - -/** - * Self-describing per-table stats record. Carries the table's identity and metadata alongside the - * snapshot + delta payload so consumers don't need an outer wrapper to know which table the stats - * belong to. - * - *

Identity ({@link #tableUuid}, {@link #databaseName}, {@link #tableName}) and metadata ({@link - * #tableProperties}, {@link #updatedAt}) are populated when read from a current-state row. When - * this record is built from a per-commit history row, {@link #delta} is populated and {@link - * #tableProperties} / {@link #updatedAt} are typically {@code null}. - */ -@Data -@Builder(toBuilder = true) -@NoArgsConstructor -@AllArgsConstructor -@JsonIgnoreProperties(ignoreUnknown = true) -public class TableStatsDto { - - /** Stable table identity from the Tables Service. Survives renames; rotates on drop+recreate. */ - private String tableUuid; - - /** Database the table lives in. */ - private String databaseName; - - /** Iceberg table name (the human-readable identifier, not the UUID). */ - private String tableName; - - /** Current table-property map (e.g. maintenance opt-in flags). Never null. */ - @Builder.Default private Map tableProperties = Collections.emptyMap(); - - /** Snapshot fields — overwritten on every upsert. */ - private SnapshotMetrics snapshot; - - /** Delta fields — accumulated across commit events. Null when read from a current-state row. */ - private CommitDelta delta; - - /** When the current snapshot was last written. Stamped server-side on every upsert. */ - private Instant updatedAt; - - /** - * Project to the current-state {@code table_stats} row. Snapshot only; deltas live on history. - */ - public com.linkedin.openhouse.optimizer.db.TableStatsRow toRow() { - return com.linkedin.openhouse.optimizer.db.TableStatsRow.builder() - .tableUuid(tableUuid) - .databaseName(databaseName) - .tableName(tableName) - .snapshot(snapshot == null ? null : snapshot.toDb()) - .tableProperties(tableProperties != null ? tableProperties : Collections.emptyMap()) - .updatedAt(updatedAt) - .build(); - } - - /** - * Build a {@link TableStatsDto} from a current-state DB row. {@link #delta} is left {@code null}. - */ - public static TableStatsDto fromRow(com.linkedin.openhouse.optimizer.db.TableStatsRow row) { - if (row == null) { - return null; - } - return TableStatsDto.builder() - .tableUuid(row.getTableUuid()) - .databaseName(row.getDatabaseName()) - .tableName(row.getTableName()) - .tableProperties( - row.getTableProperties() != null ? row.getTableProperties() : Collections.emptyMap()) - .snapshot(SnapshotMetrics.fromDb(row.getSnapshot())) - .updatedAt(row.getUpdatedAt()) - .build(); - } - - /** Project to the DB-layer {@link com.linkedin.openhouse.optimizer.db.SnapshotMetrics} object. */ - public com.linkedin.openhouse.optimizer.db.SnapshotMetrics toSnapshotRow() { - return snapshot == null ? null : snapshot.toDb(); - } - - /** - * Project to the DB-layer {@link com.linkedin.openhouse.optimizer.db.CommitDeltaMetrics} object. - */ - public com.linkedin.openhouse.optimizer.db.CommitDeltaMetrics toDeltaRow() { - return delta == null ? null : delta.toDb(); - } - - /** Join the two DB-side columns back into a single internal-model {@link TableStatsDto}. */ - public static TableStatsDto fromRows( - com.linkedin.openhouse.optimizer.db.SnapshotMetrics dbSnapshot, - com.linkedin.openhouse.optimizer.db.CommitDeltaMetrics dbDelta) { - if (dbSnapshot == null && dbDelta == null) { - return null; - } - return TableStatsDto.builder() - .snapshot(SnapshotMetrics.fromDb(dbSnapshot)) - .delta(CommitDelta.fromDb(dbDelta)) - .build(); - } - - /** Point-in-time metadata read from Iceberg at scan time. */ - @Data - @Builder(toBuilder = true) - @NoArgsConstructor - @AllArgsConstructor - @JsonIgnoreProperties(ignoreUnknown = true) - public static class SnapshotMetrics { - - /** Iceberg metadata version pointer for this snapshot. */ - private String tableVersion; - - /** Filesystem path (or URI) of the table's storage root. */ - private String tableLocation; - - /** Total on-disk size of the table at this snapshot, in bytes. */ - private Long tableSizeBytes; - - /** Total number of data files as of the latest snapshot — used for bin-packing. */ - private Long numCurrentFiles; - - /** Convert to the DB-layer counterpart. */ - public com.linkedin.openhouse.optimizer.db.SnapshotMetrics toDb() { - return com.linkedin.openhouse.optimizer.db.SnapshotMetrics.builder() - .tableVersion(tableVersion) - .tableLocation(tableLocation) - .tableSizeBytes(tableSizeBytes) - .numCurrentFiles(numCurrentFiles) - .build(); - } - - /** Build the internal-model inner object from the DB-layer counterpart. */ - public static SnapshotMetrics fromDb(com.linkedin.openhouse.optimizer.db.SnapshotMetrics v) { - if (v == null) { - return null; - } - return SnapshotMetrics.builder() - .tableVersion(v.getTableVersion()) - .tableLocation(v.getTableLocation()) - .tableSizeBytes(v.getTableSizeBytes()) - .numCurrentFiles(v.getNumCurrentFiles()) - .build(); - } - } - - /** Per-commit incremental counters; accumulated across all recorded commit events. */ - @Data - @Builder(toBuilder = true) - @NoArgsConstructor - @AllArgsConstructor - @JsonIgnoreProperties(ignoreUnknown = true) - public static class CommitDelta { - - /** Number of data files this commit added to the table. */ - private Long numFilesAdded; - - /** Number of data files this commit removed from the table. */ - private Long numFilesDeleted; - - /** Total bytes added by this commit. */ - private Long addedSizeBytes; - - /** Total bytes removed by this commit. */ - private Long deletedSizeBytes; - - /** Convert to the DB-layer counterpart. */ - public com.linkedin.openhouse.optimizer.db.CommitDeltaMetrics toDb() { - return com.linkedin.openhouse.optimizer.db.CommitDeltaMetrics.builder() - .numFilesAdded(numFilesAdded) - .numFilesDeleted(numFilesDeleted) - .addedSizeBytes(addedSizeBytes) - .deletedSizeBytes(deletedSizeBytes) - .build(); - } - - /** Build the internal-model inner object from the DB-layer counterpart. */ - public static CommitDelta fromDb(com.linkedin.openhouse.optimizer.db.CommitDeltaMetrics v) { - if (v == null) { - return null; - } - return CommitDelta.builder() - .numFilesAdded(v.getNumFilesAdded()) - .numFilesDeleted(v.getNumFilesDeleted()) - .addedSizeBytes(v.getAddedSizeBytes()) - .deletedSizeBytes(v.getDeletedSizeBytes()) - .build(); - } - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableStatsHistoryDto.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableStatsHistoryDto.java deleted file mode 100644 index 069944e59..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/model/TableStatsHistoryDto.java +++ /dev/null @@ -1,67 +0,0 @@ -package com.linkedin.openhouse.optimizer.model; - -import com.linkedin.openhouse.optimizer.db.TableStatsHistoryRow; -import java.time.Instant; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - -/** - * Internal-model view of an append-only per-commit stats history record. - * - *

One per Iceberg commit. {@link #stats} carries both the snapshot at commit time and the commit - * delta — consumers can reconstruct change rates over arbitrary time windows. - */ -@Data -@Builder -@NoArgsConstructor -@AllArgsConstructor -public class TableStatsHistoryDto { - - /** UUID primary key — set by the caller, not generated server-side. */ - private String id; - - /** Stable table identity from the Tables Service. */ - private String tableUuid; - - /** Denormalized database name for display. */ - private String databaseName; - - /** Denormalized table name for display. */ - private String tableName; - - /** Snapshot + delta for this commit event. */ - private TableStatsDto stats; - - /** When this history row was recorded. */ - private Instant recordedAt; - - /** Convert to the corresponding DB row. */ - public TableStatsHistoryRow toRow() { - return TableStatsHistoryRow.builder() - .id(id) - .tableUuid(tableUuid) - .databaseName(databaseName) - .tableName(tableName) - .snapshot(stats == null ? null : stats.toSnapshotRow()) - .delta(stats == null ? null : stats.toDeltaRow()) - .recordedAt(recordedAt) - .build(); - } - - /** Build a {@link TableStatsHistoryDto} from a DB row. */ - public static TableStatsHistoryDto fromRow(TableStatsHistoryRow row) { - if (row == null) { - return null; - } - return TableStatsHistoryDto.builder() - .id(row.getId()) - .tableUuid(row.getTableUuid()) - .databaseName(row.getDatabaseName()) - .tableName(row.getTableName()) - .stats(TableStatsDto.fromRows(row.getSnapshot(), row.getDelta())) - .recordedAt(row.getRecordedAt()) - .build(); - } -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/repository/TableOperationsHistoryRepository.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/repository/TableOperationsHistoryRepository.java deleted file mode 100644 index 6c08f844a..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/repository/TableOperationsHistoryRepository.java +++ /dev/null @@ -1,45 +0,0 @@ -package com.linkedin.openhouse.optimizer.repository; - -import com.linkedin.openhouse.optimizer.db.OperationType; -import com.linkedin.openhouse.optimizer.db.TableOperationsHistoryRow; -import java.util.List; -import org.springframework.data.domain.Pageable; -import org.springframework.data.jpa.repository.JpaRepository; -import org.springframework.data.jpa.repository.Query; -import org.springframework.data.repository.query.Param; - -/** Repository for reading {@code table_operations_history}. */ -public interface TableOperationsHistoryRepository - extends JpaRepository { - - /** - * Return history rows for a single {@code tableUuid}, newest first. {@code pageable} is required; - * callers pick the row cap (default limit lives in {@code optimizer.repo.default-limit}). - */ - @Query( - "SELECT r FROM TableOperationsHistoryRow r " - + "WHERE r.tableUuid = :tableUuid " - + "ORDER BY r.completedAt DESC") - List find(@Param("tableUuid") String tableUuid, Pageable pageable); - - /** - * Return the most-recent history row per {@code (table_uuid, operation_type)}, filtered to a - * single operation type. Used by the analyzer to evaluate cadence without materializing every - * historical row. - * - *

The correlated subquery is portable across MySQL and H2 (MySQL mode). Backed by index {@code - * idx_toph_optype_uuid_completed (operation_type, table_uuid, completed_at)} on {@code - * table_operations_history}, the subquery becomes an index-only lookup per outer row. - * - *

Ties on {@code completed_at} for the same {@code (table_uuid, operation_type)} return all - * tied rows; callers should dedupe in memory. - */ - @Query( - "SELECT r FROM TableOperationsHistoryRow r " - + "WHERE r.operationType = :operationType " - + "AND r.completedAt = (" - + " SELECT MAX(r2.completedAt) FROM TableOperationsHistoryRow r2 " - + " WHERE r2.tableUuid = r.tableUuid AND r2.operationType = r.operationType)") - List findLatest( - @Param("operationType") OperationType operationType, Pageable pageable); -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/repository/TableOperationsRepository.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/repository/TableOperationsRepository.java deleted file mode 100644 index e0df2cd21..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/repository/TableOperationsRepository.java +++ /dev/null @@ -1,146 +0,0 @@ -package com.linkedin.openhouse.optimizer.repository; - -import com.linkedin.openhouse.optimizer.db.OperationStatus; -import com.linkedin.openhouse.optimizer.db.OperationType; -import com.linkedin.openhouse.optimizer.db.TableOperationsRow; -import java.time.Instant; -import java.util.List; -import java.util.Optional; -import org.springframework.data.domain.Pageable; -import org.springframework.data.jpa.repository.JpaRepository; -import org.springframework.data.jpa.repository.Modifying; -import org.springframework.data.jpa.repository.Query; -import org.springframework.data.repository.query.Param; - -/** Spring Data JPA repository for {@code table_operations} rows in the optimizer DB. */ -public interface TableOperationsRepository extends JpaRepository { - - /** - * Find operation rows matching the given filters. Every filter is optional ({@link - * Optional#empty()} to skip). {@code pageable} is required; callers pick the row cap (default - * limit lives in {@code optimizer.repo.default-limit}). - */ - default List find( - Optional operationType, - Optional status, - Optional tableUuid, - Optional databaseName, - Optional tableName, - Optional scheduledAt, - Optional> ids, - Pageable pageable) { - // List parameters can't share an :ids IS NULL pattern with the IN clause — - // Hibernate expands the list inline and the IS NULL check turns ungrammatical. - // Two internal queries; dispatch by presence. - if (ids.isPresent()) { - return findInternalWithIds( - operationType.orElse(null), - status.orElse(null), - tableUuid.orElse(null), - databaseName.orElse(null), - tableName.orElse(null), - scheduledAt.orElse(null), - ids.get(), - pageable); - } - return findInternal( - operationType.orElse(null), - status.orElse(null), - tableUuid.orElse(null), - databaseName.orElse(null), - tableName.orElse(null), - scheduledAt.orElse(null), - pageable); - } - - /** - * Batch CAS: transition rows from {@code fromStatus} to {@code toStatus} for every id in {@code - * ids} that is still in {@code fromStatus}. Rows in a different status are skipped silently. - * Returns the number of rows transitioned. - * - *

Side-effect columns use COALESCE — {@link Optional#empty()} means "leave unchanged". The - * underlying transitions are: - * - *

    - *
  • PENDING → SCHEDULING: pass {@code scheduledAt = Optional.of(claimedAt)}; the watermark - * lets {@link #find} resolve the precise set of rows this caller claimed. - *
  • SCHEDULING → SCHEDULED: pass {@code jobId = Optional.of(...)}. - *
  • SCHEDULING → PENDING: pass both empty; {@code scheduledAt} stays at the prior claim's - * watermark (overwritten on the next claim) and {@code jobId} stays null. - *
- */ - default int updateBatch( - List ids, - OperationStatus fromStatus, - OperationStatus toStatus, - Optional scheduledAt, - Optional jobId) { - return updateBatchInternal( - ids, fromStatus, toStatus, scheduledAt.orElse(null), jobId.orElse(null)); - } - - /** - * Delete the specified rows, but only if they are still {@code PENDING}. The status gate is - * defensive — never drop a row another instance has claimed. Returns the number of rows actually - * removed. - */ - @Modifying(flushAutomatically = true, clearAutomatically = true) - @Query( - "DELETE FROM TableOperationsRow r " - + "WHERE r.id IN :ids " - + "AND r.status = com.linkedin.openhouse.optimizer.db.OperationStatus.PENDING") - int cancel(@Param("ids") List ids); - - // ---- Internals. Use the Optional-typed default methods above. ---- - - @Query( - "SELECT r FROM TableOperationsRow r " - + "WHERE (:operationType IS NULL OR r.operationType = :operationType) " - + "AND (:status IS NULL OR r.status = :status) " - + "AND (:tableUuid IS NULL OR r.tableUuid = :tableUuid) " - + "AND (:databaseName IS NULL OR r.databaseName = :databaseName) " - + "AND (:tableName IS NULL OR r.tableName = :tableName) " - + "AND (:scheduledAt IS NULL OR r.scheduledAt = :scheduledAt)") - List findInternal( - @Param("operationType") OperationType operationType, - @Param("status") OperationStatus status, - @Param("tableUuid") String tableUuid, - @Param("databaseName") String databaseName, - @Param("tableName") String tableName, - @Param("scheduledAt") Instant scheduledAt, - Pageable pageable); - - @Query( - "SELECT r FROM TableOperationsRow r " - + "WHERE (:operationType IS NULL OR r.operationType = :operationType) " - + "AND (:status IS NULL OR r.status = :status) " - + "AND (:tableUuid IS NULL OR r.tableUuid = :tableUuid) " - + "AND (:databaseName IS NULL OR r.databaseName = :databaseName) " - + "AND (:tableName IS NULL OR r.tableName = :tableName) " - + "AND (:scheduledAt IS NULL OR r.scheduledAt = :scheduledAt) " - + "AND r.id IN :ids") - List findInternalWithIds( - @Param("operationType") OperationType operationType, - @Param("status") OperationStatus status, - @Param("tableUuid") String tableUuid, - @Param("databaseName") String databaseName, - @Param("tableName") String tableName, - @Param("scheduledAt") Instant scheduledAt, - @Param("ids") List ids, - Pageable pageable); - - @Modifying(flushAutomatically = true, clearAutomatically = true) - @Query( - "UPDATE TableOperationsRow r " - + "SET r.status = :toStatus, " - + " r.scheduledAt = COALESCE(:scheduledAt, r.scheduledAt), " - + " r.jobId = COALESCE(:jobId, r.jobId) " - + "WHERE r.id IN :ids " - + "AND r.status = :fromStatus") - int updateBatchInternal( - @Param("ids") List ids, - @Param("fromStatus") OperationStatus fromStatus, - @Param("toStatus") OperationStatus toStatus, - @Param("scheduledAt") Instant scheduledAt, - @Param("jobId") String jobId); -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/repository/TableStatsHistoryRepository.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/repository/TableStatsHistoryRepository.java deleted file mode 100644 index 9b603f265..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/repository/TableStatsHistoryRepository.java +++ /dev/null @@ -1,34 +0,0 @@ -package com.linkedin.openhouse.optimizer.repository; - -import com.linkedin.openhouse.optimizer.db.TableStatsHistoryRow; -import java.time.Instant; -import java.util.List; -import java.util.Optional; -import org.springframework.data.domain.Pageable; -import org.springframework.data.jpa.repository.JpaRepository; -import org.springframework.data.jpa.repository.Query; -import org.springframework.data.repository.query.Param; - -/** Append-only repository for per-commit stats history rows. */ -public interface TableStatsHistoryRepository extends JpaRepository { - - /** - * Return history rows for a table, newest first. {@code since} is optional ({@link - * Optional#empty()} to skip the time filter). {@code pageable} is required; callers pick the row - * cap (default limit lives in {@code optimizer.repo.default-limit}). - */ - default List find( - String tableUuid, Optional since, Pageable pageable) { - return findInternal(tableUuid, since.orElse(null), pageable); - } - - // ---- Internals. Use the Optional-typed default method above. ---- - - @Query( - "SELECT r FROM TableStatsHistoryRow r " - + "WHERE r.tableUuid = :tableUuid " - + "AND (:since IS NULL OR r.recordedAt >= :since) " - + "ORDER BY r.recordedAt DESC") - List findInternal( - @Param("tableUuid") String tableUuid, @Param("since") Instant since, Pageable pageable); -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/repository/TableStatsRepository.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/repository/TableStatsRepository.java deleted file mode 100644 index 1123c0e7a..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/repository/TableStatsRepository.java +++ /dev/null @@ -1,48 +0,0 @@ -package com.linkedin.openhouse.optimizer.repository; - -import com.linkedin.openhouse.optimizer.db.TableStatsRow; -import java.util.List; -import java.util.Optional; -import org.springframework.data.domain.Pageable; -import org.springframework.data.jpa.repository.JpaRepository; -import org.springframework.data.jpa.repository.Query; -import org.springframework.data.repository.query.Param; - -/** Spring Data JPA repository for {@code table_stats} rows in the optimizer DB. */ -public interface TableStatsRepository extends JpaRepository { - - /** - * Return stats rows matching the given filters. Every filter is optional ({@link - * Optional#empty()} to skip). {@code pageable} is required; callers pick the row cap (default - * limit lives in {@code optimizer.repo.default-limit}). - */ - default List find( - Optional databaseName, - Optional tableName, - Optional tableUuid, - Pageable pageable) { - return findInternal( - databaseName.orElse(null), tableName.orElse(null), tableUuid.orElse(null), pageable); - } - - /** - * Return the distinct {@code database_name} values present in {@code table_stats}. Used by the - * Analyzer to enumerate databases when iterating per-db; the result set size is bounded by the - * number of databases (small even at million-table scale). - */ - @Query("SELECT DISTINCT r.databaseName FROM TableStatsRow r") - List findDistinctDatabaseNames(); - - // ---- Internals. Use the Optional-typed default methods above. ---- - - @Query( - "SELECT r FROM TableStatsRow r " - + "WHERE (:databaseName IS NULL OR r.databaseName = :databaseName) " - + "AND (:tableName IS NULL OR r.tableName = :tableName) " - + "AND (:tableUuid IS NULL OR r.tableUuid = :tableUuid)") - List findInternal( - @Param("databaseName") String databaseName, - @Param("tableName") String tableName, - @Param("tableUuid") String tableUuid, - Pageable pageable); -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/service/OptimizerDataService.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/service/OptimizerDataService.java deleted file mode 100644 index c20ae7bf2..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/service/OptimizerDataService.java +++ /dev/null @@ -1,94 +0,0 @@ -package com.linkedin.openhouse.optimizer.service; - -import com.linkedin.openhouse.optimizer.model.HistoryStatusDto; -import com.linkedin.openhouse.optimizer.model.OperationStatusDto; -import com.linkedin.openhouse.optimizer.model.OperationTypeDto; -import com.linkedin.openhouse.optimizer.model.TableOperationDto; -import com.linkedin.openhouse.optimizer.model.TableOperationsHistoryDto; -import com.linkedin.openhouse.optimizer.model.TableStatsDto; -import com.linkedin.openhouse.optimizer.model.TableStatsHistoryDto; -import java.time.Instant; -import java.util.List; -import java.util.Optional; - -/** - * Service interface for optimizer data operations. - * - *

The service is the boundary between the wire-API surface and the database. Inputs and outputs - * are internal-model types only — callers (controllers, future CLI, in-process consumers) - * convert at their own edge. No api/-package types appear here. - */ -public interface OptimizerDataService { - - // --- TableOperations --- - - /** - * List operations matching the given filters, capped at {@code limit} rows. Every filter - * parameter is optional — pass {@link Optional#empty()} to skip that filter. - */ - List listTableOperations( - Optional operationType, - Optional status, - Optional databaseName, - Optional tableName, - Optional tableUuid, - int limit); - - /** - * Update an operation by writing a history entry. Looks up the operation row by {@code - * operationId}, copies its table metadata into a new history row with the supplied terminal - * {@code status}, and saves it. Returns the history record, or empty if the operation does not - * exist. - */ - Optional updateOperation(String operationId, HistoryStatusDto status); - - /** - * Return the operation row for {@code id} regardless of status, or empty if it does not exist. - * Used to poll a specific operation (e.g. waiting for SUCCESS after a Spark job completes). - */ - Optional getTableOperation(String id); - - // --- TableStatsDto --- - - /** - * Create or update the stats row for {@code stats.getTableUuid()}. Fully idempotent: the same - * call overwrites the previous snapshot with the latest commit values. The service stamps {@link - * TableStatsDto#getUpdatedAt()} server-side and returns the resulting {@link TableStatsDto}. - */ - TableStatsDto upsertTableStats(TableStatsDto stats); - - /** Return the stats row for {@code tableUuid}, or empty if none exists. */ - Optional getTableStats(String tableUuid); - - /** - * List stats rows matching the given filters, capped at {@code limit} rows. Every filter - * parameter is optional — pass {@link Optional#empty()} to skip that filter. - */ - List listTableStats( - Optional databaseName, - Optional tableName, - Optional tableUuid, - int limit); - - /** - * Return per-commit stats history for {@code tableUuid}, newest first. - * - * @param tableUuid the stable table UUID - * @param since if present, only return rows recorded at or after this instant - * @param limit maximum number of rows to return - */ - List getStatsHistory(String tableUuid, Optional since, int limit); - - // --- TableOperationsHistoryDto --- - - /** Append a completed-job result record. */ - TableOperationsHistoryDto appendHistory(TableOperationsHistoryDto history); - - /** - * Return the most recent history rows for a table UUID, newest first. - * - * @param tableUuid the stable table UUID - * @param limit maximum number of rows to return - */ - List getHistory(String tableUuid, int limit); -} diff --git a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/service/OptimizerDataServiceImpl.java b/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/service/OptimizerDataServiceImpl.java deleted file mode 100644 index 29fd0eeee..000000000 --- a/services/optimizer/src/main/java/com/linkedin/openhouse/optimizer/service/OptimizerDataServiceImpl.java +++ /dev/null @@ -1,175 +0,0 @@ -package com.linkedin.openhouse.optimizer.service; - -import com.linkedin.openhouse.optimizer.db.TableStatsHistoryRow; -import com.linkedin.openhouse.optimizer.db.TableStatsRow; -import com.linkedin.openhouse.optimizer.model.HistoryStatusDto; -import com.linkedin.openhouse.optimizer.model.OperationStatusDto; -import com.linkedin.openhouse.optimizer.model.OperationTypeDto; -import com.linkedin.openhouse.optimizer.model.TableOperationDto; -import com.linkedin.openhouse.optimizer.model.TableOperationsHistoryDto; -import com.linkedin.openhouse.optimizer.model.TableStatsDto; -import com.linkedin.openhouse.optimizer.model.TableStatsHistoryDto; -import com.linkedin.openhouse.optimizer.repository.TableOperationsHistoryRepository; -import com.linkedin.openhouse.optimizer.repository.TableOperationsRepository; -import com.linkedin.openhouse.optimizer.repository.TableStatsHistoryRepository; -import com.linkedin.openhouse.optimizer.repository.TableStatsRepository; -import java.time.Instant; -import java.util.List; -import java.util.Optional; -import java.util.UUID; -import java.util.stream.Collectors; -import lombok.RequiredArgsConstructor; -import org.springframework.data.domain.PageRequest; -import org.springframework.stereotype.Service; -import org.springframework.transaction.annotation.Transactional; - -/** - * Implementation of {@link OptimizerDataService}. - * - *

Operates purely on model/ and db/ types. Conversion happens via the {@code toRow()} / {@code - * fromRow(...)} methods on the model types themselves — no injected mapper. No api/-package types - * appear in this class. - */ -@Service -@RequiredArgsConstructor -public class OptimizerDataServiceImpl implements OptimizerDataService { - - private final TableOperationsRepository operationsRepository; - private final TableOperationsHistoryRepository historyRepository; - private final TableStatsRepository statsRepository; - private final TableStatsHistoryRepository statsHistoryRepository; - - // --- TableOperations --- - - @Override - public List listTableOperations( - Optional operationType, - Optional status, - Optional databaseName, - Optional tableName, - Optional tableUuid, - int limit) { - return operationsRepository - .find( - operationType.map(OperationTypeDto::toDb), - status.map(OperationStatusDto::toDb), - tableUuid, - databaseName, - tableName, - Optional.empty(), - Optional.empty(), - PageRequest.of(0, limit)) - .stream() - .map(TableOperationDto::fromRow) - .collect(Collectors.toList()); - } - - @Override - @Transactional - public Optional updateOperation( - String operationId, HistoryStatusDto status) { - return operationsRepository - .findById(operationId) - .map( - row -> - TableOperationsHistoryDto.builder() - .id(row.getId()) - .tableUuid(row.getTableUuid()) - .databaseName(row.getDatabaseName()) - .tableName(row.getTableName()) - .operationType(OperationTypeDto.fromDb(row.getOperationType())) - .completedAt(Instant.now()) - .status(status) - .build()) - .map(history -> TableOperationsHistoryDto.fromRow(historyRepository.save(history.toRow()))); - } - - @Override - public Optional getTableOperation(String id) { - return operationsRepository.findById(id).map(TableOperationDto::fromRow); - } - - // --- TableStatsDto --- - - @Override - @Transactional - public TableStatsDto upsertTableStats(TableStatsDto stats) { - Instant now = Instant.now(); - String tableUuid = stats.getTableUuid(); - - TableStatsRow row = - statsRepository - .findById(tableUuid) - .map( - existing -> - existing - .toBuilder() - .databaseName(stats.getDatabaseName()) - .tableName(stats.getTableName()) - .snapshot(stats.toSnapshotRow()) - .tableProperties(stats.getTableProperties()) - .updatedAt(now) - .build()) - .orElse(stats.toBuilder().updatedAt(now).build().toRow()); - TableStatsRow saved = statsRepository.save(row); - - statsHistoryRepository.save( - TableStatsHistoryRow.builder() - .id(UUID.randomUUID().toString()) - .tableUuid(tableUuid) - .databaseName(stats.getDatabaseName()) - .tableName(stats.getTableName()) - .snapshot(stats.toSnapshotRow()) - .delta(stats.toDeltaRow()) - .recordedAt(now) - .build()); - - return TableStatsDto.fromRow(saved); - } - - @Override - public Optional getTableStats(String tableUuid) { - return statsRepository.findById(tableUuid).map(TableStatsDto::fromRow); - } - - @Override - public List listTableStats( - Optional databaseName, - Optional tableName, - Optional tableUuid, - int limit) { - return statsRepository.find(databaseName, tableName, tableUuid, PageRequest.of(0, limit)) - .stream() - .map(TableStatsDto::fromRow) - .collect(Collectors.toList()); - } - - @Override - public List getStatsHistory( - String tableUuid, Optional since, int limit) { - return statsHistoryRepository.find(tableUuid, since, PageRequest.of(0, limit)).stream() - .map(TableStatsHistoryDto::fromRow) - .collect(Collectors.toList()); - } - - // --- TableOperationsHistoryDto --- - - @Override - @Transactional - public TableOperationsHistoryDto appendHistory(TableOperationsHistoryDto history) { - TableOperationsHistoryDto toWrite = - history - .toBuilder() - .completedAt( - history.getCompletedAt() != null ? history.getCompletedAt() : Instant.now()) - .build(); - return TableOperationsHistoryDto.fromRow(historyRepository.save(toWrite.toRow())); - } - - @Override - public List getHistory(String tableUuid, int limit) { - return historyRepository.find(tableUuid, PageRequest.of(0, limit)).stream() - .map(TableOperationsHistoryDto::fromRow) - .collect(Collectors.toList()); - } -} diff --git a/services/optimizer/src/main/resources/application.properties b/services/optimizer/src/main/resources/application.properties deleted file mode 100644 index e7f082b47..000000000 --- a/services/optimizer/src/main/resources/application.properties +++ /dev/null @@ -1,25 +0,0 @@ -spring.application.name=openhouse-optimizer-service -server.port=8080 - -spring.jpa.hibernate.ddl-auto=none -spring.sql.init.mode=always -spring.jpa.defer-datasource-initialization=true -spring.sql.init.schema-locations=classpath:db/optimizer-schema.sql - -spring.jpa.properties.hibernate.dialect=org.hibernate.dialect.MySQL8Dialect -spring.jpa.properties.hibernate.show_sql=false -spring.jpa.properties.hibernate.physical_naming_strategy=org.hibernate.boot.model.naming.PhysicalNamingStrategyStandardImpl - -spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver -spring.datasource.url=${OPTIMIZER_DB_URL:jdbc:mysql://localhost:3306/oh_db} -spring.datasource.username=${OPTIMIZER_DB_USER:oh_user} -spring.datasource.password=${OPTIMIZER_DB_PASSWORD:oh_password} -spring.datasource.hikari.maximum-pool-size=20 - -management.endpoints.web.exposure.include=health,prometheus -management.endpoint.health.enabled=true - -# Include ResponseStatusException.reason in the default error response body. Without this, Spring -# Boot 2.7 omits the `message` field, and the human-readable detail from a thrown -# ResponseStatusException never reaches the caller. -server.error.include-message=always diff --git a/services/optimizer/src/main/resources/db/optimizer-schema.sql b/services/optimizer/src/main/resources/db/optimizer-schema.sql deleted file mode 100644 index 892c1c55f..000000000 --- a/services/optimizer/src/main/resources/db/optimizer-schema.sql +++ /dev/null @@ -1,54 +0,0 @@ --- Optimizer Service Schema --- Compatible with MySQL (production) and H2 in MySQL mode (tests). -CREATE TABLE IF NOT EXISTS table_operations ( - id VARCHAR(36) NOT NULL, - table_uuid VARCHAR(36) NOT NULL, - database_name VARCHAR(128) NOT NULL, - table_name VARCHAR(128) NOT NULL, - operation_type VARCHAR(50) NOT NULL, - status VARCHAR(20) NOT NULL, - created_at TIMESTAMP(6) NOT NULL, - scheduled_at TIMESTAMP(6), - job_id VARCHAR(255), - -- TODO: per-operation metric columns will be added as operations are onboarded. - PRIMARY KEY (id) -); - -CREATE TABLE IF NOT EXISTS table_stats ( - table_uuid VARCHAR(36) NOT NULL, - database_name VARCHAR(128) NOT NULL, - table_name VARCHAR(128) NOT NULL, - snapshot TEXT, - table_properties TEXT, - updated_at TIMESTAMP(6) NOT NULL, - PRIMARY KEY (table_uuid) -); - -CREATE TABLE IF NOT EXISTS table_stats_history ( - id VARCHAR(36) NOT NULL, - table_uuid VARCHAR(36) NOT NULL, - database_name VARCHAR(128) NOT NULL, - table_name VARCHAR(128) NOT NULL, - snapshot TEXT, - delta TEXT, - recorded_at TIMESTAMP(6) NOT NULL, - PRIMARY KEY (id), - INDEX idx_tsh_table_uuid (table_uuid), - INDEX idx_tsh_recorded_at (recorded_at) -); - -CREATE TABLE IF NOT EXISTS table_operations_history ( - id VARCHAR(36) NOT NULL, - table_uuid VARCHAR(36) NOT NULL, - database_name VARCHAR(128) NOT NULL, - table_name VARCHAR(128) NOT NULL, - operation_type VARCHAR(50) NOT NULL, - completed_at TIMESTAMP(6) NOT NULL, - status VARCHAR(20) NOT NULL, - PRIMARY KEY (id), - INDEX idx_toph_db_table (database_name, table_name), - -- Drives TableOperationHistoryRepository.findLatestPerTable: the correlated - -- MAX(completed_at) subquery becomes an index-only lookup per (operation_type, - -- table_uuid) instead of an O(N²) scan. - INDEX idx_toph_optype_uuid_completed (operation_type, table_uuid, completed_at) -); diff --git a/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/OptimizerServiceContextTest.java b/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/OptimizerServiceContextTest.java deleted file mode 100644 index fa373c57d..000000000 --- a/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/OptimizerServiceContextTest.java +++ /dev/null @@ -1,25 +0,0 @@ -package com.linkedin.openhouse.optimizer; - -import static org.assertj.core.api.Assertions.assertThat; - -import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.test.context.SpringBootTest; -import org.springframework.context.ApplicationContext; -import org.springframework.test.context.ActiveProfiles; - -/** - * Validates that the Spring application context loads successfully against the H2 schema. This test - * exercises schema-SQL-init, JPA entity scanning, and repository wiring. - */ -@SpringBootTest -@ActiveProfiles("test") -class OptimizerServiceContextTest { - - @Autowired ApplicationContext context; - - @Test - void contextLoads() { - assertThat(context).isNotNull(); - } -} diff --git a/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/api/controller/ControllerErrorHandlingTest.java b/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/api/controller/ControllerErrorHandlingTest.java deleted file mode 100644 index b9c8dc3dc..000000000 --- a/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/api/controller/ControllerErrorHandlingTest.java +++ /dev/null @@ -1,124 +0,0 @@ -package com.linkedin.openhouse.optimizer.api.controller; - -import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.get; -import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.post; -import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.jsonPath; -import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status; - -import com.linkedin.openhouse.optimizer.db.OperationType; -import com.linkedin.openhouse.optimizer.db.TableOperationsRow; -import com.linkedin.openhouse.optimizer.repository.TableOperationsRepository; -import java.time.Instant; -import java.util.UUID; -import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc; -import org.springframework.boot.test.context.SpringBootTest; -import org.springframework.http.MediaType; -import org.springframework.test.context.ActiveProfiles; -import org.springframework.test.web.servlet.MockMvc; -import org.springframework.transaction.annotation.Transactional; - -/** - * Exercises what the controllers own: server-side validation on {@code updateOperation} (path/body - * mismatch, missing fields) and 404s on missing rows. Assertions are status-code-only: MockMvc does - * not trigger Spring's error-dispatch to {@code BasicErrorController}, so the response body of a - * {@link org.springframework.web.server.ResponseStatusException} is empty in tests even though it - * is populated in production (with {@code server.error.include-message=always}). Framework-level - * 4xx (missing query param, malformed JSON, etc.) is left to Spring's defaults and not asserted. - */ -@SpringBootTest -@AutoConfigureMockMvc -@ActiveProfiles("test") -@Transactional -class ControllerErrorHandlingTest { - - @Autowired MockMvc mockMvc; - @Autowired TableOperationsRepository operationsRepository; - - @Test - void updateOperation_notFound_returns404() throws Exception { - String id = UUID.randomUUID().toString(); - String body = String.format("{\"operationId\":\"%s\",\"status\":\"SUCCESS\"}", id); - mockMvc - .perform( - post("/v1/optimizer/operations/" + id + "/update") - .contentType(MediaType.APPLICATION_JSON) - .content(body)) - .andExpect(status().isNotFound()); - } - - @Test - void updateOperation_pathBodyMismatch_returns400() throws Exception { - String pathId = UUID.randomUUID().toString(); - String bodyId = UUID.randomUUID().toString(); - String body = String.format("{\"operationId\":\"%s\",\"status\":\"SUCCESS\"}", bodyId); - mockMvc - .perform( - post("/v1/optimizer/operations/" + pathId + "/update") - .contentType(MediaType.APPLICATION_JSON) - .content(body)) - .andExpect(status().isBadRequest()); - } - - @Test - void updateOperation_missingOperationId_returns400() throws Exception { - String pathId = UUID.randomUUID().toString(); - String body = "{\"status\":\"SUCCESS\"}"; - mockMvc - .perform( - post("/v1/optimizer/operations/" + pathId + "/update") - .contentType(MediaType.APPLICATION_JSON) - .content(body)) - .andExpect(status().isBadRequest()); - } - - @Test - void updateOperation_missingStatus_returns400() throws Exception { - String id = UUID.randomUUID().toString(); - String body = String.format("{\"operationId\":\"%s\"}", id); - mockMvc - .perform( - post("/v1/optimizer/operations/" + id + "/update") - .contentType(MediaType.APPLICATION_JSON) - .content(body)) - .andExpect(status().isBadRequest()); - } - - @Test - void getTableOperation_notFound_returns404() throws Exception { - String id = UUID.randomUUID().toString(); - mockMvc.perform(get("/v1/optimizer/operations/" + id)).andExpect(status().isNotFound()); - } - - @Test - void getTableStats_notFound_returns404() throws Exception { - String uuid = UUID.randomUUID().toString(); - mockMvc.perform(get("/v1/optimizer/stats/" + uuid)).andExpect(status().isNotFound()); - } - - @Test - void updateOperation_happyPath_returns201() throws Exception { - String id = UUID.randomUUID().toString(); - operationsRepository.save( - TableOperationsRow.builder() - .id(id) - .tableUuid(UUID.randomUUID().toString()) - .databaseName("db1") - .tableName("tbl1") - .operationType(OperationType.ORPHAN_FILES_DELETION) - .status(com.linkedin.openhouse.optimizer.db.OperationStatus.SCHEDULED) - .createdAt(Instant.now()) - .scheduledAt(Instant.now()) - .jobId("job-x") - .build()); - String body = String.format("{\"operationId\":\"%s\",\"status\":\"SUCCESS\"}", id); - mockMvc - .perform( - post("/v1/optimizer/operations/" + id + "/update") - .contentType(MediaType.APPLICATION_JSON) - .content(body)) - .andExpect(status().isCreated()) - .andExpect(jsonPath("$.status").value("SUCCESS")); - } -} diff --git a/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/repository/TableOperationsHistoryRepositoryTest.java b/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/repository/TableOperationsHistoryRepositoryTest.java deleted file mode 100644 index 9f1de0c0c..000000000 --- a/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/repository/TableOperationsHistoryRepositoryTest.java +++ /dev/null @@ -1,130 +0,0 @@ -package com.linkedin.openhouse.optimizer.repository; - -import static org.assertj.core.api.Assertions.assertThat; - -import com.linkedin.openhouse.optimizer.db.HistoryStatus; -import com.linkedin.openhouse.optimizer.db.OperationType; -import com.linkedin.openhouse.optimizer.db.TableOperationsHistoryRow; -import java.time.Instant; -import java.util.List; -import java.util.UUID; -import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.test.context.SpringBootTest; -import org.springframework.data.domain.PageRequest; -import org.springframework.test.context.ActiveProfiles; -import org.springframework.transaction.annotation.Transactional; - -@SpringBootTest -@ActiveProfiles("test") -@Transactional -class TableOperationsHistoryRepositoryTest { - - @Autowired TableOperationsHistoryRepository repository; - - @Test - void findByTableUuid_returnsRowsNewestFirst() { - Instant t1 = Instant.parse("2024-01-01T10:00:00Z"); - Instant t2 = Instant.parse("2024-01-02T10:00:00Z"); - String tableUuid = UUID.randomUUID().toString(); - String idOlder = UUID.randomUUID().toString(); - String idNewer = UUID.randomUUID().toString(); - - repository.save( - TableOperationsHistoryRow.builder() - .id(idOlder) - .tableUuid(tableUuid) - .databaseName("db1") - .tableName("tbl1") - .operationType(OperationType.ORPHAN_FILES_DELETION) - .completedAt(t1) - .status(HistoryStatus.SUCCESS) - .build()); - - repository.save( - TableOperationsHistoryRow.builder() - .id(idNewer) - .tableUuid(tableUuid) - .databaseName("db1") - .tableName("tbl1") - .operationType(OperationType.ORPHAN_FILES_DELETION) - .completedAt(t2) - .status(HistoryStatus.FAILED) - .build()); - - List rows = repository.find(tableUuid, PageRequest.of(0, 10)); - - assertThat(rows).hasSize(2); - assertThat(rows.get(0).getId()).isEqualTo(idNewer); - assertThat(rows.get(1).getId()).isEqualTo(idOlder); - } - - @Test - void findByTableUuid_respectsLimit() { - Instant now = Instant.now(); - String tableUuid = UUID.randomUUID().toString(); - for (int i = 0; i < 5; i++) { - repository.save( - TableOperationsHistoryRow.builder() - .id(UUID.randomUUID().toString()) - .tableUuid(tableUuid) - .databaseName("db1") - .tableName("tbl3") - .operationType(OperationType.ORPHAN_FILES_DELETION) - .completedAt(now.plusSeconds(i)) - .status(HistoryStatus.SUCCESS) - .build()); - } - - List rows = repository.find(tableUuid, PageRequest.of(0, 3)); - assertThat(rows).hasSize(3); - } - - @Test - void findLatestPerTable_returnsOneRowPerTableUuid() { - Instant t1 = Instant.parse("2024-01-01T10:00:00Z"); - Instant t2 = Instant.parse("2024-02-01T10:00:00Z"); - String tableUuid = UUID.randomUUID().toString(); - String otherUuid = UUID.randomUUID().toString(); - - repository.save( - TableOperationsHistoryRow.builder() - .id(UUID.randomUUID().toString()) - .tableUuid(tableUuid) - .databaseName("db1") - .tableName("tbl1") - .operationType(OperationType.ORPHAN_FILES_DELETION) - .completedAt(t1) - .status(HistoryStatus.SUCCESS) - .build()); - repository.save( - TableOperationsHistoryRow.builder() - .id(UUID.randomUUID().toString()) - .tableUuid(tableUuid) - .databaseName("db1") - .tableName("tbl1") - .operationType(OperationType.ORPHAN_FILES_DELETION) - .completedAt(t2) - .status(HistoryStatus.FAILED) - .build()); - repository.save( - TableOperationsHistoryRow.builder() - .id(UUID.randomUUID().toString()) - .tableUuid(otherUuid) - .databaseName("db1") - .tableName("tbl2") - .operationType(OperationType.ORPHAN_FILES_DELETION) - .completedAt(t1) - .status(HistoryStatus.SUCCESS) - .build()); - - List latest = - repository.findLatest(OperationType.ORPHAN_FILES_DELETION, PageRequest.of(0, 10_000)); - - assertThat(latest).hasSize(2); - TableOperationsHistoryRow forTarget = - latest.stream().filter(r -> r.getTableUuid().equals(tableUuid)).findFirst().orElseThrow(); - assertThat(forTarget.getCompletedAt()).isEqualTo(t2); - assertThat(forTarget.getStatus()).isEqualTo(HistoryStatus.FAILED); - } -} diff --git a/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/repository/TableOperationsRepositoryTest.java b/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/repository/TableOperationsRepositoryTest.java deleted file mode 100644 index 072be5fd9..000000000 --- a/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/repository/TableOperationsRepositoryTest.java +++ /dev/null @@ -1,312 +0,0 @@ -package com.linkedin.openhouse.optimizer.repository; - -import static org.assertj.core.api.Assertions.assertThat; - -import com.linkedin.openhouse.optimizer.db.OperationStatus; -import com.linkedin.openhouse.optimizer.db.OperationType; -import com.linkedin.openhouse.optimizer.db.TableOperationsRow; -import java.time.Instant; -import java.time.temporal.ChronoUnit; -import java.util.List; -import java.util.Optional; -import java.util.UUID; -import java.util.stream.Collectors; -import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.test.context.SpringBootTest; -import org.springframework.data.domain.PageRequest; -import org.springframework.data.domain.Pageable; -import org.springframework.test.context.ActiveProfiles; -import org.springframework.transaction.annotation.Transactional; - -@SpringBootTest -@ActiveProfiles("test") -@Transactional -class TableOperationsRepositoryTest { - - private static final Pageable PAGE = PageRequest.of(0, 10_000); - - @Autowired TableOperationsRepository repository; - - @Test - void saveAndFindById() { - String id = UUID.randomUUID().toString(); - - repository.save(pendingRow(id, "tbl1")); - - Optional found = repository.findById(id); - assertThat(found).isPresent(); - assertThat(found.get().getStatus()).isEqualTo(OperationStatus.PENDING); - } - - @Test - void find_noFilters_returnsAll() { - repository.save(pendingRow(UUID.randomUUID().toString(), "tbl1")); - repository.save(scheduledRow(UUID.randomUUID().toString(), "tbl2")); - - List rows = - repository.find( - Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.empty(), - PAGE); - assertThat(rows).hasSize(2); - } - - @Test - void find_byStatus() { - repository.save(pendingRow(UUID.randomUUID().toString(), "tbl1")); - repository.save(scheduledRow(UUID.randomUUID().toString(), "tbl2")); - - List pending = - repository.find( - Optional.empty(), - Optional.of(OperationStatus.PENDING), - Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.empty(), - PAGE); - assertThat(pending).hasSize(1); - assertThat(pending.get(0).getStatus()).isEqualTo(OperationStatus.PENDING); - - List scheduled = - repository.find( - Optional.empty(), - Optional.of(OperationStatus.SCHEDULED), - Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.empty(), - PAGE); - assertThat(scheduled).hasSize(1); - assertThat(scheduled.get(0).getStatus()).isEqualTo(OperationStatus.SCHEDULED); - } - - @Test - void find_byDatabaseAndTable() { - repository.save(pendingRow(UUID.randomUUID().toString(), "tbl1", "db1")); - repository.save(pendingRow(UUID.randomUUID().toString(), "tbl2", "db2")); - - assertThat( - repository.find( - Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.of("db1"), - Optional.empty(), - Optional.empty(), - Optional.empty(), - PAGE)) - .hasSize(1); - assertThat( - repository.find( - Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.of("db2"), - Optional.of("tbl2"), - Optional.empty(), - Optional.empty(), - PAGE)) - .hasSize(1); - assertThat( - repository.find( - Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.of("db1"), - Optional.of("tbl2"), - Optional.empty(), - Optional.empty(), - PAGE)) - .isEmpty(); - } - - @Test - void find_byScheduledAtAndIds_resolvesClaimedSubset() { - String idA = UUID.randomUUID().toString(); - String idB = UUID.randomUUID().toString(); - String idC = UUID.randomUUID().toString(); - repository.save(pendingRow(idA, "tbl_a")); - repository.save(pendingRow(idB, "tbl_b")); - // idC is already SCHEDULING with an older watermark — must NOT appear. - repository.save( - TableOperationsRow.builder() - .id(idC) - .tableUuid(UUID.randomUUID().toString()) - .databaseName("db1") - .tableName("tbl_c") - .operationType(OperationType.ORPHAN_FILES_DELETION) - .status(OperationStatus.SCHEDULING) - .createdAt(Instant.now()) - .scheduledAt(Instant.now().minusSeconds(60)) - .build()); - - // Truncate to microseconds — MySQL TIMESTAMP(6) (and H2 in MySQL mode) stores microseconds, - // so a nano-precision now() round-trips lossily. On Linux CI Instant.now() carries nanos; - // truncating here keeps the watermark comparison exact across platforms. - Instant now = Instant.now().truncatedTo(ChronoUnit.MICROS); - int transitioned = - repository.updateBatch( - List.of(idA, idB, idC), - OperationStatus.PENDING, - OperationStatus.SCHEDULING, - Optional.of(now), - Optional.empty()); - assertThat(transitioned).isEqualTo(2); - - List claimedIds = - repository - .find( - Optional.empty(), - Optional.of(OperationStatus.SCHEDULING), - Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.of(now), - Optional.of(List.of(idA, idB, idC)), - PAGE) - .stream() - .map(TableOperationsRow::getId) - .collect(Collectors.toList()); - assertThat(claimedIds).containsExactlyInAnyOrder(idA, idB); - } - - @Test - void updateBatch_schedulingToScheduled_setsJobIdAndPreservesScheduledAt() { - String id = UUID.randomUUID().toString(); - Instant claimedAt = Instant.parse("2026-05-20T16:42:43Z"); - repository.save( - TableOperationsRow.builder() - .id(id) - .tableUuid(UUID.randomUUID().toString()) - .databaseName("db1") - .tableName("tbl1") - .operationType(OperationType.ORPHAN_FILES_DELETION) - .status(OperationStatus.SCHEDULING) - .createdAt(Instant.now()) - .scheduledAt(claimedAt) - .build()); - - int updated = - repository.updateBatch( - List.of(id), - OperationStatus.SCHEDULING, - OperationStatus.SCHEDULED, - Optional.empty(), - Optional.of("job-123")); - assertThat(updated).isEqualTo(1); - - TableOperationsRow row = repository.findById(id).orElseThrow(); - assertThat(row.getStatus()).isEqualTo(OperationStatus.SCHEDULED); - assertThat(row.getJobId()).isEqualTo("job-123"); - assertThat(row.getScheduledAt()).isEqualTo(claimedAt); - } - - @Test - void updateBatch_schedulingToPending_leavesScheduledAtUntouched() { - // scheduledAt is intentionally NOT cleared on revert. Status is the source of truth; the - // stale watermark gets overwritten on the next PENDING → SCHEDULING transition. - String id = UUID.randomUUID().toString(); - Instant claimedAt = Instant.parse("2026-05-20T16:42:43Z"); - repository.save( - TableOperationsRow.builder() - .id(id) - .tableUuid(UUID.randomUUID().toString()) - .databaseName("db1") - .tableName("tbl1") - .operationType(OperationType.ORPHAN_FILES_DELETION) - .status(OperationStatus.SCHEDULING) - .createdAt(Instant.now()) - .scheduledAt(claimedAt) - .build()); - - int reverted = - repository.updateBatch( - List.of(id), - OperationStatus.SCHEDULING, - OperationStatus.PENDING, - Optional.empty(), - Optional.empty()); - assertThat(reverted).isEqualTo(1); - - TableOperationsRow row = repository.findById(id).orElseThrow(); - assertThat(row.getStatus()).isEqualTo(OperationStatus.PENDING); - assertThat(row.getScheduledAt()).isEqualTo(claimedAt); - } - - @Test - void updateBatch_skipsRowsNotInFromStatus() { - String pendingId = UUID.randomUUID().toString(); - String scheduledId = UUID.randomUUID().toString(); - repository.save(pendingRow(pendingId, "tbl_a")); - repository.save(scheduledRow(scheduledId, "tbl_b")); - - int transitioned = - repository.updateBatch( - List.of(pendingId, scheduledId), - OperationStatus.PENDING, - OperationStatus.SCHEDULING, - Optional.of(Instant.now()), - Optional.empty()); - assertThat(transitioned).isEqualTo(1); - - assertThat(repository.findById(pendingId).orElseThrow().getStatus()) - .isEqualTo(OperationStatus.SCHEDULING); - assertThat(repository.findById(scheduledId).orElseThrow().getStatus()) - .isEqualTo(OperationStatus.SCHEDULED); - } - - @Test - void cancel_deletesOnlyPendingRows() { - String pendingId = UUID.randomUUID().toString(); - String scheduledId = UUID.randomUUID().toString(); - repository.save(pendingRow(pendingId, "tbl_p")); - repository.save(scheduledRow(scheduledId, "tbl_s")); - - int deleted = repository.cancel(List.of(pendingId, scheduledId)); - assertThat(deleted).isEqualTo(1); - - assertThat(repository.findById(pendingId)).isEmpty(); - assertThat(repository.findById(scheduledId)).isPresent(); - } - - // --- helpers --- - - private TableOperationsRow pendingRow(String id, String tableName) { - return pendingRow(id, tableName, "db1"); - } - - private TableOperationsRow pendingRow(String id, String tableName, String databaseName) { - return TableOperationsRow.builder() - .id(id) - .tableUuid(UUID.randomUUID().toString()) - .databaseName(databaseName) - .tableName(tableName) - .operationType(OperationType.ORPHAN_FILES_DELETION) - .status(OperationStatus.PENDING) - .createdAt(Instant.now()) - .build(); - } - - private TableOperationsRow scheduledRow(String id, String tableName) { - return TableOperationsRow.builder() - .id(id) - .tableUuid(UUID.randomUUID().toString()) - .databaseName("db1") - .tableName(tableName) - .operationType(OperationType.ORPHAN_FILES_DELETION) - .status(OperationStatus.SCHEDULED) - .createdAt(Instant.now()) - .scheduledAt(Instant.now()) - .jobId("job-" + id) - .build(); - } -} diff --git a/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/repository/TableStatsHistoryRepositoryTest.java b/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/repository/TableStatsHistoryRepositoryTest.java deleted file mode 100644 index cddec50c9..000000000 --- a/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/repository/TableStatsHistoryRepositoryTest.java +++ /dev/null @@ -1,148 +0,0 @@ -package com.linkedin.openhouse.optimizer.repository; - -import static org.assertj.core.api.Assertions.assertThat; - -import com.linkedin.openhouse.optimizer.db.CommitDeltaMetrics; -import com.linkedin.openhouse.optimizer.db.SnapshotMetrics; -import com.linkedin.openhouse.optimizer.db.TableStatsHistoryRow; -import java.time.Instant; -import java.time.temporal.ChronoUnit; -import java.util.List; -import java.util.Optional; -import java.util.UUID; -import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.test.context.SpringBootTest; -import org.springframework.data.domain.PageRequest; -import org.springframework.test.context.ActiveProfiles; -import org.springframework.transaction.annotation.Transactional; - -@SpringBootTest -@ActiveProfiles("test") -@Transactional -class TableStatsHistoryRepositoryTest { - - @Autowired TableStatsHistoryRepository repository; - - @Test - void saveAndFind() { - String tableUuid = UUID.randomUUID().toString(); - Instant now = Instant.now(); - - repository.save(buildRow(tableUuid, "db1", "tbl1", 10L, 2L, now.minus(2, ChronoUnit.HOURS))); - repository.save(buildRow(tableUuid, "db1", "tbl1", 5L, 1L, now.minus(1, ChronoUnit.HOURS))); - repository.save(buildRow(tableUuid, "db1", "tbl1", 3L, 0L, now)); - - List rows = - repository.find(tableUuid, Optional.empty(), PageRequest.of(0, 100)); - - assertThat(rows).hasSize(3); - // newest first - assertThat(rows.get(0).getDelta().getNumFilesAdded()).isEqualTo(3L); - assertThat(rows.get(2).getDelta().getNumFilesAdded()).isEqualTo(10L); - } - - @Test - void find_respectsLimit() { - String tableUuid = UUID.randomUUID().toString(); - Instant now = Instant.now(); - - for (int i = 0; i < 5; i++) { - repository.save(buildRow(tableUuid, "db1", "tbl1", i, 0L, now.minus(i, ChronoUnit.HOURS))); - } - - List rows = - repository.find(tableUuid, Optional.empty(), PageRequest.of(0, 3)); - - assertThat(rows).hasSize(3); - } - - @Test - void find_withSince_filtersOlderRows() { - String tableUuid = UUID.randomUUID().toString(); - Instant now = Instant.now(); - Instant cutoff = now.minus(90, ChronoUnit.MINUTES); - - repository.save(buildRow(tableUuid, "db1", "tbl1", 10L, 2L, now.minus(2, ChronoUnit.HOURS))); - repository.save(buildRow(tableUuid, "db1", "tbl1", 5L, 1L, now.minus(1, ChronoUnit.HOURS))); - repository.save(buildRow(tableUuid, "db1", "tbl1", 3L, 0L, now)); - - List rows = - repository.find(tableUuid, Optional.of(cutoff), PageRequest.of(0, 100)); - - // only the 2 rows within the last 90 minutes - assertThat(rows).hasSize(2); - assertThat(rows.get(0).getDelta().getNumFilesAdded()).isEqualTo(3L); - } - - @Test - void find_isolatesByTableUuid() { - String uuid1 = UUID.randomUUID().toString(); - String uuid2 = UUID.randomUUID().toString(); - Instant now = Instant.now(); - - repository.save(buildRow(uuid1, "db1", "tbl1", 10L, 0L, now)); - repository.save(buildRow(uuid2, "db2", "tbl2", 20L, 0L, now)); - - assertThat(repository.find(uuid1, Optional.empty(), PageRequest.of(0, 100))).hasSize(1); - assertThat(repository.find(uuid2, Optional.empty(), PageRequest.of(0, 100))).hasSize(1); - } - - @Test - void callerSetIdIsPreserved() { - String tableUuid = UUID.randomUUID().toString(); - String id1 = UUID.randomUUID().toString(); - String id2 = UUID.randomUUID().toString(); - Instant now = Instant.now(); - - TableStatsHistoryRow row1 = - repository.save(buildRow(id1, tableUuid, "db1", "tbl1", 1L, 0L, now)); - TableStatsHistoryRow row2 = - repository.save(buildRow(id2, tableUuid, "db1", "tbl1", 2L, 0L, now)); - - assertThat(row1.getId()).isEqualTo(id1); - assertThat(row2.getId()).isEqualTo(id2); - assertThat(repository.findById(id1)).isPresent(); - assertThat(repository.findById(id2)).isPresent(); - } - - private static TableStatsHistoryRow buildRow( - String tableUuid, - String databaseName, - String tableName, - long numFilesAdded, - long numFilesDeleted, - Instant recordedAt) { - return buildRow( - UUID.randomUUID().toString(), - tableUuid, - databaseName, - tableName, - numFilesAdded, - numFilesDeleted, - recordedAt); - } - - private static TableStatsHistoryRow buildRow( - String id, - String tableUuid, - String databaseName, - String tableName, - long numFilesAdded, - long numFilesDeleted, - Instant recordedAt) { - return TableStatsHistoryRow.builder() - .id(id) - .tableUuid(tableUuid) - .databaseName(databaseName) - .tableName(tableName) - .snapshot(SnapshotMetrics.builder().tableSizeBytes(1024L).build()) - .delta( - CommitDeltaMetrics.builder() - .numFilesAdded(numFilesAdded) - .numFilesDeleted(numFilesDeleted) - .build()) - .recordedAt(recordedAt) - .build(); - } -} diff --git a/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/repository/TableStatsRepositoryTest.java b/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/repository/TableStatsRepositoryTest.java deleted file mode 100644 index e73ac0cb4..000000000 --- a/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/repository/TableStatsRepositoryTest.java +++ /dev/null @@ -1,129 +0,0 @@ -package com.linkedin.openhouse.optimizer.repository; - -import static org.assertj.core.api.Assertions.assertThat; - -import com.linkedin.openhouse.optimizer.db.SnapshotMetrics; -import com.linkedin.openhouse.optimizer.db.TableStatsRow; -import java.time.Instant; -import java.util.Map; -import java.util.Optional; -import java.util.UUID; -import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.test.context.SpringBootTest; -import org.springframework.data.domain.PageRequest; -import org.springframework.data.domain.Pageable; -import org.springframework.test.context.ActiveProfiles; -import org.springframework.transaction.annotation.Transactional; - -@SpringBootTest -@ActiveProfiles("test") -@Transactional -class TableStatsRepositoryTest { - - private static final Pageable PAGE = PageRequest.of(0, 10_000); - - @Autowired TableStatsRepository repository; - - @Test - void saveAndFindById() { - String tableUuid = UUID.randomUUID().toString(); - SnapshotMetrics snapshot = SnapshotMetrics.builder().tableSizeBytes(1024L).build(); - - repository.save( - TableStatsRow.builder() - .tableUuid(tableUuid) - .databaseName("db1") - .tableName("tbl1") - .snapshot(snapshot) - .tableProperties(Map.of("maintenance.optimizer.ofd.enabled", "true")) - .updatedAt(Instant.now()) - .build()); - - Optional found = repository.findById(tableUuid); - assertThat(found).isPresent(); - assertThat(found.get().getDatabaseName()).isEqualTo("db1"); - assertThat(found.get().getSnapshot().getTableSizeBytes()).isEqualTo(1024L); - assertThat(found.get().getTableProperties()) - .containsEntry("maintenance.optimizer.ofd.enabled", "true"); - } - - @Test - void upsert_overwritesPreviousStats() { - String tableUuid = UUID.randomUUID().toString(); - - repository.save( - TableStatsRow.builder() - .tableUuid(tableUuid) - .databaseName("db1") - .tableName("tbl1") - .snapshot(SnapshotMetrics.builder().tableSizeBytes(100L).build()) - .updatedAt(Instant.now()) - .build()); - - repository.save( - TableStatsRow.builder() - .tableUuid(tableUuid) - .databaseName("db1") - .tableName("tbl1") - .snapshot(SnapshotMetrics.builder().tableSizeBytes(200L).build()) - .updatedAt(Instant.now()) - .build()); - - assertThat(repository.findAll()).hasSize(1); - assertThat(repository.findById(tableUuid).get().getSnapshot().getTableSizeBytes()) - .isEqualTo(200L); - } - - @Test - void find_noParams_returnsAll() { - repository.save( - TableStatsRow.builder() - .tableUuid(UUID.randomUUID().toString()) - .databaseName("db1") - .tableName("tbl1") - .snapshot(SnapshotMetrics.builder().tableSizeBytes(100L).build()) - .updatedAt(Instant.now()) - .build()); - repository.save( - TableStatsRow.builder() - .tableUuid(UUID.randomUUID().toString()) - .databaseName("db2") - .tableName("tbl2") - .snapshot(SnapshotMetrics.builder().tableSizeBytes(200L).build()) - .updatedAt(Instant.now()) - .build()); - - assertThat(repository.find(Optional.empty(), Optional.empty(), Optional.empty(), PAGE)) - .hasSize(2); - } - - @Test - void find_byDatabase() { - repository.save( - TableStatsRow.builder() - .tableUuid(UUID.randomUUID().toString()) - .databaseName("db1") - .tableName("tbl1") - .snapshot(SnapshotMetrics.builder().tableSizeBytes(100L).build()) - .updatedAt(Instant.now()) - .build()); - repository.save( - TableStatsRow.builder() - .tableUuid(UUID.randomUUID().toString()) - .databaseName("db2") - .tableName("tbl2") - .snapshot(SnapshotMetrics.builder().tableSizeBytes(200L).build()) - .updatedAt(Instant.now()) - .build()); - - assertThat(repository.find(Optional.of("db1"), Optional.empty(), Optional.empty(), PAGE)) - .hasSize(1); - assertThat( - repository - .find(Optional.of("db1"), Optional.empty(), Optional.empty(), PAGE) - .get(0) - .getDatabaseName()) - .isEqualTo("db1"); - } -} diff --git a/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/service/OptimizerDataServiceImplTest.java b/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/service/OptimizerDataServiceImplTest.java deleted file mode 100644 index 2a3c1e676..000000000 --- a/services/optimizer/src/test/java/com/linkedin/openhouse/optimizer/service/OptimizerDataServiceImplTest.java +++ /dev/null @@ -1,173 +0,0 @@ -package com.linkedin.openhouse.optimizer.service; - -import static org.assertj.core.api.Assertions.assertThat; - -import com.linkedin.openhouse.optimizer.db.TableOperationsRow; -import com.linkedin.openhouse.optimizer.db.TableStatsHistoryRow; -import com.linkedin.openhouse.optimizer.model.HistoryStatusDto; -import com.linkedin.openhouse.optimizer.model.OperationStatusDto; -import com.linkedin.openhouse.optimizer.model.OperationTypeDto; -import com.linkedin.openhouse.optimizer.model.TableOperationsHistoryDto; -import com.linkedin.openhouse.optimizer.model.TableStatsDto; -import com.linkedin.openhouse.optimizer.repository.TableOperationsRepository; -import com.linkedin.openhouse.optimizer.repository.TableStatsHistoryRepository; -import com.linkedin.openhouse.optimizer.repository.TableStatsRepository; -import java.time.Instant; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.UUID; -import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.test.context.SpringBootTest; -import org.springframework.data.domain.PageRequest; -import org.springframework.test.context.ActiveProfiles; -import org.springframework.transaction.annotation.Transactional; - -@SpringBootTest -@ActiveProfiles("test") -@Transactional -class OptimizerDataServiceImplTest { - - @Autowired OptimizerDataService service; - @Autowired TableOperationsRepository operationsRepository; - @Autowired TableStatsRepository statsRepository; - @Autowired TableStatsHistoryRepository statsHistoryRepository; - - // --- updateOperation --- - - @Test - void completeOperation_writesHistoryFromOperationRow() { - String operationId = UUID.randomUUID().toString(); - String tableUuid = UUID.randomUUID().toString(); - operationsRepository.save( - TableOperationsRow.builder() - .id(operationId) - .tableUuid(tableUuid) - .databaseName("db1") - .tableName("tbl1") - .operationType(com.linkedin.openhouse.optimizer.db.OperationType.ORPHAN_FILES_DELETION) - .status(com.linkedin.openhouse.optimizer.db.OperationStatus.SCHEDULED) - .createdAt(Instant.now()) - .scheduledAt(Instant.now()) - .jobId("spark-job-123") - .build()); - - Optional result = - service.updateOperation(operationId, HistoryStatusDto.SUCCESS); - - assertThat(result).isPresent(); - assertThat(result.get().getStatus()).isEqualTo(HistoryStatusDto.SUCCESS); - assertThat(result.get().getTableUuid()).isEqualTo(tableUuid); - assertThat(result.get().getOperationType()).isEqualTo(OperationTypeDto.ORPHAN_FILES_DELETION); - assertThat(result.get().getDatabaseName()).isEqualTo("db1"); - assertThat(result.get().getCompletedAt()).isNotNull(); - } - - @Test - void completeOperation_notFound_returnsEmpty() { - Optional result = - service.updateOperation(UUID.randomUUID().toString(), HistoryStatusDto.FAILED); - - assertThat(result).isEmpty(); - } - - // --- upsertTableStats --- - - @Test - void upsertTableStats_createsNewRow() { - String tableUuid = UUID.randomUUID().toString(); - TableStatsDto input = - TableStatsDto.builder() - .tableUuid(tableUuid) - .databaseName("db1") - .tableName("tbl1") - .tableProperties(Map.of("maintenance.optimizer.ofd.enabled", "true")) - .snapshot(TableStatsDto.SnapshotMetrics.builder().tableSizeBytes(1024L).build()) - .build(); - - TableStatsDto result = service.upsertTableStats(input); - - assertThat(result.getTableUuid()).isEqualTo(tableUuid); - assertThat(result.getDatabaseName()).isEqualTo("db1"); - assertThat(result.getSnapshot().getTableSizeBytes()).isEqualTo(1024L); - assertThat(result.getTableProperties()) - .containsEntry("maintenance.optimizer.ofd.enabled", "true"); - assertThat(result.getUpdatedAt()).isNotNull(); - assertThat(statsRepository.findById(tableUuid)).isPresent(); - } - - @Test - void upsertTableStats_updatesExistingRow_andAppendsHistory() { - String tableUuid = UUID.randomUUID().toString(); - TableStatsDto first = - TableStatsDto.builder() - .tableUuid(tableUuid) - .databaseName("db1") - .tableName("tbl1") - .snapshot(TableStatsDto.SnapshotMetrics.builder().tableSizeBytes(100L).build()) - .delta( - TableStatsDto.CommitDelta.builder().numFilesAdded(5L).numFilesDeleted(1L).build()) - .build(); - TableStatsDto second = - TableStatsDto.builder() - .tableUuid(tableUuid) - .databaseName("db1") - .tableName("tbl1") - .snapshot(TableStatsDto.SnapshotMetrics.builder().tableSizeBytes(200L).build()) - .delta( - TableStatsDto.CommitDelta.builder().numFilesAdded(3L).numFilesDeleted(0L).build()) - .build(); - - service.upsertTableStats(first); - TableStatsDto result = service.upsertTableStats(second); - - assertThat(result.getSnapshot().getTableSizeBytes()).isEqualTo(200L); - assertThat(statsRepository.findAll()).hasSize(1); - - List history = - statsHistoryRepository.find(tableUuid, Optional.empty(), PageRequest.of(0, 100)); - assertThat(history).hasSize(2); - assertThat(history.get(0).getDelta().getNumFilesAdded()).isEqualTo(3L); - assertThat(history.get(1).getDelta().getNumFilesAdded()).isEqualTo(5L); - } - - // --- list filters touch the operations enum mapping path --- - - @Test - void listTableOperations_filtersByOperationTypeAndStatus() { - String pendingId = UUID.randomUUID().toString(); - String scheduledId = UUID.randomUUID().toString(); - operationsRepository.save( - TableOperationsRow.builder() - .id(pendingId) - .tableUuid(UUID.randomUUID().toString()) - .databaseName("db1") - .tableName("tbl1") - .operationType(com.linkedin.openhouse.optimizer.db.OperationType.ORPHAN_FILES_DELETION) - .status(com.linkedin.openhouse.optimizer.db.OperationStatus.PENDING) - .createdAt(Instant.now()) - .build()); - operationsRepository.save( - TableOperationsRow.builder() - .id(scheduledId) - .tableUuid(UUID.randomUUID().toString()) - .databaseName("db1") - .tableName("tbl2") - .operationType(com.linkedin.openhouse.optimizer.db.OperationType.ORPHAN_FILES_DELETION) - .status(com.linkedin.openhouse.optimizer.db.OperationStatus.SCHEDULED) - .createdAt(Instant.now()) - .build()); - - assertThat( - service.listTableOperations( - Optional.of(OperationTypeDto.ORPHAN_FILES_DELETION), - Optional.of(OperationStatusDto.PENDING), - Optional.empty(), - Optional.empty(), - Optional.empty(), - 100)) - .extracting(op -> op.getId()) - .containsExactly(pendingId); - } -} diff --git a/services/optimizer/src/test/resources/application-test.properties b/services/optimizer/src/test/resources/application-test.properties deleted file mode 100644 index 97b7841dc..000000000 --- a/services/optimizer/src/test/resources/application-test.properties +++ /dev/null @@ -1,12 +0,0 @@ -spring.datasource.url=jdbc:h2:mem:optimizer_test;MODE=MySQL;DATABASE_TO_LOWER=TRUE;DB_CLOSE_DELAY=-1 -spring.datasource.driver-class-name=org.h2.Driver -spring.datasource.username=sa -spring.datasource.password= - -spring.jpa.hibernate.ddl-auto=none -spring.sql.init.mode=always -spring.jpa.defer-datasource-initialization=true -spring.jpa.properties.hibernate.dialect=org.hibernate.dialect.H2Dialect -spring.jpa.properties.hibernate.physical_naming_strategy=org.hibernate.boot.model.naming.PhysicalNamingStrategyStandardImpl - -spring.sql.init.schema-locations=classpath:db/optimizer-schema.sql diff --git a/services/tables/build.gradle b/services/tables/build.gradle index c85a57131..054464a9c 100644 --- a/services/tables/build.gradle +++ b/services/tables/build.gradle @@ -43,7 +43,6 @@ dependencies { implementation 'com.cronutils:cron-utils:9.2.0' testImplementation 'org.junit.jupiter:junit-jupiter-engine:' + junit_version testImplementation 'org.springframework.security:spring-security-test:5.7.3' - testImplementation 'org.springframework:spring-context-support:5.3.18' testImplementation(testFixtures(project(':services:common'))) testImplementation (project(':tables-test-fixtures:tables-test-fixtures_2.12')) { exclude group: 'com.linkedin.iceberg' diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/config/InternalCatalogBeans.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/config/InternalCatalogBeans.java deleted file mode 100644 index add58ddc8..000000000 --- a/services/tables/src/main/java/com/linkedin/openhouse/tables/config/InternalCatalogBeans.java +++ /dev/null @@ -1,31 +0,0 @@ -package com.linkedin.openhouse.tables.config; - -import com.linkedin.openhouse.internal.catalog.config.InternalCatalogSettings; -import org.springframework.boot.context.properties.EnableConfigurationProperties; -import org.springframework.context.annotation.Bean; -import org.springframework.context.annotation.Configuration; - -@Configuration -@EnableConfigurationProperties(InternalCatalogProperties.class) -public class InternalCatalogBeans { - - @Bean - public InternalCatalogSettings internalCatalogSettings(InternalCatalogProperties properties) { - InternalCatalogSettings settings = new InternalCatalogSettings(); - InternalCatalogProperties.MetadataCache metadataCacheOverrides = properties.getMetadataCache(); - - if (metadataCacheOverrides != null) { - if (metadataCacheOverrides.getEnabled() != null) { - settings.getMetadataCache().setEnabled(metadataCacheOverrides.getEnabled()); - } - if (metadataCacheOverrides.getTtl() != null) { - settings.getMetadataCache().setTtl(metadataCacheOverrides.getTtl()); - } - if (metadataCacheOverrides.getMaxWeight() != null) { - settings.getMetadataCache().setMaxWeight(metadataCacheOverrides.getMaxWeight()); - } - } - - return settings; - } -} diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/config/InternalCatalogProperties.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/config/InternalCatalogProperties.java deleted file mode 100644 index 0b080edda..000000000 --- a/services/tables/src/main/java/com/linkedin/openhouse/tables/config/InternalCatalogProperties.java +++ /dev/null @@ -1,23 +0,0 @@ -package com.linkedin.openhouse.tables.config; - -import java.time.Duration; -import lombok.Getter; -import lombok.Setter; -import org.springframework.boot.context.properties.ConfigurationProperties; -import org.springframework.util.unit.DataSize; - -@Getter -@Setter -@ConfigurationProperties(prefix = "cluster.iceberg.tables") -public class InternalCatalogProperties { - - private MetadataCache metadataCache = new MetadataCache(); - - @Getter - @Setter - public static class MetadataCache { - private Boolean enabled; - private Duration ttl; - private DataSize maxWeight; - } -} diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/repository/OpenHouseInternalRepository.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/repository/OpenHouseInternalRepository.java index a47826f8f..349953916 100644 --- a/services/tables/src/main/java/com/linkedin/openhouse/tables/repository/OpenHouseInternalRepository.java +++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/repository/OpenHouseInternalRepository.java @@ -5,7 +5,6 @@ import com.linkedin.openhouse.tables.model.TableDto; import com.linkedin.openhouse.tables.model.TableDtoPrimaryKey; import java.util.List; -import java.util.Optional; import org.springframework.data.domain.Page; import org.springframework.data.domain.Pageable; import org.springframework.data.repository.PagingAndSortingRepository; @@ -18,16 +17,6 @@ @Repository public interface OpenHouseInternalRepository extends PagingAndSortingRepository { - - /** - * Returns a lightweight {@link TableDto} that acts as a catalog-level reference to the table — - * populated only with the identifiers, tableUUID, and the metadata.json location. Unlike {@link - * #findById}, this does not parse the table's metadata.json, so it succeeds even when the - * metadata is corrupted. Intended for paths (e.g. drop) that need only the reference, not the - * full table state. - */ - Optional findTableRefById(TableDtoPrimaryKey tableDtoPrimaryKey); - List findAllIds(); Page findAllIds(Pageable pageable); diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/repository/impl/OpenHouseInternalRepositoryImpl.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/repository/impl/OpenHouseInternalRepositoryImpl.java index eb1e6486e..9a4b236af 100644 --- a/services/tables/src/main/java/com/linkedin/openhouse/tables/repository/impl/OpenHouseInternalRepositoryImpl.java +++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/repository/impl/OpenHouseInternalRepositoryImpl.java @@ -2,7 +2,6 @@ import static com.linkedin.openhouse.internal.catalog.CatalogConstants.*; import static com.linkedin.openhouse.internal.catalog.mapper.HouseTableSerdeUtils.*; -import static com.linkedin.openhouse.internal.catalog.mapper.HouseTableSerdeUtils.getCanonicalFieldName; import static com.linkedin.openhouse.tables.repository.impl.InternalRepositoryUtils.*; import com.google.common.annotations.VisibleForTesting; @@ -157,8 +156,8 @@ public TableDto save(TableDto tableDto) { Map tableProps = computePropsForTableCreation(tableDto); tablePolicyManager.managePoliciesOnCreateIfNeeded(tableDto); SortOrder sortOrder = getIcebergSortOrder(tableDto, writeSchema); - String metadataLocation = tableProps.get(getCanonicalFieldName("tableLocation")); - String tableLocation = metadataLocation.substring(0, metadataLocation.lastIndexOf("/")); + String tableLocation = + tableDto.getTableVersion().substring(0, tableDto.getTableVersion().lastIndexOf("/")); table = replaceTable( tableIdentifier, writeSchema, partitionSpec, tableLocation, tableProps, sortOrder); @@ -678,25 +677,6 @@ public Optional findById(TableDtoPrimaryKey tableDtoPrimaryKey) { table, fileIOManager, partitionSpecMapper, policiesMapper, tableTypeMapper)); } - @Override - public Optional findTableRefById(TableDtoPrimaryKey tableDtoPrimaryKey) { - if (!(catalog instanceof OpenHouseInternalCatalog)) { - throw new UnsupportedOperationException( - "findTableRefById is not supported for catalog type: " + catalog.getClass().getName()); - } - return ((OpenHouseInternalCatalog) catalog) - .findHouseTable( - TableIdentifier.of(tableDtoPrimaryKey.getDatabaseId(), tableDtoPrimaryKey.getTableId())) - .map( - houseTable -> - TableDto.builder() - .databaseId(houseTable.getDatabaseId()) - .tableId(houseTable.getTableId()) - .tableUUID(houseTable.getTableUUID()) - .tableLocation(houseTable.getTableLocation()) - .build()); - } - // FIXME: Likely need a cache layer to avoid expensive tableScan. @Timed(metricKey = MetricsConstant.REPO_TABLE_EXISTS_TIME) @Override diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/TablesServiceImpl.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/TablesServiceImpl.java index 2c6bf611b..7d9c39559 100644 --- a/services/tables/src/main/java/com/linkedin/openhouse/tables/services/TablesServiceImpl.java +++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/services/TablesServiceImpl.java @@ -206,14 +206,12 @@ public void deleteTable(String databaseId, String tableId, String actingPrincipa TableDtoPrimaryKey tableDtoPrimaryKey = TableDtoPrimaryKey.builder().databaseId(databaseId).tableId(tableId).build(); - // Table-ref lookup (no metadata.json parse) is enough here — drop only needs identifiers + - // tableUUID for the ACL check. Lets us drop tables whose metadata.json is corrupted. - TableDto tableDto = - openHouseInternalRepository - .findTableRefById(tableDtoPrimaryKey) - .orElseThrow(() -> new NoSuchUserTableException(databaseId, tableId)); - - authorizationUtils.checkTableDropPrivilege(tableDto, actingPrincipal, Privileges.DELETE_TABLE); + Optional tableDto = openHouseInternalRepository.findById(tableDtoPrimaryKey); + if (!tableDto.isPresent()) { + throw new NoSuchUserTableException(databaseId, tableId); + } + authorizationUtils.checkTableDropPrivilege( + tableDto.get(), actingPrincipal, Privileges.DELETE_TABLE); openHouseInternalRepository.deleteById(tableDtoPrimaryKey); } diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/config/InternalCatalogBeansTest.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/config/InternalCatalogBeansTest.java deleted file mode 100644 index 7fa711b04..000000000 --- a/services/tables/src/test/java/com/linkedin/openhouse/tables/config/InternalCatalogBeansTest.java +++ /dev/null @@ -1,105 +0,0 @@ -package com.linkedin.openhouse.tables.config; - -import com.linkedin.openhouse.internal.catalog.cache.CacheConfiguration; -import com.linkedin.openhouse.internal.catalog.config.InternalCatalogSettings; -import java.time.Duration; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; -import org.springframework.boot.test.context.assertj.AssertableApplicationContext; -import org.springframework.boot.test.context.runner.ApplicationContextRunner; -import org.springframework.cache.CacheManager; -import org.springframework.cache.caffeine.CaffeineCacheManager; -import org.springframework.cache.support.NoOpCacheManager; -import org.springframework.util.unit.DataSize; - -class InternalCatalogBeansTest { - - private final ApplicationContextRunner contextRunner = - new ApplicationContextRunner().withUserConfiguration(InternalCatalogBeans.class); - - private final ApplicationContextRunner crossModuleContextRunner = - new ApplicationContextRunner() - .withUserConfiguration(InternalCatalogBeans.class, CacheConfiguration.class); - - @Test - public void testDefaultInternalCatalogSettings() { - contextRunner.run( - context -> { - assertMetadataCacheOverrides(context, null, null, null); - assertMetadataCacheSettings( - context, false, Duration.ofMinutes(10), DataSize.ofMegabytes(512)); - Assertions.assertFalse(context.containsBean("internalCatalogCacheManager")); - }); - } - - @Test - public void testOverriddenInternalCatalogSettings() { - contextRunner - .withPropertyValues( - "cluster.iceberg.tables.metadata-cache.enabled=true", - "cluster.iceberg.tables.metadata-cache.ttl=7m", - "cluster.iceberg.tables.metadata-cache.max-weight=42MB") - .run( - context -> { - assertMetadataCacheOverrides( - context, true, Duration.ofMinutes(7), DataSize.ofMegabytes(42)); - assertMetadataCacheSettings( - context, true, Duration.ofMinutes(7), DataSize.ofMegabytes(42)); - Assertions.assertFalse(context.containsBean("internalCatalogCacheManager")); - }); - } - - @Test - public void testDisabledMetadataCachePropagatesAsNoOp() { - crossModuleContextRunner.run( - context -> { - assertMetadataCacheSettings( - context, false, Duration.ofMinutes(10), DataSize.ofMegabytes(512)); - Assertions.assertTrue(context.getBean(CacheManager.class) instanceof NoOpCacheManager); - }); - } - - @Test - public void testEnabledMetadataCachePropagatesToCacheConfiguration() { - crossModuleContextRunner - .withPropertyValues( - "cluster.iceberg.tables.metadata-cache.enabled=true", - "cluster.iceberg.tables.metadata-cache.ttl=7m", - "cluster.iceberg.tables.metadata-cache.max-weight=42MB") - .run( - context -> { - assertMetadataCacheOverrides( - context, true, Duration.ofMinutes(7), DataSize.ofMegabytes(42)); - assertMetadataCacheSettings( - context, true, Duration.ofMinutes(7), DataSize.ofMegabytes(42)); - Assertions.assertTrue( - context.getBean(CacheManager.class) instanceof CaffeineCacheManager); - }); - } - - private void assertMetadataCacheOverrides( - AssertableApplicationContext context, - Boolean expectedEnabled, - Duration expectedTtl, - DataSize expectedMaxWeight) { - Assertions.assertNull(context.getStartupFailure()); - - InternalCatalogProperties properties = context.getBean(InternalCatalogProperties.class); - Assertions.assertEquals(expectedEnabled, properties.getMetadataCache().getEnabled()); - Assertions.assertEquals(expectedTtl, properties.getMetadataCache().getTtl()); - Assertions.assertEquals(expectedMaxWeight, properties.getMetadataCache().getMaxWeight()); - } - - private void assertMetadataCacheSettings( - AssertableApplicationContext context, - boolean expectedEnabled, - Duration expectedTtl, - DataSize expectedMaxWeight) { - Assertions.assertNull(context.getStartupFailure()); - - InternalCatalogSettings settings = context.getBean(InternalCatalogSettings.class); - Assertions.assertEquals(expectedEnabled, settings.getMetadataCache().isEnabled()); - Assertions.assertEquals(expectedTtl, settings.getMetadataCache().getTtl()); - Assertions.assertEquals(expectedMaxWeight, settings.getMetadataCache().getMaxWeight()); - } -} diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/RepositoryTestWithSettableComponents.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/RepositoryTestWithSettableComponents.java index dcc5b4b04..1ce1cae1d 100644 --- a/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/RepositoryTestWithSettableComponents.java +++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/RepositoryTestWithSettableComponents.java @@ -8,7 +8,6 @@ import com.linkedin.openhouse.cluster.storage.StorageManager; import com.linkedin.openhouse.common.test.cluster.PropertyOverrideContextInitializer; import com.linkedin.openhouse.internal.catalog.OpenHouseInternalTableOperations; -import com.linkedin.openhouse.internal.catalog.cache.TableMetadataCache; import com.linkedin.openhouse.internal.catalog.fileio.FileIOManager; import com.linkedin.openhouse.internal.catalog.mapper.HouseTableMapper; import com.linkedin.openhouse.internal.catalog.model.HouseTable; @@ -27,12 +26,9 @@ import java.util.HashSet; import java.util.Map; import java.util.Optional; -import java.util.concurrent.ConcurrentHashMap; -import java.util.function.Supplier; import javax.annotation.PostConstruct; import org.apache.iceberg.BaseTable; import org.apache.iceberg.Table; -import org.apache.iceberg.TableMetadata; import org.apache.iceberg.TableProperties; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.catalog.TableIdentifier; @@ -92,22 +88,6 @@ private HouseTableRepository provideFailedHtsRepoWhenSave(String tableLocation) return htsRepo; } - private TableMetadataCache newTableMetadataCache() { - Map cache = new ConcurrentHashMap<>(); - return new TableMetadataCache() { - @Override - public TableMetadata load(String metadataLocation, Supplier metadataLoader) { - return cache.computeIfAbsent(metadataLocation, ignored -> metadataLoader.get()); - } - - @Override - public TableMetadata seed(String metadataLocation, TableMetadata tableMetadata) { - cache.put(metadataLocation, tableMetadata); - return tableMetadata; - } - }; - } - @Test void testNoRetryInternalRepo() throws Exception { TableIdentifier tableIdentifier = @@ -124,8 +104,7 @@ void testNoRetryInternalRepo() throws Exception { houseTableMapper, tableIdentifier, metricsReporter, - fileIOManager, - newTableMetadataCache()); + fileIOManager); ((SettableCatalogForTest) catalog).setOperation(actualOps); TableDto creationDTO = TABLE_DTO.toBuilder().tableVersion(INITIAL_TABLE_VERSION).build(); creationDTO = openHouseInternalRepository.save(creationDTO); @@ -141,13 +120,7 @@ void testNoRetryInternalRepo() throws Exception { new MetricsReporter(this.meterRegistry, "test", Lists.newArrayList()); OpenHouseInternalTableOperations mockOps = new OpenHouseInternalTableOperations( - htsRepo, - fileIO, - houseTableMapper, - tableIdentifier, - metricsReporter2, - fileIOManager, - newTableMetadataCache()); + htsRepo, fileIO, houseTableMapper, tableIdentifier, metricsReporter2, fileIOManager); OpenHouseInternalTableOperations spyOperations = Mockito.spy(mockOps); BaseTable spyOptsMockedTable = Mockito.spy(new BaseTable(spyOperations, realTable.name())); @@ -219,8 +192,7 @@ void testSaveClearsTransientCommitPropertiesDuringTransaction() throws Exception houseTableMapper, tableIdentifier, metricsReporter, - fileIOManager, - newTableMetadataCache()); + fileIOManager); ((SettableCatalogForTest) catalog).setOperation(actualOps); TableDto creationDTO = TABLE_DTO.toBuilder().tableVersion(INITIAL_TABLE_VERSION).build(); @@ -279,13 +251,7 @@ void testFailedHtsRepoWhenGet() { new MetricsReporter(this.meterRegistry, "test", Lists.newArrayList()); OpenHouseInternalTableOperations mockOps = new OpenHouseInternalTableOperations( - htsRepo, - fileIO, - houseTableMapper, - tableIdentifier, - metricsReporter, - fileIOManager, - newTableMetadataCache()); + htsRepo, fileIO, houseTableMapper, tableIdentifier, metricsReporter, fileIOManager); OpenHouseInternalTableOperations spyOperations = Mockito.spy(mockOps); BaseTable spyOptsMockedTable = Mockito.spy( diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/TablesServiceTest.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/TablesServiceTest.java index 3a5998ebb..eec75c7d5 100644 --- a/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/TablesServiceTest.java +++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/TablesServiceTest.java @@ -29,8 +29,6 @@ import com.linkedin.openhouse.tables.services.TablesService; import com.linkedin.openhouse.tables.utils.AuthorizationUtils; import java.io.IOException; -import java.net.URI; -import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.UUID; @@ -268,43 +266,6 @@ public void testTableDeleteAlreadyDeleted() { TABLE_DTO.getDatabaseId(), TABLE_DTO.getTableId(), TEST_USER)); } - /** - * Regression test for the corrupted-metadata drop path: even when metadata.json cannot be parsed - * (loadTable would throw), deleteTable must still succeed because it goes through the HTS-only - * findTableRefById lookup and avoids loadTable entirely. - */ - @Test - public void testTableDeleteSucceedsWhenMetadataJsonIsCorrupted() throws IOException { - TableDto created = verifyPutTableRequest(TABLE_DTO, null, true); - - // tableLocation on TableDto is the metadata.json path (file://.metadata.json). - Path metadataPath = Paths.get(URI.create(created.getTableLocation())); - Assertions.assertTrue( - Files.exists(metadataPath), - "metadata.json should exist on disk after create: " + metadataPath); - - // Corrupt the file so TableMetadataParser.read fails. - Files.write(metadataPath, "{\"not\":\"valid iceberg metadata\"}".getBytes()); - - // Sanity check: reading the table now fails because loadTable parses metadata.json. - Assertions.assertThrows( - Exception.class, - () -> tablesService.getTable(TABLE_DTO.getDatabaseId(), TABLE_DTO.getTableId(), TEST_USER)); - - // Drop should still succeed despite the corruption. - Assertions.assertDoesNotThrow( - () -> - tablesService.deleteTable( - TABLE_DTO.getDatabaseId(), TABLE_DTO.getTableId(), TEST_USER)); - - // Verify HTS row is gone — a second delete should now hit the not-found path. - Assertions.assertThrows( - NoSuchUserTableException.class, - () -> - tablesService.deleteTable( - TABLE_DTO.getDatabaseId(), TABLE_DTO.getTableId(), TEST_USER)); - } - @Test public void testTimePartitioning() { Schema schema = diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/repository/impl/OpenHouseInternalRepositoryImplTest.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/repository/impl/OpenHouseInternalRepositoryImplTest.java index e8e4ad585..29b16ac5b 100644 --- a/services/tables/src/test/java/com/linkedin/openhouse/tables/repository/impl/OpenHouseInternalRepositoryImplTest.java +++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/repository/impl/OpenHouseInternalRepositoryImplTest.java @@ -6,22 +6,16 @@ import static org.mockito.Mockito.when; import com.linkedin.openhouse.cluster.configs.ClusterProperties; -import com.linkedin.openhouse.internal.catalog.OpenHouseInternalCatalog; import com.linkedin.openhouse.internal.catalog.mapper.HouseTableSerdeUtils; -import com.linkedin.openhouse.internal.catalog.model.HouseTable; import com.linkedin.openhouse.tables.common.TableType; import com.linkedin.openhouse.tables.dto.mapper.iceberg.PoliciesSpecMapper; import com.linkedin.openhouse.tables.model.TableDto; -import com.linkedin.openhouse.tables.model.TableDtoPrimaryKey; import com.linkedin.openhouse.tables.repository.PreservedKeyChecker; import io.micrometer.core.instrument.Counter; import io.micrometer.core.instrument.MeterRegistry; import java.util.HashMap; import java.util.Map; -import java.util.Optional; import org.apache.iceberg.TableProperties; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.TableIdentifier; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -40,7 +34,6 @@ public class OpenHouseInternalRepositoryImplTest { @Mock private MeterRegistry meterRegistry; @Mock private ClusterProperties clusterProperties; @Mock private PreservedKeyChecker preservedKeyChecker; - @Mock private OpenHouseInternalCatalog catalog; @InjectMocks private OpenHouseInternalRepositoryImpl openHouseInternalRepository; @@ -110,56 +103,6 @@ void testComputePropsForTableCreation_tableLocation() { actualProps.get(HouseTableSerdeUtils.getCanonicalFieldName("tableLocation"))); } - @Test - void findTableRefByIdReturnsPartialTableDto() { - HouseTable row = - HouseTable.builder() - .databaseId(DB_ID) - .tableId(TABLE_ID) - .tableUUID("uuid-1") - .tableLocation("/base/db/table-uuid-1/00001-x.metadata.json") - .build(); - when(catalog.findHouseTable(TableIdentifier.of(DB_ID, TABLE_ID))).thenReturn(Optional.of(row)); - - Optional result = - openHouseInternalRepository.findTableRefById( - TableDtoPrimaryKey.builder().databaseId(DB_ID).tableId(TABLE_ID).build()); - - Assertions.assertTrue(result.isPresent()); - TableDto dto = result.get(); - Assertions.assertEquals(DB_ID, dto.getDatabaseId()); - Assertions.assertEquals(TABLE_ID, dto.getTableId()); - Assertions.assertEquals("uuid-1", dto.getTableUUID()); - Assertions.assertEquals("/base/db/table-uuid-1/00001-x.metadata.json", dto.getTableLocation()); - // Fields not populated by the table-ref lookup should be null/default. - Assertions.assertNull(dto.getSchema()); - Assertions.assertNull(dto.getTableCreator()); - } - - @Test - void findTableRefByIdReturnsEmptyWhenHouseTableMissing() { - when(catalog.findHouseTable(any(TableIdentifier.class))).thenReturn(Optional.empty()); - - Optional result = - openHouseInternalRepository.findTableRefById( - TableDtoPrimaryKey.builder().databaseId(DB_ID).tableId(TABLE_ID).build()); - - Assertions.assertFalse(result.isPresent()); - } - - @Test - void findTableRefByIdThrowsWhenCatalogIsNotOpenHouseInternalCatalog() { - // Build a fresh impl with a non-OpenHouseInternal Catalog wired in. - OpenHouseInternalRepositoryImpl impl = new OpenHouseInternalRepositoryImpl(); - impl.catalog = mock(Catalog.class); - - Assertions.assertThrows( - UnsupportedOperationException.class, - () -> - impl.findTableRefById( - TableDtoPrimaryKey.builder().databaseId(DB_ID).tableId(TABLE_ID).build())); - } - private TableDto createTableDto(Map properties) { return TableDto.builder() .databaseId(DB_ID) diff --git a/settings.gradle b/settings.gradle index 810ecd643..035e54349 100644 --- a/settings.gradle +++ b/settings.gradle @@ -49,9 +49,6 @@ include ':libs:datalayout' include ':services:common' include ':services:housetables' include ':services:jobs' -include ':services:optimizer' -include ':services:optimizer:analyzer' -include ':apps:optimizer:analyzerapp' include ':services:tables' include ':tables-test-fixtures:tables-test-fixtures-iceberg-1.2' include ':tables-test-fixtures:tables-test-fixtures-iceberg-1.5' diff --git a/tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/java/com/linkedin/openhouse/tablestest/OpenHouseSparkITest.java b/tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/java/com/linkedin/openhouse/tablestest/OpenHouseSparkITest.java index 91050dda0..056ae03f3 100644 --- a/tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/java/com/linkedin/openhouse/tablestest/OpenHouseSparkITest.java +++ b/tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/java/com/linkedin/openhouse/tablestest/OpenHouseSparkITest.java @@ -131,4 +131,11 @@ protected Catalog getOpenHouseCatalog(SparkSession spark) { catalogProperties, spark.sparkContext().hadoopConfiguration()); } + + /** + * Getting rid of "file:" part if needed for ease of comparison of tableLocation / tableVersion + */ + protected String stripPathScheme(String path) { + return path.startsWith("file:") ? path.split("file:")[1] : path; + } }