From abbbaee99662f3c937b3720175042cd145f22bd6 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Mon, 26 Jan 2026 02:08:56 -0800 Subject: [PATCH 01/45] first commit --- common/config/src/main/resources/storage.conf | 13 +++- .../amber/config/EnvironmentalVariable.scala | 1 + .../texera/amber/config/StorageConfig.scala | 3 +- common/workflow-core/build.sbt | 5 ++ .../core/storage/IcebergCatalogInstance.scala | 2 +- .../result/iceberg/IcebergTableWriter.scala | 13 +++- .../texera/amber/util/IcebergUtil.scala | 59 +++++++++++++------ .../ComputingUnitManagingResource.scala | 2 + 8 files changed, 73 insertions(+), 25 deletions(-) diff --git a/common/config/src/main/resources/storage.conf b/common/config/src/main/resources/storage.conf index 85a62b77a3b..5a4af15dffd 100644 --- a/common/config/src/main/resources/storage.conf +++ b/common/config/src/main/resources/storage.conf @@ -23,12 +23,19 @@ storage { # Configuration for Apache Iceberg, used for storing the workflow results & stats iceberg { catalog { - type = postgres # either hadoop, rest, or postgres + type = rest # either hadoop, rest, or postgres type = ${?STORAGE_ICEBERG_CATALOG_TYPE} rest-uri = "" rest-uri = ${?STORAGE_ICEBERG_CATALOG_REST_URI} # the uri of the rest catalog, not needed unless using REST catalog + rest { + uri = "http://localhost:8181/catalog/" + uri = ${?STORAGE_ICEBERG_CATALOG_REST_URI} # the uri of the rest catalog, not needed unless using REST catalog + warehouse-name = "texeraExecutionsRR" + warehouse-name = ${?STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME} # warehouse name for Lakekeeper, e.g., "taxera-execution" + } + postgres { # do not include scheme in the uri as Python and Java use different schemes uri-without-scheme = "localhost:5432/texera_iceberg_catalog" @@ -131,10 +138,10 @@ storage { url-for-test-cases = "jdbc:postgresql://localhost:5432/texera_db_for_test_cases?currentSchema=texera_db,public" url-for-test-cases = ${?STORAGE_JDBC_URL_FOR_TEST_CASES} - username = "postgres" + username = "wangmeng" username = ${?STORAGE_JDBC_USERNAME} - password = "postgres" + password = "" password = ${?STORAGE_JDBC_PASSWORD} } } diff --git a/common/config/src/main/scala/org/apache/texera/amber/config/EnvironmentalVariable.scala b/common/config/src/main/scala/org/apache/texera/amber/config/EnvironmentalVariable.scala index 099e12260d2..efb1e80a1cf 100644 --- a/common/config/src/main/scala/org/apache/texera/amber/config/EnvironmentalVariable.scala +++ b/common/config/src/main/scala/org/apache/texera/amber/config/EnvironmentalVariable.scala @@ -57,6 +57,7 @@ object EnvironmentalVariable { // Iceberg Catalog val ENV_ICEBERG_CATALOG_TYPE = "STORAGE_ICEBERG_CATALOG_TYPE" val ENV_ICEBERG_CATALOG_REST_URI = "STORAGE_ICEBERG_CATALOG_REST_URI" + val ENV_ICEBERG_CATALOG_REST_WAREHOUSE_NAME = "STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME" // Iceberg Postgres Catalog val ENV_ICEBERG_CATALOG_POSTGRES_URI_WITHOUT_SCHEME = diff --git a/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala b/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala index c5bd3302862..d136142fab6 100644 --- a/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala +++ b/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala @@ -39,7 +39,8 @@ object StorageConfig { // Iceberg specifics val icebergCatalogType: String = conf.getString("storage.iceberg.catalog.type") - val icebergRESTCatalogUri: String = conf.getString("storage.iceberg.catalog.rest-uri") + val icebergRESTCatalogUri: String = conf.getString("storage.iceberg.catalog.rest.uri") + val icebergRESTCatalogWarehouseName: String = conf.getString("storage.iceberg.catalog.rest.warehouse-name") // Iceberg Postgres specifics val icebergPostgresCatalogUriWithoutScheme: String = diff --git a/common/workflow-core/build.sbt b/common/workflow-core/build.sbt index db916685138..7920c5a24d1 100644 --- a/common/workflow-core/build.sbt +++ b/common/workflow-core/build.sbt @@ -167,6 +167,10 @@ libraryDependencies ++= Seq( excludeJackson, excludeJacksonModule ), + "org.apache.iceberg" % "iceberg-aws" % "1.7.1" excludeAll( + excludeJackson, + excludeJacksonModule + ), "org.apache.hadoop" % "hadoop-common" % "3.3.1" excludeAll( excludeXmlBind, excludeGlassfishJersey, @@ -210,4 +214,5 @@ libraryDependencies ++= Seq( ), "software.amazon.awssdk" % "auth" % "2.29.51", "software.amazon.awssdk" % "regions" % "2.29.51", + "software.amazon.awssdk" % "sts" % "2.29.51", ) \ No newline at end of file diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/IcebergCatalogInstance.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/IcebergCatalogInstance.scala index e3512874c9b..bb9f2d8bf2d 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/IcebergCatalogInstance.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/IcebergCatalogInstance.scala @@ -52,7 +52,7 @@ object IcebergCatalogInstance { case "rest" => IcebergUtil.createRestCatalog( "texera_iceberg", - StorageConfig.fileStorageDirectoryPath + StorageConfig.icebergRESTCatalogWarehouseName ) case "postgres" => IcebergUtil.createPostgresCatalog( diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/result/iceberg/IcebergTableWriter.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/result/iceberg/IcebergTableWriter.scala index 549cb4b9d17..dd2e40bc30d 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/result/iceberg/IcebergTableWriter.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/core/storage/result/iceberg/IcebergTableWriter.scala @@ -107,10 +107,19 @@ private[storage] class IcebergTableWriter[T]( private def flushBuffer(): Unit = { if (buffer.nonEmpty) { // Create a unique file path using the writer's identifier and the filename index - val filepath = Paths.get(table.location()).resolve(s"${writerIdentifier}_${filenameIdx}") + // Handle S3 URIs (s3://) differently from local file paths to preserve URI format + val location = table.location() + val filepathString = if (location.startsWith("s3://")) { + // For S3 URIs, append path component directly as string to preserve s3:// format + val basePath = if (location.endsWith("/")) location else s"$location/" + s"$basePath${writerIdentifier}_${filenameIdx}" + } else { + // For local file paths, use Paths.get() for proper path resolution + Paths.get(location).resolve(s"${writerIdentifier}_${filenameIdx}").toString + } // Increment the filename index by 1 filenameIdx += 1 - val outputFile: OutputFile = table.io().newOutputFile(filepath.toString) + val outputFile: OutputFile = table.io().newOutputFile(filepathString) // Create a Parquet data writer to write a new file val dataWriter: DataWriter[Record] = Parquet .writeData(outputFile) diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala index ad6ac07c1ff..fe2db42214e 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala @@ -22,9 +22,10 @@ package org.apache.texera.amber.util import org.apache.texera.amber.config.StorageConfig import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, LargeBinary, Schema, Tuple} import org.apache.hadoop.conf.Configuration -import org.apache.iceberg.catalog.{Catalog, TableIdentifier} +import org.apache.iceberg.catalog.{Catalog, SupportsNamespaces, TableIdentifier} import org.apache.iceberg.data.parquet.GenericParquetReaders import org.apache.iceberg.data.{GenericRecord, Record} +import org.apache.iceberg.aws.s3.S3FileIO import org.apache.iceberg.hadoop.{HadoopCatalog, HadoopFileIO} import org.apache.iceberg.io.{CloseableIterable, InputFile} import org.apache.iceberg.jdbc.JdbcCatalog @@ -32,14 +33,9 @@ import org.apache.iceberg.parquet.{Parquet, ParquetValueReader} import org.apache.iceberg.rest.RESTCatalog import org.apache.iceberg.types.Type.PrimitiveType import org.apache.iceberg.types.Types -import org.apache.iceberg.{ - CatalogProperties, - DataFile, - PartitionSpec, - Table, - TableProperties, - Schema => IcebergSchema -} +import org.apache.iceberg.{CatalogProperties, DataFile, PartitionSpec, Table, TableProperties, Schema => IcebergSchema} +import org.apache.iceberg.catalog.Namespace +import org.apache.iceberg.exceptions.AlreadyExistsException import java.nio.ByteBuffer import java.nio.file.Path @@ -96,22 +92,35 @@ object IcebergUtil { * TODO: Add authentication support, such as OAuth2, using `OAuth2Properties`. * * @param catalogName the name of the catalog. - * @param warehouse the root path for the warehouse where the tables are stored. + * @param warehouse the warehouse identifier (path for standard REST catalog, name for Lakekeeper). * @return the initialized RESTCatalog instance. */ def createRestCatalog( catalogName: String, - warehouse: Path + warehouse: String ): RESTCatalog = { val catalog = new RESTCatalog() - catalog.initialize( - catalogName, - Map( - "warehouse" -> warehouse.toString, - CatalogProperties.URI -> StorageConfig.icebergRESTCatalogUri, - CatalogProperties.FILE_IO_IMPL -> classOf[HadoopFileIO].getName - ).asJava + + // Build base properties map + var properties = Map( + "warehouse" -> warehouse, + CatalogProperties.URI -> StorageConfig.icebergRESTCatalogUri ) + + properties = properties ++ Map( + CatalogProperties.FILE_IO_IMPL -> classOf[S3FileIO].getName, + // S3FileIO configuration for MinIO + "s3.endpoint" -> StorageConfig.s3Endpoint, + "s3.access-key-id" -> StorageConfig.s3Username, + "s3.secret-access-key" -> StorageConfig.s3Password, + "s3.region" -> StorageConfig.s3Region, + "s3.path-style-access" -> "true", + ) + + println(s"[IcebergUtil] effective s3.endpoint = ${properties.get("s3.endpoint")}, io.s3.endpoint = ${properties.get("io.s3.endpoint")}") + println(s"[IcebergUtil] StorageConfig.s3Endpoint = ${StorageConfig.s3Endpoint}") + + catalog.initialize(catalogName, properties.asJava) catalog } @@ -165,6 +174,20 @@ object IcebergUtil { TableProperties.COMMIT_MIN_RETRY_WAIT_MS -> StorageConfig.icebergTableCommitMinRetryWaitMs.toString ) +// val namespace = Namespace.of(tableNamespace) +// +// catalog match { +// case nsCatalog: SupportsNamespaces => +// try nsCatalog.createNamespace(namespace, Map.empty[String, String].asJava) +// catch { +// case _: AlreadyExistsException => () +// } +// case _ => +// throw new IllegalArgumentException( +// s"Catalog ${catalog.getClass.getName} does not support namespaces" +// ) +// } + val identifier = TableIdentifier.of(tableNamespace, tableName) if (catalog.tableExists(identifier) && overrideIfExists) { catalog.dropTable(identifier) diff --git a/computing-unit-managing-service/src/main/scala/org/apache/texera/service/resource/ComputingUnitManagingResource.scala b/computing-unit-managing-service/src/main/scala/org/apache/texera/service/resource/ComputingUnitManagingResource.scala index 1249d067835..9b214b9755c 100644 --- a/computing-unit-managing-service/src/main/scala/org/apache/texera/service/resource/ComputingUnitManagingResource.scala +++ b/computing-unit-managing-service/src/main/scala/org/apache/texera/service/resource/ComputingUnitManagingResource.scala @@ -69,6 +69,8 @@ object ComputingUnitManagingResource { private lazy val computingUnitEnvironmentVariables: Map[String, Any] = Map( // Variables for saving results to Iceberg EnvironmentalVariable.ENV_ICEBERG_CATALOG_TYPE -> StorageConfig.icebergCatalogType, + EnvironmentalVariable.ENV_ICEBERG_CATALOG_REST_URI -> StorageConfig.icebergRESTCatalogUri, + EnvironmentalVariable.ENV_ICEBERG_CATALOG_REST_WAREHOUSE_NAME -> StorageConfig.icebergRESTCatalogWarehouseName, EnvironmentalVariable.ENV_ICEBERG_CATALOG_POSTGRES_URI_WITHOUT_SCHEME -> StorageConfig.icebergPostgresCatalogUriWithoutScheme, EnvironmentalVariable.ENV_ICEBERG_CATALOG_POSTGRES_USERNAME -> StorageConfig.icebergPostgresCatalogUsername, EnvironmentalVariable.ENV_ICEBERG_CATALOG_POSTGRES_PASSWORD -> StorageConfig.icebergPostgresCatalogPassword, From 330dbd0b39b58fc3e7e769ef0c5391c8a79127b5 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Mon, 26 Jan 2026 02:56:35 -0800 Subject: [PATCH 02/45] for python, need test --- .../iceberg/iceberg_catalog_instance.py | 33 +++++++++++----- .../core/storage/iceberg/iceberg_utils.py | 38 +++++++++++++++++++ .../storage/iceberg/test_iceberg_document.py | 7 ++++ .../python/core/storage/storage_config.py | 21 ++++++++++ .../main/python/texera_run_python_worker.py | 14 +++++++ .../pythonworker/PythonWorkflowWorker.scala | 7 ++++ 6 files changed, 111 insertions(+), 9 deletions(-) diff --git a/amber/src/main/python/core/storage/iceberg/iceberg_catalog_instance.py b/amber/src/main/python/core/storage/iceberg/iceberg_catalog_instance.py index b1478fadf03..e394f773566 100644 --- a/amber/src/main/python/core/storage/iceberg/iceberg_catalog_instance.py +++ b/amber/src/main/python/core/storage/iceberg/iceberg_catalog_instance.py @@ -18,14 +18,14 @@ from pyiceberg.catalog import Catalog from typing import Optional -from core.storage.iceberg.iceberg_utils import create_postgres_catalog +from core.storage.iceberg.iceberg_utils import create_postgres_catalog, create_rest_catalog from core.storage.storage_config import StorageConfig class IcebergCatalogInstance: """ IcebergCatalogInstance is a singleton that manages the Iceberg catalog instance. - Currently only postgres SQL catalog is supported. + Supports postgres SQL catalog and REST catalog. - Provides a single shared catalog for all Iceberg table-related operations. - Lazily initializes the catalog on first access. - Supports replacing the catalog instance for testing or reconfiguration. @@ -39,16 +39,31 @@ def get_instance(cls): Retrieves the singleton Iceberg catalog instance. - If the catalog is not initialized, it is lazily created using the configured properties. + - Supports "postgres" and "rest" catalog types. :return: the Iceberg catalog instance. """ if cls._instance is None: - cls._instance = create_postgres_catalog( - "texera_iceberg", - StorageConfig.ICEBERG_FILE_STORAGE_DIRECTORY_PATH, - StorageConfig.ICEBERG_POSTGRES_CATALOG_URI_WITHOUT_SCHEME, - StorageConfig.ICEBERG_POSTGRES_CATALOG_USERNAME, - StorageConfig.ICEBERG_POSTGRES_CATALOG_PASSWORD, - ) + catalog_type = StorageConfig.ICEBERG_CATALOG_TYPE + if catalog_type == "postgres": + cls._instance = create_postgres_catalog( + "texera_iceberg", + StorageConfig.ICEBERG_FILE_STORAGE_DIRECTORY_PATH, + StorageConfig.ICEBERG_POSTGRES_CATALOG_URI_WITHOUT_SCHEME, + StorageConfig.ICEBERG_POSTGRES_CATALOG_USERNAME, + StorageConfig.ICEBERG_POSTGRES_CATALOG_PASSWORD, + ) + elif catalog_type == "rest": + cls._instance = create_rest_catalog( + "texera_iceberg", + StorageConfig.ICEBERG_REST_CATALOG_WAREHOUSE_NAME, + StorageConfig.ICEBERG_REST_CATALOG_URI, + StorageConfig.S3_ENDPOINT, + StorageConfig.S3_REGION, + StorageConfig.S3_USERNAME, + StorageConfig.S3_PASSWORD, + ) + else: + raise ValueError(f"Unsupported catalog type: {catalog_type}") return cls._instance @classmethod diff --git a/amber/src/main/python/core/storage/iceberg/iceberg_utils.py b/amber/src/main/python/core/storage/iceberg/iceberg_utils.py index 9e17b2e0e82..1e096c42869 100644 --- a/amber/src/main/python/core/storage/iceberg/iceberg_utils.py +++ b/amber/src/main/python/core/storage/iceberg/iceberg_utils.py @@ -18,6 +18,7 @@ import pyarrow as pa import pyiceberg.table from pyiceberg.catalog import Catalog +from pyiceberg.catalog.rest import RESTCatalog from pyiceberg.catalog.sql import SqlCatalog from pyiceberg.expressions import AlwaysTrue from pyiceberg.io.pyarrow import ArrowScan @@ -153,6 +154,43 @@ def create_postgres_catalog( ) +def create_rest_catalog( + catalog_name: str, + warehouse_name: str, + rest_uri: str, + s3_endpoint: str, + s3_region: str, + s3_username: str, + s3_password: str, +) -> RESTCatalog: + """ + Creates a REST catalog instance by connecting to a REST endpoint. + - Configures the catalog to interact with a REST endpoint. + - The warehouse_name parameter specifies the warehouse identifier (name for Lakekeeper). + - Configures S3FileIO for MinIO/S3 storage backend. + :param catalog_name: the name of the catalog. + :param warehouse_name: the warehouse identifier (name for Lakekeeper). + :param rest_uri: the URI of the REST catalog endpoint. + :param s3_endpoint: the S3 endpoint URL. + :param s3_region: the S3 region. + :param s3_username: the S3 access key ID. + :param s3_password: the S3 secret access key. + :return: a RESTCatalog instance. + """ + return RESTCatalog( + name=catalog_name, + properties={ + "uri": rest_uri, + "warehouse": warehouse_name, + "s3.endpoint": s3_endpoint, + "s3.access-key-id": s3_username, + "s3.secret-access-key": s3_password, + "s3.region": s3_region, + "s3.path-style-access": "true", + }, + ) + + def create_table( catalog: Catalog, table_namespace: str, diff --git a/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py b/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py index 34711beb652..8886514493c 100644 --- a/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py +++ b/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py @@ -37,9 +37,16 @@ # Hardcoded storage config only for test purposes. StorageConfig.initialize( + catalog_type="postgres", postgres_uri_without_scheme="localhost:5432/texera_iceberg_catalog", postgres_username="texera", postgres_password="password", + rest_catalog_uri="", + rest_catalog_warehouse_name="", + s3_endpoint="", + s3_region="", + s3_username="", + s3_password="", table_result_namespace="operator-port-result", directory_path="../../../../../../amber/user-resources/workflow-results", commit_batch_size=4096, diff --git a/amber/src/main/python/core/storage/storage_config.py b/amber/src/main/python/core/storage/storage_config.py index c55495ea14c..a85d76146b6 100644 --- a/amber/src/main/python/core/storage/storage_config.py +++ b/amber/src/main/python/core/storage/storage_config.py @@ -25,9 +25,16 @@ class StorageConfig: _initialized = False + ICEBERG_CATALOG_TYPE = None ICEBERG_POSTGRES_CATALOG_URI_WITHOUT_SCHEME = None ICEBERG_POSTGRES_CATALOG_USERNAME = None ICEBERG_POSTGRES_CATALOG_PASSWORD = None + ICEBERG_REST_CATALOG_URI = None + ICEBERG_REST_CATALOG_WAREHOUSE_NAME = None + S3_ENDPOINT = None + S3_REGION = None + S3_USERNAME = None + S3_PASSWORD = None ICEBERG_TABLE_RESULT_NAMESPACE = None ICEBERG_FILE_STORAGE_DIRECTORY_PATH = None ICEBERG_TABLE_COMMIT_BATCH_SIZE = None @@ -41,9 +48,16 @@ class StorageConfig: @classmethod def initialize( cls, + catalog_type, postgres_uri_without_scheme, postgres_username, postgres_password, + rest_catalog_uri, + rest_catalog_warehouse_name, + s3_endpoint, + s3_region, + s3_username, + s3_password, table_result_namespace, directory_path, commit_batch_size, @@ -57,9 +71,16 @@ def initialize( "Storage config has already been initialized and cannot be modified." ) + cls.ICEBERG_CATALOG_TYPE = catalog_type cls.ICEBERG_POSTGRES_CATALOG_URI_WITHOUT_SCHEME = postgres_uri_without_scheme cls.ICEBERG_POSTGRES_CATALOG_USERNAME = postgres_username cls.ICEBERG_POSTGRES_CATALOG_PASSWORD = postgres_password + cls.ICEBERG_REST_CATALOG_URI = rest_catalog_uri + cls.ICEBERG_REST_CATALOG_WAREHOUSE_NAME = rest_catalog_warehouse_name + cls.S3_ENDPOINT = s3_endpoint + cls.S3_REGION = s3_region + cls.S3_USERNAME = s3_username + cls.S3_PASSWORD = s3_password cls.ICEBERG_TABLE_RESULT_NAMESPACE = table_result_namespace cls.ICEBERG_FILE_STORAGE_DIRECTORY_PATH = directory_path cls.ICEBERG_TABLE_COMMIT_BATCH_SIZE = int(commit_batch_size) diff --git a/amber/src/main/python/texera_run_python_worker.py b/amber/src/main/python/texera_run_python_worker.py index 3ebf81c201f..5377395eaa0 100644 --- a/amber/src/main/python/texera_run_python_worker.py +++ b/amber/src/main/python/texera_run_python_worker.py @@ -45,9 +45,16 @@ def init_loguru_logger(stream_log_level) -> None: output_port, logger_level, r_path, + iceberg_catalog_type, iceberg_postgres_catalog_uri_without_scheme, iceberg_postgres_catalog_username, iceberg_postgres_catalog_password, + iceberg_rest_catalog_uri, + iceberg_rest_catalog_warehouse_name, + s3_endpoint, + s3_region, + s3_username, + s3_password, iceberg_table_namespace, iceberg_file_storage_directory_path, iceberg_table_commit_batch_size, @@ -58,9 +65,16 @@ def init_loguru_logger(stream_log_level) -> None: ) = sys.argv init_loguru_logger(logger_level) StorageConfig.initialize( + iceberg_catalog_type, iceberg_postgres_catalog_uri_without_scheme, iceberg_postgres_catalog_username, iceberg_postgres_catalog_password, + iceberg_rest_catalog_uri, + iceberg_rest_catalog_warehouse_name, + s3_endpoint, + s3_region, + s3_username, + s3_password, iceberg_table_namespace, iceberg_file_storage_directory_path, iceberg_table_commit_batch_size, diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala index 558b99c9b7b..8276415ca80 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala @@ -178,9 +178,16 @@ class PythonWorkflowWorker( Integer.toString(pythonProxyServer.getPortNumber.get()), UdfConfig.pythonLogStreamHandlerLevel, RENVPath, + StorageConfig.icebergCatalogType, StorageConfig.icebergPostgresCatalogUriWithoutScheme, StorageConfig.icebergPostgresCatalogUsername, StorageConfig.icebergPostgresCatalogPassword, + StorageConfig.icebergRESTCatalogUri, + StorageConfig.icebergRESTCatalogWarehouseName, + StorageConfig.s3Endpoint, + StorageConfig.s3Region, + StorageConfig.s3Username, + StorageConfig.s3Password, StorageConfig.icebergTableResultNamespace, StorageConfig.fileStorageDirectoryPath.toString, StorageConfig.icebergTableCommitBatchSize.toString, From 603b373db758de1b642a0347dc4f88f15e8f0008 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Wed, 28 Jan 2026 03:34:04 -0800 Subject: [PATCH 03/45] for python, 2 --- .../python/core/storage/iceberg/iceberg_utils.py | 14 +++++++------- .../core/storage/iceberg/test_iceberg_document.py | 2 -- .../src/main/python/core/storage/storage_config.py | 7 +------ amber/src/main/python/texera_run_python_worker.py | 4 ---- .../pythonworker/PythonWorkflowWorker.scala | 2 -- 5 files changed, 8 insertions(+), 21 deletions(-) diff --git a/amber/src/main/python/core/storage/iceberg/iceberg_utils.py b/amber/src/main/python/core/storage/iceberg/iceberg_utils.py index 1e096c42869..c1f9df2e403 100644 --- a/amber/src/main/python/core/storage/iceberg/iceberg_utils.py +++ b/amber/src/main/python/core/storage/iceberg/iceberg_utils.py @@ -17,8 +17,7 @@ import pyarrow as pa import pyiceberg.table -from pyiceberg.catalog import Catalog -from pyiceberg.catalog.rest import RESTCatalog +from pyiceberg.catalog import Catalog, load_catalog from pyiceberg.catalog.sql import SqlCatalog from pyiceberg.expressions import AlwaysTrue from pyiceberg.io.pyarrow import ArrowScan @@ -162,7 +161,7 @@ def create_rest_catalog( s3_region: str, s3_username: str, s3_password: str, -) -> RESTCatalog: +) -> Catalog: """ Creates a REST catalog instance by connecting to a REST endpoint. - Configures the catalog to interact with a REST endpoint. @@ -175,11 +174,12 @@ def create_rest_catalog( :param s3_region: the S3 region. :param s3_username: the S3 access key ID. :param s3_password: the S3 secret access key. - :return: a RESTCatalog instance. + :return: a Catalog instance (REST catalog). """ - return RESTCatalog( - name=catalog_name, - properties={ + return load_catalog( + catalog_name, + **{ + "type": "rest", "uri": rest_uri, "warehouse": warehouse_name, "s3.endpoint": s3_endpoint, diff --git a/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py b/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py index 8886514493c..54f0bb86d17 100644 --- a/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py +++ b/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py @@ -43,8 +43,6 @@ postgres_password="password", rest_catalog_uri="", rest_catalog_warehouse_name="", - s3_endpoint="", - s3_region="", s3_username="", s3_password="", table_result_namespace="operator-port-result", diff --git a/amber/src/main/python/core/storage/storage_config.py b/amber/src/main/python/core/storage/storage_config.py index a85d76146b6..aeb182f751a 100644 --- a/amber/src/main/python/core/storage/storage_config.py +++ b/amber/src/main/python/core/storage/storage_config.py @@ -31,8 +31,6 @@ class StorageConfig: ICEBERG_POSTGRES_CATALOG_PASSWORD = None ICEBERG_REST_CATALOG_URI = None ICEBERG_REST_CATALOG_WAREHOUSE_NAME = None - S3_ENDPOINT = None - S3_REGION = None S3_USERNAME = None S3_PASSWORD = None ICEBERG_TABLE_RESULT_NAMESPACE = None @@ -54,8 +52,6 @@ def initialize( postgres_password, rest_catalog_uri, rest_catalog_warehouse_name, - s3_endpoint, - s3_region, s3_username, s3_password, table_result_namespace, @@ -77,8 +73,7 @@ def initialize( cls.ICEBERG_POSTGRES_CATALOG_PASSWORD = postgres_password cls.ICEBERG_REST_CATALOG_URI = rest_catalog_uri cls.ICEBERG_REST_CATALOG_WAREHOUSE_NAME = rest_catalog_warehouse_name - cls.S3_ENDPOINT = s3_endpoint - cls.S3_REGION = s3_region + cls.S3_USERNAME = s3_username cls.S3_PASSWORD = s3_password cls.ICEBERG_TABLE_RESULT_NAMESPACE = table_result_namespace diff --git a/amber/src/main/python/texera_run_python_worker.py b/amber/src/main/python/texera_run_python_worker.py index 5377395eaa0..4d3a6cf1a5e 100644 --- a/amber/src/main/python/texera_run_python_worker.py +++ b/amber/src/main/python/texera_run_python_worker.py @@ -51,8 +51,6 @@ def init_loguru_logger(stream_log_level) -> None: iceberg_postgres_catalog_password, iceberg_rest_catalog_uri, iceberg_rest_catalog_warehouse_name, - s3_endpoint, - s3_region, s3_username, s3_password, iceberg_table_namespace, @@ -71,8 +69,6 @@ def init_loguru_logger(stream_log_level) -> None: iceberg_postgres_catalog_password, iceberg_rest_catalog_uri, iceberg_rest_catalog_warehouse_name, - s3_endpoint, - s3_region, s3_username, s3_password, iceberg_table_namespace, diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala index 8276415ca80..5badac45e20 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala @@ -184,8 +184,6 @@ class PythonWorkflowWorker( StorageConfig.icebergPostgresCatalogPassword, StorageConfig.icebergRESTCatalogUri, StorageConfig.icebergRESTCatalogWarehouseName, - StorageConfig.s3Endpoint, - StorageConfig.s3Region, StorageConfig.s3Username, StorageConfig.s3Password, StorageConfig.icebergTableResultNamespace, From 8efbb73d9926f4897d25d0e3d8756582e70c1695 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Wed, 28 Jan 2026 03:37:27 -0800 Subject: [PATCH 04/45] update storage.conf --- common/config/src/main/resources/storage.conf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/config/src/main/resources/storage.conf b/common/config/src/main/resources/storage.conf index 5a4af15dffd..667c47e5d03 100644 --- a/common/config/src/main/resources/storage.conf +++ b/common/config/src/main/resources/storage.conf @@ -31,7 +31,7 @@ storage { rest { uri = "http://localhost:8181/catalog/" - uri = ${?STORAGE_ICEBERG_CATALOG_REST_URI} # the uri of the rest catalog, not needed unless using REST catalog + uri = ${?STORAGE_ICEBERG_CATALOG_REST_URI} warehouse-name = "texeraExecutionsRR" warehouse-name = ${?STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME} # warehouse name for Lakekeeper, e.g., "taxera-execution" } @@ -138,10 +138,10 @@ storage { url-for-test-cases = "jdbc:postgresql://localhost:5432/texera_db_for_test_cases?currentSchema=texera_db,public" url-for-test-cases = ${?STORAGE_JDBC_URL_FOR_TEST_CASES} - username = "wangmeng" + username = "postgres" username = ${?STORAGE_JDBC_USERNAME} - password = "" + password = "postgres" password = ${?STORAGE_JDBC_PASSWORD} } } From 720f5e14d1377aeb8c2e66993036fe2111d6d27f Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Mon, 2 Feb 2026 16:59:53 -0800 Subject: [PATCH 05/45] script --- bin/bootstrap-lakekeeper-warehouse.sh | 445 ++++++++++++++++++++++++++ bin/parse-storage-config.py | 83 +++++ 2 files changed, 528 insertions(+) create mode 100755 bin/bootstrap-lakekeeper-warehouse.sh create mode 100755 bin/parse-storage-config.py diff --git a/bin/bootstrap-lakekeeper-warehouse.sh b/bin/bootstrap-lakekeeper-warehouse.sh new file mode 100755 index 00000000000..ef952cbc2e3 --- /dev/null +++ b/bin/bootstrap-lakekeeper-warehouse.sh @@ -0,0 +1,445 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Bootstrap script to start Lakekeeper and create warehouse (idempotent). +# This script does three things: +# 1. Starts Lakekeeper if it's not already running +# 2. Checks if MinIO bucket exists (and creates it if needed) +# 3. Checks and creates the warehouse if it doesn't exist +# +# +# Usage: +# ./bin/bootstrap-lakekeeper-warehouse.sh + +set -e + +# Read configuration from storage.conf or environment variables +# Priority: environment variable > storage.conf > default value + +# Find storage.conf path +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +if [ -n "$TEXERA_HOME" ]; then + STORAGE_CONF_PATH="$TEXERA_HOME/common/config/src/main/resources/storage.conf" +else + STORAGE_CONF_PATH="$SCRIPT_DIR/../common/config/src/main/resources/storage.conf" +fi + +# Extract values from storage.conf using pyhocon for proper HOCON parsing +# pyhocon handles environment variable substitution correctly +if [ -f "$STORAGE_CONF_PATH" ]; then + # Check if pyhocon is available + if ! command -v python3 >/dev/null 2>&1; then + echo "✗ Error: python3 is required to parse storage.conf" + echo " Please install Python 3" + exit 1 + fi + + if ! python3 -c "import pyhocon" 2>/dev/null; then + echo "✗ Error: pyhocon is required to parse storage.conf" + echo " Install it with: pip install pyhocon" + exit 1 + fi + + # Use pyhocon for proper HOCON parsing (handles environment variable substitution) + REST_URI_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.iceberg.catalog.rest.uri" 2>/dev/null | sed 's|/catalog/*$||' || echo "") + WAREHOUSE_NAME_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.iceberg.catalog.rest.warehouse-name" 2>/dev/null || echo "") + S3_BUCKET_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.iceberg.catalog.rest.s3-bucket" 2>/dev/null || echo "") + S3_ENDPOINT_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.s3.endpoint" 2>/dev/null || echo "") + S3_USERNAME_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.s3.auth.username" 2>/dev/null || echo "") + S3_PASSWORD_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.s3.auth.password" 2>/dev/null || echo "") + + echo "Configuration read from storage.conf:" + echo " REST_URI_FROM_CONF=$REST_URI_FROM_CONF" + echo " WAREHOUSE_NAME_FROM_CONF=$WAREHOUSE_NAME_FROM_CONF" + echo " S3_BUCKET_FROM_CONF=$S3_BUCKET_FROM_CONF" + echo " S3_ENDPOINT_FROM_CONF=$S3_ENDPOINT_FROM_CONF" + echo " S3_USERNAME_FROM_CONF=$S3_USERNAME_FROM_CONF" + echo " S3_PASSWORD_FROM_CONF=$S3_PASSWORD_FROM_CONF" + echo "" +else + REST_URI_FROM_CONF="" + WAREHOUSE_NAME_FROM_CONF="" + S3_BUCKET_FROM_CONF="" + S3_ENDPOINT_FROM_CONF="" + S3_USERNAME_FROM_CONF="" + S3_PASSWORD_FROM_CONF="" + echo "storage.conf not found, using environment variables or defaults" + echo "" +fi + +# Use values from storage.conf with defaults +LAKEKEEPER_BASE_URI="${REST_URI_FROM_CONF:-http://localhost:8181}" +WAREHOUSE_NAME="${WAREHOUSE_NAME_FROM_CONF:-texera-executions}" +S3_BUCKET="${S3_BUCKET_FROM_CONF:-texera-iceberg}" +S3_ENDPOINT="${S3_ENDPOINT_FROM_CONF:-http://localhost:9000}" +S3_USERNAME="${S3_USERNAME_FROM_CONF:-texera_minio}" +S3_PASSWORD="${S3_PASSWORD_FROM_CONF:-password}" +STORAGE_PATH="s3://${S3_BUCKET}/iceberg/${WAREHOUSE_NAME}" + +echo "==========================================" +echo "Lakekeeper Bootstrap and Warehouse Setup" +echo "==========================================" +echo "Lakekeeper Base URI: $LAKEKEEPER_BASE_URI" +echo "Lakekeeper Binary: ${LAKEKEEPER_BINARY_PATH:-lakekeeper}" +echo "Warehouse Name: $WAREHOUSE_NAME" +echo "S3 Endpoint: $S3_ENDPOINT" +echo "S3 Bucket: $S3_BUCKET" +echo "Storage Path: $STORAGE_PATH" +echo "" + +# Function to check if Lakekeeper is running +check_lakekeeper_running() { + local health_url="${LAKEKEEPER_BASE_URI}/health" + if curl -s -f "$health_url" > /dev/null 2>&1; then + return 0 # Running + else + return 1 # Not running + fi +} + +## Function to check if MinIO bucket exists +check_minio_bucket() { + local bucket_name="$1" + local endpoint="$2" + local username="$3" + local password="$4" + + # Use AWS CLI if available (preferred method) + if command -v aws >/dev/null 2>&1; then + # Check if bucket exists using AWS CLI (set env vars inline to avoid polluting global env) + if AWS_ACCESS_KEY_ID="$username" AWS_SECRET_ACCESS_KEY="$password" AWS_DEFAULT_REGION="us-west-2" \ + aws --endpoint-url="$endpoint" s3 ls "s3://${bucket_name}/" >/dev/null 2>&1; then + return 0 # Bucket exists + else + return 1 # Bucket doesn't exist or error + fi + else + # Fallback: Use curl to check bucket via MinIO API + # MinIO ListObjects API: GET /bucket-name?list-type=2 + local check_url="${endpoint}/${bucket_name}?list-type=2" + local http_code=$(curl -s -o /dev/null -w "%{http_code}" \ + -u "${username}:${password}" \ + "$check_url" 2>/dev/null || echo "000") + + if [ "$http_code" = "200" ]; then + return 0 # Bucket exists + else + return 1 # Bucket doesn't exist or error + fi + fi +} + +# Function to create MinIO bucket +create_minio_bucket() { + local bucket_name="$1" + local endpoint="$2" + local username="$3" + local password="$4" + + # Use AWS CLI if available (preferred method) + if command -v aws >/dev/null 2>&1; then + # Create bucket using AWS CLI (set env vars inline to avoid polluting global env) + if AWS_ACCESS_KEY_ID="$username" AWS_SECRET_ACCESS_KEY="$password" AWS_DEFAULT_REGION="us-west-2" \ + aws --endpoint-url="$endpoint" s3 mb "s3://${bucket_name}" >/dev/null 2>&1; then + return 0 # Success + else + return 1 # Failed + fi + else + # Fallback: Use curl to create bucket via MinIO API + # MinIO MakeBucket API: PUT /bucket-name + local create_url="${endpoint}/${bucket_name}" + local http_code=$(curl -s -o /dev/null -w "%{http_code}" \ + -X PUT \ + -u "${username}:${password}" \ + "$create_url" 2>/dev/null || echo "000") + + if [ "$http_code" = "200" ]; then + return 0 # Success + else + return 1 # Failed + fi + fi +} + +# Function to start Lakekeeper +start_lakekeeper() { + export LAKEKEEPER__METRICS_PORT=9091 +# export LAKEKEEPER__PG_DATABASE_URL_READ= +# export LAKEKEEPER__PG_DATABASE_URL_WRITE= +# export LAKEKEEPER__PG_ENCRYPTION_KEY= +# local binary_path="" + + echo "Starting Lakekeeper..." + + # Check if LAKEKEEPER_BINARY_PATH is set + if [ -z "${LAKEKEEPER_BINARY_PATH:-}" ]; then + echo "⚠ Warning: LAKEKEEPER_BINARY_PATH environment variable is not set." + echo " Skipping Lakekeeper startup. Assuming it's already running or will be started separately." + return 1 + fi + + # Check if the binary file exists and is executable + if [ ! -x "$LAKEKEEPER_BINARY_PATH" ]; then + echo "⚠ Warning: Lakekeeper binary not found or not executable at '$LAKEKEEPER_BINARY_PATH'" + echo " Please ensure LAKEKEEPER_BINARY_PATH points to a valid executable file." + echo " Skipping Lakekeeper startup. Assuming it's already running or will be started separately." + return 1 + fi + + local binary_path="$LAKEKEEPER_BINARY_PATH" + + # Check required environment variables + if [ -z "$LAKEKEEPER__PG_DATABASE_URL_READ" ] || [ -z "$LAKEKEEPER__PG_DATABASE_URL_WRITE" ] || [ -z "$LAKEKEEPER__PG_ENCRYPTION_KEY" ]; then + echo "⚠ Warning: Required Lakekeeper database environment variables not set:" + echo " - LAKEKEEPER__PG_DATABASE_URL_READ" + echo " - LAKEKEEPER__PG_DATABASE_URL_WRITE" + echo " - LAKEKEEPER__PG_ENCRYPTION_KEY" + echo " Skipping Lakekeeper startup. Assuming it's already running or will be started separately." + return 1 + fi + + # Run migration first + echo "Running Lakekeeper migration..." + if ! "$binary_path" migrate; then + echo "✗ Failed to run Lakekeeper migration" + return 1 + fi + + # Start Lakekeeper in background + echo "Starting Lakekeeper server..." + nohup "$binary_path" serve > /tmp/lakekeeper.log 2>&1 & + local lakekeeper_pid=$! + echo "Lakekeeper started with PID: $lakekeeper_pid" + + # Wait for Lakekeeper to be ready + echo "Waiting for Lakekeeper to be ready..." + local max_attempts=30 + local attempt=1 + while [ $attempt -le $max_attempts ]; do + if check_lakekeeper_running; then + echo "✓ Lakekeeper is ready!" + return 0 + fi + if [ $attempt -eq $max_attempts ]; then + echo "✗ Lakekeeper did not become ready after $max_attempts attempts" + echo " Check logs at /tmp/lakekeeper.log" + return 1 + fi + echo " Waiting for Lakekeeper... ($attempt/$max_attempts)" + sleep 2 + attempt=$((attempt + 1)) + done +} + +# Function to check if warehouse exists +# Returns: 0=exists, 1=not found, 2=connection error +check_warehouse_exists() { + local warehouse_name="$1" + local base_uri="$2" + + # Get list of all warehouses and check if the name exists + # API: GET /management/v1/warehouse returns list of warehouses + local list_url="${base_uri}/management/v1/warehouse" + + echo "Checking if warehouse '$warehouse_name' exists..." + echo " URL: $list_url" + + # Get warehouse list + local temp_response=$(mktemp) + local http_code=$(curl -s -o "$temp_response" -w "%{http_code}" "$list_url" 2>/dev/null || echo "000") + + if [ "$http_code" = "000" ]; then + rm -f "$temp_response" + echo "✗ Failed to connect to Lakekeeper at $list_url" + echo " Please ensure Lakekeeper is running and accessible." + return 2 # Connection error + fi + + if [ "$http_code" != "200" ]; then + rm -f "$temp_response" + echo "⚠ Warning: Unexpected HTTP status $http_code when listing warehouses" + return 1 # Treat as not found, will attempt to create + fi + + # Check if warehouse name exists in the list using jq or grep + # The response format: {"warehouses":[{"name":"...",...},...]} + if command -v jq >/dev/null 2>&1; then + # Use jq if available (more reliable) + if jq -e ".warehouses[] | select(.name == \"$warehouse_name\")" "$temp_response" >/dev/null 2>&1; then + rm -f "$temp_response" + return 0 # Exists + else + rm -f "$temp_response" + return 1 # Not found + fi + else + # Fallback: use grep to check if name exists in JSON + if grep -q "\"name\"[[:space:]]*:[[:space:]]*\"$warehouse_name\"" "$temp_response" 2>/dev/null; then + rm -f "$temp_response" + return 0 # Exists + else + rm -f "$temp_response" + return 1 # Not found + fi + fi +} + +# Function to create warehouse +# Returns: 0=success, 1=failure +create_warehouse() { + local warehouse_name="$1" + local base_uri="$2" + local storage_path="$3" + local temp_response="$4" + + # NOTE: According to Lakekeeper 0.7.x Management API docs: + # https://docs.lakekeeper.io/docs/0.7.x/api/management/#tag/warehouse + # POST /management/v1/warehouse (singular) to create a warehouse + # Request body uses "storage-profile" with "bucket" and "key-prefix" fields + local create_url="${base_uri}/management/v1/warehouse" + + # Parse storage_path: s3://bucket/path -> bucket and key-prefix + # Example: s3://texera-iceberg/iceberg/texera-executions + # -> bucket: texera-iceberg + # -> key-prefix: iceberg/texera-executions + local bucket="${S3_BUCKET}" + local region="" + + # Request body format according to Lakekeeper API + local create_payload=$(cat < 1: + key_path = sys.argv[1] + config = parse_storage_config() + value = get_value(config, key_path) + if value is None: + print(f"Key '{key_path}' not found", file=sys.stderr) + sys.exit(1) + print(value) + else: + # Print all storage config + config = parse_storage_config() + print(config.get("storage", {})) + + +if __name__ == "__main__": + main() From 2050396473162c3c883af72948d61910f480eb4d Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Mon, 2 Feb 2026 17:00:22 -0800 Subject: [PATCH 06/45] script --- .../scala/org/apache/texera/amber/config/StorageConfig.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala b/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala index d136142fab6..58f1e88c881 100644 --- a/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala +++ b/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala @@ -41,6 +41,7 @@ object StorageConfig { val icebergCatalogType: String = conf.getString("storage.iceberg.catalog.type") val icebergRESTCatalogUri: String = conf.getString("storage.iceberg.catalog.rest.uri") val icebergRESTCatalogWarehouseName: String = conf.getString("storage.iceberg.catalog.rest.warehouse-name") + val icebergRESTCatalogS3Bucket: String = conf.getString("storage.iceberg.catalog.rest.s3-bucket") // Iceberg Postgres specifics val icebergPostgresCatalogUriWithoutScheme: String = From 227395c601725a35904572438fe831339facb026 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Mon, 2 Feb 2026 22:07:36 -0800 Subject: [PATCH 07/45] 1 --- bin/bootstrap-lakekeeper-warehouse.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/bootstrap-lakekeeper-warehouse.sh b/bin/bootstrap-lakekeeper-warehouse.sh index ef952cbc2e3..f7f398bad46 100755 --- a/bin/bootstrap-lakekeeper-warehouse.sh +++ b/bin/bootstrap-lakekeeper-warehouse.sh @@ -443,3 +443,4 @@ if create_warehouse "$WAREHOUSE_NAME" "$LAKEKEEPER_BASE_URI" "$STORAGE_PATH" "$T else exit 1 fi + From ec9d4beb5f2d7a48c9f5b465d1013f991a1ebbd6 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Tue, 3 Feb 2026 13:32:34 -0800 Subject: [PATCH 08/45] update script, will refine later --- bin/bootstrap-lakekeeper-warehouse.sh | 79 ++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 13 deletions(-) diff --git a/bin/bootstrap-lakekeeper-warehouse.sh b/bin/bootstrap-lakekeeper-warehouse.sh index f7f398bad46..ad167bc14b6 100755 --- a/bin/bootstrap-lakekeeper-warehouse.sh +++ b/bin/bootstrap-lakekeeper-warehouse.sh @@ -58,14 +58,17 @@ if [ -f "$STORAGE_CONF_PATH" ]; then # Use pyhocon for proper HOCON parsing (handles environment variable substitution) REST_URI_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.iceberg.catalog.rest.uri" 2>/dev/null | sed 's|/catalog/*$||' || echo "") WAREHOUSE_NAME_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.iceberg.catalog.rest.warehouse-name" 2>/dev/null || echo "") + REST_REGION_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.iceberg.catalog.rest.region" 2>/dev/null || echo "") S3_BUCKET_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.iceberg.catalog.rest.s3-bucket" 2>/dev/null || echo "") S3_ENDPOINT_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.s3.endpoint" 2>/dev/null || echo "") S3_USERNAME_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.s3.auth.username" 2>/dev/null || echo "") S3_PASSWORD_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.s3.auth.password" 2>/dev/null || echo "") - + + echo "Configuration read from storage.conf:" echo " REST_URI_FROM_CONF=$REST_URI_FROM_CONF" echo " WAREHOUSE_NAME_FROM_CONF=$WAREHOUSE_NAME_FROM_CONF" + echo " REST_REGION_FROM_CONF=$REST_REGION_FROM_CONF" echo " S3_BUCKET_FROM_CONF=$S3_BUCKET_FROM_CONF" echo " S3_ENDPOINT_FROM_CONF=$S3_ENDPOINT_FROM_CONF" echo " S3_USERNAME_FROM_CONF=$S3_USERNAME_FROM_CONF" @@ -74,6 +77,7 @@ if [ -f "$STORAGE_CONF_PATH" ]; then else REST_URI_FROM_CONF="" WAREHOUSE_NAME_FROM_CONF="" + REST_REGION_FROM_CONF="" S3_BUCKET_FROM_CONF="" S3_ENDPOINT_FROM_CONF="" S3_USERNAME_FROM_CONF="" @@ -85,6 +89,7 @@ fi # Use values from storage.conf with defaults LAKEKEEPER_BASE_URI="${REST_URI_FROM_CONF:-http://localhost:8181}" WAREHOUSE_NAME="${WAREHOUSE_NAME_FROM_CONF:-texera-executions}" +S3_REGION="${REST_REGION_FROM_CONF:-us-west-2}" S3_BUCKET="${S3_BUCKET_FROM_CONF:-texera-iceberg}" S3_ENDPOINT="${S3_ENDPOINT_FROM_CONF:-http://localhost:9000}" S3_USERNAME="${S3_USERNAME_FROM_CONF:-texera_minio}" @@ -261,48 +266,68 @@ check_warehouse_exists() { echo " URL: $list_url" # Get warehouse list - local temp_response=$(mktemp) - local http_code=$(curl -s -o "$temp_response" -w "%{http_code}" "$list_url" 2>/dev/null || echo "000") + local temp_response + temp_response=$(mktemp) || { + echo "✗ Failed to create temporary file" + return 2 + } + + local http_code + http_code=$(curl -s -o "$temp_response" -w "%{http_code}" "$list_url" 2>/dev/null || echo "000") + echo " HTTP status: $http_code" if [ "$http_code" = "000" ]; then - rm -f "$temp_response" + rm -f "$temp_response" || true echo "✗ Failed to connect to Lakekeeper at $list_url" echo " Please ensure Lakekeeper is running and accessible." return 2 # Connection error fi if [ "$http_code" != "200" ]; then - rm -f "$temp_response" echo "⚠ Warning: Unexpected HTTP status $http_code when listing warehouses" + echo " Response body:" + cat "$temp_response" 2>/dev/null | sed 's/^/ /' || true + rm -f "$temp_response" || true return 1 # Treat as not found, will attempt to create fi + echo " Checking response for warehouse name..." # Check if warehouse name exists in the list using jq or grep # The response format: {"warehouses":[{"name":"...",...},...]} if command -v jq >/dev/null 2>&1; then + echo " Using jq to parse response..." # Use jq if available (more reliable) if jq -e ".warehouses[] | select(.name == \"$warehouse_name\")" "$temp_response" >/dev/null 2>&1; then - rm -f "$temp_response" + echo " Warehouse found in list" + rm -f "$temp_response" 2>/dev/null || true return 0 # Exists else - rm -f "$temp_response" + echo " Warehouse not found in list" + rm -f "$temp_response" 2>/dev/null || true + echo " About to return 1 from check_warehouse_exists (jq path)" return 1 # Not found fi else + echo " Using grep to parse response (jq not available)..." # Fallback: use grep to check if name exists in JSON if grep -q "\"name\"[[:space:]]*:[[:space:]]*\"$warehouse_name\"" "$temp_response" 2>/dev/null; then - rm -f "$temp_response" + echo " Warehouse found in list" + rm -f "$temp_response" || true return 0 # Exists else - rm -f "$temp_response" + echo " Warehouse not found in list" + rm -f "$temp_response" 2>/dev/null || true + echo " About to return 1 from check_warehouse_exists (grep path)" return 1 # Not found fi fi + echo " Function check_warehouse_exists completed" } # Function to create warehouse # Returns: 0=success, 1=failure create_warehouse() { + echo "1123" local warehouse_name="$1" local base_uri="$2" local storage_path="$3" @@ -319,7 +344,8 @@ create_warehouse() { # -> bucket: texera-iceberg # -> key-prefix: iceberg/texera-executions local bucket="${S3_BUCKET}" - local region="" + local region="${S3_REGION}" + local endpoint="${S3_ENDPOINT}" # Request body format according to Lakekeeper API local create_payload=$(cat < Date: Sun, 15 Feb 2026 04:49:26 -0800 Subject: [PATCH 09/45] single node --- bin/single-node/.env | 12 ++- bin/single-node/docker-compose.yml | 143 +++++++++++++++++++++++++++++ sql/texera_lakekeeper.sql | 20 ++++ 3 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 sql/texera_lakekeeper.sql diff --git a/bin/single-node/.env b/bin/single-node/.env index 0b9c4916af9..cf72dc58fa4 100644 --- a/bin/single-node/.env +++ b/bin/single-node/.env @@ -51,4 +51,14 @@ FILE_SERVICE_GET_PRESIGNED_URL_ENDPOINT=http://texera-file-service:9092/api/data FILE_SERVICE_UPLOAD_ONE_FILE_TO_DATASET_ENDPOINT=http://texera-file-service:9092/api/dataset/did/upload STORAGE_ICEBERG_CATALOG_POSTGRES_URI_WITHOUT_SCHEME=texera-postgres:5432/texera_iceberg_catalog STORAGE_ICEBERG_CATALOG_POSTGRES_USERNAME=texera -STORAGE_ICEBERG_CATALOG_POSTGRES_PASSWORD=password \ No newline at end of file +STORAGE_ICEBERG_CATALOG_POSTGRES_PASSWORD=password + +LAKEKEEPER__PG_DATABASE_URL_READ=postgres://texera:password@postgres:5432/texera_lakekeeper +LAKEKEEPER__PG_DATABASE_URL_WRITE=postgres://texera:password@postgres:5432/texera_lakekeeper +LAKEKEEPER__PG_ENCRYPTION_KEY=texera_key +LAKEKEEPER_BASE_URI=http://lakekeeper:8181 +WAREHOUSE_NAME=texera-executions +S3_BUCKET=texera-iceberg +S3_USERNAME=texera_minio +S3_PASSWORD=password +S3_REGION=us-west-2 diff --git a/bin/single-node/docker-compose.yml b/bin/single-node/docker-compose.yml index dd7425c345c..4ecba2e2615 100644 --- a/bin/single-node/docker-compose.yml +++ b/bin/single-node/docker-compose.yml @@ -75,6 +75,145 @@ services: timeout: 5s retries: 10 + # Lakekeeper migration init container + # This runs once to migrate the database before the lakekeeper server starts + lakekeeper-migrate: + image: vakamo/lakekeeper:v0.11.0 + container_name: texera-lakekeeper-migrate + depends_on: + postgres: + condition: service_healthy + env_file: + - .env + restart: "no" + entrypoint: ["/home/nonroot/lakekeeper"] + command: ["migrate"] + + # Lakekeeper is the Iceberg REST catalog service for workflow result storage + # NOTE: You need to provide a Lakekeeper docker image or build one from source. + # The image should contain the Lakekeeper binary and run it with 'serve' command. + # Example: If using a custom image, set LAKEKEEPER_IMAGE environment variable in .env + lakekeeper: + image: vakamo/lakekeeper:v0.11.0 + container_name: texera-lakekeeper + restart: always + depends_on: + postgres: + condition: service_healthy + minio: + condition: service_started + lakekeeper-migrate: + condition: service_completed_successfully + env_file: + - .env + entrypoint: ["/home/nonroot/lakekeeper"] + command: ["serve"] + ports: + - "8181:8181" + healthcheck: + test: ["CMD", "/home/nonroot/lakekeeper", "healthcheck"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 10s + + lakekeeper-init: + image: alpine:3.19 + container_name: texera-lakekeeper-init + depends_on: + lakekeeper: + condition: service_healthy + minio: + condition: service_started + env_file: + - .env + restart: "no" + entrypoint: [ "/bin/sh", "-c" ] + command: + - | + set -e + + echo "Installing dependencies..." + apk add --no-cache curl ca-certificates + + echo "Installing MinIO Client..." + wget -q https://dl.min.io/client/mc/release/linux-amd64/mc -O /usr/local/bin/mc + chmod +x /usr/local/bin/mc + + check_status() { + if [ "$$1" -ge 200 ] && [ "$$1" -lt 300 ]; then + echo "Created $$2 successfully (HTTP $$1)." + elif [ "$$1" -eq 409 ]; then + echo "$$2 already exists (HTTP 409). Treating as success." + else + echo "Failed to create $$2. HTTP Code: $$1" + echo "ERROR RESPONSE:" + if [ -f /tmp/response.txt ]; then cat /tmp/response.txt; fi + echo "" + exit 1 + fi + } + + echo "Step 1: Initializing MinIO bucket '$$S3_BUCKET'..." + mc alias set minio "$$STORAGE_S3_ENDPOINT" "$$S3_USERNAME" "$$S3_PASSWORD" || true + if mc ls minio/$$S3_BUCKET > /dev/null 2>&1; then + echo "MinIO bucket '$$S3_BUCKET' already exists." + else + mc mb minio/$$S3_BUCKET || { + echo "Failed to create MinIO bucket '$$S3_BUCKET'" + exit 1 + } + echo "MinIO bucket '$$S3_BUCKET' created successfully." + fi + + + echo "Step 2: Initializing Default Project..." + PROJECT_PAYLOAD='{"project-id": "00000000-0000-0000-0000-000000000000", "project-name": "default"}' + + PROJECT_CODE=$$(curl -s -o /tmp/response.txt -w "%{http_code}" \ + -X POST \ + -H "Content-Type: application/json" \ + -d "$$PROJECT_PAYLOAD" \ + "$$LAKEKEEPER_BASE_URI/management/v1/project" || echo "000") + + check_status "$$PROJECT_CODE" "Default Project" + + + echo "Step 3: Initializing Warehouse '$$WAREHOUSE_NAME'..." + CREATE_PAYLOAD=$$(cat < Date: Sun, 15 Feb 2026 04:52:47 -0800 Subject: [PATCH 10/45] maybe merge to s3 (region, and s3-bucket) --- common/config/src/main/resources/storage.conf | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/common/config/src/main/resources/storage.conf b/common/config/src/main/resources/storage.conf index 667c47e5d03..3742ca465ce 100644 --- a/common/config/src/main/resources/storage.conf +++ b/common/config/src/main/resources/storage.conf @@ -34,6 +34,10 @@ storage { uri = ${?STORAGE_ICEBERG_CATALOG_REST_URI} warehouse-name = "texeraExecutionsRR" warehouse-name = ${?STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME} # warehouse name for Lakekeeper, e.g., "taxera-execution" + region = "us-west-2" + region = ${?STORAGE_ICEBERG_CATALOG_REST_REGION} + s3-bucket = "texera-iceberg" + s3-bucket = ${?STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET} # S3 bucket name for Iceberg storage } postgres { @@ -138,10 +142,10 @@ storage { url-for-test-cases = "jdbc:postgresql://localhost:5432/texera_db_for_test_cases?currentSchema=texera_db,public" url-for-test-cases = ${?STORAGE_JDBC_URL_FOR_TEST_CASES} - username = "postgres" + username = "wangmeng" username = ${?STORAGE_JDBC_USERNAME} - password = "postgres" + password = "" password = ${?STORAGE_JDBC_PASSWORD} } } From 955e8b4d1b4822743f14fc9a591a8ae2c359a1ff Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Sun, 15 Feb 2026 05:42:19 -0800 Subject: [PATCH 11/45] single node fix --- bin/single-node/.env | 13 ++++++++----- bin/single-node/docker-compose.yml | 26 +++++++++++++------------- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/bin/single-node/.env b/bin/single-node/.env index cf72dc58fa4..f19cb6ec37d 100644 --- a/bin/single-node/.env +++ b/bin/single-node/.env @@ -57,8 +57,11 @@ LAKEKEEPER__PG_DATABASE_URL_READ=postgres://texera:password@postgres:5432/texera LAKEKEEPER__PG_DATABASE_URL_WRITE=postgres://texera:password@postgres:5432/texera_lakekeeper LAKEKEEPER__PG_ENCRYPTION_KEY=texera_key LAKEKEEPER_BASE_URI=http://lakekeeper:8181 -WAREHOUSE_NAME=texera-executions -S3_BUCKET=texera-iceberg -S3_USERNAME=texera_minio -S3_PASSWORD=password -S3_REGION=us-west-2 +STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME=texera-executions +STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET=texera-iceberg +STORAGE_S3_AUTH_USERNAME=texera_minio +STORAGE_S3_AUTH_PASSWORD=password +STORAGE_ICEBERG_CATALOG_REST_REGION=us-west-2 + +STORAGE_ICEBERG_CATALOG_REST_URI=http://texera-lakekeeper:8181/catalog +STORAGE_ICEBERG_CATALOG_TYPE=rest \ No newline at end of file diff --git a/bin/single-node/docker-compose.yml b/bin/single-node/docker-compose.yml index 4ecba2e2615..efc77fd671a 100644 --- a/bin/single-node/docker-compose.yml +++ b/bin/single-node/docker-compose.yml @@ -154,16 +154,16 @@ services: fi } - echo "Step 1: Initializing MinIO bucket '$$S3_BUCKET'..." - mc alias set minio "$$STORAGE_S3_ENDPOINT" "$$S3_USERNAME" "$$S3_PASSWORD" || true - if mc ls minio/$$S3_BUCKET > /dev/null 2>&1; then - echo "MinIO bucket '$$S3_BUCKET' already exists." + echo "Step 1: Initializing MinIO bucket '$$STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET'..." + mc alias set minio "$$STORAGE_S3_ENDPOINT" "$$STORAGE_S3_AUTH_USERNAME" "$$STORAGE_S3_AUTH_PASSWORD" || true + if mc ls minio/$$STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET > /dev/null 2>&1; then + echo "MinIO bucket '$$STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET' already exists." else - mc mb minio/$$S3_BUCKET || { - echo "Failed to create MinIO bucket '$$S3_BUCKET'" + mc mb minio/$$STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET || { + echo "Failed to create MinIO bucket '$$STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET'" exit 1 } - echo "MinIO bucket '$$S3_BUCKET' created successfully." + echo "MinIO bucket '$$STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET' created successfully." fi @@ -179,15 +179,15 @@ services: check_status "$$PROJECT_CODE" "Default Project" - echo "Step 3: Initializing Warehouse '$$WAREHOUSE_NAME'..." + echo "Step 3: Initializing Warehouse '$$STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME'..." CREATE_PAYLOAD=$$(cat < Date: Sun, 15 Feb 2026 05:43:38 -0800 Subject: [PATCH 12/45] IcebergUtil.scala namespace init, needed --- .../texera/amber/util/IcebergUtil.scala | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala index fe2db42214e..a779caaf864 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala @@ -174,19 +174,19 @@ object IcebergUtil { TableProperties.COMMIT_MIN_RETRY_WAIT_MS -> StorageConfig.icebergTableCommitMinRetryWaitMs.toString ) -// val namespace = Namespace.of(tableNamespace) -// -// catalog match { -// case nsCatalog: SupportsNamespaces => -// try nsCatalog.createNamespace(namespace, Map.empty[String, String].asJava) -// catch { -// case _: AlreadyExistsException => () -// } -// case _ => -// throw new IllegalArgumentException( -// s"Catalog ${catalog.getClass.getName} does not support namespaces" -// ) -// } + val namespace = Namespace.of(tableNamespace) + + catalog match { + case nsCatalog: SupportsNamespaces => + try nsCatalog.createNamespace(namespace, Map.empty[String, String].asJava) + catch { + case _: AlreadyExistsException => () + } + case _ => + throw new IllegalArgumentException( + s"Catalog ${catalog.getClass.getName} does not support namespaces" + ) + } val identifier = TableIdentifier.of(tableNamespace, tableName) if (catalog.tableExists(identifier) && overrideIfExists) { From 2c31f4c5333ebd4b34c97d9cb126f323ac541933 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Tue, 17 Feb 2026 08:23:56 -0800 Subject: [PATCH 13/45] k8s set up, will test again, able to run in pods, but didnt test with workflows yet --- bin/k8s/Chart.yaml | 5 + bin/k8s/files/texera_lakekeeper.sql | 20 +++ bin/k8s/templates/lakekeeper-init-job.yaml | 137 ++++++++++++++++++ .../postgresql-init-script-config.yaml | 6 + bin/k8s/templates/webserver-deployment.yaml | 11 ++ ...low-computing-unit-manager-deployment.yaml | 38 +++-- bin/k8s/values.yaml | 32 ++++ 7 files changed, 239 insertions(+), 10 deletions(-) create mode 100644 bin/k8s/files/texera_lakekeeper.sql create mode 100644 bin/k8s/templates/lakekeeper-init-job.yaml diff --git a/bin/k8s/Chart.yaml b/bin/k8s/Chart.yaml index 1e6dcfbef83..9f6122fc3fc 100644 --- a/bin/k8s/Chart.yaml +++ b/bin/k8s/Chart.yaml @@ -59,6 +59,11 @@ dependencies: repository: oci://docker.io/envoyproxy alias: envoy-gateway + - name: lakekeeper + version: 0.9.0 + repository: https://lakekeeper.github.io/lakekeeper-charts/ + condition: lakekeeper.enabled + - name: metrics-server version: 3.12.2 repository: https://kubernetes-sigs.github.io/metrics-server/ diff --git a/bin/k8s/files/texera_lakekeeper.sql b/bin/k8s/files/texera_lakekeeper.sql new file mode 100644 index 00000000000..6fede36bcc7 --- /dev/null +++ b/bin/k8s/files/texera_lakekeeper.sql @@ -0,0 +1,20 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +\c postgres + +DROP DATABASE IF EXISTS texera_lakekeeper; +CREATE DATABASE texera_lakekeeper; diff --git a/bin/k8s/templates/lakekeeper-init-job.yaml b/bin/k8s/templates/lakekeeper-init-job.yaml new file mode 100644 index 00000000000..0a3540b6055 --- /dev/null +++ b/bin/k8s/templates/lakekeeper-init-job.yaml @@ -0,0 +1,137 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +{{- if .Values.lakekeeperInit.enabled }} +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ .Release.Name }}-lakekeeper-init + namespace: {{ .Release.Namespace }} +spec: + backoffLimit: 3 + template: + metadata: + name: {{ .Release.Name }}-lakekeeper-init + spec: + restartPolicy: Never + containers: + - name: lakekeeper-init + image: alpine:3.19 + env: + - name: STORAGE_S3_ENDPOINT + value: http://{{ .Release.Name }}-minio:9000 + - name: STORAGE_S3_AUTH_USERNAME + valueFrom: + secretKeyRef: + name: {{ .Release.Name }}-minio + key: root-user + - name: STORAGE_S3_AUTH_PASSWORD + valueFrom: + secretKeyRef: + name: {{ .Release.Name }}-minio + key: root-password + - name: STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET + value: {{ .Values.lakekeeperInit.warehouse.s3Bucket | quote }} + - name: STORAGE_ICEBERG_CATALOG_REST_REGION + value: {{ .Values.lakekeeperInit.warehouse.region | quote }} + - name: STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME + value: {{ .Values.lakekeeperInit.warehouse.name | quote }} + - name: LAKEKEEPER_BASE_URI + value: http://{{ .Release.Name }}-lakekeeper:{{ .Values.lakekeeper.catalog.service.externalPort }} + - name: LAKEKEEPER_PROJECT_ID + value: {{ .Values.lakekeeperInit.defaultProject.id | quote }} + - name: LAKEKEEPER_PROJECT_NAME + value: {{ .Values.lakekeeperInit.defaultProject.name | quote }} + command: + - /bin/sh + - -c + - | + set -e + + apk add --no-cache curl ca-certificates wget + wget -q https://dl.min.io/client/mc/release/linux-amd64/mc -O /usr/local/bin/mc + chmod +x /usr/local/bin/mc + + check_status() { + if [ "$1" -ge 200 ] && [ "$1" -lt 300 ]; then + echo "Created $2 successfully (HTTP $1)." + elif [ "$1" -eq 409 ]; then + echo "$2 already exists (HTTP 409). Treating as success." + else + echo "Failed to create $2. HTTP Code: $1" + echo "ERROR RESPONSE:" + if [ -f /tmp/response.txt ]; then cat /tmp/response.txt; fi + echo "" + exit 1 + fi + } + + echo "Waiting for Lakekeeper health endpoint..." + until curl -s -f "${LAKEKEEPER_BASE_URI}/health" > /dev/null 2>&1; do + sleep 3 + done + + echo "Step 1: Initializing MinIO bucket '${STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET}'..." + mc alias set minio "${STORAGE_S3_ENDPOINT}" "${STORAGE_S3_AUTH_USERNAME}" "${STORAGE_S3_AUTH_PASSWORD}" || true + if mc ls minio/${STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET} > /dev/null 2>&1; then + echo "MinIO bucket '${STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET}' already exists." + else + mc mb minio/${STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET} + echo "MinIO bucket '${STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET}' created successfully." + fi + + echo "Step 2: Initializing default project..." + PROJECT_PAYLOAD="{\"project-id\":\"${LAKEKEEPER_PROJECT_ID}\",\"project-name\":\"${LAKEKEEPER_PROJECT_NAME}\"}" + PROJECT_CODE=$(curl -s -o /tmp/response.txt -w "%{http_code}" \ + -X POST \ + -H "Content-Type: application/json" \ + -d "${PROJECT_PAYLOAD}" \ + "${LAKEKEEPER_BASE_URI}/management/v1/project" || echo "000") + check_status "${PROJECT_CODE}" "Default Project" + + echo "Step 3: Initializing warehouse '${STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME}'..." + CREATE_PAYLOAD=$(cat < /tmp/texera_lakekeeper.sql +{{ .Files.Get "files/texera_lakekeeper.sql" | indent 6 }} + EOF + psql -U postgres -f /tmp/texera_lakekeeper.sql + echo "Initializing Texera database..." cat <<'EOF' > /tmp/texera_ddl.sql {{ .Files.Get "files/texera_ddl.sql" | indent 6 }} diff --git a/bin/k8s/templates/webserver-deployment.yaml b/bin/k8s/templates/webserver-deployment.yaml index 0c8656bfe8f..32249ca0b16 100644 --- a/bin/k8s/templates/webserver-deployment.yaml +++ b/bin/k8s/templates/webserver-deployment.yaml @@ -60,6 +60,17 @@ spec: secretKeyRef: name: {{ .Release.Name }}-lakefs-secret key: secret_key + # Workflow Result (Lakekeeper REST catalog) + - name: STORAGE_ICEBERG_CATALOG_TYPE + value: rest + - name: STORAGE_ICEBERG_CATALOG_REST_URI + value: http://{{ .Release.Name }}-lakekeeper:{{ .Values.lakekeeper.catalog.service.externalPort }}/catalog + - name: STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME + value: {{ .Values.lakekeeperInit.warehouse.name | quote }} + - name: STORAGE_ICEBERG_CATALOG_REST_REGION + value: {{ .Values.lakekeeperInit.warehouse.region | quote }} + - name: STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET + value: {{ .Values.lakekeeperInit.warehouse.s3Bucket | quote }} {{- range .Values.texeraEnvVars }} - name: {{ .name }} value: "{{ .value }}" diff --git a/bin/k8s/templates/workflow-computing-unit-manager-deployment.yaml b/bin/k8s/templates/workflow-computing-unit-manager-deployment.yaml index 75e5e76b30a..7c9bb7338aa 100644 --- a/bin/k8s/templates/workflow-computing-unit-manager-deployment.yaml +++ b/bin/k8s/templates/workflow-computing-unit-manager-deployment.yaml @@ -33,6 +33,25 @@ spec: app: {{ .Release.Name }}-{{ .Values.workflowComputingUnitManager.name }} spec: serviceAccountName: {{ .Values.workflowComputingUnitManager.serviceAccountName }} + initContainers: + - name: wait-lakekeeper + image: curlimages/curl:latest + command: + - /bin/sh + - -c + - | + set -e + LAKEKEEPER_BASE_URI="http://{{ .Release.Name }}-lakekeeper:{{ .Values.lakekeeper.catalog.service.externalPort }}" + WAREHOUSE_NAME="{{ .Values.lakekeeperInit.warehouse.name }}" + echo "Waiting for Lakekeeper to become healthy..." + until curl -s -f "${LAKEKEEPER_BASE_URI}/health" > /dev/null 2>&1; do + sleep 1 + done + echo "Waiting for warehouse '${WAREHOUSE_NAME}' to exist..." + until curl -s "${LAKEKEEPER_BASE_URI}/management/v1/warehouse" | grep -q "\"name\"[[:space:]]*:[[:space:]]*\"${WAREHOUSE_NAME}\""; do + sleep 1 + done + echo "Lakekeeper warehouse is ready." containers: - name: {{ .Values.workflowComputingUnitManager.name }} image: {{ .Values.global.imageRegistry }}/{{ .Values.workflowComputingUnitManager.imageName }}:{{ .Values.global.imageTag }} @@ -88,16 +107,15 @@ spec: key: secret_key # Workflow Result - name: STORAGE_ICEBERG_CATALOG_TYPE - value: postgres - - name: STORAGE_ICEBERG_CATALOG_POSTGRES_URI_WITHOUT_SCHEME - value: {{ .Release.Name }}-postgresql:5432/texera_iceberg_catalog - - name: STORAGE_ICEBERG_CATALOG_POSTGRES_USERNAME - value: postgres - - name: STORAGE_ICEBERG_CATALOG_POSTGRES_PASSWORD - valueFrom: - secretKeyRef: - name: {{ .Release.Name }}-postgresql - key: postgres-password + value: rest + - name: STORAGE_ICEBERG_CATALOG_REST_URI + value: http://{{ .Release.Name }}-lakekeeper:{{ .Values.lakekeeper.catalog.service.externalPort }}/catalog + - name: STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME + value: {{ .Values.lakekeeperInit.warehouse.name | quote }} + - name: STORAGE_ICEBERG_CATALOG_REST_REGION + value: {{ .Values.lakekeeperInit.warehouse.region | quote }} + - name: STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET + value: {{ .Values.lakekeeperInit.warehouse.s3Bucket | quote }} {{- range .Values.texeraEnvVars }} - name: {{ .name }} value: "{{ .value }}" diff --git a/bin/k8s/values.yaml b/bin/k8s/values.yaml index 3e612ef40f5..1dea85befe5 100644 --- a/bin/k8s/values.yaml +++ b/bin/k8s/values.yaml @@ -107,6 +107,38 @@ lakefs: access_key_id: texera_minio secret_access_key: password +lakekeeper: + enabled: true + postgresql: + enabled: false + internalOpenFGA: false # may need to update + catalog: + replicas: 1 + image: + repository: vakamo/lakekeeper + tag: v0.11.0 + pullPolicy: IfNotPresent + service: + externalPort: 8181 + externalDatabase: + type: postgres + host_read: texera-postgresql + host_write: texera-postgresql + port: 5432 + database: texera_lakekeeper + user: postgres + password: root_password + +lakekeeperInit: + enabled: true + defaultProject: + id: "00000000-0000-0000-0000-000000000000" + name: default + warehouse: + name: texera-executions + region: us-west-2 + s3Bucket: texera-iceberg + # Part2: configurations of Texera-related micro services texeraImages: pullPolicy: Always From 14e884b879d8d7ba3470a0b491a0aef7ac9f26cd Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Tue, 17 Feb 2026 11:23:40 -0800 Subject: [PATCH 14/45] expose lakekeeper to the texera-computing-unit-pool --- bin/k8s/templates/external-names.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bin/k8s/templates/external-names.yaml b/bin/k8s/templates/external-names.yaml index 69540067b81..4a21846f3c9 100644 --- a/bin/k8s/templates/external-names.yaml +++ b/bin/k8s/templates/external-names.yaml @@ -81,4 +81,10 @@ to access services in the main namespace using the same service names. "externalName" (printf "%s-minio.%s.svc.cluster.local" .Release.Name $namespace) ) | nindent 0 }} +{{/* Lakekeeper ExternalName - Add this block */}} +{{- include "external-name-service" (dict + "name" (printf "%s-lakekeeper" .Release.Name) + "namespace" $workflowComputingUnitPoolNamespace + "externalName" (printf "%s-lakekeeper.%s.svc.cluster.local" .Release.Name $namespace) + ) | nindent 0 }} From 55a99a295da72c52f89c518f1eb7cab836d65657 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Tue, 17 Feb 2026 11:35:37 -0800 Subject: [PATCH 15/45] expose lakekeeper to the texera-computing-unit-pool, update --- bin/k8s/templates/external-names.yaml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/bin/k8s/templates/external-names.yaml b/bin/k8s/templates/external-names.yaml index 4a21846f3c9..e6def2d82c9 100644 --- a/bin/k8s/templates/external-names.yaml +++ b/bin/k8s/templates/external-names.yaml @@ -83,8 +83,7 @@ to access services in the main namespace using the same service names. {{/* Lakekeeper ExternalName - Add this block */}} {{- include "external-name-service" (dict - "name" (printf "%s-lakekeeper" .Release.Name) - "namespace" $workflowComputingUnitPoolNamespace - "externalName" (printf "%s-lakekeeper.%s.svc.cluster.local" .Release.Name $namespace) - ) | nindent 0 }} - + "name" (printf "%s-lakekeeper" .Release.Name) + "namespace" $workflowComputingUnitPoolNamespace + "externalName" (printf "%s-lakekeeper.%s.svc.cluster.local" .Release.Name $namespace) +) | nindent 0 }} From 8d2711ed478ba251eb81287584190a3f1d598925 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Tue, 17 Feb 2026 11:36:47 -0800 Subject: [PATCH 16/45] expose lakekeeper to the texera-computing-unit-pool, update --- bin/k8s/templates/external-names.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/k8s/templates/external-names.yaml b/bin/k8s/templates/external-names.yaml index e6def2d82c9..691c92e0b19 100644 --- a/bin/k8s/templates/external-names.yaml +++ b/bin/k8s/templates/external-names.yaml @@ -81,6 +81,7 @@ to access services in the main namespace using the same service names. "externalName" (printf "%s-minio.%s.svc.cluster.local" .Release.Name $namespace) ) | nindent 0 }} +--- {{/* Lakekeeper ExternalName - Add this block */}} {{- include "external-name-service" (dict "name" (printf "%s-lakekeeper" .Release.Name) From 34d636359a6d51f009f6603b3871f124c9d4ef74 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Wed, 25 Feb 2026 10:07:56 -0800 Subject: [PATCH 17/45] fix python dependency issue --- amber/requirements.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/amber/requirements.txt b/amber/requirements.txt index 803ab682d5e..8cca5d201f8 100644 --- a/amber/requirements.txt +++ b/amber/requirements.txt @@ -43,7 +43,10 @@ bidict==0.22.0 cached_property==1.5.2 psutil==5.9.0 tzlocal==2.1 -pyiceberg==0.8.1 +pyiceberg==0.9.0 +s3fs==2025.9.0 +aiobotocore==2.25.1 +botocore==1.40.53 readerwriterlock==1.0.9 tenacity==8.5.0 SQLAlchemy==2.0.37 From ea2a04c4660845f0b9b63357b8c66534446e3412 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Wed, 25 Feb 2026 10:40:03 -0800 Subject: [PATCH 18/45] update --- sql/texera_lakekeeper.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/texera_lakekeeper.sql b/sql/texera_lakekeeper.sql index 6fede36bcc7..afdca6946cc 100644 --- a/sql/texera_lakekeeper.sql +++ b/sql/texera_lakekeeper.sql @@ -9,7 +9,8 @@ -- http://www.apache.org/licenses/LICENSE-2.0 -- -- Unless required by applicable law or agreed to in writing, --- software distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -- KIND, either express or implied. See the License for the -- specific language governing permissions and limitations -- under the License. From 1f14d746745cbeb81c56973dbf4e7d6a8c47cd4f Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Wed, 25 Feb 2026 10:41:19 -0800 Subject: [PATCH 19/45] update --- common/config/src/main/resources/storage.conf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/config/src/main/resources/storage.conf b/common/config/src/main/resources/storage.conf index 3742ca465ce..90e0b601a09 100644 --- a/common/config/src/main/resources/storage.conf +++ b/common/config/src/main/resources/storage.conf @@ -142,10 +142,10 @@ storage { url-for-test-cases = "jdbc:postgresql://localhost:5432/texera_db_for_test_cases?currentSchema=texera_db,public" url-for-test-cases = ${?STORAGE_JDBC_URL_FOR_TEST_CASES} - username = "wangmeng" + username = "postgres" username = ${?STORAGE_JDBC_USERNAME} - password = "" + password = "postgres" password = ${?STORAGE_JDBC_PASSWORD} } } From 7ed8b9a9e0616f44fc86b1e39a1aab38fbf78455 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Wed, 25 Feb 2026 10:59:28 -0800 Subject: [PATCH 20/45] update --- bin/k8s/files/texera_lakekeeper.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/k8s/files/texera_lakekeeper.sql b/bin/k8s/files/texera_lakekeeper.sql index 6fede36bcc7..afdca6946cc 100644 --- a/bin/k8s/files/texera_lakekeeper.sql +++ b/bin/k8s/files/texera_lakekeeper.sql @@ -9,7 +9,8 @@ -- http://www.apache.org/licenses/LICENSE-2.0 -- -- Unless required by applicable law or agreed to in writing, --- software distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -- KIND, either express or implied. See the License for the -- specific language governing permissions and limitations -- under the License. From ba038bdd67c5b5756ad660974b02f2c68fba3026 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Wed, 25 Feb 2026 11:11:46 -0800 Subject: [PATCH 21/45] update --- bin/k8s/files/texera_lakekeeper.sql | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) mode change 100644 => 120000 bin/k8s/files/texera_lakekeeper.sql diff --git a/bin/k8s/files/texera_lakekeeper.sql b/bin/k8s/files/texera_lakekeeper.sql deleted file mode 100644 index afdca6946cc..00000000000 --- a/bin/k8s/files/texera_lakekeeper.sql +++ /dev/null @@ -1,21 +0,0 @@ --- Licensed to the Apache Software Foundation (ASF) under one --- or more contributor license agreements. See the NOTICE file --- distributed with this work for additional information --- regarding copyright ownership. The ASF licenses this file --- to you under the Apache License, Version 2.0 (the --- "License"); you may not use this file except in compliance --- with the License. You may obtain a copy of the License at --- --- http://www.apache.org/licenses/LICENSE-2.0 --- --- Unless required by applicable law or agreed to in writing, --- software distributed under the License is distributed on an --- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --- KIND, either express or implied. See the License for the --- specific language governing permissions and limitations --- under the License. - -\c postgres - -DROP DATABASE IF EXISTS texera_lakekeeper; -CREATE DATABASE texera_lakekeeper; diff --git a/bin/k8s/files/texera_lakekeeper.sql b/bin/k8s/files/texera_lakekeeper.sql new file mode 120000 index 00000000000..6ddbed93822 --- /dev/null +++ b/bin/k8s/files/texera_lakekeeper.sql @@ -0,0 +1 @@ +../../../sql/texera_lakekeeper.sql \ No newline at end of file From dd3704551f816381d5cee2c0a17e355983c4b741 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Thu, 26 Feb 2026 11:56:40 -0800 Subject: [PATCH 22/45] clean --- .../core/storage/iceberg/iceberg_catalog_instance.py | 4 ++-- amber/src/main/python/core/storage/storage_config.py | 8 +------- amber/src/main/python/texera_run_python_worker.py | 4 ---- .../architecture/pythonworker/PythonWorkflowWorker.scala | 2 -- 4 files changed, 3 insertions(+), 15 deletions(-) diff --git a/amber/src/main/python/core/storage/iceberg/iceberg_catalog_instance.py b/amber/src/main/python/core/storage/iceberg/iceberg_catalog_instance.py index e394f773566..3160ffacf4a 100644 --- a/amber/src/main/python/core/storage/iceberg/iceberg_catalog_instance.py +++ b/amber/src/main/python/core/storage/iceberg/iceberg_catalog_instance.py @@ -59,8 +59,8 @@ def get_instance(cls): StorageConfig.ICEBERG_REST_CATALOG_URI, StorageConfig.S3_ENDPOINT, StorageConfig.S3_REGION, - StorageConfig.S3_USERNAME, - StorageConfig.S3_PASSWORD, + StorageConfig.S3_AUTH_USERNAME, + StorageConfig.S3_AUTH_PASSWORD, ) else: raise ValueError(f"Unsupported catalog type: {catalog_type}") diff --git a/amber/src/main/python/core/storage/storage_config.py b/amber/src/main/python/core/storage/storage_config.py index aeb182f751a..0e47bdb71ae 100644 --- a/amber/src/main/python/core/storage/storage_config.py +++ b/amber/src/main/python/core/storage/storage_config.py @@ -31,13 +31,11 @@ class StorageConfig: ICEBERG_POSTGRES_CATALOG_PASSWORD = None ICEBERG_REST_CATALOG_URI = None ICEBERG_REST_CATALOG_WAREHOUSE_NAME = None - S3_USERNAME = None - S3_PASSWORD = None ICEBERG_TABLE_RESULT_NAMESPACE = None ICEBERG_FILE_STORAGE_DIRECTORY_PATH = None ICEBERG_TABLE_COMMIT_BATCH_SIZE = None - # S3 configs (for large_binary_manager module) + # S3 configs S3_ENDPOINT = None S3_REGION = None S3_AUTH_USERNAME = None @@ -52,8 +50,6 @@ def initialize( postgres_password, rest_catalog_uri, rest_catalog_warehouse_name, - s3_username, - s3_password, table_result_namespace, directory_path, commit_batch_size, @@ -74,8 +70,6 @@ def initialize( cls.ICEBERG_REST_CATALOG_URI = rest_catalog_uri cls.ICEBERG_REST_CATALOG_WAREHOUSE_NAME = rest_catalog_warehouse_name - cls.S3_USERNAME = s3_username - cls.S3_PASSWORD = s3_password cls.ICEBERG_TABLE_RESULT_NAMESPACE = table_result_namespace cls.ICEBERG_FILE_STORAGE_DIRECTORY_PATH = directory_path cls.ICEBERG_TABLE_COMMIT_BATCH_SIZE = int(commit_batch_size) diff --git a/amber/src/main/python/texera_run_python_worker.py b/amber/src/main/python/texera_run_python_worker.py index 4d3a6cf1a5e..8687298f819 100644 --- a/amber/src/main/python/texera_run_python_worker.py +++ b/amber/src/main/python/texera_run_python_worker.py @@ -51,8 +51,6 @@ def init_loguru_logger(stream_log_level) -> None: iceberg_postgres_catalog_password, iceberg_rest_catalog_uri, iceberg_rest_catalog_warehouse_name, - s3_username, - s3_password, iceberg_table_namespace, iceberg_file_storage_directory_path, iceberg_table_commit_batch_size, @@ -69,8 +67,6 @@ def init_loguru_logger(stream_log_level) -> None: iceberg_postgres_catalog_password, iceberg_rest_catalog_uri, iceberg_rest_catalog_warehouse_name, - s3_username, - s3_password, iceberg_table_namespace, iceberg_file_storage_directory_path, iceberg_table_commit_batch_size, diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala index 5badac45e20..d2bc5f50253 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala @@ -184,8 +184,6 @@ class PythonWorkflowWorker( StorageConfig.icebergPostgresCatalogPassword, StorageConfig.icebergRESTCatalogUri, StorageConfig.icebergRESTCatalogWarehouseName, - StorageConfig.s3Username, - StorageConfig.s3Password, StorageConfig.icebergTableResultNamespace, StorageConfig.fileStorageDirectoryPath.toString, StorageConfig.icebergTableCommitBatchSize.toString, From cdc2f01eaa9876dbf39e79117dd5f02fb3ba05f8 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Thu, 26 Feb 2026 12:09:35 -0800 Subject: [PATCH 23/45] clean --- .../main/python/core/storage/iceberg/test_iceberg_document.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py b/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py index 54f0bb86d17..7da090d9bfa 100644 --- a/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py +++ b/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py @@ -43,8 +43,6 @@ postgres_password="password", rest_catalog_uri="", rest_catalog_warehouse_name="", - s3_username="", - s3_password="", table_result_namespace="operator-port-result", directory_path="../../../../../../amber/user-resources/workflow-results", commit_batch_size=4096, From b96436c83657dd4eaf9d4e0c30a505bfdab23327 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Thu, 26 Feb 2026 12:32:03 -0800 Subject: [PATCH 24/45] clean --- bin/k8s/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/k8s/values.yaml b/bin/k8s/values.yaml index 1dea85befe5..653ab8a9515 100644 --- a/bin/k8s/values.yaml +++ b/bin/k8s/values.yaml @@ -111,7 +111,7 @@ lakekeeper: enabled: true postgresql: enabled: false - internalOpenFGA: false # may need to update + internalOpenFGA: false catalog: replicas: 1 image: From da271dea4e1ae637a1c76ff77a03315094a3e6be Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Thu, 26 Feb 2026 12:35:59 -0800 Subject: [PATCH 25/45] clean --- common/config/src/main/resources/storage.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/config/src/main/resources/storage.conf b/common/config/src/main/resources/storage.conf index 90e0b601a09..779a3ed126b 100644 --- a/common/config/src/main/resources/storage.conf +++ b/common/config/src/main/resources/storage.conf @@ -32,7 +32,7 @@ storage { rest { uri = "http://localhost:8181/catalog/" uri = ${?STORAGE_ICEBERG_CATALOG_REST_URI} - warehouse-name = "texeraExecutionsRR" + warehouse-name = "texera" warehouse-name = ${?STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME} # warehouse name for Lakekeeper, e.g., "taxera-execution" region = "us-west-2" region = ${?STORAGE_ICEBERG_CATALOG_REST_REGION} From 906b3af12b12df527bec1eb849bb7e88b74efbd4 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Thu, 26 Feb 2026 12:38:30 -0800 Subject: [PATCH 26/45] clean --- bin/k8s/values.yaml | 2 +- bin/single-node/.env | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/k8s/values.yaml b/bin/k8s/values.yaml index 653ab8a9515..d4aa357f34f 100644 --- a/bin/k8s/values.yaml +++ b/bin/k8s/values.yaml @@ -135,7 +135,7 @@ lakekeeperInit: id: "00000000-0000-0000-0000-000000000000" name: default warehouse: - name: texera-executions + name: texera region: us-west-2 s3Bucket: texera-iceberg diff --git a/bin/single-node/.env b/bin/single-node/.env index f19cb6ec37d..935fa54d018 100644 --- a/bin/single-node/.env +++ b/bin/single-node/.env @@ -57,7 +57,7 @@ LAKEKEEPER__PG_DATABASE_URL_READ=postgres://texera:password@postgres:5432/texera LAKEKEEPER__PG_DATABASE_URL_WRITE=postgres://texera:password@postgres:5432/texera_lakekeeper LAKEKEEPER__PG_ENCRYPTION_KEY=texera_key LAKEKEEPER_BASE_URI=http://lakekeeper:8181 -STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME=texera-executions +STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME=texera STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET=texera-iceberg STORAGE_S3_AUTH_USERNAME=texera_minio STORAGE_S3_AUTH_PASSWORD=password From 3ac34c159d28632438093fd5cf8f87bbab3e49bf Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Thu, 26 Feb 2026 12:48:23 -0800 Subject: [PATCH 27/45] clean --- bin/single-node/.env | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/single-node/.env b/bin/single-node/.env index 935fa54d018..ac3c80e5692 100644 --- a/bin/single-node/.env +++ b/bin/single-node/.env @@ -53,10 +53,10 @@ STORAGE_ICEBERG_CATALOG_POSTGRES_URI_WITHOUT_SCHEME=texera-postgres:5432/texera_ STORAGE_ICEBERG_CATALOG_POSTGRES_USERNAME=texera STORAGE_ICEBERG_CATALOG_POSTGRES_PASSWORD=password -LAKEKEEPER__PG_DATABASE_URL_READ=postgres://texera:password@postgres:5432/texera_lakekeeper -LAKEKEEPER__PG_DATABASE_URL_WRITE=postgres://texera:password@postgres:5432/texera_lakekeeper +LAKEKEEPER__PG_DATABASE_URL_READ=postgres://texera:password@texera-postgres:5432/texera_lakekeeper +LAKEKEEPER__PG_DATABASE_URL_WRITE=postgres://texera:password@texera-postgres:5432/texera_lakekeeper LAKEKEEPER__PG_ENCRYPTION_KEY=texera_key -LAKEKEEPER_BASE_URI=http://lakekeeper:8181 +LAKEKEEPER_BASE_URI=http://texera-lakekeeper:8181 STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME=texera STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET=texera-iceberg STORAGE_S3_AUTH_USERNAME=texera_minio From 069d3138723067dccb52eb85c46587db4ca7e4cb Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Thu, 26 Feb 2026 12:51:41 -0800 Subject: [PATCH 28/45] clean --- common/config/src/main/resources/storage.conf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/config/src/main/resources/storage.conf b/common/config/src/main/resources/storage.conf index 779a3ed126b..d9c76646dd8 100644 --- a/common/config/src/main/resources/storage.conf +++ b/common/config/src/main/resources/storage.conf @@ -33,11 +33,11 @@ storage { uri = "http://localhost:8181/catalog/" uri = ${?STORAGE_ICEBERG_CATALOG_REST_URI} warehouse-name = "texera" - warehouse-name = ${?STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME} # warehouse name for Lakekeeper, e.g., "taxera-execution" + warehouse-name = ${?STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME} region = "us-west-2" region = ${?STORAGE_ICEBERG_CATALOG_REST_REGION} s3-bucket = "texera-iceberg" - s3-bucket = ${?STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET} # S3 bucket name for Iceberg storage + s3-bucket = ${?STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET} } postgres { From 5d125fc0d7a977b96fcfc1f37230195b31b7fa2d Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Thu, 26 Feb 2026 12:55:02 -0800 Subject: [PATCH 29/45] clean --- bin/single-node/docker-compose.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/bin/single-node/docker-compose.yml b/bin/single-node/docker-compose.yml index efc77fd671a..46e6c2186a7 100644 --- a/bin/single-node/docker-compose.yml +++ b/bin/single-node/docker-compose.yml @@ -89,10 +89,7 @@ services: entrypoint: ["/home/nonroot/lakekeeper"] command: ["migrate"] - # Lakekeeper is the Iceberg REST catalog service for workflow result storage - # NOTE: You need to provide a Lakekeeper docker image or build one from source. - # The image should contain the Lakekeeper binary and run it with 'serve' command. - # Example: If using a custom image, set LAKEKEEPER_IMAGE environment variable in .env + # Lakekeeper is the Iceberg REST catalog service lakekeeper: image: vakamo/lakekeeper:v0.11.0 container_name: texera-lakekeeper From b58e3cbce2336c07298e4716ab7c3dad7331cd9c Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Thu, 26 Feb 2026 13:07:52 -0800 Subject: [PATCH 30/45] clean --- .../scala/org/apache/texera/amber/config/StorageConfig.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala b/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala index 58f1e88c881..d136142fab6 100644 --- a/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala +++ b/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala @@ -41,7 +41,6 @@ object StorageConfig { val icebergCatalogType: String = conf.getString("storage.iceberg.catalog.type") val icebergRESTCatalogUri: String = conf.getString("storage.iceberg.catalog.rest.uri") val icebergRESTCatalogWarehouseName: String = conf.getString("storage.iceberg.catalog.rest.warehouse-name") - val icebergRESTCatalogS3Bucket: String = conf.getString("storage.iceberg.catalog.rest.s3-bucket") // Iceberg Postgres specifics val icebergPostgresCatalogUriWithoutScheme: String = From c95541a38e70a70cebfc1c02247b58daa3ec2d4d Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Thu, 26 Feb 2026 13:32:03 -0800 Subject: [PATCH 31/45] clean --- .../scala/org/apache/texera/amber/util/IcebergUtil.scala | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala index a779caaf864..45c0cf0999d 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala @@ -92,7 +92,7 @@ object IcebergUtil { * TODO: Add authentication support, such as OAuth2, using `OAuth2Properties`. * * @param catalogName the name of the catalog. - * @param warehouse the warehouse identifier (path for standard REST catalog, name for Lakekeeper). + * @param warehouse the warehouse identifier (for Lakekeeper). * @return the initialized RESTCatalog instance. */ def createRestCatalog( @@ -116,9 +116,6 @@ object IcebergUtil { "s3.region" -> StorageConfig.s3Region, "s3.path-style-access" -> "true", ) - - println(s"[IcebergUtil] effective s3.endpoint = ${properties.get("s3.endpoint")}, io.s3.endpoint = ${properties.get("io.s3.endpoint")}") - println(s"[IcebergUtil] StorageConfig.s3Endpoint = ${StorageConfig.s3Endpoint}") catalog.initialize(catalogName, properties.asJava) catalog From 2658c348ffa2cbe7fd20caad47f932f4db4b8232 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Thu, 26 Feb 2026 20:50:54 -0800 Subject: [PATCH 32/45] clean code --- bin/bootstrap-lakekeeper.sh | 523 ++++++++++++++++++++++++++++++++++++ bin/parse-storage-config.py | 60 +++-- 2 files changed, 557 insertions(+), 26 deletions(-) create mode 100755 bin/bootstrap-lakekeeper.sh diff --git a/bin/bootstrap-lakekeeper.sh b/bin/bootstrap-lakekeeper.sh new file mode 100755 index 00000000000..facbc491612 --- /dev/null +++ b/bin/bootstrap-lakekeeper.sh @@ -0,0 +1,523 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Bootstrap script to start Lakekeeper and create warehouse (idempotent). +# This script does four things: +# 1. Starts Lakekeeper if it's not already running +# 2. Bootstraps the Lakekeeper server (creates default project, idempotent) +# 3. Checks if MinIO bucket exists (and creates it if needed) +# 4. Checks and creates the warehouse if it doesn't exist +# +# +# Usage: +# ./bin/bootstrap-lakekeeper.sh + +set -e + +# ============================================================================== +# User Configuration - Edit the values below before running this script +# ============================================================================== + +# Lakekeeper binary path +LAKEKEEPER_BINARY_PATH="/Users/wangmeng/Desktop/lakekeeper-binary/lakekeeper" + +# Lakekeeper PostgreSQL connection URLs +LAKEKEEPER__PG_DATABASE_URL_READ="postgres://wangmeng:@localhost:5432/texera_lakekeeper" +LAKEKEEPER__PG_DATABASE_URL_WRITE="postgres://wangmeng:@localhost:5432/texera_lakekeeper" + +# Lakekeeper encryption key +LAKEKEEPER__PG_ENCRYPTION_KEY="texera_key" + +# Lakekeeper metrics port +LAKEKEEPER__METRICS_PORT="9091" + +# ============================================================================== +# End of User Configuration +# ============================================================================== + +# Read remaining configuration from storage.conf +# Priority: user config above > storage.conf > default value + +# Find storage.conf path +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +if [ -n "$TEXERA_HOME" ]; then + STORAGE_CONF_PATH="$TEXERA_HOME/common/config/src/main/resources/storage.conf" +else + STORAGE_CONF_PATH="$SCRIPT_DIR/../common/config/src/main/resources/storage.conf" +fi + +# Extract values from storage.conf using pyhocon for proper HOCON parsing +if [ -f "$STORAGE_CONF_PATH" ]; then + # Check if pyhocon is available + if ! command -v python3 >/dev/null 2>&1; then + echo "✗ Error: python3 is required to parse storage.conf" + echo " Please install Python 3" + exit 1 + fi + + if ! python3 -c "import pyhocon" 2>/dev/null; then + echo "✗ Error: pyhocon is required to parse storage.conf" + echo " Install it with: pip install pyhocon" + exit 1 + fi + + # Use batch mode to parse all config values in a single python invocation + CONF_OUTPUT=$(python3 "$SCRIPT_DIR/parse-storage-config.py" --batch \ + REST_URI_FROM_CONF=storage.iceberg.catalog.rest.uri \ + WAREHOUSE_NAME_FROM_CONF=storage.iceberg.catalog.rest.warehouse-name \ + REST_REGION_FROM_CONF=storage.iceberg.catalog.rest.region \ + S3_BUCKET_FROM_CONF=storage.iceberg.catalog.rest.s3-bucket \ + S3_ENDPOINT_FROM_CONF=storage.s3.endpoint \ + S3_USERNAME_FROM_CONF=storage.s3.auth.username \ + S3_PASSWORD_FROM_CONF=storage.s3.auth.password \ + 2>/dev/null) || true + + # Parse the batch output (each line is VAR_NAME=value) + while IFS='=' read -r var_name var_value; do + [ -z "$var_name" ] && continue + declare "$var_name=$var_value" + done <<< "$CONF_OUTPUT" + + # Strip trailing /catalog/ from REST URI + REST_URI_FROM_CONF=$(echo "${REST_URI_FROM_CONF:-}" | sed 's|/catalog/*$||') + + echo "Configuration read from storage.conf:" + echo " REST_URI=$REST_URI_FROM_CONF" + echo " WAREHOUSE_NAME=$WAREHOUSE_NAME_FROM_CONF" + echo " REGION=$REST_REGION_FROM_CONF" + echo " S3_BUCKET=$S3_BUCKET_FROM_CONF" + echo " S3_ENDPOINT=$S3_ENDPOINT_FROM_CONF" + echo " S3_USERNAME=$S3_USERNAME_FROM_CONF" + echo " S3_PASSWORD=***" + echo "" +else + REST_URI_FROM_CONF="" + WAREHOUSE_NAME_FROM_CONF="" + REST_REGION_FROM_CONF="" + S3_BUCKET_FROM_CONF="" + S3_ENDPOINT_FROM_CONF="" + S3_USERNAME_FROM_CONF="" + S3_PASSWORD_FROM_CONF="" + echo "storage.conf not found, using environment variables or defaults" + echo "" +fi + +# Use values from storage.conf with defaults +LAKEKEEPER_BASE_URI="${REST_URI_FROM_CONF:-http://localhost:8181}" +WAREHOUSE_NAME="${WAREHOUSE_NAME_FROM_CONF:-texera}" +S3_REGION="${REST_REGION_FROM_CONF:-us-west-2}" +S3_BUCKET="${S3_BUCKET_FROM_CONF:-texera-iceberg}" +S3_ENDPOINT="${S3_ENDPOINT_FROM_CONF:-http://localhost:9000}" +S3_USERNAME="${S3_USERNAME_FROM_CONF:-texera_minio}" +S3_PASSWORD="${S3_PASSWORD_FROM_CONF:-password}" +STORAGE_PATH="s3://${S3_BUCKET}/iceberg/${WAREHOUSE_NAME}" + +echo "==========================================" +echo "Lakekeeper Bootstrap and Warehouse Setup" +echo "==========================================" +echo "Lakekeeper Base URI: $LAKEKEEPER_BASE_URI" +echo "Lakekeeper Binary: ${LAKEKEEPER_BINARY_PATH:-lakekeeper}" +echo "Warehouse Name: $WAREHOUSE_NAME" +echo "S3 Endpoint: $S3_ENDPOINT" +echo "S3 Bucket: $S3_BUCKET" +echo "Storage Path: $STORAGE_PATH" +echo "" + +# Function to check if Lakekeeper is running +check_lakekeeper_running() { + local health_url="${LAKEKEEPER_BASE_URI}/health" + if curl -s -f "$health_url" > /dev/null 2>&1; then + return 0 # Running + else + return 1 # Not running + fi +} + +# Function to bootstrap the Lakekeeper server (creates default project). +# This is idempotent - safe to call even if already bootstrapped. +# Returns: 0=success (or already bootstrapped), 1=failure +bootstrap_lakekeeper_server() { + local base_uri="$1" + local bootstrap_url="${base_uri}/management/v1/bootstrap" + + echo "Bootstrapping Lakekeeper server (creating default project)..." + echo " URL: $bootstrap_url" + + local temp_response + temp_response=$(mktemp) || { + echo "✗ Failed to create temporary file" + return 1 + } + + local http_code + http_code=$(curl -s -o "$temp_response" -w "%{http_code}" \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{"accept-terms-of-use": true}' \ + "$bootstrap_url" 2>/dev/null || echo "000") + + echo " HTTP status: $http_code" + + case "$http_code" in + 000) + echo "✗ Failed to connect to Lakekeeper at $bootstrap_url" + rm -f "$temp_response" || true + return 1 + ;; + 2*) + echo "✓ Lakekeeper server bootstrapped successfully (HTTP $http_code)" + rm -f "$temp_response" || true + return 0 + ;; + *) + if grep -q "CatalogAlreadyBootstrapped" "$temp_response" 2>/dev/null; then + echo "✓ Lakekeeper server already bootstrapped (HTTP $http_code), continuing." + rm -f "$temp_response" || true + return 0 + fi + echo "✗ Failed to bootstrap Lakekeeper server (HTTP $http_code)" + echo " Response body:" + cat "$temp_response" | sed 's/^/ /' || true + rm -f "$temp_response" || true + return 1 + ;; + esac +} + +# Function to check if MinIO bucket exists (requires AWS CLI) +check_minio_bucket() { + local bucket_name="$1" + local endpoint="$2" + local username="$3" + local password="$4" + + if ! command -v aws >/dev/null 2>&1; then + echo "✗ Error: AWS CLI is required for MinIO bucket operations." + echo " Install it with: pip install awscli" + return 1 + fi + + if AWS_ACCESS_KEY_ID="$username" AWS_SECRET_ACCESS_KEY="$password" AWS_DEFAULT_REGION="us-west-2" \ + aws --endpoint-url="$endpoint" s3 ls "s3://${bucket_name}/" >/dev/null 2>&1; then + return 0 # Bucket exists + else + return 1 # Bucket doesn't exist or error + fi +} + +# Function to create MinIO bucket (requires AWS CLI) +create_minio_bucket() { + local bucket_name="$1" + local endpoint="$2" + local username="$3" + local password="$4" + + if ! command -v aws >/dev/null 2>&1; then + echo "✗ Error: AWS CLI is required for MinIO bucket operations." + echo " Install it with: pip install awscli" + return 1 + fi + + if AWS_ACCESS_KEY_ID="$username" AWS_SECRET_ACCESS_KEY="$password" AWS_DEFAULT_REGION="us-west-2" \ + aws --endpoint-url="$endpoint" s3 mb "s3://${bucket_name}" >/dev/null 2>&1; then + return 0 # Success + else + return 1 # Failed + fi +} + +# Function to start Lakekeeper +start_lakekeeper() { + export LAKEKEEPER__METRICS_PORT + export LAKEKEEPER__PG_ENCRYPTION_KEY + + echo "Starting Lakekeeper..." + + # Validate LAKEKEEPER_BINARY_PATH + if [ -z "$LAKEKEEPER_BINARY_PATH" ]; then + echo "✗ Error: LAKEKEEPER_BINARY_PATH is not set." + echo " Please set it in the User Configuration section at the top of this script." + exit 1 + fi + + if [ ! -x "$LAKEKEEPER_BINARY_PATH" ]; then + echo "✗ Error: Lakekeeper binary not found or not executable at '$LAKEKEEPER_BINARY_PATH'" + echo " Please update LAKEKEEPER_BINARY_PATH in the User Configuration section." + exit 1 + fi + + local binary_path="$LAKEKEEPER_BINARY_PATH" + + # Validate required database URLs + if [ -z "$LAKEKEEPER__PG_DATABASE_URL_READ" ] || [ -z "$LAKEKEEPER__PG_DATABASE_URL_WRITE" ]; then + echo "✗ Error: Database URLs not configured." + echo " Please set LAKEKEEPER__PG_DATABASE_URL_READ and LAKEKEEPER__PG_DATABASE_URL_WRITE" + echo " in the User Configuration section at the top of this script." + exit 1 + fi + export LAKEKEEPER__PG_DATABASE_URL_READ + export LAKEKEEPER__PG_DATABASE_URL_WRITE + + # Run migration first + echo "Running Lakekeeper migration..." + if ! "$binary_path" migrate; then + echo "✗ Failed to run Lakekeeper migration" + return 1 + fi + + # Start Lakekeeper in background + echo "Starting Lakekeeper server..." + nohup "$binary_path" serve > /tmp/lakekeeper.log 2>&1 & + local lakekeeper_pid=$! + echo "Lakekeeper started with PID: $lakekeeper_pid" + + # Wait for Lakekeeper to be ready + echo "Waiting for Lakekeeper to be ready..." + local max_attempts=30 + local attempt=1 + while [ $attempt -le $max_attempts ]; do + if check_lakekeeper_running; then + echo "✓ Lakekeeper is ready!" + return 0 + fi + if [ $attempt -eq $max_attempts ]; then + echo "✗ Lakekeeper did not become ready after $max_attempts attempts" + echo " Check logs at /tmp/lakekeeper.log" + return 1 + fi + echo " Waiting for Lakekeeper... ($attempt/$max_attempts)" + sleep 2 + attempt=$((attempt + 1)) + done +} + +# Function to check if warehouse exists +# Returns: 0=exists, 1=not found, 2=connection error +check_warehouse_exists() { + local warehouse_name="$1" + local base_uri="$2" + + local list_url="${base_uri}/management/v1/warehouse" + + echo "Checking if warehouse '$warehouse_name' exists..." + + local temp_response + temp_response=$(mktemp) || { + echo "✗ Failed to create temporary file" + return 2 + } + + local http_code + http_code=$(curl -s -o "$temp_response" -w "%{http_code}" "$list_url" 2>/dev/null || echo "000") + + if [ "$http_code" = "000" ]; then + rm -f "$temp_response" || true + echo "✗ Failed to connect to Lakekeeper at $list_url" + return 2 + fi + + if [ "$http_code" != "200" ]; then + echo "⚠ Warning: Unexpected HTTP status $http_code when listing warehouses" + cat "$temp_response" 2>/dev/null | sed 's/^/ /' || true + rm -f "$temp_response" || true + return 1 + fi + + # Check if warehouse name exists in the response + # Response format: {"warehouses":[{"name":"...",...},...]} + local found=1 + if command -v jq >/dev/null 2>&1; then + if jq -e ".warehouses[] | select(.name == \"$warehouse_name\")" "$temp_response" >/dev/null 2>&1; then + found=0 + fi + else + if grep -q "\"name\"[[:space:]]*:[[:space:]]*\"$warehouse_name\"" "$temp_response" 2>/dev/null; then + found=0 + fi + fi + + rm -f "$temp_response" 2>/dev/null || true + return $found +} + +# Function to create warehouse +# Args: warehouse_name base_uri s3_bucket s3_region s3_endpoint s3_username s3_password +# Returns: 0=success, 1=failure +create_warehouse() { + local warehouse_name="$1" + local base_uri="$2" + local bucket="$3" + local region="$4" + local endpoint="$5" + local username="$6" + local password="$7" + + local create_url="${base_uri}/management/v1/warehouse" + + local create_payload=$(cat </dev/null | sed 's/^/ /' || true + rm -f "$temp_response" || true + return 1 + ;; + esac +} + +# Step 1: Check if Lakekeeper is running, start if not +echo "Step 1: Checking Lakekeeper status..." +if check_lakekeeper_running; then + echo "✓ Lakekeeper is already running" +else + echo "Lakekeeper is not running, attempting to start..." + if start_lakekeeper; then + echo "✓ Lakekeeper started successfully" + else + echo "✗ Failed to start Lakekeeper" + exit 1 + fi +fi +echo "" + +# Step 2: Bootstrap the Lakekeeper server (creates default project) +echo "Step 2: Bootstrapping Lakekeeper server..." +if bootstrap_lakekeeper_server "$LAKEKEEPER_BASE_URI"; then + echo "✓ Lakekeeper server bootstrap completed" +else + echo "✗ Failed to bootstrap Lakekeeper server" + echo " Please check that Lakekeeper is running and accessible at $LAKEKEEPER_BASE_URI" + exit 1 +fi +echo "" + +# Step 3: Check and create MinIO bucket +echo "Step 3: Checking MinIO bucket..." +if check_minio_bucket "$S3_BUCKET" "$S3_ENDPOINT" "$S3_USERNAME" "$S3_PASSWORD"; then + echo "✓ MinIO bucket '$S3_BUCKET' already exists" +else + echo "MinIO bucket '$S3_BUCKET' does not exist, creating..." + if create_minio_bucket "$S3_BUCKET" "$S3_ENDPOINT" "$S3_USERNAME" "$S3_PASSWORD"; then + echo "✓ MinIO bucket '$S3_BUCKET' created successfully" + else + echo "✗ Failed to create MinIO bucket '$S3_BUCKET'" + echo " Please ensure MinIO is running and accessible at $S3_ENDPOINT" + exit 1 + fi +fi +echo "" + +# Step 4: Check and create warehouse +echo "Step 4: Checking and creating warehouse..." + +set +e # Temporarily disable exit on error to capture function return value +check_warehouse_exists "$WAREHOUSE_NAME" "$LAKEKEEPER_BASE_URI" +check_result=$? +set -e # Re-enable exit on error + +case $check_result in + 0) + echo "✓ Warehouse '$WAREHOUSE_NAME' already exists, skipping creation." + echo "" + echo "==========================================" + echo "✓ Bootstrap completed successfully!" + echo "==========================================" + exit 0 + ;; + 1) + echo "Warehouse '$WAREHOUSE_NAME' does not exist, will create..." + ;; + 2) + exit 1 + ;; + *) + echo "✗ Unexpected error (code: $check_result)" + exit 1 + ;; +esac + +# Create warehouse +if create_warehouse "$WAREHOUSE_NAME" "$LAKEKEEPER_BASE_URI" "$S3_BUCKET" "$S3_REGION" "$S3_ENDPOINT" "$S3_USERNAME" "$S3_PASSWORD"; then + echo "" + echo "==========================================" + echo "✓ Bootstrap completed successfully!" + echo "==========================================" + exit 0 +else + echo "" + echo "==========================================" + echo "✗ Bootstrap failed!" + echo "==========================================" + exit 1 +fi diff --git a/bin/parse-storage-config.py b/bin/parse-storage-config.py index 69aa78ca9c3..9a8d5b1a638 100755 --- a/bin/parse-storage-config.py +++ b/bin/parse-storage-config.py @@ -4,11 +4,18 @@ This script properly handles HOCON syntax including environment variable substitution. Usage: - python3 bin/parse-storage-config.py [key_path] - -Examples: + # Single key mode (backward compatible): python3 bin/parse-storage-config.py storage.iceberg.catalog.rest.uri + + # Batch mode (outputs VAR_NAME=value lines): + python3 bin/parse-storage-config.py --batch VAR1=key.path1 VAR2=key.path2 ... + +Examples: python3 bin/parse-storage-config.py storage.s3.endpoint + python3 bin/parse-storage-config.py --batch \ + REST_URI=storage.iceberg.catalog.rest.uri \ + WAREHOUSE_NAME=storage.iceberg.catalog.rest.warehouse-name \ + S3_ENDPOINT=storage.s3.endpoint """ import os @@ -28,44 +35,49 @@ def find_storage_conf(): if texera_home: conf_path = Path(texera_home) / "common" / "config" / "src" / "main" / "resources" / "storage.conf" else: - # Assume we're in the project root script_dir = Path(__file__).parent conf_path = script_dir.parent / "common" / "config" / "src" / "main" / "resources" / "storage.conf" - + if not conf_path.exists(): print(f"Error: storage.conf not found at {conf_path}", file=sys.stderr) sys.exit(1) - + return conf_path def parse_storage_config(): """Parse storage.conf with environment variable resolution.""" conf_path = find_storage_conf() - - # pyhocon automatically resolves environment variables - # Environment variables are available in os.environ config = ConfigFactory.parse_file(str(conf_path)) - return config def get_value(config, key_path): - """Get value from config by key path (e.g., 'storage.iceberg.catalog.rest.uri').""" - keys = key_path.split(".") - value = config - for key in keys: - if hasattr(value, key): - value = getattr(value, key) - elif key in value: - value = value[key] - else: - return None - return value + """Get value from config by dot-separated key path.""" + try: + return config.get_string(key_path) + except Exception: + return None def main(): - if len(sys.argv) > 1: + if len(sys.argv) < 2: + config = parse_storage_config() + print(config.get("storage", {})) + return + + if sys.argv[1] == "--batch": + config = parse_storage_config() + for arg in sys.argv[2:]: + if "=" not in arg: + print(f"Error: batch argument must be VAR_NAME=key.path, got '{arg}'", file=sys.stderr) + sys.exit(1) + var_name, key_path = arg.split("=", 1) + value = get_value(config, key_path) + if value is None: + value = "" + print(f"{var_name}={value}") + else: key_path = sys.argv[1] config = parse_storage_config() value = get_value(config, key_path) @@ -73,10 +85,6 @@ def main(): print(f"Key '{key_path}' not found", file=sys.stderr) sys.exit(1) print(value) - else: - # Print all storage config - config = parse_storage_config() - print(config.get("storage", {})) if __name__ == "__main__": From 38561e6aaea28924e0a3eae563db794c5a58dbbb Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Thu, 26 Feb 2026 20:51:29 -0800 Subject: [PATCH 33/45] clean code --- bin/bootstrap-lakekeeper-warehouse.sh | 499 -------------------------- 1 file changed, 499 deletions(-) delete mode 100755 bin/bootstrap-lakekeeper-warehouse.sh diff --git a/bin/bootstrap-lakekeeper-warehouse.sh b/bin/bootstrap-lakekeeper-warehouse.sh deleted file mode 100755 index ad167bc14b6..00000000000 --- a/bin/bootstrap-lakekeeper-warehouse.sh +++ /dev/null @@ -1,499 +0,0 @@ -#!/usr/bin/env bash -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Bootstrap script to start Lakekeeper and create warehouse (idempotent). -# This script does three things: -# 1. Starts Lakekeeper if it's not already running -# 2. Checks if MinIO bucket exists (and creates it if needed) -# 3. Checks and creates the warehouse if it doesn't exist -# -# -# Usage: -# ./bin/bootstrap-lakekeeper-warehouse.sh - -set -e - -# Read configuration from storage.conf or environment variables -# Priority: environment variable > storage.conf > default value - -# Find storage.conf path -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -if [ -n "$TEXERA_HOME" ]; then - STORAGE_CONF_PATH="$TEXERA_HOME/common/config/src/main/resources/storage.conf" -else - STORAGE_CONF_PATH="$SCRIPT_DIR/../common/config/src/main/resources/storage.conf" -fi - -# Extract values from storage.conf using pyhocon for proper HOCON parsing -# pyhocon handles environment variable substitution correctly -if [ -f "$STORAGE_CONF_PATH" ]; then - # Check if pyhocon is available - if ! command -v python3 >/dev/null 2>&1; then - echo "✗ Error: python3 is required to parse storage.conf" - echo " Please install Python 3" - exit 1 - fi - - if ! python3 -c "import pyhocon" 2>/dev/null; then - echo "✗ Error: pyhocon is required to parse storage.conf" - echo " Install it with: pip install pyhocon" - exit 1 - fi - - # Use pyhocon for proper HOCON parsing (handles environment variable substitution) - REST_URI_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.iceberg.catalog.rest.uri" 2>/dev/null | sed 's|/catalog/*$||' || echo "") - WAREHOUSE_NAME_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.iceberg.catalog.rest.warehouse-name" 2>/dev/null || echo "") - REST_REGION_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.iceberg.catalog.rest.region" 2>/dev/null || echo "") - S3_BUCKET_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.iceberg.catalog.rest.s3-bucket" 2>/dev/null || echo "") - S3_ENDPOINT_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.s3.endpoint" 2>/dev/null || echo "") - S3_USERNAME_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.s3.auth.username" 2>/dev/null || echo "") - S3_PASSWORD_FROM_CONF=$(python3 "$SCRIPT_DIR/parse-storage-config.py" "storage.s3.auth.password" 2>/dev/null || echo "") - - - echo "Configuration read from storage.conf:" - echo " REST_URI_FROM_CONF=$REST_URI_FROM_CONF" - echo " WAREHOUSE_NAME_FROM_CONF=$WAREHOUSE_NAME_FROM_CONF" - echo " REST_REGION_FROM_CONF=$REST_REGION_FROM_CONF" - echo " S3_BUCKET_FROM_CONF=$S3_BUCKET_FROM_CONF" - echo " S3_ENDPOINT_FROM_CONF=$S3_ENDPOINT_FROM_CONF" - echo " S3_USERNAME_FROM_CONF=$S3_USERNAME_FROM_CONF" - echo " S3_PASSWORD_FROM_CONF=$S3_PASSWORD_FROM_CONF" - echo "" -else - REST_URI_FROM_CONF="" - WAREHOUSE_NAME_FROM_CONF="" - REST_REGION_FROM_CONF="" - S3_BUCKET_FROM_CONF="" - S3_ENDPOINT_FROM_CONF="" - S3_USERNAME_FROM_CONF="" - S3_PASSWORD_FROM_CONF="" - echo "storage.conf not found, using environment variables or defaults" - echo "" -fi - -# Use values from storage.conf with defaults -LAKEKEEPER_BASE_URI="${REST_URI_FROM_CONF:-http://localhost:8181}" -WAREHOUSE_NAME="${WAREHOUSE_NAME_FROM_CONF:-texera-executions}" -S3_REGION="${REST_REGION_FROM_CONF:-us-west-2}" -S3_BUCKET="${S3_BUCKET_FROM_CONF:-texera-iceberg}" -S3_ENDPOINT="${S3_ENDPOINT_FROM_CONF:-http://localhost:9000}" -S3_USERNAME="${S3_USERNAME_FROM_CONF:-texera_minio}" -S3_PASSWORD="${S3_PASSWORD_FROM_CONF:-password}" -STORAGE_PATH="s3://${S3_BUCKET}/iceberg/${WAREHOUSE_NAME}" - -echo "==========================================" -echo "Lakekeeper Bootstrap and Warehouse Setup" -echo "==========================================" -echo "Lakekeeper Base URI: $LAKEKEEPER_BASE_URI" -echo "Lakekeeper Binary: ${LAKEKEEPER_BINARY_PATH:-lakekeeper}" -echo "Warehouse Name: $WAREHOUSE_NAME" -echo "S3 Endpoint: $S3_ENDPOINT" -echo "S3 Bucket: $S3_BUCKET" -echo "Storage Path: $STORAGE_PATH" -echo "" - -# Function to check if Lakekeeper is running -check_lakekeeper_running() { - local health_url="${LAKEKEEPER_BASE_URI}/health" - if curl -s -f "$health_url" > /dev/null 2>&1; then - return 0 # Running - else - return 1 # Not running - fi -} - -## Function to check if MinIO bucket exists -check_minio_bucket() { - local bucket_name="$1" - local endpoint="$2" - local username="$3" - local password="$4" - - # Use AWS CLI if available (preferred method) - if command -v aws >/dev/null 2>&1; then - # Check if bucket exists using AWS CLI (set env vars inline to avoid polluting global env) - if AWS_ACCESS_KEY_ID="$username" AWS_SECRET_ACCESS_KEY="$password" AWS_DEFAULT_REGION="us-west-2" \ - aws --endpoint-url="$endpoint" s3 ls "s3://${bucket_name}/" >/dev/null 2>&1; then - return 0 # Bucket exists - else - return 1 # Bucket doesn't exist or error - fi - else - # Fallback: Use curl to check bucket via MinIO API - # MinIO ListObjects API: GET /bucket-name?list-type=2 - local check_url="${endpoint}/${bucket_name}?list-type=2" - local http_code=$(curl -s -o /dev/null -w "%{http_code}" \ - -u "${username}:${password}" \ - "$check_url" 2>/dev/null || echo "000") - - if [ "$http_code" = "200" ]; then - return 0 # Bucket exists - else - return 1 # Bucket doesn't exist or error - fi - fi -} - -# Function to create MinIO bucket -create_minio_bucket() { - local bucket_name="$1" - local endpoint="$2" - local username="$3" - local password="$4" - - # Use AWS CLI if available (preferred method) - if command -v aws >/dev/null 2>&1; then - # Create bucket using AWS CLI (set env vars inline to avoid polluting global env) - if AWS_ACCESS_KEY_ID="$username" AWS_SECRET_ACCESS_KEY="$password" AWS_DEFAULT_REGION="us-west-2" \ - aws --endpoint-url="$endpoint" s3 mb "s3://${bucket_name}" >/dev/null 2>&1; then - return 0 # Success - else - return 1 # Failed - fi - else - # Fallback: Use curl to create bucket via MinIO API - # MinIO MakeBucket API: PUT /bucket-name - local create_url="${endpoint}/${bucket_name}" - local http_code=$(curl -s -o /dev/null -w "%{http_code}" \ - -X PUT \ - -u "${username}:${password}" \ - "$create_url" 2>/dev/null || echo "000") - - if [ "$http_code" = "200" ]; then - return 0 # Success - else - return 1 # Failed - fi - fi -} - -# Function to start Lakekeeper -start_lakekeeper() { - export LAKEKEEPER__METRICS_PORT=9091 -# export LAKEKEEPER__PG_DATABASE_URL_READ= -# export LAKEKEEPER__PG_DATABASE_URL_WRITE= -# export LAKEKEEPER__PG_ENCRYPTION_KEY= -# local binary_path="" - - echo "Starting Lakekeeper..." - - # Check if LAKEKEEPER_BINARY_PATH is set - if [ -z "${LAKEKEEPER_BINARY_PATH:-}" ]; then - echo "⚠ Warning: LAKEKEEPER_BINARY_PATH environment variable is not set." - echo " Skipping Lakekeeper startup. Assuming it's already running or will be started separately." - return 1 - fi - - # Check if the binary file exists and is executable - if [ ! -x "$LAKEKEEPER_BINARY_PATH" ]; then - echo "⚠ Warning: Lakekeeper binary not found or not executable at '$LAKEKEEPER_BINARY_PATH'" - echo " Please ensure LAKEKEEPER_BINARY_PATH points to a valid executable file." - echo " Skipping Lakekeeper startup. Assuming it's already running or will be started separately." - return 1 - fi - - local binary_path="$LAKEKEEPER_BINARY_PATH" - - # Check required environment variables - if [ -z "$LAKEKEEPER__PG_DATABASE_URL_READ" ] || [ -z "$LAKEKEEPER__PG_DATABASE_URL_WRITE" ] || [ -z "$LAKEKEEPER__PG_ENCRYPTION_KEY" ]; then - echo "⚠ Warning: Required Lakekeeper database environment variables not set:" - echo " - LAKEKEEPER__PG_DATABASE_URL_READ" - echo " - LAKEKEEPER__PG_DATABASE_URL_WRITE" - echo " - LAKEKEEPER__PG_ENCRYPTION_KEY" - echo " Skipping Lakekeeper startup. Assuming it's already running or will be started separately." - return 1 - fi - - # Run migration first - echo "Running Lakekeeper migration..." - if ! "$binary_path" migrate; then - echo "✗ Failed to run Lakekeeper migration" - return 1 - fi - - # Start Lakekeeper in background - echo "Starting Lakekeeper server..." - nohup "$binary_path" serve > /tmp/lakekeeper.log 2>&1 & - local lakekeeper_pid=$! - echo "Lakekeeper started with PID: $lakekeeper_pid" - - # Wait for Lakekeeper to be ready - echo "Waiting for Lakekeeper to be ready..." - local max_attempts=30 - local attempt=1 - while [ $attempt -le $max_attempts ]; do - if check_lakekeeper_running; then - echo "✓ Lakekeeper is ready!" - return 0 - fi - if [ $attempt -eq $max_attempts ]; then - echo "✗ Lakekeeper did not become ready after $max_attempts attempts" - echo " Check logs at /tmp/lakekeeper.log" - return 1 - fi - echo " Waiting for Lakekeeper... ($attempt/$max_attempts)" - sleep 2 - attempt=$((attempt + 1)) - done -} - -# Function to check if warehouse exists -# Returns: 0=exists, 1=not found, 2=connection error -check_warehouse_exists() { - local warehouse_name="$1" - local base_uri="$2" - - # Get list of all warehouses and check if the name exists - # API: GET /management/v1/warehouse returns list of warehouses - local list_url="${base_uri}/management/v1/warehouse" - - echo "Checking if warehouse '$warehouse_name' exists..." - echo " URL: $list_url" - - # Get warehouse list - local temp_response - temp_response=$(mktemp) || { - echo "✗ Failed to create temporary file" - return 2 - } - - local http_code - http_code=$(curl -s -o "$temp_response" -w "%{http_code}" "$list_url" 2>/dev/null || echo "000") - echo " HTTP status: $http_code" - - if [ "$http_code" = "000" ]; then - rm -f "$temp_response" || true - echo "✗ Failed to connect to Lakekeeper at $list_url" - echo " Please ensure Lakekeeper is running and accessible." - return 2 # Connection error - fi - - if [ "$http_code" != "200" ]; then - echo "⚠ Warning: Unexpected HTTP status $http_code when listing warehouses" - echo " Response body:" - cat "$temp_response" 2>/dev/null | sed 's/^/ /' || true - rm -f "$temp_response" || true - return 1 # Treat as not found, will attempt to create - fi - - echo " Checking response for warehouse name..." - # Check if warehouse name exists in the list using jq or grep - # The response format: {"warehouses":[{"name":"...",...},...]} - if command -v jq >/dev/null 2>&1; then - echo " Using jq to parse response..." - # Use jq if available (more reliable) - if jq -e ".warehouses[] | select(.name == \"$warehouse_name\")" "$temp_response" >/dev/null 2>&1; then - echo " Warehouse found in list" - rm -f "$temp_response" 2>/dev/null || true - return 0 # Exists - else - echo " Warehouse not found in list" - rm -f "$temp_response" 2>/dev/null || true - echo " About to return 1 from check_warehouse_exists (jq path)" - return 1 # Not found - fi - else - echo " Using grep to parse response (jq not available)..." - # Fallback: use grep to check if name exists in JSON - if grep -q "\"name\"[[:space:]]*:[[:space:]]*\"$warehouse_name\"" "$temp_response" 2>/dev/null; then - echo " Warehouse found in list" - rm -f "$temp_response" || true - return 0 # Exists - else - echo " Warehouse not found in list" - rm -f "$temp_response" 2>/dev/null || true - echo " About to return 1 from check_warehouse_exists (grep path)" - return 1 # Not found - fi - fi - echo " Function check_warehouse_exists completed" -} - -# Function to create warehouse -# Returns: 0=success, 1=failure -create_warehouse() { - echo "1123" - local warehouse_name="$1" - local base_uri="$2" - local storage_path="$3" - local temp_response="$4" - - # NOTE: According to Lakekeeper 0.7.x Management API docs: - # https://docs.lakekeeper.io/docs/0.7.x/api/management/#tag/warehouse - # POST /management/v1/warehouse (singular) to create a warehouse - # Request body uses "storage-profile" with "bucket" and "key-prefix" fields - local create_url="${base_uri}/management/v1/warehouse" - - # Parse storage_path: s3://bucket/path -> bucket and key-prefix - # Example: s3://texera-iceberg/iceberg/texera-executions - # -> bucket: texera-iceberg - # -> key-prefix: iceberg/texera-executions - local bucket="${S3_BUCKET}" - local region="${S3_REGION}" - local endpoint="${S3_ENDPOINT}" - - # Request body format according to Lakekeeper API - local create_payload=$(cat < Date: Thu, 26 Feb 2026 20:58:07 -0800 Subject: [PATCH 34/45] clean code --- bin/bootstrap-lakekeeper.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bin/bootstrap-lakekeeper.sh b/bin/bootstrap-lakekeeper.sh index facbc491612..1d4adc659c2 100755 --- a/bin/bootstrap-lakekeeper.sh +++ b/bin/bootstrap-lakekeeper.sh @@ -34,11 +34,13 @@ set -e # ============================================================================== # Lakekeeper binary path -LAKEKEEPER_BINARY_PATH="/Users/wangmeng/Desktop/lakekeeper-binary/lakekeeper" +LAKEKEEPER_BINARY_PATH="" # Lakekeeper PostgreSQL connection URLs -LAKEKEEPER__PG_DATABASE_URL_READ="postgres://wangmeng:@localhost:5432/texera_lakekeeper" -LAKEKEEPER__PG_DATABASE_URL_WRITE="postgres://wangmeng:@localhost:5432/texera_lakekeeper" +#(LAKEKEEPER__PG_DATABASE_URL_READ="postgres://postgres_user:postgres_urlencoded_password@hostname:5432/texera_lakekeeper" +# LAKEKEEPER__PG_DATABASE_URL_WRITE="postgres://postgres_user:postgres_urlencoded_password@hostname:5432/texera_lakekeeper") +LAKEKEEPER__PG_DATABASE_URL_READ="" +LAKEKEEPER__PG_DATABASE_URL_WRITE="" # Lakekeeper encryption key LAKEKEEPER__PG_ENCRYPTION_KEY="texera_key" From 4f7b4ad8c7e85bb5e5ff421ca33f789a0a537c19 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Fri, 27 Feb 2026 13:06:10 -0800 Subject: [PATCH 35/45] fmt fix --- .../apache/texera/amber/config/StorageConfig.scala | 3 ++- .../org/apache/texera/amber/util/IcebergUtil.scala | 13 ++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala b/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala index 7361ab6fecd..728e3c0c2de 100644 --- a/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala +++ b/common/config/src/main/scala/org/apache/texera/amber/config/StorageConfig.scala @@ -37,7 +37,8 @@ object StorageConfig { // Iceberg specifics val icebergCatalogType: String = conf.getString("storage.iceberg.catalog.type") val icebergRESTCatalogUri: String = conf.getString("storage.iceberg.catalog.rest.uri") - val icebergRESTCatalogWarehouseName: String = conf.getString("storage.iceberg.catalog.rest.warehouse-name") + val icebergRESTCatalogWarehouseName: String = + conf.getString("storage.iceberg.catalog.rest.warehouse-name") // Iceberg Postgres specifics val icebergPostgresCatalogUriWithoutScheme: String = diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala index 45c0cf0999d..39f010ef3fb 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala @@ -33,7 +33,14 @@ import org.apache.iceberg.parquet.{Parquet, ParquetValueReader} import org.apache.iceberg.rest.RESTCatalog import org.apache.iceberg.types.Type.PrimitiveType import org.apache.iceberg.types.Types -import org.apache.iceberg.{CatalogProperties, DataFile, PartitionSpec, Table, TableProperties, Schema => IcebergSchema} +import org.apache.iceberg.{ + CatalogProperties, + DataFile, + PartitionSpec, + Table, + TableProperties, + Schema => IcebergSchema +} import org.apache.iceberg.catalog.Namespace import org.apache.iceberg.exceptions.AlreadyExistsException @@ -114,9 +121,9 @@ object IcebergUtil { "s3.access-key-id" -> StorageConfig.s3Username, "s3.secret-access-key" -> StorageConfig.s3Password, "s3.region" -> StorageConfig.s3Region, - "s3.path-style-access" -> "true", + "s3.path-style-access" -> "true" ) - + catalog.initialize(catalogName, properties.asJava) catalog } From 6bed1177b83b60ca79a3e9d3b14fa16da768e0b0 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Fri, 27 Feb 2026 13:37:18 -0800 Subject: [PATCH 36/45] dependency fix --- common/workflow-core/build.sbt | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/common/workflow-core/build.sbt b/common/workflow-core/build.sbt index 7920c5a24d1..4f9c37b1719 100644 --- a/common/workflow-core/build.sbt +++ b/common/workflow-core/build.sbt @@ -134,10 +134,14 @@ dependencyOverrides ++= Seq( "io.netty" % "netty-codec" % nettyVersion, "io.netty" % "netty-codec-http" % nettyVersion, "io.netty" % "netty-codec-http2" % nettyVersion, + "io.netty" % "netty-codec-socks" % nettyVersion, "io.netty" % "netty-common" % nettyVersion, "io.netty" % "netty-handler" % nettyVersion, + "io.netty" % "netty-handler-proxy" % nettyVersion, "io.netty" % "netty-resolver" % nettyVersion, "io.netty" % "netty-transport" % nettyVersion, + "io.netty" % "netty-transport-classes-epoll" % nettyVersion, + "io.netty" % "netty-transport-native-epoll" % nettyVersion, "io.netty" % "netty-transport-native-unix-common" % nettyVersion ) @@ -212,7 +216,13 @@ libraryDependencies ++= Seq( "software.amazon.awssdk" % "s3" % "2.29.51" excludeAll( ExclusionRule(organization = "io.netty") ), - "software.amazon.awssdk" % "auth" % "2.29.51", - "software.amazon.awssdk" % "regions" % "2.29.51", - "software.amazon.awssdk" % "sts" % "2.29.51", + "software.amazon.awssdk" % "auth" % "2.29.51" excludeAll( + ExclusionRule(organization = "io.netty") + ), + "software.amazon.awssdk" % "regions" % "2.29.51" excludeAll( + ExclusionRule(organization = "io.netty") + ), + "software.amazon.awssdk" % "sts" % "2.29.51" excludeAll( + ExclusionRule(organization = "io.netty") + ), ) \ No newline at end of file From 401973e96cf39427a0512d7a80dca8c340f0c0fc Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Fri, 27 Feb 2026 14:31:01 -0800 Subject: [PATCH 37/45] fmt fix --- bin/parse-storage-config.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/bin/parse-storage-config.py b/bin/parse-storage-config.py index 9a8d5b1a638..262dba45add 100755 --- a/bin/parse-storage-config.py +++ b/bin/parse-storage-config.py @@ -1,4 +1,21 @@ #!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """ Parse storage.conf HOCON file with environment variable resolution. This script properly handles HOCON syntax including environment variable substitution. From bec831c660df6df5ff6e5cd92090c9c24ac6ec47 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Fri, 27 Feb 2026 16:47:52 -0800 Subject: [PATCH 38/45] update --- .../core/storage/iceberg/test_iceberg_document.py | 12 ++++++------ .../pytexera/storage/test_large_binary_manager.py | 9 ++++++--- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py b/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py index 7da090d9bfa..3cc48da6bdf 100644 --- a/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py +++ b/amber/src/main/python/core/storage/iceberg/test_iceberg_document.py @@ -37,19 +37,19 @@ # Hardcoded storage config only for test purposes. StorageConfig.initialize( - catalog_type="postgres", + catalog_type="rest", postgres_uri_without_scheme="localhost:5432/texera_iceberg_catalog", postgres_username="texera", postgres_password="password", - rest_catalog_uri="", - rest_catalog_warehouse_name="", + rest_catalog_uri="http://localhost:8181/catalog/", + rest_catalog_warehouse_name="texera", table_result_namespace="operator-port-result", directory_path="../../../../../../amber/user-resources/workflow-results", commit_batch_size=4096, s3_endpoint="http://localhost:9000", - s3_region="us-east-1", - s3_auth_username="minioadmin", - s3_auth_password="minioadmin", + s3_region="us-west-2", + s3_auth_username="texera_minio", + s3_auth_password="password", ) diff --git a/amber/src/main/python/pytexera/storage/test_large_binary_manager.py b/amber/src/main/python/pytexera/storage/test_large_binary_manager.py index a657f244f38..82537457e69 100644 --- a/amber/src/main/python/pytexera/storage/test_large_binary_manager.py +++ b/amber/src/main/python/pytexera/storage/test_large_binary_manager.py @@ -27,16 +27,19 @@ def setup_storage_config(self): """Initialize StorageConfig for tests.""" if not StorageConfig._initialized: StorageConfig.initialize( + catalog_type="rest", postgres_uri_without_scheme="localhost:5432/test", postgres_username="test", postgres_password="test", + rest_catalog_uri="http://localhost:8181/catalog/", + rest_catalog_warehouse_name="texera", table_result_namespace="test", directory_path="/tmp/test", commit_batch_size=1000, s3_endpoint="http://localhost:9000", - s3_region="us-east-1", - s3_auth_username="minioadmin", - s3_auth_password="minioadmin", + s3_region="us-west-2", + s3_auth_username="texera_minio", + s3_auth_password="password", ) def test_get_s3_client_initializes_once(self): From d65756b4fce4276f2e84cf3e89b42a52fab0ecc0 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Fri, 27 Feb 2026 17:00:35 -0800 Subject: [PATCH 39/45] fmt fix --- .../python/core/storage/iceberg/iceberg_catalog_instance.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/amber/src/main/python/core/storage/iceberg/iceberg_catalog_instance.py b/amber/src/main/python/core/storage/iceberg/iceberg_catalog_instance.py index 3160ffacf4a..0059808f9f8 100644 --- a/amber/src/main/python/core/storage/iceberg/iceberg_catalog_instance.py +++ b/amber/src/main/python/core/storage/iceberg/iceberg_catalog_instance.py @@ -18,7 +18,10 @@ from pyiceberg.catalog import Catalog from typing import Optional -from core.storage.iceberg.iceberg_utils import create_postgres_catalog, create_rest_catalog +from core.storage.iceberg.iceberg_utils import ( + create_postgres_catalog, + create_rest_catalog, +) from core.storage.storage_config import StorageConfig From 6d9792d2c70c44ba4af7a147a1a438814645eb92 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Mon, 2 Mar 2026 12:18:10 -0800 Subject: [PATCH 40/45] update github-action-build.yml, testing working locally --- .github/workflows/github-action-build.yml | 147 +++++++++++++++++++++- 1 file changed, 146 insertions(+), 1 deletion(-) diff --git a/.github/workflows/github-action-build.yml b/.github/workflows/github-action-build.yml index af2a60920d5..cb341734ce2 100644 --- a/.github/workflows/github-action-build.yml +++ b/.github/workflows/github-action-build.yml @@ -97,7 +97,6 @@ jobs: POSTGRES_PASSWORD: postgres ports: - 5432:5432 - # Add a health check so steps wait until Postgres is ready options: >- --health-cmd="pg_isready -U postgres" --health-interval=10s @@ -129,20 +128,97 @@ jobs: psql -h localhost -U postgres -f sql/texera_ddl.sql psql -h localhost -U postgres -f sql/iceberg_postgres_catalog.sql psql -h localhost -U postgres -f sql/texera_lakefs.sql + psql -h localhost -U postgres -f sql/texera_lakekeeper.sql env: PGPASSWORD: postgres - name: Create texera_db_for_test_cases run: psql -h localhost -U postgres -v DB_NAME=texera_db_for_test_cases -f sql/texera_ddl.sql env: PGPASSWORD: postgres + - name: Start MinIO + run: | + docker run -d --name minio --network host \ + -e MINIO_ROOT_USER=texera_minio \ + -e MINIO_ROOT_PASSWORD=password \ + minio/minio:RELEASE.2025-02-28T09-55-16Z server /data + + for i in $(seq 1 30); do + curl -sf http://localhost:9000/minio/health/live && break + echo "Waiting for MinIO... (attempt $i)" + sleep 2 + done + - name: Start Lakekeeper + run: | + docker run --rm --network host \ + -e LAKEKEEPER__PG_DATABASE_URL_READ=postgres://postgres:postgres@localhost:5432/texera_lakekeeper \ + -e LAKEKEEPER__PG_DATABASE_URL_WRITE=postgres://postgres:postgres@localhost:5432/texera_lakekeeper \ + -e LAKEKEEPER__PG_ENCRYPTION_KEY=texera_key \ + vakamo/lakekeeper:v0.11.0 migrate + + docker run -d --name lakekeeper --network host \ + -e LAKEKEEPER__PG_DATABASE_URL_READ=postgres://postgres:postgres@localhost:5432/texera_lakekeeper \ + -e LAKEKEEPER__PG_DATABASE_URL_WRITE=postgres://postgres:postgres@localhost:5432/texera_lakekeeper \ + -e LAKEKEEPER__PG_ENCRYPTION_KEY=texera_key \ + vakamo/lakekeeper:v0.11.0 serve + + for i in $(seq 1 30); do + docker exec lakekeeper /home/nonroot/lakekeeper healthcheck && break + echo "Waiting for Lakekeeper to be ready... (attempt $i)" + sleep 2 + done + - name: Initialize Lakekeeper Warehouse + run: | + docker run --rm --network host --entrypoint sh minio/mc -c \ + "mc alias set minio http://localhost:9000 texera_minio password && \ + mc mb --ignore-existing minio/texera-iceberg" + + curl -sf -X POST -H 'Content-Type: application/json' \ + -d '{"project-id":"00000000-0000-0000-0000-000000000000","project-name":"default"}' \ + http://localhost:8181/management/v1/project || true + + curl -sf -X POST -H 'Content-Type: application/json' -d '{ + "warehouse-name": "texera", + "project-id": "00000000-0000-0000-0000-000000000000", + "storage-profile": { + "type": "s3", + "bucket": "texera-iceberg", + "region": "us-west-2", + "endpoint": "http://localhost:9000", + "flavor": "s3-compat", + "path-style-access": true, + "sts-enabled": false + }, + "storage-credential": { + "type": "s3", + "credential-type": "access-key", + "aws-access-key-id": "texera_minio", + "aws-secret-access-key": "password" + } + }' http://localhost:8181/management/v1/warehouse - name: Compile with sbt run: sbt clean package + env: + STORAGE_ICEBERG_CATALOG_TYPE: rest + STORAGE_ICEBERG_CATALOG_REST_URI: http://localhost:8181/catalog/ + STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME: texera + STORAGE_S3_ENDPOINT: http://localhost:9000 + STORAGE_S3_REGION: us-west-2 + STORAGE_S3_AUTH_USERNAME: texera_minio + STORAGE_S3_AUTH_PASSWORD: password - name: Set docker-java API version run: | echo "api.version=1.52" >> ~/.docker-java.properties cat ~/.docker-java.properties - name: Run backend tests run: sbt test + env: + STORAGE_ICEBERG_CATALOG_TYPE: rest + STORAGE_ICEBERG_CATALOG_REST_URI: http://localhost:8181/catalog/ + STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME: texera + STORAGE_S3_ENDPOINT: http://localhost:9000 + STORAGE_S3_REGION: us-west-2 + STORAGE_S3_AUTH_USERNAME: texera_minio + STORAGE_S3_AUTH_PASSWORD: password python: strategy: @@ -166,9 +242,78 @@ jobs: run: sudo apt-get update && sudo apt-get install -y postgresql - name: Start PostgreSQL Service run: sudo systemctl start postgresql + - name: Configure PostgreSQL for TCP password auth + run: | + sudo -u postgres psql -c "ALTER USER postgres PASSWORD 'postgres';" + PG_HBA=$(sudo -u postgres psql -t -c "SHOW hba_file;" | xargs) + sudo sed -i 's/local\s\+all\s\+all\s\+peer/local all all md5/' "$PG_HBA" + echo "host all all 127.0.0.1/32 md5" | sudo tee -a "$PG_HBA" + echo "host all all ::1/128 md5" | sudo tee -a "$PG_HBA" + sudo systemctl restart postgresql - name: Create Database and User run: | cd sql && sudo -u postgres psql -f iceberg_postgres_catalog.sql + cd sql && sudo -u postgres psql -f texera_lakekeeper.sql + - name: Start MinIO + run: | + docker run -d --name minio --network host \ + -e MINIO_ROOT_USER=texera_minio \ + -e MINIO_ROOT_PASSWORD=password \ + minio/minio:RELEASE.2025-02-28T09-55-16Z server /data + + for i in $(seq 1 30); do + curl -sf http://localhost:9000/minio/health/live && break + echo "Waiting for MinIO... (attempt $i)" + sleep 2 + done + - name: Start Lakekeeper + run: | + docker run --rm --network host \ + -e LAKEKEEPER__PG_DATABASE_URL_READ=postgres://postgres:postgres@localhost:5432/texera_lakekeeper \ + -e LAKEKEEPER__PG_DATABASE_URL_WRITE=postgres://postgres:postgres@localhost:5432/texera_lakekeeper \ + -e LAKEKEEPER__PG_ENCRYPTION_KEY=texera_key \ + vakamo/lakekeeper:v0.11.0 migrate + + docker run -d --name lakekeeper --network host \ + -e LAKEKEEPER__PG_DATABASE_URL_READ=postgres://postgres:postgres@localhost:5432/texera_lakekeeper \ + -e LAKEKEEPER__PG_DATABASE_URL_WRITE=postgres://postgres:postgres@localhost:5432/texera_lakekeeper \ + -e LAKEKEEPER__PG_ENCRYPTION_KEY=texera_key \ + vakamo/lakekeeper:v0.11.0 serve + + for i in $(seq 1 30); do + docker exec lakekeeper /home/nonroot/lakekeeper healthcheck && break + echo "Waiting for Lakekeeper to be ready... (attempt $i)" + sleep 2 + done + - name: Initialize Lakekeeper Warehouse + run: | + docker run --rm --network host --entrypoint sh minio/mc -c \ + "mc alias set minio http://localhost:9000 texera_minio password && \ + mc mb --ignore-existing minio/texera-iceberg" + + curl -sf -X POST -H 'Content-Type: application/json' \ + -d '{"project-id":"00000000-0000-0000-0000-000000000000","project-name":"default"}' \ + http://localhost:8181/management/v1/project || true + + curl -sf -X POST -H 'Content-Type: application/json' -d '{ + "warehouse-name": "texera", + "project-id": "00000000-0000-0000-0000-000000000000", + "storage-profile": { + "type": "s3", + "bucket": "texera-iceberg", + "region": "us-west-2", + "endpoint": "http://localhost:9000", + "flavor": "s3-compat", + "path-style-access": true, + "sts-enabled": false + }, + "storage-credential": { + "type": "s3", + "credential-type": "access-key", + "aws-access-key-id": "texera_minio", + "aws-secret-access-key": "password" + } + }' http://localhost:8181/management/v1/warehouse - name: Lint with Ruff run: | cd amber/src/main/python && ruff check . && ruff format --check . From 86273e8df82743a2e8b93c26895a37af803958d4 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Mon, 2 Mar 2026 13:29:49 -0800 Subject: [PATCH 41/45] update github-action-build.yml, testing working locally --- .github/workflows/github-action-build.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/github-action-build.yml b/.github/workflows/github-action-build.yml index cb341734ce2..725b30f0760 100644 --- a/.github/workflows/github-action-build.yml +++ b/.github/workflows/github-action-build.yml @@ -166,6 +166,13 @@ jobs: echo "Waiting for Lakekeeper to be ready... (attempt $i)" sleep 2 done + + # Final check - fail with logs if Lakekeeper didn't start + docker exec lakekeeper /home/nonroot/lakekeeper healthcheck || { + echo "Lakekeeper failed to start. Container logs:" + docker logs lakekeeper + exit 1 + } - name: Initialize Lakekeeper Warehouse run: | docker run --rm --network host --entrypoint sh minio/mc -c \ @@ -285,6 +292,13 @@ jobs: echo "Waiting for Lakekeeper to be ready... (attempt $i)" sleep 2 done + + # Final check - fail with logs if Lakekeeper didn't start + docker exec lakekeeper /home/nonroot/lakekeeper healthcheck || { + echo "Lakekeeper failed to start. Container logs:" + docker logs lakekeeper + exit 1 + } - name: Initialize Lakekeeper Warehouse run: | docker run --rm --network host --entrypoint sh minio/mc -c \ From b649f1948e0525fcbf36a87b36434d27ef97eec2 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Mon, 2 Mar 2026 13:36:47 -0800 Subject: [PATCH 42/45] update github-action-build.yml, testing working locally --- .github/workflows/github-action-build.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/github-action-build.yml b/.github/workflows/github-action-build.yml index 725b30f0760..9118f26c10a 100644 --- a/.github/workflows/github-action-build.yml +++ b/.github/workflows/github-action-build.yml @@ -159,6 +159,7 @@ jobs: -e LAKEKEEPER__PG_DATABASE_URL_READ=postgres://postgres:postgres@localhost:5432/texera_lakekeeper \ -e LAKEKEEPER__PG_DATABASE_URL_WRITE=postgres://postgres:postgres@localhost:5432/texera_lakekeeper \ -e LAKEKEEPER__PG_ENCRYPTION_KEY=texera_key \ + -e LAKEKEEPER__METRICS_PORT=9091 \ vakamo/lakekeeper:v0.11.0 serve for i in $(seq 1 30); do @@ -285,6 +286,7 @@ jobs: -e LAKEKEEPER__PG_DATABASE_URL_READ=postgres://postgres:postgres@localhost:5432/texera_lakekeeper \ -e LAKEKEEPER__PG_DATABASE_URL_WRITE=postgres://postgres:postgres@localhost:5432/texera_lakekeeper \ -e LAKEKEEPER__PG_ENCRYPTION_KEY=texera_key \ + -e LAKEKEEPER__METRICS_PORT=9091 \ vakamo/lakekeeper:v0.11.0 serve for i in $(seq 1 30); do From db9c8b7721f4238332a16ee7b0dd0b590a4866cb Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Mon, 2 Mar 2026 14:12:39 -0800 Subject: [PATCH 43/45] update github-action-build.yml, testing working locally --- .github/workflows/github-action-build.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/github-action-build.yml b/.github/workflows/github-action-build.yml index 9118f26c10a..0bbe6fd83d7 100644 --- a/.github/workflows/github-action-build.yml +++ b/.github/workflows/github-action-build.yml @@ -260,8 +260,9 @@ jobs: sudo systemctl restart postgresql - name: Create Database and User run: | - cd sql && sudo -u postgres psql -f iceberg_postgres_catalog.sql - cd sql && sudo -u postgres psql -f texera_lakekeeper.sql + cd sql + sudo -u postgres psql -f iceberg_postgres_catalog.sql + sudo -u postgres psql -f texera_lakekeeper.sql - name: Start MinIO run: | docker run -d --name minio --network host \ From 8be851fb45aa7da32e0ca5ec66073917d7b2175e Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Mon, 2 Mar 2026 17:26:30 -0800 Subject: [PATCH 44/45] update github-action-build.yml, testing working locally --- common/workflow-core/build.sbt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common/workflow-core/build.sbt b/common/workflow-core/build.sbt index 4f9c37b1719..4ff5b979aa2 100644 --- a/common/workflow-core/build.sbt +++ b/common/workflow-core/build.sbt @@ -37,6 +37,10 @@ ThisBuild / conflictManager := ConflictManager.latestRevision // Restrict parallel execution of tests to avoid conflicts Global / concurrentRestrictions += Tags.limit(Tags.Test, 1) +// Fork a separate JVM for tests to avoid sbt classloader conflicts +// (iceberg-aws S3FileIO hits ClassCastException with layered classloaders) +Test / fork := true + ///////////////////////////////////////////////////////////////////////////// // Compiler Options From 32d575b305f1d8c2f8a43b27dcab0141ddcc0ad5 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Tue, 3 Mar 2026 13:15:10 -0800 Subject: [PATCH 45/45] update github-action-build.yml, testing working locally --- common/workflow-core/build.sbt | 1 + 1 file changed, 1 insertion(+) diff --git a/common/workflow-core/build.sbt b/common/workflow-core/build.sbt index 4ff5b979aa2..71900821e93 100644 --- a/common/workflow-core/build.sbt +++ b/common/workflow-core/build.sbt @@ -40,6 +40,7 @@ Global / concurrentRestrictions += Tags.limit(Tags.Test, 1) // Fork a separate JVM for tests to avoid sbt classloader conflicts // (iceberg-aws S3FileIO hits ClassCastException with layered classloaders) Test / fork := true +Test / baseDirectory := (ThisBuild / baseDirectory).value /////////////////////////////////////////////////////////////////////////////