diff --git a/dev/docker-compose-integration.yml b/dev/docker-compose-integration.yml index 482468a9bf..03f5684ce4 100644 --- a/dev/docker-compose-integration.yml +++ b/dev/docker-compose-integration.yml @@ -17,21 +17,22 @@ services: spark-iceberg: - container_name: pyiceberg-spark + image: pyiceberg-spark:latest build: spark/ + container_name: pyiceberg-spark networks: iceberg_net: depends_on: - rest - hive - minio + ports: + - 15002:15002 # Spark Connect + - 4040:4040 # Spark UI environment: - AWS_ACCESS_KEY_ID=admin - AWS_SECRET_ACCESS_KEY=password - AWS_REGION=us-east-1 - ports: - - 15002:15002 # Spark Connect - - 4040:4040 # Spark UI links: - rest:rest - hive:hive @@ -60,10 +61,6 @@ services: minio: image: minio/minio container_name: pyiceberg-minio - environment: - - MINIO_ROOT_USER=admin - - MINIO_ROOT_PASSWORD=password - - MINIO_DOMAIN=minio networks: iceberg_net: aliases: @@ -71,14 +68,18 @@ services: ports: - 9001:9001 - 9000:9000 + environment: + - MINIO_ROOT_USER=admin + - MINIO_ROOT_PASSWORD=password + - MINIO_DOMAIN=minio command: ["server", "/data", "--console-address", ":9001"] mc: - depends_on: - - minio image: minio/mc container_name: pyiceberg-mc networks: iceberg_net: + depends_on: + - minio environment: - AWS_ACCESS_KEY_ID=admin - AWS_SECRET_ACCESS_KEY=password @@ -91,6 +92,7 @@ services: tail -f /dev/null " hive: + image: pyiceberg-hive:latest build: hive/ container_name: pyiceberg-hive hostname: hive diff --git a/dev/hive/Dockerfile b/dev/hive/Dockerfile index 2c87b69ee1..e46a035727 100644 --- a/dev/hive/Dockerfile +++ b/dev/hive/Dockerfile @@ -15,17 +15,28 @@ FROM apache/hive:4.0.0 -ENV HADOOP_VERSION=3.3.6 -ENV AWS_SDK_BUNDLE=1.12.753 +# Dependency versions - changing these invalidates the JAR download layer +ARG HADOOP_VERSION=3.3.6 +ARG AWS_SDK_BUNDLE=1.12.753 +ARG MAVEN_MIRROR=https://repo1.maven.org/maven2 USER root -# Install curl, download JARs, and cleanup in a single layer -RUN apt-get update -qq && apt-get -qq -y install curl && \ - curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -Lo /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar && \ - curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar -Lo /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar && \ - apt-get clean && rm -rf /var/lib/apt/lists/* +# Install curl (separate layer - rarely changes) +RUN apt-get update -qq && \ + apt-get -qq -y install --no-install-recommends curl && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* +# Download JARs with retry logic (slow layer - only changes when versions change) +RUN curl -fsSL --retry 3 --retry-delay 5 \ + -o /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar \ + "${MAVEN_MIRROR}/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" && \ + curl -fsSL --retry 3 --retry-delay 5 \ + -o /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar \ + "${MAVEN_MIRROR}/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar" + +# Copy configuration last (changes more frequently than JARs) COPY core-site.xml /opt/hadoop/etc/hadoop/core-site.xml USER hive diff --git a/dev/spark/Dockerfile b/dev/spark/Dockerfile index 4b486c9001..0e1f29d152 100644 --- a/dev/spark/Dockerfile +++ b/dev/spark/Dockerfile @@ -18,50 +18,46 @@ ARG BASE_IMAGE_SPARK_VERSION=4.0.1 FROM apache/spark:${BASE_IMAGE_SPARK_VERSION} # Dependency versions - keep these compatible +# Changing these will invalidate the JAR download cache layer ARG ICEBERG_VERSION=1.10.1 ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13 ARG HADOOP_VERSION=3.4.1 -ARG SCALA_VERSION=2.13 ARG AWS_SDK_VERSION=2.24.6 ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2 USER root WORKDIR ${SPARK_HOME} -# Install curl for JAR downloads -RUN apt-get update && \ - apt-get install -y --no-install-recommends curl && \ - rm -rf /var/lib/apt/lists/* - -# Copy configuration (early for better caching) -COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/ - -# Create event log directory -RUN mkdir -p /home/iceberg/spark-events && \ +# Install curl and create directories +RUN apt-get update -qq && \ + apt-get install -qq -y --no-install-recommends curl && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* && \ + mkdir -p /home/iceberg/spark-events && \ chown -R spark:spark /home/iceberg -# Required JAR dependencies -ENV JARS_TO_DOWNLOAD="\ - org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \ - org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \ - org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \ - software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar" - -# Download JARs with retry logic +# Download JARs with retry logic (most cacheable - only changes when versions change) +# This is the slowest step, so we do it before copying config files RUN set -e && \ cd "${SPARK_HOME}/jars" && \ - for jar_path in ${JARS_TO_DOWNLOAD}; do \ + for jar_path in \ + "org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \ + "org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \ + "org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \ + "software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"; \ + do \ jar_name=$(basename "${jar_path}") && \ - echo "Downloading ${jar_name}..." && \ curl -fsSL --retry 3 --retry-delay 5 \ -o "${jar_name}" \ "${MAVEN_MIRROR}/${jar_path}" && \ - echo "✓ Downloaded ${jar_name}"; \ - done && \ - chown -R spark:spark "${SPARK_HOME}/jars" + chown spark:spark "${jar_name}"; \ + done + +# Copy configuration last (changes more frequently than JARs) +COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/ USER spark WORKDIR ${SPARK_HOME} # Start Spark Connect server -CMD ["sh", "-c", "SPARK_NO_DAEMONIZE=true ${SPARK_HOME}/sbin/start-connect-server.sh"] +CMD ["bash", "-c", "SPARK_NO_DAEMONIZE=true ${SPARK_HOME}/sbin/start-connect-server.sh"]