diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index e7299d3fe8..68e581c3d8 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -67,8 +67,43 @@ jobs: - name: Generate coverage report (85%) # Coverage threshold should only increase over time — never decrease it! run: COVERAGE_FAIL_UNDER=85 make coverage-report + build-docker-images: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build and cache Spark image + uses: docker/build-push-action@v6 + with: + context: dev/spark + tags: pyiceberg-spark:latest + cache-from: type=gha,scope=spark + cache-to: type=gha,mode=max,scope=spark + outputs: type=docker,dest=/tmp/spark-image.tar + + - name: Build and cache Hive image + uses: docker/build-push-action@v6 + with: + context: dev/hive + tags: pyiceberg-hive:latest + cache-from: type=gha,scope=hive + cache-to: type=gha,mode=max,scope=hive + outputs: type=docker,dest=/tmp/hive-image.tar + + - name: Upload Docker images + uses: actions/upload-artifact@v4 + with: + name: docker-images + path: /tmp/*-image.tar + retention-days: 1 + compression-level: 0 + integration-test: runs-on: ubuntu-latest + needs: build-docker-images strategy: matrix: python: ['3.10', '3.11', '3.12', '3.13'] @@ -85,6 +120,17 @@ jobs: - name: Install run: make install + - name: Download Docker images + uses: actions/download-artifact@v4 + with: + name: docker-images + path: /tmp + + - name: Load Docker images + run: | + docker load -i /tmp/spark-image.tar + docker load -i /tmp/hive-image.tar + - name: Run integration tests with coverage run: COVERAGE=1 make test-integration - name: Show debug logs diff --git a/dev/docker-compose-integration.yml b/dev/docker-compose-integration.yml index 482468a9bf..a111e9f77e 100644 --- a/dev/docker-compose-integration.yml +++ b/dev/docker-compose-integration.yml @@ -18,7 +18,11 @@ services: spark-iceberg: container_name: pyiceberg-spark - build: spark/ + image: pyiceberg-spark:latest + build: + context: spark/ + cache_from: + - pyiceberg-spark:latest networks: iceberg_net: depends_on: @@ -91,7 +95,11 @@ services: tail -f /dev/null " hive: - build: hive/ + image: pyiceberg-hive:latest + build: + context: hive/ + cache_from: + - pyiceberg-hive:latest container_name: pyiceberg-hive hostname: hive networks: diff --git a/dev/hive/Dockerfile b/dev/hive/Dockerfile index 2c87b69ee1..69f9555ab9 100644 --- a/dev/hive/Dockerfile +++ b/dev/hive/Dockerfile @@ -15,17 +15,28 @@ FROM apache/hive:4.0.0 -ENV HADOOP_VERSION=3.3.6 -ENV AWS_SDK_BUNDLE=1.12.753 +# Dependency versions - changing these invalidates the JAR download layer +ARG HADOOP_VERSION=3.3.6 +ARG AWS_SDK_BUNDLE=1.12.753 +ARG MAVEN_MIRROR=https://repo1.maven.org/maven2 USER root -# Install curl, download JARs, and cleanup in a single layer -RUN apt-get update -qq && apt-get -qq -y install curl && \ - curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -Lo /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar && \ - curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar -Lo /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar && \ - apt-get clean && rm -rf /var/lib/apt/lists/* +# Install curl (separate layer - rarely changes) +RUN apt-get update -qq && \ + apt-get -qq -y install --no-install-recommends curl && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* +# Download JARs with retry logic (slow layer - only changes when versions change) +RUN curl -fsSL --retry 3 --retry-delay 5 \ + -o /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar \ + ${MAVEN_MIRROR}/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar && \ + curl -fsSL --retry 3 --retry-delay 5 \ + -o /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar \ + ${MAVEN_MIRROR}/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar + +# Copy configuration last (changes more frequently than JARs) COPY core-site.xml /opt/hadoop/etc/hadoop/core-site.xml USER hive diff --git a/dev/spark/Dockerfile b/dev/spark/Dockerfile index 4b486c9001..98b84cf5e8 100644 --- a/dev/spark/Dockerfile +++ b/dev/spark/Dockerfile @@ -18,10 +18,10 @@ ARG BASE_IMAGE_SPARK_VERSION=4.0.1 FROM apache/spark:${BASE_IMAGE_SPARK_VERSION} # Dependency versions - keep these compatible +# Changing these will invalidate the JAR download cache layer ARG ICEBERG_VERSION=1.10.1 ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13 ARG HADOOP_VERSION=3.4.1 -ARG SCALA_VERSION=2.13 ARG AWS_SDK_VERSION=2.24.6 ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2 @@ -31,26 +31,23 @@ WORKDIR ${SPARK_HOME} # Install curl for JAR downloads RUN apt-get update && \ apt-get install -y --no-install-recommends curl && \ + apt-get clean && \ rm -rf /var/lib/apt/lists/* -# Copy configuration (early for better caching) -COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/ - -# Create event log directory +# Create directories (separate layer) RUN mkdir -p /home/iceberg/spark-events && \ chown -R spark:spark /home/iceberg -# Required JAR dependencies -ENV JARS_TO_DOWNLOAD="\ - org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \ - org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \ - org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \ - software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar" - -# Download JARs with retry logic +# Download JARs with retry logic (most cacheable - only changes when versions change) +# This is the slowest step, so we do it before copying config files RUN set -e && \ cd "${SPARK_HOME}/jars" && \ - for jar_path in ${JARS_TO_DOWNLOAD}; do \ + for jar_path in \ + "org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \ + "org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \ + "org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \ + "software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"; \ + do \ jar_name=$(basename "${jar_path}") && \ echo "Downloading ${jar_name}..." && \ curl -fsSL --retry 3 --retry-delay 5 \ @@ -60,6 +57,9 @@ RUN set -e && \ done && \ chown -R spark:spark "${SPARK_HOME}/jars" +# Copy configuration last (changes more frequently than JARs) +COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/ + USER spark WORKDIR ${SPARK_HOME}