Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions .github/workflows/python-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,43 @@ jobs:
- name: Generate coverage report (85%) # Coverage threshold should only increase over time — never decrease it!
run: COVERAGE_FAIL_UNDER=85 make coverage-report

build-docker-images:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Build and cache Spark image
uses: docker/build-push-action@v6
with:
context: dev/spark
tags: pyiceberg-spark:latest
cache-from: type=gha,scope=spark
cache-to: type=gha,mode=max,scope=spark
outputs: type=docker,dest=/tmp/spark-image.tar

- name: Build and cache Hive image
uses: docker/build-push-action@v6
with:
context: dev/hive
tags: pyiceberg-hive:latest
cache-from: type=gha,scope=hive
cache-to: type=gha,mode=max,scope=hive
outputs: type=docker,dest=/tmp/hive-image.tar

- name: Upload Docker images
uses: actions/upload-artifact@v4
with:
name: docker-images
path: /tmp/*-image.tar
retention-days: 1
compression-level: 0

integration-test:
runs-on: ubuntu-latest
needs: build-docker-images
strategy:
matrix:
python: ['3.10', '3.11', '3.12', '3.13']
Expand All @@ -85,6 +120,17 @@ jobs:
- name: Install
run: make install

- name: Download Docker images
uses: actions/download-artifact@v4
with:
name: docker-images
path: /tmp

- name: Load Docker images
run: |
docker load -i /tmp/spark-image.tar
docker load -i /tmp/hive-image.tar

- name: Run integration tests with coverage
run: COVERAGE=1 make test-integration
- name: Show debug logs
Expand Down
12 changes: 10 additions & 2 deletions dev/docker-compose-integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@
services:
spark-iceberg:
container_name: pyiceberg-spark
build: spark/
image: pyiceberg-spark:latest
build:
context: spark/
cache_from:
- pyiceberg-spark:latest
networks:
iceberg_net:
depends_on:
Expand Down Expand Up @@ -91,7 +95,11 @@ services:
tail -f /dev/null
"
hive:
build: hive/
image: pyiceberg-hive:latest
build:
context: hive/
cache_from:
- pyiceberg-hive:latest
container_name: pyiceberg-hive
hostname: hive
networks:
Expand Down
25 changes: 18 additions & 7 deletions dev/hive/Dockerfile
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same logic, just reordered

Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,28 @@

FROM apache/hive:4.0.0

ENV HADOOP_VERSION=3.3.6
ENV AWS_SDK_BUNDLE=1.12.753
# Dependency versions - changing these invalidates the JAR download layer
ARG HADOOP_VERSION=3.3.6
ARG AWS_SDK_BUNDLE=1.12.753
ARG MAVEN_MIRROR=https://repo1.maven.org/maven2

USER root

# Install curl, download JARs, and cleanup in a single layer
RUN apt-get update -qq && apt-get -qq -y install curl && \
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -Lo /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar && \
curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar -Lo /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar && \
apt-get clean && rm -rf /var/lib/apt/lists/*
# Install curl (separate layer - rarely changes)
RUN apt-get update -qq && \
apt-get -qq -y install --no-install-recommends curl && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Download JARs with retry logic (slow layer - only changes when versions change)
RUN curl -fsSL --retry 3 --retry-delay 5 \
-o /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar \
${MAVEN_MIRROR}/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar && \
curl -fsSL --retry 3 --retry-delay 5 \
-o /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar \
${MAVEN_MIRROR}/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar

# Copy configuration last (changes more frequently than JARs)
COPY core-site.xml /opt/hadoop/etc/hadoop/core-site.xml

USER hive
28 changes: 14 additions & 14 deletions dev/spark/Dockerfile
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same logic, just reordered

Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ ARG BASE_IMAGE_SPARK_VERSION=4.0.1
FROM apache/spark:${BASE_IMAGE_SPARK_VERSION}

# Dependency versions - keep these compatible
# Changing these will invalidate the JAR download cache layer
ARG ICEBERG_VERSION=1.10.1
ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13
ARG HADOOP_VERSION=3.4.1
ARG SCALA_VERSION=2.13
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unreferenced, so removing

ARG AWS_SDK_VERSION=2.24.6
ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2

Expand All @@ -31,26 +31,23 @@ WORKDIR ${SPARK_HOME}
# Install curl for JAR downloads
RUN apt-get update && \
apt-get install -y --no-install-recommends curl && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Copy configuration (early for better caching)
COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/

# Create event log directory
# Create directories (separate layer)
RUN mkdir -p /home/iceberg/spark-events && \
chown -R spark:spark /home/iceberg

# Required JAR dependencies
ENV JARS_TO_DOWNLOAD="\
org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \
software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"

# Download JARs with retry logic
# Download JARs with retry logic (most cacheable - only changes when versions change)
# This is the slowest step, so we do it before copying config files
RUN set -e && \
cd "${SPARK_HOME}/jars" && \
for jar_path in ${JARS_TO_DOWNLOAD}; do \
for jar_path in \
"org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \
"org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \
"org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \
"software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"; \
do \
jar_name=$(basename "${jar_path}") && \
echo "Downloading ${jar_name}..." && \
curl -fsSL --retry 3 --retry-delay 5 \
Expand All @@ -60,6 +57,9 @@ RUN set -e && \
done && \
chown -R spark:spark "${SPARK_HOME}/jars"

# Copy configuration last (changes more frequently than JARs)
COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/

USER spark
WORKDIR ${SPARK_HOME}

Expand Down