diff --git a/.github/workflows/build-lib_array_morph-and-pypi-package.yaml b/.github/workflows/build.yaml similarity index 60% rename from .github/workflows/build-lib_array_morph-and-pypi-package.yaml rename to .github/workflows/build.yaml index 2e86554..1faaf57 100644 --- a/.github/workflows/build-lib_array_morph-and-pypi-package.yaml +++ b/.github/workflows/build.yaml @@ -1,6 +1,8 @@ name: Build, Test, and Publish on: + push: + branches: [main] pull_request: branches: [main] release: @@ -16,21 +18,93 @@ jobs: fail-fast: false matrix: include: - # ── Linux x86_64 (manylinux_2_28 container) ── - - { os: linux, arch: x86_64, runner: ubuntu-latest, container: "quay.io/pypa/manylinux_2_28_x86_64", python: "3.9" } - - { os: linux, arch: x86_64, runner: ubuntu-latest, container: "quay.io/pypa/manylinux_2_28_x86_64", python: "3.10" } - - { os: linux, arch: x86_64, runner: ubuntu-latest, container: "quay.io/pypa/manylinux_2_28_x86_64", python: "3.11" } - - { os: linux, arch: x86_64, runner: ubuntu-latest, container: "quay.io/pypa/manylinux_2_28_x86_64", python: "3.12" } - - { os: linux, arch: x86_64, runner: ubuntu-latest, container: "quay.io/pypa/manylinux_2_28_x86_64", python: "3.13" } - - { os: linux, arch: x86_64, runner: ubuntu-latest, container: "quay.io/pypa/manylinux_2_28_x86_64", python: "3.14" } - # ── Linux aarch64 (manylinux_2_28 container) ── - - { os: linux, arch: aarch64, runner: ubuntu-24.04-arm, container: "quay.io/pypa/manylinux_2_28_aarch64", python: "3.9" } - - { os: linux, arch: aarch64, runner: ubuntu-24.04-arm, container: "quay.io/pypa/manylinux_2_28_aarch64", python: "3.10" } - - { os: linux, arch: aarch64, runner: ubuntu-24.04-arm, container: "quay.io/pypa/manylinux_2_28_aarch64", python: "3.11" } - - { os: linux, arch: aarch64, runner: ubuntu-24.04-arm, container: "quay.io/pypa/manylinux_2_28_aarch64", python: "3.12" } - - { os: linux, arch: aarch64, runner: ubuntu-24.04-arm, container: "quay.io/pypa/manylinux_2_28_aarch64", python: "3.13" } - - { os: linux, arch: aarch64, runner: ubuntu-24.04-arm, container: "quay.io/pypa/manylinux_2_28_aarch64", python: "3.14" } - # ── macOS arm64 ── + # Linux x86_64 (manylinux_2_28 container) + - { + os: linux, + arch: x86_64, + runner: ubuntu-latest, + container: "quay.io/pypa/manylinux_2_28_x86_64", + python: "3.9", + } + - { + os: linux, + arch: x86_64, + runner: ubuntu-latest, + container: "quay.io/pypa/manylinux_2_28_x86_64", + python: "3.10", + } + - { + os: linux, + arch: x86_64, + runner: ubuntu-latest, + container: "quay.io/pypa/manylinux_2_28_x86_64", + python: "3.11", + } + - { + os: linux, + arch: x86_64, + runner: ubuntu-latest, + container: "quay.io/pypa/manylinux_2_28_x86_64", + python: "3.12", + } + - { + os: linux, + arch: x86_64, + runner: ubuntu-latest, + container: "quay.io/pypa/manylinux_2_28_x86_64", + python: "3.13", + } + - { + os: linux, + arch: x86_64, + runner: ubuntu-latest, + container: "quay.io/pypa/manylinux_2_28_x86_64", + python: "3.14", + } + # Linux aarch64 (manylinux_2_28 container) + - { + os: linux, + arch: aarch64, + runner: ubuntu-24.04-arm, + container: "quay.io/pypa/manylinux_2_28_aarch64", + python: "3.9", + } + - { + os: linux, + arch: aarch64, + runner: ubuntu-24.04-arm, + container: "quay.io/pypa/manylinux_2_28_aarch64", + python: "3.10", + } + - { + os: linux, + arch: aarch64, + runner: ubuntu-24.04-arm, + container: "quay.io/pypa/manylinux_2_28_aarch64", + python: "3.11", + } + - { + os: linux, + arch: aarch64, + runner: ubuntu-24.04-arm, + container: "quay.io/pypa/manylinux_2_28_aarch64", + python: "3.12", + } + - { + os: linux, + arch: aarch64, + runner: ubuntu-24.04-arm, + container: "quay.io/pypa/manylinux_2_28_aarch64", + python: "3.13", + } + - { + os: linux, + arch: aarch64, + runner: ubuntu-24.04-arm, + container: "quay.io/pypa/manylinux_2_28_aarch64", + python: "3.14", + } + # macOS arm64 - { os: macos, arch: arm64, runner: macos-latest, python: "3.9" } - { os: macos, arch: arm64, runner: macos-latest, python: "3.10" } - { os: macos, arch: arm64, runner: macos-latest, python: "3.11" } @@ -42,142 +116,84 @@ jobs: - name: Checkout uses: actions/checkout@v4 with: - fetch-depth: 0 # setuptools-scm needs full history + fetch-depth: 0 + + - name: Install uv + uses: astral-sh/setup-uv@v7 + with: + version: "0.10.6" + + - name: Set up Python + run: uv python install ${{ matrix.python }} # ────────────────────────────────────────────── - # 1. Python + system deps + # 1. System deps # ────────────────────────────────────────────── - # manylinux containers have Python pre-installed at /opt/python/ - - name: Select Python (Linux container) + - name: Install system deps (Linux manylinux) if: runner.os == 'Linux' run: | - # Map matrix python version to manylinux cpython path - PY_VER="${{ matrix.python }}" - PY_TAG="cp${PY_VER/./}" # "3.12" → "cp312" - - # Find the matching Python in /opt/python/ - PY_DIR=$(ls -d /opt/python/${PY_TAG}-*/bin | head -1) - if [ -z "$PY_DIR" ]; then - echo "ERROR: Python $PY_VER not found in manylinux container" - ls /opt/python/ - exit 1 - fi - - echo "$PY_DIR" >> $GITHUB_PATH - echo "Using Python from: $PY_DIR" - $PY_DIR/python --version - - - name: Install system deps (Linux container) + yum -y install \ + git curl ca-certificates \ + zip unzip tar \ + cmake ninja-build pkgconfig \ + gcc gcc-c++ make \ + perl perl-IPC-Cmd perl-ExtUtils-MakeMaker \ + kernel-headers + + - name: Sanity check compilers if: runner.os == 'Linux' run: | - # manylinux_2_28 is AlmaLinux 8 — uses yum - yum install -y \ - cmake ninja-build \ - pkgconfig \ - libX11-devel libXext-devel libXrender-devel \ - libXrandr-devel libXinerama-devel libXcursor-devel \ - libXcomposite-devel libXdamage-devel libXfixes-devel \ - libXi-devel libXtst-devel libXScrnSaver-devel \ - libxcb-devel xcb-util-devel \ - libXau-devel libXdmcp-devel \ - mesa-libGL-devel \ - alsa-lib-devel \ - uuid-devel \ - perl-IPC-Cmd # needed by some Conan builds (e.g. OpenSSL) - - - name: Install uv (macOS) - if: runner.os == 'macOS' - uses: astral-sh/setup-uv@v7 - with: - version: "0.10.6" - - - name: Install uv (Linux container) - if: runner.os == 'Linux' - run: | - curl -LsSf https://astral.sh/uv/0.10.6/install.sh | sh - echo "$HOME/.local/bin" >> $GITHUB_PATH - - - name: Set up Python (macOS) - if: runner.os == 'macOS' - run: uv python install ${{ matrix.python }} + gcc --version + g++ --version + perl -MIPC::Cmd -e 'print "IPC::Cmd OK\n"' - name: Install system deps (macOS) if: runner.os == 'macOS' - run: brew install ninja cmake + run: brew install ninja cmake curl openssl - - name: Create venv + install Python tools + - name: Install Python tools run: | - if [ "${{ runner.os }}" = "macOS" ]; then - uv venv --python ${{ matrix.python }} - else - # Use the manylinux container's Python - python -m venv ${{ github.workspace }}/.venv - fi - echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH - - # Upgrade pip inside venv for manylinux compatibility - ${{ github.workspace }}/.venv/bin/python -m pip install --upgrade pip - - # Install build tools - ${{ github.workspace }}/.venv/bin/pip install \ + uv venv --python ${{ matrix.python }} + echo "${GITHUB_WORKSPACE}/.venv/bin" >> $GITHUB_PATH + uv pip install \ scikit-build-core setuptools-scm h5py \ - build auditwheel delocate conan + build auditwheel delocate # ────────────────────────────────────────────── - # 2. Conan: install C++ deps (cached per platform) + # 2. vcpkg: install C++ deps (cached per platform) # ────────────────────────────────────────────── - - name: Cache Conan packages - uses: actions/cache@v4 - with: - path: ~/.conan2 - key: conan-${{ matrix.os }}-${{ matrix.arch }}-${{ hashFiles('lib/conanfile.py') }} - restore-keys: conan-${{ matrix.os }}-${{ matrix.arch }}- - - - name: Conan install - working-directory: lib + - name: Bootstrap vcpkg run: | - conan profile detect --force - conan install . --build=missing -of build \ - -c tools.system.package_manager:mode=install \ - -c tools.system.package_manager:sudo=True + git clone https://github.com/microsoft/vcpkg.git ${GITHUB_WORKSPACE}/vcpkg + ${GITHUB_WORKSPACE}/vcpkg/bootstrap-vcpkg.sh + echo "VCPKG_ROOT=${GITHUB_WORKSPACE}/vcpkg" >> $GITHUB_ENV + echo "CMAKE_TOOLCHAIN_FILE=${GITHUB_WORKSPACE}/vcpkg/scripts/buildsystems/vcpkg.cmake" >> $GITHUB_ENV - - name: Find Conan toolchain - run: | - TOOLCHAIN=$(find ${{ github.workspace }}/lib/build -name "conan_toolchain.cmake" | head -1) - if [ -z "$TOOLCHAIN" ]; then - echo "ERROR: conan_toolchain.cmake not found" - find ${{ github.workspace }}/lib/build -type f -name "*.cmake" || true - exit 1 - fi - echo "CMAKE_TOOLCHAIN_FILE=$TOOLCHAIN" >> $GITHUB_ENV - echo "Found toolchain at: $TOOLCHAIN" + - name: Cache vcpkg packages + uses: actions/cache@v4 + with: + path: ~/.cache/vcpkg/archives + key: vcpkg-${{ matrix.os }}-${{ matrix.arch }}-${{ hashFiles('lib/vcpkg.json') }} + restore-keys: vcpkg-${{ matrix.os }}-${{ matrix.arch }}- # ────────────────────────────────────────────── # 3. Discover h5py HDF5 + build wheel # ────────────────────────────────────────────── - - name: Discover h5py HDF5 location run: | - H5PY_HDF5_DIR=$(python3 -c " + HDF5_DIR=$(${GITHUB_WORKSPACE}/.venv/bin/python -c " import h5py, os d = os.path.dirname(h5py.__file__) dylibs = os.path.join(d, '.dylibs') libs = os.path.join(os.path.dirname(d), 'h5py.libs') print(dylibs if os.path.exists(dylibs) else libs) ") - echo "H5PY_HDF5_DIR=$H5PY_HDF5_DIR" >> $GITHUB_ENV - echo "Discovered h5py HDF5 at: $H5PY_HDF5_DIR" - ls -la "$H5PY_HDF5_DIR" - - - name: Set macOS deployment target - if: runner.os == 'macOS' - run: echo "MACOSX_DEPLOYMENT_TARGET=12.0" >> $GITHUB_ENV + echo "HDF5_DIR=$HDF5_DIR" >> $GITHUB_ENV - name: Build wheel - run: | - python -m build --wheel --no-isolation + run: uv build --wheel --no-build-isolation --python ${GITHUB_WORKSPACE}/.venv/bin/python # ────────────────────────────────────────────── # 4. Repair wheel for PyPI @@ -186,7 +202,7 @@ jobs: - name: Repair wheel (Linux) if: runner.os == 'Linux' run: | - export LD_LIBRARY_PATH="${H5PY_HDF5_DIR}:${LD_LIBRARY_PATH}" + export LD_LIBRARY_PATH="${HDF5_DIR}:${LD_LIBRARY_PATH}" auditwheel show dist/*.whl auditwheel repair dist/*.whl -w wheelhouse/ \ --exclude libhdf5.so \ @@ -197,7 +213,7 @@ jobs: - name: Repair wheel (macOS) if: runner.os == 'macOS' run: | - export DYLD_LIBRARY_PATH="${H5PY_HDF5_DIR}:${DYLD_LIBRARY_PATH}" + export DYLD_LIBRARY_PATH="${HDF5_DIR}:${DYLD_LIBRARY_PATH}" delocate-listdeps dist/*.whl delocate-wheel -w wheelhouse/ dist/*.whl \ --exclude libhdf5 \ @@ -209,7 +225,7 @@ jobs: - name: Smoke test run: | - pip install wheelhouse/*.whl --force-reinstall + uv pip install wheelhouse/*.whl --force-reinstall python3 -c "import arraymorph; print('arraymorph imported successfully')" - name: Upload wheel artifact diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index de374ff..0000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,62 +0,0 @@ -name: Build ArrayMorph - -on: - push: - branches: - - main - paths: - - arraymorph/** - - .github/workflows/** - pull_request: - branches: - - main - -jobs: - build: - runs-on: ubuntu-latest - - env: - VCPKG_ROOT: ${{ github.workspace }}/vcpkg - HDF5_INSTALL: ${{ github.workspace }}/HDF5 - - steps: - - name: Checkout source - uses: actions/checkout@v3 - - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y build-essential pkg-config python3 python3-pip libssl-dev cmake - - - name: Install vcpkg - run: | - git clone https://github.com/microsoft/vcpkg.git $VCPKG_ROOT - cd $VCPKG_ROOT - ./bootstrap-vcpkg.sh - ./vcpkg install aws-sdk-cpp[s3]:x64-linux - rm -rf vcpkg/buildtrees vcpkg/downloads vcpkg/packages - ./vcpkg install azure-storage-blobs-cpp:x64-linux - rm -rf vcpkg/buildtrees vcpkg/downloads vcpkg/packages - - - name: Install HDF5 - run: | - git clone https://github.com/HDFGroup/hdf5.git - cd hdf5 - git checkout hdf5-1_14_2 - ./configure --prefix=$HDF5_INSTALL --enable-cxx - make -j$(nproc) - make install - rm -rf hdf5 - - - name: Install h5py - run: | - python3 -m pip install --upgrade pip - HDF5_DIR=$HDF5_INSTALL pip3 install --no-binary=h5py h5py - - - name: Build ArrayMorph - run: | - cd arraymorph - cmake -B ./build -S . \ - -DCMAKE_PREFIX_PATH=$HDF5_INSTALL \ - -DCMAKE_TOOLCHAIN_FILE=$VCPKG_ROOT/scripts/buildsystems/vcpkg.cmake - cmake --build ./build --parallel \ No newline at end of file diff --git a/.gitignore b/.gitignore index e739efe..2c0814a 100644 --- a/.gitignore +++ b/.gitignore @@ -72,3 +72,13 @@ uv.lock *.flv *.mov *.wmv + +# vcpkg +lib/vcpkg_installed/ +vcpkg_installed/ + +# Conan generated (legacy) +lib/activate.sh +lib/deactivate.sh +.conan2/ +conan.conf diff --git a/README.md b/README.md index 5b6e61f..18bc448 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ # ArrayMorph -[![Build Status](https://github.com/ICICLE-ai/arraymorph/actions/workflows/build.yml/badge.svg)](https://github.com/ICICLE-ai/arraymorph/actions/workflows/build.yml) +[![Build Status](https://github.com/ICICLE-ai/arraymorph/actions/workflows/build.yml/badge.svg)](https://github.com/ICICLE-ai/arraymorph/actions/workflows/build.yaml) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -ArrayMorph is a software to manage array data stored on cloud object storage efficiently. It supports both HDF5 C++ API and h5py API. The data returned by h5py API is numpy arrays. By using h5py API, users can access array data stored on the cloud and feed the read data into machine learning pipelines seamlessly. +ArrayMorph enables efficient storage and retrieval of array data from cloud object stores, supporting AWS S3 and Azure Blob Storage. It is an HDF5 Virtual Object Layer (VOL) plugin that transparently routes HDF5 file operations to cloud storage — existing h5py or HDF5 C++ code works unchanged once the plugin is loaded. **Tag**: CI4AI @@ -11,119 +11,320 @@ ArrayMorph is a software to manage array data stored on cloud object storage eff # How-To Guides -## Install dependencies +## Install ArrayMorph -It is recommended to use Conda (and conda-forge) for managing dependencies. +```bash +pip install arraymorph +``` + +Once installed, jump straight to [Configure credentials for AWS S3](#configure-credentials-for-aws-s3) or [Azure](#configure-credentials-for-azure-blob-storage) below. + +If you need the standalone `lib_arraymorph` binary, you can [download a pre-built release](#download-a-pre-built-lib_arraymorph) or [build from source](#build-from-source). -1. Install [Miniconda](https://docs.anaconda.com/miniconda/) -2. Install [conda-build](https://docs.conda.io/projects/conda-build/en/stable/install-conda-build.html) for installing local conda packages -3. Create and activate environment with dependencies: - ```bash - conda create -n arraymorph conda-forge::gxx=9 - conda activate arraymorph - conda install -n arraymorph cmake conda-forge::hdf5=1.14.2 conda-forge::aws-sdk-cpp conda-forge::azure-storage-blobs-cpp conda-forge::h5py - ``` +## Configure credentials for AWS S3 -## Install ArrayMorph via ArrayMorph local conda package - ```bash - git clone https://github.com/ICICLE-ai/arraymorph.git - cd arraymorph/arraymorph_channel - conda index . - conda install -n arraymorph arraymorph -c file://$(pwd) -c conda-forge - ``` +Use the Python API before opening any HDF5 files: + +```python +import arraymorph + +arraymorph.configure_s3( + bucket="my-bucket", + access_key="MY_ACCESS_KEY", + secret_key="MY_SECRET_KEY", + region="us-east-1", + use_tls=True, +) +arraymorph.enable() +``` -## Install ArryMorph from source code +Or set environment variables directly: -### Build ArrayMorph ```bash -git clone https://github.com/ICICLE-ai/arraymorph.git -cd arraymorph/arraymorph -cmake -B ./build -S . -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -cd build -make +export STORAGE_PLATFORM=S3 +export BUCKET_NAME=my-bucket +export AWS_ACCESS_KEY_ID=MY_ACCESS_KEY +export AWS_SECRET_ACCESS_KEY=MY_SECRET_KEY +export AWS_REGION=us-east-1 +export HDF5_PLUGIN_PATH=$(python -c "import arraymorph; print(arraymorph.get_plugin_path())") +export HDF5_VOL_CONNECTOR=arraymorph +``` + +## Configure credentials for Azure Blob Storage + +```python +import arraymorph + +arraymorph.configure_azure( + container="my-container", + connection_string="DefaultEndpointsProtocol=https;AccountName=...;AccountKey=...;EndpointSuffix=core.windows.net", +) +arraymorph.enable() ``` -### Enable VOL plugin: +Or set environment variables directly: + ```bash -export HDF5_PLUGIN_PATH=/path/to/arraymorph/arraymorph/build/src +export STORAGE_PLATFORM=Azure +export BUCKET_NAME=my-container +export AZURE_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=https;..." +export HDF5_PLUGIN_PATH=$(python -c "import arraymorph; print(arraymorph.get_plugin_path())") export HDF5_VOL_CONNECTOR=arraymorph ``` -## Configure Environment for Cloud Access +## Use an S3-compatible object store (MinIO, Ceph, Garage) + +Pass `endpoint`, `addressing_style=True`, and `use_signed_payloads=True` to match the requirements of most self-hosted S3-compatible stores: + +```python +import arraymorph + +arraymorph.configure_s3( + bucket="my-bucket", + access_key="MY_ACCESS_KEY", + secret_key="MY_SECRET_KEY", + endpoint="http://localhost:9000", + region="us-east-1", + use_tls=False, + addressing_style=True, + use_signed_payloads=True, +) +arraymorph.enable() +``` + +## Download a pre-built lib_arraymorph + +Each [GitHub release](https://github.com/ICICLE-ai/ArrayMorph/releases) attaches standalone pre-compiled binaries of `lib_arraymorph` for all supported platforms: + +| File | Platform | +| ---------------------------------- | ------------------- | +| `lib_arraymorph-linux-x86_64.so` | Linux x86_64 | +| `lib_arraymorph-linux-aarch64.so` | Linux aarch64 | +| `lib_arraymorph-macos-arm64.dylib` | macOS Apple Silicon | + +Download the file for your platform from the release assets and set `HDF5_PLUGIN_PATH` to the directory containing it before calling `arraymorph.enable()` or setting `HDF5_VOL_CONNECTOR` manually. + +## Build from source + +Use this path if you want to compile `lib_arraymorph` yourself — for example to target a specific platform, contribute changes, or build a custom wheel. + +### Prerequisites + +- [vcpkg](https://github.com/microsoft/vcpkg) — installs the AWS and Azure C++ SDKs via CMake +- [CMake](https://cmake.org) and [Ninja](https://ninja-build.org) +- [uv](https://docs.astral.sh/uv/) — Python package manager + +### Step 1 — Clone and create a virtual environment -### AWS Configuration: ```bash -export STORAGE_PLATFORM=S3 -export BUCKET_NAME=XXXXXX -export AWS_ACCESS_KEY_ID=XXXXXX -export AWS_SECRET_ACCESS_KEY=XXXXXX -export AWS_REGION=us-east-2 # or your bucket's region +git clone https://github.com/ICICLE-ai/ArrayMorph.git +cd ArrayMorph +uv venv +source .venv/bin/activate +``` + +### Step 2 — Install h5py + +`lib_arraymorph` links against an HDF5 shared library at build time. Rather than requiring a separate system-wide HDF5 installation, the build system points CMake at the `.so` / `.dylib` that h5py already bundles. Install h5py first so those libraries are present: + +```bash +uv pip install h5py ``` -### Azure Configuration: +On macOS the bundled libraries land in `.venv/lib/python*/site-packages/h5py/.dylibs/`; on Linux in `.venv/lib/python*/site-packages/h5py.libs/`. + +### Step 3 — Configure and build the shared library + ```bash -export STORAGE_PLATFORM=Azure -export BUCKET_NAME=XXXXXX -export AZURE_STORAGE_CONNECTION_STRING=XXXXXX +export HDF5_DIR=$(.venv/bin/python -c "import h5py,os; d=os.path.dirname(h5py.__file__); print(os.path.join(d,'.dylibs') if os.path.exists(os.path.join(d,'.dylibs')) else os.path.join(os.path.dirname(d),'h5py.libs'))") + +cmake -B lib/build -S lib \ + -DCMAKE_TOOLCHAIN_FILE=${VCPKG_ROOT:-~/.vcpkg}/scripts/buildsystems/vcpkg.cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -G Ninja + +cmake --build lib/build +``` + +This produces `lib/build/lib_arraymorph.dylib` on macOS or `lib/build/lib_arraymorph.so` on Linux. + +### Optional — Python package + +If you also want to use the Python API, install the package in editable mode: + +```bash +HDF5_DIR=$HDF5_DIR \ +CMAKE_TOOLCHAIN_FILE=${VCPKG_ROOT:-~/.vcpkg}/scripts/buildsystems/vcpkg.cmake \ +uv pip install -e . +``` + +Or build a redistributable wheel: + +```bash +HDF5_DIR=$HDF5_DIR \ +CMAKE_TOOLCHAIN_FILE=${VCPKG_ROOT:-~/.vcpkg}/scripts/buildsystems/vcpkg.cmake \ +uv build --wheel --no-build-isolation +``` + +The wheel is written to `dist/`. Install it in any environment with: + +```bash +pip install dist/arraymorph-*.whl ``` --- # Tutorials -## Run a simple example: Writing and Reading HDF5 files from Cloud - -### Prerequisites: -- AWS or Azure cloud account with credentials -- S3 bucket or Azure container -- ArrayMorph dependencies installed - -### Steps: -1. Activate conda environment - ```bash - conda activate arraymorph - ``` - -2. Write sample HDF5 data to the cloud - ```bash - cd examples/python - python3 write.py - ``` - -3. Read data back from cloud HDF5 file - ```bash - cd examples/python - python3 read.py - ``` +## Write and read a chunked array on AWS S3 + +This tutorial walks through writing a 2-D NumPy array to a cloud HDF5 file and reading a slice of it back. + +### Prerequisites + +- An AWS account with an S3 bucket, or an S3-compatible object store +- ArrayMorph installed (`pip install arraymorph`) + +### Step 1 — Configure and enable ArrayMorph + +```python +import arraymorph + +arraymorph.configure_s3( + bucket="my-bucket", + access_key="MY_ACCESS_KEY", + secret_key="MY_SECRET_KEY", + region="us-east-1", + use_tls=True, +) +arraymorph.enable() +``` + +`arraymorph.enable()` sets `HDF5_PLUGIN_PATH` and `HDF5_VOL_CONNECTOR` in the current process. Any `h5py.File(...)` call made after this point is routed through ArrayMorph. + +### Step 2 — Write array data + +```python +import h5py +import numpy as np + +data = np.fromfunction(lambda i, j: i + j, (100, 100), dtype="i4") + +with h5py.File("demo.h5", "w") as f: + f.create_dataset("values", data=data, chunks=(10, 10)) +``` + +Each 10×10 chunk is stored as a separate object in your S3 bucket. + +### Step 3 — Read a slice back + +```python +import h5py + +with h5py.File("demo.h5", "r") as f: + dset = f["values"] + print(dset.dtype) # int32 + print(dset[5:15, 5:15]) # fetches only the chunks that overlap this slice +``` + +Only the chunks that overlap the requested hyperslab are fetched from cloud storage — no full-file download occurs. + --- # Explanation -### How ArrayMorph Works +## How ArrayMorph works -ArrayMorph plugs into the HDF5 stack using a VOL (Virtual Object Layer) plugin that intercepts file operations and routes them to cloud object storage instead of local files. This allows existing HDF5 APIs (both C++ and h5py in Python) to operate on cloud-based data seamlessly, enabling transparent cloud access for scientific or ML pipelines. +ArrayMorph is implemented as an HDF5 **Virtual Object Layer (VOL)** connector. The VOL is an abstraction layer inside the HDF5 library that separates the public API from the storage implementation. By providing a plugin that registers itself as a VOL connector, ArrayMorph intercepts every HDF5 file operation before it reaches the native POSIX layer. -It supports: -- Cloud backends: AWS S3 and Azure Blob -- File formats: Current binary data stream (we plan to extend to other formats like jpg in the future) -- Languages: C++ and Python (via h5py compatibility) +When `arraymorph.enable()` is called: -The system is designed to be efficient in latency-sensitive scenarios and aims to integrate well with large-scale distributed training and inference. +1. `HDF5_PLUGIN_PATH` is set to the directory containing the compiled shared library (`lib_arraymorph.so` / `lib_arraymorph.dylib`). +2. `HDF5_VOL_CONNECTOR=arraymorph` tells HDF5 to load and activate that plugin for all subsequent file operations. + +From this point, a call like `h5py.File("demo.h5", "w")` does not touch the local filesystem. Instead, the VOL connector: + +1. Reads cloud credentials from environment variables and constructs an AWS S3 or Azure Blob client (selected by `STORAGE_PLATFORM`). +2. On dataset read/write, translates the HDF5 hyperslab selection into a list of chunks and dispatches asynchronous get/put requests against the object store — one object per chunk. + +### Chunked storage model + +HDF5 datasets are divided into fixed-size chunks (e.g. `chunks=(64, 64)` for a 2-D dataset). ArrayMorph stores each chunk as an independent object in the bucket. The object key encodes the dataset path and chunk coordinates, so a partial read only fetches the chunks that overlap the requested slice. For large chunks, ArrayMorph can issue byte-range requests to retrieve only the needed bytes within a chunk object. + +### Async I/O + +Both the S3 and Azure backends use asynchronous operations dispatched to a thread pool. This allows ArrayMorph to fetch multiple chunks in parallel, which is important for workloads that access many chunks per read (e.g. strided access patterns in machine learning data loaders). + +### Compatibility + +Because the interception happens at the VOL layer, no changes to application code are required. Any program that opens HDF5 files with h5py or the HDF5 C++ API will automatically use ArrayMorph once the plugin is loaded. --- -## References +# References + +## Python API + +### `arraymorph.enable() -> None` + +Sets `HDF5_PLUGIN_PATH` and `HDF5_VOL_CONNECTOR` in the current process environment. Must be called before any `h5py.File(...)` call. + +### `arraymorph.get_plugin_path() -> str` + +Returns the directory containing the compiled VOL plugin. Useful when you need to set `HDF5_PLUGIN_PATH` manually. + +### `arraymorph.configure_s3(bucket, access_key, secret_key, endpoint=None, region="us-east-2", use_tls=False, addressing_style=False, use_signed_payloads=False) -> None` + +Configures the S3 client. All parameters are written to environment variables consumed by the C++ plugin at file-open time. + +| Parameter | Environment variable | Default | Description | +| --------------------- | ------------------------- | ----------- | ---------------------------------------------------- | +| `bucket` | `BUCKET_NAME` | — | S3 bucket name | +| `access_key` | `AWS_ACCESS_KEY_ID` | — | Access key ID | +| `secret_key` | `AWS_SECRET_ACCESS_KEY` | — | Secret access key | +| `endpoint` | `AWS_ENDPOINT_URL_S3` | AWS default | Custom endpoint for S3-compatible stores | +| `region` | `AWS_REGION` | `us-east-2` | SigV4 signing region | +| `use_tls` | `AWS_USE_TLS` | `false` | Use HTTPS when `True` | +| `addressing_style` | `AWS_S3_ADDRESSING_STYLE` | `virtual` | `path` when `True`; required for most non-AWS stores | +| `use_signed_payloads` | `AWS_SIGNED_PAYLOADS` | `false` | Include request body in SigV4 signature | + +### `arraymorph.configure_azure(container, connection_string=None) -> None` + +Configures the Azure Blob client. + +| Parameter | Environment variable | Default | Description | +| ------------------- | --------------------------------- | -------- | ------------------------------- | +| `container` | `BUCKET_NAME` | — | Azure container name | +| `connection_string` | `AZURE_STORAGE_CONNECTION_STRING` | From env | Azure Storage connection string | + +## Environment variables + +All configuration can be applied via environment variables without using the Python API. This is useful when running HDF5 C++ programs directly. + +| Variable | Description | +| --------------------------------- | --------------------------------------------------- | +| `HDF5_PLUGIN_PATH` | Directory containing `lib_arraymorph.so` / `.dylib` | +| `HDF5_VOL_CONNECTOR` | Must be `arraymorph` to activate the plugin | +| `STORAGE_PLATFORM` | `S3` (default) or `Azure` | +| `BUCKET_NAME` | Bucket or container name | +| `AWS_ACCESS_KEY_ID` | S3 access key | +| `AWS_SECRET_ACCESS_KEY` | S3 secret key | +| `AWS_REGION` | SigV4 signing region | +| `AWS_ENDPOINT_URL_S3` | Custom S3-compatible endpoint URL | +| `AWS_USE_TLS` | `true` / `false` | +| `AWS_S3_ADDRESSING_STYLE` | `path` or `virtual` | +| `AWS_SIGNED_PAYLOADS` | `true` / `false` | +| `AZURE_STORAGE_CONNECTION_STRING` | Azure connection string | + +## External references - [HDF5 VOL connectors](https://docs.hdfgroup.org/hdf5/develop/_v_o_l.html) - [AWS SDK for C++](https://github.com/aws/aws-sdk-cpp) - [Azure SDK for C++](https://github.com/Azure/azure-sdk-for-cpp) - [h5py documentation](https://docs.h5py.org/en/stable/) -- [conda-forge](https://conda-forge.org/) --- ## Acknowledgements -This project is supported by: - -*National Science Foundation (NSF) funded AI institute for Intelligent Cyberinfrastructure with Computational Learning in the Environment (ICICLE) (OAC 2112606)* +This project is supported by the National Science Foundation (NSF) funded AI institute for Intelligent Cyberinfrastructure with Computational Learning in the Environment (ICICLE) (OAC 2112606). diff --git a/justfile b/justfile index 1b1889b..7a09ea9 100644 --- a/justfile +++ b/justfile @@ -1,4 +1,3 @@ - # ArrayMorph — Top-Level Build Orchestration # https://just.systems @@ -7,9 +6,8 @@ set dotenv-load := true set export := true # --- Variables --- -CONAN_BUILD_DIR := "lib/build/Release/generators" -CMAKE_TOOLCHAIN_FILE := justfile_directory() / CONAN_BUILD_DIR / "conan_toolchain.cmake" -H5PY_HDF5_DIR := `./.venv/bin/python -c "import h5py,os;d=os.path.dirname(h5py.__file__);print(os.path.join(d,'.dylibs') if os.path.exists(os.path.join(d,'.dylibs')) else os.path.join(os.path.dirname(d),'h5py.libs'))"` +VCPKG_TOOLCHAIN := env("VCPKG_ROOT", home_directory() / ".vcpkg") / "scripts/buildsystems/vcpkg.cmake" +HDF5_DIR := `./.venv/bin/python3 -c "import h5py,os;d=os.path.dirname(h5py.__file__);print(os.path.join(d,'.dylibs') if os.path.exists(os.path.join(d,'.dylibs')) else os.path.join(os.path.dirname(d),'h5py.libs'))"` # --- Recipes --- @@ -17,24 +15,20 @@ H5PY_HDF5_DIR := `./.venv/bin/python -c "import h5py,os;d=os.path.dirname(h5py._ default: @just --list -# Install C++ dependencies via Conan -deps: - cd lib && conan install . --build=missing -s build_type=Release - # Build Python wheel (scikit-build-core handles CMake) wheel: - CMAKE_TOOLCHAIN_FILE={{ CMAKE_TOOLCHAIN_FILE }} \ - H5PY_HDF5_DIR={{ H5PY_HDF5_DIR }} \ - uv build --wheel --no-build-isolation + CMAKE_TOOLCHAIN_FILE={{ VCPKG_TOOLCHAIN }} \ + HDF5_DIR={{ HDF5_DIR }} \ + uv build --wheel --no-build-isolation # Install editable into current venv (for development iteration) dev: - CMAKE_TOOLCHAIN_FILE={{ CMAKE_TOOLCHAIN_FILE }} \ - H5PY_HDF5_DIR={{ H5PY_HDF5_DIR }} \ + CMAKE_TOOLCHAIN_FILE={{ VCPKG_TOOLCHAIN }} \ + HDF5_DIR={{ HDF5_DIR }} \ uv pip install -e . # Full build from scratch: deps → wheel -build: deps wheel +build: wheel # Test the built wheel in an isolated venv test: @@ -42,7 +36,7 @@ test: uv venv .test-venv source .test-venv/bin/activate.fish uv pip install dist/arraymorph-0.2.0-*.whl - python -c "import arraymorph; print('Plugin:', arraymorph.get_plugin_path()); arraymorph.enable(); print('VOL enabled')" + python3 -c "import arraymorph; print('Plugin:', arraymorph.get_plugin_path()); arraymorph.enable(); print('VOL enabled')" rm -rf .test-venv # Full build + test @@ -50,14 +44,13 @@ all: build test # Clean build artifacts clean: - rm -rf lib/build dist *.egg-info .test-venv + rm -rf lib/build lib/vcpkg_installed dist *.egg-info .test-venv # Full clean rebuild rebuild: clean build # Show current env var values (for debugging) info: - @echo "CMAKE_TOOLCHAIN_FILE: {{ CMAKE_TOOLCHAIN_FILE }}" - @echo "H5PY_HDF5_DIR: {{ H5PY_HDF5_DIR }}" + @echo "CMAKE_TOOLCHAIN_FILE: {{ VCPKG_TOOLCHAIN }}" + @echo "HDF5_DIR: {{ HDF5_DIR }}" @echo "Plugin lib: $(find lib/build -name 'lib_array_morph*' 2>/dev/null || echo 'not built')" - diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 897085b..51630db 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.20) project(ArrayMorph - VERSION 0.1.0 + VERSION 0.2.0 LANGUAGES C CXX ) @@ -28,23 +28,28 @@ include_directories(${CMAKE_SOURCE_DIR}/include) find_package(AWSSDK REQUIRED COMPONENTS core s3) # Azure SDK -find_package(AzureSDK CONFIG REQUIRED) +find_package(azure-storage-blobs-cpp CONFIG REQUIRED) # cURL and OpenSSL find_package(CURL REQUIRED) find_package(OpenSSL REQUIRED) -# --- HDF5: Conan headers + h5py runtime binary --- +# --- HDF5: vcpkg headers + h5py runtime binary --- # # ArrayMorph is a VOL plugin that gets dlopen'd by HDF5 at runtime. # We MUST link against the same HDF5 that h5py ships to avoid -# duplicate symbol conflicts. Conan provides headers, h5py provides +# duplicate symbol conflicts. vcpkg provides headers, h5py provides # the shared library. -if(NOT DEFINED ENV{H5PY_HDF5_DIR}) +# Accept HDF5_DIR from either CMake variable (-D) or env var +if(NOT HDF5_DIR AND DEFINED ENV{HDF5_DIR}) + set(HDF5_DIR "$ENV{HDF5_DIR}") +endif() + +if(NOT HDF5_DIR) message(FATAL_ERROR - "H5PY_HDF5_DIR not set. Run:\n" - " export H5PY_HDF5_DIR=$(python -c \"" + "HDF5_DIR not set. Run:\n" + " export HDF5_DIR=$(python3 -c \"" "import h5py, os; " "d=os.path.dirname(h5py.__file__); " "print(os.path.join(d,'.dylibs') if os.path.exists(os.path.join(d,'.dylibs')) " @@ -52,9 +57,9 @@ if(NOT DEFINED ENV{H5PY_HDF5_DIR}) ) endif() -set(H5PY_LIB_DIR "$ENV{H5PY_HDF5_DIR}") +set(H5PY_LIB_DIR "${HDF5_DIR}") -# Find HDF5 headers (via Conan) +# Find HDF5 headers (via vcpkg) find_package(HDF5 REQUIRED COMPONENTS C) # Locate the actual shared library in h5py's bundled directory @@ -67,7 +72,7 @@ if(NOT _h5py_hdf5_libs) # List what's actually in the directory for debugging file(GLOB _h5py_dir_contents "${H5PY_LIB_DIR}/*") message(FATAL_ERROR - "No HDF5 shared library found in H5PY_HDF5_DIR=${H5PY_LIB_DIR}\n" + "No HDF5 shared library found in HDF5_DIR=${H5PY_LIB_DIR}\n" "Directory contents: ${_h5py_dir_contents}\n" "Expected libhdf5*.dylib (macOS) or libhdf5*.so* (Linux)" ) @@ -76,7 +81,7 @@ endif() # Pick the first match list(GET _h5py_hdf5_libs 0 _h5py_hdf5_lib) -# Create imported target: Conan headers + h5py binary +# Create imported target: vcpkg headers + h5py binary add_library(hdf5_custom SHARED IMPORTED) set_target_properties(hdf5_custom PROPERTIES IMPORTED_LOCATION "${_h5py_hdf5_lib}" @@ -89,8 +94,8 @@ message(STATUS "HDF5 binary: ${_h5py_hdf5_lib}") # Collect all dependencies into a list set(ALL_DEPS - AWS::aws-sdk-cpp-core - AWS::aws-sdk-cpp-s3 + aws-cpp-sdk-core + aws-cpp-sdk-s3 Azure::azure-storage-blobs hdf5_custom OpenSSL::SSL diff --git a/lib/README.md b/lib/README.md index 958f219..d90c9bb 100644 --- a/lib/README.md +++ b/lib/README.md @@ -1,17 +1,64 @@ -# Building ArrayMorph into a conda package +# lib — ArrayMorph C++ shared library -This folder contains the ArrayMorph source code (./src/), the CMake file to build ArrayMorph (CMakeLists.txt) and the Conda build recipes (build.sh, meta.yaml). +This directory contains the C++ source code and CMake build system for `lib_arraymorph`, the HDF5 VOL connector plugin. -## Build ArrayMorph conda package +## Directory layout -1. Install [Miniconda](https://docs.anaconda.com/miniconda/) -2. Install [conda-build](https://docs.conda.io/projects/conda-build/en/stable/install-conda-build.html) -3. Update conda and conda-build -4. Under the current folder, build ArrayMorph conda pacakge - ```bash - conda build -c conda-forge . - ``` +``` +lib/ +├── src/ # C++ source files +├── include/ # Public headers +├── CMakeLists.txt # CMake build definition +└── vcpkg.json # vcpkg dependency manifest (AWS SDK, Azure SDK) +``` -## Get ArrayMorph conda package +## Download a pre-built binary -ArrayMorph conda package is stored in /path/to/conda/conda-bld/linux-64/ +Each [GitHub release](https://github.com/ICICLE-ai/ArrayMorph/releases) attaches standalone pre-compiled binaries — no build toolchain required: + +| File | Platform | +|---|---| +| `lib_arraymorph-linux-x86_64.so` | Linux x86_64 | +| `lib_arraymorph-linux-aarch64.so` | Linux aarch64 | +| `lib_arraymorph-macos-arm64.dylib` | macOS Apple Silicon | + +Download the file for your platform from the release assets and point `HDF5_PLUGIN_PATH` at the containing directory. + +The standalone binary still requires an HDF5 shared library at runtime. Set `LD_LIBRARY_PATH` (Linux) or `DYLD_LIBRARY_PATH` (macOS) to the directory containing `libhdf5.so` / `libhdf5.dylib` before loading the plugin. + +## Prerequisites + +- [vcpkg](https://github.com/microsoft/vcpkg) — installs the AWS and Azure C++ SDKs via CMake +- [CMake](https://cmake.org) and [Ninja](https://ninja-build.org) +- HDF5 shared library (`.so` / `.dylib`) — set `HDF5_DIR` to the directory containing it + +## Build + +```bash +cmake -B build -S . \ + -DCMAKE_TOOLCHAIN_FILE=${VCPKG_ROOT:-~/.vcpkg}/scripts/buildsystems/vcpkg.cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -G Ninja + +cmake --build build +``` + +This produces `build/lib_arraymorph.dylib` on macOS or `build/lib_arraymorph.so` on Linux. + +### Locating HDF5 + +`lib_arraymorph` must link against an existing HDF5 shared library. Set `HDF5_DIR` to the directory containing the HDF5 `.so` / `.dylib` before running CMake: + +```bash +export HDF5_DIR=/path/to/hdf5/lib +``` + +If you have h5py installed in a Python environment, you can point directly at its bundled libraries: + +```bash +# macOS +export HDF5_DIR=/path/to/.venv/lib/python3.x/site-packages/h5py/.dylibs + +# Linux +export HDF5_DIR=/path/to/.venv/lib/python3.x/site-packages/h5py.libs +``` diff --git a/lib/conanfile.py b/lib/conanfile.py deleted file mode 100644 index be860ad..0000000 --- a/lib/conanfile.py +++ /dev/null @@ -1,41 +0,0 @@ -from conan import ConanFile -from conan.tools.cmake import cmake_layout, CMakeToolchain, CMakeDeps - - -class ArrayMorphRecipe(ConanFile): - name = "ArrayMorph" - version = "0.2.0" - settings = "os", "compiler", "build_type", "arch" - - def requirements(self): - self.requires("aws-sdk-cpp/1.11.692") - self.requires("azure-sdk-for-cpp/1.16.1") - self.requires("hdf5/1.14.6") - self.requires("libcurl/8.17.0") - self.requires("openssl/3.6.1") - - def configure(self): - self.options["*"].shared = False - - # AWS SDK: ONLY S3 — disable everything that pulls in - # audio (libalsa), GUI (xorg), and other unnecessary deps - self.options["aws-sdk-cpp"].s3 = True - self.options["aws-sdk-cpp"].text_to_speech = False - self.options["aws-sdk-cpp"].access_management = False - self.options["aws-sdk-cpp"].identity_management = False - self.options["aws-sdk-cpp"].transfer = False - self.options["aws-sdk-cpp"].queues = False - self.options["aws-sdk-cpp"].messaging = False - - # Azure SDK: only blob storage - self.options["azure-sdk-for-cpp"].with_storage_blobs = True - self.options["azure-sdk-for-cpp"].with_storage_datalake = False - - def layout(self): - cmake_layout(self) - - def generate(self): - tc = CMakeToolchain(self, generator="Ninja") - tc.generate() - deps = CMakeDeps(self) - deps.generate() diff --git a/lib/justfile b/lib/justfile index 5349ac8..4a56ffd 100644 --- a/lib/justfile +++ b/lib/justfile @@ -2,11 +2,11 @@ # Variables BUILD_DIR := "build" -COMPILER_STD := "gnu20" +VCPKG_TOOLCHAIN := env("VCPKG_ROOT", home_directory() / ".vcpkg") / "scripts/buildsystems/vcpkg.cmake" # Set this to the path where H5Py installs HDF5 binary. On macOS its .dylib, Linux .so, Windows .dll -H5PY_HDF5_DIR := `python -c "import h5py,os;d=os.path.dirname(h5py.__file__);print(os.path.join(d,'.dylibs') if os.path.exists(os.path.join(d,'.dylibs')) else os.path.join(os.path.dirname(d),'h5py.libs'))"` +HDF5_DIR := `../.venv/bin/python -c "import h5py,os;d=os.path.dirname(h5py.__file__);print(os.path.join(d,'.dylibs') if os.path.exists(os.path.join(d,'.dylibs')) else os.path.join(os.path.dirname(d),'h5py.libs'))"` # Settings set dotenv-load := true @@ -16,25 +16,22 @@ set export := true default: @just --list -# 1. Install dependencies via Conan - -# We put all .pc and .ini files in a hidden .conan_deps folder -deps: - conan install . --build=missing -s compiler.cppstd={{ COMPILER_STD }} - -# 2. Setup the CMake build environment +# Setup the CMake build environment setup: - export H5PY_HDF5_DIR={{ H5PY_HDF5_DIR }}; \ - cmake --preset conan-release + export HDF5_DIR={{ HDF5_DIR }}; \ + cmake -B {{ BUILD_DIR }} -S . \ + -DCMAKE_TOOLCHAIN_FILE={{ VCPKG_TOOLCHAIN }} \ + -DCMAKE_BUILD_TYPE=Release \ + -G Ninja build: - cmake --build --preset conan-release + cmake --build {{ BUILD_DIR }} -# 5. Full build from scratch -full-build: deps setup build +# Full build from scratch +full-build: setup build -# 6. Clean all build and dependency artifacts +# Clean all build and dependency artifacts clean: - rm -rf {{ BUILD_DIR }} + rm -rf {{ BUILD_DIR }} vcpkg_installed full-clean-build: clean full-build diff --git a/lib/vcpkg.json b/lib/vcpkg.json new file mode 100644 index 0000000..abea6d8 --- /dev/null +++ b/lib/vcpkg.json @@ -0,0 +1,12 @@ +{ + "name": "arraymorph", + "version-string": "0.2.0", + "dependencies": [ + { + "name": "aws-sdk-cpp", + "features": ["s3"] + }, + "azure-storage-blobs-cpp", + "hdf5" + ] +} diff --git a/pyproject.toml b/pyproject.toml index 50deb06..929b4c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ metadata.version.provider = "scikit_build_core.metadata.setuptools_scm" [tool.scikit-build.cmake.define] CMAKE_POSITION_INDEPENDENT_CODE = "ON" CMAKE_TOOLCHAIN_FILE = { env = "CMAKE_TOOLCHAIN_FILE", default = "" } -H5PY_HDF5_DIR = { env = "H5PY_HDF5_DIR", default = "" } +HDF5_DIR = { env = "HDF5_DIR", default = "" } [tool.setuptools_scm] local_scheme = "no-local-version"