Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .github/workflows/benchmarks/build_binaries.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ configure_backend() {
echo "Running: ./configure.py --backend=CUDA --cuda_compiler=nvcc"
./configure.py --backend=CUDA --cuda_compiler=nvcc || echo "INFO: GPU Configure script failed or is not applicable."
;;
GPU_MI250)
echo "Running: ./configure.py --backend=ROCM --rocm_compiler=hipcc --clang_path=/lib/llvm-18/bin/clang-18 "
./configure.py --backend=ROCM --rocm_compiler=hipcc --clang_path=/lib/llvm-18/bin/clang-18 || echo "INFO: GPU Configure script failed or is not applicable."
;;
*)
echo "INFO: Unknown hardware category '$hw_category_upper_for_configure'"
;;
Expand Down Expand Up @@ -96,6 +100,12 @@ case "$HARDWARE_CATEGORY" in
stats_binary_path="./$BAZEL_BIN_DIR/xla/tools/compute_xspace_stats_main_gpu"
device_type_flag_value="gpu"
;;
GPU_MI250)
BUILD_TYPE="XLA_LINUX_X86_GPU_ROCM_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS"
runner_binary_path="./$BAZEL_BIN_DIR/xla/tools/multihost_hlo_runner/hlo_runner_main_gpu"
stats_binary_path="./$BAZEL_BIN_DIR/xla/tools/compute_xspace_stats_main_gpu"
device_type_flag_value="gpu"
;;
*)
echo "::error::Unsupported HARDWARE_CATEGORY: '$HARDWARE_CATEGORY'. This script is configured to handle specific values from the HardwareCategory enum (CPU_X86, CPU_ARM64, GPU_L4, GPU_B200)."
exit 1
Expand Down
12 changes: 5 additions & 7 deletions .github/workflows/generate_benchmark_matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ on:
jobs:
generate:
name: Generate Matrix (${{ inputs.workflow_type }})
runs-on: linux-x86-n2-64
container: us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/ml-build:latest
runs-on: linux-mi250-4
container: ${{ vars.DOCKER_IMAGE }}
outputs:
matrix_json_output: ${{ steps.run_generator.outputs.matrix_json }}
defaults:
Expand All @@ -65,7 +65,7 @@ jobs:
run: |
echo "Configuring OpenXLA for CPU to build the generator tool..."
if [ -f "./configure.py" ]; then
./configure.py --backend=CPU
./configure.py --backend=CPU --clang_path=/lib/llvm-18/bin/clang-18
else
echo "::warning::configure.py not found. Assuming C++ tool build doesn't require it or is pre-configured."
fi
Expand All @@ -78,16 +78,14 @@ jobs:
--test_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd \
--config=warnings \
--config=nonccl \
--config=rbe_linux_cpu \
--color=yes \
--test_output=errors \
--verbose_failures \
--keep_going \
--nobuild_tests_only \
--profile=profile.json.gz \
--flaky_test_attempts=3 \
--jobs=150 \
--bes_upload_mode=fully_async \
--bes_backend="" \
//xla/tools/benchmarks/utils:generate_benchmark_matrices_main
if [ $? -ne 0 ]; then
echo "::error::Failed to build generate_benchmark_matrices_main"
Expand Down Expand Up @@ -138,4 +136,4 @@ jobs:

echo "matrix_json<<EOF_MATRIX_JSON" >> $GITHUB_OUTPUT
echo "$JSON_ARRAY_STRING" >> $GITHUB_OUTPUT
echo "EOF_MATRIX_JSON" >> $GITHUB_OUTPUT
echo "EOF_MATRIX_JSON" >> $GITHUB_OUTPUT
48 changes: 17 additions & 31 deletions .github/workflows/postsubmit_benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ on:
- 'no'
push:
branches:
- main
- rocm-jaxlib-v0.9.0

concurrency:
# Run every push to main and do not cancel in-progress jobs; the timeout is 60 minutes.
# Run every push to rocm-jaxlib-v0.9.0 and do not cancel in-progress jobs; the timeout is 60 minutes.
group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
cancel-in-progress: false

Expand All @@ -44,15 +44,15 @@ jobs:
# =================================================================
generate_matrix:
name: Generate Postsubmit Matrix
# Condition: Run if manually dispatched OR if it's a push event to the main branch.
# Condition: Run if manually dispatched OR if it's a push event to the rocm-jaxlib-v0.9.0 branch.
if: |
github.event_name == 'workflow_dispatch' ||
(github.event_name == 'push' && github.ref == 'refs/heads/main')
(github.event_name == 'push' && github.ref == 'refs/heads/rocm-jaxlib-v0.9.0')
uses: ./.github/workflows/generate_benchmark_matrix.yml
with:
workflow_type: 'POSTSUBMIT'
registry_file: 'xla/tools/benchmarks/registries/default_registry.yml'
checkout_ref: ${{ github.sha }} # On push/dispatch to main, github.sha is the commit SHA
checkout_ref: ${{ github.sha }} # On push/dispatch to rocm-jaxlib-v0.9.0, github.sha is the commit SHA

run_benchmarks:
name: Run Benchmark (${{ matrix.benchmark_entry.config_id }}) # config_id will indicate the workflow type, e.g., '_postsubmit'
Expand All @@ -66,7 +66,9 @@ jobs:
benchmark_entry: ${{ fromJson(needs.generate_matrix.outputs.matrix_include_json || '[]') }}

runs-on: ${{ matrix.benchmark_entry.runner_label }}
container: ${{ matrix.benchmark_entry.container_image }}
container:
image: ${{ matrix.benchmark_entry.container_image }}
options: --device=/dev/dri --device=/dev/kfd

defaults:
run:
Expand Down Expand Up @@ -101,11 +103,6 @@ jobs:
COMPARISON_SCRIPT_RELATIVE: .github/workflows/benchmarks/compare_with_baseline.py

steps:
- name: "Wait For Connection"
uses: google-ml-infra/actions/ci_connection@7f5ca0c263a81ed09ea276524c1b9192f1304e3c
with:
halt-dispatch-input: ${{ inputs.halt-for-connection }}

- name: Print Job Info & Set Full Paths in ENV
run: |
# Resolve full paths based on GITHUB_WORKSPACE and relative paths defined in env
Expand Down Expand Up @@ -149,6 +146,15 @@ jobs:
with:
ref: ${{ env.CHECKOUT_REF }}

- name: Get RBE cluster keys
env:
RBE_CI_CERT: ${{ secrets.RBE_CI_CERT }}
RBE_CI_KEY: ${{ secrets.RBE_CI_KEY }}
run: |
mkdir -p /tf/certificates
echo "$RBE_CI_CERT" > /tf/certificates/ci-cert.crt
echo "$RBE_CI_KEY" > /tf/certificates/ci-cert.key

- name: Build Binaries
id: build_binaries
run: |
Expand Down Expand Up @@ -202,26 +208,6 @@ jobs:
echo "Baseline comparison finished."
echo "---------------------------------------------"

- name: Upload results.json directly to GCS
run: |
GCS_BUCKET="gs://openxla-postsubmit-transient"
RESULTS_JSON_FILE_PATH="${{ env.RESOLVED_OUTPUT_DIR }}/results.json"

# Check if the results file exists
if [ ! -f "$RESULTS_JSON_FILE_PATH" ]; then
echo "::error::results.json not found at $RESULTS_JSON_FILE_PATH"
exit 1
fi

# Construct a GCS object name
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
DATE_FOLDER=$(date +%Y%m%d)
COMMIT_SHA_SHORT=$(echo "${{ github.sha }}" | cut -c1-8)
GCS_OBJECT_NAME="${BENCHMARK_NAME}/${DATE_FOLDER}/${TIMESTAMP}_run_${WORKFLOW_RUN_ID}_commit_${COMMIT_SHA_SHORT}.json"

echo "Uploading $RESULTS_JSON_FILE_PATH to $GCS_BUCKET/$GCS_OBJECT_NAME"
gsutil cp "$RESULTS_JSON_FILE_PATH" "$GCS_BUCKET/$GCS_OBJECT_NAME"

- name: Upload Benchmark Artifacts
if: always()
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
Expand Down
47 changes: 47 additions & 0 deletions build_tools/ci/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ class BuildType(enum.Enum):
XLA_LINUX_X86_GPU_L4_16_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS = enum.auto()
XLA_LINUX_X86_GPU_L4_48_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS = enum.auto()
XLA_LINUX_X86_GPU_A4_224_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS = enum.auto()
XLA_LINUX_X86_GPU_ROCM_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS = enum.auto()

XLA_MACOS_X86_CPU_KOKORO = enum.auto()
XLA_MACOS_ARM64_CPU_KOKORO = enum.auto()
Expand Down Expand Up @@ -273,6 +274,29 @@ def _tag_filters_for_compute_capability(
tag_filters += ("-requires-gpu-intel",)
return tag_filters

rocm_tag_filters = (
"-no_gpu",
"-skip_rocprofiler_sdk",
"-no_oss",
"-oss_excluded",
"-oss_serial",
"-requires-gpu-intel",
"-requires-gpu-nvidia",
"-cuda-only",
"-oneapi-only",
"-requires-gpu-sm60",
"-requires-gpu-sm60-only",
"-requires-gpu-sm70",
"-requires-gpu-sm70-only",
"-requires-gpu-sm80",
"-requires-gpu-sm80-only",
"-requires-gpu-sm86",
"-requires-gpu-sm86-only",
"-requires-gpu-sm89",
"-requires-gpu-sm89-only",
"-requires-gpu-sm90",
"-requires-gpu-sm90-only",
)

nvidia_gpu_filters = (
"-no_oss",
Expand Down Expand Up @@ -636,6 +660,29 @@ def nvidia_gpu_build_with_compute_capability(
subcommand="build",
)

Build(
type_=BuildType.XLA_LINUX_X86_GPU_ROCM_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS,
repo="openxla/xla",
configs=("rocm_ci", "rocm_rbe"),
target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
test_tag_filters=rocm_tag_filters,
build_tag_filters=rocm_tag_filters,
options={
"run_under": "//build_tools/ci:parallel_gpu_execute",
"//xla/tsl:ci_build": True,
"remote_download_toplevel": True, # Override remote_download_minimal from rocm_rbe
**_DEFAULT_BAZEL_OPTIONS,
},
repo_env={
"TF_ROCM_AMDGPU_TARGETS": "gfx90a",
"TF_ROCM_RBE_DOCKER_IMAGE": "rocm/"
"tensorflow-build@sha256:"
"66eb4c1e39db76fae2eb0a1029490acbe7bfce0e00d6ab435e170f743921f4c4"
},
startup_options={"bazelrc": "build_tools/rocm/rocm_xla.bazelrc"},
subcommand="build",
)

macos_tag_filter = (
"-no_oss",
"-gpu",
Expand Down
20 changes: 20 additions & 0 deletions xla/tools/benchmarks/baseline/postsubmit_baseline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@
"threshold": 0.30 # Allow 30% regression max
}
},
"gemma3_1b_flax_call_mi250_1h1d_postsubmit": {
"GPU_DEVICE_TIME": {
"baseline_ms": 5.6,
"threshold": 0.30 # Allow 30% regression max
},
"GPU_DEVICE_MEMCPY_TIME": {
"baseline_ms": 0.05,
"threshold": 0.30 # Allow 30% regression max
}
},
"gemma2_2b_keras_jax_b200_1h1d_postsubmit": { # config_id
"GPU_DEVICE_TIME": {
"baseline_ms": 100,
Expand Down Expand Up @@ -65,6 +75,16 @@
"threshold": 0.30 # Allow 30% regression max
}
},
"gemma2_2b_keras_jax_mi250_1h1d_postsubmit": {
"GPU_DEVICE_TIME": {
"baseline_ms": 205,
"threshold": 0.30 # Allow 30% regression max
},
"GPU_DEVICE_MEMCPY_TIME": {
"baseline_ms": 0.1,
"threshold": 0.30 # Allow 30% regression max
}
},
"nv_maxtext_1n1g_jit_train_step_before_optimization_b200_1h1d_postsubmit": {
"GPU_DEVICE_TIME": {
"baseline_ms": 302.173,
Expand Down
1 change: 1 addition & 0 deletions xla/tools/benchmarks/proto/benchmark_config.proto
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ enum HardwareCategory {
CPU_ARM64 = 2; // ARM64 CPU
GPU_L4 = 3; // L4 GPU
GPU_B200 = 4; // B200 GPU
GPU_MI250 = 5; // MI250 GPU
}

// Enum defining the workflow type.
Expand Down
48 changes: 4 additions & 44 deletions xla/tools/benchmarks/registries/default_registry.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,12 @@ benchmarks: [
}
model_source_info: ["Gemma3 1B"]
hardware_execution_configs: [{
hardware_category: GPU_L4
hardware_category: GPU_MI250
topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME]
workflow_type: [PRESUBMIT, POSTSUBMIT, SCHEDULED]
runtime_flags: ["--num_repeats=5"]
},
{
hardware_category: CPU_X86
topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
target_metrics: [CPU_TIME]
workflow_type: [PRESUBMIT, POSTSUBMIT, SCHEDULED]
workflow_type: [POSTSUBMIT]
runtime_flags: ["--num_repeats=5"]
}
]
}]
update_frequency_policy: QUARTERLY
},
{
Expand All @@ -52,46 +44,14 @@ benchmarks: [
}
model_source_info: ["Gemma2 2B"]
hardware_execution_configs: [{
hardware_category: GPU_L4
topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME]
workflow_type: [PRESUBMIT, POSTSUBMIT]
runtime_flags: ["--num_repeats=5"]
},
{
hardware_category: GPU_B200
hardware_category: GPU_MI250
topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME]
workflow_type: [POSTSUBMIT]
runtime_flags: ["--num_repeats=5"]
},
{
hardware_category: CPU_X86
topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
target_metrics: [CPU_TIME]
workflow_type: [PRESUBMIT, POSTSUBMIT]
runtime_flags: ["--num_repeats=5"]
}]
update_frequency_policy: QUARTERLY
# TODO(juliagmt): remove this label once the benchmark is stable.
github_labels: ["blocking_presubmit_test"]
},
{
name: "nv_maxtext_1n1g_jit_train_step_before_optimization.hlo"
description: "Nvidia benchmark for Maxtext 1 node 1 gpu config for gpt3-52k model."
owner: "hmonishN@"
input_artifact: {
input_format: HLO_TEXT
artifact_path: "xla/tools/benchmarks/hlo/nv_maxtext_1n1g_jit_train_step_before_optimization.hlo"
}
model_source_info: ["Maxtext Default gpt3-52k"]
hardware_execution_configs: [{
hardware_category: GPU_B200
topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME]
workflow_type: [POSTSUBMIT]
runtime_flags: ["--num_repeats=5", "--xla_gpu_enable_latency_hiding_scheduler", "--xla_gpu_all_reduce_combine_threshold_bytes=1073741824", "--xla_gpu_all_gather_combine_threshold_bytes=1073741824", "--xla_gpu_reduce_scatter_combine_threshold_bytes=134217728", "--xla_gpu_enable_pipelined_all_gather", "--xla_gpu_enable_pipelined_all_reduce", "--xla_gpu_enable_while_loop_double_buffering", "--xla_gpu_enable_all_gather_combine_by_dim=false", "--xla_gpu_enable_reduce_scatter_combine_by_dim=false", "--xla_disable_hlo_passes=rematerialization"]
}]
update_frequency_policy: QUARTERLY
}
]
9 changes: 9 additions & 0 deletions xla/tools/benchmarks/utils/generate_benchmark_matrices.cc
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,12 @@ GetHardwareToRunnerLabelMap() {
// - CPU_X86: linux-x86-n2-128
// - GPU_L4: linux-x86-g2-16-l4-1gpu
// - GPU_B200: linux-x86-a4-224-b200-1gpu
// - GPU_MI250: linux-mi250-4
static const auto* kMap = new absl::flat_hash_map<std::string, std::string>{
{"CPU_X86", "linux-x86-n2-128"},
{"GPU_L4", "linux-x86-g2-16-l4-1gpu"},
{"GPU_B200", "linux-x86-a4-224-b200-1gpu"},
{"GPU_MI250", "linux-mi250-4"},
// Add more mappings
};
return *kMap;
Expand All @@ -125,6 +127,10 @@ GetHardwareToContainerImage() {
{"GPU_L4_1H_4D",
"us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
"ml-build-cuda12.8-cudnn9.8:latest"},
{"GPU_MI250",
"rocm/"
"tensorflow-build@sha256:"
"66eb4c1e39db76fae2eb0a1029490acbe7bfce0e00d6ab435e170f743921f4c4"},
};
return *kHardwareToContainerImage;
}
Expand Down Expand Up @@ -154,6 +160,9 @@ GetHardwareToDefaultTargetMetrics() {
{"GPU_B200",
{TargetMetric::GPU_DEVICE_TIME,
TargetMetric::GPU_DEVICE_MEMCPY_TIME}},
{"GPU_MI250",
{TargetMetric::GPU_DEVICE_TIME,
TargetMetric::GPU_DEVICE_MEMCPY_TIME}},
};
return *kHardwareToDefaultTargetMetrics;
}
Expand Down
Loading