ROCm · mmakevic-amd · Mar 27, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/.github/workflows/benchmarks/build_binaries.sh b/.github/workflows/benchmarks/build_binaries.sh
@@ -53,6 +53,10 @@ configure_backend() {
       echo "Running: ./configure.py --backend=CUDA --cuda_compiler=nvcc"
       ./configure.py --backend=CUDA --cuda_compiler=nvcc || echo "INFO: GPU Configure script failed or is not applicable."
       ;;
+    GPU_MI250)
+      echo "Running: ./configure.py --backend=ROCM --rocm_compiler=hipcc --clang_path=/lib/llvm-18/bin/clang-18 "
+      ./configure.py --backend=ROCM --rocm_compiler=hipcc --clang_path=/lib/llvm-18/bin/clang-18  || echo "INFO: GPU Configure script failed or is not applicable."
+      ;;
     *)
       echo "INFO: Unknown hardware category '$hw_category_upper_for_configure'"
       ;;
@@ -96,6 +100,12 @@ case "$HARDWARE_CATEGORY" in
     stats_binary_path="./$BAZEL_BIN_DIR/xla/tools/compute_xspace_stats_main_gpu"
     device_type_flag_value="gpu"
     ;;
+  GPU_MI250)
+    BUILD_TYPE="XLA_LINUX_X86_GPU_ROCM_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS"
+    runner_binary_path="./$BAZEL_BIN_DIR/xla/tools/multihost_hlo_runner/hlo_runner_main_gpu"
+    stats_binary_path="./$BAZEL_BIN_DIR/xla/tools/compute_xspace_stats_main_gpu"
+    device_type_flag_value="gpu"
+    ;;
   *)
     echo "::error::Unsupported HARDWARE_CATEGORY: '$HARDWARE_CATEGORY'. This script is configured to handle specific values from the HardwareCategory enum (CPU_X86, CPU_ARM64, GPU_L4, GPU_B200)."
     exit 1

diff --git a/.github/workflows/generate_benchmark_matrix.yml b/.github/workflows/generate_benchmark_matrix.yml
@@ -43,8 +43,8 @@ on:
 jobs:
   generate:
     name: Generate Matrix (${{ inputs.workflow_type }})
-    runs-on: linux-x86-n2-64
-    container: us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/ml-build:latest
+    runs-on: linux-mi250-4
+    container: ${{ vars.DOCKER_IMAGE }}
     outputs:
       matrix_json_output: ${{ steps.run_generator.outputs.matrix_json }}
     defaults:
@@ -65,7 +65,7 @@ jobs:
         run: |
           echo "Configuring OpenXLA for CPU to build the generator tool..."
           if [ -f "./configure.py" ]; then
-            ./configure.py --backend=CPU 
+            ./configure.py --backend=CPU --clang_path=/lib/llvm-18/bin/clang-18
           else
             echo "::warning::configure.py not found. Assuming C++ tool build doesn't require it or is pre-configured."
           fi
@@ -78,16 +78,14 @@ jobs:
                 --test_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd \
                 --config=warnings \
                 --config=nonccl \
-                --config=rbe_linux_cpu \
                 --color=yes \
                 --test_output=errors \
                 --verbose_failures \
                 --keep_going \
                 --nobuild_tests_only \
                 --profile=profile.json.gz \
                 --flaky_test_attempts=3 \
-                --jobs=150 \
-                --bes_upload_mode=fully_async \
+                --bes_backend="" \
                 //xla/tools/benchmarks/utils:generate_benchmark_matrices_main
           if [ $? -ne 0 ]; then
              echo "::error::Failed to build generate_benchmark_matrices_main"
@@ -138,4 +136,4 @@ jobs:
 
           echo "matrix_json<<EOF_MATRIX_JSON" >> $GITHUB_OUTPUT
           echo "$JSON_ARRAY_STRING" >> $GITHUB_OUTPUT
-          echo "EOF_MATRIX_JSON" >> $GITHUB_OUTPUT
+          echo "EOF_MATRIX_JSON" >> $GITHUB_OUTPUT
diff --git a/.github/workflows/postsubmit_benchmark.yml b/.github/workflows/postsubmit_benchmark.yml
@@ -31,10 +31,10 @@ on:
         - 'no'
   push:
     branches:
-      - main
+      - rocm-jaxlib-v0.9.0
 
 concurrency:
-  # Run every push to main and do not cancel in-progress jobs; the timeout is 60 minutes.
+  # Run every push to rocm-jaxlib-v0.9.0 and do not cancel in-progress jobs; the timeout is 60 minutes.
   group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
   cancel-in-progress: false
 
@@ -44,15 +44,15 @@ jobs:
   # =================================================================
   generate_matrix:
     name: Generate Postsubmit Matrix
-    # Condition: Run if manually dispatched OR if it's a push event to the main branch.
+    # Condition: Run if manually dispatched OR if it's a push event to the rocm-jaxlib-v0.9.0 branch.
     if: |
       github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'push' && github.ref == 'refs/heads/main')
+      (github.event_name == 'push' && github.ref == 'refs/heads/rocm-jaxlib-v0.9.0')
     uses: ./.github/workflows/generate_benchmark_matrix.yml
     with:
       workflow_type: 'POSTSUBMIT'
       registry_file: 'xla/tools/benchmarks/registries/default_registry.yml'
-      checkout_ref: ${{ github.sha }} # On push/dispatch to main, github.sha is the commit SHA
+      checkout_ref: ${{ github.sha }} # On push/dispatch to rocm-jaxlib-v0.9.0, github.sha is the commit SHA
 
   run_benchmarks:
     name: Run Benchmark (${{ matrix.benchmark_entry.config_id }}) # config_id will indicate the workflow type, e.g., '_postsubmit'
@@ -66,7 +66,9 @@ jobs:
          benchmark_entry: ${{ fromJson(needs.generate_matrix.outputs.matrix_include_json || '[]') }}
 
     runs-on: ${{ matrix.benchmark_entry.runner_label }}
-    container: ${{ matrix.benchmark_entry.container_image }}
+    container:
+      image: ${{ matrix.benchmark_entry.container_image }}
+      options: --device=/dev/dri --device=/dev/kfd
 
     defaults:
       run:
@@ -101,11 +103,6 @@ jobs:
       COMPARISON_SCRIPT_RELATIVE: .github/workflows/benchmarks/compare_with_baseline.py
 
     steps:
-      - name: "Wait For Connection"
-        uses: google-ml-infra/actions/ci_connection@7f5ca0c263a81ed09ea276524c1b9192f1304e3c
-        with:
-          halt-dispatch-input: ${{ inputs.halt-for-connection }}
-
       - name: Print Job Info & Set Full Paths in ENV
         run: |
           # Resolve full paths based on GITHUB_WORKSPACE and relative paths defined in env
@@ -149,6 +146,15 @@ jobs:
         with:
           ref: ${{ env.CHECKOUT_REF }}
 
+      - name: Get RBE cluster keys
+        env:
+          RBE_CI_CERT: ${{ secrets.RBE_CI_CERT }}
+          RBE_CI_KEY: ${{ secrets.RBE_CI_KEY }}
+        run: |
+          mkdir -p /tf/certificates
+          echo "$RBE_CI_CERT" > /tf/certificates/ci-cert.crt
+          echo "$RBE_CI_KEY" > /tf/certificates/ci-cert.key
+
       - name: Build Binaries
         id: build_binaries
         run: |
@@ -202,26 +208,6 @@ jobs:
           echo "Baseline comparison finished."
           echo "---------------------------------------------"
 
-      - name: Upload results.json directly to GCS
-        run: |
-          GCS_BUCKET="gs://openxla-postsubmit-transient"
-          RESULTS_JSON_FILE_PATH="${{ env.RESOLVED_OUTPUT_DIR }}/results.json"
-
-          # Check if the results file exists
-          if [ ! -f "$RESULTS_JSON_FILE_PATH" ]; then
-            echo "::error::results.json not found at $RESULTS_JSON_FILE_PATH"
-            exit 1
-          fi
-
-          # Construct a GCS object name
-          TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-          DATE_FOLDER=$(date +%Y%m%d)
-          COMMIT_SHA_SHORT=$(echo "${{ github.sha }}" | cut -c1-8)
-          GCS_OBJECT_NAME="${BENCHMARK_NAME}/${DATE_FOLDER}/${TIMESTAMP}_run_${WORKFLOW_RUN_ID}_commit_${COMMIT_SHA_SHORT}.json"
-
-          echo "Uploading $RESULTS_JSON_FILE_PATH to $GCS_BUCKET/$GCS_OBJECT_NAME"
-          gsutil cp "$RESULTS_JSON_FILE_PATH" "$GCS_BUCKET/$GCS_OBJECT_NAME"
-
       - name: Upload Benchmark Artifacts
         if: always()
         uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0

diff --git a/build_tools/ci/build.py b/build_tools/ci/build.py
@@ -123,6 +123,7 @@ class BuildType(enum.Enum):
   XLA_LINUX_X86_GPU_L4_16_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS = enum.auto()
   XLA_LINUX_X86_GPU_L4_48_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS = enum.auto()
   XLA_LINUX_X86_GPU_A4_224_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS = enum.auto()
+  XLA_LINUX_X86_GPU_ROCM_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS = enum.auto()
 
   XLA_MACOS_X86_CPU_KOKORO = enum.auto()
   XLA_MACOS_ARM64_CPU_KOKORO = enum.auto()
@@ -273,6 +274,29 @@ def _tag_filters_for_compute_capability(
   tag_filters += ("-requires-gpu-intel",)
   return tag_filters
 
+rocm_tag_filters = (
+    "-no_gpu",
+    "-skip_rocprofiler_sdk",
+    "-no_oss",
+    "-oss_excluded",
+    "-oss_serial",
+    "-requires-gpu-intel",
+    "-requires-gpu-nvidia",
+    "-cuda-only",
+    "-oneapi-only",
+    "-requires-gpu-sm60",
+    "-requires-gpu-sm60-only",
+    "-requires-gpu-sm70",
+    "-requires-gpu-sm70-only",
+    "-requires-gpu-sm80",
+    "-requires-gpu-sm80-only",
+    "-requires-gpu-sm86",
+    "-requires-gpu-sm86-only",
+    "-requires-gpu-sm89",
+    "-requires-gpu-sm89-only",
+    "-requires-gpu-sm90",
+    "-requires-gpu-sm90-only",
+)
 
 nvidia_gpu_filters = (
     "-no_oss",
@@ -636,6 +660,29 @@ def nvidia_gpu_build_with_compute_capability(
     subcommand="build",
 )
 
+Build(
+    type_=BuildType.XLA_LINUX_X86_GPU_ROCM_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS,
+    repo="openxla/xla",
+    configs=("rocm_ci", "rocm_rbe"),
+    target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
+    test_tag_filters=rocm_tag_filters,
+    build_tag_filters=rocm_tag_filters,
+    options={
+        "run_under": "//build_tools/ci:parallel_gpu_execute",
+        "//xla/tsl:ci_build": True,
+        "remote_download_toplevel": True,  # Override remote_download_minimal from rocm_rbe
+        **_DEFAULT_BAZEL_OPTIONS,
+    },
+    repo_env={
+        "TF_ROCM_AMDGPU_TARGETS": "gfx90a",
+        "TF_ROCM_RBE_DOCKER_IMAGE": "rocm/"
+           "tensorflow-build@sha256:"
+           "66eb4c1e39db76fae2eb0a1029490acbe7bfce0e00d6ab435e170f743921f4c4"
+    },
+    startup_options={"bazelrc": "build_tools/rocm/rocm_xla.bazelrc"},
+    subcommand="build",
+)
+
 macos_tag_filter = (
     "-no_oss",
     "-gpu",

diff --git a/xla/tools/benchmarks/baseline/postsubmit_baseline.yml b/xla/tools/benchmarks/baseline/postsubmit_baseline.yml
@@ -35,6 +35,16 @@
       "threshold": 0.30 # Allow 30% regression max
     }
   },
+  "gemma3_1b_flax_call_mi250_1h1d_postsubmit": { 
+    "GPU_DEVICE_TIME": {
+      "baseline_ms": 5.6,
+      "threshold": 0.30 # Allow 30% regression max
+    },
+    "GPU_DEVICE_MEMCPY_TIME": {
+      "baseline_ms": 0.05,
+      "threshold": 0.30 # Allow 30% regression max
+    }
+  },
   "gemma2_2b_keras_jax_b200_1h1d_postsubmit": {  # config_id
     "GPU_DEVICE_TIME": {
       "baseline_ms": 100,
@@ -65,6 +75,16 @@
       "threshold": 0.30 # Allow 30% regression max
     }
   },
+  "gemma2_2b_keras_jax_mi250_1h1d_postsubmit": {
+    "GPU_DEVICE_TIME": {
+      "baseline_ms": 205,
+      "threshold": 0.30 # Allow 30% regression max
+    },
+    "GPU_DEVICE_MEMCPY_TIME": {
+      "baseline_ms": 0.1,
+      "threshold": 0.30 # Allow 30% regression max
+    }
+  },
   "nv_maxtext_1n1g_jit_train_step_before_optimization_b200_1h1d_postsubmit": {
     "GPU_DEVICE_TIME": {
       "baseline_ms": 302.173,

diff --git a/xla/tools/benchmarks/proto/benchmark_config.proto b/xla/tools/benchmarks/proto/benchmark_config.proto
@@ -24,6 +24,7 @@ enum HardwareCategory {
   CPU_ARM64 = 2;  // ARM64 CPU
   GPU_L4 = 3;     // L4 GPU
   GPU_B200 = 4;   // B200 GPU
+  GPU_MI250 = 5;  // MI250 GPU
 }
 
 // Enum defining the workflow type.

diff --git a/xla/tools/benchmarks/registries/default_registry.yml b/xla/tools/benchmarks/registries/default_registry.yml
@@ -26,20 +26,12 @@ benchmarks: [
     }
     model_source_info: ["Gemma3 1B"]
     hardware_execution_configs: [{
-      hardware_category: GPU_L4
+      hardware_category: GPU_MI250
       topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
       target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME]
-      workflow_type: [PRESUBMIT, POSTSUBMIT, SCHEDULED]
-      runtime_flags: ["--num_repeats=5"]
-    },
-    {
-      hardware_category: CPU_X86
-      topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
-      target_metrics: [CPU_TIME]
-      workflow_type: [PRESUBMIT, POSTSUBMIT, SCHEDULED]
+      workflow_type: [POSTSUBMIT]
       runtime_flags: ["--num_repeats=5"]
-    }
-    ]
+    }]
     update_frequency_policy: QUARTERLY
   },
   {
@@ -52,46 +44,14 @@ benchmarks: [
     }
     model_source_info: ["Gemma2 2B"]
     hardware_execution_configs: [{
-      hardware_category: GPU_L4
-      topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
-      target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME]
-      workflow_type: [PRESUBMIT, POSTSUBMIT]
-      runtime_flags: ["--num_repeats=5"]
-    },
-    {
-      hardware_category: GPU_B200
+      hardware_category: GPU_MI250
       topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
       target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME]
       workflow_type: [POSTSUBMIT]
       runtime_flags: ["--num_repeats=5"]
-    },
-    {
-      hardware_category: CPU_X86
-      topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
-      target_metrics: [CPU_TIME]
-      workflow_type: [PRESUBMIT, POSTSUBMIT]
-      runtime_flags: ["--num_repeats=5"]
     }]
     update_frequency_policy: QUARTERLY
     # TODO(juliagmt): remove this label once the benchmark is stable.
     github_labels: ["blocking_presubmit_test"]
-  },
-  {
-    name: "nv_maxtext_1n1g_jit_train_step_before_optimization.hlo"
-    description: "Nvidia benchmark for Maxtext 1 node 1 gpu config for gpt3-52k model."
-    owner: "hmonishN@"
-    input_artifact: {
-      input_format: HLO_TEXT
-      artifact_path: "xla/tools/benchmarks/hlo/nv_maxtext_1n1g_jit_train_step_before_optimization.hlo"
-    }
-    model_source_info: ["Maxtext Default gpt3-52k"]
-    hardware_execution_configs: [{
-      hardware_category: GPU_B200
-      topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
-      target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME]
-      workflow_type: [POSTSUBMIT]
-      runtime_flags: ["--num_repeats=5", "--xla_gpu_enable_latency_hiding_scheduler", "--xla_gpu_all_reduce_combine_threshold_bytes=1073741824", "--xla_gpu_all_gather_combine_threshold_bytes=1073741824", "--xla_gpu_reduce_scatter_combine_threshold_bytes=134217728", "--xla_gpu_enable_pipelined_all_gather", "--xla_gpu_enable_pipelined_all_reduce", "--xla_gpu_enable_while_loop_double_buffering", "--xla_gpu_enable_all_gather_combine_by_dim=false", "--xla_gpu_enable_reduce_scatter_combine_by_dim=false", "--xla_disable_hlo_passes=rematerialization"]
-    }]
-    update_frequency_policy: QUARTERLY
   }
 ]
diff --git a/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc b/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc
@@ -97,10 +97,12 @@ GetHardwareToRunnerLabelMap() {
   // - CPU_X86: linux-x86-n2-128
   // - GPU_L4: linux-x86-g2-16-l4-1gpu
   // - GPU_B200: linux-x86-a4-224-b200-1gpu
+  // - GPU_MI250: linux-mi250-4
   static const auto* kMap = new absl::flat_hash_map<std::string, std::string>{
       {"CPU_X86", "linux-x86-n2-128"},
       {"GPU_L4", "linux-x86-g2-16-l4-1gpu"},
       {"GPU_B200", "linux-x86-a4-224-b200-1gpu"},
+      {"GPU_MI250", "linux-mi250-4"},
       // Add more mappings
   };
   return *kMap;
@@ -125,6 +127,10 @@ GetHardwareToContainerImage() {
           {"GPU_L4_1H_4D",
            "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
            "ml-build-cuda12.8-cudnn9.8:latest"},
+          {"GPU_MI250",
+           "rocm/"
+           "tensorflow-build@sha256:"
+           "66eb4c1e39db76fae2eb0a1029490acbe7bfce0e00d6ab435e170f743921f4c4"},
       };
   return *kHardwareToContainerImage;
 }
@@ -154,6 +160,9 @@ GetHardwareToDefaultTargetMetrics() {
           {"GPU_B200",
            {TargetMetric::GPU_DEVICE_TIME,
             TargetMetric::GPU_DEVICE_MEMCPY_TIME}},
+          {"GPU_MI250",
+           {TargetMetric::GPU_DEVICE_TIME,
+            TargetMetric::GPU_DEVICE_MEMCPY_TIME}},
       };
   return *kHardwareToDefaultTargetMetrics;
 }