ROCm · i-chaochen · Apr 10, 2026 · Apr 13, 2026 · Apr 15, 2026 · Apr 17, 2026
diff --git a/build_tools/rocm/execute_ci_build_upstream.sh b/build_tools/rocm/execute_ci_build_upstream.sh
@@ -4,38 +4,34 @@ set -ex
 
 SCRIPT_DIR=$(realpath "$(dirname "$0")")
 
-EXCLUDED_TESTS=(
-    "*ParametersUsedByCollectiveMosaicShouldBeCopiedToCollectiveMemory"
-    "SortingTest*"
-    "*IotaR1Test*"
-    "HostMemoryAllocateTest.Numa"
-    "CubSort*"
-    "Fp8s/FloatNormalizationTest.Fp8Normalization/f8e4m3fn_f8e5m2"
-    "Fp8s/FloatNormalizationTest.Fp8Normalization/f8e5m2_f8e5m2"
-    "Fp8s/FloatNormalizationTest.Fp8Normalization/f8e5m2_f8e4m3fn"
-    "Fp8s/FloatNormalizationTest.Fp8Normalization/f8e4m3fn_f8e4m3fn"
-)
+EXCLUDED_TESTS=()
 
 EXCLUDED_TARGETS_SGPU=(
-    "//xla/service/gpu:dot_algorithm_support_test_amdgpu_any"
-    "//xla/service/gpu:float_support_test_amdgpu_any"
-    "//xla/backends/gpu/transforms:scatter_determinism_expander_test_amdgpu_any"
-    "//xla/backends/gpu/transforms:triton_fusion_numerics_verifier_test_amdgpu_any"
+    "//xla/tests:iota_test_amdgpu_any"   # Taking too many CI nodes
     "//xla/backends/gpu/codegen/triton:dot_algorithms_test_amdgpu_any"
 )
 
 TEST_TARGETS_SGPU=(
     "//xla/..."
-    "-//xla/service/gpu:dot_algorithm_support_test_amdgpu_any"
-    "-//xla/service/gpu:float_support_test_amdgpu_any"
-    "-//xla/backends/gpu/transforms:scatter_determinism_expander_test_amdgpu_any"
-    "-//xla/backends/gpu/transforms:triton_fusion_numerics_verifier_test_amdgpu_any"
+    "-//xla/tests:iota_test_amdgpu_any"   # Taking too many CI nodes
     "-//xla/backends/gpu/codegen/triton:dot_algorithms_test_amdgpu_any"
 )
 
 TEST_TARGETS_MGPU=(
-    "//xla/backends/gpu/tests:collective_pipeline_parallelism_test"
+    "//xla/tests:collective_ops_test"
     "//xla/backends/gpu/collectives:gpu_clique_key_test"
+    "//xla/backends/gpu/runtime:all_reduce_test"
+    "//xla/backends/gpu/runtime:collective_kernel_thunk_test"
+    "//xla/backends/gpu/runtime:buffers_checksum_thunk_test"
+    "//xla/backends/gpu/tests:collective_ops_command_buffer_test"
+    "//xla/backends/gpu/tests:collective_pipeline_parallelism_test"
+    "//xla/backends/gpu/tests:nccl_group_execution_test"
+    "//xla/backends/gpu/tests:collective_ops_e2e_test"
+    "//xla/backends/gpu/tests:collective_ops_ffi_test"
+    "//xla/backends/gpu/tests:collective_ops_sharded_unsharded_e2e_test"
+    "//xla/backends/gpu/tests:ragged_all_to_all_e2e_test"
+    "//xla/backends/gpu/tests:replicated_io_feed_test"
+    "//xla/backends/gpu/tests:all_reduce_e2e_test"
     "//xla/service:collective_ops_utils_test"
     "//xla/service:collective_pipeliner_test"
     "//xla/service:collective_permute_cycle_test"
@@ -49,8 +45,12 @@ TEST_TARGETS_MGPU=(
     "//xla/service:sharding_propagation_test"
     "//xla/service:sharding_remover_test"
     "//xla/service:p2p_schedule_preparation_test"
+    "//xla/tools/multihost_hlo_runner:functional_hlo_runner_test"
     "//xla/pjrt/distributed:topology_util_test"
     "//xla/pjrt/distributed:client_server_test"
+    "//xla/pjrt/extensions/cross_host_transfers:pjrt_c_api_cross_host_transfers_extension_gpu_test"
+    "//xla/pjrt/gpu/tfrt:tfrt_gpu_client_test"
+    "//xla/pjrt/gpu:se_gpu_pjrt_client_test"
 )
 
 TAG_FILTERS=$("${SCRIPT_DIR}/rocm_tag_filters.sh")

diff --git a/build_tools/rocm/rocm_tag_filters.sh b/build_tools/rocm/rocm_tag_filters.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+TAG_FILTERS=(
+    -no_gpu
+    -requires-gpu-intel
+    -requires-gpu-nvidia
+    -cuda-only
+    -oneapi-only
+    -requires-gpu-sm60
+    -requires-gpu-sm60-only
+    -requires-gpu-sm70
+    -requires-gpu-sm70-only
+    -requires-gpu-sm80
+    -requires-gpu-sm80-only
+    -requires-gpu-sm86
+    -requires-gpu-sm86-only
+    -requires-gpu-sm89
+    -requires-gpu-sm89-only
+    -requires-gpu-sm90
+    -requires-gpu-sm90-only
+    -skip_rocprofiler_sdk
+    -no_oss
+    -oss_excluded
+    -oss_serial
+)
+
+echo $(IFS=, ; echo "${TAG_FILTERS[*]}")
diff --git a/build_tools/rocm/rocm_xla.bazelrc b/build_tools/rocm/rocm_xla.bazelrc
@@ -0,0 +1,113 @@
+# Test-related settings.
+
+build:rocm_dev --remote_upload_local_results=false
+build:rocm_dev --remote_cache="https://wardite.cluster.engflow.com"
+
+build:rocm_rbe --repo_env=REMOTE_GPU_TESTING=1
+build:rocm_rbe --bes_backend="grpcs://wardite.cluster.engflow.com"
+build:rocm_rbe --bes_results_url="https://wardite.cluster.engflow.com/invocation/"
+build:rocm_rbe --host_platform="@local_config_rocm//rocm:linux_x64"
+build:rocm_rbe --extra_execution_platforms="@local_config_rocm//rocm:linux_x64"
+build:rocm_rbe --platforms="@local_config_rocm//rocm:linux_x64"
+build:rocm_rbe --bes_timeout=600s
+build:rocm_rbe --tls_client_certificate="/data/ci-cert.crt"
+build:rocm_rbe --tls_client_key="/data/ci-cert.key"
+build:rocm_rbe --spawn_strategy=remote,local
+build:rocm_rbe --grpc_keepalive_time=30s
+
+test:rocm_rbe --jobs=200
+test:rocm_rbe --remote_executor=grpcs://wardite.cluster.engflow.com
+test:rocm_rbe --remote_timeout=3600
+test:rocm_rbe --strategy=TestRunner=remote,local
+
+# Dynamic execution config for ROCm RBE - builds locally, tests use hybrid mode
+build:rocm_rbe_dynamic --config=rocm_rbe
+build:rocm_rbe_dynamic --spawn_strategy=local
+test:rocm_rbe_dynamic --experimental_spawn_scheduler
+test:rocm_rbe_dynamic --strategy=TestRunner=dynamic
+test:rocm_rbe_dynamic --dynamic_mode=default
+test:rocm_rbe_dynamic --dynamic_local_strategy=worker,standalone,local
+test:rocm_rbe_dynamic --dynamic_remote_strategy=remote
+test:rocm_rbe_dynamic --experimental_local_execution_delay=1000
+test:rocm_rbe_dynamic --local_resources=cpu=HOST_CPUS*0.5
+
+build:tsan --strip=never
+build:tsan --copt -fsanitize=thread
+build:tsan --copt -g
+build:tsan --copt -fno-omit-frame-pointer
+build:tsan --linkopt -fsanitize=thread
+build:tsan --linkopt -g
+build:tsan --//build_tools/rocm:sanitizer=tsan
+build:tsan --test_env=TSAN_OPTIONS=suppressions=build_tools/rocm/tsan_ignore_list.txt::history_size=7:ignore_noninstrumented_modules=1
+build:tsan --run_under=//build_tools/rocm:sanitizer_wrapper
+
+build:asan --test_env=ASAN_OPTIONS=suppressions=build_tools/rocm/asan_ignore_list.txt:use_sigaltstack=0
+build:asan --test_env=LSAN_OPTIONS=suppressions=build_tools/rocm/lsan_ignore_list.txt:use_sigaltstack=0
+build:asan --//build_tools/rocm:sanitizer=asan
+build:asan --run_under=//build_tools/rocm:sanitizer_wrapper
+
+build:ci_single_gpu --run_under=//build_tools/rocm:parallel_gpu_execute
+build:ci_single_gpu --flaky_test_attempts=3
+
+build:ci_multi_gpu --action_env=XLA_FLAGS="--xla_gpu_force_compilation_parallelism=16 --xla_gpu_enable_llvm_module_compilation_parallelism=true"
+build:ci_multi_gpu --action_env=NCCL_MAX_NCHANNELS=1
+build:ci_multi_gpu --run_under=//build_tools/rocm:sanitizer_wrapper
+build:ci_multi_gpu --test_sharding_strategy=disabled
+build:ci_multi_gpu --flaky_test_attempts=3
+build:ci_multi_gpu --experimental_guard_against_concurrent_changes
+build:ci_multi_gpu --test_env=HIP_VISIBLE_DEVICES=0,1,2,3
+
+test:xla_sgpu -- \
+//xla/... \
+-//xla/tests:iota_test_amdgpu_any \
+-//xla/backends/gpu/collectives:gpu_clique_key_test \
+-//xla/service:collective_ops_utils_test \
+-//xla/service:collective_pipeliner_test \
+-//xla/service:collective_permute_cycle_test \
+-//xla/service:batched_gather_scatter_normalizer_test \
+-//xla/service:all_reduce_simplifier_test \
+-//xla/service:all_gather_simplifier_test \
+-//xla/service:reduce_scatter_decomgoser_test \
+-//xla/service:reduce_scatter_reassociate_test \
+-//xla/service:reduce_scatter_combiner_test \
+-//xla/service:scatter_simplifier_test \
+-//xla/service:sharding_propagation_test \
+-//xla/service:sharding_remover_test \
+-//xla/service:p2p_schedule_preparation_test \
+-//xla/pjrt/distributed:topology_util_test \
+-//xla/pjrt/distributed:client_server_test
+
+test:xla_mgpu -- \
+//xla/tests:collective_ops_test \
+//xla/backends/gpu/collectives:gpu_clique_key_test \
+//xla/backends/gpu/runtime:all_reduce_test \
+//xla/backends/gpu/runtime:collective_kernel_thunk_test \
+//xla/backends/gpu/runtime:buffers_checksum_thunk_test \
+//xla/backends/gpu/tests:collective_ops_command_buffer_test \
+//xla/backends/gpu/tests:collective_pipeline_parallelism_test \
+//xla/backends/gpu/tests:nccl_group_execution_test \
+//xla/backends/gpu/tests:collective_ops_e2e_test \
+//xla/backends/gpu/tests:collective_ops_ffi_test \
+//xla/backends/gpu/tests:collective_ops_sharded_unsharded_e2e_test \
+//xla/backends/gpu/tests:ragged_all_to_all_e2e_test \
+//xla/backends/gpu/tests:replicated_io_feed_test \
+//xla/backends/gpu/tests:all_reduce_e2e_test \
+//xla/service:collective_ops_utils_test \
+//xla/service:collective_pipeliner_test \
+//xla/service:collective_permute_cycle_test \
+//xla/service:batched_gather_scatter_normalizer_test \
+//xla/service:all_reduce_simplifier_test \
+//xla/service:all_gather_simplifier_test \
+//xla/service:reduce_scatter_decomposer_test \
+//xla/service:reduce_scatter_reassociate_test \
+//xla/service:reduce_scatter_combiner_test \
+//xla/service:scatter_simplifier_test \
+//xla/service:sharding_propagation_test \
+//xla/service:sharding_remover_test \
+//xla/service:p2p_schedule_preparation_test \
+//xla/tools/multihost_hlo_runner:functional_hlo_runner_test \
+//xla/pjrt/distributed:topology_util_test \
+//xla/pjrt/distributed:client_server_test \
+//xla/pjrt/extensions/cross_host_transfers:pjrt_c_api_cross_host_transfers_extension_gpu_test \
+//xla/pjrt/gpu/tfrt:tfrt_gpu_client_test \
+//xla/pjrt/gpu:se_gpu_pjrt_client_test \
diff --git a/build_tools/rocm/rocm_xla_ci.bazelrc b/build_tools/rocm/rocm_xla_ci.bazelrc
@@ -0,0 +1,3 @@
+# CI related imports
+try-import /usertools/rocm.bazelrc
+try-import %workspace%/build_tools/rocm/rocm_xla.bazelrc
diff --git a/build_tools/rocm/run_xla_ci_build.sh b/build_tools/rocm/run_xla_ci_build.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+# Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+SCRIPT_DIR=$(realpath $(dirname $0))
+TAG_FILTERS=$($SCRIPT_DIR/rocm_tag_filters.sh)
+
+mkdir -p /tf/pkg
+
+for arg in "$@"; do
+    if [[ "$arg" == "--config=ci_multi_gpu" ]]; then
+        TAG_FILTERS="${TAG_FILTERS},multi_gpu"
+    fi
+    if [[ "$arg" == "--config=ci_single_gpu" ]]; then
+        TAG_FILTERS="${TAG_FILTERS},gpu,-multi_gpu"
+    fi
+done
+
+SCRIPT_DIR=$(dirname $0)
+bazel --bazelrc="$SCRIPT_DIR/rocm_xla_ci.bazelrc" test \
+    --build_tag_filters=$TAG_FILTERS \
+    --test_tag_filters=$TAG_FILTERS \
+    --profile=/tf/pkg/profile.json.gz \
+    --nokeep_going \
+    --test_env=TF_TESTS_PER_GPU=1 \
+    --action_env=XLA_FLAGS="--xla_gpu_enable_llvm_module_compilation_parallelism=true --xla_gpu_force_compilation_parallelism=16" \
+    --test_output=errors \
+    --run_under=//build_tools/rocm:parallel_gpu_execute \
+    "$@"