struct SM90_TMA_REDUCE_ADD_2D
{
CUTE_HOST_DEVICE static void
copy(void const* const desc_ptr,
void const* const smem_ptr,
int32_t const& crd0, int32_t const& crd1)
{
#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr);
asm volatile (
"cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.bulk_group [%0, {%2, %3}], [%1];"
:
: "l"(gmem_int_desc), "r"(smem_int_ptr),
"r"(crd0), "r"(crd1)
: "memory");
#else
CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
#endif
}
};
I noticed from PTX that reduce is bulk_group, but not mbarrier? So, the grammar will be much different from TMA_STORE, but not just replace SM90_TMA_STORE to SM90_TMA_REDUCE_ADD?
I noticed from PTX that reduce is bulk_group, but not mbarrier? So, the grammar will be much different from TMA_STORE, but not just replace
SM90_TMA_STOREtoSM90_TMA_REDUCE_ADD?