From 65a9dd79363faeb1056a7c192872de030b951e34 Mon Sep 17 00:00:00 2001 From: leimao Date: Tue, 30 Jul 2024 20:30:51 -0700 Subject: [PATCH] Make Layout Expression Consistent --- tma/scale_tma_kernel.h | 4 ++-- tma/tma_copy.h | 6 +++--- tma/tma_copy_multicast.h | 2 +- transpose-cute/include/copy.h | 6 +++--- transpose-cute/include/transpose_naive.h | 8 ++++---- transpose-cute/include/transpose_smem.h | 12 ++++++------ .../include/transpose_tmastore_vectorized.h | 6 +++--- transpose-cute/include/util.h | 2 +- 8 files changed, 23 insertions(+), 23 deletions(-) diff --git a/tma/scale_tma_kernel.h b/tma/scale_tma_kernel.h index bceba1d..bc2643c 100644 --- a/tma/scale_tma_kernel.h +++ b/tma/scale_tma_kernel.h @@ -178,8 +178,8 @@ int scaleTmaKernelHost(int M, int N, int iterations = 1) { // Make tensors // - auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{}); - auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{}); + auto gmemLayoutS = make_layout(tensor_shape, GenRowMajor{}); + auto gmemLayoutD = make_layout(tensor_shape, GenRowMajor{}); Tensor tensor_S = make_tensor( make_gmem_ptr(thrust::raw_pointer_cast(d_S.data())), gmemLayoutS); Tensor tensor_D = make_tensor( diff --git a/tma/tma_copy.h b/tma/tma_copy.h index fd1532f..f4facd2 100644 --- a/tma/tma_copy.h +++ b/tma/tma_copy.h @@ -151,8 +151,8 @@ int copy_host_tma_load_and_store_kernel(int M, int N, int iterations = 1) { // Make tensors // - auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{}); - auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{}); + auto gmemLayoutS = make_layout(tensor_shape, GenRowMajor{}); + auto gmemLayoutD = make_layout(tensor_shape, GenRowMajor{}); Tensor tensor_S = make_tensor( make_gmem_ptr(thrust::raw_pointer_cast(d_S.data())), gmemLayoutS); Tensor tensor_D = make_tensor( @@ -163,7 +163,7 @@ int copy_host_tma_load_and_store_kernel(int M, int N, int iterations = 1) { auto tileShape = make_shape(bM{}, bN{}); // NOTE: same smem layout for TMA load and store - auto smemLayout = make_layout(tileShape, LayoutRight{}); + auto smemLayout = make_layout(tileShape, GenRowMajor{}); auto tma_load = make_tma_copy(SM90_TMA_LOAD{}, tensor_S, smemLayout); // print(tma_load); diff --git a/tma/tma_copy_multicast.h b/tma/tma_copy_multicast.h index 4b1346e..395842e 100644 --- a/tma/tma_copy_multicast.h +++ b/tma/tma_copy_multicast.h @@ -284,7 +284,7 @@ int copy_host_tma_load_and_store_kernel_multicast(int M, int N, // Make tensors // - auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{}); + auto gmemLayoutS = make_layout(tensor_shape, GenRowMajor{}); auto gmemLayoutD = make_ordered_layout(tensor_shape_out, Step<_1, _0, _2>{}); // print(gmemLayoutD); diff --git a/transpose-cute/include/copy.h b/transpose-cute/include/copy.h index 828976b..0fdd62b 100644 --- a/transpose-cute/include/copy.h +++ b/transpose-cute/include/copy.h @@ -105,8 +105,8 @@ template void copy_baseline(TransposeParams params) { // Make tensors // auto tensor_shape = make_shape(params.M, params.N); - auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{}); - auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{}); + auto gmemLayoutS = make_layout(tensor_shape, GenRowMajor{}); + auto gmemLayoutD = make_layout(tensor_shape, GenRowMajor{}); Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS); Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD); @@ -124,7 +124,7 @@ template void copy_baseline(TransposeParams params) { tiled_divide(tensor_D, block_shape); // ((bN, bM), n', m') auto threadLayout = - make_layout(make_shape(Int<8>{}, Int<32>{}), LayoutRight{}); + make_layout(make_shape(Int<8>{}, Int<32>{}), GenRowMajor{}); auto vec_layout = make_layout(make_shape(Int<4>{}, Int<1>{})); diff --git a/transpose-cute/include/transpose_naive.h b/transpose-cute/include/transpose_naive.h index e7de598..a8137dc 100644 --- a/transpose-cute/include/transpose_naive.h +++ b/transpose-cute/include/transpose_naive.h @@ -52,8 +52,8 @@ template void transpose_naive(TransposeParams params // auto tensor_shape = make_shape(params.M, params.N); auto tensor_shape_trans = make_shape(params.N, params.M); - auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{}); - auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{}); + auto gmemLayoutS = make_layout(tensor_shape, GenRowMajor{}); + auto gmemLayoutD = make_layout(tensor_shape_trans, GenRowMajor{}); Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS); Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD); @@ -75,9 +75,9 @@ template void transpose_naive(TransposeParams params Tensor tiled_tensor_DT = tiled_divide(tensor_DT, block_shape_trans); // ((bN, bM), n', m') auto threadLayoutS = - make_layout(make_shape(Int<8>{}, Int<32>{}), LayoutRight{}); + make_layout(make_shape(Int<8>{}, Int<32>{}), GenRowMajor{}); auto threadLayoutD = - make_layout(make_shape(Int<8>{}, Int<32>{}), LayoutRight{}); + make_layout(make_shape(Int<8>{}, Int<32>{}), GenRowMajor{}); dim3 gridDim( size<1>(tiled_tensor_S), diff --git a/transpose-cute/include/transpose_smem.h b/transpose-cute/include/transpose_smem.h index d191efc..1c12d24 100644 --- a/transpose-cute/include/transpose_smem.h +++ b/transpose-cute/include/transpose_smem.h @@ -71,8 +71,8 @@ template void transpose_smem(Transpos // auto tensor_shape = make_shape(params.M, params.N); auto tensor_shape_trans = make_shape(params.N, params.M); - auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{}); - auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{}); + auto gmemLayoutS = make_layout(tensor_shape, GenRowMajor{}); + auto gmemLayoutD = make_layout(tensor_shape_trans, GenRowMajor{}); Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS); Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD); @@ -91,8 +91,8 @@ template void transpose_smem(Transpos Tensor tiled_tensor_D = tiled_divide(tensor_D, block_shape_trans); // ((bN, bM), n', m') - auto tileShapeS = make_layout(block_shape, LayoutRight{}); - auto tileShapeD = make_layout(block_shape_trans, LayoutRight{}); + auto tileShapeS = make_layout(block_shape, GenRowMajor{}); + auto tileShapeD = make_layout(block_shape_trans, GenRowMajor{}); auto smemLayoutS = tileShapeS; auto smemLayoutD = composition(smemLayoutS, tileShapeD); @@ -100,9 +100,9 @@ template void transpose_smem(Transpos auto smemLayoutD_swizzle = composition(smemLayoutS_swizzle, tileShapeD); auto threadLayoutS = - make_layout(make_shape(Int<8>{}, Int<32>{}), LayoutRight{}); + make_layout(make_shape(Int<8>{}, Int<32>{}), GenRowMajor{}); auto threadLayoutD = - make_layout(make_shape(Int<8>{}, Int<32>{}), LayoutRight{}); + make_layout(make_shape(Int<8>{}, Int<32>{}), GenRowMajor{}); size_t smem_size = int( sizeof(SharedStorageTranspose)); diff --git a/transpose-cute/include/transpose_tmastore_vectorized.h b/transpose-cute/include/transpose_tmastore_vectorized.h index 371dfa4..21cf0f5 100644 --- a/transpose-cute/include/transpose_tmastore_vectorized.h +++ b/transpose-cute/include/transpose_tmastore_vectorized.h @@ -98,8 +98,8 @@ template void transpose_tma(TransposeParams params) auto tensor_shape = make_shape(params.M, params.N); auto tensor_shape_trans = make_shape(params.N, params.M); - auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{}); - auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{}); + auto gmemLayoutS = make_layout(tensor_shape, GenRowMajor{}); + auto gmemLayoutD = make_layout(tensor_shape_trans, GenRowMajor{}); Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS); Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD); @@ -117,7 +117,7 @@ template void transpose_tma(TransposeParams params) Tensor tiled_tensor_D = tiled_divide(tensor_D, block_shape_trans); // ((bN, bM), n', m') auto threadLayoutS = - make_layout(make_shape(Int<32>{}, Int<8>{}), LayoutRight{}); + make_layout(make_shape(Int<32>{}, Int<8>{}), GenRowMajor{}); auto vecLayoutS = make_layout(make_shape(Int<1>{}, Int<4>{})); using AccessTypeS = cutlass::AlignedArray; using AtomS = Copy_Atom, Element>; diff --git a/transpose-cute/include/util.h b/transpose-cute/include/util.h index b00bb89..6db90ed 100644 --- a/transpose-cute/include/util.h +++ b/transpose-cute/include/util.h @@ -53,7 +53,7 @@ template int benchmark(void (*transpose)(T int bad = 0; if constexpr (isTranspose) { - auto transpose_function = make_layout(tensor_shape_S, LayoutRight{}); + auto transpose_function = make_layout(tensor_shape_S, GenRowMajor{}); for (size_t i = 0; i < h_D.size(); ++i) if (h_D[i] != h_S[transpose_function(i)]) bad++;