Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions tma/scale_tma_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,8 @@ int scaleTmaKernelHost(int M, int N, int iterations = 1) {
// Make tensors
//

auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{});
auto gmemLayoutS = make_layout(tensor_shape, GenRowMajor{});
auto gmemLayoutD = make_layout(tensor_shape, GenRowMajor{});
Tensor tensor_S = make_tensor(
make_gmem_ptr(thrust::raw_pointer_cast(d_S.data())), gmemLayoutS);
Tensor tensor_D = make_tensor(
Expand Down
6 changes: 3 additions & 3 deletions tma/tma_copy.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,8 @@ int copy_host_tma_load_and_store_kernel(int M, int N, int iterations = 1) {
// Make tensors
//

auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{});
auto gmemLayoutS = make_layout(tensor_shape, GenRowMajor{});
auto gmemLayoutD = make_layout(tensor_shape, GenRowMajor{});
Tensor tensor_S = make_tensor(
make_gmem_ptr(thrust::raw_pointer_cast(d_S.data())), gmemLayoutS);
Tensor tensor_D = make_tensor(
Expand All @@ -163,7 +163,7 @@ int copy_host_tma_load_and_store_kernel(int M, int N, int iterations = 1) {

auto tileShape = make_shape(bM{}, bN{});
// NOTE: same smem layout for TMA load and store
auto smemLayout = make_layout(tileShape, LayoutRight{});
auto smemLayout = make_layout(tileShape, GenRowMajor{});
auto tma_load =
make_tma_copy(SM90_TMA_LOAD{}, tensor_S, smemLayout);
// print(tma_load);
Expand Down
2 changes: 1 addition & 1 deletion tma/tma_copy_multicast.h
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ int copy_host_tma_load_and_store_kernel_multicast(int M, int N,
// Make tensors
//

auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
auto gmemLayoutS = make_layout(tensor_shape, GenRowMajor{});
auto gmemLayoutD = make_ordered_layout(tensor_shape_out, Step<_1, _0, _2>{});
// print(gmemLayoutD);

Expand Down
6 changes: 3 additions & 3 deletions transpose-cute/include/copy.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ template <typename T> void copy_baseline(TransposeParams<T> params) {
// Make tensors
//
auto tensor_shape = make_shape(params.M, params.N);
auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
auto gmemLayoutD = make_layout(tensor_shape, LayoutRight{});
auto gmemLayoutS = make_layout(tensor_shape, GenRowMajor{});
auto gmemLayoutD = make_layout(tensor_shape, GenRowMajor{});
Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);

Expand All @@ -124,7 +124,7 @@ template <typename T> void copy_baseline(TransposeParams<T> params) {
tiled_divide(tensor_D, block_shape); // ((bN, bM), n', m')

auto threadLayout =
make_layout(make_shape(Int<8>{}, Int<32>{}), LayoutRight{});
make_layout(make_shape(Int<8>{}, Int<32>{}), GenRowMajor{});

auto vec_layout = make_layout(make_shape(Int<4>{}, Int<1>{}));

Expand Down
8 changes: 4 additions & 4 deletions transpose-cute/include/transpose_naive.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ template <typename Element> void transpose_naive(TransposeParams<Element> params
//
auto tensor_shape = make_shape(params.M, params.N);
auto tensor_shape_trans = make_shape(params.N, params.M);
auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{});
auto gmemLayoutS = make_layout(tensor_shape, GenRowMajor{});
auto gmemLayoutD = make_layout(tensor_shape_trans, GenRowMajor{});
Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);

Expand All @@ -75,9 +75,9 @@ template <typename Element> void transpose_naive(TransposeParams<Element> params
Tensor tiled_tensor_DT = tiled_divide(tensor_DT, block_shape_trans); // ((bN, bM), n', m')

auto threadLayoutS =
make_layout(make_shape(Int<8>{}, Int<32>{}), LayoutRight{});
make_layout(make_shape(Int<8>{}, Int<32>{}), GenRowMajor{});
auto threadLayoutD =
make_layout(make_shape(Int<8>{}, Int<32>{}), LayoutRight{});
make_layout(make_shape(Int<8>{}, Int<32>{}), GenRowMajor{});

dim3 gridDim(
size<1>(tiled_tensor_S),
Expand Down
12 changes: 6 additions & 6 deletions transpose-cute/include/transpose_smem.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ template <typename Element, bool isSwizzled = true> void transpose_smem(Transpos
//
auto tensor_shape = make_shape(params.M, params.N);
auto tensor_shape_trans = make_shape(params.N, params.M);
auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{});
auto gmemLayoutS = make_layout(tensor_shape, GenRowMajor{});
auto gmemLayoutD = make_layout(tensor_shape_trans, GenRowMajor{});
Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);

Expand All @@ -91,18 +91,18 @@ template <typename Element, bool isSwizzled = true> void transpose_smem(Transpos
Tensor tiled_tensor_D =
tiled_divide(tensor_D, block_shape_trans); // ((bN, bM), n', m')

auto tileShapeS = make_layout(block_shape, LayoutRight{});
auto tileShapeD = make_layout(block_shape_trans, LayoutRight{});
auto tileShapeS = make_layout(block_shape, GenRowMajor{});
auto tileShapeD = make_layout(block_shape_trans, GenRowMajor{});

auto smemLayoutS = tileShapeS;
auto smemLayoutD = composition(smemLayoutS, tileShapeD);
auto smemLayoutS_swizzle = composition(Swizzle<5, 0, 5>{}, tileShapeS);
auto smemLayoutD_swizzle = composition(smemLayoutS_swizzle, tileShapeD);

auto threadLayoutS =
make_layout(make_shape(Int<8>{}, Int<32>{}), LayoutRight{});
make_layout(make_shape(Int<8>{}, Int<32>{}), GenRowMajor{});
auto threadLayoutD =
make_layout(make_shape(Int<8>{}, Int<32>{}), LayoutRight{});
make_layout(make_shape(Int<8>{}, Int<32>{}), GenRowMajor{});

size_t smem_size = int(
sizeof(SharedStorageTranspose<Element, decltype(smemLayoutS_swizzle)>));
Expand Down
6 changes: 3 additions & 3 deletions transpose-cute/include/transpose_tmastore_vectorized.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ template <typename Element> void transpose_tma(TransposeParams<Element> params)

auto tensor_shape = make_shape(params.M, params.N);
auto tensor_shape_trans = make_shape(params.N, params.M);
auto gmemLayoutS = make_layout(tensor_shape, LayoutRight{});
auto gmemLayoutD = make_layout(tensor_shape_trans, LayoutRight{});
auto gmemLayoutS = make_layout(tensor_shape, GenRowMajor{});
auto gmemLayoutD = make_layout(tensor_shape_trans, GenRowMajor{});
Tensor tensor_S = make_tensor(make_gmem_ptr(params.input), gmemLayoutS);
Tensor tensor_D = make_tensor(make_gmem_ptr(params.output), gmemLayoutD);

Expand All @@ -117,7 +117,7 @@ template <typename Element> void transpose_tma(TransposeParams<Element> params)
Tensor tiled_tensor_D = tiled_divide(tensor_D, block_shape_trans); // ((bN, bM), n', m')

auto threadLayoutS =
make_layout(make_shape(Int<32>{}, Int<8>{}), LayoutRight{});
make_layout(make_shape(Int<32>{}, Int<8>{}), GenRowMajor{});
auto vecLayoutS = make_layout(make_shape(Int<1>{}, Int<4>{}));
using AccessTypeS = cutlass::AlignedArray<Element, size(vecLayoutS)>;
using AtomS = Copy_Atom<UniversalCopy<AccessTypeS>, Element>;
Expand Down
2 changes: 1 addition & 1 deletion transpose-cute/include/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ template <typename T, bool isTranspose = true> int benchmark(void (*transpose)(T

int bad = 0;
if constexpr (isTranspose) {
auto transpose_function = make_layout(tensor_shape_S, LayoutRight{});
auto transpose_function = make_layout(tensor_shape_S, GenRowMajor{});
for (size_t i = 0; i < h_D.size(); ++i)
if (h_D[i] != h_S[transpose_function(i)])
bad++;
Expand Down