Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 53 additions & 14 deletions src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/tma_bulk.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,17 @@ static constexpr size_t buf_len = 1024;

__global__ void test_UBLKPF(int32_t *data, int run_iters)
{
// Shared memory buffer. The destination shared memory buffer of
// a bulk operations should be 16 byte aligned.
__shared__ alignas(16) int32_t smem_data[buf_len];
#pragma nv_diag_suppress static_var_with_dynamic_init
__shared__ barrier bar;
if (threadIdx.x == 0) {
init(&bar, blockDim.x);
ptx::fence_proxy_async(ptx::space_shared);
}
__syncthreads();

size_t offset = blockIdx.x * blockDim.x;

// Trigger a bulk prefetch
Expand All @@ -53,8 +64,29 @@ __global__ void test_UBLKPF(int32_t *data, int run_iters)
: "l"(prefetch_addr),
"r"(prefetch_count)
: "memory");
ptx::cp_async_bulk_commit_group();
ptx::cp_async_bulk_wait_group_read(ptx::n32_t<0>());
}
__syncthreads();

// Some arithmetic loop to wait for the prefetch to complete
for (int i = 0; i < 4096; i++) {
if (threadIdx.x == 0) {
smem_data[threadIdx.x] = i;
}
}
__syncthreads();

for (int i = 0; i < run_iters; i++) {
// Initiate TMA transfer to copy global to shared memory.
if (threadIdx.x == 0)
{
cuda::memcpy_async(
smem_data,
data + offset,
cuda::aligned_size_t<16>(sizeof(smem_data)),
bar);
}
barrier::arrival_token token = bar.arrive();
bar.wait(std::move(token));
}
}

Expand Down Expand Up @@ -157,7 +189,8 @@ int main(int argc, char *argv[])
const char* opcode = "UBLKPF";
int opt;
int run_iters = DEFAULT_RUN_ITERS;
while ((opt = getopt(argc, argv, "n:o:i:")) != -1) {
bool dump_data = false;
while ((opt = getopt(argc, argv, "n:o:i:d")) != -1) {
switch (opt) {
case 'n':
n = atoi(optarg);
Expand All @@ -168,6 +201,9 @@ int main(int argc, char *argv[])
case 'i':
run_iters = atoi(optarg);
break;
case 'd':
dump_data = true;
break;
default:
fprintf(stderr, "Usage: %s -n <n> -o <opcode>\n", argv[0]);
fprintf(stderr, " -n <n>: number of elements\n");
Expand All @@ -177,6 +213,7 @@ int main(int argc, char *argv[])
fprintf(stderr, " -o UBLKCP_G_S: bulk copy global to shared\n");
fprintf(stderr, " -o UBLKRED_G_S: bulk reduce global to shared\n");
fprintf(stderr, " -i <run_iters>: number of iterations\n");
fprintf(stderr, " -d: dump data\n");
return 1;
}
}
Expand Down Expand Up @@ -250,18 +287,20 @@ int main(int argc, char *argv[])
ptr = h_b;
}

char filename[100];
sprintf(filename, "tma_bulk_test_%s_%d.txt", opcode, n);
FILE *f = fopen(filename, "w");
for (i = 0; i < n; i++)
{
fprintf(f, "0x%x ", ptr[i]);
// Add line break after every 512 values
if ((i + 1) % 512 == 0)
fprintf(f, "\n");
if (dump_data) {
char filename[100];
sprintf(filename, "tma_bulk_test_%s_%d.txt", opcode, n);
FILE *f = fopen(filename, "w");
for (i = 0; i < n; i++)
{
fprintf(f, "0x%x ", ptr[i]);
// Add line break after every 512 values
if ((i + 1) % 512 == 0)
fprintf(f, "\n");
}
fclose(f);
printf("Values dumped to %s\n", filename);
}
fclose(f);
printf("Values dumped to %s\n", filename);

// Release host memory
free(h_a);
Expand Down
114 changes: 90 additions & 24 deletions src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/tma_tensor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,26 @@ __global__ void test_kernel(const __grid_constant__ CUtensorMap tensor_map, int

__device__ void test_UTMAPF_kernel(CUtensorMap const& tensor_map, int x, int y, int run_iters) {
// TensorMap prefetch at tensor_map with tensor coord {x, y}

// The destination shared memory buffer of a bulk tensor operation should be
// 128 byte aligned.
__shared__ alignas(128) int smem_buffer[SMEM_HEIGHT][SMEM_WIDTH];

// Initialize shared memory barrier with the number of threads participating in the barrier.
#pragma nv_diag_suppress static_var_with_dynamic_init
__shared__ barrier bar;

if (threadIdx.x == 0 && threadIdx.y == 0)
{
// Initialize barrier. All threads in block participate.
init(&bar, blockDim.x * blockDim.y);
// Make initialized barrier visible in async proxy.
ptx::fence_proxy_async(ptx::space_shared);
}
// Syncthreads so initialized barrier is visible to all threads.
__syncthreads();

// Trigger the prefetch
if (threadIdx.x == 0 && threadIdx.y == 0) {
asm volatile (
"cp.async.bulk.prefetch.tensor.2d.L2.global.tile"
Expand All @@ -103,6 +123,40 @@ __device__ void test_UTMAPF_kernel(CUtensorMap const& tensor_map, int x, int y,
"r"(y)
: "memory");
}
__syncthreads();

// Wait for the prefetch to complete with some arithemic loop
for (int i = 0; i < 4096; i++) {
if (threadIdx.x == 0 && threadIdx.y == 0) {
smem_buffer[threadIdx.y][threadIdx.x] = i;
}
}
__syncthreads();

// Use subsequent TMA load to tests the prefetch
for (int i = 0; i < run_iters; i++) {
barrier::arrival_token token;
if (threadIdx.x == 0 && threadIdx.y == 0) {
// Initiate bulk tensor copy.
ptx::cp_async_bulk_tensor(
ptx::space_cluster,
ptx::space_global,
&smem_buffer,
&tensor_map,
{x, y},
cuda::device::barrier_native_handle(bar)
);
// Arrive on the barrier and tell how many bytes are expected to come in.
token = cuda::device::barrier_arrive_tx(bar, 1, sizeof(smem_buffer));
}
else
{
// Other threads just arrive.
token = bar.arrive();
}
// Wait for the data to have arrived.
bar.wait(std::move(token));
}
}

__device__ void test_UTMALDG_kernel(CUtensorMap const& tensor_map, int x, int y, int run_iters) {
Expand Down Expand Up @@ -209,30 +263,35 @@ __device__ void test_UTMAREDG_kernel(CUtensorMap const& tensor_map, int x, int y

__device__ void test_REGULAR_LOAD_kernel(int *mat, int x, int y, int width_stride, int run_iters) {
__shared__ alignas(128) int smem_buffer[SMEM_HEIGHT][SMEM_WIDTH];

// Compute a unique value for the thread
int thread_x = threadIdx.x + x;
int thread_y = threadIdx.y + y;
// Mimic a TMA load pattern here
for (int i = 0; i < run_iters; i++) {
if (threadIdx.x == 0 && threadIdx.y == 0) {
for (int row = 0; row < SMEM_HEIGHT; row++) {
for (int col = 0; col < SMEM_WIDTH; col++) {
smem_buffer[row][col] = mat[(y + row) * width_stride + (x + col)] + 1;
int tmp;
// Bypassing L1 cache here
// asm volatile("ld.global.cg.s32 %0, [%1];" : "=r"(tmp) : "l"(mat + (y + row) * width_stride + (x + col)));
// No bypassing L1 cache here with request merged in L1 MSHR
tmp = mat[(y + row) * width_stride + (x + col)];
smem_buffer[row][col] = tmp;
}
}
}
__syncthreads();
// Mimic a TMA store pattern here to make compiler happy
if (threadIdx.x == 0 && threadIdx.y == 0) {
for (int row = 0; row < SMEM_HEIGHT; row++) {
for (int col = 0; col < SMEM_WIDTH; col++) {
mat[(y + row) * width_stride + (x + col)] = smem_buffer[row][col];
}
}

// Add a sink operation to make the compiler happy
// that write to the global memory
if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
int sum = 0;
for (int i = 0; i < SMEM_HEIGHT; i++) {
for (int j = 0; j < SMEM_WIDTH; j++) {
sum += smem_buffer[i][j];
}
}
__syncthreads();
mat[0] = sum;
}
__syncthreads();
}

PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled()
Expand All @@ -254,7 +313,8 @@ int main(int argc, char *argv[]) {
TestType test_type = TestType::UTMAPF;
int run_iters = DEFAULT_RUN_ITERS;
int opt;
while ((opt = getopt(argc, argv, "w:h:o:i:")) != -1) {
bool dump_data = false;
while ((opt = getopt(argc, argv, "w:h:o:i:d")) != -1) {
switch (opt) {
case 'w':
width = uint64_t(atoi(optarg));
Expand All @@ -268,6 +328,9 @@ int main(int argc, char *argv[]) {
case 'i':
run_iters = atoi(optarg);
break;
case 'd':
dump_data = true;
break;
default:
fprintf(stderr, "Usage: %s -w <width> -h <height> -o <opcode>\n", argv[0]);
fprintf(stderr, " Block size: %d x %d\n", SMEM_WIDTH, SMEM_HEIGHT);
Expand All @@ -279,6 +342,7 @@ int main(int argc, char *argv[]) {
fprintf(stderr, " -o UTMASTG: tensor store async\n");
fprintf(stderr, " -o UTMAREDG: tensor reduce async\n");
fprintf(stderr, " -i <run_iters>: number of iterations\n");
fprintf(stderr, " -d: dump data\n");
return 1;
}
}
Expand Down Expand Up @@ -379,18 +443,20 @@ int main(int argc, char *argv[]) {
ptr = out_mat;
}

char filename[100];
sprintf(filename, "tma_tensor_test_%s_%lu_%lu.txt", opcode.c_str(), height, width);
FILE *f = fopen(filename, "w");
for (int i = 0; i < height_stride * width_stride; i++)
{
fprintf(f, "0x%x ", ptr[i]);
// Add line break after every 512 values
if ((i + 1) % 512 == 0)
fprintf(f, "\n");
if (dump_data) {
char filename[100];
sprintf(filename, "tma_tensor_test_%s_%lu_%lu.txt", opcode.c_str(), height, width);
FILE *f = fopen(filename, "w");
for (int i = 0; i < height_stride * width_stride; i++)
{
fprintf(f, "0x%x ", ptr[i]);
// Add line break after every 512 values
if ((i + 1) % 512 == 0)
fprintf(f, "\n");
}
fclose(f);
printf("Values dumped to %s\n", filename);
}
fclose(f);
printf("Values dumped to %s\n", filename);

// Release device memory
cudaFree(d_mat);
Expand Down
26 changes: 13 additions & 13 deletions src/setup_environment
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ else
SCRIPT_PATH="$0"
fi
echo SCRIPT_PATH=$SCRIPT_PATH
export GPUAPPS_ROOT="$( cd "$( dirname "$SCRIPT_PATH" )" && pwd )"/../
export GPUAPPS_ROOT="$( cd "$( dirname "$SCRIPT_PATH" )" >/dev/null 2>&1 && pwd )"/../
echo GPUAPPS_ROOT=$GPUAPPS_ROOT
export CUDA_PATH=$CUDA_INSTALL_PATH

Expand Down Expand Up @@ -172,23 +172,23 @@ export GENCODE_FLAGS=$CUDA_CPPFLAGS
export CUDA_CPPFLAGS="$CUDA_CPPFLAGS $NVCC_ADDITIONAL_ARGS"
echo MAKE_ARGS=$MAKE_ARGS

if [ ! -d $GPUAPPS_ROOT/4.2/ ]; then
if [ ! -d "$GPUAPPS_ROOT/4.2/" ]; then
echo "SDK 4.2 Not detected - installing and building"
if [ ! -f $GPUAPPS_ROOT/gpucomputingsdk_4.2.9_linux.run ]; then
if [ ! -f "$GPUAPPS_ROOT/gpucomputingsdk_4.2.9_linux.run" ]; then
wget http://developer.download.nvidia.com/compute/cuda/4_2/rel/sdk/gpucomputingsdk_4.2.9_linux.run -O \
$GPUAPPS_ROOT/gpucomputingsdk_4.2.9_linux.run
"$GPUAPPS_ROOT/gpucomputingsdk_4.2.9_linux.run"
fi
chmod u+x $GPUAPPS_ROOT/gpucomputingsdk_4.2.9_linux.run
$GPUAPPS_ROOT/gpucomputingsdk_4.2.9_linux.run -- --prefix=$GPUAPPS_ROOT/4.2 --cudaprefix=$CUDA_INSTALL_PATH
export NVIDIA_COMPUTE_SDK_LOCATION=$GPUAPPS_ROOT/4.2
rm $GPUAPPS_ROOT/gpucomputingsdk_4.2.9_linux.run
make -C $NVIDIA_COMPUTE_SDK_LOCATION/shared
make -C $NVIDIA_COMPUTE_SDK_LOCATION/C/common
make -C $NVIDIA_COMPUTE_SDK_LOCATION/C/common -f Makefile_paramgl
make -C $NVIDIA_COMPUTE_SDK_LOCATION/C/common -f Makefile_rendercheckgl
chmod u+x "$GPUAPPS_ROOT/gpucomputingsdk_4.2.9_linux.run"
"$GPUAPPS_ROOT/gpucomputingsdk_4.2.9_linux.run" -- --prefix="$GPUAPPS_ROOT/4.2" --cudaprefix="$CUDA_INSTALL_PATH"
export NVIDIA_COMPUTE_SDK_LOCATION="$GPUAPPS_ROOT/4.2"
rm "$GPUAPPS_ROOT/gpucomputingsdk_4.2.9_linux.run"
make -C "$NVIDIA_COMPUTE_SDK_LOCATION/shared"
make -C "$NVIDIA_COMPUTE_SDK_LOCATION/C/common"
make -C "$NVIDIA_COMPUTE_SDK_LOCATION/C/common" -f Makefile_paramgl
make -C "$NVIDIA_COMPUTE_SDK_LOCATION/C/common" -f Makefile_rendercheckgl
else
echo "SDK 4.2 detected"
export NVIDIA_COMPUTE_SDK_LOCATION=$GPUAPPS_ROOT/4.2
export NVIDIA_COMPUTE_SDK_LOCATION="$GPUAPPS_ROOT/4.2"
fi
if [ ! -n "$CUDA_INSTALL_PATH" ]; then
echo "ERROR *** CUDA_INSTALL_PATH not set;"
Expand Down
Loading