diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/mbarrier/mbarrier.cu b/src/cuda/GPU_Microbenchmark/ubench/tma/mbarrier/mbarrier.cu index 7f4276783..b2fb086b8 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/tma/mbarrier/mbarrier.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/tma/mbarrier/mbarrier.cu @@ -30,7 +30,7 @@ namespace ptx = cuda::ptx; } \ } -__global__ __noinline__ void test_mbarrier_kernel() { +__global__ __noinline__ void test_mbarrier_kernel(uint64_t *sink) { // mbarrier object is 64bit in shared memory __shared__ uint64_t mbarrier; uint64_t state; @@ -38,49 +38,104 @@ __global__ __noinline__ void test_mbarrier_kernel() { // Initialize the mbarrier with PTX asm int block_size = blockDim.x * blockDim.y; if (threadIdx.x == 0 && threadIdx.y == 0) { - asm("mbarrier.init.shared::cta.b64 [%0], %1;" : : "l"(&mbarrier), "r"(block_size) : "memory"); + asm("mbarrier.init.shared::cta.b64 [%0], %1;" : : "l"(&mbarrier), "r"(0x1234) : "memory"); } __syncthreads(); // Expect on the mbarrier #if __CUDA_ARCH__ >= 900 int bytes_per_thread = 4; + asm("barrier.sync 0;"); + // SYNCS.ARRIVE.TRANS64.RED.A0TR asm("mbarrier.expect_tx.shared::cta.b64 [%0], %1;" : : "l"(&mbarrier), "r"(bytes_per_thread) : "memory"); - __syncthreads(); + asm("mbarrier.expect_tx.shared::cluster.b64 [%0], %1;" : : "l"(&mbarrier), "r"(2*bytes_per_thread) : "memory"); // Complete on the mbarrier - asm("mbarrier.complete_tx.shared::cta.b64 [%0], %1;" : : "l"(&mbarrier), "r"(bytes_per_thread) : "memory"); - __syncthreads(); + asm("barrier.sync 1;"); + // SYNCS.ARRIVE.TRANS64.RED.A0TX + asm("mbarrier.complete_tx.shared::cta.b64 [%0], %1;" : : "l"(&mbarrier), "r"(3*bytes_per_thread) : "memory"); // All threads in the block arrive on the mbarrier - asm("mbarrier.arrive.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(1) : "memory"); - __syncthreads(); + asm("barrier.sync 2;"); + // SYNCS.ARRIVE.TRANS64.RED.A1T0 (no count specified, return value is not used) + asm("mbarrier.arrive.b64 %0, [%1];" : "=l"(state) : "l"(&mbarrier) : "memory"); + asm("mbarrier.arrive.b64 _, [%0];" :: "l"(&mbarrier) : "memory"); + // SYNCS.ARRIVE.TRANS64.A1T0 (return value is used) + asm("mbarrier.arrive.b64 %0, [%1];" : "=l"(state) : "l"(&mbarrier) : "memory"); + *sink += state; + // SYNCS.ARRIVE.TRANS64.RED.ART0 (return value state is not used) + asm("mbarrier.arrive.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(4) : "memory"); + asm("mbarrier.arrive.b64 _ , [%0], %1;" : : "l"(&mbarrier), "n"(4) : "memory"); + // SYNCS.ARRIVE.TRANS64.ART0 (return value state is used) + asm("mbarrier.arrive.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(5) : "memory"); + *sink += state; + + // ARRIVE with no complete + asm("barrier.sync 3;"); + // SYNCS.ARRIVE.TRANS64.RED.ART0 + asm("mbarrier.arrive.noComplete.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(5) : "memory"); + // SYNCS.ARRIVE.TRANS64.TMASK.ART0 (return value is used) + asm("mbarrier.arrive.noComplete.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(6) : "memory"); + *sink += state; // Arrive and expect on the mbarrier - asm("mbarrier.arrive.expect_tx.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(2) : "memory"); - __syncthreads(); + asm("barrier.sync 4;"); + // SYNCS.ARRIVE.TRANS64.RED (return value is not used) + asm("mbarrier.arrive.expect_tx.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(6) : "memory"); + asm("mbarrier.arrive.expect_tx.b64 _, [%0], %1;" :: "l"(&mbarrier), "n"(6) : "memory"); + // SYNCS.ARRIVE.TRANS64 (return value is used) + asm("mbarrier.arrive.expect_tx.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(6) : "memory"); + *sink += state; // Arrive and drop + asm("barrier.sync 5;"); + // SYNCS.ARRIVE.TRANS64.RED.OPTOUT.A1T0 (not using return value) + asm("mbarrier.arrive_drop.b64 %0, [%1];" : "=l"(state) : "l"(&mbarrier) : "memory"); + asm("mbarrier.arrive_drop.b64 _, [%0];" :: "l"(&mbarrier) : "memory"); + // SYNCS.ARRIVE.TRANS64.OPTOUT.A1T0 (using return value) + asm("mbarrier.arrive_drop.b64 %0, [%1];" : "=l"(state) : "l"(&mbarrier) : "memory"); + *sink += state; + + // SYNCS.ARRIVE.TRANS64.RED.OPTOUT.ART0 (not using return value) asm("mbarrier.arrive_drop.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(3) : "memory"); - __syncthreads(); + asm("mbarrier.arrive_drop.b64 _, [%0], %1;" :: "l"(&mbarrier), "n"(3) : "memory"); + // SYNCS.ARRIVE.TRANS64.OPTOUT.ART0 (using return value) + asm("mbarrier.arrive_drop.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(3) : "memory"); + *sink += state; + + asm("barrier.sync 6;"); + // SYNCS.ARRIVE.TRANS64.RED.OPTOUT.ART0 + asm("mbarrier.arrive_drop.noComplete.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(3) : "memory"); + asm("mbarrier.arrive_drop.noComplete.b64 _, [%0], %1;" :: "l"(&mbarrier), "n"(3) : "memory"); + // SYNCS.ARRIVE.TRANS64.TMASK.OPTOUT.ART0 + asm("mbarrier.arrive_drop.noComplete.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(3) : "memory"); + *sink += state; + #else // For sm_80 // All threads in the block arrive on the mbarrier + asm("barrier.sync 5;"); asm("mbarrier.arrive.noComplete.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(1) : "memory"); - __syncthreads(); - // Arrive and drop + // Arrive and drop with no complete + asm("barrier.sync 6;"); asm("mbarrier.arrive_drop.noComplete.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(3) : "memory"); - __syncthreads(); #endif #if __CUDA_ARCH__ >= 900 - // Arrive and drop + // Arrive and drop expect tx + asm("barrier.sync 7;"); + // SYNCS.ARRIVE.TRANS64.RED.OPTOUT asm("mbarrier.arrive_drop.expect_tx.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(4) : "memory"); - __syncthreads(); + asm("mbarrier.arrive_drop.expect_tx.b64 _, [%0], %1;" :: "l"(&mbarrier), "n"(5) : "memory"); + // SYNCS.ARRIVE.TRANS64.OPTOUT RA, [URB], RC; RC is the transaction count + asm("mbarrier.arrive_drop.expect_tx.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(6) : "memory"); + *sink += state; #endif // Get pending count + asm("barrier.sync 8;"); + // This is just a list of arithematic operations on the opaque state variable asm("mbarrier.pending_count.b64 %0, %1;" : "=r"(count) : "l"(state) : "memory"); // Prevent optimizing away if (threadIdx.x == 0 && threadIdx.y == 0) { @@ -89,31 +144,90 @@ __global__ __noinline__ void test_mbarrier_kernel() { __syncthreads(); // cp async barrier arrive + asm("barrier.sync 9;"); + // SYNCS.ARRIVE.TRANS64.RED.A0T1 RZ, [URB], RZ + // ARRIVES.LDGSTSBAR.64.TRANSCNT [URB] asm("cp.async.mbarrier.arrive.shared::cta.b64 [%0];" : : "l"(&mbarrier) : "memory"); - __syncthreads(); + asm("barrier.sync 10;"); + // ARRIVES.LDGSTSBAR.64.ARVCNT [URB] asm("cp.async.mbarrier.arrive.noinc.shared::cta.b64 [%0];" : : "l"(&mbarrier) : "memory"); - __syncthreads(); // Wait on the mbarrier #if __CUDA_ARCH__ >= 900 + asm("barrier.sync 11;"); + // SYNCS.PHASECHK.TRANS64 PT, [UR4], RZ asm ("\n\t" ".reg .pred complete;\n\t" "mbarrier.test_wait.parity.b64 complete, [%0], %1;" : : "l"(&mbarrier), "n"(0) : "memory" ); - __syncthreads(); + asm("barrier.sync 12;"); + // SYNCS.PHASECHK.TRANS64 PT, [UR4], R0 + asm ("\n\t" + "mbarrier.test_wait.parity.b64 complete, [%0], %1;" + : : "l"(&mbarrier), "n"(1) : "memory" + ); + asm("barrier.sync 0;"); + // SYNCS.PHASECHK.TRANS64.TRYWAIT PT, [UR4], RZ asm ("\n\t" "mbarrier.try_wait.parity.b64 complete, [%0], %1;" : : "l"(&mbarrier), "n"(0) : "memory" ); + asm("barrier.sync 1;"); + // SYNCS.PHASECHK.TRANS64.TRYWAIT PT, [UR4], R0 + asm ("\n\t" + "mbarrier.try_wait.parity.b64 complete, [%0], %1;" + : : "l"(&mbarrier), "n"(1) : "memory" + ); + asm("barrier.sync 2;"); + // SYNCS.PHASECHK.TRANS64.TRYWAIT PT, [R4+URZ], RZ + // @!PT NANOSLEEP.SYNCS 0x1234 + asm ("\n\t" + "mbarrier.try_wait.parity.b64 complete, [%0], %1, %2;" + : : "l"(&mbarrier), "n"(0), "n"(0x1234) : "memory" + ); + asm("barrier.sync 3;"); + // SYNCS.PHASECHK.TRANS64.TRYWAIT PT, [R5+URZ], R3 + // @!PT NANOSLEEP.SYNCS 0x4321 + // @!PT SYNCS.PHASECHK.TRANS64 PT, [R5+URZ], R3 ; + asm ("\n\t" + "mbarrier.try_wait.parity.b64 complete, [%0], %1, %2;" + : : "l"(&mbarrier), "n"(1), "n"(0x4321) : "memory" + ); __syncthreads(); + + state = 0; + asm("barrier.sync 4;"); + // SYNCS.PHASECHK.TRANS64 PT, [UR4], RZ + asm ("\n\t" + ".reg .pred complete2;\n\t" + "mbarrier.test_wait.b64 complete2, [%0], %1;" + : : "l"(&mbarrier), "l"(state) : "memory" + ); + asm("barrier.sync 5;"); + // SYNCS.PHASECHK.TRANS64.TRYWAIT PT, [UR4], RZ + asm ("\n\t" + "mbarrier.try_wait.b64 complete2, [%0], %1;" + : : "l"(&mbarrier), "l"(state) : "memory" + ); + asm("barrier.sync 6;"); + // SYNCS.PHASECHK.TRANS64.TRYWAIT PT, [R6+URZ], RZ + // @!PT NANOSLEEP.SYNCS 0x1234 + // @!PT SYNCS.PHASECHK.TRANS64 PT, [R6+URZ], RZ ; + asm ("\n\t" + "mbarrier.try_wait.b64 complete2, [%0], %1, %2;" + : : "l"(&mbarrier), "l"(state), "n"(0x1234) : "memory" + ); #endif } int main(int argc, char *argv[]) { - CUDA_SAFECALL((test_mbarrier_kernel<<<1, 1>>>())); - CUDA_SAFECALL(cudaDeviceSynchronize()); + printf("This is a test program mean to compare the mbarrier PTX and SASS mapping, it is not tested at all for functionality/run to finished\n"); + // uint64_t *sink; + // cudaMalloc(&sink, sizeof(uint64_t)); + // CUDA_SAFECALL((test_mbarrier_kernel<<<1, 1>>>(sink))); + // CUDA_SAFECALL(cudaDeviceSynchronize()); - printf("Mbarrier test completed\n"); + // printf("Mbarrier test completed\n"); return 0; }