-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtiled_gemm.cu
More file actions
69 lines (56 loc) · 2.2 KB
/
tiled_gemm.cu
File metadata and controls
69 lines (56 loc) · 2.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
// gemm_tiled.cu ── shared-memory tiled GEMM, one NVTX range
#include <cuda_runtime.h>
#include <nvToolsExt.h>
#include <iostream>
#include <chrono>
#ifndef BLOCK
#define BLOCK 32
#endif
__global__ void matmul_tiled(const float* __restrict__ A,
const float* __restrict__ B,
float* C, int N)
{
__shared__ float As[BLOCK][BLOCK];
__shared__ float Bs[BLOCK][BLOCK];
int row = blockIdx.y * BLOCK + threadIdx.y;
int col = blockIdx.x * BLOCK + threadIdx.x;
float acc = 0.f;
for (int t = 0; t < N; t += BLOCK) {
As[threadIdx.y][threadIdx.x] = A[row * N + t + threadIdx.x];
Bs[threadIdx.y][threadIdx.x] = B[(t + threadIdx.y) * N + col];
__syncthreads();
for (int k = 0; k < BLOCK; ++k)
acc += As[threadIdx.y][k] * Bs[k][threadIdx.x];
__syncthreads();
}
if (row < N && col < N) C[row * N + col] = acc;
}
int main(int argc, char** argv)
{
const int N = (argc > 1) ? std::stoi(argv[1]) : 1024;
const size_t bytes = size_t(N) * N * sizeof(float);
float *hA, *hB, *hC;
cudaHostAlloc(&hA, bytes, cudaHostAllocDefault);
cudaHostAlloc(&hB, bytes, cudaHostAllocDefault);
cudaHostAlloc(&hC, bytes, cudaHostAllocDefault);
for (int i = 0; i < N * N; ++i) hA[i] = hB[i] = 1.f;
float *dA, *dB, *dC;
cudaMalloc(&dA, bytes); cudaMalloc(&dB, bytes); cudaMalloc(&dC, bytes);
dim3 block(BLOCK, BLOCK);
dim3 grid((N + BLOCK - 1) / BLOCK, (N + BLOCK - 1) / BLOCK);
auto start = std::chrono::high_resolution_clock::now();
cudaMemcpy(dA, hA, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(dB, hB, bytes, cudaMemcpyHostToDevice);
nvtxRangePushA("gemm tiled kernel");
matmul_tiled<<<grid, block>>>(dA, dB, dC, N);
nvtxRangePop();
cudaDeviceSynchronize();
cudaMemcpy(hC, dC, bytes, cudaMemcpyDeviceToHost);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << "Elapsed time: " << elapsed.count() << " seconds\n";
std::cout << "C[0] = " << hC[0] << '\n';
cudaFree(dA); cudaFree(dB); cudaFree(dC);
cudaFreeHost(hA); cudaFreeHost(hB); cudaFreeHost(hC);
return 0;
}