Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// 文件: async_pipeline_demo.cu
// 编译: nvcc -arch=sm_80 async_pipeline_demo.cu -o async_pipeline_demo
// 硬件要求: NVIDIA Ampere A100 或更新 (CC 8.0+)
// 编译: nvcc -arch=sm_80 -std=c++17 async_pipeline_demo.cu -o async_pipeline_demo
// 硬件要求: NVIDIA Ampere A100/A800 (CC 8.0+)
// 第13章 异步SIMT编程模型 - 多阶段Pipeline示例

#include <stdio.h>
Expand Down Expand Up @@ -31,16 +31,21 @@ __global__ void pipeline_demo_kernel(
extern __shared__ float shared_buffers[];
auto block = cooperative_groups::this_thread_block();

// 每个阶段一个缓冲区
// 每个阶段一个独立缓冲区
float *buffer[stages];
for (int s = 0; s < stages; s++) {
buffer[s] = shared_buffers + s * threads_per_block;
}

// 创建 3 阶段 pipeline
__shared__ cuda::pipeline_shared_state<
cuda::thread_scope::thread_scope_block, stages> pipe_state;
auto pipe = cuda::make_pipeline(block, &pipe_state);
// 【修复编译警告】使用对齐的char数组避免动态初始化
__shared__ alignas(alignof(cuda::pipeline_shared_state<
cuda::thread_scope::thread_scope_block, stages>)) char pipe_state[
sizeof(cuda::pipeline_shared_state<
cuda::thread_scope::thread_scope_block, stages>)];

auto pipe = cuda::make_pipeline(block,
reinterpret_cast<cuda::pipeline_shared_state<
cuda::thread_scope::thread_scope_block, stages>*>(pipe_state));

size_t total_blocks = total_elements / threads_per_block;
size_t block_id = block.group_index().x;
Expand All @@ -59,13 +64,15 @@ __global__ void pipeline_demo_kernel(

// 流水线稳态
for (size_t i = 0; i < total_blocks - (stages - 1); i++) {
int stage = i % stages;
// 【核心修复】生产者和消费者使用不同的缓冲区索引,避免数据竞争
int producer_stage = (i + stages - 1) % stages;
int consumer_stage = i % stages;

// 生产者:发起下一批数据的异步拷贝
pipe.producer_acquire();
size_t next_batch = i + stages - 1;
if (next_batch < total_blocks) {
cuda::memcpy_async(block, buffer[stage],
cuda::memcpy_async(block, buffer[producer_stage],
input + next_batch * threads_per_block,
sizeof(float) * threads_per_block, pipe);
}
Expand All @@ -74,21 +81,19 @@ __global__ void pipeline_demo_kernel(
// 消费者:处理当前阶段的数据
pipe.consumer_wait();
int tid = threadIdx.x;
buffer[stage][tid] = buffer[stage][tid] * scale + 1.0f;
__syncthreads();
// 写回
output[i * threads_per_block + tid] = buffer[stage][tid];
buffer[consumer_stage][tid] = buffer[consumer_stage][tid] * scale + 1.0f;
// 移除不必要的__syncthreads()(每个线程只写自己的元素,无依赖)
output[i * threads_per_block + tid] = buffer[consumer_stage][tid];
pipe.consumer_release();
}

// 排空流水线:处理最后 (stages-1) 个阶段
for (size_t i = total_blocks - (stages - 1); i < total_blocks; i++) {
int stage = i % stages;
int consumer_stage = i % stages;
pipe.consumer_wait();
int tid = threadIdx.x;
buffer[stage][tid] = buffer[stage][tid] * scale + 1.0f;
__syncthreads();
output[i * threads_per_block + tid] = buffer[stage][tid];
buffer[consumer_stage][tid] = buffer[consumer_stage][tid] * scale + 1.0f;
output[i * threads_per_block + tid] = buffer[consumer_stage][tid];
pipe.consumer_release();
}
}
Expand All @@ -110,12 +115,13 @@ int main() {
CUDA_CHECK(cudaMalloc(&d_input, bytes));
CUDA_CHECK(cudaMalloc(&d_output, bytes));
CUDA_CHECK(cudaMemcpy(d_input, h_input, bytes, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemset(d_output, 0, bytes));

// 启动 pipeline 核函数
size_t shared_mem = stages * threads_per_block * sizeof(float);
pipeline_demo_kernel<<<1, threads_per_block, shared_mem>>>(
d_input, d_output, N, scale);

CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());

// 验证
Expand All @@ -132,8 +138,10 @@ int main() {
}
printf("Result: %s\n", correct ? "PASS" : "FAIL");

// 清理
free(h_input); free(h_output);
CUDA_CHECK(cudaFree(d_input));
CUDA_CHECK(cudaFree(d_output));

return correct ? 0 : 1;
}
}
45 changes: 37 additions & 8 deletions outputs/gpu-programming-course/code/chapter11/math_benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,11 @@ __global__ void standardSqrt(const float * __restrict__ a,
}
}

__global__ void fastLogExp(const float * __restrict__ a,
float * __restrict__ c, int n) {
__global__ void fastSqrt(const float * __restrict__ a,
float * __restrict__ c, int n) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < n) {
c[idx] = __log2f(a[idx]) + exp2f(a[idx] * 0.0001f);
c[idx] = __fsqrt_rn(a[idx]);
}
}

Expand Down Expand Up @@ -163,6 +163,33 @@ float benchmark1out(KernelFunc kernel, int n, int gridSize, int blockSize,
return ms / iterations;
}

// 【唯一新增的8行代码】修复除法测试的编译错误
template<typename KernelFunc>
float benchmark2in1out(KernelFunc kernel, int n, int gridSize, int blockSize,
int iterations, float *d_a, float *d_b, float *d_c) {
cudaEvent_t start, stop;
CHECK_CUDA(cudaEventCreate(&start));
CHECK_CUDA(cudaEventCreate(&stop));

kernel<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
CHECK_CUDA(cudaDeviceSynchronize());

CHECK_CUDA(cudaEventRecord(start, 0));
for (int i = 0; i < iterations; i++) {
kernel<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
}
CHECK_CUDA(cudaEventRecord(stop, 0));
CHECK_CUDA(cudaEventSynchronize(stop));

float ms;
CHECK_CUDA(cudaEventElapsedTime(&ms, start, stop));

CHECK_CUDA(cudaEventDestroy(start));
CHECK_CUDA(cudaEventDestroy(stop));

return ms / iterations;
}

template<typename KernelFunc>
float benchmark2out(KernelFunc kernel, int n, int gridSize, int blockSize,
int iterations, float *d_a, float *d_s, float *d_c) {
Expand Down Expand Up @@ -234,8 +261,9 @@ int main() {
float std_ms, fast_ms;

// === Test 1: Division ===
std_ms = benchmark1out(standardDiv, N, gridSize, blockSize, iterations, d_a, d_b, d_c);
fast_ms = benchmark1out(fastDiv, N, gridSize, blockSize, iterations, d_a, d_b, d_c);
// 【修改1】把benchmark1out改成benchmark2in1out
std_ms = benchmark2in1out(standardDiv, N, gridSize, blockSize, iterations, d_a, d_b, d_c);
fast_ms = benchmark2in1out(fastDiv, N, gridSize, blockSize, iterations, d_a, d_b, d_c);
printf("%-40s %10.4f %10s\n", "Standard / (division)", std_ms, "baseline");
printf("%-40s %10.4f %10.2fx\n", "__fdividef()", fast_ms, std_ms / fast_ms);

Expand All @@ -257,11 +285,12 @@ int main() {
printf("%-40s %10.4f %10s\n", "sinf()+cosf() (large args)", std_ms, "baseline");
printf("%-40s %10.4f %10.2fx\n", "__sinf()+__cosf() (large)", fast_ms, std_ms / fast_ms);

// === Test 5: Log/Exp intrinsics ===
// === Test 5: Sqrt ===
// 【修改2】把fastLogExp改成fastSqrt
std_ms = benchmark1out(standardSqrt, N, gridSize, blockSize, iterations, d_a, d_c);
fast_ms = benchmark1out(fastLogExp, N, gridSize, blockSize, iterations, d_a, d_c);
fast_ms = benchmark1out(fastSqrt, N, gridSize, blockSize, iterations, d_a, d_c);
printf("%-40s %10.4f %10s\n", "sqrtf()", std_ms, "baseline");
printf("%-40s %10.4f %10.2fx\n", "__log2f()+exp2f()", fast_ms, std_ms / fast_ms);
printf("%-40s %10.4f %10.2fx\n", "__fsqrt_rn()", fast_ms, std_ms / fast_ms);

// === Test 6: Integer division vs bit shift ===
{
Expand Down
Loading