Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions outputs/gpu-programming-course/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,20 +77,20 @@ nvcc -o vector_add vector_add.cu

课程配套的在线评测系统位于 `outputs/eval_system/`,支持代码提交、自动编译、NCU性能分析和报告下载。详见该目录下的 `design.md`。

## 项目结构
## 项目主要结构

```
gpu-programming-course/
├── README.md # 本文件
├── docs/ # 课程文档
│ ├── _sidebar.md # 侧边栏导航
── 前言.md # 前言
│ ├── images/ # 图片资源
│ ├── chapter1/ ~ chapter11/ # 基础篇章节
── advanced-chapter1/ ~ 6/ # 进阶篇章节
├── code/ # 随章代码
├── Extra-Chapter/ # 补充资料与参考答案
└── outputs/ # 规划文档与评测系统
gpu-programming-guide/
├── cold-start/ # 冷启动知识库与参考资料
├── outputs/ # 课程产出与规划文档
│ ├── eval_system/ # 在线评测系统设计与实现
── gpu-programming-course/ # 课程主体内容
├── code/ # 各章节 CUDA 示例代码
├── docs/ # 各章节课程文档
│ ├── images/ # 文档图片资源
└── README.md # 文档目录说明
└── syllabus/ # 课程大纲与教学计划
└── README.md # 项目说明文档
```

## 贡献指南
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,75 +33,78 @@ __global__ void pipeline_demo_kernel(

// 每个阶段一个缓冲区
float *buffer[stages];
for (int s = 0; s < stages; s++) {
for (int s = 0; s < stages; ++s) {
buffer[s] = shared_buffers + s * threads_per_block;
}

// 创建 3 阶段 pipeline
// 创建 pipeline 状态
__shared__ cuda::pipeline_shared_state<
cuda::thread_scope::thread_scope_block, stages> pipe_state;
auto pipe = cuda::make_pipeline(block, &pipe_state);

size_t total_blocks = total_elements / threads_per_block;
size_t block_id = block.group_index().x;
if (block_id != 0) return; // 只用一个块演示

// 只有第一个块执行(简化演示)
if (block_id != 0) return;

// 预热流水线:填充前 (stages-1) 个阶段
for (int s = 0; s < stages - 1; s++) {
// 预热:填充前 (stages-1) 个批次
for (int s = 0; s < stages - 1; ++s) {
pipe.producer_acquire();
cuda::memcpy_async(block, buffer[s],
input + s * threads_per_block,
sizeof(float) * threads_per_block, pipe);
pipe.producer_commit();
}

// 流水线稳态
for (size_t i = 0; i < total_blocks - (stages - 1); i++) {
int stage = i % stages;
// 流水线状态 :消费 + 生产
for (size_t i = 0; i < total_blocks - (stages - 1); ++i) {
// 1. 等待当前批次就绪(消费者)
pipe.consumer_wait();
int tid = threadIdx.x;
int cons_buf_idx = i % stages; // 当前批次应该所在的缓冲区
float *curr_buf = buffer[cons_buf_idx];

// 计算(每个线程独立处理自己的元素)
float val = curr_buf[tid] * scale + 1.0f;
// 写回全局内存
output[i * threads_per_block + tid] = val;

// 生产者:发起下一批数据的异步拷贝
pipe.consumer_release(); // 释放当前缓冲区,允许生产者复用

// 2. 为未来批次准备数据(生产者)
pipe.producer_acquire();
size_t next_batch = i + stages - 1;
size_t next_batch = i + stages - 1; // 要准备的下一个批次索引
if (next_batch < total_blocks) {
cuda::memcpy_async(block, buffer[stage],
int prod_buf_idx = (i + stages - 1) % stages; // 正确的目标缓冲区索引
cuda::memcpy_async(block, buffer[prod_buf_idx],
input + next_batch * threads_per_block,
sizeof(float) * threads_per_block, pipe);
}
pipe.producer_commit();

// 消费者:处理当前阶段的数据
pipe.consumer_wait();
int tid = threadIdx.x;
buffer[stage][tid] = buffer[stage][tid] * scale + 1.0f;
__syncthreads();
// 写回
output[i * threads_per_block + tid] = buffer[stage][tid];
pipe.consumer_release();
}

// 排空流水线:处理最后 (stages-1) 个阶段
for (size_t i = total_blocks - (stages - 1); i < total_blocks; i++) {
int stage = i % stages;
// 排空:处理最后 (stages-1) 个批次
for (size_t i = total_blocks - (stages - 1); i < total_blocks; ++i) {
pipe.consumer_wait();
int tid = threadIdx.x;
buffer[stage][tid] = buffer[stage][tid] * scale + 1.0f;
__syncthreads();
output[i * threads_per_block + tid] = buffer[stage][tid];
int cons_buf_idx = i % stages;
float *curr_buf = buffer[cons_buf_idx];

float val = curr_buf[tid] * scale + 1.0f;
output[i * threads_per_block + tid] = val;

pipe.consumer_release();
}
}

int main() {
const size_t N = threads_per_block * 100; // 100个批次
const size_t N = threads_per_block * 100;
const size_t bytes = N * sizeof(float);
const float scale = 2.0f;

// 主机内存
// 主机数据
float *h_input = (float *)malloc(bytes);
float *h_output = (float *)malloc(bytes);
for (size_t i = 0; i < N; i++) {
for (size_t i = 0; i < N; ++i) {
h_input[i] = (float)(i % 100) / 100.0f;
}

Expand All @@ -111,21 +114,19 @@ int main() {
CUDA_CHECK(cudaMalloc(&d_output, bytes));
CUDA_CHECK(cudaMemcpy(d_input, h_input, bytes, cudaMemcpyHostToDevice));

// 启动 pipeline 核函数
// 启动核函数
size_t shared_mem = stages * threads_per_block * sizeof(float);
pipeline_demo_kernel<<<1, threads_per_block, shared_mem>>>(
d_input, d_output, N, scale);

CUDA_CHECK(cudaDeviceSynchronize());

// 验证
// 验证结果
CUDA_CHECK(cudaMemcpy(h_output, d_output, bytes, cudaMemcpyDeviceToHost));
bool correct = true;
for (size_t i = 0; i < N; i++) {
for (size_t i = 0; i < N; ++i) {
float expected = h_input[i] * scale + 1.0f;
if (fabsf(h_output[i] - expected) > 1e-5f) {
printf("Mismatch at %zu: GPU %f vs CPU %f\n",
i, h_output[i], expected);
printf("Mismatch at %zu: GPU %f vs CPU %f\n", i, h_output[i], expected);
correct = false;
break;
}
Expand All @@ -136,4 +137,4 @@ int main() {
CUDA_CHECK(cudaFree(d_input));
CUDA_CHECK(cudaFree(d_output));
return correct ? 0 : 1;
}
}
Loading