open-rdma · whx-6 · May 20, 2026 · May 20, 2026 · May 21, 2026 · May 22, 2026
diff --git a/outputs/gpu-programming-course/README.md b/outputs/gpu-programming-course/README.md
@@ -77,20 +77,20 @@ nvcc -o vector_add vector_add.cu
 
 课程配套的在线评测系统位于 `outputs/eval_system/`，支持代码提交、自动编译、NCU性能分析和报告下载。详见该目录下的 `design.md`。
 
-## 项目结构
+## 项目主要结构
 
 ```
-gpu-programming-course/
-├── README.md                    # 本文件
-├── docs/                        # 课程文档
-│   ├── _sidebar.md              # 侧边栏导航
-│   ├── 前言.md                  # 前言
-│   ├── images/                  # 图片资源
-│   ├── chapter1/ ~ chapter11/   # 基础篇章节
-│   └── advanced-chapter1/ ~ 6/  # 进阶篇章节
-├── code/                        # 随章代码
-├── Extra-Chapter/               # 补充资料与参考答案
-└── outputs/                     # 规划文档与评测系统
+gpu-programming-guide/
+├── cold-start/                 # 冷启动知识库与参考资料
+├── outputs/                    # 课程产出与规划文档
+│   ├── eval_system/            # 在线评测系统设计与实现
+│   └── gpu-programming-course/ # 课程主体内容
+│       ├── code/               # 各章节 CUDA 示例代码
+│       ├── docs/               # 各章节课程文档
+│       │   ├── images/         # 文档图片资源
+│       │   └── README.md       # 文档目录说明
+│       └── syllabus/           # 课程大纲与教学计划
+└── README.md                   # 项目说明文档
 ```
 
 ## 贡献指南

diff --git a/...dvanced-chapter1/distributed_histogram.cu → ...vanced-chapter12/distributed_histogram.cu b/...dvanced-chapter1/distributed_histogram.cu → ...vanced-chapter12/distributed_histogram.cu
diff --git a/.../advanced-chapter2/async_pipeline_demo.cu → ...advanced-chapter13/async_pipeline_demo.cu b/.../advanced-chapter2/async_pipeline_demo.cu → ...advanced-chapter13/async_pipeline_demo.cu
@@ -33,75 +33,78 @@ __global__ void pipeline_demo_kernel(
 
     // 每个阶段一个缓冲区
     float *buffer[stages];
-    for (int s = 0; s < stages; s++) {
+    for (int s = 0; s < stages; ++s) {
         buffer[s] = shared_buffers + s * threads_per_block;
     }
 
-    // 创建 3 阶段 pipeline
+    // 创建 pipeline 状态
     __shared__ cuda::pipeline_shared_state<
         cuda::thread_scope::thread_scope_block, stages> pipe_state;
     auto pipe = cuda::make_pipeline(block, &pipe_state);
 
     size_t total_blocks = total_elements / threads_per_block;
     size_t block_id = block.group_index().x;
+    if (block_id != 0) return;   // 只用一个块演示
 
-    // 只有第一个块执行（简化演示）
-    if (block_id != 0) return;
-
-    // 预热流水线：填充前 (stages-1) 个阶段
-    for (int s = 0; s < stages - 1; s++) {
+    // 预热：填充前 (stages-1) 个批次 
+    for (int s = 0; s < stages - 1; ++s) {
         pipe.producer_acquire();
         cuda::memcpy_async(block, buffer[s],
                            input + s * threads_per_block,
                            sizeof(float) * threads_per_block, pipe);
         pipe.producer_commit();
     }
 
-    // 流水线稳态
-    for (size_t i = 0; i < total_blocks - (stages - 1); i++) {
-        int stage = i % stages;
+    // 流水线状态 ：消费 + 生产 
+    for (size_t i = 0; i < total_blocks - (stages - 1); ++i) {
+        // 1. 等待当前批次就绪（消费者）
+        pipe.consumer_wait();
+        int tid = threadIdx.x;
+        int cons_buf_idx = i % stages;                 // 当前批次应该所在的缓冲区
+        float *curr_buf = buffer[cons_buf_idx];
+
+        // 计算（每个线程独立处理自己的元素）
+        float val = curr_buf[tid] * scale + 1.0f;
+        // 写回全局内存
+        output[i * threads_per_block + tid] = val;
 
-        // 生产者：发起下一批数据的异步拷贝
+        pipe.consumer_release();   // 释放当前缓冲区，允许生产者复用
+
+        // 2. 为未来批次准备数据（生产者）
         pipe.producer_acquire();
-        size_t next_batch = i + stages - 1;
+        size_t next_batch = i + stages - 1;            // 要准备的下一个批次索引
         if (next_batch < total_blocks) {
-            cuda::memcpy_async(block, buffer[stage],
+            int prod_buf_idx = (i + stages - 1) % stages;   // 正确的目标缓冲区索引
+            cuda::memcpy_async(block, buffer[prod_buf_idx],
                                input + next_batch * threads_per_block,
                                sizeof(float) * threads_per_block, pipe);
         }
         pipe.producer_commit();
-
-        // 消费者：处理当前阶段的数据
-        pipe.consumer_wait();
-        int tid = threadIdx.x;
-        buffer[stage][tid] = buffer[stage][tid] * scale + 1.0f;
-        __syncthreads();
-        // 写回
-        output[i * threads_per_block + tid] = buffer[stage][tid];
-        pipe.consumer_release();
     }
 
-    // 排空流水线：处理最后 (stages-1) 个阶段
-    for (size_t i = total_blocks - (stages - 1); i < total_blocks; i++) {
-        int stage = i % stages;
+    //  排空：处理最后 (stages-1) 个批次 
+    for (size_t i = total_blocks - (stages - 1); i < total_blocks; ++i) {
         pipe.consumer_wait();
         int tid = threadIdx.x;
-        buffer[stage][tid] = buffer[stage][tid] * scale + 1.0f;
-        __syncthreads();
-        output[i * threads_per_block + tid] = buffer[stage][tid];
+        int cons_buf_idx = i % stages;
+        float *curr_buf = buffer[cons_buf_idx];
+
+        float val = curr_buf[tid] * scale + 1.0f;
+        output[i * threads_per_block + tid] = val;
+
         pipe.consumer_release();
     }
 }
 
 int main() {
-    const size_t N = threads_per_block * 100; // 100个批次
+    const size_t N = threads_per_block * 100;
     const size_t bytes = N * sizeof(float);
     const float scale = 2.0f;
 
-    // 主机内存
+    // 主机数据
     float *h_input = (float *)malloc(bytes);
     float *h_output = (float *)malloc(bytes);
-    for (size_t i = 0; i < N; i++) {
+    for (size_t i = 0; i < N; ++i) {
         h_input[i] = (float)(i % 100) / 100.0f;
     }
 
@@ -111,21 +114,19 @@ int main() {
     CUDA_CHECK(cudaMalloc(&d_output, bytes));
     CUDA_CHECK(cudaMemcpy(d_input, h_input, bytes, cudaMemcpyHostToDevice));
 
-    // 启动 pipeline 核函数
+    // 启动核函数
     size_t shared_mem = stages * threads_per_block * sizeof(float);
     pipeline_demo_kernel<<<1, threads_per_block, shared_mem>>>(
         d_input, d_output, N, scale);
-
     CUDA_CHECK(cudaDeviceSynchronize());
 
-    // 验证
+    // 验证结果
     CUDA_CHECK(cudaMemcpy(h_output, d_output, bytes, cudaMemcpyDeviceToHost));
     bool correct = true;
-    for (size_t i = 0; i < N; i++) {
+    for (size_t i = 0; i < N; ++i) {
         float expected = h_input[i] * scale + 1.0f;
         if (fabsf(h_output[i] - expected) > 1e-5f) {
-            printf("Mismatch at %zu: GPU %f vs CPU %f\n",
-                   i, h_output[i], expected);
+            printf("Mismatch at %zu: GPU %f vs CPU %f\n", i, h_output[i], expected);
             correct = false;
             break;
         }
@@ -136,4 +137,4 @@ int main() {
     CUDA_CHECK(cudaFree(d_input));
     CUDA_CHECK(cudaFree(d_output));
     return correct ? 0 : 1;
-}
+}