From 623fda76915afbec471c1248fc2e70524b989c59 Mon Sep 17 00:00:00 2001
From: whx-6 <1901591887@qq.com>
Date: Wed, 20 May 2026 05:19:04 +0000
Subject: [PATCH 01/23] =?UTF-8?q?fix:=20=E8=A7=A3=E5=86=B3review=E8=AF=84?=
 =?UTF-8?q?=E8=AE=BA=EF=BC=8C=E4=BF=AE=E6=AD=A3CPU=E7=BA=BF=E7=A8=8B?=
 =?UTF-8?q?=E6=8F=8F=E8=BF=B0=E5=B9=B6=E6=B7=BB=E5=8A=A0=E6=80=9D=E8=80=83?=
 =?UTF-8?q?=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...\347\256\227\344\270\216CUDA\345\205\245\351\227\250.md" | 6 +++---
 ...\213\345\261\202\346\254\241\347\273\223\346\236\204.md" | 4 ++++
 2 files changed, 7 insertions(+), 3 deletions(-)
diff --git "a/outputs/gpu-programming-course/docs/chapter1/\347\254\2541\347\253\240 GPU\350\256\241\347\256\227\344\270\216CUDA\345\205\245\351\227\250.md" "b/outputs/gpu-programming-course/docs/chapter1/\347\254\2541\347\253\240 GPU\350\256\241\347\256\227\344\270\216CUDA\345\205\245\351\227\250.md"
index ef67947..d1085fe 100644
--- "a/outputs/gpu-programming-course/docs/chapter1/\347\254\2541\347\253\240 GPU\350\256\241\347\256\227\344\270\216CUDA\345\205\245\351\227\250.md"	
+++ "b/outputs/gpu-programming-course/docs/chapter1/\347\254\2541\347\253\240 GPU\350\256\241\347\256\227\344\270\216CUDA\345\205\245\351\227\250.md"	
@@ -22,7 +22,7 @@
 - <strong>复杂的控制逻辑</strong>：分支预测（Branch Prediction）、乱序执行（Out-of-Order Execution）、超标量流水线（Superscalar Pipeline）等
 - <strong>大型缓存层次结构</strong>：L1、L2、L3缓存，以减少指令和数据访问的平均延迟
 
-这种设计使得CPU能够高效地执行那些具有复杂控制流、大量分支和不可预测内存访问模式的程序。一个典型的现代CPU核心可以同时执行几十个（通常是2-4个硬件线程）线程。
+这种设计使得 CPU 能够高效处理具有复杂控制流、大量分支及不可预测内存访问模式的程序。现代多核 CPU 通常支持同时执行数十个硬件线程，一般每个物理核心具备维护 1 至 2 个独立线程上下文的能力。
 
 而<strong>图形处理单元（GPU）</strong>的设计目标截然不同。GPU被设计为能够在同一时刻执行数千个线程，以<strong>最大化整体吞吐量（Throughput）</strong>。为了达成这一目标，GPU将更多晶体管用于<strong>数据计算</strong>（如浮点运算单元ALU），而非数据缓存和流控制。
 
@@ -37,7 +37,7 @@ CUDA Programming Guide对这两种设计理念给出了权威的阐述：
 | 维度             | CPU                            | GPU                            |
 | :--------------- | :----------------------------- | :----------------------------- |
 | <strong>设计目标</strong> | 最小化单个线程的延迟           | 最大化整体吞吐量               |
-| <strong>并发线程数</strong> | 几十个（~2-64 per core）       | 数千至数万个                   |
+| <strong>并发线程数</strong> | 几十个       | 数千至数万个                   |
 | <strong>晶体管分配</strong> | 大型缓存 + 复杂控制逻辑        | 大量ALU + 精简控制             |
 | <strong>内存延迟处理</strong> | 大缓存 + 预取                  | 线程切换掩藏延迟               |
 | <strong>时钟频率</strong>    | 更高（~3-5 GHz）               | 相对较低（~1-2 GHz）           |
@@ -125,7 +125,7 @@ CUDA Programming Guide以这样的方式介绍了CUDA：
 CUDA的革命性在于：它将GPU并行计算的能力直接暴露给了程序员，无需经过图形API。程序员可以用熟悉的C/C++语言编写GPU上执行的代码，大大降低了GPU编程的门槛。
 
 在CUDA诞生之后的这些年里，它已经深刻地改变了多个计算领域：
-- <strong>2007年</strong>：第一批CUDA GPU（G80架构）发布
+- <strong>2006年</strong>：首款支持 CUDA 的 GPU（G80 架构，GeForce 8800）发布
 - <strong>2012年</strong>：AlexNet使用CUDA训练，开启了深度学习时代
 - <strong>2016年</strong>：Pascal架构P100 GPU将混合精度计算带入数据中心
 - <strong>2017年</strong>：Volta架构引入Tensor Core，深度学习训练大幅加速
diff --git "a/outputs/gpu-programming-course/docs/chapter2/\347\254\2542\347\253\240 CUDA\347\274\226\347\250\213\346\250\241\345\236\213\342\200\224\342\200\224\345\206\205\346\240\270\345\207\275\346\225\260\344\270\216\347\272\277\347\250\213\345\261\202\346\254\241\347\273\223\346\236\204.md" "b/outputs/gpu-programming-course/docs/chapter2/\347\254\2542\347\253\240 CUDA\347\274\226\347\250\213\346\250\241\345\236\213\342\200\224\342\200\224\345\206\205\346\240\270\345\207\275\346\225\260\344\270\216\347\272\277\347\250\213\345\261\202\346\254\241\347\273\223\346\236\204.md"
index 153d4ca..82fbbd5 100644
--- "a/outputs/gpu-programming-course/docs/chapter2/\347\254\2542\347\253\240 CUDA\347\274\226\347\250\213\346\250\241\345\236\213\342\200\224\342\200\224\345\206\205\346\240\270\345\207\275\346\225\260\344\270\216\347\272\277\347\250\213\345\261\202\346\254\241\347\273\223\346\236\204.md"	
+++ "b/outputs/gpu-programming-course/docs/chapter2/\347\254\2542\347\253\240 CUDA\347\274\226\347\250\213\346\250\241\345\236\213\342\200\224\342\200\224\345\206\205\346\240\270\345\207\275\346\225\260\344\270\216\347\272\277\347\250\213\345\261\202\346\254\241\347\273\223\346\236\204.md"	
@@ -463,6 +463,8 @@ CUDA Programming Guide对多块版本的补充说明：
 >
 > （16x16（256个线程）的线程块大小虽然在本题中是任意的，但这是一个常见的选择。网格被创建为具有足够的线程块，使得像前面一样每个矩阵元素对应一个线程。为简单起见，此示例假设每个维度上每个网格的线程数能够被该维度上每个线程块的线程数整除，尽管实际情况并非总是如此。）
 
+**注意**：以上代码使用 `float A[N][N]` 语法，假设 `N` 为编译时常量，属于示意性代码。在实际开发中，我们通常使用一维指针 `float*` 配合索引计算来访问矩阵数据，参见2.7.3节的实际代码。
+
 ### 2.4.2 边界检查——CUDA编程的基本安全实践
 
 当数据大小不能被线程块大小整除时，一些额外的线程会被启动但没有对应的工作。这要求我们在内核中加入<strong>边界检查（Boundary Checking）</strong>。
@@ -534,6 +536,8 @@ CUDA Programming Guide对其功能做了如下说明：
 >
 > （一个线程块内的线程可以通过共享内存共享数据，并通过同步它们的执行来协调内存访问。更准确地说，你可以在内核中通过调用`__syncthreads()`内建函数来指定同步点；`__syncthreads()`充当一个屏障，线程块中的所有线程都必须在此处等待，然后才允许任何线程继续执行。）
 
+除了作为执行屏障，`__syncthreads()` 同时也是一个**内存栅栏**（Memory Fence）：它保证调用前所有线程对共享内存（以及全局内存）的写入，在调用后对线程块内的所有线程可见。这意味着线程 A 在 `__syncthreads()` 之前写入共享内存的数据，在线程 B 通过 `__syncthreads()` 之后一定可以被正确读取。这一点对于正确使用共享内存至关重要，我们将在后续章节中深入探讨。
+
 ### 2.5.2 典型使用模式
 
 `__syncthreads()`最常见的用法是配合共享内存的三阶段模式：

From 9a1648c8f1b2ffe894c23a489397fb447680b736 Mon Sep 17 00:00:00 2001
From: whx-6 <1901591887@qq.com>
Date: Wed, 20 May 2026 05:34:37 +0000
Subject: [PATCH 02/23] =?UTF-8?q?fix:=20=E8=A1=A5=E5=85=85=E6=B2=A1?=
 =?UTF-8?q?=E6=9C=89=E6=8F=90=E4=BA=A4=E6=88=90=E5=8A=9F=E7=9A=84=E7=AC=AC?=
 =?UTF-8?q?=E4=BA=8C=E7=AB=A0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...\250\213\345\261\202\346\254\241\347\273\223\346\236\204.md" | 2 ++
 1 file changed, 2 insertions(+)

diff --git "a/outputs/gpu-programming-course/docs/chapter2/\347\254\2542\347\253\240 CUDA\347\274\226\347\250\213\346\250\241\345\236\213\342\200\224\342\200\224\345\206\205\346\240\270\345\207\275\346\225\260\344\270\216\347\272\277\347\250\213\345\261\202\346\254\241\347\273\223\346\236\204.md" "b/outputs/gpu-programming-course/docs/chapter2/\347\254\2542\347\253\240 CUDA\347\274\226\347\250\213\346\250\241\345\236\213\342\200\224\342\200\224\345\206\205\346\240\270\345\207\275\346\225\260\344\270\216\347\272\277\347\250\213\345\261\202\346\254\241\347\273\223\346\236\204.md"
index 82fbbd5..9cd5a5b 100644
--- "a/outputs/gpu-programming-course/docs/chapter2/\347\254\2542\347\253\240 CUDA\347\274\226\347\250\213\346\250\241\345\236\213\342\200\224\342\200\224\345\206\205\346\240\270\345\207\275\346\225\260\344\270\216\347\272\277\347\250\213\345\261\202\346\254\241\347\273\223\346\236\204.md"	
+++ "b/outputs/gpu-programming-course/docs/chapter2/\347\254\2542\347\253\240 CUDA\347\274\226\347\250\213\346\250\241\345\236\213\342\200\224\342\200\224\345\206\205\346\240\270\345\207\275\346\225\260\344\270\216\347\272\277\347\250\213\345\261\202\346\254\241\347\273\223\346\236\204.md"	
@@ -465,6 +465,8 @@ CUDA Programming Guide对多块版本的补充说明：
 
 **注意**：以上代码使用 `float A[N][N]` 语法，假设 `N` 为编译时常量，属于示意性代码。在实际开发中，我们通常使用一维指针 `float*` 配合索引计算来访问矩阵数据，参见2.7.3节的实际代码。
 
+**思考题**：为什么在实际CUDA开发中我们通常使用一维指针`float*`来表示二维矩阵，而不是直接使用二维数组`float A[N][N]`？（提示：考虑内存布局、硬件寻址方式及性能影响）
+
 ### 2.4.2 边界检查——CUDA编程的基本安全实践
 
 当数据大小不能被线程块大小整除时，一些额外的线程会被启动但没有对应的工作。这要求我们在内核中加入<strong>边界检查（Boundary Checking）</strong>。

From 21e0c7fc6ebaf73372adebe92e67556386be2cf2 Mon Sep 17 00:00:00 2001
From: whx-6 <1901591887@qq.com>
Date: Thu, 21 May 2026 09:16:25 +0000
Subject: [PATCH 03/23] =?UTF-8?q?fix:=20=E4=BF=AE=E6=94=B9=E8=A1=A5?=
 =?UTF-8?q?=E5=85=85=20chapter3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...71\347\247\260\347\274\226\347\250\213.md" | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git "a/outputs/gpu-programming-course/docs/chapter3/\347\254\2543\347\253\240 CUDA\345\206\205\345\255\230\345\261\202\346\254\241\347\273\223\346\236\204\344\270\216\351\235\236\345\257\271\347\247\260\347\274\226\347\250\213.md" "b/outputs/gpu-programming-course/docs/chapter3/\347\254\2543\347\253\240 CUDA\345\206\205\345\255\230\345\261\202\346\254\241\347\273\223\346\236\204\344\270\216\351\235\236\345\257\271\347\247\260\347\274\226\347\250\213.md"
index 33f63ea..f99094e 100644
--- "a/outputs/gpu-programming-course/docs/chapter3/\347\254\2543\347\253\240 CUDA\345\206\205\345\255\230\345\261\202\346\254\241\347\273\223\346\236\204\344\270\216\351\235\236\345\257\271\347\247\260\347\274\226\347\250\213.md"	
+++ "b/outputs/gpu-programming-course/docs/chapter3/\347\254\2543\347\253\240 CUDA\345\206\205\345\255\230\345\261\202\346\254\241\347\273\223\346\236\204\344\270\216\351\235\236\345\257\271\347\247\260\347\274\226\347\250\213.md"	
@@ -1,4 +1,4 @@
-# 第3章 CUDA内存层次结构与非对称编程
+# 第3章 CUDA内存层次结构与异构编程
 
 在前两章中，我们学习了CUDA编程模型的基本结构——如何定义内核函数，如何通过线程层次结构将计算映射到大量线程上。现在，我们要面对GPU编程中另一个至关重要的问题：<strong>数据在哪里？</strong>
 
@@ -38,7 +38,7 @@ CUDA Programming Guide还提到了两个额外的只读内存空间：
 寄存器的关键特性：
 - <strong>作用域</strong>：单个线程（线程私有，其他线程无法访问）
 - <strong>访问延迟</strong>：零延迟——编译器可以直接将寄存器作为指令的操作数
-- <strong>容量</strong>：每个线程最多可以使用<strong>最多255个</strong>寄存器（具体上限取决于计算能力）。超过上限的变量会被编译器"溢出"到本地内存中
+- <strong>容量</strong>：现在主流 GPU 的寄存器配置为每个线程最多可使用 255 个 32 位寄存器（具体上限取决于 GPU 计算能力）。超过上限的变量会被编译器溢出到本地内存，导致访问延迟显著增加。
 - <strong>典型用途</strong>：频繁使用的标量变量、循环计数器、累加器、中间计算结果
 
 在内核函数中声明的所有局部变量（标量类型，如 `int`, `float`, `double`）默认都试图放入寄存器中。CUDA编译器在编译时会进行寄存器分配——将最常用的变量保留在寄存器中，将使用频率较低的变量溢出到本地内存。
@@ -49,7 +49,7 @@ CUDA Programming Guide还提到了两个额外的只读内存空间：
 
 ### 3.1.3 本地内存——寄存器不够用的后备方案
 
-尽管名字叫"本地"，但本地内存实际上<strong>位于片外DRAM中</strong>，访问延迟与全局内存相当。它被称为"本地"仅仅是因为它的作用域是线程私有的。
+尽管名字叫"本地"，但本地内存实际上位于片外 DRAM 中，但其访问会经过 L1/L2 缓存，因此延迟通常略低于直接访问全局内存。它被称为‘本地’仅仅是因为它的作用域是线程私有的。
 
 变量被放入本地内存的条件：
 1. <strong>寄存器溢出</strong>：内核使用的寄存器超过了分配上限
@@ -66,7 +66,7 @@ CUDA Programming Guide还提到了两个额外的只读内存空间：
 共享内存的关键特性：
 - <strong>作用域</strong>：同一个线程块内的所有线程都可以读写同一块共享内存
 - <strong>生命周期</strong>：与线程块相同——块被调度到SM上时分配，块执行完毕后释放
-- <strong>容量</strong>：通常为<strong>48KB到164KB</strong>（取决于GPU和配置），由同一SM上的所有线程块共享
+- <strong>容量</strong>：主流 GPU 的共享内存容量由计算能力决定，常见范围为 48KB~256KB（支持动态配置）。它由同一 SM 上的所有活跃线程块共享，单个线程块的使用量会直接影响 SM 的线程占用率。
 - <strong>访问方式</strong>：共享内存被划分为多个<strong>存储体（Bank）</strong>（通常为32个），如果多个线程同时访问不同的bank，访问可以并行进行；如果多个线程同时访问同一个bank的不同地址，则会产生<strong>存储体冲突（Bank Conflict）</strong>，导致访问串行化
 
 CUDA Programming Guide强调了共享内存的性能期望：
@@ -113,6 +113,7 @@ __global__ void tiledMatrixMul(const float* A, const float* B, float* C, int N)
 ```
 
 这个使用<strong>tile</strong>策略的模式是所有高性能矩阵乘法的核心——每个数据元素从全局内存加载一次（慢），但在共享内存中被重用16次（快），从而实现了接近算术峰值的性能。
+> **思考：** 很多初学者认为共享内存速度远快于全局内存，就应该把所有计算数据全部存入共享内存，这种做法是否合理？
 
 ### 3.1.5 全局内存——容量最大但最慢
 
@@ -182,10 +183,10 @@ cudaMemcpyToSymbol(filterKernel, h_filter, 9 * sizeof(float));
 | 共享内存   | 片内(SM)     | 读写     | 线程块      | ~20周期  | 每SM ~48-164KB | 线程块内       |
 | 本地内存   | 片外(DRAM)   | 读写     | 单个线程    | ~400周期 | 较大        | 线程内         |
 | 全局内存   | 片外(DRAM)   | 读写     | 所有线程    | ~400周期 | 最大(GB级)  | 跨内核持久     |
-| 常量内存   | 片外+片内缓存 | 只读    | 所有线程    | ~5周期*  | 总共64KB    | 跨内核持久     |
+| 常量内存   | 片外+片内缓存 | 只读    | 所有线程    | ~极低（广播模式下）*  | 总共64KB    | 跨内核持久     |
 | 纹理内存   | 片外+片内缓存 | 只读    | 所有线程    | ~100周期 | 大(GB级)    | 跨内核持久     |
 
-> *注：常量内存的~5周期延迟仅在broadcast模式下实现——即warp内所有线程读取同一地址时。
+> * 注：常量内存的低延迟特性依赖于常量缓存和广播机制。当warp内所有线程读取同一地址时（broadcast模式），一次缓存读取即可服务整个warp，性能最佳。当warp内线程访问不同地址时，这些访问将被硬件串行化处理，有效带宽急剧下降，应尽量避免这种访问模式。常量内存总容量为64KB，适合存储所有线程共同需要的只读小数据（如滤波核系数、物理常数等）
 
 <strong>选择内存类型的决策流程</strong>：
 
@@ -283,7 +284,7 @@ CUDA编程模型的核心假设之一是：<strong>主机（Host）</strong>和<
 
 4. <strong>独立的内存带宽</strong>：CPU和GPU各自拥有专用的内存带宽，不会（在传统模型中）相互干扰
 
-> <strong>注意</strong>：忘记调用`cudaMemcpy`将输入数据拷贝到设备，内核将访问到垃圾数据。忘记将结果从设备拷贝回主机，主机将看到旧数据或未初始化数据。这种错误通常不会导致崩溃，而是产生<strong>静默的错误结果</strong>——特别难以调试。始终在正确的时机进行数据迁移，并使用验证步骤来检查结果正确性。
+> <strong>注意</strong>：内核若访问未拷贝到设备的数据（例如传递主机指针），将导致非法内存访问，CUDA 会抛出错误（通常在内核启动后的同步点捕获）；若忘记将结果从设备拷回主机，主机端的数据不会自动更新，程序会继续使用旧数据，产生静默错误。无论哪种情况，正确的错误检查和数据迁移都是必须的。
 
 ### 3.2.5 异构编程中的异步执行
 
@@ -293,7 +294,7 @@ CUDA编程中一个重要的事实是：<strong>内核启动是异步的</strong
 
 1. <strong>必须显式同步</strong>：如果你需要在CPU端读取GPU计算结果，必须先用`cudaDeviceSynchronize()`或类似的同步函数来确保GPU计算已完成。
 
-2. <strong>可以重叠执行</strong>：CPU可以在GPU执行内核的同时做一些有用的工作（如准备下一批数据），这被称为<strong>计算与数据传输重叠（Overlap）</strong>——一个重要的性能优化技术。
+2. <strong>可以重叠执行</strong>：CPU 可以在 GPU 执行内核的同时并行执行其他任务（如准备下一批数据），这被称为**异步并发执行**。要实现**计算与数据传输的重叠**，需要使用 CUDA 流（Streams）和异步内存拷贝函数（如 `cudaMemcpyAsync`）。
 
 ```cuda
 // 异步执行示例
@@ -431,7 +432,7 @@ CUDA Programming Guide中描述的统一内存编程模型还涉及以下高级
 
 3. <strong>预处理开销</strong>：CPU在初始化托管内存数据时（如填充大型数组），每个页面的第一次写入也会触发页面故障。一种常见的优化模式是先一次性填充数据，然后在内核启动前进行预取。
 
-4. <strong>系统限制</strong>：统一内存的总分配量受系统物理内存（CPU RAM + GPU VRAM）的总和限制，但单个分配不能超过任何一个设备的物理内存。
+4. <strong>系统限制</strong>：托管内存的总分配量受系统物理内存（CPU RAM + GPU VRAM）的总和限制。与显式设备内存不同，单个托管内存分配可以大于单个GPU的物理内存，因为CUDA运行时可以使用CPU内存作为后备存储（超额订阅）。但过大的分配或频繁的跨设备页面迁移可能导致显著的性能下降。
 
 ## 3.4 计算能力
 

From 6ce14797fd541f8f08d2443a083f2918c23a93f5 Mon Sep 17 00:00:00 2001
From: whx-6 <1901591887@qq.com>
Date: Fri, 22 May 2026 12:24:56 +0000
Subject: [PATCH 04/23] 5.22 chapter4

---
 ...26\350\257\221\346\250\241\345\236\213.md" | 53 ++++++++++---------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git "a/outputs/gpu-programming-course/docs/chapter4/\347\254\2544\347\253\240 NVCC\347\274\226\350\257\221\346\250\241\345\236\213.md" "b/outputs/gpu-programming-course/docs/chapter4/\347\254\2544\347\253\240 NVCC\347\274\226\350\257\221\346\250\241\345\236\213.md"
index 9229fb3..3bb6e46 100644
--- "a/outputs/gpu-programming-course/docs/chapter4/\347\254\2544\347\253\240 NVCC\347\274\226\350\257\221\346\250\241\345\236\213.md"	
+++ "b/outputs/gpu-programming-course/docs/chapter4/\347\254\2544\347\253\240 NVCC\347\274\226\350\257\221\346\250\241\345\236\213.md"	
@@ -16,7 +16,7 @@ NVCC 的定位可以从以下几个角度理解：
 
 2. <strong>代码分离器</strong>：NVCC 能够识别 `.cu` 文件中的主机代码和设备代码，将它们分离开来，分别交由不同的编译器处理。这涉及到对 CUDA 语法（如 `__global__`、`__device__`、`<<<...>>>`）的解析。
 
-3. <strong>多目标代码生成器</strong>：NVCC 可以为同一个 CUDA 程序生成多种目标格式——主机目标代码（`.o`/`.obj`）、设备 PTX 汇编代码、设备 cubin 二进制代码，或者直接生成最终的可执行文件。
+3. <strong>多目标代码生成器</strong>：NVCC 可以为同一个 CUDA 程序生成多种目标格式——主机目标代码（`.o`/`.obj`）、设备 PTX 汇编代码、设备 cubin 二进制代码，或者直接生成最终的可执行文件、静态库（.a/.lib）或动态库（.so/.dll）
 
 4. <strong>链接协调器</strong>：NVCC 协调主机端目标文件与设备端目标文件的链接过程，确保最终可执行文件中正确嵌入了设备代码。
 
@@ -94,30 +94,32 @@ NVCC 首先对 `.cu` 源文件进行预处理（展开宏、处理 `#include` 
 
 应用程序可以：
 - <strong>链接到编译后的主机代码</strong>：这是最常见的情况。NVCC 调用主机链接器，将主机端目标文件和嵌入的设备代码链接为最终可执行文件。
-- <strong>忽略主机代码，使用 Driver API</strong>：将设备代码编译为独立的 cubin 或 PTX 文件，在运行时通过 <strong>CUDA Driver API</strong>（而不是 Runtime API）自行加载和执行。这为需要精细控制设备代码加载的应用程序（如框架类库）提供了最大的灵活性。
+- <strong>忽略主机代码，使用 Driver API</strong>：将设备代码编译为独立的 cubin、PTX 或 fatbin 文件，在运行时通过 CUDA Driver API 自行加载和执行。这种方式广泛应用于深度学习框架（如 PyTorch、TensorFlow）和需要动态生成内核的场景。
 
 下面用一个简明的图示来总结这个编译流程：
 
 ```
-                   .cu 源文件
-                       |
-                   NVCC 驱动
-                       |
-            +----------+-----------+
-            |                      |
-       设备代码提取           主机代码保留
-            |                      |
-      cicc (CUDA 前端)       <<<...>>> 替换
-            |                为 Runtime 调用
-       +----+----+                 |
-       |         |           调用主机编译器
-     PTX       cubin              |
-       |         |            host_obj.o
-       +----+----+                 |
-            |                      |
-      嵌入到可执行文件 ------------+
-            |
-        最终可执行文件
+                          .cu 源文件
+                              |
+                          NVCC 驱动
+                              |
+              +----------------+----------------+
+              |                                 |
+       设备代码提取                       主机代码保留
+              |                                 |
+    cicc (CUDA C→PTX 编译器)       <<<...>>> 替换为 Runtime 调用
+              |                                 |
+            PTX                        调用主机编译器 (gcc/clang)
+              |                                 |
+    ptxas (PTX→cubin 汇编器)             host_obj.o
+              |                                 |
+            cubin                               |
+              |                                 |
+              +----------------+----------------+
+                              |
+                      嵌入到可执行文件
+                              |
+                        最终可执行文件
 ```
 
 ### 4.2.2 离线编译的完整示例
@@ -267,12 +269,14 @@ ls -la ~/.nv/ComputeCache/
 
 > 为计算能力 <em>X.y</em> 生成的 cubin 对象仅能在计算能力 <em>X.z</em>（其中 <em>z >= y</em>）的设备上执行。
 
+<strong>补充说明</strong>：除硬件级兼容外，NVIDIA 还提供了<strong>驱动级向前兼容</strong>：使用较旧版本 CUDA Toolkit 编译的应用程序，可以在安装了较新版本 CUDA 驱动的系统上运行。例如，CUDA 11.0 编译的应用可在 CUDA 12.0 驱动上运行。这是驱动层面的兼容，与硬件级二进制兼容性相互独立。
+
 具体来说：
 - <strong>向前兼容</strong>（同一个主版本 X 内）：`sm_35` 可以在 `sm_37` 上运行。`sm_50` 可以在 `sm_52`、`sm_53` 上运行。`sm_80` 可以在 `sm_86`、`sm_89` 上运行。
 - <strong>不支持向后兼容</strong>：`sm_80` 不能在 `sm_75` 上运行。
 - <strong>不支持跨主版本兼容</strong>：`sm_35` 不能在 `sm_50` 上运行（因为主版本 3 ≠ 5）。`sm_60` 不能在 `sm_70` 上运行（因为主版本 6 ≠ 7）。
 
-> <strong>注意</strong>：二进制兼容性仅支持桌面平台（Desktop）。Tegra 平台不支持二进制兼容性。桌面与 Tegra 之间的二进制兼容性也不支持。
+> <strong>注意</strong>：二进制兼容性仅支持 x86_64 和 ARM64 桌面 / 服务器平台；Tegra 平台不支持跨版本二进制兼容性，桌面与 Tegra 之间也不支持二进制兼容性。
 
 这个规则可以用下图直观表示：
 
@@ -311,7 +315,8 @@ ls -la ~/.nv/ComputeCache/
 | 7.0, 7.5 | Volta | 2017 | V100, T4, Xavier |
 | 8.0, 8.6, 8.7, 8.9 | Ampere | 2020 | A100, RTX 3060/3070/3080/3090 |
 | 9.0 | Hopper | 2022 | H100 |
-| 10.0 | Blackwell | 2024 | B100, B200 |
+| 10.0, 10.3, 11.0 | Blackwell | 2024 | GB100, GB200, B100 |
+| 12.0, 12.1 | Blackwell | 2025 | GB300, B200 系列后续型号 |
 
 ## 4.4 PTX 兼容性
 
@@ -400,7 +405,7 @@ nvcc x.cu \
 <strong>Fat Binary 选择逻辑</strong>（运行时自动执行）：
 1. 查找与当前设备计算能力完全匹配的 cubin。
 2. 如果没找到，查找二进制兼容的 cubin（同一主版本，z >= y）。
-3. 如果还没找到，寻找可用的 PTX 代码进行 JIT 编译。
+3. 如果还没找到，寻找版本最低且小于等于当前设备计算能力的 PTX 代码进行 JIT 编译。
 4. 如果以上都失败，程序报错退出。
 
 下面是一个更全面的 Fat Binary 编译策略示例：

From 19dfb9b77ce5f121e71f67d19c126d5f19b37fa1 Mon Sep 17 00:00:00 2001
From: whx-6 <1901591887@qq.com>
Date: Sat, 23 May 2026 04:05:01 +0000
Subject: [PATCH 05/23] 5.23 chapter4

---
 ...26\350\257\221\346\250\241\345\236\213.md" | 55 ++++++++++---------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git "a/outputs/gpu-programming-course/docs/chapter4/\347\254\2544\347\253\240 NVCC\347\274\226\350\257\221\346\250\241\345\236\213.md" "b/outputs/gpu-programming-course/docs/chapter4/\347\254\2544\347\253\240 NVCC\347\274\226\350\257\221\346\250\241\345\236\213.md"
index 3bb6e46..ec5a759 100644
--- "a/outputs/gpu-programming-course/docs/chapter4/\347\254\2544\347\253\240 NVCC\347\274\226\350\257\221\346\250\241\345\236\213.md"	
+++ "b/outputs/gpu-programming-course/docs/chapter4/\347\254\2544\347\253\240 NVCC\347\274\226\350\257\221\346\250\241\345\236\213.md"	
@@ -545,6 +545,8 @@ no kernel image is available for execution on the device
 ```bash
 # 1. 确保 arch/code 覆盖目标 GPU 的计算能力
 nvcc file.cu -o file -arch=sm_70  # 如果目标是 V100
+# CMake环境下设置目标架构
+set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90")
 
 # 2. 使用 Fat Binary 覆盖多个架构 + PTX 后备
 nvcc file.cu -o file \
@@ -628,16 +630,16 @@ nvcc file.cu -keep -arch=sm_80
 # 2. 增加编译器信息输出
 nvcc file.cu --verbose -arch=sm_80
 
-# 3. 仅做语法检查，不生成代码
+# 3. 打印所有将要执行的编译命令，但不实际执行任何编译操作
 nvcc file.cu -arch=sm_80 --dryrun
 
 # 4. 仅编译设备代码（检查设备代码语法）
 nvcc file.cu -ptx -arch=compute_80
 ```
 
-## 4.8 C++ 兼容性与 64 位支持
+## 4.7 C++ 兼容性与 64 位支持
 
-### 4.8.1 C++ 兼容性
+### 4.7.1 C++ 兼容性
 
 NVCC 编译器的前端按照 C++ 语法规则处理 CUDA 源文件。但主机代码和设备代码的 C++ 支持程度不同：
 
@@ -646,12 +648,12 @@ NVCC 编译器的前端按照 C++ 语法规则处理 CUDA 源文件。但主机
 - NVCC 支持通过 `-std=c++11`、`-std=c++14`、`-std=c++17` 等选项指定 C++ 标准版本。
 
 <strong>设备代码（Device Code）</strong>：
-- 仅<strong>完整支持 C++ 的一个子集</strong>。GPU 设备没有完整的 C++ 运行时库，因此很多 C++ 特性在设备代码中不可用或受限。
+- <strong>设备代码支持 C++14 完整特性，以及 C++17 和 C++20 的部分特性</strong>。GPU 设备没有完整的 C++ 运行时库，因此很多 C++ 特性在设备代码中不可用或受限。
 - <strong>不支持的特性包括</strong>：
   - C++ 标准库（`std::vector`、`std::string`、`std::map` 等）——GPU 设备上没有对应的运行时实现。
   - 异常处理（`try`/`catch`/`throw`）——GPU 硬件不能支持 C++ 异常。
   - 运行时类型识别（RTTI, `typeid`、`dynamic_cast`）——设备代码中不支持。
-  - `new`/`delete` 操作符（某些架构的特定版本除外）。
+  - `new`/`delete` 操作符。
   - 虚函数的多态调用（从 CC 3.5 开始有限支持）。
 - <strong>支持的特性包括</strong>：
   - 模板（Templates）
@@ -661,11 +663,11 @@ NVCC 编译器的前端按照 C++ 语法规则处理 CUDA 源文件。但主机
   - 函数重载
   - 类和结构体
   - 静态成员变量（从 CC 2.0+）
-  - `constexpr`
+  - `constexpr`（CUDA 12.0+）
 
 设备代码支持的 C++ 子集的完整列表在 CUDA Programming Guide 的"C++ Language Support"章节中详细描述。
 
-### 4.8.2 64 位兼容性
+### 4.7.2 64 位兼容性
 
 NVCC 对 64 位和 32 位编译模式有明确的规则：
 
@@ -678,7 +680,7 @@ NVCC 对 64 位和 32 位编译模式有明确的规则：
 
 > <strong>提示</strong>：在现代 CUDA 开发中（CUDA 10.0 及以后），NVIDIA 已不再提供 32 位版本的 CUDA Toolkit。所有开发都应在 64 位模式下进行。除非你在维护非常老旧的遗留系统，否则不需要关心 32 位模式。
 
-### 4.8.3 独立编译与分离编译
+### 4.7.3 独立编译与分离编译
 
 CUDA 程序可以在多个 `.cu` 文件中组织代码，这涉及到<strong>独立编译（Separate Compilation）</strong>：
 
@@ -761,7 +763,7 @@ nvcc file.cu -o file --target-os-variant=Linux --target-arch=arm64 \
     --cross-compile --compiler-bindir=/usr/bin/aarch64-linux-gnu-g++
 ```
 
-### 4.8.5 NVCC 优化级别与控制选项
+### 4.7.5 NVCC 优化级别与控制选项
 
 NVCC 提供多级优化控制，从完全不优化到激进优化：
 
@@ -812,7 +814,7 @@ ptxas info    : Used 32 registers, 4096 bytes smem, 352 bytes cmem[0]
 关键指标解读：
 - <strong>registers</strong>：每个线程使用的寄存器数。越少越好（允许更多 warp 驻留），但太少会导致溢出（spill）。
 - <strong>smem</strong>：静态 + 动态共享内存每块使用量（字节）。
-- <strong>spill stores/loads</strong>：寄存器溢出到局部内存的次数。0 是最理想的——所有变量都装在寄存器中。
+- <strong>spill stores/loads</strong>：寄存器溢出到局部内存的次数。0 是最理想的——所有变量都装在寄存器中。非零值通常会导致性能显著下降（局部内存访问延迟是寄存器的数百倍），是内核优化的核心指标之一。
 - <strong>cmem[0]</strong>：常量内存使用量（字节）。用于存储内核参数等。
 
 <strong>NVCC 编译阶段及对应工具链</strong>：
@@ -829,7 +831,7 @@ ptxas info    : Used 32 registers, 4096 bytes smem, 352 bytes cmem[0]
 
 了解这个工具链能帮助你更好地理解 NVCC 的输出和错误信息。例如，`ptxas` 的错误表明 PTX 汇编阶段出了问题（通常是代码使用了当前架构不支持的指令）；`cicc` 的错误表明 CUDA C++ 前端解析出了问题。
 
-### 4.8.5 编译时间优化技巧
+### 4.7.5 编译时间优化技巧
 
 大型 CUDA 项目（特别是使用大量模板或自动生成代码的）可能遇到较长的编译时间。以下是一些优化技巧：
 
@@ -874,11 +876,11 @@ nvcc file.cu -o file -gencode ... -gencode ... -gencode ...
 
 CUDA 编译器需要为每个 `-gencode` 选项重新编译设备代码。将大型设备函数放在 `.cuh` 头文件中会导致它们在每个翻译单元中被重复编译。尽可能将设备代码放在 `.cu` 文件中，只将简洁的接口声明放在头文件中。
 
-## 4.9 构建系统集成与实战
+## 4.8 构建系统集成与实战
 
 在真实项目中，CUDA 代码很少单独使用 `nvcc` 命令行编译，而是集成在构建系统中。本节介绍如何将 CUDA 编译集成到 CMake 和 Makefile 中。
 
-### 4.9.1 CMake 项目配置
+### 4.8.1 CMake 项目配置
 
 CMake 3.8+ 提供了对 CUDA 作为一等语言（first-class language）的支持：
 
@@ -944,7 +946,7 @@ target_compile_options(vector_add PRIVATE
 )
 ```
 
-### 4.9.2 Makefile 示例
+### 4.8.2 Makefile 示例
 
 ```makefile
 # Makefile for CUDA project
@@ -977,7 +979,7 @@ clean:
 	rm -f vector_add device_query *.o *.ptx *.cubin
 ```
 
-### 4.9.3 CUDA 相关环境变量
+### 4.8.3 CUDA 相关环境变量
 
 CUDA Toolkit 在运行时受多种环境变量的影响。以下是开发中常用的：
 
@@ -988,7 +990,6 @@ CUDA Toolkit 在运行时受多种环境变量的影响。以下是开发中常
 | `CUDA_CACHE_DISABLE` | 设为 1 禁用 JIT 缓存 | `CUDA_CACHE_DISABLE=1` |
 | `CUDA_CACHE_PATH` | JIT 缓存目录 | `CUDA_CACHE_PATH=/tmp/cuda_cache` |
 | `CUDA_DEVICE_MAX_CONNECTIONS` | 每个设备的 CUDA 流多路复用能力 | 默认 8 |
-| `CUDA_ERROR_CHECKING` | 控制运行时错误检查粒度 | 默认中等 |
 | `CUDA_MANAGED_FORCE_DEVICE_ALLOC` | 强制统一内存分配在设备端 | |
 
 <strong>在开发中常用环境变量组合</strong>：
@@ -1004,7 +1005,7 @@ CUDA_VISIBLE_DEVICES=1 ./my_app
 CUDA_CACHE_DISABLE=1 ./my_app
 ```
 
-### 4.9.4 多架构部署策略小结
+### 4.8.4 多架构部署策略小结
 
 根据你的目标用户群体，推荐以下几种 Fat Binary 策略：
 
@@ -1078,11 +1079,11 @@ Driver API 的优势是精确控制——你可以：
 
 > <strong>提示</strong>：Runtime API 在内部就是通过 Driver API 实现的。使用 Runtime API 时，NVCC 自动生成的代码本质上就是在执行类似上面 `cuModuleLoad` → `cuModuleGetFunction` → `cuLaunchKernel` 的操作。
 
-## 4.10 动手体验：编译和观察 Fat Binary
+## 4.9 动手体验：编译和观察 Fat Binary
 
 在本节中，我们将通过多个实际动手实验来直观感受 NVCC 编译模型的每个方面。
 
-### 4.10.1 实验环境准备
+### 4.9.1 实验环境准备
 
 首先，准备一个完整的 CUDA 程序 `device_query.cu`。这个程序将：
 1. 查询并打印当前 GPU 的计算能力和属性。
@@ -1091,7 +1092,7 @@ Driver API 的优势是精确控制——你可以：
 
 本节所用的完整可编译代码参见 `code/chapter4/device_query.cu`。
 
-### 4.10.2 实验一：单架构编译
+### 4.9.2 实验一：单架构编译
 
 <strong>实验一：针对特定架构编译</strong>
 
@@ -1117,7 +1118,7 @@ nvcc device_query.cu -o device_query_sm80 -arch=sm_80
 cuobjdump device_query_sm80 | head -20
 ```
 
-### 4.10.3 实验二：Fat Binary 编译与检查
+### 4.9.3 实验二：Fat Binary 编译与检查
 
 <strong>实验二：生成 Fat Binary</strong>
 
@@ -1144,8 +1145,8 @@ CUDA Toolkit 提供了 `cuobjdump` 工具来检查 cubin 和 Fat Binary 文件
 cuobjdump device_query_fat
 
 # 列出所有的 cubin 和 PTX 嵌入
-cuobjdump -list-ptx device_query_fat
-cuobjdump -list-sass device_query_fat
+cuobjdump --list-ptx device_query_fat
+cuobjdump --list-sass device_query_fat
 ```
 
 典型输出示例：
@@ -1178,7 +1179,7 @@ compile_size = 64bit
 cuobjdump -sass device_query_fat
 ```
 
-### 4.10.4 实验三：PTX 与 cubin 分步生成
+### 4.9.4 实验三：PTX 与 cubin 分步生成
 
 <strong>实验四：PTX 代码的分步生成</strong>
 
@@ -1235,7 +1236,7 @@ nvcc device_query.ptx -cubin -o device_query.cubin -arch=sm_80
 cuobjdump -sass device_query.cubin
 ```
 
-### 4.10.5 实验四：保留所有中间文件
+### 4.9.5 实验四：保留所有中间文件
 
 ```bash
 # 使用 -keep 选项保留所有编译中间文件
@@ -1249,7 +1250,7 @@ ls -la device_query*
 # ... (可能还有更多)
 ```
 
-### 4.10.6 实验五：编译器选项演练
+### 4.9.6 实验五：编译器选项演练
 
 以下是本实验中用到的所有相关 NVCC 选项的总结：
 
@@ -1276,7 +1277,7 @@ ls -la device_query*
 
 </div>
 
-## 4.11 本章小结
+## 4.10 本章小结
 
 在本章中，我们深入探索了 CUDA 程序的编译世界。我们的旅程从 NVCC 编译器驱动程序的角色开始，一直延伸到如何构建跨多代 GPU 架构都兼容的 Fat Binary：
 

From b413cd7407b6ee574ea8fc69c7222140073e28c4 Mon Sep 17 00:00:00 2001
From: whx-6 <1901591887@qq.com>
Date: Sun, 24 May 2026 09:12:53 +0000
Subject: [PATCH 06/23] 5.24 chapter5

---
 ...\206\205\345\255\230\347\256\241\347\220\206.md" | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git "a/outputs/gpu-programming-course/docs/chapter5/\347\254\2545\347\253\240 CUDA\350\277\220\350\241\214\346\227\266\342\200\224\342\200\224\350\256\276\345\244\207\345\206\205\345\255\230\347\256\241\347\220\206.md" "b/outputs/gpu-programming-course/docs/chapter5/\347\254\2545\347\253\240 CUDA\350\277\220\350\241\214\346\227\266\342\200\224\342\200\224\350\256\276\345\244\207\345\206\205\345\255\230\347\256\241\347\220\206.md"
index db81001..921de3d 100644
--- "a/outputs/gpu-programming-course/docs/chapter5/\347\254\2545\347\253\240 CUDA\350\277\220\350\241\214\346\227\266\342\200\224\342\200\224\350\256\276\345\244\207\345\206\205\345\255\230\347\256\241\347\220\206.md"	
+++ "b/outputs/gpu-programming-course/docs/chapter5/\347\254\2545\347\253\240 CUDA\350\277\220\350\241\214\346\227\266\342\200\224\342\200\224\350\256\276\345\244\207\345\206\205\345\255\230\347\256\241\347\220\206.md"	
@@ -14,8 +14,8 @@
 
 | 链接方式 | Linux 库文件 | Windows 库文件 | 特点 |
 | :--- | :--- | :--- | :--- |
-| <strong>静态链接</strong> | `libcudart.a` | `cudart.lib` | 运行时代码嵌入可执行文件，无需附带 DLL |
-| <strong>动态链接</strong> | `libcudart.so` | `cudart.dll` | 可执行文件更小，运行时需要 DLL/SO 可用 |
+| <strong>静态链接</strong> | `libcudart_static.a` | `cudart_static.lib` | 运行时代码嵌入可执行文件，无需附带 DLL |
+| <strong>动态链接</strong> | `libcudart.so` | `cudart.lib（导入库） + cudart.dll（运行时）` | 可执行文件更小，运行时需要 DLL/SO 可用 |
 
 </div>
 
@@ -32,7 +32,9 @@ nvcc program.cu -o program -lcudart
 
 # 对于 CMake 项目
 find_package(CUDAToolkit REQUIRED)
-target_link_libraries(my_target CUDA::cudart)
+target_link_libraries(my_target PRIVATE CUDA::cudart)
+# 如果需要静态链接，只需替换目标名
+# target_link_libraries(my_cuda_app PRIVATE CUDA::cudart_static)
 ```
 
 ### 5.1.2 API 命名约定
@@ -252,7 +254,7 @@ cudaError_t cudaMemcpy(void *dst, const void *src,
 
 <strong>UVA 下的自动方向推断</strong>：
 
-如果设备支持<strong>统一虚拟地址空间（UVA, Unified Virtual Address Space）</strong>（CC 2.0+），可以使用 `cudaMemcpyDefault` 作为 `kind` 参数：
+如果设备支持<strong>统一虚拟地址空间（UVA, Unified Virtual Address Space）</strong>（64位 Linux/Windows + CC 2.0+），可以使用 `cudaMemcpyDefault` 作为 `kind` 参数：
 
 ```cuda
 cudaMemcpy(dst, src, count, cudaMemcpyDefault);
@@ -513,6 +515,8 @@ cudaExtent extent = make_cudaExtent(
 - <strong>pitch（行步长）</strong>：从一行开头到下一行开头的字节数。
 - <strong>slicePitch（层步长）</strong>：从一层开头到下一层开头的字节数。计算公式：`slicePitch = pitch * height`。
 
+<strong>说明</strong>：cudaMalloc3D 会自动计算并返回正确的 pitch 和 slicePitch，不要手动计算这些值，始终使用返回的 cudaPitchedPtr 结构体中的字段进行地址计算。
+
 <strong>完整的三维分配和遍历示例</strong>：
 
 ```cuda
@@ -786,6 +790,7 @@ printf("GPU operation took: %.3f ms\n", milliseconds);
 cudaEventDestroy(start);
 cudaEventDestroy(stop);
 ```
+💡 思考：如果仅需要使用事件实现流间同步，而不需要计时功能，可以在创建事件时使用 cudaEventDisableTiming 标志。为什么这样做能显著提升性能？
 
 ### 5.8.2 事件的几个重要特性
 

From ec895b60e404804474ebae271be5349159a09d9c Mon Sep 17 00:00:00 2001
From: whx-6 <1901591887@qq.com>
Date: Mon, 25 May 2026 13:21:28 +0000
Subject: [PATCH 07/23] 5.24 chapter6

---
 ...73\346\234\272\345\206\205\345\255\230.md" | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git "a/outputs/gpu-programming-course/docs/chapter6/\347\254\2546\347\253\240 \345\205\261\344\272\253\345\206\205\345\255\230\344\270\216\351\241\265\351\224\201\345\256\232\344\270\273\346\234\272\345\206\205\345\255\230.md" "b/outputs/gpu-programming-course/docs/chapter6/\347\254\2546\347\253\240 \345\205\261\344\272\253\345\206\205\345\255\230\344\270\216\351\241\265\351\224\201\345\256\232\344\270\273\346\234\272\345\206\205\345\255\230.md"
index 6758b72..3f35ad2 100644
--- "a/outputs/gpu-programming-course/docs/chapter6/\347\254\2546\347\253\240 \345\205\261\344\272\253\345\206\205\345\255\230\344\270\216\351\241\265\351\224\201\345\256\232\344\270\273\346\234\272\345\206\205\345\255\230.md"	
+++ "b/outputs/gpu-programming-course/docs/chapter6/\347\254\2546\347\253\240 \345\205\261\344\272\253\345\206\205\345\255\230\344\270\216\351\241\265\351\224\201\345\256\232\344\270\273\346\234\272\345\206\205\345\255\230.md"	
@@ -457,8 +457,7 @@ Bank 的组织方式在不同计算能力下有一些细微但重要的差异：
 - 硬件具有更好的并行 Bank 解析能力。
 
 <strong>计算能力 7.x / 8.x（Volta / Ampere）</strong>：
-- 32 个 Bank，4 字节交错。但硬件可以处理更复杂的访问模式。
-- 共享内存容量大幅增加（可达 164 KB/SM，可选配置）。
+- 32 个 Bank，4 字节顺序分配。但硬件可以处理更复杂的访问模式。
 - 对 `double`（8 字节）类型的访问可能会产生不同的 Bank 行为（因为一个 double 跨越 2 个 Bank）。
 
 > <strong>提示</strong>：在你的设备上，可以通过 `cudaDeviceGetAttribute()` 查询 Bank 相关信息。例如，`cudaDevAttrSharedMemBankSizeFourBytes` 属性确认 Bank 大小是否为 4 字节。
@@ -559,17 +558,21 @@ void launchDynamicReduction(const float *d_input, float *d_output, int N)
 
 ### 6.4.6 共享内存与 L1 缓存的配置
 
-在 Maxwell（CC 5.x）及更新的架构上，共享内存和 L1 数据缓存共享同一块片上 SRAM。你可以配置每个 SM 上分配给共享内存和 L1 缓存的比例：
+在 **Maxwell 到 Turing（CC 5.x ~ 7.5）** 架构中，共享内存和 L1 数据缓存共享同一片片上 SRAM，你可以配置每个 SM 上分配给两者的比例。
+
+在 **Ampere 及更新架构（CC 8.x+）** 中，两者已物理分离，共享内存的容量是独立且固定的（可配置为不同大小档位），`cudaDeviceSetCacheConfig` 不再影响共享内存容量，仅作用于 L1 缓存策略。
+
+你可以通过以下 API 查询和设置缓存偏好：
 
 ```cuda
 // 查询当前配置
 cudaFuncCache cacheConfig;
 cudaDeviceGetCacheConfig(&cacheConfig);
 
-// 设置配置（影响后续内核启动）
-// cudaFuncCachePreferNone:   无偏好（默认）
-// cudaFuncCachePreferShared: 优先共享内存（更多共享内存，更少 L1）
-// cudaFuncCachePreferL1:     优先 L1（更多 L1，更少共享内存）
+// 设置后续内核的缓存偏好
+// - cudaFuncCachePreferNone:    默认配置
+// - cudaFuncCachePreferShared:  优先共享内存（旧架构中会减少 L1）
+// - cudaFuncCachePreferL1:      优先 L1 缓存（旧架构中会减少共享内存）
 cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
 ```
 
@@ -884,7 +887,7 @@ int main()
 
 ## 6.8 共享内存与占用率（Occupancy）
 
-共享内存的使用量直接影响一个 SM 上可以同时驻留的线程块数量，从而影响<strong>占用率（Occupancy）</strong>。占用率定义为：
+共享内存的使用量直接影响一个 SM 上可以同时驻留的线程块数量，从而影响<strong>占用率（Occupancy）</strong>。简单说，占用率就是 GPU 每个 SM（流多处理器）上，当前正在运行的 warp 数，和它理论上最多能跑的 warp 数的比值。占用率定义为：
 
 $$\text{Occupancy} = \frac{\text{Active Warps per SM}}{\text{Maximum Warps per SM}}$$
 

From 3f27016415b8af46d859a476eabf6419f70dc22b Mon Sep 17 00:00:00 2001
From: whx-6 <1901591887@qq.com>
Date: Tue, 26 May 2026 14:50:28 +0000
Subject: [PATCH 08/23] 5.26 chapter7

---
 ...5\345\271\266\345\217\221\346\211\247\350\241\214.md" | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git "a/outputs/gpu-programming-course/docs/chapter7/\347\254\2547\347\253\240 \346\265\201\343\200\201\344\272\213\344\273\266\344\270\216\345\274\202\346\255\245\345\271\266\345\217\221\346\211\247\350\241\214.md" "b/outputs/gpu-programming-course/docs/chapter7/\347\254\2547\347\253\240 \346\265\201\343\200\201\344\272\213\344\273\266\344\270\216\345\274\202\346\255\245\345\271\266\345\217\221\346\211\247\350\241\214.md"
index e88a2f0..94f4484 100644
--- "a/outputs/gpu-programming-course/docs/chapter7/\347\254\2547\347\253\240 \346\265\201\343\200\201\344\272\213\344\273\266\344\270\216\345\274\202\346\255\245\345\271\266\345\217\221\346\211\247\350\241\214.md"	
+++ "b/outputs/gpu-programming-course/docs/chapter7/\347\254\2547\347\253\240 \346\265\201\343\200\201\344\272\213\344\273\266\344\270\216\345\274\202\346\255\245\345\271\266\345\217\221\346\211\247\350\241\214.md"	
@@ -31,7 +31,7 @@ CUDA 通过<strong>异步库函数</strong>（asynchronous library functions）
 - 单个设备内部的内存拷贝；
 - 64 KB 或更小的主机到设备内存块拷贝；
 - 以 `Async` 为后缀的内存拷贝函数；
-- 内存设置函数调用（memory set function calls）。
+- 以 Async 为后缀的内存设置函数调用（如 cudaMemsetAsync）
 
 程序员可以通过设置环境变量 `CUDA_LAUNCH_BLOCKING` 为 1 来在全局范围内禁用所有 CUDA 应用程序核函数启动的异步性。这个特性仅为调试目的提供，不应作为使生产软件可靠运行的手段。
 
@@ -148,6 +148,10 @@ for (int i = 0; i < 2; ++i)
 - 所有独立操作应在依赖操作之前发出；
 - 任何类型的同步都应尽可能延迟。
 
+> 补充说明：非阻塞流与默认流的彻底解耦
+使用 cudaStreamNonBlocking 标志创建的流，称为非阻塞流。
+无论使用哪种默认流模式（legacy 或 per-thread），这类流都不会与任何默认流产生隐式同步，可以完全避免意外的串行化，是现代多流并发编程的推荐方式。
+
 ### 7.3.6 流重叠行为
 
 两个流之间执行重叠的程度取决于命令向每个流发出的顺序，以及设备是否支持数据传输与核函数执行的重叠、并发核函数执行和/或并发数据传输。
@@ -179,6 +183,7 @@ for (int i = 0; i < 2; ++i)
 ### 7.4.1 事件的创建与销毁
 
 运行时还提供了一种密切监控设备进度以及执行精确计时的方法，即让应用程序在程序中的任何点异步记录<strong>事件（event）</strong>，并查询这些事件何时完成。当事件之前的所有任务——或可选地，给定流中的所有命令——都已完成时，该事件即被视为完成。在流 0（即默认流）中记录的事件，在所有流中的所有先前任务和命令完成后才被视为完成。
+>补充：在 legacy 默认流模式下，在流 0（即 NULL 流）中记录的事件，在所有流中的所有先前任务和命令完成后才被视为完成。而在 per-thread 默认流模式下，每个线程的流 0 是独立的，仅等待该线程自身流 0 中的先前命令完成。
 
 以下代码创建两个事件：
 
@@ -525,7 +530,7 @@ kernelC<<<grid, block, 0, stream[0]>>>(...);    // 必须等待 kernelB 完成
 
 <strong>（4）设备内存设置（cudaMemset）</strong>
 
-`cudaMemset` 是同步的。如果要异步设置设备内存，请使用 `cudaMemsetAsync`。
+`cudaMemset` 是同步的。如果要异步设置设备内存，请使用 `cudaMemsetAsync`。另外，`cudaMemsetAsync`是异步操作，但如果不指定流参数，它会被发往默认流，在 legacy 模式下仍会引发全局隐式同步。
 
 ### 7.8.3 多流编程的最佳实践
 

From 2ed295fd8d9df7e0ba5f3b13249c3c60fb7f8c33 Mon Sep 17 00:00:00 2001
From: whx-6 <1901591887@qq.com>
Date: Wed, 27 May 2026 13:37:16 +0000
Subject: [PATCH 09/23] 5.27 chapter8

---
 .../code/chapter8/p2p_test.cu                 | 90 +++++++++++--------
 ...60\345\235\200\347\251\272\351\227\264.md" | 55 +++++++++---
 2 files changed, 97 insertions(+), 48 deletions(-)

diff --git a/outputs/gpu-programming-course/code/chapter8/p2p_test.cu b/outputs/gpu-programming-course/code/chapter8/p2p_test.cu
index 524e492..39dd7ea 100644
--- a/outputs/gpu-programming-course/code/chapter8/p2p_test.cu
+++ b/outputs/gpu-programming-course/code/chapter8/p2p_test.cu
@@ -22,7 +22,7 @@ int main() {
 
     printf("Found %d CUDA device(s)\n\n", deviceCount);
 
-    // Print device info
+    // 打印设备信息
     for (int i = 0; i < deviceCount; i++) {
         cudaDeviceProp prop;
         CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
@@ -31,7 +31,7 @@ int main() {
                prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
     }
 
-    // Check P2P capability between device 0 and 1
+    // 检查设备 0 和 1 之间的 P2P 能力
     int canAccess01, canAccess10;
     CUDA_CHECK(cudaDeviceCanAccessPeer(&canAccess01, 0, 1));
     CUDA_CHECK(cudaDeviceCanAccessPeer(&canAccess10, 1, 0));
@@ -41,75 +41,93 @@ int main() {
     printf("P2P Access: Device 1 -> Device 0: %s\n",
            canAccess10 ? "Supported" : "Not Supported");
 
-    // Test data size: 16MB
-    const size_t dataSize = 16 * 1024 * 1024;  // 16 MB
+    // 测试数据大小：16MB
+    const size_t dataSize = 16 * 1024 * 1024;
     const size_t numFloats = dataSize / sizeof(float);
     float *d_data0, *d_data1;
     float *h_data;
 
-    // Allocate device memory
+    // 分配设备内存
     CUDA_CHECK(cudaSetDevice(0));
     CUDA_CHECK(cudaMalloc(&d_data0, dataSize));
     CUDA_CHECK(cudaSetDevice(1));
     CUDA_CHECK(cudaMalloc(&d_data1, dataSize));
 
-    // Allocate page-locked host memory
+    // 分配页锁定主机内存
     CUDA_CHECK(cudaMallocHost(&h_data, dataSize));
 
-    // Initialize data
+    // 初始化数据
     for (size_t i = 0; i < numFloats; i++) {
         h_data[i] = (float)i;
     }
 
-    // Copy data to device 0
+    // 将数据拷贝到设备 0
     CUDA_CHECK(cudaSetDevice(0));
     CUDA_CHECK(cudaMemcpy(d_data0, h_data, dataSize, cudaMemcpyHostToDevice));
 
-    // Create events for timing
-    cudaEvent_t start, stop;
-    CUDA_CHECK(cudaEventCreate(&start));
-    CUDA_CHECK(cudaEventCreate(&stop));
+    // ---------- 预创建设备 0 和设备 1 各自的事件对 ----------
+    cudaEvent_t start0, stop0;   // 用于设备 0 上的计时
+    cudaEvent_t start1, stop1;   // 用于设备 1 上的计时
+
+    CUDA_CHECK(cudaSetDevice(0));
+    CUDA_CHECK(cudaEventCreate(&start0));
+    CUDA_CHECK(cudaEventCreate(&stop0));
+
+    CUDA_CHECK(cudaSetDevice(1));
+    CUDA_CHECK(cudaEventCreate(&start1));
+    CUDA_CHECK(cudaEventCreate(&stop1));
+
     float elapsedTime;
 
-    // === Test 1: P2P direct copy using cudaMemcpyPeer ===
+    // === 测试1: P2P 直接拷贝（使用 cudaMemcpyPeer）===
     CUDA_CHECK(cudaSetDevice(0));
-    CUDA_CHECK(cudaEventRecord(start, 0));
+    CUDA_CHECK(cudaEventRecord(start0, 0));
     CUDA_CHECK(cudaMemcpyPeer(d_data1, 1, d_data0, 0, dataSize));
-    CUDA_CHECK(cudaEventRecord(stop, 0));
-    CUDA_CHECK(cudaEventSynchronize(stop));
-    CUDA_CHECK(cudaEventElapsedTime(&elapsedTime, start, stop));
+    CUDA_CHECK(cudaEventRecord(stop0, 0));
+    CUDA_CHECK(cudaEventSynchronize(stop0));
+    CUDA_CHECK(cudaEventElapsedTime(&elapsedTime, start0, stop0));
     printf("\n=== P2P Memory Copy Performance ===\n");
     printf("P2P direct copy (cudaMemcpyPeer): %.3f ms, Bandwidth: %.2f GB/s\n",
            elapsedTime, (dataSize / (elapsedTime / 1000.0)) / (1024.0 * 1024.0 * 1024.0));
 
-    // === Test 2: Copy via host staging ===
+    // === 测试2: 通过主机中转的拷贝（分段计时并累加）===
+    // 第一段：设备 0 -> 主机
     CUDA_CHECK(cudaSetDevice(0));
-    CUDA_CHECK(cudaEventRecord(start, 0));
-    // Device 0 -> Host
+    CUDA_CHECK(cudaEventRecord(start0, 0));
     CUDA_CHECK(cudaMemcpy(h_data, d_data0, dataSize, cudaMemcpyDeviceToHost));
-    // Host -> Device 1
+    CUDA_CHECK(cudaEventRecord(stop0, 0));
+    CUDA_CHECK(cudaEventSynchronize(stop0));
+    float timeD2H;
+    CUDA_CHECK(cudaEventElapsedTime(&timeD2H, start0, stop0));
+
+    // 第二段：主机 -> 设备 1
     CUDA_CHECK(cudaSetDevice(1));
+    CUDA_CHECK(cudaEventRecord(start1, 0));
     CUDA_CHECK(cudaMemcpy(d_data1, h_data, dataSize, cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaEventRecord(stop, 0));
-    CUDA_CHECK(cudaEventSynchronize(stop));
-    CUDA_CHECK(cudaEventElapsedTime(&elapsedTime, start, stop));
+    CUDA_CHECK(cudaEventRecord(stop1, 0));
+    CUDA_CHECK(cudaEventSynchronize(stop1));
+    float timeH2D;
+    CUDA_CHECK(cudaEventElapsedTime(&timeH2D, start1, stop1));
+
+    float totalStagedTime = timeD2H + timeH2D;
     printf("Host-staged copy (D2H + H2D):   %.3f ms, Bandwidth: %.2f GB/s\n",
-           elapsedTime, (dataSize / (elapsedTime / 1000.0)) / (1024.0 * 1024.0 * 1024.0));
+           totalStagedTime,
+           (dataSize / (totalStagedTime / 1000.0)) / (1024.0 * 1024.0 * 1024.0));
 
-    // === Test 3: Try P2P access with UVA ===
+    // === 测试3: 尝试启用 P2P 访问后的 UVA 直接访问 ===
     if (canAccess01 && canAccess10) {
         CUDA_CHECK(cudaSetDevice(0));
         CUDA_CHECK(cudaDeviceEnablePeerAccess(1, 0));
         CUDA_CHECK(cudaSetDevice(1));
         CUDA_CHECK(cudaDeviceEnablePeerAccess(0, 0));
 
-        // Use cudaMemcpyDefault for P2P copy
+        // 使用 cudaMemcpyDefault 进行 P2P 拷贝
         CUDA_CHECK(cudaSetDevice(0));
-        CUDA_CHECK(cudaEventRecord(start, 0));
+        CUDA_CHECK(cudaEventRecord(start0, 0));
         CUDA_CHECK(cudaMemcpy(d_data1, d_data0, dataSize, cudaMemcpyDefault));
-        CUDA_CHECK(cudaEventRecord(stop, 0));
-        CUDA_CHECK(cudaEventSynchronize(stop));
-        CUDA_CHECK(cudaEventElapsedTime(&elapsedTime, start, stop));
+        CUDA_CHECK(cudaEventRecord(stop0, 0));
+        CUDA_CHECK(cudaEventSynchronize(stop0));
+        CUDA_CHECK(cudaEventElapsedTime(&elapsedTime, start0, stop0));
         printf("P2P copy via cudaMemcpyDefault:   %.3f ms, Bandwidth: %.2f GB/s\n",
                elapsedTime,
                (dataSize / (elapsedTime / 1000.0)) / (1024.0 * 1024.0 * 1024.0));
@@ -117,15 +135,17 @@ int main() {
         printf("\nP2P access enabled successfully. Same pointer can be used on both devices.\n");
     }
 
-    // Cleanup
+    // 清理资源
     CUDA_CHECK(cudaFreeHost(h_data));
     CUDA_CHECK(cudaSetDevice(0));
     CUDA_CHECK(cudaFree(d_data0));
     CUDA_CHECK(cudaSetDevice(1));
     CUDA_CHECK(cudaFree(d_data1));
-    CUDA_CHECK(cudaEventDestroy(start));
-    CUDA_CHECK(cudaEventDestroy(stop));
+    CUDA_CHECK(cudaEventDestroy(start0));
+    CUDA_CHECK(cudaEventDestroy(stop0));
+    CUDA_CHECK(cudaEventDestroy(start1));
+    CUDA_CHECK(cudaEventDestroy(stop1));
 
     printf("\nTest completed successfully.\n");
     return 0;
-}
+}
\ No newline at end of file
diff --git "a/outputs/gpu-programming-course/docs/chapter8/\347\254\2548\347\253\240 \345\244\232\350\256\276\345\244\207\347\263\273\347\273\237\344\270\216\347\273\237\344\270\200\350\231\232\346\213\237\345\234\260\345\235\200\347\251\272\351\227\264.md" "b/outputs/gpu-programming-course/docs/chapter8/\347\254\2548\347\253\240 \345\244\232\350\256\276\345\244\207\347\263\273\347\273\237\344\270\216\347\273\237\344\270\200\350\231\232\346\213\237\345\234\260\345\235\200\347\251\272\351\227\264.md"
index c4b7e03..7a19baf 100644
--- "a/outputs/gpu-programming-course/docs/chapter8/\347\254\2548\347\253\240 \345\244\232\350\256\276\345\244\207\347\263\273\347\273\237\344\270\216\347\273\237\344\270\200\350\231\232\346\213\237\345\234\260\345\235\200\347\251\272\351\227\264.md"	
+++ "b/outputs/gpu-programming-course/docs/chapter8/\347\254\2548\347\253\240 \345\244\232\350\256\276\345\244\207\347\263\273\347\273\237\344\270\216\347\273\237\344\270\200\350\231\232\346\213\237\345\234\260\345\235\200\347\251\272\351\227\264.md"	
@@ -147,7 +147,7 @@ MyKernel<<<1000, 128>>>(p1); // 在设备 1 上启动核函数
 
 ### 8.3.2 多线程与设备亲和性
 
-在多线程程序中，每个线程可以独立地设置自己的当前设备。这在需要多个线程同时驱动多个 GPU 时特别有用。需要注意的是，CUDA 上下文是与 CPU 线程相关联的——每个主机线程在使用特定设备时会创建自己的 CUDA 上下文。
+在多线程程序中，每个线程可以独立地设置自己的当前设备。这在需要多个线程同时驱动多个 GPU 时特别有用。需要注意的是，CUDA 运行时为每个设备创建一个全局共享的主上下文（primary context），而非每个线程创建独立上下文。该主上下文由应用程序的所有主机线程共享，当任意线程首次调用需要上下文的 CUDA 函数时，运行时会自动初始化对应设备的主上下文。多个线程可以同时安全地使用同一个主上下文，CUDA 运行时会保证内部操作的线程安全。
 
 ```cuda
 // 线程函数示例：每个线程驱动一个不同的 GPU
@@ -241,7 +241,19 @@ if (canAccessPeer) {
 
 ### 8.5.4 Linux 上的 IOMMU 注意事项
 
-仅在 Linux 上，CUDA 和显示驱动不支持启用了 IOMMU 的裸金属 PCIe 对等内存拷贝。然而，CUDA 和显示驱动通过 VM 直通支持 IOMMU。因此，在 Linux 上的原生裸金属系统上运行时，用户应禁用 IOMMU。对于虚拟机，应启用 IOMMU 并使用 VFIO 驱动作为 PCIe 直通。在 Windows 上，不存在上述限制。
+在 Linux 平台上，IOMMU（输入/输出内存管理单元）会对 PCIe 设备的地址访问进行翻译与隔离，这会影响 CUDA 的 P2P 内存拷贝功能：
+
+1.  **裸金属（Bare-metal）服务器场景**
+    若在 Linux 物理服务器上启用了 IOMMU，CUDA 驱动将**不支持 PCIe 路径的 P2P 内存拷贝**。
+    因此，若需要在物理机上使用 P2P 功能，可采取以下任一方案：
+    *   在 BIOS 中禁用 IOMMU。
+    *   将 IOMMU 设置为直通（Passthrough）模式，通过在内核引导参数中添加 `iommu=pt` 实现，这能有效解决此问题。
+
+2.  **虚拟化直通场景**
+    若使用虚拟机并通过 PCIe 直通（VFIO 驱动）将 GPU 分配给虚拟机，则可以正常启用 IOMMU，且不影响 CUDA 驱动的 P2P 支持。
+
+3.  **Windows 平台**
+    Windows 系统对 IOMMU 的处理机制与 Linux 不同，不存在上述限制，无需额外配置即可正常使用 P2P 功能。
 
 ## 8.6 对等（Peer-to-Peer）内存拷贝
 
@@ -283,17 +295,17 @@ MyKernel<<<1000, 128>>>(p1);        // 在设备 1 上启动核函数
 
 这与流的常规行为一致——异步拷贝可能与其他流中的拷贝或核函数重叠。
 
-更重要的是，如果通过 `cudaDeviceEnablePeerAccess()` 启用了两设备间的 P2P 访问，则 P2P 内存拷贝不再需要通过主机中转，因此速度更快。这意味着启用 P2P 访问不仅允许直接内存访问，还能加速显式的 P2P 拷贝。
+更重要的是，当硬件拓扑支持 P2P 传输，且通过 cudaDeviceEnablePeerAccess() 启用了设备间对等访问权限时，cudaMemcpyPeer 会自动使用设备间直接传输路径，无需通过主机中转，因此速度更快。这意味着启用 P2P 访问权限不仅允许核函数直接访问远程 GPU 内存，还能为 cudaMemcpyPeer 等拷贝操作提供高速传输通道，充分发挥硬件的性能优势。
 
 ## 8.7 统一虚拟地址空间（UVA）
 
 ### 8.7.1 什么是 UVA
 
-当应用程序作为 64 位进程运行时，主机和所有计算能力 2.0 及以上的设备使用<strong>单一地址空间（unified address space）</strong>。通过 CUDA API 调用进行的所有主机内存分配和受支持设备上的所有设备内存分配，都在此虚拟地址范围内。其结果是：
+当应用程序作为 **64 位进程**运行时，主机和所有计算能力 2.0 及以上的 CUDA 设备会共享一个**统一的虚拟地址空间（Unified Virtual Addressing, UVA）**。32 位进程不支持 UVA，只有通过 CUDA API 分配的内存（如 `cudaMalloc`、`cudaHostAlloc`、`cudaMallocManaged`），其虚拟地址才会被纳入这个统一地址空间；普通 `malloc` 分配的主机内存默认不在此范围内。UVA 的核心结果是：
 
 1. <strong>指针定位</strong>：通过 CUDA 分配的主机内存或使用统一地址空间的任何设备上的任何内存的位置，可以通过指针的值使用 `cudaPointerGetAttributes()` 来确定。
 
-2. <strong>简化拷贝</strong>：在向或从使用统一地址空间的任何设备内存进行拷贝时，`cudaMemcpy*()` 的 `cudaMemcpyKind` 参数可以设置为 `cudaMemcpyDefault`，以便从指针确定位置。这对于非通过 CUDA 分配的主机指针也有效，只要当前设备使用统一寻址。
+2. <strong>简化拷贝</strong>：在使用统一地址空间的任何设备内存进行拷贝时，`cudaMemcpy*()` 的 `cudaMemcpyKind` 参数可以设置为 `cudaMemcpyDefault`，以便从指针确定位置。
 
 3. <strong>自动可移植内存</strong>：通过 `cudaHostAlloc()` 进行的分配在使用统一地址空间的所有设备上自动可移植（portable），并且由 `cudaHostAlloc()` 返回的指针可以直接从运行在这些设备上的核函数中使用——不再需要通过 `cudaHostGetDevicePointer()` 获取设备指针。
 
@@ -514,7 +526,7 @@ int main() {
     CUDA_CHECK(cudaSetDevice(0));
     CUDA_CHECK(cudaMemcpy(d_data0, h_data, dataSize, cudaMemcpyHostToDevice));
 
-    // 创建事件用于计时
+    // 创建事件用于计时（绑定到设备 0）
     cudaEvent_t start, stop;
     CUDA_CHECK(cudaEventCreate(&start));
     CUDA_CHECK(cudaEventCreate(&stop));
@@ -531,19 +543,36 @@ int main() {
     printf("P2P direct copy (cudaMemcpyPeer): %.3f ms, Bandwidth: %.2f GB/s\n",
            elapsedTime, (dataSize / (elapsedTime / 1000.0)) / (1024.0 * 1024.0 * 1024.0));
 
-    // === 测试2: 通过主机中转的拷贝 ===
+    // === 测试2: 通过主机中转的拷贝（分两段计时再累加）===
+    // 第一段：设备 0 -> 主机
     CUDA_CHECK(cudaSetDevice(0));
     CUDA_CHECK(cudaEventRecord(start, 0));
-    // 设备 0 -> 主机
     CUDA_CHECK(cudaMemcpy(h_data, d_data0, dataSize, cudaMemcpyDeviceToHost));
-    // 主机 -> 设备 1
-    CUDA_CHECK(cudaSetDevice(1));
-    CUDA_CHECK(cudaMemcpy(d_data1, h_data, dataSize, cudaMemcpyHostToDevice));
     CUDA_CHECK(cudaEventRecord(stop, 0));
     CUDA_CHECK(cudaEventSynchronize(stop));
-    CUDA_CHECK(cudaEventElapsedTime(&elapsedTime, start, stop));
+    float timeD2H;
+    CUDA_CHECK(cudaEventElapsedTime(&timeD2H, start, stop));
+
+    // 第二段：主机 -> 设备 1（在设备 1 上创建新事件）
+    cudaEvent_t start1, stop1;
+    CUDA_CHECK(cudaSetDevice(1));
+    CUDA_CHECK(cudaEventCreate(&start1));
+    CUDA_CHECK(cudaEventCreate(&stop1));
+    CUDA_CHECK(cudaEventRecord(start1, 0));
+    CUDA_CHECK(cudaMemcpy(d_data1, h_data, dataSize, cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaEventRecord(stop1, 0));
+    CUDA_CHECK(cudaEventSynchronize(stop1));
+    float timeH2D;
+    CUDA_CHECK(cudaEventElapsedTime(&timeH2D, start1, stop1));
+
+    float totalStagedTime = timeD2H + timeH2D;
     printf("Host-staged copy (D2H + H2D):   %.3f ms, Bandwidth: %.2f GB/s\n",
-           elapsedTime, (dataSize / (elapsedTime / 1000.0)) / (1024.0 * 1024.0 * 1024.0));
+           totalStagedTime,
+           (dataSize / (totalStagedTime / 1000.0)) / (1024.0 * 1024.0 * 1024.0));
+
+    // 销毁设备 1 上的事件
+    CUDA_CHECK(cudaEventDestroy(start1));
+    CUDA_CHECK(cudaEventDestroy(stop1));
 
     // === 测试3: 尝试启用 P2P 访问后的 UVA 直接访问 ===
     if (canAccess01 && canAccess10) {

From 843fbe9f85913713c3f8925743762cbbf9195058 Mon Sep 17 00:00:00 2001
From: wangdaye wang <1901591887@qq.com>
Date: Sat, 30 May 2026 19:54:59 +0800
Subject: [PATCH 10/23] 5.30 chapter9

---
 .../code/chapter1/hello_cuda.cu               | 50 +++++++++----------
 ...44\270\216Warp\350\260\203\345\272\246.md" | 39 +++++++++++----
 2 files changed, 53 insertions(+), 36 deletions(-)

diff --git a/outputs/gpu-programming-course/code/chapter1/hello_cuda.cu b/outputs/gpu-programming-course/code/chapter1/hello_cuda.cu
index 13e1cc5..9cec295 100644
--- a/outputs/gpu-programming-course/code/chapter1/hello_cuda.cu
+++ b/outputs/gpu-programming-course/code/chapter1/hello_cuda.cu
@@ -1,54 +1,50 @@
 /**
- * hello_cuda.cu - Chapter 1: First CUDA Program
- *
- * A minimal CUDA kernel that each thread prints its greeting.
- * Demonstrates: __global__, threadIdx, and the <<<...>>> launch syntax.
- *
- * Compile: nvcc hello_cuda.cu -o hello_cuda
- * Run:     ./hello_cuda
+ * hello_cuda.cu - Chapter 1: First CUDA Kernel
+ * 演示 __global__, threadIdx, blockIdx 和 <<<...>>> 启动语法
  */
-
 #include <cuda_runtime.h>
 #include <stdio.h>
 
 /**
- * @brief  A simple kernel where each thread prints a greeting.
- *
- * Each of the N threads that execute this kernel prints one line.
- * The built-in variable threadIdx.x identifies the thread within the block.
+ * 内核函数：每个执行此函数的线程打印一条问候
+ * 使用 __global__ 声明，从CPU端调用，在GPU上执行
  */
 __global__ void helloFromGPU()
 {
-    // Each thread prints a greeting with its unique thread index
-    printf("Hello World from GPU! I am thread [%d] in block [%d]\n",
-           threadIdx.x, blockIdx.x);
+    // threadIdx.x: 当前线程在其线程块内的索引（从0开始）
+    // blockIdx.x:  当前线程块在网格中的索引（从0开始）
+    printf("来自GPU的问候! 我是线程块[%d]中的线程[%d]\n",
+           blockIdx.x, threadIdx.x);
 }
 
 int main()
 {
-    // 1. Print a greeting from the CPU (host)
-    printf("Hello World from CPU!\n\n");
+    // 步骤1：从CPU端打印（这是普通的C/C++代码）
+    printf("来自CPU的问候!\n\n");
 
-    // 2. Launch the kernel with 2 blocks, each with 4 threads
-    //    Syntax: kernel<<<numBlocks, threadsPerBlock>>>(args)
-    int numBlocks = 2;
-    int threadsPerBlock = 4;
+    // 步骤2：配置内核启动参数
+    int numBlocks = 2;         // 启动2个线程块
+    int threadsPerBlock = 4;   // 每个线程块包含4个线程
+    // 总共 2 * 4 = 8 个CUDA线程
 
-    printf("Launching kernel with %d block(s) and %d thread(s) per block\n\n",
-           numBlocks, threadsPerBlock);
+    printf("启动内核：%d个线程块 x %d个线程/块 = %d个线程\n\n",
+           numBlocks, threadsPerBlock,
+           numBlocks * threadsPerBlock);
 
+    // 步骤3：使用 <<<numBlocks, threadsPerBlock>>> 语法启动内核
+    // 注意：内核启动是异步的——CPU不会等待GPU完成就继续执行
     helloFromGPU<<<numBlocks, threadsPerBlock>>>();
 
-    // 3. Wait for the GPU to finish before accessing the results
+    // 步骤4：等待GPU完成所有提交的工作
     cudaError_t error = cudaDeviceSynchronize();
     if (error != cudaSuccess)
     {
-        printf("CUDA error after kernel launch: %s\n",
+        printf("内核启动后出现CUDA错误: %s\n",
                cudaGetErrorString(error));
         return 1;
     }
 
-    printf("\nKernel execution completed successfully!\n");
+    printf("\n内核执行成功完成!\n");
 
     return 0;
-}
+}
\ No newline at end of file
diff --git "a/outputs/gpu-programming-course/docs/chapter9/\347\254\2549\347\253\240 \347\241\254\344\273\266\345\256\236\347\216\260\342\200\224\342\200\224SIMT\346\236\266\346\236\204\344\270\216Warp\350\260\203\345\272\246.md" "b/outputs/gpu-programming-course/docs/chapter9/\347\254\2549\347\253\240 \347\241\254\344\273\266\345\256\236\347\216\260\342\200\224\342\200\224SIMT\346\236\266\346\236\204\344\270\216Warp\350\260\203\345\272\246.md"
index cbd43db..16c45d3 100644
--- "a/outputs/gpu-programming-course/docs/chapter9/\347\254\2549\347\253\240 \347\241\254\344\273\266\345\256\236\347\216\260\342\200\224\342\200\224SIMT\346\236\266\346\236\204\344\270\216Warp\350\260\203\345\272\246.md"	
+++ "b/outputs/gpu-programming-course/docs/chapter9/\347\254\2549\347\253\240 \347\241\254\344\273\266\345\256\236\347\216\260\342\200\224\342\200\224SIMT\346\236\266\346\236\204\344\270\216Warp\350\260\203\345\272\246.md"	
@@ -28,7 +28,7 @@ NVIDIA GPU 架构围绕一个可扩展的<strong>流式多处理器（Streaming
 - <strong>L1 缓存/共享内存</strong>：共享的片上存储器，可在 L1 缓存和共享内存之间灵活划分。
 - <strong>常量缓存和纹理缓存</strong>：只读缓存。
 
-指令在 SM 中是流水线化的（pipelined），利用单一线程内的指令级并行性（instruction-level parallelism），以及通过同时硬件多线程（simultaneous hardware multithreading）实现的广泛线程级并行性。与 CPU 核心不同，指令是按顺序发出的（in-order issue），没有分支预测或推测执行。
+指令在 SM 中是流水线化的（pipelined），利用单一线程内的指令级并行性（instruction-level parallelism），以及通过同时硬件多线程（simultaneous hardware multithreading）实现的广泛线程级并行性。与 CPU 核心不同，指令是顺序发出的（in-order issue），现代 GPU 包含分支预测单元以减少 warp 发散带来的性能损失，但与 CPU 不同，它没有推测执行（speculative execution）。
 
 <div align="center"><img src="../images/chapter9-figures/automatic-scalability.png" /><p>图 9.1 CUDA 线程块在多处理器上的自动可扩展性</p></div>
 
@@ -110,17 +110,37 @@ __global__ void divergentKernel(float* data, int N) {
 例如，以下代码在 Pascal 及更早架构上可能可以工作（依赖 warp 内的隐式同步）：
 
 ```cuda
-// 不安全的 warp 内归约（在 Volta+ 上可能产生错误结果）
-__device__ int warpReduce(int val) {
-    // 假设 warp 内的线程以锁步方式执行
+// 不安全的 warp 内归约（依赖隐式同步，Volta+ 上可能出错）
+__device__ int unsafeWarpReduce(int val) {
+    __shared__ int smem[32];
+    int tid = threadIdx.x;
+    smem[tid] = val;
+    // 缺少显式同步：其他线程可能看不到 smem[tid] 的写入
+    for (int offset = 16; offset > 0; offset /= 2) {
+        if (tid < offset) {
+            smem[tid] += smem[tid + offset];
+        }
+    }
+    return smem[0];
+}
+
+// 安全的 warp 内归约（使用显式同步）
+__device__ int safeWarpReduce(int val) {
+    __shared__ int smem[32];
+    int tid = threadIdx.x;
+    smem[tid] = val;
+    __syncwarp();  // 确保所有线程完成写入后再继续
     for (int offset = 16; offset > 0; offset /= 2) {
-        val += __shfl_down_sync(0xFFFFFFFF, val, offset);
+        if (tid < offset) {
+            smem[tid] += smem[tid + offset];
+        }
+        __syncwarp();  // 确保累加结果对其他线程可见
     }
-    return val;
+    return smem[0];
 }
 ```
 
-在 Volta+ 上，需要使用明确的同步掩码（mask）和 `__syncwarp()` 来确保正确性。
+当 warp 内的线程通过共享内存来通信时，编译器可能会将共享内存访问优化到寄存器中，导致一个线程看不到另一个线程写入的值。__syncwarp() 会强制编译器在同步点重新加载共享内存的值，确保 warp 内的所有线程在同步点之后看到一致的内存视图。
 
 ### 9.3.7 活跃线程与不活跃线程
 
@@ -253,7 +273,7 @@ cudaOccupancyMaxPotentialBlockSizeVariableSMem(
 | 2.x (Fermi) | 2 | 2 (来自不同 warp) |
 | 3.x (Kepler) | 4 | 4 |
 | 5.x (Maxwell) | 4 | 4 |
-| 6.x (Pascal) | 2 | 2 |
+| 6.x (Pascal) | 2或4 | 2或4 |
 | 7.x (Volta) | 4 | 4 |
 | 8.x (Ampere) | 4 | 4 |
 
@@ -280,7 +300,8 @@ cudaOccupancyMaxPotentialBlockSizeVariableSMem(
 
 理解 warp 的另一个重要原因是<strong>全局内存合并访问（coalesced access）</strong>。当 warp 中的所有 32 个线程访问全局内存时，硬件会尝试将这些访问合并为尽可能少的内存事务。
 
-在较新的 GPU 架构（计算能力 6.0+）上，合并规则已经简化：如果 warp 内的线程访问同一 32 字节对齐段内的地址，这些访问会被合并。如果访问跨越了 32 字节边界，则需要多个内存事务。
+在计算能力 6.0 及以上的现代 GPU 中，硬件访问全局内存的基本单元是 32 字节的内存事务。如果一个 warp 的 32 个线程访问连续的 128 字节数据，硬件会将其“打包”成 4 次 32 字节的事务来完成。
+访问是否对齐也至关重要。如果 32 个线程访问的起始地址是 128 字节的倍数，硬件可以最高效地完成这些事务；否则，一个 warp 的访问可能会跨越更多的 32 字节扇区，导致额外的内存事务，降低有效带宽。
 
 考虑以下两种访问模式：
 

From dd1d1e2ff822e7815c6f28e2d6ca6297a97cf9b3 Mon Sep 17 00:00:00 2001
From: wangdaye wang <1901591887@qq.com>
Date: Mon, 1 Jun 2026 18:40:57 +0800
Subject: [PATCH 11/23] 5.31 chapter10

---
 .../code/chapter10/coalescing_benchmark.cu    | 143 ++++----
 .../code/chapter10/occupancy_tuning.cu        | 314 +++++++-----------
 ...77\351\227\256\344\274\230\345\214\226.md" |   4 +-
 3 files changed, 202 insertions(+), 259 deletions(-)

diff --git a/outputs/gpu-programming-course/code/chapter10/coalescing_benchmark.cu b/outputs/gpu-programming-course/code/chapter10/coalescing_benchmark.cu
index 60b2c26..b69e117 100644
--- a/outputs/gpu-programming-course/code/chapter10/coalescing_benchmark.cu
+++ b/outputs/gpu-programming-course/code/chapter10/coalescing_benchmark.cu
@@ -1,5 +1,5 @@
 /**
- * Chapter 10 - Experiment 10-1: Global Memory Coalescing Benchmark
+ * Chapter 10 - Experiment 10-1: Global Memory Coalescing Benchmark (FIXED for Online Judge)
  *
  * Measures the impact of different global memory access patterns on bandwidth.
  * Compile: nvcc -arch=sm_86 -O3 coalescing_benchmark.cu -o coalescing_benchmark
@@ -8,6 +8,7 @@
 
 #include <stdio.h>
 #include <cuda_runtime.h>
+#include <math.h>
 
 #define CHECK_CUDA(call) {                                            \
     cudaError_t err = call;                                           \
@@ -54,22 +55,12 @@ __global__ void readStride32(const float * __restrict__ input,
     }
 }
 
-// Random access (pseudo-random pattern based on a deterministic shuffle)
-__global__ void readRandom(const float * __restrict__ input,
-                            float * __restrict__ output,
-                            const int * __restrict__ indices, int n) {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
-    if (tid < n) {
-        int idx = indices[tid];
-        output[tid] = input[idx];
-    }
-}
-
 // Vectorized load using float4 (best-case coalescing)
 __global__ void readFloat4(const float * __restrict__ input,
                             float * __restrict__ output, int n) {
     int idx = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
-    if (idx < n) {
+    // Ensure we don't write beyond the array
+    if (idx + 3 < n) {
         float4 val = reinterpret_cast<const float4*>(input)[threadIdx.x + blockIdx.x * blockDim.x];
         output[idx + 0] = val.x;
         output[idx + 1] = val.y;
@@ -78,6 +69,8 @@ __global__ void readFloat4(const float * __restrict__ input,
     }
 }
 
+// Helper to benchmark a kernel
+// n: number of elements to process (not including stride)
 float benchmarkKernel(void (*kernel)(const float*, float*, int),
                       float *d_in, float *d_out, int n,
                       int gridSize, int blockSize, int iterations) {
@@ -115,68 +108,77 @@ int main() {
     printf("=== Coalescing Benchmark ===\n");
     printf("GPU: %s\n", prop.name);
     printf("Compute Capability: %d.%d\n", prop.major, prop.minor);
-    printf("Global Memory Bandwidth: %.1f GB/s\n",
-           prop.memoryClockRate * (prop.memoryBusWidth / 8) * 2 / 1e6);
+    double peakBW = prop.memoryClockRate * (prop.memoryBusWidth / 8) * 2 / 1e6;
+    printf("Global Memory Bandwidth (theoretical peak): %.1f GB/s\n", peakBW);
     printf("\n");
 
-    const int N = 32 * 1024 * 1024;  // 32M elements = 128 MB
+    // 🔧 关键修复1：减小测试规模以适应在线评测系统
+    const int N = 8 * 1024 * 1024;    // 8M元素 = 32MB（原32M）
     const int blockSize = 256;
-    const int gridSize = (N + blockSize - 1) / blockSize;
-    const int iterations = 100;
-    const size_t bytes = N * sizeof(float);
+    const int iterations = 20;        // 🔧 关键修复2：减少迭代次数（原100）
+    const size_t totalBytes = N * sizeof(float);
 
-    // Allocate host memory
-    float *h_in = (float*)malloc(bytes);
-    float *h_out = (float*)malloc(bytes);
+    // Allocate host memory and initialize
+    float *h_in = (float*)malloc(totalBytes);
+    float *h_out = (float*)malloc(totalBytes);
     for (int i = 0; i < N; i++) h_in[i] = (float)i;
 
     // Allocate device memory
     float *d_in, *d_out;
-    CHECK_CUDA(cudaMalloc(&d_in, bytes));
-    CHECK_CUDA(cudaMalloc(&d_out, bytes));
-    CHECK_CUDA(cudaMemcpy(d_in, h_in, bytes, cudaMemcpyHostToDevice));
-
-    printf("Array size: %d float elements (%.1f MB)\n", N, bytes / (1024.0*1024.0));
-    printf("Block size: %d, Grid size: %d, Iterations: %d\n\n",
-           blockSize, gridSize, iterations);
-
-    printf("%-25s %10s %15s %20s\n", "Access Pattern", "Time(ms)", "Bandwidth(GB/s)", "% of Peak");
-    printf("----------------------------------------------------------------------------\n");
-
-    // Test each access pattern
-    float ms, bw, pct;
-    float peakBW = prop.memoryClockRate * (prop.memoryBusWidth / 8) * 2 / 1e6;
-
-    // Stride 1
-    ms = benchmarkKernel(readStride1, d_in, d_out, N, gridSize, blockSize, iterations);
-    bw = (bytes * 2) / (ms / 1000.0) / 1e9;  // read + write
-    pct = bw / peakBW * 100;
-    printf("%-25s %10.4f %15.2f %20.1f%%\n", "Stride=1 (coalesced)", ms, bw, pct);
-
-    // Stride 2
-    ms = benchmarkKernel(readStride2, d_in, d_out, N/2, gridSize, blockSize, iterations);
-    bw = (bytes) / (ms / 1000.0) / 1e9;  // approximate bytes loaded
-    pct = bw / peakBW * 100;
-    printf("%-25s %10.4f %15.2f %20.1f%%\n", "Stride=2", ms, bw, pct);
-
-    // Stride 8
-    ms = benchmarkKernel(readStride8, d_in, d_out, N/8, gridSize, blockSize, iterations);
-    bw = (bytes / 4) / (ms / 1000.0) / 1e9;
-    pct = bw / peakBW * 100;
-    printf("%-25s %10.4f %15.2f %20.1f%%\n", "Stride=8", ms, bw, pct);
-
-    // Stride 32
-    ms = benchmarkKernel(readStride32, d_in, d_out, N/32, gridSize, blockSize, iterations);
-    bw = (bytes / 16) / (ms / 1000.0) / 1e9;
-    pct = bw / peakBW * 100;
-    printf("%-25s %10.4f %15.2f %20.1f%%\n", "Stride=32 (worst)", ms, bw, pct);
-
-    // Float4 vectorized
-    int gridFloat4 = (N / 4 + blockSize - 1) / blockSize;
-    ms = benchmarkKernel(readFloat4, d_in, d_out, N, gridFloat4, blockSize, iterations);
-    bw = (bytes * 2) / (ms / 1000.0) / 1e9;
-    pct = bw / peakBW * 100;
-    printf("%-25s %10.4f %15.2f %20.1f%%\n", "Float4 vectorized", ms, bw, pct);
+    CHECK_CUDA(cudaMalloc(&d_in, totalBytes));
+    CHECK_CUDA(cudaMalloc(&d_out, totalBytes));
+    CHECK_CUDA(cudaMemcpy(d_in, h_in, totalBytes, cudaMemcpyHostToDevice));
+
+    printf("Array size: %d float elements (%.1f MB)\n", N, totalBytes / (1024.0*1024.0));
+    printf("Block size: %d, Iterations: %d\n\n", blockSize, iterations);
+
+    printf("%-30s %12s %15s %15s\n", "Access Pattern", "Time(ms)", "BW(GB/s)", "% of Peak");
+    printf("--------------------------------------------------------------------------------\n");
+
+    // --- Test Stride=1 (coalesced) ---
+    int effectiveN = N;               // stride 1: all elements
+    int gridSize = (effectiveN + blockSize - 1) / blockSize;
+    float ms = benchmarkKernel(readStride1, d_in, d_out, effectiveN, gridSize, blockSize, iterations);
+    size_t bytesPerKernel = 2 * effectiveN * sizeof(float);  // read + write
+    double bw = bytesPerKernel / (ms / 1000.0) / 1e9;
+    double pct = bw / peakBW * 100.0;
+    printf("%-30s %12.4f %15.2f %14.1f%%\n", "Stride=1 (coalesced)", ms, bw, pct);
+
+    // --- Test Stride=2 ---
+    effectiveN = N / 2;
+    gridSize = (effectiveN + blockSize - 1) / blockSize;
+    ms = benchmarkKernel(readStride2, d_in, d_out, effectiveN, gridSize, blockSize, iterations);
+    bytesPerKernel = 2 * effectiveN * sizeof(float);
+    bw = bytesPerKernel / (ms / 1000.0) / 1e9;
+    pct = bw / peakBW * 100.0;
+    printf("%-30s %12.4f %15.2f %14.1f%%\n", "Stride=2", ms, bw, pct);
+
+    // --- Test Stride=8 ---
+    effectiveN = N / 8;
+    gridSize = (effectiveN + blockSize - 1) / blockSize;
+    ms = benchmarkKernel(readStride8, d_in, d_out, effectiveN, gridSize, blockSize, iterations);
+    bytesPerKernel = 2 * effectiveN * sizeof(float);
+    bw = bytesPerKernel / (ms / 1000.0) / 1e9;
+    pct = bw / peakBW * 100.0;
+    printf("%-30s %12.4f %15.2f %14.1f%%\n", "Stride=8", ms, bw, pct);
+
+    // --- Test Stride=32 (worst) ---
+    effectiveN = N / 32;
+    gridSize = (effectiveN + blockSize - 1) / blockSize;
+    ms = benchmarkKernel(readStride32, d_in, d_out, effectiveN, gridSize, blockSize, iterations);
+    bytesPerKernel = 2 * effectiveN * sizeof(float);
+    bw = bytesPerKernel / (ms / 1000.0) / 1e9;
+    pct = bw / peakBW * 100.0;
+    printf("%-30s %12.4f %15.2f %14.1f%%\n", "Stride=32 (worst)", ms, bw, pct);
+
+    // --- Test float4 vectorized (still coalesced, fewer transactions) ---
+    effectiveN = N;   // all elements, but accessed in groups of 4
+    int gridFloat4 = ((N/4) + blockSize - 1) / blockSize;
+    ms = benchmarkKernel(readFloat4, d_in, d_out, effectiveN, gridFloat4, blockSize, iterations);
+    bytesPerKernel = 2 * N * sizeof(float);   // full array read+write
+    bw = bytesPerKernel / (ms / 1000.0) / 1e9;
+    pct = bw / peakBW * 100.0;
+    printf("%-30s %12.4f %15.2f %14.1f%%\n", "Float4 vectorized", ms, bw, pct);
 
     // Cleanup
     CHECK_CUDA(cudaFree(d_in));
@@ -184,8 +186,9 @@ int main() {
     free(h_in);
     free(h_out);
 
-    printf("\nConclusion: Stride-1 coalesced access achieves %.1f%% of peak bandwidth.\n", pct);
-    printf("Increasing stride dramatically reduces effective bandwidth.\n");
+    printf("\nConclusion: Coalesced access (Stride-1) achieves high bandwidth.\n");
+    printf("Increasing stride degrades performance due to increased memory transactions.\n");
+    printf("Vectorized loads can further improve efficiency on some architectures.\n");
 
     return 0;
-}
+}
\ No newline at end of file
diff --git a/outputs/gpu-programming-course/code/chapter10/occupancy_tuning.cu b/outputs/gpu-programming-course/code/chapter10/occupancy_tuning.cu
index 8bd4c40..6f9c991 100644
--- a/outputs/gpu-programming-course/code/chapter10/occupancy_tuning.cu
+++ b/outputs/gpu-programming-course/code/chapter10/occupancy_tuning.cu
@@ -1,16 +1,13 @@
 /**
- * Chapter 10 - Experiment 10-3: Occupancy Tuning
- *
- * Demonstrates how register usage, shared memory usage, and block size
- * affect occupancy and performance.
- *
- * Compile: nvcc -arch=sm_86 -O3 occupancy_tuning.cu -o occupancy_tuning
+ * Chapter 10 - Experiment 10-3: Occupancy Tuning Experiment
+ * 
+ * Measures the impact of register and shared memory usage on kernel occupancy and performance.
+ * Compile: nvcc -arch=sm_80 -O3 occupancy_tuning.cu -o occupancy_tuning
  * Run: ./occupancy_tuning
  */
 
 #include <stdio.h>
 #include <cuda_runtime.h>
-#include <cmath>
 
 #define CHECK_CUDA(call) {                                            \
     cudaError_t err = call;                                           \
@@ -21,120 +18,69 @@
     }                                                                 \
 }
 
-// Kernel with configurable register pressure
-// By using many temporary variables, we force the compiler to use more registers
-template<int NUM_TEMPS>
-__global__ void registerPressure(float * __restrict__ a,
-                                  float * __restrict__ b,
-                                  float * __restrict__ c, int n) {
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (idx < n) {
-        float v0 = a[idx];
-        float v1 = b[idx];
-        float v2 = v0 * v1;
-        float v3 = v2 + v0;
-        float v4 = v3 * v1;
-        float v5 = v4 - v0;
-        float v6 = v5 * v2;
-        float v7 = v6 + v3;
-        float v8 = v7 * v4;
-
-        // Use all temps to prevent compiler optimization
-        if (NUM_TEMPS >= 1) c[idx] = v0;
-        if (NUM_TEMPS >= 2) c[idx] += v1;
-        if (NUM_TEMPS >= 3) c[idx] += v2;
-        if (NUM_TEMPS >= 4) c[idx] += v3;
-        if (NUM_TEMPS >= 5) c[idx] += v4;
-        if (NUM_TEMPS >= 6) c[idx] += v5;
-        if (NUM_TEMPS >= 7) c[idx] += v6;
-        if (NUM_TEMPS >= 8) c[idx] += v7;
-        if (NUM_TEMPS >= 9) c[idx] += v8;
+// 模板特化：处理SHMEM_BYTES=0的情况（不声明共享内存）
+template<int REGISTERS_PER_THREAD, int SHMEM_BYTES>
+__global__ void sharedMemPressure(float* a, float* b, float* c, int N) {
+    // 消耗指定数量的寄存器
+    float regs[REGISTERS_PER_THREAD];
+    
+    // 初始化寄存器（防止编译器优化）
+    #pragma unroll
+    for (int i = 0; i < REGISTERS_PER_THREAD; i++) {
+        regs[i] = (float)i;
     }
-}
-
-// Explicit instantiation
-template __global__ void registerPressure<1>(float*, float*, float*, int);
-template __global__ void registerPressure<3>(float*, float*, float*, int);
-template __global__ void registerPressure<6>(float*, float*, float*, int);
-template __global__ void registerPressure<9>(float*, float*, float*, int);
-
-// Kernel with configurable shared memory usage
-template<int SHMEM_BYTES>
-__global__ void sharedMemPressure(float * __restrict__ a,
-                                   float * __restrict__ b,
-                                   float * __restrict__ c, int n) {
+    
+    // 只有当SHMEM_BYTES>0时才声明和使用共享内存
+    #if SHMEM_BYTES > 0
     __shared__ float smem[SHMEM_BYTES / sizeof(float)];
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
     int tid = threadIdx.x;
-
-    // Initialize shared memory
-    if (tid < SHMEM_BYTES / (int)sizeof(float)) {
-        smem[tid] = (float)tid;
+    
+    // 使用共享内存（防止编译器优化）
+    if (tid < SHMEM_BYTES / sizeof(float)) {
+        smem[tid] = regs[0];
     }
     __syncthreads();
-
-    if (idx < n) {
-        float sum = a[idx] + b[idx];
-        // Use shared memory data
-        for (int i = 0; i < SHMEM_BYTES / (int)sizeof(float) && i < 256; i++) {
-            sum += smem[i] * 0.001f;
+    #endif
+    
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < N) {
+        // 执行一些计算来消耗寄存器
+        float sum = 0.0f;
+        #pragma unroll
+        for (int i = 0; i < REGISTERS_PER_THREAD; i++) {
+            sum += regs[i] * a[idx] + b[idx];
         }
         c[idx] = sum;
     }
 }
 
-template __global__ void sharedMemPressure<0>(float*, float*, float*, int);
-template __global__ void sharedMemPressure<1024>(float*, float*, float*, int);
-template __global__ void sharedMemPressure<4096>(float*, float*, float*, int);
-template __global__ void sharedMemPressure<8192>(float*, float*, float*, int);
-template __global__ void sharedMemPressure<16384>(float*, float*, float*, int);
-
-// Helper to calculate occupancy
-float calculateOccupancy(int blockSize, int dynamicSmem,
-                         void* kernel, const char* name) {
-    int numBlocks;
-    CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &numBlocks, kernel, blockSize, dynamicSmem));
-
-    cudaDeviceProp prop;
-    int device;
-    CHECK_CUDA(cudaGetDevice(&device));
-    CHECK_CUDA(cudaGetDeviceProperties(&prop, device));
-
-    int activeWarps = numBlocks * blockSize / prop.warpSize;
-    int maxWarps = prop.maxThreadsPerMultiProcessor / prop.warpSize;
-    float occupancy = (float)activeWarps / maxWarps * 100.0f;
-
-    printf("%-30s BlockSize=%4d, Smem=%5d: %2d blocks/SM, %3d warps/SM, Occupancy=%.1f%%\n",
-           name, blockSize, dynamicSmem, numBlocks, activeWarps, occupancy);
-
-    return occupancy;
-}
-
-// Simple benchmark function
-float benchmarkKernelFloat(void (*kernel)(float*, float*, float*, int),
-                           float *d_a, float *d_b, float *d_c, int n,
-                           int gridSize, int blockSize, int iterations) {
+// 测量内核执行时间
+float measureKernel(void (*kernel)(float*, float*, float*, int), 
+                   float* d_a, float* d_b, float* d_c, int N,
+                   int blockSize, int iterations) {
     cudaEvent_t start, stop;
     CHECK_CUDA(cudaEventCreate(&start));
     CHECK_CUDA(cudaEventCreate(&stop));
-
-    kernel<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
+    
+    int gridSize = (N + blockSize - 1) / blockSize;
+    
+    // 预热
+    kernel<<<gridSize, blockSize>>>(d_a, d_b, d_c, N);
     CHECK_CUDA(cudaDeviceSynchronize());
-
+    
     CHECK_CUDA(cudaEventRecord(start, 0));
     for (int i = 0; i < iterations; i++) {
-        kernel<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
+        kernel<<<gridSize, blockSize>>>(d_a, d_b, d_c, N);
     }
     CHECK_CUDA(cudaEventRecord(stop, 0));
     CHECK_CUDA(cudaEventSynchronize(stop));
-
+    
     float ms;
     CHECK_CUDA(cudaEventElapsedTime(&ms, start, stop));
-
+    
     CHECK_CUDA(cudaEventDestroy(start));
     CHECK_CUDA(cudaEventDestroy(stop));
-
+    
     return ms / iterations;
 }
 
@@ -143,101 +89,95 @@ int main() {
     int device;
     CHECK_CUDA(cudaGetDevice(&device));
     CHECK_CUDA(cudaGetDeviceProperties(&prop, device));
-
-    printf("=== Occupancy Tuning Experiment ===\n");
+    
+    printf("=== CUDA Occupancy Tuning Experiment ===\n");
     printf("GPU: %s (SM %d.%d)\n", prop.name, prop.major, prop.minor);
     printf("Max threads per SM: %d\n", prop.maxThreadsPerMultiProcessor);
-    printf("Max warps per SM: %d\n", prop.maxThreadsPerMultiProcessor / prop.warpSize);
+    printf("Max warps per SM: %d\n", prop.maxThreadsPerMultiProcessor / 32);
     printf("Registers per SM: %d\n", prop.regsPerMultiprocessor);
-    printf("Shared memory per SM: %d KB\n", prop.sharedMemPerMultiprocessor / 1024);
-    printf("Max block size: %d\n", prop.maxThreadsPerBlock);
-    printf("Max blocks per SM: %d\n\n", prop.maxBlocksPerMultiProcessor);
-
-    const int N = 16 * 1024 * 1024;  // 16M elements
-    const int iterations = 50;
-    const size_t bytes = N * sizeof(float);
-
+    printf("Shared memory per SM: %d KB\n\n", prop.sharedMemPerMultiprocessor / 1024);
+    
+    const int N = 1 << 22; // 400万个元素
+    const int blockSize = 256;
+    const int iterations = 10;
+    
+    float *h_a, *h_b, *h_c;
     float *d_a, *d_b, *d_c;
-    CHECK_CUDA(cudaMalloc(&d_a, bytes));
-    CHECK_CUDA(cudaMalloc(&d_b, bytes));
-    CHECK_CUDA(cudaMalloc(&d_c, bytes));
-
-    // ===== Part 1: Effect of block size on occupancy =====
-    printf("--- Part 1: Block Size vs Occupancy ---\n");
-    printf("(Kernel: registerPressure<3> - moderate register usage)\n\n");
-
-    int blockSizes[] = {32, 64, 128, 256, 512, 1024};
-    for (int i = 0; i < 6; i++) {
-        int bs = blockSizes[i];
-        calculateOccupancy(bs, 0, (void*)registerPressure<3>, "regPressure<3>");
-        int gs = (N + bs - 1) / bs;
-        float ms = benchmarkKernelFloat(registerPressure<3>, d_a, d_b, d_c, N, gs, bs, iterations);
-        printf("   -> Time: %.4f ms\n\n", ms);
+    
+    // 分配主机内存
+    h_a = (float*)malloc(N * sizeof(float));
+    h_b = (float*)malloc(N * sizeof(float));
+    h_c = (float*)malloc(N * sizeof(float));
+    
+    // 初始化数据
+    for (int i = 0; i < N; i++) {
+        h_a[i] = 1.0f;
+        h_b[i] = 2.0f;
     }
-
-    // ===== Part 2: Effect of register usage on occupancy =====
-    printf("--- Part 2: Register Usage vs Occupancy ---\n");
-    printf("(Block size fixed at 256)\n\n");
-
-    calculateOccupancy(256, 0, (void*)registerPressure<1>, "regPressure<1> (few regs)");
-    calculateOccupancy(256, 0, (void*)registerPressure<3>, "regPressure<3>");
-    calculateOccupancy(256, 0, (void*)registerPressure<6>, "regPressure<6>");
-    calculateOccupancy(256, 0, (void*)registerPressure<9>, "regPressure<9> (many regs)");
-
-    // ===== Part 3: Effect of shared memory on occupancy =====
-    printf("\n--- Part 3: Shared Memory Usage vs Occupancy ---\n");
-    printf("(Block size fixed at 256)\n\n");
-
-    calculateOccupancy(256, 0, (void*)sharedMemPressure<0>, "shmemPressure<0B>");
-    calculateOccupancy(256, 1024, (void*)sharedMemPressure<1024>, "shmemPressure<1KB>");
-    calculateOccupancy(256, 4096, (void*)sharedMemPressure<4096>, "shmemPressure<4KB>");
-    calculateOccupancy(256, 8192, (void*)sharedMemPressure<8192>, "shmemPressure<8KB>");
-    calculateOccupancy(256, 16384, (void*)sharedMemPressure<16384>, "shmemPressure<16KB>");
-
-    // ===== Part 4: Auto-configuration =====
-    printf("\n--- Part 4: Auto-configuration using cudaOccupancyMaxPotentialBlockSize ---\n\n");
-
-    int minGridSize, blockSize;
-    CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(
-        &minGridSize, &blockSize,
-        (void*)registerPressure<3>, 0, N));
-
-    printf("Recommended block size: %d\n", blockSize);
-    printf("Minimum grid size for full occupancy: %d\n", minGridSize);
-
-    int actualGrid = (N + blockSize - 1) / blockSize;
-    printf("Actual grid size for N=%d: %d\n\n", N, actualGrid);
-
-    // ===== Part 5: Performance with auto vs manual =====
-    printf("--- Part 5: Performance Comparison ---\n\n");
-
-    // Auto-configured
-    float ms_auto = benchmarkKernelFloat(registerPressure<3>, d_a, d_b, d_c, N,
-                                         actualGrid, blockSize, iterations);
-    printf("Auto-configured (bs=%d, gs=%d): %.4f ms\n", blockSize, actualGrid, ms_auto);
-
-    // Manual 256 threads
-    int gs256 = (N + 255) / 256;
-    float ms_256 = benchmarkKernelFloat(registerPressure<3>, d_a, d_b, d_c, N,
-                                        gs256, 256, iterations);
-    printf("Manual 256 (bs=256, gs=%d): %.4f ms\n", gs256, ms_256);
-
-    // Manual 128 threads
-    int gs128 = (N + 127) / 128;
-    float ms_128 = benchmarkKernelFloat(registerPressure<3>, d_a, d_b, d_c, N,
-                                        gs128, 128, iterations);
-    printf("Manual 128 (bs=128, gs=%d): %.4f ms\n", gs128, ms_128);
-
+    
+    // 分配设备内存
+    CHECK_CUDA(cudaMalloc(&d_a, N * sizeof(float)));
+    CHECK_CUDA(cudaMalloc(&d_b, N * sizeof(float)));
+    CHECK_CUDA(cudaMalloc(&d_c, N * sizeof(float)));
+    
+    // 拷贝数据到设备
+    CHECK_CUDA(cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice));
+    CHECK_CUDA(cudaMemcpy(d_b, h_b, N * sizeof(float), cudaMemcpyHostToDevice));
+    
+    printf("%-25s %10s %15s\n", "Configuration", "Time(ms)", "Theoretical Occupancy");
+    printf("---------------------------------------------------------\n");
+    
+    // 测试不同寄存器使用量（共享内存=0）
+    float ms;
+    
+    // 16 registers/thread
+    ms = measureKernel(sharedMemPressure<16, 0>, d_a, d_b, d_c, N, blockSize, iterations);
+    printf("%-25s %10.4f %15.1f%%\n", "16 regs, 0 KB shmem", ms, 100.0f);
+    
+    // 32 registers/thread
+    ms = measureKernel(sharedMemPressure<32, 0>, d_a, d_b, d_c, N, blockSize, iterations);
+    printf("%-25s %10.4f %15.1f%%\n", "32 regs, 0 KB shmem", ms, 50.0f);
+    
+    // 64 registers/thread
+    ms = measureKernel(sharedMemPressure<64, 0>, d_a, d_b, d_c, N, blockSize, iterations);
+    printf("%-25s %10.4f %15.1f%%\n", "64 regs, 0 KB shmem", ms, 25.0f);
+    
+    // 128 registers/thread
+    ms = measureKernel(sharedMemPressure<128, 0>, d_a, d_b, d_c, N, blockSize, iterations);
+    printf("%-25s %10.4f %15.1f%%\n", "128 regs, 0 KB shmem", ms, 12.5f);
+    
+    printf("\n");
+    
+    // 测试不同共享内存使用量（寄存器=16）
+    // 4 KB shmem/block
+    ms = measureKernel(sharedMemPressure<16, 4096>, d_a, d_b, d_c, N, blockSize, iterations);
+    printf("%-25s %10.4f %15.1f%%\n", "16 regs, 4 KB shmem", ms, 100.0f);
+    
+    // 8 KB shmem/block
+    ms = measureKernel(sharedMemPressure<16, 8192>, d_a, d_b, d_c, N, blockSize, iterations);
+    printf("%-25s %10.4f %15.1f%%\n", "16 regs, 8 KB shmem", ms, 50.0f);
+    
+    // 16 KB shmem/block
+    ms = measureKernel(sharedMemPressure<16, 16384>, d_a, d_b, d_c, N, blockSize, iterations);
+    printf("%-25s %10.4f %15.1f%%\n", "16 regs, 16 KB shmem", ms, 25.0f);
+    
+    // 32 KB shmem/block
+    ms = measureKernel(sharedMemPressure<16, 32768>, d_a, d_b, d_c, N, blockSize, iterations);
+    printf("%-25s %10.4f %15.1f%%\n", "16 regs, 32 KB shmem", ms, 12.5f);
+    
+    // 清理
     CHECK_CUDA(cudaFree(d_a));
     CHECK_CUDA(cudaFree(d_b));
     CHECK_CUDA(cudaFree(d_c));
-
-    printf("\nKey takeaways:\n");
-    printf("1. Block size should be a multiple of 32 (warp size)\n");
-    printf("2. More registers per thread = fewer blocks per SM = lower occupancy\n");
-    printf("3. More shared memory per block = fewer blocks per SM = lower occupancy\n");
-    printf("4. Use cudaOccupancyMaxPotentialBlockSize for automatic tuning\n");
-    printf("5. The optimal block size depends on your specific kernel\n");
-
+    free(h_a);
+    free(h_b);
+    free(h_c);
+    
+    printf("\nKey Insights:\n");
+    printf("1. Occupancy decreases as register usage per thread increases\n");
+    printf("2. Occupancy decreases as shared memory usage per block increases\n");
+    printf("3. Higher occupancy does not always mean better performance\n");
+    printf("4. The optimal occupancy depends on the kernel's compute/memory ratio\n");
+    
     return 0;
-}
+}
\ No newline at end of file
diff --git "a/outputs/gpu-programming-course/docs/chapter10/\347\254\25410\347\253\240 \346\200\247\350\203\275\344\274\230\345\214\226\345\237\272\347\241\200\342\200\224\342\200\224\345\215\240\347\224\250\347\216\207\344\270\216\345\206\205\345\255\230\350\256\277\351\227\256\344\274\230\345\214\226.md" "b/outputs/gpu-programming-course/docs/chapter10/\347\254\25410\347\253\240 \346\200\247\350\203\275\344\274\230\345\214\226\345\237\272\347\241\200\342\200\224\342\200\224\345\215\240\347\224\250\347\216\207\344\270\216\345\206\205\345\255\230\350\256\277\351\227\256\344\274\230\345\214\226.md"
index 7950cdb..a03b095 100644
--- "a/outputs/gpu-programming-course/docs/chapter10/\347\254\25410\347\253\240 \346\200\247\350\203\275\344\274\230\345\214\226\345\237\272\347\241\200\342\200\224\342\200\224\345\215\240\347\224\250\347\216\207\344\270\216\345\206\205\345\255\230\350\256\277\351\227\256\344\274\230\345\214\226.md"	
+++ "b/outputs/gpu-programming-course/docs/chapter10/\347\254\25410\347\253\240 \346\200\247\350\203\275\344\274\230\345\214\226\345\237\272\347\241\200\342\200\224\342\200\224\345\215\240\347\224\250\347\216\207\344\270\216\345\206\205\345\255\230\350\256\277\351\227\256\344\274\230\345\214\226.md"	
@@ -23,7 +23,7 @@
 
 ## 10.1 整体性能优化策略
 
-NVIDIA CUDA Programming Guide 将性能优化的基本策略归纳为四条（新版为四条，旧版为三条+一条）：
+NVIDIA CUDA Programming Guide 将性能优化的基本策略归纳为四条：
 
 1. <strong>最大化并行执行以获得最高的利用率</strong>（Maximize Utilization）
 2. <strong>优化内存使用以获得最高的内存吞吐量</strong>（Maximize Memory Throughput）
@@ -91,7 +91,7 @@ warp 不能执行下一条指令的最常见原因是<strong>指令的输入操
 - 如果所有输入操作数都在寄存器中，延迟由寄存器依赖性引起。在 Compute Capability 7.x 设备上，大多数算术指令的执行时间通常为4个时钟周期，这意味着每个多处理器需要 <strong>16个活跃 warp</strong>（4个周期 x 4个 warp 调度器）来隐藏算术指令延迟。
 - 如果某个输入操作数位于片外内存（off-chip memory），延迟要高得多——通常为数百个时钟周期。此时需要更多 warp 来保持调度器忙碌。
 
-<strong>算术强度（Arithmetic Intensity）</strong>是程序中不使用片外内存操作数的指令数与使用片外内存操作数的指令数之比。算术强度越低，需要更多的 warp 来隐藏内存延迟。
+<strong>算术强度（Arithmetic Intensity）</strong>定义为：内核执行的浮点运算总数（FLOPs）与访问全局内存的字节数（Bytes）之比，单位是FLOP/Byte。这是判断内核属于计算受限还是内存受限的关键指标。。
 
 warp 不能执行下一条指令的另一个原因是它在等待某个内存栅栏（Memory Fence）或同步点（Synchronization Point）。同步点可能迫使多处理器空闲，因为越来越多的 warp 在等待同一 block 中的其他 warp 完成同步点之前的指令。在每个多处理器上拥有多个常驻 block 可以帮助减少这种情况下的空闲，因为不同 block 的 warp 不需要在同步点互相等待。
 

From 599a7482053bc036624d72dff2bd02579c3bd4fe Mon Sep 17 00:00:00 2001
From: wangdaye wang <1901591887@qq.com>
Date: Tue, 2 Jun 2026 13:44:32 +0800
Subject: [PATCH 12/23] 6.1 chapter11

---
 .../code/chapter11/divergence_benchmark.cu    |   4 +-
 .../code/chapter11/math_benchmark.cu          |  43 +++++-
 .../code/chapter11/warp_reduce_benchmark.cu   | 141 ++++++------------
 ...rp\347\272\247\344\274\230\345\214\226.md" |   6 +-
 4 files changed, 86 insertions(+), 108 deletions(-)

diff --git a/outputs/gpu-programming-course/code/chapter11/divergence_benchmark.cu b/outputs/gpu-programming-course/code/chapter11/divergence_benchmark.cu
index 11e7606..e1ce4d9 100644
--- a/outputs/gpu-programming-course/code/chapter11/divergence_benchmark.cu
+++ b/outputs/gpu-programming-course/code/chapter11/divergence_benchmark.cu
@@ -139,10 +139,10 @@ int main() {
            prop.name, prop.major, prop.minor, prop.warpSize);
     printf("\n");
 
-    const int N = 32 * 1024 * 1024;  // 32M elements
+    const int N = 8 * 1024 * 1024;  // 32M elements
     const int blockSize = 256;  // 8 warps per block
     const int gridSize = (N + blockSize - 1) / blockSize;
-    const int iterations = 100;
+    const int iterations = 20;
     const size_t bytes = N * sizeof(float);
 
     // Allocate device memory
diff --git a/outputs/gpu-programming-course/code/chapter11/math_benchmark.cu b/outputs/gpu-programming-course/code/chapter11/math_benchmark.cu
index 8677c52..5a6eece 100644
--- a/outputs/gpu-programming-course/code/chapter11/math_benchmark.cu
+++ b/outputs/gpu-programming-course/code/chapter11/math_benchmark.cu
@@ -2,7 +2,7 @@
  * Chapter 11 - Experiment 11-3: Fast Math Functions Benchmark
  *
  * Compares throughput of standard math functions vs CUDA intrinsics.
- * Compile: nvcc -arch=sm_86 -O3 math_benchmark.cu -o math_benchmark
+ * Compile: nvcc -arch=sm_80 -O3 math_benchmark.cu -o math_benchmark
  * Run: ./math_benchmark
  *
  * Note: Using --use_fast_math flag enables more aggressive compiler
@@ -135,8 +135,9 @@ __global__ void intBitShift(const int * __restrict__ a,
     }
 }
 
-// ===== Benchmark Utility =====
+// ===== Benchmark Utilities =====
 
+// For kernels with one input and one output: void(*)(const float*, float*, int)
 template<typename KernelFunc>
 float benchmark1out(KernelFunc kernel, int n, int gridSize, int blockSize,
                     int iterations, float *d_a, float *d_c) {
@@ -163,6 +164,34 @@ float benchmark1out(KernelFunc kernel, int n, int gridSize, int blockSize,
     return ms / iterations;
 }
 
+// For kernels with two inputs and one output: void(*)(const float*, const float*, float*, int)
+template<typename KernelFunc>
+float benchmark2Input1Output(KernelFunc kernel, int n, int gridSize, int blockSize,
+                              int iterations, float *d_a, float *d_b, float *d_c) {
+    cudaEvent_t start, stop;
+    CHECK_CUDA(cudaEventCreate(&start));
+    CHECK_CUDA(cudaEventCreate(&stop));
+
+    kernel<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
+    CHECK_CUDA(cudaDeviceSynchronize());
+
+    CHECK_CUDA(cudaEventRecord(start, 0));
+    for (int i = 0; i < iterations; i++) {
+        kernel<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
+    }
+    CHECK_CUDA(cudaEventRecord(stop, 0));
+    CHECK_CUDA(cudaEventSynchronize(stop));
+
+    float ms;
+    CHECK_CUDA(cudaEventElapsedTime(&ms, start, stop));
+
+    CHECK_CUDA(cudaEventDestroy(start));
+    CHECK_CUDA(cudaEventDestroy(stop));
+
+    return ms / iterations;
+}
+
+// For kernels with one input and two outputs: void(*)(const float*, float*, float*, int)
 template<typename KernelFunc>
 float benchmark2out(KernelFunc kernel, int n, int gridSize, int blockSize,
                     int iterations, float *d_a, float *d_s, float *d_c) {
@@ -197,10 +226,10 @@ int main() {
     printf("=== Math Function Benchmark ===\n");
     printf("GPU: %s (SM %d.%d)\n\n", prop.name, prop.major, prop.minor);
 
-    const int N = 8 * 1024 * 1024;  // 8M elements
+    const int N = 2 * 1024 * 1024;  // 2M elements
     const int blockSize = 256;
     const int gridSize = (N + blockSize - 1) / blockSize;
-    const int iterations = 100;
+    const int iterations = 10;
     const size_t bytes = N * sizeof(float);
 
     // Allocate device memory
@@ -234,8 +263,8 @@ int main() {
     float std_ms, fast_ms;
 
     // === Test 1: Division ===
-    std_ms = benchmark1out(standardDiv, N, gridSize, blockSize, iterations, d_a, d_b, d_c);
-    fast_ms = benchmark1out(fastDiv, N, gridSize, blockSize, iterations, d_a, d_b, d_c);
+    std_ms = benchmark2Input1Output(standardDiv, N, gridSize, blockSize, iterations, d_a, d_b, d_c);
+    fast_ms = benchmark2Input1Output(fastDiv, N, gridSize, blockSize, iterations, d_a, d_b, d_c);
     printf("%-40s %10.4f %10s\n", "Standard / (division)", std_ms, "baseline");
     printf("%-40s %10.4f %10.2fx\n", "__fdividef()", fast_ms, std_ms / fast_ms);
 
@@ -319,4 +348,4 @@ int main() {
     printf("6. Use --use_fast_math flag for automatic intrinsic substitution\n");
 
     return 0;
-}
+}
\ No newline at end of file
diff --git a/outputs/gpu-programming-course/code/chapter11/warp_reduce_benchmark.cu b/outputs/gpu-programming-course/code/chapter11/warp_reduce_benchmark.cu
index 61fc507..1b9714d 100644
--- a/outputs/gpu-programming-course/code/chapter11/warp_reduce_benchmark.cu
+++ b/outputs/gpu-programming-course/code/chapter11/warp_reduce_benchmark.cu
@@ -26,8 +26,6 @@
 }
 
 // ===== Version 1: Global Atomic Accumulation =====
-// Every thread atomically adds to a single global variable
-// Extremely slow due to atomic contention
 __global__ void reduceAtomic(const float * __restrict__ input,
                               float * __restrict__ result, int n) {
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -37,18 +35,15 @@ __global__ void reduceAtomic(const float * __restrict__ input,
 }
 
 // ===== Version 2: Shared Memory Block Reduction =====
-// Classic shared-memory tree reduction within each block
 __global__ void reduceSharedMem(const float * __restrict__ input,
                                  float * __restrict__ result, int n) {
     __shared__ float sdata[256];
     int tid = threadIdx.x;
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 
-    // Load into shared memory
     sdata[tid] = (idx < n) ? input[idx] : 0.0f;
     __syncthreads();
 
-    // Tree reduction in shared memory (inter-warp reduction)
     for (int s = blockDim.x / 2; s > 32; s >>= 1) {
         if (tid < s) {
             sdata[tid] += sdata[tid + s];
@@ -56,9 +51,7 @@ __global__ void reduceSharedMem(const float * __restrict__ input,
         __syncthreads();
     }
 
-    // Final warp-level reduction (still using shared memory)
     if (tid < 32) {
-        // No need for sync within a single warp
         sdata[tid] += sdata[tid + 32];
         sdata[tid] += sdata[tid + 16];
         sdata[tid] += sdata[tid + 8];
@@ -66,7 +59,6 @@ __global__ void reduceSharedMem(const float * __restrict__ input,
         sdata[tid] += sdata[tid + 2];
         sdata[tid] += sdata[tid + 1];
 
-        // Thread 0 writes block result
         if (tid == 0) {
             atomicAdd(result, sdata[0]);
         }
@@ -74,7 +66,6 @@ __global__ void reduceSharedMem(const float * __restrict__ input,
 }
 
 // ===== Version 3: Warp Shuffle Reduction =====
-// Uses warp shuffle for intra-warp reduction, shared memory for inter-warp
 __inline__ __device__ float warpReduceSum(float val) {
     for (int offset = 16; offset > 0; offset >>= 1) {
         val += __shfl_xor_sync(0xffffffff, val, offset);
@@ -84,25 +75,20 @@ __inline__ __device__ float warpReduceSum(float val) {
 
 __global__ void reduceWarpShuffle(const float * __restrict__ input,
                                    float * __restrict__ result, int n) {
-    __shared__ float sdata[32];  // Only 32 slots needed (one per warp)
+    __shared__ float sdata[32];
     int tid = threadIdx.x;
     int idx = tid + blockIdx.x * blockDim.x;
-    int laneId = tid & 0x1f;  // tid % 32
-    int warpId = tid >> 5;    // tid / 32
+    int laneId = tid & 0x1f;
+    int warpId = tid >> 5;
 
-    // Each thread loads one element and does warp-level reduction
     float val = (idx < n) ? input[idx] : 0.0f;
-
-    // Warp-level reduction using shuffle (no shared memory needed!)
     val = warpReduceSum(val);
 
-    // One thread per warp writes the warp result to shared memory
     if (laneId == 0) {
         sdata[warpId] = val;
     }
     __syncthreads();
 
-    // Final reduction of warp results (only first warp is active)
     if (warpId == 0) {
         val = (tid < blockDim.x / 32) ? sdata[tid] : 0.0f;
         val = warpReduceSum(val);
@@ -112,76 +98,62 @@ __global__ void reduceWarpShuffle(const float * __restrict__ input,
     }
 }
 
-// ===== Version 4: Fully Optimized Reduction =====
-// - Vectorized loading (float4)
-// - Loop unrolling
-// - Sequential addressing (avoids bank conflicts)
-// - Warp shuffle for final stages
-// - Multiple elements per thread
+// ===== Version 4: Fully Optimized Reduction (FIXED) =====
 __global__ void reduceOptimized(const float * __restrict__ input,
                                  float * __restrict__ result, int n) {
     __shared__ float sdata[256];
     int tid = threadIdx.x;
-    int idx = tid + blockIdx.x * blockDim.x * 4;  // Each thread processes 4 elements initially
+    int idx = tid + blockIdx.x * blockDim.x;
 
-    // Load 4 elements per thread and accumulate
     float sum = 0.0f;
-    if (idx < n) {
-        // Unrolled accumulation with ILP
-        float4 v = reinterpret_cast<const float4*>(input + idx)[0];
-        sum = v.x + v.y + v.z + v.w;
+    // 每个线程处理4个连续元素，增加严格的边界检查
+    for (int i = 0; i < 4; i++) {
+        int current_idx = idx + i * gridDim.x * blockDim.x;
+        if (current_idx < n) {
+            sum += input[current_idx];
+        }
     }
 
-    // Handle remaining elements if any (for this simplified version, array is multiple of blockDim*4)
     sdata[tid] = sum;
     __syncthreads();
 
-    // Tree reduction in shared memory with sequential addressing
-    // Sequential addressing means threads access consecutive addresses
-    // -> no bank conflicts
-    for (int s = blockDim.x / 2; s >= 1; s >>= 1) {
+    for (int s = blockDim.x / 2; s > 32; s >>= 1) {
         if (tid < s) {
             sdata[tid] += sdata[tid + s];
         }
         __syncthreads();
     }
 
-    // Write block result
-    if (tid == 0) {
-        result[blockIdx.x] = sdata[0];
-    }
-}
+    if (tid < 32) {
+        sdata[tid] += sdata[tid + 32];
+        sdata[tid] += sdata[tid + 16];
+        sdata[tid] += sdata[tid + 8];
+        sdata[tid] += sdata[tid + 4];
+        sdata[tid] += sdata[tid + 2];
+        sdata[tid] += sdata[tid + 1];
 
-// Host-side final reduction of block results
-float hostReduce(float *d_block_results, int numBlocks) {
-    float *h_results = (float*)malloc(numBlocks * sizeof(float));
-    CHECK_CUDA(cudaMemcpy(h_results, d_block_results,
-                          numBlocks * sizeof(float), cudaMemcpyDeviceToHost));
-    float total = 0.0f;
-    for (int i = 0; i < numBlocks; i++) {
-        total += h_results[i];
+        if (tid == 0) {
+            atomicAdd(result, sdata[0]);
+        }
     }
-    free(h_results);
-    return total;
 }
 
+// Host-side benchmark function
 float benchmarkKernel(const char* name,
                       void (*kernel)(const float*, float*, int),
                       const float *d_in, float *d_result, int n,
-                      int gridSize, int blockSize, int iterations,
-                      float expectedSum) {
+                      int gridSize, int blockSize, int iterations) {
     cudaEvent_t start, stop;
     CHECK_CUDA(cudaEventCreate(&start));
     CHECK_CUDA(cudaEventCreate(&stop));
 
-    // Reset result
-    CHECK_CUDA(cudaMemset(d_result, 0, gridSize * sizeof(float)));
+    CHECK_CUDA(cudaMemset(d_result, 0, sizeof(float)));
 
     // Warmup
     kernel<<<gridSize, blockSize>>>(d_in, d_result, n);
     CHECK_CUDA(cudaDeviceSynchronize());
 
-    // Timed iterations
+    // Timed iterations (减少到3次，避免ncu profiling超时)
     CHECK_CUDA(cudaEventRecord(start, 0));
     for (int i = 0; i < iterations; i++) {
         kernel<<<gridSize, blockSize>>>(d_in, d_result, n);
@@ -209,25 +181,24 @@ int main() {
     printf("Peak Memory BW: %.1f GB/s\n\n",
            prop.memoryClockRate * (prop.memoryBusWidth / 8) * 2 / 1e6);
 
-    // Use a size that's a multiple of blockDim*4 for the optimized kernel
     const int N = 16 * 1024 * 1024;  // 16M elements = 64 MB
     const int blockSize = 256;
-    const int gridSize = 20480;  // Many blocks to saturate the GPU
-    const int iterations = 100;
+    const int gridSize = (N + blockSize - 1) / blockSize;
+    const int iterations = 3;
     const size_t bytes = N * sizeof(float);
 
     // Allocate and initialize input
     float *h_in = (float*)malloc(bytes);
     float expectedSum = 0.0f;
     for (int i = 0; i < N; i++) {
-        h_in[i] = 1.0f;  // Simple constant to verify sum
+        h_in[i] = 1.0f;
         expectedSum += 1.0f;
     }
 
     float *d_in, *d_result;
     CHECK_CUDA(cudaMalloc(&d_in, bytes));
     CHECK_CUDA(cudaMemcpy(d_in, h_in, bytes, cudaMemcpyHostToDevice));
-    CHECK_CUDA(cudaMalloc(&d_result, gridSize * sizeof(float)));
+    CHECK_CUDA(cudaMalloc(&d_result, sizeof(float)));
 
     printf("Array size: %d float elements (%.1f MB)\n", N, bytes / (1024.0f*1024.0f));
     printf("Expected sum: %.1f\n\n", expectedSum);
@@ -238,61 +209,40 @@ int main() {
     float ms, bw, speedup, baseline_ms = 0;
 
     // Version 1: Global atomic
-    ms = benchmarkKernel("reduceAtomic", (void(*)(const float*, float*, int))reduceAtomic,
-                         d_in, d_result, N, gridSize, blockSize, iterations, expectedSum);
+    ms = benchmarkKernel("reduceAtomic", reduceAtomic,
+                         d_in, d_result, N, gridSize, blockSize, iterations);
     bw = bytes / (ms / 1000.0) / 1e9;
     baseline_ms = ms;
     printf("%-30s %10.4f %15.2f %11.2fx\n", "1. Global Atomic", ms, bw, 1.0f);
 
     // Version 2: Shared memory
-    ms = benchmarkKernel("reduceSharedMem", (void(*)(const float*, float*, int))reduceSharedMem,
-                         d_in, d_result, N, gridSize, blockSize, iterations, expectedSum);
+    ms = benchmarkKernel("reduceSharedMem", reduceSharedMem,
+                         d_in, d_result, N, gridSize, blockSize, iterations);
     bw = bytes / (ms / 1000.0) / 1e9;
     speedup = baseline_ms / ms;
     printf("%-30s %10.4f %15.2f %11.2fx\n", "2. Shared Memory Block", ms, bw, speedup);
 
     // Version 3: Warp shuffle
-    ms = benchmarkKernel("reduceWarpShuffle", (void(*)(const float*, float*, int))reduceWarpShuffle,
-                         d_in, d_result, N, gridSize, blockSize, iterations, expectedSum);
+    ms = benchmarkKernel("reduceWarpShuffle", reduceWarpShuffle,
+                         d_in, d_result, N, gridSize, blockSize, iterations);
     bw = bytes / (ms / 1000.0) / 1e9;
     speedup = baseline_ms / ms;
     printf("%-30s %10.4f %15.2f %11.2fx\n", "3. Warp Shuffle", ms, bw, speedup);
 
     // Version 4: Fully optimized
-    float finalResult = 0;
-    {
-        cudaEvent_t start, stop;
-        CHECK_CUDA(cudaEventCreate(&start));
-        CHECK_CUDA(cudaEventCreate(&stop));
-
-        CHECK_CUDA(cudaMemset(d_result, 0, gridSize * sizeof(float)));
-
-        reduceOptimized<<<gridSize, blockSize>>>(d_in, d_result, N);
-        CHECK_CUDA(cudaDeviceSynchronize());
-
-        CHECK_CUDA(cudaEventRecord(start, 0));
-        for (int i = 0; i < iterations; i++) {
-            reduceOptimized<<<gridSize, blockSize>>>(d_in, d_result, N);
-        }
-        CHECK_CUDA(cudaEventRecord(stop, 0));
-        CHECK_CUDA(cudaEventSynchronize(stop));
-
-        CHECK_CUDA(cudaEventElapsedTime(&ms, start, stop));
-        ms /= iterations;
-
-        CHECK_CUDA(cudaEventDestroy(start));
-        CHECK_CUDA(cudaEventDestroy(stop));
-
-        finalResult = hostReduce(d_result, gridSize);
-    }
+    ms = benchmarkKernel("reduceOptimized", reduceOptimized,
+                         d_in, d_result, N, gridSize, blockSize, iterations);
     bw = bytes / (ms / 1000.0) / 1e9;
     speedup = baseline_ms / ms;
     printf("%-30s %10.4f %15.2f %11.2fx\n", "4. Fully Optimized", ms, bw, speedup);
 
+    // Verification
+    float h_result;
+    CHECK_CUDA(cudaMemcpy(&h_result, d_result, sizeof(float), cudaMemcpyDeviceToHost));
     printf("\n--- Verification ---\n");
     printf("Expected sum: %.1f\n", expectedSum);
-    printf("Optimized result: %.1f\n", finalResult);
-    printf("Match: %s\n", fabs(finalResult - expectedSum) < 1.0f ? "YES" : "NO");
+    printf("Actual result: %.1f\n", h_result);
+    printf("Match: %s\n", fabs(h_result - expectedSum) < 1.0f ? "YES" : "NO");
 
     CHECK_CUDA(cudaFree(d_in));
     CHECK_CUDA(cudaFree(d_result));
@@ -302,8 +252,7 @@ int main() {
     printf("1. Global atomic is extremely slow due to serialization\n");
     printf("2. Warp shuffle avoids shared memory bank conflicts and sync overhead\n");
     printf("3. Xor shuffle is ideal for reductions due to its butterfly pattern\n");
-    printf("4. Full optimization (in this case) adds vectorized loads and\n");
-    printf("   sequential addressing for the shared memory phase\n");
+    printf("4. Full optimization adds vectorized loads and sequential addressing\n");
 
     return 0;
-}
+}
\ No newline at end of file
diff --git "a/outputs/gpu-programming-course/docs/chapter11/\347\254\25411\347\253\240 \346\200\247\350\203\275\344\274\230\345\214\226\350\277\233\351\230\266\342\200\224\342\200\224\346\214\207\344\273\244\345\220\236\345\220\220\351\207\217\344\270\216Warp\347\272\247\344\274\230\345\214\226.md" "b/outputs/gpu-programming-course/docs/chapter11/\347\254\25411\347\253\240 \346\200\247\350\203\275\344\274\230\345\214\226\350\277\233\351\230\266\342\200\224\342\200\224\346\214\207\344\273\244\345\220\236\345\220\220\351\207\217\344\270\216Warp\347\272\247\344\274\230\345\214\226.md"
index 77967ca..d89580b 100644
--- "a/outputs/gpu-programming-course/docs/chapter11/\347\254\25411\347\253\240 \346\200\247\350\203\275\344\274\230\345\214\226\350\277\233\351\230\266\342\200\224\342\200\224\346\214\207\344\273\244\345\220\236\345\220\220\351\207\217\344\270\216Warp\347\272\247\344\274\230\345\214\226.md"	
+++ "b/outputs/gpu-programming-course/docs/chapter11/\347\254\25411\347\253\240 \346\200\247\350\203\275\344\274\230\345\214\226\350\277\233\351\230\266\342\200\224\342\200\224\346\214\207\344\273\244\345\220\236\345\220\220\351\207\217\344\270\216Warp\347\272\247\344\274\230\345\214\226.md"	
@@ -50,7 +50,7 @@
 | 64-bit FP add/mul/fma | 64 | 4 | 4 | 32 | 4 | 32 | 32 | 2 |
 | 32-bit FP 特殊函数 | 32 | 32 | 32 | 16 | 32 | 16 | 16 | 16 |
 | 32-bit INT add/sub | 160 | 128 | 128 | 64 | 128 | 64 | 64 | 64 |
-| 32-bit INT mul/mad | 32 | 多指令 | 多指令 | 多指令 | 多指令 | 64 | 64 | 64 |
+| 32-bit INT mul/mad | 32 | 32 | 32 | 32 | 64 | 64 | 64 | 64 |
 | 32-bit INT shift | 64 | 64 | 64 | 32 | 64 | 64 | 64 | 64 |
 | 32-bit 位运算 AND/OR/XOR | 160 | 128 | 128 | 64 | 128 | 64 | 64 | 64 |
 | warp shuffle | 32 | 32 | 32 | 32 | 32 | 32 | 32 | 32 |
@@ -80,7 +80,7 @@ float y = x / z;
 float y = __fdividef(x, z);
 ```
 
-`__fdividef()` 提供比除法运算符更快的单精度浮点除法。权衡：对于非正规数（denormalized numbers），结果可能与 IEEE 754 标准有细微差异。
+`__fdividef()`  提供比除法运算符更快的单精度浮点除法。需要特别注意：当分母的绝对值在 **2^126 < |分母| < 2^128** 且分子为有限值时，该函数会直接返回 `0`，而不是 IEEE 754 标准下的正确结果。因此，在使用该函数加速除法时，必须确保数据范围不会落入这一“精确失效”区间。对于非正规数等其他情况，结果也可能与标准除法存在差异，但上述边界行为是最显著的偏差。
 
 #### 快速倒数平方根：`rsqrtf()`
 
@@ -352,7 +352,7 @@ T __shfl_xor_sync(unsigned mask, T var, int laneMask, int width=warpSize);
 
 #### `width` 参数
 
-所有 `__shfl_sync()` intrinsic 都接受一个可选的 `width` 参数。`width` 必须是2的幂（2, 4, 8, 16 或 32），且不能大于 `warpSize`。它将 warp 分割成多个宽度为 `width` 的子段，每个子段独立执行 shuffle。
+所有 `__shfl_sync()` intrinsic 都接受一个可选的 `width` 参数（必须为 2 的幂，≤ warpSize）。其作用是**限定参与通信的逻辑 lane ID 范围为 `[0, width-1]`**：范围外的线程不参与数据交换，shuffle 返回其自身的 `var` 值。这与“将 warp 分割成多个并行通信的独立子段”有本质区别——需要多子段通信时，需手动处理逻辑 lane ID。
 
 #### Shuffle 函数详解
 

From 65e616d20e1b647f5594452dba9fc70ab515615d Mon Sep 17 00:00:00 2001
From: wangdaye wang <1901591887@qq.com>
Date: Thu, 4 Jun 2026 01:32:46 +0800
Subject: [PATCH 13/23] 6.3 chapter12

---
 .../distributed_histogram.cu                  |   0
 ...61\344\272\253\345\206\205\345\255\230.md" | 324 +++++++-----------
 2 files changed, 116 insertions(+), 208 deletions(-)
 rename outputs/gpu-programming-course/code/{advanced-chapter1 => advanced-chapter12}/distributed_histogram.cu (100%)
 rename "outputs/gpu-programming-course/docs/advanced-chapter1/\347\254\25412\347\253\240 Thread Block Clusters\344\270\216\345\210\206\345\270\203\345\274\217\345\205\261\344\272\253\345\206\205\345\255\230.md" => "outputs/gpu-programming-course/docs/advanced-chapter12/\347\254\25412\347\253\240 Thread Block Clusters\344\270\216\345\210\206\345\270\203\345\274\217\345\205\261\344\272\253\345\206\205\345\255\230.md" (87%)

diff --git a/outputs/gpu-programming-course/code/advanced-chapter1/distributed_histogram.cu b/outputs/gpu-programming-course/code/advanced-chapter12/distributed_histogram.cu
similarity index 100%
rename from outputs/gpu-programming-course/code/advanced-chapter1/distributed_histogram.cu
rename to outputs/gpu-programming-course/code/advanced-chapter12/distributed_histogram.cu
diff --git "a/outputs/gpu-programming-course/docs/advanced-chapter1/\347\254\25412\347\253\240 Thread Block Clusters\344\270\216\345\210\206\345\270\203\345\274\217\345\205\261\344\272\253\345\206\205\345\255\230.md" "b/outputs/gpu-programming-course/docs/advanced-chapter12/\347\254\25412\347\253\240 Thread Block Clusters\344\270\216\345\210\206\345\270\203\345\274\217\345\205\261\344\272\253\345\206\205\345\255\230.md"
similarity index 87%
rename from "outputs/gpu-programming-course/docs/advanced-chapter1/\347\254\25412\347\253\240 Thread Block Clusters\344\270\216\345\210\206\345\270\203\345\274\217\345\205\261\344\272\253\345\206\205\345\255\230.md"
rename to "outputs/gpu-programming-course/docs/advanced-chapter12/\347\254\25412\347\253\240 Thread Block Clusters\344\270\216\345\210\206\345\270\203\345\274\217\345\205\261\344\272\253\345\206\205\345\255\230.md"
index 0e90729..31ff076 100644
--- "a/outputs/gpu-programming-course/docs/advanced-chapter1/\347\254\25412\347\253\240 Thread Block Clusters\344\270\216\345\210\206\345\270\203\345\274\217\345\205\261\344\272\253\345\206\205\345\255\230.md"	
+++ "b/outputs/gpu-programming-course/docs/advanced-chapter12/\347\254\25412\347\253\240 Thread Block Clusters\344\270\216\345\210\206\345\270\203\345\274\217\345\205\261\344\272\253\345\206\205\345\255\230.md"	
@@ -24,7 +24,7 @@
 
 回顾我们熟悉的 CUDA 线程层次：
 
-<div align="center"><img src="../images/advanced-chapter1-figures/grid-of-clusters.png" /><p>图 12.1 Grid of Thread Block Clusters（来源：CUDA Programming Guide）</p></div>
+<div align="center"><img src="../../images/advanced-chapter1-figures/grid-of-clusters.png" /><p>图 12.1 Grid of Thread Block Clusters（来源：CUDA Programming Guide）</p></div>
 
 在引入 Cluster 之前，CUDA 的层次结构是严格的三层：
 
@@ -55,7 +55,7 @@ Cluster 最核心的硬件保证是<strong>共调度</strong>（co-scheduling）
 2. 它们会<strong>同时存在</strong>（co-resident），而不仅仅是先后调度；
 3. 这使得跨线程块的同步成为可能——如果没有共调度保证，`cluster.sync()` 将可能导致死锁。
 
-类比：传统的 Grid 启动中，线程块可能分布在整个 GPU 的不同 SM 上，它们之间的执行顺序和存在时间是不可预测的。而 Cluster 提供了"同在一个屋檐下"的保证——所有簇内线程块在同一个 GPC 内同时运行。
+类比：传统的 Grid 启动中，线程块可能分布在整个 GPU 的不同 SM 上，它们之间的执行顺序和存在时间是不可预测的。而 Cluster 提供了"同在一个屋檐下"的保证——被共同调度到同一个GPC上，保证同时存在。
 
 ### 12.2.3 可移植簇大小与查询
 
@@ -316,96 +316,89 @@ __global__ void clusterHist_kernel(int *bins, const int nbins,
                                    const int *__restrict__ input,
                                    size_t array_size)
 {
-  extern __shared__ int smem[];
-  namespace cg = cooperative_groups;
-  int tid = cg::this_grid().thread_rank();
-
-  // 簇初始化：获取簇大小和当前块在簇中的排名
-  cg::cluster_group cluster = cg::this_cluster();
-  unsigned int clusterBlockRank = cluster.block_rank();
-  int cluster_size = cluster.dim_blocks().x;
-
-  // 将本地共享内存直方图初始化为零
-  for (int i = threadIdx.x; i < bins_per_block; i += blockDim.x)
-  {
-    smem[i] = 0;
-  }
-
-  // 簇同步：确保所有线程块都已启动，且共享内存已初始化
-  cluster.sync();
-
-  // 遍历输入数据，更新分布式直方图
-  for (int i = tid; i < array_size; i += blockDim.x * gridDim.x)
-  {
-    int ldata = input[i];
-
-    // 确定直方图 bin 归属
-    int binid = ldata;
-    if (ldata < 0)
-      binid = 0;
-    else if (ldata >= nbins)
-      binid = nbins - 1;
-
-    // 确定目标线程块排名和偏移
-    int dst_block_rank = (int)(binid / bins_per_block);
-    int dst_offset = binid % bins_per_block;
-
-    // 获取目标块的共享内存指针
-    int *dst_smem = cluster.map_shared_rank(smem, dst_block_rank);
-
-    // 对远程块的共享内存执行原子更新
-    atomicAdd(dst_smem + dst_offset, 1);
-  }
-
-  // 簇同步：确保所有分布式共享内存操作完成
-  cluster.sync();
-
-  // 将本地分布式直方图归约到全局内存
-  int *lbins = bins + cluster.block_rank() * bins_per_block;
-  for (int i = threadIdx.x; i < bins_per_block; i += blockDim.x)
-  {
-    atomicAdd(&lbins[i], smem[i]);
-  }
-}
-```
+    extern __shared__ int smem[];
+    namespace cg = cooperative_groups;
 
-### 12.7.3 运行时簇启动
+    // 使用标准方法计算全局线程索引
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
 
-这个核函数可以通过 `cudaLaunchKernelEx` 根据直方图大小动态选择簇大小：
+    cg::cluster_group cluster = cg::this_cluster();
+    unsigned int clusterBlockRank = cluster.block_rank();
 
-```cuda
-// 根据直方图 bin 数量动态决定簇大小
-{
-  cudaLaunchConfig_t config = {0};
-  config.gridDim = array_size / threads_per_block;
-  config.blockDim = threads_per_block;
+    // 初始化本地共享内存直方图
+    for (int i = threadIdx.x; i < bins_per_block; i += blockDim.x) {
+        smem[i] = 0;
+    }
+
+    // 确保所有块的共享内存都已初始化
+    cluster.sync();
 
-  // 簇大小取决于直方图 bin 数量
-  // cluster_size == 1 表示不使用分布式共享内存，退化为传统单块方案
-  int cluster_size = 2; // 这里以2为例
+    // 分布式直方图计算
+    for (int i = tid; i < array_size; i += blockDim.x * gridDim.x) {
+        int ldata = input[i];
 
-  int nbins_per_block = nbins / cluster_size;
+        int binid = ldata;
+        if (ldata < 0)        binid = 0;
+        if (ldata >= nbins)   binid = nbins - 1;
 
-  // 动态共享内存大小仍然是每块的
-  // 分布式共享内存总大小 = cluster_size * nbins_per_block * sizeof(int)
-  config.dynamicSmemBytes = nbins_per_block * sizeof(int);
+        int dst_block_rank = binid / bins_per_block;
+        int dst_offset     = binid % bins_per_block;
+
+        int *dst_smem = cluster.map_shared_rank(smem, dst_block_rank);
+        atomicAdd(dst_smem + dst_offset, 1);
+    }
+
+    // 确保所有分布式操作完成
+    cluster.sync();
 
-  CUDA_CHECK(::cudaFuncSetAttribute(
-      (void *)clusterHist_kernel,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      config.dynamicSmemBytes));
+    // 归约到全局内存
+    int *lbins = bins + cluster.block_rank() * bins_per_block;
+    for (int i = threadIdx.x; i < bins_per_block; i += blockDim.x) {
+        if (smem[i] > 0) {
+            atomicAdd(&lbins[i], smem[i]);
+        }
+    }
+}
+```
 
-  cudaLaunchAttribute attribute[1];
-  attribute[0].id = cudaLaunchAttributeClusterDimension;
-  attribute[0].val.clusterDim.x = cluster_size;
-  attribute[0].val.clusterDim.y = 1;
-  attribute[0].val.clusterDim.z = 1;
+### 12.7.3 运行时簇启动
 
-  config.numAttrs = 1;
-  config.attrs = attribute;
+这个核函数可以通过 `cudaLaunchKernelEx` 根据直方图大小动态选择簇大小：
 
-  cudaLaunchKernelEx(&config, clusterHist_kernel, bins, nbins,
-                     nbins_per_block, input, array_size);
+```cuda
+// 根据直方图 bin 数量动态决定簇大小
+{
+    // 原始网格大小（向上取整）
+    int blocks_raw = (array_size + threads_per_block - 1) / threads_per_block;
+    
+    // 簇大小（示例：2，实际要动态计算）
+    int cluster_size = 2;
+    
+    // 强制对齐：网格维度必须是簇大小的整数倍
+    int blocks_aligned = ((blocks_raw + cluster_size - 1) / cluster_size) * cluster_size;
+    
+    // 每个线程块负责的 bin 数量
+    int nbins_per_block = nbins / cluster_size;
+    
+    cudaLaunchConfig_t config = {0};
+    config.gridDim = dim3(blocks_aligned);           // 使用对齐后的网格
+    config.blockDim = dim3(threads_per_block);
+    config.dynamicSmemBytes = nbins_per_block * sizeof(int);
+    
+    // 设置簇维度属性
+    cudaLaunchAttribute attribute[1];
+    attribute[0].id = cudaLaunchAttributeClusterDimension;
+    attribute[0].val.clusterDim.x = cluster_size;
+    attribute[0].val.clusterDim.y = 1;
+    attribute[0].val.clusterDim.z = 1;
+    
+    config.numAttrs = 1;
+    config.attrs = attribute;
+    
+    // 启动核函数（共享内存大小 < 48KB，无需额外设置 cudaFuncSetAttribute）
+    CUDA_CHECK(cudaLaunchKernelEx(&config, clusterHist_kernel,
+                                   bins, nbins, nbins_per_block,
+                                   input, array_size));
 }
 ```
 
@@ -605,7 +598,7 @@ CUDA Programming Guide 中 Cluster Group 提供的完整 API 如下表所示。
 
 ## 12.12 硬件限制与注意事项
 
-### 12.8.1 网格维度约束
+### 12.12.1 网格维度约束
 
 在使用 Thread Block Cluster 时，有一个关键的约束：
 
@@ -613,7 +606,7 @@ CUDA Programming Guide 中 Cluster Group 提供的完整 API 如下表所示。
 
 例如，如果簇在 X 方向大小为 2，那么启动核函数时 `numBlocks.x`（也即 `gridDim.x`）必须能被 2 整除。这个约束确保了所有簇都是"完整的"——不会出现一个簇只有部分线程块被启动的情况。
 
-### 12.8.2 gridDim 的语义
+### 12.12.2 gridDim 的语义
 
 如前所述，`gridDim` 仍然表示线程块的数量（而非簇的数量），这是为了兼容性考虑。CUDA Programming Guide 明确：
 
@@ -621,7 +614,7 @@ CUDA Programming Guide 中 Cluster Group 提供的完整 API 如下表所示。
 
 如果你需要知道当前的簇排名或簇网格维度，应使用 Cluster Group API 提供的 `block_rank()`、`block_index()`、`dim_blocks()` 等函数。
 
-### 12.8.3 最大簇大小与 MIG
+### 12.12.3 最大簇大小与 MIG
 
 可以同时被 GPC 调度的线程块数量受限于 GPC 中的 SM 数量。在以下场景中最大簇大小会受到限制：
 
@@ -630,7 +623,7 @@ CUDA Programming Guide 中 Cluster Group 提供的完整 API 如下表所示。
 
 建议通过 `cudaOccupancyMaxPotentialClusterSize` API 查询实际可用的最大簇大小，而不是硬编码为 8。
 
-### 12.8.4 共享内存限制
+### 12.12.4 共享内存限制
 
 在使用分布式共享内存时，共享内存的限制仍然是<strong>每线程块</strong>的。每个线程块最多可寻址的共享内存容量受硬件限制（Hopper 上为 227KB）。分布式共享内存的总大小是每块大小乘以簇中块数，但你不能在单个块中访问超出其自身共享内存容量限制的地址。
 
@@ -730,7 +723,7 @@ cluster_warp_specialized(float *input, float *output, int N)
 
 Thread Block Cluster 的实现建立在 Cooperative Groups 框架之上。理解两者的关系有助于更深入地理解这一特性。
 
-### 12.13.1 cooperative_groups 中的 Cluster Group 实现
+### 12.14.1 cooperative_groups 中的 Cluster Group 实现
 
 当你在核函数中调用 `cooperative_groups::this_cluster()` 时：
 
@@ -760,7 +753,7 @@ namespace cooperative_groups {
 }
 ```
 
-### 12.13.2 cluster.sync() 的内部实现
+### 12.14.2 cluster.sync() 的内部实现
 
 `cluster.sync()` 是硬件支持的同步操作。在 PTX 层面，它使用 `barrier.cluster` 指令。整个流程：
 
@@ -772,7 +765,7 @@ namespace cooperative_groups {
 - 最低开销：每个块内的一次 `__syncthreads()` + 一次硬件 barrier 操作；
 - 实际的延迟取决于簇的大小和 GPC 上块的分布。
 
-### 12.13.3 不能用 cluster.sync() 做什么
+### 12.14.3 不能用 cluster.sync() 做什么
 
 虽然 `cluster.sync()` 提供了簇级别的同步，但它<strong>不能</strong>用于：
 
@@ -780,9 +773,9 @@ namespace cooperative_groups {
 2. <strong>网格级同步</strong>：如果需要整个网格的同步，需要使用 Cooperative Groups 的 `grid.sync()`（有更多限制）；
 3. <strong>细粒度 Warp 同步</strong>：`cluster.sync()` 是块级别的，不适用于 warp 级同步。
 
-## 12.14 实践建议与最佳实践
+## 12.15 实践建议与最佳实践
 
-### 12.14.1 何时使用 Thread Block Cluster
+### 12.15.1 何时使用 Thread Block Cluster
 
 Thread Block Cluster 和分布式共享内存不是银弹。只有在以下场景中才值得使用：
 
@@ -790,13 +783,13 @@ Thread Block Cluster 和分布式共享内存不是银弹。只有在以下场
 2. <strong>需要跨块细粒度协作</strong>：算法天然需要块之间交换数据；
 3. <strong>可以减少全局内存原子操作</strong>：通过 DSM 将跨块操作保留在共享内存域内。
 
-### 12.14.2 何时不应使用
+### 12.15.2 何时不应使用
 
 1. <strong>数据完全独立</strong>：每个块独立处理自己的数据，不需要块间通信；
 2. <strong>单块共享内存已足够</strong>：如果数据大小适合单块共享内存，使用传统方法更简单；
 3. <strong>CC < 9.0 的硬件</strong>：Cluster 是 Hopper+ 独占特性。
 
-### 12.14.3 调试分布式共享内存的常见问题
+### 12.15.3 调试分布式共享内存的常见问题
 
 1. <strong>忘记 cluster.sync()</strong>：这是最常见的错误。在访问 DSM 之前和退出之前都必须同步。
 
@@ -808,7 +801,7 @@ Thread Block Cluster 和分布式共享内存不是银弹。只有在以下场
 
 5. <strong>共享内存分配不足</strong>：动态共享内存大小仍然是每块的。如果每块需要 X 字节共享内存，而簇大小是 4，总分布式共享内存大小是 4X——但启动时只需指定 X。
 
-### 12.14.4 迁移现有代码到 Cluster 的步骤
+### 12.15.4 迁移现有代码到 Cluster 的步骤
 
 1. <strong>评估收益</strong>：确定算法是否真正能从跨块共享内存中受益；
 2. <strong>添加条件编译</strong>：使用 `#if __CUDA_ARCH__ >= 900` 保护 Cluster 代码；
@@ -817,7 +810,7 @@ Thread Block Cluster 和分布式共享内存不是银弹。只有在以下场
 5. <strong>测试回退路径</strong>：确保在旧硬件上回退路径正确工作；
 6. <strong>性能对比</strong>：使用 Nsight Compute 对比新旧方案的性能。
 
-## 12.15 动手体验：扩展练习——分布式矩阵乘法预处理
+## 12.16 动手体验：扩展练习——分布式矩阵乘法预处理
 
 以下是一个扩展练习，展示如何使用 DSM 进行矩阵乘法的预处理步骤（数据重组/格式转换）。这个例子展示了一个实际场景：在 GEMM 的分块预处理中，使用 DSM 在簇内协同重组数据布局。
 
@@ -866,7 +859,19 @@ distributed_matrix_reorder(
 
     cluster.sync();
 
-    // 写回全局内存（省略）
+    // 写回全局内存
+    int out_base_row = blockIdx.x * tile_size;
+    int col_split = N / cluster_size;
+    int total_item = tile_size * col_split;
+    int tid = threadIdx.x;
+    for (; tid < total_item; tid += blockDim.x)
+    {
+        int r = tid / col_split;
+        int inner_c = tid % col_split;
+        int real_c = block_rank + inner_c * cluster_size;
+        int pos = (out_base_row + r) * N + real_c;
+        global_B[pos] = smem[tid];
+    }
 }
 
 // 主机端动态启动
@@ -896,7 +901,7 @@ void launch_reorder(float *d_A, float *d_B, int M, int N, int tile_size)
 }
 ```
 
-## 12.9 动手体验：完整的分布式直方图程序
+## 12.17 动手体验：完整的分布式直方图程序
 
 下面是一个完整的、可编译的分布式直方图示例程序。它包含了主机端准备数据、动态选择簇大小、核函数执行和结果验证的全流程。
 
@@ -1051,9 +1056,9 @@ int main()
 }
 ```
 
-## 12.16 常见问题与故障排除
+## 12.18 常见问题与故障排除
 
-### 12.16.1 启动失败：网格维度不是簇大小的整数倍
+### 12.18.1 启动失败：网格维度不是簇大小的整数倍
 
 <strong>症状</strong>：`cudaLaunchKernelEx` 返回 `cudaErrorInvalidConfiguration`。
 
@@ -1067,7 +1072,7 @@ int cluster_x = 4;
 blocks_x = ((blocks_x + cluster_x - 1) / cluster_x) * cluster_x; // 向上取整
 ```
 
-### 12.16.2 DSM 访问返回垃圾值
+### 12.18.2 DSM 访问返回垃圾值
 
 <strong>症状</strong>：从远程块的共享内存读取到未初始化或错误的数据。
 
@@ -1078,7 +1083,7 @@ blocks_x = ((blocks_x + cluster_x - 1) / cluster_x) * cluster_x; // 向上取整
 
 <strong>解决方法</strong>：仔细检查 `cluster.sync()` 的位置和 `map_shared_rank` 的参数。
 
-### 12.16.3 簇同步死锁
+### 12.18.3 簇同步死锁
 
 <strong>症状</strong>：kernel 无限期挂起，不返回。
 
@@ -1101,7 +1106,7 @@ if (block_rank == 0) {
 }
 ```
 
-### 12.16.4 性能不如预期
+### 12.18.4 性能不如预期
 
 <strong>可能原因和解决方案</strong>：
 1. <strong>簇大小太大</strong>：减少簇大小，测试不同配置；
@@ -1109,13 +1114,13 @@ if (block_rank == 0) {
 3. <strong>同步开销</strong>：评估是否真的需要 DSM——如果单块共享内存够用，去掉 Cluster；
 4. <strong>MIG 限制</strong>：检查是否在 MIG 实例上运行，MIG 减少可用 SM 数量。
 
-### 12.16.5 编译错误：__cluster_dims__ 与 __block_size__ 冲突
+### 12.18.5 编译错误：__cluster_dims__ 与 __block_size__ 冲突
 
 <strong>症状</strong>：`error: "__block_size__" and "__cluster_dims__" cannot be used together`
 
 <strong>解决</strong>：选择其中一个。通常使用 `__cluster_dims__` 更简洁，除非需要以簇数为单位启动。
 
-## 12.17 性能基准参考
+## 12.19 性能基准参考
 
 以下是在 NVIDIA H100 上使用不同方案进行直方图计算的性能对比（512 bins, 1M elements）：
 
@@ -1134,108 +1139,11 @@ if (block_rank == 0) {
 2. DSM 在 cluster_size=2 时比单块再加速 36%；
 3. DSM 从 4 到 8 的收益递减（2.7x → 3.0x），这是因为较大的簇带来更多的同步开销。
 
-## 12.18 动手体验2：扩展练习——分布式矩阵乘法预处理
-
-除了直方图和归约，DSM 还可以用于矩阵乘法分块中的数据重组。以下是一个扩展练习，展示如何使用 DSM 在 Cluster 内协作将 row-major 数据重组为 block-optimized 布局：
-
-```cuda
-// distributed_matrix_reorder.cu
-// 使用 DSM 进行矩阵分块数据重组
-__global__ void __cluster_dims__(2, 2, 1)
-distributed_matrix_reorder(
-    const float *__restrict__ global_A,
-    float *__restrict__ global_B,
-    int M, int N, int tile_size)
-{
-    extern __shared__ float smem[];
-    namespace cg = cooperative_groups;
-    cg::cluster_group cluster = cg::this_cluster();
-
-    int block_rank = cluster.block_rank();
-    int cluster_size = cluster.dim_blocks().x * cluster.dim_blocks().y;
-    int rows_per_block = tile_size / cluster_size;
-    int my_start_row = block_rank * rows_per_block;
-
-    // Step 1: 加载数据到本地共享内存
-    int global_row_offset = blockIdx.x * tile_size + my_start_row;
-    for (int i = threadIdx.x; i < rows_per_block * N; i += blockDim.x) {
-        int r = i / N;
-        int c = i % N;
-        smem[r * N + c] = global_A[(global_row_offset + r) * N + c];
-    }
-    __syncthreads();
-
-    // Step 2: DSM交叉拷贝——每个块将自己的数据分布到所有块
-    cluster.sync();
-
-    for (int r = 0; r < rows_per_block; r++) {
-        for (int c = 0; c < N; c++) {
-            int dst_block = c % cluster_size;
-            int dst_offset = r * (N / cluster_size) + (c / cluster_size);
-            float *dst_smem = cluster.map_shared_rank(smem, dst_block);
-            dst_smem[dst_offset] = smem[r * N + c];
-        }
-    }
-
-    cluster.sync();
-
-    // Step 3: 写回全局内存
-    for (int i = threadIdx.x; i < rows_per_block * (N / cluster_size); i += blockDim.x) {
-        int r = i / (N / cluster_size);
-        int c = i % (N / cluster_size);
-        int global_idx = (global_row_offset + r) * N + c;
-        global_B[global_idx] = smem[r * (N / cluster_size) + c];
-    }
-}
-
-// 主机端动态启动封装
-void launch_matrix_reorder(float *d_A, float *d_B, int M, int N, int tile_size)
-{
-    dim3 threads(256);
-    int tiles = M / tile_size;
-    int cluster_x = 2, cluster_y = 2;
-
-    cudaLaunchConfig_t config = {0};
-    config.gridDim = dim3(tiles);
-    config.blockDim = threads;
-    config.dynamicSmemBytes = tile_size * N * sizeof(float);
-
-    CUDA_CHECK(cudaFuncSetAttribute(
-        (void *)distributed_matrix_reorder,
-        cudaFuncAttributeMaxDynamicSharedMemorySize,
-        config.dynamicSmemBytes));
-
-    cudaLaunchAttribute attrs[1];
-    attrs[0].id = cudaLaunchAttributeClusterDimension;
-    attrs[0].val.clusterDim.x = cluster_x;
-    attrs[0].val.clusterDim.y = cluster_y;
-    attrs[0].val.clusterDim.z = 1;
-
-    config.numAttrs = 1;
-    config.attrs = attrs;
-
-    CUDA_CHECK(cudaLaunchKernelEx(&config, distributed_matrix_reorder,
-                                   d_A, d_B, M, N, tile_size));
-}
-```
-
-### 设计要点分析
-
-这个分布式矩阵重组核函数的设计体现了几个关键决策：
-
-1. <strong>分块策略</strong>：将 `tile_size` 行数据分配给 `cluster_size` 个块，每个块处理 `rows_per_block = tile_size / cluster_size` 行。这里要求 `tile_size` 能被 `cluster_size` 整除。
-
-2. <strong>DSM 交叉拷贝</strong>：每个块将自己的行数据按列重新分布——将属于其他块的列数据通过 `map_shared_rank` 直接写入目标块的共享内存。这一步避免了通过全局内存中转。
-
-3. <strong>同步点</strong>：两个 `cluster.sync()` 确保：(a) 所有块的本地数据加载完成；(b) 所有交叉拷贝完成。
-
-4. <strong>复杂度</strong>：这个示例展示了 DSM 的强大之处——如果没有 DSM，这种跨块数据重组需要通过全局内存原子操作或额外的 kernel launch 来完成。
-
-## 12.19 GPU 架构演进中的 Cluster 设计哲学
+## 12.20 GPU 架构演进中的 Cluster 设计哲学
 
 Thread Block Cluster 的引入不仅仅是增加了一个编程层次——它反映了 GPU 硬件架构的深层演进趋势。
 
-### 12.19.1 从 SM 到 GPC 的演进
+### 12.20.1 从 SM 到 GPC 的演进
 
 回顾 GPU 架构的发展：
 
@@ -1247,7 +1155,7 @@ Thread Block Cluster 的引入不仅仅是增加了一个编程层次——它
 
 - <strong>Hopper (2022)</strong>：Thread Block Cluster + DSM 首次将 GPC 概念暴露给程序员，使跨块协作成为一等公民。
 
-### 12.19.2 为什么是现在？
+### 12.20.2 为什么是现在？
 
 为什么 Cluster 在 Hopper 架构上才被引入？有几个技术原因：
 
@@ -1259,13 +1167,13 @@ Thread Block Cluster 的引入不仅仅是增加了一个编程层次——它
 
 4. <strong>工作负载需求</strong>：深度学习和大规模模拟对跨块协作的需求日益增长（如分布式 softmax、batch normalization 等）。
 
-### 12.19.3 展望：Blackwell 及以后
+### 12.20.3 展望：Blackwell 及以后
 
 NVIDIA Blackwell 架构（CC 10.0/12.0）在 Cluster 的基础上进一步引入了 <strong>Cluster Launch Control</strong>（集群启动控制），支持线程块之间的工作窃取（work stealing）。这表明 Cluster 将成为未来 GPU 编程模型的核心组成部分，而不仅仅是 Hopper 的特色功能。
 
 理解 Thread Block Cluster 和分布式共享内存，不仅让你能在 H100 上写出更好的程序，更让你为未来的 GPU 架构做好准备。
 
-## 12.20 本章小结
+## 12.21 本章小结
 
 本章我们深入探讨了 CUDA Hopper 架构引入的两个紧密相关的特性：Thread Block Clusters 和 Distributed Shared Memory。让我们回顾关键要点：
 
@@ -1283,7 +1191,7 @@ NVIDIA Blackwell 架构（CC 10.0/12.0）在 Cluster 的基础上进一步引入
 
 Thread Block Clusters 和分布式共享内存在 NVIDIA Hopper 架构上开启了 GPU 编程的新维度。它们使得跨线程块的细粒度协作成为可能，为许多以前需要全局内存回退的算法提供了更高效的选择。
 
-## 12.11 习题
+## 12.23 习题
 
 1. 解释 Thread Block Cluster 的共调度保证为什么是实现 `cluster.sync()` 的前提条件。如果没有共调度保证会发生什么？
 
@@ -1297,7 +1205,7 @@ Thread Block Clusters 和分布式共享内存在 NVIDIA Hopper 架构上开启
 
 6. 为什么在使用分布式共享内存时，需要在退出前再次调用 `cluster.sync()`？如果不调用会发生什么情况？
 
-## 12.12 参考文献
+## 12.24 参考文献
 
 1. CUDA C++ Programming Guide 13.0, Section 5.2.1 "Thread Block Clusters"
 2. CUDA C++ Programming Guide 13.0, Section 5.2.2 "Blocks as Clusters"

From 533393789a0044cf16e54a7bf44d026b3ae56a2b Mon Sep 17 00:00:00 2001
From: wangdaye wang <1901591887@qq.com>
Date: Thu, 4 Jun 2026 19:19:30 +0800
Subject: [PATCH 14/23] 6.4 chapter13

---
 .../async_pipeline_demo.cu                    |  77 ++---
 ...26\347\250\213\346\250\241\345\236\213.md" | 286 ++++++++----------
 2 files changed, 168 insertions(+), 195 deletions(-)
 rename outputs/gpu-programming-course/code/{advanced-chapter2 => advanced-chapter13}/async_pipeline_demo.cu (68%)
 rename "outputs/gpu-programming-course/docs/advanced-chapter2/\347\254\25413\347\253\240 \345\274\202\346\255\245SIMT\347\274\226\347\250\213\346\250\241\345\236\213.md" => "outputs/gpu-programming-course/docs/advanced-chapter13/\347\254\25413\347\253\240 \345\274\202\346\255\245SIMT\347\274\226\347\250\213\346\250\241\345\236\213.md" (89%)

diff --git a/outputs/gpu-programming-course/code/advanced-chapter2/async_pipeline_demo.cu b/outputs/gpu-programming-course/code/advanced-chapter13/async_pipeline_demo.cu
similarity index 68%
rename from outputs/gpu-programming-course/code/advanced-chapter2/async_pipeline_demo.cu
rename to outputs/gpu-programming-course/code/advanced-chapter13/async_pipeline_demo.cu
index d1e4ca9..b4ffa6c 100644
--- a/outputs/gpu-programming-course/code/advanced-chapter2/async_pipeline_demo.cu
+++ b/outputs/gpu-programming-course/code/advanced-chapter13/async_pipeline_demo.cu
@@ -33,23 +33,21 @@ __global__ void pipeline_demo_kernel(
 
     // 每个阶段一个缓冲区
     float *buffer[stages];
-    for (int s = 0; s < stages; s++) {
+    for (int s = 0; s < stages; ++s) {
         buffer[s] = shared_buffers + s * threads_per_block;
     }
 
-    // 创建 3 阶段 pipeline
+    // 创建 pipeline 状态
     __shared__ cuda::pipeline_shared_state<
         cuda::thread_scope::thread_scope_block, stages> pipe_state;
     auto pipe = cuda::make_pipeline(block, &pipe_state);
 
     size_t total_blocks = total_elements / threads_per_block;
     size_t block_id = block.group_index().x;
+    if (block_id != 0) return;   // 只用一个块演示
 
-    // 只有第一个块执行（简化演示）
-    if (block_id != 0) return;
-
-    // 预热流水线：填充前 (stages-1) 个阶段
-    for (int s = 0; s < stages - 1; s++) {
+    // 预热：填充前 (stages-1) 个批次 
+    for (int s = 0; s < stages - 1; ++s) {
         pipe.producer_acquire();
         cuda::memcpy_async(block, buffer[s],
                            input + s * threads_per_block,
@@ -57,51 +55,56 @@ __global__ void pipeline_demo_kernel(
         pipe.producer_commit();
     }
 
-    // 流水线稳态
-    for (size_t i = 0; i < total_blocks - (stages - 1); i++) {
-        int stage = i % stages;
+    // 流水线状态 ：消费 + 生产 
+    for (size_t i = 0; i < total_blocks - (stages - 1); ++i) {
+        // 1. 等待当前批次就绪（消费者）
+        pipe.consumer_wait();
+        int tid = threadIdx.x;
+        int cons_buf_idx = i % stages;                 // 当前批次应该所在的缓冲区
+        float *curr_buf = buffer[cons_buf_idx];
+
+        // 计算（每个线程独立处理自己的元素）
+        float val = curr_buf[tid] * scale + 1.0f;
+        // 写回全局内存
+        output[i * threads_per_block + tid] = val;
 
-        // 生产者：发起下一批数据的异步拷贝
+        pipe.consumer_release();   // 释放当前缓冲区，允许生产者复用
+
+        // 2. 为未来批次准备数据（生产者）
         pipe.producer_acquire();
-        size_t next_batch = i + stages - 1;
+        size_t next_batch = i + stages - 1;            // 要准备的下一个批次索引
         if (next_batch < total_blocks) {
-            cuda::memcpy_async(block, buffer[stage],
+            int prod_buf_idx = (i + stages - 1) % stages;   // 正确的目标缓冲区索引
+            cuda::memcpy_async(block, buffer[prod_buf_idx],
                                input + next_batch * threads_per_block,
                                sizeof(float) * threads_per_block, pipe);
         }
         pipe.producer_commit();
-
-        // 消费者：处理当前阶段的数据
-        pipe.consumer_wait();
-        int tid = threadIdx.x;
-        buffer[stage][tid] = buffer[stage][tid] * scale + 1.0f;
-        __syncthreads();
-        // 写回
-        output[i * threads_per_block + tid] = buffer[stage][tid];
-        pipe.consumer_release();
     }
 
-    // 排空流水线：处理最后 (stages-1) 个阶段
-    for (size_t i = total_blocks - (stages - 1); i < total_blocks; i++) {
-        int stage = i % stages;
+    //  排空：处理最后 (stages-1) 个批次 
+    for (size_t i = total_blocks - (stages - 1); i < total_blocks; ++i) {
         pipe.consumer_wait();
         int tid = threadIdx.x;
-        buffer[stage][tid] = buffer[stage][tid] * scale + 1.0f;
-        __syncthreads();
-        output[i * threads_per_block + tid] = buffer[stage][tid];
+        int cons_buf_idx = i % stages;
+        float *curr_buf = buffer[cons_buf_idx];
+
+        float val = curr_buf[tid] * scale + 1.0f;
+        output[i * threads_per_block + tid] = val;
+
         pipe.consumer_release();
     }
 }
 
 int main() {
-    const size_t N = threads_per_block * 100; // 100个批次
+    const size_t N = threads_per_block * 100;
     const size_t bytes = N * sizeof(float);
     const float scale = 2.0f;
 
-    // 主机内存
+    // 主机数据
     float *h_input = (float *)malloc(bytes);
     float *h_output = (float *)malloc(bytes);
-    for (size_t i = 0; i < N; i++) {
+    for (size_t i = 0; i < N; ++i) {
         h_input[i] = (float)(i % 100) / 100.0f;
     }
 
@@ -111,21 +114,19 @@ int main() {
     CUDA_CHECK(cudaMalloc(&d_output, bytes));
     CUDA_CHECK(cudaMemcpy(d_input, h_input, bytes, cudaMemcpyHostToDevice));
 
-    // 启动 pipeline 核函数
+    // 启动核函数
     size_t shared_mem = stages * threads_per_block * sizeof(float);
     pipeline_demo_kernel<<<1, threads_per_block, shared_mem>>>(
         d_input, d_output, N, scale);
-
     CUDA_CHECK(cudaDeviceSynchronize());
 
-    // 验证
+    // 验证结果
     CUDA_CHECK(cudaMemcpy(h_output, d_output, bytes, cudaMemcpyDeviceToHost));
     bool correct = true;
-    for (size_t i = 0; i < N; i++) {
+    for (size_t i = 0; i < N; ++i) {
         float expected = h_input[i] * scale + 1.0f;
         if (fabsf(h_output[i] - expected) > 1e-5f) {
-            printf("Mismatch at %zu: GPU %f vs CPU %f\n",
-                   i, h_output[i], expected);
+            printf("Mismatch at %zu: GPU %f vs CPU %f\n", i, h_output[i], expected);
             correct = false;
             break;
         }
@@ -136,4 +137,4 @@ int main() {
     CUDA_CHECK(cudaFree(d_input));
     CUDA_CHECK(cudaFree(d_output));
     return correct ? 0 : 1;
-}
+}
\ No newline at end of file
diff --git "a/outputs/gpu-programming-course/docs/advanced-chapter2/\347\254\25413\347\253\240 \345\274\202\346\255\245SIMT\347\274\226\347\250\213\346\250\241\345\236\213.md" "b/outputs/gpu-programming-course/docs/advanced-chapter13/\347\254\25413\347\253\240 \345\274\202\346\255\245SIMT\347\274\226\347\250\213\346\250\241\345\236\213.md"
similarity index 89%
rename from "outputs/gpu-programming-course/docs/advanced-chapter2/\347\254\25413\347\253\240 \345\274\202\346\255\245SIMT\347\274\226\347\250\213\346\250\241\345\236\213.md"
rename to "outputs/gpu-programming-course/docs/advanced-chapter13/\347\254\25413\347\253\240 \345\274\202\346\255\245SIMT\347\274\226\347\250\213\346\250\241\345\236\213.md"
index cfd944d..4323ab7 100644
--- "a/outputs/gpu-programming-course/docs/advanced-chapter2/\347\254\25413\347\253\240 \345\274\202\346\255\245SIMT\347\274\226\347\250\213\346\250\241\345\236\213.md"	
+++ "b/outputs/gpu-programming-course/docs/advanced-chapter13/\347\254\25413\347\253\240 \345\274\202\346\255\245SIMT\347\274\226\347\250\213\346\250\241\345\236\213.md"	
@@ -194,16 +194,16 @@ CUDA Programming Guide 还介绍了 <strong>Warp Specialization</strong>（空
 - 两者之间通过两个 `cuda::barrier` 实现"缓冲区就绪"和"缓冲区已填充"的信号传递。
 
 ```cuda
-#include &lt;cuda/barrier&gt;
-#include &lt;cooperative_groups.h&gt;
+#include <cuda/barrier>
+#include <cooperative_groups.h>
 
-using barrier = cuda::barrier&lt;cuda::thread_scope::thread_scope_block&gt;;
+using barrier = cuda::barrier<cuda::thread_scope::thread_scope_block>;
 
 __device__ void producer(barrier ready[], barrier filled[],
                          float* buffer, float* in, int N, int buffer_len)
 {
     for (int i = 0; i < (N/buffer_len); ++i) {
-        ready[i%2].arrive_and_wait();  /* 等待缓冲区就绪 */
+        ready[i%2].wait();  // 只等待，不 arrive（arrive 由消费者完成）
         /* 生产数据，填充 buffer_(i%2) */
         barrier::arrival_token token = filled[i%2].arrive();
         /* buffer_(i%2) 已填满——不等待，继续下一个迭代 */
@@ -216,7 +216,7 @@ __device__ void consumer(barrier ready[], barrier filled[],
     barrier::arrival_token token1 = ready[0].arrive(); /* buffer_0 就绪 */
     barrier::arrival_token token2 = ready[1].arrive(); /* buffer_1 就绪 */
     for (int i = 0; i < (N/buffer_len); ++i) {
-        filled[i%2].arrive_and_wait(); /* 等待缓冲区被填满 */
+        filled[i%2].wait(); // 只等待，不 arrive（arrive 由生产者完成）
         /* 消费 buffer_(i%2) */
         barrier::arrival_token token = ready[i%2].arrive();
         /* buffer_(i%2) 已消费完，可以重新填充 */
@@ -225,19 +225,25 @@ __device__ void consumer(barrier ready[], barrier filled[],
 
 __global__ void producer_consumer_pattern(int N, int buffer_len,
                                            float* in, float* out) {
-    // 双缓冲: buffer_0 = buffer, buffer_1 = buffer + buffer_len
     __shared__ extern float buffer[];
-
-    // bar[0]/bar[1] 跟踪 buffer_0/buffer_1 是否就绪
-    // bar[2]/bar[3] 跟踪 buffer_0/buffer_1 是否已填满
-    __shared__ barrier bar[4];
+    __shared__ barrier bar[4];  // bar[0]=ready0, bar[1]=ready1, bar[2]=filled0, bar[3]=filled1
 
     auto block = cooperative_groups::this_thread_block();
-    if (block.thread_rank() < 4)
-        init(bar + block.thread_rank(), block.size());
+    int tid = block.thread_rank();
+    int num_threads = block.size();
+    const int prod_count = warpSize;            // 生产者线程数 = 32
+    const int cons_count = num_threads - warpSize;  // 消费者线程数
+
+    // 正确初始化：ready 屏障期望消费者线程数，filled 屏障期望生产者线程数
+    if (tid == 0) {
+        init(&bar[0], cons_count);  // ready0
+        init(&bar[1], cons_count);  // ready1
+        init(&bar[2], prod_count);  // filled0
+        init(&bar[3], prod_count);  // filled1
+    }
     block.sync();
 
-    if (block.thread_rank() < warpSize)
+    if (tid < warpSize)
         producer(bar, bar+2, buffer, in, N, buffer_len);
     else
         consumer(bar, bar+2, buffer, out, N, buffer_len);
@@ -450,7 +456,7 @@ __global__ void single_stage_kernel(int *data, int *result, size_t size) {
 多阶段 Pipeline 才是真正展现实力的地方。通过使用多个共享内存缓冲区（"阶段"），你可以让数据拷贝和计算<strong>完全重叠</strong>：
 
 ```cuda
-#include &lt;cuda/pipeline&gt;
+#include <cuda/pipeline>
 
 constexpr int stages = 2; // 双缓冲
 
@@ -465,13 +471,13 @@ __global__ void pipeline_kernel(int *data, int *result, size_t size) {
     }
 
     // 创建多阶段 pipeline
-    __shared__ cuda::pipeline_shared_state&lt;
-        cuda::thread_scope::thread_scope_block, stages&gt; shared_state;
+    __shared__ cuda::pipeline_shared_state<
+        cuda::thread_scope::thread_scope_block, stages> shared_state;
     auto pipeline = cuda::make_pipeline(block, &shared_state);
 
     size_t num_blocks = size / block.size();
 
-    // 预热：先发起前 stages 个拷贝
+    // 预热：先发起前 stages-1 个拷贝
     for (int stage = 0; stage < stages - 1; stage++) {
         pipeline.producer_acquire();
         cuda::memcpy_async(block, buffer[stage], data + stage * block.size(),
@@ -479,11 +485,18 @@ __global__ void pipeline_kernel(int *data, int *result, size_t size) {
         pipeline.producer_commit();
     }
 
-    // 主循环：消费和生产的流水线重叠
+    // 主循环：先消费，再生产，避免 pipeline 队列满导致死锁
     for (size_t i = 0; i < num_blocks - (stages - 1); i++) {
         int stage = i % stages;
 
-        // 生产下一批数据（异步拷贝）
+        // 先等待并消费当前阶段的数据（由之前的生产提供）
+        pipeline.consumer_wait();
+        for (int j = threadIdx.x; j < block.size(); j += block.size()) {
+            result[i * block.size() + j] = buffer[stage][j] * 2;
+        }
+        pipeline.consumer_release();
+
+        // 再生产下一批数据（异步拷贝）
         pipeline.producer_acquire();
         size_t producer_offset = (i + stages - 1) * block.size();
         if (producer_offset < size) {
@@ -492,13 +505,6 @@ __global__ void pipeline_kernel(int *data, int *result, size_t size) {
                                sizeof(int) * block.size(), pipeline);
         }
         pipeline.producer_commit();
-
-        // 消费当前阶段的数据
-        pipeline.consumer_wait();
-        for (int j = threadIdx.x; j < block.size(); j += block.size()) {
-            result[i * block.size() + j] = buffer[stage][j] * 2;
-        }
-        pipeline.consumer_release();
     }
 
     // 排空：处理剩余的 stages-1 个阶段
@@ -671,111 +677,75 @@ NVIDIA Nsight Compute 提供了针对异步操作的专门指标：
 #include <cuda/barrier>
 #include <cooperative_groups.h>
 
+using namespace cooperative_groups;
 using barrier = cuda::barrier<cuda::thread_scope::thread_scope_block>;
 
-// 生产者：负责加载数据
-__device__ void producer_work(barrier &ready, barrier &filled,
-                              float *buf, const float *__restrict__ global_in,
-                              int block_start, int buffer_len)
-{
-    // 等待缓冲区就绪
-    ready.arrive_and_wait();
-
-    // 加载数据到共享内存缓冲区（异步）
-    auto block = cooperative_groups::this_thread_block();
-    // 使用 memcpy_async 进行异步加载
-    cuda::memcpy_async(block, buf, global_in + block_start,
-                       sizeof(float) * buffer_len, filled);
-
-    // 生产者不需要等待加载完成——它在 filled barrier 上 arrive
-    // filled.consumer 会等到数据加载完成
-}
-
-// 消费者：负责计算
-__device__ void consumer_work(barrier &ready, barrier &filled,
-                              float *buf, float *global_out,
-                              int block_start, int buffer_len)
-{
-    // 等待数据被填满
-    filled.arrive_and_wait();
-
-    // 数据就绪，进行计算
-    for (int i = threadIdx.x; i < buffer_len; i += blockDim.x) {
-        global_out[block_start + i] = buf[i] * 2.0f + 1.0f;
-    }
-
-    // 通知生产者缓冲区可以被重新填充
-    ready.arrive();
-}
-
-__global__ void full_producer_consumer(
+__global__ void producer_consumer_corrected(
     const float *__restrict__ global_in,
     float *__restrict__ global_out,
     int N, int buffer_len)
 {
-    // 双缓冲区
     __shared__ extern float buffer[];
-    float *buf_a = buffer;                     // buffer_0
-    float *buf_b = buffer + buffer_len;        // buffer_1
+    float *buf_a = buffer;
+    float *buf_b = buffer + buffer_len;
 
-    // 四个 barrier：
-    // ready[0/1]：buf_0/1 就绪（可以被填充）
-    // filled[0/1]：buf_0/1 已填满（可以被消费）
-    // ready 由消费者释放，filled 由生产者填充
     __shared__ barrier ready[2];
     __shared__ barrier filled[2];
 
-    auto block = cooperative_groups::this_thread_block();
+    auto block = this_thread_block();
     int tid = block.thread_rank();
 
-    // 初始化 barrier
-    if (tid < 2) {
-        init(&ready[tid], block.size());
-        init(&filled[tid], block.size());
+    const int prod_count = warpSize;                // 生产者线程数（正好1个warp）
+    const int cons_count = block.size() - prod_count; // 消费者线程数
+
+    if (tid == 0) {
+        init(&ready[0], cons_count);  // ready：消费者arrive，生产者wait
+        init(&ready[1], cons_count);
+        init(&filled[0], prod_count); // filled：生产者arrive，消费者wait
+        init(&filled[1], prod_count);
     }
-    block.sync();
+    block.sync(); // 确保所有线程看到初始化完成的屏障
 
-    // 空间分割：warp 0 是生产者，其余 warp 是消费者
-    bool is_producer = (tid / warpSize) == 0;
+    // tiled_partition<32>将block分成多个32线程的tile，第一个tile正好是生产者线程（tid 0-31）
+    auto producer_group = tiled_partition<warpSize>(block);
 
-    if (is_producer) {
-        // 预热：消费者需要 ready 信号才知道可以开始填充
-        // 但生产者首先需要等待消费者释放 ready
-        filled[0].arrive_and_wait();  // 标记 buf_0 已"填充"（初始为空需要此操作来完成初始化）
+    bool is_producer = (tid < prod_count);
+    int total_blocks = N / buffer_len;
 
-        int total_blocks = N / buffer_len;
+    if (is_producer) {
         for (int b = 0; b < total_blocks; b++) {
             int buf_idx = b % 2;
             int block_start = b * buffer_len;
 
-            ready[buf_idx].arrive_and_wait();  // 等待缓冲区就绪
+            ready[buf_idx].wait(); // 等待消费者释放缓冲区
 
-            // 异步加载
-            cuda::memcpy_async(block, (buf_idx == 0 ? buf_a : buf_b),
+            // 1. 只有生产者线程调用，group中所有线程都参与，符合CUDA语义
+            // 2. 整个生产者group合作拷贝整个缓冲区，只发起1次拷贝，无重复
+            // 3. 拷贝完成后，每个生产者线程自动在filled上arrive一次（共32次，正好匹配期望计数）
+            cuda::memcpy_async(producer_group,
+                               (buf_idx == 0 ? buf_a : buf_b),
                                global_in + block_start,
                                sizeof(float) * buffer_len,
                                filled[buf_idx]);
         }
     } else {
-        // 消费者：先通知生产者所有缓冲区初始可用
+        // 所有消费者发出初始信号：两个缓冲区初始都可用
         ready[0].arrive();
         ready[1].arrive();
 
-        int total_blocks = N / buffer_len;
         for (int b = 0; b < total_blocks; b++) {
             int buf_idx = b % 2;
             int block_start = b * buffer_len;
 
-            filled[buf_idx].arrive_and_wait();  // 等待缓冲区被填满
+            filled[buf_idx].wait(); // 等待生产者填充数据完成
 
-            // 消费数据
+            // 消费数据（索引完全正确）
             float *curr_buf = (buf_idx == 0 ? buf_a : buf_b);
-            for (int i = tid; i < buffer_len; i += block.size()) {
+            for (int i = tid - prod_count; i < buffer_len; i += cons_count) {
                 global_out[block_start + i] = curr_buf[i] * 2.0f + 1.0f;
             }
 
-            // 通知生产者缓冲区就绪
-            ready[buf_idx].arrive();
+            ready[buf_idx].arrive(); // 通知生产者缓冲区已释放
         }
     }
 }
@@ -790,10 +760,11 @@ __global__ void full_producer_consumer(
 // 编译: nvcc -arch=sm_80 async_pipeline_demo.cu -o async_pipeline_demo
 // 硬件要求: NVIDIA Ampere A100 或更新 (CC 8.0+)
 
-#include &lt;stdio.h&gt;
-#include &lt;stdlib.h&gt;
-#include &lt;cuda/pipeline&gt;
-#include &lt;cooperative_groups.h&gt;
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <cuda/pipeline>
+#include <cooperative_groups.h>
 
 #define CUDA_CHECK(call)                                             \
     do {                                                             \
@@ -819,23 +790,21 @@ __global__ void pipeline_demo_kernel(
 
     // 每个阶段一个缓冲区
     float *buffer[stages];
-    for (int s = 0; s < stages; s++) {
+    for (int s = 0; s < stages; ++s) {
         buffer[s] = shared_buffers + s * threads_per_block;
     }
 
-    // 创建 3 阶段 pipeline
-    __shared__ cuda::pipeline_shared_state&lt;
-        cuda::thread_scope::thread_scope_block, stages&gt; pipe_state;
+    // 创建 pipeline 状态
+    __shared__ cuda::pipeline_shared_state<
+        cuda::thread_scope::thread_scope_block, stages> pipe_state;
     auto pipe = cuda::make_pipeline(block, &pipe_state);
 
     size_t total_blocks = total_elements / threads_per_block;
     size_t block_id = block.group_index().x;
+    if (block_id != 0) return;   // 只用一个块演示
 
-    // 只有第一个块执行（简化演示）
-    if (block_id != 0) return;
-
-    // 预热流水线：填充前 (stages-1) 个阶段
-    for (int s = 0; s < stages - 1; s++) {
+    // 预热：填充前 (stages-1) 个批次 
+    for (int s = 0; s < stages - 1; ++s) {
         pipe.producer_acquire();
         cuda::memcpy_async(block, buffer[s],
                            input + s * threads_per_block,
@@ -843,51 +812,56 @@ __global__ void pipeline_demo_kernel(
         pipe.producer_commit();
     }
 
-    // 流水线稳态
-    for (size_t i = 0; i < total_blocks - (stages - 1); i++) {
-        int stage = i % stages;
+    // 流水线状态 ：消费 + 生产 
+    for (size_t i = 0; i < total_blocks - (stages - 1); ++i) {
+        // 1. 等待当前批次就绪（消费者）
+        pipe.consumer_wait();
+        int tid = threadIdx.x;
+        int cons_buf_idx = i % stages;                 // 当前批次应该所在的缓冲区
+        float *curr_buf = buffer[cons_buf_idx];
+
+        // 计算（每个线程独立处理自己的元素）
+        float val = curr_buf[tid] * scale + 1.0f;
+        // 写回全局内存
+        output[i * threads_per_block + tid] = val;
 
-        // 生产者：发起下一批数据的异步拷贝
+        pipe.consumer_release();   // 释放当前缓冲区，允许生产者复用
+
+        // 2. 为未来批次准备数据（生产者）
         pipe.producer_acquire();
-        size_t next_batch = i + stages - 1;
+        size_t next_batch = i + stages - 1;            // 要准备的下一个批次索引
         if (next_batch < total_blocks) {
-            cuda::memcpy_async(block, buffer[stage],
+            int prod_buf_idx = (i + stages - 1) % stages;   // 正确的目标缓冲区索引
+            cuda::memcpy_async(block, buffer[prod_buf_idx],
                                input + next_batch * threads_per_block,
                                sizeof(float) * threads_per_block, pipe);
         }
         pipe.producer_commit();
-
-        // 消费者：处理当前阶段的数据
-        pipe.consumer_wait();
-        int tid = threadIdx.x;
-        buffer[stage][tid] = buffer[stage][tid] * scale + 1.0f;
-        __syncthreads();
-        // 写回
-        output[i * threads_per_block + tid] = buffer[stage][tid];
-        pipe.consumer_release();
     }
 
-    // 排空流水线：处理最后 (stages-1) 个阶段
-    for (size_t i = total_blocks - (stages - 1); i < total_blocks; i++) {
-        int stage = i % stages;
+    //  排空：处理最后 (stages-1) 个批次 
+    for (size_t i = total_blocks - (stages - 1); i < total_blocks; ++i) {
         pipe.consumer_wait();
         int tid = threadIdx.x;
-        buffer[stage][tid] = buffer[stage][tid] * scale + 1.0f;
-        __syncthreads();
-        output[i * threads_per_block + tid] = buffer[stage][tid];
+        int cons_buf_idx = i % stages;
+        float *curr_buf = buffer[cons_buf_idx];
+
+        float val = curr_buf[tid] * scale + 1.0f;
+        output[i * threads_per_block + tid] = val;
+
         pipe.consumer_release();
     }
 }
 
 int main() {
-    const size_t N = threads_per_block * 100; // 100个批次
+    const size_t N = threads_per_block * 100;
     const size_t bytes = N * sizeof(float);
     const float scale = 2.0f;
 
-    // 主机内存
+    // 主机数据
     float *h_input = (float *)malloc(bytes);
     float *h_output = (float *)malloc(bytes);
-    for (size_t i = 0; i < N; i++) {
+    for (size_t i = 0; i < N; ++i) {
         h_input[i] = (float)(i % 100) / 100.0f;
     }
 
@@ -897,21 +871,19 @@ int main() {
     CUDA_CHECK(cudaMalloc(&d_output, bytes));
     CUDA_CHECK(cudaMemcpy(d_input, h_input, bytes, cudaMemcpyHostToDevice));
 
-    // 启动 pipeline 核函数
+    // 启动核函数
     size_t shared_mem = stages * threads_per_block * sizeof(float);
-    pipeline_demo_kernel&lt;&lt;&lt;1, threads_per_block, shared_mem&gt;&gt;&gt;(
+    pipeline_demo_kernel<<<1, threads_per_block, shared_mem>>>(
         d_input, d_output, N, scale);
-
     CUDA_CHECK(cudaDeviceSynchronize());
 
-    // 验证
+    // 验证结果
     CUDA_CHECK(cudaMemcpy(h_output, d_output, bytes, cudaMemcpyDeviceToHost));
     bool correct = true;
-    for (size_t i = 0; i < N; i++) {
+    for (size_t i = 0; i < N; ++i) {
         float expected = h_input[i] * scale + 1.0f;
         if (fabsf(h_output[i] - expected) > 1e-5f) {
-            printf("Mismatch at %zu: GPU %f vs CPU %f\n",
-                   i, h_output[i], expected);
+            printf("Mismatch at %zu: GPU %f vs CPU %f\n", i, h_output[i], expected);
             correct = false;
             break;
         }
@@ -925,7 +897,7 @@ int main() {
 }
 ```
 
-### 流水线效率分析
+### 13.10.1 流水线效率分析
 
 多阶段 Pipeline 的关键优势在于：当生产者（数据拷贝）在处理阶段 N 时，消费者（计算）同时处理阶段 N-1。通过 `stages` 个缓冲区，实现了以下重叠：
 
@@ -939,7 +911,7 @@ int main() {
 
 在稳态下，Load 和 Compute 完全重叠。阶段数越多，越容易在拷贝延迟波动时维持重叠，但也会消耗更多共享内存。
 
-## 13.8 同步机制的对比
+## 13.11 同步机制的对比
 
 | 特性 | `__syncthreads()` | `cuda::barrier` | `cuda::pipeline` |
 |------|------------------|-----------------|-----------------|
@@ -957,9 +929,9 @@ CUDA Programming Guide 也给出了建议：
 
 即：对于不需要异步重叠的简单同步场景，使用传统的 `__syncthreads()` 仍然是最优选择。
 
-## 13.9 异步操作的调试与错误处理
+## 13.12 异步操作的调试与错误处理
 
-### 13.9.1 常见问题诊断
+### 13.12.1 常见问题诊断
 
 使用异步操作时，常见的问题包括：
 
@@ -990,15 +962,15 @@ bar.wait(std::move(token));  // 使用 phase N 的过期 token
 
 <strong>4. Pipeline 阶段溢出</strong>：`producer_acquire()` 超过 pipeline 的阶段数导致死锁。
 
-### 13.9.2 使用环境变量和工具调试
+### 13.12.2 使用环境变量和工具调试
 
 - `CUDA_LAUNCH_BLOCKING=1`：使所有 kernel 启动变为同步，便于隔离问题；
 - `cuda-memcheck` 工具检测共享内存的非法访问；
 - NVIDIA Compute Sanitizer 可以检测异步操作中的数据竞争。
 
-## 13.10 高级 Pipeline 模式
+## 13.13 高级 Pipeline 模式
 
-### 13.10.1 Pipeline 与 GEMM 的软件流水线
+### 13.13.1 Pipeline 与 GEMM 的软件流水线
 
 在矩阵乘法（GEMM）中，多阶段 Pipeline 用于实现 Global-to-Shared 内存拷贝与 Tensor Core 计算的重叠：
 
@@ -1066,7 +1038,7 @@ __global__ void gemm_pipeline(
 }
 ```
 
-### 13.10.2 动态阶段数选择
+### 13.13.2 动态阶段数选择
 
 Pipeline 的阶段数涉及权衡：
 
@@ -1084,7 +1056,7 @@ int select_pipeline_stages(size_t tile_bytes, size_t smem_per_block) {
 }
 ```
 
-### 13.10.3 Pipeline 交错模式
+### 13.13.3 Pipeline 交错模式
 
 ```cuda
 // 双 Pipeline 交错
@@ -1101,7 +1073,7 @@ pa.consumer_wait(); compute_a(); pa.consumer_release();
 pb.consumer_wait(); compute_b(); pb.consumer_release();
 ```
 
-## 13.11 与主机端异步操作的对比
+## 13.14 与主机端异步操作的对比
 
 | 特性 | 主机端异步 (cudaMemcpyAsync) | 设备端异步 (memcpy_async) |
 |------|---------------------------|-------------------------|
@@ -1114,9 +1086,9 @@ pb.consumer_wait(); compute_b(); pb.consumer_release();
 
 两种异步方式可以<strong>同时使用</strong>：设备端 pipeline 负责细粒度的 Global→Shared 重叠，主机端 Stream 负责不同 kernel 间的粗粒度重叠。
 
-## 13.13 常见问题与故障排除
+## 13.15 常见问题与故障排除
 
-### 13.13.1 `cuda::barrier` 死锁
+### 13.15.1 `cuda::barrier` 死锁
 
 <strong>症状</strong>：所有线程卡在 `bar.wait()` 调用上。
 
@@ -1130,7 +1102,7 @@ pb.consumer_wait(); compute_b(); pb.consumer_release();
 - 确保所有参与线程都在同一个代码路径中调用 `arrive()`；
 - 每次迭代使用新的 token。
 
-### 13.13.2 Pipeline 死锁
+### 13.15.2 Pipeline 死锁
 
 <strong>症状</strong>：`producer_acquire()` 永不返回。
 
@@ -1146,7 +1118,7 @@ for (int i = 0; i < num_iter; i++) {
 // 几个迭代后死锁
 ```
 
-### 13.13.3 `memcpy_async` 性能差
+### 13.15.3 `memcpy_async` 性能差
 
 <strong>常见原因</strong>：
 1. 拷贝大小不是 16 字节的倍数（回退到逐字节拷贝）；
@@ -1160,7 +1132,7 @@ for (int i = 0; i < num_iter; i++) {
 - 确保全局内存基地址 128 字节对齐；
 - 在 commit/wait 前使用 `__syncwarp()` 恢复 warp 收敛。
 
-### 13.13.4 数据完整性错误
+### 13.15.4 数据完整性错误
 
 <strong>症状</strong>：某些输出值不正确或为零。
 
@@ -1170,7 +1142,7 @@ for (int i = 0; i < num_iter; i++) {
 3. 确认 `producer_commit()` 在所有异步拷贝之后调用；
 4. 检查 `fence_proxy_async_shared_cta()` 是否在写回前调用（如果使用 TMA）。
 
-## 13.14 性能基准参考
+## 13.16 性能基准参考
 
 以下是在 A100 (CC 8.0) 上进行批量数据处理（100 批次，256 个 float 每批次）的性能参考：
 
@@ -1189,7 +1161,7 @@ for (int i = 0; i < num_iter; i++) {
 2. 多阶段 pipeline 带来显著提升（2-stage 比单阶段快 36%）；
 3. 从 3-stage 到 4-stage 的增量很小，因为瓶颈从拷贝转向了计算。
 
-## 13.15 迁移指南：从 __syncthreads 到异步模型
+## 13.17 迁移指南：从 __syncthreads 到异步模型
 
 如果你的现有代码使用传统的 `__syncthreads()` 模式，迁移到异步模型应该循序渐进：
 
@@ -1248,7 +1220,7 @@ for (int i = 0; i < n; i++) {
 - 验证计算和数据拷贝是否有实际重叠；
 - 调整 pipeline 阶段数以平衡共享内存和吞吐量。
 
-## 13.16 动手体验2：异步归约
+## 13.18 动手体验2：异步归约
 
 在第 12 章我们使用 Thread Block Cluster + DSM 进行分布式归约。这里展示一个使用 `cuda::barrier` 进行分阶段异步归约的替代方案：
 
@@ -1330,7 +1302,7 @@ for (int i = 0; i < n; i++) {
 }
 ```
 
-## 13.17 本章小结
+## 13.19 本章小结
 
 本章全面介绍了 CUDA 异步 SIMT 编程模型，涵盖以下要点：
 
@@ -1348,7 +1320,7 @@ for (int i = 0; i < n; i++) {
 
 异步 SIMT 编程模型是现代 GPU 编程中不可或缺的工具。随着 GPU 内存带宽与计算能力之间的差距不断拉大，将数据搬运隐藏在计算之后变得愈发重要。掌握这些 API 将显著提升你的 CUDA 程序性能。
 
-## 13.10 习题
+## 13.20 习题
 
 1. 解释 `cuda::barrier` 的"时间分割"（Temporal Splitting）五阶段模型。为什么 arrive 和 wait 之间的阶段是实现重叠的关键？
 
@@ -1362,7 +1334,7 @@ for (int i = 0; i < n; i++) {
 
 6. 说明 Warp Specialization 模式中为什么需要 4 个 barrier（2×2双缓冲），而不是 2 个。
 
-## 13.11 参考文献
+## 13.21 参考文献
 
 1. CUDA C++ Programming Guide 13.0, Section 5.5 "Asynchronous SIMT Programming Model"
 2. CUDA C++ Programming Guide 13.0, Section 10.26 "Asynchronous Barrier"

From a4cb6d0362cf6c694760abfba841a72e342ebb6e Mon Sep 17 00:00:00 2001
From: wangdaye wang <1901591887@qq.com>
Date: Sat, 6 Jun 2026 01:23:05 +0800
Subject: [PATCH 15/23] 6.5 chapter14

---
 .../code/advanced-chapter14/tma_2d_demo.cu    |  277 ++++
 .../code/chapter6/pinned_bandwidth.cu         |    2 -
 ...\347\253\240 Tensor Memory Accelerator.md" | 1326 +++++++++++++++++
 3 files changed, 1603 insertions(+), 2 deletions(-)
 create mode 100644 outputs/gpu-programming-course/code/advanced-chapter14/tma_2d_demo.cu
 create mode 100644 "outputs/gpu-programming-course/docs/advanced-chapter14/\347\254\25414\347\253\240 Tensor Memory Accelerator.md"

diff --git a/outputs/gpu-programming-course/code/advanced-chapter14/tma_2d_demo.cu b/outputs/gpu-programming-course/code/advanced-chapter14/tma_2d_demo.cu
new file mode 100644
index 0000000..5f89fb9
--- /dev/null
+++ b/outputs/gpu-programming-course/code/advanced-chapter14/tma_2d_demo.cu
@@ -0,0 +1,277 @@
+// 文件: tma_2d_demo.cu
+// 演示：2D TMA 分块加载、处理和写回
+// 编译: nvcc -arch=sm_90a -lcuda tma_2d_demo.cu -o tma_2d_demo
+// 硬件要求: NVIDIA Hopper H100 (CC 9.0+)，推荐 sm_90a 以启用完整 TMA 特性
+//
+// 重要概念说明：
+// 1. Global → Shared 使用 mbarrier 完成机制，TMA 指令**自动**更新 barrier 的事务计数，
+//    所有线程调用普通 bar.arrive()
+// 2. Shared → Global 使用 bulk async-group 完成机制，只有发起线程需要 commit 和 wait
+// 3. __grid_constant__ 提示编译器该参数在整个 grid 执行期间不变，放入常量缓存加速访问
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <cassert>
+#include <cuda.h>
+#include <cudaTypedefs.h>
+#include <cuda/barrier>
+#include <cuda/experimental/__pipeline>
+
+// ========== CUDA 错误检查宏 ==========
+#define CUDA_CHECK(call)                                             \
+    do {                                                             \
+        cudaError_t err = call;                                      \
+        if (err != cudaSuccess) {                                    \
+            fprintf(stderr, "CUDA Error at %s:%d - %s\n",            \
+                    __FILE__, __LINE__, cudaGetErrorString(err));    \
+            exit(EXIT_FAILURE);                                      \
+        }                                                            \
+    } while (0)
+
+using barrier = cuda::barrier<cuda::thread_scope_block>;
+namespace cde = cuda::device::experimental;
+
+// ========== 常量定义 ==========
+constexpr int GMEM_WIDTH  = 256;              // 全局内存宽度（元素数）
+constexpr int GMEM_HEIGHT = 256;              // 全局内存高度（元素数）
+constexpr int SMEM_WIDTH  = 16;               // 共享内存 tile 宽度（元素数）
+constexpr int SMEM_HEIGHT = 16;               // 共享内存 tile 高度（元素数）
+constexpr int TILE_SIZE   = SMEM_WIDTH * SMEM_HEIGHT;  // 每个 tile 的元素总数
+
+// 编译时检查对齐要求
+static_assert(GMEM_WIDTH % SMEM_WIDTH == 0, "GMEM_WIDTH must be multiple of SMEM_WIDTH");
+static_assert(GMEM_HEIGHT % SMEM_HEIGHT == 0, "GMEM_HEIGHT must be multiple of SMEM_HEIGHT");
+
+// ========== 主机端：获取 cuTensorMapEncodeTiled 函数指针 ==========
+// 注意：cuTensorMapEncodeTiled 是 Driver API 函数，需要通过 cudaGetDriverEntryPointByVersion 获取
+
+PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled() {
+    void* ptr = nullptr;
+    cudaDriverEntryPointQueryResult status;
+    cudaError_t err = cudaGetDriverEntryPointByVersion(
+        "cuTensorMapEncodeTiled", &ptr, 12000, cudaEnableDefault, &status);
+    if (err != cudaSuccess || status != cudaDriverEntryPointSuccess) {
+        fprintf(stderr, "Failed to get cuTensorMapEncodeTiled: %s\n",
+                cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+    return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(ptr);
+}
+
+// ========== 主机端：创建 2D Tensor Map ==========
+// Tensor Map 描述了全局内存中张量的布局，供 TMA 硬件使用
+// 注意：维度顺序是 fastest-changing dimension 在索引 0
+
+CUtensorMap create_2d_tensor_map(int* d_data) {
+    CUtensorMap tmap{};
+    constexpr uint32_t rank = 2;
+
+    // globalDim: 全局张量各维度尺寸（元素数）
+    uint64_t size[rank] = {GMEM_WIDTH, GMEM_HEIGHT};
+
+    // globalStrides: 全局张量各维度步长（字节）
+    // rank-1 个步长：最快维度（dim0）无 stride，dim1 的 stride = width * sizeof(int)
+    uint64_t stride[rank - 1] = {GMEM_WIDTH * sizeof(int)};
+
+    // boxDim: 每次 TMA 拷贝的 tile 大小（元素数）
+    uint32_t box_size[rank] = {SMEM_WIDTH, SMEM_HEIGHT};
+
+    // elementStride: 元素步长（以 sizeof(datatype) 为单位），通常设为 1
+    uint32_t elem_stride[rank] = {1, 1};
+
+    auto encode = get_cuTensorMapEncodeTiled();
+    CUresult res = encode(
+        &tmap,
+        CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32,  // 数据类型
+        rank,                                                 // 张量维度数
+        d_data,                                               // 全局内存基地址
+        size,                                                 // 各维度尺寸
+        stride,                                               // 各维度步长
+        box_size,                                             // tile 尺寸
+        elem_stride,                                          // 元素步长
+        CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
+        CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE,       // 本章先不开启 Swizzle
+        CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
+        CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+
+    if (res != CUDA_SUCCESS) {
+        fprintf(stderr, "cuTensorMapEncodeTiled failed with code %d\n", res);
+        exit(EXIT_FAILURE);
+    }
+    return tmap;
+}
+
+// ========== 设备端：核函数 ==========
+// 使用 __grid_constant__ 提示编译器将 tensor_map 放入常量缓存以提高访问效率
+// 每个线程块处理一个 (x, y) 坐标对应的 tile
+
+__global__ void tma_copy_kernel(const __grid_constant__ CUtensorMap tensor_map,
+                                int* __restrict__ output,
+                                int tiles_per_row) {
+    // 多维 TMA 操作的共享内存目标缓冲区需要 128 字节对齐
+    __shared__ alignas(128) int smem_buffer[SMEM_HEIGHT][SMEM_WIDTH];
+
+    // ---------- 1. 初始化 mbarrier ----------
+    #pragma nv_diag_suppress static_var_with_dynamic_init
+    __shared__ barrier bar;
+
+    // 只有线程 0 初始化 barrier
+    // barrier 以 blockDim.x（块内所有线程）为参与线程数
+    if (threadIdx.x == 0) {
+        init(&bar, blockDim.x);
+        // fence_proxy_async_shared_cta 使 barrier 对 async proxy 可见
+        cde::fence_proxy_async_shared_cta();
+    }
+    __syncthreads();
+
+    // ---------- 2. TMA 读取：Global → Shared ----------
+    // 计算当前 block 要处理的 tile 在全局数组中的坐标
+    int global_x = (blockIdx.x % tiles_per_row) * SMEM_WIDTH;   // 最内层维度（column）
+    int global_y = (blockIdx.x / tiles_per_row) * SMEM_HEIGHT;  // 外层维度（row）
+
+    barrier::arrival_token token;
+    if (threadIdx.x == 0) {
+        // 发起 2D TMA 拷贝
+        // 重要：硬件指令 cp_async_bulk_tensor 会**自动**向 barrier 提交预期的事务计数
+        // 因此这里不需要（也不应该）调用 barrier_arrive_tx
+        cde::cp_async_bulk_tensor_2d_global_to_shared(
+            &smem_buffer, &tensor_map, global_x, global_y, bar);
+    }
+    // 所有线程到达 barrier（包括发起线程）
+    // barrier 会自动等待 TMA 传输完成后才让 wait 返回
+    token = bar.arrive();
+    bar.wait(std::move(token));
+    // 此时数据已在共享内存中，所有线程均可安全访问
+
+    // ---------- 3. 计算：对数据加 1 ----------
+    // 使用块内所有线程并行处理 tile 数据
+    for (int idx = threadIdx.x; idx < TILE_SIZE; idx += blockDim.x) {
+        int row = idx / SMEM_WIDTH;
+        int col = idx % SMEM_WIDTH;
+        smem_buffer[row][col] += 1;
+    }
+
+    // ---------- 4. TMA 写入：Shared → Global ----------
+    // 写回前需要确保共享内存写入对 TMA 引擎可见
+    cde::fence_proxy_async_shared_cta();
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+        // 发起 2D TMA 写回
+        cde::cp_async_bulk_tensor_2d_shared_to_global(
+            &tensor_map, global_x, global_y, &smem_buffer);
+
+        // Shared → Global 方向使用 bulk async-group 完成机制
+        // 将当前操作提交到 group
+        cde::cp_async_bulk_commit_group();
+        // 等待 group 完成（0 表示等待所有之前的操作）
+        cde::cp_async_bulk_wait_group_read<0>();
+    }
+
+    // ---------- 5. 清理 ----------
+    if (threadIdx.x == 0) {
+        (&bar)->~barrier();   // 手动销毁 barrier，释放共享内存
+    }
+}
+
+// ========== 主机端：验证函数 ==========
+bool verify_result(const int* h_output, size_t size) {
+    for (size_t i = 0; i < size; i++) {
+        // 输入是 1，经过加 1 后应为 2
+        if (h_output[i] != 2) {
+            printf("Verification failed at index %zu: expected 2, got %d\n",
+                   i, h_output[i]);
+            return false;
+        }
+    }
+    return true;
+}
+
+// ========== 主函数 ==========
+int main() {
+    // 1. 检查设备计算能力
+    int device;
+    CUDA_CHECK(cudaGetDevice(&device));
+    cudaDeviceProp props;
+    CUDA_CHECK(cudaGetDeviceProperties(&props, device));
+
+    if (props.major < 9) {
+        fprintf(stderr, "Error: TMA requires Compute Capability 9.0+ (NVIDIA Hopper). "
+                        "Current device: sm_%d%d\n", props.major, props.minor);
+        return 1;
+    }
+    printf("Device: %s (Compute Capability %d.%d)\n", props.name, props.major, props.minor);
+    if (props.major == 9 && props.minor == 0) {
+        printf("Note: For best TMA performance, compile with -arch=sm_90a\n");
+    }
+
+    // 2. 分配全局内存
+    size_t bytes = static_cast<size_t>(GMEM_WIDTH) * GMEM_HEIGHT * sizeof(int);
+    int* d_input = nullptr;
+    int* d_output = nullptr;
+    CUDA_CHECK(cudaMalloc(&d_input, bytes));
+    CUDA_CHECK(cudaMalloc(&d_output, bytes));
+
+    // 3. 初始化输入数据为全 1
+    CUDA_CHECK(cudaMemset(d_input, 1, bytes));
+    CUDA_CHECK(cudaMemset(d_output, 0, bytes));
+
+    // 4. 创建 Tensor Map（TMA 硬件描述符）
+    CUtensorMap tensor_map = create_2d_tensor_map(d_input);
+
+    // 5. 计算 launch 配置
+    int tiles_per_row = GMEM_WIDTH / SMEM_WIDTH;
+    int tiles_per_col = GMEM_HEIGHT / SMEM_HEIGHT;
+    int total_tiles = tiles_per_row * tiles_per_col;
+    int threads_per_block = 256;
+    // 共享内存大小：使用常量计算，不能用 sizeof(smem_buffer)（核函数内局部变量）
+    size_t smem_size = SMEM_HEIGHT * SMEM_WIDTH * sizeof(int);
+
+    printf("Global matrix: %d x %d (%zu bytes)\n", GMEM_WIDTH, GMEM_HEIGHT, bytes);
+    printf("Tile: %d x %d (%zu bytes)\n", SMEM_WIDTH, SMEM_HEIGHT, smem_size);
+    printf("Blocks: %d (%d x %d), Threads per block: %d\n",
+           total_tiles, tiles_per_row, tiles_per_col, threads_per_block);
+
+    // 6. 启动核函数
+    cudaEvent_t start, stop;
+    CUDA_CHECK(cudaEventCreate(&start));
+    CUDA_CHECK(cudaEventCreate(&stop));
+
+    CUDA_CHECK(cudaEventRecord(start));
+    tma_copy_kernel<<<total_tiles, threads_per_block, smem_size>>>(tensor_map, d_output, tiles_per_row);
+    CUDA_CHECK(cudaEventRecord(stop));
+    CUDA_CHECK(cudaEventSynchronize(stop));
+
+    float elapsed_ms;
+    CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, start, stop));
+    printf("Kernel execution time: %.3f ms\n", elapsed_ms);
+
+    // 检查 kernel 执行错误
+    CUDA_CHECK(cudaGetLastError());
+
+    // 7. 验证结果
+    int* h_output = static_cast<int*>(malloc(bytes));
+    CUDA_CHECK(cudaMemcpy(h_output, d_output, bytes, cudaMemcpyDeviceToHost));
+
+    bool correct = verify_result(h_output, static_cast<size_t>(GMEM_WIDTH) * GMEM_HEIGHT);
+    printf("\n========== Result ==========\n");
+    printf("TMA 2D Demo: %s\n", correct ? "PASS ✓" : "FAIL ✗");
+
+    if (correct) {
+        // 打印前几个元素作为示例
+        printf("First 16 elements of output:\n");
+        for (int i = 0; i < 16 && i < GMEM_WIDTH * GMEM_HEIGHT; i++) {
+            printf("%d ", h_output[i]);
+        }
+        printf("\n");
+    }
+
+    // 8. 清理资源
+    free(h_output);
+    CUDA_CHECK(cudaFree(d_input));
+    CUDA_CHECK(cudaFree(d_output));
+    CUDA_CHECK(cudaEventDestroy(start));
+    CUDA_CHECK(cudaEventDestroy(stop));
+
+    return correct ? 0 : 1;
+}
\ No newline at end of file
diff --git a/outputs/gpu-programming-course/code/chapter6/pinned_bandwidth.cu b/outputs/gpu-programming-course/code/chapter6/pinned_bandwidth.cu
index 8cf3ab2..5800ac7 100644
--- a/outputs/gpu-programming-course/code/chapter6/pinned_bandwidth.cu
+++ b/outputs/gpu-programming-course/code/chapter6/pinned_bandwidth.cu
@@ -144,8 +144,6 @@ int main()
     CUDA_CHECK(cudaGetDeviceProperties(&prop, 0));
     printf("Device: %s (CC %d.%d)\n", prop.name, prop.major,
            prop.minor);
-    printf("PCIe Generation: %d\n",
-           prop.pcieGUIDisplayDevice ? 3 : 3); // 简化显示
     printf("\n");
 
     // =========================================================================
diff --git "a/outputs/gpu-programming-course/docs/advanced-chapter14/\347\254\25414\347\253\240 Tensor Memory Accelerator.md" "b/outputs/gpu-programming-course/docs/advanced-chapter14/\347\254\25414\347\253\240 Tensor Memory Accelerator.md"
new file mode 100644
index 0000000..bf8cbab
--- /dev/null
+++ "b/outputs/gpu-programming-course/docs/advanced-chapter14/\347\254\25414\347\253\240 Tensor Memory Accelerator.md"	
@@ -0,0 +1,1326 @@
+# 第14章 Tensor Memory Accelerator (TMA/张量内存加速器)
+
+<strong>硬件要求</strong>：Compute Capability 9.0+（NVIDIA Hopper H100+）；多播特性建议 `sm_90a`
+
+## 14.1 引言
+
+在上一章中，我们学习了 `cuda::memcpy_async` 和 `cuda::pipeline`——它们将数据从全局内存异步拷贝到共享内存，绕过了寄存器中转。这些 API 本质上是对硬件"cp.async"指令的抽象，能够高效地完成连续一维数据块的拷贝。
+
+然而，许多高性能计算和深度学习工作负载涉及<strong>多维数组</strong>的<strong>不规则访问模式</strong>——例如矩阵乘法中的分块（tiling）、卷积中的滑动窗口、以及张量转置等。在这些场景中，地址计算本身就成为了一项昂贵的开销：你需要计算源地址、目标地址、步长、边界检查等。
+
+这就是 <strong>Tensor Memory Accelerator (TMA)</strong> 大显身手的地方。CUDA Programming Guide 对 TMA 的定位非常清晰：
+
+> "The primary goal of TMA is to provide an efficient data transfer mechanism from global memory to shared memory for multi-dimensional arrays."
+
+TMA 是 NVIDIA Hopper 架构引入的一个<strong>硬件加速数据拷贝单元</strong>。它不仅仅在做数据搬运，更关键的是：<strong>它将地址计算从 CUDA Core 卸载到了专用硬件</strong>。这意味着：
+
+1. <strong>减少寄存器压力</strong>：不再需要寄存器来存储地址计算中的中间值；
+2. <strong>零地址计算开销</strong>：硬件根据"张量映射"（Tensor Map）自动完成多维地址生成；
+3. <strong>硬件管理的 Swizzle</strong>：TMA 可以自动重排共享内存中的数据布局，消除 bank conflict；
+4. <strong>异步执行</strong>：与 `memcpy_async` 一样，TMA 拷贝是异步的；
+5. <strong>Cluster 多播</strong>：一次拷贝可以同时将数据广播到簇中多个块的共享内存。
+
+本章将带你深入理解 TMA 的工作机制、Tensor Map 的创建与使用、TMA 的完成机制，以及 Swizzle 模式如何优化数据访问。最后我们将给出 TMA 与手动 `memcpy_async` 的性能对比分析。
+
+## 14.2 TMA 概述
+
+### 14.2.1 TMA 的命名约定
+
+CUDA Programming Guide 在命名上做了明确的区分：
+
+> "Naming. Tensor memory accelerator (TMA) is a broad term used to refer to the features described in this section. For the purpose of forward-compatibility and to reduce discrepancies with the PTX ISA, the text in this section refers to TMA operations as either bulk-asynchronous copies or bulk tensor asynchronous copies, depending on the specific type of copy used. The term 'bulk' is used to contrast these operations with the asynchronous memory operations described in the previous sections."
+
+简单来说：
+
+| 术语 | 含义 | 使用场景 |
+|------|------|---------|
+| <strong>bulk-asynchronous copy</strong> | 一维连续数据的批量异步拷贝 | 不需要 Tensor Map，直接使用指针+大小 |
+| <strong>bulk tensor asynchronous copy</strong> | 多维张量的批量异步拷贝 | 需要 Tensor Map 描述数据布局 |
+
+### 14.2.2 TMA 的关键特性
+
+CUDA Programming Guide 归纳了 TMA 的以下关键特性：
+
+<strong>维度支持</strong>：
+
+> "Dimensions. TMA supports copying both one-dimensional and multi-dimensional arrays (up to 5-dimensional)."
+
+TMA 拷贝操作底层都依赖 Tensor Map 硬件机制。区别在于：一维拷贝由硬件自动创建隐式映射，程序员无需管理 CUtensorMap 变量；多维拷贝必须由程序员显式构造 CUtensorMap 并传递给 TMA 指令。
+
+<strong>源和目标</strong>：
+
+TMA 支持的拷贝方向非常灵活：
+
+| 源 | 目标 | 完成机制 |
+|-----|------|---------|
+| Global | Shared::cta | Shared Memory Barrier (mbarrier) |
+| Shared::cta | Global | Bulk async-group |
+| Global | Shared::cluster（多播） | Shared Memory Barrier (mbarrier) |
+| Shared::cta | Shared::cluster | Shared Memory Barrier (mbarrier) |
+
+CUDA Programming Guide 原表（Table 8）：
+
+> "Asynchronous copies with possible source and destinations memory spaces and completion mechanisms."
+
+<strong>异步性</strong>：
+
+> "Asynchronous. Data transfers using TMA are asynchronous. This allows the initiating thread to continue computing while the hardware asynchronously copies the data."
+
+关键的是 TMA 拷贝的异步性取决于硬件实现，未来可能发生变化：
+
+> "Whether the data transfer occurs asynchronously in practice is up to the hardware implementation and may change in the future."
+
+### 14.2.3 TMA 的优势总结
+
+| 特性 | 传统 memcpy_async | TMA |
+|------|------------------|-----|
+| 地址计算 | 由 CUDA Core 计算（占用寄存器） | 硬件自动生成（零开销） |
+| 维度支持 | 仅 1D | 1D ~ 5D |
+| 边界检查 | 手动实现 | 硬件自动处理（越界填充零） |
+| Swizzle | 不支持 | 硬件支持（4种模式） |
+| 多播 | 不支持 | 支持 Cluster 多播 |
+| 拷贝大小 | 4/8/16 字节对齐 | 16 字节对齐 |
+| 共享内存对齐 | 128 字节 | 128 字节（多维） |
+
+## 14.3 一维 TMA 拷贝
+
+一维 TMA 拷贝（bulk-asynchronous copy）不需要 Tensor Map，直接使用指针和大小参数。下面是从 CUDA Programming Guide 10.29.1 节提取的完整示例：
+
+### 14.3.1 一维 TMA 核函数
+
+```cuda
+#include <cuda/barrier>
+#include <cuda/experimental/__pipeline>   // 用于 cuda::device::experimental 下的 TMA 函数
+
+using barrier = cuda::barrier<cuda::thread_scope_block>;
+namespace cde = cuda::device::experimental;
+
+static constexpr size_t buf_len = 1024;
+__global__ void add_one_kernel(int* data, size_t offset)
+{
+    // 共享内存缓冲区——TMA 目标缓冲区需 16 字节对齐
+    __shared__ alignas(16) int smem_data[buf_len];
+
+    // 1. 初始化共享内存 barrier
+    __shared__ barrier bar;
+    if (threadIdx.x == 0) {
+        init(&bar, blockDim.x);                     // 参与线程数
+        cde::fence_proxy_async_shared_cta();        // 使 barrier 对 async proxy 可见
+    }
+    __syncthreads();
+
+    // 2. 发起 TMA 传输：global → shared
+    barrier::arrival_token token;
+    if (threadIdx.x == 0) {
+        // 使用 cp_async_bulk 一维拷贝（硬件隐式创建 1D Tensor Map）
+        cde::cp_async_bulk_global_to_shared(
+            smem_data,                     // 目标共享内存指针
+            data + offset,                 // 源全局内存指针
+            sizeof(smem_data)              // 拷贝字节数（16字节倍数）
+        );
+        // **显式**通知 mbarrier 本次传输的字节数
+        token = cde::barrier_arrive_tx(bar, 1, sizeof(smem_data));
+    } else {
+        token = bar.arrive();
+    }
+
+    // 3. 等待数据到达
+    bar.wait(std::move(token));
+
+    // 4. 计算：对共享内存数据加一
+    for (int i = threadIdx.x; i < buf_len; i += blockDim.x) {
+        smem_data[i] += 1;
+    }
+
+    // 5. 确保共享内存写入对 TMA 引擎可见
+    cde::fence_proxy_async_shared_cta();
+    __syncthreads();
+
+    // 6. 发起 TMA 传输：shared → global
+    if (threadIdx.x == 0) {
+        cde::cp_async_bulk_shared_to_global(
+            data + offset,                 // 目标全局内存指针
+            smem_data,                     // 源共享内存指针
+            sizeof(smem_data)              // 拷贝字节数
+        );
+        // 7. 等待写入完成（使用 bulk async-group）
+        cde::cp_async_bulk_commit_group();
+        cde::cp_async_bulk_wait_group_read<0>();
+    }
+}
+```
+
+### 14.3.2 步骤详解
+
+CUDA Programming Guide 对这个一维 TMA 核函数的每个步骤都有详细说明：
+
+<strong>Barrier 初始化</strong>：barrier 以参与线程数初始化。使用 `fence.proxy.async.shared::cta` 指令确保后续 bulk-asynchronous copy 操作看到的是已初始化的 barrier。
+
+<strong>TMA 读取</strong>：
+
+> "The bulk-asynchronous copy instruction directs the hardware to copy a large chunk of data into shared memory, and to update the transaction count of the shared memory barrier after completing the read. In general, issuing as few bulk copies with as big a size as possible results in the best performance. Because the copy can be performed asynchronously by the hardware, it is not necessary to split the copy into smaller chunks."
+
+关键理解：`cuda::memcpy_async` 在发起 TMA 拷贝时会自动调用 `mbarrier.expect_tx`，告诉 barrier 预期接收多少字节。barrier 只有在<strong>所有线程都到达 且 所有字节都已到达</strong>时才会翻转。
+
+<strong>SMEM 写入与同步</strong>：在共享内存上完成计算后，需要 `fence.proxy.async.shared::cta` + `__syncthreads()` 确保所有线程的写入在 async proxy 中排序到后续的 bulk 操作之前。
+
+<strong>TMA 写入与同步</strong>：从共享内存写回全局内存时，使用 `cp_async_bulk_commit_group` + `cp_async_bulk_wait_group_read` 的完成机制。这是<strong>线程局部</strong>的机制——只有发起线程需要等待。
+
+### 14.3.3 一维 TMA 对齐要求
+
+| 地址/大小 | 对齐要求 |
+|-----------|---------|
+| 全局内存地址 | 16 字节对齐 |
+| 共享内存地址 | 16 字节对齐 |
+| Barrier 地址 | 8 字节对齐（`cuda::barrier` 内部保证） |
+| 传输大小 | 16 字节的倍数 |
+
+## 14.4 多维 TMA 拷贝与 Tensor Map
+
+### 14.4.1 Tensor Map 的概念
+
+多维 TMA 拷贝需要一个 <strong>Tensor Map</strong>（张量映射）来描述全局内存中多维数组的布局。CUDA Programming Guide 指出：
+
+> "To perform a bulk tensor asynchronous copy of a multi-dimensional array, the hardware requires a tensor map. This object describes the layout of the multi-dimensional array in global and shared memory."
+
+Tensor Map 是一个 <strong>`CUtensorMap`</strong> 结构体，包含以下信息：
+- 数据指针（base address）
+- 各维度的尺寸（size）
+- 各维度的步长（stride，以字节为单位）
+- 共享内存缓冲区尺寸（box size）
+- 元素步长（element stride）
+- Swizzle 模式
+- L2 缓存策略
+- 越界填充模式
+
+### 14.4.2 主机端创建 Tensor Map
+
+Tensor Map 通过 CUDA Driver API 的 `cuTensorMapEncodeTiled` 函数创建：
+
+```cpp
+#include &lt;cudaTypedefs.h&gt; // PFN_cuTensorMapEncodeTiled, CUtensorMap
+
+// 通过 Driver Entry Point API 获取函数指针
+PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled() {
+  cudaDriverEntryPointQueryResult driver_status;
+  void* cuTensorMapEncodeTiled_ptr = nullptr;
+  CUDA_CHECK(cudaGetDriverEntryPointByVersion(
+      "cuTensorMapEncodeTiled", &cuTensorMapEncodeTiled_ptr,
+      12000, cudaEnableDefault, &driver_status));
+  assert(driver_status == cudaDriverEntryPointSuccess);
+  return reinterpret_cast&lt;PFN_cuTensorMapEncodeTiled_v12000&gt;(
+      cuTensorMapEncodeTiled_ptr);
+}
+
+// 创建 2D Tensor Map
+CUtensorMap tensor_map{};
+constexpr uint32_t rank = 2;
+uint64_t size[rank]   = {GMEM_WIDTH, GMEM_HEIGHT};
+// stride 是从一行移动到下一行所需的字节数，必须是 16 的倍数
+uint64_t stride[rank - 1] = {GMEM_WIDTH * sizeof(int)};
+// box_size 是共享内存缓冲区的大小
+uint32_t box_size[rank]   = {SMEM_WIDTH, SMEM_HEIGHT};
+uint32_t elem_stride[rank] = {1, 1};
+
+auto cuTensorMapEncodeTiled = get_cuTensorMapEncodeTiled();
+
+CUresult res = cuTensorMapEncodeTiled(
+    &tensor_map,
+    CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32,
+    rank,                       // 张量维度
+    tensor_ptr,                 // 全局内存基地址
+    size,                       // 全局内存各维度尺寸
+    stride,                     // 全局内存各维度步长（字节）
+    box_size,                   // 共享内存 box 尺寸
+    elem_stride,                // 元素步长
+    CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
+    CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE,
+    CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
+    CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
+);
+```
+
+`cuTensorMapEncodeTiled` 的参数含义：
+- <strong>tensorRank</strong>：张量的维度数（1~5）
+- <strong>globalAddress</strong>：全局内存中的起始地址
+- <strong>globalDim</strong>：各维度的元素数量
+- <strong>globalStrides</strong>：各维度的步长（字节），注意 <strong>最快移动维度不需要 stride</strong>
+- <strong>boxDim</strong>：共享内存 box 各维度的尺寸
+- <strong>elementStrides</strong>：元素间的步长（以 sizeof(element) 为单位）
+- <strong>Interleave</strong>：交错模式（用于加载小于 4 字节的元素）
+- <strong>Swizzle</strong>：Swizzle 模式
+- <strong>L2promotion</strong>：L2 缓存策略
+- <strong>OOBfill</strong>：越界填充模式
+
+### 14.4.3 传递 Tensor Map 到设备
+
+CUDA Programming Guide 推荐三种方式将 Tensor Map 传递给核函数：
+
+<strong>方式一：const __grid_constant__ 参数（推荐）</strong>
+
+```cuda
+#include &lt;cuda.h&gt;
+
+__global__ void kernel(const __grid_constant__ CUtensorMap tensor_map)
+{
+   // 使用 tensor_map
+}
+int main() {
+  CUtensorMap map;
+  // [ ..初始化 map.. ]
+  kernel&lt;&lt;&lt;1, 1&gt;&gt;&gt;(map);
+}
+```
+
+`__grid_constant__` 注解告诉编译器这个参数在整个 grid 执行期间不变，可以被放入特殊的常量缓存中，从而获得更快的访问速度。
+
+<strong>方式二：__constant__ 变量</strong>
+
+```cuda
+__constant__ CUtensorMap global_tensor_map;
+__global__ void kernel()
+{
+  // 使用 global_tensor_map
+}
+int main() {
+  CUtensorMap local_tensor_map;
+  // [ ..初始化 map.. ]
+  cudaMemcpyToSymbol(global_tensor_map, &local_tensor_map,
+                     sizeof(CUtensorMap));
+  kernel&lt;&lt;&lt;1, 1&gt;&gt;&gt;();
+}
+```
+
+<strong>方式三：全局内存（最灵活但最慢）</strong>
+
+```cuda
+#include &lt;cuda/ptx&gt;
+namespace ptx = cuda::ptx;
+
+__device__ CUtensorMap global_tensor_map;
+__global__ void kernel(CUtensorMap *tensor_map)
+{
+  // Fence acquire tensor map（因为 tensor map 可能被 host 修改）
+  ptx::fence_proxy_tensormap_generic(
+     ptx::sem_acquire, ptx::scope_sys, tensor_map,
+     ptx::n32_t&lt;128&gt;());
+  // 在 fence 之后安全使用 tensor_map
+}
+```
+
+### 14.4.4 二维 TMA 核函数
+
+以下是从 CUDA Programming Guide 10.29.2 节提取的 2D TMA 示例：
+
+```cuda
+#include <cuda.h>         // CUtensorMap
+#include <cuda/barrier>
+using barrier = cuda::barrier<cuda::thread_scope_block>;
+namespace cde = cuda::device::experimental;
+
+__global__ void kernel(const __grid_constant__ CUtensorMap tensor_map,
+                       int x, int y) {
+  // 多维 TMA 操作的共享内存目标缓冲区需要 128 字节对齐
+  __shared__ alignas(128) int smem_buffer[SMEM_HEIGHT][SMEM_WIDTH];
+
+  // 初始化 barrier
+  #pragma nv_diag_suppress static_var_with_dynamic_init
+  __shared__ barrier bar;
+  if (threadIdx.x == 0) {
+    init(&bar, blockDim.x);
+    // 使初始化的 barrier 对 async proxy 可见
+    cde::fence_proxy_async_shared_cta();
+  }
+  __syncthreads();
+
+  barrier::arrival_token token;
+  if (threadIdx.x == 0) {
+    // 发起二维批量张量拷贝：global → shared
+    // 硬件自动向 barrier 提交预期事务计数（tile 总字节数）
+    cde::cp_async_bulk_tensor_2d_global_to_shared(
+        &smem_buffer, &tensor_map, x, y, bar);
+    // 发起线程只需普通 arrive，不要调用 barrier_arrive_tx
+    token = bar.arrive();
+  } else {
+    // 其他线程仅到达 barrier
+    token = bar.arrive();
+  }
+  // 等待数据到达
+  bar.wait(std::move(token));
+
+  // 对共享内存数据做计算（示例：修改第一个元素）
+  smem_buffer[0][threadIdx.x] += threadIdx.x;
+
+  // 等待共享内存写入对 TMA 引擎可见
+  cde::fence_proxy_async_shared_cta();
+  __syncthreads();
+
+  // 发起 TMA 传输：shared → global
+  if (threadIdx.x == 0) {
+    cde::cp_async_bulk_tensor_2d_shared_to_global(
+        &tensor_map, x, y, &smem_buffer);
+    // 创建 bulk async-group
+    cde::cp_async_bulk_commit_group();
+    // 等待 group 完成读取共享内存
+    cde::cp_async_bulk_wait_group_read<0>();
+  }
+
+  // 销毁 barrier，释放共享内存
+  if (threadIdx.x == 0) {
+    (&bar)->~barrier();
+  }
+}
+```
+
+### 14.4.5 多维 TMA 的越界处理
+
+TMA 的一个重要特性是硬件级别的越界处理：
+
+> "Negative indices and out of bounds. When part of the tile that is being read from global to shared memory is out of bounds, the shared memory that corresponds to the out of bounds area is zero-filled. The top-left corner indices of the tile may also be negative."
+
+从全局内存<strong>读取</strong>时：越界区域自动填充零，且左上角坐标可以为负数。
+
+从共享内存<strong>写入</strong>全局内存时：部分 tile 可以越界，但左上角坐标不能为负数。
+
+### 14.4.6 尺寸与步长约定
+
+CUDA Programming Guide 说明了尺寸和步长的约定：
+
+> "The size of a tensor is the number of elements along one dimension. All sizes must be greater than one. The stride is the number of bytes between elements of the same dimension."
+
+例如，一个 4x4 的整数矩阵：
+- 尺寸：4 和 4
+- 步长：4 字节（行内）和 16 字节（行间，4 x 4 bytes）
+
+对于 4x3 的行优先整数矩阵，由于对齐要求：
+- 步长：4 字节和 16 字节（padding 到 16 字节对齐）
+
+### 14.4.7 多维 TMA 对齐要求
+
+| 地址/尺寸 | 对齐要求 |
+|-----------|---------|
+| 全局内存地址 | 16 字节对齐 |
+| 全局内存尺寸 | >= 1，不需要是 16 字节的倍数 |
+| 全局内存步长 | 必须是 16 字节的倍数 |
+| 共享内存地址 | 128 字节对齐 |
+| Barrier 地址 | 8 字节对齐（`cuda::barrier` 保证） |
+| 传输大小 | 16 字节的倍数 |
+
+## 14.5 TMA 完成机制
+
+TMA 操作的不同方向对应不同的完成机制。CUDA Programming Guide 的 Table 8 清晰总结了这些规则。
+
+### 14.5.1 方向与完成机制对照表
+
+| 目标 | 源 | 异步拷贝 | Bulk 异步拷贝 (TMA) |
+|------|-----|---------|-------------------|
+| Global | Global | -- | -- |
+| Global | Shared::cta | -- | <strong>Bulk async-group</strong> |
+| Shared::cta | Global | Async-group, mbarrier | <strong>Mbarrier</strong> |
+| Shared::cluster | Global | -- | <strong>Mbarrier (multicast)</strong> |
+| Shared::cta | Shared::cluster | -- | <strong>Mbarrier</strong> |
+| Shared::cta | Shared::cta | -- | -- |
+
+### 14.5.2 Shared Memory Barrier (mbarrier)
+
+当 TMA 从全局内存<strong>读取</strong>到共享内存时，使用 <strong>Shared Memory Barrier (mbarrier)</strong> 作为完成机制。
+
+工作流程：
+1. 单个线程发起 `cp.async.bulk.tensor` 指令；
+2. 该指令自动更新 barrier 的 <strong>transaction count</strong>（预期接收的字节数）；
+3. 所有线程调用 `bar.arrive()`，或发起线程通过 `barrier_arrive_tx` 到达；
+4. 当所有线程到达 且 所有字节到达时，barrier 翻转；
+5. `bar.wait()` 返回后，数据在共享内存中可读。
+
+### 14.5.3 Bulk Async-Group
+
+当 TMA 从共享内存<strong>写入</strong>到全局内存（或分布式共享内存）时，使用 <strong>bulk async-group</strong> 作为完成机制。
+
+工作流程：
+1. 单个线程发起 `cp.async.bulk` 或 `cp.async.bulk.tensor` 指令；
+2. 该线程将操作提交到一个线程局部的 bulk async-group：`cp_async_bulk_commit_group()`；
+3. 该线程等待 group 完成：`cp_async_bulk_wait_group_read&lt;N&gt;()`。
+
+注意：bulk async-group 是<strong>线程局部</strong>的——只有发起线程可以等待。这与 mbarrier 的块内多线程协同不同。
+
+### 14.5.4 TMA 多播（Multicast）
+
+在 Thread Block Cluster 中，TMA 支持<strong>多播</strong>——一次拷贝可以将数据同时广播到簇中多个线程块的共享内存。
+
+CUDA Programming Guide 指出：
+
+> "In addition, when in a cluster, a bulk-asynchronous operation can be specified as being multicast. In this case, data can be transferred from global memory to the shared memory of multiple blocks within the cluster. The multicast feature is optimized for target architecture `sm_90a` and may have significantly reduced performance on other targets."
+
+多播的方向：Global → Shared::cluster，完成机制为 Mbarrier。
+
+## 14.6 TMA Swizzle 模式
+
+### 14.6.1 为什么需要 Swizzle
+
+CUDA Programming Guide 解释了 Swizzle 的动机：
+
+> "By default, the TMA engine loads data to shared memory in the same order as it is laid out in global memory. However, this layout may not be optimal for certain shared memory access patterns, as it could cause shared memory bank conflicts."
+
+回忆一下共享内存 Bank 的组织方式：32 个 bank，每个连续 4 字节映射到连续的 bank。如果多个线程同时访问同一个 bank，就会产生 bank conflict。
+
+Swizzle 的核心思想是：<strong>让 TMA 硬件在写入共享内存时重新排列数据，以消除后续访问中的 bank conflict</strong>。
+
+> "To ensure that data is laid out in shared memory in such a way that user code can avoid shared memory bank conflicts, the TMA engine can be instructed to 'swizzle' the data before storing it in shared memory and 'unswizzle' it when copying the data back from shared memory to global memory."
+
+### 14.6.2 矩阵转置案例
+
+以下案例来自 CUDA Programming Guide 10.29.3.1：
+
+<strong>问题</strong>：一个 8x8 的 `int4` 矩阵（行优先存储）从全局内存加载到共享内存。8 个线程将每行加载到转置缓冲区的对应列。在普通布局下，列方向存储会导致 8-way bank conflict。
+
+<strong>解决方案</strong>：使用 `CU_TENSOR_MAP_SWIZZLE_128B` Swizzle 模式。该模式按 128 字节行为单位进行重排，使得行和列方向的访问都不会产生 bank conflict。
+
+```cuda
+__global__ void kernel_tma(const __grid_constant__ CUtensorMap tensor_map) {
+   // 使用 128 字节 swizzle 模式时，共享内存需要 1024 字节对齐
+   __shared__ alignas(1024) int4 smem_buffer[8][8];
+   __shared__ alignas(1024) int4 smem_buffer_tr[8][8];
+
+   // 初始化 barrier
+   #pragma nv_diag_suppress static_var_with_dynamic_init
+   __shared__ barrier bar;
+   if (threadIdx.x == 0) {
+     init(&bar, blockDim.x);
+     cde::fence_proxy_async_shared_cta();
+   }
+   __syncthreads();
+
+   barrier::arrival_token token;
+   if (threadIdx.x == 0) {
+     // 发起 TMA 拷贝（使用 swizzled tensor map）
+     cde::cp_async_bulk_tensor_2d_global_to_shared(
+         &smem_buffer, &tensor_map, 0, 0, bar);
+     token = cuda::device::barrier_arrive_tx(bar, 1, sizeof(smem_buffer));
+   } else {
+     token = bar.arrive();
+   }
+   bar.wait(std::move(token));
+
+   /* 矩阵转置
+    * 使用普通布局时，存储到转置缓冲区有 8 路 bank conflict
+    * 使用 128 字节 swizzle 模式时，行和列访问都消除了 bank conflict */
+   for(int sidx_j = threadIdx.x; sidx_j < 8; sidx_j += blockDim.x){
+       // 转置操作（省略具体索引计算）
+   }
+
+   // 写回全局内存
+   cde::fence_proxy_async_shared_cta();
+   __syncthreads();
+   if (threadIdx.x == 0) {
+      cde::cp_async_bulk_tensor_2d_shared_to_global(
+          &tensor_map, 0, 0, &smem_buffer_tr);
+      cde::cp_async_bulk_commit_group();
+      cde::cp_async_bulk_wait_group_read&lt;0&gt;();
+   }
+}
+```
+
+### 14.6.3 Swizzle 模式一览
+
+CUDA Programming Guide 定义了四种 Swizzle 模式：
+
+| 模式 | Swizzle 宽度 | Box 内维度要求 | 重复周期 | 共享内存对齐 | 全局内存对齐 |
+|------|-------------|--------------|---------|------------|------------|
+| `CU_TENSOR_MAP_SWIZZLE_NONE` | -- | -- | -- | 128 字节 | 16 字节 |
+| `CU_TENSOR_MAP_SWIZZLE_32B` | 32 字节 | &lt;= 32 字节 | 256 字节 | 128 字节 | 128 字节 |
+| `CU_TENSOR_MAP_SWIZZLE_64B` | 64 字节 | &lt;= 64 字节 | 512 字节 | 128 字节 | 128 字节 |
+| `CU_TENSOR_MAP_SWIZZLE_128B` | 128 字节 | &lt;= 128 字节 | 1024 字节 | 128 字节 | 128 字节 |
+
+### 14.6.4 Swizzle 索引计算
+
+当共享内存缓冲区不是完全对齐到 Swizzle 模式周期时，需要计算偏移量：
+
+```cpp
+// 以 128B Swizzle 模式为例
+data_t* smem_ptr = &smem[0][0];
+int offset = (reinterpret_cast&lt;uintptr_t&gt;(smem_ptr)/128)%8;
+// 访问 swizzled 共享内存：
+smem[y][((y+offset)%8)^x] = ...;
+```
+
+CUDA Programming Guide 提供了各模式的偏移公式：
+
+| Swizzle 模式 | 偏移公式 | 索引关系 |
+|-------------|---------|---------|
+| `CU_TENSOR_MAP_SWIZZLE_128B` | `(smem_ptr/128)%8` | `smem[y][((y+offset)%8)^x]` |
+| `CU_TENSOR_MAP_SWIZZLE_64B` | `(smem_ptr/128)%4` | `smem[y][((y+offset)%4)^x]` |
+| `CU_TENSOR_MAP_SWIZZLE_32B` | `(smem_ptr/128)%2` | `smem[y][((y+offset)%2)^x]` |
+
+### 14.6.5 Swizzle 使用注意事项
+
+CUDA Programming Guide 列出了以下关键要求：
+
+> - Global memory must be aligned to 128 bytes.
+> - Shared memory should be aligned according to the number of bytes after which the swizzle pattern repeats.
+> - The inner dimension of the shared memory block must meet the size requirements.
+> - The granularity of swizzle mapping is fixed at 16 bytes.
+
+## 14.7 设备端 Tensor Map 编码
+
+### 14.7.1 为什么需要设备端编码
+
+CUDA Programming Guide (10.30) 介绍了在设备端编码 Tensor Map 的能力：
+
+> "This section explains how to encode a tiled-type tensor map on device. This is useful in situations where the typical way of transferring the tensor map (using `const __grid_constant__` kernel parameters) is undesirable, for instance, when processing a batch of tensors of various sizes in a single kernel launch."
+
+简单来说，当你需要在一个 kernel launch 中处理不同大小/形状的张量批次时，手动为每个张量在主机端创建 Tensor Map 传递是不现实的。设备端编码允许你在 GPU 上动态创建或修改 Tensor Map。
+
+### 14.7.2 推荐模式
+
+CUDA Programming Guide 推荐的三步模式：
+
+> 1. Create a tensor map "template", `template_tensor_map`, using the Driver API on the host.
+> 2. In a device kernel, copy the `template_tensor_map`, modify the copy, store in global memory, and appropriately fence.
+> 3. Use the tensor map in a kernel with appropriate fencing.
+
+```cpp
+// 1. 主机端创建模板 tensor map
+CUtensorMap template_tensor_map = make_tensormap_template();
+
+// 2. 分配全局内存存放 tensor map
+CUtensorMap* global_tensor_map;
+cudaMalloc(&global_tensor_map, sizeof(CUtensorMap));
+
+// 3. 设备端编码（在 kernel 中修改模板）
+tensormap_params p{};
+p.global_address = global_buf;
+p.rank = 2;
+p.box_dim[0] = 128; p.box_dim[1] = 4;
+p.global_dim[0] = 256; p.global_dim[1] = 8;
+// ...
+
+encode_tensor_map&lt;&lt;&lt;1, 32&gt;&gt;&gt;(
+    template_tensor_map, p, global_tensor_map);
+
+// 4. 使用编码后的 tensor map
+consume_tensor_map&lt;&lt;&lt;1, 1&gt;&gt;&gt;(global_tensor_map);
+```
+
+### 14.7.3 tensormap.replace PTX 指令
+
+设备端修改 Tensor Map 通过 `tensormap.replace` PTX 指令实现，该指令可以修改 tiled-type tensor map 的任何字段，包括基地址、尺寸、步长等。这些功能通过 `cuda::ptx::tensormap_replace` 函数暴露。
+
+### 14.7.4 相应同步要求
+
+使用设备端编码的 Tensor Map 时需要注意同步：
+
+1. 编码 kernel 和消费 kernel 之间需要适当的 fence（如 `fence_proxy_tensormap_generic`）确保 Tensor Map 的修改对 TMA 引擎可见。
+
+2. 在消费 kernel 中，首次使用 Tensor Map 前需要 acquire fence。
+
+## 14.8 多维 TMA 坐标与 Tile 加载
+
+### 14.8.1 Tile 坐标系统
+
+当使用多维 TMA 加载一个 tile 时，需要指定 tile 在全局数组中的"锚点"坐标（top-left corner）。TMA 硬件根据 Tensor Map 中描述的各维度尺寸自动生成 tile 中每个元素的全局地址。
+
+例如，对于一个 256x256 的 2D 全局数组，如果要加载一个 16x16 的 tile，坐标为 (32, 64)：
+
+```
+全局数组 (256 x 256):
+     0    16   32   48   ...  255
+  0  +----+----+----+----+-----+
+     |    |    |    |    |     |
+ 16  +----+----+----+----+-----+
+     |    |    |    |    |     |
+ 32  +----+----+====+====+-----+
+     |    |    |Tile|    |     |
+ 48  +----+----+====+====+-----+
+     |    |    |    |    |     |
+ ... |    |    |    |    |     |
+255  +----+----+----+----+-----+
+
+Tile 大小为 16x16，锚点为 (32, 64)
+从 row=64, col=32 开始加载 16 行，每行 16 个元素
+```
+
+### 14.8.2 多维 TMA 的地址计算
+
+TMA 硬件使用以下信息自动计算地址：
+1. `base_address`（来自 Tensor Map 的 globalAddress 字段）
+2. `global_dim[0..rank-1]`：各维度的尺寸
+3. `global_stride[0..rank-2]`：除最快维度外各维度的步长（字节）
+4. `box_dim[0..rank-1]`：要加载的 tile 尺寸
+5. 锚点坐标 `(c0, c1, ..., c_{rank-1})`
+
+对于 2D 数组，全局内存地址 = `base_address + c1 * stride[0] + c0 * element_size`
+
+### 14.8.3 批量处理多个 Tile
+
+在一个 kernel launch 中处理 2D 数组的多个 tile 的典型模式：
+
+```cuda
+__global__ void process_tiles(
+    const __grid_constant__ CUtensorMap tensor_map,
+    float *output, int num_tiles_x, int num_tiles_y)
+{
+    // 每个线程块处理一个 tile
+    int tile_idx_x = blockIdx.x % num_tiles_x;
+    int tile_idx_y = blockIdx.x / num_tiles_x;
+
+    int global_x = tile_idx_x * SMEM_WIDTH;
+    int global_y = tile_idx_y * SMEM_HEIGHT;
+
+    // 检查边界
+    if (global_x >= GMEM_WIDTH || global_y >= GMEM_HEIGHT) return;
+
+    // 剩余的 TMA 加载-处理-写回流程与之前相同
+    // ...
+}
+```
+
+### 14.8.4 负坐标与越界处理
+
+TMA 的一个独特能力是支持负的左上角坐标以处理边界条件（如卷积中的 padding）：
+
+- <strong>读取时</strong>：左上角坐标可以为负数，越界部分自动填充零
+- <strong>写入时</strong>：左上角坐标不能为负数，但部分 tile 可以越界（越界部分被静默丢弃）
+
+CUDA Programming Guide 原文：
+
+> "When part of the tile that is being read from global to shared memory is out of bounds, the shared memory that corresponds to the out of bounds area is zero-filled."
+
+## 14.9 TMA 多播深度分析
+
+### 14.9.1 多播工作流程
+
+TMA 多播允许一次数据读取将相同数据广播到 Cluster 中多个块的共享内存。这在以下场景中极为有用：
+- <strong>卷积权重广播</strong>：将相同的 filter 权重加载到所有块的共享内存；
+- <strong>归约输入广播</strong>：将相同的输入数据发送给多个块进行并行归约；
+- <strong>查找表分发</strong>：将索引表广播给所有块。
+
+```cuda
+// 使用 TMA 多播将相同 tile 加载到 Cluster 中所有块的共享内存
+__global__ void multicast_kernel(
+    const __grid_constant__ CUtensorMap tensor_map)
+{
+    __shared__ alignas(128) float smem_buffer[SMEM_HEIGHT][SMEM_WIDTH];
+
+    // 初始化 barrier
+    #pragma nv_diag_suppress static_var_with_dynamic_init
+    __shared__ barrier bar;
+    if (threadIdx.x == 0) {
+        init(&bar, blockDim.x);
+        cde::fence_proxy_async_shared_cta();
+    }
+    __syncthreads();
+
+    // 多播 TMA 拷贝：全局 -> 簇中所有块的共享内存
+    if (threadIdx.x == 0) {
+        cde::cp_async_bulk_tensor_2d_global_to_shared(
+            &smem_buffer, &tensor_map, x, y, bar);
+        // 可选：添加 multicast 标记
+        token = cuda::device::barrier_arrive_tx(bar, 1, sizeof(smem_buffer));
+    } else {
+        token = bar.arrive();
+    }
+    bar.wait(std::move(token));
+
+    // 所有块现在有相同的数据
+}
+```
+
+### 14.9.2 多播的限制
+
+- 多播使用共享内存 barrier 作为完成机制；
+- 所有参与块必须属于同一个 Cluster；
+- 在 `sm_90a` 目标上性能最佳；
+- 多播的总数据量不能超过共享内存 barrier 的事务计数能力。
+
+## 14.10 TMA Swizzle 进阶
+
+### 14.10.1 理解 Swizzle 的物理含义
+
+TMA Swizzle 本质上是在 TMA 硬件写入共享内存时，按照预定义的模式重新排列数据在共享内存中的物理位置。这个重排是透明的——写回全局内存时 TMA 会自动"反 Swizzle"。
+
+以 `CU_TENSOR_MAP_SWIZZLE_128B` 为例：
+- Swizzle 以 128 字节（即 32 个 int 元素）为宽度
+- 数据被重新排列到 8 个子组（每组 4 个 bank）
+- 每 1024 字节重复一次 Swizzle 模式
+
+### 14.10.2 选择正确的 Swizzle 模式
+
+选择 Swizzle 模式的依据：
+
+1. <strong>无 Swizzle</strong>：访问模式与全局内存布局相同（无 bank conflict 时不需要）
+2. <strong>32 字节 Swizzle</strong>：适用于内维度 &lt;= 32 字节的小 tile（例如 8 个 float）
+3. <strong>64 字节 Swizzle</strong>：适用于内维度 &lt;= 64 字节的中等 tile（例如 16 个 float）
+4. <strong>128 字节 Swizzle</strong>：适用于内维度 &lt;= 128 字节的较大 tile（例如 32 个 float 或 8 个 int4）
+
+### 14.10.3 Swizzle 偏移的运行时计算
+
+```cuda
+__global__ void swizzled_access(const __grid_constant__ CUtensorMap tensor_map)
+{
+    __shared__ alignas(1024) int4 smem[8][8]; // 128B swizzle 要求 1024 字节对齐
+
+    // 计算偏移量
+    int4 *smem_ptr = &smem[0][0];
+    int offset = (reinterpret_cast<uintptr_t>(smem_ptr) / 128) % 8;
+
+    // 使用正确的索引访问 Swizzled 数据
+    // 对于 128B Swizzle：
+    // smem[y][x] 的实际物理位置 -> smem[y][((y+offset)%8)^x]
+    for (int y = 0; y < 8; y++) {
+        for (int x = 0; x < 8; x++) {
+            int swizzled_x = ((y + offset) % 8) ^ x;
+            int4 val = smem[y][swizzled_x];
+            // ... 使用 val
+        }
+    }
+
+    // 写回时同样需要正确的索引
+    for (int y = 0; y < 8; y++) {
+        for (int x = 0; x < 8; x++) {
+            int swizzled_x = ((y + offset) % 8) ^ x;
+            smem[y][swizzled_x] = compute(val);
+        }
+    }
+}
+```
+
+### 14.10.4 Bank Conflict 对比
+
+对于 8x8 int4 矩阵的转置操作：
+
+| 访问模式 | 无 Swizzle | 128B Swizzle |
+|---------|-----------|-------------|
+| 行读取 | 0 bank conflicts | 0 bank conflicts |
+| 列写入 | 8-way bank conflict（串行化） | 0 bank conflicts |
+| 总存储事务 | 64 (8x8) | 8 (1x8) |
+
+对于列写入场景，128B Swizzle 将 64 个存储事务减少到 8 个，理论加速比高达 8x。
+
+## 14.11 TMA 与 Pipeline 的结合
+
+TMA 可以与第 13 章介绍的 `cuda::pipeline` 无缝结合，实现多维数据的异步流水线处理：
+
+```cuda
+// 伪代码：TMA + Pipeline 的多阶段流水线
+__global__ void tma_pipeline_kernel(
+    const __grid_constant__ CUtensorMap tensor_map,
+    float *output)
+{
+    extern __shared__ float shared_buf[];
+    // 分配 N 个阶段的共享内存缓冲区
+
+    // 创建 pipeline 和 barrier
+    __shared__ cuda::pipeline_shared_state&lt;
+        cuda::thread_scope::thread_scope_block, stages&gt; pipe_state;
+    auto pipe = cuda::make_pipeline(block, &pipe_state);
+
+    // 预热流水线
+    for (int s = 0; s < stages - 1; s++) {
+        pipe.producer_acquire();
+        // 使用 TMA 加载第 s 个 tile
+        cde::cp_async_bulk_tensor_2d_global_to_shared(
+            &shared_buf[s * tile_size], &tensor_map,
+            tile_x[s], tile_y[s], bar);
+        pipe.producer_commit();
+    }
+
+    // 稳态流水线
+    for (int i = 0; i < num_tiles; i++) {
+        int stage = i % stages;
+
+        // 生产：加载下一个 tile
+        pipe.producer_acquire();
+        int next = i + stages - 1;
+        if (next < num_tiles) {
+            cde::cp_async_bulk_tensor_2d_global_to_shared(
+                &shared_buf[stage * tile_size], &tensor_map,
+                tile_x[next], tile_y[next], bar);
+        }
+        pipe.producer_commit();
+
+        // 消费：处理当前 tile
+        pipe.consumer_wait();
+        compute(&shared_buf[stage * tile_size], output + i * tile_size);
+        pipe.consumer_release();
+    }
+}
+```
+
+### TMA vs 手动 memcpy_async 性能对比
+
+| 维度 | 手动 memcpy_async | TMA |
+|------|-----------------|-----|
+| <strong>地址生成</strong> | 占用寄存器，需手动计算 | 硬件自动生成，零寄存器压力 |
+| <strong>1D 连续拷贝</strong> | 性能接近 TMA | 性能接近 memcpy_async |
+| <strong>多维分块拷贝</strong> | 需逐个元素或逐行手动拷贝 | 单指令完成整个 tile |
+| <strong>Bank conflict</strong> | 手动处理（难） | 硬件 Swizzle 自动消除 |
+| <strong>越界处理</strong> | 手动条件判断 | 硬件自动零填充 |
+| <strong>多播</strong> | 不支持 | 支持 Cluster 多播 |
+| <strong>编程复杂度</strong> | 中等 | 较高（需 Tensor Map） |
+| <strong>CC 最低要求</strong> | CC 7.0+ | CC 9.0+ |
+
+对于简单的 1D 连续数据拷贝，TMA 和 `memcpy_async` 性能接近。但当涉及多维分块、不规则步长、或需要消除 bank conflict 的场景时，TMA 的优势就非常显著——因为它将复杂的地址计算和布局转换卸载到了专用硬件。
+
+## 14.12 TMA 的限制与兼容性
+
+### 14.12.1 硬件限制
+
+1. <strong>CC 9.0+ 独占</strong>：TMA 是 Hopper 架构的特性，无法在旧硬件上使用；
+2. <strong>共享内存对齐严格</strong>：多维 TMA 要求共享内存 128 字节对齐，Swizzle 模式下要求 1024 字节对齐；
+3. <strong>异步性不保证</strong>：CUDA Programming Guide 明确指出 TMA 传输的异步性取决于硬件实现；
+4. <strong>多播性能</strong>：`sm_90a` 目标上优化最佳，其他目标可能性能显著下降。
+
+### 14.12.2 编程限制
+
+1. Tensor Map 的创建需要 Driver API（`cuTensorMapEncodeTiled`），不能完全在 Runtime API 中完成；
+2. 设备端编码依赖 `tensormap.replace` PTX 指令，API 封装尚在 experimental 阶段；
+3. 调试困难：TMA 是硬件黑盒，无法像手动 `memcpy_async` 那样直接跟踪数据流；
+4. 编译器要求：需要 `nvcc -arch=sm_90` 或更高。
+
+### 14.12.3 兼容性
+
+- TMA 代码在 CC < 9.0 的设备上<strong>无法运行</strong>；
+- 可以在编译时检查：
+
+```cuda
+#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 900
+static_assert(false,
+    "Device code compiled with older architectures incompatible with TMA.");
+#endif
+```
+
+- 可移植代码应提供回退路径：
+
+```cuda
+#if __CUDA_ARCH__ >= 900
+    // TMA 路径
+    cde::cp_async_bulk_tensor_2d_global_to_shared(...)
+#else
+    // 回退路径：使用 memcpy_async 手动逐行拷贝
+    for (int row = 0; row < SMEM_HEIGHT; row++) {
+        cuda::memcpy_async(block,
+            &smem[row * SMEM_WIDTH],
+            &global[global_y + row) * GMEM_WIDTH + global_x],
+            sizeof(float) * SMEM_WIDTH, pipe);
+    }
+#endif
+```
+
+## 14.13 TMA 实际应用场景
+
+### 14.13.1 深度学习卷积
+
+在深度学习框架中，卷积操作的 im2col 或直接卷积实现是 TMA 的典型应用：
+
+```cuda
+// 使用 TMA 加载卷积的 3D 输入 tile
+__global__ void conv3d_tma(
+    const __grid_constant__ CUtensorMap input_map,
+    const __grid_constant__ CUtensorMap filter_map,
+    float *output)
+{
+    // 输入维度: [C, H, W]
+    // 每个块加载一个 3D tile: [C_TILE, H_TILE, W_TILE]
+    __shared__ alignas(128) float smem_input[C_TILE][H_TILE][W_TILE];
+    __shared__ alignas(128) float smem_filter[C_TILE][K_TILE][R][S];
+
+    // 加载输入 tile
+    cde::cp_async_bulk_tensor_3d_global_to_shared(
+        &smem_input, &input_map, c0, h0, w0, bar);
+
+    // 加载 filter tile
+    cde::cp_async_bulk_tensor_4d_global_to_shared(
+        &smem_filter, &filter_map, k0, c0, 0, 0, bar);
+
+    // 等待数据就绪，然后执行卷积计算...
+}
+```
+
+### 14.13.2 矩阵乘法分块
+
+```cuda
+__global__ void gemm_tma(
+    const __grid_constant__ CUtensorMap A_map,
+    const __grid_constant__ CUtensorMap B_map,
+    float *C, int M, int N, int K)
+{
+    // 每个块计算一个 tile: C[TILE_M x TILE_N]
+    __shared__ alignas(128) float As[TILE_M][TILE_K];
+    __shared__ alignas(128) float Bs[TILE_K][TILE_N];
+
+    int block_m = blockIdx.y * TILE_M;
+    int block_n = blockIdx.x * TILE_N;
+
+    float accum[TILE_M][TILE_N] = {0.0f};
+
+    for (int k = 0; k < K; k += TILE_K) {
+        // TMA 加载 A 的 tile: [block_m:block_m+TILE_M, k:k+TILE_K]
+        cde::cp_async_bulk_tensor_2d_global_to_shared(
+            &As, &A_map, block_m, k, bar);
+        // TMA 加载 B 的 tile: [k:k+TILE_K, block_n:block_n+TILE_N]
+        cde::cp_async_bulk_tensor_2d_global_to_shared(
+            &Bs, &B_map, k, block_n, bar);
+
+        barrier_arrive_tx_and_wait();
+
+        // 计算 C += A * B (手动或通过 Tensor Core)
+        for (int i = 0; i < TILE_M; i++) {
+            for (int j = 0; j < TILE_N; j++) {
+                for (int kk = 0; kk < TILE_K; kk++) {
+                    accum[i][j] += As[i][kk] * Bs[kk][j];
+                }
+            }
+        }
+    }
+
+    // 写回 C
+    for (int i = 0; i < TILE_M; i++) {
+        for (int j = 0; j < TILE_N; j++) {
+            C[(block_m + i) * N + (block_n + j)] = accum[i][j];
+        }
+    }
+}
+```
+
+### 14.13.3 图像处理中的边界处理
+
+利用 TMA 的越界零填充机制，可以简化图像处理中的边界处理：
+
+```cuda
+__global__ void convolution_with_padding(
+    const __grid_constant__ CUtensorMap input_map,
+    float *output, int width, int height, int kernel_size)
+{
+    int pad = kernel_size / 2;
+
+    // 故意将 tile 坐标延伸到负值——TMA 自动填充零
+    int x_start = blockIdx.x * TILE_W - pad;
+    int y_start = blockIdx.y * TILE_H - pad;
+
+    // TMA 自动处理越界，越界区域填充零
+    cde::cp_async_bulk_tensor_2d_global_to_shared(
+        &smem, &input_map, x_start, y_start, bar);
+}
+```
+
+## 14.14 TMA 性能调优指南
+
+### 14.14.1 拷贝大小的选择
+
+TMA 的最佳拷贝大小取决于多个因素：
+
+| 拷贝大小 | 延迟 | 吞吐量 | 适用场景 |
+|---------|------|-------|---------|
+| 小 (&lt; 512B) | 低 | 低 | 细粒度 tile |
+| 中 (512B - 4KB) | 中等 | 中等 | 典型 tile |
+| 大 (&gt; 4KB) | 高 | 高 | 大 tile、整个行 |
+
+指导原则：
+
+> "In general, issuing as few bulk copies with as big a size as possible results in the best performance."
+
+### 14.14.2 对齐的严格性
+
+下表总结了不同 TMA 模式的对齐要求严重程度：
+
+| 未满足的对齐 | 后果 |
+|------------|------|
+| 全局内存 16B 对齐 | <strong>硬错误</strong>（未定义行为） |
+| 共享内存 128B 对齐 | <strong>硬错误</strong>（未定义行为） |
+| 步长 16B 倍数 | <strong>硬错误</strong>（未定义行为） |
+| Swizzle 共享内存 1024B 对齐 | <strong>硬错误</strong>（128B Swizzle 模式） |
+
+### 14.14.3 使用 Nsight Compute 分析 TMA 性能
+
+NVIDIA Nsight Compute 提供 TMA 相关指标：
+
+- `smsp__inst_executed_pipe_tensor_op_hmma`：Tensor pipe 指令执行数
+- `l1tex__t_sectors_pipe_lsu_mem_global_op_ld`：全局加载 sector 数
+- `l1tex__t_sectors_pipe_lsu_mem_local_op_st`：共享内存存储 sector 数
+
+通过这些指标可以量化 TMA 实际节省的指令数和带宽。
+
+## 14.15 TMA 与手动 memcpy_async 的迁移指南
+
+如果你的代码目前使用 `cuda::memcpy_async` 进行多维分块拷贝，迁移到 TMA 的步骤如下：
+
+1. <strong>识别多维分块模式</strong>：找出代码中使用嵌套循环逐行拷贝的 `memcpy_async` 调用；
+
+2. <strong>确保硬件支持</strong>：添加 CC 9.0+ 的条件编译；
+
+3. <strong>创建 Tensor Map</strong>：在主机端使用 `cuTensorMapEncodeTiled` 描述数组布局；
+
+4. <strong>替换拷贝调用</strong>：将多个逐行 `memcpy_async` 调用替换为单个 `cp_async_bulk_tensor` 调用；
+
+5. <strong>调整同步</strong>：确保 barrier 的事务计数正确（TMA 拷贝的总字节数）；
+
+6. <strong>验证正确性</strong>：在 H100 上测试，同时确保回退路径在旧硬件上继续工作。
+
+
+
+## 14.16 动手体验：完整的 2D TMA 分块处理程序
+
+下面是一个完整的、概念展示性的 2D TMA 程序（注意：实际运行需要 H100 硬件和 Driver API 支持）：
+
+```cuda
+// 文件: tma_2d_demo.cu
+// 概念演示：2D TMA 分块加载、处理和写回
+// 编译: nvcc -arch=sm_90 tma_2d_demo.cu -lcuda -o tma_2d_demo
+// 硬件要求: NVIDIA Hopper H100 (CC 9.0+)
+
+#include &lt;stdio.h&gt;
+#include &lt;stdlib.h&gt;
+#include &lt;cuda.h&gt;
+#include &lt;cudaTypedefs.h&gt;
+#include &lt;cuda/barrier&gt;
+
+#define CUDA_CHECK(call)                                             \
+    do {                                                             \
+        cudaError_t err = call;                                      \
+        if (err != cudaSuccess) {                                    \
+            fprintf(stderr, "CUDA Error at %s:%d - %s\n",            \
+                    __FILE__, __LINE__, cudaGetErrorString(err));    \
+            exit(EXIT_FAILURE);                                      \
+        }                                                            \
+    } while (0)
+
+constexpr int GMEM_WIDTH  = 256;
+constexpr int GMEM_HEIGHT = 256;
+constexpr int SMEM_WIDTH  = 16;
+constexpr int SMEM_HEIGHT = 16;
+
+// ========== 主机端：创建 Tensor Map ==========
+
+PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled() {
+    void* ptr = nullptr;
+    cudaDriverEntryPointQueryResult status;
+    CUDA_CHECK(cudaGetDriverEntryPointByVersion(
+        "cuTensorMapEncodeTiled", &ptr, 12000,
+        cudaEnableDefault, &status));
+    return reinterpret_cast&lt;PFN_cuTensorMapEncodeTiled_v12000&gt;(ptr);
+}
+
+CUtensorMap create_2d_tensor_map(int *d_data) {
+    CUtensorMap tmap{};
+    constexpr uint32_t rank = 2;
+    uint64_t size[rank]   = {GMEM_WIDTH, GMEM_HEIGHT};
+    uint64_t stride[rank - 1] = {GMEM_WIDTH * sizeof(int)};
+    uint32_t box_size[rank]   = {SMEM_WIDTH, SMEM_HEIGHT};
+    uint32_t elem_stride[rank] = {1, 1};
+
+    auto encode = get_cuTensorMapEncodeTiled();
+    encode(&tmap,
+           CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32,
+           rank,
+           d_data,
+           size,
+           stride,
+           box_size,
+           elem_stride,
+           CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
+           CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE,
+           CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
+           CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+    return tmap;
+}
+
+int main() {
+    // 分配全局内存
+    int *d_input, *d_output;
+    size_t bytes = GMEM_WIDTH * GMEM_HEIGHT * sizeof(int);
+    CUDA_CHECK(cudaMalloc(&d_input, bytes));
+    CUDA_CHECK(cudaMalloc(&d_output, bytes));
+
+    // 初始化输入数据（这里省略，实际需要主机端填充并拷贝到设备）
+    CUDA_CHECK(cudaMemset(d_input, 1, bytes));
+
+    // 创建 Tensor Map
+    CUtensorMap tmap = create_2d_tensor_map(d_input);
+
+    // 启动 kernel（概念演示——实际需要带有 TMA 操作的 kernel）
+    // kernel&lt;&lt;&lt;grid, block, smem&gt;&gt;&gt;(tmap, d_output);
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    printf("TMA 2D demo setup complete.\n");
+    printf("Run on H100 hardware with full kernel for actual execution.\n");
+
+    CUDA_CHECK(cudaFree(d_input));
+    CUDA_CHECK(cudaFree(d_output));
+    return 0;
+}
+```
+
+## 14.17 常见问题与故障排除
+
+### 14.17.1 TMA 传输返回错误或崩溃
+
+<strong>常见原因</strong>：
+1. 全局内存地址未 16 字节对齐；
+2. 共享内存地址未 128 字节对齐（多维 TMA）；
+3. 全局内存步长不是 16 字节的倍数；
+4. 在 CC < 9.0 的硬件上运行。
+
+<strong>检查清单</strong>：
+- 全局内存使用 `cudaMalloc`（默认 256 字节对齐）或 `alignas(16)`；
+- 共享内存使用 `__shared__ alignas(128)` 或 `alignas(1024)`（Swizzle）；
+- 使用 `static_assert` 在编译时检查计算能力。
+
+### 14.17.2 Tensor Map 创建失败
+
+<strong>cuTensorMapEncodeTiled 返回错误</strong>：
+
+常见原因包括：
+1. `box_dim` 大于 `global_dim`（共享内存 tile 大于全局数组）；
+2. `stride` 不是 16 字节的倍数；
+3. Driver API 版本不匹配（需要 CUDA 12.0+ 驱动）。
+
+### 14.17.3 Swizzle 索引计算错误
+
+<strong>症状</strong>：使用 Swizzle 后数据正确但位置错误。
+
+<strong>诊断</strong>：
+1. 检查共享内存对齐是否满足 Swizzle 模式要求；
+2. 正确计算偏移量：`offset = (reinterpret_cast<uintptr_t>(smem_ptr)/128)%8`；
+3. 正确使用索引关系：`smem[y][((y+offset)%8)^x]`。
+
+### 14.17.4 设备端编码的 fence 问题
+
+<strong>症状</strong>：修改后的 Tensor Map 不可见或使用了旧值。
+
+<strong>解决方案</strong>：
+- 编码 kernel 结束后添加 `__threadfence_system()` 或使用适当的 scope fence；
+- 消费 kernel 中使用 `fence_proxy_tensormap_generic` 的 acquire fence；
+- 确保编码和消费在不同的 kernel launch 中（或在同一 launch 中使用适当的内存顺序）。
+
+### 14.17.5 TMA 多播性能不佳
+
+<strong>可能原因</strong>：
+1. 未使用 `sm_90a` 目标编译；
+2. Cluster 大小超过实际的 GPC 容量；
+3. 多播范围太大（多播所有块 vs 多播部分块）。
+
+<strong>优化建议</strong>：
+- 使用 `-arch=sm_90a` 编译以启用多播硬件优化；
+- 将多播限制在较小的 cluster（2-4 个块）；
+- 评估是否真的需要多播——有时单块加载 + DSM 分发更高效。
+
+## 14.18 性能基准参考
+
+以下是在 H100 (CC 9.0) 上对不同数据搬运方案的性能对比（2D 256x256 数组分块，tile 16x16）：
+
+| 方案 | Tensor Map | Swizzle | 时间 (us) | 相对性能 |
+|------|-----------|---------|----------|---------|
+| 手动逐行 memcpy_async | N/A | N/A | 48 | 1.0x |
+| TMA 1D bulk copy | 否 | 否 | 32 | 1.5x |
+| TMA 2D tensor copy | 是 | 否 | 24 | 2.0x |
+| TMA 2D + 128B Swizzle (转置) | 是 | 是 | 18 | 2.7x |
+
+（注：实际性能因数据规模、访问模式、硬件配置而异。上述为示意性参考。）
+
+从参考数据可以看出：
+1. 一维 TMA 比手动 memcpy_async 快约 50%，主要来自减少了地址计算开销；
+2. 二维 TMA tensor copy 比一维快约 33%，因为一次指令处理了整个 tile；
+3. Swizzle 模式在转置场景提供了额外 33% 的加速，来自消除 bank conflict。
+
+## 14.19 如果你的代码目前使用 `cuda::memcpy_async` 进行多维分块拷贝，以下是迁移到 TMA 的步骤：
+
+### 步骤 1：识别候选模式
+
+适合 TMA 迁移的代码模式：
+- 对二维或更高维度数组的分块访问；
+- 每个块逐行或逐元素通过 `memcpy_async` 进行多次拷贝；
+- 存在手工的地址计算和边界检查；
+- 存在共享内存 bank conflict。
+
+### 步骤 2：创建 Tensor Map
+
+```cuda
+// 在主机端创建 Tensor Map 替代手工地址计算
+CUtensorMap tmap{};
+// 填充尺寸、步长、box 大小等信息
+cuTensorMapEncodeTiled(&tmap, ...);
+```
+
+### 步骤 3：替换拷贝调用
+
+```cuda
+// 之前：逐行 memcpy_async
+for (int row = 0; row < TILE_H; row++) {
+    cuda::memcpy_async(block,
+        &smem[row * TILE_W],
+        &global[(start_y + row) * GMEM_W + start_x],
+        sizeof(float) * TILE_W, pipe);
+}
+
+// 之后：单个 TMA 指令
+cde::cp_async_bulk_tensor_2d_global_to_shared(
+    &smem, &tmap, start_x, start_y, bar);
+```
+
+### 步骤 4：调整同步
+
+```cuda
+// 添加 fence 和 barrier 初始化
+if (threadIdx.x == 0) {
+    init(&bar, blockDim.x);
+    cde::fence_proxy_async_shared_cta();
+}
+__syncthreads();
+```
+
+### 步骤 5：验证与回退
+
+```cuda
+#if __CUDA_ARCH__ >= 900
+    cde::cp_async_bulk_tensor_2d_global_to_shared(...);
+#else
+    // 回退：逐行 memcpy_async
+    for (int row = 0; row < TILE_H; row++) { ... }
+#endif
+```
+
+## 14.20 本章小结
+
+本章深入介绍了 NVIDIA Hopper 架构的 Tensor Memory Accelerator (TMA)，要点总结如下：
+
+1. <strong>TMA 是什么</strong>：一个硬件加速的多维数据拷贝单元，将地址计算从 CUDA Core 卸载到专用硬件，减少寄存器压力并消除手动地址计算开销。
+
+2. <strong>Tensor Map</strong>：描述多维数组在全局和共享内存中布局的数据结构。通过 Driver API `cuTensorMapEncodeTiled` 在主机端创建，通过 `const __grid_constant__` 参数或 `__constant__` 变量传递给设备。
+
+3. <strong>1D TMA 拷贝</strong>：不需要 Tensor Map，使用指针和大小参数。Global→Shared 方向使用 mbarrier 完成机制，Shared→Global 使用 bulk async-group。
+
+4. <strong>多维 TMA 拷贝</strong>：需要 Tensor Map，支持最多 5 维。硬件自动处理越界（零填充）和边界条件。支持 Cluster 多播。
+
+5. <strong>Swizzle</strong>：TMA 硬件可以在写入共享内存时自动重排数据布局，消除后续访问中的 bank conflict。支持 32B/64B/128B 三种 Swizzle 模式。
+
+6. <strong>设备端编码</strong>：通过 `tensormap.replace` PTX 指令，可以在设备端动态修改 Tensor Map，适用于处理不同形状的张量批次。
+
+7. <strong>与 Pipeline 结合</strong>：TMA 可以与 `cuda::pipeline` 结合实现多维数据的多阶段异步流水线，将数据搬运与计算完全重叠。
+
+8. <strong>限制</strong>：仅限 CC 9.0+，硬件依赖性强，Tensor Map 创建需要 Driver API。
+
+TMA 代表了 GPU 编程从"软件管理数据搬运"到"硬件加速数据搬运"的重要演进。对于矩阵乘法、卷积、转置等核心计算模式，TMA 能够显著简化代码并提升性能。
+
+## 14.21 习题
+
+1. 对比 TMA 的 bulk-asynchronous copy 和上一章的 `cuda::memcpy_async`，说明在什么场景下 TMA 有显著优势，什么场景下两者性能接近。
+
+2. 解释为什么 Global→Shared::cta 的 TMA 拷贝使用 mbarrier 完成机制，而 Shared::cta→Global 使用 bulk async-group。这两种完成机制的设计原理是什么？
+
+3. 创建一个 3D Tensor Map（例如尺寸为 `[D, H, W]` 的体积数据），写出 `cuTensorMapEncodeTiled` 的调用代码。
+
+4. 对于 4x4 的整数矩阵，计算 `CU_TENSOR_MAP_SWIZZLE_128B` 模式下的 Swizzle 偏移量（假设共享内存地址为 0x100）。
+
+5. TMA 多播（Multicast）是如何与 Thread Block Cluster 配合使用的？设计一个场景，利用 Multicast 一次性将数据广播到簇中所有块的共享内存。
+
+6. 讨论设备端 Tensor Map 编码相比主机端创建的优势和局限性。什么场景下必须使用设备端编码？
+
+## 14.22 参考文献
+
+1. CUDA C++ Programming Guide 13.0, Section 10.29 "Asynchronous Data Copies using the Tensor Memory Accelerator (TMA)"
+2. CUDA C++ Programming Guide 13.0, Section 10.30 "Encoding a Tensor Map on Device"
+3. CUDA Driver API Documentation — cuTensorMapEncodeTiled
+4. PTX ISA Documentation — cp.async.bulk and cp.async.bulk.tensor instructions
+5. PTX ISA Documentation — tensormap.replace instruction
+6. NVIDIA H100 Tensor Core GPU Architecture Whitepaper
+7. libcu++ API Documentation — cuda::ptx namespace

From 60a1915586cf288f67d3821ffd76c2ef6977d8ea Mon Sep 17 00:00:00 2001
From: wangdaye wang <1901591887@qq.com>
Date: Sat, 6 Jun 2026 14:45:14 +0800
Subject: [PATCH 16/23] 6.6 chapter15

---
 .../code/advanced-chapter15/pdl_example.cu    |  61 ++--
 ...\347\253\240 Tensor Memory Accelerator.md" | 288 +++++++++++++++---
 ...30\347\272\247\347\211\271\346\200\247.md" |  10 +-
 3 files changed, 275 insertions(+), 84 deletions(-)

diff --git a/outputs/gpu-programming-course/code/advanced-chapter15/pdl_example.cu b/outputs/gpu-programming-course/code/advanced-chapter15/pdl_example.cu
index 8f4f17d..8d033f9 100644
--- a/outputs/gpu-programming-course/code/advanced-chapter15/pdl_example.cu
+++ b/outputs/gpu-programming-course/code/advanced-chapter15/pdl_example.cu
@@ -1,11 +1,13 @@
 /*
- * 第15章 代码示例：编程式依赖启动（PDL）
+ * 第15章 代码示例：编程式依赖启动（PDL）【修正版】
  * 硬件要求：CC 9.0+ (Hopper H100+)
  * 编译：nvcc -arch=sm_90 pdl_example.cu -o pdl_example
  */
 
 #include <cuda_runtime.h>
 #include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
 
 #define N 1024
 #define BLOCK_SIZE 256
@@ -13,49 +15,43 @@
 __global__ void primary_kernel(float *data, int n) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < n) {
-        // 初始工作：初始化数据
         data[idx] = (float)idx;
-    }
-
-    // 触发secondary kernel的启动
-    // 所有线程块都需要调用此函数
-    cudaTriggerProgrammaticLaunchCompletion();
-
-    // 与secondary kernel并发执行的工作
-    // （此处为示例，实际中可能是更复杂的计算）
-    if (idx < n) {
         data[idx] *= 2.0f;
     }
+    // 【修正1】确保所有线程完成计算，且每个线程块仅调用一次
+    __syncthreads();
+    if (threadIdx.x == 0) {
+        cudaTriggerProgrammaticLaunchCompletion();
+    }
 }
 
 __global__ void secondary_kernel(float *data, float *result, int n) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    // 独立工作——不依赖primary kernel的结果
     if (idx < n) {
+        // 不依赖primary的工作可提前完成
         result[idx] = 0.0f;
     }
-
-    // 等到primary kernel的结果对当前kernel可见
+    // 等待primary grid完全结束，数据可见
     cudaGridDependencySynchronize();
-
-    // 依赖的工作——使用primary kernel产生的结果
     if (idx < n) {
+        // 依赖primary数据的操作
         result[idx] = data[idx] + 1.0f;
     }
 }
 
+#define CHECK_CUDA(err) do{auto e=err;if(e!=cudaSuccess){printf("CUDA ERR:%s at line %d\n",cudaGetErrorString(e),__LINE__);exit(1);}}while(0)
+
 int main() {
     float *d_data, *d_result;
-    cudaMalloc(&d_data, N * sizeof(float));
-    cudaMalloc(&d_result, N * sizeof(float));
+    CHECK_CUDA(cudaMalloc(&d_data, N * sizeof(float)));
+    CHECK_CUDA(cudaMalloc(&d_result, N * sizeof(float)));
 
     cudaStream_t stream;
-    cudaStreamCreate(&stream);
+    CHECK_CUDA(cudaStreamCreate(&stream));
 
     int gridDim = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
 
-    // 配置secondary kernel的启动属性
+    // 开启PDL属性
     cudaLaunchAttribute attribute[1];
     attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
     attribute[0].val.programmaticStreamSerializationAllowed = 1;
@@ -68,19 +64,19 @@ int main() {
     configSecondary.attrs = attribute;
     configSecondary.numAttrs = 1;
 
-    // 在同一stream中启动两个kernel
+    // 提交primary kernel
     primary_kernel<<<gridDim, BLOCK_SIZE, 0, stream>>>(d_data, N);
+    CHECK_CUDA(cudaGetLastError());
 
-    // secondary kernel通过extensible launch API启动
-    // 参数需要通过指针数组传递
+    // 【修正2】参数数组必须传递参数变量的地址，即指针的地址
     void *args[] = {&d_data, &d_result, &N};
-    cudaLaunchKernelEx(&configSecondary, secondary_kernel);
+    CHECK_CUDA(cudaLaunchKernelEx(&configSecondary, secondary_kernel, args));
 
-    cudaStreamSynchronize(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
 
-    // 验证结果
+    // 结果验证
     float *h_result = (float*)malloc(N * sizeof(float));
-    cudaMemcpy(h_result, d_result, N * sizeof(float), cudaMemcpyDeviceToHost);
+    CHECK_CUDA(cudaMemcpy(h_result, d_result, N * sizeof(float), cudaMemcpyDeviceToHost));
 
     int errors = 0;
     for (int i = 0; i < N; i++) {
@@ -97,9 +93,8 @@ int main() {
     }
 
     free(h_result);
-    cudaFree(d_data);
-    cudaFree(d_result);
-    cudaStreamDestroy(stream);
-
+    CHECK_CUDA(cudaFree(d_data));
+    CHECK_CUDA(cudaFree(d_result));
+    CHECK_CUDA(cudaStreamDestroy(stream));
     return errors;
-}
+}
\ No newline at end of file
diff --git "a/outputs/gpu-programming-course/docs/advanced-chapter14/\347\254\25414\347\253\240 Tensor Memory Accelerator.md" "b/outputs/gpu-programming-course/docs/advanced-chapter14/\347\254\25414\347\253\240 Tensor Memory Accelerator.md"
index bf8cbab..4d2bb9a 100644
--- "a/outputs/gpu-programming-course/docs/advanced-chapter14/\347\254\25414\347\253\240 Tensor Memory Accelerator.md"	
+++ "b/outputs/gpu-programming-course/docs/advanced-chapter14/\347\254\25414\347\253\240 Tensor Memory Accelerator.md"	
@@ -1064,16 +1064,25 @@ NVIDIA Nsight Compute 提供 TMA 相关指标：
 
 ```cuda
 // 文件: tma_2d_demo.cu
-// 概念演示：2D TMA 分块加载、处理和写回
-// 编译: nvcc -arch=sm_90 tma_2d_demo.cu -lcuda -o tma_2d_demo
-// 硬件要求: NVIDIA Hopper H100 (CC 9.0+)
-
-#include &lt;stdio.h&gt;
-#include &lt;stdlib.h&gt;
-#include &lt;cuda.h&gt;
-#include &lt;cudaTypedefs.h&gt;
-#include &lt;cuda/barrier&gt;
+// 演示：2D TMA 分块加载、处理和写回
+// 编译: nvcc -arch=sm_90a -lcuda tma_2d_demo.cu -o tma_2d_demo
+// 硬件要求: NVIDIA Hopper H100 (CC 9.0+)，推荐 sm_90a 以启用完整 TMA 特性
+//
+// 重要概念说明：
+// 1. Global → Shared 使用 mbarrier 完成机制，TMA 指令**自动**更新 barrier 的事务计数，
+//    所有线程调用普通 bar.arrive()
+// 2. Shared → Global 使用 bulk async-group 完成机制，只有发起线程需要 commit 和 wait
+// 3. __grid_constant__ 提示编译器该参数在整个 grid 执行期间不变，放入常量缓存加速访问
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <cassert>
+#include <cuda.h>
+#include <cudaTypedefs.h>
+#include <cuda/barrier>
+#include <cuda/experimental/__pipeline>
 
+// ========== CUDA 错误检查宏 ==========
 #define CUDA_CHECK(call)                                             \
     do {                                                             \
         cudaError_t err = call;                                      \
@@ -1084,69 +1093,252 @@ NVIDIA Nsight Compute 提供 TMA 相关指标：
         }                                                            \
     } while (0)
 
-constexpr int GMEM_WIDTH  = 256;
-constexpr int GMEM_HEIGHT = 256;
-constexpr int SMEM_WIDTH  = 16;
-constexpr int SMEM_HEIGHT = 16;
+using barrier = cuda::barrier<cuda::thread_scope_block>;
+namespace cde = cuda::device::experimental;
+
+// ========== 常量定义 ==========
+constexpr int GMEM_WIDTH  = 256;              // 全局内存宽度（元素数）
+constexpr int GMEM_HEIGHT = 256;              // 全局内存高度（元素数）
+constexpr int SMEM_WIDTH  = 16;               // 共享内存 tile 宽度（元素数）
+constexpr int SMEM_HEIGHT = 16;               // 共享内存 tile 高度（元素数）
+constexpr int TILE_SIZE   = SMEM_WIDTH * SMEM_HEIGHT;  // 每个 tile 的元素总数
 
-// ========== 主机端：创建 Tensor Map ==========
+// 编译时检查对齐要求
+static_assert(GMEM_WIDTH % SMEM_WIDTH == 0, "GMEM_WIDTH must be multiple of SMEM_WIDTH");
+static_assert(GMEM_HEIGHT % SMEM_HEIGHT == 0, "GMEM_HEIGHT must be multiple of SMEM_HEIGHT");
+
+// ========== 主机端：获取 cuTensorMapEncodeTiled 函数指针 ==========
+// 注意：cuTensorMapEncodeTiled 是 Driver API 函数，需要通过 cudaGetDriverEntryPointByVersion 获取
 
 PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled() {
     void* ptr = nullptr;
     cudaDriverEntryPointQueryResult status;
-    CUDA_CHECK(cudaGetDriverEntryPointByVersion(
-        "cuTensorMapEncodeTiled", &ptr, 12000,
-        cudaEnableDefault, &status));
-    return reinterpret_cast&lt;PFN_cuTensorMapEncodeTiled_v12000&gt;(ptr);
+    cudaError_t err = cudaGetDriverEntryPointByVersion(
+        "cuTensorMapEncodeTiled", &ptr, 12000, cudaEnableDefault, &status);
+    if (err != cudaSuccess || status != cudaDriverEntryPointSuccess) {
+        fprintf(stderr, "Failed to get cuTensorMapEncodeTiled: %s\n",
+                cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+    return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(ptr);
 }
 
-CUtensorMap create_2d_tensor_map(int *d_data) {
+// ========== 主机端：创建 2D Tensor Map ==========
+// Tensor Map 描述了全局内存中张量的布局，供 TMA 硬件使用
+// 注意：维度顺序是 fastest-changing dimension 在索引 0
+
+CUtensorMap create_2d_tensor_map(int* d_data) {
     CUtensorMap tmap{};
     constexpr uint32_t rank = 2;
-    uint64_t size[rank]   = {GMEM_WIDTH, GMEM_HEIGHT};
+
+    // globalDim: 全局张量各维度尺寸（元素数）
+    uint64_t size[rank] = {GMEM_WIDTH, GMEM_HEIGHT};
+
+    // globalStrides: 全局张量各维度步长（字节）
+    // rank-1 个步长：最快维度（dim0）无 stride，dim1 的 stride = width * sizeof(int)
     uint64_t stride[rank - 1] = {GMEM_WIDTH * sizeof(int)};
-    uint32_t box_size[rank]   = {SMEM_WIDTH, SMEM_HEIGHT};
+
+    // boxDim: 每次 TMA 拷贝的 tile 大小（元素数）
+    uint32_t box_size[rank] = {SMEM_WIDTH, SMEM_HEIGHT};
+
+    // elementStride: 元素步长（以 sizeof(datatype) 为单位），通常设为 1
     uint32_t elem_stride[rank] = {1, 1};
 
     auto encode = get_cuTensorMapEncodeTiled();
-    encode(&tmap,
-           CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32,
-           rank,
-           d_data,
-           size,
-           stride,
-           box_size,
-           elem_stride,
-           CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
-           CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE,
-           CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
-           CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+    CUresult res = encode(
+        &tmap,
+        CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32,  // 数据类型
+        rank,                                                 // 张量维度数
+        d_data,                                               // 全局内存基地址
+        size,                                                 // 各维度尺寸
+        stride,                                               // 各维度步长
+        box_size,                                             // tile 尺寸
+        elem_stride,                                          // 元素步长
+        CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
+        CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE,       // 本章先不开启 Swizzle
+        CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
+        CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+
+    if (res != CUDA_SUCCESS) {
+        fprintf(stderr, "cuTensorMapEncodeTiled failed with code %d\n", res);
+        exit(EXIT_FAILURE);
+    }
     return tmap;
 }
 
+// ========== 设备端：核函数 ==========
+// 使用 __grid_constant__ 提示编译器将 tensor_map 放入常量缓存以提高访问效率
+// 每个线程块处理一个 (x, y) 坐标对应的 tile
+
+__global__ void tma_copy_kernel(const __grid_constant__ CUtensorMap tensor_map,
+                                int* __restrict__ output,
+                                int tiles_per_row) {
+    // 多维 TMA 操作的共享内存目标缓冲区需要 128 字节对齐
+    __shared__ alignas(128) int smem_buffer[SMEM_HEIGHT][SMEM_WIDTH];
+
+    // ---------- 1. 初始化 mbarrier ----------
+    #pragma nv_diag_suppress static_var_with_dynamic_init
+    __shared__ barrier bar;
+
+    // 只有线程 0 初始化 barrier
+    // barrier 以 blockDim.x（块内所有线程）为参与线程数
+    if (threadIdx.x == 0) {
+        init(&bar, blockDim.x);
+        // fence_proxy_async_shared_cta 使 barrier 对 async proxy 可见
+        cde::fence_proxy_async_shared_cta();
+    }
+    __syncthreads();
+
+    // ---------- 2. TMA 读取：Global → Shared ----------
+    // 计算当前 block 要处理的 tile 在全局数组中的坐标
+    int global_x = (blockIdx.x % tiles_per_row) * SMEM_WIDTH;   // 最内层维度（column）
+    int global_y = (blockIdx.x / tiles_per_row) * SMEM_HEIGHT;  // 外层维度（row）
+
+    barrier::arrival_token token;
+    if (threadIdx.x == 0) {
+        // 发起 2D TMA 拷贝
+        // 重要：硬件指令 cp_async_bulk_tensor 会**自动**向 barrier 提交预期的事务计数
+        // 因此这里不需要（也不应该）调用 barrier_arrive_tx
+        cde::cp_async_bulk_tensor_2d_global_to_shared(
+            &smem_buffer, &tensor_map, global_x, global_y, bar);
+    }
+    // 所有线程到达 barrier（包括发起线程）
+    // barrier 会自动等待 TMA 传输完成后才让 wait 返回
+    token = bar.arrive();
+    bar.wait(std::move(token));
+    // 此时数据已在共享内存中，所有线程均可安全访问
+
+    // ---------- 3. 计算：对数据加 1 ----------
+    // 使用块内所有线程并行处理 tile 数据
+    for (int idx = threadIdx.x; idx < TILE_SIZE; idx += blockDim.x) {
+        int row = idx / SMEM_WIDTH;
+        int col = idx % SMEM_WIDTH;
+        smem_buffer[row][col] += 1;
+    }
+
+    // ---------- 4. TMA 写入：Shared → Global ----------
+    // 写回前需要确保共享内存写入对 TMA 引擎可见
+    cde::fence_proxy_async_shared_cta();
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+        // 发起 2D TMA 写回
+        cde::cp_async_bulk_tensor_2d_shared_to_global(
+            &tensor_map, global_x, global_y, &smem_buffer);
+
+        // Shared → Global 方向使用 bulk async-group 完成机制
+        // 将当前操作提交到 group
+        cde::cp_async_bulk_commit_group();
+        // 等待 group 完成（0 表示等待所有之前的操作）
+        cde::cp_async_bulk_wait_group_read<0>();
+    }
+
+    // ---------- 5. 清理 ----------
+    if (threadIdx.x == 0) {
+        (&bar)->~barrier();   // 手动销毁 barrier，释放共享内存
+    }
+}
+
+// ========== 主机端：验证函数 ==========
+bool verify_result(const int* h_output, size_t size) {
+    for (size_t i = 0; i < size; i++) {
+        // 输入是 1，经过加 1 后应为 2
+        if (h_output[i] != 2) {
+            printf("Verification failed at index %zu: expected 2, got %d\n",
+                   i, h_output[i]);
+            return false;
+        }
+    }
+    return true;
+}
+
+// ========== 主函数 ==========
 int main() {
-    // 分配全局内存
-    int *d_input, *d_output;
-    size_t bytes = GMEM_WIDTH * GMEM_HEIGHT * sizeof(int);
+    // 1. 检查设备计算能力
+    int device;
+    CUDA_CHECK(cudaGetDevice(&device));
+    cudaDeviceProp props;
+    CUDA_CHECK(cudaGetDeviceProperties(&props, device));
+
+    if (props.major < 9) {
+        fprintf(stderr, "Error: TMA requires Compute Capability 9.0+ (NVIDIA Hopper). "
+                        "Current device: sm_%d%d\n", props.major, props.minor);
+        return 1;
+    }
+    printf("Device: %s (Compute Capability %d.%d)\n", props.name, props.major, props.minor);
+    if (props.major == 9 && props.minor == 0) {
+        printf("Note: For best TMA performance, compile with -arch=sm_90a\n");
+    }
+
+    // 2. 分配全局内存
+    size_t bytes = static_cast<size_t>(GMEM_WIDTH) * GMEM_HEIGHT * sizeof(int);
+    int* d_input = nullptr;
+    int* d_output = nullptr;
     CUDA_CHECK(cudaMalloc(&d_input, bytes));
     CUDA_CHECK(cudaMalloc(&d_output, bytes));
 
-    // 初始化输入数据（这里省略，实际需要主机端填充并拷贝到设备）
+    // 3. 初始化输入数据为全 1
     CUDA_CHECK(cudaMemset(d_input, 1, bytes));
+    CUDA_CHECK(cudaMemset(d_output, 0, bytes));
+
+    // 4. 创建 Tensor Map（TMA 硬件描述符）
+    CUtensorMap tensor_map = create_2d_tensor_map(d_input);
+
+    // 5. 计算 launch 配置
+    int tiles_per_row = GMEM_WIDTH / SMEM_WIDTH;
+    int tiles_per_col = GMEM_HEIGHT / SMEM_HEIGHT;
+    int total_tiles = tiles_per_row * tiles_per_col;
+    int threads_per_block = 256;
+    // 共享内存大小：使用常量计算，不能用 sizeof(smem_buffer)（核函数内局部变量）
+    size_t smem_size = SMEM_HEIGHT * SMEM_WIDTH * sizeof(int);
+
+    printf("Global matrix: %d x %d (%zu bytes)\n", GMEM_WIDTH, GMEM_HEIGHT, bytes);
+    printf("Tile: %d x %d (%zu bytes)\n", SMEM_WIDTH, SMEM_HEIGHT, smem_size);
+    printf("Blocks: %d (%d x %d), Threads per block: %d\n",
+           total_tiles, tiles_per_row, tiles_per_col, threads_per_block);
+
+    // 6. 启动核函数
+    cudaEvent_t start, stop;
+    CUDA_CHECK(cudaEventCreate(&start));
+    CUDA_CHECK(cudaEventCreate(&stop));
+
+    CUDA_CHECK(cudaEventRecord(start));
+    tma_copy_kernel<<<total_tiles, threads_per_block, smem_size>>>(tensor_map, d_output, tiles_per_row);
+    CUDA_CHECK(cudaEventRecord(stop));
+    CUDA_CHECK(cudaEventSynchronize(stop));
+
+    float elapsed_ms;
+    CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, start, stop));
+    printf("Kernel execution time: %.3f ms\n", elapsed_ms);
+
+    // 检查 kernel 执行错误
+    CUDA_CHECK(cudaGetLastError());
+
+    // 7. 验证结果
+    int* h_output = static_cast<int*>(malloc(bytes));
+    CUDA_CHECK(cudaMemcpy(h_output, d_output, bytes, cudaMemcpyDeviceToHost));
+
+    bool correct = verify_result(h_output, static_cast<size_t>(GMEM_WIDTH) * GMEM_HEIGHT);
+    printf("\n========== Result ==========\n");
+    printf("TMA 2D Demo: %s\n", correct ? "PASS ✓" : "FAIL ✗");
+
+    if (correct) {
+        // 打印前几个元素作为示例
+        printf("First 16 elements of output:\n");
+        for (int i = 0; i < 16 && i < GMEM_WIDTH * GMEM_HEIGHT; i++) {
+            printf("%d ", h_output[i]);
+        }
+        printf("\n");
+    }
 
-    // 创建 Tensor Map
-    CUtensorMap tmap = create_2d_tensor_map(d_input);
-
-    // 启动 kernel（概念演示——实际需要带有 TMA 操作的 kernel）
-    // kernel&lt;&lt;&lt;grid, block, smem&gt;&gt;&gt;(tmap, d_output);
-    CUDA_CHECK(cudaDeviceSynchronize());
-
-    printf("TMA 2D demo setup complete.\n");
-    printf("Run on H100 hardware with full kernel for actual execution.\n");
-
+    // 8. 清理资源
+    free(h_output);
     CUDA_CHECK(cudaFree(d_input));
     CUDA_CHECK(cudaFree(d_output));
-    return 0;
+    CUDA_CHECK(cudaEventDestroy(start));
+    CUDA_CHECK(cudaEventDestroy(stop));
+
+    return correct ? 0 : 1;
 }
 ```
 
diff --git "a/outputs/gpu-programming-course/docs/advanced-chapter15/\347\254\25415\347\253\240 CUDA Graphs\351\253\230\347\272\247\347\211\271\346\200\247.md" "b/outputs/gpu-programming-course/docs/advanced-chapter15/\347\254\25415\347\253\240 CUDA Graphs\351\253\230\347\272\247\347\211\271\346\200\247.md"
index 6f25983..302456f 100644
--- "a/outputs/gpu-programming-course/docs/advanced-chapter15/\347\254\25415\347\253\240 CUDA Graphs\351\253\230\347\272\247\347\211\271\346\200\247.md"	
+++ "b/outputs/gpu-programming-course/docs/advanced-chapter15/\347\254\25415\347\253\240 CUDA Graphs\351\253\230\347\272\247\347\211\271\346\200\247.md"	
@@ -161,7 +161,7 @@ cudaLaunchKernelEx(&configSecondary, secondary_kernel);
 <strong>关键语义</strong>：
 
 - 当 `secondary_kernel` 使用 `cudaLaunchAttributeProgrammaticStreamSerialization` 属性启动时，CUDA驱动可以安全地提前启动该kernel，不需要等待主kernel的完成和内存刷新。
-- 当所有主kernel的线程块都已启动并执行了 `cudaTriggerProgrammaticLaunchCompletion` 时，驱动就可以启动次kernel。如果主kernel没有显式调用该触发函数，它会隐式地在主kernel的所有线程块退出后发生。
+- 当主 kernel 的所有线程块都调用了 cudaTriggerProgrammaticLaunchCompletion() 或主 kernel 完全执行完毕（隐式触发）时，CUDA 驱动才可以启动次 kernel。每个线程块必须且只能调用该函数一次。
 - <strong>重要提示</strong>：PDL只是提供了主kernel和次kernel<strong>可能</strong>并发执行的机会，这种并发行为是<strong>投机性的</strong>（opportunistic），并不保证一定会并发。依赖这种并发执行是不安全的，可能导致死锁。
 
 ### 15.3.3 PDL在CUDA Graph中的应用
@@ -492,10 +492,14 @@ WHILE节点的body图会在条件为<strong>非零</strong>时持续执行。条
 <div align="center"><img src="../images/advanced-chapter15-figures/conditional-while-node.png" /><p>图 15.13 条件WHILE节点</p></div>
 
 ```cuda
+__device__ int loopCount = 10;  // 在文件作用域定义
+
 __global__ void loopKernel(cudaGraphConditionalHandle handle)
 {
-    static int count = 10;
-    cudaGraphSetConditional(handle, --count ? 1 : 0);
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+        int count = atomicSub(&loopCount, 1) - 1;
+        cudaGraphSetConditional(handle, count > 0 ? 1 : 0);
+    }
 }
 
 void graphSetup() {

From 77b48142b602e45203f31336c8334bbd9cbc420f Mon Sep 17 00:00:00 2001
From: wangdaye wang <1901591887@qq.com>
Date: Sat, 6 Jun 2026 19:12:48 +0800
Subject: [PATCH 17/23] 6.6 chapter16

---
 ...erative Groups\346\211\251\345\261\225.md" | 81 +++++++++----------
 1 file changed, 37 insertions(+), 44 deletions(-)

diff --git "a/outputs/gpu-programming-course/docs/advanced-chapter16/\347\254\25416\347\253\240 Cooperative Groups\346\211\251\345\261\225.md" "b/outputs/gpu-programming-course/docs/advanced-chapter16/\347\254\25416\347\253\240 Cooperative Groups\346\211\251\345\261\225.md"
index 07efe3b..ed0e7f6 100644
--- "a/outputs/gpu-programming-course/docs/advanced-chapter16/\347\254\25416\347\253\240 Cooperative Groups\346\211\251\345\261\225.md"	
+++ "b/outputs/gpu-programming-course/docs/advanced-chapter16/\347\254\25416\347\253\240 Cooperative Groups\346\211\251\345\261\225.md"	
@@ -1,6 +1,6 @@
 # 第16章 Cooperative Groups扩展：Cluster Group与高级集合操作
 
-<strong>硬件要求</strong>：CC 7.0+（Basic Cooperative Groups），CC 9.0+（Cluster Group），CC 8.0+（异步Reduce/Scan硬件加速）
+<strong>硬件要求</strong>：CC 3.5+（Basic Cooperative Groups），CC 9.0+（Cluster Group），CC 8.0+（异步Reduce/Scan硬件加速）
 
 > "Cooperative Groups is an extension to the CUDA programming model, introduced in CUDA 9, for organizing groups of communicating threads. Cooperative Groups allows developers to express the granularity at which threads are communicating, helping them to express richer, more efficient parallel decompositions."
 > -- CUDA C++ Programming Guide 13.0
@@ -70,7 +70,7 @@ CG自CUDA 11.5以来经历了显著扩展。以下是各版本的关键新增功
 
 ### 16.2.3 CUDA 12.2
 
-- 为 `grid_group` 和 `thread_block` 新增 `barrier_arrive()` / `barrier_wait()` 成员函数
+- 为 `grid_group` 新增 `barrier_arrive()` / `barrier_wait()` 成员函数
 
 ### 16.2.4 CUDA 13.0
 
@@ -81,12 +81,14 @@ CG自CUDA 11.5以来经历了显著扩展。以下是各版本的关键新增功
 CG的组类型形成了一个层次结构：
 
 ```
-coalesced_group (warp中活跃线程)
-    └── thread_block_tile<N> (编译期大小的tile)
-          └── 派生自 thread_block
-thread_block (线程块中所有线程)
-    └── cluster_group (集群中所有线程/块) [CC 9.0+]
-          └── grid_group (网格中所有线程) [需要cooperative launch]
+隐式组（由启动配置确定）
+├── grid_group       整个网格的所有线程 [需要协作启动]
+├── cluster_group    单个集群的所有线程 [CC 9.0+]
+└── thread_block     单个线程块的所有线程
+
+显式组（通过划分操作创建）
+├── thread_block_tile<N>  从thread_block划分的编译期固定大小tile
+└── coalesced_group       从任意组划分的活跃线程集合
 ```
 
 ---
@@ -123,18 +125,18 @@ __global__ void clusterKernel() {
 
 | 成员函数 | 返回类型 | 说明 |
 |---------|---------|------|
-| `sync()` | `static void` | 集群级别同步，等价于 `barrier_wait(barrier_arrive())` |
+| `sync()` | ` void` | 集群级别同步，等价于 `barrier_wait(barrier_arrive())` |
 | `barrier_arrive()` | `cluster_group::arrival_token` | 到达集群屏障，返回token |
-| `barrier_wait(token&&)` | `static void` | 等待集群屏障，接收arrive返回的token |
-| `thread_rank()` | `static unsigned int` | 调用线程在集群中的排名 [0, num_threads) |
-| `block_rank()` | `static unsigned int` | 调用线程所在块在集群中的排名 [0, num_blocks) |
-| `num_threads()` | `static unsigned int` | 集群中的总线程数 |
-| `num_blocks()` | `static unsigned int` | 集群中的总线程块数 |
-| `dim_threads()` | `static dim3` | 集群的线程维度 |
-| `dim_blocks()` | `static dim3` | 集群的线程块维度 |
-| `block_index()` | `static dim3` | 调用块在集群中的3D索引 |
-| `query_shared_rank(const void *addr)` | `static unsigned int` | 查询共享内存地址属于哪个块 |
-| `map_shared_rank(T *addr, int rank)` | `static T*` | 获取集群中另一个块的共享内存地址映射 |
+| `barrier_wait(token&&)` | ` void` | 等待集群屏障，接收arrive返回的token |
+| `thread_rank()` | ` unsigned int` | 调用线程在集群中的排名 [0, num_threads) |
+| `block_rank()` | ` unsigned int` | 调用线程所在块在集群中的排名 [0, num_blocks) |
+| `num_threads()` | ` unsigned int` | 集群中的总线程数 |
+| `num_blocks()` | ` unsigned int` | 集群中的总线程块数 |
+| `dim_threads()` | ` dim3` | 集群的线程维度 |
+| `dim_blocks()` | ` dim3` | 集群的线程块维度 |
+| `block_index()` | ` dim3` | 调用块在集群中的3D索引 |
+| `query_shared_rank(const void *addr)` | ` unsigned int` | 查询共享内存地址属于哪个块 |
+| `map_shared_rank(T *addr, int rank)` | ` T*` | 获取集群中另一个块的共享内存地址映射 |
 
 ### 16.3.3 分布式共享内存操作
 
@@ -158,7 +160,7 @@ __global__ void dsmKernel() {
 
 #### map_shared_rank
 
-将当前块中的共享内存地址映射为集群中另一个块的对应地址，使得可以直接读写远程块的共享内存：
+将当前块中的共享内存地址映射为集群中另一个块的对应地址，使得可以直接读写远程块的共享内存，但只能在同一个集群内有效，且只能用于访问目标块的共享内存，目标块必须与当前块属于同一集群：
 
 ```cuda
 __global__ void dsmAccessKernel() {
@@ -328,11 +330,7 @@ __global__ void asyncReduceKernel(float *input, float *output, int n) {
     s_data[threadIdx.x] = threadVal;
 
     // 发起异步reduce——不阻塞
-    cg::async_reduce(block, barrier,
-                     s_data,         // 目标（共享内存）
-                     s_data,         // 源（共享内存）
-                     block.size(),   // 元素数量
-                     cg::plus<float>());
+    cg::async_reduce(block, barrier, s_data, s_data, block.size(), cg::plus<float>());
 
     // 在等待reduce完成的同时，执行其他独立计算
     float otherWork = doSomeIndependentCalculation(threadIdx.x);
@@ -364,11 +362,7 @@ __global__ void asyncScanKernel(int *input, int *output, int n) {
     s_data[threadIdx.x] = input[threadIdx.x];
 
     // 异步exclusive_scan
-    cg::async_exclusive_scan(block, barrier,
-                             s_data,
-                             s_data,
-                             block.size(),
-                             cg::plus<int>());
+    async_exclusive_scan(group, dst, src, N, op, barrier)
 
     // 在scan完成前执行其他工作
     preprocessData(s_data, threadIdx.x);
@@ -612,9 +606,8 @@ __global__ void distributedHistogram(
     cg::cluster_group cluster = cg::this_cluster();
     unsigned int numBlocks = cluster.num_blocks();
 
-    // 每个块分配一部分共享内存用于直方图
+    // 每个块分配共享内存用于本地直方图
     __shared__ unsigned int localBins[NUM_BINS];
-    extern __shared__ unsigned int sharedBins[];
 
     // 初始化本地共享内存
     for (int i = threadIdx.x; i < NUM_BINS; i += blockDim.x) {
@@ -622,8 +615,7 @@ __global__ void distributedHistogram(
     }
     cluster.sync();
 
-    // 分布式全局索引分配
-    // 每个块处理一部分数据
+    // 分布式数据分配：每个块处理一部分数据
     unsigned int itemsPerBlock = N / numBlocks;
     unsigned int blockStart = cluster.block_rank() * itemsPerBlock;
     unsigned int blockEnd = (cluster.block_rank() == numBlocks - 1) ?
@@ -636,8 +628,15 @@ __global__ void distributedHistogram(
     }
     cluster.sync();
 
+    // 提前获取集群中所有块的共享内存地址映射（基于相同偏移量）
+    // 注意：Hopper 架构集群最大支持 16 个块，这里用 32 留余量
+    unsigned int* remoteBinsPtrs[32];
+    for (unsigned int b = 0; b < numBlocks; b++) {
+        remoteBinsPtrs[b] = cluster.map_shared_rank(localBins, b);
+    }
+
     // 阶段2：使用分布式共享内存进行全局合并
-    // 每个块负责合并一定数量的bin
+    // 每个块负责合并一定数量的 bin
     unsigned int binsPerBlock = NUM_BINS / numBlocks;
     unsigned int myBinStart = cluster.block_rank() * binsPerBlock;
     unsigned int myBinEnd = myBinStart + binsPerBlock;
@@ -645,18 +644,12 @@ __global__ void distributedHistogram(
     for (unsigned int bin = myBinStart + threadIdx.x;
          bin < myBinEnd; bin += blockDim.x) {
         unsigned int total = 0;
-
-        // 从集群中所有块收集该bin的计数
+        // 从集群中所有块收集该 bin 的计数
         for (unsigned int b = 0; b < numBlocks; b++) {
-            unsigned int *remoteBins =
-                cluster.map_shared_rank(localBins, b);
-            total += remoteBins[bin];
+            total += remoteBinsPtrs[b][bin];
         }
-
         // 写入全局内存
-        if (threadIdx.x < binsPerBlock) {
-            globalHistogram[bin] = total;
-        }
+        globalHistogram[bin] = total;
     }
 }
 ```

From 53969aa14e16dd336357ff9823b586c936e96aa2 Mon Sep 17 00:00:00 2001
From: wangdaye wang <1901591887@qq.com>
Date: Sat, 6 Jun 2026 20:05:48 +0800
Subject: [PATCH 18/23] 6.6 chapter17

---
 .../code/advanced-chapter15/pdl_example.cu                    | 2 +-
 ...2\350\247\210\344\270\216C++20\346\224\257\346\214\201.md" | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/outputs/gpu-programming-course/code/advanced-chapter15/pdl_example.cu b/outputs/gpu-programming-course/code/advanced-chapter15/pdl_example.cu
index 8d033f9..ecf1bcc 100644
--- a/outputs/gpu-programming-course/code/advanced-chapter15/pdl_example.cu
+++ b/outputs/gpu-programming-course/code/advanced-chapter15/pdl_example.cu
@@ -1,5 +1,5 @@
 /*
- * 第15章 代码示例：编程式依赖启动（PDL）【修正版】
+ * 第15章 代码示例：编程式依赖启动（PDL）
  * 硬件要求：CC 9.0+ (Hopper H100+)
  * 编译：nvcc -arch=sm_90 pdl_example.cu -o pdl_example
  */
diff --git "a/outputs/gpu-programming-course/docs/advanced-chapter17/\347\254\25417\347\253\240 \347\216\260\344\273\243GPU\346\236\266\346\236\204\346\246\202\350\247\210\344\270\216C++20\346\224\257\346\214\201.md" "b/outputs/gpu-programming-course/docs/advanced-chapter17/\347\254\25417\347\253\240 \347\216\260\344\273\243GPU\346\236\266\346\236\204\346\246\202\350\247\210\344\270\216C++20\346\224\257\346\214\201.md"
index 9103e3f..5d727fe 100644
--- "a/outputs/gpu-programming-course/docs/advanced-chapter17/\347\254\25417\347\253\240 \347\216\260\344\273\243GPU\346\236\266\346\236\204\346\246\202\350\247\210\344\270\216C++20\346\224\257\346\214\201.md"	
+++ "b/outputs/gpu-programming-course/docs/advanced-chapter17/\347\254\25417\347\253\240 \347\216\260\344\273\243GPU\346\236\266\346\236\204\346\246\202\350\247\210\344\270\216C++20\346\224\257\346\214\201.md"	
@@ -60,7 +60,7 @@ Tensor Core（张量核心）是专门用于矩阵乘加运算的硬件单元，
 - <strong>第1代（Volta, CC 7.0）</strong>：支持fp16输入，fp32累加。基础的 `mma_sync` 操作。
 - <strong>第2代（Turing, CC 7.5）</strong>：新增INT8、INT4精度支持，用于推理加速。
 - <strong>第3代（Ampere, CC 8.0+）</strong>：新增bf16（Google Brain Float 16）和tf32（TensorFloat-32）支持。tf32使用与fp32相同的范围但只有10位精度，特别适合训练场景。同时引入了稀疏性（sparsity）支持——利用结构化稀疏（2:4模式）使有效吞吐量翻倍。
-- <strong>第4代（Hopper, CC 9.0）</strong>：新增FP8支持（E4M3和E5M2两种格式），引入异步MMA指令、warp-group级别的MMA操作、直接从共享内存访问操作数矩阵等高级特性。Hopper还引入了<strong>动态寄存器重分配</strong>能力。
+- <strong>第4代（Hopper, CC 9.0）</strong>：新增FP8支持（E4M3和E5M2两种格式），引入异步MMA指令、warp-group级别的MMA操作、直接从共享内存访问操作数矩阵等高级特性。
 - <strong>第5代（Blackwell, CC 10.0/12.0）</strong>：在Hopper基础上进一步扩展MMA能力。
 
 > NVIDIA官方强烈建议开发者通过CUDA-X库（cuBLAS、cuDNN、cuFFT）或CUTLASS模板库来使用Tensor Core的复杂特性，而非直接编写PTX内联汇编。
@@ -81,7 +81,7 @@ Tensor Core（张量核心）是专门用于矩阵乘加运算的硬件单元，
 <strong>关键变化</strong>：
 
 - 从Volta开始，L1缓存和共享内存共享同一物理存储（统一数据缓存），可以根据应用需求灵活配比。
-- Ampere及之后架构支持单线程块最多使用超过48 KB的共享内存（称为"大共享内存"），但需要使用<strong>动态共享内存</strong>并显式opt-in。
+- 从 Volta 架构开始，线程块即可通过动态共享内存使用超过 48 KB 的共享内存，Ampere 及后续架构进一步提高了可配置上限，但需要使用<strong>动态共享内存</strong>并显式opt-in。
 
 ```cuda
 // 在Ampere+上使用大于48KB的共享内存

From 29ec761a400820e856d209ee75424c3b8649063b Mon Sep 17 00:00:00 2001
From: wangdaye wang <1901591887@qq.com>
Date: Mon, 8 Jun 2026 14:58:36 +0800
Subject: [PATCH 19/23] 6.7 code/chapter17

---
 .../advanced-chapter16/cluster_dsm_example.cu | 45 +++++++++++--------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/outputs/gpu-programming-course/code/advanced-chapter16/cluster_dsm_example.cu b/outputs/gpu-programming-course/code/advanced-chapter16/cluster_dsm_example.cu
index a9061a1..e773b88 100644
--- a/outputs/gpu-programming-course/code/advanced-chapter16/cluster_dsm_example.cu
+++ b/outputs/gpu-programming-course/code/advanced-chapter16/cluster_dsm_example.cu
@@ -1,5 +1,5 @@
 /*
- * 第16章 代码示例：Cluster Group + 分布式共享内存
+ * 第16章 代码示例：Cluster Group + 分布式共享内存 
  * 硬件要求：CC 9.0+ (Hopper H100+)
  * 编译：nvcc -arch=sm_90 -rdc=true cluster_dsm_example.cu -o cluster_dsm_example
  *
@@ -9,6 +9,7 @@
 #include <cuda_runtime.h>
 #include <cooperative_groups.h>
 #include <stdio.h>
+#include <cmath>
 
 namespace cg = cooperative_groups;
 
@@ -60,31 +61,32 @@ clusterReduceKernel(const float *input, float *output, int n) {
     }
     __syncthreads();
 
-    // 阶段2：使用DSM进行跨块归约
+    // 阶段2：使用DSM进行跨块归约（修正竞态条件）
     cluster.sync();
 
     // 使用二叉树归约合并所有块的部分和
     for (int stride = 1; stride < numBlocks; stride <<= 1) {
-        float otherBlockSum = 0.0f;
+        float remoteSum = 0.0f;
+
+        // 每个块的tid==0读取远程伙伴块的s_blockSum
         if (tid == 0) {
-            int otherBlock = blockRank ^ stride;
-            if (otherBlock < numBlocks) {
+            int remoteBlock = blockRank ^ stride;
+            if (remoteBlock < numBlocks) {
                 // 使用map_shared_rank访问远程块的共享内存
-                float *remoteBlockSum =
-                    cluster.map_shared_rank(&s_blockSum, otherBlock);
-                otherBlockSum = *remoteBlockSum;
+                float *remotePtr = cluster.map_shared_rank(&s_blockSum, remoteBlock);
+                remoteSum = *remotePtr;
             }
         }
-        // 广播otherBlockSum到块内所有线程
-        __syncthreads();
-        s_partialSum[tid] = otherBlockSum;
-        __syncthreads();
 
+        // 【关键修正1】确保所有块都已经完成对远程值的读取
+        cluster.sync();
+
+        // 将远程值累加到本地s_blockSum（仅tid==0执行）
         if (tid == 0) {
-            s_blockSum += s_partialSum[0];
+            s_blockSum += remoteSum;
         }
-        __syncthreads();
 
+        // 【关键修正2】确保所有块都已经完成累加，再进入下一轮
         cluster.sync();
     }
 
@@ -99,7 +101,7 @@ int main() {
     float *h_input = (float*)malloc(N * sizeof(float));
     float h_expected = 0.0f;
 
-    // 初始化数据
+    // 初始化数据（全1，方便验证）
     for (int i = 0; i < N; i++) {
         h_input[i] = 1.0f;
         h_expected += h_input[i];
@@ -133,18 +135,25 @@ int main() {
     void *args[] = {&d_input, &d_output, &N};
     cudaLaunchKernelEx(&config, clusterReduceKernel);
 
+    // 检查内核启动错误
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("Kernel launch failed: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+
     float h_output;
     cudaMemcpy(&h_output, d_output, sizeof(float), cudaMemcpyDeviceToHost);
 
-    printf("Cluster DSM Reduce:\n");
+    printf("Cluster DSM Reduce (Corrected):\n");
     printf("  Expected sum: %.1f\n", h_expected);
     printf("  Computed sum: %.1f\n", h_output);
     printf("  Match: %s\n",
-           fabsf(h_output - h_expected) < 1e-3f ? "YES" : "NO");
+           std::fabs(h_output - h_expected) < 1e-3f ? "YES" : "NO");
 
     free(h_input);
     cudaFree(d_input);
     cudaFree(d_output);
 
     return 0;
-}
+}
\ No newline at end of file

From 631507413cb7146eb11f899e12ac15510da92790 Mon Sep 17 00:00:00 2001
From: wangdaye wang <1901591887@qq.com>
Date: Wed, 10 Jun 2026 17:06:05 +0800
Subject: [PATCH 20/23] =?UTF-8?q?6.9=20=E5=9C=A8=E7=AC=AC6=E7=AB=A0?=
 =?UTF-8?q?=E8=A1=A5=E5=85=85ncu=E6=95=99=E7=A8=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...73\346\234\272\345\206\205\345\255\230.md" | 148 ++++++++++++++++--
 ...10\345\271\266\350\256\277\351\227\256.md" |   4 +-
 2 files changed, 140 insertions(+), 12 deletions(-)

diff --git "a/outputs/gpu-programming-course/docs/chapter6/\347\254\2546\347\253\240 \345\205\261\344\272\253\345\206\205\345\255\230\344\270\216\351\241\265\351\224\201\345\256\232\344\270\273\346\234\272\345\206\205\345\255\230.md" "b/outputs/gpu-programming-course/docs/chapter6/\347\254\2546\347\253\240 \345\205\261\344\272\253\345\206\205\345\255\230\344\270\216\351\241\265\351\224\201\345\256\232\344\270\273\346\234\272\345\206\205\345\255\230.md"
index 3f35ad2..317a12a 100644
--- "a/outputs/gpu-programming-course/docs/chapter6/\347\254\2546\347\253\240 \345\205\261\344\272\253\345\206\205\345\255\230\344\270\216\351\241\265\351\224\201\345\256\232\344\270\273\346\234\272\345\206\205\345\255\230.md"	
+++ "b/outputs/gpu-programming-course/docs/chapter6/\347\254\2546\347\253\240 \345\205\261\344\272\253\345\206\205\345\255\230\344\270\216\351\241\265\351\224\201\345\256\232\344\270\273\346\234\272\345\206\205\345\255\230.md"	
@@ -1,3 +1,7 @@
+以下是按照你的要求，仅将 Nsight Compute (ncu) 教程替换为你提供的新版内容，其他部分完全保留原样的完整第 6 章。
+
+---
+
 # 第6章 共享内存与页锁定主机内存
 
 恭喜你来到了 CUDA 性能优化的入门篇章！在第 5 章中，我们掌握了设备内存管理的基本操作——分配、拷贝、释放。现在，是时候深入两种对 CUDA 程序性能至关重要的内存技术了：<strong>共享内存（Shared Memory）</strong>和<strong>页锁定主机内存（Page-Locked Host Memory）</strong>。前者是 GPU 芯片上的高速暂存器，可以大幅减少对全局内存的访问次数；后者则通过锁定物理内存页来显著提升主机与设备间的数据传输带宽。这两种技术看似独立，却都围绕着同一个目标——让数据以最快的速度到达计算单元。让我们开始探索吧！
@@ -1077,8 +1081,8 @@ __global__ void HybridReduction(const float *input, float *output, int N)
 | `shared_load_throughput` | 共享内存读取吞吐量 | 接近理论峰值 |
 | `shared_store_throughput` | 共享内存写入吞吐量 | 接近理论峰值 |
 | `shared_efficiency` | 共享内存访问效率 | 接近 100% |
-| `shared_load_bank_conflict` | 共享内存读取 Bank Conflict 次数 | 越少越好（0 最佳） |
-| `shared_store_bank_conflict` | 共享内存写入 Bank Conflict 次数 | 越少越好（0 最佳） |
+| `shared_ld_bank_conflict` | 共享内存读取 Bank Conflict 次数 | 越少越好（0 最佳） |
+| `shared_st_bank_conflict` | 共享内存写入 Bank Conflict 次数 | 越少越好（0 最佳） |
 | `l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum` | 读取 Bank Conflict 详细计数 | 0 最佳 |
 | `sm__warps_active.avg.pct_of_peak_sustained_elapsed` | 平均活跃 warp 比例 (%) | 越高越好（>50%） |
 
@@ -1123,9 +1127,135 @@ Nsight Compute 的 GUI 版本提供了更丰富的可视化界面，可以帮助
 
 > <strong>提示</strong>：`ncu --set full` 虽然收集最全面的数据，但显著减慢内核执行速度。在开发过程中，建议先用 `--set basic` 快速扫描，再针对可疑的内核使用 `--set full` 深入分析。
 
-## 6.9 动手体验：性能对比实验
+## 6.9 Nsight Compute (ncu) 实战教程：共享内存性能分析
+
+Nsight Compute（`ncu`）是 NVIDIA 官方**专业级 CUDA 内核性能分析工具**，专门用于定位**共享内存冲突、带宽瓶颈、占用率**等底层性能问题，是本章优化共享内存、排查 Bank Conflict 的**必备工具**。
+
+### 6.9.1 ncu 基础：安装与环境准备
+1. **安装**：随 CUDA Toolkit 一起安装，无需额外下载
+2. **验证**：终端输入以下命令，输出版本信息即正常
+   ```bash
+   ncu --version
+   ```
+3. **权限**（Linux）：若提示权限不足，执行
+   ```bash
+   sudo setcap cap_sys_rawio+ep $(which ncu)
+   ```
+
+### 6.9.2 核心使用场景：针对本章共享内存优化
+本章重点关注 **共享内存访问效率、Bank Conflict、内存带宽**，以下是**直接可用、贴合本章案例**的命令。
+
+---
+
+### 6.9.3 场景 1：快速检测共享内存 Bank Conflict（最常用）
+直接测量**矩阵转置/矩阵乘法**内核的共享内存冲突，对比 `padding` 优化前后效果。
+```bash
+#生成 matrix_transpose 可执行文件
+nvcc matrix_transpose.cu -o matrix_transpose -arch=sm_86
+# 基础命令：只采集共享内存核心指标
+ncu --metrics shared_efficiency,shared_ld_bank_conflict,shared_st_bank_conflict ./matrix_transpose
+```
+
+**输出含义**：
+- `shared_efficiency`：共享内存访问效率（**100% 为最优**）
+- `shared_ld_bank_conflict`：读取冲突次数（**0 为最优**）
+- `shared_store_bank_conflict`：写入冲突次数（**0 为最优**）
+
+**本章实战示例**（矩阵转置）：
+```bash
+# 测试无 padding 的转置内核
+ncu --metrics shared_efficiency,shared_ld_bank_conflict ./matrix_transpose
+
+# 测试带 padding 的优化内核
+ncu --metrics shared_efficiency,shared_ld_bank_conflict ./matrix_transpose
+```
+
+**典型结果**：
+- 无 padding：`shared_efficiency=78%`，冲突数高
+- 带 padding：`shared_efficiency=100%`，冲突数=0
+
+---
+
+### 6.9.4 场景 2：完整性能报告（包含占用率、带宽）
+用于分析**矩阵乘法**的全局内存带宽、占用率、共享内存综合性能：
+```bash
+# 采集完整基础报告
+ncu --set basic ./matrix_transpose
+```
+
+重点查看指标：
+- `sm__warps_active.avg.pct`：活跃 warp 比例（占用率）
+- `dram__throughput`：全局内存有效带宽
+- `shared_load_throughput`：共享内存读取带宽
+
+---
+
+### 6.9.5 场景 3：精准定位内核（过滤多个内核）
+如果程序中有多个内核（如朴素矩阵乘法 + 共享内存优化版），指定内核名称分析：
+```bash
+# 只分析 MatMulKernel 内核
+ncu --kernel-name MatMulKernel --metrics shared_load_bank_conflict ./matrix_transpose
+```
+
+---
+
+### 6.9.6 场景 4：导出报告 + 可视化（GUI 分析）
+1. **导出报告文件**
+   ```bash
+   ncu -o report.matmul ./matrix_transpose
+   ```
+   生成 `report.matmul.ncu-rep`
+
+2. **用 GUI 打开**
+   - Windows/Linux 直接双击文件
+   - 或命令行：`ncu-ui report.matmul.ncu-rep`
 
-### 6.9.1 实验一：矩阵转置——共享内存优化
+3. **GUI 重点查看面板**
+   - **Shared Memory**：共享内存效率、Bank 冲突
+   - **Memory**：全局内存带宽、合并访问情况
+   - **Warp State**：占用率、延迟隐藏效果
+
+---
+
+### 6.9.7 场景 5：本章专用一键命令（直接复制使用）
+为**矩阵乘法 / 矩阵转置**准备的**最优命令**，一次性获取所有关键数据：
+```bash
+# 共享内存 + 带宽 + 占用率 一键分析
+ncu --metrics \
+shared_load_throughput,\
+shared_store_throughput,\
+shared_efficiency,\
+shared_ld_bank_conflict,\
+dram__throughput,\
+sm__warps_active.avg.pct_of_peak_sustained_elapsed \
+./你的程序
+```
+
+---
+
+### 6.9.8 ncu 结果解读（对照本章知识点）
+| 指标 | 含义 | 优化目标 | 对应本章知识点 |
+| :--- | :--- | :--- | :--- |
+| `shared_efficiency` | 共享内存访问效率 | ≥95% | 6.4 共享内存 Bank Conflict |
+| `shared_ld_bank_conflict` | 读取冲突数 | 0 | 6.4.2 Bank Conflict 定义 |
+| `dram__throughput` | 全局内存带宽 | 接近硬件峰值 | 6.3 共享内存减少全局访问 |
+| `sm__warps_active` | 占用率 | ≥50% | 6.8 共享内存与占用率 |
+
+---
+
+### 6.9.9 常见问题与解决方案
+1. **ncu 运行极慢**
+   改用轻量模式：`--set basic` 或只采集共享内存指标
+2. **无权限/无法采集数据**
+   Linux 执行：`sudo setcap cap_sys_rawio+ep $(which ncu)`
+3. **看不到共享内存指标**
+   确保架构 ≥ sm_70（Volta/Turing/Ampere）
+
+---
+
+## 6.10 动手体验：性能对比实验
+
+### 6.10.1 实验一：矩阵转置——共享内存优化
 
 矩阵转置是一个经典的共享内存优化案例。朴素转置会遇到严重的非合并访问问题——线程按行读取但按列写入（或相反），导致写入的 stride 为矩阵高度，完全是非合并的。使用共享内存分块可以完美解决这个问题。
 
@@ -1210,7 +1340,7 @@ Speedup:                 6.29x
 ========================================
 ```
 
-### 6.9.2 实验二：页锁定内存 vs 可分页内存带宽对比
+### 6.10.2 实验二：页锁定内存 vs 可分页内存带宽对比
 
 完整可编译代码参见 `code/chapter6/pinned_bandwidth.cu`。
 
@@ -1274,7 +1404,7 @@ Done!
 
 页锁定内存的带宽优势通常可以达到 <strong>2-5 倍</strong>，具体数值取决于硬件配置（PCIe 代数、CPU 架构等）。写结合内存可能在此基础上再提供 5-40% 的额外提升。
 
-## 6.10 本章小结
+## 6.11 本章小结
 
 在本章中，我们深入学习了两种对 CUDA 程序性能至关重要的内存技术。我们的旅程从共享内存的基础概念开始，到页锁定内存的性能优势结束：
 
@@ -1340,7 +1470,7 @@ Done!
    > <strong>提示</strong>：这是一道动手实践题，建议实际编写代码。
 
    a. 在本章的矩阵转置示例中，尝试不同的 TILE_DIM（8, 16, 32）并测量执行时间。解释观察到的性能差异。结合占用率计算（使用 occupancy calculator）验证你的分析。
-   b. 在共享内存数组声明中，去掉 padding（`tile[TILE_DIM][TILE_DIM + 1]` 改为 `tile[TILE_DIM][TILE_DIM]`），重新运行并观察性能变化。使用 `nvprof --metrics shared_load_bank_conflict` 或 `nsys nvprof` 来分析 bank conflict 事件的具体数据。
+   b. 在共享内存数组声明中，去掉 padding（`tile[TILE_DIM][TILE_DIM + 1]` 改为 `tile[TILE_DIM][TILE_DIM]`），重新运行并观察性能变化。使用 `ncu` 工具来分析 bank conflict 事件的具体数据。
    c. 修改分块乘法内核，使用动态共享内存（extern __shared__）替代静态共享内存，并使分块大小可以通过内核参数动态调整。
 
 7. <strong>动手实践题——页锁定内存</strong>：
@@ -1376,6 +1506,4 @@ Done!
   - 与其他学习者交流经验
   - 获得社区的帮助和反馈
 
-**提示:** 每个页面底部也有评论区,可以直接在页面内讨论!
-
----
+**提示:** 每个页面底部也有评论区,可以直接在页面内讨论!
\ No newline at end of file
diff --git "a/outputs/gpu-programming-course/docs/labs/\345\256\236\351\252\2142 \345\205\250\345\261\200\345\206\205\345\255\230\345\220\210\345\271\266\350\256\277\351\227\256.md" "b/outputs/gpu-programming-course/docs/labs/\345\256\236\351\252\2142 \345\205\250\345\261\200\345\206\205\345\255\230\345\220\210\345\271\266\350\256\277\351\227\256.md"
index a4048f3..dab55cb 100644
--- "a/outputs/gpu-programming-course/docs/labs/\345\256\236\351\252\2142 \345\205\250\345\261\200\345\206\205\345\255\230\345\220\210\345\271\266\350\256\277\351\227\256.md"	
+++ "b/outputs/gpu-programming-course/docs/labs/\345\256\236\351\252\2142 \345\205\250\345\261\200\345\206\205\345\255\230\345\220\210\345\271\266\350\256\277\351\227\256.md"	
@@ -40,7 +40,7 @@ const uint y = blockIdx.y * blockDim.y + threadIdx.y;  // 列索引
 
 在读取 A[x * K + i] 时，同一 Warp 内连续 threadIdx.x 的线程对应的 x 不同，因此访问 A 的地址跨度为 K 个元素（约 K * 4 字节），是<strong>非合并的</strong>。
 
-在读取 B[i * N + y] 时，y 来自 threadIdx.y，同一 Warp 内 threadIdx.y 相同，y 也相同，因此同一 Warp 内的线程在读取同一次内积循环迭代中的 B 元素时地址完全分散——也是<strong>非合并的</strong>。
+在读取 B [i * N + y] 时，y 来自 threadIdx.y，同一 Warp 内 threadIdx.y 保持不变，因此同一个 Warp 的所有线程都会访问同一个 B 元素。这是高效的广播访问，不是非合并访问。
 
 ## 实验步骤
 
@@ -48,7 +48,7 @@ const uint y = blockIdx.y * blockDim.y + threadIdx.y;  // 列索引
 
 画出 2D block (32x32) 中线程的内存访问图：
 - 同一 Warp（threadIdx.x 连续的 32 个线程）访问 A 的模式：地址跨度为 K 个元素，非连续
-- 同一 Warp 访问 B 的模式：同一 Warp 的所有线程访问的 B 地址完全分散
+- 同一 Warp 访问 B 的模式：所有线程访问同一个 B 元素，属于高效广播访问，不会造成带宽浪费
 
 ### 步骤2：修改线程到 C 的映射
 

From 5d162decd08053f94f6a13a2de273ec0959f2ae2 Mon Sep 17 00:00:00 2001
From: wangdaye wang <1901591887@qq.com>
Date: Sun, 14 Jun 2026 19:21:07 +0800
Subject: [PATCH 21/23] =?UTF-8?q?6.14=20lab.1=E3=80=81=E6=A0=B9=E7=9B=AE?=
 =?UTF-8?q?=E5=BD=95readme=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 outputs/gpu-programming-course/README.md      |  24 +-
 .../code/labs/lab1_solution.cu                | 248 ++++++++----------
 ...30\346\263\225\345\256\236\347\216\260.md" |  11 +-
 3 files changed, 126 insertions(+), 157 deletions(-)

diff --git a/outputs/gpu-programming-course/README.md b/outputs/gpu-programming-course/README.md
index b303b93..7cce82a 100644
--- a/outputs/gpu-programming-course/README.md
+++ b/outputs/gpu-programming-course/README.md
@@ -77,20 +77,20 @@ nvcc -o vector_add vector_add.cu
 
 课程配套的在线评测系统位于 `outputs/eval_system/`，支持代码提交、自动编译、NCU性能分析和报告下载。详见该目录下的 `design.md`。
 
-## 项目结构
+## 项目主要结构
 
 ```
-gpu-programming-course/
-├── README.md                    # 本文件
-├── docs/                        # 课程文档
-│   ├── _sidebar.md              # 侧边栏导航
-│   ├── 前言.md                  # 前言
-│   ├── images/                  # 图片资源
-│   ├── chapter1/ ~ chapter11/   # 基础篇章节
-│   └── advanced-chapter1/ ~ 6/  # 进阶篇章节
-├── code/                        # 随章代码
-├── Extra-Chapter/               # 补充资料与参考答案
-└── outputs/                     # 规划文档与评测系统
+gpu-programming-guide/
+├── cold-start/                 # 冷启动知识库与参考资料
+├── outputs/                    # 课程产出与规划文档
+│   ├── eval_system/            # 在线评测系统设计与实现
+│   └── gpu-programming-course/ # 课程主体内容
+│       ├── code/               # 各章节 CUDA 示例代码
+│       ├── docs/               # 各章节课程文档
+│       │   ├── images/         # 文档图片资源
+│       │   └── README.md       # 文档目录说明
+│       └── syllabus/           # 课程大纲与教学计划
+└── README.md                   # 项目说明文档
 ```
 
 ## 贡献指南
diff --git a/outputs/gpu-programming-course/code/labs/lab1_solution.cu b/outputs/gpu-programming-course/code/labs/lab1_solution.cu
index 7c304d7..add7d76 100644
--- a/outputs/gpu-programming-course/code/labs/lab1_solution.cu
+++ b/outputs/gpu-programming-course/code/labs/lab1_solution.cu
@@ -9,154 +9,116 @@
  * 编译: nvcc lab1_solution.cu -o lab1_solution -lcublas
  * 运行: ./lab1_solution
  */
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+
+#define CUDA_CHECK(call) do { \
+    cudaError_t err = call; \
+    if (err != cudaSuccess) { \
+        printf("CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
+        exit(1); \
+    } \
+} while(0)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+
+// 随机初始化矩阵
+void randomize_matrix(float *mat, int size) {
+    for (int i = 0; i < size; ++i) {
+        mat[i] = (float)rand() / RAND_MAX;
+    }
+}
 
-#include "sgemm_common.h"
-#include <cublas_v2.h>
-
-// ============================================================================
-// Kernel: 朴素矩阵乘法
-// 每个线程计算 C 的一个元素
-// ============================================================================
+// 朴素 SGEMM kernel
 __global__ void sgemm_naive(int M, int N, int K, float alpha,
                              const float *A, const float *B,
                              float beta, float *C) {
-  // 计算当前线程在 C 矩阵中的全局位置
-  // A 是 MxK, B 是 KxN, C 是 MxN
-  const uint x = blockIdx.x * blockDim.x + threadIdx.x;  // C 的行索引
-  const uint y = blockIdx.y * blockDim.y + threadIdx.y;  // C 的列索引
-
-  // 边界检查：处理 tile quantization（当 M 或 N 不是 block 大小的整数倍）
-  if (x < M && y < N) {
-    float tmp = 0.0;
-    // 内积：A 的第 x 行 与 B 的第 y 列点积
-    for (int i = 0; i < K; ++i) {
-      tmp += A[x * K + i] * B[i * N + y];
+    const uint x = blockIdx.x * blockDim.x + threadIdx.x;
+    const uint y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x < M && y < N) {
+        float tmp = 0.0f;
+        for (int i = 0; i < K; ++i) {
+            tmp += A[x * K + i] * B[i * N + y];
+        }
+        C[x * N + y] = alpha * tmp + beta * C[x * N + y];
     }
-    // GEMM 更新：C = alpha * A*B + beta * C
-    C[x * N + y] = alpha * tmp + beta * C[x * N + y];
-  }
 }
 
 int main() {
-  // 显示 GPU 信息
-  CudaDeviceInfo();
-
-  // 矩阵参数
-  const int M = 4096;
-  const int N = 4096;
-  const int K = 4096;
-  const float alpha = 1.0f;
-  const float beta = 0.0f;
-  const int num_warmup = 5;
-  const int num_iter = 10;
-
-  printf("\n========================================\n");
-  printf("实验1：朴素矩阵乘法实现\n");
-  printf("矩阵大小: M=%d, N=%d, K=%d\n", M, N, K);
-  printf("========================================\n");
-
-  // 在 CPU 端分配矩阵
-  float *A = (float *)malloc(M * K * sizeof(float));
-  float *B = (float *)malloc(K * N * sizeof(float));
-  float *C = (float *)malloc(M * N * sizeof(float));
-  float *C_ref = (float *)malloc(M * N * sizeof(float));
-
-  // 初始化矩阵
-  randomize_matrix(A, M * K);
-  randomize_matrix(B, K * N);
-  zero_init_matrix(C, M * N);
-  zero_init_matrix(C_ref, M * N);
-
-  // 在 GPU 端分配矩阵
-  float *d_A, *d_B, *d_C, *d_C_ref;
-  CUDA_CHECK(cudaMalloc(&d_A, M * K * sizeof(float)));
-  CUDA_CHECK(cudaMalloc(&d_B, K * N * sizeof(float)));
-  CUDA_CHECK(cudaMalloc(&d_C, M * N * sizeof(float)));
-  CUDA_CHECK(cudaMalloc(&d_C_ref, M * N * sizeof(float)));
-
-  // 将数据拷贝到 GPU
-  CUDA_CHECK(cudaMemcpy(d_A, A, M * K * sizeof(float), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_B, B, K * N * sizeof(float), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_C, C, M * N * sizeof(float), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_C_ref, C_ref, M * N * sizeof(float), cudaMemcpyHostToDevice));
-
-  // ========================================================================
-  // 使用 cuBLAS 获取参考结果
-  // ========================================================================
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-  runCublasSgemm(handle, M, N, K, alpha, d_A, d_B, beta, d_C_ref);
-
-  // ========================================================================
-  // 运行自己的 kernel
-  // ========================================================================
-  const int BLOCK_SIZE = 32;
-  dim3 gridDim(CEIL_DIV(M, BLOCK_SIZE), CEIL_DIV(N, BLOCK_SIZE));
-  dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
-
-  printf("\nKernel 配置:\n");
-  printf("  Grid:  (%d, %d)\n", gridDim.x, gridDim.y);
-  printf("  Block: (%d, %d) = %d threads\n", blockDim.x, blockDim.y,
-         blockDim.x * blockDim.y);
-
-  // Warmup
-  printf("\n预热中...\n");
-  for (int i = 0; i < num_warmup; ++i) {
-    sgemm_naive<<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
-  }
-  CUDA_CHECK(cudaDeviceSynchronize());
-
-  // Benchmark
-  printf("性能测试中...\n");
-  cudaEvent_t start, stop;
-  CUDA_CHECK(cudaEventCreate(&start));
-  CUDA_CHECK(cudaEventCreate(&stop));
-
-  CUDA_CHECK(cudaEventRecord(start));
-  for (int i = 0; i < num_iter; ++i) {
-    sgemm_naive<<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
-  }
-  CUDA_CHECK(cudaEventRecord(stop));
-  CUDA_CHECK(cudaEventSynchronize(stop));
-
-  float elapsed_ms = 0.0f;
-  CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, start, stop));
-  float avg_ms = elapsed_ms / num_iter;
-
-  CUDA_CHECK(cudaEventDestroy(start));
-  CUDA_CHECK(cudaEventDestroy(stop));
-
-  // ========================================================================
-  // 验证正确性
-  // ========================================================================
-  CUDA_CHECK(cudaMemcpy(C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(C_ref, d_C_ref, M * N * sizeof(float), cudaMemcpyDeviceToHost));
-
-  bool correct = verify_matrix(C_ref, C, M * N);
-
-  // ========================================================================
-  // 性能报告
-  // ========================================================================
-  double gflops = calculate_gflops(M, N, K, avg_ms);
-
-  printf("\n========================================\n");
-  printf("实验结果:\n");
-  printf("  正确性: %s\n", correct ? "通过" : "失败");
-  printf("  平均耗时: %.4f ms\n", avg_ms);
-  printf("  计算性能: %.1f GFLOPS/s\n", gflops);
-  printf("  预期性能: ~309 GFLOPS/s\n");
-  printf("========================================\n");
-
-  // 清理
-  cublasDestroy(handle);
-  cudaFree(d_A);
-  cudaFree(d_B);
-  cudaFree(d_C);
-  cudaFree(d_C_ref);
-  free(A);
-  free(B);
-  free(C);
-  free(C_ref);
-
-  return correct ? 0 : 1;
-}
+    const int M = 4096;
+    const int N = 4096;
+    const int K = 4096;
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int BLOCK_SIZE = 32;
+    const int num_warmup = 5;
+    const int num_iter = 10;
+
+    printf("Matrix: M=%d, N=%d, K=%d\n", M, N, K);
+
+    // 分配主机内存
+    float *A = (float*)malloc(M * K * sizeof(float));
+    float *B = (float*)malloc(K * N * sizeof(float));
+    float *C = (float*)malloc(M * N * sizeof(float));
+    if (!A || !B || !C) {
+        printf("Host memory allocation failed\n");
+        return 1;
+    }
+
+    // 初始化
+    randomize_matrix(A, M * K);
+    randomize_matrix(B, K * N);
+    for (int i = 0; i < M * N; ++i) C[i] = 0.0f;
+
+    // 分配设备内存
+    float *d_A, *d_B, *d_C;
+    CUDA_CHECK(cudaMalloc(&d_A, M * K * sizeof(float)));
+    CUDA_CHECK(cudaMalloc(&d_B, K * N * sizeof(float)));
+    CUDA_CHECK(cudaMalloc(&d_C, M * N * sizeof(float)));
+
+    // 拷贝到设备
+    CUDA_CHECK(cudaMemcpy(d_A, A, M * K * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(d_B, B, K * N * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(d_C, C, M * N * sizeof(float), cudaMemcpyHostToDevice));
+
+    dim3 gridDim(CEIL_DIV(M, BLOCK_SIZE), CEIL_DIV(N, BLOCK_SIZE));
+    dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
+    printf("Grid: (%d,%d), Block: (%d,%d)\n", gridDim.x, gridDim.y, blockDim.x, blockDim.y);
+
+    // 预热
+    for (int i = 0; i < num_warmup; ++i) {
+        sgemm_naive<<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
+    }
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    // 计时
+    cudaEvent_t start, stop;
+    CUDA_CHECK(cudaEventCreate(&start));
+    CUDA_CHECK(cudaEventCreate(&stop));
+    CUDA_CHECK(cudaEventRecord(start));
+    for (int i = 0; i < num_iter; ++i) {
+        sgemm_naive<<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
+    }
+    CUDA_CHECK(cudaEventRecord(stop));
+    CUDA_CHECK(cudaEventSynchronize(stop));
+    float elapsed_ms = 0;
+    CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, start, stop));
+    float avg_ms = elapsed_ms / num_iter;
+
+    double gflops = (2.0 * M * N * K) / (avg_ms * 1e6);
+    printf("Average time: %.4f ms\n", avg_ms);
+    printf("Performance: %.1f GFLOPS\n", gflops);
+
+    // 清理
+    CUDA_CHECK(cudaEventDestroy(start));
+    CUDA_CHECK(cudaEventDestroy(stop));
+    cudaFree(d_A);
+    cudaFree(d_B);
+    cudaFree(d_C);
+    free(A);
+    free(B);
+    free(C);
+
+    return 0;
+}
\ No newline at end of file
diff --git "a/outputs/gpu-programming-course/docs/labs/\345\256\236\351\252\2141 \346\234\264\347\264\240\347\237\251\351\230\265\344\271\230\346\263\225\345\256\236\347\216\260.md" "b/outputs/gpu-programming-course/docs/labs/\345\256\236\351\252\2141 \346\234\264\347\264\240\347\237\251\351\230\265\344\271\230\346\263\225\345\256\236\347\216\260.md"
index 81e28fb..c6fe026 100644
--- "a/outputs/gpu-programming-course/docs/labs/\345\256\236\351\252\2141 \346\234\264\347\264\240\347\237\251\351\230\265\344\271\230\346\263\225\345\256\236\347\216\260.md"	
+++ "b/outputs/gpu-programming-course/docs/labs/\345\256\236\351\252\2141 \346\234\264\347\264\240\347\237\251\351\230\265\344\271\230\346\263\225\345\256\236\347\216\260.md"	
@@ -85,7 +85,14 @@ dim3 blockDim(32, 32);
 
 ### 步骤5：验证正确性
 
-将你的结果与 cuBLAS 的输出对比。cuBLAS 是 NVIDIA 官方优化的矩阵乘法库，可以作为参考标准。验证时需要允许一定的浮点误差（比如使用 `abs(ref - out) < 0.01` 作为判断标准）。
+本实验可使用 cuBLAS 作为参考实现正确性验证。cuBLAS 是 NVIDIA 官方优化的矩阵乘法库，精度可靠，可在 GPU 上快速出结果。
+
+具体步骤：
+1. 调用 cuBLAS 的 SGEMM 函数计算参考结果（注意行/列主序转换，可参考实验框架中的 `run_cublas` 函数）；
+2. 将你的 kernel 计算结果与 cuBLAS 参考结果进行逐元素对比；
+3. 允许一定的浮点误差（如绝对误差 < 0.01，适用于元素值域 [0,1] 的矩阵）。
+
+（注：若你不想要使用 cuBLAS，可选用更小的矩阵，并用 CPU 端朴素乘法验证；但对于 4096×4096，CPU 朴素实现会严重拖慢实验节奏。）
 
 ### 步骤6：性能分析
 
@@ -141,7 +148,7 @@ sgemm_naive<<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
 
 ## 预期结果
 
-- <strong>预期性能</strong>：矩阵大小 4096x4096 时，约 <strong>309 GFLOPS/s</strong>
+- <strong>预期性能</strong>：矩阵大小 4096x4096 时，约 <strong>260 GFLOPS/s</strong>
 - <strong>相对 cuBLAS</strong>：约为 cuBLAS 的 <strong>1.3%</strong>
 - <strong>相对 A6000 峰值性能</strong>：约为峰值的 <strong>1.0%</strong>（A6000 峰值约 30 TFLOPS FP32）
 

From b550929a2d18fbadb8ebc452ab4a183d309b3917 Mon Sep 17 00:00:00 2001
From: wangdaye wang <1901591887@qq.com>
Date: Mon, 15 Jun 2026 17:00:58 +0800
Subject: [PATCH 22/23] =?UTF-8?q?6.15=20lab.1=E3=80=81lab2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../code/labs/lab1_solution.cu                | 106 ++++---
 .../code/labs/lab1_start.cu                   |  46 ++-
 .../code/labs/lab2_solution.cu                | 264 ++++++++++--------
 .../code/labs/lab2_start.cu                   |  17 +-
 4 files changed, 252 insertions(+), 181 deletions(-)

diff --git a/outputs/gpu-programming-course/code/labs/lab1_solution.cu b/outputs/gpu-programming-course/code/labs/lab1_solution.cu
index add7d76..e39e2b3 100644
--- a/outputs/gpu-programming-course/code/labs/lab1_solution.cu
+++ b/outputs/gpu-programming-course/code/labs/lab1_solution.cu
@@ -11,114 +11,132 @@
  */
 #include <stdio.h>
 #include <stdlib.h>
+#include <time.h>
 #include <cuda_runtime.h>
 
+// ===================== 宏定义 =====================
+#define BLOCK_SIZE      32
+#define CEIL_DIV(a, b)  (((a) + (b) - 1) / (b))
+
 #define CUDA_CHECK(call) do { \
     cudaError_t err = call; \
     if (err != cudaSuccess) { \
-        printf("CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
-        exit(1); \
+        fprintf(stderr, "CUDA Error [%s:%d]: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
+        exit(EXIT_FAILURE); \
     } \
 } while(0)
 
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
-
-// 随机初始化矩阵
+// ===================== 工具函数 =====================
+// 随机初始化矩阵，值域 [0, 1]
 void randomize_matrix(float *mat, int size) {
     for (int i = 0; i < size; ++i) {
         mat[i] = (float)rand() / RAND_MAX;
     }
 }
 
-// 朴素 SGEMM kernel
+// ===================== 朴素 SGEMM Kernel =====================
 __global__ void sgemm_naive(int M, int N, int K, float alpha,
                              const float *A, const float *B,
                              float beta, float *C) {
-    const uint x = blockIdx.x * blockDim.x + threadIdx.x;
-    const uint y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (x < M && y < N) {
+    // 计算当前线程在 C 矩阵中的全局坐标
+    const unsigned int row = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int col = blockIdx.y * blockDim.y + threadIdx.y;
+
+    // 边界检查：处理 tile quantization 越界问题
+    if (row < M && col < N) {
         float tmp = 0.0f;
+        // 内积循环：A 的第 row 行 × B 的第 col 列
         for (int i = 0; i < K; ++i) {
-            tmp += A[x * K + i] * B[i * N + y];
+            tmp += A[row * K + i] * B[i * N + col];
         }
-        C[x * N + y] = alpha * tmp + beta * C[x * N + y];
+        // 标准 GEMM 公式: C = alpha * A*B + beta * C
+        C[row * N + col] = alpha * tmp + beta * C[row * N + col];
     }
 }
 
+// ===================== 主函数 =====================
 int main() {
+    // 矩阵参数
     const int M = 4096;
     const int N = 4096;
     const int K = 4096;
     const float alpha = 1.0f;
-    const float beta = 0.0f;
-    const int BLOCK_SIZE = 32;
+    const float beta  = 0.0f;
     const int num_warmup = 5;
-    const int num_iter = 10;
+    const int num_iter   = 10;
 
-    printf("Matrix: M=%d, N=%d, K=%d\n", M, N, K);
+    printf("===== CUDA Naive SGEMM =====\n");
+    printf("Matrix: A(%dx%d), B(%dx%d), C(%dx%d)\n", M, K, K, N, M, N);
 
-    // 分配主机内存
-    float *A = (float*)malloc(M * K * sizeof(float));
-    float *B = (float*)malloc(K * N * sizeof(float));
-    float *C = (float*)malloc(M * N * sizeof(float));
-    if (!A || !B || !C) {
-        printf("Host memory allocation failed\n");
-        return 1;
-    }
+    // 初始化随机种子
+    srand((unsigned int)time(NULL));
 
-    // 初始化
-    randomize_matrix(A, M * K);
-    randomize_matrix(B, K * N);
-    for (int i = 0; i < M * N; ++i) C[i] = 0.0f;
+    // 1. 分配主机内存
+    float *h_A = (float*)malloc(M * K * sizeof(float));
+    float *h_B = (float*)malloc(K * N * sizeof(float));
+    float *h_C = (float*)malloc(M * N * sizeof(float));
 
-    // 分配设备内存
+    randomize_matrix(h_A, M * K);
+    randomize_matrix(h_B, K * N);
+    for (int i = 0; i < M * N; ++i) {
+        h_C[i] = 0.0f;
+    }
+
+    // 2. 分配设备显存
     float *d_A, *d_B, *d_C;
     CUDA_CHECK(cudaMalloc(&d_A, M * K * sizeof(float)));
     CUDA_CHECK(cudaMalloc(&d_B, K * N * sizeof(float)));
     CUDA_CHECK(cudaMalloc(&d_C, M * N * sizeof(float)));
 
-    // 拷贝到设备
-    CUDA_CHECK(cudaMemcpy(d_A, A, M * K * sizeof(float), cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemcpy(d_B, B, K * N * sizeof(float), cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemcpy(d_C, C, M * N * sizeof(float), cudaMemcpyHostToDevice));
+    // 3. 数据拷贝 Host -> Device
+    CUDA_CHECK(cudaMemcpy(d_A, h_A, M * K * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(d_B, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(d_C, h_C, M * N * sizeof(float), cudaMemcpyHostToDevice));
 
+    // 4. 配置 Kernel 启动参数
     dim3 gridDim(CEIL_DIV(M, BLOCK_SIZE), CEIL_DIV(N, BLOCK_SIZE));
     dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
-    printf("Grid: (%d,%d), Block: (%d,%d)\n", gridDim.x, gridDim.y, blockDim.x, blockDim.y);
+    printf("Grid: (%d, %d), Block: (%d, %d)\n", gridDim.x, gridDim.y, blockDim.x, blockDim.y);
 
-    // 预热
+    // 5. Kernel 预热（消除冷启动开销）
     for (int i = 0; i < num_warmup; ++i) {
         sgemm_naive<<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
     }
+    CUDA_CHECK(cudaGetLastError());
     CUDA_CHECK(cudaDeviceSynchronize());
 
-    // 计时
+    // 6. 正式计时
     cudaEvent_t start, stop;
     CUDA_CHECK(cudaEventCreate(&start));
     CUDA_CHECK(cudaEventCreate(&stop));
+
     CUDA_CHECK(cudaEventRecord(start));
     for (int i = 0; i < num_iter; ++i) {
         sgemm_naive<<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
     }
+    CUDA_CHECK(cudaGetLastError());
     CUDA_CHECK(cudaEventRecord(stop));
     CUDA_CHECK(cudaEventSynchronize(stop));
-    float elapsed_ms = 0;
-    CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, start, stop));
-    float avg_ms = elapsed_ms / num_iter;
 
+    // 7. 计算性能
+    float total_ms = 0.0f;
+    CUDA_CHECK(cudaEventElapsedTime(&total_ms, start, stop));
+    float avg_ms = total_ms / num_iter;
     double gflops = (2.0 * M * N * K) / (avg_ms * 1e6);
-    printf("Average time: %.4f ms\n", avg_ms);
-    printf("Performance: %.1f GFLOPS\n", gflops);
 
-    // 清理
+    printf("\nAverage Time: %.4f ms\n", avg_ms);
+    printf("Performance:  %.1f GFLOPS\n", gflops);
+
+    // 8. 资源释放
     CUDA_CHECK(cudaEventDestroy(start));
     CUDA_CHECK(cudaEventDestroy(stop));
     cudaFree(d_A);
     cudaFree(d_B);
     cudaFree(d_C);
-    free(A);
-    free(B);
-    free(C);
+    free(h_A);
+    free(h_B);
+    free(h_C);
 
+    printf("Program finished.\n");
     return 0;
 }
\ No newline at end of file
diff --git a/outputs/gpu-programming-course/code/labs/lab1_start.cu b/outputs/gpu-programming-course/code/labs/lab1_start.cu
index a4fd559..6187a43 100644
--- a/outputs/gpu-programming-course/code/labs/lab1_start.cu
+++ b/outputs/gpu-programming-course/code/labs/lab1_start.cu
@@ -11,8 +11,34 @@
  * 运行: ./lab1_start
  */
 
-#include "sgemm_common.h"
-#include <cublas_v2.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <cuda_runtime.h>
+
+// ========== 宏定义 ==========
+#define CEIL_DIV(a, b)  (((a) + (b) - 1) / (b))
+
+#define CUDA_CHECK(call) do { \
+    cudaError_t err = call; \
+    if (err != cudaSuccess) { \
+        fprintf(stderr, "CUDA Error [%s:%d]: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
+        exit(EXIT_FAILURE); \
+    } \
+} while(0)
+
+void randomize_matrix(float *mat, int size) {
+    for (int i = 0; i < size; ++i) {
+        mat[i] = (float)rand() / RAND_MAX;
+    }
+}
+
+void zero_init_matrix(float *mat, int size) {
+    for (int i = 0; i < size; ++i) {
+        mat[i] = 0.0f;
+    }
+}
+// ==============================================================
 
 // ============================================================================
 // TODO: 在此处编写你的 sgemm_naive kernel
@@ -27,8 +53,6 @@
 // __global__ void sgemm_naive(...) { ... }
 
 int main() {
-  // 显示 GPU 信息
-  CudaDeviceInfo();
 
   // 矩阵参数
   const int M = 4096;
@@ -67,11 +91,11 @@ int main() {
   CUDA_CHECK(cudaMemcpy(d_C_ref, C_ref, M * N * sizeof(float), cudaMemcpyHostToDevice));
 
   // ========================================================================
-  // 使用 cuBLAS 获取参考结果
+  // 使用 cuBLAS 获取参考结果（··可选··）
   // ========================================================================
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-  runCublasSgemm(handle, M, N, K, alpha, d_A, d_B, beta, d_C_ref);
+  // cublasHandle_t handle;
+  // cublasCreate(&handle);
+  // runCublasSgemm(handle, M, N, K, alpha, d_A, d_B, beta, d_C_ref);
 
   // ========================================================================
   // TODO: 配置 kernel 启动参数并运行你的 kernel
@@ -88,8 +112,8 @@ int main() {
 
   printf("\n请在代码中完成 TODO 部分的实现，然后重新编译运行。\n");
 
-  // 清理
-  cublasDestroy(handle);
+  // 清理：删掉未定义的 cublasDestroy(handle)
+  // cublasDestroy(handle);
   cudaFree(d_A);
   cudaFree(d_B);
   cudaFree(d_C);
@@ -100,4 +124,4 @@ int main() {
   free(C_ref);
 
   return 0;
-}
+}
\ No newline at end of file
diff --git a/outputs/gpu-programming-course/code/labs/lab2_solution.cu b/outputs/gpu-programming-course/code/labs/lab2_solution.cu
index bc999a0..5a0f95d 100644
--- a/outputs/gpu-programming-course/code/labs/lab2_solution.cu
+++ b/outputs/gpu-programming-course/code/labs/lab2_solution.cu
@@ -1,130 +1,158 @@
 /**
- * 实验2：全局内存合并访问 - 参考解答
- *
- * 将 2D block 改为 1D block，重新映射线程到 C 矩阵的位置关系，
- * 使连续的 threadIdx.x 对应连续的 C 列坐标，实现全局内存合并访问。
- *
- * 预期性能：矩阵 4096x4096 时约 1986.5 GFLOPS/s（比实验1提升约 6.4x）
+ * 实验2：全局内存合并访问 - 在线测评提交版
+ * 无 cuBLAS 依赖，可直接在 CMake 环境编译运行
  */
 
-#include "sgemm_common.h"
-#include <cublas_v2.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <cuda_runtime.h>
+
+// ===================== 宏定义 =====================
+#define CUDA_CHECK(call) do { \
+    cudaError_t err = call; \
+    if (err != cudaSuccess) { \
+        fprintf(stderr, "CUDA Error [%s:%d]: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
+        exit(EXIT_FAILURE); \
+    } \
+} while(0)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+
+// ===================== 辅助函数 =====================
+void randomize_matrix(float *mat, int size) {
+    for (int i = 0; i < size; ++i) {
+        mat[i] = (float)rand() / RAND_MAX;
+    }
+}
+
+void zero_init_matrix(float *mat, int size) {
+    for (int i = 0; i < size; ++i) {
+        mat[i] = 0.0f;
+    }
+}
+
+double calculate_gflops(int M, int N, int K, double time_ms) {
+    // 2 * M * N * K 浮点操作（乘加算两个 FLOP）
+    double flops = 2.0 * M * N * K;
+    return flops / (time_ms * 1e6);
+}
+
+void print_device_info() {
+    int deviceCount;
+    cudaGetDeviceCount(&deviceCount);
+    if (deviceCount == 0) {
+        printf("No CUDA devices found.\n");
+        return;
+    }
+    int device;
+    cudaGetDevice(&device);
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, device);
+    printf("Device: %s\n", prop.name);
+    printf("Compute Capability: %d.%d\n", prop.major, prop.minor);
+    printf("SMs: %d\n", prop.multiProcessorCount);
+    printf("Max threads per block: %d\n", prop.maxThreadsPerBlock);
+}
 
+// ===================== Kernel: 合并访问 =====================
 template <const uint BLOCKSIZE>
 __global__ void sgemm_global_mem_coalesce(int M, int N, int K, float alpha,
                                            const float *A, const float *B,
                                            float beta, float *C) {
-  // 重映射：threadIdx.x 连续 => cCol 连续
-  // 因为 cCol 在 B 的访问中是连续的维度：B[i * N + cCol]
-  const int cRow = blockIdx.x * BLOCKSIZE + (threadIdx.x / BLOCKSIZE);
-  const int cCol = blockIdx.y * BLOCKSIZE + (threadIdx.x % BLOCKSIZE);
-
-  if (cRow < M && cCol < N) {
-    float tmp = 0.0;
-    for (int i = 0; i < K; ++i) {
-      // 同一 Warp 内：cCol 连续 => B[i * N + cCol] 地址连续 => 合并访问
-      tmp += A[cRow * K + i] * B[i * N + cCol];
+    // 重映射：threadIdx.x 连续 => cCol 连续
+    const int cRow = blockIdx.x * BLOCKSIZE + (threadIdx.x / BLOCKSIZE);
+    const int cCol = blockIdx.y * BLOCKSIZE + (threadIdx.x % BLOCKSIZE);
+
+    if (cRow < M && cCol < N) {
+        float tmp = 0.0f;
+        for (int i = 0; i < K; ++i) {
+            // 合并访问：B[i * N + cCol] 地址连续
+            tmp += A[cRow * K + i] * B[i * N + cCol];
+        }
+        C[cRow * N + cCol] = alpha * tmp + beta * C[cRow * N + cCol];
     }
-    C[cRow * N + cCol] = alpha * tmp + beta * C[cRow * N + cCol];
-  }
 }
 
+// ===================== 主函数 =====================
 int main() {
-  CudaDeviceInfo();
-
-  const int M = 4096;
-  const int N = 4096;
-  const int K = 4096;
-  const float alpha = 1.0f;
-  const float beta = 0.0f;
-  const int num_warmup = 5;
-  const int num_iter = 10;
-
-  printf("\n========================================\n");
-  printf("实验2：全局内存合并访问\n");
-  printf("矩阵大小: M=%d, N=%d, K=%d\n", M, N, K);
-  printf("========================================\n");
-
-  float *A = (float *)malloc(M * K * sizeof(float));
-  float *B = (float *)malloc(K * N * sizeof(float));
-  float *C = (float *)malloc(M * N * sizeof(float));
-  float *C_ref = (float *)malloc(M * N * sizeof(float));
-
-  randomize_matrix(A, M * K);
-  randomize_matrix(B, K * N);
-  zero_init_matrix(C, M * N);
-  zero_init_matrix(C_ref, M * N);
-
-  float *d_A, *d_B, *d_C, *d_C_ref;
-  CUDA_CHECK(cudaMalloc(&d_A, M * K * sizeof(float)));
-  CUDA_CHECK(cudaMalloc(&d_B, K * N * sizeof(float)));
-  CUDA_CHECK(cudaMalloc(&d_C, M * N * sizeof(float)));
-  CUDA_CHECK(cudaMalloc(&d_C_ref, M * N * sizeof(float)));
-
-  CUDA_CHECK(cudaMemcpy(d_A, A, M * K * sizeof(float), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_B, B, K * N * sizeof(float), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_C, C, M * N * sizeof(float), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_C_ref, C_ref, M * N * sizeof(float), cudaMemcpyHostToDevice));
-
-  // cuBLAS reference
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-  runCublasSgemm(handle, M, N, K, alpha, d_A, d_B, beta, d_C_ref);
-
-  // 1D block 启动配置
-  const uint BLOCKSIZE = 32;
-  dim3 gridDim(CEIL_DIV(M, BLOCKSIZE), CEIL_DIV(N, BLOCKSIZE));
-  dim3 blockDim(BLOCKSIZE * BLOCKSIZE);  // 1024 threads, 1D
-
-  printf("\nKernel 配置:\n");
-  printf("  Grid:  (%d, %d)\n", gridDim.x, gridDim.y);
-  printf("  Block: (%d) = 1D, %d threads\n", blockDim.x, blockDim.x);
-
-  // Warmup
-  for (int i = 0; i < num_warmup; ++i) {
-    sgemm_global_mem_coalesce<BLOCKSIZE>
-        <<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
-  }
-  CUDA_CHECK(cudaDeviceSynchronize());
-
-  // Benchmark
-  cudaEvent_t start, stop;
-  CUDA_CHECK(cudaEventCreate(&start));
-  CUDA_CHECK(cudaEventCreate(&stop));
-
-  CUDA_CHECK(cudaEventRecord(start));
-  for (int i = 0; i < num_iter; ++i) {
-    sgemm_global_mem_coalesce<BLOCKSIZE>
-        <<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
-  }
-  CUDA_CHECK(cudaEventRecord(stop));
-  CUDA_CHECK(cudaEventSynchronize(stop));
-
-  float elapsed_ms;
-  CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, start, stop));
-  float avg_ms = elapsed_ms / num_iter;
-
-  CUDA_CHECK(cudaEventDestroy(start));
-  CUDA_CHECK(cudaEventDestroy(stop));
-
-  // Verification
-  CUDA_CHECK(cudaMemcpy(C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(C_ref, d_C_ref, M * N * sizeof(float), cudaMemcpyDeviceToHost));
-  bool correct = verify_matrix(C_ref, C, M * N);
-
-  double gflops = calculate_gflops(M, N, K, avg_ms);
-
-  printf("\n========================================\n");
-  printf("实验结果:\n");
-  printf("  正确性: %s\n", correct ? "通过" : "失败");
-  printf("  平均耗时: %.4f ms\n", avg_ms);
-  printf("  计算性能: %.1f GFLOPS/s\n", gflops);
-  printf("  预期性能: ~1986.5 GFLOPS/s\n");
-  printf("  相比实验1 (309 GFLOPS): %.1fx 提升\n", gflops / 309.0);
-  printf("========================================\n");
-
-  cublasDestroy(handle);
-  cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaFree(d_C_ref);
-  free(A); free(B); free(C); free(C_ref);
-  return correct ? 0 : 1;
-}
+    srand((unsigned int)time(NULL));
+    print_device_info();
+
+    const int M = 4096;
+    const int N = 4096;
+    const int K = 4096;
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int num_warmup = 5;
+    const int num_iter = 10;
+
+    printf("\n========================================\n");
+    printf("实验2：全局内存合并访问\n");
+    printf("矩阵大小: M=%d, N=%d, K=%d\n", M, N, K);
+    printf("========================================\n");
+
+    // 分配主机内存
+    float *A = (float*)malloc(M * K * sizeof(float));
+    float *B = (float*)malloc(K * N * sizeof(float));
+    float *C = (float*)malloc(M * N * sizeof(float));
+
+    // 初始化
+    randomize_matrix(A, M * K);
+    randomize_matrix(B, K * N);
+    zero_init_matrix(C, M * N);
+
+    // 分配设备内存
+    float *d_A, *d_B, *d_C;
+    CUDA_CHECK(cudaMalloc(&d_A, M * K * sizeof(float)));
+    CUDA_CHECK(cudaMalloc(&d_B, K * N * sizeof(float)));
+    CUDA_CHECK(cudaMalloc(&d_C, M * N * sizeof(float)));
+
+    // 拷贝到设备
+    CUDA_CHECK(cudaMemcpy(d_A, A, M * K * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(d_B, B, K * N * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(d_C, C, M * N * sizeof(float), cudaMemcpyHostToDevice));
+
+    // Kernel 配置
+    const uint BLOCKSIZE = 32;
+    dim3 gridDim(CEIL_DIV(M, BLOCKSIZE), CEIL_DIV(N, BLOCKSIZE));
+    dim3 blockDim(BLOCKSIZE * BLOCKSIZE);   // 1024 线程，1D
+    printf("Kernel 配置: Grid(%d, %d), Block(%d) 1D\n", gridDim.x, gridDim.y, blockDim.x);
+
+    // 预热
+    for (int i = 0; i < num_warmup; ++i) {
+        sgemm_global_mem_coalesce<BLOCKSIZE><<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
+    }
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    // 计时
+    cudaEvent_t start, stop;
+    CUDA_CHECK(cudaEventCreate(&start));
+    CUDA_CHECK(cudaEventCreate(&stop));
+    CUDA_CHECK(cudaEventRecord(start));
+    for (int i = 0; i < num_iter; ++i) {
+        sgemm_global_mem_coalesce<BLOCKSIZE><<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
+    }
+    CUDA_CHECK(cudaEventRecord(stop));
+    CUDA_CHECK(cudaEventSynchronize(stop));
+
+    float elapsed_ms;
+    CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, start, stop));
+    float avg_ms = elapsed_ms / num_iter;
+
+    double gflops = calculate_gflops(M, N, K, avg_ms);
+
+    printf("\n========================================\n");
+    printf("实验结果:\n");
+    printf("  平均耗时: %.4f ms\n", avg_ms);
+    printf("  计算性能: %.1f GFLOPS/s\n", gflops);
+    printf("========================================\n");
+
+    // 清理
+    CUDA_CHECK(cudaEventDestroy(start));
+    CUDA_CHECK(cudaEventDestroy(stop));
+    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
+    free(A); free(B); free(C);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/outputs/gpu-programming-course/code/labs/lab2_start.cu b/outputs/gpu-programming-course/code/labs/lab2_start.cu
index 36ae0f1..63af064 100644
--- a/outputs/gpu-programming-course/code/labs/lab2_start.cu
+++ b/outputs/gpu-programming-course/code/labs/lab2_start.cu
@@ -13,9 +13,10 @@
  * 编译: nvcc lab2_start.cu -o lab2_start -lcublas
  * 运行: ./lab2_start
  */
-
-#include "sgemm_common.h"
-#include <cublas_v2.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <cuda_runtime.h>
 
 // ============================================================================
 // TODO: 将实验1的 2D kernel 改写为 1D kernel，实现合并访问
@@ -72,10 +73,10 @@ int main() {
   CUDA_CHECK(cudaMemcpy(d_C, C, M * N * sizeof(float), cudaMemcpyHostToDevice));
   CUDA_CHECK(cudaMemcpy(d_C_ref, C_ref, M * N * sizeof(float), cudaMemcpyHostToDevice));
 
-  // cuBLAS reference
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-  runCublasSgemm(handle, M, N, K, alpha, d_A, d_B, beta, d_C_ref);
+  // // cuBLAS reference（可选）
+  // cublasHandle_t handle;
+  // cublasCreate(&handle);
+  // runCublasSgemm(handle, M, N, K, alpha, d_A, d_B, beta, d_C_ref);
 
   // TODO: 配置 1D block 启动参数
   // dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32));
@@ -84,7 +85,7 @@ int main() {
 
   printf("\n请在代码中完成 TODO 部分的实现。\n");
 
-  cublasDestroy(handle);
+  // cublasDestroy(handle);
   cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaFree(d_C_ref);
   free(A); free(B); free(C); free(C_ref);
   return 0;

From 0235ad2d838e49f41daf6622bcda657e1b065e65 Mon Sep 17 00:00:00 2001
From: wangdaye wang <1901591887@qq.com>
Date: Wed, 17 Jun 2026 19:24:28 +0800
Subject: [PATCH 23/23] =?UTF-8?q?6.16=E3=80=8117=20=20lab.45678?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../code/labs/lab2_solution.cu                |   9 +-
 .../code/labs/lab3_solution.cu                | 342 ++++++++++--------
 .../code/labs/lab3_start.cu                   |   7 +-
 .../code/labs/lab4_solution.cu                | 108 +++---
 .../code/labs/lab4_start.cu                   |   6 +-
 .../code/labs/lab5_solution.cu                |  87 +++--
 .../code/labs/lab5_start.cu                   |   6 +-
 .../code/labs/lab6_solution.cu                |  96 +++--
 .../code/labs/lab6_start.cu                   |   6 +-
 .../code/labs/lab7_solution.cu                |  93 +++--
 .../code/labs/lab7_start.cu                   |   7 +-
 .../code/labs/lab8_solution.cu                |  99 ++---
 .../code/labs/lab8_start.cu                   |   8 +-
 ...30\346\263\225\345\256\236\347\216\260.md" |   1 -
 14 files changed, 543 insertions(+), 332 deletions(-)

diff --git a/outputs/gpu-programming-course/code/labs/lab2_solution.cu b/outputs/gpu-programming-course/code/labs/lab2_solution.cu
index 5a0f95d..fa6e469 100644
--- a/outputs/gpu-programming-course/code/labs/lab2_solution.cu
+++ b/outputs/gpu-programming-course/code/labs/lab2_solution.cu
@@ -1,8 +1,11 @@
 /**
- * 实验2：全局内存合并访问 - 在线测评提交版
- * 无 cuBLAS 依赖，可直接在 CMake 环境编译运行
+ * 实验2：全局内存合并访问 - 参考解答
+ *
+ * 将 2D block 改为 1D block，重新映射线程到 C 矩阵的位置关系，
+ * 使连续的 threadIdx.x 对应连续的 C 列坐标，实现全局内存合并访问。
+ *
+ * 预期性能：矩阵 4096x4096 时约 1986.5 GFLOPS/s（比实验1提升约 6.4x）
  */
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
diff --git a/outputs/gpu-programming-course/code/labs/lab3_solution.cu b/outputs/gpu-programming-course/code/labs/lab3_solution.cu
index 7441aaa..95a7dd9 100644
--- a/outputs/gpu-programming-course/code/labs/lab3_solution.cu
+++ b/outputs/gpu-programming-course/code/labs/lab3_solution.cu
@@ -7,156 +7,206 @@
  * 预期性能：矩阵 4096x4096 时约 2980.3 GFLOPS/s（比实验2提升约 1.5x）
  */
 
-#include "sgemm_common.h"
-#include <cublas_v2.h>
+/**
+ * 实验3：共享内存缓存分块 - 在线测评提交版
+ * 无 cuBLAS 依赖，完全对齐 lab2 代码结构，CMake/ncu 环境直接编译运行
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <cuda_runtime.h>
+
+// ===================== 宏定义 =====================
+#define CUDA_CHECK(call) do { \
+    cudaError_t err = call; \
+    if (err != cudaSuccess) { \
+        fprintf(stderr, "CUDA Error [%s:%d]: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
+        exit(EXIT_FAILURE); \
+    } \
+} while(0)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+
+// ===================== 辅助函数 =====================
+void randomize_matrix(float *mat, int size) {
+    for (int i = 0; i < size; ++i) {
+        mat[i] = (float)rand() / RAND_MAX;
+    }
+}
 
-template <const int BLOCKSIZE>
+void zero_init_matrix(float *mat, int size) {
+    for (int i = 0; i < size; ++i) {
+        mat[i] = 0.0f;
+    }
+}
+
+double calculate_gflops(int M, int N, int K, double time_ms) {
+    // 2 * M * N * K 浮点操作（乘+加各算1个FLOP）
+    double flops = 2.0 * M * N * K;
+    return flops / (time_ms * 1e6);
+}
+
+void print_device_info() {
+    int deviceCount;
+    cudaGetDeviceCount(&deviceCount);
+    if (deviceCount == 0) {
+        printf("No CUDA devices found.\n");
+        return;
+    }
+    int device;
+    cudaGetDevice(&device);
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, device);
+    printf("Device: %s\n", prop.name);
+    printf("Compute Capability: %d.%d\n", prop.major, prop.minor);
+    printf("SMs: %d\n", prop.multiProcessorCount);
+    printf("Max threads per block: %d\n", prop.maxThreadsPerBlock);
+}
+
+// ===================== Kernel: 共享内存分块SGEMM =====================
+template <const uint BLOCKSIZE>
 __global__ void sgemm_shared_mem_block(int M, int N, int K, float alpha,
-                                        const float *A, const float *B,
-                                        float beta, float *C) {
-  // 当前 Block 要计算的 C 子块位置
-  const uint cRow = blockIdx.x;
-  const uint cCol = blockIdx.y;
-
-  // 在共享内存中分配当前 Block 的缓存
-  __shared__ float As[BLOCKSIZE * BLOCKSIZE];
-  __shared__ float Bs[BLOCKSIZE * BLOCKSIZE];
-
-  // 线程在 Block 内的行列位置
-  const uint threadCol = threadIdx.x % BLOCKSIZE;
-  const uint threadRow = threadIdx.x / BLOCKSIZE;
-
-  // 指针移动到当前 Block 负责的区域
-  A += cRow * BLOCKSIZE * K;                    // row=cRow, col=0
-  B += cCol * BLOCKSIZE;                        // row=0, col=cCol
-  C += cRow * BLOCKSIZE * N + cCol * BLOCKSIZE; // row=cRow, col=cCol
-
-  float tmp = 0.0;
-  for (int bkIdx = 0; bkIdx < K; bkIdx += BLOCKSIZE) {
-    // 协作加载数据到共享内存（保持合并访问）
-    // threadCol 是连续的 threadIdx 维度 => 合并访问
-    As[threadRow * BLOCKSIZE + threadCol] = A[threadRow * K + threadCol];
-    Bs[threadRow * BLOCKSIZE + threadCol] = B[threadRow * N + threadCol];
-
-    // 同步：确保所有线程都加载完成后再开始计算
-    __syncthreads();
-
-    // 前进到下一个 K 维度的块
-    A += BLOCKSIZE;
-    B += BLOCKSIZE * N;
-
-    // 在共享内存上执行分块内积
-    for (int dotIdx = 0; dotIdx < BLOCKSIZE; ++dotIdx) {
-      tmp += As[threadRow * BLOCKSIZE + dotIdx] *
-             Bs[dotIdx * BLOCKSIZE + threadCol];
+                                       const float *A, const float *B,
+                                       float beta, float *C) {
+    // 当前Block对应的C矩阵起始行列
+    const int blockRow = blockIdx.x * BLOCKSIZE;
+    const int blockCol = blockIdx.y * BLOCKSIZE;
+
+    // Block内线程索引 1D 展开
+    const int tid = threadIdx.x;
+    const int tidRow = tid / BLOCKSIZE;
+    const int tidCol = tid % BLOCKSIZE;
+
+    // 线程最终输出C的全局坐标
+    const int cRow = blockRow + tidRow;
+    const int cCol = blockCol + tidCol;
+
+    // 共享内存缓存A、B子块
+    __shared__ float As[BLOCKSIZE * BLOCKSIZE];
+    __shared__ float Bs[BLOCKSIZE * BLOCKSIZE];
+
+    float accum = 0.0f;
+
+    // K维度分块循环，逐块加载到共享内存计算
+    for (int bk = 0; bk < K; bk += BLOCKSIZE) {
+        // 1. 协作加载A子块到共享内存
+        int aRow = cRow;
+        int aCol = bk + tidCol;
+        if (aRow < M && aCol < K) {
+            As[tidRow * BLOCKSIZE + tidCol] = A[aRow * K + aCol];
+        } else {
+            As[tidRow * BLOCKSIZE + tidCol] = 0.0f;
+        }
+
+        // 2. 协作加载B子块到共享内存
+        int bRow = bk + tidRow;
+        int bCol = cCol;
+        if (bRow < K && bCol < N) {
+            Bs[tidRow * BLOCKSIZE + tidCol] = B[bRow * N + bCol];
+        } else {
+            Bs[tidRow * BLOCKSIZE + tidCol] = 0.0f;
+        }
+
+        // 同步：确保整块A、B加载完成再计算
+        __syncthreads();
+
+        // 分块内积计算，复用共享内存减少全局内存访问
+        for (int k = 0; k < BLOCKSIZE; k++) {
+            accum += As[tidRow * BLOCKSIZE + k] * Bs[k * BLOCKSIZE + tidCol];
+        }
+
+        // 同步：防止下一轮加载覆盖未计算完成的数据
+        __syncthreads();
     }
-    // 同步：防止快线程提前加载下一个块，覆盖仍在被慢线程读取的 SMEM
-    __syncthreads();
-  }
 
-  C[threadRow * N + threadCol] =
-      alpha * tmp + beta * C[threadRow * N + threadCol];
+    // 写回最终结果到全局内存C
+    if (cRow < M && cCol < N) {
+        int cIdx = cRow * N + cCol;
+        C[cIdx] = alpha * accum + beta * C[cIdx];
+    }
 }
 
+// ===================== 主函数（和lab2逻辑完全对齐） =====================
 int main() {
-  CudaDeviceInfo();
-
-  const int M = 4096;
-  const int N = 4096;
-  const int K = 4096;
-  const float alpha = 1.0f;
-  const float beta = 0.0f;
-  const int num_warmup = 5;
-  const int num_iter = 10;
-  const int BLOCKSIZE = 32;
-
-  printf("\n========================================\n");
-  printf("实验3：共享内存缓存分块\n");
-  printf("矩阵大小: M=%d, N=%d, K=%d, BLOCKSIZE=%d\n", M, N, K, BLOCKSIZE);
-  printf("========================================\n");
-
-  float *A = (float *)malloc(M * K * sizeof(float));
-  float *B = (float *)malloc(K * N * sizeof(float));
-  float *C = (float *)malloc(M * N * sizeof(float));
-  float *C_ref = (float *)malloc(M * N * sizeof(float));
-
-  randomize_matrix(A, M * K);
-  randomize_matrix(B, K * N);
-  zero_init_matrix(C, M * N);
-  zero_init_matrix(C_ref, M * N);
-
-  float *d_A, *d_B, *d_C, *d_C_ref;
-  CUDA_CHECK(cudaMalloc(&d_A, M * K * sizeof(float)));
-  CUDA_CHECK(cudaMalloc(&d_B, K * N * sizeof(float)));
-  CUDA_CHECK(cudaMalloc(&d_C, M * N * sizeof(float)));
-  CUDA_CHECK(cudaMalloc(&d_C_ref, M * N * sizeof(float)));
-
-  CUDA_CHECK(cudaMemcpy(d_A, A, M * K * sizeof(float), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_B, B, K * N * sizeof(float), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_C, C, M * N * sizeof(float), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_C_ref, C_ref, M * N * sizeof(float), cudaMemcpyHostToDevice));
-
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-  runCublasSgemm(handle, M, N, K, alpha, d_A, d_B, beta, d_C_ref);
-
-  // 配置 kernel
-  dim3 gridDim(CEIL_DIV(M, BLOCKSIZE), CEIL_DIV(N, BLOCKSIZE));
-  dim3 blockDim(BLOCKSIZE * BLOCKSIZE);
-
-  // 本 kernel 不会使用 L1 缓存，将所有 L1 让渡给 SMEM
-  cudaFuncSetAttribute(sgemm_shared_mem_block<BLOCKSIZE>,
-                       cudaFuncAttributePreferredSharedMemoryCarveout,
-                       cudaSharedmemCarveoutMaxShared);
-
-  printf("\nKernel 配置:\n");
-  printf("  Grid:  (%d, %d)\n", gridDim.x, gridDim.y);
-  printf("  Block: (%d) = 1D, %d threads\n", blockDim.x, blockDim.x);
-  printf("  SMEM/Block: %zu bytes\n", 2 * BLOCKSIZE * BLOCKSIZE * sizeof(float));
-
-  // Warmup
-  for (int i = 0; i < num_warmup; ++i) {
-    sgemm_shared_mem_block<BLOCKSIZE>
-        <<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
-  }
-  CUDA_CHECK(cudaDeviceSynchronize());
-
-  // Benchmark
-  cudaEvent_t start, stop;
-  CUDA_CHECK(cudaEventCreate(&start));
-  CUDA_CHECK(cudaEventCreate(&stop));
-
-  CUDA_CHECK(cudaEventRecord(start));
-  for (int i = 0; i < num_iter; ++i) {
-    sgemm_shared_mem_block<BLOCKSIZE>
-        <<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
-  }
-  CUDA_CHECK(cudaEventRecord(stop));
-  CUDA_CHECK(cudaEventSynchronize(stop));
-
-  float elapsed_ms;
-  CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, start, stop));
-  float avg_ms = elapsed_ms / num_iter;
-
-  CUDA_CHECK(cudaEventDestroy(start));
-  CUDA_CHECK(cudaEventDestroy(stop));
-
-  CUDA_CHECK(cudaMemcpy(C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(C_ref, d_C_ref, M * N * sizeof(float), cudaMemcpyDeviceToHost));
-  bool correct = verify_matrix(C_ref, C, M * N);
-
-  double gflops = calculate_gflops(M, N, K, avg_ms);
-
-  printf("\n========================================\n");
-  printf("实验结果:\n");
-  printf("  正确性: %s\n", correct ? "通过" : "失败");
-  printf("  平均耗时: %.4f ms\n", avg_ms);
-  printf("  计算性能: %.1f GFLOPS/s\n", gflops);
-  printf("  预期性能: ~2980.3 GFLOPS/s\n");
-  printf("========================================\n");
-
-  cublasDestroy(handle);
-  cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaFree(d_C_ref);
-  free(A); free(B); free(C); free(C_ref);
-  return correct ? 0 : 1;
-}
+    srand((unsigned int)time(NULL));
+    print_device_info();
+
+    const int M = 4096;
+    const int N = 4096;
+    const int K = 4096;
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int num_warmup = 5;
+    const int num_iter = 10;
+
+    printf("\n========================================\n");
+    printf("实验3：共享内存缓存分块 SGEMM\n");
+    printf("矩阵大小: M=%d, N=%d, K=%d\n", M, N, K);
+    printf("========================================\n");
+
+    // 分配主机内存
+    float *A = (float*)malloc(M * K * sizeof(float));
+    float *B = (float*)malloc(K * N * sizeof(float));
+    float *C = (float*)malloc(M * N * sizeof(float));
+
+    // 矩阵初始化
+    randomize_matrix(A, M * K);
+    randomize_matrix(B, K * N);
+    zero_init_matrix(C, M * N);
+
+    // 分配设备显存
+    float *d_A, *d_B, *d_C;
+    CUDA_CHECK(cudaMalloc(&d_A, M * K * sizeof(float)));
+    CUDA_CHECK(cudaMalloc(&d_B, K * N * sizeof(float)));
+    CUDA_CHECK(cudaMalloc(&d_C, M * N * sizeof(float)));
+
+    // 主机数据拷贝到GPU
+    CUDA_CHECK(cudaMemcpy(d_A, A, M * K * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(d_B, B, K * N * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(d_C, C, M * N * sizeof(float), cudaMemcpyHostToDevice));
+
+    // Kernel 启动参数配置
+    const uint BLOCKSIZE = 32;
+    dim3 gridDim(CEIL_DIV(M, BLOCKSIZE), CEIL_DIV(N, BLOCKSIZE));
+    dim3 blockDim(BLOCKSIZE * BLOCKSIZE);   // 单Block 1024线程，一维展开
+    printf("Kernel 配置: Grid(%d, %d), Block(%d) 1D\n", gridDim.x, gridDim.y, blockDim.x);
+
+    // 预热迭代，消除启动开销
+    for (int i = 0; i < num_warmup; ++i) {
+        sgemm_shared_mem_block<BLOCKSIZE><<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
+    }
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    // 性能计时
+    cudaEvent_t start, stop;
+    CUDA_CHECK(cudaEventCreate(&start));
+    CUDA_CHECK(cudaEventCreate(&stop));
+    CUDA_CHECK(cudaEventRecord(start));
+    for (int i = 0; i < num_iter; ++i) {
+        sgemm_shared_mem_block<BLOCKSIZE><<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
+    }
+    CUDA_CHECK(cudaEventRecord(stop));
+    CUDA_CHECK(cudaEventSynchronize(stop));
+
+    float elapsed_ms;
+    CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, start, stop));
+    float avg_ms = elapsed_ms / num_iter;
+
+    double gflops = calculate_gflops(M, N, K, avg_ms);
+
+    // 输出性能结果
+    printf("\n========================================\n");
+    printf("实验结果:\n");
+    printf("  平均耗时: %.4f ms\n", avg_ms);
+    printf("  计算性能: %.1f GFLOPS/s\n", gflops);
+    printf("========================================\n");
+
+    // 资源释放
+    CUDA_CHECK(cudaEventDestroy(start));
+    CUDA_CHECK(cudaEventDestroy(stop));
+    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
+    free(A); free(B); free(C);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/outputs/gpu-programming-course/code/labs/lab3_start.cu b/outputs/gpu-programming-course/code/labs/lab3_start.cu
index 7765281..f42ceb4 100644
--- a/outputs/gpu-programming-course/code/labs/lab3_start.cu
+++ b/outputs/gpu-programming-course/code/labs/lab3_start.cu
@@ -13,9 +13,10 @@
  * 编译: nvcc lab3_start.cu -o lab3_start -lcublas
  * 运行: ./lab3_start
  */
-
-#include "sgemm_common.h"
-#include <cublas_v2.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <cuda_runtime.h>
 
 // TODO: 实现使用共享内存缓存分块的 kernel
 // 参考结构：
diff --git a/outputs/gpu-programming-course/code/labs/lab4_solution.cu b/outputs/gpu-programming-course/code/labs/lab4_solution.cu
index f1acfc5..65275e4 100644
--- a/outputs/gpu-programming-course/code/labs/lab4_solution.cu
+++ b/outputs/gpu-programming-course/code/labs/lab4_solution.cu
@@ -7,55 +7,93 @@
  * 预期性能：矩阵 4096x4096 时约 8474.7 GFLOPS/s（比实验3提升约 2.8x）
  */
 
-#include "sgemm_common.h"
-#include <cublas_v2.h>
 
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <cuda_runtime.h>
+
+// ===================== 宏定义 =====================
+#define CUDA_CHECK(call) do { \
+    cudaError_t err = call; \
+    if (err != cudaSuccess) { \
+        fprintf(stderr, "CUDA Error [%s:%d]: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
+        exit(EXIT_FAILURE); \
+    } \
+} while(0)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+
+// ===================== 辅助函数 =====================
+void randomize_matrix(float *mat, int size) {
+    for (int i = 0; i < size; ++i) {
+        mat[i] = (float)rand() / RAND_MAX;
+    }
+}
+
+void zero_init_matrix(float *mat, int size) {
+    for (int i = 0; i < size; ++i) {
+        mat[i] = 0.0f;
+    }
+}
+
+double calculate_gflops(int M, int N, int K, double time_ms) {
+    double flops = 2.0 * M * N * K;
+    return flops / (time_ms * 1e6);
+}
+
+void print_device_info() {
+    int deviceCount;
+    cudaGetDeviceCount(&deviceCount);
+    if (deviceCount == 0) {
+        printf("No CUDA devices found.\n");
+        return;
+    }
+    int device;
+    cudaGetDevice(&device);
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, device);
+    printf("Device: %s\n", prop.name);
+    printf("Compute Capability: %d.%d\n", prop.major, prop.minor);
+    printf("SMs: %d\n", prop.multiProcessorCount);
+    printf("Max threads per block: %d\n", prop.maxThreadsPerBlock);
+}
+
+// ===================== Kernel: 一维 Block Tile SGEMM =====================
 template <const int BM, const int BN, const int BK, const int TM>
 __global__ void sgemm1DBlocktiling(int M, int N, int K, float alpha,
                                     const float *A, const float *B,
                                     float beta, float *C) {
-  // Block 在 C 矩阵中的位置
-  // 交换 x 和 y 以获得更好的 L2 缓存命中率
   const uint cRow = blockIdx.y;
   const uint cCol = blockIdx.x;
 
-  // 当前线程在 Block Tile 中的行列位置
   const int threadCol = threadIdx.x % BN;
   const int threadRow = threadIdx.x / BN;
 
-  // 在 SMEM 中分配当前 Block Tile 的缓存
   __shared__ float As[BM * BK];
   __shared__ float Bs[BK * BN];
 
-  // 指针移动到当前 Block Tile 的起始位置
   A += cRow * BM * K;
   B += cCol * BN;
   C += cRow * BM * N + cCol * BN;
 
-  // 每个线程负责从 GMEM 加载到 SMEM 时的索引
-  // 保持合并访问：innerCol 维度对应连续的 threadIdx
   const uint innerColA = threadIdx.x % BK;
   const uint innerRowA = threadIdx.x / BK;
   const uint innerColB = threadIdx.x % BN;
   const uint innerRowB = threadIdx.x / BN;
 
-  // 寄存器中的线程结果缓存（TM 个 float）
   float threadResults[TM] = {0.0};
 
-  // 外层循环：沿 K 维度分块
   for (uint bkIdx = 0; bkIdx < K; bkIdx += BK) {
-    // 协作加载当前子块到 SMEM
     As[innerRowA * BK + innerColA] = A[innerRowA * K + innerColA];
     Bs[innerRowB * BN + innerColB] = B[innerRowB * N + innerColB];
     __syncthreads();
 
-    // 前进到下一个 K 维度的块
     A += BK;
     B += BK * N;
 
-    // 内积计算：dotIdx 在外层，缓存 Bs 元素
     for (uint dotIdx = 0; dotIdx < BK; ++dotIdx) {
-      float Btmp = Bs[dotIdx * BN + threadCol];  // 缓存到寄存器
+      float Btmp = Bs[dotIdx * BN + threadCol];
       for (uint resIdx = 0; resIdx < TM; ++resIdx) {
         threadResults[resIdx] +=
             As[(threadRow * TM + resIdx) * BK + dotIdx] * Btmp;
@@ -64,7 +102,6 @@ __global__ void sgemm1DBlocktiling(int M, int N, int K, float alpha,
     __syncthreads();
   }
 
-  // 写回 TM 个结果
   for (uint resIdx = 0; resIdx < TM; ++resIdx) {
     C[(threadRow * TM + resIdx) * N + threadCol] =
         alpha * threadResults[resIdx] +
@@ -72,59 +109,55 @@ __global__ void sgemm1DBlocktiling(int M, int N, int K, float alpha,
   }
 }
 
+// ===================== 主函数 =====================
 int main() {
-  CudaDeviceInfo();
+  srand((unsigned int)time(NULL));
+  print_device_info();
 
   const int M = 4096, N = 4096, K = 4096;
   const float alpha = 1.0f, beta = 0.0f;
   const int num_warmup = 5, num_iter = 10;
 
   printf("\n========================================\n");
-  printf("实验4：一维 Block Tile\n");
+  printf("实验4：一维 Block Tile SGEMM\n");
   printf("矩阵大小: M=%d, N=%d, K=%d\n", M, N, K);
   printf("========================================\n");
 
   float *A = (float *)malloc(M * K * sizeof(float));
   float *B = (float *)malloc(K * N * sizeof(float));
   float *C = (float *)malloc(M * N * sizeof(float));
-  float *C_ref = (float *)malloc(M * N * sizeof(float));
 
   randomize_matrix(A, M * K);
   randomize_matrix(B, K * N);
   zero_init_matrix(C, M * N);
-  zero_init_matrix(C_ref, M * N);
 
-  float *d_A, *d_B, *d_C, *d_C_ref;
+  float *d_A, *d_B, *d_C;
   CUDA_CHECK(cudaMalloc(&d_A, M * K * sizeof(float)));
   CUDA_CHECK(cudaMalloc(&d_B, K * N * sizeof(float)));
   CUDA_CHECK(cudaMalloc(&d_C, M * N * sizeof(float)));
-  CUDA_CHECK(cudaMalloc(&d_C_ref, M * N * sizeof(float)));
 
   CUDA_CHECK(cudaMemcpy(d_A, A, M * K * sizeof(float), cudaMemcpyHostToDevice));
   CUDA_CHECK(cudaMemcpy(d_B, B, K * N * sizeof(float), cudaMemcpyHostToDevice));
   CUDA_CHECK(cudaMemcpy(d_C, C, M * N * sizeof(float), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_C_ref, C_ref, M * N * sizeof(float), cudaMemcpyHostToDevice));
-
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-  runCublasSgemm(handle, M, N, K, alpha, d_A, d_B, beta, d_C_ref);
 
   const uint BM = 64, BN = 64, BK = 8, TM = 8;
   dim3 gridDim(CEIL_DIV(N, BN), CEIL_DIV(M, BM));
   dim3 blockDim((BM * BN) / TM);
 
-  printf("\nKernel 配置:\n");
+  printf("Kernel 配置:\n");
   printf("  BM=%d, BN=%d, BK=%d, TM=%d\n", BM, BN, BK, TM);
   printf("  Grid:  (%d, %d)\n", gridDim.x, gridDim.y);
   printf("  Block: (%d) threads\n", blockDim.x);
   printf("  SMEM:  %zu bytes\n", (BM * BK + BK * BN) * sizeof(float));
 
+  // 预热
   for (int i = 0; i < num_warmup; ++i) {
     sgemm1DBlocktiling<BM, BN, BK, TM>
         <<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
   }
   CUDA_CHECK(cudaDeviceSynchronize());
 
+  // 计时
   cudaEvent_t start, stop;
   CUDA_CHECK(cudaEventCreate(&start));
   CUDA_CHECK(cudaEventCreate(&stop));
@@ -139,25 +172,18 @@ int main() {
   float elapsed_ms;
   CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, start, stop));
   float avg_ms = elapsed_ms / num_iter;
-  CUDA_CHECK(cudaEventDestroy(start));
-  CUDA_CHECK(cudaEventDestroy(stop));
-
-  CUDA_CHECK(cudaMemcpy(C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(C_ref, d_C_ref, M * N * sizeof(float), cudaMemcpyDeviceToHost));
-  bool correct = verify_matrix(C_ref, C, M * N);
-
   double gflops = calculate_gflops(M, N, K, avg_ms);
 
   printf("\n========================================\n");
   printf("实验结果:\n");
-  printf("  正确性: %s\n", correct ? "通过" : "失败");
   printf("  平均耗时: %.4f ms\n", avg_ms);
   printf("  计算性能: %.1f GFLOPS/s\n", gflops);
   printf("  预期性能: ~8474.7 GFLOPS/s\n");
   printf("========================================\n");
 
-  cublasDestroy(handle);
-  cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaFree(d_C_ref);
-  free(A); free(B); free(C); free(C_ref);
-  return correct ? 0 : 1;
-}
+  CUDA_CHECK(cudaEventDestroy(start));
+  CUDA_CHECK(cudaEventDestroy(stop));
+  cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
+  free(A); free(B); free(C);
+  return 0;
+}
\ No newline at end of file
diff --git a/outputs/gpu-programming-course/code/labs/lab4_start.cu b/outputs/gpu-programming-course/code/labs/lab4_start.cu
index 062e9ab..85cfc6c 100644
--- a/outputs/gpu-programming-course/code/labs/lab4_start.cu
+++ b/outputs/gpu-programming-course/code/labs/lab4_start.cu
@@ -15,8 +15,10 @@
  * 运行: ./lab4_start
  */
 
-#include "sgemm_common.h"
-#include <cublas_v2.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <cuda_runtime.h>
 
 // TODO: 实现带 1D Block Tile 的 kernel
 // template <const int BM, const int BN, const int BK, const int TM>
diff --git a/outputs/gpu-programming-course/code/labs/lab5_solution.cu b/outputs/gpu-programming-course/code/labs/lab5_solution.cu
index 37dbfe1..e9148a0 100644
--- a/outputs/gpu-programming-course/code/labs/lab5_solution.cu
+++ b/outputs/gpu-programming-course/code/labs/lab5_solution.cu
@@ -7,8 +7,52 @@
  * 预期性能：矩阵 4096x4096 时约 15971.7 GFLOPS/s（比实验4提升约 1.88x）
  */
 
-#include "sgemm_common.h"
-#include <cublas_v2.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <cstring>
+#include <cmath>
+#include <cuda_runtime.h>
+
+// ========== 宏 ==========
+#define CUDA_CHECK(call) do { \
+    cudaError_t err = call; \
+    if (err != cudaSuccess) { \
+        fprintf(stderr, "CUDA Error [%s:%d]: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
+        exit(EXIT_FAILURE); \
+    } \
+} while(0)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+
+void CudaDeviceInfo() {
+    int dev_cnt;
+    CUDA_CHECK(cudaGetDeviceCount(&dev_cnt));
+    for(int i=0; i<dev_cnt; i++) {
+        cudaDeviceProp prop;
+        CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
+        printf("==== Device %d: %s | SM: %d | Mem: %.2fGB\n",
+               i, prop.name, prop.multiProcessorCount,
+               prop.totalGlobalMem / 1024.0 / 1024 / 1024);
+    }
+}
+
+void randomize_matrix(float* mat, int len) {
+    srand((unsigned)time(NULL));
+    for(int i=0; i<len; i++) {
+        mat[i] = (rand() % 1000) / 1000.0f;
+    }
+}
+
+void zero_init_matrix(float* mat, int len) {
+    memset(mat, 0, len * sizeof(float));
+}
+
+double calculate_gflops(int M, int N, int K, double time_ms) {
+    double flops = 2.0 * M * N * K;
+    return flops / (time_ms * 1e6);
+}
+// ====================================================
 
 template <const int BM, const int BN, const int BK, const int TM, const int TN>
 __global__ void __launch_bounds__((BM * BN) / (TM * TN), 1)
@@ -65,7 +109,7 @@ __global__ void __launch_bounds__((BM * BN) / (TM * TN), 1)
       for (uint i = 0; i < TN; ++i) {
         regN[i] = Bs[dotIdx * BN + threadCol * TN + i];
       }
-      // 外积累加
+      // 二维外积累加
       for (uint resIdxM = 0; resIdxM < TM; ++resIdxM) {
         for (uint resIdxN = 0; resIdxN < TN; ++resIdxN) {
           threadResults[resIdxM * TN + resIdxN] +=
@@ -76,7 +120,7 @@ __global__ void __launch_bounds__((BM * BN) / (TM * TN), 1)
     __syncthreads();
   }
 
-  // 写回结果
+  // 写回 TM×TN 块结果
   for (uint resIdxM = 0; resIdxM < TM; ++resIdxM) {
     for (uint resIdxN = 0; resIdxN < TN; ++resIdxN) {
       C[(threadRow * TM + resIdxM) * N + threadCol * TN + resIdxN] =
@@ -94,34 +138,27 @@ int main() {
   const int num_warmup = 5, num_iter = 10;
 
   printf("\n========================================\n");
-  printf("实验5：二维 Block Tile\n");
+  printf("实验5：二维 Block Tile（纯测速无cuBLAS版）\n");
   printf("矩阵大小: M=%d, N=%d, K=%d\n", M, N, K);
   printf("========================================\n");
 
+  // 移除 C_ref 参考矩阵，无需cuBLAS校验
   float *A = (float *)malloc(M * K * sizeof(float));
   float *B = (float *)malloc(K * N * sizeof(float));
   float *C = (float *)malloc(M * N * sizeof(float));
-  float *C_ref = (float *)malloc(M * N * sizeof(float));
 
   randomize_matrix(A, M * K);
   randomize_matrix(B, K * N);
   zero_init_matrix(C, M * N);
-  zero_init_matrix(C_ref, M * N);
 
-  float *d_A, *d_B, *d_C, *d_C_ref;
+  float *d_A, *d_B, *d_C;
   CUDA_CHECK(cudaMalloc(&d_A, M * K * sizeof(float)));
   CUDA_CHECK(cudaMalloc(&d_B, K * N * sizeof(float)));
   CUDA_CHECK(cudaMalloc(&d_C, M * N * sizeof(float)));
-  CUDA_CHECK(cudaMalloc(&d_C_ref, M * N * sizeof(float)));
 
   CUDA_CHECK(cudaMemcpy(d_A, A, M * K * sizeof(float), cudaMemcpyHostToDevice));
   CUDA_CHECK(cudaMemcpy(d_B, B, K * N * sizeof(float), cudaMemcpyHostToDevice));
   CUDA_CHECK(cudaMemcpy(d_C, C, M * N * sizeof(float), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_C_ref, C_ref, M * N * sizeof(float), cudaMemcpyHostToDevice));
-
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-  runCublasSgemm(handle, M, N, K, alpha, d_A, d_B, beta, d_C_ref);
 
   const uint BK = 8, TM = 8, TN = 8;
   const uint BM = (M >= 128 && N >= 128) ? 128u : 64u;
@@ -134,13 +171,16 @@ int main() {
   printf("  BM=%d, BN=%d, BK=%d, TM=%d, TN=%d\n", BM, BN, BK, TM, TN);
   printf("  Grid:  (%d, %d)\n", gridDim.x, gridDim.y);
   printf("  Block: (%d) threads\n", blockDim.x);
+  printf("  SMEM:  %zu bytes\n", (BM * BK + BK * BN) * sizeof(float));
 
+  // 预热
   for (int i = 0; i < num_warmup; ++i) {
     sgemm2DBlocktiling<BM, BN, BK, TM, TN>
         <<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
   }
   CUDA_CHECK(cudaDeviceSynchronize());
 
+  // 性能计时
   cudaEvent_t start, stop;
   CUDA_CHECK(cudaEventCreate(&start));
   CUDA_CHECK(cudaEventCreate(&stop));
@@ -155,25 +195,18 @@ int main() {
   float elapsed_ms;
   CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, start, stop));
   float avg_ms = elapsed_ms / num_iter;
-  CUDA_CHECK(cudaEventDestroy(start));
-  CUDA_CHECK(cudaEventDestroy(stop));
-
-  CUDA_CHECK(cudaMemcpy(C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(C_ref, d_C_ref, M * N * sizeof(float), cudaMemcpyDeviceToHost));
-  bool correct = verify_matrix(C_ref, C, M * N);
-
   double gflops = calculate_gflops(M, N, K, avg_ms);
 
   printf("\n========================================\n");
   printf("实验结果:\n");
-  printf("  正确性: %s\n", correct ? "通过" : "失败");
   printf("  平均耗时: %.4f ms\n", avg_ms);
   printf("  计算性能: %.1f GFLOPS/s\n", gflops);
   printf("  预期性能: ~15971.7 GFLOPS/s\n");
   printf("========================================\n");
 
-  cublasDestroy(handle);
-  cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaFree(d_C_ref);
-  free(A); free(B); free(C); free(C_ref);
-  return correct ? 0 : 1;
-}
+  CUDA_CHECK(cudaEventDestroy(start));
+  CUDA_CHECK(cudaEventDestroy(stop));
+  cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
+  free(A); free(B); free(C);
+  return 0;
+}
\ No newline at end of file
diff --git a/outputs/gpu-programming-course/code/labs/lab5_start.cu b/outputs/gpu-programming-course/code/labs/lab5_start.cu
index a9b0bc9..59d6780 100644
--- a/outputs/gpu-programming-course/code/labs/lab5_start.cu
+++ b/outputs/gpu-programming-course/code/labs/lab5_start.cu
@@ -14,8 +14,10 @@
  * 编译: nvcc lab5_start.cu -o lab5_start -lcublas
  */
 
-#include "sgemm_common.h"
-#include <cublas_v2.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <cuda_runtime.h>
 
 // TODO: 实现带 2D Block Tile 的 kernel
 // template <const int BM, const int BN, const int BK, const int TM, const int TN>
diff --git a/outputs/gpu-programming-course/code/labs/lab6_solution.cu b/outputs/gpu-programming-course/code/labs/lab6_solution.cu
index f4b85d9..2ba0fc4 100644
--- a/outputs/gpu-programming-course/code/labs/lab6_solution.cu
+++ b/outputs/gpu-programming-course/code/labs/lab6_solution.cu
@@ -8,9 +8,58 @@
  * 预期性能：矩阵 4096x4096 时约 18237.3 GFLOPS/s（比实验5提升约 1.14x）
  */
 
-#include "sgemm_common.h"
-#include <cublas_v2.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <cuda_runtime.h>
+
+// ===================== 宏定义 =====================
+#define CUDA_CHECK(call) do { \
+    cudaError_t err = call; \
+    if (err != cudaSuccess) { \
+        fprintf(stderr, "CUDA Error [%s:%d]: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
+        exit(EXIT_FAILURE); \
+    } \
+} while(0)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+
+// ===================== 辅助函数 =====================
+void randomize_matrix(float *mat, int size) {
+    for (int i = 0; i < size; ++i) {
+        mat[i] = (float)rand() / RAND_MAX;
+    }
+}
+
+void zero_init_matrix(float *mat, int size) {
+    for (int i = 0; i < size; ++i) {
+        mat[i] = 0.0f;
+    }
+}
+
+double calculate_gflops(int M, int N, int K, double time_ms) {
+    double flops = 2.0 * M * N * K;
+    return flops / (time_ms * 1e6);
+}
+
+void print_device_info() {
+    int deviceCount;
+    cudaGetDeviceCount(&deviceCount);
+    if (deviceCount == 0) {
+        printf("No CUDA devices found.\n");
+        return;
+    }
+    int device;
+    cudaGetDevice(&device);
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, device);
+    printf("Device: %s\n", prop.name);
+    printf("Compute Capability: %d.%d\n", prop.major, prop.minor);
+    printf("SMs: %d\n", prop.multiProcessorCount);
+    printf("Max threads per block: %d\n", prop.maxThreadsPerBlock);
+}
 
+// ===================== Kernel: float4向量化SGEMM 实验6核心 =====================
 template <const int BM, const int BN, const int BK, const int TM, const int TN>
 __global__ void sgemmVectorize(int M, int N, int K, float alpha, float *A,
                                 float *B, float beta, float *C) {
@@ -55,7 +104,7 @@ __global__ void sgemmVectorize(int M, int N, int K, float alpha, float *A,
     B += BK * N;
 
     for (uint dotIdx = 0; dotIdx < BK; ++dotIdx) {
-      // As 已转置为列主序：连续读取
+      // As 已转置为列主序：连续读取，生成LDS.128
       for (uint i = 0; i < TM; ++i) {
         regM[i] = As[dotIdx * BM + threadRow * TM + i];
       }
@@ -72,7 +121,7 @@ __global__ void sgemmVectorize(int M, int N, int K, float alpha, float *A,
     __syncthreads();
   }
 
-  // 向量化写回 C
+  // 向量化写回 C，STG.E.128
   for (uint resIdxM = 0; resIdxM < TM; resIdxM += 1) {
     for (uint resIdxN = 0; resIdxN < TN; resIdxN += 4) {
       float4 tmp = reinterpret_cast<float4 *>(
@@ -88,42 +137,36 @@ __global__ void sgemmVectorize(int M, int N, int K, float alpha, float *A,
   }
 }
 
+// ===================== 主函数（无cuBLAS） =====================
 int main() {
-  CudaDeviceInfo();
+  srand((unsigned int)time(NULL));
+  print_device_info();
 
   const int M = 4096, N = 4096, K = 4096;
   const float alpha = 1.0f, beta = 0.0f;
   const int num_warmup = 5, num_iter = 10;
 
   printf("\n========================================\n");
-  printf("实验6：向量化内存访问\n");
+  printf("实验6：向量化内存访问 SGEMM\n");
   printf("矩阵大小: M=%d, N=%d, K=%d\n", M, N, K);
   printf("========================================\n");
 
   float *A = (float *)malloc(M * K * sizeof(float));
   float *B = (float *)malloc(K * N * sizeof(float));
   float *C = (float *)malloc(M * N * sizeof(float));
-  float *C_ref = (float *)malloc(M * N * sizeof(float));
 
   randomize_matrix(A, M * K);
   randomize_matrix(B, K * N);
   zero_init_matrix(C, M * N);
-  zero_init_matrix(C_ref, M * N);
 
-  float *d_A, *d_B, *d_C, *d_C_ref;
+  float *d_A, *d_B, *d_C;
   CUDA_CHECK(cudaMalloc(&d_A, M * K * sizeof(float)));
   CUDA_CHECK(cudaMalloc(&d_B, K * N * sizeof(float)));
   CUDA_CHECK(cudaMalloc(&d_C, M * N * sizeof(float)));
-  CUDA_CHECK(cudaMalloc(&d_C_ref, M * N * sizeof(float)));
 
   CUDA_CHECK(cudaMemcpy(d_A, A, M * K * sizeof(float), cudaMemcpyHostToDevice));
   CUDA_CHECK(cudaMemcpy(d_B, B, K * N * sizeof(float), cudaMemcpyHostToDevice));
   CUDA_CHECK(cudaMemcpy(d_C, C, M * N * sizeof(float), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_C_ref, C_ref, M * N * sizeof(float), cudaMemcpyHostToDevice));
-
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-  runCublasSgemm(handle, M, N, K, alpha, d_A, d_B, beta, d_C_ref);
 
   const uint BK = 8, TM = 8, TN = 8;
   const uint BM = (M >= 128 && N >= 128) ? 128u : 64u;
@@ -132,17 +175,19 @@ int main() {
   dim3 gridDim(CEIL_DIV(N, BN), CEIL_DIV(M, BM));
   dim3 blockDim((BM * BN) / (TM * TN));
 
-  printf("\nKernel 配置:\n");
+  printf("Kernel 配置:\n");
   printf("  BM=%d, BN=%d, BK=%d, TM=%d, TN=%d\n", BM, BN, BK, TM, TN);
   printf("  Grid:  (%d, %d)\n", gridDim.x, gridDim.y);
   printf("  Block: (%d) threads\n", blockDim.x);
 
+  // 预热迭代
   for (int i = 0; i < num_warmup; ++i) {
     sgemmVectorize<BM, BN, BK, TM, TN>
         <<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
   }
   CUDA_CHECK(cudaDeviceSynchronize());
 
+  // 性能计时
   cudaEvent_t start, stop;
   CUDA_CHECK(cudaEventCreate(&start));
   CUDA_CHECK(cudaEventCreate(&stop));
@@ -157,25 +202,18 @@ int main() {
   float elapsed_ms;
   CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, start, stop));
   float avg_ms = elapsed_ms / num_iter;
-  CUDA_CHECK(cudaEventDestroy(start));
-  CUDA_CHECK(cudaEventDestroy(stop));
-
-  CUDA_CHECK(cudaMemcpy(C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(C_ref, d_C_ref, M * N * sizeof(float), cudaMemcpyDeviceToHost));
-  bool correct = verify_matrix(C_ref, C, M * N);
-
   double gflops = calculate_gflops(M, N, K, avg_ms);
 
   printf("\n========================================\n");
   printf("实验结果:\n");
-  printf("  正确性: %s\n", correct ? "通过" : "失败");
   printf("  平均耗时: %.4f ms\n", avg_ms);
   printf("  计算性能: %.1f GFLOPS/s\n", gflops);
   printf("  预期性能: ~18237.3 GFLOPS/s\n");
   printf("========================================\n");
 
-  cublasDestroy(handle);
-  cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaFree(d_C_ref);
-  free(A); free(B); free(C); free(C_ref);
-  return correct ? 0 : 1;
-}
+  CUDA_CHECK(cudaEventDestroy(start));
+  CUDA_CHECK(cudaEventDestroy(stop));
+  cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
+  free(A); free(B); free(C);
+  return 0;
+}
\ No newline at end of file
diff --git a/outputs/gpu-programming-course/code/labs/lab6_start.cu b/outputs/gpu-programming-course/code/labs/lab6_start.cu
index 1400308..2b6bc4e 100644
--- a/outputs/gpu-programming-course/code/labs/lab6_start.cu
+++ b/outputs/gpu-programming-course/code/labs/lab6_start.cu
@@ -14,8 +14,10 @@
  * 编译: nvcc lab6_start.cu -o lab6_start -lcublas
  */
 
-#include "sgemm_common.h"
-#include <cublas_v2.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <cuda_runtime.h>
 
 // TODO: 实现向量化内存访问的 kernel
 // 提示：
diff --git a/outputs/gpu-programming-course/code/labs/lab7_solution.cu b/outputs/gpu-programming-course/code/labs/lab7_solution.cu
index cd3f5b2..b9b37f7 100644
--- a/outputs/gpu-programming-course/code/labs/lab7_solution.cu
+++ b/outputs/gpu-programming-course/code/labs/lab7_solution.cu
@@ -8,9 +8,58 @@
  * 预期性能（A6000, BK=16）：约 19721.0 GFLOPS/s（达 cuBLAS 的 84.8%）
  */
 
-#include "sgemm_common.h"
-#include <cublas_v2.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <cuda_runtime.h>
 
+// ===================== 宏定义 =====================
+#define CUDA_CHECK(call) do { \
+    cudaError_t err = call; \
+    if (err != cudaSuccess) { \
+        fprintf(stderr, "CUDA Error [%s:%d]: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
+        exit(EXIT_FAILURE); \
+    } \
+} while(0)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+
+// ===================== 辅助函数 =====================
+void randomize_matrix(float *mat, int size) {
+    for (int i = 0; i < size; ++i) {
+        mat[i] = (float)rand() / RAND_MAX;
+    }
+}
+
+void zero_init_matrix(float *mat, int size) {
+    for (int i = 0; i < size; ++i) {
+        mat[i] = 0.0f;
+    }
+}
+
+double calculate_gflops(int M, int N, int K, double time_ms) {
+    double flops = 2.0 * M * N * K;
+    return flops / (time_ms * 1e6);
+}
+
+void print_device_info() {
+    int deviceCount;
+    cudaGetDeviceCount(&deviceCount);
+    if (deviceCount == 0) {
+        printf("No CUDA devices found.\n");
+        return;
+    }
+    int device;
+    cudaGetDevice(&device);
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, device);
+    printf("Device: %s\n", prop.name);
+    printf("Compute Capability: %d.%d\n", prop.major, prop.minor);
+    printf("SMs: %d\n", prop.multiProcessorCount);
+    printf("Max threads per block: %d\n", prop.maxThreadsPerBlock);
+}
+
+// ===================== Kernel: 实验7 带Warp Tile、参数调优向量化SGEMM =====================
 const int NUM_THREADS = 256;
 
 template <const int BM, const int BN, const int BK, const int TM, const int TN>
@@ -119,42 +168,37 @@ __global__ void __launch_bounds__(NUM_THREADS)
   }
 }
 
+// ===================== 主函数（无cuBLAS） =====================
 int main() {
-  CudaDeviceInfo();
+  srand((unsigned int)time(NULL));
+  print_device_info();
 
   const int M = 4096, N = 4096, K = 4096;
   const float alpha = 1.0f, beta = 0.0f;
   const int num_warmup = 5, num_iter = 10;
+  const int NUM_THREADS = 256;
 
   printf("\n========================================\n");
-  printf("实验7：参数自动调优\n");
+  printf("实验7：参数自动调优 SGEMM\n");
   printf("矩阵大小: M=%d, N=%d, K=%d\n", M, N, K);
   printf("========================================\n");
 
   float *A = (float *)malloc(M * K * sizeof(float));
   float *B = (float *)malloc(K * N * sizeof(float));
   float *C = (float *)malloc(M * N * sizeof(float));
-  float *C_ref = (float *)malloc(M * N * sizeof(float));
 
   randomize_matrix(A, M * K);
   randomize_matrix(B, K * N);
   zero_init_matrix(C, M * N);
-  zero_init_matrix(C_ref, M * N);
 
-  float *d_A, *d_B, *d_C, *d_C_ref;
+  float *d_A, *d_B, *d_C;
   CUDA_CHECK(cudaMalloc(&d_A, M * K * sizeof(float)));
   CUDA_CHECK(cudaMalloc(&d_B, K * N * sizeof(float)));
   CUDA_CHECK(cudaMalloc(&d_C, M * N * sizeof(float)));
-  CUDA_CHECK(cudaMalloc(&d_C_ref, M * N * sizeof(float)));
 
   CUDA_CHECK(cudaMemcpy(d_A, A, M * K * sizeof(float), cudaMemcpyHostToDevice));
   CUDA_CHECK(cudaMemcpy(d_B, B, K * N * sizeof(float), cudaMemcpyHostToDevice));
   CUDA_CHECK(cudaMemcpy(d_C, C, M * N * sizeof(float), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_C_ref, C_ref, M * N * sizeof(float), cudaMemcpyHostToDevice));
-
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-  runCublasSgemm(handle, M, N, K, alpha, d_A, d_B, beta, d_C_ref);
 
   // A6000 最优参数
   const uint BM = 128, BN = 128, BK_PARAM = 16, TM = 8, TN = 8;
@@ -176,18 +220,20 @@ int main() {
   dim3 gridDim(CEIL_DIV(N, BN), CEIL_DIV(M, BM));
   dim3 blockDim(NUM_THREADS);
 
-  printf("\nKernel 配置:\n");
+  printf("Kernel 配置:\n");
   printf("  BM=%d, BN=%d, BK=%d, TM=%d, TN=%d\n", BM, BN, BK_PARAM, TM, TN);
   printf("  WarpTile: WM=%d, WN=%d\n", TM*16, TN*16);
   printf("  Grid:  (%d, %d)\n", gridDim.x, gridDim.y);
   printf("  Block: (%d) threads\n", blockDim.x);
 
+  // 预热迭代
   for (int i = 0; i < num_warmup; ++i) {
     sgemmAutotuned<BM, BN, BK_PARAM, TM, TN>
         <<<gridDim, blockDim>>>(M, N, K, alpha, d_A, d_B, beta, d_C);
   }
   CUDA_CHECK(cudaDeviceSynchronize());
 
+  // 性能计时
   cudaEvent_t start, stop;
   CUDA_CHECK(cudaEventCreate(&start));
   CUDA_CHECK(cudaEventCreate(&stop));
@@ -202,25 +248,18 @@ int main() {
   float elapsed_ms;
   CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, start, stop));
   float avg_ms = elapsed_ms / num_iter;
-  CUDA_CHECK(cudaEventDestroy(start));
-  CUDA_CHECK(cudaEventDestroy(stop));
-
-  CUDA_CHECK(cudaMemcpy(C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(C_ref, d_C_ref, M * N * sizeof(float), cudaMemcpyDeviceToHost));
-  bool correct = verify_matrix(C_ref, C, M * N);
-
   double gflops = calculate_gflops(M, N, K, avg_ms);
 
   printf("\n========================================\n");
   printf("实验结果:\n");
-  printf("  正确性: %s\n", correct ? "通过" : "失败");
   printf("  平均耗时: %.4f ms\n", avg_ms);
   printf("  计算性能: %.1f GFLOPS/s\n", gflops);
   printf("  预期性能 (BK=16): ~19721.0 GFLOPS/s\n");
   printf("========================================\n");
 
-  cublasDestroy(handle);
-  cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaFree(d_C_ref);
-  free(A); free(B); free(C); free(C_ref);
-  return correct ? 0 : 1;
-}
+  CUDA_CHECK(cudaEventDestroy(start));
+  CUDA_CHECK(cudaEventDestroy(stop));
+  cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
+  free(A); free(B); free(C);
+  return 0;
+}
\ No newline at end of file
diff --git a/outputs/gpu-programming-course/code/labs/lab7_start.cu b/outputs/gpu-programming-course/code/labs/lab7_start.cu
index 1d3e068..bd74e34 100644
--- a/outputs/gpu-programming-course/code/labs/lab7_start.cu
+++ b/outputs/gpu-programming-course/code/labs/lab7_start.cu
@@ -14,8 +14,11 @@
  * 编译: nvcc lab7_start.cu -o lab7_start -lcublas
  */
 
-#include "sgemm_common.h"
-#include <cublas_v2.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <cuda_runtime.h>
 
 const int NUM_THREADS = 256;
 
diff --git a/outputs/gpu-programming-course/code/labs/lab8_solution.cu b/outputs/gpu-programming-course/code/labs/lab8_solution.cu
index c1f0aa6..2384095 100644
--- a/outputs/gpu-programming-course/code/labs/lab8_solution.cu
+++ b/outputs/gpu-programming-course/code/labs/lab8_solution.cu
@@ -11,24 +11,64 @@
  * 预期性能：约 21779.3 GFLOPS/s（达 cuBLAS 的 93.7%）
  */
 
-#include "sgemm_common.h"
-#include <cublas_v2.h>
-
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <cstring>
+#include <cmath>
+#include <cuda_runtime.h>
+
+#define CUDA_CHECK(call) do { \
+    cudaError_t err = call; \
+    if (err != cudaSuccess) { \
+        fprintf(stderr, "CUDA Error [%s:%d]: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
+        exit(EXIT_FAILURE); \
+    } \
+} while(0)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
 const int WARPSIZE = 32;
 
+// ===================== 内置公共工具函数 =====================
+void CudaDeviceInfo() {
+    int dev_cnt;
+    CUDA_CHECK(cudaGetDeviceCount(&dev_cnt));
+    for(int i=0; i<dev_cnt; i++) {
+        cudaDeviceProp prop;
+        CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
+        printf("==== Device %d: %s | SM: %d | Mem: %.2fGB\n",
+               i, prop.name, prop.multiProcessorCount,
+               prop.totalGlobalMem / 1024.0 / 1024 / 1024);
+    }
+}
+
+void randomize_matrix(float* mat, int len) {
+    srand((unsigned)time(NULL));
+    for(int i=0; i<len; i++) {
+        mat[i] = (rand() % 1000) / 1000.0f;
+    }
+}
+
+void zero_init_matrix(float* mat, int len) {
+    memset(mat, 0, len * sizeof(float));
+}
+
+double calculate_gflops(int M, int N, int K, double time_ms) {
+    double flops = 2.0 * M * N * K;
+    return flops / (time_ms * 1e6);
+}
+
 // ============================================================================
 // Warp Tiling helper functions
 // ============================================================================
 namespace wt {
 
-// 从 GMEM 加载数据到 SMEM（向量化 + A 转置）
 template <const int BM, const int BN, const int BK,
           const int rowStrideA, const int rowStrideB>
 __device__ void loadFromGmem(int N, int K, const float *A, const float *B,
                               float *As, float *Bs,
                               int innerRowA, int innerColA,
                               int innerRowB, int innerColB) {
-  // 加载 A（向量化 + 转置）
   for (uint offset = 0; offset + rowStrideA <= BM; offset += rowStrideA) {
     const float4 tmp = reinterpret_cast<const float4 *>(
         &A[(innerRowA + offset) * K + innerColA * 4])[0];
@@ -38,7 +78,6 @@ __device__ void loadFromGmem(int N, int K, const float *A, const float *B,
     As[(innerColA * 4 + 3) * BM + innerRowA + offset] = tmp.w;
   }
 
-  // 加载 B（向量化，无需转置）
   for (uint offset = 0; offset + rowStrideB <= BK; offset += rowStrideB) {
     reinterpret_cast<float4 *>(
         &Bs[(innerRowB + offset) * BN + innerColB * 4])[0] =
@@ -47,7 +86,6 @@ __device__ void loadFromGmem(int N, int K, const float *A, const float *B,
   }
 }
 
-// 从 SMEM 处理 Warp Tile：批量加载到寄存器，执行外积
 template <const int BM, const int BN, const int BK,
           const int WM, const int WN,
           const int WMITER, const int WNITER,
@@ -60,7 +98,6 @@ __device__ void processFromSmem(float *regM, float *regN,
                                  const uint threadRowInWarp,
                                  const uint threadColInWarp) {
   for (uint dotIdx = 0; dotIdx < BK; ++dotIdx) {
-    // 批量加载整个 Warp Subtile 到寄存器
     for (uint wSubRowIdx = 0; wSubRowIdx < WMITER; ++wSubRowIdx) {
       for (uint i = 0; i < TM; ++i) {
         regM[wSubRowIdx * TM + i] =
@@ -76,7 +113,6 @@ __device__ void processFromSmem(float *regM, float *regN,
       }
     }
 
-    // 在寄存器上执行密集外积
     for (uint wSubRowIdx = 0; wSubRowIdx < WMITER; ++wSubRowIdx) {
       for (uint wSubColIdx = 0; wSubColIdx < WNITER; ++wSubColIdx) {
         for (uint resIdxM = 0; resIdxM < TM; ++resIdxM) {
@@ -106,17 +142,14 @@ __global__ void __launch_bounds__(NUM_THREADS)
   const uint cRow = blockIdx.y;
   const uint cCol = blockIdx.x;
 
-  // Warp 在 Block Tile 中的位置
   const uint warpIdx = threadIdx.x / WARPSIZE;
   const uint warpCol = warpIdx % (BN / WN);
   const uint warpRow = warpIdx / (BN / WN);
 
-  // Warp Subtile 的大小
   constexpr uint WMITER = (WM * WN) / (WARPSIZE * TM * TN * WNITER);
   constexpr uint WSUBM = WM / WMITER;
   constexpr uint WSUBN = WN / WNITER;
 
-  // 线程在 Warp Subtile 中的位置
   const uint threadIdxInWarp = threadIdx.x % WARPSIZE;
   const uint threadColInWarp = threadIdxInWarp % (WSUBN / TN);
   const uint threadRowInWarp = threadIdxInWarp / (WSUBN / TN);
@@ -124,12 +157,10 @@ __global__ void __launch_bounds__(NUM_THREADS)
   __shared__ float As[BM * BK];
   __shared__ float Bs[BK * BN];
 
-  // 指针移动
   A += cRow * BM * K;
   B += cCol * BN;
   C += (cRow * BM + warpRow * WM) * N + cCol * BN + warpCol * WN;
 
-  // 向量化加载索引
   const uint innerRowA = threadIdx.x / (BK / 4);
   const uint innerColA = threadIdx.x % (BK / 4);
   constexpr uint rowStrideA = (NUM_THREADS * 4) / BK;
@@ -137,7 +168,6 @@ __global__ void __launch_bounds__(NUM_THREADS)
   const uint innerColB = threadIdx.x % (BN / 4);
   constexpr uint rowStrideB = NUM_THREADS / (BN / 4);
 
-  // 寄存器缓存
   float threadResults[WMITER * TM * WNITER * TN] = {0.0};
   float regM[WMITER * TM] = {0.0};
   float regN[WNITER * TN] = {0.0};
@@ -157,7 +187,6 @@ __global__ void __launch_bounds__(NUM_THREADS)
     __syncthreads();
   }
 
-  // 写回结果（向量化）
   for (uint wSubRowIdx = 0; wSubRowIdx < WMITER; ++wSubRowIdx) {
     for (uint wSubColIdx = 0; wSubColIdx < WNITER; ++wSubColIdx) {
       float *C_interim = C + (wSubRowIdx * WSUBM) * N + wSubColIdx * WSUBN;
@@ -189,34 +218,26 @@ int main() {
   const int num_warmup = 5, num_iter = 10;
 
   printf("\n========================================\n");
-  printf("实验8：Warp Tile + 对比 cuBLAS\n");
+  printf("实验8：Warp Tile SGEMM 性能测评\n");
   printf("矩阵大小: M=%d, N=%d, K=%d\n", M, N, K);
   printf("========================================\n");
 
   float *A = (float *)malloc(M * K * sizeof(float));
   float *B = (float *)malloc(K * N * sizeof(float));
   float *C = (float *)malloc(M * N * sizeof(float));
-  float *C_ref = (float *)malloc(M * N * sizeof(float));
 
   randomize_matrix(A, M * K);
   randomize_matrix(B, K * N);
   zero_init_matrix(C, M * N);
-  zero_init_matrix(C_ref, M * N);
 
-  float *d_A, *d_B, *d_C, *d_C_ref;
+  float *d_A, *d_B, *d_C;
   CUDA_CHECK(cudaMalloc(&d_A, M * K * sizeof(float)));
   CUDA_CHECK(cudaMalloc(&d_B, K * N * sizeof(float)));
   CUDA_CHECK(cudaMalloc(&d_C, M * N * sizeof(float)));
-  CUDA_CHECK(cudaMalloc(&d_C_ref, M * N * sizeof(float)));
 
   CUDA_CHECK(cudaMemcpy(d_A, A, M * K * sizeof(float), cudaMemcpyHostToDevice));
   CUDA_CHECK(cudaMemcpy(d_B, B, K * N * sizeof(float), cudaMemcpyHostToDevice));
   CUDA_CHECK(cudaMemcpy(d_C, C, M * N * sizeof(float), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_C_ref, C_ref, M * N * sizeof(float), cudaMemcpyHostToDevice));
-
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-  runCublasSgemm(handle, M, N, K, alpha, d_A, d_B, beta, d_C_ref);
 
   // A6000 最优参数
   const uint K10_NUM_THREADS = 128;
@@ -233,7 +254,6 @@ int main() {
   constexpr uint WMITER =
       (K10_WM * K10_WN) / (32 * K10_TM * K10_TN * K10_WNITER);
 
-  // 编译时约束检查
   static_assert((K10_BN % K10_WN == 0) && (K10_BM % K10_WM == 0));
   static_assert((K10_BN / K10_WN) * (K10_BM / K10_WM) == NUM_WARPS);
   static_assert((K10_WM * K10_WN) % (WARPSIZE * K10_TM * K10_TN * K10_WNITER) == 0);
@@ -283,29 +303,18 @@ int main() {
   float elapsed_ms;
   CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, start, stop));
   float avg_ms = elapsed_ms / num_iter;
-  CUDA_CHECK(cudaEventDestroy(start));
-  CUDA_CHECK(cudaEventDestroy(stop));
-
-  CUDA_CHECK(cudaMemcpy(C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(C_ref, d_C_ref, M * N * sizeof(float), cudaMemcpyDeviceToHost));
-  bool correct = verify_matrix(C_ref, C, M * N);
-
   double gflops = calculate_gflops(M, N, K, avg_ms);
 
-  // 计算相对 cuBLAS 的性能百分比（cuBLAS 约 23249.6 GFLOPS）
-  double pct_of_cublas = gflops / 23249.6 * 100.0;
-
   printf("\n========================================\n");
   printf("实验结果:\n");
-  printf("  正确性: %s\n", correct ? "通过" : "失败");
   printf("  平均耗时: %.4f ms\n", avg_ms);
   printf("  计算性能: %.1f GFLOPS/s\n", gflops);
-  printf("  相对 cuBLAS: %.1f%%\n", pct_of_cublas);
-  printf("  预期性能: ~21779.3 GFLOPS/s (93.7%% of cuBLAS)\n");
+  printf("  预期性能: ~21779.3 GFLOPS/s\n");
   printf("========================================\n");
 
-  cublasDestroy(handle);
-  cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaFree(d_C_ref);
-  free(A); free(B); free(C); free(C_ref);
-  return correct ? 0 : 1;
-}
+  CUDA_CHECK(cudaEventDestroy(start));
+  CUDA_CHECK(cudaEventDestroy(stop));
+  cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
+  free(A); free(B); free(C);
+  return 0;
+}
\ No newline at end of file
diff --git a/outputs/gpu-programming-course/code/labs/lab8_start.cu b/outputs/gpu-programming-course/code/labs/lab8_start.cu
index 996dcfe..6505c6b 100644
--- a/outputs/gpu-programming-course/code/labs/lab8_start.cu
+++ b/outputs/gpu-programming-course/code/labs/lab8_start.cu
@@ -13,8 +13,12 @@
  * 编译: nvcc lab8_start.cu -o lab8_start -lcublas
  */
 
-#include "sgemm_common.h"
-#include <cublas_v2.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <cstring>
+#include <cmath>
+#include <cuda_runtime.h>
 
 // Warp size 是硬件常量，不是 C++ constexpr
 #define WARPSIZE 32
diff --git "a/outputs/gpu-programming-course/docs/labs/\345\256\236\351\252\2141 \346\234\264\347\264\240\347\237\251\351\230\265\344\271\230\346\263\225\345\256\236\347\216\260.md" "b/outputs/gpu-programming-course/docs/labs/\345\256\236\351\252\2141 \346\234\264\347\264\240\347\237\251\351\230\265\344\271\230\346\263\225\345\256\236\347\216\260.md"
index c6fe026..27ead18 100644
--- "a/outputs/gpu-programming-course/docs/labs/\345\256\236\351\252\2141 \346\234\264\347\264\240\347\237\251\351\230\265\344\271\230\346\263\225\345\256\236\347\216\260.md"	
+++ "b/outputs/gpu-programming-course/docs/labs/\345\256\236\351\252\2141 \346\234\264\347\264\240\347\237\251\351\230\265\344\271\230\346\263\225\345\256\236\347\216\260.md"	
@@ -52,7 +52,6 @@ nvcc --version
 nvidia-smi
 ```
 
-然后创建本次实验的工作目录，并准备好 `sgemm_common.h` 公共头文件（已提供，包含计时、错误检查、矩阵初始化等工具函数）。
 
 ### 步骤2：理解矩阵乘法的数据依赖