intro-programming-gpu/software2.tex at main · chpc-uofu/intro-programming-gpu · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
% Matrix Multiplication
\subsection{Case study: matrix multiplication}

\begin{frame}
  \frametitle{Preliminary notion: matrix storage as a $1$D vector}
  \centering
  \includegraphics[width=0.65\textwidth,
    alt={Diagram of an 8x8 matrix P stored as a 1D vector}]{./img/matrix8x8.png}
  \par\small{P: $8\times8$ Matrix ($d=8$)}
  \begin{itemize}
    \item $P[i,j]$ is stored as $1$D vector: $P[i*d+j]$ where $i,j\,\in [0,8)$.
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Matrix Multiplication $P=M\,N$}
  Matrix multiplication: $P=M \; N$ where $M,N \, \in \, \mathbb{R}^{d\times d}$
  \begin{itemize}
    \item $P[i,j] = \displaystyle \sum_{k=0}^{d-1} M[i,k]\;N[k,j]$
    \item $M[i,k]$ is stored as: $M[i*d+k]$
    \item $N[k,j]$ is stored as: $N[k*d+j]$
    \item Therefore,
          \begin{equation}
            P[i*d+j] = \sum_{k=0}^{d-1} M[i*d+k]\;N[k*d+j] \label{Eq:MatMul}
          \end{equation}
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Matrix Mul.: kernel (v.\,$1$)}
  \begin{itemize}
    \item Each Thread (\texttt{threadIdx}) is a $2$D object:
          (\texttt{threadIdx.x}, \texttt{threadIdx.y})
          (cfr.\ a point in plane geometry).
    \item Each Thread calculates only $1$ element of $P$.
  \end{itemize}
  Implementation of Eq.\,(1) using $1$ Block of Threads.\\
  \textbf{\textcolor{orange}{Note}}: see \texttt{./latexinc/matmul/1/mul.cu}
\end{frame}

\begin{frame}
  \frametitle{Invoking kernel (v.\,$1$)}
  \begin{itemize}
    \item Invoking $1$ Block of Threads.
    \item \textbf{\textcolor{orange}{Note}}: see \texttt{./latexinc/matmul/1/main-sel.cu}
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Mat.\ Mul.\ (v.$2$): Grid of $2$D Blocks}
  \begin{itemize}
    \item \texttt{int tx = blockIdx.x*blockDim.x + threadIdx.x;}
    \item \texttt{int ty = blockIdx.y*blockDim.y + threadIdx.y;}
  \end{itemize}
  \centering
  \includegraphics[width=0.50\textwidth,
    alt={Diagram of a 2D grid of 2D blocks of threads}]{./img/BlockGrid.eps}
  \par\small{$2$D-Grid of $2$D-Blocks of Threads}
\end{frame}

\begin{frame}
  \frametitle{Matrix Mul.: visualization (v.\,$2$)}
  \centering
  \includegraphics[width=0.85\textwidth,
    alt={Visualization of matrix multiplication using a 2D grid}]{./img/mulB.jpg}
  \par\small{Matrix Mul.\ ($2$D Grid)}
\end{frame}

\begin{frame}
  \frametitle{Matrix Mul.: kernel (v.\,$2$)}
  \textbf{\textcolor{orange}{Note}}: see \texttt{./latexinc/matmul/2/mul.cu}
\end{frame}

\begin{frame}
  \frametitle{Invoking kernel (v.\,$2$)}
  \begin{itemize}
    \item Invoking a grid of blocks of threads.
    \item \textbf{\textcolor{orange}{Note}}: see \texttt{./latexinc/matmul/2/main-sel.cu}
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Types of GPU memory}
  \begin{itemize}
    \item \textbf{\textcolor{blue}{global memory}}: largest, slowest, often the bottleneck.
    \item \textbf{\textcolor{blue}{constant memory}}: cached, read-only.
          \texttt{\_\_constant\_\_}: constant memory space specifier.
    \item \textbf{\textcolor{blue}{registers}}: fast, on-chip memory (exclusive to each thread).
    \item \textbf{\textcolor{blue}{shared memory}}: allocated per thread block \& low latency.
          \texttt{\_\_shared\_\_}: shared memory space specifier.
          \texttt{\_\_syncthreads()}: barrier function which forces all threads in a
          block to wait until all threads have arrived before proceeding
          (block level synchronization).
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Matrix Mul.: use of shared memory (v.\,$3$)}
  \centering
  \includegraphics[width=0.85\textwidth,
    alt={Diagram showing matrix multiplication using shared memory}]{./img/mulC.jpg}
  \par\small{Matrix Mul.: use of shared memory}
\end{frame}

\begin{frame}
  \frametitle{Matrix Mul.: kernel (v.\,$3$) --- use of shared memory}
  \textbf{\textcolor{orange}{Note}}: see \texttt{./latexinc/matmul/3/mul.cu}
\end{frame}

\subsection{Building CUDA applications \& useful env.\ variables}

\begin{frame}
  \frametitle{Building/Compiling CUDA applications}
  General scheme:
  \begin{itemize}
    \item Source code for CUDA applications:
          \texttt{C}/\CC\ host code with extensions to deal with the device(s).
          Other programming languages are allowed e.g.\ \texttt{Fortran}.
    \item Primo: \textbf{\textcolor{green}{separate}} device functions from host code.
    \item \textbf{\textcolor{green}{Device code}}: preprocessing, compilation
          with the NVIDIA compiler (\texttt{nvcc}).
    \item \textbf{\textcolor{green}{Host code}}: preprocessed, compiled with a host
          (\texttt{C}/\CC) compiler e.\,g.\ (\texttt{gcc, g++, icc, icpc, \ldots})
    \item Compiled device functions are \textbf{\textcolor{green}{embedded}} as
          \texttt{fatbinary} images in the host object file.
    \item \textbf{\textcolor{green}{Linking stage}}: adding CUDA runtime libraries
          to the host object file to create an executable.
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Further concepts}
  \begin{itemize}
    \item \texttt{.cu}: Suffix for CUDA source file (host code (\texttt{C},\CC) \& device code).
    \item \texttt{.cuf}: Suffix for CUDA source file (host code (\texttt{Fortran}) \& device code).
    \item \texttt{.ptx}: Suffix for \textbf{P}arallel \textbf{T}hread E\textbf{x}ecution
          (\texttt{PTX}) files --- an intermediate representation similar to assembly
          for a \textbf{\textcolor{blue}{virtual GPU architecture}}
          (prefix: \texttt{compute\_}, e.\,g.\ \texttt{compute\_70}).
    \item \texttt{.cubin}: Suffix for the \textbf{CU}DA device \textbf{bin}ary file
          pertaining to a \textbf{\textcolor{blue}{real GPU architecture}}
          (prefix: \texttt{sm\_}, e.\,g.\ \texttt{sm\_70}).
          \textcolor{orange}{Memento}: \texttt{sm} stands for the physical streaming multiprocessor.
    \item \texttt{fatbin}: Multiple \texttt{PTX} [\& \texttt{cubin}] files are merged
          into a \texttt{fatbin} file.
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Compilation trajectory}
  \centering
  \includegraphics[width=0.65\textwidth,
    alt={Diagram showing the CUDA compilation trajectory}]{./img/compileTrajectory.png}
  \par\small{Compilation trajectory}
\end{frame}

\begin{frame}
  \frametitle{In praxi,}
  We will now address the following questions:
  \begin{itemize}
    \item What are the recent CUDA architectures?
    \item How to find the compute capability (CC) of a device?
    \item How to build an executable for a particular device?
    \item How to build an executable for multiple architectures?
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Recent CUDA Architectures/Generations}
  \begin{itemize}
    \item NVIDIA GPU: $xy$ where $x$: generation (major), $y$: minor.
    \item New generation: major improvement in functionality/chip.
    \item \textbf{Binary} compatibility is \textbf{NOT} guaranteed among generations.
  \end{itemize}
  \begin{center}
    \begin{tabular}{cccc}
      \toprule
      Architecture      & Year & \texttt{compute\_} & \texttt{sm\_} \\
                        &      & (\textit{virtual})  & (\textit{real}) \\
      \midrule
      \texttt{Maxwell}      & 2014 & 50, 52, 53    & 50, 52, 53   \\
      \texttt{Pascal}       & 2016 & 60, 61, 62    & 60, 61, 62   \\
      \texttt{Volta}        & 2017 & 70, 72        & 70, 72       \\
      \texttt{Turing}       & 2018 & 75            & 75           \\
      \texttt{Ampere}       & 2020 & 80, 86, 87    & 80, 86, 87   \\
      \texttt{Ada Lovelace} & 2022 & 89            & 89           \\
      \texttt{Hopper}       & 2022 & 90, 90a       & 90, 90a      \\
      \texttt{Blackwell}    & 2024 & 100, 100a     & 100, 100a    \\
      \bottomrule
    \end{tabular}
  \end{center}
  \par\small{Some of the recent CUDA architectures (10/31/2025)}
\end{frame}

\begin{frame}
  \frametitle{Retrieval of the Compute Capability (CC)}
  You can:
  \begin{enumerate}
    \item Use \texttt{nvidia-smi} to display the architecture.
          \texttt{nvidia-smi} is an \textbf{\textcolor{green}{extremely powerful}} tool
          to query the state \& specs of the devices attached to a node.
    \item Use \texttt{nvaccelinfo}
          (part of \href{https://developer.nvidia.com/hpc-sdk}{NVIDIA HPC SDK}).
    \item Write basic \texttt{C}/\CC\ code relying on CUDA APIs
          (available in \texttt{src/devicequery}, executable: \texttt{devinfo}):
          \texttt{cudaGetDeviceCount(int *tot)}: returns number of devices.
          \texttt{cudaGetDeviceProperties(cudaDeviceProp *p, int idev)}:
          returns information about device \texttt{idev}.
  \end{enumerate}
\end{frame}

\begin{frame}
  \frametitle{Use of \texttt{nvidia-smi}}
  \centering
  \includegraphics[width=0.85\textwidth,
    alt={Screenshot of nvidia-smi output}]{./img/nvidia-smi2.png}
  \par\small{\texttt{nvidia-smi}}
\end{frame}

\begin{frame}
  \frametitle{Retrieval of CC using \texttt{nvaccelinfo}}
  \centering
  \includegraphics[width=0.75\textwidth,
    alt={Screenshot of nvaccelinfo output showing compute capability}]{./img/nvaccelinfo.png}
  \par\small{Use of \texttt{nvaccelinfo} to retrieve \texttt{compute\_}}
\end{frame}

\begin{frame}
  \frametitle{Retrieval of CC through some simple CUDA APIs}
  \centering
  \includegraphics[width=0.70\textwidth,
    alt={Screenshot of devinfo output based on CUDA APIs}]{./img/devinfo.png}
  \par\small{\texttt{devinfo} based on a few CUDA APIs}
\end{frame}

\begin{frame}
  \frametitle{Compiling your code for a particular device}
  \begin{itemize}
    \item Compilation goes in $2$ steps:
    \item Step 1 --- \texttt{PTX} representation: generic assembly for a
          \textit{virtual} (\texttt{compute\_}) GPU architecture.
          The \texttt{.ptx} file is human readable (text file).
    \item Step 2 --- \texttt{Binary generation}: object file for the
          \textit{real} (\texttt{sm\_}) GPU architecture (based on the \texttt{PTX} file).
    \item \texttt{-arch}/\texttt{-code} flags:
    \item \texttt{--gpu-architecture|-arch <arch>}: specifies $1$ \textit{virtual}
          GPU architecture, e.\,g.\ \texttt{-arch=compute\_50}.
    \item \texttt{--gpu-code|-code <arch>}: specifies the \textit{real}
          GPU architecture(s), e.\,g.\ \texttt{-code=sm\_52}.
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Compiling your code (cont.)}
  \begin{itemize}
    \item Choose $1$ virtual architecture and the accompanying real architectures,
          e.\,g.\ \texttt{-arch=compute\_50 -code=sm\_50,sm\_51,sm\_52}:
          \texttt{PTX} file generated for \texttt{compute\_50} (\textit{virtual}).
          \textbf{fatbinary} object created for \texttt{sm\_50,sm\_51,sm\_52} (\textit{real}).
    \item \texttt{--generate-code|-gencode arch=<arch>,code=<code>}:
          \textbf{\textcolor{green}{Generalization}} of
          \texttt{--gpu-architecture=<arch> --gpu-code=<code>}.
          Allows creation of binaries for different architectures.
    \item Example: see \texttt{./latexinc/cuflags.txt}
  \end{itemize}
\end{frame}

\subsection{Profiling \& debugging}

\begin{frame}
  \frametitle{Profiling \& debugging}
  CUDA SDK comes with:
  \begin{itemize}
    \item Its own profiler:
          \href{https://docs.nvidia.com/cuda/profiler-users-guide/}{\texttt{nvprof}}.
    \item Its own debugger:
          \href{https://docs.nvidia.com/nsight-visual-studio-edition/cuda-debugger/}{\texttt{nvsight}}.
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Profiling \texttt{mul3} using nvprof}
  \centering
  \includegraphics[width=0.90\textwidth,
    alt={Screenshot of nvprof profiling output for mul3}]{./img/nvprof-mul3.png}
  \par\small{Profiling \texttt{mul3} on \texttt{notch001}}
\end{frame}

\subsection{Important CUDA libraries}

\begin{frame}
  \frametitle{Important CUDA libraries}
  To increase performance, use highly-optimized libraries:
  \begin{itemize}
    \item \href{https://developer.nvidia.com/cublas}{\textbf{\textcolor{green}{cuBLAS}}}:
          \textbf{B}asic \textbf{L}inear \textbf{A}lgebra \textbf{S}ubroutines on NVIDIA GPUs.
    \item \href{https://icl.utk.edu/magma/}{\textbf{\textcolor{green}{MAGMA}}}:
          \textbf{M}atrix \textbf{A}lgebra on \textbf{G}PU and \textbf{M}ulti-core \textbf{A}rchitectures.
    \item \href{https://docs.nvidia.com/cuda/pdf/CURAND_Library.pdf}{\textbf{\textcolor{green}{cuRAND}}}:
          Random Number Generation library.
    \item \href{https://docs.nvidia.com/cuda/pdf/CUFFT_Library.pdf}{\textbf{\textcolor{green}{cuFFT}}}:
          CUDA \textbf{F}ast \textbf{F}ourier \textbf{T}ransform library.
    \item \href{https://developer.nvidia.com/nccl}{\textbf{\textcolor{green}{NCCL}}}:
          \textbf{N}VIDIA \textbf{C}ollective \textbf{C}ommunications \textbf{L}ibrary.
    \item \href{https://developer.nvidia.com/cudnn}{\textbf{\textcolor{green}{cuDNN}}}:
          CUDA \textbf{D}eep \textbf{N}eural \textbf{N}etwork library.
    \item \href{https://developer.nvidia.com/cutensor}{\textbf{\textcolor{green}{cuTENSOR}}}:
          GPU-accelerated Tensor Linear Algebra.
    \item \href{https://developer.nvidia.com/DALI}{\textbf{\textcolor{green}{DALI}}}:
          NVIDIA \textbf{Da}ta Loading \textbf{Li}brary.
    \item $\ldots$
  \end{itemize}
\end{frame}

\subsection{Alternatives to CUDA}

\begin{frame}
  \frametitle{Alternatives to CUDA}
  \begin{itemize}
    \item Similar to CUDA:
          \href{https://www.amd.com/en/products/software/rocm.html}{\textbf{\textcolor{green}{ROCM}}} (AMD).
    \item \href{https://www.openacc.org/}{\textbf{\textcolor{green}{OpenACC}}}
          (use of directives, cfr.\ \texttt{OpenMP}):
          GCC supports OpenACC for NVIDIA \& AMD GPUs.
          NVIDIA HPC SDK (formerly PGI).
          Sourcery Codebench (AMD GPU).
    \item Higher-level abstractions:
          \href{https://www.kokkos.org/about/core/}{\textbf{\textcolor{green}{Kokkos}}}
          (programming model for parallel algorithms on many-core chips).
  \end{itemize}
\end{frame}

\subsection{Links}

\begin{frame}
  \frametitle{Links}
  \begin{itemize}
    \item \href{https://docs.nvidia.com/cuda/index.html}{CUDA Toolkit Documentation}
    \item \href{https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html}{CUDA \CC\, Programming Guide Release $13.0$ (10/31/25)}
    \item \href{https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html}{CUDA \CC\, Best Practices Guide, Release $13.0$ (10/31/25)}
    \item \href{https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/}{NVIDIA CUDA Compiler Driver NVCC, Release $13.0$ (10/31/25)}
    \item \href{https://docs.nvidia.com/cuda/parallel-thread-execution/}{PTX \& ISA Release $9.0$ (10/31/25)}
  \end{itemize}
\end{frame}