-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsoftware.tex
More file actions
152 lines (140 loc) · 6.08 KB
/
software.tex
File metadata and controls
152 lines (140 loc) · 6.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
\section{Software}
\subsection{GPGPU \& CUDA}
\begin{frame}
\frametitle{GPGPU \& CUDA}
\begin{itemize}
\item GPU (Graphic Processing Unit):
originally developed for graphical applications.
\item GP-GPU: General-Purpose GPU, i.e.\
the use of GPUs beyond graphical applications.
\textbf{\textcolor{red}{CAVEAT}}: problem must be reformulated
in terms of the graphics API.
\item \textbf{\textcolor{green}{2007}}: NVIDIA introduces the
\textbf{\textcolor{blue}{CUDA}} framework
(\textbf{\textcolor{blue}{C}}ompute
\textbf{\textcolor{blue}{U}}nified
\textbf{\textcolor{blue}{D}}evice
\textbf{\textcolor{blue}{A}}rchitecture).
The \href{https://developer.nvidia.com/cuda-downloads}{CUDA Toolkit}
consists of the CUDA Driver and the CUDA Toolkit
(\texttt{nvcc}, \texttt{nvprof}, \ldots, libraries, header files).
\item CUDA API: extension of the \texttt{C} language.
\item Handles GPU thread level parallelism.
\item Deals with moving data between CPU and GPU.
\item Also supports \CC\,, \texttt{Fortran} and \texttt{Python}.
\end{itemize}
\end{frame}
% The following image was retrieved from:
% https://docs.nvidia.com/deploy/cuda-compatibility/index.html
\begin{frame}
\frametitle{Schema of CUDA Components}
\centering
\includegraphics[width=0.75\textwidth,
alt={Diagram showing the schema of CUDA software components}]{./img/CUDAcomponents.png}
\par\small{Schema of the CUDA Components}
\end{frame}
\subsection{Structure of a GPU computation}
\begin{frame}
\frametitle{Structure of a GPU computation}
\begin{enumerate}
\item \textbf{\textcolor{blue}{Allocate}} memory space on the GPU device.
\item \textbf{\textcolor{blue}{Transfer}} the data from the CPU to the GPU device.
\item Perform the \textbf{\textcolor{blue}{calculation}} on the GPU device:
\textbf{\textcolor{blue}{kernel}} (function executed on the GPU).
To enhance performance: keep data as long as possible on the GPU device.
\item \textbf{\textcolor{blue}{Transfer}} the result back from the GPU device to the CPU.
\item \textbf{\textcolor{blue}{Deallocate}} memory space on the GPU device.
\end{enumerate}
\textbf{\textcolor{orange}{Note}}: source code \& makefile available in \texttt{./src}
\end{frame}
% Allocate & deallocate of GLOBAL memory
\begin{frame}
\frametitle{Alloc.\ \& free of global memory on the GPU}
\begin{itemize}
\item \texttt{cudaError\_t} --- CUDA Error types.
\item \texttt{cudaError\_t cudaMalloc(void **devPtr, size\_t size)}\\
Allocates memory on the device.
\item \texttt{cudaError\_t cudaFree(void *devPtr)}\\
Frees memory on the device.
\end{itemize}
\textbf{\textcolor{orange}{Note}}: full example in \texttt{./latexinc/ex1.cu}
\end{frame}
% Copy data between host and device
\begin{frame}
\frametitle{Copy data between host (CPU) and device (GPU)}
\texttt{cudaMemcpy} signature:
\begin{itemize}
\item \texttt{cudaError\_t cudaMemcpy(}\\
\hspace{4ex}\texttt{void *dst, const void *src,}\\
\hspace{4ex}\texttt{size\_t count, cudaMemcpyKind kind)}
\end{itemize}
\textbf{\textcolor{orange}{Note}}: full example in \texttt{./latexinc/memcpy.cu}
\end{frame}
\begin{frame}
\frametitle{Copy data --- direction (\texttt{kind})}
\begin{itemize}
\item \texttt{cudaMemcpyHostToHost}
\item \texttt{cudaMemcpyHostToDevice}
\item \texttt{cudaMemcpyDeviceToHost}
\item \texttt{cudaMemcpyDeviceToDevice}
\end{itemize}
\textbf{\textcolor{orange}{Note}}: full example in \texttt{./latexinc/ex2.cu}
\end{frame}
% Kernel - split into two frames
\begin{frame}
\frametitle{CUDA Kernel --- Declaration}
\begin{itemize}
\item \textbf{\textcolor{blue}{CUDA kernel}}: function which may run on a GPU device.
\item \textbf{\textcolor{blue}{Kernel declaration}} syntax:\\
\hspace{6ex}\textit{funcspec} \texttt{void}
\textit{kernelName}(\textit{args})\{ \textit{body} \}
\item \textit{funcspec}: \texttt{\_\_global\_\_}, \texttt{\_\_host\_\_},
or \texttt{\_\_device\_\_}
\item \textit{kernelName}: name of the kernel/CUDA function.
\item \textit{args}: argument list of the kernel/CUDA function.
\item \textit{body}: body of the kernel/CUDA function (your code).
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{CUDA Kernel --- Call syntax}
\begin{itemize}
\item \textbf{\textcolor{blue}{Kernel call}} syntax:\\
\hspace{6ex}\textit{kernelName}\texttt{<<<}\textit{gridSize,blockSize}\texttt{>>>}(\textit{args})
\item \textit{gridSize}: size of the grid of thread blocks.
\item \textit{blockSize}: size of a thread block.
\end{itemize}
\end{frame}
% Kernel (Part 2)
\begin{frame}
\frametitle{Function type qualifiers}
\begin{center}
\begin{tabular}{l|l|l}
Qualifier & Called from & Executed on \\
\hline
\texttt{\_\_global\_\_} & host & device \\
\texttt{\_\_host\_\_} & host & host \\
\texttt{\_\_device\_\_} & device & device \\
\hline
\end{tabular}
\end{center}
\textbf{\textcolor{orange}{Note:}}
You can have two different versions of a function:
one with \texttt{\_\_host\_\_} \& one with \texttt{\_\_device\_\_}.
\end{frame}
% Grid, Blocks and Threads
\begin{frame}
\frametitle{Grids, Blocks and Threads}
We have a hierarchical (software) implementation.
\begin{itemize}
\item \texttt{uint3}, \texttt{dim3}:
CUDA structures of unsigned integers \texttt{x}, \texttt{y}, \texttt{z}.
\texttt{dim3} is based on \texttt{uint3} but initializes unspecified
components to $1$.
\item \textbf{\textcolor{blue}{Grid}}: each Grid consists of Blocks.
\texttt{dim3 gridDim}: dimensions of the Grid.
\texttt{uint3 blockIdx}: block index within the Grid.
\item \textbf{\textcolor{blue}{Block}}: each Block consists of Threads.
\texttt{dim3 blockDim}: dimensions of the Block.
\texttt{uint3 threadIdx}: thread index within the Block.
\end{itemize}
\end{frame}