intro-programming-gpu/chpc.tex at main · chpc-uofu/intro-programming-gpu · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
\section{Use of GPUs at the CHPC}

\subsection{GPUs available at CHPC}

\subsubsection{Regular env.: lp/kp/np/grn clusters}

\begin{frame}
  \frametitle{GPU devices on lp/kp/np/grn}
  \begin{center}
    \small
    \begin{tabular}{l|c}
      \texttt{GPU device type} & \texttt{CC} \\
      \hline
      \href{https://www.nvidia.com/en-us/geforce/graphics-cards/geforce-gtx-titan-x/specifications/}{\texttt{NVIDIA GeForce GTX TITAN X}} & 5.2 \\
      \href{https://images.nvidia.com/content/tesla/pdf/nvidia-tesla-p100-PCIe-datasheet.pdf}{\texttt{Tesla P100-PCIE-16GB}} & 6.0 \\
      \href{https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/documents/nvidia-p40-datasheet.pdf}{\texttt{Tesla P40}} & 6.1 \\
      \href{https://www.nvidia.com/en-us/geforce/10-series/}{\texttt{NVIDIA GeForce GTX 1080 Ti}} & 6.1 \\
      \href{https://www.gpuzoo.com/GPU-NVIDIA/Titan_V.html}{\texttt{NVIDIA Titan V}} & 7.0 \\
      \href{https://images.nvidia.com/content/technologies/volta/pdf/tesla-volta-v100-datasheet-letter-fnl-web.pdf}{\texttt{NVIDIA Tesla V100-PCIE-16GB}} & 7.0 \\
      \href{https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/tesla-t4/t4-tensor-core-product-brief.pdf}{\texttt{Tesla T4}} & 7.5 \\
      \href{https://www.techpowerup.com/gpu-specs/geforce-rtx-2080-ti.c3305}{\texttt{NVIDIA GeForce RTX 2080 Ti}} & 7.5 \\
      \href{https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf}{\texttt{NVIDIA A100-PCIe-40GB}} & 8.0 \\
      \href{https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf}{\texttt{NVIDIA A100-SXM4-80GB}} & 8.0 \\
      \href{https://www.nvidia.com/en-us/design-visualization/a800/}{\texttt{NVIDIA A800 40GB Active}} & 8.0 \\
      \hline
    \end{tabular}
  \end{center}
  \par\small{GPU devices on lp/kp/np/grn (10/31/2025). CC = compute capability.}
\end{frame}

\begin{frame}
  \frametitle{GPU devices on lp/kp/np/grn (cont.)}
  \begin{center}
    \small
    \begin{tabular}{l|c}
      \texttt{GPU device type} & \texttt{CC} \\
      \hline
      \href{https://www.nvidia.com/en-us/geforce/graphics-cards/30-series/rtx-3090-3090ti/}{\texttt{NVIDIA GeForce RTX 3090}} & 8.6 \\
      \href{https://images.nvidia.com/content/Solutions/data-center/a40/nvidia-a40-datasheet.pdf}{\texttt{NVIDIA A40}} & 8.6 \\
      \href{https://www.nvidia.com/content/dam/en-zz/Solutions/gtcs22/design-visualization/quadro-product-literature/proviz-nvidia-rtx-a5500-datasheet-2130578-r3-us-web.pdf}{\texttt{NVIDIA RTX A5500}} & 8.6 \\
      \href{https://www.nvidia.com/en-us/design-visualization/rtx-a6000/}{\texttt{NVIDIA RTX A6000}} & 8.6 \\
      \href{https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/rtx-6000/proviz-print-rtx6000-datasheet-web-2504660.pdf}{\texttt{NVIDIA RTX 6000 Ada Generation}} & 8.9 \\
      \href{https://www.nvidia.com/en-us/data-center/l40/}{\texttt{NVIDIA L40}} & 8.9 \\
      \href{https://resources.nvidia.com/en-us-l40s/l40s-datasheet-28413}{\texttt{NVIDIA L40S}} & 8.9 \\
      \href{https://www.nvidia.com/en-us/data-center/h100/}{\texttt{NVIDIA H100 [NVL]}} & 9.0 \\
      \href{https://www.nvidia.com/en-us/data-center/h200/}{\texttt{NVIDIA H200 [NVL]}} & 9.0 \\
      \href{https://www.nvidia.com/content/dam/en-zz/Solutions/products/workstations/professional-desktop-gpus/rtx-pro-6000-max-q/workstation-datasheet-blackwell-rtx-pro-6000-max-q-nvidia-3519233.pdf}{\texttt{NVIDIA RTX PRO 6000 Blackwell Max-Q}} & 12.0 \\
      \hline
    \end{tabular}
  \end{center}
  \par\small{GPU devices on lp/kp/np/grn (10/31/2025). CC = compute capability.}
\end{frame}

\subsubsection{Protected env.: redwood cluster}

\begin{frame}
  \frametitle{GPU devices on redwood}
  \begin{center}
    \small
    \begin{tabular}{l|c}
      \texttt{GPU device type} & \texttt{CC} \\
      \hline
      \href{https://www.nvidia.com/en-us/geforce/10-series/}{\texttt{NVIDIA GeForce GTX 1080 Ti}} & 6.1 \\
      \href{https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf}{\texttt{NVIDIA A100-SXM4-40GB}} & 8.0 \\
      \href{https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf}{\texttt{NVIDIA A100 80GB PCIe}} & 8.0 \\
      \href{https://www.nvidia.com/content/dam/en-zz/Solutions/data-center/products/a30-gpu/pdf/a30-datasheet.pdf}{\texttt{NVIDIA A30}} & 8.0 \\
      \href{https://images.nvidia.com/content/Solutions/data-center/a40/nvidia-a40-datasheet.pdf}{\texttt{NVIDIA A40}} & 8.6 \\
      \href{https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/rtx-6000/proviz-print-rtx6000-datasheet-web-2504660.pdf}{\texttt{NVIDIA RTX 6000 Ada Generation}} & 8.9 \\
      \href{https://www.nvidia.com/en-us/data-center/h100/}{\texttt{NVIDIA H100 NVL}} & 9.0 \\
      \href{https://www.nvidia.com/en-us/data-center/h200/}{\texttt{NVIDIA H200 [NVL]}} & 9.0 \\
      \hline
    \end{tabular}
  \end{center}
  \par\small{GPU devices on redwood (10/31/2025). CC = compute capability.}
\end{frame}

\subsection{How to access the GPUs at CHPC}

\begin{frame}
  \frametitle{Accessing GPUs at CHPC}
  \begin{itemize}
    \item \href{https://www.chpc.utah.edu/presentations/images-and-pdfs/usinggpuss24f.pdf}{Using GPUs at the CHPC (Presentation by Martin \v{C}uma)}
    \item \textbf{\textcolor{orange}{Note:}}
          When a GPU job is launched it runs with its own
          \href{https://en.wikipedia.org/wiki/Cgroups}{cgroup}
          (limits/accounts for its own resources).
    \item When a \$USER has several GPU jobs running on the \textbf{same} node,
          the \$USER will land in \textbf{one} cgroup belonging to one of their jobs
          when \texttt{ssh}ing into that node.
          Therefore, the \$USER \textbf{cannot} verify the status of other jobs
          using \texttt{nvidia-smi} directly.
    \item $\Rightarrow$ Use instead:\\
          \texttt{srun --pty --overlap --jobid \$JOBID /usr/bin/nvidia-smi}\\
          where \texttt{JOBID} is the job identifier.
  \end{itemize}
\end{frame}