From 23e866c31996d4857cb412d499feb3ed440d72ca Mon Sep 17 00:00:00 2001 From: Swapnil Date: Fri, 15 May 2026 23:32:11 +0530 Subject: [PATCH 1/3] CONSOLE-GPU: Display GPU metrics on the Node Details page Adds a new GPU metrics section to the Node Details page that surfaces DCGM exporter metrics (utilization, temperature, power usage, framebuffer memory) per GPU device, along with summary information (GPU count, model, capacity, allocatable) from the Kubernetes Node resource. The section is only rendered for nodes that report GPU capacity (nvidia.com/gpu or amd.com/gpu) or have active DCGM metrics. PromQL queries use both Hostname and node label selectors joined with `or` to support common DCGM exporter labeling conventions. Includes unit tests for query generation helpers and component rendering. Co-authored-by: Cursor --- .../console-app/locales/en/console-app.json | 11 + .../src/components/nodes/NodeDetails.tsx | 3 + .../nodes/NodeDetailsGpuMetrics.tsx | 272 ++++++++++++++++++ .../__tests__/NodeDetailsGpuMetrics.spec.tsx | 156 ++++++++++ .../__tests__/nodeGpuMetricsQueries.spec.ts | 86 ++++++ .../components/nodes/nodeGpuMetricsQueries.ts | 74 +++++ 6 files changed, 602 insertions(+) create mode 100644 frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx create mode 100644 frontend/packages/console-app/src/components/nodes/__tests__/NodeDetailsGpuMetrics.spec.tsx create mode 100644 frontend/packages/console-app/src/components/nodes/__tests__/nodeGpuMetricsQueries.spec.ts create mode 100644 frontend/packages/console-app/src/components/nodes/nodeGpuMetricsQueries.ts diff --git a/frontend/packages/console-app/locales/en/console-app.json b/frontend/packages/console-app/locales/en/console-app.json index ce594b3c6a8..e745a58ca75 100644 --- a/frontend/packages/console-app/locales/en/console-app.json +++ b/frontend/packages/console-app/locales/en/console-app.json @@ -437,6 +437,17 @@ "Container runtime": "Container runtime", "Kubelet version": "Kubelet version", "Kube-Proxy version": "Kube-Proxy version", + "GPU metrics": "GPU metrics", + "GPU count": "GPU count", + "GPU model": "GPU model", + "GPU capacity": "GPU capacity", + "GPU allocatable": "GPU allocatable", + "GPU device": "GPU device", + "Temperature": "Temperature", + "Power usage": "Power usage", + "FB memory used": "FB memory used", + "FB memory free": "FB memory free", + "GPU metrics are not available. Ensure DCGM exporter metrics are being scraped and labeled with the node name.": "GPU metrics are not available. Ensure DCGM exporter metrics are being scraped and labeled with the node name.", "Machine set": "Machine set", "This count is based on your access permissions and might not include all virtual machines.": "This count is based on your access permissions and might not include all virtual machines.", "MachineConfigPool": "MachineConfigPool", diff --git a/frontend/packages/console-app/src/components/nodes/NodeDetails.tsx b/frontend/packages/console-app/src/components/nodes/NodeDetails.tsx index 7d9f97596c2..a3edbb2223c 100644 --- a/frontend/packages/console-app/src/components/nodes/NodeDetails.tsx +++ b/frontend/packages/console-app/src/components/nodes/NodeDetails.tsx @@ -1,6 +1,8 @@ import type { FC } from 'react'; +import { PROMETHEUS_BASE_PATH } from '@console/internal/components/graphs/consts'; import type { NodeKind } from '@console/internal/module/k8s'; import NodeDetailsConditions from './NodeDetailsConditions'; +import NodeDetailsGpuMetrics from './NodeDetailsGpuMetrics'; import NodeDetailsImages from './NodeDetailsImages'; import NodeDetailsOverview from './NodeDetailsOverview'; @@ -11,6 +13,7 @@ type NodeDetailsProps = { const NodeDetails: FC = ({ obj: node }) => ( <> + {PROMETHEUS_BASE_PATH && } diff --git a/frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx b/frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx new file mode 100644 index 00000000000..945fae2d320 --- /dev/null +++ b/frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx @@ -0,0 +1,272 @@ +import type { FC } from 'react'; +import { useMemo } from 'react'; +import { + Bullseye, + DescriptionList, + DescriptionListDescription, + DescriptionListGroup, + DescriptionListTerm, + Spinner, +} from '@patternfly/react-core'; +import { useTranslation } from 'react-i18next'; +import type { PrometheusResponse, PrometheusResult } from '@console/internal/components/graphs'; +import { PrometheusEndpoint } from '@console/internal/components/graphs/helpers'; +import { usePrometheusPoll } from '@console/internal/components/graphs/prometheus-poll-hook'; +import { SectionHeading } from '@console/internal/components/utils/headings'; +import type { NodeKind } from '@console/internal/module/k8s'; +import PaneBody from '@console/shared/src/components/layout/PaneBody'; +import { + GpuMetricQuery, + getGpuMetricQueries, + nodeHasGpuCapacity, + GPU_RESOURCE_KEYS, +} from './nodeGpuMetricsQueries'; + +type GpuMetricResult = { + value: string; + modelName?: string; + device?: string; +}; + +type GpuDeviceRow = { + id: string; + label: string; + utilization: string; + temperature: string; + power: string; + fbUsed: string; + fbFree: string; +}; + +const resultsByGpu = ( + response: PrometheusResponse | undefined, +): Record => { + if (!response?.data?.result?.length) { + return {}; + } + return response.data.result.reduce>( + (acc, r: PrometheusResult) => { + const gpu = r.metric?.gpu ?? r.metric?.GPU_I_ID ?? r.metric?.UUID ?? r.metric?.device ?? ''; + acc[gpu] = { + value: r.value?.[1] ?? '', + modelName: r.metric?.modelName, + device: r.metric?.device, + }; + return acc; + }, + {}, + ); +}; + +const collectGpuIds = (...maps: Record[]): string[] => { + const ids = new Set(); + maps.forEach((m) => Object.keys(m).forEach((k) => ids.add(k))); + return [...ids].sort(); +}; + +const gpuDeviceLabel = (gpuId: string, meta: GpuMetricResult | undefined): string => { + const index = `GPU ${gpuId}`; + const model = meta?.modelName; + if (model) { + return `${index} \u2014 ${model}`; + } + const dev = meta?.device; + if (dev) { + return `${index} (${dev})`; + } + return index; +}; + +const findFirstMeta = (...maps: Record[]): GpuMetricResult | undefined => { + for (const m of maps) { + for (const entry of Object.values(m)) { + if (entry.modelName) return entry; + } + } + return Object.values(maps[0] ?? {})[0]; +}; + +const formatValue = (val: string | undefined, suffix: string): string => { + if (val === undefined || val === '') return '-'; + const num = parseFloat(val); + if (Number.isNaN(num)) return '-'; + return `${Math.round(num * 10) / 10} ${suffix}`; +}; + +const formatMemMiB = (val: string | undefined): string => { + if (val === undefined || val === '') return '-'; + const mib = parseFloat(val); + if (Number.isNaN(mib)) return '-'; + if (mib >= 1024) return `${(mib / 1024).toFixed(1)} GiB`; + return `${Math.round(mib)} MiB`; +}; + +type NodeDetailsGpuMetricsProps = { + node: NodeKind; +}; + +const NodeDetailsGpuMetrics: FC = ({ node }) => { + const { t } = useTranslation(); + const nodeName = node.metadata.name; + + const hasCapacity = nodeHasGpuCapacity(node.status?.capacity); + + const queries = useMemo(() => getGpuMetricQueries(nodeName), [nodeName]); + + const [countResponse, , countLoading] = usePrometheusPoll({ + endpoint: PrometheusEndpoint.QUERY, + query: queries[GpuMetricQuery.GPU_COUNT], + }); + const [utilResponse, , utilLoading] = usePrometheusPoll({ + endpoint: PrometheusEndpoint.QUERY, + query: queries[GpuMetricQuery.GPU_UTILIZATION], + }); + const [tempResponse, , tempLoading] = usePrometheusPoll({ + endpoint: PrometheusEndpoint.QUERY, + query: queries[GpuMetricQuery.GPU_TEMPERATURE], + }); + const [powerResponse, , powerLoading] = usePrometheusPoll({ + endpoint: PrometheusEndpoint.QUERY, + query: queries[GpuMetricQuery.GPU_POWER_USAGE], + }); + const [fbUsedResponse, , fbUsedLoading] = usePrometheusPoll({ + endpoint: PrometheusEndpoint.QUERY, + query: queries[GpuMetricQuery.GPU_FB_USED], + }); + const [fbFreeResponse, , fbFreeLoading] = usePrometheusPoll({ + endpoint: PrometheusEndpoint.QUERY, + query: queries[GpuMetricQuery.GPU_FB_FREE], + }); + + const isLoading = + countLoading || utilLoading || tempLoading || powerLoading || fbUsedLoading || fbFreeLoading; + + const utilMap = useMemo(() => resultsByGpu(utilResponse), [utilResponse]); + const tempMap = useMemo(() => resultsByGpu(tempResponse), [tempResponse]); + const powerMap = useMemo(() => resultsByGpu(powerResponse), [powerResponse]); + const fbUsedMap = useMemo(() => resultsByGpu(fbUsedResponse), [fbUsedResponse]); + const fbFreeMap = useMemo(() => resultsByGpu(fbFreeResponse), [fbFreeResponse]); + + const gpuIds = useMemo(() => collectGpuIds(utilMap, tempMap, powerMap, fbUsedMap, fbFreeMap), [ + utilMap, + tempMap, + powerMap, + fbUsedMap, + fbFreeMap, + ]); + + const hasMetrics = gpuIds.length > 0; + + if (!hasCapacity && !isLoading && !hasMetrics) { + return null; + } + + const gpuCountValue = countResponse?.data?.result?.[0]?.value?.[1]; + const gpuCountStr = + gpuCountValue !== undefined && gpuCountValue !== '' + ? String(Math.round(parseFloat(gpuCountValue))) + : undefined; + + const gpuCapacityStr = GPU_RESOURCE_KEYS.map((key) => node.status?.capacity?.[key]) + .filter(Boolean) + .join(', '); + const gpuAllocatableStr = GPU_RESOURCE_KEYS.map((key) => node.status?.allocatable?.[key]) + .filter(Boolean) + .join(', '); + + const firstMeta = findFirstMeta(utilMap, tempMap, powerMap, fbUsedMap, fbFreeMap); + const gpuModelStr = firstMeta?.modelName; + + const rows: GpuDeviceRow[] = gpuIds.map((id) => { + const meta = utilMap[id] ?? tempMap[id] ?? powerMap[id] ?? fbUsedMap[id] ?? fbFreeMap[id]; + return { + id, + label: gpuDeviceLabel(id, meta), + utilization: formatValue(utilMap[id]?.value, '%'), + temperature: formatValue(tempMap[id]?.value, '°C'), + power: formatValue(powerMap[id]?.value, 'W'), + fbUsed: formatMemMiB(fbUsedMap[id]?.value), + fbFree: formatMemMiB(fbFreeMap[id]?.value), + }; + }); + + return ( + + + + {(gpuCountStr || gpuCapacityStr || gpuAllocatableStr || gpuModelStr) && ( + + {gpuCountStr && ( + + {t('console-app~GPU count')} + {gpuCountStr} + + )} + {gpuModelStr && ( + + {t('console-app~GPU model')} + {gpuModelStr} + + )} + {gpuCapacityStr && ( + + {t('console-app~GPU capacity')} + {gpuCapacityStr} + + )} + {gpuAllocatableStr && ( + + {t('console-app~GPU allocatable')} + {gpuAllocatableStr} + + )} + + )} + + {isLoading && ( + + + + )} + + {!isLoading && hasMetrics && ( +
+ + + + + + + + + + + + + {rows.map((row) => ( + + + + + + + + + ))} + +
{t('console-app~GPU device')}{t('console-app~Utilization')}{t('console-app~Temperature')}{t('console-app~Power usage')}{t('console-app~FB memory used')}{t('console-app~FB memory free')}
{row.label}{row.utilization}{row.temperature}{row.power}{row.fbUsed}{row.fbFree}
+
+ )} + + {!isLoading && !hasMetrics && hasCapacity && ( +

+ {t( + 'console-app~GPU metrics are not available. Ensure DCGM exporter metrics are being scraped and labeled with the node name.', + )} +

+ )} +
+ ); +}; + +export default NodeDetailsGpuMetrics; diff --git a/frontend/packages/console-app/src/components/nodes/__tests__/NodeDetailsGpuMetrics.spec.tsx b/frontend/packages/console-app/src/components/nodes/__tests__/NodeDetailsGpuMetrics.spec.tsx new file mode 100644 index 00000000000..40b6aa3f570 --- /dev/null +++ b/frontend/packages/console-app/src/components/nodes/__tests__/NodeDetailsGpuMetrics.spec.tsx @@ -0,0 +1,156 @@ +import { render, screen } from '@testing-library/react'; +import { usePrometheusPoll } from '@console/internal/components/graphs/prometheus-poll-hook'; +import type { NodeKind } from '@console/internal/module/k8s'; +import NodeDetailsGpuMetrics from '../NodeDetailsGpuMetrics'; + +jest.mock('@console/internal/components/graphs/prometheus-poll-hook', () => ({ + usePrometheusPoll: jest.fn(), +})); + +const mockUsePrometheusPoll = usePrometheusPoll as jest.Mock; + +const baseNode: NodeKind = { + apiVersion: 'v1', + kind: 'Node', + metadata: { name: 'gpu-node-1', uid: 'uid-1' }, + spec: {}, + status: { + capacity: { 'nvidia.com/gpu': '2', cpu: '8', memory: '32Gi' }, + allocatable: { 'nvidia.com/gpu': '2', cpu: '7500m', memory: '30Gi' }, + conditions: [], + images: [], + }, +}; + +const nonGpuNode: NodeKind = { + apiVersion: 'v1', + kind: 'Node', + metadata: { name: 'cpu-node-1', uid: 'uid-2' }, + spec: {}, + status: { + capacity: { cpu: '8', memory: '32Gi' }, + allocatable: { cpu: '7500m', memory: '30Gi' }, + conditions: [], + images: [], + }, +}; + +const makeResponse = ( + results: { gpu: string; value: string; modelName?: string; device?: string }[], +) => ({ + status: 'success', + data: { + resultType: 'vector' as const, + result: results.map((r) => ({ + metric: { + gpu: r.gpu, + ...(r.modelName && { modelName: r.modelName }), + ...(r.device && { device: r.device }), + }, + value: [Date.now() / 1000, r.value], + })), + }, +}); + +const makeScalarResponse = (value: string) => ({ + status: 'success', + data: { + resultType: 'vector' as const, + result: [{ metric: {}, value: [Date.now() / 1000, value] }], + }, +}); + +const emptyResponse = { status: 'success', data: { resultType: 'vector' as const, result: [] } }; + +describe('NodeDetailsGpuMetrics', () => { + beforeEach(() => { + jest.clearAllMocks(); + }); + + it('renders nothing for a non-GPU node when no metrics are returned', () => { + mockUsePrometheusPoll.mockReturnValue([emptyResponse, null, false]); + const { container } = render(); + expect(container).toBeEmptyDOMElement(); + }); + + it('shows the GPU metrics heading when the node has GPU capacity', () => { + mockUsePrometheusPoll.mockReturnValue([emptyResponse, null, false]); + render(); + expect(screen.getByText('GPU metrics')).toBeInTheDocument(); + }); + + it('shows capacity and allocatable counts from node status', () => { + mockUsePrometheusPoll.mockReturnValue([emptyResponse, null, false]); + render(); + expect(screen.getByText('GPU capacity')).toBeInTheDocument(); + expect(screen.getByText('GPU allocatable')).toBeInTheDocument(); + expect(screen.getAllByText('2').length).toBeGreaterThanOrEqual(1); + }); + + it('shows a spinner while loading', () => { + mockUsePrometheusPoll.mockReturnValue([undefined, null, true]); + render(); + expect(screen.getByRole('progressbar')).toBeInTheDocument(); + }); + + it('renders GPU count, model, and a table with device labels when GPU metrics are returned', () => { + const countResp = makeScalarResponse('2'); + const utilResp = makeResponse([ + { gpu: '0', value: '45', modelName: 'Tesla T4', device: 'nvidia0' }, + { gpu: '1', value: '78', modelName: 'Tesla T4', device: 'nvidia1' }, + ]); + const tempResp = makeResponse([ + { gpu: '0', value: '62', modelName: 'Tesla T4' }, + { gpu: '1', value: '71', modelName: 'Tesla T4' }, + ]); + const powerResp = makeResponse([ + { gpu: '0', value: '120.5' }, + { gpu: '1', value: '185.3' }, + ]); + const fbUsedResp = makeResponse([ + { gpu: '0', value: '4096' }, + { gpu: '1', value: '8192' }, + ]); + const fbFreeResp = makeResponse([ + { gpu: '0', value: '12288' }, + { gpu: '1', value: '8192' }, + ]); + + mockUsePrometheusPoll + .mockReturnValueOnce([countResp, null, false]) + .mockReturnValueOnce([utilResp, null, false]) + .mockReturnValueOnce([tempResp, null, false]) + .mockReturnValueOnce([powerResp, null, false]) + .mockReturnValueOnce([fbUsedResp, null, false]) + .mockReturnValueOnce([fbFreeResp, null, false]); + + render(); + + expect(screen.getByText('GPU count')).toBeInTheDocument(); + expect(screen.getByText('GPU model')).toBeInTheDocument(); + expect(screen.getByText('Tesla T4')).toBeInTheDocument(); + expect(screen.getByText('GPU device')).toBeInTheDocument(); + + expect(screen.getByText('GPU 0 \u2014 Tesla T4')).toBeInTheDocument(); + expect(screen.getByText('GPU 1 \u2014 Tesla T4')).toBeInTheDocument(); + + expect(screen.getByText('Utilization')).toBeInTheDocument(); + expect(screen.getByText('Temperature')).toBeInTheDocument(); + expect(screen.getByText('Power usage')).toBeInTheDocument(); + + expect(screen.getByText('45 %')).toBeInTheDocument(); + expect(screen.getByText('78 %')).toBeInTheDocument(); + expect(screen.getByText('62 °C')).toBeInTheDocument(); + expect(screen.getByText('71 °C')).toBeInTheDocument(); + expect(screen.getByText('120.5 W')).toBeInTheDocument(); + expect(screen.getByText('185.3 W')).toBeInTheDocument(); + expect(screen.getByText('4.0 GiB')).toBeInTheDocument(); + expect(screen.getAllByText('8.0 GiB')).toHaveLength(2); + }); + + it('shows the not-available message when node has capacity but no metric data', () => { + mockUsePrometheusPoll.mockReturnValue([emptyResponse, null, false]); + render(); + expect(screen.getByText(/GPU metrics are not available/)).toBeInTheDocument(); + }); +}); diff --git a/frontend/packages/console-app/src/components/nodes/__tests__/nodeGpuMetricsQueries.spec.ts b/frontend/packages/console-app/src/components/nodes/__tests__/nodeGpuMetricsQueries.spec.ts new file mode 100644 index 00000000000..1cf377f9997 --- /dev/null +++ b/frontend/packages/console-app/src/components/nodes/__tests__/nodeGpuMetricsQueries.spec.ts @@ -0,0 +1,86 @@ +import { + escapePromQLLabel, + getGpuMetricQueries, + GpuMetricQuery, + nodeHasGpuCapacity, + GPU_RESOURCE_KEYS, +} from '../nodeGpuMetricsQueries'; + +describe('escapePromQLLabel', () => { + it('returns a plain name unchanged', () => { + expect(escapePromQLLabel('worker-gpu-01')).toBe('worker-gpu-01'); + }); + + it('escapes single quotes', () => { + expect(escapePromQLLabel("node's-name")).toBe("node\\'s-name"); + }); + + it('escapes backslashes', () => { + expect(escapePromQLLabel('path\\node')).toBe('path\\\\node'); + }); + + it('escapes both backslash and single quote together', () => { + expect(escapePromQLLabel("a\\'b")).toBe("a\\\\\\'b"); + }); +}); + +describe('getGpuMetricQueries', () => { + it('returns queries keyed by GpuMetricQuery', () => { + const queries = getGpuMetricQueries('gpu-node-1'); + expect(Object.keys(queries)).toHaveLength(Object.keys(GpuMetricQuery).length); + }); + + it('uses PromQL or between two instant vectors for each label convention', () => { + const queries = getGpuMetricQueries('worker-gpu-01'); + const utilQuery = queries[GpuMetricQuery.GPU_UTILIZATION]; + expect(utilQuery).toBe( + "DCGM_FI_DEV_GPU_UTIL{Hostname='worker-gpu-01'} or DCGM_FI_DEV_GPU_UTIL{node='worker-gpu-01'}", + ); + }); + + it('uses the correct DCGM metric name for each query', () => { + const queries = getGpuMetricQueries('n1'); + expect(queries[GpuMetricQuery.GPU_COUNT]).toBe( + "count(DCGM_FI_DEV_GPU_UTIL{Hostname='n1'} or DCGM_FI_DEV_GPU_UTIL{node='n1'})", + ); + expect(queries[GpuMetricQuery.GPU_UTILIZATION]).toContain('DCGM_FI_DEV_GPU_UTIL'); + expect(queries[GpuMetricQuery.GPU_TEMPERATURE]).toContain('DCGM_FI_DEV_GPU_TEMP'); + expect(queries[GpuMetricQuery.GPU_POWER_USAGE]).toContain('DCGM_FI_DEV_POWER_USAGE'); + expect(queries[GpuMetricQuery.GPU_FB_USED]).toContain('DCGM_FI_DEV_FB_USED'); + expect(queries[GpuMetricQuery.GPU_FB_FREE]).toContain('DCGM_FI_DEV_FB_FREE'); + }); + + it('escapes special characters in node names', () => { + const queries = getGpuMetricQueries("node'special"); + expect(queries[GpuMetricQuery.GPU_UTILIZATION]).toContain("Hostname='node\\'special'"); + }); +}); + +describe('nodeHasGpuCapacity', () => { + it('returns false for undefined capacity', () => { + expect(nodeHasGpuCapacity(undefined)).toBe(false); + }); + + it('returns false when no GPU keys are present', () => { + expect(nodeHasGpuCapacity({ cpu: '8', memory: '32Gi' })).toBe(false); + }); + + it('returns false when GPU capacity is 0', () => { + expect(nodeHasGpuCapacity({ 'nvidia.com/gpu': '0' })).toBe(false); + }); + + it('returns true when nvidia.com/gpu > 0', () => { + expect(nodeHasGpuCapacity({ 'nvidia.com/gpu': '2' })).toBe(true); + }); + + it('returns true when amd.com/gpu > 0', () => { + expect(nodeHasGpuCapacity({ 'amd.com/gpu': '1' })).toBe(true); + }); +}); + +describe('GPU_RESOURCE_KEYS', () => { + it('includes nvidia and amd', () => { + expect(GPU_RESOURCE_KEYS).toContain('nvidia.com/gpu'); + expect(GPU_RESOURCE_KEYS).toContain('amd.com/gpu'); + }); +}); diff --git a/frontend/packages/console-app/src/components/nodes/nodeGpuMetricsQueries.ts b/frontend/packages/console-app/src/components/nodes/nodeGpuMetricsQueries.ts new file mode 100644 index 00000000000..85e616db3b2 --- /dev/null +++ b/frontend/packages/console-app/src/components/nodes/nodeGpuMetricsQueries.ts @@ -0,0 +1,74 @@ +import * as _ from 'lodash'; + +export enum GpuMetricQuery { + GPU_COUNT = 'GPU_COUNT', + GPU_UTILIZATION = 'GPU_UTILIZATION', + GPU_TEMPERATURE = 'GPU_TEMPERATURE', + GPU_POWER_USAGE = 'GPU_POWER_USAGE', + GPU_FB_USED = 'GPU_FB_USED', + GPU_FB_FREE = 'GPU_FB_FREE', +} + +/** + * Escapes a node name for safe inclusion in PromQL label matchers. + * Backslash and single-quote are the only characters that need escaping + * inside a PromQL single-quoted string literal. + */ +export const escapePromQLLabel = (value: string): string => + value.replace(/\\/g, '\\\\').replace(/'/g, "\\'"); + +/** + * Builds two separate label selectors for matching the node across common DCGM + * label conventions. PromQL does not support `or` inside `{}` label matchers, + * so each query must join two full instant vectors with the `or` operator: + * metric{Hostname='name'} or metric{node='name'} + */ +const buildNodeSelectors = (nodeName: string): { hn: string; nd: string } => { + const escaped = escapePromQLLabel(nodeName); + return { + hn: `Hostname='${escaped}'`, + nd: `node='${escaped}'`, + }; +}; + +const gpuQueries = { + [GpuMetricQuery.GPU_COUNT]: _.template( + `count(DCGM_FI_DEV_GPU_UTIL{<%= hn %>} or DCGM_FI_DEV_GPU_UTIL{<%= nd %>})`, + ), + [GpuMetricQuery.GPU_UTILIZATION]: _.template( + `DCGM_FI_DEV_GPU_UTIL{<%= hn %>} or DCGM_FI_DEV_GPU_UTIL{<%= nd %>}`, + ), + [GpuMetricQuery.GPU_TEMPERATURE]: _.template( + `DCGM_FI_DEV_GPU_TEMP{<%= hn %>} or DCGM_FI_DEV_GPU_TEMP{<%= nd %>}`, + ), + [GpuMetricQuery.GPU_POWER_USAGE]: _.template( + `DCGM_FI_DEV_POWER_USAGE{<%= hn %>} or DCGM_FI_DEV_POWER_USAGE{<%= nd %>}`, + ), + [GpuMetricQuery.GPU_FB_USED]: _.template( + `DCGM_FI_DEV_FB_USED{<%= hn %>} or DCGM_FI_DEV_FB_USED{<%= nd %>}`, + ), + [GpuMetricQuery.GPU_FB_FREE]: _.template( + `DCGM_FI_DEV_FB_FREE{<%= hn %>} or DCGM_FI_DEV_FB_FREE{<%= nd %>}`, + ), +}; + +export const getGpuMetricQueries = (nodeName: string): Record => { + const selectors = buildNodeSelectors(nodeName); + return { + [GpuMetricQuery.GPU_COUNT]: gpuQueries[GpuMetricQuery.GPU_COUNT](selectors), + [GpuMetricQuery.GPU_UTILIZATION]: gpuQueries[GpuMetricQuery.GPU_UTILIZATION](selectors), + [GpuMetricQuery.GPU_TEMPERATURE]: gpuQueries[GpuMetricQuery.GPU_TEMPERATURE](selectors), + [GpuMetricQuery.GPU_POWER_USAGE]: gpuQueries[GpuMetricQuery.GPU_POWER_USAGE](selectors), + [GpuMetricQuery.GPU_FB_USED]: gpuQueries[GpuMetricQuery.GPU_FB_USED](selectors), + [GpuMetricQuery.GPU_FB_FREE]: gpuQueries[GpuMetricQuery.GPU_FB_FREE](selectors), + }; +}; + +/** Resource keys that indicate GPU presence in node.status.capacity / allocatable. */ +export const GPU_RESOURCE_KEYS = ['nvidia.com/gpu', 'amd.com/gpu'] as const; + +export const nodeHasGpuCapacity = (capacity?: { [key: string]: string }): boolean => + GPU_RESOURCE_KEYS.some((key) => { + const val = capacity?.[key]; + return val !== undefined && parseInt(val, 10) > 0; + }); From 18e9f90b1f6e8f3d0c6bf72f75b114889f208bea Mon Sep 17 00:00:00 2001 From: Swapnil Date: Fri, 15 May 2026 23:50:24 +0530 Subject: [PATCH 2/3] fixup: Address CodeRabbit review feedback - Skip Prometheus results without a valid GPU identifier to prevent silent data loss when multiple results lack label keys. - Guard GPU count display against NaN from non-numeric Prometheus values. Co-authored-by: Cursor --- .../src/components/nodes/NodeDetailsGpuMetrics.tsx | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx b/frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx index 945fae2d320..1443c7e44fa 100644 --- a/frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx +++ b/frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx @@ -47,6 +47,9 @@ const resultsByGpu = ( return response.data.result.reduce>( (acc, r: PrometheusResult) => { const gpu = r.metric?.gpu ?? r.metric?.GPU_I_ID ?? r.metric?.UUID ?? r.metric?.device ?? ''; + if (!gpu) { + return acc; + } acc[gpu] = { value: r.value?.[1] ?? '', modelName: r.metric?.modelName, @@ -162,10 +165,11 @@ const NodeDetailsGpuMetrics: FC = ({ node }) => { } const gpuCountValue = countResponse?.data?.result?.[0]?.value?.[1]; - const gpuCountStr = - gpuCountValue !== undefined && gpuCountValue !== '' - ? String(Math.round(parseFloat(gpuCountValue))) - : undefined; + const gpuCountStr = (() => { + if (gpuCountValue === undefined || gpuCountValue === '') return undefined; + const parsed = parseFloat(gpuCountValue); + return Number.isNaN(parsed) ? undefined : String(Math.round(parsed)); + })(); const gpuCapacityStr = GPU_RESOURCE_KEYS.map((key) => node.status?.capacity?.[key]) .filter(Boolean) From 8bc4ea8e0f3a253af0760f31c4ed5e919ec58229 Mon Sep 17 00:00:00 2001 From: Swapnil Date: Sat, 16 May 2026 00:05:17 +0530 Subject: [PATCH 3/3] fixup: Replace lodash templates with native template literals Remove the lodash dependency from nodeGpuMetricsQueries.ts and use native template literals for PromQL query construction, reducing bundle weight with zero functional change. Co-authored-by: Cursor --- .../components/nodes/nodeGpuMetricsQueries.ts | 38 +++++-------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/frontend/packages/console-app/src/components/nodes/nodeGpuMetricsQueries.ts b/frontend/packages/console-app/src/components/nodes/nodeGpuMetricsQueries.ts index 85e616db3b2..401d7dad848 100644 --- a/frontend/packages/console-app/src/components/nodes/nodeGpuMetricsQueries.ts +++ b/frontend/packages/console-app/src/components/nodes/nodeGpuMetricsQueries.ts @@ -1,5 +1,3 @@ -import * as _ from 'lodash'; - export enum GpuMetricQuery { GPU_COUNT = 'GPU_COUNT', GPU_UTILIZATION = 'GPU_UTILIZATION', @@ -31,36 +29,18 @@ const buildNodeSelectors = (nodeName: string): { hn: string; nd: string } => { }; }; -const gpuQueries = { - [GpuMetricQuery.GPU_COUNT]: _.template( - `count(DCGM_FI_DEV_GPU_UTIL{<%= hn %>} or DCGM_FI_DEV_GPU_UTIL{<%= nd %>})`, - ), - [GpuMetricQuery.GPU_UTILIZATION]: _.template( - `DCGM_FI_DEV_GPU_UTIL{<%= hn %>} or DCGM_FI_DEV_GPU_UTIL{<%= nd %>}`, - ), - [GpuMetricQuery.GPU_TEMPERATURE]: _.template( - `DCGM_FI_DEV_GPU_TEMP{<%= hn %>} or DCGM_FI_DEV_GPU_TEMP{<%= nd %>}`, - ), - [GpuMetricQuery.GPU_POWER_USAGE]: _.template( - `DCGM_FI_DEV_POWER_USAGE{<%= hn %>} or DCGM_FI_DEV_POWER_USAGE{<%= nd %>}`, - ), - [GpuMetricQuery.GPU_FB_USED]: _.template( - `DCGM_FI_DEV_FB_USED{<%= hn %>} or DCGM_FI_DEV_FB_USED{<%= nd %>}`, - ), - [GpuMetricQuery.GPU_FB_FREE]: _.template( - `DCGM_FI_DEV_FB_FREE{<%= hn %>} or DCGM_FI_DEV_FB_FREE{<%= nd %>}`, - ), -}; +const buildQuery = (metric: string, hn: string, nd: string): string => + `${metric}{${hn}} or ${metric}{${nd}}`; export const getGpuMetricQueries = (nodeName: string): Record => { - const selectors = buildNodeSelectors(nodeName); + const { hn, nd } = buildNodeSelectors(nodeName); return { - [GpuMetricQuery.GPU_COUNT]: gpuQueries[GpuMetricQuery.GPU_COUNT](selectors), - [GpuMetricQuery.GPU_UTILIZATION]: gpuQueries[GpuMetricQuery.GPU_UTILIZATION](selectors), - [GpuMetricQuery.GPU_TEMPERATURE]: gpuQueries[GpuMetricQuery.GPU_TEMPERATURE](selectors), - [GpuMetricQuery.GPU_POWER_USAGE]: gpuQueries[GpuMetricQuery.GPU_POWER_USAGE](selectors), - [GpuMetricQuery.GPU_FB_USED]: gpuQueries[GpuMetricQuery.GPU_FB_USED](selectors), - [GpuMetricQuery.GPU_FB_FREE]: gpuQueries[GpuMetricQuery.GPU_FB_FREE](selectors), + [GpuMetricQuery.GPU_COUNT]: `count(${buildQuery('DCGM_FI_DEV_GPU_UTIL', hn, nd)})`, + [GpuMetricQuery.GPU_UTILIZATION]: buildQuery('DCGM_FI_DEV_GPU_UTIL', hn, nd), + [GpuMetricQuery.GPU_TEMPERATURE]: buildQuery('DCGM_FI_DEV_GPU_TEMP', hn, nd), + [GpuMetricQuery.GPU_POWER_USAGE]: buildQuery('DCGM_FI_DEV_POWER_USAGE', hn, nd), + [GpuMetricQuery.GPU_FB_USED]: buildQuery('DCGM_FI_DEV_FB_USED', hn, nd), + [GpuMetricQuery.GPU_FB_FREE]: buildQuery('DCGM_FI_DEV_FB_FREE', hn, nd), }; };