diff --git a/docs/docs/guides/metrics.md b/docs/docs/guides/metrics.md index 5db9c252cc..4eb5d60b79 100644 --- a/docs/docs/guides/metrics.md +++ b/docs/docs/guides/metrics.md @@ -43,9 +43,9 @@ To enable exporting metrics to Prometheus, set the In addition to the essential metrics available via the CLI and UI, `dstack` exports additional metrics to Prometheus, including data on fleets, runs, jobs, and DCGM metrics. ??? info "NVIDIA DCGM" - NVIDIA DCGM metrics are automatically collected for `aws`, `azure`, `gcp`, and `oci` backends, + NVIDIA DCGM metrics are automatically collected for `aws`, `azure`, `gcp`, and `oci` backends, as well as for [SSH fleets](../concepts/fleets.md#ssh). - + To ensure NVIDIA DCGM metrics are collected from SSH fleets, ensure the `datacenter-gpu-manager-4-core`, `datacenter-gpu-manager-4-proprietary`, and `datacenter-gpu-manager-exporter` packages are installed on the hosts. @@ -112,6 +112,9 @@ telemetry, and more. | `dstack_job_memory_total_bytes` | *gauge* | Total memory allocated for the job, bytes | `4009754624.0` | | `dstack_job_memory_usage_bytes` | *gauge* | Memory used by the job (including cache), bytes | `339017728.0` | | `dstack_job_memory_working_set_bytes` | *gauge* | Memory used by the job (not including cache), bytes | `147251200.0` | + | `dstack_job_gpu_usage_ratio` | *gauge* | Job GPU usage, percent (as 0.0-1.0) | `0.93` | + | `dstack_job_gpu_memory_total_bytes` | *gauge* | Total GPU memory allocated for the job, bytes | `8589934592.0` | + | `dstack_job_gpu_memory_usage_bytes` | *gauge* | GPU memory used by the job, bytes | `1048576.0` | | `DCGM_FI_DEV_GPU_UTIL` | *gauge* | GPU utilization (in %) | | | `DCGM_FI_DEV_MEM_COPY_UTIL` | *gauge* | Memory utilization (in %) | | | `DCGM_FI_DEV_ENC_UTIL` | *gauge* | Encoder utilization (in %) | | @@ -176,6 +179,9 @@ telemetry, and more. | `dstack_run_type` | *string* | Run configuration type | `task`, `dev-environment` | | `dstack_backend` | *string* | Backend | `aws`, `runpod` | | `dstack_gpu` | *string?* | GPU name | `H100` | + | `dstack_gpu_num`[^1] | *integer* | GPU number (0-based) | `0` | + + [^1]: For `dstack_gpu_*` metrics only. ### Server health metrics diff --git a/src/dstack/_internal/server/services/prometheus/custom_metrics.py b/src/dstack/_internal/server/services/prometheus/custom_metrics.py index 9f9c64ed15..7a7636c061 100644 --- a/src/dstack/_internal/server/services/prometheus/custom_metrics.py +++ b/src/dstack/_internal/server/services/prometheus/custom_metrics.py @@ -1,4 +1,5 @@ import itertools +import json from collections import defaultdict from collections.abc import Generator, Iterable from datetime import timezone @@ -177,6 +178,19 @@ async def get_job_metrics(session: AsyncSession) -> Iterable[Metric]: metrics.add_sample(_JOB_CPU_TIME, labels, jmp.cpu_usage_micro / 1_000_000) metrics.add_sample(_JOB_MEMORY_USAGE, labels, jmp.memory_usage_bytes) metrics.add_sample(_JOB_MEMORY_WORKING_SET, labels, jmp.memory_working_set_bytes) + if gpus: + gpu_memory_total = gpus[0].memory_mib * 1024 * 1024 + for gpu_num, (gpu_util, gpu_memory_usage) in enumerate( + zip( + json.loads(jmp.gpus_util_percent), + json.loads(jmp.gpus_memory_usage_bytes), + ) + ): + gpu_labels = labels.copy() + gpu_labels["dstack_gpu_num"] = gpu_num + metrics.add_sample(_JOB_GPU_USAGE_RATIO, gpu_labels, gpu_util / 100) + metrics.add_sample(_JOB_GPU_MEMORY_TOTAL, gpu_labels, gpu_memory_total) + metrics.add_sample(_JOB_GPU_MEMORY_USAGE, gpu_labels, gpu_memory_usage) jpm = job_prometheus_metrics.get(job.id) if jpm is not None: for metric in text_string_to_metric_families(jpm.text): @@ -202,6 +216,9 @@ async def get_job_metrics(session: AsyncSession) -> Iterable[Metric]: _JOB_MEMORY_TOTAL = "dstack_job_memory_total_bytes" _JOB_MEMORY_USAGE = "dstack_job_memory_usage_bytes" _JOB_MEMORY_WORKING_SET = "dstack_job_memory_working_set_bytes" +_JOB_GPU_USAGE_RATIO = "dstack_job_gpu_usage_ratio" +_JOB_GPU_MEMORY_TOTAL = "dstack_job_gpu_memory_total_bytes" +_JOB_GPU_MEMORY_USAGE = "dstack_job_gpu_memory_usage_bytes" class _Metrics(dict[str, Metric]): @@ -259,6 +276,9 @@ class _JobMetrics(_Metrics): (_JOB_MEMORY_TOTAL, _GAUGE, "Total memory allocated for the job, bytes"), (_JOB_MEMORY_USAGE, _GAUGE, "Memory used by the job (including cache), bytes"), (_JOB_MEMORY_WORKING_SET, _GAUGE, "Memory used by the job (not including cache), bytes"), + (_JOB_GPU_USAGE_RATIO, _GAUGE, "Job GPU usage, percent (as 0.0-1.0)"), + (_JOB_GPU_MEMORY_TOTAL, _GAUGE, "Total GPU memory allocated for the job, bytes"), + (_JOB_GPU_MEMORY_USAGE, _GAUGE, "GPU memory used by the job, bytes"), ] diff --git a/src/tests/_internal/server/routers/test_prometheus.py b/src/tests/_internal/server/routers/test_prometheus.py index 1f7e1e274b..0678d0197e 100644 --- a/src/tests/_internal/server/routers/test_prometheus.py +++ b/src/tests/_internal/server/routers/test_prometheus.py @@ -109,6 +109,7 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient) memory_gib=128, gpu_count=2, gpu_name="V4", + gpu_memory_gib=16, price=12, ) project_2 = await _create_project(session, "project-2", user) @@ -140,6 +141,8 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient) """), ) project_1 = await _create_project(session, "project-1", user) + # jrd.offer.instance.resources has higher priority than jpd.instance_type.resources, + # should be ignored jpd_1_1 = get_job_provisioning_data(backend=BackendType.AWS, gpu_count=4, gpu_name="T4") jrd_1_1 = get_job_runtime_data(offer=offer) job_1_1 = await _create_job( @@ -176,6 +179,8 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient) cpu_usage_micro=3_500_000, memory_working_set_bytes=3_221_225_472, memory_usage_bytes=4_294_967_296, + gpus_util_percent=[80, 90], + gpus_memory_usage_bytes=[1_073_741_824, 2_147_483_648], ) # Older, ignored await create_job_metrics_point( @@ -316,6 +321,18 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient) # HELP dstack_job_memory_working_set_bytes Memory used by the job (not including cache), bytes # TYPE dstack_job_memory_working_set_bytes gauge dstack_job_memory_working_set_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 3221225472.0 + # HELP dstack_job_gpu_usage_ratio Job GPU usage, percent (as 0.0-1.0) + # TYPE dstack_job_gpu_usage_ratio gauge + dstack_job_gpu_usage_ratio{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="0"}} 0.8 + dstack_job_gpu_usage_ratio{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="1"}} 0.9 + # HELP dstack_job_gpu_memory_total_bytes Total GPU memory allocated for the job, bytes + # TYPE dstack_job_gpu_memory_total_bytes gauge + dstack_job_gpu_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="0"}} 17179869184.0 + dstack_job_gpu_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="1"}} 17179869184.0 + # HELP dstack_job_gpu_memory_usage_bytes GPU memory used by the job, bytes + # TYPE dstack_job_gpu_memory_usage_bytes gauge + dstack_job_gpu_memory_usage_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="0"}} 1073741824.0 + dstack_job_gpu_memory_usage_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="1"}} 2147483648.0 # HELP FIELD_1 Test field 1 # TYPE FIELD_1 gauge FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 350.0