Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions docs/docs/guides/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ To enable exporting metrics to Prometheus, set the
In addition to the essential metrics available via the CLI and UI, `dstack` exports additional metrics to Prometheus, including data on fleets, runs, jobs, and DCGM metrics.

??? info "NVIDIA DCGM"
NVIDIA DCGM metrics are automatically collected for `aws`, `azure`, `gcp`, and `oci` backends,
NVIDIA DCGM metrics are automatically collected for `aws`, `azure`, `gcp`, and `oci` backends,
as well as for [SSH fleets](../concepts/fleets.md#ssh).

To ensure NVIDIA DCGM metrics are collected from SSH fleets, ensure the `datacenter-gpu-manager-4-core`,
`datacenter-gpu-manager-4-proprietary`, and `datacenter-gpu-manager-exporter` packages are installed on the hosts.

Expand Down Expand Up @@ -112,6 +112,9 @@ telemetry, and more.
| `dstack_job_memory_total_bytes` | *gauge* | Total memory allocated for the job, bytes | `4009754624.0` |
| `dstack_job_memory_usage_bytes` | *gauge* | Memory used by the job (including cache), bytes | `339017728.0` |
| `dstack_job_memory_working_set_bytes` | *gauge* | Memory used by the job (not including cache), bytes | `147251200.0` |
| `dstack_job_gpu_usage_ratio` | *gauge* | Job GPU usage, percent (as 0.0-1.0) | `0.93` |
| `dstack_job_gpu_memory_total_bytes` | *gauge* | Total GPU memory allocated for the job, bytes | `8589934592.0` |
| `dstack_job_gpu_memory_usage_bytes` | *gauge* | GPU memory used by the job, bytes | `1048576.0` |
| `DCGM_FI_DEV_GPU_UTIL` | *gauge* | GPU utilization (in %) | |
| `DCGM_FI_DEV_MEM_COPY_UTIL` | *gauge* | Memory utilization (in %) | |
| `DCGM_FI_DEV_ENC_UTIL` | *gauge* | Encoder utilization (in %) | |
Expand Down Expand Up @@ -176,6 +179,9 @@ telemetry, and more.
| `dstack_run_type` | *string* | Run configuration type | `task`, `dev-environment` |
| `dstack_backend` | *string* | Backend | `aws`, `runpod` |
| `dstack_gpu` | *string?* | GPU name | `H100` |
| `dstack_gpu_num`[^1] | *integer* | GPU number (0-based) | `0` |

[^1]: For `dstack_gpu_*` metrics only.

### Server health metrics

Expand Down
20 changes: 20 additions & 0 deletions src/dstack/_internal/server/services/prometheus/custom_metrics.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import itertools
import json
from collections import defaultdict
from collections.abc import Generator, Iterable
from datetime import timezone
Expand Down Expand Up @@ -177,6 +178,19 @@ async def get_job_metrics(session: AsyncSession) -> Iterable[Metric]:
metrics.add_sample(_JOB_CPU_TIME, labels, jmp.cpu_usage_micro / 1_000_000)
metrics.add_sample(_JOB_MEMORY_USAGE, labels, jmp.memory_usage_bytes)
metrics.add_sample(_JOB_MEMORY_WORKING_SET, labels, jmp.memory_working_set_bytes)
if gpus:
gpu_memory_total = gpus[0].memory_mib * 1024 * 1024
for gpu_num, (gpu_util, gpu_memory_usage) in enumerate(
zip(
json.loads(jmp.gpus_util_percent),
json.loads(jmp.gpus_memory_usage_bytes),
)
):
gpu_labels = labels.copy()
gpu_labels["dstack_gpu_num"] = gpu_num
metrics.add_sample(_JOB_GPU_USAGE_RATIO, gpu_labels, gpu_util / 100)
metrics.add_sample(_JOB_GPU_MEMORY_TOTAL, gpu_labels, gpu_memory_total)
metrics.add_sample(_JOB_GPU_MEMORY_USAGE, gpu_labels, gpu_memory_usage)
jpm = job_prometheus_metrics.get(job.id)
if jpm is not None:
for metric in text_string_to_metric_families(jpm.text):
Expand All @@ -202,6 +216,9 @@ async def get_job_metrics(session: AsyncSession) -> Iterable[Metric]:
_JOB_MEMORY_TOTAL = "dstack_job_memory_total_bytes"
_JOB_MEMORY_USAGE = "dstack_job_memory_usage_bytes"
_JOB_MEMORY_WORKING_SET = "dstack_job_memory_working_set_bytes"
_JOB_GPU_USAGE_RATIO = "dstack_job_gpu_usage_ratio"
_JOB_GPU_MEMORY_TOTAL = "dstack_job_gpu_memory_total_bytes"
_JOB_GPU_MEMORY_USAGE = "dstack_job_gpu_memory_usage_bytes"


class _Metrics(dict[str, Metric]):
Expand Down Expand Up @@ -259,6 +276,9 @@ class _JobMetrics(_Metrics):
(_JOB_MEMORY_TOTAL, _GAUGE, "Total memory allocated for the job, bytes"),
(_JOB_MEMORY_USAGE, _GAUGE, "Memory used by the job (including cache), bytes"),
(_JOB_MEMORY_WORKING_SET, _GAUGE, "Memory used by the job (not including cache), bytes"),
(_JOB_GPU_USAGE_RATIO, _GAUGE, "Job GPU usage, percent (as 0.0-1.0)"),
(_JOB_GPU_MEMORY_TOTAL, _GAUGE, "Total GPU memory allocated for the job, bytes"),
(_JOB_GPU_MEMORY_USAGE, _GAUGE, "GPU memory used by the job, bytes"),
]


Expand Down
17 changes: 17 additions & 0 deletions src/tests/_internal/server/routers/test_prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient)
memory_gib=128,
gpu_count=2,
gpu_name="V4",
gpu_memory_gib=16,
price=12,
)
project_2 = await _create_project(session, "project-2", user)
Expand Down Expand Up @@ -140,6 +141,8 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient)
"""),
)
project_1 = await _create_project(session, "project-1", user)
# jrd.offer.instance.resources has higher priority than jpd.instance_type.resources,
# should be ignored
jpd_1_1 = get_job_provisioning_data(backend=BackendType.AWS, gpu_count=4, gpu_name="T4")
jrd_1_1 = get_job_runtime_data(offer=offer)
job_1_1 = await _create_job(
Expand Down Expand Up @@ -176,6 +179,8 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient)
cpu_usage_micro=3_500_000,
memory_working_set_bytes=3_221_225_472,
memory_usage_bytes=4_294_967_296,
gpus_util_percent=[80, 90],
gpus_memory_usage_bytes=[1_073_741_824, 2_147_483_648],
)
# Older, ignored
await create_job_metrics_point(
Expand Down Expand Up @@ -316,6 +321,18 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient)
# HELP dstack_job_memory_working_set_bytes Memory used by the job (not including cache), bytes
# TYPE dstack_job_memory_working_set_bytes gauge
dstack_job_memory_working_set_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 3221225472.0
# HELP dstack_job_gpu_usage_ratio Job GPU usage, percent (as 0.0-1.0)
# TYPE dstack_job_gpu_usage_ratio gauge
dstack_job_gpu_usage_ratio{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="0"}} 0.8
dstack_job_gpu_usage_ratio{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="1"}} 0.9
# HELP dstack_job_gpu_memory_total_bytes Total GPU memory allocated for the job, bytes
# TYPE dstack_job_gpu_memory_total_bytes gauge
dstack_job_gpu_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="0"}} 17179869184.0
dstack_job_gpu_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="1"}} 17179869184.0
# HELP dstack_job_gpu_memory_usage_bytes GPU memory used by the job, bytes
# TYPE dstack_job_gpu_memory_usage_bytes gauge
dstack_job_gpu_memory_usage_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="0"}} 1073741824.0
dstack_job_gpu_memory_usage_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="1"}} 2147483648.0
# HELP FIELD_1 Test field 1
# TYPE FIELD_1 gauge
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 350.0
Expand Down
Loading