Skip to content

Commit 59b646d

Browse files
authored
Support getting job metrics by run_id (#3201)
1 parent 1c2a7bd commit 59b646d

File tree

4 files changed

+26
-11
lines changed

4 files changed

+26
-11
lines changed

frontend/src/pages/Runs/Details/Jobs/Metrics/index.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ export const JobMetrics: React.FC = () => {
3535
const { cpuChartProps, memoryChartProps, eachGPUChartProps, eachGPUMemoryChartProps, isLoading } = useMetricsData({
3636
project_name: paramProjectName,
3737
run_name: runData?.run_spec.run_name ?? '',
38+
run_id: runData?.id ?? '',
3839
job_num: jobData?.job_spec.job_num ?? 0,
3940
limit: 1000,
4041
});

frontend/src/types/run.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ declare type TStopRunsRequestParams = {
136136
declare type TJobMetricsRequestParams = {
137137
project_name: IProject['project_name'];
138138
run_name: string;
139+
run_id: string;
139140
replica_num?: number;
140141
job_num: number;
141142
limit?: number;

src/dstack/_internal/server/routers/metrics.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from datetime import datetime
22
from typing import Optional, Tuple
3+
from uuid import UUID
34

45
from fastapi import APIRouter, Depends
56
from sqlalchemy.ext.asyncio import AsyncSession
@@ -29,6 +30,7 @@
2930
)
3031
async def get_job_metrics(
3132
run_name: str,
33+
run_id: Optional[UUID] = None,
3234
replica_num: int = 0,
3335
job_num: int = 0,
3436
limit: int = 1,
@@ -39,8 +41,9 @@ async def get_job_metrics(
3941
):
4042
"""
4143
Returns job-level metrics such as hardware utilization
42-
given `run_name`, `replica_num`, and `job_num`.
43-
If only `run_name` is specified, returns metrics of `(replica_num=0, job_num=0)`.
44+
given `run_name`, `run_id`, `replica_num`, and `job_num`.
45+
If only `run_name` is specified, returns metrics of `(replica_num=0, job_num=0)`
46+
of the latest run with the given name.
4447
By default, returns one latest sample. To control time window/number of samples, use
4548
`limit`, `after`, `before`.
4649
@@ -61,6 +64,7 @@ async def get_job_metrics(
6164
session=session,
6265
project=project,
6366
run_name=run_name,
67+
run_id=run_id,
6468
replica_num=replica_num,
6569
job_num=job_num,
6670
)

src/dstack/_internal/server/services/jobs/__init__.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -97,19 +97,28 @@ def find_job(jobs: List[Job], replica_num: int, job_num: int) -> Job:
9797

9898

9999
async def get_run_job_model(
100-
session: AsyncSession, project: ProjectModel, run_name: str, replica_num: int, job_num: int
100+
session: AsyncSession,
101+
project: ProjectModel,
102+
run_name: str,
103+
run_id: Optional[UUID],
104+
replica_num: int,
105+
job_num: int,
101106
) -> Optional[JobModel]:
107+
filters = [
108+
RunModel.project_id == project.id,
109+
RunModel.run_name == run_name,
110+
JobModel.replica_num == replica_num,
111+
JobModel.job_num == job_num,
112+
]
113+
if run_id is not None:
114+
filters.append(RunModel.id == run_id)
115+
else:
116+
# Assuming run_name is unique for non-deleted runs
117+
filters.append(RunModel.deleted == False)
102118
res = await session.execute(
103119
select(JobModel)
104120
.join(JobModel.run)
105-
.where(
106-
RunModel.project_id == project.id,
107-
# assuming run_name is unique for non-deleted runs
108-
RunModel.run_name == run_name,
109-
RunModel.deleted == False,
110-
JobModel.replica_num == replica_num,
111-
JobModel.job_num == job_num,
112-
)
121+
.where(*filters)
113122
.order_by(JobModel.submission_num.desc())
114123
.limit(1)
115124
)

0 commit comments

Comments
 (0)