diff --git a/frontend/src/pages/Runs/Details/Jobs/Metrics/index.tsx b/frontend/src/pages/Runs/Details/Jobs/Metrics/index.tsx index d487428fc1..dd088c418b 100644 --- a/frontend/src/pages/Runs/Details/Jobs/Metrics/index.tsx +++ b/frontend/src/pages/Runs/Details/Jobs/Metrics/index.tsx @@ -35,6 +35,7 @@ export const JobMetrics: React.FC = () => { const { cpuChartProps, memoryChartProps, eachGPUChartProps, eachGPUMemoryChartProps, isLoading } = useMetricsData({ project_name: paramProjectName, run_name: runData?.run_spec.run_name ?? '', + run_id: runData?.id ?? '', job_num: jobData?.job_spec.job_num ?? 0, limit: 1000, }); diff --git a/frontend/src/types/run.d.ts b/frontend/src/types/run.d.ts index f43c5e1562..79b20f14d0 100644 --- a/frontend/src/types/run.d.ts +++ b/frontend/src/types/run.d.ts @@ -136,6 +136,7 @@ declare type TStopRunsRequestParams = { declare type TJobMetricsRequestParams = { project_name: IProject['project_name']; run_name: string; + run_id: string; replica_num?: number; job_num: number; limit?: number; diff --git a/src/dstack/_internal/server/routers/metrics.py b/src/dstack/_internal/server/routers/metrics.py index e61a0d9bfa..a73a243431 100644 --- a/src/dstack/_internal/server/routers/metrics.py +++ b/src/dstack/_internal/server/routers/metrics.py @@ -1,5 +1,6 @@ from datetime import datetime from typing import Optional, Tuple +from uuid import UUID from fastapi import APIRouter, Depends from sqlalchemy.ext.asyncio import AsyncSession @@ -29,6 +30,7 @@ ) async def get_job_metrics( run_name: str, + run_id: Optional[UUID] = None, replica_num: int = 0, job_num: int = 0, limit: int = 1, @@ -39,8 +41,9 @@ async def get_job_metrics( ): """ Returns job-level metrics such as hardware utilization - given `run_name`, `replica_num`, and `job_num`. - If only `run_name` is specified, returns metrics of `(replica_num=0, job_num=0)`. + given `run_name`, `run_id`, `replica_num`, and `job_num`. + If only `run_name` is specified, returns metrics of `(replica_num=0, job_num=0)` + of the latest run with the given name. By default, returns one latest sample. To control time window/number of samples, use `limit`, `after`, `before`. @@ -61,6 +64,7 @@ async def get_job_metrics( session=session, project=project, run_name=run_name, + run_id=run_id, replica_num=replica_num, job_num=job_num, ) diff --git a/src/dstack/_internal/server/services/jobs/__init__.py b/src/dstack/_internal/server/services/jobs/__init__.py index ffea0c72ea..cbb089b2c5 100644 --- a/src/dstack/_internal/server/services/jobs/__init__.py +++ b/src/dstack/_internal/server/services/jobs/__init__.py @@ -97,19 +97,28 @@ def find_job(jobs: List[Job], replica_num: int, job_num: int) -> Job: async def get_run_job_model( - session: AsyncSession, project: ProjectModel, run_name: str, replica_num: int, job_num: int + session: AsyncSession, + project: ProjectModel, + run_name: str, + run_id: Optional[UUID], + replica_num: int, + job_num: int, ) -> Optional[JobModel]: + filters = [ + RunModel.project_id == project.id, + RunModel.run_name == run_name, + JobModel.replica_num == replica_num, + JobModel.job_num == job_num, + ] + if run_id is not None: + filters.append(RunModel.id == run_id) + else: + # Assuming run_name is unique for non-deleted runs + filters.append(RunModel.deleted == False) res = await session.execute( select(JobModel) .join(JobModel.run) - .where( - RunModel.project_id == project.id, - # assuming run_name is unique for non-deleted runs - RunModel.run_name == run_name, - RunModel.deleted == False, - JobModel.replica_num == replica_num, - JobModel.job_num == job_num, - ) + .where(*filters) .order_by(JobModel.submission_num.desc()) .limit(1) )