Skip to content

Commit d9ed1b4

Browse files
[CLI] Show replica=... and job=... only when needed.
1 parent 442a743 commit d9ed1b4

2 files changed

Lines changed: 256 additions & 7 deletions

File tree

src/dstack/_internal/cli/utils/run.py

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,23 @@ def get_runs_table(
287287
if not merge_job_rows:
288288
add_row_from_dict(table, run_row)
289289

290+
# Determine if we need to show replica= and job= fields
291+
replica_nums = {job.job_spec.replica_num for job in run.jobs}
292+
show_replica = len(replica_nums) > 1 # Show if there are multiple different replicas
293+
294+
# Group jobs by replica to check if each replica has multiple jobs
295+
jobs_by_replica: Dict[int, List[Any]] = {}
296+
for job in run.jobs:
297+
replica_num = job.job_spec.replica_num
298+
if replica_num not in jobs_by_replica:
299+
jobs_by_replica[replica_num] = []
300+
jobs_by_replica[replica_num].append(job)
301+
302+
# Check if any replica has multiple different job_nums
303+
show_job = any(
304+
len({j.job_spec.job_num for j in jobs}) > 1 for jobs in jobs_by_replica.values()
305+
)
306+
290307
for job in run.jobs:
291308
latest_job_submission = job.job_submissions[-1]
292309
status = latest_job_submission.status.value
@@ -296,13 +313,23 @@ def get_runs_table(
296313
status_text = latest_job_submission.status_message
297314
status_style = _get_job_status_style(status_text, latest_job_submission.status)
298315

316+
# Build NAME field conditionally showing replica= and job=
317+
name_parts = []
318+
if show_replica:
319+
name_parts.append(f"replica={job.job_spec.replica_num}")
320+
if show_job:
321+
name_parts.append(f"job={job.job_spec.job_num}")
322+
name_suffix = (
323+
f" deployment={latest_job_submission.deployment_num}"
324+
if show_deployment_num
325+
else ""
326+
)
327+
# Always include indentation for job rows, even if there are no parts
328+
name_value = " " + (" ".join(name_parts) if name_parts else "")
329+
name_value += name_suffix
330+
299331
job_row: Dict[Union[str, int], Any] = {
300-
"NAME": f" replica={job.job_spec.replica_num} job={job.job_spec.job_num}"
301-
+ (
302-
f" deployment={latest_job_submission.deployment_num}"
303-
if show_deployment_num
304-
else ""
305-
),
332+
"NAME": name_value,
306333
"STATUS": f"[{status_style}]{status_text}[/]",
307334
"PROBES": _format_job_probes(
308335
job.job_spec.probes, latest_job_submission.probes, latest_job_submission.status

src/tests/_internal/cli/utils/test_run.py

Lines changed: 223 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,14 @@
1212

1313
from dstack._internal.cli.utils.run import get_runs_table
1414
from dstack._internal.core.models.backends.base import BackendType
15-
from dstack._internal.core.models.configurations import AnyRunConfiguration, TaskConfiguration
15+
from dstack._internal.core.models.configurations import (
16+
AnyRunConfiguration,
17+
ServiceConfiguration,
18+
TaskConfiguration,
19+
)
1620
from dstack._internal.core.models.instances import Disk, InstanceType, Resources
1721
from dstack._internal.core.models.profiles import Profile
22+
from dstack._internal.core.models.resources import Range
1823
from dstack._internal.core.models.runs import (
1924
JobProvisioningData,
2025
JobStatus,
@@ -297,3 +302,220 @@ async def test_status_messages(
297302

298303
status_style = get_table_cell_style(table, "STATUS", 0)
299304
assert status_style == expected_style
305+
306+
async def test_multi_node_task_with_multiple_jobs(self, session: AsyncSession):
307+
"""Test that a multi-node task with multiple jobs shows replica= and job= in table rows."""
308+
submitted_at = datetime(2023, 1, 2, 3, 4, 5, tzinfo=timezone.utc)
309+
310+
project = await create_project(session=session)
311+
user = await create_user(session=session)
312+
repo = await create_repo(session=session, project_id=project.id)
313+
314+
# Create a multi-node task configuration (3 nodes)
315+
configuration = TaskConfiguration(
316+
type="task",
317+
image="ubuntu:latest",
318+
commands=["echo hello"],
319+
nodes=3,
320+
)
321+
322+
run_spec = get_run_spec(
323+
run_name="multi-node-run",
324+
repo_id=repo.name,
325+
profile=Profile(name="default"),
326+
configuration=configuration,
327+
)
328+
329+
run_model_db = await create_run(
330+
session=session,
331+
project=project,
332+
repo=repo,
333+
user=user,
334+
run_name="multi-node-run",
335+
run_spec=run_spec,
336+
status=RunStatus.RUNNING,
337+
submitted_at=submitted_at,
338+
)
339+
340+
# Create 3 jobs, all with replica_num=0 but different job_num values
341+
resources = Resources(
342+
cpus=2,
343+
memory_mib=4096,
344+
gpus=[],
345+
spot=False,
346+
disk=Disk(size_mib=102400),
347+
)
348+
instance_type = InstanceType(name="t2.medium", resources=resources)
349+
job_provisioning_data = get_job_provisioning_data(
350+
backend=BackendType.AWS,
351+
region="us-east-1",
352+
cpu_count=2,
353+
memory_gib=4,
354+
spot=False,
355+
hostname="1.2.3.4",
356+
price=0.0464,
357+
instance_type=instance_type,
358+
)
359+
360+
# Create 3 jobs: all replica_num=0, job_num=0,1,2
361+
for job_num in range(3):
362+
await create_job(
363+
session=session,
364+
run=run_model_db,
365+
status=JobStatus.RUNNING,
366+
submitted_at=submitted_at,
367+
last_processed_at=submitted_at,
368+
job_provisioning_data=job_provisioning_data,
369+
replica_num=0,
370+
job_num=job_num,
371+
)
372+
373+
await session.refresh(run_model_db)
374+
375+
res = await session.execute(
376+
select(RunModel)
377+
.where(RunModel.id == run_model_db.id)
378+
.options(selectinload(RunModel.jobs))
379+
)
380+
run_model_db = res.scalar_one()
381+
382+
run_model = run_model_to_run(run_model_db)
383+
384+
api_run = Run(
385+
api_client=Mock(spec=APIClient),
386+
project=project.name,
387+
run=run_model,
388+
)
389+
390+
table = get_runs_table([api_run], verbose=False)
391+
cells = get_table_cells(table)
392+
393+
# Should have 4 rows: 1 run header + 3 job rows
394+
assert len(cells) == 4
395+
396+
# First row should be the run header
397+
assert cells[0]["NAME"] == "multi-node-run"
398+
399+
# Next 3 rows should be job rows with job=0,1,2 (replica= should NOT be shown since all jobs have same replica)
400+
for i in range(1, 4):
401+
job_row = cells[i]
402+
assert (
403+
"replica=" not in job_row["NAME"]
404+
) # Should not show replica since all are replica=0
405+
assert f"job={i - 1}" in job_row["NAME"]
406+
assert job_row["STATUS"] == "running"
407+
408+
async def test_service_with_multiple_replicas_and_jobs(self, session: AsyncSession):
409+
"""Test that a service with multiple replicas and jobs shows different replica= and same job= in table rows."""
410+
submitted_at = datetime(2023, 1, 2, 3, 4, 5, tzinfo=timezone.utc)
411+
412+
project = await create_project(session=session)
413+
user = await create_user(session=session)
414+
repo = await create_repo(session=session, project_id=project.id)
415+
416+
# Create a service configuration with 3 replicas
417+
configuration = ServiceConfiguration(
418+
type="service",
419+
image="ubuntu:latest",
420+
commands=["echo hello"],
421+
port=8000,
422+
replicas=Range[int](min=3, max=3),
423+
)
424+
425+
run_spec = get_run_spec(
426+
run_name="service-run",
427+
repo_id=repo.name,
428+
profile=Profile(name="default"),
429+
configuration=configuration,
430+
)
431+
432+
run_model_db = await create_run(
433+
session=session,
434+
project=project,
435+
repo=repo,
436+
user=user,
437+
run_name="service-run",
438+
run_spec=run_spec,
439+
status=RunStatus.RUNNING,
440+
submitted_at=submitted_at,
441+
)
442+
443+
# Create jobs: 3 replicas, 2 jobs per replica
444+
# replica=0: job=0, job=1
445+
# replica=1: job=0, job=1
446+
# replica=2: job=0, job=1
447+
resources = Resources(
448+
cpus=2,
449+
memory_mib=4096,
450+
gpus=[],
451+
spot=False,
452+
disk=Disk(size_mib=102400),
453+
)
454+
instance_type = InstanceType(name="t2.medium", resources=resources)
455+
job_provisioning_data = get_job_provisioning_data(
456+
backend=BackendType.AWS,
457+
region="us-east-1",
458+
cpu_count=2,
459+
memory_gib=4,
460+
spot=False,
461+
hostname="1.2.3.4",
462+
price=0.0464,
463+
instance_type=instance_type,
464+
)
465+
466+
# Create 3 replicas, each with 2 jobs
467+
for replica_num in range(3):
468+
for job_num in range(2):
469+
await create_job(
470+
session=session,
471+
run=run_model_db,
472+
status=JobStatus.RUNNING,
473+
submitted_at=submitted_at,
474+
last_processed_at=submitted_at,
475+
job_provisioning_data=job_provisioning_data,
476+
replica_num=replica_num,
477+
job_num=job_num,
478+
)
479+
480+
await session.refresh(run_model_db)
481+
482+
res = await session.execute(
483+
select(RunModel)
484+
.where(RunModel.id == run_model_db.id)
485+
.options(selectinload(RunModel.jobs))
486+
)
487+
run_model_db = res.scalar_one()
488+
489+
run_model = run_model_to_run(run_model_db)
490+
491+
api_run = Run(
492+
api_client=Mock(spec=APIClient),
493+
project=project.name,
494+
run=run_model,
495+
)
496+
497+
table = get_runs_table([api_run], verbose=False)
498+
cells = get_table_cells(table)
499+
500+
# Should have 7 rows: 1 run header + 6 job rows (3 replicas × 2 jobs)
501+
assert len(cells) == 7
502+
503+
# First row should be the run header
504+
assert cells[0]["NAME"] == "service-run"
505+
506+
# Next 6 rows should be job rows with both replica= and job= (since there are multiple replicas and multiple jobs per replica)
507+
# Expected order: replica=0 job=0, replica=0 job=1, replica=1 job=0, replica=1 job=1, replica=2 job=0, replica=2 job=1
508+
expected_jobs = [
509+
(0, 0),
510+
(0, 1),
511+
(1, 0),
512+
(1, 1),
513+
(2, 0),
514+
(2, 1),
515+
]
516+
517+
for i, (expected_replica, expected_job) in enumerate(expected_jobs, start=1):
518+
job_row = cells[i]
519+
assert f"replica={expected_replica}" in job_row["NAME"]
520+
assert f"job={expected_job}" in job_row["NAME"]
521+
assert job_row["STATUS"] == "running"

0 commit comments

Comments
 (0)