Skip to content

Commit f3d5978

Browse files
rsareddy0329zhaoqizqwangpintaoz-awspintaoz
authored
Update documentation-with-new-changes branch with latest changes from main (#190)
* Fix training test (#184) * Fix SDK training test: Add wait time before refresh * Fix training tests in canaries * Update logging information for submitting and deleting training job (#189) Co-authored-by: pintaoz <pintaoz@amazon.com> --------- Co-authored-by: Zhaoqi <zhaoqiwang.baruch@gmail.com> Co-authored-by: pintaoz-aws <167920275+pintaoz-aws@users.noreply.github.com> Co-authored-by: pintaoz <pintaoz@amazon.com>
1 parent 0de2138 commit f3d5978

4 files changed

Lines changed: 5 additions & 25 deletions

File tree

src/sagemaker/hyperpod/cli/commands/training.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,7 @@
11
import click
2-
import logging
3-
import os
4-
import yaml
5-
import shutil
6-
import subprocess
7-
from pathlib import Path
82
from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob
93
from sagemaker.hyperpod.common.config import Metadata
10-
import tempfile
11-
from typing import List, Dict, Any, Optional, Callable, get_args, get_origin, Literal
124
from sagemaker.hyperpod.cli.training_utils import generate_click_command
13-
from importlib.metadata import entry_points
145
from hyperpod_pytorch_job_template.registry import SCHEMA_REGISTRY
156
from sagemaker.hyperpod.common.telemetry.telemetry_logging import (
167
_hyperpod_telemetry_emitter,

src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
_HyperPodPytorchJob, HyperPodPytorchJobStatus
44
)
55
from sagemaker.hyperpod.common.config.metadata import Metadata
6-
from kubernetes import client, config, __version__ as kubernetes_client_version
7-
from typing import List, Optional, ClassVar, Tuple
6+
from kubernetes import client, config
7+
from typing import List, Optional, ClassVar
88
from sagemaker.hyperpod.common.utils import (
99
handle_exception,
1010
get_default_namespace,
@@ -84,7 +84,7 @@ def create(self, debug=False):
8484
plural=PLURAL,
8585
body=config,
8686
)
87-
logger.info("Successfully submitted HyperPodPytorchJob!")
87+
logger.info(f"Successfully submitted HyperPodPytorchJob '{self.metadata.name}'!")
8888
except Exception as e:
8989
logger.error(f"Failed to create HyperPodPytorchJob {self.metadata.name}!")
9090
handle_exception(e, self.metadata.name, self.metadata.namespace)
@@ -131,7 +131,7 @@ def delete(self):
131131
plural=PLURAL,
132132
name=self.metadata.name,
133133
)
134-
logger.info(f"Successful deleted HyperPodPytorchJob!")
134+
logger.info(f"Successful deleted HyperPodPytorchJob '{self.metadata.name}'!")
135135
except Exception as e:
136136
logger.error(f"Failed to delete HyperPodPytorchJob {self.metadata.name}!")
137137
handle_exception(e, self.metadata.name, self.metadata.namespace)

test/integration_tests/training/cli/test_cli_training.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,6 @@ def test_list_clusters(self, cluster_name):
2727
"""Test listing clusters """
2828
assert cluster_name
2929

30-
def test_set_cluster_context(self, cluster_name):
31-
"""Test setting cluster context."""
32-
result = execute_command([
33-
"hyp", "set-cluster-context",
34-
"--cluster-name", cluster_name
35-
])
36-
assert result.returncode == 0
37-
context_line = result.stdout.strip().splitlines()[-1]
38-
assert any(text in context_line for text in ["Updated context", "Added new context"])
39-
4030
def test_get_cluster_context(self):
4131
"""Test getting current cluster context."""
4232
result = execute_command(["hyp", "get-cluster-context"])

test/integration_tests/training/sdk/test_sdk_training.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,9 @@ def test_list_jobs(self, pytorch_job):
7070
job_names = [job.metadata.name for job in jobs]
7171
assert pytorch_job.metadata.name in job_names
7272

73-
#
7473
def test_refresh_job(self, pytorch_job):
7574
pytorch_job.refresh()
76-
time.sleep(15)
75+
time.sleep(30)
7776
assert pytorch_job.status is not None, "Job status should not be None"
7877
logger.info(f"Refreshed job status:\n{yaml.dump(pytorch_job.status)}")
7978

0 commit comments

Comments
 (0)