Skip to content

Commit bba82d7

Browse files
aws-brianxiaMohamed Zeidan
authored andcommitted
Add additional Space parameters for resources including the fractional GPU (#287)
1 parent c008e92 commit bba82d7

4 files changed

Lines changed: 96 additions & 17 deletions

File tree

src/sagemaker/hyperpod/cli/space_utils.py

Lines changed: 69 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,22 +42,40 @@ def generate_click_command(
4242

4343
def decorator(func: Callable) -> Callable:
4444
# build resources from CPU/memory options
45-
def _build_resources(cpu, memory, gpu):
46-
if cpu is None and memory is None and gpu is None:
45+
def _build_resources(cpu, cpu_limit, memory, memory_limit, gpu, gpu_limit,
46+
accelerator_partition_type, accelerator_partition_count):
47+
if not any([cpu, cpu_limit, memory, memory_limit, gpu, gpu_limit,
48+
accelerator_partition_type, accelerator_partition_count]):
4749
return None
50+
51+
if (accelerator_partition_type is None) ^ (accelerator_partition_count is None):
52+
raise click.UsageError(
53+
"Both accelerator-partition-type and accelerator-partition-count must be specified together"
54+
)
4855

4956
# Build requests dictionary
5057
requests = {}
58+
limits = {}
5159
if cpu is not None:
5260
requests["cpu"] = cpu
61+
if cpu_limit is not None:
62+
limits["cpu"] = cpu_limit
5363
if memory is not None:
5464
requests["memory"] = memory
65+
if memory_limit is not None:
66+
limits["memory"] = memory_limit
5567
if gpu is not None:
5668
requests["nvidia.com/gpu"] = gpu
69+
if gpu_limit is not None:
70+
limits["nvidia.com/gpu"] = gpu_limit
71+
if accelerator_partition_type is not None and accelerator_partition_count is not None:
72+
requests[f"nvidia.com/{accelerator_partition_type}"] = accelerator_partition_count
73+
limits[f"nvidia.com/{accelerator_partition_type}"] = accelerator_partition_count
5774

5875
# Return ResourceRequirements structure
5976
return {
60-
"requests": requests
77+
"requests": requests,
78+
"limits": limits,
6179
}
6280

6381
def _parse_volume_param(ctx, param, value):
@@ -140,7 +158,16 @@ def wrapped_func(*args, **kwargs):
140158
if Model is None:
141159
raise click.ClickException(f"Unsupported schema version: {version}")
142160

143-
resources = _build_resources(kwargs.pop("cpu", None), kwargs.pop("memory", None), kwargs.pop("gpu", None))
161+
resources = _build_resources(
162+
kwargs.pop("cpu", None),
163+
kwargs.pop("cpu_limit", None),
164+
kwargs.pop("memory", None),
165+
kwargs.pop("memory_limit", None),
166+
kwargs.pop("gpu", None),
167+
kwargs.pop("gpu_limit", None),
168+
kwargs.pop("accelerator_partition_type", None),
169+
kwargs.pop("accelerator_partition_count", None),
170+
)
144171
if resources is not None:
145172
kwargs["resources"] = resources
146173

@@ -210,21 +237,56 @@ def wrapped_func(*args, **kwargs):
210237
"--cpu",
211238
type=str,
212239
default=None,
213-
help="CPU resource, e.g. '250m'",
240+
help="CPU resource request, e.g. '250m'",
241+
)(wrapped_func)
242+
243+
wrapped_func = click.option(
244+
"--cpu-limit",
245+
type=str,
246+
default=None,
247+
help="CPU resource limit, e.g. '250m'",
214248
)(wrapped_func)
215249

216250
wrapped_func = click.option(
217251
"--memory",
218252
type=str,
219253
default=None,
220-
help="Memory resource, e.g. '256Mi'",
254+
help="Memory resource request, e.g. '256Mi'",
255+
)(wrapped_func)
256+
257+
wrapped_func = click.option(
258+
"--memory-limit",
259+
type=str,
260+
default=None,
261+
help="Memory resource limit, e.g. '256Mi'",
221262
)(wrapped_func)
222263

223264
wrapped_func = click.option(
224265
"--gpu",
225266
type=str,
226267
default=None,
227-
help="Gpu resource, e.g. '1'",
268+
help="Gpu resource request, e.g. '1'",
269+
)(wrapped_func)
270+
271+
wrapped_func = click.option(
272+
"--gpu-limit",
273+
type=str,
274+
default=None,
275+
help="Gpu resource limit, e.g. '1'",
276+
)(wrapped_func)
277+
278+
wrapped_func = click.option(
279+
"--accelerator-partition-type",
280+
type=str,
281+
default=None,
282+
help="Fractional GPU parition type",
283+
)(wrapped_func)
284+
285+
wrapped_func = click.option(
286+
"--accelerator-partition-count",
287+
type=str,
288+
default=None,
289+
help="Fractional GPU parition count",
228290
)(wrapped_func)
229291

230292
wrapped_func = click.option(

src/sagemaker/hyperpod/space/hyperpod_space.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ def list_pods(self) -> List[str]:
314314
try:
315315
pods = v1.list_namespaced_pod(
316316
namespace=self.config.namespace,
317-
label_selector=f"{SPACE_GROUP}/workspaceName={self.config.name}"
317+
label_selector=f"{SPACE_GROUP}/workspace-name={self.config.name}"
318318
)
319319
return [pod.metadata.name for pod in pods.items]
320320
except Exception as e:

test/unit_tests/cli/test_space_utils.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def cmd(version, domain_config):
9191

9292
@patch('sagemaker.hyperpod.cli.space_utils.load_schema_for_version')
9393
def test_resources_building(self, mock_load_schema):
94-
"""Test CPU and memory resource building"""
94+
"""Test CPU, memory, GPU and fractional GPU resource building"""
9595
schema = {
9696
'properties': {
9797
'resources': {
@@ -117,26 +117,43 @@ class Config:
117117
def cmd(version, domain_config):
118118
click.echo(json.dumps(domain_config.get('resources')))
119119

120-
# Test with custom CPU and memory
121-
result = self.runner.invoke(cmd, ['--cpu', '1000m', '--memory', '1Gi'])
120+
# Test with CPU and memory requests and limits
121+
result = self.runner.invoke(cmd, ['--cpu', '1000m', '--cpu-limit', '2000m', '--memory', '1Gi', '--memory-limit', '2Gi'])
122122
assert result.exit_code == 0
123123
output = json.loads(result.output)
124124
assert output['requests']['cpu'] == '1000m'
125125
assert output['requests']['memory'] == '1Gi'
126-
assert 'nvidia.com/gpu' not in output['requests']
126+
assert output['limits']['cpu'] == '2000m'
127+
assert output['limits']['memory'] == '2Gi'
127128

128-
# Test with only CPU
129-
result = self.runner.invoke(cmd, ['--cpu', '750m'])
129+
# Test with GPU requests and limits
130+
result = self.runner.invoke(cmd, ['--gpu', '1', '--gpu-limit', '2'])
130131
assert result.exit_code == 0
131132
output = json.loads(result.output)
132-
assert output['requests']['cpu'] == '750m'
133-
assert 'memory' not in output['requests']
133+
assert output['requests']['nvidia.com/gpu'] == '1'
134+
assert output['limits']['nvidia.com/gpu'] == '2'
135+
136+
# Test with fractional GPU partitioning
137+
result = self.runner.invoke(cmd, ['--accelerator-partition-type', 'mig-1g.5gb', '--accelerator-partition-count', '2'])
138+
assert result.exit_code == 0
139+
output = json.loads(result.output)
140+
assert output['requests']['nvidia.com/mig-1g.5gb'] == '2'
141+
assert output['limits']['nvidia.com/mig-1g.5gb'] == '2'
134142

135143
# Test with no resources specified
136144
result = self.runner.invoke(cmd, [])
137145
assert result.exit_code == 0
138146
assert result.output.strip() == 'null'
139147

148+
# Test error when only one accelerator partition parameter is provided
149+
result = self.runner.invoke(cmd, ['--accelerator-partition-type', 'mig-1g.5gb'])
150+
assert result.exit_code == 2
151+
assert 'Both accelerator-partition-type and accelerator-partition-count must be specified together' in result.output
152+
153+
result = self.runner.invoke(cmd, ['--accelerator-partition-count', '2'])
154+
assert result.exit_code == 2
155+
assert 'Both accelerator-partition-type and accelerator-partition-count must be specified together' in result.output
156+
140157
@patch('sagemaker.hyperpod.cli.space_utils.load_schema_for_version')
141158
def test_type_conversion(self, mock_load_schema):
142159
"""Test type conversion for different parameter types"""

test/unit_tests/test_hyperpod_space.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,7 @@ def test_list_pods_success(self, mock_verify_config, mock_core_api_class):
504504
self.assertEqual(result, ["pod1", "pod2"])
505505
mock_core_api.list_namespaced_pod.assert_called_once_with(
506506
namespace="test-namespace",
507-
label_selector="workspace.jupyter.org/workspaceName=test-space"
507+
label_selector="workspace.jupyter.org/workspace-name=test-space"
508508
)
509509

510510
@patch('sagemaker.hyperpod.space.hyperpod_space.client.CoreV1Api')

0 commit comments

Comments
 (0)