Skip to content

Commit 24745cb

Browse files
authored
Set up instance-level firewall on all backends (#3058)
1 parent bcc2fa1 commit 24745cb

File tree

8 files changed

+76
-38
lines changed

8 files changed

+76
-38
lines changed

docs/docs/concepts/backends.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ There are two ways to configure AWS: using an access key or using the default cr
243243
* `user` with passwordless sudo access
244244
* Docker is installed
245245
* (For NVIDIA instances) NVIDIA/CUDA drivers and NVIDIA Container Toolkit are installed
246+
* The firewall (`iptables`, `ufw`, etc.) must allow external traffic to port 22 and all traffic within the private subnet, and should forbid any other incoming external traffic.
246247

247248
## Azure
248249

src/dstack/_internal/core/backends/aws/compute.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,12 @@ def create_instance(
292292
image_id=image_id,
293293
instance_type=instance_offer.instance.name,
294294
iam_instance_profile=self.config.iam_instance_profile,
295-
user_data=get_user_data(authorized_keys=instance_config.get_public_keys()),
295+
user_data=get_user_data(
296+
authorized_keys=instance_config.get_public_keys(),
297+
# Custom OS images may lack ufw, so don't attempt to set up the firewall.
298+
# Rely on security groups and the image's built-in firewall rules instead.
299+
skip_firewall_setup=self.config.os_images is not None,
300+
),
296301
tags=aws_resources.make_tags(tags),
297302
security_group_id=security_group_id,
298303
spot=instance_offer.instance.resources.spot,

src/dstack/_internal/core/backends/base/compute.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import string
55
import threading
66
from abc import ABC, abstractmethod
7+
from collections.abc import Iterable
78
from functools import lru_cache
89
from pathlib import Path
910
from typing import Dict, List, Literal, Optional
@@ -45,6 +46,7 @@
4546

4647
DSTACK_SHIM_BINARY_NAME = "dstack-shim"
4748
DSTACK_RUNNER_BINARY_NAME = "dstack-runner"
49+
DEFAULT_PRIVATE_SUBNETS = ("10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16")
4850

4951
GoArchType = Literal["amd64", "arm64"]
5052

@@ -507,12 +509,16 @@ def get_user_data(
507509
base_path: Optional[PathLike] = None,
508510
bin_path: Optional[PathLike] = None,
509511
backend_shim_env: Optional[Dict[str, str]] = None,
512+
skip_firewall_setup: bool = False,
513+
firewall_allow_from_subnets: Iterable[str] = DEFAULT_PRIVATE_SUBNETS,
510514
) -> str:
511515
shim_commands = get_shim_commands(
512516
authorized_keys=authorized_keys,
513517
base_path=base_path,
514518
bin_path=bin_path,
515519
backend_shim_env=backend_shim_env,
520+
skip_firewall_setup=skip_firewall_setup,
521+
firewall_allow_from_subnets=firewall_allow_from_subnets,
516522
)
517523
commands = (backend_specific_commands or []) + shim_commands
518524
return get_cloud_config(
@@ -554,8 +560,13 @@ def get_shim_commands(
554560
bin_path: Optional[PathLike] = None,
555561
backend_shim_env: Optional[Dict[str, str]] = None,
556562
arch: Optional[str] = None,
563+
skip_firewall_setup: bool = False,
564+
firewall_allow_from_subnets: Iterable[str] = DEFAULT_PRIVATE_SUBNETS,
557565
) -> List[str]:
558-
commands = get_setup_cloud_instance_commands()
566+
commands = get_setup_cloud_instance_commands(
567+
skip_firewall_setup=skip_firewall_setup,
568+
firewall_allow_from_subnets=firewall_allow_from_subnets,
569+
)
559570
commands += get_shim_pre_start_commands(
560571
base_path=base_path,
561572
bin_path=bin_path,
@@ -638,8 +649,11 @@ def get_dstack_shim_download_url(arch: Optional[str] = None) -> str:
638649
return url_template.format(version=version, arch=arch)
639650

640651

641-
def get_setup_cloud_instance_commands() -> list[str]:
642-
return [
652+
def get_setup_cloud_instance_commands(
653+
skip_firewall_setup: bool,
654+
firewall_allow_from_subnets: Iterable[str],
655+
) -> list[str]:
656+
commands = [
643657
# Workaround for https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
644658
# Attempts to patch /etc/docker/daemon.json while keeping any custom settings it may have.
645659
(
@@ -653,6 +667,19 @@ def get_setup_cloud_instance_commands() -> list[str]:
653667
"'"
654668
),
655669
]
670+
if not skip_firewall_setup:
671+
commands += [
672+
"ufw --force reset", # Some OS images have default rules like `allow 80`. Delete them
673+
"ufw default deny incoming",
674+
"ufw default allow outgoing",
675+
"ufw allow ssh",
676+
]
677+
for subnet in firewall_allow_from_subnets:
678+
commands.append(f"ufw allow from {subnet}")
679+
commands += [
680+
"ufw --force enable",
681+
]
682+
return commands
656683

657684

658685
def get_shim_pre_start_commands(

src/dstack/_internal/core/backends/digitalocean_base/compute.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,6 @@
2626
logger = get_logger(__name__)
2727

2828
MAX_INSTANCE_NAME_LEN = 60
29-
30-
# Setup commands for DigitalOcean instances
31-
SETUP_COMMANDS = [
32-
"sudo ufw delete limit ssh",
33-
"sudo ufw allow ssh",
34-
]
35-
3629
DOCKER_INSTALL_COMMANDS = [
3730
"export DEBIAN_FRONTEND=noninteractive",
3831
"mkdir -p /etc/apt/keyrings",
@@ -92,9 +85,9 @@ def create_instance(
9285
size_slug = instance_offer.instance.name
9386

9487
if not instance_offer.instance.resources.gpus:
95-
backend_specific_commands = SETUP_COMMANDS + DOCKER_INSTALL_COMMANDS
88+
backend_specific_commands = DOCKER_INSTALL_COMMANDS
9689
else:
97-
backend_specific_commands = SETUP_COMMANDS
90+
backend_specific_commands = None
9891

9992
project_id = None
10093
if self.config.project_name:

src/dstack/_internal/core/backends/gcp/compute.py

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import json
33
import threading
44
from collections import defaultdict
5+
from dataclasses import dataclass
56
from typing import Callable, Dict, List, Literal, Optional, Tuple
67

78
import google.api_core.exceptions
@@ -285,16 +286,18 @@ def create_instance(
285286
)
286287
raise NoCapacityError()
287288

289+
image = _get_image(
290+
instance_type_name=instance_offer.instance.name,
291+
cuda=len(instance_offer.instance.resources.gpus) > 0,
292+
)
293+
288294
for zone in zones:
289295
request = compute_v1.InsertInstanceRequest()
290296
request.zone = zone
291297
request.project = self.config.project_id
292298
request.instance_resource = gcp_resources.create_instance_struct(
293299
disk_size=disk_size,
294-
image_id=_get_image_id(
295-
instance_type_name=instance_offer.instance.name,
296-
cuda=len(instance_offer.instance.resources.gpus) > 0,
297-
),
300+
image_id=image.id,
298301
machine_type=instance_offer.instance.name,
299302
accelerators=gcp_resources.get_accelerators(
300303
project_id=self.config.project_id,
@@ -305,6 +308,7 @@ def create_instance(
305308
user_data=_get_user_data(
306309
authorized_keys=authorized_keys,
307310
instance_type_name=instance_offer.instance.name,
311+
is_ufw_installed=image.is_ufw_installed,
308312
),
309313
authorized_keys=authorized_keys,
310314
labels=labels,
@@ -889,24 +893,41 @@ def _get_vpc_subnet(
889893
)
890894

891895

892-
def _get_image_id(instance_type_name: str, cuda: bool) -> str:
896+
@dataclass
897+
class GCPImage:
898+
id: str
899+
is_ufw_installed: bool
900+
901+
902+
def _get_image(instance_type_name: str, cuda: bool) -> GCPImage:
893903
if instance_type_name == "a3-megagpu-8g":
894904
image_name = "dstack-a3mega-5"
905+
is_ufw_installed = False
895906
elif instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
896-
return "projects/cos-cloud/global/images/cos-105-17412-535-78"
907+
return GCPImage(
908+
id="projects/cos-cloud/global/images/cos-105-17412-535-78",
909+
is_ufw_installed=False,
910+
)
897911
elif cuda:
898912
image_name = f"dstack-cuda-{version.base_image}"
913+
is_ufw_installed = True
899914
else:
900915
image_name = f"dstack-{version.base_image}"
916+
is_ufw_installed = True
901917
image_name = image_name.replace(".", "-")
902-
return f"projects/dstack/global/images/{image_name}"
918+
return GCPImage(
919+
id=f"projects/dstack/global/images/{image_name}",
920+
is_ufw_installed=is_ufw_installed,
921+
)
903922

904923

905924
def _get_gateway_image_id() -> str:
906925
return "projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy-v20230714"
907926

908927

909-
def _get_user_data(authorized_keys: List[str], instance_type_name: str) -> str:
928+
def _get_user_data(
929+
authorized_keys: List[str], instance_type_name: str, is_ufw_installed: bool
930+
) -> str:
910931
base_path = None
911932
bin_path = None
912933
backend_shim_env = None
@@ -929,6 +950,9 @@ def _get_user_data(authorized_keys: List[str], instance_type_name: str) -> str:
929950
base_path=base_path,
930951
bin_path=bin_path,
931952
backend_shim_env=backend_shim_env,
953+
# Instance-level firewall is optional on GCP. The main protection comes from GCP firewalls.
954+
# So only set up instance-level firewall as an additional measure if ufw is available.
955+
skip_firewall_setup=not is_ufw_installed,
932956
)
933957

934958

src/dstack/_internal/core/backends/nebius/compute.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,6 @@
5959
"exec-opts": ["native.cgroupdriver=cgroupfs"],
6060
}
6161
SETUP_COMMANDS = [
62-
"ufw allow ssh",
63-
"ufw allow from 10.0.0.0/8",
64-
"ufw allow from 172.16.0.0/12",
65-
"ufw allow from 192.168.0.0/16",
66-
"ufw default deny incoming",
67-
"ufw default allow outgoing",
68-
"ufw enable",
6962
'sed -i "s/.*AllowTcpForwarding.*/AllowTcpForwarding yes/g" /etc/ssh/sshd_config',
7063
"service ssh restart",
7164
f"echo {shlex.quote(json.dumps(DOCKER_DAEMON_CONFIG))} > /etc/docker/daemon.json",

src/dstack/_internal/core/backends/oci/compute.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -135,11 +135,10 @@ def create_instance(
135135
security_group.id, region.virtual_network_client
136136
)
137137

138-
setup_commands = [
139-
f"sudo iptables -I INPUT -s {resources.VCN_CIDR} -j ACCEPT",
140-
"sudo netfilter-persistent save",
141-
]
142-
cloud_init_user_data = get_user_data(instance_config.get_public_keys(), setup_commands)
138+
cloud_init_user_data = get_user_data(
139+
authorized_keys=instance_config.get_public_keys(),
140+
firewall_allow_from_subnets=[resources.VCN_CIDR],
141+
)
143142

144143
display_name = generate_unique_instance_name(instance_config)
145144
try:

src/dstack/_internal/core/backends/vultr/compute.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -75,17 +75,13 @@ def create_instance(
7575
subnet = vpc["v4_subnet"]
7676
subnet_mask = vpc["v4_subnet_mask"]
7777

78-
setup_commands = [
79-
f"sudo ufw allow from {subnet}/{subnet_mask}",
80-
"sudo ufw reload",
81-
]
8278
instance_id = self.api_client.launch_instance(
8379
region=instance_offer.region,
8480
label=instance_name,
8581
plan=instance_offer.instance.name,
8682
user_data=get_user_data(
8783
authorized_keys=instance_config.get_public_keys(),
88-
backend_specific_commands=setup_commands,
84+
firewall_allow_from_subnets=[f"{subnet}/{subnet_mask}"],
8985
),
9086
vpc_id=vpc["id"],
9187
)

0 commit comments

Comments
 (0)