Hercules/variables.tf at master · ETHRoboticsClub/Hercules · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
variable "region" {
  description = "AWS region"
  type        = string
  default     = "us-east-1"
}

variable "aws_profile" {
  description = "AWS CLI named profile to use for authentication. Defaults to the AWS_PROFILE environment variable or the 'default' profile when null."
  type        = string
  default     = null
}

variable "cluster_name" {
  description = "Name of the EKS cluster"
  type        = string
  default     = "ethrc-prod-1"
}

variable "cluster_version" {
  description = "Kubernetes version for the EKS cluster"
  type        = string
  default     = "1.35"
}

variable "vpc_cidr" {
  description = "CIDR block for VPC"
  type        = string
  default     = "10.0.0.0/16"
}

variable "availability_zones" {
  description = "Availability zones for subnets. EKS requires at least two."
  type        = list(string)
  default     = null
}

variable "private_subnet_cidrs" {
  description = "CIDR blocks for private subnets (one per AZ)"
  type        = list(string)
  default     = null
}

variable "public_subnet_cidrs" {
  description = "CIDR blocks for public subnets (one per AZ)"
  type        = list(string)
  default     = null
}

variable "node_tier" {
  description = "Compute tier for the node group: 'cpu', 'gpus', 'gpum', 'gpul', or 'h100'. GPU tiers auto-install the NVIDIA GPU Operator."
  type        = string
  default     = "cpu"

  validation {
    condition     = contains(["cpu", "gpus", "gpum", "gpul", "h100"], var.node_tier)
    error_message = "node_tier must be one of: cpu, gpus, gpum, gpul, h100."
  }
}

variable "gpu_operator_enabled" {
  description = "Install the NVIDIA GPU Operator. Set to true whenever GPU node pools are active."
  type        = bool
  default     = true
}

variable "gpum_instance_types" {
  description = "List of EC2 instance types to use for the 'gpum' NodePool (1× L40S). g6e.2xlarge is a fallback for regions with limited g6e.xlarge availability."
  type        = list(string)
  default     = ["g6e.xlarge", "g6e.2xlarge"]
}

variable "node_disk_size" {
  description = "Disk size in GB for worker nodes"
  type        = number
  default     = 200
}

variable "use_byoip_ipv6" {
  description = "Use Bring Your Own IP (BYOIP) for IPv6 instead of AWS-provided addresses"
  type        = bool
  default     = false
}

variable "byoip_ipv6_pool_id" {
  description = "AWS IPv6 BYOIP pool ID (required if use_byoip_ipv6 is true). Format: ipv6pool-ec2-xxxxxxxxxxxxxxxxx"
  type        = string
  default     = null
}

variable "byoip_ipv6_cidr" {
  description = "BYOIP IPv6 CIDR block to use (e.g., 2001:db8:1234::/56). If not specified with use_byoip_ipv6, will use netmask length"
  type        = string
  default     = null
}

variable "byoip_ipv6_netmask_length" {
  description = "Netmask length for BYOIP IPv6 CIDR (typically 56 for VPC from a /48 allocation)"
  type        = number
  default     = 56
}

variable "tags" {
  description = "Common tags to apply to all resources"
  type        = map(string)
  default = {
    Project     = "ethrc-prod-1"
    Environment = "development"
    ManagedBy   = "OpenTofu"
  }
}

variable "cluster_access" {
  description = "Map of IAM principals to grant cluster access. Keys are friendly names; values specify the principal ARN, EKS access policy, and an optional list of namespaces. When namespaces is non-empty the access entry is scoped to those namespaces only; omit (or leave empty) for cluster-wide access."
  type = map(object({
    principal_arn = string
    policy        = string                     # AmazonEKSClusterAdminPolicy | AmazonEKSAdminPolicy | AmazonEKSEditPolicy | AmazonEKSViewPolicy
    namespaces    = optional(list(string), []) # [] = cluster-wide; non-empty = namespace-scoped
  }))
  default = {}
}

variable "s3_bucket_arns" {
  description = "Additional S3 bucket ARNs to expose via the CSI driver alongside the ML data bucket."
  type        = list(string)
  default     = []
}

variable "gpu_node_max_lifetime" {
  description = "Hard TTL for gpus/gpum/gpul nodes. Karpenter drains and terminates any node running longer than this duration, regardless of workload state. Go duration syntax (e.g. \"24h\", \"72h\"). Set to \"Never\" to disable."
  type        = string
  default     = "48h"
}

variable "ml_data_bucket_name" {
  description = "Name of the S3 bucket for ML training data, checkpoints, and model artefacts. Must be globally unique."
  type        = string
}


variable "argocd_enabled" {
  description = "Install ArgoCD for GitOps-driven ML workload management."
  type        = bool
  default     = true
}

variable "argocd_chart_version" {
  description = "Version of the argo-cd Helm chart to install."
  type        = string
  default     = "9.4.5"
}

variable "argocd_source_repos" {
  description = "List of git repository URLs the ml-workloads AppProject is allowed to sync from. Restrict to specific repo URLs in production to prevent syncing from untrusted sources."
  type        = list(string)
  default     = ["*"]
}

variable "workload_namespaces" {
  description = "Namespaces to create. One ArgoCD AppProject is created per namespace and each project is restricted to its own namespace as the only destination."
  type        = list(string)
  default     = ["robot-learning", "humanoid", "aeronautics", "cybersecurity"]
}

variable "argocd_team_groups" {
  description = "Map of workload namespace names to lists of SSO/OIDC group names. Members of these groups receive edit access to the corresponding team's ArgoCD AppProject and namespace only. Leave empty to configure SSO group bindings outside of Terraform."
  type        = map(list(string))
  default     = {}
}

# ── ECR variables ─────────────────────────────────────────────────────────
variable "ecr_repository_names" {
  description = "ECR repository names to create for container images."
  type        = list(string)
  default     = ["ethroboticsclub/pytorch", "ethroboticsclub/jax"]
}

variable "ecr_github_repositories" {
  description = "GitHub repositories allowed to push to ECR (format: org/repo)."
  type        = list(string)
  default     = ["ethroboticsclub/docker-images"]
}


# ── Access restriction ─────────────────────────────────────────────────────────

variable "api_server_allowed_cidrs" {
  description = "IPv4 CIDRs permitted to reach the public EKS API server (kubectl). Defaults to unrestricted when empty. AWS does not accept IPv6 here — use waf_as214770_cidrs for IPv6 coverage on application endpoints."
  type        = list(string)
  default     = []
}

variable "waf_as214770_cidrs" {
  description = "IP prefixes (IPv4 and/or IPv6) announced by AS214770. These are allowed through the WAF alongside the Switzerland geo-match rule. Fetch current prefixes from https://bgp.he.net/AS214770."
  type        = list(string)
  default     = []
}

variable "argocd_hostname" {
  description = "Public hostname for the ArgoCD UI (e.g. argocd.example.com). Set alongside argocd_certificate_arn to create an internet-facing ALB with WAF."
  type        = string
  default     = null
}

variable "route53_zone_name" {
  description = "Route 53 hosted zone name used to write DNS validation records for auto-created ACM certificates (e.g. \"ethrc.rgn.dev\"). Required when argocd_hostname is set."
  type        = string
  default     = null
}

variable "kubeflow_training_operator_enabled" {
  description = "Install Kubeflow Trainer v2 for distributed ML training with the TrainJob API. Includes JobSet and default ClusterTrainingRuntimes (torch, deepspeed, mlx, jax, torchtune)."
  type        = bool
  default     = true
}

variable "kubeflow_dashboard_enabled" {
  description = "Deploy the Kubeflow Central Dashboard. Requires kubeflow_training_operator_enabled = true."
  type        = bool
  default     = true
}

variable "kubeflow_dashboard_hostname" {
  description = "Public hostname for the Kubeflow Dashboard (e.g. kubeflow.example.com). Set alongside kubeflow_dashboard_certificate_arn to create an internet-facing ALB with WAF."
  type        = string
  default     = null
}

variable "kubeflow_dashboard_certificate_arn" {
  description = "ACM certificate ARN for the Kubeflow Dashboard HTTPS listener. Must cover kubeflow_dashboard_hostname."
  type        = string
  default     = null
}

variable "use_public_subnets_for_nodes" {
  description = "Place nodes in public subnets with auto-assigned public IPs."
  type        = bool
  default     = true
}

# ── Hybrid EKS Nodes Configuration ─────────────────────────────────────────────

variable "enable_hybrid_nodes" {
  description = "Enable support for hybrid EKS nodes (on-premises infrastructure connected to the cluster)."
  type        = bool
  default     = false
}

variable "hybrid_node_cidrs" {
  description = "IPv4 CIDR blocks for on-premises hybrid nodes that need access to the EKS cluster. Only used when enable_hybrid_nodes is true."
  type        = list(string)
  default     = []
}

variable "hybrid_node_pod_cidrs" {
  description = "IPv4 CIDR blocks for hybrid node pods (remote pod networks). AWS requires RFC1918 ranges (10/8, 172.16/12, or 192.168/16). Must not overlap with VPC CIDR or hybrid_node_cidrs. Only used when enable_hybrid_nodes is true."
  type        = list(string)
  default     = []
}

variable "hybrid_node_registration_limit" {
  description = "Maximum number of hybrid nodes that can be registered via SSM activation."
  type        = number
  default     = 10
}

# ── Tailscale Configuration ───────────────────────────────────────────────────

variable "tailscale_enabled" {
  description = "Install the Tailscale operator for mesh networking between hybrid nodes and the EKS cluster."
  type        = bool
  default     = false
}

variable "tailscale_oauth_client_id" {
  description = "Tailscale OAuth client ID for the operator. Required when tailscale_enabled is true."
  type        = string
  sensitive   = true
  default     = null
}

variable "tailscale_oauth_client_secret" {
  description = "Tailscale OAuth client secret for the operator. Required when tailscale_enabled is true."
  type        = string
  sensitive   = true
  default     = null
}

variable "tailscale_chart_version" {
  description = "Version of the Tailscale Helm chart to install."
  type        = string
  default     = "1.78.3"
}

# ── SkyPilot API Server Configuration ─────────────────────────────────────────

variable "skypilot_api_server_enabled" {
  description = "Deploy SkyPilot API server for remote job submission. Requires EBS CSI driver for persistent storage."
  type        = bool
  default     = false
}

variable "skypilot_api_server_web_password" {
  description = "Basic auth password for SkyPilot API server. Required when skypilot_api_server_enabled is true."
  type        = string
  sensitive   = true
  default     = null
}

variable "skypilot_api_server_hostname" {
  description = "Public hostname for SkyPilot API server (e.g. skypilot.example.com). When set alongside route53_zone_name, an ACM certificate is auto-created and DNS-validated."
  type        = string
  default     = null
}

variable "skypilot_api_server_chart_version" {
  description = "Version of the SkyPilot Helm chart to install."
  type        = string
  default     = "0.1.0-devel"
}

variable "skypilot_api_server_resources" {
  description = "Resource requests and limits for SkyPilot API server. Defaults per SkyPilot docs: 4 CPU / 8Gi memory."
  type = object({
    requests = object({
      cpu    = string
      memory = string
    })
    limits = object({
      cpu    = string
      memory = string
    })
  })
  default = {
    requests = {
      cpu    = "4"
      memory = "8Gi"
    }
    limits = {
      cpu    = "4"
      memory = "8Gi"
    }
  }
}