Skip to content

Commit d4d925c

Browse files
author
weiliu2
committed
Add private-join and private-leave commands for Private AKS cluster
Add current node into the private cluster. - Add private-join command to join Private AKS cluster via Gateway - Add private-leave command with --mode=local|full cleanup options - Add shell scripts for installation and uninstallation - Add documentation for creating Private AKS cluster - Readme.md and create_private_clsuter.md shows howto
1 parent 9c281de commit d4d925c

12 files changed

Lines changed: 1809 additions & 12 deletions

File tree

commands.go

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"go.goms.io/aks/AKSFlexNode/pkg/bootstrapper"
1515
"go.goms.io/aks/AKSFlexNode/pkg/config"
1616
"go.goms.io/aks/AKSFlexNode/pkg/logger"
17+
"go.goms.io/aks/AKSFlexNode/pkg/privatecluster"
1718
"go.goms.io/aks/AKSFlexNode/pkg/status"
1819
)
1920

@@ -118,6 +119,107 @@ func runVersion() {
118119
fmt.Printf("Build Time: %s\n", BuildTime)
119120
}
120121

122+
// Private cluster command variables
123+
var (
124+
aksResourceID string
125+
cleanupModeFlag string
126+
)
127+
128+
// NewPrivateJoinCommand creates a new private-join command
129+
func NewPrivateJoinCommand() *cobra.Command {
130+
cmd := &cobra.Command{
131+
Use: "private-join",
132+
Short: "Join a Private AKS cluster (requires sudo)",
133+
Long: `Join a Private AKS cluster.
134+
135+
Prerequisites:
136+
1. A Private AKS cluster must exist with AAD and Azure RBAC enabled
137+
See: pkg/privatecluster/create_private_cluster.md
138+
139+
2. Current user must have the following roles on the cluster:
140+
- Azure Kubernetes Service Cluster Admin Role
141+
- Azure Kubernetes Service RBAC Cluster Admin
142+
143+
3. Current user must be logged in via 'sudo az login'
144+
145+
The full resource ID of the Private AKS cluster is required as the --aks-resource-id parameter.
146+
This same resource ID can be used later with the private-leave command.`,
147+
RunE: func(cmd *cobra.Command, args []string) error {
148+
return runPrivateJoin(cmd.Context())
149+
},
150+
}
151+
152+
cmd.Flags().StringVar(&aksResourceID, "aks-resource-id", "", "AKS cluster resource ID (required)")
153+
cmd.MarkFlagRequired("aks-resource-id")
154+
155+
return cmd
156+
}
157+
158+
// NewPrivateLeaveCommand creates a new private-leave command
159+
func NewPrivateLeaveCommand() *cobra.Command {
160+
cmd := &cobra.Command{
161+
Use: "private-leave",
162+
Short: "Leave a Private AKS cluster (--mode=local|full, requires sudo)",
163+
Long: `Remove this edge node from a Private AKS cluster.
164+
165+
Cleanup modes:
166+
--local Local cleanup only (default):
167+
- Remove node from AKS cluster
168+
- Run aks-flex-node unbootstrap
169+
- Remove Arc Agent
170+
- Stop VPN and remove client config
171+
- Keep Gateway for other nodes
172+
173+
--full Full cleanup (requires --aks-resource-id):
174+
- All local cleanup steps
175+
- Delete Gateway VM
176+
- Delete Gateway subnet, NSG, Public IP
177+
- Delete SSH keys
178+
179+
This command requires the current user to be logged in via 'sudo az login'.`,
180+
RunE: func(cmd *cobra.Command, args []string) error {
181+
return runPrivateLeave(cmd.Context())
182+
},
183+
}
184+
185+
cmd.Flags().StringVar(&cleanupModeFlag, "mode", "local", "Cleanup mode: 'local' (keep Gateway) or 'full' (remove all Azure resources)")
186+
cmd.Flags().StringVar(&aksResourceID, "aks-resource-id", "", "AKS cluster resource ID (required for --mode=full)")
187+
188+
return cmd
189+
}
190+
191+
// runPrivateJoin executes the private cluster join process
192+
func runPrivateJoin(ctx context.Context) error {
193+
if os.Getuid() != 0 {
194+
return fmt.Errorf("this command requires root privileges, please run with 'sudo'")
195+
}
196+
runner := privatecluster.NewScriptRunner("")
197+
return runner.RunPrivateInstall(ctx, aksResourceID)
198+
}
199+
200+
// runPrivateLeave executes the private cluster leave process
201+
func runPrivateLeave(ctx context.Context) error {
202+
if os.Getuid() != 0 {
203+
return fmt.Errorf("this command requires root privileges, please run with 'sudo'")
204+
}
205+
// Validate cleanup mode
206+
var mode privatecluster.CleanupMode
207+
switch cleanupModeFlag {
208+
case "local":
209+
mode = privatecluster.CleanupModeLocal
210+
case "full":
211+
mode = privatecluster.CleanupModeFull
212+
if aksResourceID == "" {
213+
return fmt.Errorf("--aks-resource-id is required for full cleanup mode")
214+
}
215+
default:
216+
return fmt.Errorf("invalid cleanup mode: %s (use 'local' or 'full')", cleanupModeFlag)
217+
}
218+
219+
runner := privatecluster.NewScriptRunner("")
220+
return runner.RunPrivateUninstall(ctx, mode, aksResourceID)
221+
}
222+
121223
// runDaemonLoop runs the periodic status collection and bootstrap monitoring daemon
122224
func runDaemonLoop(ctx context.Context, cfg *config.Config) error {
123225
logger := logger.GetLoggerFromContext(ctx)

go.mod

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ require (
1515
github.com/sirupsen/logrus v1.9.3
1616
github.com/spf13/cobra v1.8.0
1717
github.com/spf13/viper v1.18.2
18+
github.com/stretchr/testify v1.11.1
19+
golang.org/x/crypto v0.45.0
1820
k8s.io/client-go v0.26.0
1921
)
2022

@@ -40,6 +42,7 @@ require (
4042
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
4143
github.com/pelletier/go-toml/v2 v2.1.0 // indirect
4244
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect
45+
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
4346
github.com/sagikazarmark/locafero v0.4.0 // indirect
4447
github.com/sagikazarmark/slog-shim v0.1.0 // indirect
4548
github.com/sourcegraph/conc v0.3.0 // indirect
@@ -50,7 +53,6 @@ require (
5053
go.uber.org/atomic v1.9.0 // indirect
5154
go.uber.org/multierr v1.9.0 // indirect
5255
go.yaml.in/yaml/v2 v2.4.3 // indirect
53-
golang.org/x/crypto v0.45.0 // indirect
5456
golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8 // indirect
5557
golang.org/x/net v0.47.0 // indirect
5658
golang.org/x/oauth2 v0.30.0 // indirect

main.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,16 @@ func main() {
2525
}
2626

2727
// Add global flags for configuration
28-
rootCmd.PersistentFlags().StringVar(&configPath, "config", "", "Path to configuration JSON file (required)")
28+
rootCmd.PersistentFlags().StringVar(&configPath, "config", "", "Path to configuration JSON file (required for agent/unbootstrap)")
29+
rootCmd.PersistentFlags().MarkHidden("config") // Hide from global help, shown in agent/unbootstrap help
2930
// Don't mark as required globally - we'll check in PersistentPreRunE for commands that need it
3031

3132
// Add commands
3233
rootCmd.AddCommand(NewAgentCommand())
3334
rootCmd.AddCommand(NewUnbootstrapCommand())
3435
rootCmd.AddCommand(NewVersionCommand())
36+
rootCmd.AddCommand(NewPrivateJoinCommand())
37+
rootCmd.AddCommand(NewPrivateLeaveCommand())
3538

3639
// Set up context with signal handling
3740
ctx, cancel := context.WithCancel(context.Background())
@@ -49,8 +52,9 @@ func main() {
4952

5053
// Set up persistent pre-run to initialize config and logger
5154
rootCmd.PersistentPreRunE = func(cmd *cobra.Command, args []string) error {
52-
// Skip config loading for version command
53-
if cmd.Name() == "version" {
55+
// Skip config loading for commands that don't need it
56+
switch cmd.Name() {
57+
case "version", "private-join", "private-leave":
5458
return nil
5559
}
5660

pkg/bootstrapper/bootstrapper.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ func New(cfg *config.Config, logger *logrus.Logger) *Bootstrapper {
3232
// Bootstrap executes all bootstrap steps sequentially
3333
func (b *Bootstrapper) Bootstrap(ctx context.Context) (*ExecutionResult, error) {
3434
// Define the bootstrap steps in order - using modules directly
35+
// Note: Network (WireGuard VPN) setup is handled by "aks-flex-node private-join" before this runs
3536
steps := []Executor{
3637
arc.NewInstaller(b.logger), // Setup Arc
3738
services.NewUnInstaller(b.logger), // Stop kubelet before setup
@@ -60,6 +61,7 @@ func (b *Bootstrapper) Unbootstrap(ctx context.Context) (*ExecutionResult, error
6061
runc.NewUnInstaller(b.logger), // Uninstall runc binary
6162
system_configuration.NewUnInstaller(b.logger), // Clean system settings
6263
arc.NewUnInstaller(b.logger), // Uninstall Arc (after cleanup)
64+
// Note: Network (WireGuard VPN) cleanup is handled by "aks-flex-node private-leave"
6365
}
6466

6567
return b.ExecuteSteps(ctx, steps, "unbootstrap")

pkg/components/arc/arc_installer.go

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ func (i *Installer) Execute(ctx context.Context) error {
9191

9292
// IsCompleted checks if Arc setup has been completed
9393
// This can be used by bootstrap steps to verify completion status
94-
// Uses the same reliable logic as status collector for consistency
94+
// Checks both Arc agent connection status AND RBAC role assignments
9595
func (i *Installer) IsCompleted(ctx context.Context) bool {
9696
i.logger.Debug("Checking Arc setup completion status")
9797

@@ -113,26 +113,60 @@ func (i *Installer) IsCompleted(ctx context.Context) bool {
113113
}
114114

115115
// Parse output to check if agent is connected (same logic as status collector)
116+
isConnected := false
116117
lines := strings.Split(strings.TrimSpace(string(output)), "\n")
117118
for _, line := range lines {
118119
line = strings.TrimSpace(line)
119120
if strings.Contains(line, "Agent Status") && strings.Contains(line, ":") {
120121
parts := strings.SplitN(line, ":", 2)
121122
if len(parts) == 2 {
122123
status := strings.TrimSpace(parts[1])
123-
isConnected := strings.ToLower(status) == "connected"
124-
if isConnected {
125-
i.logger.Debug("Arc setup appears to be completed - agent is connected")
126-
} else {
124+
isConnected = strings.ToLower(status) == "connected"
125+
if !isConnected {
127126
i.logger.Debugf("Arc agent status is '%s' - not ready", status)
127+
return false
128128
}
129-
return isConnected
130129
}
131130
}
132131
}
133132

134-
i.logger.Debug("Could not find Agent Status in azcmagent show output - Arc not ready")
135-
return false
133+
if !isConnected {
134+
i.logger.Debug("Could not find Agent Status in azcmagent show output - Arc not ready")
135+
return false
136+
}
137+
138+
// Arc is connected, now check if RBAC roles are assigned
139+
i.logger.Debug("Arc agent is connected, checking RBAC role assignments...")
140+
if err := i.setUpClients(ctx); err != nil {
141+
i.logger.Debugf("Failed to set up clients for RBAC check: %v", err)
142+
return false
143+
}
144+
145+
arcMachine, err := i.getArcMachine(ctx)
146+
if err != nil || arcMachine == nil {
147+
i.logger.Debugf("Failed to get Arc machine: %v", err)
148+
return false
149+
}
150+
151+
managedIdentityID := getArcMachineIdentityID(arcMachine)
152+
if managedIdentityID == "" {
153+
i.logger.Debug("Arc machine has no managed identity ID")
154+
return false
155+
}
156+
157+
hasPermissions, err := i.checkRequiredPermissions(ctx, managedIdentityID)
158+
if err != nil {
159+
i.logger.Debugf("Failed to check RBAC permissions: %v", err)
160+
return false
161+
}
162+
163+
if !hasPermissions {
164+
i.logger.Debug("Arc is connected but RBAC roles are not assigned")
165+
return false
166+
}
167+
168+
i.logger.Debug("Arc setup is complete - agent connected and RBAC roles assigned")
169+
return true
136170
}
137171

138172
// registerArcMachine registers the machine with Azure Arc using the Arc agent

pkg/config/config.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,9 +240,70 @@ func (c *Config) Validate() error {
240240
return fmt.Errorf("invalid agent.logLevel: %s. Valid values are: debug, info, warning, error", c.Agent.LogLevel)
241241
}
242242

243+
// Validate network configuration if present
244+
if c.Network != nil {
245+
if err := c.Network.Validate(); err != nil {
246+
return fmt.Errorf("invalid network config: %w", err)
247+
}
248+
}
249+
250+
return nil
251+
}
252+
253+
// Validate validates the network configuration
254+
func (n *NetworkConfig) Validate() error {
255+
switch n.Mode {
256+
case "", "direct":
257+
// No additional validation needed for direct mode
258+
return nil
259+
case "wireguard":
260+
if n.Gateway == nil {
261+
return fmt.Errorf("gateway configuration is required when mode is 'wireguard'")
262+
}
263+
return n.Gateway.Validate()
264+
default:
265+
return fmt.Errorf("unsupported network mode: %s. Valid values are: direct, wireguard", n.Mode)
266+
}
267+
}
268+
269+
// Validate validates the Gateway configuration
270+
func (w *GatewayConfig) Validate() error {
271+
if w.ServerEndpoint == "" {
272+
return fmt.Errorf("serverEndpoint is required")
273+
}
274+
if w.ServerPublicKey == "" {
275+
return fmt.Errorf("serverPublicKey is required")
276+
}
277+
if w.ClientAddress == "" {
278+
return fmt.Errorf("clientAddress is required")
279+
}
280+
if len(w.AllowedIPs) == 0 {
281+
return fmt.Errorf("allowedIPs is required and must not be empty")
282+
}
243283
return nil
244284
}
245285

286+
// GetInterfaceName returns the Gateway interface name, defaulting to "wg-aks"
287+
func (w *GatewayConfig) GetInterfaceName() string {
288+
if w.InterfaceName != "" {
289+
return w.InterfaceName
290+
}
291+
return "wg-aks"
292+
}
293+
294+
// GetPersistentKeepalive returns the keepalive interval, defaulting to 25 seconds
295+
func (w *GatewayConfig) GetPersistentKeepalive() int {
296+
if w.PersistentKeepalive > 0 {
297+
return w.PersistentKeepalive
298+
}
299+
return 25
300+
}
301+
302+
// IsGatewayEnabled returns true if Gateway VPN is configured
303+
func (c *Config) IsGatewayEnabled() bool {
304+
return c.Network != nil && c.Network.Mode == "wireguard" && c.Network.Gateway != nil
305+
}
306+
246307
// populateTargetClusterInfoFromConfig extracts cluster information from the resource ID
247308
// This function should only be called after validateAzureResourceID confirms the format is correct
248309
func populateTargetClusterInfoFromConfig(cfg *Config) {

pkg/config/structs.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,48 @@ type Config struct {
1414
Node NodeConfig `json:"node"`
1515
Paths PathsConfig `json:"paths"`
1616
Npd NPDConfig `json:"npd"`
17+
Network *NetworkConfig `json:"network,omitempty"` // Network configuration for Private Cluster access
18+
}
19+
20+
// NetworkConfig holds network configuration for accessing Private AKS clusters.
21+
type NetworkConfig struct {
22+
// Mode specifies the network mode: "direct" (default) or "wireguard"
23+
Mode string `json:"mode,omitempty"`
24+
25+
// Gateway configuration (required when mode is "wireguard")
26+
// Note: JSON/mapstructure field name kept as "wireguard" for backward compatibility
27+
Gateway *GatewayConfig `json:"wireguard,omitempty" mapstructure:"wireguard"`
28+
}
29+
30+
// GatewayConfig holds Gateway VPN configuration for Private Cluster access.
31+
type GatewayConfig struct {
32+
// ServerEndpoint is the Gateway server endpoint (host:port)
33+
ServerEndpoint string `json:"serverEndpoint"`
34+
35+
// ServerPublicKey is the server's Gateway public key (base64 encoded)
36+
ServerPublicKey string `json:"serverPublicKey"`
37+
38+
// ClientPrivateKey is the client's Gateway private key (optional, auto-generated if empty)
39+
ClientPrivateKey string `json:"clientPrivateKey,omitempty"`
40+
41+
// ClientAddress is the client's VPN address with CIDR (e.g., "172.16.0.2/24")
42+
ClientAddress string `json:"clientAddress"`
43+
44+
// AllowedIPs are the IP ranges to route through the VPN tunnel
45+
AllowedIPs []string `json:"allowedIPs"`
46+
47+
// DNS servers to use through the VPN (optional)
48+
DNS []string `json:"dns,omitempty"`
49+
50+
// PersistentKeepalive interval in seconds (default: 25)
51+
PersistentKeepalive int `json:"persistentKeepalive,omitempty"`
52+
53+
// InterfaceName is the Gateway interface name (default: wg-aks)
54+
InterfaceName string `json:"interfaceName,omitempty"`
55+
56+
// TestEndpoint is an endpoint to test connectivity after VPN is established (optional)
57+
// If not specified, will try to test connectivity to the AKS API Server
58+
TestEndpoint string `json:"testEndpoint,omitempty"`
1759
}
1860

1961
// AzureConfig holds Azure-specific configuration required for connecting to Azure services.

0 commit comments

Comments
 (0)