diff --git a/STORAGE_DESIGN.md b/STORAGE_DESIGN.md new file mode 100644 index 0000000000..a7e9666931 --- /dev/null +++ b/STORAGE_DESIGN.md @@ -0,0 +1,1328 @@ +# Storage Configuration Design + +**Version:** 1.1 +**Date:** 2026-03-27 +**Status:** Design Active + +--- + +## Table of Contents + +1. [Overview](#overview) +2. [Architecture](#architecture) +3. [Configuration Schema](#configuration-schema) +4. [Mount Params (Profiles)](#mount-params-profiles) +5. [Storage Technologies](#storage-technologies) +6. [Usage Examples](#usage-examples) +7. [Priority Resolution](#priority-resolution) +8. [Best Practices](#best-practices) +9. [Multi-Storage System Support](#multi-storage-system-support) +10. [Validation Rules](#validation-rules) +11. [Design Decision: storage_config.yml vs storage_profile.yml](#design-decision-storage_configyml-vs-storage_profileyml) + +--- + +## Overview + +This design provides a flexible, profile-based mount configuration system for Dell storage solutions in HPC environments. It supports: + +- **VAST NFS Storage** - High-performance NFS storage for shared filesystems +- **PowerVault iSCSI Storage** - Block storage for persistent data and databases +- **Generic Network Storage** - NFS, CIFS, and other network filesystems +- **Local Storage** - Direct-attached storage and local disks + +### Key Features + +- ✅ **Profile-based configuration** - Reusable templates for common mount patterns (`mount_params`) +- ✅ **Priority-based resolution** - Explicit values override profile defaults +- ✅ **Vendor-specific optimizations** - VAST and PowerVault tuned profiles +- ✅ **Functional-group targeting** - Mount configurations applied per node functional group prefix +- ✅ **Multi-volume PowerVault** - List-based `powervault_config` supports multiple iSCSI volumes +- ✅ **Dual PowerVault mount modes** - Reference an existing `mounts[]` entry or define inline +- ✅ **Validation enforcement** - Schema validation ensures correct configuration + +--- + +## Architecture + +### Configuration Flow + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ storage_config.yml │ +├─────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌────────────────────────────────────────────────────────┐ │ +│ │ mount_params (Profiles) │ │ +│ │ ├─ vast_nfs │ │ +│ │ ├─ vast_nfs_performance │ │ +│ │ ├─ powervault_iscsi │ │ +│ │ ├─ network_storage │ │ +│ │ ├─ bind_mounts │ │ +│ │ ├─ local_storage │ │ +│ │ ├─ scratch_storage │ │ +│ │ └─ global │ │ +│ └────────────────────────────────────────────────────────┘ │ +│ ▼ │ +│ ┌────────────────────────────────────────────────────────┐ │ +│ │ mounts (Mount Entries) │ │ +│ │ ├─ vast_home (uses vast_nfs profile) │ │ +│ │ ├─ powervault_slurm_persist (uses powervault_iscsi) │ │ +│ │ └─ powervault_mysql_bind (uses bind_mounts) │ │ +│ └────────────────────────────────────────────────────────┘ │ +│ ▼ │ +│ ┌────────────────────────────────────────────────────────┐ │ +│ │ powervault_config (Optional, list) │ │ +│ │ ├─ name: powervault1 │ │ +│ │ │ ├─ ip / port / iscsi_initiator / volume_id │ │ +│ │ │ └─ mount: (Mode A) │ │ +│ │ └─ name: powervault2 │ │ +│ │ ├─ ip / port / iscsi_initiator / volume_id │ │ +│ │ └─ source / mount_point / mount_params / ... │ │ +│ │ (Mode B) │ │ +│ └────────────────────────────────────────────────────────┘ │ +│ ▼ │ +│ ┌────────────────────────────────────────────────────────┐ │ +│ │ swap (Optional, list) │ │ +│ │ └─ name / filename / size / maxsize / │ │ +│ │ functional_group_prefix │ │ +│ └────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ + ▼ + ┌──────────────────────────────────────┐ + │ Schema Validation │ + │ (storage_config.json) │ + └──────────────────────────────────────┘ + ▼ + ┌──────────────────────────────────────┐ + │ Cloud-Init Generation │ + │ (Jinja2 Templates) │ + └──────────────────────────────────────┘ + ▼ + ┌──────────────────────────────────────┐ + │ Node Provisioning │ + │ (Per Functional Group) │ + └──────────────────────────────────────┘ +``` + +--- + +## Configuration Schema + +### File Structure + +```yaml +# storage_config.yml + +# PowerVault iSCSI configuration (optional, list) +powervault_config: + - name: "powervault1" + ip: + - 172.1.2.3 + port: 3260 + iscsi_initiator: "iqn.2025-01.com.dell:hostname" + volume_id: "00c0ff4343f1f1f1001c8c4e6901000000" + # Mode A: reference an existing mounts[] entry by name + mount: "powervault_slurm_persist" + + - name: "powervault2" + ip: + - 172.1.2.4 + port: 3260 + iscsi_initiator: "iqn.2025-01.com.dell:slurmd-node" + volume_id: "00c0ff4343f1f1f1001c8c4e6901000001" + # Mode B: define all mount parameters inline + source: "/dev/mapper/360002ac0000000000000000000000000" + mount_point: "/scratch" + mount_params: "powervault_iscsi" + fs_type: "xfs" + mnt_opts: "defaults,nofail" + functional_group_prefix: ["k8s_node"] + +# Mount parameter profiles (templates) +mount_params: + profile_name: + fs_type: "filesystem_type" + mnt_opts: "mount_options" + dump_freq: "0" + fsck_pass: "0" + # Optional custom fields (e.g., vast_nfs_ip, perf_ip) — used in mount templates + +# Mount entries +mounts: + - name: "unique_mount_name" # alphanumeric, underscore, hyphen only + source: "device_or_network_path" + mount_point: "/mount/path" + mount_params: "profile_name" # Optional — references mount_params profile + fs_type: "filesystem_type" # Optional — overrides profile + mnt_opts: "mount_options" # Optional — overrides profile + dump_freq: "0" # Optional — overrides profile + fsck_pass: "0" # Optional — overrides profile + functional_group_prefix: ["prefix1", "prefix2"] # Required + # All nodes whose functional group name starts with any listed prefix get this mount. + # e.g., ["slurm"] matches slurm_control_node, slurm_node, slurm_login, etc. + # Omit functional_group_prefix to apply the mount to ALL nodes (use with care). + +# Swap configuration (optional) +swap: + - name: "swap_name" + filename: "/swapfile" + size: "4G" + maxsize: "8G" # Optional — used when size is "auto" + functional_group_prefix: ["prefix1"] +``` + +--- + +## Mount Params (Profiles) + +`mount_params` are reusable templates that define the **how to mount** (technical settings). Mount entries define the **what, where, and who** (source, path, targeting). + +### Profile Structure + +```yaml +mount_params: + profile_name: + fs_type: "filesystem_type" # Required + mnt_opts: "mount_options" # Required + dump_freq: "0" # Required + fsck_pass: "0" # Required + # Additional custom fields are allowed and are passed through to mount templates. + # Standard fstab fields (fs_type, mnt_opts, dump_freq, fsck_pass) are used directly + # by the cloud-init mounts module. Custom fields are available to Jinja2 templates. +``` + +### Standard Profiles + +#### 1. `default` — Standard NFS Defaults + +```yaml +default: + fs_type: "nfs" + mnt_opts: "defaults,nofail,_netdev,x-systemd.after=cloud-init-network.service" + dump_freq: "0" + fsck_pass: "0" +``` + +**Use Case:** Generic NFS mounts with standard options. + +--- + +#### 2. `vast_nfs` — VAST NFS Standard Configuration + +```yaml +vast_nfs: + fs_type: "nfs4" + mnt_opts: "defaults,nofail,_netdev,noatime,x-systemd.after=cloud-init-network.service" + dump_freq: "0" + fsck_pass: "0" + vast_nfs_ip: "192.168.1.100" # Custom field — used in mount source templates +``` + +**Use Case:** VAST Data NFS exports with standard performance. +**Features:** +- NFSv4 protocol +- `noatime` for improved performance +- Network dependency handling +- `vast_nfs_ip` available as a template variable for resolving mount source + +--- + +#### 3. `vast_nfs_performance` — VAST NFS High-Performance + +```yaml +vast_nfs_performance: + fs_type: "nfs4" + mnt_opts: "defaults,nofail,_netdev,noatime,nodiratime,rsize=1048576,wsize=1048576" + dump_freq: "0" + fsck_pass: "0" + perf_ip: "192.168.1.101" # Custom field — used in mount source templates +``` + +**Use Case:** VAST Data NFS for high-throughput workloads (scratch, datasets). +**Features:** +- NFSv4 protocol +- **1MB read/write buffers** (`rsize=1048576,wsize=1048576`) +- `noatime` and `nodiratime` for maximum performance +- Optimized for large sequential I/O + +--- + +#### 4. `powervault_iscsi` — PowerVault iSCSI Block Storage + +```yaml +powervault_iscsi: + fs_type: "xfs" + mnt_opts: "defaults,_netdev,noatime,x-systemd.requires=iscsi.service" + dump_freq: "0" + fsck_pass: "0" +``` + +**Use Case:** PowerVault iSCSI persistent storage. +**Features:** +- XFS filesystem (high performance, scalability) +- Requires iSCSI service to be running +- `noatime` for improved performance +- Network device handling + +**Note:** Matches `setup_iscsi_storage.sh` default configuration. + +--- + +#### 5. `network_storage` — Generic Network Storage + +```yaml +network_storage: + fs_type: "auto" + mnt_opts: "defaults,nofail,_netdev,x-systemd.after=cloud-init-network.service" + dump_freq: "0" + fsck_pass: "0" +``` + +**Use Case:** Generic network filesystems (NFS, CIFS, etc.). + +--- + +#### 6. `local_storage` — Local Disk Storage + +```yaml +local_storage: + fs_type: "auto" + mnt_opts: "defaults,nofail,noatime" + dump_freq: "0" + fsck_pass: "2" +``` + +**Use Case:** Local disks (ext4, xfs, etc.). +**Note:** `fsck_pass: "2"` enables filesystem check on boot. + +--- + +#### 7. `bind_mounts` — Bind Mounts + +```yaml +bind_mounts: + fs_type: "none" + mnt_opts: "bind" + dump_freq: "0" + fsck_pass: "0" +``` + +**Use Case:** Bind mounts (e.g., `/mnt/slurm-persist/mysql` → `/var/lib/mysql`). + +--- + +#### 8. `scratch_storage` — High-Performance Scratch + +```yaml +scratch_storage: + fs_type: "xfs" + mnt_opts: "defaults,nofail,noatime,nodiratime,largeio,inode64" + dump_freq: "0" + fsck_pass: "2" +``` + +**Use Case:** Local high-performance scratch storage. + +--- + +#### 9. `global` — Global Fallback + +```yaml +global: + fs_type: "auto" + mnt_opts: "defaults,nofail,x-systemd.after=cloud-init-network.service" + dump_freq: "0" + fsck_pass: "2" +``` + +**Use Case:** Fallback when no specific profile matches and no explicit options are provided. + +--- + +## Storage Technologies + +### VAST NFS Storage + +**Overview:** VAST Data provides high-performance, scale-out NFS storage optimized for HPC workloads. + +#### Configuration Format + +```yaml +mounts: + - name: "vast_home" + source: "192.168.1.100:/home" # VAST NFS export + mount_point: "/home" + mount_params: "vast_nfs" + functional_group_prefix: ["slurm"] +``` + +#### Source Format + +- **Pattern:** `:` +- **Example:** `192.168.1.100:/home` +- **Template variable:** `{{ vast_nfs_ip }}:/home` (using `vast_nfs_ip` from `mount_params`) + +#### Recommended Profiles + +| Use Case | Profile | Reason | +|----------|---------|--------| +| Home directories | `vast_nfs` | Standard performance, shared access | +| Shared applications | `vast_nfs` | Standard performance, read-heavy | +| Datasets (read-only) | `vast_nfs` | Standard performance | +| Scratch space | `vast_nfs_performance` | High throughput, large I/O | +| Checkpoints | `vast_nfs_performance` | High throughput, write-heavy | + +#### Performance Tuning + +**Standard Configuration (`vast_nfs`):** +- Default NFS buffer sizes +- Suitable for most workloads +- Lower memory overhead + +**High-Performance Configuration (`vast_nfs_performance`):** +- 1MB read/write buffers +- Optimized for large sequential I/O +- Higher memory usage +- Best for scratch, checkpoints, large datasets + +--- + +### PowerVault iSCSI Storage + +**Overview:** Dell PowerVault provides block-level iSCSI storage for persistent data and databases. + +`powervault_config` is a **list**, allowing multiple volumes to be configured independently. Each volume entry supports two mount modes. + +#### Mode A: Reference an Existing `mounts[]` Entry + +Use `mount: ` to link the PowerVault volume to a fully-defined entry in the `mounts[]` list. The iSCSI setup provisions the device; the mount entry handles fstab/cloud-init configuration. + +```yaml +powervault_config: + - name: "powervault1" + ip: + - 172.1.2.3 + port: 3260 + iscsi_initiator: "iqn.2025-01.com.dell:scontrol-node" + volume_id: "00c0ff4343f1f1f1001c8c4e6901000000" + mount: "powervault_slurm_persist" # references mounts[] entry by name + +mounts: + - name: "powervault_slurm_persist" + source: "UUID=" + mount_point: "/mnt/slurm-persist" + mount_params: "powervault_iscsi" + functional_group_prefix: ["slurm_control_node"] +``` + +#### Mode B: Inline Mount Definition + +Define all mount parameters directly on the `powervault_config` entry. Use this when the volume does not need a separate `mounts[]` entry. + +```yaml +powervault_config: + - name: "powervault2" + ip: + - 172.1.2.4 + port: 3260 + iscsi_initiator: "iqn.2025-01.com.dell:slurmd-node" + volume_id: "00c0ff4343f1f1f1001c8c4e6901000001" + source: "/dev/mapper/360002ac0000000000000000000000000" + mount_point: "/scratch" + mount_params: "powervault_iscsi" + fs_type: "xfs" + mnt_opts: "defaults,nofail" + functional_group_prefix: ["k8s_node"] +``` + +#### Source Format + +- **Pattern:** `UUID=` or `/dev/mapper/` +- **Example:** `UUID=12345678-1234-1234-1234-123456789abc` +- **Note:** UUID is preferred over device paths for persistence across reboots. + +#### Setup Script Integration + +The `setup_iscsi_storage.sh` script automatically: + +1. Discovers iSCSI targets from controller IPs +2. Logs in to all discovered targets +3. Configures multipath for redundancy +4. Selects the correct volume using `volume_id` +5. Creates GPT partition table +6. Formats partition with XFS +7. Mounts to `/mnt/slurm-persist` using UUID +8. Creates subdirectories: `mysql/`, `spool/` +9. Sets up bind mounts in `/etc/fstab` + +#### Recommended Workflow + +``` +PowerVault Setup (setup_iscsi_storage.sh) + ↓ +/mnt/slurm-persist (XFS on /dev/mapper/mpatha1) + ↓ + ├─ mysql/ → bind mount to /var/lib/mysql + └─ spool/ → bind mount to /var/spool +``` + +#### Bind Mount Ordering + +Bind mounts that depend on a parent PowerVault mount **must appear after** the parent in the `mounts[]` list. The list is processed in order — if the parent is not yet mounted, the bind mount will fail. + +```yaml +mounts: + # 1. Parent mount first + - name: "powervault_slurm_persist" + source: "UUID=" + mount_point: "/mnt/slurm-persist" + mount_params: "powervault_iscsi" + functional_group_prefix: ["slurm_control_node"] + + # 2. Bind mounts after parent + - name: "powervault_mysql_bind" + source: "/mnt/slurm-persist/mysql" + mount_point: "/var/lib/mysql" + mount_params: "bind_mounts" + functional_group_prefix: ["slurm_control_node"] +``` + +--- + +## Usage Examples + +### Example 0: Atomic Mount (All Fields Explicit, No Profile) + +```yaml +mounts: + - name: "atomic_nfs_data" + source: "UUID=" + mount_point: "/mnt/atomic" + fs_type: "nfs" + mnt_opts: "defaults,nofail,_netdev,x-systemd.after=cloud-init-network.service" + dump_freq: "0" + fsck_pass: "0" + functional_group_prefix: ["slurm_node"] +``` + +**Result:** All fields explicit; no profile lookup. Mount applies to all nodes whose functional group starts with `slurm_node`. + +--- + +### Example 1: PowerVault iSCSI — Mode A (Profile Reference via `mounts[]`) + +```yaml +powervault_config: + - name: "powervault1" + ip: + - 172.1.2.3 + port: 3260 + iscsi_initiator: "iqn.2025-01.com.dell:scontrol-node" + volume_id: "00c0ff4343f1f1f1001c8c4e6901000000" + mount: "powervault_slurm_persist" + +mounts: + - name: "powervault_slurm_persist" + source: "UUID=" + mount_point: "/mnt/slurm-persist" + mount_params: "powervault_iscsi" + functional_group_prefix: ["slurm_control_node_x86_64"] +``` + +**Result:** +- Filesystem: XFS +- Mount options: `defaults,_netdev,noatime,x-systemd.requires=iscsi.service` +- Applied to: Slurm control nodes on x86_64 architecture only + +--- + +### Example 2: PowerVault Bind Mount — MySQL Data Directory + +```yaml +mounts: + - name: "powervault_mysql_bind" + source: "192.1.2.3:/mnt/mysql" + mount_point: "/var/lib/mysql" + mount_params: "bind_mounts" + functional_group_prefix: ["slurm_control_node"] +``` + +**Result:** +- Filesystem: none (bind) +- Mount options: `bind` +- Applied to: All Slurm control nodes (all architectures) + +--- + +### Example 3: VAST NFS — Home Directories + +```yaml +mounts: + - name: "vast_home" + source: "{{ vast_nfs_ip }}:/home" # vast_nfs_ip resolved from vast_nfs profile + mount_point: "/home" + mount_params: "vast_nfs" + functional_group_prefix: ["slurm"] +``` + +**Result:** +- Filesystem: NFSv4 +- Mount options: `defaults,nofail,_netdev,noatime,x-systemd.after=cloud-init-network.service` +- Applied to: All nodes whose functional group starts with `slurm` (control, compute, login, etc.) + +--- + +### Example 4: VAST NFS — High-Performance Scratch + +```yaml +mounts: + - name: "vast_scratch" + source: "192.168.1.100:/scratch" + mount_point: "/scratch" + mount_params: "vast_nfs_performance" + functional_group_prefix: ["k8s_node"] +``` + +**Result:** +- Filesystem: NFSv4 +- Mount options: `defaults,nofail,_netdev,noatime,nodiratime,rsize=1048576,wsize=1048576` +- Applied to: All nodes whose functional group starts with `k8s_node` +- Performance: 1MB read/write buffers + +--- + +### Example 5: Explicit Override of Profile Fields + +```yaml +mounts: + - name: "vast_apps_readonly" + source: "192.168.1.100:/apps" + mount_point: "/opt/apps" + fs_type: "nfs4" # EXPLICIT — overrides profile + mnt_opts: "defaults,nofail,_netdev,ro" # EXPLICIT — overrides profile + mount_params: "network_storage" + # dump_freq and fsck_pass still come from network_storage profile + functional_group_prefix: ["slurm"] +``` + +**Result:** +- Filesystem: NFSv4 (explicit, profile value ignored) +- Mount options: `defaults,nofail,_netdev,ro` (explicit, read-only) +- Dump frequency: `0` (from `network_storage` profile) +- Fsck pass: `0` (from `network_storage` profile) + +--- + +### Example 6: Swap Configuration + +```yaml +swap: + - name: "compute_swap" + filename: "/swapfile" + size: "2G" + maxsize: "4G" + functional_group_prefix: ["slurm_node"] +``` + +**Result:** A 2G swap file (up to 4G) created at `/swapfile` on all nodes whose functional group starts with `slurm_node`. + +--- + +## Priority Resolution + +When a mount entry references a profile via `mount_params`, field values are resolved using this priority order: + +### Priority Order (Highest to Lowest) + +``` +1. Explicit value in mount entry ← HIGHEST PRIORITY +2. Value from mount_params profile (if specified) +3. Auto-selected profile based on fs_type +4. Global fallback profile (global) +5. Hardcoded system defaults ← LOWEST PRIORITY +``` + +### Resolution Examples + +#### Example 1: Full Profile Usage + +```yaml +mount_params: + vast_nfs: + fs_type: "nfs4" + mnt_opts: "defaults,nofail,_netdev,noatime" + dump_freq: "0" + fsck_pass: "0" + +mounts: + - name: "vast_home" + source: "192.168.1.100:/home" + mount_point: "/home" + mount_params: "vast_nfs" + functional_group_prefix: ["slurm_node"] +``` + +**Resolution:** +- `fs_type`: `"nfs4"` ← from `vast_nfs` profile +- `mnt_opts`: `"defaults,nofail,_netdev,noatime"` ← from `vast_nfs` profile +- `dump_freq`: `"0"` ← from `vast_nfs` profile +- `fsck_pass`: `"0"` ← from `vast_nfs` profile + +--- + +#### Example 2: Partial Override + +```yaml +mount_params: + vast_nfs: + fs_type: "nfs4" + mnt_opts: "defaults,nofail,_netdev,noatime" + dump_freq: "0" + fsck_pass: "0" + +mounts: + - name: "vast_apps" + source: "192.168.1.100:/apps" + mount_point: "/opt/apps" + fs_type: "nfs4" # ← EXPLICIT + mnt_opts: "defaults,nofail,_netdev,ro" # ← EXPLICIT + mount_params: "vast_nfs" + functional_group_prefix: ["slurm_node"] +``` + +**Resolution:** +- `fs_type`: `"nfs4"` ← **EXPLICIT (priority 1)** — profile value ignored +- `mnt_opts`: `"defaults,nofail,_netdev,ro"` ← **EXPLICIT (priority 1)** — profile value ignored +- `dump_freq`: `"0"` ← from `vast_nfs` profile (priority 2) +- `fsck_pass`: `"0"` ← from `vast_nfs` profile (priority 2) + +--- + +#### Example 3: No Profile — All Fields Explicit + +```yaml +mounts: + - name: "atomic_data" + source: "192.168.1.100:/data" + mount_point: "/data" + fs_type: "nfs4" + mnt_opts: "defaults,nofail,_netdev" + dump_freq: "0" + fsck_pass: "0" + functional_group_prefix: ["slurm_node"] +``` + +**Resolution:** All four fstab fields are explicit (priority 1). No profile lookup is performed. + +--- + +## Best Practices + +### Profile Design + +✅ **DO:** +- Create profiles for common storage patterns +- Use descriptive profile names (`vast_nfs`, `powervault_iscsi`) +- Add custom fields (e.g., `vast_nfs_ip`) for template variables +- Document profile purpose and use cases +- Keep profiles simple and focused on a single storage technology + +❌ **DON'T:** +- Include `functional_group_prefix` in profiles — it belongs in mount entries +- Create too many near-identical profiles +- Use generic names like `profile1`, `profile2` + +--- + +### Mount Configuration + +✅ **DO:** +- Use profiles (`mount_params`) for standard configurations +- Override specific fields when needed (e.g., force `ro`) +- Use UUID for PowerVault sources — more reliable than device paths +- Specify `functional_group_prefix` on every mount entry +- Use descriptive, alphanumeric mount names (no spaces) +- Order bind mounts **after** their parent mount in the list + +❌ **DON'T:** +- Duplicate mount options across many entries — use profiles +- Use device paths (`/dev/sda1`) for PowerVault — use UUID +- Omit `functional_group_prefix` unless you intentionally want all-nodes application +- Place bind mounts before their parent in the `mounts[]` list + +--- + +### functional_group_prefix Targeting + +The `functional_group_prefix` field uses prefix matching against node functional group names: + +| Prefix | Matches | +|--------|---------| +| `["slurm"]` | `slurm_control_node`, `slurm_node`, `slurm_login`, `slurm_control_node_x86_64`, ... | +| `["slurm_control_node"]` | `slurm_control_node`, `slurm_control_node_x86_64`, `slurm_control_node_aarch64` | +| `["slurm_control_node_x86_64"]` | `slurm_control_node_x86_64` only | +| `["k8s"]` | `k8s_node`, `k8s_control_node`, `k8s_worker`, ... | +| `["slurm", "k8s"]` | All Slurm nodes AND all K8s nodes | + +Use the most specific prefix needed. Broader prefixes like `["slurm"]` apply to all Slurm node types across all architectures. + +--- + +### Storage Selection + +| Requirement | Recommended Storage | Profile | +|-------------|---------------------|---------| +| Shared home directories | VAST NFS | `vast_nfs` | +| Shared applications | VAST NFS | `vast_nfs` | +| High-throughput scratch | VAST NFS | `vast_nfs_performance` | +| Large datasets | VAST NFS | `vast_nfs_performance` | +| Persistent databases | PowerVault iSCSI | `powervault_iscsi` | +| Slurm state files | PowerVault iSCSI | `powervault_iscsi` | +| Service bind mounts | PowerVault subdirectory | `bind_mounts` | +| Local scratch | Local disk | `scratch_storage` | +| Generic network FS | NFS/CIFS | `network_storage` | + +--- + +### Performance Optimization + +#### VAST NFS + +**Standard Workloads (`vast_nfs`):** +- Home directories +- Shared applications +- Small file I/O +- Metadata-heavy operations + +**High-Performance Workloads (`vast_nfs_performance`):** +- Scratch space +- Checkpointing +- Large sequential I/O +- Streaming data + +**Buffer Size Tuning:** +```yaml +# Standard: default buffers (typically 32KB–128KB) +mnt_opts: "defaults,nofail,_netdev,noatime" + +# High-performance: 1MB buffers +mnt_opts: "defaults,nofail,_netdev,noatime,nodiratime,rsize=1048576,wsize=1048576" +``` + +#### PowerVault iSCSI + +**Filesystem Choice:** +- ✅ **XFS** (recommended) — High performance, scalability, large files +- ⚠️ **ext4** — Good compatibility, lower performance at scale + +**Mount Options:** +```yaml +# Recommended +mnt_opts: "defaults,_netdev,noatime,x-systemd.requires=iscsi.service" + +# Additional options for databases +mnt_opts: "defaults,_netdev,noatime,nobarrier,x-systemd.requires=iscsi.service" +``` + +--- + +## Multi-Storage System Support + +The design is **storage-system and access-method agnostic** by construction. It is a mount model, not a storage-vendor model — it doesn't care what the storage system is, only whether the source is known at boot time (`mounts:`) or runtime-discovered (`powervault_config:` / runcmd). + +### Coverage Matrix + +| Storage System | Access Method | Mechanism | Profile | +|---|---|---|---| +| **PowerVault** | iSCSI/multipath | `powervault_config:` → runcmd (runtime discovery) | `powervault_iscsi` | +| **VAST** | NFS | `mounts:` + static source | `vast_nfs` / `vast_nfs_performance` | +| **VAST** | RDMA | `mounts:` + `mount_params` profile with RDMA opts | custom profile | +| **PowerScale** | NFS | `mounts:` entry — just a different source IP | `powerscale_nfs` (user-defined) | +| **PowerScale** | SMB/CIFS | `mounts:` with `fs_type: cifs` + cred opts | custom profile | +| **BeeGFS** | FUSE/RDMA | `mounts:` with BeeGFS client mount | `beegfs` | +| **Any NFS server** | NFS3/NFS4 | `mounts:` + profile or explicit `fs_type`/`mnt_opts` | `default` / `network_storage` | +| **S3** | s3fs-fuse | `mounts:` with `fs_type: fuse.s3fs` + cred opts | custom profile | +| **Local disk** | block device | `mounts:` with UUID/device path | `local_storage` / `scratch_storage` | + +### Why This Works + +1. **`mounts:` is protocol-agnostic.** `source` is just a string (IP:/path, UUID, device path, s3 bucket). `fs_type` and `mnt_opts` handle the protocol specifics. + +2. **`mount_params:` profiles absorb vendor differences.** Each storage+protocol combo is a profile. Users reference the profile name, don't think about options. + +3. **`powervault_config:` exists only because iSCSI needs runtime discovery.** Any storage needing runtime device resolution before mount follows the same pattern (runcmd-based). Everything with a known source at config time fits in `mounts:`. + +4. **Custom fields in profiles carry vendor-specific variables.** `vast_nfs_ip` in the `vast_nfs` profile demonstrates this. Same pattern works for PowerScale VIPs, BeeGFS mgmtd hosts, etc. + +### Adding a New Storage System (Zero Schema Changes) + +Example: PowerScale NFS — define a profile and reference it. + +```yaml +mount_params: + powerscale_nfs: + fs_type: "nfs4" + mnt_opts: "nfsvers=4.1,hard,intr,noatime,nconnect=16,rsize=1048576,wsize=1048576" + dump_freq: "0" + fsck_pass: "0" + powerscale_ip: "10.0.1.50" # Custom field — available to Jinja2 templates + +mounts: + - name: "powerscale_home" + source: "{{ powerscale_ip }}:/ifs/home" + mount_point: "/home" + mount_params: "powerscale_nfs" + functional_group_prefix: ["slurm"] +``` + +### When a New Top-Level Section Is Needed + +Only when storage requires **runtime device discovery before mount** — where the source path can't be known at boot time. Currently that's just iSCSI/multipath (`powervault_config:`). Everything else (NFS, CIFS, s3fs, BeeGFS, VAST RDMA client) has a known source at config time and fits in `mounts:`. + +--- + +## Validation Rules + +### Required Fields + +#### Mount Entry (`mounts[]`) + +- ✅ `name` — Unique identifier (alphanumeric, `_`, `-`; no spaces) +- ✅ `source` — Device or network path +- ✅ `mount_point` — Absolute path starting with `/` +- ✅ `functional_group_prefix` — List of functional group prefixes +- ✅ **At least one of:** + - `mount_params` (references a `mount_params` profile), **OR** + - `mnt_opts` (explicit mount options) + +#### PowerVault Entry (`powervault_config[]`) + +- ✅ `name` — Unique identifier for this volume +- ✅ `ip` — List of controller IPv4 addresses (min 1) +- ✅ `iscsi_initiator` — IQN string +- ✅ `volume_id` — Hex WWN string +- ⬜ `port` — Optional; defaults to `3260` +- ✅ **Exactly one of:** + - `mount: ` (Mode A — references a `mounts[]` entry), **OR** + - Inline fields: `source`, `mount_point`, `mount_params` / `mnt_opts`, `functional_group_prefix` (Mode B) + +#### Swap Entry (`swap[]`) + +- ✅ `name` — Unique identifier +- ✅ `filename` — Absolute path for the swap file +- ✅ `size` — Human-readable size (`2G`, `512M`, `auto`) +- ⬜ `maxsize` — Optional; used only when `size: auto` +- ✅ `functional_group_prefix` — List of functional group prefixes + +#### Profile (`mount_params`) + +- ✅ `fs_type` — Filesystem type +- ✅ `mnt_opts` — Mount options +- ✅ `dump_freq` — Dump frequency +- ✅ `fsck_pass` — Fsck pass number +- ⬜ Custom fields (e.g., `vast_nfs_ip`, `perf_ip`) — Optional; passed to Jinja2 templates + +--- + +### Field Validation + +#### `name` (mount or swap) +- Pattern: `^[a-zA-Z0-9_-]+$` +- Length: 1–64 characters +- Must be unique across all entries in the same list + +#### `source` +- Minimum length: 1 character +- Examples: + - `/dev/sda1` + - `UUID=12345678-1234-1234-1234-123456789abc` + - `192.168.1.100:/export/share` + - `{{ vast_nfs_ip }}:/home` (Jinja2 template resolved at generation time) + +#### `mount_point` +- Pattern: `^/[a-zA-Z0-9/_.-]*$` +- Must start with `/` + +#### `fs_type` +- Allowed values: `auto`, `ext2`, `ext3`, `ext4`, `xfs`, `btrfs`, `nfs`, `nfs4`, `cifs`, `tmpfs`, `cephfs`, `vfat`, `ntfs`, `none` + +#### `mnt_opts` +- Pattern: `^[a-zA-Z0-9,=._-]+$` +- Examples: `defaults,nofail,_netdev`, `bind`, `defaults,nofail,noatime` + +#### `dump_freq` +- Pattern: `^[0-2]$` +- Usually `0` (no dump) + +#### `fsck_pass` +- Pattern: `^[0-9]$` +- Common values: + - `0` — No fsck (network filesystems, bind mounts) + - `1` — Root filesystem + - `2` — Other local filesystems + +#### `functional_group_prefix` +- Array of strings +- Pattern per element: `^[a-zA-Z0-9_-]+$` +- Must be unique within array +- Prefix-matched against node functional group names at provisioning time + +#### `volume_id` (PowerVault) +- Pattern: `^[a-fA-F0-9]+$` + +#### `iscsi_initiator` (PowerVault) +- Pattern: `^iqn\.[a-zA-Z0-9.-]+(?::[a-zA-Z0-9._:-]+)?$` + +#### `size` (swap) +- Pattern: `^(auto|[0-9]+[BKMGT]?)$` +- Examples: `2G`, `512M`, `auto` + +--- + +### Conditional Validation + +#### Mount Entry Must Have Profile OR Explicit Options + +```json +{ + "anyOf": [ + { "required": ["mount_params"] }, + { "required": ["mnt_opts"] } + ] +} +``` + +**Valid:** +```yaml +# Has mount_params profile +- name: "mount1" + source: "..." + mount_point: "..." + mount_params: "vast_nfs" + functional_group_prefix: ["slurm"] + +# Has mnt_opts explicit +- name: "mount2" + source: "..." + mount_point: "..." + mnt_opts: "defaults,nofail" + functional_group_prefix: ["slurm"] + +# Has both — explicit wins for mnt_opts, profile fills the rest +- name: "mount3" + source: "..." + mount_point: "..." + mount_params: "vast_nfs" + mnt_opts: "defaults,nofail,ro" # overrides profile's mnt_opts + functional_group_prefix: ["slurm"] +``` + +**Invalid:** +```yaml +# Missing both mount_params and mnt_opts +- name: "mount_invalid" + source: "..." + mount_point: "..." + functional_group_prefix: ["slurm"] +``` + +--- + +## Quick Reference + +### Profile Selection Guide + +| Storage Type | Use Case | Profile | Key Features | +|--------------|----------|---------|--------------| +| **VAST** | Home directories | `vast_nfs` | NFSv4, standard perf | +| **VAST** | Shared apps | `vast_nfs` | NFSv4, standard perf | +| **VAST** | Scratch space | `vast_nfs_performance` | NFSv4, 1MB buffers | +| **VAST** | Large datasets | `vast_nfs_performance` | NFSv4, 1MB buffers | +| **PowerVault** | Persistent storage | `powervault_iscsi` | XFS, iSCSI service dep | +| **PowerVault** | Database bind | `bind_mounts` | Bind from persistent mount | +| **Generic** | Network FS | `network_storage` | Auto-detect FS | +| **Local** | Local disk | `local_storage` | Auto-detect FS | +| **Local** | Scratch | `scratch_storage` | XFS, optimized | + +--- + +### Common Mount Options + +| Option | Description | Use Case | +|--------|-------------|----------| +| `defaults` | Use default options | All mounts | +| `nofail` | Don't fail boot if mount fails | Network mounts | +| `_netdev` | Network device (wait for network) | Network mounts | +| `noatime` | Don't update access time | Performance | +| `nodiratime` | Don't update directory access time | Performance | +| `ro` | Read-only | Shared apps, datasets | +| `rw` | Read-write | Default | +| `bind` | Bind mount | Subdirectory mounts | +| `rsize=1048576` | 1MB read buffer | High-perf NFS | +| `wsize=1048576` | 1MB write buffer | High-perf NFS | +| `x-systemd.requires=iscsi.service` | Require iSCSI service | PowerVault | +| `x-systemd.after=cloud-init-network.service` | Wait for cloud-init network | Network mounts | + +--- + +### Filesystem Types + +| Type | Description | Use Case | +|------|-------------|----------| +| `nfs` | NFS version 3 | Legacy NFS | +| `nfs4` | NFS version 4 | Modern NFS (VAST) | +| `xfs` | XFS filesystem | PowerVault, local disks | +| `ext4` | ext4 filesystem | Local disks | +| `cifs` | SMB/CIFS | Windows shares | +| `none` | No filesystem | Bind mounts | +| `auto` | Auto-detect | Generic mounts | + +--- + +## Troubleshooting + +### Common Issues + +#### Issue: Mount fails with "mount.nfs: Connection timed out" + +**Cause:** Network not ready or VAST server unreachable. + +**Solution:** +- Ensure `_netdev` and `x-systemd.after=cloud-init-network.service` in mount options +- Verify VAST server IP is correct and reachable +- Check firewall rules (NFS ports: 2049, 111) + +--- + +#### Issue: PowerVault mount fails with "No such device" + +**Cause:** iSCSI service not running or multipath device not ready. + +**Solution:** +- Ensure `x-systemd.requires=iscsi.service` in mount options +- Verify `powervault_config` is correctly configured +- Check iSCSI discovery: `iscsiadm -m discovery -t sendtargets -p ` +- Check multipath devices: `multipath -ll` + +--- + +#### Issue: Bind mount fails with "mount point does not exist" + +**Cause:** Source directory does not exist (parent mount not yet mounted, or list ordering issue). + +**Solution:** +- Ensure the parent mount entry appears **before** the bind mount in `mounts[]` +- Create the source directory: `mkdir -p /mnt/slurm-persist/mysql` +- Check that the parent PowerVault mount is healthy + +--- + +#### Issue: Validation error "must have either mount_params or mnt_opts" + +**Cause:** Mount entry is missing both a profile reference and explicit mount options. + +**Solution:** +- Add `mount_params: "profile_name"`, **OR** +- Add `mnt_opts: "mount_options"` + +--- + +#### Issue: Mount name fails validation + +**Cause:** Mount `name` contains spaces or special characters (e.g., `"Atomic mount"`). + +**Solution:** +- Use only alphanumeric characters, underscores, and hyphens: `"atomic_mount"` + +--- + +## References + +### Related Files + +- **Configuration:** `input/storage_config.yml` +- **Schema:** `common/library/module_utils/input_validation/schema/storage_config.json` +- **Cloud-Init Template:** `discovery/roles/configure_ochami/templates/cloud_init/ci-group-*.yaml.j2` +- **PowerVault Setup Script:** Embedded in cloud-init template (`setup_iscsi_storage.sh`) +- **Cluster Configuration:** `input/omnia_config.yml` (references mount names via `mounts:` list) + +--- + +## Design Decision: storage_config.yml vs storage_profile.yml + +This section documents the evaluation of two proposed input designs for filling the cloud-init `mounts:` module in oChaMI per-group cloud-init data, and explains the rationale for choosing `storage_config.yml`. + +--- + +### Goal + +oChaMI provisions nodes by pushing per-group cloud-init payloads. Each payload's `mounts:` module requires a flat list of fstab tuples: + +```yaml +mounts: + - [source, mount_point, fs_type, mnt_opts, dump_freq, fsck_pass] + - [source, mount_point, fs_type, mnt_opts, dump_freq, fsck_pass] +``` + +The design must be **simple and minimal** — the template engine should be able to produce this list for any given functional group without complex logic, missing fields, or ambiguous conventions. + +--- + +### Resolution Path Comparison + +#### `storage_config.yml` — 1-hop resolution + +``` +functional_group name (e.g. slurm_node_x86_64) + │ + ▼ + scan mounts[] where functional_group_prefix prefix-matches the group name + │ + ▼ (one direct pass) + for each matched mount entry: + resolve [source, mount_point, fs_type, mnt_opts, dump_freq, fsck_pass] + via: explicit field → mount_params profile → global default + │ + ▼ + flat list of tuples ──► cloud-init mounts: +``` + +One loop. One dict merge per entry. No construction, no inference. + +#### `storage_profile.yml` — 4-hop resolution + +``` +functional_group name (e.g. slurm_compute_x86_64) + │ + ▼ + mounts[cluster][functional_group] → profile name(s) + │ + ▼ + storage_profiles[profile][backend_ref] → path map + │ + ▼ + storage_config[section][backend_ref] → ip(s), options, protocol + │ + ▼ + construct source = ip + server_path + mount_point = client_path ← direction undocumented + fs_type = inferred from protocol ← not explicit + mnt_opts = options from backend ← no per-path override possible + dump_freq = ??? ← missing entirely + fsck_pass = ??? ← missing entirely + │ + ▼ + flat list of tuples ──► cloud-init mounts: +``` + +Four hops, path direction ambiguity, two required fstab fields absent. + +--- + +### Field-by-Field Comparison + +| cloud-init field | `storage_config.yml` | `storage_profile.yml` | +|---|---|---| +| `source` | Explicit on mount entry | Constructed: `ip + server_path` from `storage_config` | +| `mount_point` | Explicit on mount entry | Ambiguous: `"/home": "/home"` — which is server, which is client? | +| `fs_type` | Explicit or from `mount_params` profile | Must be inferred from `protocol` field | +| `mnt_opts` | Explicit or from `mount_params` profile | `options` from backend config; no per-path overrides | +| `dump_freq` | Explicit or from profile | **Not present anywhere in the file** | +| `fsck_pass` | Explicit or from profile | **Not present anywhere in the file** | +| Node targeting | `functional_group_prefix` per mount | `mounts[cluster][fg_name]` → profile → backend (3 levels) | +| Resolution hops | **1 (direct match + profile merge)** | **4 (cluster → fg → profile → backend)** | +| Template complexity | Low — filter loop + dict merge | High — nested loops, type dispatch, fallback inference | + +--- + +### Issues Found in Each Approach + +#### `storage_config.yml` — Fixable Issues + +| Issue | Severity | Fix | +|---|---|---| +| `mount_params` key diverges from JSON schema (`mount_default_fields`) | High | Update schema to match file | +| `omnia_config.yml` references `nfs_slurm`, `nfs_home` — neither exists as a mount `name` | High | Add matching named entries or align names | +| `"Atomic mount"` name contains a space — fails schema pattern | Medium | Rename to `atomic_mount` | +| `functional_group_prefix` omitted on one entry — undefined all-nodes behavior | Medium | Document or enforce requirement | +| Bind mount ordering not enforced — depends on list position | Low | Document ordering requirement | +| `vast_nfs_ip`, `perf_ip` custom fields in profiles — schema `additionalProperties: false` rejects them | Medium | Relax schema for custom fields | + +All issues are mechanical — wrong key names, missing entries, schema not updated after design evolved. + +#### `storage_profile.yml` — Structural Issues + +| Issue | Severity | Fix | +|---|---|---| +| Path map direction `"/home": "/home"` undocumented — server:client or client:server? | **Critical** | No fix without redesign | +| PowerVault entries are scalars (`pv1_volume1: "/var/lib/mysql"`) vs NFS entries are maps — structurally inconsistent | **Critical** | No fix without redesign | +| `dump_freq` and `fsck_pass` absent — cannot generate valid fstab tuple | **Critical** | Requires new fields added throughout | +| `slurm_common_profile` is null — silently applies nothing | High | Validate against null profiles | +| `slurm_login_x86_64` is null — login nodes silently get no mounts | High | Validate against null node assignments | +| `nfs1` and `nfs2` are identical copies — `slurm_compute_profile_aarch64` resolves to same paths as base | Medium | Copy-paste error | +| `ps1` uses `ips:` (list), `ps2` uses `ip:` (scalar) — inconsistent field names | Medium | Standardize field name | +| Multi-profile list (`- slurm_compute_profile\n- slurm_compute_profile_aarch64`) has no merge strategy — `/home` collision | High | Define merge semantics | +| `/tmp` mounted from VAST NFS in compute and compiler profiles — network `/tmp` is a reliability risk in HPC | Medium | Design smell | + +The structural issues (path direction, missing fstab fields, inconsistent value types) cannot be fixed by adding fields — they require reconceiving the profile format. + +--- + +### Why `storage_config.yml` Wins for This Use Case + +**1. Shape matches the output.** Each `mounts[]` entry is already one fstab tuple. The Jinja2 template is a filter — no construction or inference. `storage_profile.yml` requires building the tuple from scattered pieces across three separate data structures. + +**2. All six fstab fields are present.** `source`, `mount_point`, `fs_type`, `mnt_opts`, `dump_freq`, `fsck_pass` are either explicit on the entry or resolved from `mount_params`. `storage_profile.yml` is missing `dump_freq` and `fsck_pass` entirely. + +**3. `functional_group_prefix` is the right targeting primitive.** oChaMI groups nodes by functional group name. Prefix matching (`["slurm"]` matches `slurm_node_x86_64`, `slurm_control_node_aarch64`, etc.) handles architecture variants without enumerating every combination. `storage_profile.yml`'s cluster → role mapping adds indirection that only matters if clusters are topologically distinct — which is already handled by `omnia_config.yml`. + +**4. `mount_params` profiles provide reuse without indirection.** The VAST RDMA flags (`rsize`, `wsize`, `noatime`) live in one profile and are inherited by all referencing mounts. This is the same reuse benefit `storage_profile.yml` claims for its role profiles, but without the extra lookup hop. + +**5. `storage_profile.yml`'s multi-cluster advantage is already covered.** `omnia_config.yml` manages cluster topology. Each cluster entry references mount names from `storage_config.yml`. Cluster-level scoping belongs in the cluster config, not the storage config. + +--- + +### Conclusion + +> **`storage_config.yml` is the chosen design.** It directly satisfies the requirement of filling the cloud-init `mounts:` module per functional group with minimal template complexity and zero ambiguity. The issues found are all fixable with targeted corrections to key names and schema alignment. `storage_profile.yml`'s issues are structural and cannot produce valid fstab tuples without a fundamental redesign. + +--- + +## Changelog + +### Version 1.1 (2026-03-27) + +- Renamed `mount_default_fields` → `mount_params` throughout (matches `storage_config.yml`) +- Renamed `mount_default_field` → `mount_params` on mount entries +- Renamed `roles` → `functional_group_prefix` on mount and swap entries +- Updated `powervault_config` from single object to **list** (supports multiple volumes) +- Documented dual PowerVault mount modes: Mode A (`mount:` reference) and Mode B (inline fields) +- Added custom fields (`vast_nfs_ip`, `perf_ip`) to `vast_nfs` and `vast_nfs_performance` profiles +- Added `nfs_bind_mounts` profile reference in examples +- Added Example 0 (atomic mount with all explicit fields) +- Added Example 6 (swap configuration) +- Added `functional_group_prefix` targeting reference table +- Added bind mount ordering requirement and warning +- Added troubleshooting entry for invalid mount names (spaces) +- Updated Architecture diagram to reflect list-based `powervault_config` and `swap` +- Updated all validation rules to match current field names and structures + +### Version 1.0 (2026-03-20) + +- Initial design document +- Added VAST NFS profiles (`vast_nfs`, `vast_nfs_performance`) +- Added PowerVault iSCSI profile (`powervault_iscsi`) +- Removed `roles` field from profiles (roles only in mount entries) +- Changed `mount_default_fields` from array to mapping structure +- Added conditional validation (`mount_default_field` OR `mnt_opts` required) +- Updated examples to reflect VAST and PowerVault storage +- Aligned PowerVault examples with `setup_iscsi_storage.sh` implementation + +--- + +## Contact + +For questions or issues, please contact the Omnia development team. + +--- diff --git a/common/library/module_utils/input_validation/schema/storage_config.json b/common/library/module_utils/input_validation/schema/storage_config.json index e300410346..b86b843f7e 100644 --- a/common/library/module_utils/input_validation/schema/storage_config.json +++ b/common/library/module_utils/input_validation/schema/storage_config.json @@ -3,88 +3,293 @@ "title": "Configuration Schema", "type": "object", "properties": { - "nfs_client_params": { + "powervault_config": { "type": "array", + "description": "List of PowerVault iSCSI volume connection definitions. Processed via runcmd script because device path is only known after iSCSI login + multipath scan.", "items": { "type": "object", "properties": { - "nfs_name": { + "name": { + "type": "string", + "description": "Unique identifier for this PowerVault volume", + "pattern": "^[a-zA-Z0-9_-]+$", + "minLength": 1, + "maxLength": 64 + }, + "ip": { + "description": "List of target controller IP addresses for iSCSI discovery", + "type": "array", + "minItems": 1, + "items": { + "type": "string", + "format": "ipv4" + }, + "uniqueItems": true + }, + "port": { + "description": "TCP port for iSCSI target (default 3260)", + "type": "integer", + "minimum": 1, + "maximum": 65535 + }, + "iscsi_initiator": { + "description": "iSCSI initiator IQN", "type": "string", - "description": "The unique NFS server name" + "pattern": "^iqn\\.[a-zA-Z0-9.-]+(?::[a-zA-Z0-9._:-]+)?$" }, - "server_ip": { + "volume_id": { + "description": "Volume identifier (hex string / WWN) for multipath device matching", "type": "string", - "anyOf": [ - { - "allOf": [ - { "pattern": ".*[A-Za-z].*" }, - { "format": "idn-hostname" } - ] - }, - { - "allOf": [ - { "pattern": "^[0-9.]+$" }, - { "format": "ipv4" } - ] - } - ] + "pattern": "^[a-fA-F0-9]+$" }, - "server_share_path": { + "mount_point": { "type": "string", - "pattern": "^/(?:[^/]+(?:/[^/]+)*)?/?$" + "description": "Where the discovered device gets mounted", + "pattern": "^/[a-zA-Z0-9/_.-]*$" }, - "client_share_path": { + "mount_params": { "type": "string", - "pattern": "^/(?:[^/]+(?:/[^/]+)*)?/?$" + "description": "Named profile for fs_type/mnt_opts (read by the runcmd script)", + "pattern": "^[a-zA-Z0-9_-]+$" }, - "client_mount_options": { - "type": "string" + "node_key": { + "type": "string", + "description": "ds.meta_data key for per-node bind mounts (e.g., local_hostname). When present, generates bind mounts under mount_point//", + "enum": ["local_hostname", "local_ipv4", "instance_id"] + }, + "node_mount_point": { + "type": "array", + "description": "List of bind mount target paths. Required when node_key is set. Each gets: mount_point// -> ", + "items": { + "type": "string", + "pattern": "^/[a-zA-Z0-9/_.-]*$" + }, + "minItems": 1, + "uniqueItems": true + }, + + "functional_group_prefix": { + "type": "array", + "description": "List of functional group prefixes for node targeting", + "items": { + "type": "string", + "pattern": "^[a-zA-Z0-9_-]+$" + }, + "uniqueItems": true } }, - "required": [ - "server_ip", - "server_share_path", - "client_share_path", - "client_mount_options" - ] - }, - "minItems": 1 + "required": ["name", "ip", "iscsi_initiator", "volume_id", "mount_point", "functional_group_prefix"], + "additionalProperties": false + } }, - "powervault_config": { - "type": "object", - "required": ["ip", "iscsi_initiator", "volume_id"], - "properties": { - "ip": { - "description": "List of target controller IP addresses", - "type": "array", - "minItems": 1, - "items": { + "beegfs_config": { + "type": "array", + "description": "List of BeeGFS parallel filesystem mount definitions. BeeGFS uses beegfs-mounts.conf and beegfs-client service, not fstab. Each entry represents one BeeGFS cluster connection.", + "items": { + "type": "object", + "properties": { + "name": { "type": "string", + "description": "Unique identifier for this BeeGFS mount", + "pattern": "^[a-zA-Z0-9_-]+$", + "minLength": 1, + "maxLength": 64 + }, + "mgmtd_host": { + "type": "string", + "description": "IP address of the BeeGFS management server", "format": "ipv4" }, - "uniqueItems": true - }, - - "port": { - "description": "TCP port for iSCSI (default 3260)", - "type": "integer" + "mount_point": { + "type": "string", + "description": "Client mount location for this BeeGFS filesystem", + "pattern": "^/[a-zA-Z0-9/_.-]*$" + }, + "conn_auth_file": { + "type": "string", + "description": "Path to the BeeGFS connauth shared secret file on the omnia_core. Copied to /etc/beegfs/ on target nodes.", + "pattern": "^/[a-zA-Z0-9/_.-]*$" + }, + "client_conf_overrides": { + "type": "object", + "description": "Optional key-value overrides for /etc/beegfs/beegfs-client.conf (e.g., tuneNumWorkers, connMaxInternodeNum)", + "additionalProperties": { + "type": ["string", "integer", "boolean"] + } + }, + "functional_group_prefix": { + "type": "array", + "description": "List of functional group prefixes for node targeting", + "items": { + "type": "string", + "pattern": "^[a-zA-Z0-9_-]+$" + }, + "uniqueItems": true + } }, - - "iscsi_initiator": { - "description": "iSCSI initiator IQN", - "type": "string", - "pattern": "^iqn\\.[a-zA-Z0-9.-]+(?::[a-zA-Z0-9._:-]+)?$" + "required": ["name", "mgmtd_host", "mount_point", "conn_auth_file", "functional_group_prefix"], + "additionalProperties": false + } + }, + "mounts": { + "type": "array", + "description": "Cloud-init compatible mount configurations. Source must be known at boot time.", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Unique identifier for this mount entry", + "pattern": "^[a-zA-Z0-9_-]+$", + "minLength": 1, + "maxLength": 64 + }, + "source": { + "type": "string", + "description": "Device name or network path (e.g., /dev/sdc, UUID=xxx, 192.168.1.100:/export/share, powervault:)", + "minLength": 1 + }, + "mount_point": { + "type": "string", + "description": "Mount point path", + "pattern": "^/[a-zA-Z0-9/_.-]*$" + }, + "fs_type": { + "type": "string", + "description": "Filesystem type. Overrides mount_params profile when specified.", + "enum": ["auto", "ext2", "ext3", "ext4", "xfs", "btrfs", "nfs", "nfs4", "cifs", "tmpfs", "cephfs", "vfat", "ntfs", "none", "beegfs", "fuse.s3fs"] + }, + "mnt_opts": { + "type": "string", + "description": "Mount options. Overrides mount_params profile when specified.", + "pattern": "^[a-zA-Z0-9,=._@/-]+$" + }, + "dump_freq": { + "type": "string", + "description": "Dump frequency (usually 0). Overrides mount_params profile when specified.", + "pattern": "^[0-2]$" + }, + "fsck_pass": { + "type": "string", + "description": "Fsck pass number (usually 0 or 2). Overrides mount_params profile when specified.", + "pattern": "^[0-9]$" + }, + "mount_params": { + "type": "string", + "description": "Name of the mount_params profile to use for unspecified fields", + "pattern": "^[a-zA-Z0-9_-]+$" + }, + "node_key": { + "type": "string", + "description": "ds.meta_data key for per-node bind mounts. When present, fs_type is forced to none and mnt_opts to bind.", + "enum": ["local_hostname", "local_ipv4", "instance_id"] + }, + "node_mount_point": { + "type": "array", + "description": "List of bind mount target paths. Required when node_key is set.", + "items": { + "type": "string", + "pattern": "^/[a-zA-Z0-9/_.-]*$" + }, + "minItems": 1, + "uniqueItems": true + }, + "functional_group_prefix": { + "type": "array", + "description": "List of oChaMI functional group prefixes to apply this mount to. Omit to apply to all groups.", + "items": { + "type": "string", + "pattern": "^[a-zA-Z0-9_-]+$" + }, + "uniqueItems": true + }, + "role": { + "type": "string", + "description": "Omnia-reserved role for this mount. Used for internal infrastructure mounts.", + "enum": ["omnia_slurm_share", "omnia_k8s_share"] + } }, - - "volume_id": { - "description": "Volume identifier (hex string)", - "type": "string", - "pattern": "^[a-fA-F0-9]+$" + "required": ["name", "source", "mount_point"], + "additionalProperties": false + } + }, + "mount_params": { + "type": "object", + "description": "Named mount parameter profiles. Each profile provides defaults for fs_type, mnt_opts, dump_freq, fsck_pass. Custom fields are allowed for backend-specific metadata.", + "patternProperties": { + "^[a-zA-Z0-9_-]+$": { + "type": "object", + "properties": { + "fs_type": { + "type": "string", + "description": "Default filesystem type", + "enum": ["auto", "ext2", "ext3", "ext4", "xfs", "btrfs", "nfs", "nfs4", "cifs", "tmpfs", "cephfs", "vfat", "ntfs", "none", "beegfs", "fuse.s3fs"] + }, + "mnt_opts": { + "type": "string", + "description": "Default mount options", + "pattern": "^[a-zA-Z0-9,=._@/-]+$" + }, + "dump_freq": { + "type": "string", + "description": "Default dump frequency", + "pattern": "^[0-2]$" + }, + "fsck_pass": { + "type": "string", + "description": "Default fsck pass number", + "pattern": "^[0-9]$" + } + }, + "required": ["fs_type", "mnt_opts"], + "additionalProperties": true } + }, + "additionalProperties": false + }, + "swap": { + "type": "array", + "description": "Swap file configurations", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Unique identifier for this swap entry", + "pattern": "^[a-zA-Z0-9_-]+$", + "minLength": 1, + "maxLength": 64 + }, + "filename": { + "type": "string", + "description": "Path to the swap file to create", + "pattern": "^/[a-zA-Z0-9/_.-]+$" + }, + "size": { + "type": "string", + "description": "Size in bytes, 'auto', or human-readable format (e.g., 2G, 512M)", + "pattern": "^(auto|[0-9]+[BKMGT]?)$" + }, + "maxsize": { + "type": "string", + "description": "Maximum size (used with size: auto)", + "pattern": "^[0-9]+[BKMGT]?$" + }, + "functional_group_prefix": { + "type": "array", + "description": "List of oChaMI functional group prefixes to apply this swap to. Omit to apply to all groups.", + "items": { + "type": "string", + "pattern": "^[a-zA-Z0-9_-]+$" + }, + "uniqueItems": true + } + }, + "required": ["name", "filename", "size"], + "additionalProperties": false } } }, - "required": [ - "nfs_client_params" - ] + "required": [], + "additionalProperties": false } diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index f2be7fa3c8..1f3e02429a 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -321,6 +321,12 @@ echo "[INFO] ===== Completed slurmd setup (aarch64) =====" + - path: /etc/munge/munge.key + owner: {{ munge_user }}:{{ munge_group }} + permissions: '{{ file_mode_400 }}' + encoding: b64 + content: {{ slurp_munge_key.content }} + - path: /usr/local/bin/configure_munge_and_pam.sh permissions: '{{ file_mode_755 }}' content: | @@ -442,7 +448,7 @@ content: | {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} -{% endfor %} +{%- endfor %} - path: /etc/sysconfig/slurmd owner: root:root diff --git a/docs/cloud_init_mounts_ansible_hld.md b/docs/cloud_init_mounts_ansible_hld.md new file mode 100644 index 0000000000..340e67442b --- /dev/null +++ b/docs/cloud_init_mounts_ansible_hld.md @@ -0,0 +1,383 @@ +# High-Level Design: Cloud-Init Mounts Configuration + +## Overview + +This document describes a flexible, cloud-init compatible mount and swap configuration system that manages storage mounts and swap files with fine-grained control over which nodes receive each configuration. + +## Design Goals + +1. **Cloud-Init Compatibility**: Generate configurations compatible with cloud-init's mounts module +2. **Granular Control**: Target specific nodes by roles, hostnames, or groups from pxe_mapping.csv +3. **Idempotency**: Support repeated executions without side effects +4. **Flexibility**: Support multiple mounts and swap configurations per host +5. **Directory**: directory creation is automatic, no need to specify it in the configuration, only setting permissions would be in runcmd + +## Architecture + +### 1. Configuration Input Structure + +The system accepts configuration from `storage_config.yml`: + +#### 1.1 Mounts Configuration + +TODO: fs_ prefix to be retained? its cloud-init specific +```yaml +mounts: + - name: "nfs_slurm_home" # Unique identifier + fs_spec: "172.16.107.168:/mnt/share/omnia" + fs_file: "/home" + fs_vfstype: "nfs" + fs_mntops: "defaults,nofail,_netdev" + fs_freq: "0" + fs_passno: "0" + roles: ["slurm_control_node", "slurm_node"] + hostnames: [] + groups: [] +``` + +**Supported Filesystem Types (`fs_vfstype`):** + +- **Network Filesystems:** + - `nfs` - Network File System (NFS v3/v4) + - `nfs4` - NFS version 4 explicitly + - `cifs` - Common Internet File System (SMB/Windows shares) + - `beegfs` - BeeGFS parallel filesystem + - `glusterfs` - GlusterFS distributed filesystem + - `lustre` - Lustre parallel distributed filesystem + +- **Local Filesystems:** + - `ext4` - Fourth Extended Filesystem (recommended for Linux) + - `ext3` - Third Extended Filesystem + - `xfs` - XFS filesystem + - `btrfs` - B-tree filesystem + - `vfat` - FAT32 filesystem + - `ntfs` - NTFS filesystem (requires ntfs-3g) + +- **Special Filesystems:** + - `tmpfs` - Temporary filesystem in RAM + - `auto` - Auto-detect filesystem type (default) + +#### 1.2 Swap Configuration + +```yaml +swap: + - name: "compute_swap" + filename: "/swapfile" + size: "4G" + maxsize: "8G" + roles: ["slurm_node"] + hostnames: [] + groups: [] +``` + +#### 1.3 Mount Default Fields + +```yaml +mount_default_fields: + fields: ["auto", "defaults,nofail,x-systemd.after=cloud-init-network.service", "0", "2"] + roles: [] + hostnames: [] + groups: [] +``` + +### 2. Resolution Process + +``` +┌─────────────────────────────────────────────────────────────┐ +│ 1. Parse PXE Mapping │ +│ - Build hostname → (role, group, ip, mac) mapping │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ 2. Resolve Mount Defaults for Each Host │ +│ - Check if hostname matches mount_default_fields │ +│ roles/hostnames/groups │ +│ - Return applicable defaults or global defaults │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ 3. Filter Mounts for Each Host │ +│ - For each mount: │ +│ - If roles/hostnames/groups empty +│ - Else check if hostname matches criteria │ +│ - Return list of applicable mounts per host │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ 4. Filter Swap for Each Host │ +│ - Apply same filtering logic as mounts │ +│ - Validate: max 1 swap for cloud-init mode │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ 5. Generate Cloud-Init Configuration │ +│ - Generate per-host cloud-init YAML │ +│ - Append to the cloud init yml instance wise per host │ +│ TODO: Need to check if this is instance wise is [possible or not] +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ 6. Apply Configuration │ +│ - This will applied in the order cloud-init executes │ +│ its modules when PXE booted │ +└─────────────────────────────────────────────────────────────┘ +``` + +### 3. PXE Mapping File + +The `pxe_mapping_file.csv` is the source of truth for node attributes: + +```csv +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP +slurm_control_node,grp0,SVC001,PARENT001,manager01,aa:bb:cc:dd:ee:01,192.168.1.10,aa:bb:cc:dd:ee:f1,192.168.2.10 +slurm_node,grp1,SVC002,PARENT001,compute01,aa:bb:cc:dd:ee:02,192.168.1.11,aa:bb:cc:dd:ee:f2,192.168.2.11 +slurm_node,grp1,SVC003,PARENT001,compute02,aa:bb:cc:dd:ee:03,192.168.1.12,aa:bb:cc:dd:ee:f3,192.168.2.12 +slurm_node,grp2,SVC004,PARENT001,compute03,aa:bb:cc:dd:ee:04,192.168.1.13,aa:bb:cc:dd:ee:f4,192.168.2.13 +``` + +**Key Fields:** +- `FUNCTIONAL_GROUP_NAME`: Node role (e.g., slurm_control_node, slurm_node) +- `GROUP_NAME`: Node group for targeting +- `HOSTNAME`: Unique hostname for the node +- `ADMIN_MAC` / `ADMIN_IP`: Admin network interface details + +### 4. Data Flow Example + +#### Input Configuration + +```yaml +# storage_config.yml +mounts: + - name: "nfs_slurm_home" + fs_spec: "172.16.107.168:/mnt/share/omnia" + fs_file: "/home" + fs_vfstype: "nfs" + fs_mntops: "defaults,nofail,_netdev" + roles: ["slurm_control_node", "slurm_node"] + +swap: + - name: "compute_swap" + filename: "/swapfile" + size: "4G" + roles: ["slurm_node"] + +mount_default_fields: + fields: ["auto", "defaults,nofail,x-systemd.after=cloud-init-network.service", "0", "2"] + roles: [] +``` + +#### Resolution for compute01 + +Given PXE mapping shows: `compute01` has `role=slurm_node`, `group=grp1` + +**Applicable Configuration:** +- Mount: `nfs_slurm_home` (matches role `slurm_node`) +- Swap: `compute_swap` (matches role `slurm_node`) + +#### Generated Cloud-Init for compute01 + +```yaml +#cloud-config +# Hostname: compute01 + +mounts: + - ["172.16.107.168:/mnt/share/omnia", "/home", "nfs", "defaults,nofail,_netdev", "0", "0"] + +mount_default_fields: ["auto", "defaults,nofail,x-systemd.after=cloud-init-network.service", "0", "2"] + +swap: + filename: /swapfile + size: 4G +``` + +### 5. Validation Requirements + +#### Level 1: Schema Validation (JSON Schema) +- ✅ Data types and formats +- ✅ Required fields (name, fs_spec, fs_file for mounts; name, filename, size for swap) +- ✅ Path patterns (absolute paths starting with `/`) +- ✅ Size format (auto, 4G, 512M, etc.) +- ✅ Filesystem types (enum of supported types) +- ✅ Unique items in arrays + +#### Level 2: Business Logic Validation +- **Unique mount names** across all mounts +- **Unique swap names** across all swaps +- **Hostname existence** in PXE mapping +- **Swap filename uniqueness** per host +- **Max 1 swap** per host for cloud-init mode +- **Mount point uniqueness** per host +- **No circular dependencies** (e.g., swap on tmpfs) + +#### JSON Schema + +The configuration is validated using JSON Schema located at: +`/new_omnia/omnia/ansible_collections/dell/storage_generic/common/library/module_utils/input_validation/schema/storage_config.json` + +**Key Schema Validations for Mounts:** +```json +{ + "mounts": { + "type": "array", + "items": { + "properties": { + "name": { + "type": "string", + "pattern": "^[a-zA-Z0-9_-]+$", + "minLength": 1, + "maxLength": 64 + }, + "fs_spec": { + "type": "string", + "minLength": 1 + }, + "fs_file": { + "type": "string", + "pattern": "^/[a-zA-Z0-9/_.-]*$" + }, + "fs_vfstype": { + "type": "string", + "enum": ["auto", "ext2", "ext3", "ext4", "xfs", "btrfs", + "nfs", "nfs4", "cifs", "tmpfs", "cephfs", "vfat", "ntfs"] + }, + "fs_mntops": { + "type": "string", + "pattern": "^[a-zA-Z0-9,=._-]+$" + }, + "fs_freq": { + "type": "string", + "pattern": "^[0-2]$" + }, + "fs_passno": { + "type": "string", + "pattern": "^[0-9]$" + }, + "roles": { + "type": "array", + "items": {"type": "string", "pattern": "^[a-zA-Z0-9_-]+$"}, + "uniqueItems": true + }, + "hostnames": { + "type": "array", + "items": {"type": "string", "pattern": "^[a-zA-Z0-9.-]+$"}, + "uniqueItems": true + }, + "groups": { + "type": "array", + "items": {"type": "string", "pattern": "^[a-zA-Z0-9_-]+$"}, + "uniqueItems": true + } + }, + "required": ["name", "fs_spec", "fs_file"] + } + } +} +``` + +**Key Schema Validations for Swap:** +```json +{ + "swap": { + "type": "array", + "items": { + "properties": { + "name": { + "type": "string", + "pattern": "^[a-zA-Z0-9_-]+$", + "minLength": 1, + "maxLength": 64 + }, + "filename": { + "type": "string", + "pattern": "^/[a-zA-Z0-9/_.-]+$" + }, + "size": { + "type": "string", + "pattern": "^(auto|[0-9]+[BKMGT]?)$" + }, + "maxsize": { + "type": "string", + "pattern": "^[0-9]+[BKMGT]?$" + }, + "roles": { + "type": "array", + "items": {"type": "string", "pattern": "^[a-zA-Z0-9_-]+$"}, + "uniqueItems": true + }, + "hostnames": { + "type": "array", + "items": {"type": "string", "pattern": "^[a-zA-Z0-9.-]+$"}, + "uniqueItems": true + }, + "groups": { + "type": "array", + "items": {"type": "string", "pattern": "^[a-zA-Z0-9_-]+$"}, + "uniqueItems": true + } + }, + "required": ["name", "filename", "size"] + } + } +} +``` + +**Key Schema Validations for Mount Default Fields:** +```json +{ + "mount_default_fields": { + "type": "object", + "properties": { + "fields": { + "type": "array", + "items": {"type": "string"}, + "minItems": 6, + "maxItems": 6 + }, + "roles": { + "type": "array", + "items": {"type": "string", "pattern": "^[a-zA-Z0-9_-]+$"}, + "uniqueItems": true + }, + "hostnames": { + "type": "array", + "items": {"type": "string", "pattern": "^[a-zA-Z0-9.-]+$"}, + "uniqueItems": true + }, + "groups": { + "type": "array", + "items": {"type": "string", "pattern": "^[a-zA-Z0-9_-]+$"}, + "uniqueItems": true + } + }, + "required": ["fields"] + } +} +``` + +### 6. Key Design Decisions + +1. **PXE Mapping as Source of Truth** + - All hostname, role, and group data from `pxe_mapping_file.csv` + - Single source for node attributes + +2. **Mount Default Fields with Targeting** + - Supports roles, hostnames, groups + - Allows different defaults for different node types + - Falls back to global defaults + +3. **Cloud-Init Constraint** + - Max 1 swap file per host (cloud-init limitation) + - Multiple mounts supported per host + +4. **Targeting Logic** + - Empty roles/hostnames/groups = apply to all hosts + - Union of all specified criteria + - Unique hostname list per mount/swap + +5. **Idempotency** + - Cloud-init configurations are idempotent by design + - Repeated executions produce same result + +## Conclusion + +This design provides a flexible, cloud-init compatible mount and swap configuration system with granular per-host control through roles, hostnames, and groups targeting. The system resolves configurations from `storage_config.yml` and `pxe_mapping_file.csv` to generate instance-specific cloud-init configurations that are applied on each target node. diff --git a/docs/cloud_init_mounts_python_hld.md b/docs/cloud_init_mounts_python_hld.md new file mode 100644 index 0000000000..40b4025993 --- /dev/null +++ b/docs/cloud_init_mounts_python_hld.md @@ -0,0 +1,509 @@ +# High-Level Design: Cloud-Init Mounts Configuration + +## Overview + +This document describes the design for a flexible, cloud-init compatible mount and swap configuration system that allows administrators to define storage mounts and swap files with fine-grained control over which nodes receive each configuration. + +## Design Goals + +1. **Flexibility**: Support multiple mount and swap configurations with different targets +2. **Cloud-Init Compatibility**: Generate configurations compatible with cloud-init's mounts module +3. **Granular Control**: Allow targeting specific nodes by roles, hostnames, or groups +4. **Simplicity**: Provide an intuitive YAML-based configuration interface +5. **Uniqueness**: Ensure each mount/swap configuration is uniquely identifiable + +## Architecture + +### 1. Configuration Input Structure + +The system accepts two main configuration lists in `storage_config.yml`: + +#### 1.1 Mounts Configuration + +```yaml +mounts: + - name: "nfs_slurm_home" # Unique identifier + fs_spec: "192.168.1.100:/export" # Device/source + fs_file: "/home" # Mount point + fs_vfstype: "nfs" # Filesystem type + fs_mntops: "defaults,nofail,_netdev" # Mount options + fs_freq: "0" # Dump frequency + fs_passno: "0" # Fsck pass number + roles: ["slurm_control_node", "slurm_node"] # Target roles + hostnames: ["node01", "node02"] # Target specific hosts + groups: ["grp1", "grp2"] # Target node groups +``` + +#### 1.2 Swap Configuration + +```yaml +swap: + - name: "compute_swap" # Unique identifier + filename: "/swapfile" # Swap file path + size: "4G" # Swap size + maxsize: "8G" # Maximum size (for auto) + roles: ["slurm_node"] # Target roles + hostnames: [] # Target specific hosts + groups: ["grp1"] # Target node groups +``` + +### 2. Target Resolution Process + +The system resolves mount/swap targets through a multi-stage process: + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Stage 1: Parse Configuration │ +│ - Read mounts[] and swap[] from storage_config.yml │ +│ - Read pxe_mapping_file.csv for hostname mappings │ +│ - Validate required fields (name, fs_spec, fs_file, etc.) │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ Stage 2: Resolve Targeting Criteria │ +│ For each mount/swap entry: │ +│ - Extract roles[], hostnames[], groups[] │ +│ - If all empty → target = ALL nodes from pxe_mapping.csv │ +│ - If any specified → resolve to hostname list │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ Stage 3: Convert to Unique Hostname List │ +│ - roles[] → query pxe_mapping.csv for role-matched hosts │ +│ - hostnames[] → validate against pxe_mapping.csv │ +│ - groups[] → query pxe_mapping.csv for group members │ +│ - Combine all sources into unique hostname set │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ Stage 4: Build Hostname-to-Mounts Mapping │ +│ For each unique hostname: │ +│ hostname_mounts[hostname] = [mount_name1, mount_name2] │ +│ hostname_swaps[hostname] = [swap_name1, swap_name2] │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ Stage 5: Generate Cloud-Init Configuration │ +│ For each hostname: │ +│ - Retrieve mount configurations by mount_name │ +│ - Retrieve swap configurations by swap_name │ +│ - Generate cloud-init YAML for that specific host │ +└─────────────────────────────────────────────────────────────┘ +``` + +### 3. Hostname Resolution Algorithm + +#### 3.1 PXE Mapping File Structure + +The `pxe_mapping_file.csv` is the source of truth for all hostname mappings. It contains: +- **HOSTNAME**: Unique hostname for each node +- **MAC**: MAC address of the node +- **IP**: IP address of the node +- **SERVICE_TAG**: Service tag of the node +- **ADMIN_MAC**: Admin network MAC address +- **ADMIN_IP**: Admin network IP address +- **BMC_IP**: BMC/iDRAC IP address +- **GROUP_NAME**: Group identifier (e.g., grp0, grp1, grp2) +- **NODE_ROLE**: Role assignment (e.g., slurm_control_node, slurm_node, kube_node) + +#### 3.2 Resolution Logic + +For each mount/swap configuration entry: + +```python +def resolve_hostnames(mount_config, pxe_mapping): + """ + Resolve roles, hostnames, and groups to a unique list of hostnames. + All hostname data is sourced from pxe_mapping_file.csv. + + Args: + mount_config: Mount/swap configuration dict + pxe_mapping: Parsed pxe_mapping_file.csv data (DataFrame) + + Returns: + Set of unique hostnames + """ + target_hostnames = set() + + # If all targeting fields are empty, return ALL nodes from pxe_mapping + if not (mount_config.get('roles') or + mount_config.get('hostnames') or + mount_config.get('groups')): + return set(pxe_mapping['HOSTNAME'].tolist()) + + # Resolve roles to hostnames from pxe_mapping + for role in mount_config.get('roles', []): + matching_hosts = pxe_mapping[ + pxe_mapping['NODE_ROLE'] == role + ]['HOSTNAME'].tolist() + target_hostnames.update(matching_hosts) + + # Validate and add explicit hostnames from pxe_mapping + for hostname in mount_config.get('hostnames', []): + if hostname in pxe_mapping['HOSTNAME'].values: + target_hostnames.add(hostname) + else: + log_warning(f"Hostname '{hostname}' not found in pxe_mapping.csv") + + # Resolve groups to hostnames from pxe_mapping + for group in mount_config.get('groups', []): + matching_hosts = pxe_mapping[ + pxe_mapping['GROUP_NAME'] == group + ]['HOSTNAME'].tolist() + target_hostnames.update(matching_hosts) + + return target_hostnames +``` + +#### 3.3 Example Resolution + +**PXE Mapping File (pxe_mapping_file.csv):** +```csv +HOSTNAME,MAC,IP,SERVICE_TAG,GROUP_NAME,NODE_ROLE +manager01,aa:bb:cc:dd:ee:01,192.168.1.10,SVC001,grp0,slurm_control_node +compute01,aa:bb:cc:dd:ee:02,192.168.1.11,SVC002,grp1,slurm_node +compute02,aa:bb:cc:dd:ee:03,192.168.1.12,SVC003,grp1,slurm_node +compute03,aa:bb:cc:dd:ee:04,192.168.1.13,SVC004,grp2,slurm_node +node01,aa:bb:cc:dd:ee:05,192.168.1.14,SVC005,grp1,kube_node +node02,aa:bb:cc:dd:ee:06,192.168.1.15,SVC006,grp1,kube_node +``` + +**Configuration:** +```yaml +mounts: + - name: "nfs_home" + fs_spec: "192.168.1.100:/home" + fs_file: "/home" + roles: ["slurm_node"] + hostnames: ["manager01"] + groups: ["grp1"] +``` + +**Resolution Steps:** + +1. **Roles Resolution**: `["slurm_node"]` → Query pxe_mapping.csv where NODE_ROLE='slurm_node' → `["compute01", "compute02", "compute03"]` +2. **Hostnames**: `["manager01"]` → Validate in pxe_mapping.csv → `["manager01"]` +3. **Groups Resolution**: `["grp1"]` → Query pxe_mapping.csv where GROUP_NAME='grp1' → `["compute01", "compute02", "node01", "node02"]` +4. **Unique Set**: `{"compute01", "compute02", "compute03", "manager01", "node01", "node02"}` + +### 4. Hostname-to-Mounts Mapping + +After resolving all mount and swap configurations, the system builds a reverse mapping: + +```python +hostname_to_mounts = { + "compute01": ["nfs_home", "data_mount", "compute_swap"], + "compute02": ["nfs_home", "data_mount", "compute_swap"], + "manager01": ["nfs_home", "nfs_slurm"], + "node01": ["nfs_home", "ephemeral_mount"], + # ... etc +} +``` + +**Data Structure:** +```python +{ + "hostname": { + "mounts": [ + { + "name": "mount_name", + "fs_spec": "...", + "fs_file": "...", + "fs_vfstype": "...", + "fs_mntops": "...", + "fs_freq": "...", + "fs_passno": "..." + } + ], + "swap": [ + { + "name": "swap_name", + "filename": "...", + "size": "...", + "maxsize": "..." + } + ] + } +} +``` + +### 5. Cloud-Init Configuration Generation + +For each hostname, generate a cloud-init compatible configuration: + +#### 5.1 Cloud-Init Format + +```yaml +#cloud-config +mounts: + - ["192.168.1.100:/home", "/home", "nfs", "defaults,nofail,_netdev", "0", "0"] + - ["/dev/sdc", "/opt/data", "ext4", "defaults,nofail", "0", "2"] + +mount_default_fields: ["auto", "defaults,nofail,x-systemd.after=cloud-init-network.service", "0", "2"] + +swap: + filename: /swapfile + size: 4G + maxsize: 8G +``` + +#### 5.2 Generation Process + +```python +def generate_cloud_init_for_host(hostname, hostname_config): + """ + Generate cloud-init configuration for a specific host. + + Args: + hostname: Target hostname + hostname_config: Dict containing mounts and swap configs + + Returns: + Cloud-init YAML string + """ + cloud_init = { + 'mounts': [], + 'mount_default_fields': mount_default_fields, + 'swap': {} + } + + # Convert mounts to cloud-init format + for mount in hostname_config['mounts']: + cloud_init['mounts'].append([ + mount['fs_spec'], + mount['fs_file'], + mount.get('fs_vfstype', 'auto'), + mount.get('fs_mntops', 'defaults,nofail'), + mount.get('fs_freq', '0'), + mount.get('fs_passno', '2') + ]) + + # Add swap configuration (use first swap if multiple) + if hostname_config['swap']: + swap = hostname_config['swap'][0] + cloud_init['swap'] = { + 'filename': swap['filename'], + 'size': swap['size'], + 'maxsize': swap.get('maxsize', '') + } + + return yaml.dump(cloud_init) +``` + +### 6. Deployment Flow + +``` +┌──────────────────┐ +│ Administrator │ +│ edits │ +│ storage_config │ +└────────┬─────────┘ + │ + ↓ +┌──────────────────────────────────────────┐ +│ Ansible Playbook Execution │ +│ 1. Read storage_config.yml │ +│ 2. Read inventory (roles) │ +│ 3. Read pxe_mapping_file.csv (groups) │ +└────────┬─────────────────────────────────┘ + │ + ↓ +┌──────────────────────────────────────────┐ +│ Resolution Engine │ +│ - Resolve all mounts/swaps to hostnames │ +│ - Build hostname_to_mounts mapping │ +└────────┬─────────────────────────────────┘ + │ + ↓ +┌──────────────────────────────────────────┐ +│ Cloud-Init Generator │ +│ For each hostname: │ +│ - Generate cloud-init YAML │ +│ - Write to /var/lib/cloud-init/... │ +└────────┬─────────────────────────────────┘ + │ + ↓ +┌──────────────────────────────────────────┐ +│ Node Provisioning │ +│ - Cloud-init reads configuration │ +│ - Mounts filesystems │ +│ - Creates swap files │ +└──────────────────────────────────────────┘ +``` + +## Data Flow Example + +### Input Configuration + +```yaml +mounts: + - name: "nfs_slurm_home" + fs_spec: "172.16.107.168:/mnt/share/omnia" + fs_file: "/home" + fs_vfstype: "nfs" + fs_mntops: "defaults,nofail,_netdev" + fs_freq: "0" + fs_passno: "0" + roles: ["slurm_control_node", "slurm_node"] + hostnames: [] + groups: [] + + - name: "local_data" + fs_spec: "/dev/sdc" + fs_file: "/opt/data" + fs_vfstype: "ext4" + fs_mntops: "defaults,nofail" + fs_freq: "0" + fs_passno: "2" + roles: [] + hostnames: [] + groups: ["grp1"] + +swap: + - name: "compute_swap" + filename: "/swapfile" + size: "4G" + maxsize: "8G" + roles: ["slurm_node"] + hostnames: [] + groups: [] +``` + +### Resolution Results + +**PXE Mapping Data (pxe_mapping_file.csv):** + +| HOSTNAME | NODE_ROLE | GROUP_NAME | +|----------|-----------|------------| +| manager01 | slurm_control_node | grp0 | +| compute01 | slurm_node | grp1 | +| compute02 | slurm_node | grp1 | +| compute03 | slurm_node | grp2 | + +**Resolution from pxe_mapping.csv:** +- Role `slurm_control_node`: `["manager01"]` +- Role `slurm_node`: `["compute01", "compute02", "compute03"]` +- Group `grp1`: `["compute01", "compute02"]` + +**Resolved Hostnames per Mount:** + +| Mount Name | Resolved Hostnames | +|------------|-------------------| +| nfs_slurm_home | manager01, compute01, compute02, compute03 | +| local_data | compute01, compute02 | + +**Resolved Hostnames per Swap:** + +| Swap Name | Resolved Hostnames | +|-----------|-------------------| +| compute_swap | compute01, compute02, compute03 | + +### Hostname-to-Mounts Mapping + +```python +{ + "manager01": { + "mounts": ["nfs_slurm_home"], + "swap": [] + }, + "compute01": { + "mounts": ["nfs_slurm_home", "local_data"], + "swap": ["compute_swap"] + }, + "compute02": { + "mounts": ["nfs_slurm_home", "local_data"], + "swap": ["compute_swap"] + }, + "compute03": { + "mounts": ["nfs_slurm_home"], + "swap": ["compute_swap"] + } +} +``` + +### Generated Cloud-Init for compute01 + +```yaml +#cloud-config +mounts: + - ["172.16.107.168:/mnt/share/omnia", "/home", "nfs", "defaults,nofail,_netdev", "0", "0"] + - ["/dev/sdc", "/opt/data", "ext4", "defaults,nofail", "0", "2"] + +mount_default_fields: ["auto", "defaults,nofail,x-systemd.after=cloud-init-network.service", "0", "2"] + +swap: + filename: /swapfile + size: 4G + maxsize: 8G +``` + +## Key Design Decisions + +### 1. Unique Mount Names +- Each mount/swap must have a unique `name` field +- Enables tracking, debugging, and idempotent operations +- Allows referencing specific configurations in logs and errors + +### 2. List-Based Targeting +- Support multiple targeting methods: roles, hostnames, groups +- Union of all targeting criteria (OR logic) +- Empty targeting = apply to ALL nodes + +### 3. Hostname-Centric Processing +- Convert all targeting to unique hostname lists early +- Build hostname-to-mounts mapping for efficient lookup +- Generate per-host cloud-init configurations + +### 4. Cloud-Init Compatibility +- Follow cloud-init mounts module specification exactly +- Support all /etc/fstab fields +- Provide sensible defaults via `mount_default_fields` + +### 5. Separation of Concerns +- Configuration input (YAML) +- Resolution logic (roles/groups → hostnames) +- Cloud-init generation (per-host YAML) +- Deployment (Ansible/cloud-init) + +## Implementation Considerations + +### 1. Validation +- Validate unique mount/swap names +- Validate required fields (name, fs_spec, fs_file) +- Validate targeting criteria reference valid roles/groups +- Validate filesystem types are supported + +### 2. Error Handling +- Invalid role names → warning and skip +- Invalid group names → warning and skip +- Invalid hostnames → warning and skip +- No resolved hostnames → error + +### 3. Idempotency +- Use mount names for tracking applied configurations +- Support updates to existing mounts +- Support removal of mounts (empty fs_file) + +### 4. Performance +- Cache hostname resolution results +- Batch cloud-init generation +- Parallel deployment where possible + +## Security Considerations + +1. **Credential Management**: For CIFS/SMB mounts, credentials should be stored securely (e.g., `/root/.smbcreds` with 0600 permissions) +2. **Mount Options**: Use `nofail` to prevent boot failures +3. **Network Mounts**: Use `_netdev` to ensure network is available before mounting +4. **Validation**: Validate all user inputs to prevent injection attacks + +## Future Enhancements + +1. **Conditional Mounting**: Support conditional mounts based on hardware detection +2. **Mount Dependencies**: Support mount ordering/dependencies +3. **Dynamic Updates**: Support runtime mount updates without reboot +4. **Monitoring Integration**: Integration with monitoring systems for mount health +5. **Backup/Restore**: Configuration backup and restore capabilities + +## Conclusion + +This design provides a flexible, scalable approach to managing storage mounts and swap configurations across heterogeneous clusters. By converting roles, hostnames, and groups to unique hostname lists and building hostname-to-mounts mappings, the system can efficiently generate cloud-init configurations tailored to each node's requirements. diff --git a/docs/logos/Liqid.png b/docs/logos/Liqid.png deleted file mode 100644 index a0c91c23f1..0000000000 Binary files a/docs/logos/Liqid.png and /dev/null differ diff --git a/docs/logos/delltech.jpg b/docs/logos/delltech.jpg deleted file mode 100644 index 1d6faa37d5..0000000000 Binary files a/docs/logos/delltech.jpg and /dev/null differ diff --git a/docs/logos/omnia-logo-transparent.png b/docs/logos/omnia-logo-transparent.png deleted file mode 100644 index f2f2bf6692..0000000000 Binary files a/docs/logos/omnia-logo-transparent.png and /dev/null differ diff --git a/docs/logos/omnia-logo.png b/docs/logos/omnia-logo.png deleted file mode 100644 index ae65c0f61b..0000000000 Binary files a/docs/logos/omnia-logo.png and /dev/null differ diff --git a/docs/logos/pisa.png b/docs/logos/pisa.png deleted file mode 100644 index 58615a6383..0000000000 Binary files a/docs/logos/pisa.png and /dev/null differ diff --git a/input/omnia_config.yml b/input/omnia_config.yml index b0e9cb8850..d87fc62310 100644 --- a/input/omnia_config.yml +++ b/input/omnia_config.yml @@ -98,6 +98,9 @@ slurm_cluster: - cluster_name: slurm_cluster nfs_storage_name: nfs_slurm + mounts: # applicable to this cluster only + - nfs_slurm # Using nfs_slurm mount from storage_config.yml + - nfs_home # Using nfs_home mount from storage_config.yml # skip_merge: true # Uncomment to enable homogeneous discovery mode diff --git a/input/powervault_reason_runcmd.md b/input/powervault_reason_runcmd.md new file mode 100644 index 0000000000..ac8c4ed9f8 --- /dev/null +++ b/input/powervault_reason_runcmd.md @@ -0,0 +1,33 @@ +No. disk_setup cannot do this task. Here's why: + +What disk_setup + fs_setup can do + + • Partition a known, static block device path (e.g., /dev/sdb, /dev/sdc) + • Format the partition with a filesystem + • Works only on devices that exist at cloud-init boot time + +Why it cannot replace setup_iscsi_storage.sh + +The script does work that must happen before any device path is known: + +Step disk_setup capable? +Enable iscsid daemon No +Set /etc/iscsi/initiatorname.iscsi No +iSCSI target discovery (iscsiadm -m discovery) No +iSCSI login (iscsiadm -m node --login) No +Enable multipathd No +Identify the correct /dev/mapper/ by matching VOLUME_ID No +Create GPT partition if absent Yes (disk_setup) +Format with xfs if no filesystem exists Yes (fs_setup with overwrite: false) +Add fstab entry and mount Yes (mounts:) +Create bind mounts for mysql/spool subdirs Yes (mounts:) + +The hard blocker: disk_setup takes a static device path as the key (/dev/sdb, etc.). The multipath device (/dev/mapper/) is not known until after iSCSI login and multipath scanning — +which is dynamic runtime work that disk_setup has no hook for. + +The iSCSI/multipath setup steps (top 6 rows) must remain in runcmd or a script. Once the device is discovered and the path is known, the partition+format+mount portion could theoretically +use disk_setup+fs_setup+mounts: — but only if you hardcode /dev/mapper/ in the config, which defeats the purpose since volume_id matching is what identifies the right device +dynamically. + +Bottom line: disk_setup is for simple, pre-known local block devices. iSCSI over multipath requires daemon setup, target discovery, and dynamic device resolution — none of which cloud-init +modules support. The script stays in runcmd. \ No newline at end of file diff --git a/input/storage_config.yml b/input/storage_config.yml index 399bf42fd6..bd71898b97 100644 --- a/input/storage_config.yml +++ b/input/storage_config.yml @@ -19,56 +19,324 @@ # -----------------------------Powervault------------------------------------------- # powervault_config -# Mandatory when using PowerVault for persistent storage. -# Below parameters are mandatory when powervault_config is defined - # ip: A list of PowerVault controller ipv4 addresses used for iSCSI target discovery and login. - # iscsi_initiator: Specifies the InitiatorName used by the host when connecting to the iSCSI target. This IQN uniquely identifies the host to the storage array. - # volume_id: This is the unique WWN/identifier for the specific volume that should be used for persistent storage. This value is used for multipath scanning to select the correct mapped device. - -# Below are the optional parameters when powervault_config is defined - # port: Defines the TCP port for the iSCSI target service. When port is not specified, default port used will be 3260 - -# Below is an example on how to configure powervault_config -# In this configuration, a single controller portal is provided. - -#powervault_config: -# ip: -# - 172.1.2.3 -# port: 3260 -# iscsi_initiator: iqn.2025-01.com.dell:scontrol-node -# volume_id: 00c0ff4343f1f1f1001c8c4e6901000000 - - -# -----------------------------NFS------------------------------------------------ - -# This variable is used for mounting NFS share on slurm_control_node, slurm_node, login_node -# This takes a list of dicts with possible keys server_ip, server_share_path, client_share_path, client_mount_options -# In both the cases, the USER must manually update 'server_ip' and 'server_share_path' below with the correct values. -# If mount_option values are empty, NFS client will be mounted with these values "nosuid,rw,sync,hard,intr" -# Its mandatory to provide atleast one entry in nfs_client_params -# Example for single mount file system: -# nfs_client_params: -# nfs_name : str ,Name of the NFS storage resource. The default is "nfs_storage_default". -# The user can assign any custom string to specify a different NFS storage resource. -# - { server_ip: 10.5.0.101, server_share_path: "/mnt/share", client_share_path: "/home", client_mount_options: "nosuid,rw,sync,hard"} -# Example for supporting multiple mount points: -# nfs_client_params: -# - { server_ip: 198.168.0.1,server_share_path: "/mnt/share1", client_share_path: "/home", client_mount_options: "nosuid,rw,sync,hard"} -# - { server_ip: 198.168.0.2, server_share_path: "/mnt/share2", client_share_path: "/mnt/mount2", client_mount_options: "nosuid,rw,sync,hard"} -# Example for multiple mount file system: -# nfs_client_params: -# - { server_ip: 198.168.0.1, server_share_path: "/mnt/share1", client_share_path: "/mnt/mount1", client_mount_options: "nosuid,rw,sync,hard"} -# - { server_ip: 198.168.0.2, server_share_path: "/mnt/share2", client_share_path: "/mnt/mount2", client_mount_options: "nosuid,rw,sync,hard"} -nfs_client_params: - - server_ip: "172.16.107.168" # Provide the IP of the NFS server - server_share_path: "/mnt/share/omnia" # Provide server share path of the NFS Server - client_share_path: /share_omnia - client_mount_options: "nosuid,rw,sync,hard,intr" - nfs_name: nfs_slurm - - - server_ip: "172.16.107.121" # Provide the IP of the NFS server - server_share_path: "/mnt/share/omnia_k8s" # Provide server share path of the NFS Server - client_share_path: /share_omnia_k8s - client_mount_options: "nosuid,rw,sync,hard,intr" - nfs_name: nfs_k8s - +# Processed entirely via runcmd script (setup_iscsi_storage.sh). +# The device path (/dev/mapper/XXX) is only known after iSCSI login + multipath scan, +# so powervault mounts CANNOT use the cloud-init mounts module. +# The runcmd script handles: iscsid enable, initiator name, discovery, login, +# multipathd, volume_id matching, partitioning, formatting, mount, and bind mounts. +# +# Mandatory parameters: +# - name: Unique identifier for this powervault entry. Required +# - ip: List of PowerVault controller IPv4 addresses for iSCSI target discovery. Required +# - iscsi_initiator: InitiatorName IQN for the host. Required +# - volume_id: WWN/identifier for the volume (used for multipath device matching). Required +# +# Optional parameters: +# - port: TCP port for iSCSI target service. Default: 3260 +# - mount_point: Where the discovered device gets mounted. Required +# - mount_params: Named profile for fs_type/mnt_opts (read by the runcmd script). Optional +# - node_key: ds.meta_data key for per-node bind mounts (e.g., "local_hostname"). Optional +# - When present, implies bind mount: // -> +# - fs_type forced to "none", mnt_opts forced to "bind" (automatic) +# - node_mount_point: List of bind mount targets. Required when node_key is set +# - Pattern: // -> +# - slurm_conf_var: List of slurm.conf dir parameters to set to the union of node_mount_point values. Optional +# - Only directory params: StateSaveLocation, SlurmdSpoolDir +# - functional_group_prefix: List of functional group prefixes for node targeting. Required + +# Example: +# powervault_config: +# - name: powervault1 +# ip: [172.1.2.3] +# port: 3260 +# iscsi_initiator: iqn.2025-01.com.dell:scontrol-node +# volume_id: 00c0ff4343f1f1f1001c8c4e6901000000 +# mount_point: "/mnt/slurm-persist" +# mount_params: "powervault_iscsi" +# node_key: "local_ipv4" +# node_mount_point: +# - "/var/lib/mysql" +# - "/var/spool/slurm" +# slurm_conf_var: +# - "StateSaveLocation" +# functional_group_prefix: ["slurm_control_node"] + +powervault_config: + - name: powervault1 + ip: + - 172.1.2.3 + port: 3260 + iscsi_initiator: iqn.2025-01.com.dell:scontrol-node + volume_id: 00c0ff4343f1f1f1001c8c4e6901000000 + # mount params + mount_point: "/mnt/slurm-persist" + mount_params: "powervault_iscsi" + node_key: "local_hostname" + node_mount_point: + - "/var/lib/mysql" + - "/var/spool/slurm" + functional_group_prefix: ["slurm_control_node"] + + - name: powervault2 + ip: + - 172.1.2.4 + port: 3260 + iscsi_initiator: iqn.2025-01.com.dell:slurmd-node + volume_id: 00c0ff4343f1f1f1001c8c4e6901000001 + mount_point: "/mnt/slurmd-persist" + mount_params: "powervault_iscsi" + functional_group_prefix: ["slurm_node"] +#TODO: for powervault one more possibility is is to use one mount with source like powervault:powervault2 + +beegfs_config: # this is not part of fstab mounts, it is handled by beegfs client configuration + - name: "beegfs_scratch" + mgmtd_host: "192.168.1.100" + mount_point: "/mnt/beegfs" + conn_auth_file: "/opt/omnia/beegfs1/connAuth" + client_conf_overrides: + tuneNumWorkers: 8 + functional_group_prefix: ["k8s_node"] + + - name: "beegfs_shared" + mgmtd_host: "192.168.2.100" + mount_point: "/mnt/beegfs-shared" + conn_auth_file: "/opt/omnia/beegfs2/connAuth" # TODO: This file on beegfs management node needs to be generated + client_conf_overrides: + tuneNumWorkers: 8 + # TODO: add more overrides if needed + functional_group_prefix: ["slurm_node"] + +# -----------------------------Cloud-Init Mounts------------------------------------------------ +# mounts +# Configure mount points compatible with cloud-init mounts module. +# Source must be known at boot time (NFS paths, UUIDs, local devices). +# For runtime-discovered sources (iSCSI/multipath), use powervault_config above. +# +# Each mount entry contains the following fields (matching /etc/fstab format): +# - name: Unique identifier for this mount entry. Required +# - source: Device or network path (e.g., /dev/sdc, UUID=xxx, 192.168.1.100:/share). Required +# - mount_point: Mount point path (e.g., /mnt, /opt/data). Required +# - fs_type: Filesystem type (e.g., ext4, xfs, nfs, nfs4, cifs, auto). Optional +# - If specified, takes PRIORITY over mount_params profile +# - mnt_opts: Mount options (e.g., defaults,noexec,nofail). Optional +# - If specified, takes PRIORITY over mount_params profile +# - dump_freq: Dump frequency (usually "0"). Optional +# - If specified, takes PRIORITY over mount_params profile +# - fsck_pass: Fsck pass number (usually "0" or "2"). Optional +# - If specified, takes PRIORITY over mount_params profile +# - mount_params: Name of a profile in mount_params section. Optional +# - Used ONLY for fields not explicitly specified in the mount entry +# - node_key: ds.meta_data key for per-node bind mounts (e.g., "local_hostname", "local_ipv4"). Optional +# - When present, implies bind mount with per-node source path +# - fs_type forced to "none", mnt_opts forced to "bind" (automatic) +# - Source becomes: // +# - node_mount_point: List of bind mount targets. Required when node_key is set +# - Each target gets: // -> +# - slurm_conf_var: List of slurm.conf dir parameters. Optional +# - Each param gets the union of all node_mount_point values +# - Only directory params: StateSaveLocation, SlurmdSpoolDir +# - functional_group_prefix: List of functional group prefixes. Optional +# - All nodes whose role starts with any prefix get this mount +# - e.g., ["slurm"] matches slurm_control_node, slurm_node, etc. +# - If omitted, mount applies to all nodes +# +# PRIORITY ORDER for field resolution: +# 1. Explicit value in mount entry (HIGHEST PRIORITY) +# 2. Value from mount_params profile (if specified) +# 3. Auto-selected profile based on fs_type +# 4. Global fallback profile +# 5. Hardcoded system defaults (LOWEST PRIORITY) + +# Example: static mount with all explicit params (no profile) +# mounts: + +# - name: "atomic_mount" +# source: "192.168.1.100:/export" +# mount_point: "/mnt/data" +# fs_type: "nfs4" +# mnt_opts: "nfsvers=4.1,hard,intr,noatime,nconnect=16,rsize=1048576,wsize=1048576" +# dump_freq: "0" +# fsck_pass: "0" +# +# Example: static mount using profile +# - name: "vast_home" +# source: "192.168.1.100:/home" +# mount_point: "/home" +# mount_params: "vast_nfs" +# functional_group_prefix: ["slurm"] +# +# Example: per-node bind mount (node_key triggers bind behavior) +# - name: "scratch_isolation" +# source: "/mnt/scratch" +# mount_point: "/mnted/scratch" + +# node_key: "local_hostname" +# node_mount_point: +# - "/scratch" +# - "/tmp" + +# functional_group_prefix: ["slurm_node"] +# # On node001 generates fstab: +# # /mnted/scratch/node001/scratch /scratch none bind 0 0 +# # /mnted/scratch/node001/tmp /tmp none bind 0 0 +# # slurm.conf: SlurmdSpoolDir=/scratch,/tmp + +# /mnt/scratch /mnted/sctratch nfs4 defaults,nofail,_netdev,x-systemd.after=cloud-init-network.service 0 0 + +# /mnted/scratch/node001/var/loig/state /var/loig/state none bind 0 0 + +mounts: + # Static mount: all explicit params, no profile, applies to all nodes + - name: "atomic_mount" + # mount + source: "UUID=" + mount_point: "/mnt/atomic" + # fstab entries + fs_type: "nfs" + mnt_opts: "defaults,nofail,_netdev,x-systemd.after=cloud-init-network.service" + dump_freq: "0" + fsck_pass: "0" + functional_group_prefix: ["all"] + + # VAST NFS: shared export for home directories + - name: "vast_home" + source: "{{ vast_nfs_ip }}:/home" + mount_point: "/home" + mount_params: "vast_nfs" + # Where + functional_group_prefix: ["slurm"] + + # VAST NFS: shared export for applications + - name: "vast_apps" + source: "{{ vast_nfs_ip }}:/apps" + mount_point: "/apps" + mount_params: "vast_nfs" + functional_group_prefix: ["slurm", "login"] + + # VAST NFS: shared export for slurm state (controller) + - name: "vast_slurm_state" + source: "{{ vast_nfs_ip }}:/slurm" + mount_point: "/var/slurm" + mount_params: "vast_nfs" + slurm_conf_var: + - "StateSaveLocation" + functional_group_prefix: ["slurm_control_node_x86_64"] + + # VAST NFS: shared scratch export + - name: "vast_scratch_shared" + source: "192.168.1.100:/scratch" + mount_point: "/mnt/scratch" + mount_params: "vast_nfs_performance" + functional_group_prefix: ["slurm", "login"] + + # Per-node bind mount from shared scratch (node_key triggers bind behavior) + - name: "scratch_isolation" + source: "/mnt/scratch" + mount_point: "/mounted/scratch" + node_key: "local_hostname" + node_mount_point: + - "/scratch" + - "/tmp" + slurm_conf_var: + - "SlurmdSpoolDir" + functional_group_prefix: ["slurm_node"] + + # Powervault: shared export for slurm state (controller) + - name: "powervault_slurm_state" + source: "powervault:powervault2" + mount_point: "/mnt/slurmd-persist" + mount_params: "powervault_iscsi" + functional_group_prefix: ["slurm_node"] + +# -----------------------------Mount Params (Profiles)------------------------------- +# Named default profiles for mount configurations. +# Apart from fs_type, mnt_opts, dump_freq, fsck_pass, additional custom fields +# can be defined and used in mount templates (e.g., vast_nfs_ip). + +mount_params: + # Default NFS mount - standard NFS4.1 with high-performance options + default: + fs_type: "nfs4" + mnt_opts: "nfsvers=4.1,hard,intr,noatime,nconnect=16,rsize=1048576,wsize=1048576" + dump_freq: "0" + fsck_pass: "0" + + # VAST NFS storage - standard configuration + vast_nfs: + fs_type: "nfs" + mnt_opts: "nfsvers=3,hard,intr,noatime,nconnect=16,rsize=1048576,wsize=1048576" + dump_freq: "0" + fsck_pass: "0" + vast_nfs_ip: "192.168.1.100" + + # VAST NFS storage - high-performance with large buffers + vast_nfs_performance: + fs_type: "nfs" + mnt_opts: "nfsvers=3,hard,intr,noatime,nodiratime,nconnect=16,rsize=1048576,wsize=1048576" + dump_freq: "0" + fsck_pass: "0" + + # PowerVault iSCSI storage - block device with XFS + powervault_iscsi: + fs_type: "xfs" + mnt_opts: "defaults,_netdev,noatime,x-systemd.requires=iscsi.service" + dump_freq: "0" + fsck_pass: "0" + + # BeeGFS parallel filesystem + beegfs: + fs_type: "beegfs" + mnt_opts: "cfgFile=/etc/beegfs/beegfs-client.conf" + dump_freq: "0" + fsck_pass: "0" + + # Network storage defaults (generic NFS, CIFS, etc.) + network_storage: + fs_type: "auto" + mnt_opts: "defaults,nofail,_netdev,x-systemd.after=cloud-init-network.service" + dump_freq: "0" + fsck_pass: "0" + + # Local storage defaults (ext4, xfs, etc.) + local_storage: + fs_type: "auto" + mnt_opts: "defaults,nofail,noatime" + dump_freq: "0" + fsck_pass: "2" + + # Bind mount defaults + bind_mounts: + fs_type: "none" + mnt_opts: "bind" + dump_freq: "0" + fsck_pass: "0" + + # High-performance scratch storage + scratch_storage: + fs_type: "xfs" + mnt_opts: "defaults,nofail,noatime,nodiratime,largeio,inode64" + dump_freq: "0" + fsck_pass: "2" + + # Global fallback defaults + global: + fs_type: "auto" + mnt_opts: "defaults,nofail,x-systemd.after=cloud-init-network.service" + dump_freq: "0" + fsck_pass: "2" + +# -----------------------------Swap------------------------------------------------- +# swap: Swap file configuration (list of swap configurations) +# Each swap entry contains: +# - name: Unique identifier. Required +# - filename: Path to the swap file (e.g., /swapfile). Required +# - size: Size in bytes, 'auto', or human-readable (e.g., "2G", "512M"). Required +# - maxsize: Max size (used with size: auto). Optional +# - functional_group_prefix: List of functional group prefixes. Required + +swap: + - name: "compute_swap" + filename: "/swapfile" + size: "2G" + maxsize: "4G" + functional_group_prefix: ["slurm_node"] diff --git a/input/storage_profile.yml b/input/storage_profile.yml new file mode 100644 index 0000000000..a0ddf78a43 --- /dev/null +++ b/input/storage_profile.yml @@ -0,0 +1,209 @@ +# ======================================== +# FINAL STORAGE CONFIGURATION +# ======================================== +# Purpose: Multi-cluster, multi-architecture storage mounting configuration +# Supports: Pure x86_64, pure aarch64, and hybrid clusters +# Storage Types: PowerScale (NFS), VAST (NFS/RDMA), PowerVault (iSCSI), External NFS + + + +storage_config: + # PowerVault Configuration - iSCSI Block Storage + powervault_config: + default_options: "defaults,_netdev,hard,intr,noatime" + # Volume 1 - Control Node Database (100GB) + pv1_volume1: + ip: ["10.10.0.21", "powervault01.cluster.local"] + port: 3260 + volume_id: "00c0ff4343f1f1f1001c8c4e6901000000" + iscsi_initiator: "iqn.2025-01.com.dell:scontrol-node" + options: "defaults,_netdev,hard,intr,noatime" + filesystem: "xfs" + + # Volume 2 - Control Node Backup (500GB) + pv1_volume2: + ip: ["10.10.0.21", "powervault01.cluster.local"] + port: 3260 + volume_id: "00c0ff4343f1f1f1001c8c4e6901000001" + iscsi_initiator: "iqn.2025-02.com.dell:scontrol-node" + filesystem: "xfs" + options: "defaults,_netdev,hard,intr,noatime" + + # PowerScale Configuration - NFS File Storage + powerscale_config: + default_options: "defaults,_netdev,hard,intr,noatime" + ps1: + protocol: "nfs" + ips: ["ps.cluster.local"] + options: "defaults,_netdev,hard,intr,noatime" + ps2: + ip: "ps.cluster.local" + protocol: "nfs" + options: "defaults,_netdev,hard,intr,noatime" + + # VAST Configuration - NFS/RDMA Parallel File Storage + vast_config: + default_options: "proto=rdma,port=20049,nconnect=8,remoteports=dns,mdconnect=2,spread_reads,spread_writes,noidlexprt,forcerdirplus" + vast1: + protocol: "nfs_rdma" + ips: ["vast.cluster.local"] + client_share_base_path: "/mnt/vast1" + server_share_base_path: "/mnt/share/omnia_vast1" + options: "proto=rdma,port=20049,nconnect=8,remoteports=dns,mdconnect=2,spread_reads,spread_writes,noidlexprt,forcerdirplus" + + # External NFS Configuration + nfs_config: + default_options: "defaults,_netdev,soft,intr" + nfs1: + protocol: "nfs" + ips: ["nfs.cluster.local"] + client_share_base_path: "/mnt/nfs1" + server_share_base_path: "/mnt/share/omnia_nfs1" + options: "defaults,_netdev,soft,intr" + nfs2: + protocol: "nfs" + ips: ["nfs.cluster.local"] + client_share_base_path: "/mnt/nfs1" + server_share_base_path: "/mnt/share/omnia_nfs1" + options: "defaults,_netdev,soft,intr" + +# ======================================== +# STORAGE PROFILES CONFIGURATION +# ======================================== +# Reusable mount profiles for different storage types and node roles + +storage_profiles: + slurm_common_profile: + slurm_control_nfs_profile: + ps1: + "/home": "/home" + "/data": "/data" + "/backup": "/backup" + "/var/slurm": "/slurm" + "/etc/slurm": "/etc/slurm" + "/var/log/slurm": "/var/log/slurm" + + slurm_control_mixed_profile: + ps1: + "/home": "/home" + "/data": "/data" + "/backup": "/backup" + "/var/slurm": "/slurm" + "/etc/slurm": "/etc/slurm" + "/var/log/slurm": "/var/log/slurm" + vast1: + "/opt/aarch64_tools": "/aarch64_tools" + "/lib/aarch64": "/lib_aarch64" + pv1_volume1: "/var/lib/mysql" + pv1_volume2: "/backup/object" + + # Slurm Login Node Profile - User Access + slurm_login_profile: + ps1: + "/home": "/home" + "/data": "/data" + "/shared": "/shared" + "/projects": "/projects" + "/tools": "/tools" + "/tmp": "/tmp" + + # Slurm Login/Compiler Node Profile - Development + Compilation + slurm_login_compiler_profile: + ps1: + "/home": "/home" + "/data": "/data" + "/shared": "/shared" + "/projects": "/projects" + "/tools": "/tools" + "/opt/compiler": "/compiler" + "/opt/build": "/build" + vast1: + "/scratch": "/scratch" + "/workspace": "/workspace" + "/tmp": "/tmp" + + + # Slurm Compute Node Profile - HPC Workloads + slurm_compute_profile: + ps1: + "/home": "/home" + "/data": "/data" + "/opt/apps": "/apps" + "/opt/hpc_tools": "/hpc_tools" + vast1: + "/scratch": "/scratch" + "/tmp": "/tmp" + "/workspace": "/workspace" + "/io-intensive": "/io-intensive" + + slurm_compute_profile_aarch64: + nfs2: + "/home": "/home" + "/data": "/data" + "/opt/apps": "/apps" + "/opt/hpc_tools": "/hpc_tools" + + # Kubernetes Worker Node Profile - Container Workloads + kube_node_profile: + ps2: + "/var/lib/containers": "/var/lib/containers" + "/var/lib/kubelet": "/var/lib/kubelet" + "/var/lib/podman": "/var/lib/podman" + "/opt/cni": "/opt/cni" + + # Kubernetes Controller Node Profile - Control Plane + kube_controller_profile: + ps2: + "/backup": "/backup" + "/etc/kubernetes": "/k8s-config" + "/etcd": "/etcd" + "/tmp": "/tmp" + "/var/lib/etcd": "/var/lib/etcd" + "/var/lib/kubelet": "/var/lib/kubelet" + "/var/lib/containers": "/var/lib/containers" + "/etc/certs": "/etc/certs" + "/opt/cni": "/opt/cni" + +# ======================================== +# SLURM MOUNTS CONFIGURATION +# ======================================== + +mounts: + slurm_cluster1: + # Control Nodes + slurm_control_x86_64: slurm_control_nfs_profile + slurm_control_aarch64: slurm_control_mixed_profile + + # Login Nodes + slurm_login_x86_64: + slurm_login_aarch64: slurm_login_profile + + # Login/Compiler Nodes + slurm_login_compiler_x86_64: slurm_login_compiler_profile + slurm_login_compiler_aarch64: slurm_login_compiler_profile + + # Compute Nodes + slurm_compute_x86_64: slurm_compute_profile + slurm_compute_aarch64: + - slurm_compute_profile + - slurm_compute_profile_aarch64 + + k8s_cluster1: + # Controller Nodes + kube_controller_x86_64: kube_controller_profile + + # Worker Nodes + kube_worker_x86_64: kube_node_profile + kube_worker_aarch64: kube_node_profile + +# ======================================== +# CONFIGURATION NOTES +# ======================================== +# Multi-storage-support example more that one pv, ps vast etc +# Multi-Cluster Support +# Multi-Architecture +# Complete Storage Types +# Mount Options +# Path Resolution Logic +# Volume Management +# Role Organization \ No newline at end of file