diff --git a/roles/helper/pci/README.md b/roles/helper/pci/README.md index 93b4aa5..36cd4a7 100644 --- a/roles/helper/pci/README.md +++ b/roles/helper/pci/README.md @@ -11,21 +11,22 @@ N/A Role Variables -------------- -| Name | Type | Default | Description | -|-----------------------------|--------|-----------|-------------------------------------------------------------------------------------| -| `pci_devices` | `list` | `[]` | PCI devices configuration. | -| `pci_devices[].excluded` | `bool` | `false` | Do not process matching PCI devices. | -| `pci_devices[].unguarded` | `bool` | `false` | Do not protect matching PCI devices (this may cause primary NIC connectivity loss). | -| `pci_devices[].unlisted` | `bool` | `true` | Do not pass matching PCI devices to OpenNebula. | -| `pci_devices[].virtual` | `bool` | `false` | Do not fail query on missing virtual devices (SR-IOV). | -| `pci_devices[].address` | `str` | undefined | Glob PCI devices by PCI or MAC address. | -| `pci_devices[].vendor` | `str` | `*` | Glob PCI devices by PCI Vendor (if address is undefined). | -| `pci_devices[].device` | `str` | `*` | Glob PCI devices by PCI Device (if address is undefined). | -| `pci_devices[].class` | `str` | `*` | Glob PCI devices by PCI Class (if address is undefined). | -| `pci_devices[].set_counter` | `str` | undefined | Reset the "set_counter" internal counter that can be used with set_name ("{3}"). | -| `pci_devices[].set_driver` | `str` | `omit` | Use driverctl to override driver (unless "omit"). | -| `pci_devices[].set_name` | `str` | `omit` | Rename device in udev (unless "omit"). | -| `pci_devices[].set_numvfs` | `str` | `0` | Enable Virtual Functions for SR-IOV capable devices (integer >= 0 or "max"). | +| Name | Type | Default | Description | +|-------------------------------|--------|-----------|-------------------------------------------------------------------------------------| +| `pci_devices` | `list` | `[]` | PCI devices configuration. | +| `pci_devices[].excluded` | `bool` | `false` | Do not process matching PCI devices. | +| `pci_devices[].unguarded` | `bool` | `false` | Do not protect matching PCI devices (this may cause primary NIC connectivity loss). | +| `pci_devices[].unlisted` | `bool` | `true` | Do not pass matching PCI devices to OpenNebula. | +| `pci_devices[].virtual` | `bool` | `false` | Do not fail query on missing virtual devices (SR-IOV). | +| `pci_devices[].address` | `str` | undefined | Glob PCI devices by PCI or MAC address. | +| `pci_devices[].vendor` | `str` | `*` | Glob PCI devices by PCI Vendor (if address is undefined). | +| `pci_devices[].device` | `str` | `*` | Glob PCI devices by PCI Device (if address is undefined). | +| `pci_devices[].class` | `str` | `*` | Glob PCI devices by PCI Class (if address is undefined). | +| `pci_devices[].set_counter` | `str` | undefined | Reset the "set_counter" internal counter that can be used with set_name ("{3}"). | +| `pci_devices[].set_driver` | `str` | `omit` | Use driverctl to override driver (unless "omit"). | +| `pci_devices[].set_name` | `str` | `omit` | Rename device in udev (unless "omit"). | +| `pci_devices[].set_numvfs` | `str` | `0` | Enable Virtual Functions for SR-IOV capable devices (integer >= 0 or "max"). | +| `pci_devices[].set_switchdev` | `bool` | `false` | Toggle legacy/switchdev modes for SR-IOV capable devices. | Dependencies ------------ @@ -138,11 +139,12 @@ Example Playbook - hosts: node vars: pci_devices: - # Enable all available VFs for all existing Mellanox PFs. + # Enable all available VFs for all existing Mellanox PFs, then enable switchdev mode. - vendor: "15b3" device: "1015" class: "0200" set_numvfs: max + set_switchdev: true # Rename all existing Mellanox VFs using custom counter (starting from 1), then pass them to OpenNebula. - vendor: "15b3" diff --git a/roles/helper/pci/tasks/devices.yml b/roles/helper/pci/tasks/devices.yml index fe133fb..966cd17 100644 --- a/roles/helper/pci/tasks/devices.yml +++ b/roles/helper/pci/tasks/devices.yml @@ -3,11 +3,11 @@ ansible.builtin.package: name: "{{ _common + _specific[ansible_os_family] }}" vars: - _common: [bash, coreutils, driverctl, findutils, grep, pciutils] + _common: [bash, coreutils, driverctl, grep, pciutils] _specific: - Debian: [] - RedHat: [] - Suse: [] + Debian: [iproute2] + RedHat: [iproute] + Suse: [iproute2] register: package until: package is success retries: 12 @@ -57,51 +57,25 @@ - when: lspci_devices | count > 0 block: - name: Render sriov-enable service unit - ansible.builtin.copy: + ansible.builtin.template: dest: "{{ item.dest }}" + src: "{{ item.src }}" owner: 0 group: 0 mode: "{{ item.mode }}" - content: "{{ item.cmd }}" loop: - dest: /usr/local/sbin/sriov-manage.sh + src: sriov-manage.sh.jinja mode: u=rwx,go=rx - cmd: | - #!/usr/bin/env bash - set -eu - - # Split the input (e.g., 0000:27:00.0-4) into address and count - IFS='-' read -r PCI_ADDR VF_COUNT <<< "$1" - - # Verify the device exists before writing - if [[ -d "/sys/bus/pci/devices/$PCI_ADDR" ]]; then - echo "Setting $VF_COUNT VFs on $PCI_ADDR" - echo "$VF_COUNT" > "/sys/bus/pci/devices/$PCI_ADDR/sriov_numvfs" - else - echo "Error: Device $PCI_ADDR not found" >&2 - exit 1 - fi - dest: /etc/systemd/system/sriov-enable@.service + src: sriov-enable@.service.jinja mode: u=rw,go=r - cmd: | - [Unit] - Description=Enable SR-IOV VFs on %I - After=network-pre.target - - [Service] - Type=oneshot - RemainAfterExit=yes - # %I is replaced by the string after the @ in the command - ExecStart=/usr/local/sbin/sriov-manage.sh %i - - [Install] - WantedBy=multi-user.target - register: copy_sriov_enable_service + register: template_sriov_enable_service - name: Reload systemd ansible.builtin.systemd_service: daemon_reload: true - when: copy_sriov_enable_service is changed + when: template_sriov_enable_service is changed - name: Override drivers (revert when needed) ansible.builtin.shell: @@ -141,7 +115,7 @@ file: "{{ role_path }}/tasks/query.yml" when: shell_revert_drivers is changed - - name: (Re)Enable VFs + - name: Enable VFs ansible.builtin.shell: cmd: | set -x -o errexit -o pipefail @@ -156,7 +130,11 @@ {% endif %} if [[ -n "$SRIOV_NUMVFS" ]]; then ALL="$(systemctl show --all -P Id 'sriov-enable@{{ v.Slot }}-*.service' | grep -E -v '^\s*$')" ||: + {% if v.Set_switchdev == 'yes' %} + TO_ENABLE="sriov-enable@{{ v.Slot }}-$SRIOV_NUMVFS-switchdev.service" + {% else %} TO_ENABLE="sriov-enable@{{ v.Slot }}-$SRIOV_NUMVFS.service" + {% endif %} if [[ "$(head -n1 '/sys/bus/pci/devices/{{ v.Slot }}/sriov_numvfs')" == 0 ]]; then # This handles the invalid case where VFs are no longer enabled but service is still active TO_DISABLE="$ALL" diff --git a/roles/helper/pci/tasks/query.yml b/roles/helper/pci/tasks/query.yml index 07266fe..dbb48fc 100644 --- a/roles/helper/pci/tasks/query.yml +++ b/roles/helper/pci/tasks/query.yml @@ -22,6 +22,7 @@ echo -e 'Set_driver:\t{{ v.set_driver | d('omit') }}' echo -e 'Set_name:\t{{ v.set_name | d('omit') }}' echo -e 'Set_numvfs:\t{{ v.set_numvfs | d(0) }}' + echo -e 'Set_switchdev:\t{{ v.set_switchdev | d(false) | bool | ternary('yes', 'no') }}' echo -e 'Virtual:\t{{ v.virtual | d(false) | bool | ternary('yes', 'no') }}' echo -e 'Unlisted:\t{{ v.unlisted | d(true) | bool | ternary('yes', 'no') }}' echo -e 'Unguarded:\t{{ v.unguarded | d(false) | bool | ternary('yes', 'no') }}' @@ -52,6 +53,7 @@ echo -e 'Set_driver:\t{{ v.set_driver | d('omit') }}' echo -e 'Set_name:\t{{ v.set_name | d('omit') }}' echo -e 'Set_numvfs:\t{{ v.set_numvfs | d(0) }}' + echo -e 'Set_switchdev:\t{{ v.set_switchdev | d(false) | bool | ternary('yes', 'no') }}' echo -e 'Virtual:\t{{ v.virtual | d(false) | bool | ternary('yes', 'no') }}' echo -e 'Unlisted:\t{{ v.unlisted | d(true) | bool | ternary('yes', 'no') }}' echo -e 'Unguarded:\t{{ v.unguarded | d(false) | bool | ternary('yes', 'no') }}' @@ -89,6 +91,7 @@ echo -e 'Set_driver:\t{{ v.set_driver | d('omit') }}' echo -e 'Set_name:\t{{ v.set_name | d('omit') }}' echo -e 'Set_numvfs:\t{{ v.set_numvfs | d(0) }}' + echo -e 'Set_switchdev:\t{{ v.set_switchdev | d(false) | bool | ternary('yes', 'no') }}' echo -e 'Virtual:\t{{ v.virtual | d(false) | bool | ternary('yes', 'no') }}' echo -e 'Unlisted:\t{{ v.unlisted | d(true) | bool | ternary('yes', 'no') }}' echo -e 'Unguarded:\t{{ v.unguarded | d(false) | bool | ternary('yes', 'no') }}' diff --git a/roles/helper/pci/tasks/udev.yml b/roles/helper/pci/tasks/udev.yml index 9aef27e..b4773f0 100644 --- a/roles/helper/pci/tasks/udev.yml +++ b/roles/helper/pci/tasks/udev.yml @@ -31,26 +31,28 @@ {{ _all | selectattr('IOMMUGroup', 'defined') }} _vf_to_pf: >- - {{ shell_vf_to_pf_fn.stdout_lines | d([]) - | map('split', ';') - | items2dict(key_name=0, value_name=1) }} + {{ shell_pf_vf_fn.stdout_lines | d([]) + | map('split', ';') + | items2dict(key_name=1, value_name=0) }} _vf_to_fn: >- - {{ shell_vf_to_pf_fn.stdout_lines | d([]) - | map('split', ';') - | items2dict(key_name=0, value_name=2) }} + {{ shell_pf_vf_fn.stdout_lines | d([]) + | map('split', ';') + | items2dict(key_name=1, value_name=2) }} block: - name: Scan /sys/bus/pci/devices/*/virtfn* (SR-IOV) ansible.builtin.shell: cmd: | - set -o errexit -o pipefail + set -o errexit -o pipefail; shopt -s nullglob {% for v in _sriov_devices %} - find -P "/sys/bus/pci/devices/{{ v.Slot }}/" -maxdepth 1 -type l -name 'virtfn*' -printf '%l/%P\n' | while IFS='/' read -r _ VF FN; do - echo "$VF;{{ v.Slot }};${FN#virtfn}" + for VF_PATH in '/sys/bus/pci/devices/{{ v.Slot }}/virtfn'*; do + VF="$(basename "$(realpath "$VF_PATH")")" + FN="$(basename "$VF_PATH")" + echo "{{ v.Slot }};$VF;${FN#virtfn}" done {% endfor %} executable: /bin/bash - register: shell_vf_to_pf_fn + register: shell_pf_vf_fn changed_when: false when: _sriov_devices | count > 0 diff --git a/roles/helper/pci/templates/sriov-enable@.service.jinja b/roles/helper/pci/templates/sriov-enable@.service.jinja new file mode 100644 index 0000000..53a2d84 --- /dev/null +++ b/roles/helper/pci/templates/sriov-enable@.service.jinja @@ -0,0 +1,13 @@ +# managed by one-deploy; vim:syn=systemd: +[Unit] +Description=Enable SR-IOV VFs on %I +After=network-pre.target + +[Service] +Type=oneshot +RemainAfterExit=yes +# %I is replaced by the string after the @ in the command +ExecStart=/usr/local/sbin/sriov-manage.sh %i + +[Install] +WantedBy=multi-user.target diff --git a/roles/helper/pci/templates/sriov-manage.sh.jinja b/roles/helper/pci/templates/sriov-manage.sh.jinja new file mode 100644 index 0000000..eb3ee34 --- /dev/null +++ b/roles/helper/pci/templates/sriov-manage.sh.jinja @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# managed by one-deploy; vim:syn=bash: + +set -o errexit -o nounset -o pipefail; shopt -s nullglob + +type -p basename devlink realpath &>/dev/null + +# Split the input (e.g., 0000:27:00.0-4-switchdev) into address, count and options +IFS='-' read -r PCI_ADDR VF_COUNT OPTIONS <<< "$1" +IFS='-' read -ra OPTIONS <<< "$OPTIONS" + +# Verify the device exists before writing +if ! [[ -d "/sys/bus/pci/devices/$PCI_ADDR/" ]]; then + echo "ERROR: No such device: $PCI_ADDR" >&2 + exit 1 +fi + +# Handle extra options before writing +for OPT in "${OPTIONS[@]}"; do + case "$OPT" in + switchdev) + if ! ESWITCH_SHOW="$(devlink dev eswitch show "pci/$PCI_ADDR")"; then + echo "ERROR: Not an eswitch device: $PCI_ADDR" >&2 + exit 1 + fi + + if [[ "$ESWITCH_SHOW" =~ (^|[[:space:]])mode[[:space:]]+switchdev([[:space:]]|$) ]]; then + echo "WARNING: Nothing to do for: $OPT" >&2 + continue + fi + + # Collect info about drivers in use + declare -A VF_DRIVER_MAP + for VF_PATH in "/sys/bus/pci/devices/$PCI_ADDR/virtfn"*; do + if ! VF_DRIVER="$(realpath -e "$VF_PATH/driver")"; then + echo "WARNING: No driver found: $VF_PATH/driver" >&2 + continue + fi + VF_PCI_ADDR="$(basename "$(realpath "$VF_PATH")")" + VF_DRIVER_MAP["$VF_PCI_ADDR"]="$VF_DRIVER" + done + + for VF_PCI_ADDR in "${!VF_DRIVER_MAP[@]}"; do + echo "Unbinding $VF_PCI_ADDR from ${VF_DRIVER_MAP["$VF_PCI_ADDR"]}" >&2 + echo "$VF_PCI_ADDR" >"${VF_DRIVER_MAP["$VF_PCI_ADDR"]}/unbind" + done + + echo "Enabling switchdev mode on $PCI_ADDR" >&2 + devlink dev eswitch set "pci/$PCI_ADDR" mode switchdev + + for VF_PCI_ADDR in "${!VF_DRIVER_MAP[@]}"; do + echo "Binding $VF_PCI_ADDR to ${VF_DRIVER_MAP["$VF_PCI_ADDR"]}" >&2 + echo "$VF_PCI_ADDR" >"${VF_DRIVER_MAP["$VF_PCI_ADDR"]}/bind" + done + ;; + *) + echo "ERROR: Unrecognized option: $OPT" >&2 + exit 1 + ;; + esac +done + +echo "Setting $VF_COUNT VFs on $PCI_ADDR" +echo "$VF_COUNT" >"/sys/bus/pci/devices/$PCI_ADDR/sriov_numvfs"