diff --git a/roles/openvswitch/README.md b/roles/openvswitch/README.md index d060046..52ce98e 100644 --- a/roles/openvswitch/README.md +++ b/roles/openvswitch/README.md @@ -28,10 +28,12 @@ Example Playbook vars: kernel_ok_to_reboot: true kernel_params: - - default_hugepagesz: "1G" - - hugepagesz: "1G" - - hugepages: 3 - intel_iommu: "on" + # NOTE: With the 'auto' dpdk-socket-mem and 2 x MTU 9000 this is not enough, + # the dpdk-socket-mem.service will attempt dynamic allocation. + # Pre-allocating hugepages in kernel's cmdline is highly recommended. + - default_hugepagesz: "1G" + - hugepages: 4 kernel_modules: - load: vfio-pci - load: vfio_iommu_type1 @@ -58,7 +60,10 @@ Example Playbook ovs: set: - other_config:dpdk-init: 'true' - - other_config:dpdk-socket-mem: '1024,0' + - other_config:per-port-memory: 'false' # the default + # NOTE: With 'auto' one-deploy will automatically calculate and allocate memory for you. + # Please ensure you have enough memory available. + - other_config:dpdk-socket-mem: auto port: ovsbr0: # "internal" port set: diff --git a/roles/openvswitch/defaults/main.yml b/roles/openvswitch/defaults/main.yml index 59787a0..6c1433d 100644 --- a/roles/openvswitch/defaults/main.yml +++ b/roles/openvswitch/defaults/main.yml @@ -35,12 +35,14 @@ ovs_packages_dpdk: AlmaLinux: - dpdk-tools - ethtool + - gawk - iproute - iputils - openvswitch3.5 - systemd-resolved Debian: - ethtool + - gawk - iproute2 - iputils-arping - openvswitch-switch-dpdk @@ -48,6 +50,7 @@ ovs_packages_dpdk: RedHat: - dpdk-tools - ethtool + - gawk - iproute - iputils - openvswitch3.6 @@ -55,6 +58,7 @@ ovs_packages_dpdk: Suse: - dpdk-tools - ethtool + - gawk - iproute2 - iputils - openvswitch diff --git a/roles/openvswitch/tasks/main.yml b/roles/openvswitch/tasks/main.yml index 57c4e89..331b3f1 100644 --- a/roles/openvswitch/tasks/main.yml +++ b/roles/openvswitch/tasks/main.yml @@ -6,15 +6,24 @@ - when: ((ovs.iface | count) + (ovs.bond | count) + (ovs.br | count)) > 0 vars: # helpers + _pci_addr_regex: >- ^[0-9a-fA-F]{4}[:][0-9a-fA-F]{2}[:][0-9a-fA-F]{2}[.][0-9a-fA-F]{1,2}$ + # general + _dpdk_enabled: >- {{ ovs.set | d([]) | selectattr('other_config:dpdk-init', 'defined') | selectattr('other_config:dpdk-init', 'in', ['true']) | count > 0 }} + + _dpdk_socket_mem_defined: >- + {{ ovs.set | d([]) + | selectattr('other_config:dpdk-socket-mem', 'defined') + | count > 0 }} # iface + _dpdk_iface: >- {%- set output = [] -%} {%- for k, v in ovs.iface.items() -%} @@ -25,6 +34,7 @@ {%- endfor -%} {%- endfor -%} {{- dict(output) -}} + _dpdk_pci_addrs_raw: >- {%- set output = [] -%} {%- for k, v in _dpdk_iface.items() -%} @@ -38,14 +48,18 @@ {%- endfor -%} {%- endfor -%} {{- output -}} + _dpdk_pci_addrs_items: >- {{ _dpdk_pci_addrs_raw | select | unique }} + _dpdk_pci_addrs: >- {{ dict(_dpdk_pci_addrs_items) }} + _internal_iface: >- {{ ovs.iface | dict2items | selectattr('key', 'in', ovs.br.keys()) | items2dict }} + _system_iface: >- {%- set output = [] -%} {%- for k, v in ovs.iface.items() | rejectattr(0, 'in', _internal_iface.keys()) -%} @@ -58,32 +72,39 @@ {%- endif -%} {%- endfor -%} {{- dict(output) -}} + _iface: >- {{ ovs.iface | dict2items | rejectattr('key', 'in', _dpdk_iface.keys()) | rejectattr('key', 'in', _internal_iface.keys()) | items2dict }} + _pci_addrs: >- {{ command_udevadm_pci_addresses.stdout_lines | d([]) | select | map('regex_replace', '^pci-', '') }} # bond + _dpdk_bond: >- {{ ovs.bond | dict2items | selectattr('value.ifaces', 'defined') | selectattr('value.ifaces', 'subset', _dpdk_iface.keys()) | items2dict }} + _bond: >- {{ ovs.bond | dict2items | selectattr('value.ifaces', 'defined') | selectattr('value.ifaces', 'subset', _iface.keys()) | items2dict }} + _bond_ifaces: >- {{ ovs.bond | dict2items | selectattr('value.ifaces', 'defined') | map(attribute='value.ifaces') | flatten }} + # br + _dpdk_br: >- {%- set output = [] -%} {%- for k, v in ovs.br.items() -%} @@ -94,10 +115,12 @@ {%- endfor -%} {%- endfor -%} {{- dict(output) -}} + _br: >- {{ ovs.br | dict2items | rejectattr('key', 'in', _dpdk_br.keys()) | items2dict }} + _br_ports: >- {{ ovs.br | dict2items | selectattr('value.ports', 'defined') @@ -283,9 +306,7 @@ retries: 12 delay: 5 - - when: - - ansible_os_family in ['Debian'] - - _dpdk_enabled is true + - when: ansible_os_family in ['Debian'] and _dpdk_enabled block: - name: Use DPDK version of ovs-vswitchd community.general.alternatives: @@ -357,32 +378,75 @@ and (alternatives_ovs_vswitchd is changed)) }} - - name: Install OVS-related scripts - ansible.builtin.template: - src: "{{ item.src }}" - dest: "{{ item.dest }}" - mode: "{{ item.mode }}" - owner: 0 - group: 0 - loop: - - src: opennebula-ovs.sh.jinja - dest: /usr/local/sbin/opennebula-ovs.sh - mode: u=rwx,go=rx - - src: opennebula-ovs.service.jinja - dest: /etc/systemd/system/opennebula-ovs.service - mode: u=rw,go=r - register: template - - - name: Switch to OVS networking - ansible.builtin.systemd_service: - daemon_reload: "{{ item.daemon_reload | d(omit) }}" - name: "{{ item.name | d(omit) }}" - state: "{{ item.state | d(omit) }}" - enabled: "{{ item.enabled | d(omit) }}" - loop: - - daemon_reload: "{{ _changed }}" - - name: opennebula-ovs.service - state: "{{ 'restarted' if _changed else 'started' }}" - enabled: true - vars: - _changed: "{{ template is changed }}" + - when: _dpdk_enabled and _dpdk_socket_mem_defined + block: + - ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/memory.yml" + + - name: Create /etc/systemd/system/ovs-vswitchd.service.d/ + ansible.builtin.file: + path: /etc/systemd/system/ovs-vswitchd.service.d/ + state: directory + owner: 0 + group: 0 + mode: u=rwx,go=rx + + - block: + - name: Install OVS-related scripts + ansible.builtin.template: + src: "{{ item.src }}" + dest: "{{ item.dest }}" + mode: "{{ item.mode }}" + owner: 0 + group: 0 + when: item.when | d(true) + loop: + - when: "{{ _dpdk_enabled and _dpdk_socket_mem_defined }}" + src: dpdk-socket-mem-scan.sh.jinja + dest: /usr/local/sbin/dpdk-socket-mem-scan.sh + mode: u=rwx,go=rx + + - when: "{{ _dpdk_enabled and _dpdk_socket_mem_defined }}" + src: dpdk-socket-mem.sh.jinja + dest: /usr/local/sbin/dpdk-socket-mem.sh + mode: u=rwx,go=rx + + - when: "{{ _dpdk_enabled and _dpdk_socket_mem_defined }}" + src: dpdk-socket-mem.service.jinja + dest: /etc/systemd/system/dpdk-socket-mem.service + mode: u=rw,go=r + + - when: "{{ _dpdk_enabled and _dpdk_socket_mem_defined }}" + src: ovs-vswitchd.service.jinja + dest: /etc/systemd/system/ovs-vswitchd.service.d/override.conf + mode: u=rw,go=r + + - src: opennebula-ovs.sh.jinja + dest: /usr/local/sbin/opennebula-ovs.sh + mode: u=rwx,go=rx + + - src: opennebula-ovs.service.jinja + dest: /etc/systemd/system/opennebula-ovs.service + mode: u=rw,go=r + register: template + + - name: Switch to OVS networking + ansible.builtin.systemd_service: + daemon_reload: "{{ item.daemon_reload | d(omit) }}" + name: "{{ item.name | d(omit) }}" + state: "{{ item.state | d(omit) }}" + enabled: "{{ item.enabled | d(omit) }}" + when: item.when | d(true) + loop: + - daemon_reload: "{{ _changed }}" + + - when: "{{ _dpdk_enabled and _dpdk_socket_mem_defined }}" + name: dpdk-socket-mem.service + state: "{{ 'restarted' if _changed else 'started' }}" + enabled: true + + - name: opennebula-ovs.service + state: "{{ 'restarted' if _changed else 'started' }}" + enabled: true + vars: + _changed: "{{ template is changed }}" diff --git a/roles/openvswitch/tasks/memory.yml b/roles/openvswitch/tasks/memory.yml new file mode 100644 index 0000000..0c5dae2 --- /dev/null +++ b/roles/openvswitch/tasks/memory.yml @@ -0,0 +1,163 @@ +--- +- vars: + _pci_addr_regex: >- + ^[0-9a-fA-F]{4}[:][0-9a-fA-F]{2}[:][0-9a-fA-F]{2}[.][0-9a-fA-F]{1,2}$ + + _per_port_memory: >- + {{ ovs.set | d([]) + | selectattr('other_config:per-port-memory', 'defined') + | map(attribute='other_config:per-port-memory') + | map('bool') + | first + | d(false) }} + + _dpdk_socket_mem: >- + {{ ovs.set | d([]) + | selectattr('other_config:dpdk-socket-mem', 'defined') + | map(attribute='other_config:dpdk-socket-mem') + | first + | d('') }} + + _dpdk_iface: >- + {%- set output = [] -%} + {%- for k, v in ovs.iface.items() -%} + {%- for u in v.set | d([]) | selectattr('type', 'defined') -%} + {%- if u.type in ['dpdk'] -%} + {{- output.append([k, v]) -}} + {%- endif -%} + {%- endfor -%} + {%- endfor -%} + {{- dict(output) -}} + + _dpdk_mtu_per_iface: >- + {%- set output = [] -%} + {%- for iface, v in _dpdk_iface.items() -%} + {{- output.append([iface, 1500]) -}} + {%- for o in v.set | d([]) | selectattr('mtu_request', 'defined') -%} + {{- output.append([iface, o.mtu_request]) -}} + {%- endfor -%} + {%- endfor -%} + {{- dict(output) -}} + + _dpdk_pci_addr_per_iface: >- + {%- set output = [] -%} + {%- for iface, v in _dpdk_iface.items() -%} + {%- for u in v.set | selectattr('options:dpdk-devargs', 'defined') -%} + {%- set _devargs = u['options:dpdk-devargs'] | string -%} + {%- if _devargs | regex_search(_pci_addr_regex) -%} + {{- output.append([iface, _devargs]) -}} + {%- endif -%} + {%- endfor -%} + {%- endfor -%} + {{- dict(output) -}} + + _dpdk_numa_node_per_iface: >- + {%- set output = [] -%} + {%- for iface, numa_node in shell_pci_addr_per_iface.stdout_lines | map('split', ';') -%} + {{- output.append([iface, numa_node | int]) -}} + {%- endfor -%} + {{- dict(output) -}} + + _dpdk_ifaces_per_numa_node: >- + {%- set output = {} -%} + {%- for iface, numa_node in _dpdk_numa_node_per_iface.items() -%} + {{- + output.update(output | combine( + { numa_node: output.get(numa_node, []) + [iface] } + )) + -}} + {%- endfor -%} + {{- output -}} + + # NOTE: + # 896 = 704 + 128 + 64 <- metadata overhead (constant for all MTUs) + # 704 <- dp_packet & rte_mbuf + # 128 <- RTE_PKTMBUF_HEADROOM + # 64 <- alignment + # 262_144 <- MAX_NB_MBUF + _dpdk_base_pool_sizes: >- + {%- set output = [] -%} + {%- for mtu in _dpdk_mtu_per_iface.values() -%} + {{- + output.append([ + mtu, + ((((((mtu / 1024) | round(0, 'ceil') * 1024) | int + 896) / 64) | round(0, 'ceil') * 64) | int + 64) * 262_144, + ]) + -}} + {%- endfor -%} + {{- dict(output) -}} + + # NOTE: + # 536_870_912 <- overhead buffer (512M) + _dpdk_socket_mem_shared_model: >- + {%- set output = {} -%} + {%- for numa_node, ifaces in _dpdk_ifaces_per_numa_node.items() -%} + {%- for mtu in _dpdk_mtu_per_iface.items() | selectattr(0, 'in', ifaces) | map(attribute=1) | unique -%} + {{- + output.update( + { numa_node: (output.get(numa_node, 536_870_912) + _dpdk_base_pool_sizes.get(mtu)) } + ) + -}} + {%- endfor -%} + {{- + output.update( + { numa_node: ((output.get(numa_node) / 1024**3) | round(0, 'ceil') * 1024) | int } + ) + -}} + {%- endfor -%} + {{- range(output | max + 1) | zip((output | max + 1) * [0]) + | items2dict(key_name=0, value_name=1) + | combine(output) -}} + + # NOTE: + # 536_870_912 <- overhead buffer (512M) + _dpdk_socket_mem_per_port_model: >- + {%- set output = {} -%} + {%- for numa_node, ifaces in _dpdk_ifaces_per_numa_node.items() -%} + {%- for iface in ifaces -%} + {{- + output.update( + { numa_node: output.get(numa_node, 536_870_912) + _dpdk_base_pool_sizes.get(_dpdk_mtu_per_iface.get(iface)) } + ) + -}} + {%- endfor -%} + {{- + output.update( + { numa_node: ((output.get(numa_node) / 1024**3) | round(0, 'ceil') * 1024) | int } + ) + -}} + {%- endfor -%} + {{- range(output | max + 1) | zip((output | max + 1) * [0]) + | items2dict(key_name=0, value_name=1) + | combine(output) -}} + block: + - name: Get NUMA info for DPDK devices + ansible.builtin.shell: + cmd: | + set -o errexit + {% for k, v in _dpdk_pci_addr_per_iface.items() %} + NODE="$(head -1 '/sys/bus/pci/devices/{{ v }}/numa_node' || echo '0')" + echo "{{ k }};$(( NODE < 0 ? 0 : NODE ))" + {% endfor %} + executable: /bin/bash + changed_when: false + register: shell_pci_addr_per_iface + + - name: Decide what to do with 'other_config:dpdk-socket-mem' (update in-place) + ansible.builtin.set_fact: + ovs: >- + {{ ovs | combine({ + "set": ovs.set | d([]) + | rejectattr('other_config:dpdk-socket-mem', 'defined') + | union(_update), + }, recursive=true) }} + vars: + _update: + - "other_config:dpdk-socket-mem": >- + {{ _estimated + if (_dpdk_socket_mem == 'auto') else + _dpdk_socket_mem }} + _estimated: >- + {{ (_dpdk_socket_mem_per_port_model | dictsort | map(attribute=1) | join(',')) + if _per_port_memory else + (_dpdk_socket_mem_shared_model | dictsort | map(attribute=1) | join(',')) }} diff --git a/roles/openvswitch/templates/dpdk-socket-mem-scan.sh.jinja b/roles/openvswitch/templates/dpdk-socket-mem-scan.sh.jinja new file mode 100644 index 0000000..95a94bf --- /dev/null +++ b/roles/openvswitch/templates/dpdk-socket-mem-scan.sh.jinja @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# managed by one-deploy; vim:syn=bash: +set -o errexit + +VSWITCHD_PID="$(systemctl show --property MainPID --value ovs-vswitchd.service)" + +if ! (( VSWITCHD_PID > 0 )); then + exit +fi + +gawk -f- "/proc/$VSWITCHD_PID/numa_maps" <<'AWK' +BEGIN { + PROCINFO["sorted_in"] = "@ind_num_asc" +} + +match($0, /\/) && match($0, /\/, N) && match($0, /\/, P) { + seen[N[1]] += N[2] * int(P[1] / 1024) +} + +END { + for (node in seen) { + max_node = (node > max_node) ? node : max_node + } + + for (node = 0; node <= max_node; node++) { + joined = (joined == "" ? "" : joined ",") ((node in seen) ? seen[node] : 0) + } + + print joined +} +AWK diff --git a/roles/openvswitch/templates/dpdk-socket-mem.service.jinja b/roles/openvswitch/templates/dpdk-socket-mem.service.jinja new file mode 100644 index 0000000..3e5d3cd --- /dev/null +++ b/roles/openvswitch/templates/dpdk-socket-mem.service.jinja @@ -0,0 +1,22 @@ +# managed by one-deploy; vim:syn=systemd: +[Unit] +Description=OVS DPDK Socket Memory Pre-allocation +After=ovsdb-server.service +Requires=ovsdb-server.service +DefaultDependencies=no + +[Service] +Type=oneshot +RemainAfterExit=yes +EnvironmentFile= +ExecStart=/usr/local/sbin/dpdk-socket-mem.sh +TimeoutStartSec=120 +StandardOutput=journal +StandardError=journal +# Ensure the service runs in a clean environment +Environment="PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" +# HugeTLB operations require root +User=root + +[Install] +WantedBy=multi-user.target diff --git a/roles/openvswitch/templates/dpdk-socket-mem.sh.jinja b/roles/openvswitch/templates/dpdk-socket-mem.sh.jinja new file mode 100644 index 0000000..fd86471 --- /dev/null +++ b/roles/openvswitch/templates/dpdk-socket-mem.sh.jinja @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# managed by one-deploy +set -o errexit + +{% if not _dpdk_enabled %} +echo 'DPDK has been disabled' >&2 +exit +{% endif %} + +WANT='{{ ovs.set | d([]) | selectattr("other_config:dpdk-socket-mem", "defined") | map(attribute="other_config:dpdk-socket-mem") | first | d("") }}' +HAVE="$(/usr/local/sbin/dpdk-socket-mem-scan.sh)" + +echo "'$HAVE' -> '$WANT'" >&2 + +if [[ "$HAVE" == "$WANT" ]]; then + echo 'Nothing to do' >&2 + exit +fi + +IFS=',' read -ra _WANT <<< "$WANT" +IFS=',' read -ra _HAVE <<< "$HAVE" + +for NODE in "${!_WANT[@]}"; do + HELD="$(head -1 "/sys/devices/system/node/node$NODE/hugepages/hugepages-1048576kB/nr_hugepages")" + FREE="$(head -1 "/sys/devices/system/node/node$NODE/hugepages/hugepages-1048576kB/free_hugepages")" + + (( NEED = (_WANT[$NODE] - _HAVE[$NODE]) / 1024, 1 )) + + if (( NEED > FREE )); then + echo "$(( HELD + NEED - FREE ))" > "/sys/devices/system/node/node$NODE/hugepages/hugepages-1048576kB/nr_hugepages" + CHANGED=1 + fi +done + +{% for v in ovs.set | d([]) | map('dict2items') | flatten %} +ovs-vsctl --no-wait set Open_vSwitch . '{{ v.key }}={{ v.value }}' +{% endfor %} + +if (( CHANGED > 0 )); then + systemctl restart --no-block ovs-vswitchd.service +fi + +echo 'Done' >&2 +exit diff --git a/roles/openvswitch/templates/opennebula-ovs.service.jinja b/roles/openvswitch/templates/opennebula-ovs.service.jinja index 14e5866..5c5ec45 100644 --- a/roles/openvswitch/templates/opennebula-ovs.service.jinja +++ b/roles/openvswitch/templates/opennebula-ovs.service.jinja @@ -1,6 +1,7 @@ +# managed by one-deploy; vim:syn=systemd: [Unit] -Description=OVS Bridge Interface Network configuration -{% if ansible_os_family == 'Debian' %} +Description=OVS Network Configuration +{% if ansible_os_family in ['Debian'] %} After=openvswitch-switch.service network-pre.target Wants=network-pre.target Before=network.target diff --git a/roles/openvswitch/templates/opennebula-ovs.sh.jinja b/roles/openvswitch/templates/opennebula-ovs.sh.jinja index 3eaaf4f..0290f9d 100644 --- a/roles/openvswitch/templates/opennebula-ovs.sh.jinja +++ b/roles/openvswitch/templates/opennebula-ovs.sh.jinja @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# managed by one-deploy set -o errexit -o pipefail # --- Helper functions @@ -9,7 +10,7 @@ die() { log "ERROR: $*"; exit 1; } # --- Basic assertions log 'Asserting required binaries and scripts are available' -type -p arping find ip jq ovs-vsctl &>/dev/null +type -p arping find ip jq ovs-vsctl systemctl &>/dev/null {% if _dpdk_enabled %} type -p dpdk-devbind.py ethtool &>/dev/null {% endif %} @@ -64,7 +65,7 @@ done # --- Networking cleanup -if type -p systemctl &>/dev/null && systemctl is-active --quiet NetworkManager; then +if systemctl is-active --quiet NetworkManager; then log 'Stopping and disabling NetworkManager' systemctl disable NetworkManager || log 'WARNING: Failed to disable NetworkManager' systemctl stop NetworkManager || log 'WARNING: Failed to stop NetworkManager' diff --git a/roles/openvswitch/templates/ovs-vswitchd.service.jinja b/roles/openvswitch/templates/ovs-vswitchd.service.jinja new file mode 100644 index 0000000..fb610e8 --- /dev/null +++ b/roles/openvswitch/templates/ovs-vswitchd.service.jinja @@ -0,0 +1,4 @@ +# managed by one-deploy; vim:syn=systemd: +[Unit] +After=dpdk-socket-mem.service +Requires=dpdk-socket-mem.service