From 4f90796d474ed52b7e37e49858d09bd66179afed Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 22 Mar 2024 10:44:22 +0000 Subject: [PATCH] Add StackHPC Ironic tunings --- doc/source/configuration/index.rst | 1 + doc/source/configuration/ironic.rst | 31 ++++++++++ etc/kayobe/kolla/config/ironic-inspector.conf | 13 ++++ .../kolla/config/ironic/ironic-api.conf | 6 ++ .../kolla/config/ironic/ironic-conductor.conf | 60 +++++++++++++++++++ etc/kayobe/kolla/config/neutron.conf | 12 ++++ etc/kayobe/kolla/config/nova.conf | 13 ++++ .../config/nova/nova-compute-ironic.conf | 15 ++++- etc/kayobe/kolla/globals.yml | 29 +++++++++ 9 files changed, 179 insertions(+), 1 deletion(-) create mode 100644 doc/source/configuration/ironic.rst create mode 100644 etc/kayobe/kolla/config/ironic-inspector.conf create mode 100644 etc/kayobe/kolla/config/ironic/ironic-api.conf create mode 100644 etc/kayobe/kolla/config/ironic/ironic-conductor.conf create mode 100644 etc/kayobe/kolla/config/neutron.conf diff --git a/doc/source/configuration/index.rst b/doc/source/configuration/index.rst index 9c423c352c..5cca8ada2d 100644 --- a/doc/source/configuration/index.rst +++ b/doc/source/configuration/index.rst @@ -10,6 +10,7 @@ the various features provided. release-train host-images + ironic lvm cephadm monitoring diff --git a/doc/source/configuration/ironic.rst b/doc/source/configuration/ironic.rst new file mode 100644 index 0000000000..9bcdf121c5 --- /dev/null +++ b/doc/source/configuration/ironic.rst @@ -0,0 +1,31 @@ +====== +Ironic +====== + +Cleaning +======== + +Storage +------- + +Hardware assisted secure erase, i.e the ``erase_devices`` clean step, is +enabled by default. This is normally dependent on the `Hardware Manager +`__ +in use. For example, when using the GenericHardwareManager the priority would +be 10, whereas if using the `ProliantHardwareManager +`__ +it would be 0. The idea is that we will prevent the catastrophic case where +data could be leaked to another tenant; forcing you to have to explicitly relax +this setting if this is a risk you want to take. This can be customised by +editing the following variables: + +.. code-block:: + :caption: $KAYOBE_CONFIG_PATH/kolla/config/ironic/ironic-conductor.conf + + [deploy] + erase_devices_priority=10 + erase_devices_metadata_priority=0 + +See `Ironic documentation +`__ for more +details. diff --git a/etc/kayobe/kolla/config/ironic-inspector.conf b/etc/kayobe/kolla/config/ironic-inspector.conf new file mode 100644 index 0000000000..7b8635793f --- /dev/null +++ b/etc/kayobe/kolla/config/ironic-inspector.conf @@ -0,0 +1,13 @@ +[DEFAULT] +timeout = 0 +{% if "genericswitch" in kolla_neutron_ml2_mechanism_drivers %} +# We are increasing the RPC response timeouts to 5 minutes due to the neutron +# generic switch driver, which synchronously applies switch configuration for +# each ironic port during node provisioning and tear down. +# The specific API calls that require this long timeout are: +# - Creation and deletion of VLAN networks. +# - Creation or update of ports, adding binding information. +# - Update of ports, removing binding information. +# - Deletion of ports. +rpc_response_timeout = 360 +{% endif %} diff --git a/etc/kayobe/kolla/config/ironic/ironic-api.conf b/etc/kayobe/kolla/config/ironic/ironic-api.conf new file mode 100644 index 0000000000..117d6d3d77 --- /dev/null +++ b/etc/kayobe/kolla/config/ironic/ironic-api.conf @@ -0,0 +1,6 @@ +[DEFAULT] +# Avoid some timeouts of heartbeats and vif deletes +rpc_response_timeout = 360 + +[neutron] +timeout = 300 diff --git a/etc/kayobe/kolla/config/ironic/ironic-conductor.conf b/etc/kayobe/kolla/config/ironic/ironic-conductor.conf new file mode 100644 index 0000000000..ad03c56f2b --- /dev/null +++ b/etc/kayobe/kolla/config/ironic/ironic-conductor.conf @@ -0,0 +1,60 @@ +[DEFAULT] +# Make direct deploy faster, transfer sparse qcow2 images +force_raw_images = False +# Avoid some rpc timeouts +rpc_response_timeout = 360 + +[conductor] +automated_clean=true +# We have busy conductors failing to heartbeat +# Default is 10 secs +heartbeat_interval = 30 +# Default is 60 seconds +heartbeat_timeout = 360 +sync_local_state_interval = 360 + +# Normally this is 100. We see eventlet threads +# not making much progress, to for saftey reduce +# this by half, should leave work on rabbit queu +workers_pool_size = 50 +# Normally this is 8, keep it same +period_max_workers = 8 + +# Increase power sync interval to reduce load +sync_power_state_interval = 120 +power_failure_recovery_interval = 120 +# Stop checking for orphan allocations for now +check_allocations_interval = 120 + +# Wait much longer before provision timeout check, to reduce background load +# The default is 60 seconds +check_provision_state_interval = 120 +check_rescue_state_interval = 120 + +[database] +# Usually this is 50, reduce to stop DB connection timeouts +# and instead just make eventlet threads wait a bit longer +max_overflow = 5 +# By default this is 30 seconds, but as we reduce +# the pool overflow, some people will need to wait longer +pool_timeout = 60 + +[deploy] +# Force Hardware assisted secure erase by default. +erase_devices_priority=10 +erase_devices_metadata_priority=0 + +[pxe] +# Increase cache size to 120GB and TTL to 28 hours +image_cache_size = 122880 +image_cache_ttl = 100800 + +[neutron] +# Increase the neutron client timeout to allow for the slow management +# switches. +timeout = 300 +request_timeout = 300 + +[glance] +# Retry image download at least once if failure +num_retries = 1 diff --git a/etc/kayobe/kolla/config/neutron.conf b/etc/kayobe/kolla/config/neutron.conf new file mode 100644 index 0000000000..60587aecb0 --- /dev/null +++ b/etc/kayobe/kolla/config/neutron.conf @@ -0,0 +1,12 @@ +[DEFAULT] +{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %} +# We are increasing the RPC response timeouts to 5 minutes due to the neutron +# generic switch driver, which synchronously applies switch configuration for +# each ironic port during node provisioning and tear down. +# The specific API calls that require this long timeout are: +# - Creation and deletion of VLAN networks. +# - Creation or update of ports, adding binding information. +# - Update of ports, removing binding information. +# - Deletion of ports. +rpc_response_timeout = 360 +{% endif %} diff --git a/etc/kayobe/kolla/config/nova.conf b/etc/kayobe/kolla/config/nova.conf index d1000be56b..585bd402cb 100644 --- a/etc/kayobe/kolla/config/nova.conf +++ b/etc/kayobe/kolla/config/nova.conf @@ -1,2 +1,15 @@ +[DEFAULT] +{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %} +# We are increasing the RPC response timeouts to 5 minutes due to the neutron +# generic switch driver, which synchronously applies switch configuration for +# each ironic port during node provisioning and tear down. +# The specific API calls that require this long timeout are: +# - Creation and deletion of VLAN networks. +# - Creation or update of ports, adding binding information. +# - Update of ports, removing binding information. +# - Deletion of ports. +rpc_response_timeout = 360 +{% endif %} + [libvirt] hw_machine_type = x86_64=q35 diff --git a/etc/kayobe/kolla/config/nova/nova-compute-ironic.conf b/etc/kayobe/kolla/config/nova/nova-compute-ironic.conf index 9f6db7a552..ce046f3fcb 100644 --- a/etc/kayobe/kolla/config/nova/nova-compute-ironic.conf +++ b/etc/kayobe/kolla/config/nova/nova-compute-ironic.conf @@ -1,4 +1,17 @@ -{% if kolla_enable_ironic|bool and kolla_nova_compute_ironic_host is not none %} [DEFAULT] +{% if kolla_enable_ironic|bool and kolla_nova_compute_ironic_host is not none %} host = {{ kolla_nova_compute_ironic_static_host_name | mandatory('You must set a static host name to help with service failover. See the operations documentation, Ironic section.') }} {% endif %} +# Don't limit the number of concurrent builds for the nova ironic compute +# service. +max_concurrent_builds = 35 + +force_config_drive = True + +[ironic] +# Ramp up maximum retries to allow time for baremetal node reboot and switch configs +api_max_retries = 720 + +[compute] +# Don't disable the compute service due to failed builds. +consecutive_build_service_disable_threshold = 0 diff --git a/etc/kayobe/kolla/globals.yml b/etc/kayobe/kolla/globals.yml index d7ef2198bc..f374a29d41 100644 --- a/etc/kayobe/kolla/globals.yml +++ b/etc/kayobe/kolla/globals.yml @@ -65,3 +65,32 @@ rabbitmq_image: "{% raw %}{{ docker_registry ~ '/' if docker_registry else '' }} # https://github.com/sysown/proxysql/issues/4877 or K-A bumps ProxySQL to 3.x. database_enable_tls_internal: false database_enable_tls_backend: false + +############################################################################# +# HAProxy tunings + +{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %} +# NOTE: We are increasing the HAProxy timeouts to 5 minutes due to the neutron +# generic switch driver, which synchronously applies switch configuration for +# each ironic port during node provisioning and tear down. +# The specific API calls that require this long timeout are: +# - Creation and deletion of VLAN networks. +# - Creation or update of ports, adding binding information. +# - Update of ports, removing binding information. +# - Deletion of ports. +haproxy_client_timeout: 5m30 +haproxy_server_timeout: 5m30 +# If using Neutron backend TLS: +neutron_tls_proxy_client_timeout: 5m30 +neutron_tls_proxy_server_timeout: 5m30 +{% endif %} + +############################################################################# +# Neutron tunings + +{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %} +neutron_tls_proxy_client_timeout: 5m30 +neutron_tls_proxy_server_timeout: 5m30 +{% endif %} + +#############################################################################