diff --git a/doc/source/configuration/index.rst b/doc/source/configuration/index.rst index 9c423c352c..99fdaf3bf1 100644 --- a/doc/source/configuration/index.rst +++ b/doc/source/configuration/index.rst @@ -10,6 +10,7 @@ the various features provided. release-train host-images + ironic lvm cephadm monitoring @@ -22,3 +23,4 @@ the various features provided. ci-cd cloudkitty ipa + stackhpc-mixin-environments diff --git a/doc/source/configuration/ironic.rst b/doc/source/configuration/ironic.rst new file mode 100644 index 0000000000..f5b2e50fdc --- /dev/null +++ b/doc/source/configuration/ironic.rst @@ -0,0 +1,11 @@ +====== +Ironic +====== + +Mixin environments +------------------ + +The following mixin environments are provided to customise the Ironic configuration: + +* :ref:`mixin-baremetal` - StackHPC opinionated defaults for Ironic. +* :ref:`mixin-baremetal-policy` - Policy tweaks for Ironic diff --git a/doc/source/configuration/stackhpc-mixin-environments.rst b/doc/source/configuration/stackhpc-mixin-environments.rst new file mode 100644 index 0000000000..fa8950e8b7 --- /dev/null +++ b/doc/source/configuration/stackhpc-mixin-environments.rst @@ -0,0 +1,35 @@ +.. _stackhpc-mixin-environments: + +=========================== +StackHPC Mixin Environments +=========================== + +StackHPC Kayobe configuration provides a set of mixin environments, which can +be used to apply configuration in modular way. These provide a mechanism where +users can opt into new sets of configuration mid-cycle, at a time of the their +choosing, and thereby facilitate gradual adoption of new features. Config may +be moved into the the base configuration for the next major release. + +For more information about Kayobe environments, please see the `upstream Kayobe +documentation +`__. + +.. note:: + + To override settings in mixin environments, you will need to define the + overrides in an environment that inherits from that one, rather than in the + base configuration. + +.. _mixin-baremetal: + +baremetal +--------- + +.. include:: ../../../etc/kayobe/environments/baremetal/README.rst + +.. _mixin-baremetal-policy: + +baremetal-policy +---------------- + +.. include:: ../../../etc/kayobe/environments/baremetal-policy/README.rst diff --git a/etc/kayobe/environments/baremetal-policy/README.rst b/etc/kayobe/environments/baremetal-policy/README.rst index 88dafb4acd..1224c224e3 100644 --- a/etc/kayobe/environments/baremetal-policy/README.rst +++ b/etc/kayobe/environments/baremetal-policy/README.rst @@ -1,5 +1,5 @@ Policy for a baremetaluser role -=============================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ When deploying Slurm on baremetal nodes, it is typical to select a specific baremetal node, and give it the expected hostname. We allow this via a tweak to @@ -11,3 +11,20 @@ not own the network. We should never use the admin role to do these operations, as it has far too much privilege. + +Consuming this environment +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Add the ``baremetal-policy`` environment to your ``.kayobe-environment`` file: + +.. code-block:: yaml + :caption: $KAYOBE_CONFIG_PATH/$KAYOBE_ENVIRONMENT/.kayobe-environment + + dependencies: + - baremetal-policy + +Redeploy Neutron, and Nova: + +.. code-block:: console + + kayobe overcloud service deploy -kt neutron,nova diff --git a/etc/kayobe/environments/baremetal/README.rst b/etc/kayobe/environments/baremetal/README.rst new file mode 100644 index 0000000000..a5b65987d6 --- /dev/null +++ b/etc/kayobe/environments/baremetal/README.rst @@ -0,0 +1,49 @@ +StackHPC Ironic environment +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Mixin that adds StackHPC opinionated defaults for Ironic. + +Consuming this environment +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Add the ``baremetal`` environment to your ``.kayobe-environment`` file: + +.. code-block:: yaml + :caption: $KAYOBE_CONFIG_PATH/$KAYOBE_ENVIRONMENT/.kayobe-environment + + dependencies: + - baremetal + +Redeploy the loadbalancer, Neutron, Nova, and Ironic: + +.. code-block:: console + + kayobe overcloud service deploy -kt loadbalancer,ironic,nova,neutron + +Cleaning +^^^^^^^^ + +Storage +""""""" + +Hardware assisted secure erase, i.e the ``erase_devices`` clean step, is +enabled by default. This is normally dependent on the `Hardware Manager +`__ +in use. For example, when using the GenericHardwareManager the priority would +be 10, whereas if using the `ProliantHardwareManager +`__ +it would be 0. The idea is that we will prevent the catastrophic case where +data could be leaked to another tenant; forcing you to have to explicitly relax +this setting if this is a risk you want to take. This can be customised by +editing the following variables: + +.. code-block:: yaml + :caption: $KAYOBE_CONFIG_PATH/$KAYOBE_ENVIRONMENT/kolla/config/ironic/ironic-conductor.conf + + [deploy] + erase_devices_priority=10 + erase_devices_metadata_priority=0 + +See `Ironic documentation +`__ for more +details. diff --git a/etc/kayobe/environments/baremetal/kolla/config/ironic-inspector.conf b/etc/kayobe/environments/baremetal/kolla/config/ironic-inspector.conf new file mode 100644 index 0000000000..5bd3990d3f --- /dev/null +++ b/etc/kayobe/environments/baremetal/kolla/config/ironic-inspector.conf @@ -0,0 +1,13 @@ +[DEFAULT] +timeout = 0 +{% if "genericswitch" in kolla_neutron_ml2_mechanism_drivers %} +# We are increasing the RPC response timeouts to 6 minutes due to the neutron +# generic switch driver, which synchronously applies switch configuration for +# each ironic port during node provisioning and tear down. +# The specific API calls that require this long timeout are: +# - Creation and deletion of VLAN networks. +# - Creation or update of ports, adding binding information. +# - Update of ports, removing binding information. +# - Deletion of ports. +rpc_response_timeout = 360 +{% endif %} diff --git a/etc/kayobe/environments/baremetal/kolla/config/ironic/ironic-api.conf b/etc/kayobe/environments/baremetal/kolla/config/ironic/ironic-api.conf new file mode 100644 index 0000000000..117d6d3d77 --- /dev/null +++ b/etc/kayobe/environments/baremetal/kolla/config/ironic/ironic-api.conf @@ -0,0 +1,6 @@ +[DEFAULT] +# Avoid some timeouts of heartbeats and vif deletes +rpc_response_timeout = 360 + +[neutron] +timeout = 300 diff --git a/etc/kayobe/environments/baremetal/kolla/config/ironic/ironic-conductor.conf b/etc/kayobe/environments/baremetal/kolla/config/ironic/ironic-conductor.conf new file mode 100644 index 0000000000..9a295bd4ef --- /dev/null +++ b/etc/kayobe/environments/baremetal/kolla/config/ironic/ironic-conductor.conf @@ -0,0 +1,60 @@ +[DEFAULT] +# Make direct deploy faster, transfer sparse qcow2 images +force_raw_images = False +# Avoid some rpc timeouts +rpc_response_timeout = 360 + +[conductor] +automated_clean=true +# We have busy conductors failing to heartbeat +# Default is 10 secs +heartbeat_interval = 30 +# Default is 60 seconds +heartbeat_timeout = 360 +sync_local_state_interval = 360 + +# Normally this is 100. We see eventlet threads +# not making much progress, so for saftey reduce +# this by half, should leave work on rabbit queu +workers_pool_size = 50 +# Normally this is 8, keep it same +period_max_workers = 8 + +# Increase power sync interval to reduce load +sync_power_state_interval = 120 +power_failure_recovery_interval = 120 +# Stop checking for orphan allocations for now +check_allocations_interval = 120 + +# Wait much longer before provision timeout check, to reduce background load +# The default is 60 seconds +check_provision_state_interval = 120 +check_rescue_state_interval = 120 + +[database] +# Usually this is 50, reduce to stop DB connection timeouts +# and instead just make eventlet threads wait a bit longer +max_overflow = 5 +# By default this is 30 seconds, but as we reduce +# the pool overflow, some people will need to wait longer +pool_timeout = 60 + +[deploy] +# Force Hardware assisted secure erase by default. +erase_devices_priority=10 +erase_devices_metadata_priority=0 + +[pxe] +# Increase cache size to 120GB and TTL to 28 hours +image_cache_size = 122880 +image_cache_ttl = 100800 + +[neutron] +# Increase the neutron client timeout to allow for the slow management +# switches. +timeout = 300 +request_timeout = 300 + +[glance] +# Retry image download at least once if failure +num_retries = 1 diff --git a/etc/kayobe/environments/baremetal/kolla/config/neutron.conf b/etc/kayobe/environments/baremetal/kolla/config/neutron.conf new file mode 100644 index 0000000000..ff9f060c89 --- /dev/null +++ b/etc/kayobe/environments/baremetal/kolla/config/neutron.conf @@ -0,0 +1,12 @@ +[DEFAULT] +{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %} +# We are increasing the RPC response timeouts to 6 minutes due to the neutron +# generic switch driver, which synchronously applies switch configuration for +# each ironic port during node provisioning and tear down. +# The specific API calls that require this long timeout are: +# - Creation and deletion of VLAN networks. +# - Creation or update of ports, adding binding information. +# - Update of ports, removing binding information. +# - Deletion of ports. +rpc_response_timeout = 360 +{% endif %} diff --git a/etc/kayobe/environments/baremetal/kolla/config/nova.conf b/etc/kayobe/environments/baremetal/kolla/config/nova.conf new file mode 100644 index 0000000000..ff9f060c89 --- /dev/null +++ b/etc/kayobe/environments/baremetal/kolla/config/nova.conf @@ -0,0 +1,12 @@ +[DEFAULT] +{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %} +# We are increasing the RPC response timeouts to 6 minutes due to the neutron +# generic switch driver, which synchronously applies switch configuration for +# each ironic port during node provisioning and tear down. +# The specific API calls that require this long timeout are: +# - Creation and deletion of VLAN networks. +# - Creation or update of ports, adding binding information. +# - Update of ports, removing binding information. +# - Deletion of ports. +rpc_response_timeout = 360 +{% endif %} diff --git a/etc/kayobe/environments/baremetal/kolla/config/nova/nova-compute-ironic.conf b/etc/kayobe/environments/baremetal/kolla/config/nova/nova-compute-ironic.conf new file mode 100644 index 0000000000..5733a7862a --- /dev/null +++ b/etc/kayobe/environments/baremetal/kolla/config/nova/nova-compute-ironic.conf @@ -0,0 +1,13 @@ +[DEFAULT] +# Increase the number of concurrent builds for the nova ironic compute service. +# This raises the default limit from 10 to 35. +max_concurrent_builds = 35 +force_config_drive = True + +[ironic] +# Ramp up maximum retries to allow time for baremetal node reboot and switch configs +api_max_retries = 720 + +[compute] +# Don't disable the compute service due to failed builds. +consecutive_build_service_disable_threshold = 0 diff --git a/etc/kayobe/environments/baremetal/kolla/globals.yml b/etc/kayobe/environments/baremetal/kolla/globals.yml new file mode 100644 index 0000000000..021b9f41cc --- /dev/null +++ b/etc/kayobe/environments/baremetal/kolla/globals.yml @@ -0,0 +1,25 @@ +# yamllint disable-file +--- +############################################################################# +# HAProxy tunings + +{% if kolla_enable_ironic | bool and "genericswitch" in kolla_neutron_ml2_mechanism_drivers %} +# NOTE: We are increasing the HAProxy timeouts to 5 minutes and 30 seconds due +# to the neutron +# generic switch driver, which synchronously applies switch configuration for +# each ironic port during node provisioning and tear down. +# The specific API calls that require this long timeout are: +# - Creation and deletion of VLAN networks. +# - Creation or update of ports, adding binding information. +# - Update of ports, removing binding information. +# - Deletion of ports. +haproxy_client_timeout: 5m30 +haproxy_server_timeout: 5m30 +neutron_tls_proxy_client_timeout: 5m30 +neutron_tls_proxy_server_timeout: 5m30 +{% endif %} + +############################################################################# +# Neutron tunings + +############################################################################# diff --git a/releasenotes/notes/add-baremetal-mixin-0475561bc5545d50.yaml b/releasenotes/notes/add-baremetal-mixin-0475561bc5545d50.yaml new file mode 100644 index 0000000000..21757cfb7a --- /dev/null +++ b/releasenotes/notes/add-baremetal-mixin-0475561bc5545d50.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + Added the ``baremetal`` mixin environment. This is an opt-in feature. + Please see the `docs + `__ + for more details.