diff --git a/test/extended-priv/const.go b/test/extended-priv/const.go index d436969d24..6a9a6bab1e 100644 --- a/test/extended-priv/const.go +++ b/test/extended-priv/const.go @@ -47,10 +47,20 @@ const ( GCPPlatform = "gcp" // AzurePlatform value used to identify azure infrastructure AzurePlatform = "azure" + // NonePlatform value used to identify a None Platform value + NonePlatform = "none" // VspherePlatform value used to identify Vsphere infrastructure VspherePlatform = "vsphere" // BaremetalPlatform value used to identify BareMetal infrastructure BaremetalPlatform = "baremetal" + // KniPlatform value used to identify KNI infrastructure + KniPlatform = "kni" + // NutanixPlatform value used to identify Nutanix infrastructure + NutanixPlatform = "nutanix" + // OpenstackPlatform value used to identify Openstack infrastructure + OpenstackPlatform = "openstack" + // OvirtPlatform value used to identify Ovirt infrastructure + OvirtPlatform = "ovirt" // ExpirationDockerfileLabel Expiration label in Dockerfile ExpirationDockerfileLabel = `LABEL maintainer="mco-qe-team" quay.expires-after=24h` diff --git a/test/extended-priv/controller.go b/test/extended-priv/controller.go index 59ce5f2d43..612d12d325 100644 --- a/test/extended-priv/controller.go +++ b/test/extended-priv/controller.go @@ -133,6 +133,50 @@ func (mcc *Controller) HasAcquiredLease() (bool, error) { return re.MatchString(podAllLogs), nil } +// GetLogsAsList returns the MCO controller logs as a list strings. One string per line +func (mcc *Controller) GetLogsAsList() ([]string, error) { + logs, err := mcc.GetLogs() + if err != nil { + return nil, err + } + + return strings.Split(logs, "\n"), nil +} + +// GetFilteredLogsAsList returns the filtered logs as a list of strings, one string per line. +func (mcc *Controller) GetFilteredLogsAsList(regex string) ([]string, error) { + logs, err := mcc.GetLogsAsList() + if err != nil { + return nil, err + } + + filteredLogs := []string{} + for _, line := range logs { + match, err := regexp.MatchString(regex, line) + if err != nil { + logger.Errorf("Error filtering log lines. Error: %s", err) + return nil, err + } + + if match { + filteredLogs = append(filteredLogs, line) + } + } + + return filteredLogs, nil +} + +// GetFilteredLogs returns the logs filtered by a regexp applied to every line. If the match is ok the log line is accepted. +// This function can return big log so, please, try not to print the returned value in your tests +func (mcc *Controller) GetFilteredLogs(regex string) (string, error) { + logs, err := mcc.GetFilteredLogsAsList(regex) + if err != nil { + return "", err + } + + return strings.Join(logs, "\n"), nil +} + // GetNode return the node where the machine controller is running func (mcc *Controller) GetNode() (*Node, error) { controllerPodName, err := mcc.GetCachedPodName() diff --git a/test/extended-priv/machineconfig.go b/test/extended-priv/machineconfig.go index a2efcf77d5..982b5a30e8 100644 --- a/test/extended-priv/machineconfig.go +++ b/test/extended-priv/machineconfig.go @@ -12,11 +12,6 @@ import ( "k8s.io/apimachinery/pkg/util/wait" ) -// MachineConfigList handles list of nodes -type MachineConfigList struct { - ResourceList -} - // MachineConfig struct is used to handle MachineConfig resources in OCP type MachineConfig struct { Resource @@ -32,6 +27,16 @@ func NewMachineConfig(oc *exutil.CLI, name, pool string) *MachineConfig { return mc.SetTemplate(*NewMCOTemplate(oc, GenericMCTemplate)) } +// MachineConfigList handles list of nodes +type MachineConfigList struct { + ResourceList +} + +// NewMachineConfigList construct a new node list struct to handle all existing nodes +func NewMachineConfigList(oc *exutil.CLI) *MachineConfigList { + return &MachineConfigList{*NewResourceList(oc, "mc")} +} + // SetTemplate sets the template that will be used by the "create" method in order to create the MC func (mc *MachineConfig) SetTemplate(template Template) *MachineConfig { mc.Template = template @@ -149,3 +154,36 @@ func DisableSkew(machineConfiguration *MachineConfiguration) { o.Eventually(machineConfiguration.IsGenerationUpToDate, "2m", "10s").Should(o.BeTrue(), "MachineConfiguration observedGeneration did not catch up to generation") logger.Infof("Skew functionality has been disabled") } + +// GetRenderedMachineConfigForMaster returns a list with all the MCs whose name starts with "render-master" +func (mcl *MachineConfigList) GetRenderedMachineConfigForMaster() ([]*MachineConfig, error) { + mcl.SetItemsFilter(`?(@.metadata.ownerReferences[0].name=="master")`) + allMCs, err := mcl.GetAll() + if err != nil { + return nil, err + } + + returnMCs := []*MachineConfig{} + + for _, mc := range allMCs { + if strings.HasPrefix(mc.GetName(), "rendered-master") { + returnMCs = append(returnMCs, &MachineConfig{Resource: *mc}) + } + } + + return returnMCs, nil +} + +func (mcl *MachineConfigList) GetRenderedMachineConfigForMasterOrFail() []*MachineConfig { + renderedMcMasterList, err := mcl.GetRenderedMachineConfigForMaster() + o.Expect(err).NotTo(o.HaveOccurred(), "Error getting the list of the machineconfigs that were created by a MCP ") + return renderedMcMasterList + +} + +// GetMachineConfigCreatedByMCPs returns a list of the machineconfigs that were created by a MCP +func (mcl *MachineConfigList) GetMCPRenderedMachineConfigsOrFail() []*MachineConfig { + renderedMcList, err := mcl.GetRenderedMachineConfigForMaster() + o.Expect(err).NotTo(o.HaveOccurred(), "Error getting the list of the machineconfigs that were created by a MCP ") + return renderedMcList +} diff --git a/test/extended-priv/machineconfigpool.go b/test/extended-priv/machineconfigpool.go index a6c6ecd2a0..b8f41a500f 100644 --- a/test/extended-priv/machineconfigpool.go +++ b/test/extended-priv/machineconfigpool.go @@ -127,6 +127,20 @@ func (mcp *MachineConfigPool) GetMaxUnavailableInt() (int, error) { return maxUnavailableInt, nil } +// SetMaxUnavailable sets the value for maxUnavailable +func (mcp *MachineConfigPool) SetMaxUnavailable(maxUnavailable int) { + logger.Infof("patch mcp %v, change spec.maxUnavailable to %d", mcp.name, maxUnavailable) + err := mcp.Patch("merge", fmt.Sprintf(`{"spec":{"maxUnavailable": %d}}`, maxUnavailable)) + o.Expect(err).NotTo(o.HaveOccurred()) +} + +// RemoveMaxUnavailable removes spec.maxUnavailable attribute from the pool config +func (mcp *MachineConfigPool) RemoveMaxUnavailable() { + logger.Infof("patch mcp %v, removing spec.maxUnavailable", mcp.name) + err := mcp.Patch("json", `[{ "op": "remove", "path": "/spec/maxUnavailable" }]`) + o.Expect(err).NotTo(o.HaveOccurred()) +} + // SetOsImageStream sets the osImageStream name for the MCP func (mcp *MachineConfigPool) SetOsImageStream(streamName string) error { logger.Infof("patch mcp %v, change spec.osImageStream.name to %s", mcp.name, streamName) @@ -1492,6 +1506,23 @@ func FilterExtensions(extensions map[string][]string, hasARM64, fips bool, osIma return filteredExtensions, extensionNames, packages } +// GetAllApplicableExtensionsToMCPOrFail returns all the extensions that are supported for the given MCP, and all the packages that will install those extensions +func GetAllApplicableExtensionsToMCPOrFail(mcp *MachineConfigPool) (extensions, packages []string) { + fips := isFIPSEnabledInClusterConfig(mcp.GetOC().AsAdmin()) + + armNodes, err := mcp.GetNodesByArchitecture(architecture.ARM64) + o.Expect(err).NotTo(o.HaveOccurred(), "Error getting the list of ARM nodes in %s", mcp) + + osImageStream, err := GetEffectiveOsImageStream(mcp) + o.Expect(err).NotTo(o.HaveOccurred(), "Error getting effective osImageStream from MCP %s", mcp.GetName()) + + _, extensions, packages = FilterExtensions(AllExtenstions, len(armNodes) > 0, fips, osImageStream) + + logger.Infof("All extensions that can be applied to %s: %s", mcp, extensions) + logger.Infof("All packages that will be installed with those extensions: %s", packages) + return extensions, packages +} + func (mcp *MachineConfigPool) GetNodesWithoutArchitecture(arch architecture.Architecture, archs ...architecture.Architecture) ([]*Node, error) { archsList := arch.String() for _, itemArch := range archs { @@ -1548,3 +1579,97 @@ func GetPoolWithArchDifferentFromOrFail(oc *exutil.CLI, arch architecture.Archit e2e.Failf("Something went wrong. There is no suitable pool to execute the test case. There is no pool with nodes using an architecture different from %s", arch) return nil } + +// GetSortedUpdatedNodes returns a list of nodes in the order that they are being updated by the MCO +// If maxUnavailable>0, then the function will fail if more that maxUpdatingNodes are being updated at the same time +func (mcp *MachineConfigPool) GetSortedUpdatedNodes(maxUnavailable int) []*Node { + timeToWait := mcp.estimateWaitDuration() + logger.Infof("Waiting %s in pool %s for all nodes to start updating.", timeToWait, mcp.name) + + poolNodes, errget := mcp.GetNodes() + o.Expect(errget).NotTo(o.HaveOccurred(), fmt.Sprintf("Cannot get nodes in pool %s", mcp.GetName())) + + pendingNodes := poolNodes + updatedNodes := []*Node{} + immediate := false + err := wait.PollUntilContextTimeout(context.TODO(), 20*time.Second, timeToWait, immediate, func(_ context.Context) (bool, error) { + // If there are degraded machines, stop polling, directly fail + degradedstdout, degradederr := mcp.getDegradedMachineCount() + if degradederr != nil { + logger.Errorf("the err:%v, and try next round", degradederr) + return false, nil + } + + if degradedstdout != 0 { + logger.Errorf("Degraded MC:\n%s", mcp.PrettyString()) + exutil.AssertWaitPollNoErr(fmt.Errorf("Degraded machines"), fmt.Sprintf("mcp %s has degraded %d machines", mcp.name, degradedstdout)) + } + + // Check that there aren't more thatn maxUpdatingNodes updating at the same time + if maxUnavailable > 0 { + totalUpdating := 0 + for _, node := range poolNodes { + isUpdating, err := node.IsUpdating() + if err != nil { + logger.Errorf("Error getting IsUpdating state for node %s: %v", node.GetName(), err) + return false, err + } + if isUpdating { + totalUpdating++ + } + } + if totalUpdating > maxUnavailable { + // print nodes for debug + mcp.oc.Run("get").Args("nodes").Execute() + exutil.AssertWaitPollNoErr(fmt.Errorf("maxUnavailable Not Honored. Pool %s, error: %d nodes were updating at the same time. Only %d nodes should be updating at the same time", mcp.GetName(), totalUpdating, maxUnavailable), "") + } + } + + remainingNodes := []*Node{} + for _, node := range pendingNodes { + isUpdating, err := node.IsUpdating() + if err != nil { + logger.Errorf("Error getting IsUpdating state for node %s: %v", node.GetName(), err) + return false, err + } + if isUpdating { + logger.Infof("Node %s is UPDATING", node.GetName()) + updatedNodes = append(updatedNodes, node) + } else { + remainingNodes = append(remainingNodes, node) + } + } + + if len(remainingNodes) == 0 { + logger.Infof("All nodes have started to be updated on mcp %s", mcp.name) + return true, nil + + } + logger.Infof(" %d remaining nodes", len(remainingNodes)) + pendingNodes = remainingNodes + return false, nil + }) + + exutil.AssertWaitPollNoErr(err, fmt.Sprintf("Could not get the list of updated nodes on mcp %s", mcp.name)) + return updatedNodes +} + +// IsOCL returns true if the pool is using On Cluster Layering functionality +func (mcp MachineConfigPool) IsOCL() (bool, error) { + isOCLEnabled, err := IsFeaturegateEnabled(mcp.GetOC(), "OnClusterBuild") + if err != nil { + return false, err + } + if !isOCLEnabled { + logger.Infof("IS pool %s OCL: false", mcp.GetName()) + return false, nil + } + + mosc, err := mcp.GetMOSC() + if err != nil { + return false, err + } + isOCL := mosc != nil + logger.Infof("IS pool %s OCL: %t", mcp.GetName(), isOCL) + return isOCL, err +} diff --git a/test/extended-priv/mco_drain.go b/test/extended-priv/mco_drain.go new file mode 100644 index 0000000000..85e5e42143 --- /dev/null +++ b/test/extended-priv/mco_drain.go @@ -0,0 +1,322 @@ +package extended + +import ( + "context" + "fmt" + "strings" + "time" + + logger "github.com/openshift/machine-config-operator/test/extended-priv/util/logext" + + g "github.com/onsi/ginkgo/v2" + o "github.com/onsi/gomega" + exutil "github.com/openshift/machine-config-operator/test/extended-priv/util" + "k8s.io/apimachinery/pkg/util/wait" +) + +var _ = g.Describe("[sig-mco][Suite:openshift/machine-config-operator/longduration][Serial][Disruptive] MCO Drain", func() { + defer g.GinkgoRecover() + + var oc = exutil.NewCLI("mco", exutil.KubeConfigPath()) + + g.JustBeforeEach(func() { + PreChecks(oc) + }) + + g.It("[PolarionID:43245][OTP] bump initial drain sleeps down to 1min", func() { + exutil.By("Start machine-config-controller logs capture") + mcc := NewController(oc.AsAdmin()) + ignoreMccLogErr := mcc.IgnoreLogsBeforeNow() + o.Expect(ignoreMccLogErr).NotTo(o.HaveOccurred(), "Ignore mcc log failed") + + exutil.By("Create a pod disruption budget to set minAvailable to 1") + oc.SetupProject() + nsName := oc.Namespace() + pdbName := "dont-evict-43245" + pdbTemplate := generateTemplateAbsolutePath("pod-disruption-budget.yaml") + pdb := PodDisruptionBudget{name: pdbName, namespace: nsName, template: pdbTemplate} + defer pdb.delete(oc) + pdb.create(oc) + + exutil.By("Create new pod for pod disruption budget") + // Not all nodes are valid. We need to deploy the "dont-evict-pod" and we can only do that in schedulable nodes + // In "edge" clusters, the "edge" nodes are not schedulable, so we need to be careful and not to use them to deploy our pod + schedulableNodes := FilterSchedulableNodesOrFail(NewNodeList(oc.AsAdmin()).GetAllLinuxWorkerNodesOrFail()) + o.Expect(schedulableNodes).NotTo(o.BeEmpty(), "There are no schedulable worker nodes!!") + workerNode := schedulableNodes[0] + hostname, err := workerNode.GetNodeHostname() + o.Expect(err).NotTo(o.HaveOccurred()) + podName := "dont-evict-43245" + podTemplate := generateTemplateAbsolutePath("create-pod.yaml") + pod := exutil.Pod{Name: podName, Namespace: nsName, Template: podTemplate, Parameters: []string{"HOSTNAME=" + hostname}} + defer func() { o.Expect(pod.Delete(oc)).NotTo(o.HaveOccurred()) }() + pod.Create(oc) + + exutil.By("Create new mc to add new file on the node and trigger node drain") + mcName := "test-file" + mcTemplate := "add-mc-to-trigger-node-drain.yaml" + mc := NewMachineConfig(oc.AsAdmin(), mcName, MachineConfigPoolWorker).SetMCOTemplate(mcTemplate) + mc.skipWaitForMcp = true + defer mc.DeleteWithWait() + defer func() { o.Expect(pod.Delete(oc)).NotTo(o.HaveOccurred()) }() + mc.create() + + exutil.By("Wait until node is cordoned") + o.Eventually(workerNode.Poll(`{.spec.taints[?(@.effect=="NoSchedule")].effect}`), + "20m", "1m").Should(o.Equal("NoSchedule"), fmt.Sprintf("Node %s was not cordoned", workerNode.name)) + + exutil.By("Check MCC logs to see the early sleep interval b/w failed drains") + var podLogs string + // Wait until trying drain for 3 times + // Early sleep interval will last for 10m. During this interval MCO will wait 1 minute before every retry. + // Every failed drain operation will last 1 minute. So 1 minute execution + 1 minute delay = 2 minutes for every try. + // In a 10 minutes span we only have time for at most 5 "Drain failed" early sleep failures + // To have a stable test case we will take only 3 of them + immediate := false + waitErr := wait.PollUntilContextTimeout(context.TODO(), 1*time.Minute, 15*time.Minute, immediate, func(_ context.Context) (bool, error) { + logs, err := mcc.GetFilteredLogsAsList(workerNode.GetName() + ".*Drain failed") + if err != nil { + return false, fmt.Errorf("Error getting filtered logs for node %s from %v: %w", workerNode.GetName(), mcc, err) + } + if len(logs) > 2 { + // Get only 3 lines to avoid flooding the test logs, ignore the rest if any. + podLogs = strings.Join(logs[0:3], "\n") + return true, nil + } + + return false, nil + }) + logger.Infof("Drain log lines for node %s:\n %s", workerNode.GetName(), podLogs) + o.Expect(waitErr).NotTo(o.HaveOccurred(), fmt.Sprintf("Cannot get 'Drain failed' log lines from controller for node %s", workerNode.GetName())) + timestamps := filterTimestampFromLogs(podLogs, 3) + logger.Infof("Timestamps %s", timestamps) + // First 3 retries should be queued every 1 minute. We check 1 min < time < 2.7 min + o.Expect(getTimeDifferenceInMinute(timestamps[0], timestamps[1])).Should(o.BeNumerically("<=", 2.7)) + o.Expect(getTimeDifferenceInMinute(timestamps[0], timestamps[1])).Should(o.BeNumerically(">=", 1)) + o.Expect(getTimeDifferenceInMinute(timestamps[1], timestamps[2])).Should(o.BeNumerically("<=", 2.7)) + o.Expect(getTimeDifferenceInMinute(timestamps[1], timestamps[2])).Should(o.BeNumerically(">=", 1)) + + exutil.By("Check MCC logs to see the increase in the sleep interval b/w failed drains") + lWaitErr := wait.PollUntilContextTimeout(context.TODO(), 1*time.Minute, 15*time.Minute, immediate, func(_ context.Context) (bool, error) { + logs, err := mcc.GetFilteredLogsAsList(workerNode.GetName() + ".*Drain has been failing for more than 10 minutes. Waiting 5 minutes") + if err != nil { + return false, fmt.Errorf("Error getting filtered logs for node %s from %v: %w", workerNode.GetName(), mcc, err) + } + if len(logs) > 1 { + // Get only 2 lines to avoid flooding the test logs, ignore the rest if any. + podLogs = strings.Join(logs[0:2], "\n") + return true, nil + } + + return false, nil + }) + logger.Infof("Long wait drain log lines for node %s:\n %s", workerNode.GetName(), podLogs) + o.Expect(lWaitErr).NotTo(o.HaveOccurred(), + fmt.Sprintf("Cannot get 'Drain has been failing for more than 10 minutes. Waiting 5 minutes' log lines from controller for node %s", + workerNode.GetName())) + // Following developers' advice we dont check the time spam between long wait log lines. Read: + // https://github.com/openshift/machine-config-operator/pull/3178 + // https://bugzilla.redhat.com/show_bug.cgi?id=2092442 + }) + + g.It("[PolarionID:51381][OTP] cordon node before node drain. OCP >= 4.11", func() { + exutil.By("Capture initial migration-controller logs") + ctrlerContainer := "machine-config-controller" + ctrlerPod, podsErr := getMachineConfigControllerPod(oc) + o.Expect(podsErr).NotTo(o.HaveOccurred()) + o.Expect(ctrlerPod).NotTo(o.BeEmpty()) + + initialCtrlerLogs, initErr := exutil.GetSpecificPodLogs(oc, MachineConfigNamespace, ctrlerContainer, ctrlerPod, "") + o.Expect(initErr).NotTo(o.HaveOccurred()) + + exutil.By("Create a MC to deploy a config file") + fileMode := "0644" // decimal 420 + filePath := "/etc/chrony.conf" + fileContent := "pool 0.rhel.pool.ntp.org iburst\ndriftfile /var/lib/chrony/drift\nmakestep 1.0 3\nrtcsync\nlogdir /var/log/chrony" + fileConfig := getBase64EncodedFileConfig(filePath, fileContent, fileMode) + + mcName := "ztc-51381-change-workers-chrony-configuration" + mc := NewMachineConfig(oc.AsAdmin(), mcName, MachineConfigPoolWorker) + defer mc.DeleteWithWait() + + err := mc.Create("-p", "NAME="+mcName, "-p", "POOL=worker", "-p", fmt.Sprintf("FILES=[%s]", fileConfig)) + o.Expect(err).NotTo(o.HaveOccurred()) + + exutil.By("Check MCD logs to make sure that the node is cordoned before being drained") + mcp := NewMachineConfigPool(oc.AsAdmin(), MachineConfigPoolWorker) + workerNode := mcp.GetSortedNodesOrFail()[0] + + o.Eventually(workerNode.IsCordoned, mcp.estimateWaitDuration().String(), "20s").Should(o.BeTrue(), "Worker node must be cordoned") + + searchRegexp := fmt.Sprintf("(?s)%s: initiating cordon", workerNode.GetName()) + if !workerNode.IsEdgeOrFail() { + // In edge nodes there is no node evicted because they are unschedulable so no pod is running + searchRegexp += fmt.Sprintf(".*node %s: Evicted pod", workerNode.GetName()) + } + searchRegexp += fmt.Sprintf(".*node %s: operation successful; applying completion annotation", workerNode.GetName()) + + o.Eventually(func() string { + podAllLogs, err := exutil.GetSpecificPodLogs(oc, MachineConfigNamespace, ctrlerContainer, ctrlerPod, "") + if err != nil { + return fmt.Sprintf("Error getting pod logs: %v", err) + } + // Remove the part of the log captured at the beginning of the test. + // We only check the part of the log that this TC generates and ignore the previously generated logs + return strings.Replace(podAllLogs, initialCtrlerLogs, "", 1) + }, "5m", "10s").Should(o.MatchRegexp(searchRegexp), "Node should be cordoned before being drained") + + exutil.By("Wait until worker MCP has finished the configuration. No machine should be degraded.") + mcp.waitForComplete() + + exutil.By("Verfiy file content and permissions") + rf := NewRemoteFile(workerNode, filePath) + rferr := rf.Fetch() + o.Expect(rferr).NotTo(o.HaveOccurred()) + + o.Expect(rf.GetTextContent()).To(o.Equal(fileContent)) + o.Expect(rf.GetNpermissions()).To(o.Equal(fileMode)) + }) + + g.It("[PolarionID:49568][OTP] Check nodes updating order maxUnavailable=1", func() { + + mcp := NewMachineConfigPool(oc.AsAdmin(), MachineConfigPoolWorker) + + // In OCL nodes are scaled up using the original osImage, and then MCO applies an update on them + // To avoid problems we do not scale up new nodes if OCL is enabled + // Once OCL is able to boot the nodes directly with the right image we can scale up nodes if the pool is OCL + if exutil.OrFail[bool](WorkersCanBeScaled(oc.AsAdmin())) && !exutil.OrFail[bool](mcp.IsOCL()) { + exutil.By("Scale machinesets and 1 more replica to make sure we have at least 2 nodes per machineset") + platform := exutil.CheckPlatform(oc) + logger.Infof("Platform is %s", platform) + if platform != NonePlatform && platform != "" { + err := AddToAllMachineSets(oc, 1) + o.Expect(err).NotTo(o.HaveOccurred()) + defer func() { + o.Expect(AddToAllMachineSets(oc, -1)).NotTo(o.HaveOccurred()) + mcp.waitForComplete() + }() + } else { + logger.Infof("Platform is %s, skipping the MachineSets replica configuration", platform) + } + } else { + logger.Infof("The worker pool cannot be scaled using machinesets or it is OCL. Skip adding new nodes") + } + + exutil.By("Get the nodes in the worker pool sorted by update order") + workerNodes, errGet := mcp.GetSortedNodes() + o.Expect(errGet).NotTo(o.HaveOccurred()) + + exutil.By("Create a MC to deploy a config file") + filePath := "/etc/TC-49568-mco-test-file-order" + fileContent := "MCO test file order\n" + fileMode := "0400" // decimal 256 + fileConfig := getURLEncodedFileConfig(filePath, fileContent, fileMode) + + mcName := "mco-test-file-order" + mc := NewMachineConfig(oc.AsAdmin(), mcName, MachineConfigPoolWorker) + defer mc.DeleteWithWait() + + err := mc.Create("-p", "NAME="+mcName, "-p", "POOL=worker", "-p", fmt.Sprintf("FILES=[%s]", fileConfig)) + o.Expect(err).NotTo(o.HaveOccurred()) + + exutil.By("Poll the nodes sorted by the order they are updated") + maxUnavailable := 1 + updatedNodes := mcp.GetSortedUpdatedNodes(maxUnavailable) + for _, n := range updatedNodes { + logger.Infof("updated node: %s created: %s zone: %s", n.GetName(), n.GetOrFail(`{.metadata.creationTimestamp}`), n.GetOrFail(`{.metadata.labels.topology\.kubernetes\.io/zone}`)) + } + + exutil.By("Wait for the configuration to be applied in all nodes") + mcp.waitForComplete() + + exutil.By("Check that nodes were updated in the right order") + rightOrder := checkUpdatedLists(workerNodes, updatedNodes, maxUnavailable) + o.Expect(rightOrder).To(o.BeTrue(), "Expected update order %s, but found order %s", workerNodes, updatedNodes) + + exutil.By("Verfiy file content and permissions") + rf := NewRemoteFile(workerNodes[0], filePath) + rferr := rf.Fetch() + o.Expect(rferr).NotTo(o.HaveOccurred()) + + o.Expect(rf.GetTextContent()).To(o.Equal(fileContent)) + o.Expect(rf.GetNpermissions()).To(o.Equal(fileMode)) + }) + + g.It("[PolarionID:49672][OTP] Check nodes updating order maxUnavailable>1", func() { + + mcp := NewMachineConfigPool(oc.AsAdmin(), MachineConfigPoolWorker) + + // In OCL nodes are scaled up using the original osImage, and then MCO applies an update on them + // To avoid problems we do not scale up new nodes if OCL is enabled + // Once OCL is able to boot the nodes directly with the right image we can scale up nodes if the pool is OCL + if exutil.OrFail[bool](WorkersCanBeScaled(oc.AsAdmin())) && !exutil.OrFail[bool](mcp.IsOCL()) { + exutil.By("Scale machinesets and 1 more replica to make sure we have at least 2 nodes per machineset") + platform := exutil.CheckPlatform(oc) + logger.Infof("Platform is %s", platform) + if platform != NonePlatform && platform != "" { + err := AddToAllMachineSets(oc, 1) + o.Expect(err).NotTo(o.HaveOccurred()) + defer func() { + o.Expect(AddToAllMachineSets(oc, -1)).NotTo(o.HaveOccurred()) + mcp.waitForComplete() + }() + } else { + logger.Infof("Platform is %s, skipping the MachineSets replica configuration", platform) + } + } else { + logger.Infof("The worker pool cannot be scaled using machinesets or it is OCL. Skip adding new nodes") + } + + // If the number of nodes is 2, since we are using maxUnavailable=2, all nodes will be cordoned at + // the same time and the eviction process will be stuck. In this case we need to skip the test case. + numWorkers := len(NewNodeList(oc.AsAdmin()).GetAllLinuxWorkerNodesOrFail()) + if numWorkers <= 2 { + g.Skip(fmt.Sprintf("The test case needs at least 3 worker nodes, because eviction will be stuck if not. Current num worker is %d, we skip the case", + numWorkers)) + } + + exutil.By("Get the nodes in the worker pool sorted by update order") + workerNodes, errGet := mcp.GetSortedNodes() + o.Expect(errGet).NotTo(o.HaveOccurred()) + + exutil.By("Set maxUnavailable value") + maxUnavailable := 2 + mcp.SetMaxUnavailable(maxUnavailable) + defer mcp.RemoveMaxUnavailable() + + exutil.By("Create a MC to deploy a config file") + filePath := "/etc/TC-49672-mco-test-file-order" + fileContent := "MCO test file order 2\n" + fileMode := "0400" // decimal 256 + fileConfig := getURLEncodedFileConfig(filePath, fileContent, fileMode) + + mcName := "mco-test-file-order2" + mc := NewMachineConfig(oc.AsAdmin(), mcName, MachineConfigPoolWorker) + defer mc.DeleteWithWait() + + err := mc.Create("-p", "NAME="+mcName, "-p", "POOL=worker", "-p", fmt.Sprintf("FILES=[%s]", fileConfig)) + o.Expect(err).NotTo(o.HaveOccurred()) + + exutil.By("Poll the nodes sorted by the order they are updated") + updatedNodes := mcp.GetSortedUpdatedNodes(maxUnavailable) + for _, n := range updatedNodes { + logger.Infof("updated node: %s created: %s zone: %s", n.GetName(), n.GetOrFail(`{.metadata.creationTimestamp}`), n.GetOrFail(`{.metadata.labels.topology\.kubernetes\.io/zone}`)) + } + + exutil.By("Wait for the configuration to be applied in all nodes") + mcp.waitForComplete() + + exutil.By("Check that nodes were updated in the right order") + rightOrder := checkUpdatedLists(workerNodes, updatedNodes, maxUnavailable) + o.Expect(rightOrder).To(o.BeTrue(), "Expected update order %s, but found order %s", workerNodes, updatedNodes) + + exutil.By("Verfiy file content and permissions") + rf := NewRemoteFile(workerNodes[0], filePath) + rferr := rf.Fetch() + o.Expect(rferr).NotTo(o.HaveOccurred()) + + o.Expect(rf.GetTextContent()).To(o.Equal(fileContent)) + o.Expect(rf.GetNpermissions()).To(o.Equal(fileMode)) + }) + +}) diff --git a/test/extended-priv/mco_machineconfigpool.go b/test/extended-priv/mco_machineconfigpool.go new file mode 100644 index 0000000000..289caa51f9 --- /dev/null +++ b/test/extended-priv/mco_machineconfigpool.go @@ -0,0 +1,585 @@ +package extended + +import ( + "fmt" + "regexp" + "strconv" + "strings" + + logger "github.com/openshift/machine-config-operator/test/extended-priv/util/logext" + + g "github.com/onsi/ginkgo/v2" + o "github.com/onsi/gomega" + exutil "github.com/openshift/machine-config-operator/test/extended-priv/util" +) + +var _ = g.Describe("[sig-mco][Suite:openshift/machine-config-operator/longduration][Serial][Disruptive] MCO", func() { + defer g.GinkgoRecover() + + var oc = exutil.NewCLI("mco", exutil.KubeConfigPath()) + + g.JustBeforeEach(func() { + PreChecks(oc) + }) + + g.It("[PolarionID:43048][PolarionID:43064][OTP][LEVEL0] create/delete custom machine config pool", func() { + if IsCompactOrSNOCluster(oc) { + g.Skip("This test case cannot be executed in SNO or Compact clusters") + } + + mcpName := "infra" + + exutil.By("get worker node to change the label") + wMcp := NewMachineConfigPool(oc.AsAdmin(), MachineConfigPoolWorker) + allWorkers := wMcp.GetNodesOrFail() + workerNode := allWorkers[0] + initialNumWorkers := len(allWorkers) + logger.Infof("OK!\n") + + exutil.By("Create custom infra mcp") + mcp, err := CreateCustomMCP(oc.AsAdmin(), mcpName, 0) + defer DeleteCustomMCP(oc.AsAdmin(), mcpName) + o.Expect(err).NotTo(o.HaveOccurred(), "Error creating a custom MCP") + logger.Infof("OK!\n") + + exutil.By("Check MCP status") + o.Consistently(mcp.pollDegradedMachineCount(), "30s", "10s").Should(o.Equal("0"), "There are degraded nodes in pool") + o.Eventually(mcp.pollDegradedStatus(), "1m", "20s").Should(o.Equal("False"), "The pool status is 'Degraded'") + o.Eventually(mcp.pollUpdatedStatus(), "1m", "20s").Should(o.Equal("True"), "The pool is reporting that it is not updated") + o.Eventually(mcp.pollMachineCount(), "1m", "10s").Should(o.Equal("0"), "The pool should report 0 machine count") + o.Eventually(mcp.pollReadyMachineCount(), "1m", "10s").Should(o.Equal("0"), "The pool should report 0 machine ready") + o.Eventually(wMcp.pollMachineCount(), "1m", "10s").Should(o.Equal(strconv.Itoa(initialNumWorkers)), + "The worker pool should report %d machine count", initialNumWorkers) + + logger.Infof("Custom mcp is created successfully!") + logger.Infof("OK!\n") + + exutil.By("Add label as infra to the existing node") + infraLabel := "node-role.kubernetes.io/infra" + o.Expect(workerNode.AddLabel(infraLabel, "")).To(o.Succeed()) + nodeLabel, err := oc.AsAdmin().WithoutNamespace().Run("get").Args("nodes/" + workerNode.name).Output() + o.Expect(err).NotTo(o.HaveOccurred()) + o.Expect(nodeLabel).Should(o.ContainSubstring("infra")) + logger.Infof("OK!\n") + + exutil.By("Check MCP status") + o.Consistently(mcp.pollDegradedMachineCount(), "30s", "10s").Should(o.Equal("0"), "There are degraded nodes in pool") + o.Eventually(mcp.pollDegradedStatus(), "1m", "20s").Should(o.Equal("False"), "The pool status is 'Degraded'") + o.Eventually(mcp.pollUpdatedStatus(), "1m", "20s").Should(o.Equal("True"), "The pool is reporting that it is not updated") + o.Eventually(mcp.pollMachineCount(), "1m", "10s").Should(o.Equal("1"), "The pool should report 1 machine count") + o.Eventually(mcp.pollReadyMachineCount(), "1m", "10s").Should(o.Equal("1"), "The pool should report 1 machine ready") + o.Eventually(wMcp.pollMachineCount(), "1m", "10s").Should(o.Equal(strconv.Itoa(initialNumWorkers-1)), + "The worker pool should report %d machine count", initialNumWorkers-1) + + logger.Infof("Custom mcp is created successfully!") + logger.Infof("OK!\n") + + exutil.By("Remove custom label from the node") + o.Expect(workerNode.RemoveLabel(infraLabel)).To(o.Succeed(), "Error removing the infra label from %s", workerNode) + logger.Infof("Label removed") + logger.Infof("OK!\n") + + exutil.By("Check that the node was properly returned to the worker pool") + wMcp.waitForComplete() + logger.Infof("OK!\n") + + exutil.By("Verify that the information is updated in MCP") + o.Eventually(mcp.pollUpdatedStatus(), "5m", "20s").Should(o.Equal("True"), "The pool is reporting that it is not updated") + o.Eventually(mcp.pollMachineCount(), "5m", "20s").Should(o.Equal("0"), "The pool should report 0 machine count") + o.Eventually(mcp.pollReadyMachineCount(), "5m", "20s").Should(o.Equal("0"), "The pool should report 0 machine ready") + o.Consistently(mcp.pollDegradedMachineCount(), "30s", "10s").Should(o.Equal("0"), "There are degraded nodes in pool") + o.Eventually(mcp.pollDegradedStatus(), "5m", "20s").Should(o.Equal("False"), "The pool status is 'Degraded'") + logger.Infof("OK!\n") + + exutil.By("Remove custom infra mcp") + mcp.delete() + logger.Infof("OK!\n") + + exutil.By("Check custom infra mcp is deleted") + mcpOut, err := oc.AsAdmin().WithoutNamespace().Run("get").Args("mcp/" + mcpName).Output() + o.Expect(err).Should(o.HaveOccurred()) + o.Expect(mcpOut).Should(o.ContainSubstring("NotFound")) + logger.Infof("Custom mcp is deleted successfully!") + logger.Infof("OK!\n") + }) + + /* Map of extensions and packages for each extension + { + "ipsec": {"NetworkManager-libreswan", "libreswan"}, + "usbguard": {"usbguard"}, + "kerberos": {"krb5-workstation", "libkadm5"}, + "kernel-devel": {"kernel-devel", "kernel-headers"}, + "sandboxed-containers": {"kata-containers"}, + "sysstat": {"sysstat"}, + } */ + g.It("[PolarionID:56131][PolarionID:77354][OTP][LEVEL0] Install all extensions", func() { + var ( + coreOSMcp = GetCoreOsCompatiblePool(oc.AsAdmin()) + node = coreOSMcp.GetCoreOsNodesOrFail()[0] + + query = `mcd_local_unsupported_packages{node="` + node.GetName() + `"}` + valueJSONPath = `data.result.0.value.1` + + mcName = fmt.Sprintf("mco-tc-%s-all-extensions", GetCurrentTestPolarionIDNumber()) + + applicableExtensions, expectedRpmInstalledPackages = GetAllApplicableExtensionsToMCPOrFail(coreOSMcp) + + skipDrainChecks = IsSNO(oc.AsAdmin()) // SNO clusters should NOT drain the nodes before rebooting them. The validator is not prepared for that. + behaviourValidatorApply = UpdateBehaviourValidator{ + SkipDrainNodesValidation: skipDrainChecks, + Checkers: []Checker{ + CommandOutputChecker{ + Command: append([]string{"rpm", "-q"}, expectedRpmInstalledPackages...), + Matcher: o.MatchRegexp("(?s)" + strings.Join(expectedRpmInstalledPackages, ".*")), + ErrorMsg: "Extensions were not properly installed", + Desc: "Checking that all available extensions were properly installed", + }, + }, + } + ) + + coreOSMcp.SetWaitingTimeForExtensionsChange() + behaviourValidatorApply.Initialize(coreOSMcp, nil) + + exutil.By("Create a MC to install all available extensions") + mc := NewMachineConfig(oc.AsAdmin(), mcName, coreOSMcp.GetName()) + mc.parameters = []string{fmt.Sprintf(`EXTENSIONS=%s`, string(MarshalOrFail(applicableExtensions)))} + mc.skipWaitForMcp = true + + defer mc.DeleteWithWait() + mc.create() + logger.Infof("OK!\n") + + behaviourValidatorApply.Validate() + + exutil.By("Check that no unsupported packages are reported") + monitor, err := exutil.NewMonitor(oc.AsAdmin()) + o.Expect(err).NotTo(o.HaveOccurred(), "Error getting the monitor to query the metricts") + + o.Eventually(monitor.SimpleQuery, "10s", "2s").WithArguments(query).Should(HavePathWithValue(valueJSONPath, o.Equal("0")), + "There are reported unsupported packages in %s", node) + logger.Infof("OK!\n") + + CheckExtensions(node, applicableExtensions) + + exutil.By("Delete the MC") + mc.DeleteWithWait() + logger.Infof("OK!\n") + + exutil.By("Verify that extension packages where uninstalled after MC deletion") + for _, pkg := range expectedRpmInstalledPackages { + o.Expect(node.RpmIsInstalled(pkg)).To( + o.BeFalse(), + "Package %s should be uninstalled when we remove the extensions MC", pkg) + } + logger.Infof("OK!\n") + }) + + g.It("[PolarionID:42390][PolarionID:45318][OTP] add machine config without ignition version. Block the Machine-Config-Operator upgrade rollout if any of the pools are Degraded", func() { + createMcAndVerifyIgnitionVersion(oc, "empty ign version", "change-worker-ign-version-to-empty", "") + }) + + g.It("[PolarionID:52373][OTP] Modify proxy configuration in paused pools", func() { + + proxyValue := "http://user:pass@proxy-fake:1111" + noProxyValue := "test.52373.no-proxy.com" + + exutil.By("Get current proxy configuration") + proxy := NewResource(oc.AsAdmin(), "proxy", "cluster") + proxyInitialConfig := proxy.GetOrFail(`{.spec}`) + logger.Infof("Initial proxy configuration: %s", proxyInitialConfig) + + wmcp := NewMachineConfigPool(oc.AsAdmin(), MachineConfigPoolWorker) + mmcp := NewMachineConfigPool(oc.AsAdmin(), MachineConfigPoolMaster) + + defer func() { + logger.Infof("Start TC defer block") + + logger.Infof("Restore original proxy config %s", proxyInitialConfig) + _ = proxy.Patch("json", `[{ "op": "add", "path": "/spec", "value": `+proxyInitialConfig+`}]`) + + logger.Infof("Wait for new machine configs to be rendered and paused pools to report updated status") + // We need to make sure that the config will NOT be applied, since the proxy is a fake one and if + // we dont make sure that the config proxy is reverted, the nodes will be broken and go into + // NotReady status + _ = wmcp.WaitForUpdatedStatus() + _ = mmcp.WaitForUpdatedStatus() + + logger.Infof("Unpause worker pool") + wmcp.pause(false) + + logger.Infof("Unpause master pool") + mmcp.pause(false) + + logger.Infof("End TC defer block") + }() + + exutil.By("Pause MCPs") + wmcp.pause(true) + mmcp.pause(true) + + exutil.By("Configure new proxy") + err := proxy.Patch("json", + `[{ "op": "add", "path": "/spec/httpProxy", "value": "`+proxyValue+`" }]`) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Error patching http proxy") + + err = proxy.Patch("json", + `[{ "op": "add", "path": "/spec/httpsProxy", "value": "`+proxyValue+`" }]`) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Error patching https proxy") + + err = proxy.Patch("json", + `[{ "op": "add", "path": "/spec/noProxy", "value": "`+noProxyValue+`" }]`) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Error patching noproxy") + + exutil.By("Verify that the proxy configuration was applied to daemonsets") + mcoDs := NewNamespacedResource(oc.AsAdmin(), "DaemonSet", MachineConfigNamespace, "machine-config-daemon") + // it should never take longer than 5 minutes to apply the proxy config under any circumstance, + // it should be considered a bug. + o.Eventually(mcoDs.Poll(`{.spec}`), "5m", "30s").Should(o.ContainSubstring(proxyValue), + "machine-config-daemon is not using the new proxy configuration: %s", proxyValue) + o.Eventually(mcoDs.Poll(`{.spec}`), "5m", "30s").Should(o.ContainSubstring(noProxyValue), + "machine-config-daemon is not using the new no-proxy value: %s", noProxyValue) + + exutil.By("Check that the operator has been marked as degraded") + mco := NewResource(oc.AsAdmin(), "co", "machine-config") + o.Eventually(mco.Poll(`{.status.conditions[?(@.type=="Degraded")].status}`), + "5m", "30s").Should(o.Equal("True"), + "machine-config Operator should report degraded status") + + o.Eventually(mco.Poll(`{.status.conditions[?(@.type=="Degraded")].message}`), + "5m", "30s").Should(o.ContainSubstring(`required MachineConfigPool master is paused and cannot sync until it is unpaused`), + "machine-config Operator is not reporting the right reason for degraded status") + + exutil.By("Restore original proxy configuration") + err = proxy.Patch("json", `[{ "op": "add", "path": "/spec", "value": `+proxyInitialConfig+`}]`) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Error patching and restoring original proxy config") + + exutil.By("Verify that the new configuration is applied to the daemonset") + // it should never take longer than 5 minutes to apply the proxy config under any circumstance, + // it should be considered a bug. + o.Eventually(mcoDs.Poll(`{.spec}`), "5m", "30s").ShouldNot(o.ContainSubstring(proxyValue), + "machine-config-daemon has not restored the original proxy configuration") + o.Eventually(mcoDs.Poll(`{.spec}`), "5m", "30s").ShouldNot(o.ContainSubstring(noProxyValue), + "machine-config-daemon has not restored the original proxy configuration for 'no-proxy'") + + exutil.By("Check that the operator is not marked as degraded anymore") + o.Eventually(mco.Poll(`{.status.conditions[?(@.type=="Degraded")].status}`), + "5m", "30s").Should(o.Equal("False"), + "machine-config Operator should not report degraded status anymore") + + }) + + g.It("[PolarionID:56123][OTP] Invalid extensions should degrade the machine config pool", func() { + var ( + validExtension = "usbguard" + invalidExtension = "zsh" + mcName = "mco-tc-56123-invalid-extension" + mcp = GetCompactCompatiblePool(oc) + + expectedRDMessage = regexp.QuoteMeta(fmt.Sprintf("invalid extensions found: [%s]", invalidExtension)) // quotemeta to scape regex characters + expectedRDReason = "" + ) + + exutil.By("Create a MC with invalid extensions") + mc := NewMachineConfig(oc.AsAdmin(), mcName, mcp.GetName()) + mc.parameters = []string{fmt.Sprintf(`EXTENSIONS=["%s", "%s"]`, validExtension, invalidExtension)} + mc.skipWaitForMcp = true + + validateMcpRenderDegraded(mc, mcp, expectedRDMessage, expectedRDReason) + }) + + g.It("[PolarionID:70125][OTP] Test patch annotation way of updating a paused pool", func() { + + var ( + workerMcp = NewMachineConfigPool(oc.AsAdmin(), MachineConfigPoolWorker) + mcName = "create-test-file-70125" + filePath = "/etc/test-file-70125" + fileConfig = getURLEncodedFileConfig(filePath, "test-70125", "420") + ) + + exutil.By("Pause worker pool") + workerMcp.pause(true) + o.Expect(workerMcp.IsPaused()).Should(o.BeTrue(), "worker pool is not paused") + + exutil.By("Create a MC for worker nodes") + mc := NewMachineConfig(oc.AsAdmin(), mcName, MachineConfigPoolWorker) + mc.SetMCOTemplate(GenericMCTemplate) + mc.SetParams(fmt.Sprintf("FILES=[%s]", fileConfig)) + mc.skipWaitForMcp = true + + defer workerMcp.RecoverFromDegraded() + defer mc.DeleteWithWait() + // unpause the mcp first in defer logic, so nodes can be recovered automatically + defer workerMcp.pause(false) + mc.create() + + exutil.By("Patch desired MC annotation to trigger update") + // get desired rendered mc from mcp.spec.configuration.name + currentConfig, ccerr := workerMcp.getConfigNameOfStatus() + o.Expect(ccerr).NotTo(o.HaveOccurred(), "Get current MC of worker pool failed") + o.Eventually(workerMcp.getConfigNameOfSpec, "2m", "5s").ShouldNot(o.Equal(currentConfig)) + desiredConfig, dcerr := workerMcp.getConfigNameOfSpec() + o.Expect(dcerr).NotTo(o.HaveOccurred(), "Get desired MC of worker pool failed") + o.Expect(desiredConfig).NotTo(o.BeEmpty(), "Cannot get desired MC") + logger.Infof("Desired MC is: %s\n", desiredConfig) + + allWorkerNodes := NewNodeList(oc.AsAdmin()).GetAllLinuxWorkerNodesOrFail() + o.Expect(allWorkerNodes).NotTo(o.BeEmpty(), "Cannot get any worker node from worker pool") + workerNode := allWorkerNodes[0] + + logger.Infof("Start to patch annotation [machineconfiguration.openshift.io/desiredConfig] for worker node %s", workerNode.GetName()) + o.Expect(workerNode.PatchDesiredConfig(desiredConfig)).To(o.Succeed(), + "Failed to patch desiredConfig annotation on node %s", workerNode.GetName()) + + // wait update to complete + o.Eventually(workerNode.IsUpdating, "5m", "5s").Should(o.BeTrue(), "Node is not updating") + o.Eventually(workerNode.IsUpdated, "10m", "10s").Should(o.BeTrue(), "Node is not updated") + o.Eventually(workerMcp.getUpdatedMachineCount, "2m", "15s").Should(o.Equal(1), "The MCP is not properly reporting the updated node") + o.Eventually(NewRemoteFile(workerNode, filePath).Exists, "2m", "20s").Should(o.BeTrue(), "Cannot find expected file %s on node %s", filePath, workerNode.GetName()) + logger.Infof("Node %s is updated to desired MC %s", workerNode.GetName(), desiredConfig) + + exutil.By("Unpause worker pool") + workerMcp.pause(false) + o.Expect(workerMcp.IsPaused()).Should(o.BeFalse(), "worker pool is not unpaused") + logger.Infof("MCP worker is unpaused\n") + + exutil.By("Check worker pool is updated") + workerMcp.waitForComplete() + + exutil.By("Check file exists on all worker nodes") + for _, node := range allWorkerNodes { + o.Expect(NewRemoteFile(node, filePath).Exists()).Should(o.BeTrue(), "Cannot find expected file %s on node %s", filePath, node.GetName()) + logger.Infof("File %s can be found on node %s\n", filePath, node.GetName()) + } + + }) + + g.It("[PolarionID:72007][OTP] check node update frequencies", func() { + + exutil.By("To get node and display its nodeupdate frequiences") + + var ( + file = "/etc/kubernetes/kubelet.conf" + cmd = "nodeStatusUpdateFrequency|nodeStatusReportFrequency" + ) + nodeList, err := NewNodeList(oc.AsAdmin()).GetAllLinux() // Get all nodes + o.Expect(err).NotTo(o.HaveOccurred(), "Error getting the list of nodes") + + for _, node := range nodeList { + if node.HasTaintEffectOrFail("NoExecute") { + logger.Infof("Node %s is tainted with 'NoExecute'. Validation skipped.", node.GetName()) + continue + } + nodeUpdate, err := node.DebugNodeWithChroot("grep", "-E", cmd, file) // To get nodeUpdate frequencies value + o.Expect(err).NotTo(o.HaveOccurred(), "Error getting nodeupdate frequencies for %s", node.GetName()) + o.Expect(nodeUpdate).To(o.Or(o.ContainSubstring(`"nodeStatusUpdateFrequency": "10s"`), o.ContainSubstring(`nodeStatusUpdateFrequency: 10s`)), "Value for 'nodeStatusUpdateFrequency' is not same as expected.") + o.Expect(nodeUpdate).To(o.Or(o.ContainSubstring(`"nodeStatusReportFrequency": "5m0s"`), o.ContainSubstring(`nodeStatusReportFrequency: 5m0s`)), "Value for 'nodeStatusReportFrequency' is not same as expected.") + logger.Infof("node/%s %s", node, nodeUpdate) + } + }) + + g.It("[PolarionID:75149][OTP] Update pool with manually cordoned nodes", func() { + if IsCompactOrSNOCluster(oc.AsAdmin()) { + g.Skip("The cluster is SNO/Compact. This test cannot be executed in SNO/Compact clusters") + } + + var ( + mcp = NewMachineConfigPool(oc.AsAdmin(), MachineConfigPoolWorker) + mcName = "mco-test-75149" + password = exutil.GetRandomString() + passwordHash = OrFail[string](getHashPasswd(password)) + nodeList = NewNodeList(oc.AsAdmin()) + initNumNodes = len(mcp.GetNodesOrFail()) + + checkDuration = "3m" + ) + if initNumNodes < 3 { + if !exutil.OrFail[bool](WorkersCanBeScaled(oc.AsAdmin())) { + g.Skip("A minimum of 3 worker nodes are needed to execute this test. The worker pool has less than 3 nodes and cannot be scaled up to create new nodes") + } + // TODO add only the necessary machines to reach 3 nodes + numAdd := 3 - initNumNodes + machineset := exutil.OrFail[*MachineSet](GetScalableMachineSet(oc.AsAdmin())) + o.Expect(machineset.AddToScale(numAdd)).To(o.Succeed(), + "Error addind new nodes to the cluster") + o.Expect(machineset.WaitUntilReady("15m")).To(o.Succeed(), + "Error waiting for the machineset to become ready. %s", machineset.PrettyString()) + mcp.waitForComplete() + + defer func() { + o.Expect(machineset.AddToScale(-numAdd)).To(o.Succeed(), + "Error removing the extra nodes from the cluster") + o.Eventually(mcp.GetNodes, "5m", "30s").Should(o.HaveLen(initNumNodes), + "The worker pool has not removed the new nodes created by the new Machineset.\n%s", mcp.PrettyString()) + mcp.waitForComplete() + }() + } + + exutil.By("Set the maxUnavailable value to 2") + mcp.SetMaxUnavailable(2) + defer mcp.RemoveMaxUnavailable() + logger.Infof("OK!\n") + + exutil.By("Manually cordon one of the nodes") + nodes := mcp.GetNodesOrFail() + cordonedNode := nodes[0] + defer cordonedNode.Uncordon() + o.Expect(cordonedNode.Cordon()).To(o.Succeed(), + "Could not cordon node %s", cordonedNode.GetName()) + logger.Infof("OK!\n") + + exutil.By("Create a new MachineConfiguration resource") + mc := NewMachineConfig(oc.AsAdmin(), mcName, mcp.GetName()) + mc.parameters = []string{fmt.Sprintf(`PWDUSERS=[{"name": "core", "passwordHash": "%s" }]`, passwordHash)} + mc.skipWaitForMcp = true + + defer mc.DeleteWithWait() + defer cordonedNode.Uncordon() + mc.create() + logger.Infof("OK!\n") + + exutil.By("Check that only one node is updated at a time (instead of 2) because the manually cordoned node counts as unavailable") + // get all nodes with status != Done + nodeList.SetItemsFilter(`?(@.metadata.annotations.machineconfiguration\.openshift\.io/state!="Done")`) + o.Consistently(func() (int, error) { + nodes, err := nodeList.GetAll() + return len(nodes), err + }, checkDuration, "10s").Should(o.BeNumerically("<", 2), + "The maximun number of nodes updated at a time should be 1, because the manually cordoned node should count as unavailable too") + logger.Infof("OK!\n") + + exutil.By("Check that all nodes are updated but the manually cordoned one") + numNodes := len(nodes) + waitDuration := mcp.estimateWaitDuration().String() + o.Eventually(mcp.getUpdatedMachineCount, waitDuration, "15s").Should(o.Equal(numNodes-1), + "All nodes but one should be udated. %d total nodes, expecting %d to be updated", numNodes, numNodes-1) + + // We check that the desired config for the manually cordoned node is the old config, and not the new one + o.Consistently(cordonedNode.GetDesiredMachineConfig, "2m", "20s").Should(o.Equal(mcp.getConfigNameOfStatusOrFail()), + "The manually cordoned node should not be updated. The desiredConfig value should be the old one.") + logger.Infof("OK!\n") + + exutil.By("Manually undordon the cordoned node") + o.Expect(cordonedNode.Uncordon()).To(o.Succeed(), + "Could not uncordon the manually cordoned node") + logger.Infof("OK!\n") + + exutil.By("All nodes should be updated now") + mcp.waitForComplete() + // Make sure that the cordoned node is now using the new configuration + o.Eventually(cordonedNode.GetDesiredMachineConfig, "30s", "10s").Should(o.Equal(mcp.getConfigNameOfSpecOrFail()), + "The manually cordoned node should not be updated. The desiredConfig value should be the old one.") + logger.Infof("OK!\n") + }) + + g.It("[PolarionID:76108][OTP] MachineConfig inheritance. Canary rollout update", func() { + SkipIfCompactOrSNO(oc) // We can't create custom pools if only the master pool exists + + var ( + customMCPName = "worker-perf" + canaryMCPName = "worker-perf-canary" + mcName = "06-kdump-enable-worker-perf-tc-76108" + mcUnit = `{"enabled": true, "name": "kdump.service"}` + mcKernelArgs = "crashkernel=512M" + mc = NewMachineConfig(oc.AsAdmin(), mcName, customMCPName) + ) + + defer mc.Delete() + + exutil.By("Create custom MCP") + defer DeleteCustomMCP(oc.AsAdmin(), customMCPName) + customMcp, err := CreateCustomMCP(oc.AsAdmin(), customMCPName, 2) + o.Expect(err).NotTo(o.HaveOccurred(), "Could not create a new custom MCP") + logger.Infof("OK!\n") + + exutil.By("Create canary custom MCP") + defer DeleteCustomMCP(oc.AsAdmin(), canaryMCPName) + canaryMcp, err := CreateCustomMCP(oc.AsAdmin(), canaryMCPName, 0) + o.Expect(err).NotTo(o.HaveOccurred(), "Could not create a new custom MCP") + logger.Infof("OK!\n") + + exutil.By("Patch the canary MCP so that it uses the MCs of the custom MCP too") + o.Expect( + canaryMcp.Patch("json", `[{ "op": "add", "path": "/spec/machineConfigSelector/matchExpressions/0/values/-", "value":"`+customMCPName+`"}]`), + ).To(o.Succeed(), "Error patching MCP %s so that it uses the same MCs as MCP %s", canaryMcp.GetName(), customMcp.GetName()) + logger.Infof("OK!\n") + + exutil.By("Apply a new MC to the custom pool") + + err = mc.Create("-p", "NAME="+mcName, "-p", "POOL="+customMCPName, "-p", fmt.Sprintf("UNITS=[%s]", mcUnit), fmt.Sprintf(`KERNEL_ARGS=["%s"]`, mcKernelArgs)) + o.Expect(err).NotTo(o.HaveOccurred(), "Error creating MachineConfig %s", mc.GetName()) + + customMcp.waitForComplete() + + exutil.By("Check that the configuration was applied the nodes") + canaryNode := customMcp.GetNodesOrFail()[0] + o.Expect(canaryNode.IsKernelArgEnabled(mcKernelArgs)).Should(o.BeTrue(), "Kernel argument %s is not set in node %s", mcKernelArgs, canaryNode) + logger.Infof("OK!\n") + + exutil.By("Move one node from the custom pool to the canary custom pool") + startTime := canaryNode.GetDateOrFail() + + // We need to add and remove the label at the same time to avoid the node belonging to 2 custom nodes at the same time, which is forbidden + o.Expect( + MoveNodeToAnotherCustomPool(canaryNode, customMCPName, canaryMCPName), + ).To(o.Succeed(), "Error labeling node %s", canaryNode) + + o.Eventually(canaryMcp.getMachineCount, "5m", "20s").Should(o.Equal(1), + "A machine should be added to the canary MCP, but no machine was added: %s", canaryMcp.PrettyString()) + o.Eventually(customMcp.getMachineCount, "5m", "20s").Should(o.Equal(1), + "A machine should be removed from the custom MCP: %s", customMcp.PrettyString()) + canaryMcp.waitForComplete() + logger.Infof("OK!\n") + + exutil.By("Check that the configuration is still applied to the canary node") + o.Expect(canaryNode.IsKernelArgEnabled(mcKernelArgs)).Should(o.BeTrue(), "Kernel argument %s is not set in node %s", mcKernelArgs, canaryNode) + logger.Infof("OK!\n") + + exutil.By("Check that the node was not restarted when it was added to the canary pool") + checkRebootAction(false, canaryNode, startTime) + logger.Infof("OK!\n") + }) + + g.It("[PolarionID:85073][OTP] automatically re-cordons manually uncordoned node during update", func() { + var ( + tcID = GetCurrentTestPolarionIDNumber() + mcName = fmt.Sprintf("mco-tc-%s", tcID) + filePath = fmt.Sprintf("/etc/test-file-%s.test", tcID) + fileConfig = getBase64EncodedFileConfig(filePath, fmt.Sprintf("test-content-%s", tcID), "0644") + mcp = GetCompactCompatiblePool(oc.AsAdmin()) + firstNode = mcp.GetSortedNodesOrFail()[0] + ) + + exutil.By(fmt.Sprintf("Create MC that creates a file %s in worker pool nodes", filePath)) + mc := NewMachineConfig(oc.AsAdmin(), mcName, mcp.GetName()) + mc.SetParams(fmt.Sprintf("FILES=[%s]", fileConfig)) + mc.skipWaitForMcp = true + defer mc.DeleteWithWait() + mc.create() + logger.Infof("OK!\n") + + exutil.By("Wait until the first worker node is cordoned") + o.Eventually(firstNode.IsCordoned, "20m", "30s").Should(o.BeTrue(), + "Node %s was not cordoned after creating the MC", firstNode.GetName()) + logger.Infof("OK!\n") + + exutil.By("Uncordon the cordoned node") + o.Expect(firstNode.Uncordon()).To(o.Succeed(), "Could not uncordon node %s", firstNode.GetName()) + logger.Infof("OK!\n") + + exutil.By("Wait for the uncordoned node to be automatically cordoned again by MCO") + o.Eventually(firstNode.IsCordoned, "5m", "10s").Should(o.BeTrue(), + "Node %s was not automatically re-cordoned by MCO after manual uncordon", firstNode.GetName()) + logger.Infof("OK!\n") + + exutil.By("Wait for MCP to complete the update") + mcp.waitForComplete() + logger.Infof("OK!\n") + + exutil.By("Verify that the file was correctly created in the node") + o.Expect(NewRemoteFile(firstNode, filePath).Exists()).Should(o.BeTrue(), + "File %s was not created in node %s", filePath, firstNode.GetName()) + logger.Infof("OK!\n") + }) + +}) diff --git a/test/extended-priv/mco_prune.go b/test/extended-priv/mco_prune.go new file mode 100644 index 0000000000..d9cd4445cb --- /dev/null +++ b/test/extended-priv/mco_prune.go @@ -0,0 +1,266 @@ +package extended + +import ( + "fmt" + "regexp" + + logger "github.com/openshift/machine-config-operator/test/extended-priv/util/logext" + + g "github.com/onsi/ginkgo/v2" + o "github.com/onsi/gomega" + exutil "github.com/openshift/machine-config-operator/test/extended-priv/util" +) + +var _ = g.Describe("[sig-mco][Suite:openshift/machine-config-operator/longduration][Serial][Disruptive] MCO", func() { + defer g.GinkgoRecover() + + var oc = exutil.NewCLI("mco", exutil.KubeConfigPath()) + + g.JustBeforeEach(func() { + PreChecks(oc) + }) + + g.It("[PolarionID:73148][OTP] prune renderedmachineconfigs", func() { + var ( + mcName = "fake-worker-pass-1" + mcList = NewMachineConfigList(oc.AsAdmin()) + wMcp = NewMachineConfigPool(oc.AsAdmin(), MachineConfigPoolWorker) + mMcp = NewMachineConfigPool(oc.AsAdmin(), MachineConfigPoolMaster) + NewSortedRenderedMCMaster []*MachineConfig + matchString string + ) + + // create machine config + exutil.By("Create a new MachineConfig") + mc := NewMachineConfig(oc.AsAdmin(), mcName, MachineConfigPoolWorker) + mc.parameters = []string{fmt.Sprintf(`PWDUSERS=[{"name":"%s", "passwordHash": "%s" }]`, "core", "fake-b")} + defer mc.DeleteWithWait() + mc.create() + o.Expect(mMcp.WaitImmediateForUpdatedStatus()).To(o.Succeed(), "Master MCP did not reach Updated status") + logger.Infof("OK!\n") + + wSpecConf, specErr := wMcp.getConfigNameOfSpec() // get worker MCP name + o.Expect(specErr).NotTo(o.HaveOccurred()) + mSpecConf, specErr := mMcp.getConfigNameOfSpec() // get master MCP name + o.Expect(specErr).NotTo(o.HaveOccurred()) + logger.Infof("%s %s \n", wSpecConf, mSpecConf) + + // sort mcList by time and get rendered machine config + mcList.SortByTimestamp() + sortedRenderedMCs := mcList.GetMCPRenderedMachineConfigsOrFail() + logger.Infof(" %s", sortedRenderedMCs) + + sortedMCListMaster := mcList.GetRenderedMachineConfigForMasterOrFail() // to get master rendered machine config + // 1 To check for `oc adm prune renderedmachineconfigs` cmd + exutil.By("To run prune cmd to know which rendered machineconfigs would be deleted") + pruneMCOutput, err := oc.AsAdmin().WithoutNamespace().Run("adm").Args("prune", "renderedmachineconfigs").Output() + o.Expect(err).NotTo(o.HaveOccurred(), "Cannot get the rendered config for pool") + logger.Infof(pruneMCOutput) + + for _, mc := range sortedRenderedMCs { + matchString := "dry-run deleting rendered MachineConfig " + if mc.GetName() == wSpecConf || mc.GetName() == mSpecConf { + matchString = "Skip dry-run deleting rendered MachineConfig " + } + o.Expect(pruneMCOutput).To(o.ContainSubstring(matchString+mc.GetName()), "The %s is not same as in-use renderedMC in MCP", mc.GetName()) // to check correct rendered MC will be deleted or skipped + + o.Expect(mc.Exists()).To(o.BeTrue(), "The dry run deleted rendered MC is removed but should exist.") + } + logger.Infof("OK!\n") + + // 2 To check for `oc adm prune renderedmachineconfigs --count=1 --pool-name master` cmd + exutil.By("To get the rendered machineconfigs based on count and MCP name") + pruneMCOutput, err = oc.AsAdmin().WithoutNamespace().Run("adm").Args("prune", "renderedmachineconfigs", "--count=1", "--pool-name", "master").Output() + o.Expect(err).NotTo(o.HaveOccurred(), "Cannot get the rendered config for pool") + logger.Infof(pruneMCOutput) + NewSortedRenderedMCMaster = mcList.GetRenderedMachineConfigForMasterOrFail() + + matchString = "dry-run deleting rendered MachineConfig " + if sortedMCListMaster[0].GetName() == mSpecConf { + matchString = "Skip dry-run deleting rendered MachineConfig " + } + o.Expect(pruneMCOutput).To(o.ContainSubstring(matchString+sortedMCListMaster[0].GetName()), "Oldest RenderedMachineConfig is not deleted") // to check old rendered master MC will be getting deleted + + o.Expect(NewSortedRenderedMCMaster).To(o.Equal(sortedMCListMaster), "The dry run deleted rendered MC is removed but should exist.") + logger.Infof("OK!\n") + + // 3 To check for 'oc adm prune renderedmachineconfigs list' cmd + exutil.By("Get the rendered machineconfigs list") + pruneMCOutput, err = oc.AsAdmin().WithoutNamespace().Run("adm").Args("prune", "renderedmachineconfigs", "list").Output() + o.Expect(err).NotTo(o.HaveOccurred(), "Cannot get the rendered config list") + logger.Infof(pruneMCOutput) + o.Expect(pruneMCOutput).To(o.And(o.ContainSubstring(wSpecConf), o.ContainSubstring(mSpecConf)), "Error: Deleted in-use rendered machine configs") + for _, mc := range sortedRenderedMCs { + used := "Currently in use: false" + if mc.GetName() == wSpecConf || mc.GetName() == mSpecConf { + used = "Currently in use: true" + } + o.Expect(pruneMCOutput).To(o.MatchRegexp(regexp.QuoteMeta(mc.GetName()) + ".*-- .*" + regexp.QuoteMeta(used) + ".*")) // to check correct rendered MC is in-use or not + } + logger.Infof("OK!\n") + + // 4 To check for 'oc adm prune renderedmachineconfigs list --in-use --pool-name master' cmd + exutil.By("To get the in use rendered machineconfigs for each MCP") + pruneMCOutput, err = oc.AsAdmin().WithoutNamespace().Run("adm").Args("prune", "renderedmachineconfigs", "list", "--in-use", "--pool-name", "master").Output() + o.Expect(err).NotTo(o.HaveOccurred(), "Cannot get the rendered config list") + logger.Infof("%s", mSpecConf) + mStatusConf, err := mMcp.getConfigNameOfStatus() + o.Expect(err).NotTo(o.HaveOccurred()) + logger.Infof("%s", mStatusConf) + // to check renderedMC is same as `spec` and `status` + o.Expect(pruneMCOutput).To(o.ContainSubstring("spec: "+mSpecConf), "Value for `spec` is not same as expected") + o.Expect(pruneMCOutput).To(o.ContainSubstring("status: "+mStatusConf), "Value for `status` is not same as expected ") + logger.Infof("%s", pruneMCOutput) + logger.Infof("OK!\n") + + // 5 To check for `oc adm prune renderedmachineconfigs --count=1 --pool-name master --confirm` cmd + exutil.By("To delete the rendered machineconfigs based on count and MCP name") + pruneMCOutput, err = oc.AsAdmin().WithoutNamespace().Run("adm").Args("prune", "renderedmachineconfigs", "--count=1", "--pool-name", "master", "--confirm").Output() + o.Expect(err).NotTo(o.HaveOccurred(), "Cannot get the rendered config list") + NewSortedRenderedMCMaster = mcList.GetRenderedMachineConfigForMasterOrFail() + + logger.Infof(pruneMCOutput) + + if sortedMCListMaster[0].GetName() == mSpecConf { + matchString = "Skip deleting rendered MachineConfig " + } else { + matchString = "deleting rendered MachineConfig " + for _, newMc := range NewSortedRenderedMCMaster { + o.Expect(newMc.GetName()).NotTo(o.ContainSubstring(sortedMCListMaster[0].GetName()), "Deleted rendered MachineConfig is still present in the new list") // check expected rendered-master MC is been deleted + } + } + o.Expect(pruneMCOutput).To(o.ContainSubstring(matchString+sortedMCListMaster[0].GetName()), "Oldest RenderedMachineConfig is not deleted") // check oldest rendered master MC is been deleted + + logger.Infof("OK!\n") + + // 6 To check for `oc adm prune renderedmachineconfigs --confirm` cmd + sortedRenderedMCs = mcList.GetMCPRenderedMachineConfigsOrFail() // Get the current list of rendered machine configs + exutil.By("To delete the rendered machineconfigs based on count and MCP name") + pruneMCOutput, err = oc.AsAdmin().WithoutNamespace().Run("adm").Args("prune", "renderedmachineconfigs", "--confirm").Output() + o.Expect(err).NotTo(o.HaveOccurred(), "Cannot get the rendered config list") + logger.Infof(pruneMCOutput) + + for _, mc := range sortedRenderedMCs { + if mc.GetName() == mSpecConf || mc.GetName() == wSpecConf { + matchString = "Skip deleting rendered MachineConfig " + o.Expect(mc.Exists()).To(o.BeTrue(), "Deleted the in-use rendered MC") // check in-use rendered MC is not been deleted + } else { + matchString = "deleting rendered MachineConfig " + o.Expect(mc.Exists()).To(o.BeFalse(), "The expected rendered MC is not deleted") // check expected rendered MC is been deleted + } + o.Expect(pruneMCOutput).To(o.ContainSubstring(matchString+mc.GetName()), "Oldest RenderedMachineConfig is not deleted") + } + + logger.Infof("OK!\n") + + }) + + g.It("[PolarionID:73155][OTP] prune renderedmachineconfigs in updating pools", func() { + var ( + wMcp = NewMachineConfigPool(oc.AsAdmin(), MachineConfigPoolWorker) + mcList = NewMachineConfigList(oc.AsAdmin()) + node = wMcp.GetSortedNodesOrFail()[0] + fileMode = "420" + fileContent = "test1" + filePath = "/etc/mco-test-case-73155-" + mcName = "mco-tc-73155-" + ) + + mcList.SortByTimestamp() // sort by time + wSpecConf, specErr := wMcp.getConfigNameOfSpec() // get worker MCP name + o.Expect(specErr).NotTo(o.HaveOccurred()) + + exutil.By("Create new Machine config") + mc := NewMachineConfig(oc.AsAdmin(), mcName+"1", MachineConfigPoolWorker) + fileConfig := getBase64EncodedFileConfig(filePath+"1", fileContent, fileMode) + mc.parameters = []string{fmt.Sprintf("FILES=[%s]", fileConfig)} + mc.skipWaitForMcp = true // to wait to execute command + + defer func() { + exutil.By("Check Machine Config are deleted") + o.Expect(NewRemoteFile(node, filePath+"1")).NotTo(Exist(), + "The file %s should NOT exists", filePath+"1") + o.Expect(NewRemoteFile(node, filePath+"2")).NotTo(Exist(), + "The file %s should NOT exists", filePath+"2") + + exutil.By("Check the MCP status is not been degreaded") + wMcp.waitForComplete() + }() + + defer mc.DeleteWithWait() // Clean up after creation + mc.create() + + logger.Infof("OK\n") + + exutil.By("Wait for first nodes to be configured") + + o.Eventually(node.IsUpdating, "10m", "20s").Should(o.BeTrue()) + o.Eventually(node.IsUpdated, "10m", "20s").Should(o.BeTrue()) // check for first node is updated + + initialRenderedMC, specErr := wMcp.getConfigNameOfSpec() // check for new worker rendered MC configured + o.Expect(specErr).NotTo(o.HaveOccurred()) + logger.Infof("OK\n") + + exutil.By("Create new second Machine configs") + + fileConfig = getBase64EncodedFileConfig(filePath+"2", fileContent, fileMode) + mc = NewMachineConfig(oc.AsAdmin(), mcName+"2", MachineConfigPoolWorker) + mc.parameters = []string{fmt.Sprintf("FILES=[%s]", fileConfig)} + mc.skipWaitForMcp = true // to wait to execute command + + defer mc.Delete() // Clean up after creation + mc.create() + logger.Infof("OK\n") + + exutil.By("Run prune command and check new rendered MC is generated with MCP is still updating") + + o.Eventually(wMcp.getConfigNameOfSpec, "5m", "20s").ShouldNot(o.Equal(initialRenderedMC), "Second worker renderedMC is not configured yet") + newRenderedMC, specErr := wMcp.getConfigNameOfSpec() + o.Expect(specErr).NotTo(o.HaveOccurred(), "Get desired MC of worker pool failed") + + pruneMCOutput, err := oc.AsAdmin().WithoutNamespace().Run("adm").Args("prune", "renderedmachineconfigs", "--pool-name", "worker", "--confirm").Output() + o.Expect(err).NotTo(o.HaveOccurred(), "Cannot get the rendered config list") + logger.Infof(pruneMCOutput) + + renderedMCs := []string{wSpecConf, initialRenderedMC, newRenderedMC} + // as wMCP is still updating with previous in-use rendered MC and new generated MC from 1st and 2nd MC created are also in-use so we need to check they are not deleted + + for _, mc := range renderedMCs { + o.Expect(pruneMCOutput).To(o.ContainSubstring("Skip deleting rendered MachineConfig "+mc), "Deleted the in-use rendered MC: "+mc) + } + logger.Infof("OK\n") + + exutil.By("Check no worker MCP is degreaded") + wMcp.waitForComplete() + + exutil.By("Execute the prune command again after complete update") + pruneMCOutput, err = oc.AsAdmin().WithoutNamespace().Run("adm").Args("prune", "renderedmachineconfigs", "--pool-name", "worker", "--confirm").Output() + o.Expect(err).NotTo(o.HaveOccurred(), "Cannot get the rendered config list") + logger.Infof(pruneMCOutput) + o.Expect(pruneMCOutput).To(o.ContainSubstring("Skip deleting rendered MachineConfig "+newRenderedMC), "Deleted the in-use rendered MC") + + logger.Infof("OK\n") + + }) + + g.It("[PolarionID:74606][OTP] 'oc adm prune' report failures consistently when using wrong pool name", func() { + var expectedErrorMsg = "error: MachineConfigPool with name 'fake' not found" + + _, stderr, err := oc.AsAdmin().Run("adm").Args("prune", "renderedmachineconfigs", "list", "--pool-name", "fake").Outputs() + o.Expect(err).To(o.HaveOccurred(), "Expected oc command error to fail but it didn't") + o.Expect(IsExecShellError(err)).To(o.BeTrue(), "Unexpected error while executing prune command. %s", err) + exitCode, unwrapErr := UnwrapExecCode(err) + o.Expect(unwrapErr).NotTo(o.HaveOccurred(), "Could not unwrap exit code from prune command error") + o.Expect(exitCode).ShouldNot(o.Equal(0), "Unexpected return code when executing the prune command with a wrong pool name") + o.Expect(stderr).To(o.Equal(expectedErrorMsg), "Unexecpted error message when using wrong pool name in the prune command") + + _, stderr, err = oc.AsAdmin().Run("adm").Args("prune", "renderedmachineconfigs", "list", "--in-use", "--pool-name", "fake").Outputs() + o.Expect(err).To(o.HaveOccurred(), "Expected oc command error to fail but it didn't") + o.Expect(IsExecShellError(err)).To(o.BeTrue(), "Unexpected error while executing prune command with in-use flag. %s", err) + exitCode, unwrapErr = UnwrapExecCode(err) + o.Expect(unwrapErr).NotTo(o.HaveOccurred(), "Could not unwrap exit code from prune command error") + o.Expect(exitCode).ShouldNot(o.Equal(0), "Unexpected return code when executing the prune command with the in-use flag and a wrong pool name") + o.Expect(stderr).To(o.Equal(expectedErrorMsg), "Unexecpted error message when using in-use flag and a wrong pool name in the prune command") + }) + +}) diff --git a/test/extended-priv/node.go b/test/extended-priv/node.go index 8d59399aca..4e2c5215ef 100644 --- a/test/extended-priv/node.go +++ b/test/extended-priv/node.go @@ -1586,3 +1586,15 @@ func (n *Node) GetRHCOSVersion() (string, error) { return rhcosVersion, nil } + +// FilterSchedulableNodesOrFail removes from a list of nodes the nodes that are not schedulable +func FilterSchedulableNodesOrFail(nodes []*Node) []*Node { + returnNodes := []*Node{} + for _, item := range nodes { + node := item + if node.IsSchedulableOrFail() { + returnNodes = append(returnNodes, node) + } + } + return returnNodes +} diff --git a/test/extended-priv/testdata/files/add-mc-to-trigger-node-drain.yaml b/test/extended-priv/testdata/files/add-mc-to-trigger-node-drain.yaml new file mode 100644 index 0000000000..6874bf875a --- /dev/null +++ b/test/extended-priv/testdata/files/add-mc-to-trigger-node-drain.yaml @@ -0,0 +1,25 @@ +apiVersion: template.openshift.io/v1 +kind: Template +metadata: + name: mc-template +objects: +- kind: MachineConfig + apiVersion: machineconfiguration.openshift.io/v1 + metadata: + labels: + machineconfiguration.openshift.io/role: "${POOL}" + name: "${NAME}" + spec: + config: + ignition: + version: 3.2.0 + storage: + files: + - contents: + source: data:text/plain;charset=utf;base64,c2VydmVyIGZvby5leGFtcGxlLm5ldCBtYXhkZWxheSAwLjQgb2ZmbGluZQpzZXJ2ZXIgYmFyLmV4YW1wbGUubmV0IG1heGRlbGF5IDAuNCBvZmZsaW5lCnNlcnZlciBiYXouZXhhbXBsZS5uZXQgbWF4ZGVsYXkgMC40IG9mZmxpbmUK + filesystem: root + mode: 0644 + path: /etc/test +parameters: + - name: NAME + - name: POOL diff --git a/test/extended-priv/testdata/files/change-worker-ign-version.yaml b/test/extended-priv/testdata/files/change-worker-ign-version.yaml index 539237d690..850397cb05 100644 --- a/test/extended-priv/testdata/files/change-worker-ign-version.yaml +++ b/test/extended-priv/testdata/files/change-worker-ign-version.yaml @@ -18,4 +18,4 @@ objects: parameters: - name: NAME - name: POOL - - name: IGNITION_VERSION + - name: IGNITION_VERSION \ No newline at end of file diff --git a/test/extended-priv/testdata/files/create-pod.yaml b/test/extended-priv/testdata/files/create-pod.yaml new file mode 100644 index 0000000000..1c2bd43db6 --- /dev/null +++ b/test/extended-priv/testdata/files/create-pod.yaml @@ -0,0 +1,32 @@ +apiVersion: template.openshift.io/v1 +kind: Template +metadata: + name: create-pod +objects: + - kind: Pod + apiVersion: v1 + metadata: + labels: + app: "${NAME}" + name: "${NAME}" + spec: + containers: + - args: + - sleep + - 4h + image: quay.io/openshifttest/busybox@sha256:c5439d7db88ab5423999530349d327b04279ad3161d7596d2126dfb5b02bfd1f + imagePullPolicy: Always + name: "${NAME}" + nodeSelector: + kubernetes.io/hostname: "${HOSTNAME}" + restartPolicy: Never + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsNonRoot: true + seccompProfile: + type: "RuntimeDefault" +parameters: + - name: NAME + - name: HOSTNAME diff --git a/test/extended-priv/testdata/files/pod-disruption-budget.yaml b/test/extended-priv/testdata/files/pod-disruption-budget.yaml new file mode 100644 index 0000000000..35bfe37b7e --- /dev/null +++ b/test/extended-priv/testdata/files/pod-disruption-budget.yaml @@ -0,0 +1,16 @@ +apiVersion: template.openshift.io/v1 +kind: Template +metadata: + name: pod-disruption-budget +objects: + - kind: PodDisruptionBudget + apiVersion: policy/v1 + metadata: + name: "${NAME}" + spec: + minAvailable: 1 + selector: + matchLabels: + app: "${NAME}" +parameters: + - name: NAME diff --git a/test/extended-priv/util.go b/test/extended-priv/util.go index ed097856b4..cf3c235409 100644 --- a/test/extended-priv/util.go +++ b/test/extended-priv/util.go @@ -8,6 +8,7 @@ import ( b64 "encoding/base64" "encoding/json" "encoding/pem" + "errors" "fmt" "net/url" "os" @@ -509,7 +510,7 @@ func WorkersCanBeScaled(oc *exutil.CLI) (bool, error) { logger.Infof("Checking if in this cluster workers can be scaled using machinesets") // Baremetal and None platforms cannot scale workers - if platform == "baremetal" || platform == "none" || platform == "" { + if platform == BaremetalPlatform || platform == NonePlatform || platform == "" { logger.Infof("Baremetal/None platform. Can't scale up nodes in Baremetal test environments. Nodes cannot be scaled") return false, nil } @@ -1267,3 +1268,157 @@ func skipTestIfOsIsNotCoreOs(oc *exutil.CLI) *Node { } return allCoreOs[0] } + +// SkipIfCompactOrSNO skips the test case if the cluster is a compact or SNO cluster +func SkipIfCompactOrSNO(oc *exutil.CLI) { + if IsCompactOrSNOCluster(oc) { + g.Skip("The test is not supported in Compact or SNO clusters") + } +} + +// IsSNO returns true if the cluster is a SNO cluster +func IsSNO(oc *exutil.CLI) bool { + return len(exutil.OrFail[[]*Node](NewNodeList(oc.AsAdmin()).GetAll())) == 1 +} + +// IsExecShellError returns true if the error is due to a failure in the command execution, and not a failue elsewhere (for example, a system failure previous to the shell command execution) +func IsExecShellError(err error) bool { + if unwrapped := errors.Unwrap(err); unwrapped != nil { + _, ok := unwrapped.(*exec.ExitError) + return ok + } + _, ok := err.(*exec.ExitError) + return ok +} + +// UnwrapExecCode unwraps the error and extracts the stderr string if possible +func UnwrapExecCode(err error) (int, error) { + if unwrapped := errors.Unwrap(err); unwrapped != nil { + exitError, ok := unwrapped.(*exec.ExitError) + if ok { + return exitError.ExitCode(), nil + } + } + return -1, fmt.Errorf("No exit code available in the provided error %s", err) +} + +func getTimeDifferenceInMinute(oldTimestamp, newTimestamp string) float64 { + // Parse timestamps using time.Parse with proper layout + // Layout matches format: HH:MM:SS.ffffff (1-6 digit fractional seconds) + y, m, d := time.Now().Date() + datePrefix := fmt.Sprintf("%04d-%02d-%02d ", y, m, d) + + // Parse old timestamp + oldTime, err := time.Parse("2006-01-02 15:04:05.999999999", datePrefix+oldTimestamp) + if err != nil { + // Fallback: try parsing without fractional seconds + oldTime, err = time.Parse("2006-01-02 15:04:05", datePrefix+strings.Split(oldTimestamp, ".")[0]) + if err != nil { + logger.Errorf("Failed to parse old timestamp '%s': %v", oldTimestamp, err) + return 0 + } + } + + // Parse new timestamp + newTime, err := time.Parse("2006-01-02 15:04:05.999999999", datePrefix+newTimestamp) + if err != nil { + // Fallback: try parsing without fractional seconds + newTime, err = time.Parse("2006-01-02 15:04:05", datePrefix+strings.Split(newTimestamp, ".")[0]) + if err != nil { + logger.Errorf("Failed to parse new timestamp '%s': %v", newTimestamp, err) + return 0 + } + } + + return newTime.Sub(oldTime).Minutes() +} + +func filterTimestampFromLogs(logs string, numberOfTimestamp int) []string { + return regexp.MustCompile(`(?m)\b[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{1,6}\b`).FindAllString(logs, numberOfTimestamp) +} + +// AddToAllMachineSets adds a delta to all MachineSets replicas and wait for the MachineSets to be ready +func AddToAllMachineSets(oc *exutil.CLI, delta int) error { + allMs, err := NewMachineSetList(oc.AsAdmin(), "openshift-machine-api").GetAll() + o.Expect(err).NotTo(o.HaveOccurred()) + + var addErr error + modifiedMSs := []*MachineSet{} + for _, ms := range allMs { + addErr = ms.AddToScale(delta) + if addErr == nil { + modifiedMSs = append(modifiedMSs, ms) + } else { + break + } + } + + if addErr != nil { + logger.Infof("Error reconfiguring MachineSets. Restoring original replicas.") + for _, ms := range modifiedMSs { + _ = ms.AddToScale(-1 * delta) + } + + return addErr + } + + var waitErr error + for _, ms := range allMs { + immediate := true + waitErr = wait.PollUntilContextTimeout(context.TODO(), 30*time.Second, 20*time.Minute, immediate, func(_ context.Context) (bool, error) { return ms.GetIsReady(), nil }) + if waitErr != nil { + logger.Errorf("MachineSet %s is not ready. Restoring original replicas.", ms.GetName()) + for _, ms := range modifiedMSs { + _ = ms.AddToScale(-1 * delta) + } + break + } + } + + return waitErr +} + +// checkUpdatedLists Compares that 2 lists are ordered in steps. +// when we update nodes with maxUnavailable>1, since we are polling, we cannot make sure +// that the sorted lists have the same order one by one. We can only make sure that the steps +// defined by maxUnavailable have the right order. +// If step=1, it is the same as comparing that both lists are equal. +func checkUpdatedLists(l, r []*Node, step int) bool { + if len(l) != len(r) { + logger.Errorf("Compared lists have different size") + return false + } + + indexStart := 0 + for i := 0; i < len(l); i += step { + indexEnd := i + step + if (i + step) > (len(l)) { + indexEnd = len(l) + } + + // Create 2 sublists with the size of the step + stepL := l[indexStart:indexEnd] + stepR := r[indexStart:indexEnd] + indexStart += step + + // All elements in one sublist should exist in the other one + // but they dont have to be in the same order. + for _, nl := range stepL { + found := false + for _, nr := range stepR { + if nl.GetName() == nr.GetName() { + found = true + break + } + + } + if !found { + logger.Errorf("Nodes were not updated in the right order. Comparing steps %s and %s\n", stepL, stepR) + return false + } + } + + } + return true + +} diff --git a/test/extended-priv/util/client.go b/test/extended-priv/util/client.go index 96e78c0094..045ec18233 100644 --- a/test/extended-priv/util/client.go +++ b/test/extended-priv/util/client.go @@ -1021,3 +1021,7 @@ func (c *CLI) SilentOutput() (string, error) { return "", nil } } + +func (e *ExitError) Unwrap() error { + return e.ExitError +}