From 1564516320038ca2c0d83eb3ed48388aa2488579 Mon Sep 17 00:00:00 2001 From: Jack Challen Date: Tue, 9 Dec 2025 13:35:45 +0000 Subject: [PATCH 1/4] Fix slightly misleading help message --- scripts.d/ta/410_nvme_controllers_with_invalid_irq.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts.d/ta/410_nvme_controllers_with_invalid_irq.sh b/scripts.d/ta/410_nvme_controllers_with_invalid_irq.sh index e86268a..a8c5319 100755 --- a/scripts.d/ta/410_nvme_controllers_with_invalid_irq.sh +++ b/scripts.d/ta/410_nvme_controllers_with_invalid_irq.sh @@ -22,10 +22,8 @@ for PCI_DEVICE_ID in $(sudo lspci -mm | grep 'Non-Volatile memory controller' | echo "The NVMe device at PCI address ${PCI_DEVICE_ID} appears to have" echo "invalid IRQ routing. This is indicated by the presence of a negative number in the" echo "\"Interrupt:\" line from lspci." - echo "This might not cause a problem, but it might prevent an NVMe drive from being claimed" - echo "by a Weka process." - echo "This can be caused by the presence of an enabled APIC device. Review your hardware," - echo "firmware, and linux kernel settings if this is causing a problem" + echo "This can sometimes prevent a WEKA Process from receiving interrupts from the NVME" + echo "Please review your hardware, firmware, and linux kernel settings if this is causing a problem" fi done From d26fbcbe2fd3efff358fd4b240e0cbd92817d854 Mon Sep 17 00:00:00 2001 From: Jack Challen Date: Tue, 9 Dec 2025 13:54:30 +0000 Subject: [PATCH 2/4] Check for extended APIC availability following customer incident A customer saw vector space exhaustion, which meant NVMEs became unusable. This was caused by x2apic not being available, (which can itself by caused by IOMMU being disabled) Therefore, check that x2apic (Intel) or ext_apic (AMD) is available, which should result in plenty of IRQ space. https://weka-support.slack.com/archives/C066DNGSAE5/p1764947984669029 --- scripts.d/ta/415_check_extended_apic.sh | 31 +++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100755 scripts.d/ta/415_check_extended_apic.sh diff --git a/scripts.d/ta/415_check_extended_apic.sh b/scripts.d/ta/415_check_extended_apic.sh new file mode 100755 index 0000000..dc95103 --- /dev/null +++ b/scripts.d/ta/415_check_extended_apic.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +#set -ue # Fail with an error code if there's any sub-command/variable error + +DESCRIPTION="Check that extended APIC is available for assigning IRQs" +# script type is single, parallel, sequential, or parallel-compare-backends +SCRIPT_TYPE="parallel" +JIRA_REFERENCE="" +WTA_REFERENCE="" +KB_REFERENCE="" + +RETURN_CODE="0" + +#check that extended APIC (or x2apic) is available, because it's required for more +# space for IRQs + +if (grep -m1 -q -E '^flags.*(\ Date: Wed, 10 Dec 2025 12:53:56 +0000 Subject: [PATCH 3/4] Remove subshell call, handle successful run in if test --- scripts.d/ta/415_check_extended_apic.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts.d/ta/415_check_extended_apic.sh b/scripts.d/ta/415_check_extended_apic.sh index dc95103..24b3a03 100755 --- a/scripts.d/ta/415_check_extended_apic.sh +++ b/scripts.d/ta/415_check_extended_apic.sh @@ -14,7 +14,10 @@ RETURN_CODE="0" #check that extended APIC (or x2apic) is available, because it's required for more # space for IRQs -if (grep -m1 -q -E '^flags.*(\/dev/null +EXT_APIC_STATUS=$? +if [[ ${EXT_APIC_STATUS} -eq 0 ]] ; then + RETURN_CODE="0" else RETURN_CODE="254" echo "There is no extended APIC available. This can prevent the assignment" From ec21db82958dfa78c7a34fc2bb866ed75d7e181f Mon Sep 17 00:00:00 2001 From: Jack Challen Date: Thu, 11 Dec 2025 13:50:17 +0000 Subject: [PATCH 4/4] Wording; make clear kernel will be prevented from accessing devices --- scripts.d/ta/415_check_extended_apic.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts.d/ta/415_check_extended_apic.sh b/scripts.d/ta/415_check_extended_apic.sh index 24b3a03..c3b9828 100755 --- a/scripts.d/ta/415_check_extended_apic.sh +++ b/scripts.d/ta/415_check_extended_apic.sh @@ -22,7 +22,8 @@ else RETURN_CODE="254" echo "There is no extended APIC available. This can prevent the assignment" echo "of enough IRQs to support all hardware, resulting in the kernel" - echo "error message: vector space exhaustion" + echo "error message: vector space exhaustion. This in turn can completely" + echo "prevent the kernel accessing devices such as NVMEs." echo "A frequent cause of no extended APIC is the disabling of IOMMUs" fi