diff --git a/scripts.d/ta/410_nvme_controllers_with_invalid_irq.sh b/scripts.d/ta/410_nvme_controllers_with_invalid_irq.sh index e86268a..a8c5319 100755 --- a/scripts.d/ta/410_nvme_controllers_with_invalid_irq.sh +++ b/scripts.d/ta/410_nvme_controllers_with_invalid_irq.sh @@ -22,10 +22,8 @@ for PCI_DEVICE_ID in $(sudo lspci -mm | grep 'Non-Volatile memory controller' | echo "The NVMe device at PCI address ${PCI_DEVICE_ID} appears to have" echo "invalid IRQ routing. This is indicated by the presence of a negative number in the" echo "\"Interrupt:\" line from lspci." - echo "This might not cause a problem, but it might prevent an NVMe drive from being claimed" - echo "by a Weka process." - echo "This can be caused by the presence of an enabled APIC device. Review your hardware," - echo "firmware, and linux kernel settings if this is causing a problem" + echo "This can sometimes prevent a WEKA Process from receiving interrupts from the NVME" + echo "Please review your hardware, firmware, and linux kernel settings if this is causing a problem" fi done diff --git a/scripts.d/ta/415_check_extended_apic.sh b/scripts.d/ta/415_check_extended_apic.sh new file mode 100755 index 0000000..c3b9828 --- /dev/null +++ b/scripts.d/ta/415_check_extended_apic.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +#set -ue # Fail with an error code if there's any sub-command/variable error + +DESCRIPTION="Check that extended APIC is available for assigning IRQs" +# script type is single, parallel, sequential, or parallel-compare-backends +SCRIPT_TYPE="parallel" +JIRA_REFERENCE="" +WTA_REFERENCE="" +KB_REFERENCE="" + +RETURN_CODE="0" + +#check that extended APIC (or x2apic) is available, because it's required for more +# space for IRQs + +grep -m1 -q -E '^flags.*(\/dev/null +EXT_APIC_STATUS=$? +if [[ ${EXT_APIC_STATUS} -eq 0 ]] ; then + RETURN_CODE="0" +else + RETURN_CODE="254" + echo "There is no extended APIC available. This can prevent the assignment" + echo "of enough IRQs to support all hardware, resulting in the kernel" + echo "error message: vector space exhaustion. This in turn can completely" + echo "prevent the kernel accessing devices such as NVMEs." + echo "A frequent cause of no extended APIC is the disabling of IOMMUs" +fi + +if [[ ${RETURN_CODE} -eq "0" ]]; then + echo "Extended APIC reports available" +else + echo "No extended APIC available" +fi +exit ${RETURN_CODE}