Skip to content

Commit 65a3e99

Browse files
KVM HA: fence by confirming host power state, not the power-off result
KVMHAProvider.fence() declared a host fenced only when the out-of-band power-off command reported success. Against an already-off chassis the BMC rejects the power-off (e.g. Redfish returns HTTP 409), so fence() failed and the host stayed stuck in the Fencing HA state, which maps to Disconnected (not Down). VM-HA therefore never restarted the VMs until the dead host was powered back on. Fencing now succeeds based on the actual chassis power state: - if the host is already powered off (OOBM STATUS == Off), treat it as fenced; - otherwise issue a best-effort power-off and confirm via OOBM STATUS; - only a confirmed Off state counts as success; if the state cannot be confirmed (e.g. unreachable BMC) the fence fails and is retried, to avoid split-brain. Also map Redfish PowerOperation.OFF to ForceOff (hard power-off) instead of GracefulShutdown, consistent with the ipmitool driver and appropriate for fencing an unresponsive host (SOFT remains the graceful ACPI shutdown). Fixes #13376
1 parent 21b2025 commit 65a3e99

3 files changed

Lines changed: 129 additions & 11 deletions

File tree

plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java

Lines changed: 49 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import org.apache.cloudstack.ha.provider.HARecoveryException;
3434
import org.apache.cloudstack.ha.provider.host.HAAbstractHostProvider;
3535
import org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerOperation;
36+
import org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerState;
3637
import org.apache.cloudstack.outofbandmanagement.OutOfBandManagementService;
3738
import org.joda.time.DateTime;
3839

@@ -85,18 +86,57 @@ public boolean recover(Host r) throws HARecoveryException {
8586

8687
@Override
8788
public boolean fence(Host r) throws HAFenceException {
89+
if (!outOfBandManagementService.isOutOfBandManagementEnabled(r)) {
90+
logger.warn("Out-of-band management is not enabled/configured for host {}; cannot fence it.", r);
91+
return false;
92+
}
8893

94+
// Fencing must guarantee the host is powered off, and the only reliable signal for that is
95+
// the actual chassis power state - not the return code of the power-off command. An already-off
96+
// (or just-turned-off) host can make the power-off command report an error/conflict even though
97+
// the host is down; conversely, an unreachable BMC must NOT be treated as a successful fence, to
98+
// avoid split-brain. Therefore: (1) if already off, consider it fenced; (2) otherwise issue a
99+
// best-effort power-off; (3) declare the fence successful only if the power state is confirmed Off.
89100
try {
90-
if (outOfBandManagementService.isOutOfBandManagementEnabled(r)){
91-
final OutOfBandManagementResponse resp = outOfBandManagementService.executePowerOperation(r, PowerOperation.OFF, null);
92-
return resp.getSuccess();
93-
} else {
94-
logger.warn("OOBM fence operation failed for this host {}", r);
95-
return false;
101+
if (isHostPoweredOff(r)) {
102+
logger.info("Host {} is already powered off; considering it fenced.", r);
103+
return true;
96104
}
97-
} catch (Exception e){
98-
logger.warn("OOBM service is not configured or enabled for this host {} error is {}", r, e.getMessage());
99-
throw new HAFenceException(String.format("OBM service is not configured or enabled for this host %s", r.getName()), e);
105+
106+
try {
107+
outOfBandManagementService.executePowerOperation(r, PowerOperation.OFF, null);
108+
} catch (Exception e) {
109+
// The power-off may legitimately fail (e.g. the chassis is already off or the command
110+
// conflicts with the current power state). Do not fail here - verify the actual power
111+
// state below instead of trusting the command result.
112+
logger.warn("Out-of-band power-off command for host {} did not complete successfully ({}); verifying the actual power state.", r, e.getMessage());
113+
}
114+
115+
if (isHostPoweredOff(r)) {
116+
logger.info("Confirmed host {} is powered off; fencing successful.", r);
117+
return true;
118+
}
119+
120+
logger.warn("Could not confirm host {} is powered off after the fence power-off; fencing will be retried.", r);
121+
return false;
122+
} catch (Exception e) {
123+
logger.warn("Out-of-band fence operation failed for host {}: {}", r, e.getMessage());
124+
throw new HAFenceException(String.format("Out-of-band fence operation failed for host %s", r.getName()), e);
125+
}
126+
}
127+
128+
/**
129+
* Returns {@code true} only when the host's chassis power is positively confirmed to be Off via
130+
* out-of-band management. Any failure to determine the power state (e.g. an unreachable BMC) returns
131+
* {@code false}, so that a host whose state cannot be confirmed is never treated as fenced.
132+
*/
133+
protected boolean isHostPoweredOff(Host r) {
134+
try {
135+
final OutOfBandManagementResponse statusResponse = outOfBandManagementService.executePowerOperation(r, PowerOperation.STATUS, null);
136+
return statusResponse != null && PowerState.Off.equals(statusResponse.getPowerState());
137+
} catch (Exception e) {
138+
logger.warn("Unable to determine the out-of-band power state of host {}: {}", r, e.getMessage());
139+
return false;
100140
}
101141
}
102142

plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KVMHostHATest.java

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,20 @@
2020

2121
import static org.junit.Assert.assertFalse;
2222
import static org.junit.Assert.assertTrue;
23+
import static org.mockito.ArgumentMatchers.eq;
24+
import static org.mockito.ArgumentMatchers.isNull;
2325
import static org.mockito.Mockito.lenient;
26+
import static org.mockito.Mockito.mock;
27+
import static org.mockito.Mockito.never;
28+
import static org.mockito.Mockito.verify;
2429
import static org.mockito.Mockito.when;
2530

31+
import org.apache.cloudstack.api.response.OutOfBandManagementResponse;
2632
import org.apache.cloudstack.ha.provider.HACheckerException;
33+
import org.apache.cloudstack.ha.provider.HAFenceException;
34+
import org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerOperation;
35+
import org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerState;
36+
import org.apache.cloudstack.outofbandmanagement.OutOfBandManagementService;
2737
import org.joda.time.DateTime;
2838
import org.junit.Before;
2939
import org.junit.Test;
@@ -34,6 +44,7 @@
3444
import com.cloud.exception.StorageUnavailableException;
3545
import com.cloud.host.Host;
3646
import com.cloud.hypervisor.Hypervisor.HypervisorType;
47+
import com.cloud.utils.exception.CloudRuntimeException;
3748

3849
@RunWith(MockitoJUnitRunner.class)
3950
public class KVMHostHATest {
@@ -42,12 +53,21 @@ public class KVMHostHATest {
4253
private Host host;
4354
@Mock
4455
private KVMHostActivityChecker kvmHostActivityChecker;
56+
@Mock
57+
private OutOfBandManagementService outOfBandManagementService;
4558
private KVMHAProvider kvmHAProvider;
4659

4760
@Before
4861
public void setup() {
4962
kvmHAProvider = new KVMHAProvider();
5063
kvmHAProvider.hostActivityChecker = kvmHostActivityChecker;
64+
kvmHAProvider.outOfBandManagementService = outOfBandManagementService;
65+
}
66+
67+
private OutOfBandManagementResponse powerStatusResponse(PowerState state) {
68+
OutOfBandManagementResponse response = mock(OutOfBandManagementResponse.class);
69+
lenient().when(response.getPowerState()).thenReturn(state);
70+
return response;
5171
}
5272

5373
@Test
@@ -80,4 +100,60 @@ public void testHostActivityForDownHost() throws HACheckerException, StorageUnav
80100
assertFalse(kvmHAProvider.hasActivity(host, dt));
81101
}
82102

103+
@Test
104+
public void testFenceWhenOutOfBandManagementNotEnabled() throws HAFenceException {
105+
when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(false);
106+
assertFalse(kvmHAProvider.fence(host));
107+
verify(outOfBandManagementService, never()).executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull());
108+
}
109+
110+
@Test
111+
public void testFenceWhenHostAlreadyPoweredOff() throws HAFenceException {
112+
OutOfBandManagementResponse offStatus = powerStatusResponse(PowerState.Off);
113+
when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true);
114+
when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.STATUS), isNull())).thenReturn(offStatus);
115+
assertTrue(kvmHAProvider.fence(host));
116+
// The host is already off, so no power-off command should be issued.
117+
verify(outOfBandManagementService, never()).executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull());
118+
}
119+
120+
@Test
121+
public void testFenceConfirmedOffAfterPowerOff() throws HAFenceException {
122+
OutOfBandManagementResponse onStatus = powerStatusResponse(PowerState.On);
123+
OutOfBandManagementResponse offStatus = powerStatusResponse(PowerState.Off);
124+
when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true);
125+
// First STATUS (pre-check) returns On, second STATUS (post power-off) returns Off.
126+
when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.STATUS), isNull()))
127+
.thenReturn(onStatus, offStatus);
128+
assertTrue(kvmHAProvider.fence(host));
129+
verify(outOfBandManagementService).executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull());
130+
}
131+
132+
@Test
133+
public void testFencePowerOffCommandFailsButHostConfirmedOff() throws HAFenceException {
134+
// Regression: the power-off command fails (e.g. the chassis is already off and the BMC returns
135+
// HTTP 409), but the host is actually off. Fencing must succeed because the confirmed power state -
136+
// not the power-off command's result - is authoritative.
137+
OutOfBandManagementResponse onStatus = powerStatusResponse(PowerState.On);
138+
OutOfBandManagementResponse offStatus = powerStatusResponse(PowerState.Off);
139+
when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true);
140+
when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.STATUS), isNull()))
141+
.thenReturn(onStatus, offStatus);
142+
when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull()))
143+
.thenThrow(new CloudRuntimeException("power-off failed: HTTP 409"));
144+
assertTrue(kvmHAProvider.fence(host));
145+
}
146+
147+
@Test
148+
public void testFenceFailsWhenPowerStateCannotBeConfirmedOff() throws HAFenceException {
149+
// The host cannot be confirmed off (e.g. an unreachable BMC). Fencing must NOT succeed, to avoid
150+
// restarting VMs while the host may still be running (split-brain).
151+
when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true);
152+
when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.STATUS), isNull()))
153+
.thenThrow(new CloudRuntimeException("BMC unreachable"));
154+
when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull()))
155+
.thenThrow(new CloudRuntimeException("BMC unreachable"));
156+
assertFalse(kvmHAProvider.fence(host));
157+
}
158+
83159
}

plugins/outofbandmanagement-drivers/redfish/src/main/java/org/apache/cloudstack/outofbandmanagement/driver/redfish/RedfishWrapper.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,17 @@ public RedfishClient.RedfishResetCmd parsePowerCommand(OutOfBandManagement.Power
3131
case ON:
3232
return RedfishClient.RedfishResetCmd.On;
3333
case OFF:
34-
return RedfishClient.RedfishResetCmd.GracefulShutdown;
34+
// OFF is a hard power-off (consistent with the ipmitool driver and required for HA
35+
// fencing of an unresponsive host); SOFT performs the graceful ACPI shutdown.
36+
return RedfishClient.RedfishResetCmd.ForceOff;
3537
case CYCLE:
3638
return RedfishClient.RedfishResetCmd.PowerCycle;
3739
case RESET:
3840
return RedfishClient.RedfishResetCmd.ForceRestart;
3941
case SOFT:
4042
return RedfishClient.RedfishResetCmd.GracefulShutdown;
4143
case STATUS:
42-
throw new IllegalStateException(String.format("%s is not a valid Redfish Reset command [%s]", operation));
44+
throw new IllegalStateException(String.format("%s is not a valid Redfish Reset command", operation));
4345
default:
4446
throw new IllegalStateException(String.format("Redfish does not support operation [%s]", operation));
4547
}

0 commit comments

Comments
 (0)