Skip to content

Commit 2aadffa

Browse files
author
Pearl Dsilva
committed
added a small improvement to factor in a senario when lv is inactive on all hosts, could happen in storage outage issue
1 parent 4b66876 commit 2aadffa

2 files changed

Lines changed: 314 additions & 10 deletions

File tree

server/src/main/java/com/cloud/storage/clvm/ClvmPoolManager.java

Lines changed: 98 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020

2121
import java.util.ArrayList;
2222
import java.util.Arrays;
23+
import java.util.Collections;
2324
import java.util.List;
25+
import java.util.stream.Collectors;
2426
import javax.inject.Inject;
2527

2628
import com.cloud.agent.AgentManager;
@@ -138,12 +140,16 @@ public void setClvmLockHostId(long volumeId, long hostId) {
138140
* Query LVM to find the actual current lock holder for a volume.
139141
* This is the SOURCE OF TRUTH - it queries the actual LVM state via sanlock/lvmlockd.
140142
*
143+
* <p>If no host holds the exclusive lock (e.g. after a storage outage), this method attempts
144+
* an exclusive activation on the best available host before giving up. Activation failure
145+
* is non-fatal: the method returns null so callers can apply their own fallback logic.
146+
*
141147
* @param volumeId The volume ID
142148
* @param volumeUuid The volume UUID
143149
* @param volumePath The LV path (e.g., "vm-123-disk-0")
144150
* @param pool The storage pool
145-
* @param updateDatabase If true, updates the database with the actual value (for debugging/audit)
146-
* @return Host ID of current lock holder, or null if no lock is held or query failed
151+
* @param updateDatabase If true, persists the discovered or newly-activated lock host to the DB
152+
* @return Host ID of current or newly-activated lock holder, or null if none found/activated
147153
*/
148154
public Long queryCurrentLockHolder(Long volumeId, String volumeUuid, String volumePath,
149155
StoragePool pool, boolean updateDatabase) {
@@ -169,7 +175,7 @@ public Long queryCurrentLockHolder(Long volumeId, String volumeUuid, String volu
169175
logger.debug("Fast path: volume {} confirmed active on DB host {}", volumeUuid, dbHostId);
170176
return dbHostId;
171177
}
172-
logger.info("Fast path miss: volume {} not active on DB host {} falling back to full fan-out",
178+
logger.info("Fast path miss: volume {} not active on DB host {} - falling back to full fan-out",
173179
volumeUuid, dbHostId);
174180
} else {
175181
logger.info("Fast path skip: DB host {} for volume {} is down/missing — falling back to full fan-out",
@@ -212,9 +218,21 @@ public Long queryCurrentLockHolder(Long volumeId, String volumeUuid, String volu
212218

213219
if (activeHostIds.isEmpty()) {
214220
logger.debug("Volume {} is not active on any reachable host — no exclusive lock held", volumeUuid);
221+
222+
// Recovery: attempt exclusive activation on the best available host before giving up.
223+
Long targetHostId = selectActivationTargetHost(dbHostId, hosts);
224+
if (targetHostId != null) {
225+
Long recoveredHostId = tryActivateExclusivelyOnHost(volumeId, volumeUuid, lvPath,
226+
targetHostId, updateDatabase);
227+
if (recoveredHostId != null) {
228+
return recoveredHostId;
229+
}
230+
}
231+
232+
// Activation failed or no eligible host - clean up stale DB record and give up
215233
if (updateDatabase && dbHostId != null) {
216234
VolumeDetailVO detail = _volsDetailsDao.findDetail(volumeId, CLVM_LOCK_HOST_ID);
217-
if (detail != null) {
235+
if (detail != null && String.valueOf(dbHostId).equals(detail.getValue())) {
218236
_volsDetailsDao.remove(detail.getId());
219237
}
220238
}
@@ -276,6 +294,82 @@ private Boolean querySingleHostLockState(Long hostId, String lvPath, String volu
276294
}
277295
}
278296

297+
/**
298+
* Selects the best host on which to exclusively activate an inactive CLVM volume.
299+
*
300+
* <p>Priority 1: the last known lock holder ({@code clvmLockHostId} from DB), if that host
301+
* is UP and KVM.
302+
*
303+
* <p>Priority 2: a random UP KVM routing host from the cluster/zone list (fallback).
304+
*
305+
* @param dbHostId last known lock holder host ID from the DB (may be null)
306+
* @param hosts routing hosts in the cluster or zone collected during fan-out
307+
* @return host ID to activate on, or null if no eligible host found
308+
*/
309+
private Long selectActivationTargetHost(Long dbHostId, List<HostVO> hosts) {
310+
if (dbHostId != null) {
311+
HostVO dbHost = _hostDao.findById(dbHostId);
312+
if (dbHost != null && dbHost.getStatus() == Status.Up
313+
&& dbHost.getHypervisorType() == Hypervisor.HypervisorType.KVM) {
314+
logger.debug("selectActivationTargetHost: preferring DB host {} (last known lock holder)", dbHostId);
315+
return dbHostId;
316+
}
317+
}
318+
if (hosts != null) {
319+
List<HostVO> eligible = hosts.stream()
320+
.filter(h -> h.getStatus() == Status.Up
321+
&& h.getType() == Host.Type.Routing
322+
&& h.getHypervisorType() == Hypervisor.HypervisorType.KVM)
323+
.collect(Collectors.toList());
324+
if (!eligible.isEmpty()) {
325+
Collections.shuffle(eligible);
326+
HostVO chosen = eligible.get(0);
327+
logger.debug("selectActivationTargetHost: falling back to random UP KVM host {} in cluster/zone",
328+
chosen.getId());
329+
return chosen.getId();
330+
}
331+
}
332+
logger.warn("selectActivationTargetHost: no eligible UP KVM host found");
333+
return null;
334+
}
335+
336+
/**
337+
* Sends an {@code ACTIVATE_EXCLUSIVE} command to {@code targetHostId} and optionally
338+
* persists the new lock host to the database.
339+
*
340+
* <p>Activation failure is non-fatal: returns null so the caller can apply its own fallback.
341+
*
342+
* @param volumeId volume DB ID
343+
* @param volumeUuid volume UUID (for logging)
344+
* @param lvPath full LV device path, e.g. {@code /dev/vgname/vol-path}
345+
* @param targetHostId host to activate on
346+
* @param updateDatabase if true, persists the new lock host on success
347+
* @return {@code targetHostId} on success, {@code null} if the command failed or threw
348+
*/
349+
private Long tryActivateExclusivelyOnHost(Long volumeId, String volumeUuid, String lvPath,
350+
Long targetHostId, boolean updateDatabase) {
351+
try {
352+
ClvmLockTransferCommand activateCmd = new ClvmLockTransferCommand(
353+
ClvmLockTransferCommand.Operation.ACTIVATE_EXCLUSIVE, lvPath, volumeUuid);
354+
Answer activateAnswer = _agentMgr.send(targetHostId, activateCmd);
355+
if (activateAnswer != null && activateAnswer.getResult()) {
356+
logger.info("Recovery: exclusively activated volume {} on host {} (was inactive on all hosts)",
357+
volumeUuid, targetHostId);
358+
if (updateDatabase) {
359+
setClvmLockHostId(volumeId, targetHostId);
360+
}
361+
return targetHostId;
362+
}
363+
logger.warn("Recovery activation of volume {} on host {} failed: {}",
364+
volumeUuid, targetHostId,
365+
activateAnswer != null ? activateAnswer.getDetails() : "null answer");
366+
} catch (AgentUnavailableException | OperationTimedoutException e) {
367+
logger.warn("Recovery activation of volume {} on host {} threw exception: {}",
368+
volumeUuid, targetHostId, e.getMessage());
369+
}
370+
return null;
371+
}
372+
279373
/**
280374
* Cleans up CLVM lock host tracking detail from volume_details table.
281375
* Called after successful volume deletion to prevent orphaned records.

0 commit comments

Comments
 (0)