2020
2121import java .util .ArrayList ;
2222import java .util .Arrays ;
23+ import java .util .Collections ;
2324import java .util .List ;
25+ import java .util .stream .Collectors ;
2426import javax .inject .Inject ;
2527
2628import com .cloud .agent .AgentManager ;
@@ -138,12 +140,16 @@ public void setClvmLockHostId(long volumeId, long hostId) {
138140 * Query LVM to find the actual current lock holder for a volume.
139141 * This is the SOURCE OF TRUTH - it queries the actual LVM state via sanlock/lvmlockd.
140142 *
143+ * <p>If no host holds the exclusive lock (e.g. after a storage outage), this method attempts
144+ * an exclusive activation on the best available host before giving up. Activation failure
145+ * is non-fatal: the method returns null so callers can apply their own fallback logic.
146+ *
141147 * @param volumeId The volume ID
142148 * @param volumeUuid The volume UUID
143149 * @param volumePath The LV path (e.g., "vm-123-disk-0")
144150 * @param pool The storage pool
145- * @param updateDatabase If true, updates the database with the actual value (for debugging/audit)
146- * @return Host ID of current lock holder, or null if no lock is held or query failed
151+ * @param updateDatabase If true, persists the discovered or newly-activated lock host to the DB
152+ * @return Host ID of current or newly-activated lock holder, or null if none found/activated
147153 */
148154 public Long queryCurrentLockHolder (Long volumeId , String volumeUuid , String volumePath ,
149155 StoragePool pool , boolean updateDatabase ) {
@@ -169,7 +175,7 @@ public Long queryCurrentLockHolder(Long volumeId, String volumeUuid, String volu
169175 logger .debug ("Fast path: volume {} confirmed active on DB host {}" , volumeUuid , dbHostId );
170176 return dbHostId ;
171177 }
172- logger .info ("Fast path miss: volume {} not active on DB host {} — falling back to full fan-out" ,
178+ logger .info ("Fast path miss: volume {} not active on DB host {} - falling back to full fan-out" ,
173179 volumeUuid , dbHostId );
174180 } else {
175181 logger .info ("Fast path skip: DB host {} for volume {} is down/missing — falling back to full fan-out" ,
@@ -212,9 +218,21 @@ public Long queryCurrentLockHolder(Long volumeId, String volumeUuid, String volu
212218
213219 if (activeHostIds .isEmpty ()) {
214220 logger .debug ("Volume {} is not active on any reachable host — no exclusive lock held" , volumeUuid );
221+
222+ // Recovery: attempt exclusive activation on the best available host before giving up.
223+ Long targetHostId = selectActivationTargetHost (dbHostId , hosts );
224+ if (targetHostId != null ) {
225+ Long recoveredHostId = tryActivateExclusivelyOnHost (volumeId , volumeUuid , lvPath ,
226+ targetHostId , updateDatabase );
227+ if (recoveredHostId != null ) {
228+ return recoveredHostId ;
229+ }
230+ }
231+
232+ // Activation failed or no eligible host - clean up stale DB record and give up
215233 if (updateDatabase && dbHostId != null ) {
216234 VolumeDetailVO detail = _volsDetailsDao .findDetail (volumeId , CLVM_LOCK_HOST_ID );
217- if (detail != null ) {
235+ if (detail != null && String . valueOf ( dbHostId ). equals ( detail . getValue ()) ) {
218236 _volsDetailsDao .remove (detail .getId ());
219237 }
220238 }
@@ -276,6 +294,82 @@ private Boolean querySingleHostLockState(Long hostId, String lvPath, String volu
276294 }
277295 }
278296
297+ /**
298+ * Selects the best host on which to exclusively activate an inactive CLVM volume.
299+ *
300+ * <p>Priority 1: the last known lock holder ({@code clvmLockHostId} from DB), if that host
301+ * is UP and KVM.
302+ *
303+ * <p>Priority 2: a random UP KVM routing host from the cluster/zone list (fallback).
304+ *
305+ * @param dbHostId last known lock holder host ID from the DB (may be null)
306+ * @param hosts routing hosts in the cluster or zone collected during fan-out
307+ * @return host ID to activate on, or null if no eligible host found
308+ */
309+ private Long selectActivationTargetHost (Long dbHostId , List <HostVO > hosts ) {
310+ if (dbHostId != null ) {
311+ HostVO dbHost = _hostDao .findById (dbHostId );
312+ if (dbHost != null && dbHost .getStatus () == Status .Up
313+ && dbHost .getHypervisorType () == Hypervisor .HypervisorType .KVM ) {
314+ logger .debug ("selectActivationTargetHost: preferring DB host {} (last known lock holder)" , dbHostId );
315+ return dbHostId ;
316+ }
317+ }
318+ if (hosts != null ) {
319+ List <HostVO > eligible = hosts .stream ()
320+ .filter (h -> h .getStatus () == Status .Up
321+ && h .getType () == Host .Type .Routing
322+ && h .getHypervisorType () == Hypervisor .HypervisorType .KVM )
323+ .collect (Collectors .toList ());
324+ if (!eligible .isEmpty ()) {
325+ Collections .shuffle (eligible );
326+ HostVO chosen = eligible .get (0 );
327+ logger .debug ("selectActivationTargetHost: falling back to random UP KVM host {} in cluster/zone" ,
328+ chosen .getId ());
329+ return chosen .getId ();
330+ }
331+ }
332+ logger .warn ("selectActivationTargetHost: no eligible UP KVM host found" );
333+ return null ;
334+ }
335+
336+ /**
337+ * Sends an {@code ACTIVATE_EXCLUSIVE} command to {@code targetHostId} and optionally
338+ * persists the new lock host to the database.
339+ *
340+ * <p>Activation failure is non-fatal: returns null so the caller can apply its own fallback.
341+ *
342+ * @param volumeId volume DB ID
343+ * @param volumeUuid volume UUID (for logging)
344+ * @param lvPath full LV device path, e.g. {@code /dev/vgname/vol-path}
345+ * @param targetHostId host to activate on
346+ * @param updateDatabase if true, persists the new lock host on success
347+ * @return {@code targetHostId} on success, {@code null} if the command failed or threw
348+ */
349+ private Long tryActivateExclusivelyOnHost (Long volumeId , String volumeUuid , String lvPath ,
350+ Long targetHostId , boolean updateDatabase ) {
351+ try {
352+ ClvmLockTransferCommand activateCmd = new ClvmLockTransferCommand (
353+ ClvmLockTransferCommand .Operation .ACTIVATE_EXCLUSIVE , lvPath , volumeUuid );
354+ Answer activateAnswer = _agentMgr .send (targetHostId , activateCmd );
355+ if (activateAnswer != null && activateAnswer .getResult ()) {
356+ logger .info ("Recovery: exclusively activated volume {} on host {} (was inactive on all hosts)" ,
357+ volumeUuid , targetHostId );
358+ if (updateDatabase ) {
359+ setClvmLockHostId (volumeId , targetHostId );
360+ }
361+ return targetHostId ;
362+ }
363+ logger .warn ("Recovery activation of volume {} on host {} failed: {}" ,
364+ volumeUuid , targetHostId ,
365+ activateAnswer != null ? activateAnswer .getDetails () : "null answer" );
366+ } catch (AgentUnavailableException | OperationTimedoutException e ) {
367+ logger .warn ("Recovery activation of volume {} on host {} threw exception: {}" ,
368+ volumeUuid , targetHostId , e .getMessage ());
369+ }
370+ return null ;
371+ }
372+
279373 /**
280374 * Cleans up CLVM lock host tracking detail from volume_details table.
281375 * Called after successful volume deletion to prevent orphaned records.
0 commit comments