Skip to content

Commit 3e2e144

Browse files
committed
fix(nas-backup): address Copilot review on PR #13074
* nasbackup.sh: emit INCREMENTAL_FALLBACK= and BITMAP_CREATED= on stdout (Script.executePipedCommands in LibvirtTakeBackupCommandWrapper reads stdout only, not stderr — so these markers were never parsed). Fixes stopped-VM fallback recording and bitmap persistence. * nasbackup.sh: fix `2>&1 > /dev/null` → `> /dev/null 2>&1` (the original ordering leaves stderr pointed at the now-redirected stdout, dropping virsh error details). * nasbackup.sh: capture qemu-agent thaw response correctly (was being swallowed by `> /dev/null`). * nasbackup.sh: usage text now reflects the implemented --parent-paths (plural, comma-separated) flag instead of misleading --parent-path. * NASBackupProvider: default nas.backup.incremental.enabled to false so existing zones keep legacy full-only behavior on upgrade; opt in per-zone when ready. Description updated to make this explicit. * NASBackupProvider.composeParentBackupPaths: sort current VM volumes by deviceId before positional comparison with parent's backed-up volumes, and verify per-position UUID alignment. If any disk was detached and a different one attached in its place, return null and force a full instead of silently rebasing onto the wrong parent file. * NASBackupChainKeys.TYPE: doc now explicitly notes the lowercase casing difference from Backup.Status uppercase enum values. * test_backup_recovery_nas.py: capture original zone-scoped nas.backup.full.every via Configurations.list at the start of each incremental test and restore that exact value in finally, instead of hardcoding 10 (which leaked into shared environments).
1 parent 5be1910 commit 3e2e144

4 files changed

Lines changed: 77 additions & 26 deletions

File tree

plugins/backup/nas/src/main/java/org/apache/cloudstack/backup/NASBackupChainKeys.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,13 @@ public final class NASBackupChainKeys {
3636
/** Position within the chain: 0 for the full, 1 for the first incremental, and so on. */
3737
public static final String CHAIN_POSITION = "nas.chain_position";
3838

39-
/** Backup type marker: {@value #TYPE_FULL} or {@value #TYPE_INCREMENTAL}. Mirrors {@code backups.type} for fast lookup without a join. */
39+
/**
40+
* Backup type marker: {@value #TYPE_FULL} or {@value #TYPE_INCREMENTAL}.
41+
* Stored in {@code backup_details} as a lowercase string for fast lookup without a join
42+
* against {@code backups.type} (which is an upper-case Backup.Status enum value).
43+
* NOTE: do NOT compare directly against {@code Backup.Status} string casing; use the
44+
* constants below.
45+
*/
4046
public static final String TYPE = "nas.type";
4147

4248
public static final String TYPE_FULL = "full";

plugins/backup/nas/src/main/java/org/apache/cloudstack/backup/NASBackupProvider.java

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,13 @@ public class NASBackupProvider extends AdapterBase implements BackupProvider, Co
100100

101101
ConfigKey<Boolean> NASBackupIncrementalEnabled = new ConfigKey<>("Advanced", Boolean.class,
102102
"nas.backup.incremental.enabled",
103-
"true",
104-
"Master switch for NAS incremental backups. When false, every NAS backup is taken as a full " +
105-
"regardless of nas.backup.full.every. Toggling this is safe at any time: switching off " +
106-
"forces the next backup to be a fresh full anchor (existing chains stay restorable), " +
107-
"switching back on resumes incrementals on the next full + incremental cycle.",
103+
"false",
104+
"Master switch for NAS incremental backups. Defaults to false so existing zones keep the " +
105+
"legacy full-only behavior on upgrade; opt in per-zone when ready to use chains. " +
106+
"When false, every NAS backup is taken as a full regardless of nas.backup.full.every. " +
107+
"Toggling this is safe at any time: switching off forces the next backup to be a fresh " +
108+
"full anchor (existing chains stay restorable), switching back on resumes incrementals " +
109+
"on the next full + incremental cycle.",
108110
true,
109111
ConfigKey.Scope.Zone,
110112
BackupFrameworkEnabled.key());
@@ -391,20 +393,38 @@ private List<String> composeParentBackupPaths(Backup parent, long vmId) {
391393
return null;
392394
}
393395

394-
// Sanity: the current VM must have the same number of volumes. If it doesn't (volume
395-
// added or removed since the parent), positional alignment is unsafe — fall back to
396-
// full at the caller.
396+
// Sanity 1: the current VM must have the same number of volumes. If it doesn't (volume
397+
// added or removed since the parent), positional alignment is unsafe — caller falls back to full.
397398
List<VolumeVO> currentVols = volumeDao.findByInstance(vmId);
398399
if (currentVols == null || currentVols.size() != parentVols.size()) {
399400
return null;
400401
}
401402

402-
List<String> paths = new ArrayList<>(parentVols.size());
403+
// Sanity 2: VolumeDao.findByInstance() has no ordering guarantee. Sort the current
404+
// volumes by deviceId so positional comparison against parentVols (also deviceId-ordered
405+
// at backup time) is meaningful.
406+
List<VolumeVO> currentSorted = new ArrayList<>(currentVols);
407+
currentSorted.sort(Comparator.comparing(Volume::getDeviceId));
408+
409+
// Sanity 3: verify each current volume's UUID matches the parent's recorded UUID at the
410+
// same position. If any disk was detached + a different one attached in its place, the
411+
// chain cannot be safely continued — force a full instead of silently rebasing onto the
412+
// wrong parent file.
403413
for (int i = 0; i < parentVols.size(); i++) {
404-
String volUuid = parentVols.get(i).getUuid();
405-
if (volUuid == null || volUuid.isEmpty()) {
414+
String parentUuid = parentVols.get(i).getUuid();
415+
String currentUuid = currentSorted.get(i).getUuid();
416+
if (parentUuid == null || parentUuid.isEmpty()
417+
|| currentUuid == null || !parentUuid.equals(currentUuid)) {
418+
LOG.debug("Volume identity mismatch at position {} for VM {}: parent uuid {} vs current uuid {}. " +
419+
"Forcing a full backup to start a fresh chain.",
420+
i, vmId, parentUuid, currentUuid);
406421
return null;
407422
}
423+
}
424+
425+
List<String> paths = new ArrayList<>(parentVols.size());
426+
for (int i = 0; i < parentVols.size(); i++) {
427+
String volUuid = parentVols.get(i).getUuid();
408428
String prefix = (i == 0) ? "root" : "datadisk";
409429
paths.add(dir + "/" + prefix + "." + volUuid + ".qcow2");
410430
}

scripts/vm/hypervisor/kvm/nasbackup.sh

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -237,17 +237,19 @@ XML
237237
# Start push backup, atomically registering the new checkpoint when applicable.
238238
local backup_begin=0
239239
if [[ $make_checkpoint -eq 1 ]]; then
240-
if virsh -c qemu:///system backup-begin --domain $VM --backupxml $dest/backup.xml --checkpointxml $dest/checkpoint.xml 2>&1 > /dev/null; then
240+
# Order matters: redirect stdout to /dev/null first, then merge stderr into stdout.
241+
# The reversed `2>&1 > /dev/null` form leaves stderr pointing at the original tty.
242+
if virsh -c qemu:///system backup-begin --domain $VM --backupxml $dest/backup.xml --checkpointxml $dest/checkpoint.xml > /dev/null 2>&1; then
241243
backup_begin=1;
242244
fi
243245
else
244-
if virsh -c qemu:///system backup-begin --domain $VM --backupxml $dest/backup.xml 2>&1 > /dev/null; then
246+
if virsh -c qemu:///system backup-begin --domain $VM --backupxml $dest/backup.xml > /dev/null 2>&1; then
245247
backup_begin=1;
246248
fi
247249
fi
248250

249251
if [[ $thaw -eq 1 ]]; then
250-
if ! response=$(virsh -c qemu:///system qemu-agent-command "$VM" '{"execute":"guest-fsfreeze-thaw"}' 2>&1 > /dev/null); then
252+
if ! response=$(virsh -c qemu:///system qemu-agent-command "$VM" '{"execute":"guest-fsfreeze-thaw"}' 2>&1); then
251253
echo "Failed to thaw the filesystem for vm $VM: $response"
252254
cleanup
253255
exit 1
@@ -358,7 +360,9 @@ backup_stopped_vm() {
358360
# to full and signal the fallback so the orchestrator can record it as a full
359361
# in the chain.
360362
if [[ "$MODE" == "incremental" ]]; then
361-
echo "INCREMENTAL_FALLBACK=full (VM stopped — incremental requires running VM)" >&2
363+
# Emit on stdout so Script.executePipedCommands in LibvirtTakeBackupCommandWrapper
364+
# can parse it and record the backup as FULL.
365+
echo "INCREMENTAL_FALLBACK=full (VM stopped — incremental requires running VM)"
362366
fi
363367

364368
mount_operation
@@ -404,8 +408,10 @@ backup_stopped_vm() {
404408
# Surface the bitmap name we created so the orchestrator can persist it as
405409
# the VM's active_checkpoint_id. Empty when sources weren't file-backed or
406410
# qemu-img bitmap failed — orchestrator handles either case.
411+
# Stdout (not stderr) so Script.executePipedCommands in the Java wrapper
412+
# can parse it — matches the backup_running_vm path.
407413
if [[ "$bitmap_seeded" == "1" ]]; then
408-
echo "BITMAP_CREATED=$BITMAP_NEW" >&2
414+
echo "BITMAP_CREATED=$BITMAP_NEW"
409415
fi
410416

411417
ls -l --numeric-uid-gid $dest | awk '{print $5}'
@@ -505,12 +511,14 @@ cleanup() {
505511
function usage {
506512
echo ""
507513
echo "Usage: $0 -o <operation> -v|--vm <domain name> -t <storage type> -s <storage address> -m <mount options> -p <backup path> -d <disks path> -q|--quiesce <true|false>"
508-
echo " [-M|--mode <full|incremental>] [--bitmap-new <name>] [--bitmap-parent <name>] [--parent-path <path>]"
514+
echo " [-M|--mode <full|incremental>] [--bitmap-new <name>] [--bitmap-parent <name>] [--parent-paths <p1,p2,...>]"
509515
echo ""
510516
echo "Incremental backup options (running VMs only; requires QEMU >= 4.2 and libvirt >= 7.2):"
511517
echo " -M|--mode full Take a full backup AND create a checkpoint (--bitmap-new required) for future incrementals."
512518
echo " -M|--mode incremental Take an incremental backup since --bitmap-parent and create new checkpoint --bitmap-new."
513-
echo " Requires --bitmap-parent, --bitmap-new, and --parent-path (parent backup file for rebase)."
519+
echo " Requires --bitmap-parent, --bitmap-new, and --parent-paths (comma-separated list, one"
520+
echo " parent qcow2 path per disk: root.<uuid>.qcow2, datadisk.<uuid>.qcow2, … same order"
521+
echo " as -d|--disks)."
514522
echo " Without -M, behaves as legacy full-only backup with no checkpoint creation."
515523
echo ""
516524
exit 1

test/integration/smoke/test_backup_recovery_nas.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -274,13 +274,25 @@ def test_vm_backup_create_vm_from_backup_in_another_zone(self):
274274
# repair, refuse-delete-full-with-children, and stopped-VM fallback.
275275
#
276276
# All tests set nas.backup.full.every to a small value (3) so a chain
277-
# forms quickly without needing many backup iterations. They restore
278-
# the original value at teardown.
277+
# forms quickly without needing many backup iterations. The original
278+
# zone value (whatever the test environment has configured) is captured
279+
# before the test runs and restored verbatim in finally, so we don't
280+
# leak config changes across tests on shared environments.
279281

280282
def _set_full_every(self, value):
281283
Configurations.update(self.apiclient, name='nas.backup.full.every',
282284
value=str(value), zoneid=self.zone.id)
283285

286+
def _get_full_every(self):
287+
"""Read the current zone-scoped (or global fallback) value of nas.backup.full.every."""
288+
configs = Configurations.list(self.apiclient, name='nas.backup.full.every',
289+
zoneid=self.zone.id)
290+
if configs and len(configs) > 0 and configs[0].value is not None:
291+
return configs[0].value
292+
# Fall back to global default — Configurations.list returns the global value
293+
# when no zone override exists. Defensive fallback to '10' (the framework default).
294+
return '10'
295+
284296
def _backup_type(self, backup):
285297
# Backup objects expose `type`; for chained backups it's "INCREMENTAL", else "FULL".
286298
return getattr(backup, 'type', 'FULL') or 'FULL'
@@ -292,6 +304,7 @@ def test_incremental_chain_cadence(self):
292304
FULL, INCREMENTAL, INCREMENTAL, FULL, INCREMENTAL, ...
293305
"""
294306
self.backup_offering.assignOffering(self.apiclient, self.vm.id)
307+
original_full_every = self._get_full_every()
295308
self._set_full_every(3)
296309
try:
297310
ssh_client_vm = self.vm.get_ssh_client(reconnect=True)
@@ -318,7 +331,7 @@ def test_incremental_chain_cadence(self):
318331
for b in reversed(created):
319332
Backup.delete(self.apiclient, b.id)
320333
finally:
321-
self._set_full_every(10)
334+
self._set_full_every(original_full_every)
322335
self.backup_offering.removeOffering(self.apiclient, self.vm.id)
323336

324337
@attr(tags=["advanced", "backup"], required_hardware="true")
@@ -328,6 +341,7 @@ def test_restore_from_incremental(self):
328341
latest incremental and verify all three markers are present (chain flatten).
329342
"""
330343
self.backup_offering.assignOffering(self.apiclient, self.vm.id)
344+
original_full_every = self._get_full_every()
331345
self._set_full_every(5)
332346
try:
333347
ssh_client_vm = self.vm.get_ssh_client(reconnect=True)
@@ -365,7 +379,7 @@ def test_restore_from_incremental(self):
365379
for b in reversed(backups):
366380
Backup.delete(self.apiclient, b.id)
367381
finally:
368-
self._set_full_every(10)
382+
self._set_full_every(original_full_every)
369383
self.backup_offering.removeOffering(self.apiclient, self.vm.id)
370384

371385
@attr(tags=["advanced", "backup"], required_hardware="true")
@@ -376,6 +390,7 @@ def test_delete_middle_incremental_repairs_chain(self):
376390
should still produce a working VM with all expected blocks.
377391
"""
378392
self.backup_offering.assignOffering(self.apiclient, self.vm.id)
393+
original_full_every = self._get_full_every()
379394
self._set_full_every(5)
380395
try:
381396
ssh_client_vm = self.vm.get_ssh_client(reconnect=True)
@@ -416,7 +431,7 @@ def test_delete_middle_incremental_repairs_chain(self):
416431
Backup.delete(self.apiclient, inc2.id)
417432
Backup.delete(self.apiclient, full.id)
418433
finally:
419-
self._set_full_every(10)
434+
self._set_full_every(original_full_every)
420435
self.backup_offering.removeOffering(self.apiclient, self.vm.id)
421436

422437
@attr(tags=["advanced", "backup"], required_hardware="true")
@@ -426,6 +441,7 @@ def test_refuse_delete_full_with_children(self):
426441
With forced=true it must succeed and remove the entire chain.
427442
"""
428443
self.backup_offering.assignOffering(self.apiclient, self.vm.id)
444+
original_full_every = self._get_full_every()
429445
self._set_full_every(5)
430446
try:
431447
Backup.create(self.apiclient, self.vm.id, "rdc_full")
@@ -449,7 +465,7 @@ def test_refuse_delete_full_with_children(self):
449465
remaining = Backup.list(self.apiclient, self.vm.id)
450466
self.assertIsNone(remaining, "Forced delete of FULL should remove the entire chain")
451467
finally:
452-
self._set_full_every(10)
468+
self._set_full_every(original_full_every)
453469
self.backup_offering.removeOffering(self.apiclient, self.vm.id)
454470

455471
@attr(tags=["advanced", "backup"], required_hardware="true")
@@ -460,6 +476,7 @@ def test_stopped_vm_falls_back_to_full(self):
460476
new chain. The incrementalFallback flag should be reflected in backup.type=FULL.
461477
"""
462478
self.backup_offering.assignOffering(self.apiclient, self.vm.id)
479+
original_full_every = self._get_full_every()
463480
self._set_full_every(2) # next backup after the first should be incremental
464481
try:
465482
Backup.create(self.apiclient, self.vm.id, "svf_first")
@@ -482,5 +499,5 @@ def test_stopped_vm_falls_back_to_full(self):
482499
for b in reversed(backups):
483500
Backup.delete(self.apiclient, b.id)
484501
finally:
485-
self._set_full_every(10)
502+
self._set_full_every(original_full_every)
486503
self.backup_offering.removeOffering(self.apiclient, self.vm.id)

0 commit comments

Comments
 (0)