Skip to content

Commit e67f5ef

Browse files
committed
"Improve WAL archiving verification and error handling in backup script"
1 parent 87d9434 commit e67f5ef

1 file changed

Lines changed: 70 additions & 35 deletions

File tree

scripts/backup-functions.sh

Lines changed: 70 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -487,21 +487,82 @@ EOF
487487

488488
if [ $((wait_time % 15)) -eq 0 ]; then
489489
log "INFO" "Still waiting for WAL archiving... (${wait_time}s/${max_wait}s)"
490+
# Force another WAL switch to trigger archiving
491+
if ! su-exec postgres psql -d "$pg_database" -c "SELECT pg_switch_wal();" 2>/dev/null; then
492+
log "WARN" "Failed to force WAL switch"
493+
fi
490494
fi
491495
done
492496

493497
if [ "$archived_count" -eq 0 ]; then
494-
log "WARN" "No archived WAL files found after ${max_wait} seconds"
495-
log "WARN" "This may cause backup failures. Check PostgreSQL logs for archive errors."
498+
log "ERROR" "No archived WAL files found after ${max_wait} seconds"
499+
log "ERROR" "WAL archiving is not working properly. This will cause backup failures."
496500

497501
# Show PostgreSQL log for debugging
498502
log "INFO" "Recent PostgreSQL log entries:"
499503
tail -20 "$pgdata/log/"*.log 2>/dev/null || log "WARN" "Could not read PostgreSQL logs"
504+
505+
# Show current archive_command
506+
local current_archive_cmd=$(su-exec postgres psql -d "$pg_database" -t -c "SHOW archive_command;" 2>/dev/null | sed 's/^[ \t]*//;s/[ \t]*$//')
507+
log "ERROR" "Current archive_command: $current_archive_cmd"
508+
509+
return 1
500510
fi
501511

502512
return 0
503513
}
504514

515+
# Verify WAL archiving is working before backup
516+
verify_wal_archiving() {
517+
local stanza_name="${PGBACKREST_STANZA:-main}"
518+
local pg_database="${POSTGRES_DB:-postgres}"
519+
520+
log "INFO" "Verifying WAL archiving is working..."
521+
522+
# Force a WAL switch and check if it gets archived
523+
local pre_switch_lsn=$(su-exec postgres psql -d "$pg_database" -t -c "SELECT pg_current_wal_lsn();" 2>/dev/null | tr -d ' ')
524+
525+
if [ -z "$pre_switch_lsn" ]; then
526+
log "ERROR" "Failed to get current WAL LSN"
527+
return 1
528+
fi
529+
530+
log "INFO" "Current WAL LSN before switch: $pre_switch_lsn"
531+
532+
# Force WAL switch
533+
if ! su-exec postgres psql -d "$pg_database" -c "SELECT pg_switch_wal();" 2>/dev/null; then
534+
log "ERROR" "Failed to force WAL switch"
535+
return 1
536+
fi
537+
538+
log "INFO" "WAL switch forced, waiting for archiving..."
539+
540+
# Wait up to 60 seconds for the WAL file to be archived
541+
local max_wait=60
542+
local wait_time=0
543+
local archive_dir="/var/lib/pgbackrest/archive/${stanza_name}"
544+
545+
while [ $wait_time -lt $max_wait ]; do
546+
if [ -d "$archive_dir" ]; then
547+
local archived_count=$(find "$archive_dir" -type f \( -name "*.gz" -o -name "*.lz4" -o -name "*.xz" -o -name "*.bz2" -o -name "*-*" \) -newer "$archive_dir" 2>/dev/null | wc -l)
548+
if [ "$archived_count" -gt 0 ]; then
549+
log "INFO" "WAL archiving verified - found newly archived WAL files"
550+
return 0
551+
fi
552+
fi
553+
554+
sleep 2
555+
wait_time=$((wait_time + 2))
556+
557+
if [ $((wait_time % 10)) -eq 0 ]; then
558+
log "INFO" "Still waiting for WAL archiving... (${wait_time}s/${max_wait}s)"
559+
fi
560+
done
561+
562+
log "ERROR" "WAL archiving verification failed - no WAL files archived within ${max_wait} seconds"
563+
return 1
564+
}
565+
505566
# Perform full backup using pgbackrest
506567
perform_pgbackrest_backup() {
507568
local stanza_name="${PGBACKREST_STANZA:-main}"
@@ -515,42 +576,16 @@ perform_pgbackrest_backup() {
515576
return 1
516577
fi
517578

518-
# For full backups, ensure we have some WAL files archived first
579+
# For full backups, ensure WAL archiving is working
519580
if [ "$backup_type" = "full" ]; then
520-
log "INFO" "Checking for archived WAL files before starting full backup..."
521-
522-
# Force a few WAL switches to ensure we have archived WAL files
523-
for i in 1 2 3; do
524-
log "INFO" "Forcing WAL switch $i/3..."
525-
su-exec postgres psql -d "${POSTGRES_DB:-postgres}" -c "SELECT pg_switch_wal();" || true
526-
sleep 2
527-
done
528-
529-
# Wait for archiving to complete
530-
sleep 10
581+
log "INFO" "Verifying WAL archiving before starting full backup..."
531582

532-
# Check if we have archived WAL files
533-
if [ -d "/var/lib/pgbackrest/archive/${stanza_name}" ]; then
534-
archived_count=$(find "/var/lib/pgbackrest/archive/${stanza_name}" -type f | wc -l)
535-
log "INFO" "Found ${archived_count} archived WAL files"
536-
537-
if [ "$archived_count" -eq 0 ]; then
538-
log "WARN" "No archived WAL files found. Backup may fail."
539-
log "INFO" "Attempting to run archive-push manually..."
540-
541-
# Try to manually archive any pending WAL files
542-
wal_dir="${PGDATA:-/var/lib/postgresql/data}/pg_wal"
543-
if [ -d "$wal_dir" ]; then
544-
for wal_file in "$wal_dir"/[0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F]; do
545-
if [ -f "$wal_file" ]; then
546-
log "INFO" "Trying to archive WAL file: $(basename "$wal_file")"
547-
su-exec postgres pgbackrest --stanza="${stanza_name}" archive-push "$wal_file" || true
548-
break
549-
fi
550-
done
551-
fi
552-
fi
583+
if ! verify_wal_archiving; then
584+
log "ERROR" "WAL archiving verification failed - backup will likely fail"
585+
return 1
553586
fi
587+
588+
log "INFO" "WAL archiving verified successfully"
554589
fi
555590

556591
# Perform the backup using su-exec

0 commit comments

Comments
 (0)