diff --git a/qa/1671 b/qa/1671 index 25c04500ea..ec48819eb2 100755 --- a/qa/1671 +++ b/qa/1671 @@ -25,12 +25,19 @@ _cleanup() status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 +_filter_warnings() +{ + sed \ + -e 's/\[.*\]/[DATE]/' \ + -e 's/([0-9][0-9]*)/(PID)/' +} + # real QA test starts here for t in 1s 5s 30s 1m 5m 10m; do echo;echo == testing replay of corrupted archive with interval $t for max 10000 samples | tee -a $seq_full pmrep -Dinterp,log -s 10000 -z -a archives/multi-corrupted -t$t -pf%c kernel.percpu.cpu.user >>$seq_full 2>&1 pmrep -s 10000 -z -a archives/multi-corrupted -t$t -pf%c kernel.percpu.cpu.user 2>$tmp.err - cat $tmp.err + _filter_warnings <$tmp.err done echo if unexpected output, see $seq.full diff --git a/qa/1671.out b/qa/1671.out index 18c2b609cd..1658fcd2e8 100644 --- a/qa/1671.out +++ b/qa/1671.out @@ -748,7 +748,10 @@ Fri May 8 11:56:24 2015 965.000 364.000 998.000 780.000 Fri May 8 11:56:25 2015 965.000 363.000 997.000 781.000 Fri May 8 11:56:26 2015 965.000 364.000 998.000 780.000 Fri May 8 11:56:27 2015 965.000 363.000 998.000 781.000 -pmrep: Corrupted record in a PCP archive +Fri May 8 11:57:54 2015 9.264 8.931 10.507 7.320 +Fri May 8 11:58:54 2015 1897.798 2042.051 2188.803 1490.997 +Fri May 8 11:59:54 2015 878.368 405.260 891.705 840.526 +[DATE] pmrep(PID) Warning: __pmLogRead: removing corrupted archive "archives/multi-corrupted/20150508.11.50" vol 0 == testing replay of corrupted archive with interval 5s for max 10000 samples k.p.c.user k.p.c.user k.p.c.user k.p.c.user @@ -903,7 +906,10 @@ Fri May 8 11:56:09 2015 965.000 363.400 997.800 780.600 Fri May 8 11:56:14 2015 965.200 363.400 997.600 780.600 Fri May 8 11:56:19 2015 965.000 363.600 997.600 780.400 Fri May 8 11:56:24 2015 965.200 363.400 997.600 780.600 -pmrep: Corrupted record in a PCP archive +Fri May 8 11:57:54 2015 49.849 30.103 53.465 39.948 +Fri May 8 11:58:54 2015 1884.785 2028.055 2173.808 1480.784 +Fri May 8 11:59:54 2015 878.368 405.260 891.705 840.526 +[DATE] pmrep(PID) Warning: __pmLogRead: removing corrupted archive "archives/multi-corrupted/20150508.11.50" vol 0 == testing replay of corrupted archive with interval 30s for max 10000 samples k.p.c.user k.p.c.user k.p.c.user k.p.c.user @@ -934,7 +940,10 @@ Fri May 8 11:54:34 2015 961.600 589.233 972.633 593.800 Fri May 8 11:55:04 2015 950.033 951.367 988.867 212.800 Fri May 8 11:55:34 2015 953.433 817.133 990.833 342.433 Fri May 8 11:56:04 2015 965.133 363.433 997.600 780.533 -pmrep: Corrupted record in a PCP archive +Fri May 8 11:57:54 2015 251.940 128.991 266.273 202.646 +Fri May 8 11:58:54 2015 1819.756 1958.078 2098.799 1429.684 +Fri May 8 11:59:54 2015 878.368 405.260 891.705 840.526 +[DATE] pmrep(PID) Warning: __pmLogRead: removing corrupted archive "archives/multi-corrupted/20150508.11.50" vol 0 == testing replay of corrupted archive with interval 1m for max 10000 samples k.p.c.user k.p.c.user k.p.c.user k.p.c.user @@ -953,7 +962,10 @@ Fri May 8 11:53:04 2015 708.567 518.800 977.150 530.800 Fri May 8 11:54:04 2015 804.050 554.583 972.183 539.767 Fri May 8 11:55:04 2015 955.817 770.300 980.750 403.300 Fri May 8 11:56:04 2015 959.283 590.283 994.217 561.483 -pmrep: Corrupted record in a PCP archive +Fri May 8 11:57:54 2015 465.061 358.300 512.071 370.076 +Fri May 8 11:58:54 2015 1429.550 1538.232 1648.762 1123.134 +Fri May 8 11:59:54 2015 878.368 405.260 891.705 840.526 +[DATE] pmrep(PID) Warning: __pmLogRead: removing corrupted archive "archives/multi-corrupted/20150508.11.50" vol 0 == testing replay of corrupted archive with interval 5m for max 10000 samples k.p.c.user k.p.c.user k.p.c.user k.p.c.user @@ -962,7 +974,9 @@ pmrep: Corrupted record in a PCP archive Fri May 8 11:44:04 2015 N/A N/A N/A N/A Fri May 8 11:49:04 2015 N/A N/A N/A N/A Fri May 8 11:54:04 2015 N/A N/A N/A N/A -pmrep: Corrupted record in a PCP archive +Fri May 8 11:57:54 2015 N/A N/A N/A N/A +Fri May 8 11:59:54 2015 365.216 168.500 370.757 349.483 +[DATE] pmrep(PID) Warning: __pmLogRead: removing corrupted archive "archives/multi-corrupted/20150508.11.50" vol 0 == testing replay of corrupted archive with interval 10m for max 10000 samples k.p.c.user k.p.c.user k.p.c.user k.p.c.user @@ -970,5 +984,6 @@ pmrep: Corrupted record in a PCP archive ms/s ms/s ms/s ms/s Fri May 8 11:44:04 2015 N/A N/A N/A N/A Fri May 8 11:54:04 2015 N/A N/A N/A N/A -pmrep: Corrupted record in a PCP archive +Fri May 8 11:57:54 2015 N/A N/A N/A N/A +[DATE] pmrep(PID) Warning: __pmLogRead: removing corrupted archive "archives/multi-corrupted/20150508.11.50" vol 0 if unexpected output, see 1671.full diff --git a/qa/722 b/qa/722 index f5ea8573d3..06e71abff2 100755 --- a/qa/722 +++ b/qa/722 @@ -57,7 +57,7 @@ _check_stderr echo && echo == Archive list, expect pass: TEST_SET_ARCHIVELIST=archives/pcp-atop $script 2>$tmp.stderr _check_stderr -echo && echo == Archive list, expect pass: +echo && echo == Archive list, expect fail: TEST_SET_ARCHIVELIST=archives/pcp-free,archives/pcp-uptime $script 2>$tmp.stderr _check_stderr diff --git a/qa/722.out b/qa/722.out index dee95b5ec6..aa317090b7 100644 --- a/qa/722.out +++ b/qa/722.out @@ -51,11 +51,11 @@ Hosts: None Archives: ['archives/pcp-atop'] pass -== Archive list, expect pass: +== Archive list, expect fail: == Test == Hosts: None Archives: ['archives/pcp-free,archives/pcp-uptime'] -pass +fail == Folio, expect pass: == Test == diff --git a/src/libpcp/src/context.c b/src/libpcp/src/context.c index b7b208d460..bbd4bcc7df 100644 --- a/src/libpcp/src/context.c +++ b/src/libpcp/src/context.c @@ -946,6 +946,15 @@ initarchive(__pmContext *ctxp, const char *name) */ sts = __pmFindOrOpenArchive(ctxp, current, multi_arch); if (sts < 0) { + if (multi_arch) { + pmNotifyErr(LOG_WARNING, + "initarchive: skipping corrupt/unreadable archive \"%s\": %s", + current, pmErrStr(sts)); + if (end == NULL) + break; + current = end + 1; + continue; + } if (pmDebugOptions.log && pmDebugOptions.desperate) { char errmsg[PM_MAXERRMSGLEN]; fprintf(stderr, "initarchive(..., %s, ...): __pmFindOrOpenArchive: %s\n", @@ -1048,6 +1057,30 @@ initarchive(__pmContext *ctxp, const char *name) } free(namelist); namelist = NULL; + + if (acp->ac_num_logs == 0) { + pmNotifyErr(LOG_ERR, + "initarchive: all archives are corrupt/unreadable"); + sts = PM_ERR_LOGFILE; + goto error; + } + + /* + * If the active log control was lost because a failed archive open + * in multi-archive mode cleaned up the shared __pmLogCtl (setting + * acp->ac_log to NULL), rebuild it by re-opening all the archives + * that were previously opened successfully. This re-accumulates + * their metadata (PMNS, hash tables, etc.) into a fresh __pmLogCtl. + */ + if (acp->ac_log == NULL) { + for (i = 0; i < acp->ac_num_logs; i++) { + sts = __pmFindOrOpenArchive(ctxp, + acp->ac_log_list[i]->name, multi_arch); + if (sts < 0) + goto error; + } + } + acp->ac_meta_loaded = 1; /* diff --git a/src/libpcp/src/interp.c b/src/libpcp/src/interp.c index 5cc1328d91..bad1197de3 100644 --- a/src/libpcp/src/interp.c +++ b/src/libpcp/src/interp.c @@ -863,10 +863,11 @@ do_roll(__pmContext *ctxp, double t_req, int *seen_mark) if (err == PM_ERR_LOGREC) { if (pmDebugOptions.interp || pmDebugOptions.log) { - fprintf(stderr, "Error: corrupted archive '%s', vol %d\n", - ctxp->c_archctl ->ac_log->name, ctxp->c_archctl->ac_curvol); + fprintf(stderr, "Warning: corrupted archive '%s', vol %d\n", + ctxp->c_archctl->ac_log->name, ctxp->c_archctl->ac_curvol); } - return err; + if (ctxp->c_archctl->ac_num_logs <= 1) + return err; } return 0; diff --git a/src/libpcp/src/logutil.c b/src/libpcp/src/logutil.c index c23922c637..c544fb3ad3 100644 --- a/src/libpcp/src/logutil.c +++ b/src/libpcp/src/logutil.c @@ -1642,6 +1642,7 @@ __pmLogRead_ctx(__pmContext *ctxp, int mode, __pmFILE *peekf, __pmResult **resul acp->ac_curvol, (long)offset); } +read_retry: if (mode == PM_MODE_BACK) { for ( ; ; ) { if (offset <= __pmLogLabelSize(lcp)) { @@ -1962,6 +1963,70 @@ __pmLogRead_ctx(__pmContext *ctxp, int mode, __pmFILE *peekf, __pmResult **resul func_return: + /* + * Recovery for corrupted archives in a multi-archive context: + * remove the corrupted archive from the log list entirely and + * attempt to continue with the next (or previous) archive. + */ + if (sts == PM_ERR_LOGREC && peekf == NULL && acp->ac_num_logs > 1) { + int corrupt_idx = acp->ac_cur_log; + __pmMultiLogCtl *bad; + + pmNotifyErr(LOG_WARNING, + "__pmLogRead: removing corrupted archive \"%s\" vol %d", + acp->ac_log->name, acp->ac_curvol); + + if (corrupt_idx >= 0 && corrupt_idx < acp->ac_num_logs) { + bad = acp->ac_log_list[corrupt_idx]; + if (bad->name) free(bad->name); + if (bad->hostname) free(bad->hostname); + if (bad->timezone) free(bad->timezone); + if (bad->zoneinfo) free(bad->zoneinfo); + free(bad); + acp->ac_num_logs--; + if (corrupt_idx < acp->ac_num_logs) + memmove(&acp->ac_log_list[corrupt_idx], + &acp->ac_log_list[corrupt_idx + 1], + (acp->ac_num_logs - corrupt_idx) * + sizeof(*acp->ac_log_list)); + acp->ac_cur_log = -1; + + if (mode == PM_MODE_FORW && + corrupt_idx < acp->ac_num_logs) { + acp->ac_mark_done = 0; + if (__pmLogChangeArchive(ctxp, corrupt_idx) == 0) { + lcp = acp->ac_log; + f = acp->ac_mfp; + acp->ac_offset = __pmLogLabelSize(lcp); + acp->ac_vol = acp->ac_curvol; + goto again; + } + } + else if (mode == PM_MODE_BACK && corrupt_idx > 0) { + int j; + acp->ac_mark_done = 0; + if (__pmLogChangeArchive(ctxp, corrupt_idx - 1) == 0) { + lcp = acp->ac_log; + for (j = lcp->maxvol; j >= lcp->minvol; j--) { + if (__pmLogChangeVol(acp, j) >= 0) + break; + } + if (j >= lcp->minvol) { + __pmFseek(acp->ac_mfp, (long)0, SEEK_END); + acp->ac_offset = __pmFtell(acp->ac_mfp); + assert(acp->ac_offset >= 0); + acp->ac_vol = acp->ac_curvol; + f = acp->ac_mfp; + offset = __pmFtell(f); + assert(offset >= 0); + goto read_retry; + } + } + } + } + sts = PM_ERR_EOL; + } + if (ctx_ctl.need_ctx_unlock) PM_UNLOCK(ctx_ctl.ctxp->c_lock); @@ -3312,7 +3377,7 @@ LogChangeToPreviousArchive(__pmContext *ctxp) */ save_origin = ctxp->c_origin; save_mode = ctxp->c_mode; - /* Switch to the next archive. */ + /* Switch to the previous archive. */ __pmLogChangeArchive(ctxp, acp->ac_cur_log - 1); lcp = acp->ac_log; ctxp->c_origin = save_origin;