Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion qa/1671
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,19 @@ _cleanup()
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15

_filter_warnings()
{
sed \
-e 's/\[.*\]/[DATE]/' \
-e 's/([0-9][0-9]*)/(PID)/'
}

# real QA test starts here
for t in 1s 5s 30s 1m 5m 10m; do
echo;echo == testing replay of corrupted archive with interval $t for max 10000 samples | tee -a $seq_full
pmrep -Dinterp,log -s 10000 -z -a archives/multi-corrupted -t$t -pf%c kernel.percpu.cpu.user >>$seq_full 2>&1
pmrep -s 10000 -z -a archives/multi-corrupted -t$t -pf%c kernel.percpu.cpu.user 2>$tmp.err
cat $tmp.err
_filter_warnings <$tmp.err
done

echo if unexpected output, see $seq.full
Expand Down
27 changes: 21 additions & 6 deletions qa/1671.out
Original file line number Diff line number Diff line change
Expand Up @@ -748,7 +748,10 @@ Fri May 8 11:56:24 2015 965.000 364.000 998.000 780.000
Fri May 8 11:56:25 2015 965.000 363.000 997.000 781.000
Fri May 8 11:56:26 2015 965.000 364.000 998.000 780.000
Fri May 8 11:56:27 2015 965.000 363.000 998.000 781.000
pmrep: Corrupted record in a PCP archive
Fri May 8 11:57:54 2015 9.264 8.931 10.507 7.320
Fri May 8 11:58:54 2015 1897.798 2042.051 2188.803 1490.997
Fri May 8 11:59:54 2015 878.368 405.260 891.705 840.526
[DATE] pmrep(PID) Warning: __pmLogRead: removing corrupted archive "archives/multi-corrupted/20150508.11.50" vol 0

== testing replay of corrupted archive with interval 5s for max 10000 samples
k.p.c.user k.p.c.user k.p.c.user k.p.c.user
Expand Down Expand Up @@ -903,7 +906,10 @@ Fri May 8 11:56:09 2015 965.000 363.400 997.800 780.600
Fri May 8 11:56:14 2015 965.200 363.400 997.600 780.600
Fri May 8 11:56:19 2015 965.000 363.600 997.600 780.400
Fri May 8 11:56:24 2015 965.200 363.400 997.600 780.600
pmrep: Corrupted record in a PCP archive
Fri May 8 11:57:54 2015 49.849 30.103 53.465 39.948
Fri May 8 11:58:54 2015 1884.785 2028.055 2173.808 1480.784
Fri May 8 11:59:54 2015 878.368 405.260 891.705 840.526
[DATE] pmrep(PID) Warning: __pmLogRead: removing corrupted archive "archives/multi-corrupted/20150508.11.50" vol 0

== testing replay of corrupted archive with interval 30s for max 10000 samples
k.p.c.user k.p.c.user k.p.c.user k.p.c.user
Expand Down Expand Up @@ -934,7 +940,10 @@ Fri May 8 11:54:34 2015 961.600 589.233 972.633 593.800
Fri May 8 11:55:04 2015 950.033 951.367 988.867 212.800
Fri May 8 11:55:34 2015 953.433 817.133 990.833 342.433
Fri May 8 11:56:04 2015 965.133 363.433 997.600 780.533
pmrep: Corrupted record in a PCP archive
Fri May 8 11:57:54 2015 251.940 128.991 266.273 202.646
Fri May 8 11:58:54 2015 1819.756 1958.078 2098.799 1429.684
Fri May 8 11:59:54 2015 878.368 405.260 891.705 840.526
[DATE] pmrep(PID) Warning: __pmLogRead: removing corrupted archive "archives/multi-corrupted/20150508.11.50" vol 0

== testing replay of corrupted archive with interval 1m for max 10000 samples
k.p.c.user k.p.c.user k.p.c.user k.p.c.user
Expand All @@ -953,7 +962,10 @@ Fri May 8 11:53:04 2015 708.567 518.800 977.150 530.800
Fri May 8 11:54:04 2015 804.050 554.583 972.183 539.767
Fri May 8 11:55:04 2015 955.817 770.300 980.750 403.300
Fri May 8 11:56:04 2015 959.283 590.283 994.217 561.483
pmrep: Corrupted record in a PCP archive
Fri May 8 11:57:54 2015 465.061 358.300 512.071 370.076
Fri May 8 11:58:54 2015 1429.550 1538.232 1648.762 1123.134
Fri May 8 11:59:54 2015 878.368 405.260 891.705 840.526
[DATE] pmrep(PID) Warning: __pmLogRead: removing corrupted archive "archives/multi-corrupted/20150508.11.50" vol 0

== testing replay of corrupted archive with interval 5m for max 10000 samples
k.p.c.user k.p.c.user k.p.c.user k.p.c.user
Expand All @@ -962,13 +974,16 @@ pmrep: Corrupted record in a PCP archive
Fri May 8 11:44:04 2015 N/A N/A N/A N/A
Fri May 8 11:49:04 2015 N/A N/A N/A N/A
Fri May 8 11:54:04 2015 N/A N/A N/A N/A
pmrep: Corrupted record in a PCP archive
Fri May 8 11:57:54 2015 N/A N/A N/A N/A
Fri May 8 11:59:54 2015 365.216 168.500 370.757 349.483
[DATE] pmrep(PID) Warning: __pmLogRead: removing corrupted archive "archives/multi-corrupted/20150508.11.50" vol 0

== testing replay of corrupted archive with interval 10m for max 10000 samples
k.p.c.user k.p.c.user k.p.c.user k.p.c.user
cpu0 cpu1 cpu2 cpu3
ms/s ms/s ms/s ms/s
Fri May 8 11:44:04 2015 N/A N/A N/A N/A
Fri May 8 11:54:04 2015 N/A N/A N/A N/A
pmrep: Corrupted record in a PCP archive
Fri May 8 11:57:54 2015 N/A N/A N/A N/A
[DATE] pmrep(PID) Warning: __pmLogRead: removing corrupted archive "archives/multi-corrupted/20150508.11.50" vol 0
if unexpected output, see 1671.full
2 changes: 1 addition & 1 deletion qa/722
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ _check_stderr
echo && echo == Archive list, expect pass:
TEST_SET_ARCHIVELIST=archives/pcp-atop $script 2>$tmp.stderr
_check_stderr
echo && echo == Archive list, expect pass:
echo && echo == Archive list, expect fail:
TEST_SET_ARCHIVELIST=archives/pcp-free,archives/pcp-uptime $script 2>$tmp.stderr
_check_stderr

Expand Down
4 changes: 2 additions & 2 deletions qa/722.out
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,11 @@ Hosts: None
Archives: ['archives/pcp-atop']
pass

== Archive list, expect pass:
== Archive list, expect fail:
== Test ==
Hosts: None
Archives: ['archives/pcp-free,archives/pcp-uptime']
pass
fail

== Folio, expect pass:
== Test ==
Expand Down
33 changes: 33 additions & 0 deletions src/libpcp/src/context.c
Original file line number Diff line number Diff line change
Expand Up @@ -946,6 +946,15 @@ initarchive(__pmContext *ctxp, const char *name)
*/
sts = __pmFindOrOpenArchive(ctxp, current, multi_arch);
if (sts < 0) {
if (multi_arch) {
pmNotifyErr(LOG_WARNING,
"initarchive: skipping corrupt/unreadable archive \"%s\": %s",
current, pmErrStr(sts));
if (end == NULL)
break;
current = end + 1;
continue;
}
if (pmDebugOptions.log && pmDebugOptions.desperate) {
char errmsg[PM_MAXERRMSGLEN];
fprintf(stderr, "initarchive(..., %s, ...): __pmFindOrOpenArchive: %s\n",
Expand Down Expand Up @@ -1048,6 +1057,30 @@ initarchive(__pmContext *ctxp, const char *name)
}
free(namelist);
namelist = NULL;

if (acp->ac_num_logs == 0) {
pmNotifyErr(LOG_ERR,
"initarchive: all archives are corrupt/unreadable");
sts = PM_ERR_LOGFILE;
goto error;
}

/*
* If the active log control was lost because a failed archive open
* in multi-archive mode cleaned up the shared __pmLogCtl (setting
* acp->ac_log to NULL), rebuild it by re-opening all the archives
* that were previously opened successfully. This re-accumulates
* their metadata (PMNS, hash tables, etc.) into a fresh __pmLogCtl.
*/
if (acp->ac_log == NULL) {
for (i = 0; i < acp->ac_num_logs; i++) {
sts = __pmFindOrOpenArchive(ctxp,
acp->ac_log_list[i]->name, multi_arch);
if (sts < 0)
goto error;
}
}

acp->ac_meta_loaded = 1;

/*
Expand Down
7 changes: 4 additions & 3 deletions src/libpcp/src/interp.c
Original file line number Diff line number Diff line change
Expand Up @@ -863,10 +863,11 @@ do_roll(__pmContext *ctxp, double t_req, int *seen_mark)

if (err == PM_ERR_LOGREC) {
if (pmDebugOptions.interp || pmDebugOptions.log) {
fprintf(stderr, "Error: corrupted archive '%s', vol %d\n",
ctxp->c_archctl ->ac_log->name, ctxp->c_archctl->ac_curvol);
fprintf(stderr, "Warning: corrupted archive '%s', vol %d\n",
ctxp->c_archctl->ac_log->name, ctxp->c_archctl->ac_curvol);
}
return err;
if (ctxp->c_archctl->ac_num_logs <= 1)
return err;
}

return 0;
Expand Down
67 changes: 66 additions & 1 deletion src/libpcp/src/logutil.c
Original file line number Diff line number Diff line change
Expand Up @@ -1642,6 +1642,7 @@ __pmLogRead_ctx(__pmContext *ctxp, int mode, __pmFILE *peekf, __pmResult **resul
acp->ac_curvol, (long)offset);
}

read_retry:
if (mode == PM_MODE_BACK) {
for ( ; ; ) {
if (offset <= __pmLogLabelSize(lcp)) {
Expand Down Expand Up @@ -1962,6 +1963,70 @@ __pmLogRead_ctx(__pmContext *ctxp, int mode, __pmFILE *peekf, __pmResult **resul

func_return:

/*
* Recovery for corrupted archives in a multi-archive context:
* remove the corrupted archive from the log list entirely and
* attempt to continue with the next (or previous) archive.
*/
if (sts == PM_ERR_LOGREC && peekf == NULL && acp->ac_num_logs > 1) {
int corrupt_idx = acp->ac_cur_log;
__pmMultiLogCtl *bad;

pmNotifyErr(LOG_WARNING,
"__pmLogRead: removing corrupted archive \"%s\" vol %d",
acp->ac_log->name, acp->ac_curvol);

if (corrupt_idx >= 0 && corrupt_idx < acp->ac_num_logs) {
bad = acp->ac_log_list[corrupt_idx];
if (bad->name) free(bad->name);
if (bad->hostname) free(bad->hostname);
if (bad->timezone) free(bad->timezone);
if (bad->zoneinfo) free(bad->zoneinfo);
free(bad);
acp->ac_num_logs--;
if (corrupt_idx < acp->ac_num_logs)
memmove(&acp->ac_log_list[corrupt_idx],
&acp->ac_log_list[corrupt_idx + 1],
(acp->ac_num_logs - corrupt_idx) *
sizeof(*acp->ac_log_list));
acp->ac_cur_log = -1;

if (mode == PM_MODE_FORW &&
corrupt_idx < acp->ac_num_logs) {
acp->ac_mark_done = 0;
if (__pmLogChangeArchive(ctxp, corrupt_idx) == 0) {
lcp = acp->ac_log;
f = acp->ac_mfp;
acp->ac_offset = __pmLogLabelSize(lcp);
acp->ac_vol = acp->ac_curvol;
goto again;
}
}
else if (mode == PM_MODE_BACK && corrupt_idx > 0) {
int j;
acp->ac_mark_done = 0;
if (__pmLogChangeArchive(ctxp, corrupt_idx - 1) == 0) {
lcp = acp->ac_log;
for (j = lcp->maxvol; j >= lcp->minvol; j--) {
if (__pmLogChangeVol(acp, j) >= 0)
break;
}
if (j >= lcp->minvol) {
__pmFseek(acp->ac_mfp, (long)0, SEEK_END);
acp->ac_offset = __pmFtell(acp->ac_mfp);
assert(acp->ac_offset >= 0);
acp->ac_vol = acp->ac_curvol;
f = acp->ac_mfp;
offset = __pmFtell(f);
assert(offset >= 0);
goto read_retry;
}
}
}
}
sts = PM_ERR_EOL;
}

if (ctx_ctl.need_ctx_unlock)
PM_UNLOCK(ctx_ctl.ctxp->c_lock);

Expand Down Expand Up @@ -3312,7 +3377,7 @@ LogChangeToPreviousArchive(__pmContext *ctxp)
*/
save_origin = ctxp->c_origin;
save_mode = ctxp->c_mode;
/* Switch to the next archive. */
/* Switch to the previous archive. */
__pmLogChangeArchive(ctxp, acp->ac_cur_log - 1);
lcp = acp->ac_log;
ctxp->c_origin = save_origin;
Expand Down
Loading