From e548c41ab3c55fa7de88581c347bdaa4b04155a9 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Sun, 22 Feb 2026 16:36:54 -0800 Subject: [PATCH 1/8] instrumentation: Rename INSTR_TIME_LT macro to INSTR_TIME_GT This was incorrectly named "LT" for "larger than" in e5a5e0a90750d66, but that is against existing conventions, where "LT" means "less than". Clarify by using "GT" for "greater than" in macro name, and add a missing comment at the top of instr_time.h to note the macro's existence. Reported by: Peter Smith Author: Lukas Fittl Reviewed-by: Discussion: https://www.postgresql.org/message-id/flat/CAHut%2BPut94CTpjQsqOJHdHkgJ2ZXq%2BqVSfMEcmDKLiWLW-hPfA%40mail.gmail.com#0690d99bebc6dd9b035724f66d9986c1 --- src/backend/executor/instrument.c | 2 +- src/include/portability/instr_time.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index edab92a0ebeda..4c3930a4f8680 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -170,7 +170,7 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add) dst->firsttuple = add->firsttuple; } else if (dst->running && add->running && - INSTR_TIME_LT(dst->firsttuple, add->firsttuple)) + INSTR_TIME_GT(dst->firsttuple, add->firsttuple)) dst->firsttuple = add->firsttuple; INSTR_TIME_ADD(dst->counter, add->counter); diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index 490593d182512..87b9ec95531b9 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -28,6 +28,8 @@ * * INSTR_TIME_ACCUM_DIFF(x, y, z) x += (y - z) * + * INSTR_TIME_GT(x, y) x > y + * * INSTR_TIME_GET_DOUBLE(t) convert t to double (in seconds) * * INSTR_TIME_GET_MILLISEC(t) convert t to double (in milliseconds) @@ -184,7 +186,7 @@ GetTimerFrequency(void) #define INSTR_TIME_ACCUM_DIFF(x,y,z) \ ((x).ticks += (y).ticks - (z).ticks) -#define INSTR_TIME_LT(x,y) \ +#define INSTR_TIME_GT(x,y) \ ((x).ticks > (y).ticks) #define INSTR_TIME_GET_DOUBLE(t) \ From 1d70ca7afcf324730e3860cfeac3d13fedf71299 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Sun, 31 Aug 2025 16:34:42 -0700 Subject: [PATCH 2/8] instrumentation: Replace direct changes of pgBufferUsage/pgWalUsage with INSTR_* macros This encapsulates the ownership of these globals better, and will allow a subsequent refactoring. Author: Lukas Fittl Reviewed-by: Andres Freund Discussion: https://www.postgresql.org/message-id/flat/CAP53PkzZ3UotnRrrnXWAv%3DF4avRq9MQ8zU%2BbxoN9tpovEu6fGQ%40mail.gmail.com#fc7140e8af21e07a90a09d7e76b300c4 --- src/backend/access/transam/xlog.c | 10 +++++----- src/backend/storage/buffer/bufmgr.c | 24 ++++++++++++------------ src/backend/storage/buffer/localbuf.c | 6 +++--- src/backend/storage/file/buffile.c | 8 ++++---- src/backend/utils/activity/pgstat_io.c | 8 ++++---- src/include/executor/instrument.h | 19 +++++++++++++++++++ 6 files changed, 47 insertions(+), 28 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 13cce9b49f16e..6a635bcdf1182 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -1080,10 +1080,10 @@ XLogInsertRecord(XLogRecData *rdata, /* Report WAL traffic to the instrumentation. */ if (inserted) { - pgWalUsage.wal_bytes += rechdr->xl_tot_len; - pgWalUsage.wal_records++; - pgWalUsage.wal_fpi += num_fpi; - pgWalUsage.wal_fpi_bytes += fpi_bytes; + INSTR_WALUSAGE_ADD(wal_bytes, rechdr->xl_tot_len); + INSTR_WALUSAGE_INCR(wal_records); + INSTR_WALUSAGE_ADD(wal_fpi, num_fpi); + INSTR_WALUSAGE_ADD(wal_fpi_bytes, fpi_bytes); /* Required for the flush of pending stats WAL data */ pgstat_report_fixed = true; @@ -2063,7 +2063,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) WriteRqst.Flush = InvalidXLogRecPtr; XLogWrite(WriteRqst, tli, false); LWLockRelease(WALWriteLock); - pgWalUsage.wal_buffers_full++; + INSTR_WALUSAGE_INCR(wal_buffers_full); TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE(); /* diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index d1babaff02351..511efaba516e7 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -826,7 +826,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN { PinLocalBuffer(bufHdr, true); - pgBufferUsage.local_blks_hit++; + INSTR_BUFUSAGE_INCR(local_blks_hit); return true; } @@ -847,7 +847,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN { if (BufferTagsEqual(&tag, &bufHdr->tag)) { - pgBufferUsage.shared_blks_hit++; + INSTR_BUFUSAGE_INCR(shared_blks_hit); return true; } UnpinBuffer(bufHdr); @@ -1248,14 +1248,14 @@ PinBufferForBlock(Relation rel, { bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr); if (*foundPtr) - pgBufferUsage.local_blks_hit++; + INSTR_BUFUSAGE_INCR(local_blks_hit); } else { bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum, strategy, foundPtr, io_context); if (*foundPtr) - pgBufferUsage.shared_blks_hit++; + INSTR_BUFUSAGE_INCR(shared_blks_hit); } if (rel) { @@ -1989,9 +1989,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) true); if (persistence == RELPERSISTENCE_TEMP) - pgBufferUsage.local_blks_hit += 1; + INSTR_BUFUSAGE_INCR(local_blks_hit); else - pgBufferUsage.shared_blks_hit += 1; + INSTR_BUFUSAGE_INCR(shared_blks_hit); if (operation->rel) pgstat_count_buffer_hit(operation->rel); @@ -2059,9 +2059,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) io_start, 1, io_buffers_len * BLCKSZ); if (persistence == RELPERSISTENCE_TEMP) - pgBufferUsage.local_blks_read += io_buffers_len; + INSTR_BUFUSAGE_ADD(local_blks_read, io_buffers_len); else - pgBufferUsage.shared_blks_read += io_buffers_len; + INSTR_BUFUSAGE_ADD(shared_blks_read, io_buffers_len); /* * Track vacuum cost when issuing IO, not after waiting for it. @@ -2954,7 +2954,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, TerminateBufferIO(buf_hdr, false, BM_VALID, true, false); } - pgBufferUsage.shared_blks_written += extend_by; + INSTR_BUFUSAGE_ADD(shared_blks_written, extend_by); *extended_by = extend_by; @@ -3099,7 +3099,7 @@ MarkBufferDirty(Buffer buffer) */ if (!(old_buf_state & BM_DIRTY)) { - pgBufferUsage.shared_blks_dirtied++; + INSTR_BUFUSAGE_INCR(shared_blks_dirtied); if (VacuumCostActive) VacuumCostBalance += VacuumCostPageDirty; } @@ -4528,7 +4528,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_WRITE, io_start, 1, BLCKSZ); - pgBufferUsage.shared_blks_written++; + INSTR_BUFUSAGE_INCR(shared_blks_written); /* * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and @@ -5689,7 +5689,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) if (dirtied) { - pgBufferUsage.shared_blks_dirtied++; + INSTR_BUFUSAGE_INCR(shared_blks_dirtied); if (VacuumCostActive) VacuumCostBalance += VacuumCostPageDirty; } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 04a540379a22f..e6054e745e819 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -217,7 +217,7 @@ FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln) /* Mark not-dirty */ TerminateLocalBufferIO(bufHdr, true, 0, false); - pgBufferUsage.local_blks_written++; + INSTR_BUFUSAGE_INCR(local_blks_written); } static Buffer @@ -478,7 +478,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr, *extended_by = extend_by; - pgBufferUsage.local_blks_written += extend_by; + INSTR_BUFUSAGE_ADD(local_blks_written, extend_by); return first_block; } @@ -509,7 +509,7 @@ MarkLocalBufferDirty(Buffer buffer) buf_state = pg_atomic_read_u64(&bufHdr->state); if (!(buf_state & BM_DIRTY)) - pgBufferUsage.local_blks_dirtied++; + INSTR_BUFUSAGE_INCR(local_blks_dirtied); buf_state |= BM_DIRTY; diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index ddf3a410d6f54..1726b3a3b3370 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -474,13 +474,13 @@ BufFileLoadBuffer(BufFile *file) if (track_io_timing) { INSTR_TIME_SET_CURRENT(io_time); - INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_read_time, io_time, io_start); + INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_read_time, io_time, io_start); } /* we choose not to advance curOffset here */ if (file->nbytes > 0) - pgBufferUsage.temp_blks_read++; + INSTR_BUFUSAGE_INCR(temp_blks_read); } /* @@ -548,13 +548,13 @@ BufFileDumpBuffer(BufFile *file) if (track_io_timing) { INSTR_TIME_SET_CURRENT(io_time); - INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_write_time, io_time, io_start); + INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_write_time, io_time, io_start); } file->curOffset += bytestowrite; wpos += bytestowrite; - pgBufferUsage.temp_blks_written++; + INSTR_BUFUSAGE_INCR(temp_blks_written); } file->dirty = false; diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index 28de24538dced..9e7a88ec0d07e 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -135,17 +135,17 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, { pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time)); if (io_object == IOOBJECT_RELATION) - INSTR_TIME_ADD(pgBufferUsage.shared_blk_write_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(shared_blk_write_time, io_time); else if (io_object == IOOBJECT_TEMP_RELATION) - INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(local_blk_write_time, io_time); } else if (io_op == IOOP_READ) { pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time)); if (io_object == IOOBJECT_RELATION) - INSTR_TIME_ADD(pgBufferUsage.shared_blk_read_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(shared_blk_read_time, io_time); else if (io_object == IOOBJECT_TEMP_RELATION) - INSTR_TIME_ADD(pgBufferUsage.local_blk_read_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(local_blk_read_time, io_time); } } diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 9759f3ea5d8d9..64e91373b5d58 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -119,4 +119,23 @@ extern void BufferUsageAccumDiff(BufferUsage *dst, extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub); +#define INSTR_BUFUSAGE_INCR(fld) do { \ + pgBufferUsage.fld++; \ + } while(0) +#define INSTR_BUFUSAGE_ADD(fld,val) do { \ + pgBufferUsage.fld += val; \ + } while(0) +#define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \ + INSTR_TIME_ADD(pgBufferUsage.fld, val); \ + } while (0) +#define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \ + INSTR_TIME_ACCUM_DIFF(pgBufferUsage.fld, endval, startval); \ + } while (0) +#define INSTR_WALUSAGE_INCR(fld) do { \ + pgWalUsage.fld++; \ + } while(0) +#define INSTR_WALUSAGE_ADD(fld,val) do { \ + pgWalUsage.fld += val; \ + } while(0) + #endif /* INSTRUMENT_H */ From 0bc83c0cac3f2c9987a7bb67cdb1e5a15e8701b9 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Sat, 1 Mar 2025 19:31:30 -0800 Subject: [PATCH 3/8] instrumentation: Separate per-node and trigger logic from other uses Previously different places (e.g. query "total time") were repurposing the Instrumentation struct initially introduced for capturing per-node statistics during execution. This overuse of the same struct is confusing, e.g. by cluttering calls of InstrStartNode/InstrStopNode in unrelated code paths, and prevents future refactorings. Instead, simplify the Instrumentation struct to only track time and WAL/buffer usage. Similarly, drop the use of InstrEndLoop outside of per-node instrumentation - these calls were added without any apparent benefit since the relevant fields were never read. Introduce the NodeInstrumentation struct to carry forward the per-node instrumentation information, and introduce TriggerInstrumentation to capture trigger timing and firings (previously counted in "ntuples"). WorkerInstrumentation is renamed to WorkerNodeInstrumentation for clarity. In passing, drop the "n" argument to InstrAlloc, as all remaining callers need exactly one Instrumentation struct. Author: Lukas Fittl Reviewed-by: Discussion: --- contrib/auto_explain/auto_explain.c | 8 +- .../pg_stat_statements/pg_stat_statements.c | 8 +- contrib/postgres_fdw/postgres_fdw.c | 2 +- src/backend/commands/explain.c | 27 ++-- src/backend/commands/trigger.c | 22 ++-- src/backend/executor/execMain.c | 10 +- src/backend/executor/execParallel.c | 24 ++-- src/backend/executor/execProcnode.c | 4 +- src/backend/executor/instrument.c | 123 ++++++++++++++---- src/include/executor/instrument.h | 66 ++++++++-- src/include/nodes/execnodes.h | 8 +- src/tools/pgindent/typedefs.list | 4 +- 12 files changed, 210 insertions(+), 96 deletions(-) diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c index e856cd35a6f0f..39bf2543b701d 100644 --- a/contrib/auto_explain/auto_explain.c +++ b/contrib/auto_explain/auto_explain.c @@ -315,7 +315,7 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags) MemoryContext oldcxt; oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); - queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false); + queryDesc->totaltime = InstrAlloc(INSTRUMENT_ALL); MemoryContextSwitchTo(oldcxt); } } @@ -381,12 +381,6 @@ explain_ExecutorEnd(QueryDesc *queryDesc) */ oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); - /* - * Make sure stats accumulation is done. (Note: it's okay if several - * levels of hook all do this.) - */ - InstrEndLoop(queryDesc->totaltime); - /* Log plan if duration is exceeded. */ msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total); if (msec >= auto_explain_log_min_duration) diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 4a427533bd88d..388b068cceccf 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -1023,7 +1023,7 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags) MemoryContext oldcxt; oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); - queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false); + queryDesc->totaltime = InstrAlloc(INSTRUMENT_ALL); MemoryContextSwitchTo(oldcxt); } } @@ -1082,12 +1082,6 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) if (queryId != INT64CONST(0) && queryDesc->totaltime && pgss_enabled(nesting_level)) { - /* - * Make sure stats accumulation is done. (Note: it's okay if several - * levels of hook all do this.) - */ - InstrEndLoop(queryDesc->totaltime); - pgss_store(queryDesc->sourceText, queryId, queryDesc->plannedstmt->stmt_location, diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c index 60d90329a651d..6f0cb2a285b38 100644 --- a/contrib/postgres_fdw/postgres_fdw.c +++ b/contrib/postgres_fdw/postgres_fdw.c @@ -2778,7 +2778,7 @@ postgresIterateDirectModify(ForeignScanState *node) if (!resultRelInfo->ri_projectReturning) { TupleTableSlot *slot = node->ss.ss_ScanTupleSlot; - Instrumentation *instr = node->ss.ps.instrument; + NodeInstrumentation *instr = node->ss.ps.instrument; Assert(!dmstate->has_returning); diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 93918a223b8d7..d7f33644d79a2 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -1099,18 +1099,15 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es) for (nt = 0; nt < rInfo->ri_TrigDesc->numtriggers; nt++) { Trigger *trig = rInfo->ri_TrigDesc->triggers + nt; - Instrumentation *instr = rInfo->ri_TrigInstrument + nt; + TriggerInstrumentation *tginstr = rInfo->ri_TrigInstrument + nt; char *relname; char *conname = NULL; - /* Must clean up instrumentation state */ - InstrEndLoop(instr); - /* * We ignore triggers that were never invoked; they likely aren't * relevant to the current query type. */ - if (instr->ntuples == 0) + if (tginstr->firings == 0) continue; ExplainOpenGroup("Trigger", NULL, true, es); @@ -1135,11 +1132,11 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es) if (show_relname) appendStringInfo(es->str, " on %s", relname); if (es->timing) - appendStringInfo(es->str, ": time=%.3f calls=%.0f\n", - INSTR_TIME_GET_MILLISEC(instr->total), - instr->ntuples); + appendStringInfo(es->str, ": time=%.3f calls=%d\n", + INSTR_TIME_GET_MILLISEC(tginstr->instr.total), + tginstr->firings); else - appendStringInfo(es->str, ": calls=%.0f\n", instr->ntuples); + appendStringInfo(es->str, ": calls=%d\n", tginstr->firings); } else { @@ -1149,9 +1146,9 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es) ExplainPropertyText("Relation", relname, es); if (es->timing) ExplainPropertyFloat("Time", "ms", - INSTR_TIME_GET_MILLISEC(instr->total), 3, + INSTR_TIME_GET_MILLISEC(tginstr->instr.total), 3, es); - ExplainPropertyFloat("Calls", NULL, instr->ntuples, 0, es); + ExplainPropertyInteger("Calls", NULL, tginstr->firings, es); } if (conname) @@ -1891,11 +1888,11 @@ ExplainNode(PlanState *planstate, List *ancestors, /* prepare per-worker general execution details */ if (es->workers_state && es->verbose) { - WorkerInstrumentation *w = planstate->worker_instrument; + WorkerNodeInstrumentation *w = planstate->worker_instrument; for (int n = 0; n < w->num_workers; n++) { - Instrumentation *instrument = &w->instrument[n]; + NodeInstrumentation *instrument = &w->instrument[n]; double nloops = instrument->nloops; double startup_ms; double total_ms; @@ -2298,11 +2295,11 @@ ExplainNode(PlanState *planstate, List *ancestors, /* Prepare per-worker buffer/WAL usage */ if (es->workers_state && (es->buffers || es->wal) && es->verbose) { - WorkerInstrumentation *w = planstate->worker_instrument; + WorkerNodeInstrumentation *w = planstate->worker_instrument; for (int n = 0; n < w->num_workers; n++) { - Instrumentation *instrument = &w->instrument[n]; + NodeInstrumentation *instrument = &w->instrument[n]; double nloops = instrument->nloops; if (nloops <= 0) diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 98d402c0a3be7..c336007314121 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -90,7 +90,7 @@ static bool TriggerEnabled(EState *estate, ResultRelInfo *relinfo, static HeapTuple ExecCallTriggerFunc(TriggerData *trigdata, int tgindx, FmgrInfo *finfo, - Instrumentation *instr, + TriggerInstrumentation *instr, MemoryContext per_tuple_context); static void AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, ResultRelInfo *src_partinfo, @@ -2309,7 +2309,7 @@ static HeapTuple ExecCallTriggerFunc(TriggerData *trigdata, int tgindx, FmgrInfo *finfo, - Instrumentation *instr, + TriggerInstrumentation *instr, MemoryContext per_tuple_context) { LOCAL_FCINFO(fcinfo, 0); @@ -2344,7 +2344,7 @@ ExecCallTriggerFunc(TriggerData *trigdata, * If doing EXPLAIN ANALYZE, start charging time to this trigger. */ if (instr) - InstrStartNode(instr + tgindx); + InstrStartTrigger(instr + tgindx); /* * Do the function evaluation in the per-tuple memory context, so that @@ -2389,10 +2389,10 @@ ExecCallTriggerFunc(TriggerData *trigdata, /* * If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count - * one "tuple returned" (really the number of firings). + * the firing of the trigger. */ if (instr) - InstrStopNode(instr + tgindx, 1); + InstrStopTrigger(instr + tgindx, 1); return (HeapTuple) DatumGetPointer(result); } @@ -3936,7 +3936,7 @@ static void AfterTriggerExecute(EState *estate, ResultRelInfo *dst_relInfo, TriggerDesc *trigdesc, FmgrInfo *finfo, - Instrumentation *instr, + TriggerInstrumentation *instr, MemoryContext per_tuple_context, TupleTableSlot *trig_tuple_slot1, TupleTableSlot *trig_tuple_slot2); @@ -4330,7 +4330,7 @@ AfterTriggerExecute(EState *estate, ResultRelInfo *src_relInfo, ResultRelInfo *dst_relInfo, TriggerDesc *trigdesc, - FmgrInfo *finfo, Instrumentation *instr, + FmgrInfo *finfo, TriggerInstrumentation *instr, MemoryContext per_tuple_context, TupleTableSlot *trig_tuple_slot1, TupleTableSlot *trig_tuple_slot2) @@ -4371,7 +4371,7 @@ AfterTriggerExecute(EState *estate, * to include time spent re-fetching tuples in the trigger cost. */ if (instr) - InstrStartNode(instr + tgindx); + InstrStartTrigger(instr + tgindx); /* * Fetch the required tuple(s). @@ -4588,10 +4588,10 @@ AfterTriggerExecute(EState *estate, /* * If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count - * one "tuple returned" (really the number of firings). + * the firing of the trigger. */ if (instr) - InstrStopNode(instr + tgindx, 1); + InstrStopTrigger(instr + tgindx, 1); } @@ -4707,7 +4707,7 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events, Relation rel = NULL; TriggerDesc *trigdesc = NULL; FmgrInfo *finfo = NULL; - Instrumentation *instr = NULL; + TriggerInstrumentation *instr = NULL; TupleTableSlot *slot1 = NULL, *slot2 = NULL; diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index bfd3ebc601ec6..c0b174cfbc02a 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -331,7 +331,7 @@ standard_ExecutorRun(QueryDesc *queryDesc, /* Allow instrumentation of Executor overall runtime */ if (queryDesc->totaltime) - InstrStartNode(queryDesc->totaltime); + InstrStart(queryDesc->totaltime); /* * extract information from the query descriptor and the query feature. @@ -383,7 +383,7 @@ standard_ExecutorRun(QueryDesc *queryDesc, dest->rShutdown(dest); if (queryDesc->totaltime) - InstrStopNode(queryDesc->totaltime, estate->es_processed); + InstrStop(queryDesc->totaltime); MemoryContextSwitchTo(oldcontext); } @@ -433,7 +433,7 @@ standard_ExecutorFinish(QueryDesc *queryDesc) /* Allow instrumentation of Executor overall runtime */ if (queryDesc->totaltime) - InstrStartNode(queryDesc->totaltime); + InstrStart(queryDesc->totaltime); /* Run ModifyTable nodes to completion */ ExecPostprocessPlan(estate); @@ -443,7 +443,7 @@ standard_ExecutorFinish(QueryDesc *queryDesc) AfterTriggerEndQuery(estate); if (queryDesc->totaltime) - InstrStopNode(queryDesc->totaltime, 0); + InstrStop(queryDesc->totaltime); MemoryContextSwitchTo(oldcontext); @@ -1270,7 +1270,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigWhenExprs = (ExprState **) palloc0_array(ExprState *, n); if (instrument_options) - resultRelInfo->ri_TrigInstrument = InstrAlloc(n, instrument_options, false); + resultRelInfo->ri_TrigInstrument = InstrAllocTrigger(n, instrument_options); } else { diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index f87978c137e7d..fb9a4afbbf18a 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -86,7 +86,7 @@ typedef struct FixedParallelExecutorState * instrument_options: Same meaning here as in instrument.c. * * instrument_offset: Offset, relative to the start of this structure, - * of the first Instrumentation object. This will depend on the length of + * of the first NodeInstrumentation object. This will depend on the length of * the plan_node_id array. * * num_workers: Number of workers. @@ -103,11 +103,15 @@ struct SharedExecutorInstrumentation int num_workers; int num_plan_nodes; int plan_node_id[FLEXIBLE_ARRAY_MEMBER]; - /* array of num_plan_nodes * num_workers Instrumentation objects follows */ + + /* + * array of num_plan_nodes * num_workers NodeInstrumentation objects + * follows + */ }; #define GetInstrumentationArray(sei) \ (StaticAssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \ - (Instrumentation *) (((char *) sei) + sei->instrument_offset)) + (NodeInstrumentation *) (((char *) sei) + sei->instrument_offset)) /* Context object for ExecParallelEstimate. */ typedef struct ExecParallelEstimateContext @@ -724,7 +728,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, instrumentation_len = MAXALIGN(instrumentation_len); instrument_offset = instrumentation_len; instrumentation_len += - mul_size(sizeof(Instrumentation), + mul_size(sizeof(NodeInstrumentation), mul_size(e.nnodes, nworkers)); shm_toc_estimate_chunk(&pcxt->estimator, instrumentation_len); shm_toc_estimate_keys(&pcxt->estimator, 1); @@ -810,7 +814,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, */ if (estate->es_instrument) { - Instrumentation *instrument; + NodeInstrumentation *instrument; int i; instrumentation = shm_toc_allocate(pcxt->toc, instrumentation_len); @@ -820,7 +824,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, instrumentation->num_plan_nodes = e.nnodes; instrument = GetInstrumentationArray(instrumentation); for (i = 0; i < nworkers * e.nnodes; ++i) - InstrInit(&instrument[i], estate->es_instrument); + InstrInitNode(&instrument[i], estate->es_instrument); shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instrumentation); pei->instrumentation = instrumentation; @@ -1052,7 +1056,7 @@ static bool ExecParallelRetrieveInstrumentation(PlanState *planstate, SharedExecutorInstrumentation *instrumentation) { - Instrumentation *instrument; + NodeInstrumentation *instrument; int i; int n; int ibytes; @@ -1080,9 +1084,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate, * Switch into per-query memory context. */ oldcontext = MemoryContextSwitchTo(planstate->state->es_query_cxt); - ibytes = mul_size(instrumentation->num_workers, sizeof(Instrumentation)); + ibytes = mul_size(instrumentation->num_workers, sizeof(NodeInstrumentation)); planstate->worker_instrument = - palloc(ibytes + offsetof(WorkerInstrumentation, instrument)); + palloc(ibytes + offsetof(WorkerNodeInstrumentation, instrument)); MemoryContextSwitchTo(oldcontext); planstate->worker_instrument->num_workers = instrumentation->num_workers; @@ -1312,7 +1316,7 @@ ExecParallelReportInstrumentation(PlanState *planstate, { int i; int plan_node_id = planstate->plan->plan_node_id; - Instrumentation *instrument; + NodeInstrumentation *instrument; InstrEndLoop(planstate->instrument); diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 7e40b85251718..1846661b503f6 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -413,8 +413,8 @@ ExecInitNode(Plan *node, EState *estate, int eflags) /* Set up instrumentation for this node if requested */ if (estate->es_instrument) - result->instrument = InstrAlloc(1, estate->es_instrument, - result->async_capable); + result->instrument = InstrAllocNode(estate->es_instrument, + result->async_capable); return result; } diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 4c3930a4f8680..34daadc88fc60 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -26,38 +26,117 @@ static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); static void WalUsageAdd(WalUsage *dst, WalUsage *add); -/* Allocate new instrumentation structure(s) */ +/* General purpose instrumentation handling */ Instrumentation * -InstrAlloc(int n, int instrument_options, bool async_mode) +InstrAlloc(int instrument_options) { Instrumentation *instr; /* initialize all fields to zeroes, then modify as needed */ - instr = palloc0(n * sizeof(Instrumentation)); + instr = palloc0(sizeof(Instrumentation)); if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL)) { - bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; - bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; - bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; - int i; - - for (i = 0; i < n; i++) - { - instr[i].need_bufusage = need_buffers; - instr[i].need_walusage = need_wal; - instr[i].need_timer = need_timer; - instr[i].async_mode = async_mode; - } + instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0; + instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0; + instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; } return instr; } +void +InstrStart(Instrumentation *instr) +{ + if (instr->need_timer && + !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) + elog(ERROR, "InstrStart called twice in a row"); + + if (instr->need_bufusage) + instr->bufusage_start = pgBufferUsage; + + if (instr->need_walusage) + instr->walusage_start = pgWalUsage; +} + +void +InstrStop(Instrumentation *instr) +{ + instr_time endtime; + + /* let's update the time only if the timer was requested */ + if (instr->need_timer) + { + if (INSTR_TIME_IS_ZERO(instr->starttime)) + elog(ERROR, "InstrStop called without start"); + + INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_ACCUM_DIFF(instr->total, endtime, instr->starttime); + + INSTR_TIME_SET_ZERO(instr->starttime); + } + + /* Add delta of buffer usage since entry to node's totals */ + if (instr->need_bufusage) + BufferUsageAccumDiff(&instr->bufusage, + &pgBufferUsage, &instr->bufusage_start); + + if (instr->need_walusage) + WalUsageAccumDiff(&instr->walusage, + &pgWalUsage, &instr->walusage_start); +} + +/* Trigger instrumentation handling */ +TriggerInstrumentation * +InstrAllocTrigger(int n, int instrument_options) +{ + TriggerInstrumentation *tginstr = palloc0(n * sizeof(TriggerInstrumentation)); + bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; + bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; + bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; + int i; + + for (i = 0; i < n; i++) + { + tginstr[i].instr.need_bufusage = need_buffers; + tginstr[i].instr.need_walusage = need_wal; + tginstr[i].instr.need_timer = need_timer; + } + + return tginstr; +} + +void +InstrStartTrigger(TriggerInstrumentation *tginstr) +{ + InstrStart(&tginstr->instr); +} + +void +InstrStopTrigger(TriggerInstrumentation *tginstr, int firings) +{ + InstrStop(&tginstr->instr); + tginstr->firings += firings; +} + +/* Node instrumentation handling */ + +/* Allocate new node instrumentation structure */ +NodeInstrumentation * +InstrAllocNode(int instrument_options, bool async_mode) +{ + NodeInstrumentation *instr = palloc(sizeof(NodeInstrumentation)); + + InstrInitNode(instr, instrument_options); + instr->async_mode = async_mode; + + return instr; +} + /* Initialize a pre-allocated instrumentation structure. */ void -InstrInit(Instrumentation *instr, int instrument_options) +InstrInitNode(NodeInstrumentation *instr, int instrument_options) { - memset(instr, 0, sizeof(Instrumentation)); + memset(instr, 0, sizeof(NodeInstrumentation)); instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0; instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0; instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; @@ -65,7 +144,7 @@ InstrInit(Instrumentation *instr, int instrument_options) /* Entry to a plan node */ void -InstrStartNode(Instrumentation *instr) +InstrStartNode(NodeInstrumentation *instr) { if (instr->need_timer && !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) @@ -81,7 +160,7 @@ InstrStartNode(Instrumentation *instr) /* Exit from a plan node */ void -InstrStopNode(Instrumentation *instr, double nTuples) +InstrStopNode(NodeInstrumentation *instr, double nTuples) { double save_tuplecount = instr->tuplecount; instr_time endtime; @@ -129,7 +208,7 @@ InstrStopNode(Instrumentation *instr, double nTuples) /* Update tuple count */ void -InstrUpdateTupleCount(Instrumentation *instr, double nTuples) +InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples) { /* count the returned tuples */ instr->tuplecount += nTuples; @@ -137,7 +216,7 @@ InstrUpdateTupleCount(Instrumentation *instr, double nTuples) /* Finish a run cycle for a plan node */ void -InstrEndLoop(Instrumentation *instr) +InstrEndLoop(NodeInstrumentation *instr) { /* Skip if nothing has happened, or already shut down */ if (!instr->running) @@ -162,7 +241,7 @@ InstrEndLoop(Instrumentation *instr) /* aggregate instrumentation information */ void -InstrAggNode(Instrumentation *dst, Instrumentation *add) +InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add) { if (!dst->running && add->running) { diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 64e91373b5d58..16688b5fdb42e 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -67,7 +67,40 @@ typedef enum InstrumentOption INSTRUMENT_ALL = PG_INT32_MAX } InstrumentOption; +/* + * General purpose instrumentation that can capture time and WAL/buffer usage + * + * Initialized through InstrAlloc, followed by one or more calls to a pair of + * InstrStart/InstrStop (activity is measured inbetween). + */ typedef struct Instrumentation +{ + /* Parameters set at creation: */ + bool need_timer; /* true if we need timer data */ + bool need_bufusage; /* true if we need buffer usage data */ + bool need_walusage; /* true if we need WAL usage data */ + /* Internal state keeping: */ + instr_time starttime; /* start time of last InstrStart */ + BufferUsage bufusage_start; /* buffer usage at start */ + WalUsage walusage_start; /* WAL usage at start */ + /* Accumulated statistics: */ + instr_time total; /* total runtime */ + BufferUsage bufusage; /* total buffer usage */ + WalUsage walusage; /* total WAL usage */ +} Instrumentation; + +/* Trigger instrumentation */ +typedef struct TriggerInstrumentation +{ + Instrumentation instr; + int firings; /* number of times the instrumented trigger + * was fired */ +} TriggerInstrumentation; + +/* + * Specialized instrumentation for per-node execution statistics + */ +typedef struct NodeInstrumentation { /* Parameters set at node creation: */ bool need_timer; /* true if we need timer data */ @@ -92,25 +125,34 @@ typedef struct Instrumentation double nfiltered2; /* # of tuples removed by "other" quals */ BufferUsage bufusage; /* total buffer usage */ WalUsage walusage; /* total WAL usage */ -} Instrumentation; +} NodeInstrumentation; -typedef struct WorkerInstrumentation +typedef struct WorkerNodeInstrumentation { int num_workers; /* # of structures that follow */ - Instrumentation instrument[FLEXIBLE_ARRAY_MEMBER]; -} WorkerInstrumentation; + NodeInstrumentation instrument[FLEXIBLE_ARRAY_MEMBER]; +} WorkerNodeInstrumentation; extern PGDLLIMPORT BufferUsage pgBufferUsage; extern PGDLLIMPORT WalUsage pgWalUsage; -extern Instrumentation *InstrAlloc(int n, int instrument_options, - bool async_mode); -extern void InstrInit(Instrumentation *instr, int instrument_options); -extern void InstrStartNode(Instrumentation *instr); -extern void InstrStopNode(Instrumentation *instr, double nTuples); -extern void InstrUpdateTupleCount(Instrumentation *instr, double nTuples); -extern void InstrEndLoop(Instrumentation *instr); -extern void InstrAggNode(Instrumentation *dst, Instrumentation *add); +extern Instrumentation *InstrAlloc(int instrument_options); +extern void InstrStart(Instrumentation *instr); +extern void InstrStop(Instrumentation *instr); + +extern TriggerInstrumentation *InstrAllocTrigger(int n, int instrument_options); +extern void InstrStartTrigger(TriggerInstrumentation *tginstr); +extern void InstrStopTrigger(TriggerInstrumentation *tginstr, int firings); + +extern NodeInstrumentation *InstrAllocNode(int instrument_options, + bool async_mode); +extern void InstrInitNode(NodeInstrumentation *instr, int instrument_options); +extern void InstrStartNode(NodeInstrumentation *instr); +extern void InstrStopNode(NodeInstrumentation *instr, double nTuples); +extern void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples); +extern void InstrEndLoop(NodeInstrumentation *instr); +extern void InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add); + extern void InstrStartParallelQuery(void); extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage); extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 63c067d5aae61..605c7a6cc3945 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -524,7 +524,7 @@ typedef struct ResultRelInfo ExprState **ri_TrigWhenExprs; /* optional runtime measurements for triggers */ - Instrumentation *ri_TrigInstrument; + TriggerInstrumentation *ri_TrigInstrument; /* On-demand created slots for triggers / returning processing */ TupleTableSlot *ri_ReturningSlot; /* for trigger output tuples */ @@ -1175,8 +1175,10 @@ typedef struct PlanState ExecProcNodeMtd ExecProcNodeReal; /* actual function, if above is a * wrapper */ - Instrumentation *instrument; /* Optional runtime stats for this node */ - WorkerInstrumentation *worker_instrument; /* per-worker instrumentation */ + NodeInstrumentation *instrument; /* Optional runtime stats for this + * node */ + WorkerNodeInstrumentation *worker_instrument; /* per-worker + * instrumentation */ /* Per-worker JIT instrumentation */ struct SharedJitInstrumentation *worker_jit_instrument; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 241945734ec86..e1075c709f538 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1784,6 +1784,7 @@ NextSampleBlock_function NextSampleTuple_function NextValueExpr Node +NodeInstrumentation NodeTag NonEmptyRange NoneCompressorState @@ -3155,6 +3156,7 @@ TriggerDesc TriggerEvent TriggerFlags TriggerInfo +TriggerInstrumentation TriggerTransition TruncateStmt TsmRoutine @@ -3375,9 +3377,9 @@ WorkTableScan WorkTableScanState WorkerInfo WorkerInfoData -WorkerInstrumentation WorkerJobDumpPtrType WorkerJobRestorePtrType +WorkerNodeInstrumentation Working_State WriteBufPtrType WriteBytePtrType From 3d9b1a6589a0c70b326c82d20f13e64c19d69a2c Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Sun, 22 Feb 2026 16:12:48 -0800 Subject: [PATCH 4/8] instrumentation: Add additional regression tests covering buffer usage This adds regression tests that cover some of the expected behaviour around the buffer statistics reported in EXPLAIN ANALYZE, specifically how they behave in parallel query, nested function calls and abort situations. Testing this is challenging because there can be different sources of buffer activity, so we rely on temporary tables where we can to prove that activity was captured and not lost. This supports a future commit that will rework some of the instrumentation logic that could cause areas covered by these tests to fail. Author: Lukas Fittl Reviewed-by: Discussion: --- .../pg_stat_statements/expected/utility.out | 70 +++++++ contrib/pg_stat_statements/expected/wal.out | 48 +++++ contrib/pg_stat_statements/sql/utility.sql | 56 +++++ contrib/pg_stat_statements/sql/wal.sql | 33 +++ src/test/regress/expected/explain.out | 197 ++++++++++++++++++ src/test/regress/sql/explain.sql | 194 +++++++++++++++++ 6 files changed, 598 insertions(+) diff --git a/contrib/pg_stat_statements/expected/utility.out b/contrib/pg_stat_statements/expected/utility.out index e4d6564ea5b5a..cba487f6be582 100644 --- a/contrib/pg_stat_statements/expected/utility.out +++ b/contrib/pg_stat_statements/expected/utility.out @@ -289,6 +289,76 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; 1 | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t (3 rows) +-- Buffer stats should flow through EXPLAIN ANALYZE +CREATE TEMP TABLE flow_through_test (a int, b char(200)); +INSERT INTO flow_through_test SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; +CREATE FUNCTION run_explain_buffers_test() RETURNS void AS $$ +DECLARE +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM flow_through_test'; +END; +$$ LANGUAGE plpgsql; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT run_explain_buffers_test(); + run_explain_buffers_test +-------------------------- + +(1 row) + +-- EXPLAIN entries should have non-zero buffer stats +SELECT query, local_blks_hit + local_blks_read > 0 as has_buffer_stats +FROM pg_stat_statements +WHERE query LIKE 'SELECT run_explain_buffers_test%' +ORDER BY query COLLATE "C"; + query | has_buffer_stats +-----------------------------------+------------------ + SELECT run_explain_buffers_test() | t +(1 row) + +DROP FUNCTION run_explain_buffers_test; +DROP TABLE flow_through_test; +-- Validate buffer/WAL counting during abort +SET pg_stat_statements.track = 'all'; +CREATE TEMP TABLE pgss_call_tab (a int, b char(20)); +CREATE TEMP TABLE pgss_call_tab2 (a int, b char(20)); +INSERT INTO pgss_call_tab VALUES (0, 'zzz'); +CREATE PROCEDURE pgss_call_rollback_proc() AS $$ +DECLARE + v int; +BEGIN + EXPLAIN ANALYZE WITH ins AS (INSERT INTO pgss_call_tab2 SELECT * FROM pgss_call_tab RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN +END; +$$ LANGUAGE plpgsql; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +CALL pgss_call_rollback_proc(); +SELECT query, calls, +local_blks_hit + local_blks_read > 0 as local_hitread, +wal_bytes > 0 as wal_bytes_generated, +wal_records > 0 as wal_records_generated +FROM pg_stat_statements +WHERE query LIKE '%pgss_call_rollback_proc%' +ORDER BY query COLLATE "C"; + query | calls | local_hitread | wal_bytes_generated | wal_records_generated +--------------------------------+-------+---------------+---------------------+----------------------- + CALL pgss_call_rollback_proc() | 1 | t | t | t +(1 row) + +DROP TABLE pgss_call_tab2; +DROP TABLE pgss_call_tab; +DROP PROCEDURE pgss_call_rollback_proc; +SET pg_stat_statements.track = 'top'; -- CALL CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$ DECLARE diff --git a/contrib/pg_stat_statements/expected/wal.out b/contrib/pg_stat_statements/expected/wal.out index 977e382d84894..611213daef6c2 100644 --- a/contrib/pg_stat_statements/expected/wal.out +++ b/contrib/pg_stat_statements/expected/wal.out @@ -28,3 +28,51 @@ SELECT pg_stat_statements_reset() IS NOT NULL AS t; t (1 row) +-- +-- Validate buffer/WAL counting with caught exception in PL/pgSQL +-- +CREATE TEMP TABLE pgss_error_tab (a int, b char(20)); +INSERT INTO pgss_error_tab VALUES (0, 'zzz'); +CREATE FUNCTION pgss_error_func() RETURNS void AS $$ +DECLARE + v int; +BEGIN + WITH ins AS (INSERT INTO pgss_error_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN + NULL; +END; +$$ LANGUAGE plpgsql; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT pgss_error_func(); + pgss_error_func +----------------- + +(1 row) + +-- Buffer/WAL usage from the wCTE INSERT should survive the exception +SELECT query, calls, +local_blks_hit + local_blks_read > 0 as local_hitread, +wal_bytes > 0 as wal_bytes_generated, +wal_records > 0 as wal_records_generated +FROM pg_stat_statements +WHERE query LIKE '%pgss_error_func%' +ORDER BY query COLLATE "C"; + query | calls | local_hitread | wal_bytes_generated | wal_records_generated +--------------------------+-------+---------------+---------------------+----------------------- + SELECT pgss_error_func() | 1 | t | t | t +(1 row) + +DROP TABLE pgss_error_tab; +DROP FUNCTION pgss_error_func; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + diff --git a/contrib/pg_stat_statements/sql/utility.sql b/contrib/pg_stat_statements/sql/utility.sql index dd97203c21025..7540e49c73caf 100644 --- a/contrib/pg_stat_statements/sql/utility.sql +++ b/contrib/pg_stat_statements/sql/utility.sql @@ -152,6 +152,62 @@ EXPLAIN (costs off) SELECT a FROM generate_series(1,10) AS tab(a) WHERE a = 7; SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; +-- Buffer stats should flow through EXPLAIN ANALYZE +CREATE TEMP TABLE flow_through_test (a int, b char(200)); +INSERT INTO flow_through_test SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; + +CREATE FUNCTION run_explain_buffers_test() RETURNS void AS $$ +DECLARE +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM flow_through_test'; +END; +$$ LANGUAGE plpgsql; + +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + +SELECT run_explain_buffers_test(); + +-- EXPLAIN entries should have non-zero buffer stats +SELECT query, local_blks_hit + local_blks_read > 0 as has_buffer_stats +FROM pg_stat_statements +WHERE query LIKE 'SELECT run_explain_buffers_test%' +ORDER BY query COLLATE "C"; + +DROP FUNCTION run_explain_buffers_test; +DROP TABLE flow_through_test; + +-- Validate buffer/WAL counting during abort +SET pg_stat_statements.track = 'all'; +CREATE TEMP TABLE pgss_call_tab (a int, b char(20)); +CREATE TEMP TABLE pgss_call_tab2 (a int, b char(20)); +INSERT INTO pgss_call_tab VALUES (0, 'zzz'); + +CREATE PROCEDURE pgss_call_rollback_proc() AS $$ +DECLARE + v int; +BEGIN + EXPLAIN ANALYZE WITH ins AS (INSERT INTO pgss_call_tab2 SELECT * FROM pgss_call_tab RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN +END; +$$ LANGUAGE plpgsql; + +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +CALL pgss_call_rollback_proc(); + +SELECT query, calls, +local_blks_hit + local_blks_read > 0 as local_hitread, +wal_bytes > 0 as wal_bytes_generated, +wal_records > 0 as wal_records_generated +FROM pg_stat_statements +WHERE query LIKE '%pgss_call_rollback_proc%' +ORDER BY query COLLATE "C"; + +DROP TABLE pgss_call_tab2; +DROP TABLE pgss_call_tab; +DROP PROCEDURE pgss_call_rollback_proc; +SET pg_stat_statements.track = 'top'; + -- CALL CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$ DECLARE diff --git a/contrib/pg_stat_statements/sql/wal.sql b/contrib/pg_stat_statements/sql/wal.sql index 1dc1552a81ebc..467e321b2062e 100644 --- a/contrib/pg_stat_statements/sql/wal.sql +++ b/contrib/pg_stat_statements/sql/wal.sql @@ -18,3 +18,36 @@ wal_records > 0 as wal_records_generated, wal_records >= rows as wal_records_ge_rows FROM pg_stat_statements ORDER BY query COLLATE "C"; SELECT pg_stat_statements_reset() IS NOT NULL AS t; + +-- +-- Validate buffer/WAL counting with caught exception in PL/pgSQL +-- +CREATE TEMP TABLE pgss_error_tab (a int, b char(20)); +INSERT INTO pgss_error_tab VALUES (0, 'zzz'); + +CREATE FUNCTION pgss_error_func() RETURNS void AS $$ +DECLARE + v int; +BEGIN + WITH ins AS (INSERT INTO pgss_error_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN + NULL; +END; +$$ LANGUAGE plpgsql; + +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SELECT pgss_error_func(); + +-- Buffer/WAL usage from the wCTE INSERT should survive the exception +SELECT query, calls, +local_blks_hit + local_blks_read > 0 as local_hitread, +wal_bytes > 0 as wal_bytes_generated, +wal_records > 0 as wal_records_generated +FROM pg_stat_statements +WHERE query LIKE '%pgss_error_func%' +ORDER BY query COLLATE "C"; + +DROP TABLE pgss_error_tab; +DROP FUNCTION pgss_error_func; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out index 7c1f26b182cb0..e28e754369314 100644 --- a/src/test/regress/expected/explain.out +++ b/src/test/regress/expected/explain.out @@ -822,3 +822,200 @@ select explain_filter('explain (analyze,buffers off,costs off) select sum(n) ove (9 rows) reset work_mem; +-- EXPLAIN (ANALYZE, BUFFERS) should report buffer usage from PL/pgSQL +-- EXCEPTION blocks, even after subtransaction rollback. +CREATE TEMP TABLE explain_exc_tab (a int, b char(20)); +INSERT INTO explain_exc_tab VALUES (0, 'zzz'); +CREATE FUNCTION explain_exc_func() RETURNS void AS $$ +DECLARE + v int; +BEGIN + WITH ins AS (INSERT INTO explain_exc_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN + NULL; +END; +$$ LANGUAGE plpgsql; +CREATE FUNCTION check_explain_exception_buffers() RETURNS boolean AS $$ +DECLARE + plan_json json; + node json; + total_buffers int; +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT explain_exc_func()' INTO plan_json; + node := plan_json->0->'Plan'; + total_buffers := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + RETURN total_buffers > 0; +END; +$$ LANGUAGE plpgsql; +SELECT check_explain_exception_buffers() AS exception_buffers_visible; + exception_buffers_visible +--------------------------- + t +(1 row) + +-- Also test with nested EXPLAIN ANALYZE (two levels of instrumentation) +CREATE FUNCTION check_explain_exception_buffers_nested() RETURNS boolean AS $$ +DECLARE + plan_json json; + node json; + total_buffers int; +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT check_explain_exception_buffers()' INTO plan_json; + node := plan_json->0->'Plan'; + total_buffers := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + RETURN total_buffers > 0; +END; +$$ LANGUAGE plpgsql; +SELECT check_explain_exception_buffers_nested() AS exception_buffers_nested_visible; + exception_buffers_nested_visible +---------------------------------- + t +(1 row) + +DROP FUNCTION check_explain_exception_buffers_nested; +DROP FUNCTION check_explain_exception_buffers; +DROP FUNCTION explain_exc_func; +DROP TABLE explain_exc_tab; +-- Cursor instrumentation test. +-- Verify that buffer usage is correctly tracked through cursor execution paths. +-- Non-scrollable cursors exercise ExecShutdownNode after each ExecutorRun +-- (EXEC_FLAG_BACKWARD is not set), while scrollable cursors only shut down +-- nodes in ExecutorFinish. In both cases, buffer usage from the inner cursor +-- scan should be correctly accumulated. +CREATE TEMP TABLE cursor_buf_test AS SELECT * FROM tenk1; +CREATE FUNCTION cursor_noscroll_scan() RETURNS bigint AS $$ +DECLARE + cur NO SCROLL CURSOR FOR SELECT * FROM cursor_buf_test; + rec RECORD; + cnt bigint := 0; +BEGIN + OPEN cur; + LOOP + FETCH NEXT FROM cur INTO rec; + EXIT WHEN NOT FOUND; + cnt := cnt + 1; + END LOOP; + CLOSE cur; + RETURN cnt; +END; +$$ LANGUAGE plpgsql; +CREATE FUNCTION cursor_scroll_scan() RETURNS bigint AS $$ +DECLARE + cur SCROLL CURSOR FOR SELECT * FROM cursor_buf_test; + rec RECORD; + cnt bigint := 0; +BEGIN + OPEN cur; + LOOP + FETCH NEXT FROM cur INTO rec; + EXIT WHEN NOT FOUND; + cnt := cnt + 1; + END LOOP; + CLOSE cur; + RETURN cnt; +END; +$$ LANGUAGE plpgsql; +CREATE FUNCTION check_cursor_explain_buffers() RETURNS TABLE(noscroll_ok boolean, scroll_ok boolean) AS $$ +DECLARE + plan_json json; + node json; + direct_buf int; + noscroll_buf int; + scroll_buf int; +BEGIN + -- Direct scan: get leaf Seq Scan node buffers as baseline + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT * FROM cursor_buf_test' INTO plan_json; + node := plan_json->0->'Plan'; + WHILE node->'Plans' IS NOT NULL LOOP + node := node->'Plans'->0; + END LOOP; + direct_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Non-scrollable cursor path: ExecShutdownNode runs after each ExecutorRun + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT cursor_noscroll_scan()' INTO plan_json; + node := plan_json->0->'Plan'; + noscroll_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Scrollable cursor path: ExecShutdownNode is skipped + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT cursor_scroll_scan()' INTO plan_json; + node := plan_json->0->'Plan'; + scroll_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Both cursor paths should report buffer counts about as high as + -- the direct scan (same data plus minor catalog overhead), and not + -- double-counted (< 2x the direct scan) + RETURN QUERY SELECT + (noscroll_buf >= direct_buf * 0.5 AND noscroll_buf < direct_buf * 2), + (scroll_buf >= direct_buf * 0.5 AND scroll_buf < direct_buf * 2); +END; +$$ LANGUAGE plpgsql; +SELECT * FROM check_cursor_explain_buffers(); + noscroll_ok | scroll_ok +-------------+----------- + t | t +(1 row) + +DROP FUNCTION check_cursor_explain_buffers; +DROP FUNCTION cursor_noscroll_scan; +DROP FUNCTION cursor_scroll_scan; +DROP TABLE cursor_buf_test; +-- Parallel query buffer double-counting test. +-- +-- Compares serial Seq Scan buffers vs parallel Seq Scan buffers. +-- They scan the same table so the buffer count should be similar. +-- Double-counting would make the parallel count ~2x larger. +CREATE FUNCTION check_parallel_explain_buffers() RETURNS TABLE(ratio numeric) AS $$ +DECLARE + plan_json json; + serial_buffers int; + parallel_buffers int; + node json; +BEGIN + -- Serial -- + SET LOCAL max_parallel_workers_per_gather = 0; + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT count(*) FROM tenk1' INTO plan_json; + node := plan_json->0->'Plan'; + serial_buffers := + COALESCE((node->>'Shared Hit Blocks')::int, 0) + + COALESCE((node->>'Shared Read Blocks')::int, 0); + + -- Parallel -- + SET LOCAL parallel_setup_cost = 0; + SET LOCAL parallel_tuple_cost = 0; + SET LOCAL min_parallel_table_scan_size = 0; + SET LOCAL max_parallel_workers_per_gather = 2; + SET LOCAL parallel_leader_participation = off; + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT count(*) FROM tenk1' INTO plan_json; + node := plan_json->0->'Plan'; + parallel_buffers := + COALESCE((node->>'Shared Hit Blocks')::int, 0) + + COALESCE((node->>'Shared Read Blocks')::int, 0); + + RETURN QUERY SELECT round(parallel_buffers::numeric / GREATEST(serial_buffers, 1)); +END; +$$ LANGUAGE plpgsql; +SELECT * FROM check_parallel_explain_buffers(); + ratio +------- + 1 +(1 row) + +DROP FUNCTION check_parallel_explain_buffers; diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql index ebdab42604beb..cf5c6335a1922 100644 --- a/src/test/regress/sql/explain.sql +++ b/src/test/regress/sql/explain.sql @@ -188,3 +188,197 @@ select explain_filter('explain (analyze,buffers off,costs off) select sum(n) ove -- Test tuplestore storage usage in Window aggregate (memory and disk case, final result is disk) select explain_filter('explain (analyze,buffers off,costs off) select sum(n) over(partition by m) from (SELECT n < 3 as m, n from generate_series(1,2500) a(n))'); reset work_mem; + +-- EXPLAIN (ANALYZE, BUFFERS) should report buffer usage from PL/pgSQL +-- EXCEPTION blocks, even after subtransaction rollback. +CREATE TEMP TABLE explain_exc_tab (a int, b char(20)); +INSERT INTO explain_exc_tab VALUES (0, 'zzz'); + +CREATE FUNCTION explain_exc_func() RETURNS void AS $$ +DECLARE + v int; +BEGIN + WITH ins AS (INSERT INTO explain_exc_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN + NULL; +END; +$$ LANGUAGE plpgsql; + +CREATE FUNCTION check_explain_exception_buffers() RETURNS boolean AS $$ +DECLARE + plan_json json; + node json; + total_buffers int; +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT explain_exc_func()' INTO plan_json; + node := plan_json->0->'Plan'; + total_buffers := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + RETURN total_buffers > 0; +END; +$$ LANGUAGE plpgsql; + +SELECT check_explain_exception_buffers() AS exception_buffers_visible; + +-- Also test with nested EXPLAIN ANALYZE (two levels of instrumentation) +CREATE FUNCTION check_explain_exception_buffers_nested() RETURNS boolean AS $$ +DECLARE + plan_json json; + node json; + total_buffers int; +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT check_explain_exception_buffers()' INTO plan_json; + node := plan_json->0->'Plan'; + total_buffers := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + RETURN total_buffers > 0; +END; +$$ LANGUAGE plpgsql; + +SELECT check_explain_exception_buffers_nested() AS exception_buffers_nested_visible; + +DROP FUNCTION check_explain_exception_buffers_nested; +DROP FUNCTION check_explain_exception_buffers; +DROP FUNCTION explain_exc_func; +DROP TABLE explain_exc_tab; + +-- Cursor instrumentation test. +-- Verify that buffer usage is correctly tracked through cursor execution paths. +-- Non-scrollable cursors exercise ExecShutdownNode after each ExecutorRun +-- (EXEC_FLAG_BACKWARD is not set), while scrollable cursors only shut down +-- nodes in ExecutorFinish. In both cases, buffer usage from the inner cursor +-- scan should be correctly accumulated. + +CREATE TEMP TABLE cursor_buf_test AS SELECT * FROM tenk1; + +CREATE FUNCTION cursor_noscroll_scan() RETURNS bigint AS $$ +DECLARE + cur NO SCROLL CURSOR FOR SELECT * FROM cursor_buf_test; + rec RECORD; + cnt bigint := 0; +BEGIN + OPEN cur; + LOOP + FETCH NEXT FROM cur INTO rec; + EXIT WHEN NOT FOUND; + cnt := cnt + 1; + END LOOP; + CLOSE cur; + RETURN cnt; +END; +$$ LANGUAGE plpgsql; + +CREATE FUNCTION cursor_scroll_scan() RETURNS bigint AS $$ +DECLARE + cur SCROLL CURSOR FOR SELECT * FROM cursor_buf_test; + rec RECORD; + cnt bigint := 0; +BEGIN + OPEN cur; + LOOP + FETCH NEXT FROM cur INTO rec; + EXIT WHEN NOT FOUND; + cnt := cnt + 1; + END LOOP; + CLOSE cur; + RETURN cnt; +END; +$$ LANGUAGE plpgsql; + +CREATE FUNCTION check_cursor_explain_buffers() RETURNS TABLE(noscroll_ok boolean, scroll_ok boolean) AS $$ +DECLARE + plan_json json; + node json; + direct_buf int; + noscroll_buf int; + scroll_buf int; +BEGIN + -- Direct scan: get leaf Seq Scan node buffers as baseline + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT * FROM cursor_buf_test' INTO plan_json; + node := plan_json->0->'Plan'; + WHILE node->'Plans' IS NOT NULL LOOP + node := node->'Plans'->0; + END LOOP; + direct_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Non-scrollable cursor path: ExecShutdownNode runs after each ExecutorRun + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT cursor_noscroll_scan()' INTO plan_json; + node := plan_json->0->'Plan'; + noscroll_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Scrollable cursor path: ExecShutdownNode is skipped + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT cursor_scroll_scan()' INTO plan_json; + node := plan_json->0->'Plan'; + scroll_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Both cursor paths should report buffer counts about as high as + -- the direct scan (same data plus minor catalog overhead), and not + -- double-counted (< 2x the direct scan) + RETURN QUERY SELECT + (noscroll_buf >= direct_buf * 0.5 AND noscroll_buf < direct_buf * 2), + (scroll_buf >= direct_buf * 0.5 AND scroll_buf < direct_buf * 2); +END; +$$ LANGUAGE plpgsql; + +SELECT * FROM check_cursor_explain_buffers(); + +DROP FUNCTION check_cursor_explain_buffers; +DROP FUNCTION cursor_noscroll_scan; +DROP FUNCTION cursor_scroll_scan; +DROP TABLE cursor_buf_test; + +-- Parallel query buffer double-counting test. +-- +-- Compares serial Seq Scan buffers vs parallel Seq Scan buffers. +-- They scan the same table so the buffer count should be similar. +-- Double-counting would make the parallel count ~2x larger. +CREATE FUNCTION check_parallel_explain_buffers() RETURNS TABLE(ratio numeric) AS $$ +DECLARE + plan_json json; + serial_buffers int; + parallel_buffers int; + node json; +BEGIN + -- Serial -- + SET LOCAL max_parallel_workers_per_gather = 0; + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT count(*) FROM tenk1' INTO plan_json; + node := plan_json->0->'Plan'; + serial_buffers := + COALESCE((node->>'Shared Hit Blocks')::int, 0) + + COALESCE((node->>'Shared Read Blocks')::int, 0); + + -- Parallel -- + SET LOCAL parallel_setup_cost = 0; + SET LOCAL parallel_tuple_cost = 0; + SET LOCAL min_parallel_table_scan_size = 0; + SET LOCAL max_parallel_workers_per_gather = 2; + SET LOCAL parallel_leader_participation = off; + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT count(*) FROM tenk1' INTO plan_json; + node := plan_json->0->'Plan'; + parallel_buffers := + COALESCE((node->>'Shared Hit Blocks')::int, 0) + + COALESCE((node->>'Shared Read Blocks')::int, 0); + + RETURN QUERY SELECT round(parallel_buffers::numeric / GREATEST(serial_buffers, 1)); +END; +$$ LANGUAGE plpgsql; + +SELECT * FROM check_parallel_explain_buffers(); + +DROP FUNCTION check_parallel_explain_buffers; From 77161a6f2c0eb78189e8595c0f276c5fe4fcfa2a Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Tue, 9 Sep 2025 02:16:59 -0700 Subject: [PATCH 5/8] Optimize measuring WAL/buffer usage through stack-based instrumentation Previously, in order to determine the buffer/WAL usage of a given code section, we utilized continuously incrementing global counters that get updated when the actual activity (e.g. shared block read) occurred, and then calculated a diff when the code section ended. This resulted in a bottleneck for executor node instrumentation specifically, with the function BufferUsageAccumDiff showing up in profiles and in some cases adding up to 10% overhead to an EXPLAIN (ANALYZE, BUFFERS) run. Instead, introduce a stack-based mechanism, where the actual activity writes into the current stack entry. In the case of executor nodes, this means that each node gets its own stack entry that is pushed at InstrStartNode, and popped at InstrEndNode. Stack entries are zero initialized (avoiding the diff mechanism) and get added to their parent entry when they are finalized, i.e. no more modifications can occur. To correctly handle abort situations, any use of instrumentation stacks must involve either a top-level Instrumentation struct, and its associated InstrStart/InstrStop helpers (which use resource owners to handle aborts), or dedicated PG_TRY/PG_FINALLY calls that ensure the stack is in a consistent state after an abort. Author: Lukas Fittl Reviewed-by: Discussion: --- .../pg_stat_statements/pg_stat_statements.c | 107 +++--- src/backend/access/brin/brin.c | 6 +- src/backend/access/gin/gininsert.c | 6 +- src/backend/access/nbtree/nbtsort.c | 6 +- src/backend/commands/explain.c | 8 +- src/backend/commands/vacuumparallel.c | 6 +- src/backend/executor/execMain.c | 60 ++- src/backend/executor/execParallel.c | 6 +- src/backend/executor/execProcnode.c | 79 ++++ src/backend/executor/instrument.c | 346 +++++++++++++++--- src/include/executor/executor.h | 2 + src/include/executor/instrument.h | 116 +++++- src/include/utils/resowner.h | 1 + src/tools/pgindent/typedefs.list | 1 + 14 files changed, 621 insertions(+), 129 deletions(-) diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 388b068cceccf..fe3237fc665fd 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -911,21 +911,13 @@ pgss_planner(Query *parse, { instr_time start; instr_time duration; - BufferUsage bufusage_start, - bufusage; - WalUsage walusage_start, - walusage; + InstrStack stack = {0}; - /* We need to track buffer usage as the planner can access them. */ - bufusage_start = pgBufferUsage; - - /* - * Similarly the planner could write some WAL records in some cases - * (e.g. setting a hint bit with those being WAL-logged) - */ - walusage_start = pgWalUsage; INSTR_TIME_SET_CURRENT(start); + /* We need to track buffer/WAL usage as the planner can access them. */ + InstrPushStack(&stack); + nesting_level++; PG_TRY(); { @@ -938,6 +930,7 @@ pgss_planner(Query *parse, } PG_FINALLY(); { + InstrPopAndFinalizeStack(&stack); nesting_level--; } PG_END_TRY(); @@ -945,14 +938,6 @@ pgss_planner(Query *parse, INSTR_TIME_SET_CURRENT(duration); INSTR_TIME_SUBTRACT(duration, start); - /* calc differences of buffer counters. */ - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - - /* calc differences of WAL counters. */ - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start); - pgss_store(query_string, parse->queryId, parse->stmt_location, @@ -960,8 +945,8 @@ pgss_planner(Query *parse, PGSS_PLAN, INSTR_TIME_GET_MILLISEC(duration), 0, - &bufusage, - &walusage, + &stack.bufusage, + &stack.walusage, NULL, NULL, 0, @@ -1082,6 +1067,13 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) if (queryId != INT64CONST(0) && queryDesc->totaltime && pgss_enabled(nesting_level)) { + /* + * Check if stack is initialized - it is not when ExecutorRun wasn't + * called + */ + const BufferUsage *bufusage = queryDesc->totaltime->stack ? &queryDesc->totaltime->stack->bufusage : NULL; + const WalUsage *walusage = queryDesc->totaltime->stack ? &queryDesc->totaltime->stack->walusage : NULL; + pgss_store(queryDesc->sourceText, queryId, queryDesc->plannedstmt->stmt_location, @@ -1089,8 +1081,8 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) PGSS_EXEC, INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total), queryDesc->estate->es_total_processed, - &queryDesc->totaltime->bufusage, - &queryDesc->totaltime->walusage, + bufusage, + walusage, queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL, NULL, queryDesc->estate->es_parallel_workers_to_launch, @@ -1157,14 +1149,10 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, instr_time start; instr_time duration; uint64 rows; - BufferUsage bufusage_start, - bufusage; - WalUsage walusage_start, - walusage; + InstrStack stack = {0}; - bufusage_start = pgBufferUsage; - walusage_start = pgWalUsage; INSTR_TIME_SET_CURRENT(start); + InstrPushStack(&stack); nesting_level++; PG_TRY(); @@ -1180,6 +1168,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, } PG_FINALLY(); { + InstrPopAndFinalizeStack(&stack); nesting_level--; } PG_END_TRY(); @@ -1208,14 +1197,6 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, qc->commandTag == CMDTAG_REFRESH_MATERIALIZED_VIEW)) ? qc->nprocessed : 0; - /* calc differences of buffer counters. */ - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - - /* calc differences of WAL counters. */ - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start); - pgss_store(queryString, saved_queryId, saved_stmt_location, @@ -1223,8 +1204,8 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, PGSS_EXEC, INSTR_TIME_GET_MILLISEC(duration), rows, - &bufusage, - &walusage, + &stack.bufusage, + &stack.walusage, NULL, NULL, 0, @@ -1454,27 +1435,33 @@ pgss_store(const char *query, int64 queryId, } } entry->counters.rows += rows; - entry->counters.shared_blks_hit += bufusage->shared_blks_hit; - entry->counters.shared_blks_read += bufusage->shared_blks_read; - entry->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied; - entry->counters.shared_blks_written += bufusage->shared_blks_written; - entry->counters.local_blks_hit += bufusage->local_blks_hit; - entry->counters.local_blks_read += bufusage->local_blks_read; - entry->counters.local_blks_dirtied += bufusage->local_blks_dirtied; - entry->counters.local_blks_written += bufusage->local_blks_written; - entry->counters.temp_blks_read += bufusage->temp_blks_read; - entry->counters.temp_blks_written += bufusage->temp_blks_written; - entry->counters.shared_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_read_time); - entry->counters.shared_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_write_time); - entry->counters.local_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_read_time); - entry->counters.local_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_write_time); - entry->counters.temp_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_read_time); - entry->counters.temp_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_write_time); + if (bufusage) + { + entry->counters.shared_blks_hit += bufusage->shared_blks_hit; + entry->counters.shared_blks_read += bufusage->shared_blks_read; + entry->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied; + entry->counters.shared_blks_written += bufusage->shared_blks_written; + entry->counters.local_blks_hit += bufusage->local_blks_hit; + entry->counters.local_blks_read += bufusage->local_blks_read; + entry->counters.local_blks_dirtied += bufusage->local_blks_dirtied; + entry->counters.local_blks_written += bufusage->local_blks_written; + entry->counters.temp_blks_read += bufusage->temp_blks_read; + entry->counters.temp_blks_written += bufusage->temp_blks_written; + entry->counters.shared_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_read_time); + entry->counters.shared_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_write_time); + entry->counters.local_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_read_time); + entry->counters.local_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_write_time); + entry->counters.temp_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_read_time); + entry->counters.temp_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_write_time); + } entry->counters.usage += USAGE_EXEC(total_time); - entry->counters.wal_records += walusage->wal_records; - entry->counters.wal_fpi += walusage->wal_fpi; - entry->counters.wal_bytes += walusage->wal_bytes; - entry->counters.wal_buffers_full += walusage->wal_buffers_full; + if (walusage) + { + entry->counters.wal_records += walusage->wal_records; + entry->counters.wal_fpi += walusage->wal_fpi; + entry->counters.wal_bytes += walusage->wal_bytes; + entry->counters.wal_buffers_full += walusage->wal_buffers_full; + } if (jitusage) { entry->counters.jit_functions += jitusage->created_functions; diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 6887e421442ee..c1c3d03b6edb6 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -2885,6 +2885,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + Instrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -2934,7 +2935,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) tuplesort_attach_shared(sharedsort, seg); /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Might as well use reliable figure when doling out maintenance_work_mem @@ -2949,7 +2950,8 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); index_close(indexRel, indexLockmode); diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 0d63fb4ba27b6..9149d735d59b3 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -2108,6 +2108,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + Instrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -2176,7 +2177,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) tuplesort_attach_shared(sharedsort, seg); /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Might as well use reliable figure when doling out maintenance_work_mem @@ -2191,7 +2192,8 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); index_close(indexRel, indexLockmode); diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 3a45508f62ea4..600f23ff2a6f9 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -1750,6 +1750,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + Instrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -1825,7 +1826,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) } /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* Perform sorting of spool, and possibly a spool2 */ sortmem = maintenance_work_mem / btshared->scantuplesortstates; @@ -1835,7 +1836,8 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); #ifdef BTREE_BUILD_STATS diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index d7f33644d79a2..17405aa862103 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -2288,9 +2288,9 @@ ExplainNode(PlanState *planstate, List *ancestors, /* Show buffer/WAL usage */ if (es->buffers && planstate->instrument) - show_buffer_usage(es, &planstate->instrument->bufusage); + show_buffer_usage(es, &planstate->instrument->stack.bufusage); if (es->wal && planstate->instrument) - show_wal_usage(es, &planstate->instrument->walusage); + show_wal_usage(es, &planstate->instrument->stack.walusage); /* Prepare per-worker buffer/WAL usage */ if (es->workers_state && (es->buffers || es->wal) && es->verbose) @@ -2307,9 +2307,9 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainOpenWorker(n, es); if (es->buffers) - show_buffer_usage(es, &instrument->bufusage); + show_buffer_usage(es, &instrument->stack.bufusage); if (es->wal) - show_wal_usage(es, &instrument->walusage); + show_wal_usage(es, &instrument->stack.walusage); ExplainCloseWorker(n, es); } } diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index c3b3c9ea21ab8..10ba717bb6b24 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -994,6 +994,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) PVIndStats *indstats; PVShared *shared; TidStore *dead_items; + Instrumentation *instr; BufferUsage *buffer_usage; WalUsage *wal_usage; int nindexes; @@ -1083,7 +1084,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) error_context_stack = &errcallback; /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* Process indexes to perform vacuum/cleanup */ parallel_vacuum_process_safe_indexes(&pvs); @@ -1091,7 +1092,8 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) /* Report buffer/WAL usage during parallel execution */ buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false); wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &buffer_usage[ParallelWorkerNumber], &wal_usage[ParallelWorkerNumber]); /* Report any remaining cost-based vacuum delay time */ diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index c0b174cfbc02a..f01f1c864c409 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -76,6 +76,7 @@ ExecutorCheckPerms_hook_type ExecutorCheckPerms_hook = NULL; /* decls for local routines only used within this module */ static void InitPlan(QueryDesc *queryDesc, int eflags); static void CheckValidRowMarkRel(Relation rel, RowMarkType markType); +static void ExecFinalizeTriggerInstrumentation(EState *estate); static void ExecPostprocessPlan(EState *estate); static void ExecEndPlan(PlanState *planstate, EState *estate); static void ExecutePlan(QueryDesc *queryDesc, @@ -329,10 +330,27 @@ standard_ExecutorRun(QueryDesc *queryDesc, */ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); - /* Allow instrumentation of Executor overall runtime */ + /* + * Start up required top-level instrumentation stack for WAL/buffer + * tracking + */ + if (!queryDesc->totaltime && (estate->es_instrument & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL))) + queryDesc->totaltime = InstrAlloc(estate->es_instrument); + if (queryDesc->totaltime) + { + /* Allow instrumentation of Executor overall runtime */ InstrStart(queryDesc->totaltime); + /* + * Remember all node stacks for abort recovery. We do this once here + * after the first call to InstrStart has allocated the parent stack. + */ + if (queryDesc->totaltime->stack && !queryDesc->already_executed) + ExecRememberNodeInstrumentation(queryDesc->planstate, + queryDesc->totaltime->stack); + } + /* * extract information from the query descriptor and the query feature. */ @@ -383,7 +401,7 @@ standard_ExecutorRun(QueryDesc *queryDesc, dest->rShutdown(dest); if (queryDesc->totaltime) - InstrStop(queryDesc->totaltime); + InstrStop(queryDesc->totaltime, false); MemoryContextSwitchTo(oldcontext); } @@ -442,8 +460,26 @@ standard_ExecutorFinish(QueryDesc *queryDesc) if (!(estate->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS)) AfterTriggerEndQuery(estate); + /* + * Accumulate per-node and trigger statistics to their respective parent + * instrumentation stacks. + * + * We skip this in parallel workers because their per-node stats are + * reported individually via ExecParallelReportInstrumentation, and the + * leader's own ExecFinalizeNodeInstrumentation handles propagation. If + * we accumulated here, the leader would double-count: worker parent nodes + * would already include their children's stats, and then the leader's + * accumulation would add the children again. + */ + if (queryDesc->totaltime && estate->es_instrument && !IsParallelWorker()) + { + ExecFinalizeNodeInstrumentation(queryDesc->planstate); + + ExecFinalizeTriggerInstrumentation(estate); + } + if (queryDesc->totaltime) - InstrStop(queryDesc->totaltime); + InstrStop(queryDesc->totaltime, true); MemoryContextSwitchTo(oldcontext); @@ -1484,6 +1520,24 @@ ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo) return resultRelInfo->ri_ancestorResultRels; } +static void +ExecFinalizeTriggerInstrumentation(EState *estate) +{ + List *rels = NIL; + + rels = list_concat(rels, estate->es_tuple_routing_result_relations); + rels = list_concat(rels, estate->es_opened_result_relations); + rels = list_concat(rels, estate->es_trig_target_relations); + + foreach_node(ResultRelInfo, rInfo, rels) + { + TriggerInstrumentation *ti = rInfo->ri_TrigInstrument; + + if (ti && ti->instr.stack) + InstrStackAdd(CurrentInstrStack, ti->instr.stack); + } +} + /* ---------------------------------------------------------------- * ExecPostprocessPlan * diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index fb9a4afbbf18a..dee8bf99c936a 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -1455,6 +1455,7 @@ void ParallelQueryMain(dsm_segment *seg, shm_toc *toc) { FixedParallelExecutorState *fpes; + Instrumentation *instr; BufferUsage *buffer_usage; WalUsage *wal_usage; DestReceiver *receiver; @@ -1515,7 +1516,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) * leader, which also doesn't count buffer accesses and WAL activity that * occur during executor startup. */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Run the plan. If we specified a tuple bound, be careful not to demand @@ -1531,7 +1532,8 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) /* Report buffer/WAL usage during parallel execution. */ buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &buffer_usage[ParallelWorkerNumber], &wal_usage[ParallelWorkerNumber]); /* Report instrumentation data if any instrumentation options are set. */ diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 1846661b503f6..e2e21c66dc9ae 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -122,6 +122,8 @@ static TupleTableSlot *ExecProcNodeFirst(PlanState *node); static TupleTableSlot *ExecProcNodeInstr(PlanState *node); static bool ExecShutdownNode_walker(PlanState *node, void *context); +static bool ExecRememberNodeInstrumentation_walker(PlanState *node, void *context); +static bool ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context); /* ------------------------------------------------------------------------ @@ -828,6 +830,83 @@ ExecShutdownNode_walker(PlanState *node, void *context) return false; } +/* + * ExecRememberNodeInstrumentation + * + * Register all per-node instrumentation stacks as unfinalized children of the + * executor's instrumentation stack. This is needed for abort recovery: if the + * executor aborts, we need to walk each per-node instrumentation stack to + * recover buffer/WAL data from nodes that never got finalized, that someone + * might be interested in as an aggregate. + */ +void +ExecRememberNodeInstrumentation(PlanState *node, InstrStack *parent) +{ + (void) ExecRememberNodeInstrumentation_walker(node, parent); +} + +static bool +ExecRememberNodeInstrumentation_walker(PlanState *node, void *context) +{ + InstrStack *parent = (InstrStack *) context; + + Assert(parent != NULL); + + if (node == NULL) + return false; + + if (node->instrument && (node->instrument->need_bufusage || + node->instrument->need_walusage)) + { + InstrRememberNodeStack(parent, &node->instrument->stack); + } + + return planstate_tree_walker(node, ExecRememberNodeInstrumentation_walker, context); +} + +/* + * ExecFinalizeNodeInstrumentation + * + * Accumulate instrumentation stats from all execution nodes to their respective + * parents (or the original parent instrumentation stack). + * + * This must run after the cleanup done by ExecShutdownNode, and not rely on any + * resources cleaned up by it. We also expect shutdown actions to have occurred, + * e.g. parallel worker instrumentation to have been added to the leader. + */ +void +ExecFinalizeNodeInstrumentation(PlanState *node) +{ + (void) ExecFinalizeNodeInstrumentation_walker(node, (InstrStack *) CurrentInstrStack); +} + +static bool +ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context) +{ + InstrStack *parent = (InstrStack *) context; + + Assert(parent != NULL); + + if (node == NULL) + return false; + + /* + * Recurse into children first (bottom-up accumulation), passing our stack + * as the parent context. This ensures children can accumulate to us even + * if they were never executed by the leader (e.g. nodes beneath Gather + * that only workers ran, where stack.previous would not be initialized). + */ + planstate_tree_walker(node, ExecFinalizeNodeInstrumentation_walker, + node->instrument ? &node->instrument->stack : parent); + + if (!node->instrument) + return false; + + node->instrument = InstrFinalizeNode(node->instrument, parent); + + return false; +} + /* * ExecSetTupleBound * diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 34daadc88fc60..1223a59879f73 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -16,15 +16,139 @@ #include #include "executor/instrument.h" +#include "utils/memutils.h" +#include "utils/resowner.h" BufferUsage pgBufferUsage; -static BufferUsage save_pgBufferUsage; WalUsage pgWalUsage; -static WalUsage save_pgWalUsage; +InstrStack TopInstrStack; +InstrStack *CurrentInstrStack = &TopInstrStack; static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); static void WalUsageAdd(WalUsage *dst, WalUsage *add); +static void InstrFinalizeNodesOnAbort(InstrStack *stack); + +/* + * Use ResourceOwner mechanism to correctly reset CurrentInstrStack on abort. + */ +static void ResOwnerReleaseInstrumentation(Datum res); +static const ResourceOwnerDesc instrumentation_resowner_desc = +{ + .name = "instrumentation", + .release_phase = RESOURCE_RELEASE_AFTER_LOCKS, + .release_priority = RELEASE_PRIO_INSTRUMENTATION, + .ReleaseResource = ResOwnerReleaseInstrumentation, + .DebugPrint = NULL, /* default message is fine */ +}; + +static inline void +ResourceOwnerRememberInstrStack(ResourceOwner owner, InstrStack *stack) +{ + ResourceOwnerRemember(owner, PointerGetDatum(stack), &instrumentation_resowner_desc); +} + +static inline void +ResourceOwnerForgetInstrStack(ResourceOwner owner, InstrStack *stack) +{ + ResourceOwnerForget(owner, PointerGetDatum(stack), &instrumentation_resowner_desc); +} + +static bool +StackIsParent(InstrStack *stack, InstrStack *entry) +{ + if (entry->previous == NULL) + return false; + + if (entry->previous == stack) + return true; + + return StackIsParent(stack, entry->previous); +} + +/* + * OrphanSkippedStacks + * + * Orphans all stack entries from the current stack entry to the provided + * stack which is assumed to be a parent stack of the current stack, and + * terminates once the current stack entry has reached the provided stack. + * + * This sets previous pointers of intermediate stack entries to NULL, so we + * don't have to worry about calling StackIsParent with a bad pointer in + * ResOwnerReleaseInstrumentation. + * + * This matters because we may process stack entries out of order in aborts + * because (1) we might have a mix of ResOwner and PG_FINALLY owned stacks + * (2) ResOwnerReleaseInstrumentation might be called out of order. + */ +static void +OrphanSkippedStacks(InstrStack *stack) +{ + if (CurrentInstrStack == stack || !StackIsParent(stack, CurrentInstrStack)) + return; + + while (CurrentInstrStack != stack) + { + InstrStack *previous = CurrentInstrStack->previous; + + Assert(previous != NULL); + CurrentInstrStack->previous = NULL; + CurrentInstrStack = previous; + } +} + +static void +ResOwnerReleaseInstrumentation(Datum res) +{ + InstrStack *stack = (InstrStack *) DatumGetPointer(res); + + /* + * Because registered resources are *not* cleaned up in a guaranteed + * order, we may get a child context after we've processed the parent. + * Thus, we only pop the stack if its not already a parent of the stack + * being released. Note that OrphanSkippedStacks may have set our previous + * stack entry to NULL, in which case we don't modify the stack either. + * + * Note that StackIsParent will recurse as needed, so it is inadvisible to + * use deeply nested stacks. + */ + if (stack->previous && !StackIsParent(CurrentInstrStack, stack)) + { + OrphanSkippedStacks(stack); + InstrPopStack(stack); + } + + /* Accumulate data from all unfinalized child node stacks. */ + InstrFinalizeNodesOnAbort(stack); + + /* + * Accumulate the stack associated with the ResOwner to the active stack. + * + * Note that we intentionally directly add to the current stack instead of + * the parent of the stack being released, because this can execute out of + * order. Explicit PG_FINALLY blocks might have modified the stack as + * well. + */ + InstrStackAdd(CurrentInstrStack, stack); + + /* Free the stack entry now since InstrStop won't be called */ + pfree(stack); +} + +/* + * Pops the stack entry and accumulates to its parent. + * + * Note that this intentionally allows passing a stack that is not + * CurrentInstrStack, as can happen with PG_FINALLY, and orphans any + * intermediate stacks that were skipped. + */ +void +InstrPopAndFinalizeStack(InstrStack *stack) +{ + OrphanSkippedStacks(stack); + InstrPopStack(stack); + InstrStackAdd(CurrentInstrStack, stack); +} /* General purpose instrumentation handling */ Instrumentation * @@ -51,15 +175,31 @@ InstrStart(Instrumentation *instr) !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) elog(ERROR, "InstrStart called twice in a row"); - if (instr->need_bufusage) - instr->bufusage_start = pgBufferUsage; + if (instr->need_bufusage || instr->need_walusage) + { + Assert(CurrentResourceOwner != NULL); + instr->owner = CurrentResourceOwner; + + /* + * Allocate the stack resource in a memory context that survives + * during an abort. This will be freed by InstrStop (regular + * execution) or ResOwnerReleaseInstrumentation (abort). + * + * We don't do this in InstrAlloc to avoid allocating when InstrStart + * + InstrStop isn't called. + */ + if (instr->stack == NULL) + instr->stack = MemoryContextAllocZero(TopMemoryContext, sizeof(InstrStack)); + + ResourceOwnerEnlarge(instr->owner); + ResourceOwnerRememberInstrStack(instr->owner, instr->stack); - if (instr->need_walusage) - instr->walusage_start = pgWalUsage; + InstrPushStack(instr->stack); + } } void -InstrStop(Instrumentation *instr) +InstrStop(Instrumentation *instr, bool finalize) { instr_time endtime; @@ -75,14 +215,31 @@ InstrStop(Instrumentation *instr) INSTR_TIME_SET_ZERO(instr->starttime); } - /* Add delta of buffer usage since entry to node's totals */ - if (instr->need_bufusage) - BufferUsageAccumDiff(&instr->bufusage, - &pgBufferUsage, &instr->bufusage_start); - - if (instr->need_walusage) - WalUsageAccumDiff(&instr->walusage, - &pgWalUsage, &instr->walusage_start); + if (instr->need_bufusage || instr->need_walusage) + { + InstrPopStack(instr->stack); + + if (finalize) + InstrStackAdd(CurrentInstrStack, instr->stack); + + Assert(instr->owner != NULL); + ResourceOwnerForgetInstrStack(instr->owner, instr->stack); + instr->owner = NULL; + + if (finalize) + { + /* + * To avoid keeping memory allocated beyond when its needed, copy + * the result to the current memory context, and free it in the + * transaction context. + */ + InstrStack *stack = palloc(sizeof(InstrStack)); + + memcpy(stack, instr->stack, sizeof(InstrStack)); + pfree(instr->stack); + instr->stack = stack; + } + } } /* Trigger instrumentation handling */ @@ -90,16 +247,16 @@ TriggerInstrumentation * InstrAllocTrigger(int n, int instrument_options) { TriggerInstrumentation *tginstr = palloc0(n * sizeof(TriggerInstrumentation)); + bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; - bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; int i; for (i = 0; i < n; i++) { + tginstr[i].instr.need_timer = need_timer; tginstr[i].instr.need_bufusage = need_buffers; tginstr[i].instr.need_walusage = need_wal; - tginstr[i].instr.need_timer = need_timer; } return tginstr; @@ -114,7 +271,12 @@ InstrStartTrigger(TriggerInstrumentation *tginstr) void InstrStopTrigger(TriggerInstrumentation *tginstr, int firings) { - InstrStop(&tginstr->instr); + /* + * This trigger may be called again, so we don't finalize instrumentation + * here. Accumulation to the parent happens at ExecutorFinish through + * ExecFinalizeTriggerInstrumentation. + */ + InstrStop(&tginstr->instr, false); tginstr->firings += firings; } @@ -124,7 +286,13 @@ InstrStopTrigger(TriggerInstrumentation *tginstr, int firings) NodeInstrumentation * InstrAllocNode(int instrument_options, bool async_mode) { - NodeInstrumentation *instr = palloc(sizeof(NodeInstrumentation)); + /* + * We can utilize TopTransactionContext instead of TopMemoryContext here + * (despite the inlined InstrStack in NodeInstrumentation) because nodes + * don't get used for utility commands that restart transactions, which + * would require a context that survives longer (EXPLAIN ANALYZE is fine). + */ + NodeInstrumentation *instr = MemoryContextAlloc(TopTransactionContext, sizeof(NodeInstrumentation)); InstrInitNode(instr, instrument_options); instr->async_mode = async_mode; @@ -142,6 +310,30 @@ InstrInitNode(NodeInstrumentation *instr, int instrument_options) instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; } +/* + * InstrRememberNodeStack - register a child node stack for abort processing. + * + * On abort, InstrFinalizeNodesOnAbort will walk the parent's list to recover + * buffer/WAL data from stacks that were never finalized, in order for + * aggregate totals to be accurate despite the query erroring out. + * + * The passed in node stack can either be the NodeInstrumentation stack or an + * additional stack that is associated with a node. This must not be called + * with other (non-node) instrumentation stacks as the child that perform their + * own cleanup. The parent must be a non-node stack that can handle aborts. + */ +void +InstrRememberNodeStack(InstrStack *parent, InstrStack *node_stack) +{ + /* + * We do not support nesting, to avoid recursion in + * InstrFinalizeNodesOnAbort + */ + Assert(parent->unfinalized_node.next == NULL); + + slist_push_head(&parent->unfinalized_children, &node_stack->unfinalized_node); +} + /* Entry to a plan node */ void InstrStartNode(NodeInstrumentation *instr) @@ -150,12 +342,13 @@ InstrStartNode(NodeInstrumentation *instr) !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) elog(ERROR, "InstrStartNode called twice in a row"); - /* save buffer usage totals at node entry, if needed */ - if (instr->need_bufusage) - instr->bufusage_start = pgBufferUsage; + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure that we always have a parent, even at the top most node */ + Assert(CurrentInstrStack != &TopInstrStack); - if (instr->need_walusage) - instr->walusage_start = pgWalUsage; + InstrPushStack(&instr->stack); + } } /* Exit from a plan node */ @@ -180,14 +373,14 @@ InstrStopNode(NodeInstrumentation *instr, double nTuples) INSTR_TIME_SET_ZERO(instr->starttime); } - /* Add delta of buffer usage since entry to node's totals */ - if (instr->need_bufusage) - BufferUsageAccumDiff(&instr->bufusage, - &pgBufferUsage, &instr->bufusage_start); + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure that we always have a parent, even at the top most node */ + Assert(instr->stack.previous != NULL); - if (instr->need_walusage) - WalUsageAccumDiff(&instr->walusage, - &pgWalUsage, &instr->walusage_start); + /* Adding to parent is handled by ExecFinalizeNodeInstrumentation */ + InstrPopStack(&instr->stack); + } /* Is this the first tuple of this cycle? */ if (!instr->running) @@ -206,6 +399,50 @@ InstrStopNode(NodeInstrumentation *instr, double nTuples) } } +/* Add per-node instrumentation to the parent and move into per-query memory context */ +NodeInstrumentation * +InstrFinalizeNode(NodeInstrumentation *instr, InstrStack *parent) +{ + NodeInstrumentation *dst = palloc(sizeof(NodeInstrumentation)); + + memcpy(dst, instr, sizeof(NodeInstrumentation)); + pfree(instr); + + /* Avoid stale pointer references */ + dst->stack.previous = NULL; + + InstrStackAdd(parent, &dst->stack); + + return dst; +} + +/* + * InstrFinalizeNodesOnAbort + * + * Accumulates unfinalized child per-node stacks into the resource owner stack, + * and resets the list so a theoretical second call is a safe no-op. + */ +static void +InstrFinalizeNodesOnAbort(InstrStack *stack) +{ + slist_iter iter; + + slist_foreach(iter, &stack->unfinalized_children) + { + InstrStack *child = slist_container(InstrStack, unfinalized_node, iter.cur); + + InstrStackAdd(stack, child); + + /* + * Note we don't free the child here since its usually contained + * within NodeInstrumentation and we don't have an easy way to access + * that, it will be instead be cleaned up by the transaction ending. + */ + } + + slist_init(&stack->unfinalized_children); +} + /* Update tuple count */ void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples) @@ -265,38 +502,65 @@ InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add) /* Add delta of buffer usage since entry to node's totals */ if (dst->need_bufusage) - BufferUsageAdd(&dst->bufusage, &add->bufusage); + BufferUsageAdd(&dst->stack.bufusage, &add->stack.bufusage); if (dst->need_walusage) - WalUsageAdd(&dst->walusage, &add->walusage); + WalUsageAdd(&dst->stack.walusage, &add->stack.walusage); } -/* note current values during parallel executor startup */ -void +/* start instrumentation during parallel executor startup */ +Instrumentation * InstrStartParallelQuery(void) { - save_pgBufferUsage = pgBufferUsage; - save_pgWalUsage = pgWalUsage; + Instrumentation *instr = InstrAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + + InstrStart(instr); + return instr; } /* report usage after parallel executor shutdown */ void -InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage) +InstrEndParallelQuery(Instrumentation *instr, BufferUsage *bufusage, WalUsage *walusage) { + InstrStop(instr, true); memset(bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(bufusage, &pgBufferUsage, &save_pgBufferUsage); + memcpy(bufusage, &instr->stack->bufusage, sizeof(BufferUsage)); memset(walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(walusage, &pgWalUsage, &save_pgWalUsage); + memcpy(walusage, &instr->stack->walusage, sizeof(WalUsage)); } -/* accumulate work done by workers in leader's stats */ +/* + * Accumulate work done by parallel workers in the leader's stats. + * + * Note that what gets added here effectively depends on whether per-node + * instrumentation is active. If its active the parallel worker intentionally + * skips ExecFinalizeNodeInstrumentation on executor shutdown, because it would + * cause double counting. Instead, this only accumulates any extra activity + * outside of nodes. + * + * Otherwise this is responsible for making sure that the complete query + * activity is accumulated. + */ void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage) { + BufferUsageAdd(&CurrentInstrStack->bufusage, bufusage); + WalUsageAdd(&CurrentInstrStack->walusage, walusage); + BufferUsageAdd(&pgBufferUsage, bufusage); WalUsageAdd(&pgWalUsage, walusage); } +void +InstrStackAdd(InstrStack *dst, InstrStack *add) +{ + Assert(dst != NULL); + Assert(add != NULL); + + BufferUsageAdd(&dst->bufusage, &add->bufusage); + WalUsageAdd(&dst->walusage, &add->walusage); +} + /* dst += add */ static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index d46ba59895d62..4406be9ed7b86 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -300,6 +300,8 @@ extern void ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function); extern Node *MultiExecProcNode(PlanState *node); extern void ExecEndNode(PlanState *node); extern void ExecShutdownNode(PlanState *node); +extern void ExecRememberNodeInstrumentation(PlanState *node, InstrStack *parent); +extern void ExecFinalizeNodeInstrumentation(PlanState *node); extern void ExecSetTupleBound(int64 tuples_needed, PlanState *child_node); diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 16688b5fdb42e..7b3e9a21733e3 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -13,6 +13,7 @@ #ifndef INSTRUMENT_H #define INSTRUMENT_H +#include "lib/ilist.h" #include "portability/instr_time.h" @@ -67,12 +68,33 @@ typedef enum InstrumentOption INSTRUMENT_ALL = PG_INT32_MAX } InstrumentOption; +/* Stack entry for current WAL/buffer usage */ +typedef struct InstrStack +{ + struct InstrStack *previous; + BufferUsage bufusage; + WalUsage walusage; + + /* + * Tracking of instr stacks that need to be cleaned up on abort that are + * not registered as a resource owner themselves + */ + slist_head unfinalized_children; /* head of unfinalized children list */ + slist_node unfinalized_node; /* node in parent's unfinalized list */ +} InstrStack; + /* * General purpose instrumentation that can capture time and WAL/buffer usage * * Initialized through InstrAlloc, followed by one or more calls to a pair of * InstrStart/InstrStop (activity is measured inbetween). + * + * Uses resource owner mechanism for handling aborts, as such, the caller + * *must* not exit out of top level transaction between InstrStart/InstrStop + * calls in regular execution. If this is needed, directly use InstrPushStack / + * InstrPopStack in a PG_TRY/PG_FINALLY block instead. */ +struct ResourceOwnerData; typedef struct Instrumentation { /* Parameters set at creation: */ @@ -81,12 +103,10 @@ typedef struct Instrumentation bool need_walusage; /* true if we need WAL usage data */ /* Internal state keeping: */ instr_time starttime; /* start time of last InstrStart */ - BufferUsage bufusage_start; /* buffer usage at start */ - WalUsage walusage_start; /* WAL usage at start */ /* Accumulated statistics: */ instr_time total; /* total runtime */ - BufferUsage bufusage; /* total buffer usage */ - WalUsage walusage; /* total WAL usage */ + InstrStack *stack; /* stack tracking buffer/WAL usage */ + struct ResourceOwnerData *owner; } Instrumentation; /* Trigger instrumentation */ @@ -99,6 +119,11 @@ typedef struct TriggerInstrumentation /* * Specialized instrumentation for per-node execution statistics + * + * Requires use of an outer InstrStart/InstrStop to handle the stack used for + * WAL/buffer usage statistics, and relies on it for managing aborts. Solely + * intended for the executor and anyone reporting about its activities (e.g. + * EXPLAIN ANALYZE). */ typedef struct NodeInstrumentation { @@ -113,8 +138,6 @@ typedef struct NodeInstrumentation instr_time counter; /* accumulated runtime for this node */ instr_time firsttuple; /* time for first tuple of this cycle */ double tuplecount; /* # of tuples emitted so far this cycle */ - BufferUsage bufusage_start; /* buffer usage at start */ - WalUsage walusage_start; /* WAL usage at start */ /* Accumulated statistics across all completed cycles: */ instr_time startup; /* total startup time */ instr_time total; /* total time */ @@ -123,10 +146,13 @@ typedef struct NodeInstrumentation double nloops; /* # of run cycles for this node */ double nfiltered1; /* # of tuples removed by scanqual or joinqual */ double nfiltered2; /* # of tuples removed by "other" quals */ - BufferUsage bufusage; /* total buffer usage */ - WalUsage walusage; /* total WAL usage */ + InstrStack stack; /* stack tracking buffer/WAL usage */ } NodeInstrumentation; +/* + * Care must be taken with any pointers contained within this struct, as this + * gets copied across processes during parallel query execution. + */ typedef struct WorkerNodeInstrumentation { int num_workers; /* # of structures that follow */ @@ -136,9 +162,68 @@ typedef struct WorkerNodeInstrumentation extern PGDLLIMPORT BufferUsage pgBufferUsage; extern PGDLLIMPORT WalUsage pgWalUsage; +/* + * The top instrumentation stack represents a running total of the current + * backend WAL/buffer usage information. This will not be updated immediately, + * but rather when the current stack entry gets accumulated which typically + * happens at query end (see CurrentInstrStack below). + * + * Care must be taken when utilizing this in the parallel worker context: + * Parallel workers will report back their instrumentation to the caller, + * and this gets added to the caller's stack. If this were to be used in the + * shared memory stats infrastructure it would need to be skipped on parallel + * workers to avoid double counting. + */ +extern PGDLLIMPORT InstrStack TopInstrStack; + +/* + * The currently active stack entry that is getting updated as activity + * happens, and will be accumulated to parent stacks when it gets finalized + * by InstrStop (for non-executor use cases), ExecFinalizeNodeInstrumentation + * (executor finish) or ResOwnerReleaseInstrumentation on abort. + */ +extern PGDLLIMPORT InstrStack *CurrentInstrStack; + +extern void InstrStackAdd(InstrStack *dst, InstrStack *add); + +/* + * Pushes the stack so that all WAL/buffer usage updates go to the passed in + * stack entry. + * + * Any caller using this directly must manage the passed in stack and call + * InstrPopStack on its own again, typically by using a PG_FINALLY block to + * ensure the stack gets reset via InstrPopStack on abort. Use InstrStart + * instead when you want automatic handling of abort cases using the resource + * owner infrastructure. + */ +static inline void +InstrPushStack(InstrStack *stack) +{ + stack->previous = CurrentInstrStack; + CurrentInstrStack = stack; +} + +/* + * Pops the stack entry back to the previous one that was effective at + * InstrPushStack. + * + * Callers must ensure that no intermediate stack entries are skipped, to + * handle aborts correctly. If you're thinking of calling this in a PG_FINALLY + * block, instead call InstrPopAndFinalizeStack which can skip intermediate + * stack entries, or instead use InstrStart/InstrStop. + */ +static inline void +InstrPopStack(InstrStack *stack) +{ + Assert(stack != NULL); + CurrentInstrStack = stack->previous; +} + +extern void InstrPopAndFinalizeStack(InstrStack *stack); + extern Instrumentation *InstrAlloc(int instrument_options); extern void InstrStart(Instrumentation *instr); -extern void InstrStop(Instrumentation *instr); +extern void InstrStop(Instrumentation *instr, bool finalize); extern TriggerInstrumentation *InstrAllocTrigger(int n, int instrument_options); extern void InstrStartTrigger(TriggerInstrumentation *tginstr); @@ -147,14 +232,16 @@ extern void InstrStopTrigger(TriggerInstrumentation *tginstr, int firings); extern NodeInstrumentation *InstrAllocNode(int instrument_options, bool async_mode); extern void InstrInitNode(NodeInstrumentation *instr, int instrument_options); +extern void InstrRememberNodeStack(InstrStack *parent, InstrStack *instr); extern void InstrStartNode(NodeInstrumentation *instr); extern void InstrStopNode(NodeInstrumentation *instr, double nTuples); +extern NodeInstrumentation *InstrFinalizeNode(NodeInstrumentation *instr, InstrStack *parent); extern void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples); extern void InstrEndLoop(NodeInstrumentation *instr); extern void InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add); -extern void InstrStartParallelQuery(void); -extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage); +pg_nodiscard extern Instrumentation *InstrStartParallelQuery(void); +extern void InstrEndParallelQuery(Instrumentation *instr, BufferUsage *bufusage, WalUsage *walusage); extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage); extern void BufferUsageAccumDiff(BufferUsage *dst, const BufferUsage *add, const BufferUsage *sub); @@ -163,21 +250,28 @@ extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, #define INSTR_BUFUSAGE_INCR(fld) do { \ pgBufferUsage.fld++; \ + CurrentInstrStack->bufusage.fld++; \ } while(0) #define INSTR_BUFUSAGE_ADD(fld,val) do { \ pgBufferUsage.fld += val; \ + CurrentInstrStack->bufusage.fld += val; \ } while(0) #define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \ INSTR_TIME_ADD(pgBufferUsage.fld, val); \ + INSTR_TIME_ADD(CurrentInstrStack->bufusage.fld, val); \ } while (0) #define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \ INSTR_TIME_ACCUM_DIFF(pgBufferUsage.fld, endval, startval); \ + INSTR_TIME_ACCUM_DIFF(CurrentInstrStack->bufusage.fld, endval, startval); \ } while (0) + #define INSTR_WALUSAGE_INCR(fld) do { \ pgWalUsage.fld++; \ + CurrentInstrStack->walusage.fld++; \ } while(0) #define INSTR_WALUSAGE_ADD(fld,val) do { \ pgWalUsage.fld += val; \ + CurrentInstrStack->walusage.fld += val; \ } while(0) #endif /* INSTRUMENT_H */ diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h index eb6033b4fdb65..5463bc921f06e 100644 --- a/src/include/utils/resowner.h +++ b/src/include/utils/resowner.h @@ -75,6 +75,7 @@ typedef uint32 ResourceReleasePriority; #define RELEASE_PRIO_SNAPSHOT_REFS 500 #define RELEASE_PRIO_FILES 600 #define RELEASE_PRIO_WAITEVENTSETS 700 +#define RELEASE_PRIO_INSTRUMENTATION 800 /* 0 is considered invalid */ #define RELEASE_PRIO_FIRST 1 diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index e1075c709f538..b3146b0a1651e 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1319,6 +1319,7 @@ InjectionPointSharedState InjectionPointsCtl InlineCodeBlock InsertStmt +InstrStack Instrumentation Int128AggState Int8TransTypeData From c452122c00d677d32752b2225b4bc3db51768462 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Tue, 9 Sep 2025 02:26:56 -0700 Subject: [PATCH 6/8] Convert remaining users of pgBufferUsage to use InstrStart/InstrStop, drop the global --- src/backend/access/heap/vacuumlazy.c | 15 ++++++------ src/backend/commands/analyze.c | 31 ++++++++++++------------ src/backend/commands/explain.c | 26 +++++++-------------- src/backend/commands/explain_dr.c | 31 +++++++++++++----------- src/backend/commands/prepare.c | 27 ++++++++------------- src/backend/executor/instrument.c | 35 +--------------------------- src/include/executor/instrument.h | 8 +------ 7 files changed, 60 insertions(+), 113 deletions(-) diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 4be267ff657e1..51f86ff1ef7bf 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -638,8 +638,7 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, TimestampTz starttime = 0; PgStat_Counter startreadtime = 0, startwritetime = 0; - WalUsage startwalusage = pgWalUsage; - BufferUsage startbufferusage = pgBufferUsage; + Instrumentation *instr = NULL; ErrorContextCallback errcallback; char **indnames = NULL; Size dead_items_max_bytes = 0; @@ -655,6 +654,8 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, startreadtime = pgStatBlockReadTime; startwritetime = pgStatBlockWriteTime; } + instr = InstrAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + InstrStart(instr); } /* Used for instrumentation and stats report */ @@ -980,14 +981,14 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, { TimestampTz endtime = GetCurrentTimestamp(); + InstrStop(instr, true); + if (verbose || params.log_vacuum_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, params.log_vacuum_min_duration)) { long secs_dur; int usecs_dur; - WalUsage walusage; - BufferUsage bufferusage; StringInfoData buf; char *msgfmt; int32 diff; @@ -996,12 +997,10 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, int64 total_blks_hit; int64 total_blks_read; int64 total_blks_dirtied; + BufferUsage bufferusage = instr->stack->bufusage; + WalUsage walusage = instr->stack->walusage; TimestampDifference(starttime, endtime, &secs_dur, &usecs_dur); - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage); - memset(&bufferusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage); total_blks_hit = bufferusage.shared_blks_hit + bufferusage.local_blks_hit; diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index a483424152c5f..47cac75bc20b5 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -308,9 +308,7 @@ do_analyze_rel(Relation onerel, const VacuumParams params, Oid save_userid; int save_sec_context; int save_nestlevel; - WalUsage startwalusage = pgWalUsage; - BufferUsage startbufferusage = pgBufferUsage; - BufferUsage bufferusage; + Instrumentation *instr = NULL; PgStat_Counter startreadtime = 0; PgStat_Counter startwritetime = 0; @@ -361,6 +359,9 @@ do_analyze_rel(Relation onerel, const VacuumParams params, } pg_rusage_init(&ru0); + + instr = InstrAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + InstrStart(instr); } /* Used for instrumentation and stats report */ @@ -741,12 +742,13 @@ do_analyze_rel(Relation onerel, const VacuumParams params, { TimestampTz endtime = GetCurrentTimestamp(); + InstrStop(instr, true); + if (verbose || params.log_analyze_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, params.log_analyze_min_duration)) { long delay_in_ms; - WalUsage walusage; double read_rate = 0; double write_rate = 0; char *msgfmt; @@ -754,18 +756,15 @@ do_analyze_rel(Relation onerel, const VacuumParams params, int64 total_blks_hit; int64 total_blks_read; int64 total_blks_dirtied; - - memset(&bufferusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage); - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage); - - total_blks_hit = bufferusage.shared_blks_hit + - bufferusage.local_blks_hit; - total_blks_read = bufferusage.shared_blks_read + - bufferusage.local_blks_read; - total_blks_dirtied = bufferusage.shared_blks_dirtied + - bufferusage.local_blks_dirtied; + BufferUsage bufusage = instr->stack->bufusage; + WalUsage walusage = instr->stack->walusage; + + total_blks_hit = bufusage.shared_blks_hit + + bufusage.local_blks_hit; + total_blks_read = bufusage.shared_blks_read + + bufusage.local_blks_read; + total_blks_dirtied = bufusage.shared_blks_dirtied + + bufusage.local_blks_dirtied; /* * We do not expect an analyze to take > 25 days and it simplifies diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 17405aa862103..b35aea246cf5d 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -322,14 +322,16 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, QueryEnvironment *queryEnv) { PlannedStmt *plan; - instr_time planstart, - planduration; - BufferUsage bufusage_start, - bufusage; + Instrumentation *instr = NULL; MemoryContextCounters mem_counters; MemoryContext planner_ctx = NULL; MemoryContext saved_ctx = NULL; + if (es->buffers) + instr = InstrAlloc(INSTRUMENT_TIMER | INSTRUMENT_BUFFERS); + else + instr = InstrAlloc(INSTRUMENT_TIMER); + if (es->memory) { /* @@ -346,15 +348,12 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, saved_ctx = MemoryContextSwitchTo(planner_ctx); } - if (es->buffers) - bufusage_start = pgBufferUsage; - INSTR_TIME_SET_CURRENT(planstart); + InstrStart(instr); /* plan the query */ plan = pg_plan_query(query, queryString, cursorOptions, params, es); - INSTR_TIME_SET_CURRENT(planduration); - INSTR_TIME_SUBTRACT(planduration, planstart); + InstrStop(instr, true); if (es->memory) { @@ -362,16 +361,9 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, MemoryContextMemConsumed(planner_ctx, &mem_counters); } - /* calc differences of buffer counters. */ - if (es->buffers) - { - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - } - /* run it (if needed) and produce output */ ExplainOnePlan(plan, into, es, queryString, params, queryEnv, - &planduration, (es->buffers ? &bufusage : NULL), + &instr->total, (es->buffers ? &instr->stack->bufusage : NULL), es->memory ? &mem_counters : NULL); } diff --git a/src/backend/commands/explain_dr.c b/src/backend/commands/explain_dr.c index 3c96061cf32ab..9e4cd8ad7d083 100644 --- a/src/backend/commands/explain_dr.c +++ b/src/backend/commands/explain_dr.c @@ -110,15 +110,20 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self) MemoryContext oldcontext; StringInfo buf = &myState->buf; int natts = typeinfo->natts; - instr_time start, - end; - BufferUsage instr_start; + Instrumentation *instr = NULL; /* only measure time, buffers if requested */ - if (myState->es->timing) - INSTR_TIME_SET_CURRENT(start); - if (myState->es->buffers) - instr_start = pgBufferUsage; + if (myState->es->timing || myState->es->buffers) + { + InstrumentOption instrument_options = 0; + + if (myState->es->timing) + instrument_options |= INSTRUMENT_TIMER; + if (myState->es->buffers) + instrument_options |= INSTRUMENT_BUFFERS; + instr = InstrAlloc(instrument_options); + InstrStart(instr); + } /* Set or update my derived attribute info, if needed */ if (myState->attrinfo != typeinfo || myState->nattrs != natts) @@ -186,18 +191,16 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self) MemoryContextSwitchTo(oldcontext); MemoryContextReset(myState->tmpcontext); + if (myState->es->timing || myState->es->buffers) + InstrStop(instr, true); + /* Update timing data */ if (myState->es->timing) - { - INSTR_TIME_SET_CURRENT(end); - INSTR_TIME_ACCUM_DIFF(myState->metrics.timeSpent, end, start); - } + INSTR_TIME_ADD(myState->metrics.timeSpent, instr->total); /* Update buffer metrics */ if (myState->es->buffers) - BufferUsageAccumDiff(&myState->metrics.bufferUsage, - &pgBufferUsage, - &instr_start); + BufferUsageAdd(&myState->metrics.bufferUsage, &instr->stack->bufusage); return true; } diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c index 5b86a727587b2..2a1a6d1cf2e6d 100644 --- a/src/backend/commands/prepare.c +++ b/src/backend/commands/prepare.c @@ -578,13 +578,16 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, ListCell *p; ParamListInfo paramLI = NULL; EState *estate = NULL; - instr_time planstart; - instr_time planduration; - BufferUsage bufusage_start, - bufusage; + Instrumentation *instr = NULL; MemoryContextCounters mem_counters; MemoryContext planner_ctx = NULL; MemoryContext saved_ctx = NULL; + int instrument_options = INSTRUMENT_TIMER; + + if (es->buffers) + instrument_options |= INSTRUMENT_BUFFERS; + + instr = InstrAlloc(instrument_options); if (es->memory) { @@ -596,9 +599,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, saved_ctx = MemoryContextSwitchTo(planner_ctx); } - if (es->buffers) - bufusage_start = pgBufferUsage; - INSTR_TIME_SET_CURRENT(planstart); + InstrStart(instr); /* Look it up in the hash table */ entry = FetchPreparedStatement(execstmt->name, true); @@ -633,8 +634,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, cplan = GetCachedPlan(entry->plansource, paramLI, CurrentResourceOwner, pstate->p_queryEnv); - INSTR_TIME_SET_CURRENT(planduration); - INSTR_TIME_SUBTRACT(planduration, planstart); + InstrStop(instr, true); if (es->memory) { @@ -642,13 +642,6 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, MemoryContextMemConsumed(planner_ctx, &mem_counters); } - /* calc differences of buffer counters. */ - if (es->buffers) - { - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - } - plan_list = cplan->stmt_list; /* Explain each query */ @@ -658,7 +651,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, if (pstmt->commandType != CMD_UTILITY) ExplainOnePlan(pstmt, into, es, query_string, paramLI, pstate->p_queryEnv, - &planduration, (es->buffers ? &bufusage : NULL), + &instr->total, (es->buffers ? &instr->stack->bufusage : NULL), es->memory ? &mem_counters : NULL); else ExplainOneUtility(pstmt->utilityStmt, into, es, pstate, paramLI); diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 1223a59879f73..fc7ac85434b2a 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -19,12 +19,10 @@ #include "utils/memutils.h" #include "utils/resowner.h" -BufferUsage pgBufferUsage; WalUsage pgWalUsage; InstrStack TopInstrStack; InstrStack *CurrentInstrStack = &TopInstrStack; -static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); static void WalUsageAdd(WalUsage *dst, WalUsage *add); static void InstrFinalizeNodesOnAbort(InstrStack *stack); @@ -547,7 +545,6 @@ InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage) BufferUsageAdd(&CurrentInstrStack->bufusage, bufusage); WalUsageAdd(&CurrentInstrStack->walusage, walusage); - BufferUsageAdd(&pgBufferUsage, bufusage); WalUsageAdd(&pgWalUsage, walusage); } @@ -562,7 +559,7 @@ InstrStackAdd(InstrStack *dst, InstrStack *add) } /* dst += add */ -static void +void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) { dst->shared_blks_hit += add->shared_blks_hit; @@ -583,36 +580,6 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time); } -/* dst += add - sub */ -void -BufferUsageAccumDiff(BufferUsage *dst, - const BufferUsage *add, - const BufferUsage *sub) -{ - dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit; - dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read; - dst->shared_blks_dirtied += add->shared_blks_dirtied - sub->shared_blks_dirtied; - dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written; - dst->local_blks_hit += add->local_blks_hit - sub->local_blks_hit; - dst->local_blks_read += add->local_blks_read - sub->local_blks_read; - dst->local_blks_dirtied += add->local_blks_dirtied - sub->local_blks_dirtied; - dst->local_blks_written += add->local_blks_written - sub->local_blks_written; - dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read; - dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written; - INSTR_TIME_ACCUM_DIFF(dst->shared_blk_read_time, - add->shared_blk_read_time, sub->shared_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->shared_blk_write_time, - add->shared_blk_write_time, sub->shared_blk_write_time); - INSTR_TIME_ACCUM_DIFF(dst->local_blk_read_time, - add->local_blk_read_time, sub->local_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->local_blk_write_time, - add->local_blk_write_time, sub->local_blk_write_time); - INSTR_TIME_ACCUM_DIFF(dst->temp_blk_read_time, - add->temp_blk_read_time, sub->temp_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time, - add->temp_blk_write_time, sub->temp_blk_write_time); -} - /* helper functions for WAL usage accumulation */ static void WalUsageAdd(WalUsage *dst, WalUsage *add) diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 7b3e9a21733e3..7a5d0597cd5b9 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -159,7 +159,6 @@ typedef struct WorkerNodeInstrumentation NodeInstrumentation instrument[FLEXIBLE_ARRAY_MEMBER]; } WorkerNodeInstrumentation; -extern PGDLLIMPORT BufferUsage pgBufferUsage; extern PGDLLIMPORT WalUsage pgWalUsage; /* @@ -243,25 +242,20 @@ extern void InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add); pg_nodiscard extern Instrumentation *InstrStartParallelQuery(void); extern void InstrEndParallelQuery(Instrumentation *instr, BufferUsage *bufusage, WalUsage *walusage); extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage); -extern void BufferUsageAccumDiff(BufferUsage *dst, - const BufferUsage *add, const BufferUsage *sub); +extern void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub); #define INSTR_BUFUSAGE_INCR(fld) do { \ - pgBufferUsage.fld++; \ CurrentInstrStack->bufusage.fld++; \ } while(0) #define INSTR_BUFUSAGE_ADD(fld,val) do { \ - pgBufferUsage.fld += val; \ CurrentInstrStack->bufusage.fld += val; \ } while(0) #define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \ - INSTR_TIME_ADD(pgBufferUsage.fld, val); \ INSTR_TIME_ADD(CurrentInstrStack->bufusage.fld, val); \ } while (0) #define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \ - INSTR_TIME_ACCUM_DIFF(pgBufferUsage.fld, endval, startval); \ INSTR_TIME_ACCUM_DIFF(CurrentInstrStack->bufusage.fld, endval, startval); \ } while (0) From f47ac425817f6b59f3b68384e7c8640070241bab Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Mon, 12 Jan 2026 18:23:03 -0800 Subject: [PATCH 7/8] Index scans: Show table buffer accesses separately in EXPLAIN ANALYZE This sets up a separate instrumentation stack that is used whilst an Index Scan does scanning on the table, for example due to additional data being needed. EXPLAIN ANALYZE will now show "Table Buffers" that represent such activity. The activity is also included in regular "Buffers" together with index activity and that of any child nodes. Author: Lukas Fittl Suggested-by: Andres Freund Reviewed-by: Discussion: --- doc/src/sgml/perform.sgml | 13 ++++-- doc/src/sgml/ref/explain.sgml | 1 + src/backend/commands/explain.c | 55 +++++++++++++++----------- src/backend/executor/execProcnode.c | 28 +++++++++++++ src/backend/executor/instrument.c | 53 +++++++++++++++++++++++++ src/backend/executor/nodeIndexscan.c | 23 ++++++++++- src/include/executor/instrument.h | 5 +++ src/include/executor/instrument_node.h | 5 +++ src/include/nodes/execnodes.h | 7 ++++ 9 files changed, 162 insertions(+), 28 deletions(-) diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 5f6f1db0467e9..9219625faf6e3 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -734,6 +734,7 @@ WHERE t1.unique1 < 10 AND t1.unique2 = t2.unique2; -> Index Scan using tenk2_unique2 on tenk2 t2 (cost=0.29..7.90 rows=1 width=244) (actual time=0.003..0.003 rows=1.00 loops=10) Index Cond: (unique2 = t1.unique2) Index Searches: 10 + Table Buffers: shared hit=10 Buffers: shared hit=24 read=6 Planning: Buffers: shared hit=15 dirtied=9 @@ -949,7 +950,8 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)'; Index Cond: (f1 @> '((0.5,2))'::polygon) Rows Removed by Index Recheck: 1 Index Searches: 1 - Buffers: shared hit=1 + Table Buffers: shared hit=1 + Buffers: shared hit=2 Planning Time: 0.039 ms Execution Time: 0.098 ms @@ -958,7 +960,9 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)'; then rejected by a recheck of the index condition. This happens because a GiST index is lossy for polygon containment tests: it actually returns the rows with polygons that overlap the target, and then we have - to do the exact containment test on those rows. + to do the exact containment test on those rows. The Table Buffers + counts indicate how many operations were performed on the table instead of + the index. This number is included in the Buffers counts. @@ -1147,13 +1151,14 @@ EXPLAIN ANALYZE SELECT * FROM tenk1 WHERE unique1 < 100 AND unique2 > 9000 QUERY PLAN -------------------------------------------------------------------&zwsp;------------------------------------------------------------ Limit (cost=0.29..14.33 rows=2 width=244) (actual time=0.051..0.071 rows=2.00 loops=1) - Buffers: shared hit=16 + Buffers: shared hit=14 -> Index Scan using tenk1_unique2 on tenk1 (cost=0.29..70.50 rows=10 width=244) (actual time=0.051..0.070 rows=2.00 loops=1) Index Cond: (unique2 > 9000) Filter: (unique1 < 100) Rows Removed by Filter: 287 Index Searches: 1 - Buffers: shared hit=16 + Table Buffers: shared hit=11 + Buffers: shared hit=14 Planning Time: 0.077 ms Execution Time: 0.086 ms diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml index 7dee77fd366b0..912c96f2ff51a 100644 --- a/doc/src/sgml/ref/explain.sgml +++ b/doc/src/sgml/ref/explain.sgml @@ -506,6 +506,7 @@ EXPLAIN ANALYZE EXECUTE query(100, 200); -> Index Scan using test_pkey on test (cost=0.29..10.27 rows=99 width=8) (actual time=0.009..0.025 rows=99.00 loops=1) Index Cond: ((id > 100) AND (id < 200)) Index Searches: 1 + Table Buffers: shared hit=1 Buffers: shared hit=4 Planning Time: 0.244 ms Execution Time: 0.073 ms diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index b35aea246cf5d..054c5bce837f6 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -143,7 +143,7 @@ static void show_instrumentation_count(const char *qlabel, int which, static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); static const char *explain_get_index_name(Oid indexId); static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage); -static void show_buffer_usage(ExplainState *es, const BufferUsage *usage); +static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title); static void show_wal_usage(ExplainState *es, const WalUsage *usage); static void show_memory_counters(ExplainState *es, const MemoryContextCounters *mem_counters); @@ -603,7 +603,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, } if (bufusage) - show_buffer_usage(es, bufusage); + show_buffer_usage(es, bufusage, NULL); if (mem_counters) show_memory_counters(es, mem_counters); @@ -1020,7 +1020,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) if (es->buffers && peek_buffer_usage(es, &metrics->bufferUsage)) { es->indent++; - show_buffer_usage(es, &metrics->bufferUsage); + show_buffer_usage(es, &metrics->bufferUsage, NULL); es->indent--; } } @@ -1034,7 +1034,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) BYTES_TO_KILOBYTES(metrics->bytesSent), es); ExplainPropertyText("Format", format, es); if (es->buffers) - show_buffer_usage(es, &metrics->bufferUsage); + show_buffer_usage(es, &metrics->bufferUsage, NULL); } ExplainCloseGroup("Serialization", "Serialization", true, es); @@ -1962,6 +1962,9 @@ ExplainNode(PlanState *planstate, List *ancestors, show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); show_indexsearches_info(planstate, es); + + if (es->buffers && planstate->instrument) + show_buffer_usage(es, &((IndexScanState *) planstate)->iss_InstrumentTableStack->bufusage, "Table"); break; case T_IndexOnlyScan: show_scan_qual(((IndexOnlyScan *) plan)->indexqual, @@ -2280,7 +2283,7 @@ ExplainNode(PlanState *planstate, List *ancestors, /* Show buffer/WAL usage */ if (es->buffers && planstate->instrument) - show_buffer_usage(es, &planstate->instrument->stack.bufusage); + show_buffer_usage(es, &planstate->instrument->stack.bufusage, NULL); if (es->wal && planstate->instrument) show_wal_usage(es, &planstate->instrument->stack.walusage); @@ -2299,7 +2302,7 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainOpenWorker(n, es); if (es->buffers) - show_buffer_usage(es, &instrument->stack.bufusage); + show_buffer_usage(es, &instrument->stack.bufusage, NULL); if (es->wal) show_wal_usage(es, &instrument->stack.walusage); ExplainCloseWorker(n, es); @@ -4099,7 +4102,7 @@ peek_buffer_usage(ExplainState *es, const BufferUsage *usage) * Show buffer usage details. This better be sync with peek_buffer_usage. */ static void -show_buffer_usage(ExplainState *es, const BufferUsage *usage) +show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title) { if (es->format == EXPLAIN_FORMAT_TEXT) { @@ -4124,6 +4127,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared || has_local || has_temp) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "Buffers:"); if (has_shared) @@ -4179,6 +4184,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared_timing || has_local_timing || has_temp_timing) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "I/O Timings:"); if (has_shared_timing) @@ -4220,44 +4227,46 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) } else { - ExplainPropertyInteger("Shared Hit Blocks", NULL, + char *prefix = title ? psprintf("%s ", title) : pstrdup(""); + + ExplainPropertyInteger(psprintf("%sShared Hit Blocks", prefix), NULL, usage->shared_blks_hit, es); - ExplainPropertyInteger("Shared Read Blocks", NULL, + ExplainPropertyInteger(psprintf("%sShared Read Blocks", prefix), NULL, usage->shared_blks_read, es); - ExplainPropertyInteger("Shared Dirtied Blocks", NULL, + ExplainPropertyInteger(psprintf("%sShared Dirtied Blocks", prefix), NULL, usage->shared_blks_dirtied, es); - ExplainPropertyInteger("Shared Written Blocks", NULL, + ExplainPropertyInteger(psprintf("%sShared Written Blocks", prefix), NULL, usage->shared_blks_written, es); - ExplainPropertyInteger("Local Hit Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Hit Blocks", prefix), NULL, usage->local_blks_hit, es); - ExplainPropertyInteger("Local Read Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Read Blocks", prefix), NULL, usage->local_blks_read, es); - ExplainPropertyInteger("Local Dirtied Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Dirtied Blocks", prefix), NULL, usage->local_blks_dirtied, es); - ExplainPropertyInteger("Local Written Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Written Blocks", prefix), NULL, usage->local_blks_written, es); - ExplainPropertyInteger("Temp Read Blocks", NULL, + ExplainPropertyInteger(psprintf("%sTemp Read Blocks", prefix), NULL, usage->temp_blks_read, es); - ExplainPropertyInteger("Temp Written Blocks", NULL, + ExplainPropertyInteger(psprintf("%sTemp Written Blocks", prefix), NULL, usage->temp_blks_written, es); if (track_io_timing) { - ExplainPropertyFloat("Shared I/O Read Time", "ms", + ExplainPropertyFloat(psprintf("%sShared I/O Read Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time), 3, es); - ExplainPropertyFloat("Shared I/O Write Time", "ms", + ExplainPropertyFloat(psprintf("%sShared I/O Write Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->shared_blk_write_time), 3, es); - ExplainPropertyFloat("Local I/O Read Time", "ms", + ExplainPropertyFloat(psprintf("%sLocal I/O Read Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->local_blk_read_time), 3, es); - ExplainPropertyFloat("Local I/O Write Time", "ms", + ExplainPropertyFloat(psprintf("%sLocal I/O Write Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->local_blk_write_time), 3, es); - ExplainPropertyFloat("Temp I/O Read Time", "ms", + ExplainPropertyFloat(psprintf("%sTemp I/O Read Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time), 3, es); - ExplainPropertyFloat("Temp I/O Write Time", "ms", + ExplainPropertyFloat(psprintf("%sTemp I/O Write Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time), 3, es); } diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index e2e21c66dc9ae..c3ff03f805bb0 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -415,9 +415,19 @@ ExecInitNode(Plan *node, EState *estate, int eflags) /* Set up instrumentation for this node if requested */ if (estate->es_instrument) + { result->instrument = InstrAllocNode(estate->es_instrument, result->async_capable); + /* IndexScan tracks table access separately from index access. */ + if (IsA(result, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, result); + + iss->iss_InstrumentTableStack = InstrAllocAdditionalNodeStack(result->instrument); + } + } + return result; } @@ -859,6 +869,15 @@ ExecRememberNodeInstrumentation_walker(PlanState *node, void *context) node->instrument->need_walusage)) { InstrRememberNodeStack(parent, &node->instrument->stack); + + /* IndexScan has a separate stack to track table access */ + if (IsA(node, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, node); + + if (iss->iss_InstrumentTableStack) + InstrRememberNodeStack(parent, iss->iss_InstrumentTableStack); + } } return planstate_tree_walker(node, ExecRememberNodeInstrumentation_walker, context); @@ -902,6 +921,15 @@ ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context) if (!node->instrument) return false; + /* IndexScan has a separate stack to track table access */ + if (IsA(node, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, node); + + if (iss->iss_InstrumentTableStack) + iss->iss_InstrumentTableStack = InstrFinalizeAdditionalNodeStack(iss->iss_InstrumentTableStack, node->instrument); + } + node->instrument = InstrFinalizeNode(node->instrument, parent); return false; diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index fc7ac85434b2a..0c2d769416e91 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -506,6 +506,59 @@ InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add) WalUsageAdd(&dst->stack.walusage, &add->stack.walusage); } +/* + * Allocate an additional InstrStack for a node, e.g. for tracking table + * buffer usage separately from index buffer usage. Allocated in + * TopTransactionContext so it survives long enough for abort recovery. + */ +InstrStack * +InstrAllocAdditionalNodeStack(NodeInstrumentation *instr) +{ + if (instr->need_bufusage || instr->need_walusage) + return MemoryContextAllocZero(TopTransactionContext, sizeof(InstrStack)); + + return NULL; +} + +void +InstrStartNodeStack(NodeInstrumentation *instr, InstrStack *stack) +{ + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure the executor set up a parent node below the top level stack */ + Assert(CurrentInstrStack != &TopInstrStack); + + InstrPushStack(stack); + } +} + +void +InstrStopNodeStack(NodeInstrumentation *instr, InstrStack *stack) +{ + if (instr->need_bufusage || instr->need_walusage) + { + /* Adding to parent is handled by InstrFinalizeAdditionalNodeStack */ + InstrPopStack(stack); + } +} + +/* Add additional node stacks to the parent and move into per-query memory context */ +InstrStack * +InstrFinalizeAdditionalNodeStack(InstrStack *stack, NodeInstrumentation *instr) +{ + InstrStack *dst = palloc(sizeof(InstrStack)); + + memcpy(dst, stack, sizeof(InstrStack)); + pfree(stack); + + /* Avoid stale pointer references */ + dst->previous = NULL; + + InstrStackAdd(&instr->stack, dst); + + return dst; +} + /* start instrumentation during parallel executor startup */ Instrumentation * InstrStartParallelQuery(void) diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index a616abff04cf0..fb62ff7a8be7a 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -83,6 +83,7 @@ IndexNext(IndexScanState *node) ExprContext *econtext; ScanDirection direction; IndexScanDesc scandesc; + ItemPointer tid; TupleTableSlot *slot; /* @@ -128,8 +129,20 @@ IndexNext(IndexScanState *node) /* * ok, now that we have what we need, fetch the next tuple. */ - while (index_getnext_slot(scandesc, direction, slot)) + while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { + if (node->ss.ps.instrument) + InstrStartNodeStack(node->ss.ps.instrument, node->iss_InstrumentTableStack); + + if (unlikely(!index_fetch_heap(scandesc, slot))) + continue; + + if (node->ss.ps.instrument) + InstrStopNodeStack(node->ss.ps.instrument, node->iss_InstrumentTableStack); + + if (scandesc->xs_heap_continue) + elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); + CHECK_FOR_INTERRUPTS(); /* @@ -812,6 +825,8 @@ ExecEndIndexScan(IndexScanState *node) * which will have a new IndexOnlyScanState and zeroed stats. */ winstrument->nsearches += node->iss_Instrument.nsearches; + if (node->iss_InstrumentTableStack) + InstrStackAdd(&winstrument->worker_table_stack, node->iss_InstrumentTableStack); } /* @@ -1819,4 +1834,10 @@ ExecIndexScanRetrieveInstrumentation(IndexScanState *node) SharedInfo->num_workers * sizeof(IndexScanInstrumentation); node->iss_SharedInfo = palloc(size); memcpy(node->iss_SharedInfo, SharedInfo, size); + + /* Aggregate workers' table buffer/WAL usage into leader's stack */ + if (node->iss_InstrumentTableStack) + for (int i = 0; i < node->iss_SharedInfo->num_workers; i++) + InstrStackAdd(node->iss_InstrumentTableStack, + &node->iss_SharedInfo->winstrument[i].worker_table_stack); } diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 7a5d0597cd5b9..8fb6c911fcb84 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -239,6 +239,11 @@ extern void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples); extern void InstrEndLoop(NodeInstrumentation *instr); extern void InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add); +extern InstrStack *InstrAllocAdditionalNodeStack(NodeInstrumentation *instr); +extern void InstrStartNodeStack(NodeInstrumentation *dst, InstrStack *stack); +extern void InstrStopNodeStack(NodeInstrumentation *dst, InstrStack *stack); +extern InstrStack *InstrFinalizeAdditionalNodeStack(InstrStack *stack, NodeInstrumentation *instr); + pg_nodiscard extern Instrumentation *InstrStartParallelQuery(void); extern void InstrEndParallelQuery(Instrumentation *instr, BufferUsage *bufusage, WalUsage *walusage); extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage); diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h index 8847d7f94fa57..dd45d72559dd7 100644 --- a/src/include/executor/instrument_node.h +++ b/src/include/executor/instrument_node.h @@ -18,6 +18,8 @@ #ifndef INSTRUMENT_NODE_H #define INSTRUMENT_NODE_H +#include "executor/instrument.h" + /* --------------------- * Instrumentation information for aggregate function execution @@ -48,6 +50,9 @@ typedef struct IndexScanInstrumentation { /* Index search count (incremented with pgstat_count_index_scan call) */ uint64 nsearches; + + /* Used for passing iss_InstrumentTableStack data from parallel workers */ + InstrStack worker_table_stack; } IndexScanInstrumentation; /* diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 605c7a6cc3945..6a6427afbd2f7 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1728,6 +1728,13 @@ typedef struct IndexScanState IndexScanInstrumentation iss_Instrument; SharedIndexScanInstrumentation *iss_SharedInfo; + /* + * Instrumentation stack utilized for tracking table access. This is + * separate from iss_Instrument since it needs to be allocated in the + * right context and IndexScanInstrumentation shouldn't contain pointers. + */ + InstrStack *iss_InstrumentTableStack; + /* These are needed for re-checking ORDER BY expr ordering */ pairingheap *iss_ReorderQueue; bool iss_ReachedEnd; From c415884d23c5741d6207aa021b6d0c23fae7a5be Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Sun, 22 Feb 2026 16:12:55 -0800 Subject: [PATCH 8/8] Add pg_session_buffer_usage contrib module This is intended for testing instrumentation related logic as it pertains to the top level stack that is maintained as a running total. There is currently no in-core user that utilizes the top-level values in this manner, and especially during abort situations this helps ensure we don't lose activity due to incorrect handling of unfinalized node stacks. --- contrib/meson.build | 1 + contrib/pg_session_buffer_usage/Makefile | 23 ++ .../expected/pg_session_buffer_usage.out | 283 ++++++++++++++++++ contrib/pg_session_buffer_usage/meson.build | 34 +++ .../pg_session_buffer_usage--1.0.sql | 31 ++ .../pg_session_buffer_usage.c | 96 ++++++ .../pg_session_buffer_usage.control | 5 + .../sql/pg_session_buffer_usage.sql | 204 +++++++++++++ 8 files changed, 677 insertions(+) create mode 100644 contrib/pg_session_buffer_usage/Makefile create mode 100644 contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out create mode 100644 contrib/pg_session_buffer_usage/meson.build create mode 100644 contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql create mode 100644 contrib/pg_session_buffer_usage/pg_session_buffer_usage.c create mode 100644 contrib/pg_session_buffer_usage/pg_session_buffer_usage.control create mode 100644 contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql diff --git a/contrib/meson.build b/contrib/meson.build index def13257cbead..cab1b21167895 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -50,6 +50,7 @@ subdir('pg_logicalinspect') subdir('pg_overexplain') subdir('pg_prewarm') subdir('pgrowlocks') +subdir('pg_session_buffer_usage') subdir('pg_stat_statements') subdir('pgstattuple') subdir('pg_surgery') diff --git a/contrib/pg_session_buffer_usage/Makefile b/contrib/pg_session_buffer_usage/Makefile new file mode 100644 index 0000000000000..75bd8e09b3d3d --- /dev/null +++ b/contrib/pg_session_buffer_usage/Makefile @@ -0,0 +1,23 @@ +# contrib/pg_session_buffer_usage/Makefile + +MODULE_big = pg_session_buffer_usage +OBJS = \ + $(WIN32RES) \ + pg_session_buffer_usage.o + +EXTENSION = pg_session_buffer_usage +DATA = pg_session_buffer_usage--1.0.sql +PGFILEDESC = "pg_session_buffer_usage - show buffer usage statistics for the current session" + +REGRESS = pg_session_buffer_usage + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/pg_session_buffer_usage +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out b/contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out new file mode 100644 index 0000000000000..242b400395043 --- /dev/null +++ b/contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out @@ -0,0 +1,283 @@ +LOAD 'pg_session_buffer_usage'; +CREATE EXTENSION pg_session_buffer_usage; +-- Verify all columns are non-negative +SELECT count(*) = 1 AS ok FROM pg_session_buffer_usage() +WHERE shared_blks_hit >= 0 AND shared_blks_read >= 0 + AND shared_blks_dirtied >= 0 AND shared_blks_written >= 0 + AND local_blks_hit >= 0 AND local_blks_read >= 0 + AND local_blks_dirtied >= 0 AND local_blks_written >= 0 + AND temp_blks_read >= 0 AND temp_blks_written >= 0 + AND shared_blk_read_time >= 0 AND shared_blk_write_time >= 0 + AND local_blk_read_time >= 0 AND local_blk_write_time >= 0 + AND temp_blk_read_time >= 0 AND temp_blk_write_time >= 0; + ok +---- + t +(1 row) + +-- Verify counters increase after buffer activity +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +CREATE TEMP TABLE test_buf_activity (id int, data text); +INSERT INTO test_buf_activity SELECT i, repeat('x', 100) FROM generate_series(1, 1000) AS i; +SELECT count(*) FROM test_buf_activity; + count +------- + 1000 +(1 row) + +SELECT local_blks_hit + local_blks_read > 0 AS blocks_increased +FROM pg_session_buffer_usage(); + blocks_increased +------------------ + t +(1 row) + +DROP TABLE test_buf_activity; +-- Parallel query test +CREATE TABLE par_dc_tab (a int, b char(200)); +INSERT INTO par_dc_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; +SELECT count(*) FROM par_dc_tab; + count +------- + 5000 +(1 row) + +-- Measure serial scan delta (leader does all the work) +SET max_parallel_workers_per_gather = 0; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT count(*) FROM par_dc_tab; + count +------- + 5000 +(1 row) + +CREATE TEMP TABLE dc_serial_result AS +SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage(); +-- Measure parallel scan delta with leader NOT participating in scanning. +-- Workers do all table scanning; leader only runs the Gather node. +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET max_parallel_workers_per_gather = 2; +SET parallel_leader_participation = off; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT count(*) FROM par_dc_tab; + count +------- + 5000 +(1 row) + +-- Confirm we got a similar hit counter through parallel worker accumulation +SELECT shared_blks_hit > s.serial_delta / 2 AND shared_blks_hit < s.serial_delta * 2 + AS leader_buffers_match +FROM pg_session_buffer_usage(), dc_serial_result s; + leader_buffers_match +---------------------- + t +(1 row) + +RESET parallel_setup_cost; +RESET parallel_tuple_cost; +RESET min_parallel_table_scan_size; +RESET max_parallel_workers_per_gather; +RESET parallel_leader_participation; +DROP TABLE par_dc_tab, dc_serial_result; +-- +-- Abort/exception tests: verify buffer usage survives various error paths. +-- +-- Rolled-back divide-by-zero under EXPLAIN ANALYZE +CREATE TEMP TABLE exc_tab (a int, b char(20)); +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +EXPLAIN (ANALYZE, BUFFERS, COSTS OFF) + WITH ins AS (INSERT INTO exc_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 FROM ins; +ERROR: division by zero +SELECT local_blks_dirtied > 0 AS exception_buffers_visible +FROM pg_session_buffer_usage(); + exception_buffers_visible +--------------------------- + t +(1 row) + +DROP TABLE exc_tab; +-- Unique constraint violation in regular query +CREATE TEMP TABLE unique_tab (a int UNIQUE, b char(20)); +INSERT INTO unique_tab VALUES (1, 'first'); +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +INSERT INTO unique_tab VALUES (1, 'duplicate'); +ERROR: duplicate key value violates unique constraint "unique_tab_a_key" +DETAIL: Key (a)=(1) already exists. +SELECT local_blks_hit > 0 AS unique_violation_buffers_visible +FROM pg_session_buffer_usage(); + unique_violation_buffers_visible +---------------------------------- + t +(1 row) + +DROP TABLE unique_tab; +-- Caught exception in PL/pgSQL subtransaction (BEGIN...EXCEPTION) +CREATE TEMP TABLE subxact_tab (a int, b char(20)); +CREATE FUNCTION subxact_exc_func() RETURNS text AS $$ +BEGIN + BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF) + WITH ins AS (INSERT INTO subxact_tab VALUES (1, ''aaa'') RETURNING a) + SELECT a / 0 FROM ins'; + EXCEPTION WHEN division_by_zero THEN + RETURN 'caught'; + END; + RETURN 'not reached'; +END; +$$ LANGUAGE plpgsql; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT subxact_exc_func(); + subxact_exc_func +------------------ + caught +(1 row) + +SELECT local_blks_dirtied > 0 AS subxact_buffers_visible +FROM pg_session_buffer_usage(); + subxact_buffers_visible +------------------------- + t +(1 row) + +DROP FUNCTION subxact_exc_func; +DROP TABLE subxact_tab; +-- Cursor (FOR loop) in aborted subtransaction; verify post-exception tracking +CREATE TEMP TABLE cursor_tab (a int, b char(200)); +INSERT INTO cursor_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i; +CREATE FUNCTION cursor_exc_func() RETURNS text AS $$ +DECLARE + rec record; + cnt int := 0; +BEGIN + BEGIN + FOR rec IN SELECT * FROM cursor_tab LOOP + cnt := cnt + 1; + IF cnt = 250 THEN + PERFORM 1 / 0; + END IF; + END LOOP; + EXCEPTION WHEN division_by_zero THEN + RETURN 'caught after ' || cnt || ' rows'; + END; + RETURN 'not reached'; +END; +$$ LANGUAGE plpgsql; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT cursor_exc_func(); + cursor_exc_func +----------------------- + caught after 250 rows +(1 row) + +SELECT local_blks_hit + local_blks_read > 0 + AS cursor_subxact_buffers_visible +FROM pg_session_buffer_usage(); + cursor_subxact_buffers_visible +-------------------------------- + t +(1 row) + +DROP FUNCTION cursor_exc_func; +DROP TABLE cursor_tab; +-- Parallel worker abort: worker buffer activity is currently NOT propagated on abort. +-- +-- When a parallel worker aborts, InstrEndParallelQuery and +-- ExecParallelReportInstrumentation never run, so the worker's buffer +-- activity is never written to shared memory, despite the information having been +-- captured by the res owner release instrumentation handling. +CREATE TABLE par_abort_tab (a int, b char(200)); +INSERT INTO par_abort_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; +-- Warm shared buffers so all reads become hits +SELECT count(*) FROM par_abort_tab; + count +------- + 5000 +(1 row) + +-- Measure serial scan delta as a reference (leader reads all blocks) +SET max_parallel_workers_per_gather = 0; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT b::int2 FROM par_abort_tab WHERE a > 1000; +ERROR: invalid input syntax for type smallint: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" +CREATE TABLE par_abort_serial_result AS +SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage(); +-- Now force parallel with leader NOT participating in scanning +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET max_parallel_workers_per_gather = 2; +SET parallel_leader_participation = off; +SET debug_parallel_query = on; -- Ensure we get CONTEXT line consistently +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT b::int2 FROM par_abort_tab WHERE a > 1000; +ERROR: invalid input syntax for type smallint: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" +CONTEXT: parallel worker +RESET debug_parallel_query; +-- Workers scanned the table but aborted before reporting stats back. +-- The leader's delta should be much less than a serial scan, documenting +-- that worker buffer activity is lost on abort. +SELECT shared_blks_hit < s.serial_delta / 2 + AS worker_abort_buffers_not_propagated +FROM pg_session_buffer_usage(), par_abort_serial_result s; + worker_abort_buffers_not_propagated +------------------------------------- + t +(1 row) + +RESET parallel_setup_cost; +RESET parallel_tuple_cost; +RESET min_parallel_table_scan_size; +RESET max_parallel_workers_per_gather; +RESET parallel_leader_participation; +DROP TABLE par_abort_tab, par_abort_serial_result; +-- Cleanup +DROP EXTENSION pg_session_buffer_usage; diff --git a/contrib/pg_session_buffer_usage/meson.build b/contrib/pg_session_buffer_usage/meson.build new file mode 100644 index 0000000000000..34c7502beb463 --- /dev/null +++ b/contrib/pg_session_buffer_usage/meson.build @@ -0,0 +1,34 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group + +pg_session_buffer_usage_sources = files( + 'pg_session_buffer_usage.c', +) + +if host_system == 'windows' + pg_session_buffer_usage_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pg_session_buffer_usage', + '--FILEDESC', 'pg_session_buffer_usage - show buffer usage statistics for the current session',]) +endif + +pg_session_buffer_usage = shared_module('pg_session_buffer_usage', + pg_session_buffer_usage_sources, + kwargs: contrib_mod_args, +) +contrib_targets += pg_session_buffer_usage + +install_data( + 'pg_session_buffer_usage--1.0.sql', + 'pg_session_buffer_usage.control', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'pg_session_buffer_usage', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'pg_session_buffer_usage', + ], + }, +} diff --git a/contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql b/contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql new file mode 100644 index 0000000000000..b300fdbc64397 --- /dev/null +++ b/contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql @@ -0,0 +1,31 @@ +/* contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION pg_session_buffer_usage" to load this file. \quit + +CREATE FUNCTION pg_session_buffer_usage( + OUT shared_blks_hit bigint, + OUT shared_blks_read bigint, + OUT shared_blks_dirtied bigint, + OUT shared_blks_written bigint, + OUT local_blks_hit bigint, + OUT local_blks_read bigint, + OUT local_blks_dirtied bigint, + OUT local_blks_written bigint, + OUT temp_blks_read bigint, + OUT temp_blks_written bigint, + OUT shared_blk_read_time double precision, + OUT shared_blk_write_time double precision, + OUT local_blk_read_time double precision, + OUT local_blk_write_time double precision, + OUT temp_blk_read_time double precision, + OUT temp_blk_write_time double precision +) +RETURNS record +AS 'MODULE_PATHNAME', 'pg_session_buffer_usage' +LANGUAGE C PARALLEL RESTRICTED; + +CREATE FUNCTION pg_session_buffer_usage_reset() +RETURNS void +AS 'MODULE_PATHNAME', 'pg_session_buffer_usage_reset' +LANGUAGE C PARALLEL RESTRICTED; diff --git a/contrib/pg_session_buffer_usage/pg_session_buffer_usage.c b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.c new file mode 100644 index 0000000000000..20ed3c11ebda8 --- /dev/null +++ b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.c @@ -0,0 +1,96 @@ +/*------------------------------------------------------------------------- + * + * pg_session_buffer_usage.c + * show buffer usage statistics for the current session + * + * Copyright (c) 2026, PostgreSQL Global Development Group + * + * contrib/pg_session_buffer_usage/pg_session_buffer_usage.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "executor/instrument.h" +#include "funcapi.h" +#include "libpq/auth.h" +#include "miscadmin.h" +#include "utils/memutils.h" + +PG_MODULE_MAGIC_EXT( + .name = "pg_session_buffer_usage", + .version = PG_VERSION +); + +#define NUM_BUFFER_USAGE_COLUMNS 16 + +PG_FUNCTION_INFO_V1(pg_session_buffer_usage); +PG_FUNCTION_INFO_V1(pg_session_buffer_usage_reset); + +#define HAVE_INSTR_STACK 1 /* Change to 0 when testing before stack + * change */ + +/* + * SQL function: pg_session_buffer_usage() + * + * Returns a single row with all BufferUsage counters accumulated since the + * start of the session. Excludes any usage not yet added to the top of the + * stack (e.g. if this gets called inside a statement that also had buffer + * activity). + */ +Datum +pg_session_buffer_usage(PG_FUNCTION_ARGS) +{ + TupleDesc tupdesc; + Datum values[NUM_BUFFER_USAGE_COLUMNS]; + bool nulls[NUM_BUFFER_USAGE_COLUMNS]; + BufferUsage *usage; + + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + memset(nulls, 0, sizeof(nulls)); + +#if HAVE_INSTR_STACK + usage = &TopInstrStack.bufusage; +#else + usage = &pgBufferUsage; +#endif + + values[0] = Int64GetDatum(usage->shared_blks_hit); + values[1] = Int64GetDatum(usage->shared_blks_read); + values[2] = Int64GetDatum(usage->shared_blks_dirtied); + values[3] = Int64GetDatum(usage->shared_blks_written); + values[4] = Int64GetDatum(usage->local_blks_hit); + values[5] = Int64GetDatum(usage->local_blks_read); + values[6] = Int64GetDatum(usage->local_blks_dirtied); + values[7] = Int64GetDatum(usage->local_blks_written); + values[8] = Int64GetDatum(usage->temp_blks_read); + values[9] = Int64GetDatum(usage->temp_blks_written); + values[10] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time)); + values[11] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->shared_blk_write_time)); + values[12] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->local_blk_read_time)); + values[13] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->local_blk_write_time)); + values[14] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time)); + values[15] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time)); + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} + +/* + * SQL function: pg_session_buffer_usage_reset() + * + * Resets all BufferUsage counters on the top instrumentation stack to zero. + * Useful in tests to avoid the baseline/delta pattern. + */ +Datum +pg_session_buffer_usage_reset(PG_FUNCTION_ARGS) +{ +#if HAVE_INSTR_STACK + memset(&TopInstrStack.bufusage, 0, sizeof(BufferUsage)); +#else + memset(&pgBufferUsage, 0, sizeof(BufferUsage)); +#endif + + PG_RETURN_VOID(); +} diff --git a/contrib/pg_session_buffer_usage/pg_session_buffer_usage.control b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.control new file mode 100644 index 0000000000000..fabd05ee02458 --- /dev/null +++ b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.control @@ -0,0 +1,5 @@ +# pg_session_buffer_usage extension +comment = 'show buffer usage statistics for the current session' +default_version = '1.0' +module_pathname = '$libdir/pg_session_buffer_usage' +relocatable = true diff --git a/contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql b/contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql new file mode 100644 index 0000000000000..8f5810fadd337 --- /dev/null +++ b/contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql @@ -0,0 +1,204 @@ +LOAD 'pg_session_buffer_usage'; +CREATE EXTENSION pg_session_buffer_usage; + +-- Verify all columns are non-negative +SELECT count(*) = 1 AS ok FROM pg_session_buffer_usage() +WHERE shared_blks_hit >= 0 AND shared_blks_read >= 0 + AND shared_blks_dirtied >= 0 AND shared_blks_written >= 0 + AND local_blks_hit >= 0 AND local_blks_read >= 0 + AND local_blks_dirtied >= 0 AND local_blks_written >= 0 + AND temp_blks_read >= 0 AND temp_blks_written >= 0 + AND shared_blk_read_time >= 0 AND shared_blk_write_time >= 0 + AND local_blk_read_time >= 0 AND local_blk_write_time >= 0 + AND temp_blk_read_time >= 0 AND temp_blk_write_time >= 0; + +-- Verify counters increase after buffer activity +SELECT pg_session_buffer_usage_reset(); + +CREATE TEMP TABLE test_buf_activity (id int, data text); +INSERT INTO test_buf_activity SELECT i, repeat('x', 100) FROM generate_series(1, 1000) AS i; +SELECT count(*) FROM test_buf_activity; + +SELECT local_blks_hit + local_blks_read > 0 AS blocks_increased +FROM pg_session_buffer_usage(); + +DROP TABLE test_buf_activity; + +-- Parallel query test +CREATE TABLE par_dc_tab (a int, b char(200)); +INSERT INTO par_dc_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; + +SELECT count(*) FROM par_dc_tab; + +-- Measure serial scan delta (leader does all the work) +SET max_parallel_workers_per_gather = 0; + +SELECT pg_session_buffer_usage_reset(); +SELECT count(*) FROM par_dc_tab; + +CREATE TEMP TABLE dc_serial_result AS +SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage(); + +-- Measure parallel scan delta with leader NOT participating in scanning. +-- Workers do all table scanning; leader only runs the Gather node. +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET max_parallel_workers_per_gather = 2; +SET parallel_leader_participation = off; + +SELECT pg_session_buffer_usage_reset(); +SELECT count(*) FROM par_dc_tab; + +-- Confirm we got a similar hit counter through parallel worker accumulation +SELECT shared_blks_hit > s.serial_delta / 2 AND shared_blks_hit < s.serial_delta * 2 + AS leader_buffers_match +FROM pg_session_buffer_usage(), dc_serial_result s; + +RESET parallel_setup_cost; +RESET parallel_tuple_cost; +RESET min_parallel_table_scan_size; +RESET max_parallel_workers_per_gather; +RESET parallel_leader_participation; + +DROP TABLE par_dc_tab, dc_serial_result; + +-- +-- Abort/exception tests: verify buffer usage survives various error paths. +-- + +-- Rolled-back divide-by-zero under EXPLAIN ANALYZE +CREATE TEMP TABLE exc_tab (a int, b char(20)); + +SELECT pg_session_buffer_usage_reset(); + +EXPLAIN (ANALYZE, BUFFERS, COSTS OFF) + WITH ins AS (INSERT INTO exc_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 FROM ins; + +SELECT local_blks_dirtied > 0 AS exception_buffers_visible +FROM pg_session_buffer_usage(); + +DROP TABLE exc_tab; + +-- Unique constraint violation in regular query +CREATE TEMP TABLE unique_tab (a int UNIQUE, b char(20)); +INSERT INTO unique_tab VALUES (1, 'first'); + +SELECT pg_session_buffer_usage_reset(); +INSERT INTO unique_tab VALUES (1, 'duplicate'); + +SELECT local_blks_hit > 0 AS unique_violation_buffers_visible +FROM pg_session_buffer_usage(); + +DROP TABLE unique_tab; + +-- Caught exception in PL/pgSQL subtransaction (BEGIN...EXCEPTION) +CREATE TEMP TABLE subxact_tab (a int, b char(20)); + +CREATE FUNCTION subxact_exc_func() RETURNS text AS $$ +BEGIN + BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF) + WITH ins AS (INSERT INTO subxact_tab VALUES (1, ''aaa'') RETURNING a) + SELECT a / 0 FROM ins'; + EXCEPTION WHEN division_by_zero THEN + RETURN 'caught'; + END; + RETURN 'not reached'; +END; +$$ LANGUAGE plpgsql; + +SELECT pg_session_buffer_usage_reset(); +SELECT subxact_exc_func(); + +SELECT local_blks_dirtied > 0 AS subxact_buffers_visible +FROM pg_session_buffer_usage(); + +DROP FUNCTION subxact_exc_func; +DROP TABLE subxact_tab; + +-- Cursor (FOR loop) in aborted subtransaction; verify post-exception tracking +CREATE TEMP TABLE cursor_tab (a int, b char(200)); +INSERT INTO cursor_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i; + +CREATE FUNCTION cursor_exc_func() RETURNS text AS $$ +DECLARE + rec record; + cnt int := 0; +BEGIN + BEGIN + FOR rec IN SELECT * FROM cursor_tab LOOP + cnt := cnt + 1; + IF cnt = 250 THEN + PERFORM 1 / 0; + END IF; + END LOOP; + EXCEPTION WHEN division_by_zero THEN + RETURN 'caught after ' || cnt || ' rows'; + END; + RETURN 'not reached'; +END; +$$ LANGUAGE plpgsql; + +SELECT pg_session_buffer_usage_reset(); +SELECT cursor_exc_func(); + +SELECT local_blks_hit + local_blks_read > 0 + AS cursor_subxact_buffers_visible +FROM pg_session_buffer_usage(); + +DROP FUNCTION cursor_exc_func; +DROP TABLE cursor_tab; + +-- Parallel worker abort: worker buffer activity is currently NOT propagated on abort. +-- +-- When a parallel worker aborts, InstrEndParallelQuery and +-- ExecParallelReportInstrumentation never run, so the worker's buffer +-- activity is never written to shared memory, despite the information having been +-- captured by the res owner release instrumentation handling. +CREATE TABLE par_abort_tab (a int, b char(200)); +INSERT INTO par_abort_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; + +-- Warm shared buffers so all reads become hits +SELECT count(*) FROM par_abort_tab; + +-- Measure serial scan delta as a reference (leader reads all blocks) +SET max_parallel_workers_per_gather = 0; + +SELECT pg_session_buffer_usage_reset(); +SELECT b::int2 FROM par_abort_tab WHERE a > 1000; + +CREATE TABLE par_abort_serial_result AS +SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage(); + +-- Now force parallel with leader NOT participating in scanning +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET max_parallel_workers_per_gather = 2; +SET parallel_leader_participation = off; +SET debug_parallel_query = on; -- Ensure we get CONTEXT line consistently + +SELECT pg_session_buffer_usage_reset(); +SELECT b::int2 FROM par_abort_tab WHERE a > 1000; + +RESET debug_parallel_query; + +-- Workers scanned the table but aborted before reporting stats back. +-- The leader's delta should be much less than a serial scan, documenting +-- that worker buffer activity is lost on abort. +SELECT shared_blks_hit < s.serial_delta / 2 + AS worker_abort_buffers_not_propagated +FROM pg_session_buffer_usage(), par_abort_serial_result s; + +RESET parallel_setup_cost; +RESET parallel_tuple_cost; +RESET min_parallel_table_scan_size; +RESET max_parallel_workers_per_gather; +RESET parallel_leader_participation; + +DROP TABLE par_abort_tab, par_abort_serial_result; + +-- Cleanup +DROP EXTENSION pg_session_buffer_usage;