diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c
index e856cd35a6f0f..39bf2543b701d 100644
--- a/contrib/auto_explain/auto_explain.c
+++ b/contrib/auto_explain/auto_explain.c
@@ -315,7 +315,7 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
MemoryContext oldcxt;
oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
- queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
+ queryDesc->totaltime = InstrAlloc(INSTRUMENT_ALL);
MemoryContextSwitchTo(oldcxt);
}
}
@@ -381,12 +381,6 @@ explain_ExecutorEnd(QueryDesc *queryDesc)
*/
oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
- /*
- * Make sure stats accumulation is done. (Note: it's okay if several
- * levels of hook all do this.)
- */
- InstrEndLoop(queryDesc->totaltime);
-
/* Log plan if duration is exceeded. */
msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total);
if (msec >= auto_explain_log_min_duration)
diff --git a/contrib/meson.build b/contrib/meson.build
index def13257cbead..cab1b21167895 100644
--- a/contrib/meson.build
+++ b/contrib/meson.build
@@ -50,6 +50,7 @@ subdir('pg_logicalinspect')
subdir('pg_overexplain')
subdir('pg_prewarm')
subdir('pgrowlocks')
+subdir('pg_session_buffer_usage')
subdir('pg_stat_statements')
subdir('pgstattuple')
subdir('pg_surgery')
diff --git a/contrib/pg_session_buffer_usage/Makefile b/contrib/pg_session_buffer_usage/Makefile
new file mode 100644
index 0000000000000..75bd8e09b3d3d
--- /dev/null
+++ b/contrib/pg_session_buffer_usage/Makefile
@@ -0,0 +1,23 @@
+# contrib/pg_session_buffer_usage/Makefile
+
+MODULE_big = pg_session_buffer_usage
+OBJS = \
+ $(WIN32RES) \
+ pg_session_buffer_usage.o
+
+EXTENSION = pg_session_buffer_usage
+DATA = pg_session_buffer_usage--1.0.sql
+PGFILEDESC = "pg_session_buffer_usage - show buffer usage statistics for the current session"
+
+REGRESS = pg_session_buffer_usage
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_session_buffer_usage
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out b/contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out
new file mode 100644
index 0000000000000..242b400395043
--- /dev/null
+++ b/contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out
@@ -0,0 +1,283 @@
+LOAD 'pg_session_buffer_usage';
+CREATE EXTENSION pg_session_buffer_usage;
+-- Verify all columns are non-negative
+SELECT count(*) = 1 AS ok FROM pg_session_buffer_usage()
+WHERE shared_blks_hit >= 0 AND shared_blks_read >= 0
+ AND shared_blks_dirtied >= 0 AND shared_blks_written >= 0
+ AND local_blks_hit >= 0 AND local_blks_read >= 0
+ AND local_blks_dirtied >= 0 AND local_blks_written >= 0
+ AND temp_blks_read >= 0 AND temp_blks_written >= 0
+ AND shared_blk_read_time >= 0 AND shared_blk_write_time >= 0
+ AND local_blk_read_time >= 0 AND local_blk_write_time >= 0
+ AND temp_blk_read_time >= 0 AND temp_blk_write_time >= 0;
+ ok
+----
+ t
+(1 row)
+
+-- Verify counters increase after buffer activity
+SELECT pg_session_buffer_usage_reset();
+ pg_session_buffer_usage_reset
+-------------------------------
+
+(1 row)
+
+CREATE TEMP TABLE test_buf_activity (id int, data text);
+INSERT INTO test_buf_activity SELECT i, repeat('x', 100) FROM generate_series(1, 1000) AS i;
+SELECT count(*) FROM test_buf_activity;
+ count
+-------
+ 1000
+(1 row)
+
+SELECT local_blks_hit + local_blks_read > 0 AS blocks_increased
+FROM pg_session_buffer_usage();
+ blocks_increased
+------------------
+ t
+(1 row)
+
+DROP TABLE test_buf_activity;
+-- Parallel query test
+CREATE TABLE par_dc_tab (a int, b char(200));
+INSERT INTO par_dc_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+SELECT count(*) FROM par_dc_tab;
+ count
+-------
+ 5000
+(1 row)
+
+-- Measure serial scan delta (leader does all the work)
+SET max_parallel_workers_per_gather = 0;
+SELECT pg_session_buffer_usage_reset();
+ pg_session_buffer_usage_reset
+-------------------------------
+
+(1 row)
+
+SELECT count(*) FROM par_dc_tab;
+ count
+-------
+ 5000
+(1 row)
+
+CREATE TEMP TABLE dc_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage();
+-- Measure parallel scan delta with leader NOT participating in scanning.
+-- Workers do all table scanning; leader only runs the Gather node.
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+SELECT pg_session_buffer_usage_reset();
+ pg_session_buffer_usage_reset
+-------------------------------
+
+(1 row)
+
+SELECT count(*) FROM par_dc_tab;
+ count
+-------
+ 5000
+(1 row)
+
+-- Confirm we got a similar hit counter through parallel worker accumulation
+SELECT shared_blks_hit > s.serial_delta / 2 AND shared_blks_hit < s.serial_delta * 2
+ AS leader_buffers_match
+FROM pg_session_buffer_usage(), dc_serial_result s;
+ leader_buffers_match
+----------------------
+ t
+(1 row)
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+DROP TABLE par_dc_tab, dc_serial_result;
+--
+-- Abort/exception tests: verify buffer usage survives various error paths.
+--
+-- Rolled-back divide-by-zero under EXPLAIN ANALYZE
+CREATE TEMP TABLE exc_tab (a int, b char(20));
+SELECT pg_session_buffer_usage_reset();
+ pg_session_buffer_usage_reset
+-------------------------------
+
+(1 row)
+
+EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+ WITH ins AS (INSERT INTO exc_tab VALUES (1, 'aaa') RETURNING a)
+ SELECT a / 0 FROM ins;
+ERROR: division by zero
+SELECT local_blks_dirtied > 0 AS exception_buffers_visible
+FROM pg_session_buffer_usage();
+ exception_buffers_visible
+---------------------------
+ t
+(1 row)
+
+DROP TABLE exc_tab;
+-- Unique constraint violation in regular query
+CREATE TEMP TABLE unique_tab (a int UNIQUE, b char(20));
+INSERT INTO unique_tab VALUES (1, 'first');
+SELECT pg_session_buffer_usage_reset();
+ pg_session_buffer_usage_reset
+-------------------------------
+
+(1 row)
+
+INSERT INTO unique_tab VALUES (1, 'duplicate');
+ERROR: duplicate key value violates unique constraint "unique_tab_a_key"
+DETAIL: Key (a)=(1) already exists.
+SELECT local_blks_hit > 0 AS unique_violation_buffers_visible
+FROM pg_session_buffer_usage();
+ unique_violation_buffers_visible
+----------------------------------
+ t
+(1 row)
+
+DROP TABLE unique_tab;
+-- Caught exception in PL/pgSQL subtransaction (BEGIN...EXCEPTION)
+CREATE TEMP TABLE subxact_tab (a int, b char(20));
+CREATE FUNCTION subxact_exc_func() RETURNS text AS $$
+BEGIN
+ BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+ WITH ins AS (INSERT INTO subxact_tab VALUES (1, ''aaa'') RETURNING a)
+ SELECT a / 0 FROM ins';
+ EXCEPTION WHEN division_by_zero THEN
+ RETURN 'caught';
+ END;
+ RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_session_buffer_usage_reset();
+ pg_session_buffer_usage_reset
+-------------------------------
+
+(1 row)
+
+SELECT subxact_exc_func();
+ subxact_exc_func
+------------------
+ caught
+(1 row)
+
+SELECT local_blks_dirtied > 0 AS subxact_buffers_visible
+FROM pg_session_buffer_usage();
+ subxact_buffers_visible
+-------------------------
+ t
+(1 row)
+
+DROP FUNCTION subxact_exc_func;
+DROP TABLE subxact_tab;
+-- Cursor (FOR loop) in aborted subtransaction; verify post-exception tracking
+CREATE TEMP TABLE cursor_tab (a int, b char(200));
+INSERT INTO cursor_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i;
+CREATE FUNCTION cursor_exc_func() RETURNS text AS $$
+DECLARE
+ rec record;
+ cnt int := 0;
+BEGIN
+ BEGIN
+ FOR rec IN SELECT * FROM cursor_tab LOOP
+ cnt := cnt + 1;
+ IF cnt = 250 THEN
+ PERFORM 1 / 0;
+ END IF;
+ END LOOP;
+ EXCEPTION WHEN division_by_zero THEN
+ RETURN 'caught after ' || cnt || ' rows';
+ END;
+ RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_session_buffer_usage_reset();
+ pg_session_buffer_usage_reset
+-------------------------------
+
+(1 row)
+
+SELECT cursor_exc_func();
+ cursor_exc_func
+-----------------------
+ caught after 250 rows
+(1 row)
+
+SELECT local_blks_hit + local_blks_read > 0
+ AS cursor_subxact_buffers_visible
+FROM pg_session_buffer_usage();
+ cursor_subxact_buffers_visible
+--------------------------------
+ t
+(1 row)
+
+DROP FUNCTION cursor_exc_func;
+DROP TABLE cursor_tab;
+-- Parallel worker abort: worker buffer activity is currently NOT propagated on abort.
+--
+-- When a parallel worker aborts, InstrEndParallelQuery and
+-- ExecParallelReportInstrumentation never run, so the worker's buffer
+-- activity is never written to shared memory, despite the information having been
+-- captured by the res owner release instrumentation handling.
+CREATE TABLE par_abort_tab (a int, b char(200));
+INSERT INTO par_abort_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+-- Warm shared buffers so all reads become hits
+SELECT count(*) FROM par_abort_tab;
+ count
+-------
+ 5000
+(1 row)
+
+-- Measure serial scan delta as a reference (leader reads all blocks)
+SET max_parallel_workers_per_gather = 0;
+SELECT pg_session_buffer_usage_reset();
+ pg_session_buffer_usage_reset
+-------------------------------
+
+(1 row)
+
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+ERROR: invalid input syntax for type smallint: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+CREATE TABLE par_abort_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage();
+-- Now force parallel with leader NOT participating in scanning
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+SET debug_parallel_query = on; -- Ensure we get CONTEXT line consistently
+SELECT pg_session_buffer_usage_reset();
+ pg_session_buffer_usage_reset
+-------------------------------
+
+(1 row)
+
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+ERROR: invalid input syntax for type smallint: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+CONTEXT: parallel worker
+RESET debug_parallel_query;
+-- Workers scanned the table but aborted before reporting stats back.
+-- The leader's delta should be much less than a serial scan, documenting
+-- that worker buffer activity is lost on abort.
+SELECT shared_blks_hit < s.serial_delta / 2
+ AS worker_abort_buffers_not_propagated
+FROM pg_session_buffer_usage(), par_abort_serial_result s;
+ worker_abort_buffers_not_propagated
+-------------------------------------
+ t
+(1 row)
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+DROP TABLE par_abort_tab, par_abort_serial_result;
+-- Cleanup
+DROP EXTENSION pg_session_buffer_usage;
diff --git a/contrib/pg_session_buffer_usage/meson.build b/contrib/pg_session_buffer_usage/meson.build
new file mode 100644
index 0000000000000..34c7502beb463
--- /dev/null
+++ b/contrib/pg_session_buffer_usage/meson.build
@@ -0,0 +1,34 @@
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+pg_session_buffer_usage_sources = files(
+ 'pg_session_buffer_usage.c',
+)
+
+if host_system == 'windows'
+ pg_session_buffer_usage_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+ '--NAME', 'pg_session_buffer_usage',
+ '--FILEDESC', 'pg_session_buffer_usage - show buffer usage statistics for the current session',])
+endif
+
+pg_session_buffer_usage = shared_module('pg_session_buffer_usage',
+ pg_session_buffer_usage_sources,
+ kwargs: contrib_mod_args,
+)
+contrib_targets += pg_session_buffer_usage
+
+install_data(
+ 'pg_session_buffer_usage--1.0.sql',
+ 'pg_session_buffer_usage.control',
+ kwargs: contrib_data_args,
+)
+
+tests += {
+ 'name': 'pg_session_buffer_usage',
+ 'sd': meson.current_source_dir(),
+ 'bd': meson.current_build_dir(),
+ 'regress': {
+ 'sql': [
+ 'pg_session_buffer_usage',
+ ],
+ },
+}
diff --git a/contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql b/contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql
new file mode 100644
index 0000000000000..b300fdbc64397
--- /dev/null
+++ b/contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql
@@ -0,0 +1,31 @@
+/* contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pg_session_buffer_usage" to load this file. \quit
+
+CREATE FUNCTION pg_session_buffer_usage(
+ OUT shared_blks_hit bigint,
+ OUT shared_blks_read bigint,
+ OUT shared_blks_dirtied bigint,
+ OUT shared_blks_written bigint,
+ OUT local_blks_hit bigint,
+ OUT local_blks_read bigint,
+ OUT local_blks_dirtied bigint,
+ OUT local_blks_written bigint,
+ OUT temp_blks_read bigint,
+ OUT temp_blks_written bigint,
+ OUT shared_blk_read_time double precision,
+ OUT shared_blk_write_time double precision,
+ OUT local_blk_read_time double precision,
+ OUT local_blk_write_time double precision,
+ OUT temp_blk_read_time double precision,
+ OUT temp_blk_write_time double precision
+)
+RETURNS record
+AS 'MODULE_PATHNAME', 'pg_session_buffer_usage'
+LANGUAGE C PARALLEL RESTRICTED;
+
+CREATE FUNCTION pg_session_buffer_usage_reset()
+RETURNS void
+AS 'MODULE_PATHNAME', 'pg_session_buffer_usage_reset'
+LANGUAGE C PARALLEL RESTRICTED;
diff --git a/contrib/pg_session_buffer_usage/pg_session_buffer_usage.c b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.c
new file mode 100644
index 0000000000000..20ed3c11ebda8
--- /dev/null
+++ b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.c
@@ -0,0 +1,96 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_session_buffer_usage.c
+ * show buffer usage statistics for the current session
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ * contrib/pg_session_buffer_usage/pg_session_buffer_usage.c
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "executor/instrument.h"
+#include "funcapi.h"
+#include "libpq/auth.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+PG_MODULE_MAGIC_EXT(
+ .name = "pg_session_buffer_usage",
+ .version = PG_VERSION
+);
+
+#define NUM_BUFFER_USAGE_COLUMNS 16
+
+PG_FUNCTION_INFO_V1(pg_session_buffer_usage);
+PG_FUNCTION_INFO_V1(pg_session_buffer_usage_reset);
+
+#define HAVE_INSTR_STACK 1 /* Change to 0 when testing before stack
+ * change */
+
+/*
+ * SQL function: pg_session_buffer_usage()
+ *
+ * Returns a single row with all BufferUsage counters accumulated since the
+ * start of the session. Excludes any usage not yet added to the top of the
+ * stack (e.g. if this gets called inside a statement that also had buffer
+ * activity).
+ */
+Datum
+pg_session_buffer_usage(PG_FUNCTION_ARGS)
+{
+ TupleDesc tupdesc;
+ Datum values[NUM_BUFFER_USAGE_COLUMNS];
+ bool nulls[NUM_BUFFER_USAGE_COLUMNS];
+ BufferUsage *usage;
+
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ memset(nulls, 0, sizeof(nulls));
+
+#if HAVE_INSTR_STACK
+ usage = &TopInstrStack.bufusage;
+#else
+ usage = &pgBufferUsage;
+#endif
+
+ values[0] = Int64GetDatum(usage->shared_blks_hit);
+ values[1] = Int64GetDatum(usage->shared_blks_read);
+ values[2] = Int64GetDatum(usage->shared_blks_dirtied);
+ values[3] = Int64GetDatum(usage->shared_blks_written);
+ values[4] = Int64GetDatum(usage->local_blks_hit);
+ values[5] = Int64GetDatum(usage->local_blks_read);
+ values[6] = Int64GetDatum(usage->local_blks_dirtied);
+ values[7] = Int64GetDatum(usage->local_blks_written);
+ values[8] = Int64GetDatum(usage->temp_blks_read);
+ values[9] = Int64GetDatum(usage->temp_blks_written);
+ values[10] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time));
+ values[11] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->shared_blk_write_time));
+ values[12] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->local_blk_read_time));
+ values[13] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->local_blk_write_time));
+ values[14] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time));
+ values[15] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time));
+
+ PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
+}
+
+/*
+ * SQL function: pg_session_buffer_usage_reset()
+ *
+ * Resets all BufferUsage counters on the top instrumentation stack to zero.
+ * Useful in tests to avoid the baseline/delta pattern.
+ */
+Datum
+pg_session_buffer_usage_reset(PG_FUNCTION_ARGS)
+{
+#if HAVE_INSTR_STACK
+ memset(&TopInstrStack.bufusage, 0, sizeof(BufferUsage));
+#else
+ memset(&pgBufferUsage, 0, sizeof(BufferUsage));
+#endif
+
+ PG_RETURN_VOID();
+}
diff --git a/contrib/pg_session_buffer_usage/pg_session_buffer_usage.control b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.control
new file mode 100644
index 0000000000000..fabd05ee02458
--- /dev/null
+++ b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.control
@@ -0,0 +1,5 @@
+# pg_session_buffer_usage extension
+comment = 'show buffer usage statistics for the current session'
+default_version = '1.0'
+module_pathname = '$libdir/pg_session_buffer_usage'
+relocatable = true
diff --git a/contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql b/contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql
new file mode 100644
index 0000000000000..8f5810fadd337
--- /dev/null
+++ b/contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql
@@ -0,0 +1,204 @@
+LOAD 'pg_session_buffer_usage';
+CREATE EXTENSION pg_session_buffer_usage;
+
+-- Verify all columns are non-negative
+SELECT count(*) = 1 AS ok FROM pg_session_buffer_usage()
+WHERE shared_blks_hit >= 0 AND shared_blks_read >= 0
+ AND shared_blks_dirtied >= 0 AND shared_blks_written >= 0
+ AND local_blks_hit >= 0 AND local_blks_read >= 0
+ AND local_blks_dirtied >= 0 AND local_blks_written >= 0
+ AND temp_blks_read >= 0 AND temp_blks_written >= 0
+ AND shared_blk_read_time >= 0 AND shared_blk_write_time >= 0
+ AND local_blk_read_time >= 0 AND local_blk_write_time >= 0
+ AND temp_blk_read_time >= 0 AND temp_blk_write_time >= 0;
+
+-- Verify counters increase after buffer activity
+SELECT pg_session_buffer_usage_reset();
+
+CREATE TEMP TABLE test_buf_activity (id int, data text);
+INSERT INTO test_buf_activity SELECT i, repeat('x', 100) FROM generate_series(1, 1000) AS i;
+SELECT count(*) FROM test_buf_activity;
+
+SELECT local_blks_hit + local_blks_read > 0 AS blocks_increased
+FROM pg_session_buffer_usage();
+
+DROP TABLE test_buf_activity;
+
+-- Parallel query test
+CREATE TABLE par_dc_tab (a int, b char(200));
+INSERT INTO par_dc_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+
+SELECT count(*) FROM par_dc_tab;
+
+-- Measure serial scan delta (leader does all the work)
+SET max_parallel_workers_per_gather = 0;
+
+SELECT pg_session_buffer_usage_reset();
+SELECT count(*) FROM par_dc_tab;
+
+CREATE TEMP TABLE dc_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage();
+
+-- Measure parallel scan delta with leader NOT participating in scanning.
+-- Workers do all table scanning; leader only runs the Gather node.
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+
+SELECT pg_session_buffer_usage_reset();
+SELECT count(*) FROM par_dc_tab;
+
+-- Confirm we got a similar hit counter through parallel worker accumulation
+SELECT shared_blks_hit > s.serial_delta / 2 AND shared_blks_hit < s.serial_delta * 2
+ AS leader_buffers_match
+FROM pg_session_buffer_usage(), dc_serial_result s;
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+
+DROP TABLE par_dc_tab, dc_serial_result;
+
+--
+-- Abort/exception tests: verify buffer usage survives various error paths.
+--
+
+-- Rolled-back divide-by-zero under EXPLAIN ANALYZE
+CREATE TEMP TABLE exc_tab (a int, b char(20));
+
+SELECT pg_session_buffer_usage_reset();
+
+EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+ WITH ins AS (INSERT INTO exc_tab VALUES (1, 'aaa') RETURNING a)
+ SELECT a / 0 FROM ins;
+
+SELECT local_blks_dirtied > 0 AS exception_buffers_visible
+FROM pg_session_buffer_usage();
+
+DROP TABLE exc_tab;
+
+-- Unique constraint violation in regular query
+CREATE TEMP TABLE unique_tab (a int UNIQUE, b char(20));
+INSERT INTO unique_tab VALUES (1, 'first');
+
+SELECT pg_session_buffer_usage_reset();
+INSERT INTO unique_tab VALUES (1, 'duplicate');
+
+SELECT local_blks_hit > 0 AS unique_violation_buffers_visible
+FROM pg_session_buffer_usage();
+
+DROP TABLE unique_tab;
+
+-- Caught exception in PL/pgSQL subtransaction (BEGIN...EXCEPTION)
+CREATE TEMP TABLE subxact_tab (a int, b char(20));
+
+CREATE FUNCTION subxact_exc_func() RETURNS text AS $$
+BEGIN
+ BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF)
+ WITH ins AS (INSERT INTO subxact_tab VALUES (1, ''aaa'') RETURNING a)
+ SELECT a / 0 FROM ins';
+ EXCEPTION WHEN division_by_zero THEN
+ RETURN 'caught';
+ END;
+ RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_session_buffer_usage_reset();
+SELECT subxact_exc_func();
+
+SELECT local_blks_dirtied > 0 AS subxact_buffers_visible
+FROM pg_session_buffer_usage();
+
+DROP FUNCTION subxact_exc_func;
+DROP TABLE subxact_tab;
+
+-- Cursor (FOR loop) in aborted subtransaction; verify post-exception tracking
+CREATE TEMP TABLE cursor_tab (a int, b char(200));
+INSERT INTO cursor_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i;
+
+CREATE FUNCTION cursor_exc_func() RETURNS text AS $$
+DECLARE
+ rec record;
+ cnt int := 0;
+BEGIN
+ BEGIN
+ FOR rec IN SELECT * FROM cursor_tab LOOP
+ cnt := cnt + 1;
+ IF cnt = 250 THEN
+ PERFORM 1 / 0;
+ END IF;
+ END LOOP;
+ EXCEPTION WHEN division_by_zero THEN
+ RETURN 'caught after ' || cnt || ' rows';
+ END;
+ RETURN 'not reached';
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_session_buffer_usage_reset();
+SELECT cursor_exc_func();
+
+SELECT local_blks_hit + local_blks_read > 0
+ AS cursor_subxact_buffers_visible
+FROM pg_session_buffer_usage();
+
+DROP FUNCTION cursor_exc_func;
+DROP TABLE cursor_tab;
+
+-- Parallel worker abort: worker buffer activity is currently NOT propagated on abort.
+--
+-- When a parallel worker aborts, InstrEndParallelQuery and
+-- ExecParallelReportInstrumentation never run, so the worker's buffer
+-- activity is never written to shared memory, despite the information having been
+-- captured by the res owner release instrumentation handling.
+CREATE TABLE par_abort_tab (a int, b char(200));
+INSERT INTO par_abort_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+
+-- Warm shared buffers so all reads become hits
+SELECT count(*) FROM par_abort_tab;
+
+-- Measure serial scan delta as a reference (leader reads all blocks)
+SET max_parallel_workers_per_gather = 0;
+
+SELECT pg_session_buffer_usage_reset();
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+
+CREATE TABLE par_abort_serial_result AS
+SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage();
+
+-- Now force parallel with leader NOT participating in scanning
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 2;
+SET parallel_leader_participation = off;
+SET debug_parallel_query = on; -- Ensure we get CONTEXT line consistently
+
+SELECT pg_session_buffer_usage_reset();
+SELECT b::int2 FROM par_abort_tab WHERE a > 1000;
+
+RESET debug_parallel_query;
+
+-- Workers scanned the table but aborted before reporting stats back.
+-- The leader's delta should be much less than a serial scan, documenting
+-- that worker buffer activity is lost on abort.
+SELECT shared_blks_hit < s.serial_delta / 2
+ AS worker_abort_buffers_not_propagated
+FROM pg_session_buffer_usage(), par_abort_serial_result s;
+
+RESET parallel_setup_cost;
+RESET parallel_tuple_cost;
+RESET min_parallel_table_scan_size;
+RESET max_parallel_workers_per_gather;
+RESET parallel_leader_participation;
+
+DROP TABLE par_abort_tab, par_abort_serial_result;
+
+-- Cleanup
+DROP EXTENSION pg_session_buffer_usage;
diff --git a/contrib/pg_stat_statements/expected/utility.out b/contrib/pg_stat_statements/expected/utility.out
index e4d6564ea5b5a..cba487f6be582 100644
--- a/contrib/pg_stat_statements/expected/utility.out
+++ b/contrib/pg_stat_statements/expected/utility.out
@@ -289,6 +289,76 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C";
1 | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t
(3 rows)
+-- Buffer stats should flow through EXPLAIN ANALYZE
+CREATE TEMP TABLE flow_through_test (a int, b char(200));
+INSERT INTO flow_through_test SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+CREATE FUNCTION run_explain_buffers_test() RETURNS void AS $$
+DECLARE
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM flow_through_test';
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t
+---
+ t
+(1 row)
+
+SELECT run_explain_buffers_test();
+ run_explain_buffers_test
+--------------------------
+
+(1 row)
+
+-- EXPLAIN entries should have non-zero buffer stats
+SELECT query, local_blks_hit + local_blks_read > 0 as has_buffer_stats
+FROM pg_stat_statements
+WHERE query LIKE 'SELECT run_explain_buffers_test%'
+ORDER BY query COLLATE "C";
+ query | has_buffer_stats
+-----------------------------------+------------------
+ SELECT run_explain_buffers_test() | t
+(1 row)
+
+DROP FUNCTION run_explain_buffers_test;
+DROP TABLE flow_through_test;
+-- Validate buffer/WAL counting during abort
+SET pg_stat_statements.track = 'all';
+CREATE TEMP TABLE pgss_call_tab (a int, b char(20));
+CREATE TEMP TABLE pgss_call_tab2 (a int, b char(20));
+INSERT INTO pgss_call_tab VALUES (0, 'zzz');
+CREATE PROCEDURE pgss_call_rollback_proc() AS $$
+DECLARE
+ v int;
+BEGIN
+ EXPLAIN ANALYZE WITH ins AS (INSERT INTO pgss_call_tab2 SELECT * FROM pgss_call_tab RETURNING a)
+ SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t
+---
+ t
+(1 row)
+
+CALL pgss_call_rollback_proc();
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_call_rollback_proc%'
+ORDER BY query COLLATE "C";
+ query | calls | local_hitread | wal_bytes_generated | wal_records_generated
+--------------------------------+-------+---------------+---------------------+-----------------------
+ CALL pgss_call_rollback_proc() | 1 | t | t | t
+(1 row)
+
+DROP TABLE pgss_call_tab2;
+DROP TABLE pgss_call_tab;
+DROP PROCEDURE pgss_call_rollback_proc;
+SET pg_stat_statements.track = 'top';
-- CALL
CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$
DECLARE
diff --git a/contrib/pg_stat_statements/expected/wal.out b/contrib/pg_stat_statements/expected/wal.out
index 977e382d84894..611213daef6c2 100644
--- a/contrib/pg_stat_statements/expected/wal.out
+++ b/contrib/pg_stat_statements/expected/wal.out
@@ -28,3 +28,51 @@ SELECT pg_stat_statements_reset() IS NOT NULL AS t;
t
(1 row)
+--
+-- Validate buffer/WAL counting with caught exception in PL/pgSQL
+--
+CREATE TEMP TABLE pgss_error_tab (a int, b char(20));
+INSERT INTO pgss_error_tab VALUES (0, 'zzz');
+CREATE FUNCTION pgss_error_func() RETURNS void AS $$
+DECLARE
+ v int;
+BEGIN
+ WITH ins AS (INSERT INTO pgss_error_tab VALUES (1, 'aaa') RETURNING a)
+ SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+ NULL;
+END;
+$$ LANGUAGE plpgsql;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t
+---
+ t
+(1 row)
+
+SELECT pgss_error_func();
+ pgss_error_func
+-----------------
+
+(1 row)
+
+-- Buffer/WAL usage from the wCTE INSERT should survive the exception
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_error_func%'
+ORDER BY query COLLATE "C";
+ query | calls | local_hitread | wal_bytes_generated | wal_records_generated
+--------------------------+-------+---------------+---------------------+-----------------------
+ SELECT pgss_error_func() | 1 | t | t | t
+(1 row)
+
+DROP TABLE pgss_error_tab;
+DROP FUNCTION pgss_error_func;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+ t
+---
+ t
+(1 row)
+
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 4a427533bd88d..fe3237fc665fd 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -911,21 +911,13 @@ pgss_planner(Query *parse,
{
instr_time start;
instr_time duration;
- BufferUsage bufusage_start,
- bufusage;
- WalUsage walusage_start,
- walusage;
+ InstrStack stack = {0};
- /* We need to track buffer usage as the planner can access them. */
- bufusage_start = pgBufferUsage;
-
- /*
- * Similarly the planner could write some WAL records in some cases
- * (e.g. setting a hint bit with those being WAL-logged)
- */
- walusage_start = pgWalUsage;
INSTR_TIME_SET_CURRENT(start);
+ /* We need to track buffer/WAL usage as the planner can access them. */
+ InstrPushStack(&stack);
+
nesting_level++;
PG_TRY();
{
@@ -938,6 +930,7 @@ pgss_planner(Query *parse,
}
PG_FINALLY();
{
+ InstrPopAndFinalizeStack(&stack);
nesting_level--;
}
PG_END_TRY();
@@ -945,14 +938,6 @@ pgss_planner(Query *parse,
INSTR_TIME_SET_CURRENT(duration);
INSTR_TIME_SUBTRACT(duration, start);
- /* calc differences of buffer counters. */
- memset(&bufusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
-
- /* calc differences of WAL counters. */
- memset(&walusage, 0, sizeof(WalUsage));
- WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
-
pgss_store(query_string,
parse->queryId,
parse->stmt_location,
@@ -960,8 +945,8 @@ pgss_planner(Query *parse,
PGSS_PLAN,
INSTR_TIME_GET_MILLISEC(duration),
0,
- &bufusage,
- &walusage,
+ &stack.bufusage,
+ &stack.walusage,
NULL,
NULL,
0,
@@ -1023,7 +1008,7 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
MemoryContext oldcxt;
oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
- queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
+ queryDesc->totaltime = InstrAlloc(INSTRUMENT_ALL);
MemoryContextSwitchTo(oldcxt);
}
}
@@ -1083,10 +1068,11 @@ pgss_ExecutorEnd(QueryDesc *queryDesc)
pgss_enabled(nesting_level))
{
/*
- * Make sure stats accumulation is done. (Note: it's okay if several
- * levels of hook all do this.)
+ * Check if stack is initialized - it is not when ExecutorRun wasn't
+ * called
*/
- InstrEndLoop(queryDesc->totaltime);
+ const BufferUsage *bufusage = queryDesc->totaltime->stack ? &queryDesc->totaltime->stack->bufusage : NULL;
+ const WalUsage *walusage = queryDesc->totaltime->stack ? &queryDesc->totaltime->stack->walusage : NULL;
pgss_store(queryDesc->sourceText,
queryId,
@@ -1095,8 +1081,8 @@ pgss_ExecutorEnd(QueryDesc *queryDesc)
PGSS_EXEC,
INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total),
queryDesc->estate->es_total_processed,
- &queryDesc->totaltime->bufusage,
- &queryDesc->totaltime->walusage,
+ bufusage,
+ walusage,
queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL,
NULL,
queryDesc->estate->es_parallel_workers_to_launch,
@@ -1163,14 +1149,10 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
instr_time start;
instr_time duration;
uint64 rows;
- BufferUsage bufusage_start,
- bufusage;
- WalUsage walusage_start,
- walusage;
+ InstrStack stack = {0};
- bufusage_start = pgBufferUsage;
- walusage_start = pgWalUsage;
INSTR_TIME_SET_CURRENT(start);
+ InstrPushStack(&stack);
nesting_level++;
PG_TRY();
@@ -1186,6 +1168,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
}
PG_FINALLY();
{
+ InstrPopAndFinalizeStack(&stack);
nesting_level--;
}
PG_END_TRY();
@@ -1214,14 +1197,6 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
qc->commandTag == CMDTAG_REFRESH_MATERIALIZED_VIEW)) ?
qc->nprocessed : 0;
- /* calc differences of buffer counters. */
- memset(&bufusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
-
- /* calc differences of WAL counters. */
- memset(&walusage, 0, sizeof(WalUsage));
- WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
-
pgss_store(queryString,
saved_queryId,
saved_stmt_location,
@@ -1229,8 +1204,8 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
PGSS_EXEC,
INSTR_TIME_GET_MILLISEC(duration),
rows,
- &bufusage,
- &walusage,
+ &stack.bufusage,
+ &stack.walusage,
NULL,
NULL,
0,
@@ -1460,27 +1435,33 @@ pgss_store(const char *query, int64 queryId,
}
}
entry->counters.rows += rows;
- entry->counters.shared_blks_hit += bufusage->shared_blks_hit;
- entry->counters.shared_blks_read += bufusage->shared_blks_read;
- entry->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
- entry->counters.shared_blks_written += bufusage->shared_blks_written;
- entry->counters.local_blks_hit += bufusage->local_blks_hit;
- entry->counters.local_blks_read += bufusage->local_blks_read;
- entry->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
- entry->counters.local_blks_written += bufusage->local_blks_written;
- entry->counters.temp_blks_read += bufusage->temp_blks_read;
- entry->counters.temp_blks_written += bufusage->temp_blks_written;
- entry->counters.shared_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_read_time);
- entry->counters.shared_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_write_time);
- entry->counters.local_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_read_time);
- entry->counters.local_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_write_time);
- entry->counters.temp_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_read_time);
- entry->counters.temp_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_write_time);
+ if (bufusage)
+ {
+ entry->counters.shared_blks_hit += bufusage->shared_blks_hit;
+ entry->counters.shared_blks_read += bufusage->shared_blks_read;
+ entry->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
+ entry->counters.shared_blks_written += bufusage->shared_blks_written;
+ entry->counters.local_blks_hit += bufusage->local_blks_hit;
+ entry->counters.local_blks_read += bufusage->local_blks_read;
+ entry->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
+ entry->counters.local_blks_written += bufusage->local_blks_written;
+ entry->counters.temp_blks_read += bufusage->temp_blks_read;
+ entry->counters.temp_blks_written += bufusage->temp_blks_written;
+ entry->counters.shared_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_read_time);
+ entry->counters.shared_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_write_time);
+ entry->counters.local_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_read_time);
+ entry->counters.local_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_write_time);
+ entry->counters.temp_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_read_time);
+ entry->counters.temp_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_write_time);
+ }
entry->counters.usage += USAGE_EXEC(total_time);
- entry->counters.wal_records += walusage->wal_records;
- entry->counters.wal_fpi += walusage->wal_fpi;
- entry->counters.wal_bytes += walusage->wal_bytes;
- entry->counters.wal_buffers_full += walusage->wal_buffers_full;
+ if (walusage)
+ {
+ entry->counters.wal_records += walusage->wal_records;
+ entry->counters.wal_fpi += walusage->wal_fpi;
+ entry->counters.wal_bytes += walusage->wal_bytes;
+ entry->counters.wal_buffers_full += walusage->wal_buffers_full;
+ }
if (jitusage)
{
entry->counters.jit_functions += jitusage->created_functions;
diff --git a/contrib/pg_stat_statements/sql/utility.sql b/contrib/pg_stat_statements/sql/utility.sql
index dd97203c21025..7540e49c73caf 100644
--- a/contrib/pg_stat_statements/sql/utility.sql
+++ b/contrib/pg_stat_statements/sql/utility.sql
@@ -152,6 +152,62 @@ EXPLAIN (costs off) SELECT a FROM generate_series(1,10) AS tab(a) WHERE a = 7;
SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C";
+-- Buffer stats should flow through EXPLAIN ANALYZE
+CREATE TEMP TABLE flow_through_test (a int, b char(200));
+INSERT INTO flow_through_test SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i;
+
+CREATE FUNCTION run_explain_buffers_test() RETURNS void AS $$
+DECLARE
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM flow_through_test';
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+
+SELECT run_explain_buffers_test();
+
+-- EXPLAIN entries should have non-zero buffer stats
+SELECT query, local_blks_hit + local_blks_read > 0 as has_buffer_stats
+FROM pg_stat_statements
+WHERE query LIKE 'SELECT run_explain_buffers_test%'
+ORDER BY query COLLATE "C";
+
+DROP FUNCTION run_explain_buffers_test;
+DROP TABLE flow_through_test;
+
+-- Validate buffer/WAL counting during abort
+SET pg_stat_statements.track = 'all';
+CREATE TEMP TABLE pgss_call_tab (a int, b char(20));
+CREATE TEMP TABLE pgss_call_tab2 (a int, b char(20));
+INSERT INTO pgss_call_tab VALUES (0, 'zzz');
+
+CREATE PROCEDURE pgss_call_rollback_proc() AS $$
+DECLARE
+ v int;
+BEGIN
+ EXPLAIN ANALYZE WITH ins AS (INSERT INTO pgss_call_tab2 SELECT * FROM pgss_call_tab RETURNING a)
+ SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+CALL pgss_call_rollback_proc();
+
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_call_rollback_proc%'
+ORDER BY query COLLATE "C";
+
+DROP TABLE pgss_call_tab2;
+DROP TABLE pgss_call_tab;
+DROP PROCEDURE pgss_call_rollback_proc;
+SET pg_stat_statements.track = 'top';
+
-- CALL
CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$
DECLARE
diff --git a/contrib/pg_stat_statements/sql/wal.sql b/contrib/pg_stat_statements/sql/wal.sql
index 1dc1552a81ebc..467e321b2062e 100644
--- a/contrib/pg_stat_statements/sql/wal.sql
+++ b/contrib/pg_stat_statements/sql/wal.sql
@@ -18,3 +18,36 @@ wal_records > 0 as wal_records_generated,
wal_records >= rows as wal_records_ge_rows
FROM pg_stat_statements ORDER BY query COLLATE "C";
SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+
+--
+-- Validate buffer/WAL counting with caught exception in PL/pgSQL
+--
+CREATE TEMP TABLE pgss_error_tab (a int, b char(20));
+INSERT INTO pgss_error_tab VALUES (0, 'zzz');
+
+CREATE FUNCTION pgss_error_func() RETURNS void AS $$
+DECLARE
+ v int;
+BEGIN
+ WITH ins AS (INSERT INTO pgss_error_tab VALUES (1, 'aaa') RETURNING a)
+ SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+ NULL;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
+SELECT pgss_error_func();
+
+-- Buffer/WAL usage from the wCTE INSERT should survive the exception
+SELECT query, calls,
+local_blks_hit + local_blks_read > 0 as local_hitread,
+wal_bytes > 0 as wal_bytes_generated,
+wal_records > 0 as wal_records_generated
+FROM pg_stat_statements
+WHERE query LIKE '%pgss_error_func%'
+ORDER BY query COLLATE "C";
+
+DROP TABLE pgss_error_tab;
+DROP FUNCTION pgss_error_func;
+SELECT pg_stat_statements_reset() IS NOT NULL AS t;
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index 60d90329a651d..6f0cb2a285b38 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -2778,7 +2778,7 @@ postgresIterateDirectModify(ForeignScanState *node)
if (!resultRelInfo->ri_projectReturning)
{
TupleTableSlot *slot = node->ss.ss_ScanTupleSlot;
- Instrumentation *instr = node->ss.ps.instrument;
+ NodeInstrumentation *instr = node->ss.ps.instrument;
Assert(!dmstate->has_returning);
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml
index 5f6f1db0467e9..9219625faf6e3 100644
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -734,6 +734,7 @@ WHERE t1.unique1 < 10 AND t1.unique2 = t2.unique2;
-> Index Scan using tenk2_unique2 on tenk2 t2 (cost=0.29..7.90 rows=1 width=244) (actual time=0.003..0.003 rows=1.00 loops=10)
Index Cond: (unique2 = t1.unique2)
Index Searches: 10
+ Table Buffers: shared hit=10
Buffers: shared hit=24 read=6
Planning:
Buffers: shared hit=15 dirtied=9
@@ -949,7 +950,8 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)';
Index Cond: (f1 @> '((0.5,2))'::polygon)
Rows Removed by Index Recheck: 1
Index Searches: 1
- Buffers: shared hit=1
+ Table Buffers: shared hit=1
+ Buffers: shared hit=2
Planning Time: 0.039 ms
Execution Time: 0.098 ms
@@ -958,7 +960,9 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)';
then rejected by a recheck of the index condition. This happens because a
GiST index is lossy
for polygon containment tests: it actually
returns the rows with polygons that overlap the target, and then we have
- to do the exact containment test on those rows.
+ to do the exact containment test on those rows. The Table Buffers
+ counts indicate how many operations were performed on the table instead of
+ the index. This number is included in the Buffers counts.
@@ -1147,13 +1151,14 @@ EXPLAIN ANALYZE SELECT * FROM tenk1 WHERE unique1 < 100 AND unique2 > 9000
QUERY PLAN
-------------------------------------------------------------------&zwsp;------------------------------------------------------------
Limit (cost=0.29..14.33 rows=2 width=244) (actual time=0.051..0.071 rows=2.00 loops=1)
- Buffers: shared hit=16
+ Buffers: shared hit=14
-> Index Scan using tenk1_unique2 on tenk1 (cost=0.29..70.50 rows=10 width=244) (actual time=0.051..0.070 rows=2.00 loops=1)
Index Cond: (unique2 > 9000)
Filter: (unique1 < 100)
Rows Removed by Filter: 287
Index Searches: 1
- Buffers: shared hit=16
+ Table Buffers: shared hit=11
+ Buffers: shared hit=14
Planning Time: 0.077 ms
Execution Time: 0.086 ms
diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml
index 7dee77fd366b0..912c96f2ff51a 100644
--- a/doc/src/sgml/ref/explain.sgml
+++ b/doc/src/sgml/ref/explain.sgml
@@ -506,6 +506,7 @@ EXPLAIN ANALYZE EXECUTE query(100, 200);
-> Index Scan using test_pkey on test (cost=0.29..10.27 rows=99 width=8) (actual time=0.009..0.025 rows=99.00 loops=1)
Index Cond: ((id > 100) AND (id < 200))
Index Searches: 1
+ Table Buffers: shared hit=1
Buffers: shared hit=4
Planning Time: 0.244 ms
Execution Time: 0.073 ms
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 6887e421442ee..c1c3d03b6edb6 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -2885,6 +2885,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Relation indexRel;
LOCKMODE heapLockmode;
LOCKMODE indexLockmode;
+ Instrumentation *instr;
WalUsage *walusage;
BufferUsage *bufferusage;
int sortmem;
@@ -2934,7 +2935,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
tuplesort_attach_shared(sharedsort, seg);
/* Prepare to track buffer usage during parallel execution */
- InstrStartParallelQuery();
+ instr = InstrStartParallelQuery();
/*
* Might as well use reliable figure when doling out maintenance_work_mem
@@ -2949,7 +2950,8 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
/* Report WAL/buffer usage during parallel execution */
bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
- InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
+ InstrEndParallelQuery(instr,
+ &bufferusage[ParallelWorkerNumber],
&walusage[ParallelWorkerNumber]);
index_close(indexRel, indexLockmode);
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 0d63fb4ba27b6..9149d735d59b3 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -2108,6 +2108,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Relation indexRel;
LOCKMODE heapLockmode;
LOCKMODE indexLockmode;
+ Instrumentation *instr;
WalUsage *walusage;
BufferUsage *bufferusage;
int sortmem;
@@ -2176,7 +2177,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
tuplesort_attach_shared(sharedsort, seg);
/* Prepare to track buffer usage during parallel execution */
- InstrStartParallelQuery();
+ instr = InstrStartParallelQuery();
/*
* Might as well use reliable figure when doling out maintenance_work_mem
@@ -2191,7 +2192,8 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
/* Report WAL/buffer usage during parallel execution */
bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
- InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
+ InstrEndParallelQuery(instr,
+ &bufferusage[ParallelWorkerNumber],
&walusage[ParallelWorkerNumber]);
index_close(indexRel, indexLockmode);
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 4be267ff657e1..51f86ff1ef7bf 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -638,8 +638,7 @@ heap_vacuum_rel(Relation rel, const VacuumParams params,
TimestampTz starttime = 0;
PgStat_Counter startreadtime = 0,
startwritetime = 0;
- WalUsage startwalusage = pgWalUsage;
- BufferUsage startbufferusage = pgBufferUsage;
+ Instrumentation *instr = NULL;
ErrorContextCallback errcallback;
char **indnames = NULL;
Size dead_items_max_bytes = 0;
@@ -655,6 +654,8 @@ heap_vacuum_rel(Relation rel, const VacuumParams params,
startreadtime = pgStatBlockReadTime;
startwritetime = pgStatBlockWriteTime;
}
+ instr = InstrAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
+ InstrStart(instr);
}
/* Used for instrumentation and stats report */
@@ -980,14 +981,14 @@ heap_vacuum_rel(Relation rel, const VacuumParams params,
{
TimestampTz endtime = GetCurrentTimestamp();
+ InstrStop(instr, true);
+
if (verbose || params.log_vacuum_min_duration == 0 ||
TimestampDifferenceExceeds(starttime, endtime,
params.log_vacuum_min_duration))
{
long secs_dur;
int usecs_dur;
- WalUsage walusage;
- BufferUsage bufferusage;
StringInfoData buf;
char *msgfmt;
int32 diff;
@@ -996,12 +997,10 @@ heap_vacuum_rel(Relation rel, const VacuumParams params,
int64 total_blks_hit;
int64 total_blks_read;
int64 total_blks_dirtied;
+ BufferUsage bufferusage = instr->stack->bufusage;
+ WalUsage walusage = instr->stack->walusage;
TimestampDifference(starttime, endtime, &secs_dur, &usecs_dur);
- memset(&walusage, 0, sizeof(WalUsage));
- WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage);
- memset(&bufferusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage);
total_blks_hit = bufferusage.shared_blks_hit +
bufferusage.local_blks_hit;
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 3a45508f62ea4..600f23ff2a6f9 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -1750,6 +1750,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Relation indexRel;
LOCKMODE heapLockmode;
LOCKMODE indexLockmode;
+ Instrumentation *instr;
WalUsage *walusage;
BufferUsage *bufferusage;
int sortmem;
@@ -1825,7 +1826,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
}
/* Prepare to track buffer usage during parallel execution */
- InstrStartParallelQuery();
+ instr = InstrStartParallelQuery();
/* Perform sorting of spool, and possibly a spool2 */
sortmem = maintenance_work_mem / btshared->scantuplesortstates;
@@ -1835,7 +1836,8 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
/* Report WAL/buffer usage during parallel execution */
bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
- InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
+ InstrEndParallelQuery(instr,
+ &bufferusage[ParallelWorkerNumber],
&walusage[ParallelWorkerNumber]);
#ifdef BTREE_BUILD_STATS
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 13cce9b49f16e..6a635bcdf1182 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -1080,10 +1080,10 @@ XLogInsertRecord(XLogRecData *rdata,
/* Report WAL traffic to the instrumentation. */
if (inserted)
{
- pgWalUsage.wal_bytes += rechdr->xl_tot_len;
- pgWalUsage.wal_records++;
- pgWalUsage.wal_fpi += num_fpi;
- pgWalUsage.wal_fpi_bytes += fpi_bytes;
+ INSTR_WALUSAGE_ADD(wal_bytes, rechdr->xl_tot_len);
+ INSTR_WALUSAGE_INCR(wal_records);
+ INSTR_WALUSAGE_ADD(wal_fpi, num_fpi);
+ INSTR_WALUSAGE_ADD(wal_fpi_bytes, fpi_bytes);
/* Required for the flush of pending stats WAL data */
pgstat_report_fixed = true;
@@ -2063,7 +2063,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
WriteRqst.Flush = InvalidXLogRecPtr;
XLogWrite(WriteRqst, tli, false);
LWLockRelease(WALWriteLock);
- pgWalUsage.wal_buffers_full++;
+ INSTR_WALUSAGE_INCR(wal_buffers_full);
TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
/*
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index a483424152c5f..47cac75bc20b5 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -308,9 +308,7 @@ do_analyze_rel(Relation onerel, const VacuumParams params,
Oid save_userid;
int save_sec_context;
int save_nestlevel;
- WalUsage startwalusage = pgWalUsage;
- BufferUsage startbufferusage = pgBufferUsage;
- BufferUsage bufferusage;
+ Instrumentation *instr = NULL;
PgStat_Counter startreadtime = 0;
PgStat_Counter startwritetime = 0;
@@ -361,6 +359,9 @@ do_analyze_rel(Relation onerel, const VacuumParams params,
}
pg_rusage_init(&ru0);
+
+ instr = InstrAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
+ InstrStart(instr);
}
/* Used for instrumentation and stats report */
@@ -741,12 +742,13 @@ do_analyze_rel(Relation onerel, const VacuumParams params,
{
TimestampTz endtime = GetCurrentTimestamp();
+ InstrStop(instr, true);
+
if (verbose || params.log_analyze_min_duration == 0 ||
TimestampDifferenceExceeds(starttime, endtime,
params.log_analyze_min_duration))
{
long delay_in_ms;
- WalUsage walusage;
double read_rate = 0;
double write_rate = 0;
char *msgfmt;
@@ -754,18 +756,15 @@ do_analyze_rel(Relation onerel, const VacuumParams params,
int64 total_blks_hit;
int64 total_blks_read;
int64 total_blks_dirtied;
-
- memset(&bufferusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage);
- memset(&walusage, 0, sizeof(WalUsage));
- WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage);
-
- total_blks_hit = bufferusage.shared_blks_hit +
- bufferusage.local_blks_hit;
- total_blks_read = bufferusage.shared_blks_read +
- bufferusage.local_blks_read;
- total_blks_dirtied = bufferusage.shared_blks_dirtied +
- bufferusage.local_blks_dirtied;
+ BufferUsage bufusage = instr->stack->bufusage;
+ WalUsage walusage = instr->stack->walusage;
+
+ total_blks_hit = bufusage.shared_blks_hit +
+ bufusage.local_blks_hit;
+ total_blks_read = bufusage.shared_blks_read +
+ bufusage.local_blks_read;
+ total_blks_dirtied = bufusage.shared_blks_dirtied +
+ bufusage.local_blks_dirtied;
/*
* We do not expect an analyze to take > 25 days and it simplifies
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 93918a223b8d7..054c5bce837f6 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -143,7 +143,7 @@ static void show_instrumentation_count(const char *qlabel, int which,
static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es);
static const char *explain_get_index_name(Oid indexId);
static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage);
-static void show_buffer_usage(ExplainState *es, const BufferUsage *usage);
+static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title);
static void show_wal_usage(ExplainState *es, const WalUsage *usage);
static void show_memory_counters(ExplainState *es,
const MemoryContextCounters *mem_counters);
@@ -322,14 +322,16 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
QueryEnvironment *queryEnv)
{
PlannedStmt *plan;
- instr_time planstart,
- planduration;
- BufferUsage bufusage_start,
- bufusage;
+ Instrumentation *instr = NULL;
MemoryContextCounters mem_counters;
MemoryContext planner_ctx = NULL;
MemoryContext saved_ctx = NULL;
+ if (es->buffers)
+ instr = InstrAlloc(INSTRUMENT_TIMER | INSTRUMENT_BUFFERS);
+ else
+ instr = InstrAlloc(INSTRUMENT_TIMER);
+
if (es->memory)
{
/*
@@ -346,15 +348,12 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
saved_ctx = MemoryContextSwitchTo(planner_ctx);
}
- if (es->buffers)
- bufusage_start = pgBufferUsage;
- INSTR_TIME_SET_CURRENT(planstart);
+ InstrStart(instr);
/* plan the query */
plan = pg_plan_query(query, queryString, cursorOptions, params, es);
- INSTR_TIME_SET_CURRENT(planduration);
- INSTR_TIME_SUBTRACT(planduration, planstart);
+ InstrStop(instr, true);
if (es->memory)
{
@@ -362,16 +361,9 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
MemoryContextMemConsumed(planner_ctx, &mem_counters);
}
- /* calc differences of buffer counters. */
- if (es->buffers)
- {
- memset(&bufusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
- }
-
/* run it (if needed) and produce output */
ExplainOnePlan(plan, into, es, queryString, params, queryEnv,
- &planduration, (es->buffers ? &bufusage : NULL),
+ &instr->total, (es->buffers ? &instr->stack->bufusage : NULL),
es->memory ? &mem_counters : NULL);
}
@@ -611,7 +603,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
}
if (bufusage)
- show_buffer_usage(es, bufusage);
+ show_buffer_usage(es, bufusage, NULL);
if (mem_counters)
show_memory_counters(es, mem_counters);
@@ -1028,7 +1020,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
if (es->buffers && peek_buffer_usage(es, &metrics->bufferUsage))
{
es->indent++;
- show_buffer_usage(es, &metrics->bufferUsage);
+ show_buffer_usage(es, &metrics->bufferUsage, NULL);
es->indent--;
}
}
@@ -1042,7 +1034,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
BYTES_TO_KILOBYTES(metrics->bytesSent), es);
ExplainPropertyText("Format", format, es);
if (es->buffers)
- show_buffer_usage(es, &metrics->bufferUsage);
+ show_buffer_usage(es, &metrics->bufferUsage, NULL);
}
ExplainCloseGroup("Serialization", "Serialization", true, es);
@@ -1099,18 +1091,15 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
for (nt = 0; nt < rInfo->ri_TrigDesc->numtriggers; nt++)
{
Trigger *trig = rInfo->ri_TrigDesc->triggers + nt;
- Instrumentation *instr = rInfo->ri_TrigInstrument + nt;
+ TriggerInstrumentation *tginstr = rInfo->ri_TrigInstrument + nt;
char *relname;
char *conname = NULL;
- /* Must clean up instrumentation state */
- InstrEndLoop(instr);
-
/*
* We ignore triggers that were never invoked; they likely aren't
* relevant to the current query type.
*/
- if (instr->ntuples == 0)
+ if (tginstr->firings == 0)
continue;
ExplainOpenGroup("Trigger", NULL, true, es);
@@ -1135,11 +1124,11 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
if (show_relname)
appendStringInfo(es->str, " on %s", relname);
if (es->timing)
- appendStringInfo(es->str, ": time=%.3f calls=%.0f\n",
- INSTR_TIME_GET_MILLISEC(instr->total),
- instr->ntuples);
+ appendStringInfo(es->str, ": time=%.3f calls=%d\n",
+ INSTR_TIME_GET_MILLISEC(tginstr->instr.total),
+ tginstr->firings);
else
- appendStringInfo(es->str, ": calls=%.0f\n", instr->ntuples);
+ appendStringInfo(es->str, ": calls=%d\n", tginstr->firings);
}
else
{
@@ -1149,9 +1138,9 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
ExplainPropertyText("Relation", relname, es);
if (es->timing)
ExplainPropertyFloat("Time", "ms",
- INSTR_TIME_GET_MILLISEC(instr->total), 3,
+ INSTR_TIME_GET_MILLISEC(tginstr->instr.total), 3,
es);
- ExplainPropertyFloat("Calls", NULL, instr->ntuples, 0, es);
+ ExplainPropertyInteger("Calls", NULL, tginstr->firings, es);
}
if (conname)
@@ -1891,11 +1880,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
/* prepare per-worker general execution details */
if (es->workers_state && es->verbose)
{
- WorkerInstrumentation *w = planstate->worker_instrument;
+ WorkerNodeInstrumentation *w = planstate->worker_instrument;
for (int n = 0; n < w->num_workers; n++)
{
- Instrumentation *instrument = &w->instrument[n];
+ NodeInstrumentation *instrument = &w->instrument[n];
double nloops = instrument->nloops;
double startup_ms;
double total_ms;
@@ -1973,6 +1962,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
show_indexsearches_info(planstate, es);
+
+ if (es->buffers && planstate->instrument)
+ show_buffer_usage(es, &((IndexScanState *) planstate)->iss_InstrumentTableStack->bufusage, "Table");
break;
case T_IndexOnlyScan:
show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
@@ -2291,18 +2283,18 @@ ExplainNode(PlanState *planstate, List *ancestors,
/* Show buffer/WAL usage */
if (es->buffers && planstate->instrument)
- show_buffer_usage(es, &planstate->instrument->bufusage);
+ show_buffer_usage(es, &planstate->instrument->stack.bufusage, NULL);
if (es->wal && planstate->instrument)
- show_wal_usage(es, &planstate->instrument->walusage);
+ show_wal_usage(es, &planstate->instrument->stack.walusage);
/* Prepare per-worker buffer/WAL usage */
if (es->workers_state && (es->buffers || es->wal) && es->verbose)
{
- WorkerInstrumentation *w = planstate->worker_instrument;
+ WorkerNodeInstrumentation *w = planstate->worker_instrument;
for (int n = 0; n < w->num_workers; n++)
{
- Instrumentation *instrument = &w->instrument[n];
+ NodeInstrumentation *instrument = &w->instrument[n];
double nloops = instrument->nloops;
if (nloops <= 0)
@@ -2310,9 +2302,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
ExplainOpenWorker(n, es);
if (es->buffers)
- show_buffer_usage(es, &instrument->bufusage);
+ show_buffer_usage(es, &instrument->stack.bufusage, NULL);
if (es->wal)
- show_wal_usage(es, &instrument->walusage);
+ show_wal_usage(es, &instrument->stack.walusage);
ExplainCloseWorker(n, es);
}
}
@@ -4110,7 +4102,7 @@ peek_buffer_usage(ExplainState *es, const BufferUsage *usage)
* Show buffer usage details. This better be sync with peek_buffer_usage.
*/
static void
-show_buffer_usage(ExplainState *es, const BufferUsage *usage)
+show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title)
{
if (es->format == EXPLAIN_FORMAT_TEXT)
{
@@ -4135,6 +4127,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
if (has_shared || has_local || has_temp)
{
ExplainIndentText(es);
+ if (title)
+ appendStringInfo(es->str, "%s ", title);
appendStringInfoString(es->str, "Buffers:");
if (has_shared)
@@ -4190,6 +4184,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
if (has_shared_timing || has_local_timing || has_temp_timing)
{
ExplainIndentText(es);
+ if (title)
+ appendStringInfo(es->str, "%s ", title);
appendStringInfoString(es->str, "I/O Timings:");
if (has_shared_timing)
@@ -4231,44 +4227,46 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
}
else
{
- ExplainPropertyInteger("Shared Hit Blocks", NULL,
+ char *prefix = title ? psprintf("%s ", title) : pstrdup("");
+
+ ExplainPropertyInteger(psprintf("%sShared Hit Blocks", prefix), NULL,
usage->shared_blks_hit, es);
- ExplainPropertyInteger("Shared Read Blocks", NULL,
+ ExplainPropertyInteger(psprintf("%sShared Read Blocks", prefix), NULL,
usage->shared_blks_read, es);
- ExplainPropertyInteger("Shared Dirtied Blocks", NULL,
+ ExplainPropertyInteger(psprintf("%sShared Dirtied Blocks", prefix), NULL,
usage->shared_blks_dirtied, es);
- ExplainPropertyInteger("Shared Written Blocks", NULL,
+ ExplainPropertyInteger(psprintf("%sShared Written Blocks", prefix), NULL,
usage->shared_blks_written, es);
- ExplainPropertyInteger("Local Hit Blocks", NULL,
+ ExplainPropertyInteger(psprintf("%sLocal Hit Blocks", prefix), NULL,
usage->local_blks_hit, es);
- ExplainPropertyInteger("Local Read Blocks", NULL,
+ ExplainPropertyInteger(psprintf("%sLocal Read Blocks", prefix), NULL,
usage->local_blks_read, es);
- ExplainPropertyInteger("Local Dirtied Blocks", NULL,
+ ExplainPropertyInteger(psprintf("%sLocal Dirtied Blocks", prefix), NULL,
usage->local_blks_dirtied, es);
- ExplainPropertyInteger("Local Written Blocks", NULL,
+ ExplainPropertyInteger(psprintf("%sLocal Written Blocks", prefix), NULL,
usage->local_blks_written, es);
- ExplainPropertyInteger("Temp Read Blocks", NULL,
+ ExplainPropertyInteger(psprintf("%sTemp Read Blocks", prefix), NULL,
usage->temp_blks_read, es);
- ExplainPropertyInteger("Temp Written Blocks", NULL,
+ ExplainPropertyInteger(psprintf("%sTemp Written Blocks", prefix), NULL,
usage->temp_blks_written, es);
if (track_io_timing)
{
- ExplainPropertyFloat("Shared I/O Read Time", "ms",
+ ExplainPropertyFloat(psprintf("%sShared I/O Read Time", prefix), "ms",
INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time),
3, es);
- ExplainPropertyFloat("Shared I/O Write Time", "ms",
+ ExplainPropertyFloat(psprintf("%sShared I/O Write Time", prefix), "ms",
INSTR_TIME_GET_MILLISEC(usage->shared_blk_write_time),
3, es);
- ExplainPropertyFloat("Local I/O Read Time", "ms",
+ ExplainPropertyFloat(psprintf("%sLocal I/O Read Time", prefix), "ms",
INSTR_TIME_GET_MILLISEC(usage->local_blk_read_time),
3, es);
- ExplainPropertyFloat("Local I/O Write Time", "ms",
+ ExplainPropertyFloat(psprintf("%sLocal I/O Write Time", prefix), "ms",
INSTR_TIME_GET_MILLISEC(usage->local_blk_write_time),
3, es);
- ExplainPropertyFloat("Temp I/O Read Time", "ms",
+ ExplainPropertyFloat(psprintf("%sTemp I/O Read Time", prefix), "ms",
INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time),
3, es);
- ExplainPropertyFloat("Temp I/O Write Time", "ms",
+ ExplainPropertyFloat(psprintf("%sTemp I/O Write Time", prefix), "ms",
INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time),
3, es);
}
diff --git a/src/backend/commands/explain_dr.c b/src/backend/commands/explain_dr.c
index 3c96061cf32ab..9e4cd8ad7d083 100644
--- a/src/backend/commands/explain_dr.c
+++ b/src/backend/commands/explain_dr.c
@@ -110,15 +110,20 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self)
MemoryContext oldcontext;
StringInfo buf = &myState->buf;
int natts = typeinfo->natts;
- instr_time start,
- end;
- BufferUsage instr_start;
+ Instrumentation *instr = NULL;
/* only measure time, buffers if requested */
- if (myState->es->timing)
- INSTR_TIME_SET_CURRENT(start);
- if (myState->es->buffers)
- instr_start = pgBufferUsage;
+ if (myState->es->timing || myState->es->buffers)
+ {
+ InstrumentOption instrument_options = 0;
+
+ if (myState->es->timing)
+ instrument_options |= INSTRUMENT_TIMER;
+ if (myState->es->buffers)
+ instrument_options |= INSTRUMENT_BUFFERS;
+ instr = InstrAlloc(instrument_options);
+ InstrStart(instr);
+ }
/* Set or update my derived attribute info, if needed */
if (myState->attrinfo != typeinfo || myState->nattrs != natts)
@@ -186,18 +191,16 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self)
MemoryContextSwitchTo(oldcontext);
MemoryContextReset(myState->tmpcontext);
+ if (myState->es->timing || myState->es->buffers)
+ InstrStop(instr, true);
+
/* Update timing data */
if (myState->es->timing)
- {
- INSTR_TIME_SET_CURRENT(end);
- INSTR_TIME_ACCUM_DIFF(myState->metrics.timeSpent, end, start);
- }
+ INSTR_TIME_ADD(myState->metrics.timeSpent, instr->total);
/* Update buffer metrics */
if (myState->es->buffers)
- BufferUsageAccumDiff(&myState->metrics.bufferUsage,
- &pgBufferUsage,
- &instr_start);
+ BufferUsageAdd(&myState->metrics.bufferUsage, &instr->stack->bufusage);
return true;
}
diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c
index 5b86a727587b2..2a1a6d1cf2e6d 100644
--- a/src/backend/commands/prepare.c
+++ b/src/backend/commands/prepare.c
@@ -578,13 +578,16 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
ListCell *p;
ParamListInfo paramLI = NULL;
EState *estate = NULL;
- instr_time planstart;
- instr_time planduration;
- BufferUsage bufusage_start,
- bufusage;
+ Instrumentation *instr = NULL;
MemoryContextCounters mem_counters;
MemoryContext planner_ctx = NULL;
MemoryContext saved_ctx = NULL;
+ int instrument_options = INSTRUMENT_TIMER;
+
+ if (es->buffers)
+ instrument_options |= INSTRUMENT_BUFFERS;
+
+ instr = InstrAlloc(instrument_options);
if (es->memory)
{
@@ -596,9 +599,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
saved_ctx = MemoryContextSwitchTo(planner_ctx);
}
- if (es->buffers)
- bufusage_start = pgBufferUsage;
- INSTR_TIME_SET_CURRENT(planstart);
+ InstrStart(instr);
/* Look it up in the hash table */
entry = FetchPreparedStatement(execstmt->name, true);
@@ -633,8 +634,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
cplan = GetCachedPlan(entry->plansource, paramLI,
CurrentResourceOwner, pstate->p_queryEnv);
- INSTR_TIME_SET_CURRENT(planduration);
- INSTR_TIME_SUBTRACT(planduration, planstart);
+ InstrStop(instr, true);
if (es->memory)
{
@@ -642,13 +642,6 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
MemoryContextMemConsumed(planner_ctx, &mem_counters);
}
- /* calc differences of buffer counters. */
- if (es->buffers)
- {
- memset(&bufusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
- }
-
plan_list = cplan->stmt_list;
/* Explain each query */
@@ -658,7 +651,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
if (pstmt->commandType != CMD_UTILITY)
ExplainOnePlan(pstmt, into, es, query_string, paramLI, pstate->p_queryEnv,
- &planduration, (es->buffers ? &bufusage : NULL),
+ &instr->total, (es->buffers ? &instr->stack->bufusage : NULL),
es->memory ? &mem_counters : NULL);
else
ExplainOneUtility(pstmt->utilityStmt, into, es, pstate, paramLI);
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 98d402c0a3be7..c336007314121 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -90,7 +90,7 @@ static bool TriggerEnabled(EState *estate, ResultRelInfo *relinfo,
static HeapTuple ExecCallTriggerFunc(TriggerData *trigdata,
int tgindx,
FmgrInfo *finfo,
- Instrumentation *instr,
+ TriggerInstrumentation *instr,
MemoryContext per_tuple_context);
static void AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
ResultRelInfo *src_partinfo,
@@ -2309,7 +2309,7 @@ static HeapTuple
ExecCallTriggerFunc(TriggerData *trigdata,
int tgindx,
FmgrInfo *finfo,
- Instrumentation *instr,
+ TriggerInstrumentation *instr,
MemoryContext per_tuple_context)
{
LOCAL_FCINFO(fcinfo, 0);
@@ -2344,7 +2344,7 @@ ExecCallTriggerFunc(TriggerData *trigdata,
* If doing EXPLAIN ANALYZE, start charging time to this trigger.
*/
if (instr)
- InstrStartNode(instr + tgindx);
+ InstrStartTrigger(instr + tgindx);
/*
* Do the function evaluation in the per-tuple memory context, so that
@@ -2389,10 +2389,10 @@ ExecCallTriggerFunc(TriggerData *trigdata,
/*
* If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count
- * one "tuple returned" (really the number of firings).
+ * the firing of the trigger.
*/
if (instr)
- InstrStopNode(instr + tgindx, 1);
+ InstrStopTrigger(instr + tgindx, 1);
return (HeapTuple) DatumGetPointer(result);
}
@@ -3936,7 +3936,7 @@ static void AfterTriggerExecute(EState *estate,
ResultRelInfo *dst_relInfo,
TriggerDesc *trigdesc,
FmgrInfo *finfo,
- Instrumentation *instr,
+ TriggerInstrumentation *instr,
MemoryContext per_tuple_context,
TupleTableSlot *trig_tuple_slot1,
TupleTableSlot *trig_tuple_slot2);
@@ -4330,7 +4330,7 @@ AfterTriggerExecute(EState *estate,
ResultRelInfo *src_relInfo,
ResultRelInfo *dst_relInfo,
TriggerDesc *trigdesc,
- FmgrInfo *finfo, Instrumentation *instr,
+ FmgrInfo *finfo, TriggerInstrumentation *instr,
MemoryContext per_tuple_context,
TupleTableSlot *trig_tuple_slot1,
TupleTableSlot *trig_tuple_slot2)
@@ -4371,7 +4371,7 @@ AfterTriggerExecute(EState *estate,
* to include time spent re-fetching tuples in the trigger cost.
*/
if (instr)
- InstrStartNode(instr + tgindx);
+ InstrStartTrigger(instr + tgindx);
/*
* Fetch the required tuple(s).
@@ -4588,10 +4588,10 @@ AfterTriggerExecute(EState *estate,
/*
* If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count
- * one "tuple returned" (really the number of firings).
+ * the firing of the trigger.
*/
if (instr)
- InstrStopNode(instr + tgindx, 1);
+ InstrStopTrigger(instr + tgindx, 1);
}
@@ -4707,7 +4707,7 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events,
Relation rel = NULL;
TriggerDesc *trigdesc = NULL;
FmgrInfo *finfo = NULL;
- Instrumentation *instr = NULL;
+ TriggerInstrumentation *instr = NULL;
TupleTableSlot *slot1 = NULL,
*slot2 = NULL;
diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c
index c3b3c9ea21ab8..10ba717bb6b24 100644
--- a/src/backend/commands/vacuumparallel.c
+++ b/src/backend/commands/vacuumparallel.c
@@ -994,6 +994,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
PVIndStats *indstats;
PVShared *shared;
TidStore *dead_items;
+ Instrumentation *instr;
BufferUsage *buffer_usage;
WalUsage *wal_usage;
int nindexes;
@@ -1083,7 +1084,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
error_context_stack = &errcallback;
/* Prepare to track buffer usage during parallel execution */
- InstrStartParallelQuery();
+ instr = InstrStartParallelQuery();
/* Process indexes to perform vacuum/cleanup */
parallel_vacuum_process_safe_indexes(&pvs);
@@ -1091,7 +1092,8 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
/* Report buffer/WAL usage during parallel execution */
buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false);
wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false);
- InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber],
+ InstrEndParallelQuery(instr,
+ &buffer_usage[ParallelWorkerNumber],
&wal_usage[ParallelWorkerNumber]);
/* Report any remaining cost-based vacuum delay time */
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index bfd3ebc601ec6..f01f1c864c409 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -76,6 +76,7 @@ ExecutorCheckPerms_hook_type ExecutorCheckPerms_hook = NULL;
/* decls for local routines only used within this module */
static void InitPlan(QueryDesc *queryDesc, int eflags);
static void CheckValidRowMarkRel(Relation rel, RowMarkType markType);
+static void ExecFinalizeTriggerInstrumentation(EState *estate);
static void ExecPostprocessPlan(EState *estate);
static void ExecEndPlan(PlanState *planstate, EState *estate);
static void ExecutePlan(QueryDesc *queryDesc,
@@ -329,9 +330,26 @@ standard_ExecutorRun(QueryDesc *queryDesc,
*/
oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
- /* Allow instrumentation of Executor overall runtime */
+ /*
+ * Start up required top-level instrumentation stack for WAL/buffer
+ * tracking
+ */
+ if (!queryDesc->totaltime && (estate->es_instrument & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL)))
+ queryDesc->totaltime = InstrAlloc(estate->es_instrument);
+
if (queryDesc->totaltime)
- InstrStartNode(queryDesc->totaltime);
+ {
+ /* Allow instrumentation of Executor overall runtime */
+ InstrStart(queryDesc->totaltime);
+
+ /*
+ * Remember all node stacks for abort recovery. We do this once here
+ * after the first call to InstrStart has allocated the parent stack.
+ */
+ if (queryDesc->totaltime->stack && !queryDesc->already_executed)
+ ExecRememberNodeInstrumentation(queryDesc->planstate,
+ queryDesc->totaltime->stack);
+ }
/*
* extract information from the query descriptor and the query feature.
@@ -383,7 +401,7 @@ standard_ExecutorRun(QueryDesc *queryDesc,
dest->rShutdown(dest);
if (queryDesc->totaltime)
- InstrStopNode(queryDesc->totaltime, estate->es_processed);
+ InstrStop(queryDesc->totaltime, false);
MemoryContextSwitchTo(oldcontext);
}
@@ -433,7 +451,7 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
/* Allow instrumentation of Executor overall runtime */
if (queryDesc->totaltime)
- InstrStartNode(queryDesc->totaltime);
+ InstrStart(queryDesc->totaltime);
/* Run ModifyTable nodes to completion */
ExecPostprocessPlan(estate);
@@ -442,8 +460,26 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
if (!(estate->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS))
AfterTriggerEndQuery(estate);
+ /*
+ * Accumulate per-node and trigger statistics to their respective parent
+ * instrumentation stacks.
+ *
+ * We skip this in parallel workers because their per-node stats are
+ * reported individually via ExecParallelReportInstrumentation, and the
+ * leader's own ExecFinalizeNodeInstrumentation handles propagation. If
+ * we accumulated here, the leader would double-count: worker parent nodes
+ * would already include their children's stats, and then the leader's
+ * accumulation would add the children again.
+ */
+ if (queryDesc->totaltime && estate->es_instrument && !IsParallelWorker())
+ {
+ ExecFinalizeNodeInstrumentation(queryDesc->planstate);
+
+ ExecFinalizeTriggerInstrumentation(estate);
+ }
+
if (queryDesc->totaltime)
- InstrStopNode(queryDesc->totaltime, 0);
+ InstrStop(queryDesc->totaltime, true);
MemoryContextSwitchTo(oldcontext);
@@ -1270,7 +1306,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
resultRelInfo->ri_TrigWhenExprs = (ExprState **)
palloc0_array(ExprState *, n);
if (instrument_options)
- resultRelInfo->ri_TrigInstrument = InstrAlloc(n, instrument_options, false);
+ resultRelInfo->ri_TrigInstrument = InstrAllocTrigger(n, instrument_options);
}
else
{
@@ -1484,6 +1520,24 @@ ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo)
return resultRelInfo->ri_ancestorResultRels;
}
+static void
+ExecFinalizeTriggerInstrumentation(EState *estate)
+{
+ List *rels = NIL;
+
+ rels = list_concat(rels, estate->es_tuple_routing_result_relations);
+ rels = list_concat(rels, estate->es_opened_result_relations);
+ rels = list_concat(rels, estate->es_trig_target_relations);
+
+ foreach_node(ResultRelInfo, rInfo, rels)
+ {
+ TriggerInstrumentation *ti = rInfo->ri_TrigInstrument;
+
+ if (ti && ti->instr.stack)
+ InstrStackAdd(CurrentInstrStack, ti->instr.stack);
+ }
+}
+
/* ----------------------------------------------------------------
* ExecPostprocessPlan
*
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index f87978c137e7d..dee8bf99c936a 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -86,7 +86,7 @@ typedef struct FixedParallelExecutorState
* instrument_options: Same meaning here as in instrument.c.
*
* instrument_offset: Offset, relative to the start of this structure,
- * of the first Instrumentation object. This will depend on the length of
+ * of the first NodeInstrumentation object. This will depend on the length of
* the plan_node_id array.
*
* num_workers: Number of workers.
@@ -103,11 +103,15 @@ struct SharedExecutorInstrumentation
int num_workers;
int num_plan_nodes;
int plan_node_id[FLEXIBLE_ARRAY_MEMBER];
- /* array of num_plan_nodes * num_workers Instrumentation objects follows */
+
+ /*
+ * array of num_plan_nodes * num_workers NodeInstrumentation objects
+ * follows
+ */
};
#define GetInstrumentationArray(sei) \
(StaticAssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \
- (Instrumentation *) (((char *) sei) + sei->instrument_offset))
+ (NodeInstrumentation *) (((char *) sei) + sei->instrument_offset))
/* Context object for ExecParallelEstimate. */
typedef struct ExecParallelEstimateContext
@@ -724,7 +728,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
instrumentation_len = MAXALIGN(instrumentation_len);
instrument_offset = instrumentation_len;
instrumentation_len +=
- mul_size(sizeof(Instrumentation),
+ mul_size(sizeof(NodeInstrumentation),
mul_size(e.nnodes, nworkers));
shm_toc_estimate_chunk(&pcxt->estimator, instrumentation_len);
shm_toc_estimate_keys(&pcxt->estimator, 1);
@@ -810,7 +814,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
*/
if (estate->es_instrument)
{
- Instrumentation *instrument;
+ NodeInstrumentation *instrument;
int i;
instrumentation = shm_toc_allocate(pcxt->toc, instrumentation_len);
@@ -820,7 +824,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
instrumentation->num_plan_nodes = e.nnodes;
instrument = GetInstrumentationArray(instrumentation);
for (i = 0; i < nworkers * e.nnodes; ++i)
- InstrInit(&instrument[i], estate->es_instrument);
+ InstrInitNode(&instrument[i], estate->es_instrument);
shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION,
instrumentation);
pei->instrumentation = instrumentation;
@@ -1052,7 +1056,7 @@ static bool
ExecParallelRetrieveInstrumentation(PlanState *planstate,
SharedExecutorInstrumentation *instrumentation)
{
- Instrumentation *instrument;
+ NodeInstrumentation *instrument;
int i;
int n;
int ibytes;
@@ -1080,9 +1084,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
* Switch into per-query memory context.
*/
oldcontext = MemoryContextSwitchTo(planstate->state->es_query_cxt);
- ibytes = mul_size(instrumentation->num_workers, sizeof(Instrumentation));
+ ibytes = mul_size(instrumentation->num_workers, sizeof(NodeInstrumentation));
planstate->worker_instrument =
- palloc(ibytes + offsetof(WorkerInstrumentation, instrument));
+ palloc(ibytes + offsetof(WorkerNodeInstrumentation, instrument));
MemoryContextSwitchTo(oldcontext);
planstate->worker_instrument->num_workers = instrumentation->num_workers;
@@ -1312,7 +1316,7 @@ ExecParallelReportInstrumentation(PlanState *planstate,
{
int i;
int plan_node_id = planstate->plan->plan_node_id;
- Instrumentation *instrument;
+ NodeInstrumentation *instrument;
InstrEndLoop(planstate->instrument);
@@ -1451,6 +1455,7 @@ void
ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
{
FixedParallelExecutorState *fpes;
+ Instrumentation *instr;
BufferUsage *buffer_usage;
WalUsage *wal_usage;
DestReceiver *receiver;
@@ -1511,7 +1516,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
* leader, which also doesn't count buffer accesses and WAL activity that
* occur during executor startup.
*/
- InstrStartParallelQuery();
+ instr = InstrStartParallelQuery();
/*
* Run the plan. If we specified a tuple bound, be careful not to demand
@@ -1527,7 +1532,8 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
/* Report buffer/WAL usage during parallel execution. */
buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
- InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber],
+ InstrEndParallelQuery(instr,
+ &buffer_usage[ParallelWorkerNumber],
&wal_usage[ParallelWorkerNumber]);
/* Report instrumentation data if any instrumentation options are set. */
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index 7e40b85251718..c3ff03f805bb0 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -122,6 +122,8 @@
static TupleTableSlot *ExecProcNodeFirst(PlanState *node);
static TupleTableSlot *ExecProcNodeInstr(PlanState *node);
static bool ExecShutdownNode_walker(PlanState *node, void *context);
+static bool ExecRememberNodeInstrumentation_walker(PlanState *node, void *context);
+static bool ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context);
/* ------------------------------------------------------------------------
@@ -413,8 +415,18 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
/* Set up instrumentation for this node if requested */
if (estate->es_instrument)
- result->instrument = InstrAlloc(1, estate->es_instrument,
- result->async_capable);
+ {
+ result->instrument = InstrAllocNode(estate->es_instrument,
+ result->async_capable);
+
+ /* IndexScan tracks table access separately from index access. */
+ if (IsA(result, IndexScanState))
+ {
+ IndexScanState *iss = castNode(IndexScanState, result);
+
+ iss->iss_InstrumentTableStack = InstrAllocAdditionalNodeStack(result->instrument);
+ }
+ }
return result;
}
@@ -828,6 +840,101 @@ ExecShutdownNode_walker(PlanState *node, void *context)
return false;
}
+/*
+ * ExecRememberNodeInstrumentation
+ *
+ * Register all per-node instrumentation stacks as unfinalized children of the
+ * executor's instrumentation stack. This is needed for abort recovery: if the
+ * executor aborts, we need to walk each per-node instrumentation stack to
+ * recover buffer/WAL data from nodes that never got finalized, that someone
+ * might be interested in as an aggregate.
+ */
+void
+ExecRememberNodeInstrumentation(PlanState *node, InstrStack *parent)
+{
+ (void) ExecRememberNodeInstrumentation_walker(node, parent);
+}
+
+static bool
+ExecRememberNodeInstrumentation_walker(PlanState *node, void *context)
+{
+ InstrStack *parent = (InstrStack *) context;
+
+ Assert(parent != NULL);
+
+ if (node == NULL)
+ return false;
+
+ if (node->instrument && (node->instrument->need_bufusage ||
+ node->instrument->need_walusage))
+ {
+ InstrRememberNodeStack(parent, &node->instrument->stack);
+
+ /* IndexScan has a separate stack to track table access */
+ if (IsA(node, IndexScanState))
+ {
+ IndexScanState *iss = castNode(IndexScanState, node);
+
+ if (iss->iss_InstrumentTableStack)
+ InstrRememberNodeStack(parent, iss->iss_InstrumentTableStack);
+ }
+ }
+
+ return planstate_tree_walker(node, ExecRememberNodeInstrumentation_walker, context);
+}
+
+/*
+ * ExecFinalizeNodeInstrumentation
+ *
+ * Accumulate instrumentation stats from all execution nodes to their respective
+ * parents (or the original parent instrumentation stack).
+ *
+ * This must run after the cleanup done by ExecShutdownNode, and not rely on any
+ * resources cleaned up by it. We also expect shutdown actions to have occurred,
+ * e.g. parallel worker instrumentation to have been added to the leader.
+ */
+void
+ExecFinalizeNodeInstrumentation(PlanState *node)
+{
+ (void) ExecFinalizeNodeInstrumentation_walker(node, (InstrStack *) CurrentInstrStack);
+}
+
+static bool
+ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context)
+{
+ InstrStack *parent = (InstrStack *) context;
+
+ Assert(parent != NULL);
+
+ if (node == NULL)
+ return false;
+
+ /*
+ * Recurse into children first (bottom-up accumulation), passing our stack
+ * as the parent context. This ensures children can accumulate to us even
+ * if they were never executed by the leader (e.g. nodes beneath Gather
+ * that only workers ran, where stack.previous would not be initialized).
+ */
+ planstate_tree_walker(node, ExecFinalizeNodeInstrumentation_walker,
+ node->instrument ? &node->instrument->stack : parent);
+
+ if (!node->instrument)
+ return false;
+
+ /* IndexScan has a separate stack to track table access */
+ if (IsA(node, IndexScanState))
+ {
+ IndexScanState *iss = castNode(IndexScanState, node);
+
+ if (iss->iss_InstrumentTableStack)
+ iss->iss_InstrumentTableStack = InstrFinalizeAdditionalNodeStack(iss->iss_InstrumentTableStack, node->instrument);
+ }
+
+ node->instrument = InstrFinalizeNode(node->instrument, parent);
+
+ return false;
+}
+
/*
* ExecSetTupleBound
*
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index edab92a0ebeda..0c2d769416e91 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -16,72 +16,342 @@
#include
#include "executor/instrument.h"
+#include "utils/memutils.h"
+#include "utils/resowner.h"
-BufferUsage pgBufferUsage;
-static BufferUsage save_pgBufferUsage;
WalUsage pgWalUsage;
-static WalUsage save_pgWalUsage;
+InstrStack TopInstrStack;
+InstrStack *CurrentInstrStack = &TopInstrStack;
-static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
static void WalUsageAdd(WalUsage *dst, WalUsage *add);
+static void InstrFinalizeNodesOnAbort(InstrStack *stack);
-/* Allocate new instrumentation structure(s) */
+/*
+ * Use ResourceOwner mechanism to correctly reset CurrentInstrStack on abort.
+ */
+static void ResOwnerReleaseInstrumentation(Datum res);
+static const ResourceOwnerDesc instrumentation_resowner_desc =
+{
+ .name = "instrumentation",
+ .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
+ .release_priority = RELEASE_PRIO_INSTRUMENTATION,
+ .ReleaseResource = ResOwnerReleaseInstrumentation,
+ .DebugPrint = NULL, /* default message is fine */
+};
+
+static inline void
+ResourceOwnerRememberInstrStack(ResourceOwner owner, InstrStack *stack)
+{
+ ResourceOwnerRemember(owner, PointerGetDatum(stack), &instrumentation_resowner_desc);
+}
+
+static inline void
+ResourceOwnerForgetInstrStack(ResourceOwner owner, InstrStack *stack)
+{
+ ResourceOwnerForget(owner, PointerGetDatum(stack), &instrumentation_resowner_desc);
+}
+
+static bool
+StackIsParent(InstrStack *stack, InstrStack *entry)
+{
+ if (entry->previous == NULL)
+ return false;
+
+ if (entry->previous == stack)
+ return true;
+
+ return StackIsParent(stack, entry->previous);
+}
+
+/*
+ * OrphanSkippedStacks
+ *
+ * Orphans all stack entries from the current stack entry to the provided
+ * stack which is assumed to be a parent stack of the current stack, and
+ * terminates once the current stack entry has reached the provided stack.
+ *
+ * This sets previous pointers of intermediate stack entries to NULL, so we
+ * don't have to worry about calling StackIsParent with a bad pointer in
+ * ResOwnerReleaseInstrumentation.
+ *
+ * This matters because we may process stack entries out of order in aborts
+ * because (1) we might have a mix of ResOwner and PG_FINALLY owned stacks
+ * (2) ResOwnerReleaseInstrumentation might be called out of order.
+ */
+static void
+OrphanSkippedStacks(InstrStack *stack)
+{
+ if (CurrentInstrStack == stack || !StackIsParent(stack, CurrentInstrStack))
+ return;
+
+ while (CurrentInstrStack != stack)
+ {
+ InstrStack *previous = CurrentInstrStack->previous;
+
+ Assert(previous != NULL);
+ CurrentInstrStack->previous = NULL;
+ CurrentInstrStack = previous;
+ }
+}
+
+static void
+ResOwnerReleaseInstrumentation(Datum res)
+{
+ InstrStack *stack = (InstrStack *) DatumGetPointer(res);
+
+ /*
+ * Because registered resources are *not* cleaned up in a guaranteed
+ * order, we may get a child context after we've processed the parent.
+ * Thus, we only pop the stack if its not already a parent of the stack
+ * being released. Note that OrphanSkippedStacks may have set our previous
+ * stack entry to NULL, in which case we don't modify the stack either.
+ *
+ * Note that StackIsParent will recurse as needed, so it is inadvisible to
+ * use deeply nested stacks.
+ */
+ if (stack->previous && !StackIsParent(CurrentInstrStack, stack))
+ {
+ OrphanSkippedStacks(stack);
+ InstrPopStack(stack);
+ }
+
+ /* Accumulate data from all unfinalized child node stacks. */
+ InstrFinalizeNodesOnAbort(stack);
+
+ /*
+ * Accumulate the stack associated with the ResOwner to the active stack.
+ *
+ * Note that we intentionally directly add to the current stack instead of
+ * the parent of the stack being released, because this can execute out of
+ * order. Explicit PG_FINALLY blocks might have modified the stack as
+ * well.
+ */
+ InstrStackAdd(CurrentInstrStack, stack);
+
+ /* Free the stack entry now since InstrStop won't be called */
+ pfree(stack);
+}
+
+/*
+ * Pops the stack entry and accumulates to its parent.
+ *
+ * Note that this intentionally allows passing a stack that is not
+ * CurrentInstrStack, as can happen with PG_FINALLY, and orphans any
+ * intermediate stacks that were skipped.
+ */
+void
+InstrPopAndFinalizeStack(InstrStack *stack)
+{
+ OrphanSkippedStacks(stack);
+ InstrPopStack(stack);
+ InstrStackAdd(CurrentInstrStack, stack);
+}
+
+/* General purpose instrumentation handling */
Instrumentation *
-InstrAlloc(int n, int instrument_options, bool async_mode)
+InstrAlloc(int instrument_options)
{
Instrumentation *instr;
/* initialize all fields to zeroes, then modify as needed */
- instr = palloc0(n * sizeof(Instrumentation));
+ instr = palloc0(sizeof(Instrumentation));
if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL))
{
- bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0;
- bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0;
- bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
- int i;
+ instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0;
+ instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0;
+ instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
+ }
+
+ return instr;
+}
- for (i = 0; i < n; i++)
+void
+InstrStart(Instrumentation *instr)
+{
+ if (instr->need_timer &&
+ !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime))
+ elog(ERROR, "InstrStart called twice in a row");
+
+ if (instr->need_bufusage || instr->need_walusage)
+ {
+ Assert(CurrentResourceOwner != NULL);
+ instr->owner = CurrentResourceOwner;
+
+ /*
+ * Allocate the stack resource in a memory context that survives
+ * during an abort. This will be freed by InstrStop (regular
+ * execution) or ResOwnerReleaseInstrumentation (abort).
+ *
+ * We don't do this in InstrAlloc to avoid allocating when InstrStart
+ * + InstrStop isn't called.
+ */
+ if (instr->stack == NULL)
+ instr->stack = MemoryContextAllocZero(TopMemoryContext, sizeof(InstrStack));
+
+ ResourceOwnerEnlarge(instr->owner);
+ ResourceOwnerRememberInstrStack(instr->owner, instr->stack);
+
+ InstrPushStack(instr->stack);
+ }
+}
+
+void
+InstrStop(Instrumentation *instr, bool finalize)
+{
+ instr_time endtime;
+
+ /* let's update the time only if the timer was requested */
+ if (instr->need_timer)
+ {
+ if (INSTR_TIME_IS_ZERO(instr->starttime))
+ elog(ERROR, "InstrStop called without start");
+
+ INSTR_TIME_SET_CURRENT(endtime);
+ INSTR_TIME_ACCUM_DIFF(instr->total, endtime, instr->starttime);
+
+ INSTR_TIME_SET_ZERO(instr->starttime);
+ }
+
+ if (instr->need_bufusage || instr->need_walusage)
+ {
+ InstrPopStack(instr->stack);
+
+ if (finalize)
+ InstrStackAdd(CurrentInstrStack, instr->stack);
+
+ Assert(instr->owner != NULL);
+ ResourceOwnerForgetInstrStack(instr->owner, instr->stack);
+ instr->owner = NULL;
+
+ if (finalize)
{
- instr[i].need_bufusage = need_buffers;
- instr[i].need_walusage = need_wal;
- instr[i].need_timer = need_timer;
- instr[i].async_mode = async_mode;
+ /*
+ * To avoid keeping memory allocated beyond when its needed, copy
+ * the result to the current memory context, and free it in the
+ * transaction context.
+ */
+ InstrStack *stack = palloc(sizeof(InstrStack));
+
+ memcpy(stack, instr->stack, sizeof(InstrStack));
+ pfree(instr->stack);
+ instr->stack = stack;
}
}
+}
+
+/* Trigger instrumentation handling */
+TriggerInstrumentation *
+InstrAllocTrigger(int n, int instrument_options)
+{
+ TriggerInstrumentation *tginstr = palloc0(n * sizeof(TriggerInstrumentation));
+ bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
+ bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0;
+ bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0;
+ int i;
+
+ for (i = 0; i < n; i++)
+ {
+ tginstr[i].instr.need_timer = need_timer;
+ tginstr[i].instr.need_bufusage = need_buffers;
+ tginstr[i].instr.need_walusage = need_wal;
+ }
+
+ return tginstr;
+}
+
+void
+InstrStartTrigger(TriggerInstrumentation *tginstr)
+{
+ InstrStart(&tginstr->instr);
+}
+
+void
+InstrStopTrigger(TriggerInstrumentation *tginstr, int firings)
+{
+ /*
+ * This trigger may be called again, so we don't finalize instrumentation
+ * here. Accumulation to the parent happens at ExecutorFinish through
+ * ExecFinalizeTriggerInstrumentation.
+ */
+ InstrStop(&tginstr->instr, false);
+ tginstr->firings += firings;
+}
+
+/* Node instrumentation handling */
+
+/* Allocate new node instrumentation structure */
+NodeInstrumentation *
+InstrAllocNode(int instrument_options, bool async_mode)
+{
+ /*
+ * We can utilize TopTransactionContext instead of TopMemoryContext here
+ * (despite the inlined InstrStack in NodeInstrumentation) because nodes
+ * don't get used for utility commands that restart transactions, which
+ * would require a context that survives longer (EXPLAIN ANALYZE is fine).
+ */
+ NodeInstrumentation *instr = MemoryContextAlloc(TopTransactionContext, sizeof(NodeInstrumentation));
+
+ InstrInitNode(instr, instrument_options);
+ instr->async_mode = async_mode;
return instr;
}
/* Initialize a pre-allocated instrumentation structure. */
void
-InstrInit(Instrumentation *instr, int instrument_options)
+InstrInitNode(NodeInstrumentation *instr, int instrument_options)
{
- memset(instr, 0, sizeof(Instrumentation));
+ memset(instr, 0, sizeof(NodeInstrumentation));
instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0;
instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0;
instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
}
+/*
+ * InstrRememberNodeStack - register a child node stack for abort processing.
+ *
+ * On abort, InstrFinalizeNodesOnAbort will walk the parent's list to recover
+ * buffer/WAL data from stacks that were never finalized, in order for
+ * aggregate totals to be accurate despite the query erroring out.
+ *
+ * The passed in node stack can either be the NodeInstrumentation stack or an
+ * additional stack that is associated with a node. This must not be called
+ * with other (non-node) instrumentation stacks as the child that perform their
+ * own cleanup. The parent must be a non-node stack that can handle aborts.
+ */
+void
+InstrRememberNodeStack(InstrStack *parent, InstrStack *node_stack)
+{
+ /*
+ * We do not support nesting, to avoid recursion in
+ * InstrFinalizeNodesOnAbort
+ */
+ Assert(parent->unfinalized_node.next == NULL);
+
+ slist_push_head(&parent->unfinalized_children, &node_stack->unfinalized_node);
+}
+
/* Entry to a plan node */
void
-InstrStartNode(Instrumentation *instr)
+InstrStartNode(NodeInstrumentation *instr)
{
if (instr->need_timer &&
!INSTR_TIME_SET_CURRENT_LAZY(instr->starttime))
elog(ERROR, "InstrStartNode called twice in a row");
- /* save buffer usage totals at node entry, if needed */
- if (instr->need_bufusage)
- instr->bufusage_start = pgBufferUsage;
+ if (instr->need_bufusage || instr->need_walusage)
+ {
+ /* Ensure that we always have a parent, even at the top most node */
+ Assert(CurrentInstrStack != &TopInstrStack);
- if (instr->need_walusage)
- instr->walusage_start = pgWalUsage;
+ InstrPushStack(&instr->stack);
+ }
}
/* Exit from a plan node */
void
-InstrStopNode(Instrumentation *instr, double nTuples)
+InstrStopNode(NodeInstrumentation *instr, double nTuples)
{
double save_tuplecount = instr->tuplecount;
instr_time endtime;
@@ -101,14 +371,14 @@ InstrStopNode(Instrumentation *instr, double nTuples)
INSTR_TIME_SET_ZERO(instr->starttime);
}
- /* Add delta of buffer usage since entry to node's totals */
- if (instr->need_bufusage)
- BufferUsageAccumDiff(&instr->bufusage,
- &pgBufferUsage, &instr->bufusage_start);
+ if (instr->need_bufusage || instr->need_walusage)
+ {
+ /* Ensure that we always have a parent, even at the top most node */
+ Assert(instr->stack.previous != NULL);
- if (instr->need_walusage)
- WalUsageAccumDiff(&instr->walusage,
- &pgWalUsage, &instr->walusage_start);
+ /* Adding to parent is handled by ExecFinalizeNodeInstrumentation */
+ InstrPopStack(&instr->stack);
+ }
/* Is this the first tuple of this cycle? */
if (!instr->running)
@@ -127,9 +397,53 @@ InstrStopNode(Instrumentation *instr, double nTuples)
}
}
+/* Add per-node instrumentation to the parent and move into per-query memory context */
+NodeInstrumentation *
+InstrFinalizeNode(NodeInstrumentation *instr, InstrStack *parent)
+{
+ NodeInstrumentation *dst = palloc(sizeof(NodeInstrumentation));
+
+ memcpy(dst, instr, sizeof(NodeInstrumentation));
+ pfree(instr);
+
+ /* Avoid stale pointer references */
+ dst->stack.previous = NULL;
+
+ InstrStackAdd(parent, &dst->stack);
+
+ return dst;
+}
+
+/*
+ * InstrFinalizeNodesOnAbort
+ *
+ * Accumulates unfinalized child per-node stacks into the resource owner stack,
+ * and resets the list so a theoretical second call is a safe no-op.
+ */
+static void
+InstrFinalizeNodesOnAbort(InstrStack *stack)
+{
+ slist_iter iter;
+
+ slist_foreach(iter, &stack->unfinalized_children)
+ {
+ InstrStack *child = slist_container(InstrStack, unfinalized_node, iter.cur);
+
+ InstrStackAdd(stack, child);
+
+ /*
+ * Note we don't free the child here since its usually contained
+ * within NodeInstrumentation and we don't have an easy way to access
+ * that, it will be instead be cleaned up by the transaction ending.
+ */
+ }
+
+ slist_init(&stack->unfinalized_children);
+}
+
/* Update tuple count */
void
-InstrUpdateTupleCount(Instrumentation *instr, double nTuples)
+InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples)
{
/* count the returned tuples */
instr->tuplecount += nTuples;
@@ -137,7 +451,7 @@ InstrUpdateTupleCount(Instrumentation *instr, double nTuples)
/* Finish a run cycle for a plan node */
void
-InstrEndLoop(Instrumentation *instr)
+InstrEndLoop(NodeInstrumentation *instr)
{
/* Skip if nothing has happened, or already shut down */
if (!instr->running)
@@ -162,7 +476,7 @@ InstrEndLoop(Instrumentation *instr)
/* aggregate instrumentation information */
void
-InstrAggNode(Instrumentation *dst, Instrumentation *add)
+InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add)
{
if (!dst->running && add->running)
{
@@ -170,7 +484,7 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add)
dst->firsttuple = add->firsttuple;
}
else if (dst->running && add->running &&
- INSTR_TIME_LT(dst->firsttuple, add->firsttuple))
+ INSTR_TIME_GT(dst->firsttuple, add->firsttuple))
dst->firsttuple = add->firsttuple;
INSTR_TIME_ADD(dst->counter, add->counter);
@@ -186,40 +500,119 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add)
/* Add delta of buffer usage since entry to node's totals */
if (dst->need_bufusage)
- BufferUsageAdd(&dst->bufusage, &add->bufusage);
+ BufferUsageAdd(&dst->stack.bufusage, &add->stack.bufusage);
if (dst->need_walusage)
- WalUsageAdd(&dst->walusage, &add->walusage);
+ WalUsageAdd(&dst->stack.walusage, &add->stack.walusage);
+}
+
+/*
+ * Allocate an additional InstrStack for a node, e.g. for tracking table
+ * buffer usage separately from index buffer usage. Allocated in
+ * TopTransactionContext so it survives long enough for abort recovery.
+ */
+InstrStack *
+InstrAllocAdditionalNodeStack(NodeInstrumentation *instr)
+{
+ if (instr->need_bufusage || instr->need_walusage)
+ return MemoryContextAllocZero(TopTransactionContext, sizeof(InstrStack));
+
+ return NULL;
}
-/* note current values during parallel executor startup */
void
+InstrStartNodeStack(NodeInstrumentation *instr, InstrStack *stack)
+{
+ if (instr->need_bufusage || instr->need_walusage)
+ {
+ /* Ensure the executor set up a parent node below the top level stack */
+ Assert(CurrentInstrStack != &TopInstrStack);
+
+ InstrPushStack(stack);
+ }
+}
+
+void
+InstrStopNodeStack(NodeInstrumentation *instr, InstrStack *stack)
+{
+ if (instr->need_bufusage || instr->need_walusage)
+ {
+ /* Adding to parent is handled by InstrFinalizeAdditionalNodeStack */
+ InstrPopStack(stack);
+ }
+}
+
+/* Add additional node stacks to the parent and move into per-query memory context */
+InstrStack *
+InstrFinalizeAdditionalNodeStack(InstrStack *stack, NodeInstrumentation *instr)
+{
+ InstrStack *dst = palloc(sizeof(InstrStack));
+
+ memcpy(dst, stack, sizeof(InstrStack));
+ pfree(stack);
+
+ /* Avoid stale pointer references */
+ dst->previous = NULL;
+
+ InstrStackAdd(&instr->stack, dst);
+
+ return dst;
+}
+
+/* start instrumentation during parallel executor startup */
+Instrumentation *
InstrStartParallelQuery(void)
{
- save_pgBufferUsage = pgBufferUsage;
- save_pgWalUsage = pgWalUsage;
+ Instrumentation *instr = InstrAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL);
+
+ InstrStart(instr);
+ return instr;
}
/* report usage after parallel executor shutdown */
void
-InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
+InstrEndParallelQuery(Instrumentation *instr, BufferUsage *bufusage, WalUsage *walusage)
{
+ InstrStop(instr, true);
memset(bufusage, 0, sizeof(BufferUsage));
- BufferUsageAccumDiff(bufusage, &pgBufferUsage, &save_pgBufferUsage);
+ memcpy(bufusage, &instr->stack->bufusage, sizeof(BufferUsage));
memset(walusage, 0, sizeof(WalUsage));
- WalUsageAccumDiff(walusage, &pgWalUsage, &save_pgWalUsage);
+ memcpy(walusage, &instr->stack->walusage, sizeof(WalUsage));
}
-/* accumulate work done by workers in leader's stats */
+/*
+ * Accumulate work done by parallel workers in the leader's stats.
+ *
+ * Note that what gets added here effectively depends on whether per-node
+ * instrumentation is active. If its active the parallel worker intentionally
+ * skips ExecFinalizeNodeInstrumentation on executor shutdown, because it would
+ * cause double counting. Instead, this only accumulates any extra activity
+ * outside of nodes.
+ *
+ * Otherwise this is responsible for making sure that the complete query
+ * activity is accumulated.
+ */
void
InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
{
- BufferUsageAdd(&pgBufferUsage, bufusage);
+ BufferUsageAdd(&CurrentInstrStack->bufusage, bufusage);
+ WalUsageAdd(&CurrentInstrStack->walusage, walusage);
+
WalUsageAdd(&pgWalUsage, walusage);
}
+void
+InstrStackAdd(InstrStack *dst, InstrStack *add)
+{
+ Assert(dst != NULL);
+ Assert(add != NULL);
+
+ BufferUsageAdd(&dst->bufusage, &add->bufusage);
+ WalUsageAdd(&dst->walusage, &add->walusage);
+}
+
/* dst += add */
-static void
+void
BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
{
dst->shared_blks_hit += add->shared_blks_hit;
@@ -240,36 +633,6 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time);
}
-/* dst += add - sub */
-void
-BufferUsageAccumDiff(BufferUsage *dst,
- const BufferUsage *add,
- const BufferUsage *sub)
-{
- dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit;
- dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read;
- dst->shared_blks_dirtied += add->shared_blks_dirtied - sub->shared_blks_dirtied;
- dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written;
- dst->local_blks_hit += add->local_blks_hit - sub->local_blks_hit;
- dst->local_blks_read += add->local_blks_read - sub->local_blks_read;
- dst->local_blks_dirtied += add->local_blks_dirtied - sub->local_blks_dirtied;
- dst->local_blks_written += add->local_blks_written - sub->local_blks_written;
- dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read;
- dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written;
- INSTR_TIME_ACCUM_DIFF(dst->shared_blk_read_time,
- add->shared_blk_read_time, sub->shared_blk_read_time);
- INSTR_TIME_ACCUM_DIFF(dst->shared_blk_write_time,
- add->shared_blk_write_time, sub->shared_blk_write_time);
- INSTR_TIME_ACCUM_DIFF(dst->local_blk_read_time,
- add->local_blk_read_time, sub->local_blk_read_time);
- INSTR_TIME_ACCUM_DIFF(dst->local_blk_write_time,
- add->local_blk_write_time, sub->local_blk_write_time);
- INSTR_TIME_ACCUM_DIFF(dst->temp_blk_read_time,
- add->temp_blk_read_time, sub->temp_blk_read_time);
- INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time,
- add->temp_blk_write_time, sub->temp_blk_write_time);
-}
-
/* helper functions for WAL usage accumulation */
static void
WalUsageAdd(WalUsage *dst, WalUsage *add)
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index a616abff04cf0..fb62ff7a8be7a 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -83,6 +83,7 @@ IndexNext(IndexScanState *node)
ExprContext *econtext;
ScanDirection direction;
IndexScanDesc scandesc;
+ ItemPointer tid;
TupleTableSlot *slot;
/*
@@ -128,8 +129,20 @@ IndexNext(IndexScanState *node)
/*
* ok, now that we have what we need, fetch the next tuple.
*/
- while (index_getnext_slot(scandesc, direction, slot))
+ while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
{
+ if (node->ss.ps.instrument)
+ InstrStartNodeStack(node->ss.ps.instrument, node->iss_InstrumentTableStack);
+
+ if (unlikely(!index_fetch_heap(scandesc, slot)))
+ continue;
+
+ if (node->ss.ps.instrument)
+ InstrStopNodeStack(node->ss.ps.instrument, node->iss_InstrumentTableStack);
+
+ if (scandesc->xs_heap_continue)
+ elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+
CHECK_FOR_INTERRUPTS();
/*
@@ -812,6 +825,8 @@ ExecEndIndexScan(IndexScanState *node)
* which will have a new IndexOnlyScanState and zeroed stats.
*/
winstrument->nsearches += node->iss_Instrument.nsearches;
+ if (node->iss_InstrumentTableStack)
+ InstrStackAdd(&winstrument->worker_table_stack, node->iss_InstrumentTableStack);
}
/*
@@ -1819,4 +1834,10 @@ ExecIndexScanRetrieveInstrumentation(IndexScanState *node)
SharedInfo->num_workers * sizeof(IndexScanInstrumentation);
node->iss_SharedInfo = palloc(size);
memcpy(node->iss_SharedInfo, SharedInfo, size);
+
+ /* Aggregate workers' table buffer/WAL usage into leader's stack */
+ if (node->iss_InstrumentTableStack)
+ for (int i = 0; i < node->iss_SharedInfo->num_workers; i++)
+ InstrStackAdd(node->iss_InstrumentTableStack,
+ &node->iss_SharedInfo->winstrument[i].worker_table_stack);
}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index d1babaff02351..511efaba516e7 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -826,7 +826,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN
{
PinLocalBuffer(bufHdr, true);
- pgBufferUsage.local_blks_hit++;
+ INSTR_BUFUSAGE_INCR(local_blks_hit);
return true;
}
@@ -847,7 +847,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN
{
if (BufferTagsEqual(&tag, &bufHdr->tag))
{
- pgBufferUsage.shared_blks_hit++;
+ INSTR_BUFUSAGE_INCR(shared_blks_hit);
return true;
}
UnpinBuffer(bufHdr);
@@ -1248,14 +1248,14 @@ PinBufferForBlock(Relation rel,
{
bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
if (*foundPtr)
- pgBufferUsage.local_blks_hit++;
+ INSTR_BUFUSAGE_INCR(local_blks_hit);
}
else
{
bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
strategy, foundPtr, io_context);
if (*foundPtr)
- pgBufferUsage.shared_blks_hit++;
+ INSTR_BUFUSAGE_INCR(shared_blks_hit);
}
if (rel)
{
@@ -1989,9 +1989,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
true);
if (persistence == RELPERSISTENCE_TEMP)
- pgBufferUsage.local_blks_hit += 1;
+ INSTR_BUFUSAGE_INCR(local_blks_hit);
else
- pgBufferUsage.shared_blks_hit += 1;
+ INSTR_BUFUSAGE_INCR(shared_blks_hit);
if (operation->rel)
pgstat_count_buffer_hit(operation->rel);
@@ -2059,9 +2059,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
io_start, 1, io_buffers_len * BLCKSZ);
if (persistence == RELPERSISTENCE_TEMP)
- pgBufferUsage.local_blks_read += io_buffers_len;
+ INSTR_BUFUSAGE_ADD(local_blks_read, io_buffers_len);
else
- pgBufferUsage.shared_blks_read += io_buffers_len;
+ INSTR_BUFUSAGE_ADD(shared_blks_read, io_buffers_len);
/*
* Track vacuum cost when issuing IO, not after waiting for it.
@@ -2954,7 +2954,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
}
- pgBufferUsage.shared_blks_written += extend_by;
+ INSTR_BUFUSAGE_ADD(shared_blks_written, extend_by);
*extended_by = extend_by;
@@ -3099,7 +3099,7 @@ MarkBufferDirty(Buffer buffer)
*/
if (!(old_buf_state & BM_DIRTY))
{
- pgBufferUsage.shared_blks_dirtied++;
+ INSTR_BUFUSAGE_INCR(shared_blks_dirtied);
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageDirty;
}
@@ -4528,7 +4528,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
IOOP_WRITE, io_start, 1, BLCKSZ);
- pgBufferUsage.shared_blks_written++;
+ INSTR_BUFUSAGE_INCR(shared_blks_written);
/*
* Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
@@ -5689,7 +5689,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
if (dirtied)
{
- pgBufferUsage.shared_blks_dirtied++;
+ INSTR_BUFUSAGE_INCR(shared_blks_dirtied);
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageDirty;
}
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 04a540379a22f..e6054e745e819 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -217,7 +217,7 @@ FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
/* Mark not-dirty */
TerminateLocalBufferIO(bufHdr, true, 0, false);
- pgBufferUsage.local_blks_written++;
+ INSTR_BUFUSAGE_INCR(local_blks_written);
}
static Buffer
@@ -478,7 +478,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr,
*extended_by = extend_by;
- pgBufferUsage.local_blks_written += extend_by;
+ INSTR_BUFUSAGE_ADD(local_blks_written, extend_by);
return first_block;
}
@@ -509,7 +509,7 @@ MarkLocalBufferDirty(Buffer buffer)
buf_state = pg_atomic_read_u64(&bufHdr->state);
if (!(buf_state & BM_DIRTY))
- pgBufferUsage.local_blks_dirtied++;
+ INSTR_BUFUSAGE_INCR(local_blks_dirtied);
buf_state |= BM_DIRTY;
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index ddf3a410d6f54..1726b3a3b3370 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -474,13 +474,13 @@ BufFileLoadBuffer(BufFile *file)
if (track_io_timing)
{
INSTR_TIME_SET_CURRENT(io_time);
- INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_read_time, io_time, io_start);
+ INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_read_time, io_time, io_start);
}
/* we choose not to advance curOffset here */
if (file->nbytes > 0)
- pgBufferUsage.temp_blks_read++;
+ INSTR_BUFUSAGE_INCR(temp_blks_read);
}
/*
@@ -548,13 +548,13 @@ BufFileDumpBuffer(BufFile *file)
if (track_io_timing)
{
INSTR_TIME_SET_CURRENT(io_time);
- INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_write_time, io_time, io_start);
+ INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_write_time, io_time, io_start);
}
file->curOffset += bytestowrite;
wpos += bytestowrite;
- pgBufferUsage.temp_blks_written++;
+ INSTR_BUFUSAGE_INCR(temp_blks_written);
}
file->dirty = false;
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index 28de24538dced..9e7a88ec0d07e 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -135,17 +135,17 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
{
pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
if (io_object == IOOBJECT_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.shared_blk_write_time, io_time);
+ INSTR_BUFUSAGE_TIME_ADD(shared_blk_write_time, io_time);
else if (io_object == IOOBJECT_TEMP_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time);
+ INSTR_BUFUSAGE_TIME_ADD(local_blk_write_time, io_time);
}
else if (io_op == IOOP_READ)
{
pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
if (io_object == IOOBJECT_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.shared_blk_read_time, io_time);
+ INSTR_BUFUSAGE_TIME_ADD(shared_blk_read_time, io_time);
else if (io_object == IOOBJECT_TEMP_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.local_blk_read_time, io_time);
+ INSTR_BUFUSAGE_TIME_ADD(local_blk_read_time, io_time);
}
}
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index d46ba59895d62..4406be9ed7b86 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -300,6 +300,8 @@ extern void ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function);
extern Node *MultiExecProcNode(PlanState *node);
extern void ExecEndNode(PlanState *node);
extern void ExecShutdownNode(PlanState *node);
+extern void ExecRememberNodeInstrumentation(PlanState *node, InstrStack *parent);
+extern void ExecFinalizeNodeInstrumentation(PlanState *node);
extern void ExecSetTupleBound(int64 tuples_needed, PlanState *child_node);
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 9759f3ea5d8d9..8fb6c911fcb84 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -13,6 +13,7 @@
#ifndef INSTRUMENT_H
#define INSTRUMENT_H
+#include "lib/ilist.h"
#include "portability/instr_time.h"
@@ -67,7 +68,64 @@ typedef enum InstrumentOption
INSTRUMENT_ALL = PG_INT32_MAX
} InstrumentOption;
+/* Stack entry for current WAL/buffer usage */
+typedef struct InstrStack
+{
+ struct InstrStack *previous;
+ BufferUsage bufusage;
+ WalUsage walusage;
+
+ /*
+ * Tracking of instr stacks that need to be cleaned up on abort that are
+ * not registered as a resource owner themselves
+ */
+ slist_head unfinalized_children; /* head of unfinalized children list */
+ slist_node unfinalized_node; /* node in parent's unfinalized list */
+} InstrStack;
+
+/*
+ * General purpose instrumentation that can capture time and WAL/buffer usage
+ *
+ * Initialized through InstrAlloc, followed by one or more calls to a pair of
+ * InstrStart/InstrStop (activity is measured inbetween).
+ *
+ * Uses resource owner mechanism for handling aborts, as such, the caller
+ * *must* not exit out of top level transaction between InstrStart/InstrStop
+ * calls in regular execution. If this is needed, directly use InstrPushStack /
+ * InstrPopStack in a PG_TRY/PG_FINALLY block instead.
+ */
+struct ResourceOwnerData;
typedef struct Instrumentation
+{
+ /* Parameters set at creation: */
+ bool need_timer; /* true if we need timer data */
+ bool need_bufusage; /* true if we need buffer usage data */
+ bool need_walusage; /* true if we need WAL usage data */
+ /* Internal state keeping: */
+ instr_time starttime; /* start time of last InstrStart */
+ /* Accumulated statistics: */
+ instr_time total; /* total runtime */
+ InstrStack *stack; /* stack tracking buffer/WAL usage */
+ struct ResourceOwnerData *owner;
+} Instrumentation;
+
+/* Trigger instrumentation */
+typedef struct TriggerInstrumentation
+{
+ Instrumentation instr;
+ int firings; /* number of times the instrumented trigger
+ * was fired */
+} TriggerInstrumentation;
+
+/*
+ * Specialized instrumentation for per-node execution statistics
+ *
+ * Requires use of an outer InstrStart/InstrStop to handle the stack used for
+ * WAL/buffer usage statistics, and relies on it for managing aborts. Solely
+ * intended for the executor and anyone reporting about its activities (e.g.
+ * EXPLAIN ANALYZE).
+ */
+typedef struct NodeInstrumentation
{
/* Parameters set at node creation: */
bool need_timer; /* true if we need timer data */
@@ -80,8 +138,6 @@ typedef struct Instrumentation
instr_time counter; /* accumulated runtime for this node */
instr_time firsttuple; /* time for first tuple of this cycle */
double tuplecount; /* # of tuples emitted so far this cycle */
- BufferUsage bufusage_start; /* buffer usage at start */
- WalUsage walusage_start; /* WAL usage at start */
/* Accumulated statistics across all completed cycles: */
instr_time startup; /* total startup time */
instr_time total; /* total time */
@@ -90,33 +146,131 @@ typedef struct Instrumentation
double nloops; /* # of run cycles for this node */
double nfiltered1; /* # of tuples removed by scanqual or joinqual */
double nfiltered2; /* # of tuples removed by "other" quals */
- BufferUsage bufusage; /* total buffer usage */
- WalUsage walusage; /* total WAL usage */
-} Instrumentation;
+ InstrStack stack; /* stack tracking buffer/WAL usage */
+} NodeInstrumentation;
-typedef struct WorkerInstrumentation
+/*
+ * Care must be taken with any pointers contained within this struct, as this
+ * gets copied across processes during parallel query execution.
+ */
+typedef struct WorkerNodeInstrumentation
{
int num_workers; /* # of structures that follow */
- Instrumentation instrument[FLEXIBLE_ARRAY_MEMBER];
-} WorkerInstrumentation;
+ NodeInstrumentation instrument[FLEXIBLE_ARRAY_MEMBER];
+} WorkerNodeInstrumentation;
-extern PGDLLIMPORT BufferUsage pgBufferUsage;
extern PGDLLIMPORT WalUsage pgWalUsage;
-extern Instrumentation *InstrAlloc(int n, int instrument_options,
- bool async_mode);
-extern void InstrInit(Instrumentation *instr, int instrument_options);
-extern void InstrStartNode(Instrumentation *instr);
-extern void InstrStopNode(Instrumentation *instr, double nTuples);
-extern void InstrUpdateTupleCount(Instrumentation *instr, double nTuples);
-extern void InstrEndLoop(Instrumentation *instr);
-extern void InstrAggNode(Instrumentation *dst, Instrumentation *add);
-extern void InstrStartParallelQuery(void);
-extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
+/*
+ * The top instrumentation stack represents a running total of the current
+ * backend WAL/buffer usage information. This will not be updated immediately,
+ * but rather when the current stack entry gets accumulated which typically
+ * happens at query end (see CurrentInstrStack below).
+ *
+ * Care must be taken when utilizing this in the parallel worker context:
+ * Parallel workers will report back their instrumentation to the caller,
+ * and this gets added to the caller's stack. If this were to be used in the
+ * shared memory stats infrastructure it would need to be skipped on parallel
+ * workers to avoid double counting.
+ */
+extern PGDLLIMPORT InstrStack TopInstrStack;
+
+/*
+ * The currently active stack entry that is getting updated as activity
+ * happens, and will be accumulated to parent stacks when it gets finalized
+ * by InstrStop (for non-executor use cases), ExecFinalizeNodeInstrumentation
+ * (executor finish) or ResOwnerReleaseInstrumentation on abort.
+ */
+extern PGDLLIMPORT InstrStack *CurrentInstrStack;
+
+extern void InstrStackAdd(InstrStack *dst, InstrStack *add);
+
+/*
+ * Pushes the stack so that all WAL/buffer usage updates go to the passed in
+ * stack entry.
+ *
+ * Any caller using this directly must manage the passed in stack and call
+ * InstrPopStack on its own again, typically by using a PG_FINALLY block to
+ * ensure the stack gets reset via InstrPopStack on abort. Use InstrStart
+ * instead when you want automatic handling of abort cases using the resource
+ * owner infrastructure.
+ */
+static inline void
+InstrPushStack(InstrStack *stack)
+{
+ stack->previous = CurrentInstrStack;
+ CurrentInstrStack = stack;
+}
+
+/*
+ * Pops the stack entry back to the previous one that was effective at
+ * InstrPushStack.
+ *
+ * Callers must ensure that no intermediate stack entries are skipped, to
+ * handle aborts correctly. If you're thinking of calling this in a PG_FINALLY
+ * block, instead call InstrPopAndFinalizeStack which can skip intermediate
+ * stack entries, or instead use InstrStart/InstrStop.
+ */
+static inline void
+InstrPopStack(InstrStack *stack)
+{
+ Assert(stack != NULL);
+ CurrentInstrStack = stack->previous;
+}
+
+extern void InstrPopAndFinalizeStack(InstrStack *stack);
+
+extern Instrumentation *InstrAlloc(int instrument_options);
+extern void InstrStart(Instrumentation *instr);
+extern void InstrStop(Instrumentation *instr, bool finalize);
+
+extern TriggerInstrumentation *InstrAllocTrigger(int n, int instrument_options);
+extern void InstrStartTrigger(TriggerInstrumentation *tginstr);
+extern void InstrStopTrigger(TriggerInstrumentation *tginstr, int firings);
+
+extern NodeInstrumentation *InstrAllocNode(int instrument_options,
+ bool async_mode);
+extern void InstrInitNode(NodeInstrumentation *instr, int instrument_options);
+extern void InstrRememberNodeStack(InstrStack *parent, InstrStack *instr);
+extern void InstrStartNode(NodeInstrumentation *instr);
+extern void InstrStopNode(NodeInstrumentation *instr, double nTuples);
+extern NodeInstrumentation *InstrFinalizeNode(NodeInstrumentation *instr, InstrStack *parent);
+extern void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples);
+extern void InstrEndLoop(NodeInstrumentation *instr);
+extern void InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add);
+
+extern InstrStack *InstrAllocAdditionalNodeStack(NodeInstrumentation *instr);
+extern void InstrStartNodeStack(NodeInstrumentation *dst, InstrStack *stack);
+extern void InstrStopNodeStack(NodeInstrumentation *dst, InstrStack *stack);
+extern InstrStack *InstrFinalizeAdditionalNodeStack(InstrStack *stack, NodeInstrumentation *instr);
+
+pg_nodiscard extern Instrumentation *InstrStartParallelQuery(void);
+extern void InstrEndParallelQuery(Instrumentation *instr, BufferUsage *bufusage, WalUsage *walusage);
extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
-extern void BufferUsageAccumDiff(BufferUsage *dst,
- const BufferUsage *add, const BufferUsage *sub);
+extern void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add,
const WalUsage *sub);
+#define INSTR_BUFUSAGE_INCR(fld) do { \
+ CurrentInstrStack->bufusage.fld++; \
+ } while(0)
+#define INSTR_BUFUSAGE_ADD(fld,val) do { \
+ CurrentInstrStack->bufusage.fld += val; \
+ } while(0)
+#define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \
+ INSTR_TIME_ADD(CurrentInstrStack->bufusage.fld, val); \
+ } while (0)
+#define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \
+ INSTR_TIME_ACCUM_DIFF(CurrentInstrStack->bufusage.fld, endval, startval); \
+ } while (0)
+
+#define INSTR_WALUSAGE_INCR(fld) do { \
+ pgWalUsage.fld++; \
+ CurrentInstrStack->walusage.fld++; \
+ } while(0)
+#define INSTR_WALUSAGE_ADD(fld,val) do { \
+ pgWalUsage.fld += val; \
+ CurrentInstrStack->walusage.fld += val; \
+ } while(0)
+
#endif /* INSTRUMENT_H */
diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h
index 8847d7f94fa57..dd45d72559dd7 100644
--- a/src/include/executor/instrument_node.h
+++ b/src/include/executor/instrument_node.h
@@ -18,6 +18,8 @@
#ifndef INSTRUMENT_NODE_H
#define INSTRUMENT_NODE_H
+#include "executor/instrument.h"
+
/* ---------------------
* Instrumentation information for aggregate function execution
@@ -48,6 +50,9 @@ typedef struct IndexScanInstrumentation
{
/* Index search count (incremented with pgstat_count_index_scan call) */
uint64 nsearches;
+
+ /* Used for passing iss_InstrumentTableStack data from parallel workers */
+ InstrStack worker_table_stack;
} IndexScanInstrumentation;
/*
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 63c067d5aae61..6a6427afbd2f7 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -524,7 +524,7 @@ typedef struct ResultRelInfo
ExprState **ri_TrigWhenExprs;
/* optional runtime measurements for triggers */
- Instrumentation *ri_TrigInstrument;
+ TriggerInstrumentation *ri_TrigInstrument;
/* On-demand created slots for triggers / returning processing */
TupleTableSlot *ri_ReturningSlot; /* for trigger output tuples */
@@ -1175,8 +1175,10 @@ typedef struct PlanState
ExecProcNodeMtd ExecProcNodeReal; /* actual function, if above is a
* wrapper */
- Instrumentation *instrument; /* Optional runtime stats for this node */
- WorkerInstrumentation *worker_instrument; /* per-worker instrumentation */
+ NodeInstrumentation *instrument; /* Optional runtime stats for this
+ * node */
+ WorkerNodeInstrumentation *worker_instrument; /* per-worker
+ * instrumentation */
/* Per-worker JIT instrumentation */
struct SharedJitInstrumentation *worker_jit_instrument;
@@ -1726,6 +1728,13 @@ typedef struct IndexScanState
IndexScanInstrumentation iss_Instrument;
SharedIndexScanInstrumentation *iss_SharedInfo;
+ /*
+ * Instrumentation stack utilized for tracking table access. This is
+ * separate from iss_Instrument since it needs to be allocated in the
+ * right context and IndexScanInstrumentation shouldn't contain pointers.
+ */
+ InstrStack *iss_InstrumentTableStack;
+
/* These are needed for re-checking ORDER BY expr ordering */
pairingheap *iss_ReorderQueue;
bool iss_ReachedEnd;
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index 490593d182512..87b9ec95531b9 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -28,6 +28,8 @@
*
* INSTR_TIME_ACCUM_DIFF(x, y, z) x += (y - z)
*
+ * INSTR_TIME_GT(x, y) x > y
+ *
* INSTR_TIME_GET_DOUBLE(t) convert t to double (in seconds)
*
* INSTR_TIME_GET_MILLISEC(t) convert t to double (in milliseconds)
@@ -184,7 +186,7 @@ GetTimerFrequency(void)
#define INSTR_TIME_ACCUM_DIFF(x,y,z) \
((x).ticks += (y).ticks - (z).ticks)
-#define INSTR_TIME_LT(x,y) \
+#define INSTR_TIME_GT(x,y) \
((x).ticks > (y).ticks)
#define INSTR_TIME_GET_DOUBLE(t) \
diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h
index eb6033b4fdb65..5463bc921f06e 100644
--- a/src/include/utils/resowner.h
+++ b/src/include/utils/resowner.h
@@ -75,6 +75,7 @@ typedef uint32 ResourceReleasePriority;
#define RELEASE_PRIO_SNAPSHOT_REFS 500
#define RELEASE_PRIO_FILES 600
#define RELEASE_PRIO_WAITEVENTSETS 700
+#define RELEASE_PRIO_INSTRUMENTATION 800
/* 0 is considered invalid */
#define RELEASE_PRIO_FIRST 1
diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out
index 7c1f26b182cb0..e28e754369314 100644
--- a/src/test/regress/expected/explain.out
+++ b/src/test/regress/expected/explain.out
@@ -822,3 +822,200 @@ select explain_filter('explain (analyze,buffers off,costs off) select sum(n) ove
(9 rows)
reset work_mem;
+-- EXPLAIN (ANALYZE, BUFFERS) should report buffer usage from PL/pgSQL
+-- EXCEPTION blocks, even after subtransaction rollback.
+CREATE TEMP TABLE explain_exc_tab (a int, b char(20));
+INSERT INTO explain_exc_tab VALUES (0, 'zzz');
+CREATE FUNCTION explain_exc_func() RETURNS void AS $$
+DECLARE
+ v int;
+BEGIN
+ WITH ins AS (INSERT INTO explain_exc_tab VALUES (1, 'aaa') RETURNING a)
+ SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+ NULL;
+END;
+$$ LANGUAGE plpgsql;
+CREATE FUNCTION check_explain_exception_buffers() RETURNS boolean AS $$
+DECLARE
+ plan_json json;
+ node json;
+ total_buffers int;
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT explain_exc_func()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ total_buffers :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+ RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_explain_exception_buffers() AS exception_buffers_visible;
+ exception_buffers_visible
+---------------------------
+ t
+(1 row)
+
+-- Also test with nested EXPLAIN ANALYZE (two levels of instrumentation)
+CREATE FUNCTION check_explain_exception_buffers_nested() RETURNS boolean AS $$
+DECLARE
+ plan_json json;
+ node json;
+ total_buffers int;
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT check_explain_exception_buffers()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ total_buffers :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+ RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+SELECT check_explain_exception_buffers_nested() AS exception_buffers_nested_visible;
+ exception_buffers_nested_visible
+----------------------------------
+ t
+(1 row)
+
+DROP FUNCTION check_explain_exception_buffers_nested;
+DROP FUNCTION check_explain_exception_buffers;
+DROP FUNCTION explain_exc_func;
+DROP TABLE explain_exc_tab;
+-- Cursor instrumentation test.
+-- Verify that buffer usage is correctly tracked through cursor execution paths.
+-- Non-scrollable cursors exercise ExecShutdownNode after each ExecutorRun
+-- (EXEC_FLAG_BACKWARD is not set), while scrollable cursors only shut down
+-- nodes in ExecutorFinish. In both cases, buffer usage from the inner cursor
+-- scan should be correctly accumulated.
+CREATE TEMP TABLE cursor_buf_test AS SELECT * FROM tenk1;
+CREATE FUNCTION cursor_noscroll_scan() RETURNS bigint AS $$
+DECLARE
+ cur NO SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+ rec RECORD;
+ cnt bigint := 0;
+BEGIN
+ OPEN cur;
+ LOOP
+ FETCH NEXT FROM cur INTO rec;
+ EXIT WHEN NOT FOUND;
+ cnt := cnt + 1;
+ END LOOP;
+ CLOSE cur;
+ RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+CREATE FUNCTION cursor_scroll_scan() RETURNS bigint AS $$
+DECLARE
+ cur SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+ rec RECORD;
+ cnt bigint := 0;
+BEGIN
+ OPEN cur;
+ LOOP
+ FETCH NEXT FROM cur INTO rec;
+ EXIT WHEN NOT FOUND;
+ cnt := cnt + 1;
+ END LOOP;
+ CLOSE cur;
+ RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+CREATE FUNCTION check_cursor_explain_buffers() RETURNS TABLE(noscroll_ok boolean, scroll_ok boolean) AS $$
+DECLARE
+ plan_json json;
+ node json;
+ direct_buf int;
+ noscroll_buf int;
+ scroll_buf int;
+BEGIN
+ -- Direct scan: get leaf Seq Scan node buffers as baseline
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT * FROM cursor_buf_test' INTO plan_json;
+ node := plan_json->0->'Plan';
+ WHILE node->'Plans' IS NOT NULL LOOP
+ node := node->'Plans'->0;
+ END LOOP;
+ direct_buf :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+
+ -- Non-scrollable cursor path: ExecShutdownNode runs after each ExecutorRun
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT cursor_noscroll_scan()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ noscroll_buf :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+
+ -- Scrollable cursor path: ExecShutdownNode is skipped
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT cursor_scroll_scan()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ scroll_buf :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+
+ -- Both cursor paths should report buffer counts about as high as
+ -- the direct scan (same data plus minor catalog overhead), and not
+ -- double-counted (< 2x the direct scan)
+ RETURN QUERY SELECT
+ (noscroll_buf >= direct_buf * 0.5 AND noscroll_buf < direct_buf * 2),
+ (scroll_buf >= direct_buf * 0.5 AND scroll_buf < direct_buf * 2);
+END;
+$$ LANGUAGE plpgsql;
+SELECT * FROM check_cursor_explain_buffers();
+ noscroll_ok | scroll_ok
+-------------+-----------
+ t | t
+(1 row)
+
+DROP FUNCTION check_cursor_explain_buffers;
+DROP FUNCTION cursor_noscroll_scan;
+DROP FUNCTION cursor_scroll_scan;
+DROP TABLE cursor_buf_test;
+-- Parallel query buffer double-counting test.
+--
+-- Compares serial Seq Scan buffers vs parallel Seq Scan buffers.
+-- They scan the same table so the buffer count should be similar.
+-- Double-counting would make the parallel count ~2x larger.
+CREATE FUNCTION check_parallel_explain_buffers() RETURNS TABLE(ratio numeric) AS $$
+DECLARE
+ plan_json json;
+ serial_buffers int;
+ parallel_buffers int;
+ node json;
+BEGIN
+ -- Serial --
+ SET LOCAL max_parallel_workers_per_gather = 0;
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT count(*) FROM tenk1' INTO plan_json;
+ node := plan_json->0->'Plan';
+ serial_buffers :=
+ COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+ COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+ -- Parallel --
+ SET LOCAL parallel_setup_cost = 0;
+ SET LOCAL parallel_tuple_cost = 0;
+ SET LOCAL min_parallel_table_scan_size = 0;
+ SET LOCAL max_parallel_workers_per_gather = 2;
+ SET LOCAL parallel_leader_participation = off;
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT count(*) FROM tenk1' INTO plan_json;
+ node := plan_json->0->'Plan';
+ parallel_buffers :=
+ COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+ COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+ RETURN QUERY SELECT round(parallel_buffers::numeric / GREATEST(serial_buffers, 1));
+END;
+$$ LANGUAGE plpgsql;
+SELECT * FROM check_parallel_explain_buffers();
+ ratio
+-------
+ 1
+(1 row)
+
+DROP FUNCTION check_parallel_explain_buffers;
diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql
index ebdab42604beb..cf5c6335a1922 100644
--- a/src/test/regress/sql/explain.sql
+++ b/src/test/regress/sql/explain.sql
@@ -188,3 +188,197 @@ select explain_filter('explain (analyze,buffers off,costs off) select sum(n) ove
-- Test tuplestore storage usage in Window aggregate (memory and disk case, final result is disk)
select explain_filter('explain (analyze,buffers off,costs off) select sum(n) over(partition by m) from (SELECT n < 3 as m, n from generate_series(1,2500) a(n))');
reset work_mem;
+
+-- EXPLAIN (ANALYZE, BUFFERS) should report buffer usage from PL/pgSQL
+-- EXCEPTION blocks, even after subtransaction rollback.
+CREATE TEMP TABLE explain_exc_tab (a int, b char(20));
+INSERT INTO explain_exc_tab VALUES (0, 'zzz');
+
+CREATE FUNCTION explain_exc_func() RETURNS void AS $$
+DECLARE
+ v int;
+BEGIN
+ WITH ins AS (INSERT INTO explain_exc_tab VALUES (1, 'aaa') RETURNING a)
+ SELECT a / 0 INTO v FROM ins;
+EXCEPTION WHEN division_by_zero THEN
+ NULL;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE FUNCTION check_explain_exception_buffers() RETURNS boolean AS $$
+DECLARE
+ plan_json json;
+ node json;
+ total_buffers int;
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT explain_exc_func()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ total_buffers :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+ RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_explain_exception_buffers() AS exception_buffers_visible;
+
+-- Also test with nested EXPLAIN ANALYZE (two levels of instrumentation)
+CREATE FUNCTION check_explain_exception_buffers_nested() RETURNS boolean AS $$
+DECLARE
+ plan_json json;
+ node json;
+ total_buffers int;
+BEGIN
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT check_explain_exception_buffers()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ total_buffers :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+ RETURN total_buffers > 0;
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT check_explain_exception_buffers_nested() AS exception_buffers_nested_visible;
+
+DROP FUNCTION check_explain_exception_buffers_nested;
+DROP FUNCTION check_explain_exception_buffers;
+DROP FUNCTION explain_exc_func;
+DROP TABLE explain_exc_tab;
+
+-- Cursor instrumentation test.
+-- Verify that buffer usage is correctly tracked through cursor execution paths.
+-- Non-scrollable cursors exercise ExecShutdownNode after each ExecutorRun
+-- (EXEC_FLAG_BACKWARD is not set), while scrollable cursors only shut down
+-- nodes in ExecutorFinish. In both cases, buffer usage from the inner cursor
+-- scan should be correctly accumulated.
+
+CREATE TEMP TABLE cursor_buf_test AS SELECT * FROM tenk1;
+
+CREATE FUNCTION cursor_noscroll_scan() RETURNS bigint AS $$
+DECLARE
+ cur NO SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+ rec RECORD;
+ cnt bigint := 0;
+BEGIN
+ OPEN cur;
+ LOOP
+ FETCH NEXT FROM cur INTO rec;
+ EXIT WHEN NOT FOUND;
+ cnt := cnt + 1;
+ END LOOP;
+ CLOSE cur;
+ RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE FUNCTION cursor_scroll_scan() RETURNS bigint AS $$
+DECLARE
+ cur SCROLL CURSOR FOR SELECT * FROM cursor_buf_test;
+ rec RECORD;
+ cnt bigint := 0;
+BEGIN
+ OPEN cur;
+ LOOP
+ FETCH NEXT FROM cur INTO rec;
+ EXIT WHEN NOT FOUND;
+ cnt := cnt + 1;
+ END LOOP;
+ CLOSE cur;
+ RETURN cnt;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE FUNCTION check_cursor_explain_buffers() RETURNS TABLE(noscroll_ok boolean, scroll_ok boolean) AS $$
+DECLARE
+ plan_json json;
+ node json;
+ direct_buf int;
+ noscroll_buf int;
+ scroll_buf int;
+BEGIN
+ -- Direct scan: get leaf Seq Scan node buffers as baseline
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT * FROM cursor_buf_test' INTO plan_json;
+ node := plan_json->0->'Plan';
+ WHILE node->'Plans' IS NOT NULL LOOP
+ node := node->'Plans'->0;
+ END LOOP;
+ direct_buf :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+
+ -- Non-scrollable cursor path: ExecShutdownNode runs after each ExecutorRun
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT cursor_noscroll_scan()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ noscroll_buf :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+
+ -- Scrollable cursor path: ExecShutdownNode is skipped
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT cursor_scroll_scan()' INTO plan_json;
+ node := plan_json->0->'Plan';
+ scroll_buf :=
+ COALESCE((node->>'Local Hit Blocks')::int, 0) +
+ COALESCE((node->>'Local Read Blocks')::int, 0);
+
+ -- Both cursor paths should report buffer counts about as high as
+ -- the direct scan (same data plus minor catalog overhead), and not
+ -- double-counted (< 2x the direct scan)
+ RETURN QUERY SELECT
+ (noscroll_buf >= direct_buf * 0.5 AND noscroll_buf < direct_buf * 2),
+ (scroll_buf >= direct_buf * 0.5 AND scroll_buf < direct_buf * 2);
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT * FROM check_cursor_explain_buffers();
+
+DROP FUNCTION check_cursor_explain_buffers;
+DROP FUNCTION cursor_noscroll_scan;
+DROP FUNCTION cursor_scroll_scan;
+DROP TABLE cursor_buf_test;
+
+-- Parallel query buffer double-counting test.
+--
+-- Compares serial Seq Scan buffers vs parallel Seq Scan buffers.
+-- They scan the same table so the buffer count should be similar.
+-- Double-counting would make the parallel count ~2x larger.
+CREATE FUNCTION check_parallel_explain_buffers() RETURNS TABLE(ratio numeric) AS $$
+DECLARE
+ plan_json json;
+ serial_buffers int;
+ parallel_buffers int;
+ node json;
+BEGIN
+ -- Serial --
+ SET LOCAL max_parallel_workers_per_gather = 0;
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT count(*) FROM tenk1' INTO plan_json;
+ node := plan_json->0->'Plan';
+ serial_buffers :=
+ COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+ COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+ -- Parallel --
+ SET LOCAL parallel_setup_cost = 0;
+ SET LOCAL parallel_tuple_cost = 0;
+ SET LOCAL min_parallel_table_scan_size = 0;
+ SET LOCAL max_parallel_workers_per_gather = 2;
+ SET LOCAL parallel_leader_participation = off;
+ EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON)
+ SELECT count(*) FROM tenk1' INTO plan_json;
+ node := plan_json->0->'Plan';
+ parallel_buffers :=
+ COALESCE((node->>'Shared Hit Blocks')::int, 0) +
+ COALESCE((node->>'Shared Read Blocks')::int, 0);
+
+ RETURN QUERY SELECT round(parallel_buffers::numeric / GREATEST(serial_buffers, 1));
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT * FROM check_parallel_explain_buffers();
+
+DROP FUNCTION check_parallel_explain_buffers;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 241945734ec86..b3146b0a1651e 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1319,6 +1319,7 @@ InjectionPointSharedState
InjectionPointsCtl
InlineCodeBlock
InsertStmt
+InstrStack
Instrumentation
Int128AggState
Int8TransTypeData
@@ -1784,6 +1785,7 @@ NextSampleBlock_function
NextSampleTuple_function
NextValueExpr
Node
+NodeInstrumentation
NodeTag
NonEmptyRange
NoneCompressorState
@@ -3155,6 +3157,7 @@ TriggerDesc
TriggerEvent
TriggerFlags
TriggerInfo
+TriggerInstrumentation
TriggerTransition
TruncateStmt
TsmRoutine
@@ -3375,9 +3378,9 @@ WorkTableScan
WorkTableScanState
WorkerInfo
WorkerInfoData
-WorkerInstrumentation
WorkerJobDumpPtrType
WorkerJobRestorePtrType
+WorkerNodeInstrumentation
Working_State
WriteBufPtrType
WriteBytePtrType