diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c index e856cd35a6f0f..39bf2543b701d 100644 --- a/contrib/auto_explain/auto_explain.c +++ b/contrib/auto_explain/auto_explain.c @@ -315,7 +315,7 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags) MemoryContext oldcxt; oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); - queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false); + queryDesc->totaltime = InstrAlloc(INSTRUMENT_ALL); MemoryContextSwitchTo(oldcxt); } } @@ -381,12 +381,6 @@ explain_ExecutorEnd(QueryDesc *queryDesc) */ oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); - /* - * Make sure stats accumulation is done. (Note: it's okay if several - * levels of hook all do this.) - */ - InstrEndLoop(queryDesc->totaltime); - /* Log plan if duration is exceeded. */ msec = INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total); if (msec >= auto_explain_log_min_duration) diff --git a/contrib/meson.build b/contrib/meson.build index def13257cbead..cab1b21167895 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -50,6 +50,7 @@ subdir('pg_logicalinspect') subdir('pg_overexplain') subdir('pg_prewarm') subdir('pgrowlocks') +subdir('pg_session_buffer_usage') subdir('pg_stat_statements') subdir('pgstattuple') subdir('pg_surgery') diff --git a/contrib/pg_session_buffer_usage/Makefile b/contrib/pg_session_buffer_usage/Makefile new file mode 100644 index 0000000000000..75bd8e09b3d3d --- /dev/null +++ b/contrib/pg_session_buffer_usage/Makefile @@ -0,0 +1,23 @@ +# contrib/pg_session_buffer_usage/Makefile + +MODULE_big = pg_session_buffer_usage +OBJS = \ + $(WIN32RES) \ + pg_session_buffer_usage.o + +EXTENSION = pg_session_buffer_usage +DATA = pg_session_buffer_usage--1.0.sql +PGFILEDESC = "pg_session_buffer_usage - show buffer usage statistics for the current session" + +REGRESS = pg_session_buffer_usage + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/pg_session_buffer_usage +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out b/contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out new file mode 100644 index 0000000000000..242b400395043 --- /dev/null +++ b/contrib/pg_session_buffer_usage/expected/pg_session_buffer_usage.out @@ -0,0 +1,283 @@ +LOAD 'pg_session_buffer_usage'; +CREATE EXTENSION pg_session_buffer_usage; +-- Verify all columns are non-negative +SELECT count(*) = 1 AS ok FROM pg_session_buffer_usage() +WHERE shared_blks_hit >= 0 AND shared_blks_read >= 0 + AND shared_blks_dirtied >= 0 AND shared_blks_written >= 0 + AND local_blks_hit >= 0 AND local_blks_read >= 0 + AND local_blks_dirtied >= 0 AND local_blks_written >= 0 + AND temp_blks_read >= 0 AND temp_blks_written >= 0 + AND shared_blk_read_time >= 0 AND shared_blk_write_time >= 0 + AND local_blk_read_time >= 0 AND local_blk_write_time >= 0 + AND temp_blk_read_time >= 0 AND temp_blk_write_time >= 0; + ok +---- + t +(1 row) + +-- Verify counters increase after buffer activity +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +CREATE TEMP TABLE test_buf_activity (id int, data text); +INSERT INTO test_buf_activity SELECT i, repeat('x', 100) FROM generate_series(1, 1000) AS i; +SELECT count(*) FROM test_buf_activity; + count +------- + 1000 +(1 row) + +SELECT local_blks_hit + local_blks_read > 0 AS blocks_increased +FROM pg_session_buffer_usage(); + blocks_increased +------------------ + t +(1 row) + +DROP TABLE test_buf_activity; +-- Parallel query test +CREATE TABLE par_dc_tab (a int, b char(200)); +INSERT INTO par_dc_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; +SELECT count(*) FROM par_dc_tab; + count +------- + 5000 +(1 row) + +-- Measure serial scan delta (leader does all the work) +SET max_parallel_workers_per_gather = 0; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT count(*) FROM par_dc_tab; + count +------- + 5000 +(1 row) + +CREATE TEMP TABLE dc_serial_result AS +SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage(); +-- Measure parallel scan delta with leader NOT participating in scanning. +-- Workers do all table scanning; leader only runs the Gather node. +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET max_parallel_workers_per_gather = 2; +SET parallel_leader_participation = off; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT count(*) FROM par_dc_tab; + count +------- + 5000 +(1 row) + +-- Confirm we got a similar hit counter through parallel worker accumulation +SELECT shared_blks_hit > s.serial_delta / 2 AND shared_blks_hit < s.serial_delta * 2 + AS leader_buffers_match +FROM pg_session_buffer_usage(), dc_serial_result s; + leader_buffers_match +---------------------- + t +(1 row) + +RESET parallel_setup_cost; +RESET parallel_tuple_cost; +RESET min_parallel_table_scan_size; +RESET max_parallel_workers_per_gather; +RESET parallel_leader_participation; +DROP TABLE par_dc_tab, dc_serial_result; +-- +-- Abort/exception tests: verify buffer usage survives various error paths. +-- +-- Rolled-back divide-by-zero under EXPLAIN ANALYZE +CREATE TEMP TABLE exc_tab (a int, b char(20)); +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +EXPLAIN (ANALYZE, BUFFERS, COSTS OFF) + WITH ins AS (INSERT INTO exc_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 FROM ins; +ERROR: division by zero +SELECT local_blks_dirtied > 0 AS exception_buffers_visible +FROM pg_session_buffer_usage(); + exception_buffers_visible +--------------------------- + t +(1 row) + +DROP TABLE exc_tab; +-- Unique constraint violation in regular query +CREATE TEMP TABLE unique_tab (a int UNIQUE, b char(20)); +INSERT INTO unique_tab VALUES (1, 'first'); +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +INSERT INTO unique_tab VALUES (1, 'duplicate'); +ERROR: duplicate key value violates unique constraint "unique_tab_a_key" +DETAIL: Key (a)=(1) already exists. +SELECT local_blks_hit > 0 AS unique_violation_buffers_visible +FROM pg_session_buffer_usage(); + unique_violation_buffers_visible +---------------------------------- + t +(1 row) + +DROP TABLE unique_tab; +-- Caught exception in PL/pgSQL subtransaction (BEGIN...EXCEPTION) +CREATE TEMP TABLE subxact_tab (a int, b char(20)); +CREATE FUNCTION subxact_exc_func() RETURNS text AS $$ +BEGIN + BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF) + WITH ins AS (INSERT INTO subxact_tab VALUES (1, ''aaa'') RETURNING a) + SELECT a / 0 FROM ins'; + EXCEPTION WHEN division_by_zero THEN + RETURN 'caught'; + END; + RETURN 'not reached'; +END; +$$ LANGUAGE plpgsql; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT subxact_exc_func(); + subxact_exc_func +------------------ + caught +(1 row) + +SELECT local_blks_dirtied > 0 AS subxact_buffers_visible +FROM pg_session_buffer_usage(); + subxact_buffers_visible +------------------------- + t +(1 row) + +DROP FUNCTION subxact_exc_func; +DROP TABLE subxact_tab; +-- Cursor (FOR loop) in aborted subtransaction; verify post-exception tracking +CREATE TEMP TABLE cursor_tab (a int, b char(200)); +INSERT INTO cursor_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i; +CREATE FUNCTION cursor_exc_func() RETURNS text AS $$ +DECLARE + rec record; + cnt int := 0; +BEGIN + BEGIN + FOR rec IN SELECT * FROM cursor_tab LOOP + cnt := cnt + 1; + IF cnt = 250 THEN + PERFORM 1 / 0; + END IF; + END LOOP; + EXCEPTION WHEN division_by_zero THEN + RETURN 'caught after ' || cnt || ' rows'; + END; + RETURN 'not reached'; +END; +$$ LANGUAGE plpgsql; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT cursor_exc_func(); + cursor_exc_func +----------------------- + caught after 250 rows +(1 row) + +SELECT local_blks_hit + local_blks_read > 0 + AS cursor_subxact_buffers_visible +FROM pg_session_buffer_usage(); + cursor_subxact_buffers_visible +-------------------------------- + t +(1 row) + +DROP FUNCTION cursor_exc_func; +DROP TABLE cursor_tab; +-- Parallel worker abort: worker buffer activity is currently NOT propagated on abort. +-- +-- When a parallel worker aborts, InstrEndParallelQuery and +-- ExecParallelReportInstrumentation never run, so the worker's buffer +-- activity is never written to shared memory, despite the information having been +-- captured by the res owner release instrumentation handling. +CREATE TABLE par_abort_tab (a int, b char(200)); +INSERT INTO par_abort_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; +-- Warm shared buffers so all reads become hits +SELECT count(*) FROM par_abort_tab; + count +------- + 5000 +(1 row) + +-- Measure serial scan delta as a reference (leader reads all blocks) +SET max_parallel_workers_per_gather = 0; +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT b::int2 FROM par_abort_tab WHERE a > 1000; +ERROR: invalid input syntax for type smallint: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" +CREATE TABLE par_abort_serial_result AS +SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage(); +-- Now force parallel with leader NOT participating in scanning +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET max_parallel_workers_per_gather = 2; +SET parallel_leader_participation = off; +SET debug_parallel_query = on; -- Ensure we get CONTEXT line consistently +SELECT pg_session_buffer_usage_reset(); + pg_session_buffer_usage_reset +------------------------------- + +(1 row) + +SELECT b::int2 FROM par_abort_tab WHERE a > 1000; +ERROR: invalid input syntax for type smallint: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" +CONTEXT: parallel worker +RESET debug_parallel_query; +-- Workers scanned the table but aborted before reporting stats back. +-- The leader's delta should be much less than a serial scan, documenting +-- that worker buffer activity is lost on abort. +SELECT shared_blks_hit < s.serial_delta / 2 + AS worker_abort_buffers_not_propagated +FROM pg_session_buffer_usage(), par_abort_serial_result s; + worker_abort_buffers_not_propagated +------------------------------------- + t +(1 row) + +RESET parallel_setup_cost; +RESET parallel_tuple_cost; +RESET min_parallel_table_scan_size; +RESET max_parallel_workers_per_gather; +RESET parallel_leader_participation; +DROP TABLE par_abort_tab, par_abort_serial_result; +-- Cleanup +DROP EXTENSION pg_session_buffer_usage; diff --git a/contrib/pg_session_buffer_usage/meson.build b/contrib/pg_session_buffer_usage/meson.build new file mode 100644 index 0000000000000..34c7502beb463 --- /dev/null +++ b/contrib/pg_session_buffer_usage/meson.build @@ -0,0 +1,34 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group + +pg_session_buffer_usage_sources = files( + 'pg_session_buffer_usage.c', +) + +if host_system == 'windows' + pg_session_buffer_usage_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pg_session_buffer_usage', + '--FILEDESC', 'pg_session_buffer_usage - show buffer usage statistics for the current session',]) +endif + +pg_session_buffer_usage = shared_module('pg_session_buffer_usage', + pg_session_buffer_usage_sources, + kwargs: contrib_mod_args, +) +contrib_targets += pg_session_buffer_usage + +install_data( + 'pg_session_buffer_usage--1.0.sql', + 'pg_session_buffer_usage.control', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'pg_session_buffer_usage', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'pg_session_buffer_usage', + ], + }, +} diff --git a/contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql b/contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql new file mode 100644 index 0000000000000..b300fdbc64397 --- /dev/null +++ b/contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql @@ -0,0 +1,31 @@ +/* contrib/pg_session_buffer_usage/pg_session_buffer_usage--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION pg_session_buffer_usage" to load this file. \quit + +CREATE FUNCTION pg_session_buffer_usage( + OUT shared_blks_hit bigint, + OUT shared_blks_read bigint, + OUT shared_blks_dirtied bigint, + OUT shared_blks_written bigint, + OUT local_blks_hit bigint, + OUT local_blks_read bigint, + OUT local_blks_dirtied bigint, + OUT local_blks_written bigint, + OUT temp_blks_read bigint, + OUT temp_blks_written bigint, + OUT shared_blk_read_time double precision, + OUT shared_blk_write_time double precision, + OUT local_blk_read_time double precision, + OUT local_blk_write_time double precision, + OUT temp_blk_read_time double precision, + OUT temp_blk_write_time double precision +) +RETURNS record +AS 'MODULE_PATHNAME', 'pg_session_buffer_usage' +LANGUAGE C PARALLEL RESTRICTED; + +CREATE FUNCTION pg_session_buffer_usage_reset() +RETURNS void +AS 'MODULE_PATHNAME', 'pg_session_buffer_usage_reset' +LANGUAGE C PARALLEL RESTRICTED; diff --git a/contrib/pg_session_buffer_usage/pg_session_buffer_usage.c b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.c new file mode 100644 index 0000000000000..20ed3c11ebda8 --- /dev/null +++ b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.c @@ -0,0 +1,96 @@ +/*------------------------------------------------------------------------- + * + * pg_session_buffer_usage.c + * show buffer usage statistics for the current session + * + * Copyright (c) 2026, PostgreSQL Global Development Group + * + * contrib/pg_session_buffer_usage/pg_session_buffer_usage.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "executor/instrument.h" +#include "funcapi.h" +#include "libpq/auth.h" +#include "miscadmin.h" +#include "utils/memutils.h" + +PG_MODULE_MAGIC_EXT( + .name = "pg_session_buffer_usage", + .version = PG_VERSION +); + +#define NUM_BUFFER_USAGE_COLUMNS 16 + +PG_FUNCTION_INFO_V1(pg_session_buffer_usage); +PG_FUNCTION_INFO_V1(pg_session_buffer_usage_reset); + +#define HAVE_INSTR_STACK 1 /* Change to 0 when testing before stack + * change */ + +/* + * SQL function: pg_session_buffer_usage() + * + * Returns a single row with all BufferUsage counters accumulated since the + * start of the session. Excludes any usage not yet added to the top of the + * stack (e.g. if this gets called inside a statement that also had buffer + * activity). + */ +Datum +pg_session_buffer_usage(PG_FUNCTION_ARGS) +{ + TupleDesc tupdesc; + Datum values[NUM_BUFFER_USAGE_COLUMNS]; + bool nulls[NUM_BUFFER_USAGE_COLUMNS]; + BufferUsage *usage; + + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + memset(nulls, 0, sizeof(nulls)); + +#if HAVE_INSTR_STACK + usage = &TopInstrStack.bufusage; +#else + usage = &pgBufferUsage; +#endif + + values[0] = Int64GetDatum(usage->shared_blks_hit); + values[1] = Int64GetDatum(usage->shared_blks_read); + values[2] = Int64GetDatum(usage->shared_blks_dirtied); + values[3] = Int64GetDatum(usage->shared_blks_written); + values[4] = Int64GetDatum(usage->local_blks_hit); + values[5] = Int64GetDatum(usage->local_blks_read); + values[6] = Int64GetDatum(usage->local_blks_dirtied); + values[7] = Int64GetDatum(usage->local_blks_written); + values[8] = Int64GetDatum(usage->temp_blks_read); + values[9] = Int64GetDatum(usage->temp_blks_written); + values[10] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time)); + values[11] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->shared_blk_write_time)); + values[12] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->local_blk_read_time)); + values[13] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->local_blk_write_time)); + values[14] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time)); + values[15] = Float8GetDatum(INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time)); + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} + +/* + * SQL function: pg_session_buffer_usage_reset() + * + * Resets all BufferUsage counters on the top instrumentation stack to zero. + * Useful in tests to avoid the baseline/delta pattern. + */ +Datum +pg_session_buffer_usage_reset(PG_FUNCTION_ARGS) +{ +#if HAVE_INSTR_STACK + memset(&TopInstrStack.bufusage, 0, sizeof(BufferUsage)); +#else + memset(&pgBufferUsage, 0, sizeof(BufferUsage)); +#endif + + PG_RETURN_VOID(); +} diff --git a/contrib/pg_session_buffer_usage/pg_session_buffer_usage.control b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.control new file mode 100644 index 0000000000000..fabd05ee02458 --- /dev/null +++ b/contrib/pg_session_buffer_usage/pg_session_buffer_usage.control @@ -0,0 +1,5 @@ +# pg_session_buffer_usage extension +comment = 'show buffer usage statistics for the current session' +default_version = '1.0' +module_pathname = '$libdir/pg_session_buffer_usage' +relocatable = true diff --git a/contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql b/contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql new file mode 100644 index 0000000000000..8f5810fadd337 --- /dev/null +++ b/contrib/pg_session_buffer_usage/sql/pg_session_buffer_usage.sql @@ -0,0 +1,204 @@ +LOAD 'pg_session_buffer_usage'; +CREATE EXTENSION pg_session_buffer_usage; + +-- Verify all columns are non-negative +SELECT count(*) = 1 AS ok FROM pg_session_buffer_usage() +WHERE shared_blks_hit >= 0 AND shared_blks_read >= 0 + AND shared_blks_dirtied >= 0 AND shared_blks_written >= 0 + AND local_blks_hit >= 0 AND local_blks_read >= 0 + AND local_blks_dirtied >= 0 AND local_blks_written >= 0 + AND temp_blks_read >= 0 AND temp_blks_written >= 0 + AND shared_blk_read_time >= 0 AND shared_blk_write_time >= 0 + AND local_blk_read_time >= 0 AND local_blk_write_time >= 0 + AND temp_blk_read_time >= 0 AND temp_blk_write_time >= 0; + +-- Verify counters increase after buffer activity +SELECT pg_session_buffer_usage_reset(); + +CREATE TEMP TABLE test_buf_activity (id int, data text); +INSERT INTO test_buf_activity SELECT i, repeat('x', 100) FROM generate_series(1, 1000) AS i; +SELECT count(*) FROM test_buf_activity; + +SELECT local_blks_hit + local_blks_read > 0 AS blocks_increased +FROM pg_session_buffer_usage(); + +DROP TABLE test_buf_activity; + +-- Parallel query test +CREATE TABLE par_dc_tab (a int, b char(200)); +INSERT INTO par_dc_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; + +SELECT count(*) FROM par_dc_tab; + +-- Measure serial scan delta (leader does all the work) +SET max_parallel_workers_per_gather = 0; + +SELECT pg_session_buffer_usage_reset(); +SELECT count(*) FROM par_dc_tab; + +CREATE TEMP TABLE dc_serial_result AS +SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage(); + +-- Measure parallel scan delta with leader NOT participating in scanning. +-- Workers do all table scanning; leader only runs the Gather node. +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET max_parallel_workers_per_gather = 2; +SET parallel_leader_participation = off; + +SELECT pg_session_buffer_usage_reset(); +SELECT count(*) FROM par_dc_tab; + +-- Confirm we got a similar hit counter through parallel worker accumulation +SELECT shared_blks_hit > s.serial_delta / 2 AND shared_blks_hit < s.serial_delta * 2 + AS leader_buffers_match +FROM pg_session_buffer_usage(), dc_serial_result s; + +RESET parallel_setup_cost; +RESET parallel_tuple_cost; +RESET min_parallel_table_scan_size; +RESET max_parallel_workers_per_gather; +RESET parallel_leader_participation; + +DROP TABLE par_dc_tab, dc_serial_result; + +-- +-- Abort/exception tests: verify buffer usage survives various error paths. +-- + +-- Rolled-back divide-by-zero under EXPLAIN ANALYZE +CREATE TEMP TABLE exc_tab (a int, b char(20)); + +SELECT pg_session_buffer_usage_reset(); + +EXPLAIN (ANALYZE, BUFFERS, COSTS OFF) + WITH ins AS (INSERT INTO exc_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 FROM ins; + +SELECT local_blks_dirtied > 0 AS exception_buffers_visible +FROM pg_session_buffer_usage(); + +DROP TABLE exc_tab; + +-- Unique constraint violation in regular query +CREATE TEMP TABLE unique_tab (a int UNIQUE, b char(20)); +INSERT INTO unique_tab VALUES (1, 'first'); + +SELECT pg_session_buffer_usage_reset(); +INSERT INTO unique_tab VALUES (1, 'duplicate'); + +SELECT local_blks_hit > 0 AS unique_violation_buffers_visible +FROM pg_session_buffer_usage(); + +DROP TABLE unique_tab; + +-- Caught exception in PL/pgSQL subtransaction (BEGIN...EXCEPTION) +CREATE TEMP TABLE subxact_tab (a int, b char(20)); + +CREATE FUNCTION subxact_exc_func() RETURNS text AS $$ +BEGIN + BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF) + WITH ins AS (INSERT INTO subxact_tab VALUES (1, ''aaa'') RETURNING a) + SELECT a / 0 FROM ins'; + EXCEPTION WHEN division_by_zero THEN + RETURN 'caught'; + END; + RETURN 'not reached'; +END; +$$ LANGUAGE plpgsql; + +SELECT pg_session_buffer_usage_reset(); +SELECT subxact_exc_func(); + +SELECT local_blks_dirtied > 0 AS subxact_buffers_visible +FROM pg_session_buffer_usage(); + +DROP FUNCTION subxact_exc_func; +DROP TABLE subxact_tab; + +-- Cursor (FOR loop) in aborted subtransaction; verify post-exception tracking +CREATE TEMP TABLE cursor_tab (a int, b char(200)); +INSERT INTO cursor_tab SELECT i, repeat('x', 200) FROM generate_series(1, 500) AS i; + +CREATE FUNCTION cursor_exc_func() RETURNS text AS $$ +DECLARE + rec record; + cnt int := 0; +BEGIN + BEGIN + FOR rec IN SELECT * FROM cursor_tab LOOP + cnt := cnt + 1; + IF cnt = 250 THEN + PERFORM 1 / 0; + END IF; + END LOOP; + EXCEPTION WHEN division_by_zero THEN + RETURN 'caught after ' || cnt || ' rows'; + END; + RETURN 'not reached'; +END; +$$ LANGUAGE plpgsql; + +SELECT pg_session_buffer_usage_reset(); +SELECT cursor_exc_func(); + +SELECT local_blks_hit + local_blks_read > 0 + AS cursor_subxact_buffers_visible +FROM pg_session_buffer_usage(); + +DROP FUNCTION cursor_exc_func; +DROP TABLE cursor_tab; + +-- Parallel worker abort: worker buffer activity is currently NOT propagated on abort. +-- +-- When a parallel worker aborts, InstrEndParallelQuery and +-- ExecParallelReportInstrumentation never run, so the worker's buffer +-- activity is never written to shared memory, despite the information having been +-- captured by the res owner release instrumentation handling. +CREATE TABLE par_abort_tab (a int, b char(200)); +INSERT INTO par_abort_tab SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; + +-- Warm shared buffers so all reads become hits +SELECT count(*) FROM par_abort_tab; + +-- Measure serial scan delta as a reference (leader reads all blocks) +SET max_parallel_workers_per_gather = 0; + +SELECT pg_session_buffer_usage_reset(); +SELECT b::int2 FROM par_abort_tab WHERE a > 1000; + +CREATE TABLE par_abort_serial_result AS +SELECT shared_blks_hit AS serial_delta FROM pg_session_buffer_usage(); + +-- Now force parallel with leader NOT participating in scanning +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET max_parallel_workers_per_gather = 2; +SET parallel_leader_participation = off; +SET debug_parallel_query = on; -- Ensure we get CONTEXT line consistently + +SELECT pg_session_buffer_usage_reset(); +SELECT b::int2 FROM par_abort_tab WHERE a > 1000; + +RESET debug_parallel_query; + +-- Workers scanned the table but aborted before reporting stats back. +-- The leader's delta should be much less than a serial scan, documenting +-- that worker buffer activity is lost on abort. +SELECT shared_blks_hit < s.serial_delta / 2 + AS worker_abort_buffers_not_propagated +FROM pg_session_buffer_usage(), par_abort_serial_result s; + +RESET parallel_setup_cost; +RESET parallel_tuple_cost; +RESET min_parallel_table_scan_size; +RESET max_parallel_workers_per_gather; +RESET parallel_leader_participation; + +DROP TABLE par_abort_tab, par_abort_serial_result; + +-- Cleanup +DROP EXTENSION pg_session_buffer_usage; diff --git a/contrib/pg_stat_statements/expected/utility.out b/contrib/pg_stat_statements/expected/utility.out index e4d6564ea5b5a..cba487f6be582 100644 --- a/contrib/pg_stat_statements/expected/utility.out +++ b/contrib/pg_stat_statements/expected/utility.out @@ -289,6 +289,76 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; 1 | 1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t (3 rows) +-- Buffer stats should flow through EXPLAIN ANALYZE +CREATE TEMP TABLE flow_through_test (a int, b char(200)); +INSERT INTO flow_through_test SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; +CREATE FUNCTION run_explain_buffers_test() RETURNS void AS $$ +DECLARE +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM flow_through_test'; +END; +$$ LANGUAGE plpgsql; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT run_explain_buffers_test(); + run_explain_buffers_test +-------------------------- + +(1 row) + +-- EXPLAIN entries should have non-zero buffer stats +SELECT query, local_blks_hit + local_blks_read > 0 as has_buffer_stats +FROM pg_stat_statements +WHERE query LIKE 'SELECT run_explain_buffers_test%' +ORDER BY query COLLATE "C"; + query | has_buffer_stats +-----------------------------------+------------------ + SELECT run_explain_buffers_test() | t +(1 row) + +DROP FUNCTION run_explain_buffers_test; +DROP TABLE flow_through_test; +-- Validate buffer/WAL counting during abort +SET pg_stat_statements.track = 'all'; +CREATE TEMP TABLE pgss_call_tab (a int, b char(20)); +CREATE TEMP TABLE pgss_call_tab2 (a int, b char(20)); +INSERT INTO pgss_call_tab VALUES (0, 'zzz'); +CREATE PROCEDURE pgss_call_rollback_proc() AS $$ +DECLARE + v int; +BEGIN + EXPLAIN ANALYZE WITH ins AS (INSERT INTO pgss_call_tab2 SELECT * FROM pgss_call_tab RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN +END; +$$ LANGUAGE plpgsql; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +CALL pgss_call_rollback_proc(); +SELECT query, calls, +local_blks_hit + local_blks_read > 0 as local_hitread, +wal_bytes > 0 as wal_bytes_generated, +wal_records > 0 as wal_records_generated +FROM pg_stat_statements +WHERE query LIKE '%pgss_call_rollback_proc%' +ORDER BY query COLLATE "C"; + query | calls | local_hitread | wal_bytes_generated | wal_records_generated +--------------------------------+-------+---------------+---------------------+----------------------- + CALL pgss_call_rollback_proc() | 1 | t | t | t +(1 row) + +DROP TABLE pgss_call_tab2; +DROP TABLE pgss_call_tab; +DROP PROCEDURE pgss_call_rollback_proc; +SET pg_stat_statements.track = 'top'; -- CALL CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$ DECLARE diff --git a/contrib/pg_stat_statements/expected/wal.out b/contrib/pg_stat_statements/expected/wal.out index 977e382d84894..611213daef6c2 100644 --- a/contrib/pg_stat_statements/expected/wal.out +++ b/contrib/pg_stat_statements/expected/wal.out @@ -28,3 +28,51 @@ SELECT pg_stat_statements_reset() IS NOT NULL AS t; t (1 row) +-- +-- Validate buffer/WAL counting with caught exception in PL/pgSQL +-- +CREATE TEMP TABLE pgss_error_tab (a int, b char(20)); +INSERT INTO pgss_error_tab VALUES (0, 'zzz'); +CREATE FUNCTION pgss_error_func() RETURNS void AS $$ +DECLARE + v int; +BEGIN + WITH ins AS (INSERT INTO pgss_error_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN + NULL; +END; +$$ LANGUAGE plpgsql; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT pgss_error_func(); + pgss_error_func +----------------- + +(1 row) + +-- Buffer/WAL usage from the wCTE INSERT should survive the exception +SELECT query, calls, +local_blks_hit + local_blks_read > 0 as local_hitread, +wal_bytes > 0 as wal_bytes_generated, +wal_records > 0 as wal_records_generated +FROM pg_stat_statements +WHERE query LIKE '%pgss_error_func%' +ORDER BY query COLLATE "C"; + query | calls | local_hitread | wal_bytes_generated | wal_records_generated +--------------------------+-------+---------------+---------------------+----------------------- + SELECT pgss_error_func() | 1 | t | t | t +(1 row) + +DROP TABLE pgss_error_tab; +DROP FUNCTION pgss_error_func; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 4a427533bd88d..fe3237fc665fd 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -911,21 +911,13 @@ pgss_planner(Query *parse, { instr_time start; instr_time duration; - BufferUsage bufusage_start, - bufusage; - WalUsage walusage_start, - walusage; + InstrStack stack = {0}; - /* We need to track buffer usage as the planner can access them. */ - bufusage_start = pgBufferUsage; - - /* - * Similarly the planner could write some WAL records in some cases - * (e.g. setting a hint bit with those being WAL-logged) - */ - walusage_start = pgWalUsage; INSTR_TIME_SET_CURRENT(start); + /* We need to track buffer/WAL usage as the planner can access them. */ + InstrPushStack(&stack); + nesting_level++; PG_TRY(); { @@ -938,6 +930,7 @@ pgss_planner(Query *parse, } PG_FINALLY(); { + InstrPopAndFinalizeStack(&stack); nesting_level--; } PG_END_TRY(); @@ -945,14 +938,6 @@ pgss_planner(Query *parse, INSTR_TIME_SET_CURRENT(duration); INSTR_TIME_SUBTRACT(duration, start); - /* calc differences of buffer counters. */ - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - - /* calc differences of WAL counters. */ - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start); - pgss_store(query_string, parse->queryId, parse->stmt_location, @@ -960,8 +945,8 @@ pgss_planner(Query *parse, PGSS_PLAN, INSTR_TIME_GET_MILLISEC(duration), 0, - &bufusage, - &walusage, + &stack.bufusage, + &stack.walusage, NULL, NULL, 0, @@ -1023,7 +1008,7 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags) MemoryContext oldcxt; oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); - queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false); + queryDesc->totaltime = InstrAlloc(INSTRUMENT_ALL); MemoryContextSwitchTo(oldcxt); } } @@ -1083,10 +1068,11 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) pgss_enabled(nesting_level)) { /* - * Make sure stats accumulation is done. (Note: it's okay if several - * levels of hook all do this.) + * Check if stack is initialized - it is not when ExecutorRun wasn't + * called */ - InstrEndLoop(queryDesc->totaltime); + const BufferUsage *bufusage = queryDesc->totaltime->stack ? &queryDesc->totaltime->stack->bufusage : NULL; + const WalUsage *walusage = queryDesc->totaltime->stack ? &queryDesc->totaltime->stack->walusage : NULL; pgss_store(queryDesc->sourceText, queryId, @@ -1095,8 +1081,8 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) PGSS_EXEC, INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total), queryDesc->estate->es_total_processed, - &queryDesc->totaltime->bufusage, - &queryDesc->totaltime->walusage, + bufusage, + walusage, queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL, NULL, queryDesc->estate->es_parallel_workers_to_launch, @@ -1163,14 +1149,10 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, instr_time start; instr_time duration; uint64 rows; - BufferUsage bufusage_start, - bufusage; - WalUsage walusage_start, - walusage; + InstrStack stack = {0}; - bufusage_start = pgBufferUsage; - walusage_start = pgWalUsage; INSTR_TIME_SET_CURRENT(start); + InstrPushStack(&stack); nesting_level++; PG_TRY(); @@ -1186,6 +1168,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, } PG_FINALLY(); { + InstrPopAndFinalizeStack(&stack); nesting_level--; } PG_END_TRY(); @@ -1214,14 +1197,6 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, qc->commandTag == CMDTAG_REFRESH_MATERIALIZED_VIEW)) ? qc->nprocessed : 0; - /* calc differences of buffer counters. */ - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - - /* calc differences of WAL counters. */ - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start); - pgss_store(queryString, saved_queryId, saved_stmt_location, @@ -1229,8 +1204,8 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, PGSS_EXEC, INSTR_TIME_GET_MILLISEC(duration), rows, - &bufusage, - &walusage, + &stack.bufusage, + &stack.walusage, NULL, NULL, 0, @@ -1460,27 +1435,33 @@ pgss_store(const char *query, int64 queryId, } } entry->counters.rows += rows; - entry->counters.shared_blks_hit += bufusage->shared_blks_hit; - entry->counters.shared_blks_read += bufusage->shared_blks_read; - entry->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied; - entry->counters.shared_blks_written += bufusage->shared_blks_written; - entry->counters.local_blks_hit += bufusage->local_blks_hit; - entry->counters.local_blks_read += bufusage->local_blks_read; - entry->counters.local_blks_dirtied += bufusage->local_blks_dirtied; - entry->counters.local_blks_written += bufusage->local_blks_written; - entry->counters.temp_blks_read += bufusage->temp_blks_read; - entry->counters.temp_blks_written += bufusage->temp_blks_written; - entry->counters.shared_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_read_time); - entry->counters.shared_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_write_time); - entry->counters.local_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_read_time); - entry->counters.local_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_write_time); - entry->counters.temp_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_read_time); - entry->counters.temp_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_write_time); + if (bufusage) + { + entry->counters.shared_blks_hit += bufusage->shared_blks_hit; + entry->counters.shared_blks_read += bufusage->shared_blks_read; + entry->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied; + entry->counters.shared_blks_written += bufusage->shared_blks_written; + entry->counters.local_blks_hit += bufusage->local_blks_hit; + entry->counters.local_blks_read += bufusage->local_blks_read; + entry->counters.local_blks_dirtied += bufusage->local_blks_dirtied; + entry->counters.local_blks_written += bufusage->local_blks_written; + entry->counters.temp_blks_read += bufusage->temp_blks_read; + entry->counters.temp_blks_written += bufusage->temp_blks_written; + entry->counters.shared_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_read_time); + entry->counters.shared_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->shared_blk_write_time); + entry->counters.local_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_read_time); + entry->counters.local_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->local_blk_write_time); + entry->counters.temp_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_read_time); + entry->counters.temp_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_write_time); + } entry->counters.usage += USAGE_EXEC(total_time); - entry->counters.wal_records += walusage->wal_records; - entry->counters.wal_fpi += walusage->wal_fpi; - entry->counters.wal_bytes += walusage->wal_bytes; - entry->counters.wal_buffers_full += walusage->wal_buffers_full; + if (walusage) + { + entry->counters.wal_records += walusage->wal_records; + entry->counters.wal_fpi += walusage->wal_fpi; + entry->counters.wal_bytes += walusage->wal_bytes; + entry->counters.wal_buffers_full += walusage->wal_buffers_full; + } if (jitusage) { entry->counters.jit_functions += jitusage->created_functions; diff --git a/contrib/pg_stat_statements/sql/utility.sql b/contrib/pg_stat_statements/sql/utility.sql index dd97203c21025..7540e49c73caf 100644 --- a/contrib/pg_stat_statements/sql/utility.sql +++ b/contrib/pg_stat_statements/sql/utility.sql @@ -152,6 +152,62 @@ EXPLAIN (costs off) SELECT a FROM generate_series(1,10) AS tab(a) WHERE a = 7; SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"; +-- Buffer stats should flow through EXPLAIN ANALYZE +CREATE TEMP TABLE flow_through_test (a int, b char(200)); +INSERT INTO flow_through_test SELECT i, repeat('x', 200) FROM generate_series(1, 5000) AS i; + +CREATE FUNCTION run_explain_buffers_test() RETURNS void AS $$ +DECLARE +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM flow_through_test'; +END; +$$ LANGUAGE plpgsql; + +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + +SELECT run_explain_buffers_test(); + +-- EXPLAIN entries should have non-zero buffer stats +SELECT query, local_blks_hit + local_blks_read > 0 as has_buffer_stats +FROM pg_stat_statements +WHERE query LIKE 'SELECT run_explain_buffers_test%' +ORDER BY query COLLATE "C"; + +DROP FUNCTION run_explain_buffers_test; +DROP TABLE flow_through_test; + +-- Validate buffer/WAL counting during abort +SET pg_stat_statements.track = 'all'; +CREATE TEMP TABLE pgss_call_tab (a int, b char(20)); +CREATE TEMP TABLE pgss_call_tab2 (a int, b char(20)); +INSERT INTO pgss_call_tab VALUES (0, 'zzz'); + +CREATE PROCEDURE pgss_call_rollback_proc() AS $$ +DECLARE + v int; +BEGIN + EXPLAIN ANALYZE WITH ins AS (INSERT INTO pgss_call_tab2 SELECT * FROM pgss_call_tab RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN +END; +$$ LANGUAGE plpgsql; + +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +CALL pgss_call_rollback_proc(); + +SELECT query, calls, +local_blks_hit + local_blks_read > 0 as local_hitread, +wal_bytes > 0 as wal_bytes_generated, +wal_records > 0 as wal_records_generated +FROM pg_stat_statements +WHERE query LIKE '%pgss_call_rollback_proc%' +ORDER BY query COLLATE "C"; + +DROP TABLE pgss_call_tab2; +DROP TABLE pgss_call_tab; +DROP PROCEDURE pgss_call_rollback_proc; +SET pg_stat_statements.track = 'top'; + -- CALL CREATE OR REPLACE PROCEDURE sum_one(i int) AS $$ DECLARE diff --git a/contrib/pg_stat_statements/sql/wal.sql b/contrib/pg_stat_statements/sql/wal.sql index 1dc1552a81ebc..467e321b2062e 100644 --- a/contrib/pg_stat_statements/sql/wal.sql +++ b/contrib/pg_stat_statements/sql/wal.sql @@ -18,3 +18,36 @@ wal_records > 0 as wal_records_generated, wal_records >= rows as wal_records_ge_rows FROM pg_stat_statements ORDER BY query COLLATE "C"; SELECT pg_stat_statements_reset() IS NOT NULL AS t; + +-- +-- Validate buffer/WAL counting with caught exception in PL/pgSQL +-- +CREATE TEMP TABLE pgss_error_tab (a int, b char(20)); +INSERT INTO pgss_error_tab VALUES (0, 'zzz'); + +CREATE FUNCTION pgss_error_func() RETURNS void AS $$ +DECLARE + v int; +BEGIN + WITH ins AS (INSERT INTO pgss_error_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN + NULL; +END; +$$ LANGUAGE plpgsql; + +SELECT pg_stat_statements_reset() IS NOT NULL AS t; +SELECT pgss_error_func(); + +-- Buffer/WAL usage from the wCTE INSERT should survive the exception +SELECT query, calls, +local_blks_hit + local_blks_read > 0 as local_hitread, +wal_bytes > 0 as wal_bytes_generated, +wal_records > 0 as wal_records_generated +FROM pg_stat_statements +WHERE query LIKE '%pgss_error_func%' +ORDER BY query COLLATE "C"; + +DROP TABLE pgss_error_tab; +DROP FUNCTION pgss_error_func; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c index 60d90329a651d..6f0cb2a285b38 100644 --- a/contrib/postgres_fdw/postgres_fdw.c +++ b/contrib/postgres_fdw/postgres_fdw.c @@ -2778,7 +2778,7 @@ postgresIterateDirectModify(ForeignScanState *node) if (!resultRelInfo->ri_projectReturning) { TupleTableSlot *slot = node->ss.ss_ScanTupleSlot; - Instrumentation *instr = node->ss.ps.instrument; + NodeInstrumentation *instr = node->ss.ps.instrument; Assert(!dmstate->has_returning); diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 5f6f1db0467e9..9219625faf6e3 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -734,6 +734,7 @@ WHERE t1.unique1 < 10 AND t1.unique2 = t2.unique2; -> Index Scan using tenk2_unique2 on tenk2 t2 (cost=0.29..7.90 rows=1 width=244) (actual time=0.003..0.003 rows=1.00 loops=10) Index Cond: (unique2 = t1.unique2) Index Searches: 10 + Table Buffers: shared hit=10 Buffers: shared hit=24 read=6 Planning: Buffers: shared hit=15 dirtied=9 @@ -949,7 +950,8 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)'; Index Cond: (f1 @> '((0.5,2))'::polygon) Rows Removed by Index Recheck: 1 Index Searches: 1 - Buffers: shared hit=1 + Table Buffers: shared hit=1 + Buffers: shared hit=2 Planning Time: 0.039 ms Execution Time: 0.098 ms @@ -958,7 +960,9 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)'; then rejected by a recheck of the index condition. This happens because a GiST index is lossy for polygon containment tests: it actually returns the rows with polygons that overlap the target, and then we have - to do the exact containment test on those rows. + to do the exact containment test on those rows. The Table Buffers + counts indicate how many operations were performed on the table instead of + the index. This number is included in the Buffers counts. @@ -1147,13 +1151,14 @@ EXPLAIN ANALYZE SELECT * FROM tenk1 WHERE unique1 < 100 AND unique2 > 9000 QUERY PLAN -------------------------------------------------------------------&zwsp;------------------------------------------------------------ Limit (cost=0.29..14.33 rows=2 width=244) (actual time=0.051..0.071 rows=2.00 loops=1) - Buffers: shared hit=16 + Buffers: shared hit=14 -> Index Scan using tenk1_unique2 on tenk1 (cost=0.29..70.50 rows=10 width=244) (actual time=0.051..0.070 rows=2.00 loops=1) Index Cond: (unique2 > 9000) Filter: (unique1 < 100) Rows Removed by Filter: 287 Index Searches: 1 - Buffers: shared hit=16 + Table Buffers: shared hit=11 + Buffers: shared hit=14 Planning Time: 0.077 ms Execution Time: 0.086 ms diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml index 7dee77fd366b0..912c96f2ff51a 100644 --- a/doc/src/sgml/ref/explain.sgml +++ b/doc/src/sgml/ref/explain.sgml @@ -506,6 +506,7 @@ EXPLAIN ANALYZE EXECUTE query(100, 200); -> Index Scan using test_pkey on test (cost=0.29..10.27 rows=99 width=8) (actual time=0.009..0.025 rows=99.00 loops=1) Index Cond: ((id > 100) AND (id < 200)) Index Searches: 1 + Table Buffers: shared hit=1 Buffers: shared hit=4 Planning Time: 0.244 ms Execution Time: 0.073 ms diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 6887e421442ee..c1c3d03b6edb6 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -2885,6 +2885,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + Instrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -2934,7 +2935,7 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) tuplesort_attach_shared(sharedsort, seg); /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Might as well use reliable figure when doling out maintenance_work_mem @@ -2949,7 +2950,8 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); index_close(indexRel, indexLockmode); diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 0d63fb4ba27b6..9149d735d59b3 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -2108,6 +2108,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + Instrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -2176,7 +2177,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) tuplesort_attach_shared(sharedsort, seg); /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Might as well use reliable figure when doling out maintenance_work_mem @@ -2191,7 +2192,8 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); index_close(indexRel, indexLockmode); diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 4be267ff657e1..51f86ff1ef7bf 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -638,8 +638,7 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, TimestampTz starttime = 0; PgStat_Counter startreadtime = 0, startwritetime = 0; - WalUsage startwalusage = pgWalUsage; - BufferUsage startbufferusage = pgBufferUsage; + Instrumentation *instr = NULL; ErrorContextCallback errcallback; char **indnames = NULL; Size dead_items_max_bytes = 0; @@ -655,6 +654,8 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, startreadtime = pgStatBlockReadTime; startwritetime = pgStatBlockWriteTime; } + instr = InstrAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + InstrStart(instr); } /* Used for instrumentation and stats report */ @@ -980,14 +981,14 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, { TimestampTz endtime = GetCurrentTimestamp(); + InstrStop(instr, true); + if (verbose || params.log_vacuum_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, params.log_vacuum_min_duration)) { long secs_dur; int usecs_dur; - WalUsage walusage; - BufferUsage bufferusage; StringInfoData buf; char *msgfmt; int32 diff; @@ -996,12 +997,10 @@ heap_vacuum_rel(Relation rel, const VacuumParams params, int64 total_blks_hit; int64 total_blks_read; int64 total_blks_dirtied; + BufferUsage bufferusage = instr->stack->bufusage; + WalUsage walusage = instr->stack->walusage; TimestampDifference(starttime, endtime, &secs_dur, &usecs_dur); - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage); - memset(&bufferusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage); total_blks_hit = bufferusage.shared_blks_hit + bufferusage.local_blks_hit; diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 3a45508f62ea4..600f23ff2a6f9 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -1750,6 +1750,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) Relation indexRel; LOCKMODE heapLockmode; LOCKMODE indexLockmode; + Instrumentation *instr; WalUsage *walusage; BufferUsage *bufferusage; int sortmem; @@ -1825,7 +1826,7 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) } /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* Perform sorting of spool, and possibly a spool2 */ sortmem = maintenance_work_mem / btshared->scantuplesortstates; @@ -1835,7 +1836,8 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &bufferusage[ParallelWorkerNumber], &walusage[ParallelWorkerNumber]); #ifdef BTREE_BUILD_STATS diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 13cce9b49f16e..6a635bcdf1182 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -1080,10 +1080,10 @@ XLogInsertRecord(XLogRecData *rdata, /* Report WAL traffic to the instrumentation. */ if (inserted) { - pgWalUsage.wal_bytes += rechdr->xl_tot_len; - pgWalUsage.wal_records++; - pgWalUsage.wal_fpi += num_fpi; - pgWalUsage.wal_fpi_bytes += fpi_bytes; + INSTR_WALUSAGE_ADD(wal_bytes, rechdr->xl_tot_len); + INSTR_WALUSAGE_INCR(wal_records); + INSTR_WALUSAGE_ADD(wal_fpi, num_fpi); + INSTR_WALUSAGE_ADD(wal_fpi_bytes, fpi_bytes); /* Required for the flush of pending stats WAL data */ pgstat_report_fixed = true; @@ -2063,7 +2063,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) WriteRqst.Flush = InvalidXLogRecPtr; XLogWrite(WriteRqst, tli, false); LWLockRelease(WALWriteLock); - pgWalUsage.wal_buffers_full++; + INSTR_WALUSAGE_INCR(wal_buffers_full); TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE(); /* diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index a483424152c5f..47cac75bc20b5 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -308,9 +308,7 @@ do_analyze_rel(Relation onerel, const VacuumParams params, Oid save_userid; int save_sec_context; int save_nestlevel; - WalUsage startwalusage = pgWalUsage; - BufferUsage startbufferusage = pgBufferUsage; - BufferUsage bufferusage; + Instrumentation *instr = NULL; PgStat_Counter startreadtime = 0; PgStat_Counter startwritetime = 0; @@ -361,6 +359,9 @@ do_analyze_rel(Relation onerel, const VacuumParams params, } pg_rusage_init(&ru0); + + instr = InstrAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + InstrStart(instr); } /* Used for instrumentation and stats report */ @@ -741,12 +742,13 @@ do_analyze_rel(Relation onerel, const VacuumParams params, { TimestampTz endtime = GetCurrentTimestamp(); + InstrStop(instr, true); + if (verbose || params.log_analyze_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, params.log_analyze_min_duration)) { long delay_in_ms; - WalUsage walusage; double read_rate = 0; double write_rate = 0; char *msgfmt; @@ -754,18 +756,15 @@ do_analyze_rel(Relation onerel, const VacuumParams params, int64 total_blks_hit; int64 total_blks_read; int64 total_blks_dirtied; - - memset(&bufferusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage); - memset(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage); - - total_blks_hit = bufferusage.shared_blks_hit + - bufferusage.local_blks_hit; - total_blks_read = bufferusage.shared_blks_read + - bufferusage.local_blks_read; - total_blks_dirtied = bufferusage.shared_blks_dirtied + - bufferusage.local_blks_dirtied; + BufferUsage bufusage = instr->stack->bufusage; + WalUsage walusage = instr->stack->walusage; + + total_blks_hit = bufusage.shared_blks_hit + + bufusage.local_blks_hit; + total_blks_read = bufusage.shared_blks_read + + bufusage.local_blks_read; + total_blks_dirtied = bufusage.shared_blks_dirtied + + bufusage.local_blks_dirtied; /* * We do not expect an analyze to take > 25 days and it simplifies diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 93918a223b8d7..054c5bce837f6 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -143,7 +143,7 @@ static void show_instrumentation_count(const char *qlabel, int which, static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); static const char *explain_get_index_name(Oid indexId); static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage); -static void show_buffer_usage(ExplainState *es, const BufferUsage *usage); +static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title); static void show_wal_usage(ExplainState *es, const WalUsage *usage); static void show_memory_counters(ExplainState *es, const MemoryContextCounters *mem_counters); @@ -322,14 +322,16 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, QueryEnvironment *queryEnv) { PlannedStmt *plan; - instr_time planstart, - planduration; - BufferUsage bufusage_start, - bufusage; + Instrumentation *instr = NULL; MemoryContextCounters mem_counters; MemoryContext planner_ctx = NULL; MemoryContext saved_ctx = NULL; + if (es->buffers) + instr = InstrAlloc(INSTRUMENT_TIMER | INSTRUMENT_BUFFERS); + else + instr = InstrAlloc(INSTRUMENT_TIMER); + if (es->memory) { /* @@ -346,15 +348,12 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, saved_ctx = MemoryContextSwitchTo(planner_ctx); } - if (es->buffers) - bufusage_start = pgBufferUsage; - INSTR_TIME_SET_CURRENT(planstart); + InstrStart(instr); /* plan the query */ plan = pg_plan_query(query, queryString, cursorOptions, params, es); - INSTR_TIME_SET_CURRENT(planduration); - INSTR_TIME_SUBTRACT(planduration, planstart); + InstrStop(instr, true); if (es->memory) { @@ -362,16 +361,9 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, MemoryContextMemConsumed(planner_ctx, &mem_counters); } - /* calc differences of buffer counters. */ - if (es->buffers) - { - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - } - /* run it (if needed) and produce output */ ExplainOnePlan(plan, into, es, queryString, params, queryEnv, - &planduration, (es->buffers ? &bufusage : NULL), + &instr->total, (es->buffers ? &instr->stack->bufusage : NULL), es->memory ? &mem_counters : NULL); } @@ -611,7 +603,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, } if (bufusage) - show_buffer_usage(es, bufusage); + show_buffer_usage(es, bufusage, NULL); if (mem_counters) show_memory_counters(es, mem_counters); @@ -1028,7 +1020,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) if (es->buffers && peek_buffer_usage(es, &metrics->bufferUsage)) { es->indent++; - show_buffer_usage(es, &metrics->bufferUsage); + show_buffer_usage(es, &metrics->bufferUsage, NULL); es->indent--; } } @@ -1042,7 +1034,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) BYTES_TO_KILOBYTES(metrics->bytesSent), es); ExplainPropertyText("Format", format, es); if (es->buffers) - show_buffer_usage(es, &metrics->bufferUsage); + show_buffer_usage(es, &metrics->bufferUsage, NULL); } ExplainCloseGroup("Serialization", "Serialization", true, es); @@ -1099,18 +1091,15 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es) for (nt = 0; nt < rInfo->ri_TrigDesc->numtriggers; nt++) { Trigger *trig = rInfo->ri_TrigDesc->triggers + nt; - Instrumentation *instr = rInfo->ri_TrigInstrument + nt; + TriggerInstrumentation *tginstr = rInfo->ri_TrigInstrument + nt; char *relname; char *conname = NULL; - /* Must clean up instrumentation state */ - InstrEndLoop(instr); - /* * We ignore triggers that were never invoked; they likely aren't * relevant to the current query type. */ - if (instr->ntuples == 0) + if (tginstr->firings == 0) continue; ExplainOpenGroup("Trigger", NULL, true, es); @@ -1135,11 +1124,11 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es) if (show_relname) appendStringInfo(es->str, " on %s", relname); if (es->timing) - appendStringInfo(es->str, ": time=%.3f calls=%.0f\n", - INSTR_TIME_GET_MILLISEC(instr->total), - instr->ntuples); + appendStringInfo(es->str, ": time=%.3f calls=%d\n", + INSTR_TIME_GET_MILLISEC(tginstr->instr.total), + tginstr->firings); else - appendStringInfo(es->str, ": calls=%.0f\n", instr->ntuples); + appendStringInfo(es->str, ": calls=%d\n", tginstr->firings); } else { @@ -1149,9 +1138,9 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es) ExplainPropertyText("Relation", relname, es); if (es->timing) ExplainPropertyFloat("Time", "ms", - INSTR_TIME_GET_MILLISEC(instr->total), 3, + INSTR_TIME_GET_MILLISEC(tginstr->instr.total), 3, es); - ExplainPropertyFloat("Calls", NULL, instr->ntuples, 0, es); + ExplainPropertyInteger("Calls", NULL, tginstr->firings, es); } if (conname) @@ -1891,11 +1880,11 @@ ExplainNode(PlanState *planstate, List *ancestors, /* prepare per-worker general execution details */ if (es->workers_state && es->verbose) { - WorkerInstrumentation *w = planstate->worker_instrument; + WorkerNodeInstrumentation *w = planstate->worker_instrument; for (int n = 0; n < w->num_workers; n++) { - Instrumentation *instrument = &w->instrument[n]; + NodeInstrumentation *instrument = &w->instrument[n]; double nloops = instrument->nloops; double startup_ms; double total_ms; @@ -1973,6 +1962,9 @@ ExplainNode(PlanState *planstate, List *ancestors, show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); show_indexsearches_info(planstate, es); + + if (es->buffers && planstate->instrument) + show_buffer_usage(es, &((IndexScanState *) planstate)->iss_InstrumentTableStack->bufusage, "Table"); break; case T_IndexOnlyScan: show_scan_qual(((IndexOnlyScan *) plan)->indexqual, @@ -2291,18 +2283,18 @@ ExplainNode(PlanState *planstate, List *ancestors, /* Show buffer/WAL usage */ if (es->buffers && planstate->instrument) - show_buffer_usage(es, &planstate->instrument->bufusage); + show_buffer_usage(es, &planstate->instrument->stack.bufusage, NULL); if (es->wal && planstate->instrument) - show_wal_usage(es, &planstate->instrument->walusage); + show_wal_usage(es, &planstate->instrument->stack.walusage); /* Prepare per-worker buffer/WAL usage */ if (es->workers_state && (es->buffers || es->wal) && es->verbose) { - WorkerInstrumentation *w = planstate->worker_instrument; + WorkerNodeInstrumentation *w = planstate->worker_instrument; for (int n = 0; n < w->num_workers; n++) { - Instrumentation *instrument = &w->instrument[n]; + NodeInstrumentation *instrument = &w->instrument[n]; double nloops = instrument->nloops; if (nloops <= 0) @@ -2310,9 +2302,9 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainOpenWorker(n, es); if (es->buffers) - show_buffer_usage(es, &instrument->bufusage); + show_buffer_usage(es, &instrument->stack.bufusage, NULL); if (es->wal) - show_wal_usage(es, &instrument->walusage); + show_wal_usage(es, &instrument->stack.walusage); ExplainCloseWorker(n, es); } } @@ -4110,7 +4102,7 @@ peek_buffer_usage(ExplainState *es, const BufferUsage *usage) * Show buffer usage details. This better be sync with peek_buffer_usage. */ static void -show_buffer_usage(ExplainState *es, const BufferUsage *usage) +show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title) { if (es->format == EXPLAIN_FORMAT_TEXT) { @@ -4135,6 +4127,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared || has_local || has_temp) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "Buffers:"); if (has_shared) @@ -4190,6 +4184,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared_timing || has_local_timing || has_temp_timing) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "I/O Timings:"); if (has_shared_timing) @@ -4231,44 +4227,46 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) } else { - ExplainPropertyInteger("Shared Hit Blocks", NULL, + char *prefix = title ? psprintf("%s ", title) : pstrdup(""); + + ExplainPropertyInteger(psprintf("%sShared Hit Blocks", prefix), NULL, usage->shared_blks_hit, es); - ExplainPropertyInteger("Shared Read Blocks", NULL, + ExplainPropertyInteger(psprintf("%sShared Read Blocks", prefix), NULL, usage->shared_blks_read, es); - ExplainPropertyInteger("Shared Dirtied Blocks", NULL, + ExplainPropertyInteger(psprintf("%sShared Dirtied Blocks", prefix), NULL, usage->shared_blks_dirtied, es); - ExplainPropertyInteger("Shared Written Blocks", NULL, + ExplainPropertyInteger(psprintf("%sShared Written Blocks", prefix), NULL, usage->shared_blks_written, es); - ExplainPropertyInteger("Local Hit Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Hit Blocks", prefix), NULL, usage->local_blks_hit, es); - ExplainPropertyInteger("Local Read Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Read Blocks", prefix), NULL, usage->local_blks_read, es); - ExplainPropertyInteger("Local Dirtied Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Dirtied Blocks", prefix), NULL, usage->local_blks_dirtied, es); - ExplainPropertyInteger("Local Written Blocks", NULL, + ExplainPropertyInteger(psprintf("%sLocal Written Blocks", prefix), NULL, usage->local_blks_written, es); - ExplainPropertyInteger("Temp Read Blocks", NULL, + ExplainPropertyInteger(psprintf("%sTemp Read Blocks", prefix), NULL, usage->temp_blks_read, es); - ExplainPropertyInteger("Temp Written Blocks", NULL, + ExplainPropertyInteger(psprintf("%sTemp Written Blocks", prefix), NULL, usage->temp_blks_written, es); if (track_io_timing) { - ExplainPropertyFloat("Shared I/O Read Time", "ms", + ExplainPropertyFloat(psprintf("%sShared I/O Read Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time), 3, es); - ExplainPropertyFloat("Shared I/O Write Time", "ms", + ExplainPropertyFloat(psprintf("%sShared I/O Write Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->shared_blk_write_time), 3, es); - ExplainPropertyFloat("Local I/O Read Time", "ms", + ExplainPropertyFloat(psprintf("%sLocal I/O Read Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->local_blk_read_time), 3, es); - ExplainPropertyFloat("Local I/O Write Time", "ms", + ExplainPropertyFloat(psprintf("%sLocal I/O Write Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->local_blk_write_time), 3, es); - ExplainPropertyFloat("Temp I/O Read Time", "ms", + ExplainPropertyFloat(psprintf("%sTemp I/O Read Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time), 3, es); - ExplainPropertyFloat("Temp I/O Write Time", "ms", + ExplainPropertyFloat(psprintf("%sTemp I/O Write Time", prefix), "ms", INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time), 3, es); } diff --git a/src/backend/commands/explain_dr.c b/src/backend/commands/explain_dr.c index 3c96061cf32ab..9e4cd8ad7d083 100644 --- a/src/backend/commands/explain_dr.c +++ b/src/backend/commands/explain_dr.c @@ -110,15 +110,20 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self) MemoryContext oldcontext; StringInfo buf = &myState->buf; int natts = typeinfo->natts; - instr_time start, - end; - BufferUsage instr_start; + Instrumentation *instr = NULL; /* only measure time, buffers if requested */ - if (myState->es->timing) - INSTR_TIME_SET_CURRENT(start); - if (myState->es->buffers) - instr_start = pgBufferUsage; + if (myState->es->timing || myState->es->buffers) + { + InstrumentOption instrument_options = 0; + + if (myState->es->timing) + instrument_options |= INSTRUMENT_TIMER; + if (myState->es->buffers) + instrument_options |= INSTRUMENT_BUFFERS; + instr = InstrAlloc(instrument_options); + InstrStart(instr); + } /* Set or update my derived attribute info, if needed */ if (myState->attrinfo != typeinfo || myState->nattrs != natts) @@ -186,18 +191,16 @@ serializeAnalyzeReceive(TupleTableSlot *slot, DestReceiver *self) MemoryContextSwitchTo(oldcontext); MemoryContextReset(myState->tmpcontext); + if (myState->es->timing || myState->es->buffers) + InstrStop(instr, true); + /* Update timing data */ if (myState->es->timing) - { - INSTR_TIME_SET_CURRENT(end); - INSTR_TIME_ACCUM_DIFF(myState->metrics.timeSpent, end, start); - } + INSTR_TIME_ADD(myState->metrics.timeSpent, instr->total); /* Update buffer metrics */ if (myState->es->buffers) - BufferUsageAccumDiff(&myState->metrics.bufferUsage, - &pgBufferUsage, - &instr_start); + BufferUsageAdd(&myState->metrics.bufferUsage, &instr->stack->bufusage); return true; } diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c index 5b86a727587b2..2a1a6d1cf2e6d 100644 --- a/src/backend/commands/prepare.c +++ b/src/backend/commands/prepare.c @@ -578,13 +578,16 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, ListCell *p; ParamListInfo paramLI = NULL; EState *estate = NULL; - instr_time planstart; - instr_time planduration; - BufferUsage bufusage_start, - bufusage; + Instrumentation *instr = NULL; MemoryContextCounters mem_counters; MemoryContext planner_ctx = NULL; MemoryContext saved_ctx = NULL; + int instrument_options = INSTRUMENT_TIMER; + + if (es->buffers) + instrument_options |= INSTRUMENT_BUFFERS; + + instr = InstrAlloc(instrument_options); if (es->memory) { @@ -596,9 +599,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, saved_ctx = MemoryContextSwitchTo(planner_ctx); } - if (es->buffers) - bufusage_start = pgBufferUsage; - INSTR_TIME_SET_CURRENT(planstart); + InstrStart(instr); /* Look it up in the hash table */ entry = FetchPreparedStatement(execstmt->name, true); @@ -633,8 +634,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, cplan = GetCachedPlan(entry->plansource, paramLI, CurrentResourceOwner, pstate->p_queryEnv); - INSTR_TIME_SET_CURRENT(planduration); - INSTR_TIME_SUBTRACT(planduration, planstart); + InstrStop(instr, true); if (es->memory) { @@ -642,13 +642,6 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, MemoryContextMemConsumed(planner_ctx, &mem_counters); } - /* calc differences of buffer counters. */ - if (es->buffers) - { - memset(&bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start); - } - plan_list = cplan->stmt_list; /* Explain each query */ @@ -658,7 +651,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, if (pstmt->commandType != CMD_UTILITY) ExplainOnePlan(pstmt, into, es, query_string, paramLI, pstate->p_queryEnv, - &planduration, (es->buffers ? &bufusage : NULL), + &instr->total, (es->buffers ? &instr->stack->bufusage : NULL), es->memory ? &mem_counters : NULL); else ExplainOneUtility(pstmt->utilityStmt, into, es, pstate, paramLI); diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 98d402c0a3be7..c336007314121 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -90,7 +90,7 @@ static bool TriggerEnabled(EState *estate, ResultRelInfo *relinfo, static HeapTuple ExecCallTriggerFunc(TriggerData *trigdata, int tgindx, FmgrInfo *finfo, - Instrumentation *instr, + TriggerInstrumentation *instr, MemoryContext per_tuple_context); static void AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, ResultRelInfo *src_partinfo, @@ -2309,7 +2309,7 @@ static HeapTuple ExecCallTriggerFunc(TriggerData *trigdata, int tgindx, FmgrInfo *finfo, - Instrumentation *instr, + TriggerInstrumentation *instr, MemoryContext per_tuple_context) { LOCAL_FCINFO(fcinfo, 0); @@ -2344,7 +2344,7 @@ ExecCallTriggerFunc(TriggerData *trigdata, * If doing EXPLAIN ANALYZE, start charging time to this trigger. */ if (instr) - InstrStartNode(instr + tgindx); + InstrStartTrigger(instr + tgindx); /* * Do the function evaluation in the per-tuple memory context, so that @@ -2389,10 +2389,10 @@ ExecCallTriggerFunc(TriggerData *trigdata, /* * If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count - * one "tuple returned" (really the number of firings). + * the firing of the trigger. */ if (instr) - InstrStopNode(instr + tgindx, 1); + InstrStopTrigger(instr + tgindx, 1); return (HeapTuple) DatumGetPointer(result); } @@ -3936,7 +3936,7 @@ static void AfterTriggerExecute(EState *estate, ResultRelInfo *dst_relInfo, TriggerDesc *trigdesc, FmgrInfo *finfo, - Instrumentation *instr, + TriggerInstrumentation *instr, MemoryContext per_tuple_context, TupleTableSlot *trig_tuple_slot1, TupleTableSlot *trig_tuple_slot2); @@ -4330,7 +4330,7 @@ AfterTriggerExecute(EState *estate, ResultRelInfo *src_relInfo, ResultRelInfo *dst_relInfo, TriggerDesc *trigdesc, - FmgrInfo *finfo, Instrumentation *instr, + FmgrInfo *finfo, TriggerInstrumentation *instr, MemoryContext per_tuple_context, TupleTableSlot *trig_tuple_slot1, TupleTableSlot *trig_tuple_slot2) @@ -4371,7 +4371,7 @@ AfterTriggerExecute(EState *estate, * to include time spent re-fetching tuples in the trigger cost. */ if (instr) - InstrStartNode(instr + tgindx); + InstrStartTrigger(instr + tgindx); /* * Fetch the required tuple(s). @@ -4588,10 +4588,10 @@ AfterTriggerExecute(EState *estate, /* * If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count - * one "tuple returned" (really the number of firings). + * the firing of the trigger. */ if (instr) - InstrStopNode(instr + tgindx, 1); + InstrStopTrigger(instr + tgindx, 1); } @@ -4707,7 +4707,7 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events, Relation rel = NULL; TriggerDesc *trigdesc = NULL; FmgrInfo *finfo = NULL; - Instrumentation *instr = NULL; + TriggerInstrumentation *instr = NULL; TupleTableSlot *slot1 = NULL, *slot2 = NULL; diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index c3b3c9ea21ab8..10ba717bb6b24 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -994,6 +994,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) PVIndStats *indstats; PVShared *shared; TidStore *dead_items; + Instrumentation *instr; BufferUsage *buffer_usage; WalUsage *wal_usage; int nindexes; @@ -1083,7 +1084,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) error_context_stack = &errcallback; /* Prepare to track buffer usage during parallel execution */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* Process indexes to perform vacuum/cleanup */ parallel_vacuum_process_safe_indexes(&pvs); @@ -1091,7 +1092,8 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) /* Report buffer/WAL usage during parallel execution */ buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false); wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &buffer_usage[ParallelWorkerNumber], &wal_usage[ParallelWorkerNumber]); /* Report any remaining cost-based vacuum delay time */ diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index bfd3ebc601ec6..f01f1c864c409 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -76,6 +76,7 @@ ExecutorCheckPerms_hook_type ExecutorCheckPerms_hook = NULL; /* decls for local routines only used within this module */ static void InitPlan(QueryDesc *queryDesc, int eflags); static void CheckValidRowMarkRel(Relation rel, RowMarkType markType); +static void ExecFinalizeTriggerInstrumentation(EState *estate); static void ExecPostprocessPlan(EState *estate); static void ExecEndPlan(PlanState *planstate, EState *estate); static void ExecutePlan(QueryDesc *queryDesc, @@ -329,9 +330,26 @@ standard_ExecutorRun(QueryDesc *queryDesc, */ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); - /* Allow instrumentation of Executor overall runtime */ + /* + * Start up required top-level instrumentation stack for WAL/buffer + * tracking + */ + if (!queryDesc->totaltime && (estate->es_instrument & (INSTRUMENT_BUFFERS | INSTRUMENT_WAL))) + queryDesc->totaltime = InstrAlloc(estate->es_instrument); + if (queryDesc->totaltime) - InstrStartNode(queryDesc->totaltime); + { + /* Allow instrumentation of Executor overall runtime */ + InstrStart(queryDesc->totaltime); + + /* + * Remember all node stacks for abort recovery. We do this once here + * after the first call to InstrStart has allocated the parent stack. + */ + if (queryDesc->totaltime->stack && !queryDesc->already_executed) + ExecRememberNodeInstrumentation(queryDesc->planstate, + queryDesc->totaltime->stack); + } /* * extract information from the query descriptor and the query feature. @@ -383,7 +401,7 @@ standard_ExecutorRun(QueryDesc *queryDesc, dest->rShutdown(dest); if (queryDesc->totaltime) - InstrStopNode(queryDesc->totaltime, estate->es_processed); + InstrStop(queryDesc->totaltime, false); MemoryContextSwitchTo(oldcontext); } @@ -433,7 +451,7 @@ standard_ExecutorFinish(QueryDesc *queryDesc) /* Allow instrumentation of Executor overall runtime */ if (queryDesc->totaltime) - InstrStartNode(queryDesc->totaltime); + InstrStart(queryDesc->totaltime); /* Run ModifyTable nodes to completion */ ExecPostprocessPlan(estate); @@ -442,8 +460,26 @@ standard_ExecutorFinish(QueryDesc *queryDesc) if (!(estate->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS)) AfterTriggerEndQuery(estate); + /* + * Accumulate per-node and trigger statistics to their respective parent + * instrumentation stacks. + * + * We skip this in parallel workers because their per-node stats are + * reported individually via ExecParallelReportInstrumentation, and the + * leader's own ExecFinalizeNodeInstrumentation handles propagation. If + * we accumulated here, the leader would double-count: worker parent nodes + * would already include their children's stats, and then the leader's + * accumulation would add the children again. + */ + if (queryDesc->totaltime && estate->es_instrument && !IsParallelWorker()) + { + ExecFinalizeNodeInstrumentation(queryDesc->planstate); + + ExecFinalizeTriggerInstrumentation(estate); + } + if (queryDesc->totaltime) - InstrStopNode(queryDesc->totaltime, 0); + InstrStop(queryDesc->totaltime, true); MemoryContextSwitchTo(oldcontext); @@ -1270,7 +1306,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigWhenExprs = (ExprState **) palloc0_array(ExprState *, n); if (instrument_options) - resultRelInfo->ri_TrigInstrument = InstrAlloc(n, instrument_options, false); + resultRelInfo->ri_TrigInstrument = InstrAllocTrigger(n, instrument_options); } else { @@ -1484,6 +1520,24 @@ ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo) return resultRelInfo->ri_ancestorResultRels; } +static void +ExecFinalizeTriggerInstrumentation(EState *estate) +{ + List *rels = NIL; + + rels = list_concat(rels, estate->es_tuple_routing_result_relations); + rels = list_concat(rels, estate->es_opened_result_relations); + rels = list_concat(rels, estate->es_trig_target_relations); + + foreach_node(ResultRelInfo, rInfo, rels) + { + TriggerInstrumentation *ti = rInfo->ri_TrigInstrument; + + if (ti && ti->instr.stack) + InstrStackAdd(CurrentInstrStack, ti->instr.stack); + } +} + /* ---------------------------------------------------------------- * ExecPostprocessPlan * diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index f87978c137e7d..dee8bf99c936a 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -86,7 +86,7 @@ typedef struct FixedParallelExecutorState * instrument_options: Same meaning here as in instrument.c. * * instrument_offset: Offset, relative to the start of this structure, - * of the first Instrumentation object. This will depend on the length of + * of the first NodeInstrumentation object. This will depend on the length of * the plan_node_id array. * * num_workers: Number of workers. @@ -103,11 +103,15 @@ struct SharedExecutorInstrumentation int num_workers; int num_plan_nodes; int plan_node_id[FLEXIBLE_ARRAY_MEMBER]; - /* array of num_plan_nodes * num_workers Instrumentation objects follows */ + + /* + * array of num_plan_nodes * num_workers NodeInstrumentation objects + * follows + */ }; #define GetInstrumentationArray(sei) \ (StaticAssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \ - (Instrumentation *) (((char *) sei) + sei->instrument_offset)) + (NodeInstrumentation *) (((char *) sei) + sei->instrument_offset)) /* Context object for ExecParallelEstimate. */ typedef struct ExecParallelEstimateContext @@ -724,7 +728,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, instrumentation_len = MAXALIGN(instrumentation_len); instrument_offset = instrumentation_len; instrumentation_len += - mul_size(sizeof(Instrumentation), + mul_size(sizeof(NodeInstrumentation), mul_size(e.nnodes, nworkers)); shm_toc_estimate_chunk(&pcxt->estimator, instrumentation_len); shm_toc_estimate_keys(&pcxt->estimator, 1); @@ -810,7 +814,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, */ if (estate->es_instrument) { - Instrumentation *instrument; + NodeInstrumentation *instrument; int i; instrumentation = shm_toc_allocate(pcxt->toc, instrumentation_len); @@ -820,7 +824,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, instrumentation->num_plan_nodes = e.nnodes; instrument = GetInstrumentationArray(instrumentation); for (i = 0; i < nworkers * e.nnodes; ++i) - InstrInit(&instrument[i], estate->es_instrument); + InstrInitNode(&instrument[i], estate->es_instrument); shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION, instrumentation); pei->instrumentation = instrumentation; @@ -1052,7 +1056,7 @@ static bool ExecParallelRetrieveInstrumentation(PlanState *planstate, SharedExecutorInstrumentation *instrumentation) { - Instrumentation *instrument; + NodeInstrumentation *instrument; int i; int n; int ibytes; @@ -1080,9 +1084,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate, * Switch into per-query memory context. */ oldcontext = MemoryContextSwitchTo(planstate->state->es_query_cxt); - ibytes = mul_size(instrumentation->num_workers, sizeof(Instrumentation)); + ibytes = mul_size(instrumentation->num_workers, sizeof(NodeInstrumentation)); planstate->worker_instrument = - palloc(ibytes + offsetof(WorkerInstrumentation, instrument)); + palloc(ibytes + offsetof(WorkerNodeInstrumentation, instrument)); MemoryContextSwitchTo(oldcontext); planstate->worker_instrument->num_workers = instrumentation->num_workers; @@ -1312,7 +1316,7 @@ ExecParallelReportInstrumentation(PlanState *planstate, { int i; int plan_node_id = planstate->plan->plan_node_id; - Instrumentation *instrument; + NodeInstrumentation *instrument; InstrEndLoop(planstate->instrument); @@ -1451,6 +1455,7 @@ void ParallelQueryMain(dsm_segment *seg, shm_toc *toc) { FixedParallelExecutorState *fpes; + Instrumentation *instr; BufferUsage *buffer_usage; WalUsage *wal_usage; DestReceiver *receiver; @@ -1511,7 +1516,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) * leader, which also doesn't count buffer accesses and WAL activity that * occur during executor startup. */ - InstrStartParallelQuery(); + instr = InstrStartParallelQuery(); /* * Run the plan. If we specified a tuple bound, be careful not to demand @@ -1527,7 +1532,8 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) /* Report buffer/WAL usage during parallel execution. */ buffer_usage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); wal_usage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); - InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], + InstrEndParallelQuery(instr, + &buffer_usage[ParallelWorkerNumber], &wal_usage[ParallelWorkerNumber]); /* Report instrumentation data if any instrumentation options are set. */ diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 7e40b85251718..c3ff03f805bb0 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -122,6 +122,8 @@ static TupleTableSlot *ExecProcNodeFirst(PlanState *node); static TupleTableSlot *ExecProcNodeInstr(PlanState *node); static bool ExecShutdownNode_walker(PlanState *node, void *context); +static bool ExecRememberNodeInstrumentation_walker(PlanState *node, void *context); +static bool ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context); /* ------------------------------------------------------------------------ @@ -413,8 +415,18 @@ ExecInitNode(Plan *node, EState *estate, int eflags) /* Set up instrumentation for this node if requested */ if (estate->es_instrument) - result->instrument = InstrAlloc(1, estate->es_instrument, - result->async_capable); + { + result->instrument = InstrAllocNode(estate->es_instrument, + result->async_capable); + + /* IndexScan tracks table access separately from index access. */ + if (IsA(result, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, result); + + iss->iss_InstrumentTableStack = InstrAllocAdditionalNodeStack(result->instrument); + } + } return result; } @@ -828,6 +840,101 @@ ExecShutdownNode_walker(PlanState *node, void *context) return false; } +/* + * ExecRememberNodeInstrumentation + * + * Register all per-node instrumentation stacks as unfinalized children of the + * executor's instrumentation stack. This is needed for abort recovery: if the + * executor aborts, we need to walk each per-node instrumentation stack to + * recover buffer/WAL data from nodes that never got finalized, that someone + * might be interested in as an aggregate. + */ +void +ExecRememberNodeInstrumentation(PlanState *node, InstrStack *parent) +{ + (void) ExecRememberNodeInstrumentation_walker(node, parent); +} + +static bool +ExecRememberNodeInstrumentation_walker(PlanState *node, void *context) +{ + InstrStack *parent = (InstrStack *) context; + + Assert(parent != NULL); + + if (node == NULL) + return false; + + if (node->instrument && (node->instrument->need_bufusage || + node->instrument->need_walusage)) + { + InstrRememberNodeStack(parent, &node->instrument->stack); + + /* IndexScan has a separate stack to track table access */ + if (IsA(node, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, node); + + if (iss->iss_InstrumentTableStack) + InstrRememberNodeStack(parent, iss->iss_InstrumentTableStack); + } + } + + return planstate_tree_walker(node, ExecRememberNodeInstrumentation_walker, context); +} + +/* + * ExecFinalizeNodeInstrumentation + * + * Accumulate instrumentation stats from all execution nodes to their respective + * parents (or the original parent instrumentation stack). + * + * This must run after the cleanup done by ExecShutdownNode, and not rely on any + * resources cleaned up by it. We also expect shutdown actions to have occurred, + * e.g. parallel worker instrumentation to have been added to the leader. + */ +void +ExecFinalizeNodeInstrumentation(PlanState *node) +{ + (void) ExecFinalizeNodeInstrumentation_walker(node, (InstrStack *) CurrentInstrStack); +} + +static bool +ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context) +{ + InstrStack *parent = (InstrStack *) context; + + Assert(parent != NULL); + + if (node == NULL) + return false; + + /* + * Recurse into children first (bottom-up accumulation), passing our stack + * as the parent context. This ensures children can accumulate to us even + * if they were never executed by the leader (e.g. nodes beneath Gather + * that only workers ran, where stack.previous would not be initialized). + */ + planstate_tree_walker(node, ExecFinalizeNodeInstrumentation_walker, + node->instrument ? &node->instrument->stack : parent); + + if (!node->instrument) + return false; + + /* IndexScan has a separate stack to track table access */ + if (IsA(node, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, node); + + if (iss->iss_InstrumentTableStack) + iss->iss_InstrumentTableStack = InstrFinalizeAdditionalNodeStack(iss->iss_InstrumentTableStack, node->instrument); + } + + node->instrument = InstrFinalizeNode(node->instrument, parent); + + return false; +} + /* * ExecSetTupleBound * diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index edab92a0ebeda..0c2d769416e91 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -16,72 +16,342 @@ #include #include "executor/instrument.h" +#include "utils/memutils.h" +#include "utils/resowner.h" -BufferUsage pgBufferUsage; -static BufferUsage save_pgBufferUsage; WalUsage pgWalUsage; -static WalUsage save_pgWalUsage; +InstrStack TopInstrStack; +InstrStack *CurrentInstrStack = &TopInstrStack; -static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); static void WalUsageAdd(WalUsage *dst, WalUsage *add); +static void InstrFinalizeNodesOnAbort(InstrStack *stack); -/* Allocate new instrumentation structure(s) */ +/* + * Use ResourceOwner mechanism to correctly reset CurrentInstrStack on abort. + */ +static void ResOwnerReleaseInstrumentation(Datum res); +static const ResourceOwnerDesc instrumentation_resowner_desc = +{ + .name = "instrumentation", + .release_phase = RESOURCE_RELEASE_AFTER_LOCKS, + .release_priority = RELEASE_PRIO_INSTRUMENTATION, + .ReleaseResource = ResOwnerReleaseInstrumentation, + .DebugPrint = NULL, /* default message is fine */ +}; + +static inline void +ResourceOwnerRememberInstrStack(ResourceOwner owner, InstrStack *stack) +{ + ResourceOwnerRemember(owner, PointerGetDatum(stack), &instrumentation_resowner_desc); +} + +static inline void +ResourceOwnerForgetInstrStack(ResourceOwner owner, InstrStack *stack) +{ + ResourceOwnerForget(owner, PointerGetDatum(stack), &instrumentation_resowner_desc); +} + +static bool +StackIsParent(InstrStack *stack, InstrStack *entry) +{ + if (entry->previous == NULL) + return false; + + if (entry->previous == stack) + return true; + + return StackIsParent(stack, entry->previous); +} + +/* + * OrphanSkippedStacks + * + * Orphans all stack entries from the current stack entry to the provided + * stack which is assumed to be a parent stack of the current stack, and + * terminates once the current stack entry has reached the provided stack. + * + * This sets previous pointers of intermediate stack entries to NULL, so we + * don't have to worry about calling StackIsParent with a bad pointer in + * ResOwnerReleaseInstrumentation. + * + * This matters because we may process stack entries out of order in aborts + * because (1) we might have a mix of ResOwner and PG_FINALLY owned stacks + * (2) ResOwnerReleaseInstrumentation might be called out of order. + */ +static void +OrphanSkippedStacks(InstrStack *stack) +{ + if (CurrentInstrStack == stack || !StackIsParent(stack, CurrentInstrStack)) + return; + + while (CurrentInstrStack != stack) + { + InstrStack *previous = CurrentInstrStack->previous; + + Assert(previous != NULL); + CurrentInstrStack->previous = NULL; + CurrentInstrStack = previous; + } +} + +static void +ResOwnerReleaseInstrumentation(Datum res) +{ + InstrStack *stack = (InstrStack *) DatumGetPointer(res); + + /* + * Because registered resources are *not* cleaned up in a guaranteed + * order, we may get a child context after we've processed the parent. + * Thus, we only pop the stack if its not already a parent of the stack + * being released. Note that OrphanSkippedStacks may have set our previous + * stack entry to NULL, in which case we don't modify the stack either. + * + * Note that StackIsParent will recurse as needed, so it is inadvisible to + * use deeply nested stacks. + */ + if (stack->previous && !StackIsParent(CurrentInstrStack, stack)) + { + OrphanSkippedStacks(stack); + InstrPopStack(stack); + } + + /* Accumulate data from all unfinalized child node stacks. */ + InstrFinalizeNodesOnAbort(stack); + + /* + * Accumulate the stack associated with the ResOwner to the active stack. + * + * Note that we intentionally directly add to the current stack instead of + * the parent of the stack being released, because this can execute out of + * order. Explicit PG_FINALLY blocks might have modified the stack as + * well. + */ + InstrStackAdd(CurrentInstrStack, stack); + + /* Free the stack entry now since InstrStop won't be called */ + pfree(stack); +} + +/* + * Pops the stack entry and accumulates to its parent. + * + * Note that this intentionally allows passing a stack that is not + * CurrentInstrStack, as can happen with PG_FINALLY, and orphans any + * intermediate stacks that were skipped. + */ +void +InstrPopAndFinalizeStack(InstrStack *stack) +{ + OrphanSkippedStacks(stack); + InstrPopStack(stack); + InstrStackAdd(CurrentInstrStack, stack); +} + +/* General purpose instrumentation handling */ Instrumentation * -InstrAlloc(int n, int instrument_options, bool async_mode) +InstrAlloc(int instrument_options) { Instrumentation *instr; /* initialize all fields to zeroes, then modify as needed */ - instr = palloc0(n * sizeof(Instrumentation)); + instr = palloc0(sizeof(Instrumentation)); if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL)) { - bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; - bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; - bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; - int i; + instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0; + instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0; + instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; + } + + return instr; +} - for (i = 0; i < n; i++) +void +InstrStart(Instrumentation *instr) +{ + if (instr->need_timer && + !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) + elog(ERROR, "InstrStart called twice in a row"); + + if (instr->need_bufusage || instr->need_walusage) + { + Assert(CurrentResourceOwner != NULL); + instr->owner = CurrentResourceOwner; + + /* + * Allocate the stack resource in a memory context that survives + * during an abort. This will be freed by InstrStop (regular + * execution) or ResOwnerReleaseInstrumentation (abort). + * + * We don't do this in InstrAlloc to avoid allocating when InstrStart + * + InstrStop isn't called. + */ + if (instr->stack == NULL) + instr->stack = MemoryContextAllocZero(TopMemoryContext, sizeof(InstrStack)); + + ResourceOwnerEnlarge(instr->owner); + ResourceOwnerRememberInstrStack(instr->owner, instr->stack); + + InstrPushStack(instr->stack); + } +} + +void +InstrStop(Instrumentation *instr, bool finalize) +{ + instr_time endtime; + + /* let's update the time only if the timer was requested */ + if (instr->need_timer) + { + if (INSTR_TIME_IS_ZERO(instr->starttime)) + elog(ERROR, "InstrStop called without start"); + + INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_ACCUM_DIFF(instr->total, endtime, instr->starttime); + + INSTR_TIME_SET_ZERO(instr->starttime); + } + + if (instr->need_bufusage || instr->need_walusage) + { + InstrPopStack(instr->stack); + + if (finalize) + InstrStackAdd(CurrentInstrStack, instr->stack); + + Assert(instr->owner != NULL); + ResourceOwnerForgetInstrStack(instr->owner, instr->stack); + instr->owner = NULL; + + if (finalize) { - instr[i].need_bufusage = need_buffers; - instr[i].need_walusage = need_wal; - instr[i].need_timer = need_timer; - instr[i].async_mode = async_mode; + /* + * To avoid keeping memory allocated beyond when its needed, copy + * the result to the current memory context, and free it in the + * transaction context. + */ + InstrStack *stack = palloc(sizeof(InstrStack)); + + memcpy(stack, instr->stack, sizeof(InstrStack)); + pfree(instr->stack); + instr->stack = stack; } } +} + +/* Trigger instrumentation handling */ +TriggerInstrumentation * +InstrAllocTrigger(int n, int instrument_options) +{ + TriggerInstrumentation *tginstr = palloc0(n * sizeof(TriggerInstrumentation)); + bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; + bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; + bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; + int i; + + for (i = 0; i < n; i++) + { + tginstr[i].instr.need_timer = need_timer; + tginstr[i].instr.need_bufusage = need_buffers; + tginstr[i].instr.need_walusage = need_wal; + } + + return tginstr; +} + +void +InstrStartTrigger(TriggerInstrumentation *tginstr) +{ + InstrStart(&tginstr->instr); +} + +void +InstrStopTrigger(TriggerInstrumentation *tginstr, int firings) +{ + /* + * This trigger may be called again, so we don't finalize instrumentation + * here. Accumulation to the parent happens at ExecutorFinish through + * ExecFinalizeTriggerInstrumentation. + */ + InstrStop(&tginstr->instr, false); + tginstr->firings += firings; +} + +/* Node instrumentation handling */ + +/* Allocate new node instrumentation structure */ +NodeInstrumentation * +InstrAllocNode(int instrument_options, bool async_mode) +{ + /* + * We can utilize TopTransactionContext instead of TopMemoryContext here + * (despite the inlined InstrStack in NodeInstrumentation) because nodes + * don't get used for utility commands that restart transactions, which + * would require a context that survives longer (EXPLAIN ANALYZE is fine). + */ + NodeInstrumentation *instr = MemoryContextAlloc(TopTransactionContext, sizeof(NodeInstrumentation)); + + InstrInitNode(instr, instrument_options); + instr->async_mode = async_mode; return instr; } /* Initialize a pre-allocated instrumentation structure. */ void -InstrInit(Instrumentation *instr, int instrument_options) +InstrInitNode(NodeInstrumentation *instr, int instrument_options) { - memset(instr, 0, sizeof(Instrumentation)); + memset(instr, 0, sizeof(NodeInstrumentation)); instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0; instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0; instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; } +/* + * InstrRememberNodeStack - register a child node stack for abort processing. + * + * On abort, InstrFinalizeNodesOnAbort will walk the parent's list to recover + * buffer/WAL data from stacks that were never finalized, in order for + * aggregate totals to be accurate despite the query erroring out. + * + * The passed in node stack can either be the NodeInstrumentation stack or an + * additional stack that is associated with a node. This must not be called + * with other (non-node) instrumentation stacks as the child that perform their + * own cleanup. The parent must be a non-node stack that can handle aborts. + */ +void +InstrRememberNodeStack(InstrStack *parent, InstrStack *node_stack) +{ + /* + * We do not support nesting, to avoid recursion in + * InstrFinalizeNodesOnAbort + */ + Assert(parent->unfinalized_node.next == NULL); + + slist_push_head(&parent->unfinalized_children, &node_stack->unfinalized_node); +} + /* Entry to a plan node */ void -InstrStartNode(Instrumentation *instr) +InstrStartNode(NodeInstrumentation *instr) { if (instr->need_timer && !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) elog(ERROR, "InstrStartNode called twice in a row"); - /* save buffer usage totals at node entry, if needed */ - if (instr->need_bufusage) - instr->bufusage_start = pgBufferUsage; + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure that we always have a parent, even at the top most node */ + Assert(CurrentInstrStack != &TopInstrStack); - if (instr->need_walusage) - instr->walusage_start = pgWalUsage; + InstrPushStack(&instr->stack); + } } /* Exit from a plan node */ void -InstrStopNode(Instrumentation *instr, double nTuples) +InstrStopNode(NodeInstrumentation *instr, double nTuples) { double save_tuplecount = instr->tuplecount; instr_time endtime; @@ -101,14 +371,14 @@ InstrStopNode(Instrumentation *instr, double nTuples) INSTR_TIME_SET_ZERO(instr->starttime); } - /* Add delta of buffer usage since entry to node's totals */ - if (instr->need_bufusage) - BufferUsageAccumDiff(&instr->bufusage, - &pgBufferUsage, &instr->bufusage_start); + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure that we always have a parent, even at the top most node */ + Assert(instr->stack.previous != NULL); - if (instr->need_walusage) - WalUsageAccumDiff(&instr->walusage, - &pgWalUsage, &instr->walusage_start); + /* Adding to parent is handled by ExecFinalizeNodeInstrumentation */ + InstrPopStack(&instr->stack); + } /* Is this the first tuple of this cycle? */ if (!instr->running) @@ -127,9 +397,53 @@ InstrStopNode(Instrumentation *instr, double nTuples) } } +/* Add per-node instrumentation to the parent and move into per-query memory context */ +NodeInstrumentation * +InstrFinalizeNode(NodeInstrumentation *instr, InstrStack *parent) +{ + NodeInstrumentation *dst = palloc(sizeof(NodeInstrumentation)); + + memcpy(dst, instr, sizeof(NodeInstrumentation)); + pfree(instr); + + /* Avoid stale pointer references */ + dst->stack.previous = NULL; + + InstrStackAdd(parent, &dst->stack); + + return dst; +} + +/* + * InstrFinalizeNodesOnAbort + * + * Accumulates unfinalized child per-node stacks into the resource owner stack, + * and resets the list so a theoretical second call is a safe no-op. + */ +static void +InstrFinalizeNodesOnAbort(InstrStack *stack) +{ + slist_iter iter; + + slist_foreach(iter, &stack->unfinalized_children) + { + InstrStack *child = slist_container(InstrStack, unfinalized_node, iter.cur); + + InstrStackAdd(stack, child); + + /* + * Note we don't free the child here since its usually contained + * within NodeInstrumentation and we don't have an easy way to access + * that, it will be instead be cleaned up by the transaction ending. + */ + } + + slist_init(&stack->unfinalized_children); +} + /* Update tuple count */ void -InstrUpdateTupleCount(Instrumentation *instr, double nTuples) +InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples) { /* count the returned tuples */ instr->tuplecount += nTuples; @@ -137,7 +451,7 @@ InstrUpdateTupleCount(Instrumentation *instr, double nTuples) /* Finish a run cycle for a plan node */ void -InstrEndLoop(Instrumentation *instr) +InstrEndLoop(NodeInstrumentation *instr) { /* Skip if nothing has happened, or already shut down */ if (!instr->running) @@ -162,7 +476,7 @@ InstrEndLoop(Instrumentation *instr) /* aggregate instrumentation information */ void -InstrAggNode(Instrumentation *dst, Instrumentation *add) +InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add) { if (!dst->running && add->running) { @@ -170,7 +484,7 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add) dst->firsttuple = add->firsttuple; } else if (dst->running && add->running && - INSTR_TIME_LT(dst->firsttuple, add->firsttuple)) + INSTR_TIME_GT(dst->firsttuple, add->firsttuple)) dst->firsttuple = add->firsttuple; INSTR_TIME_ADD(dst->counter, add->counter); @@ -186,40 +500,119 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add) /* Add delta of buffer usage since entry to node's totals */ if (dst->need_bufusage) - BufferUsageAdd(&dst->bufusage, &add->bufusage); + BufferUsageAdd(&dst->stack.bufusage, &add->stack.bufusage); if (dst->need_walusage) - WalUsageAdd(&dst->walusage, &add->walusage); + WalUsageAdd(&dst->stack.walusage, &add->stack.walusage); +} + +/* + * Allocate an additional InstrStack for a node, e.g. for tracking table + * buffer usage separately from index buffer usage. Allocated in + * TopTransactionContext so it survives long enough for abort recovery. + */ +InstrStack * +InstrAllocAdditionalNodeStack(NodeInstrumentation *instr) +{ + if (instr->need_bufusage || instr->need_walusage) + return MemoryContextAllocZero(TopTransactionContext, sizeof(InstrStack)); + + return NULL; } -/* note current values during parallel executor startup */ void +InstrStartNodeStack(NodeInstrumentation *instr, InstrStack *stack) +{ + if (instr->need_bufusage || instr->need_walusage) + { + /* Ensure the executor set up a parent node below the top level stack */ + Assert(CurrentInstrStack != &TopInstrStack); + + InstrPushStack(stack); + } +} + +void +InstrStopNodeStack(NodeInstrumentation *instr, InstrStack *stack) +{ + if (instr->need_bufusage || instr->need_walusage) + { + /* Adding to parent is handled by InstrFinalizeAdditionalNodeStack */ + InstrPopStack(stack); + } +} + +/* Add additional node stacks to the parent and move into per-query memory context */ +InstrStack * +InstrFinalizeAdditionalNodeStack(InstrStack *stack, NodeInstrumentation *instr) +{ + InstrStack *dst = palloc(sizeof(InstrStack)); + + memcpy(dst, stack, sizeof(InstrStack)); + pfree(stack); + + /* Avoid stale pointer references */ + dst->previous = NULL; + + InstrStackAdd(&instr->stack, dst); + + return dst; +} + +/* start instrumentation during parallel executor startup */ +Instrumentation * InstrStartParallelQuery(void) { - save_pgBufferUsage = pgBufferUsage; - save_pgWalUsage = pgWalUsage; + Instrumentation *instr = InstrAlloc(INSTRUMENT_BUFFERS | INSTRUMENT_WAL); + + InstrStart(instr); + return instr; } /* report usage after parallel executor shutdown */ void -InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage) +InstrEndParallelQuery(Instrumentation *instr, BufferUsage *bufusage, WalUsage *walusage) { + InstrStop(instr, true); memset(bufusage, 0, sizeof(BufferUsage)); - BufferUsageAccumDiff(bufusage, &pgBufferUsage, &save_pgBufferUsage); + memcpy(bufusage, &instr->stack->bufusage, sizeof(BufferUsage)); memset(walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(walusage, &pgWalUsage, &save_pgWalUsage); + memcpy(walusage, &instr->stack->walusage, sizeof(WalUsage)); } -/* accumulate work done by workers in leader's stats */ +/* + * Accumulate work done by parallel workers in the leader's stats. + * + * Note that what gets added here effectively depends on whether per-node + * instrumentation is active. If its active the parallel worker intentionally + * skips ExecFinalizeNodeInstrumentation on executor shutdown, because it would + * cause double counting. Instead, this only accumulates any extra activity + * outside of nodes. + * + * Otherwise this is responsible for making sure that the complete query + * activity is accumulated. + */ void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage) { - BufferUsageAdd(&pgBufferUsage, bufusage); + BufferUsageAdd(&CurrentInstrStack->bufusage, bufusage); + WalUsageAdd(&CurrentInstrStack->walusage, walusage); + WalUsageAdd(&pgWalUsage, walusage); } +void +InstrStackAdd(InstrStack *dst, InstrStack *add) +{ + Assert(dst != NULL); + Assert(add != NULL); + + BufferUsageAdd(&dst->bufusage, &add->bufusage); + WalUsageAdd(&dst->walusage, &add->walusage); +} + /* dst += add */ -static void +void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) { dst->shared_blks_hit += add->shared_blks_hit; @@ -240,36 +633,6 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time); } -/* dst += add - sub */ -void -BufferUsageAccumDiff(BufferUsage *dst, - const BufferUsage *add, - const BufferUsage *sub) -{ - dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit; - dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read; - dst->shared_blks_dirtied += add->shared_blks_dirtied - sub->shared_blks_dirtied; - dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written; - dst->local_blks_hit += add->local_blks_hit - sub->local_blks_hit; - dst->local_blks_read += add->local_blks_read - sub->local_blks_read; - dst->local_blks_dirtied += add->local_blks_dirtied - sub->local_blks_dirtied; - dst->local_blks_written += add->local_blks_written - sub->local_blks_written; - dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read; - dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written; - INSTR_TIME_ACCUM_DIFF(dst->shared_blk_read_time, - add->shared_blk_read_time, sub->shared_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->shared_blk_write_time, - add->shared_blk_write_time, sub->shared_blk_write_time); - INSTR_TIME_ACCUM_DIFF(dst->local_blk_read_time, - add->local_blk_read_time, sub->local_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->local_blk_write_time, - add->local_blk_write_time, sub->local_blk_write_time); - INSTR_TIME_ACCUM_DIFF(dst->temp_blk_read_time, - add->temp_blk_read_time, sub->temp_blk_read_time); - INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time, - add->temp_blk_write_time, sub->temp_blk_write_time); -} - /* helper functions for WAL usage accumulation */ static void WalUsageAdd(WalUsage *dst, WalUsage *add) diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index a616abff04cf0..fb62ff7a8be7a 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -83,6 +83,7 @@ IndexNext(IndexScanState *node) ExprContext *econtext; ScanDirection direction; IndexScanDesc scandesc; + ItemPointer tid; TupleTableSlot *slot; /* @@ -128,8 +129,20 @@ IndexNext(IndexScanState *node) /* * ok, now that we have what we need, fetch the next tuple. */ - while (index_getnext_slot(scandesc, direction, slot)) + while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { + if (node->ss.ps.instrument) + InstrStartNodeStack(node->ss.ps.instrument, node->iss_InstrumentTableStack); + + if (unlikely(!index_fetch_heap(scandesc, slot))) + continue; + + if (node->ss.ps.instrument) + InstrStopNodeStack(node->ss.ps.instrument, node->iss_InstrumentTableStack); + + if (scandesc->xs_heap_continue) + elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); + CHECK_FOR_INTERRUPTS(); /* @@ -812,6 +825,8 @@ ExecEndIndexScan(IndexScanState *node) * which will have a new IndexOnlyScanState and zeroed stats. */ winstrument->nsearches += node->iss_Instrument.nsearches; + if (node->iss_InstrumentTableStack) + InstrStackAdd(&winstrument->worker_table_stack, node->iss_InstrumentTableStack); } /* @@ -1819,4 +1834,10 @@ ExecIndexScanRetrieveInstrumentation(IndexScanState *node) SharedInfo->num_workers * sizeof(IndexScanInstrumentation); node->iss_SharedInfo = palloc(size); memcpy(node->iss_SharedInfo, SharedInfo, size); + + /* Aggregate workers' table buffer/WAL usage into leader's stack */ + if (node->iss_InstrumentTableStack) + for (int i = 0; i < node->iss_SharedInfo->num_workers; i++) + InstrStackAdd(node->iss_InstrumentTableStack, + &node->iss_SharedInfo->winstrument[i].worker_table_stack); } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index d1babaff02351..511efaba516e7 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -826,7 +826,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN { PinLocalBuffer(bufHdr, true); - pgBufferUsage.local_blks_hit++; + INSTR_BUFUSAGE_INCR(local_blks_hit); return true; } @@ -847,7 +847,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN { if (BufferTagsEqual(&tag, &bufHdr->tag)) { - pgBufferUsage.shared_blks_hit++; + INSTR_BUFUSAGE_INCR(shared_blks_hit); return true; } UnpinBuffer(bufHdr); @@ -1248,14 +1248,14 @@ PinBufferForBlock(Relation rel, { bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr); if (*foundPtr) - pgBufferUsage.local_blks_hit++; + INSTR_BUFUSAGE_INCR(local_blks_hit); } else { bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum, strategy, foundPtr, io_context); if (*foundPtr) - pgBufferUsage.shared_blks_hit++; + INSTR_BUFUSAGE_INCR(shared_blks_hit); } if (rel) { @@ -1989,9 +1989,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) true); if (persistence == RELPERSISTENCE_TEMP) - pgBufferUsage.local_blks_hit += 1; + INSTR_BUFUSAGE_INCR(local_blks_hit); else - pgBufferUsage.shared_blks_hit += 1; + INSTR_BUFUSAGE_INCR(shared_blks_hit); if (operation->rel) pgstat_count_buffer_hit(operation->rel); @@ -2059,9 +2059,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) io_start, 1, io_buffers_len * BLCKSZ); if (persistence == RELPERSISTENCE_TEMP) - pgBufferUsage.local_blks_read += io_buffers_len; + INSTR_BUFUSAGE_ADD(local_blks_read, io_buffers_len); else - pgBufferUsage.shared_blks_read += io_buffers_len; + INSTR_BUFUSAGE_ADD(shared_blks_read, io_buffers_len); /* * Track vacuum cost when issuing IO, not after waiting for it. @@ -2954,7 +2954,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, TerminateBufferIO(buf_hdr, false, BM_VALID, true, false); } - pgBufferUsage.shared_blks_written += extend_by; + INSTR_BUFUSAGE_ADD(shared_blks_written, extend_by); *extended_by = extend_by; @@ -3099,7 +3099,7 @@ MarkBufferDirty(Buffer buffer) */ if (!(old_buf_state & BM_DIRTY)) { - pgBufferUsage.shared_blks_dirtied++; + INSTR_BUFUSAGE_INCR(shared_blks_dirtied); if (VacuumCostActive) VacuumCostBalance += VacuumCostPageDirty; } @@ -4528,7 +4528,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_WRITE, io_start, 1, BLCKSZ); - pgBufferUsage.shared_blks_written++; + INSTR_BUFUSAGE_INCR(shared_blks_written); /* * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and @@ -5689,7 +5689,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) if (dirtied) { - pgBufferUsage.shared_blks_dirtied++; + INSTR_BUFUSAGE_INCR(shared_blks_dirtied); if (VacuumCostActive) VacuumCostBalance += VacuumCostPageDirty; } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 04a540379a22f..e6054e745e819 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -217,7 +217,7 @@ FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln) /* Mark not-dirty */ TerminateLocalBufferIO(bufHdr, true, 0, false); - pgBufferUsage.local_blks_written++; + INSTR_BUFUSAGE_INCR(local_blks_written); } static Buffer @@ -478,7 +478,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr, *extended_by = extend_by; - pgBufferUsage.local_blks_written += extend_by; + INSTR_BUFUSAGE_ADD(local_blks_written, extend_by); return first_block; } @@ -509,7 +509,7 @@ MarkLocalBufferDirty(Buffer buffer) buf_state = pg_atomic_read_u64(&bufHdr->state); if (!(buf_state & BM_DIRTY)) - pgBufferUsage.local_blks_dirtied++; + INSTR_BUFUSAGE_INCR(local_blks_dirtied); buf_state |= BM_DIRTY; diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index ddf3a410d6f54..1726b3a3b3370 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -474,13 +474,13 @@ BufFileLoadBuffer(BufFile *file) if (track_io_timing) { INSTR_TIME_SET_CURRENT(io_time); - INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_read_time, io_time, io_start); + INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_read_time, io_time, io_start); } /* we choose not to advance curOffset here */ if (file->nbytes > 0) - pgBufferUsage.temp_blks_read++; + INSTR_BUFUSAGE_INCR(temp_blks_read); } /* @@ -548,13 +548,13 @@ BufFileDumpBuffer(BufFile *file) if (track_io_timing) { INSTR_TIME_SET_CURRENT(io_time); - INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_write_time, io_time, io_start); + INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_write_time, io_time, io_start); } file->curOffset += bytestowrite; wpos += bytestowrite; - pgBufferUsage.temp_blks_written++; + INSTR_BUFUSAGE_INCR(temp_blks_written); } file->dirty = false; diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index 28de24538dced..9e7a88ec0d07e 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -135,17 +135,17 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, { pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time)); if (io_object == IOOBJECT_RELATION) - INSTR_TIME_ADD(pgBufferUsage.shared_blk_write_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(shared_blk_write_time, io_time); else if (io_object == IOOBJECT_TEMP_RELATION) - INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(local_blk_write_time, io_time); } else if (io_op == IOOP_READ) { pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time)); if (io_object == IOOBJECT_RELATION) - INSTR_TIME_ADD(pgBufferUsage.shared_blk_read_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(shared_blk_read_time, io_time); else if (io_object == IOOBJECT_TEMP_RELATION) - INSTR_TIME_ADD(pgBufferUsage.local_blk_read_time, io_time); + INSTR_BUFUSAGE_TIME_ADD(local_blk_read_time, io_time); } } diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index d46ba59895d62..4406be9ed7b86 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -300,6 +300,8 @@ extern void ExecSetExecProcNode(PlanState *node, ExecProcNodeMtd function); extern Node *MultiExecProcNode(PlanState *node); extern void ExecEndNode(PlanState *node); extern void ExecShutdownNode(PlanState *node); +extern void ExecRememberNodeInstrumentation(PlanState *node, InstrStack *parent); +extern void ExecFinalizeNodeInstrumentation(PlanState *node); extern void ExecSetTupleBound(int64 tuples_needed, PlanState *child_node); diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 9759f3ea5d8d9..8fb6c911fcb84 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -13,6 +13,7 @@ #ifndef INSTRUMENT_H #define INSTRUMENT_H +#include "lib/ilist.h" #include "portability/instr_time.h" @@ -67,7 +68,64 @@ typedef enum InstrumentOption INSTRUMENT_ALL = PG_INT32_MAX } InstrumentOption; +/* Stack entry for current WAL/buffer usage */ +typedef struct InstrStack +{ + struct InstrStack *previous; + BufferUsage bufusage; + WalUsage walusage; + + /* + * Tracking of instr stacks that need to be cleaned up on abort that are + * not registered as a resource owner themselves + */ + slist_head unfinalized_children; /* head of unfinalized children list */ + slist_node unfinalized_node; /* node in parent's unfinalized list */ +} InstrStack; + +/* + * General purpose instrumentation that can capture time and WAL/buffer usage + * + * Initialized through InstrAlloc, followed by one or more calls to a pair of + * InstrStart/InstrStop (activity is measured inbetween). + * + * Uses resource owner mechanism for handling aborts, as such, the caller + * *must* not exit out of top level transaction between InstrStart/InstrStop + * calls in regular execution. If this is needed, directly use InstrPushStack / + * InstrPopStack in a PG_TRY/PG_FINALLY block instead. + */ +struct ResourceOwnerData; typedef struct Instrumentation +{ + /* Parameters set at creation: */ + bool need_timer; /* true if we need timer data */ + bool need_bufusage; /* true if we need buffer usage data */ + bool need_walusage; /* true if we need WAL usage data */ + /* Internal state keeping: */ + instr_time starttime; /* start time of last InstrStart */ + /* Accumulated statistics: */ + instr_time total; /* total runtime */ + InstrStack *stack; /* stack tracking buffer/WAL usage */ + struct ResourceOwnerData *owner; +} Instrumentation; + +/* Trigger instrumentation */ +typedef struct TriggerInstrumentation +{ + Instrumentation instr; + int firings; /* number of times the instrumented trigger + * was fired */ +} TriggerInstrumentation; + +/* + * Specialized instrumentation for per-node execution statistics + * + * Requires use of an outer InstrStart/InstrStop to handle the stack used for + * WAL/buffer usage statistics, and relies on it for managing aborts. Solely + * intended for the executor and anyone reporting about its activities (e.g. + * EXPLAIN ANALYZE). + */ +typedef struct NodeInstrumentation { /* Parameters set at node creation: */ bool need_timer; /* true if we need timer data */ @@ -80,8 +138,6 @@ typedef struct Instrumentation instr_time counter; /* accumulated runtime for this node */ instr_time firsttuple; /* time for first tuple of this cycle */ double tuplecount; /* # of tuples emitted so far this cycle */ - BufferUsage bufusage_start; /* buffer usage at start */ - WalUsage walusage_start; /* WAL usage at start */ /* Accumulated statistics across all completed cycles: */ instr_time startup; /* total startup time */ instr_time total; /* total time */ @@ -90,33 +146,131 @@ typedef struct Instrumentation double nloops; /* # of run cycles for this node */ double nfiltered1; /* # of tuples removed by scanqual or joinqual */ double nfiltered2; /* # of tuples removed by "other" quals */ - BufferUsage bufusage; /* total buffer usage */ - WalUsage walusage; /* total WAL usage */ -} Instrumentation; + InstrStack stack; /* stack tracking buffer/WAL usage */ +} NodeInstrumentation; -typedef struct WorkerInstrumentation +/* + * Care must be taken with any pointers contained within this struct, as this + * gets copied across processes during parallel query execution. + */ +typedef struct WorkerNodeInstrumentation { int num_workers; /* # of structures that follow */ - Instrumentation instrument[FLEXIBLE_ARRAY_MEMBER]; -} WorkerInstrumentation; + NodeInstrumentation instrument[FLEXIBLE_ARRAY_MEMBER]; +} WorkerNodeInstrumentation; -extern PGDLLIMPORT BufferUsage pgBufferUsage; extern PGDLLIMPORT WalUsage pgWalUsage; -extern Instrumentation *InstrAlloc(int n, int instrument_options, - bool async_mode); -extern void InstrInit(Instrumentation *instr, int instrument_options); -extern void InstrStartNode(Instrumentation *instr); -extern void InstrStopNode(Instrumentation *instr, double nTuples); -extern void InstrUpdateTupleCount(Instrumentation *instr, double nTuples); -extern void InstrEndLoop(Instrumentation *instr); -extern void InstrAggNode(Instrumentation *dst, Instrumentation *add); -extern void InstrStartParallelQuery(void); -extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage); +/* + * The top instrumentation stack represents a running total of the current + * backend WAL/buffer usage information. This will not be updated immediately, + * but rather when the current stack entry gets accumulated which typically + * happens at query end (see CurrentInstrStack below). + * + * Care must be taken when utilizing this in the parallel worker context: + * Parallel workers will report back their instrumentation to the caller, + * and this gets added to the caller's stack. If this were to be used in the + * shared memory stats infrastructure it would need to be skipped on parallel + * workers to avoid double counting. + */ +extern PGDLLIMPORT InstrStack TopInstrStack; + +/* + * The currently active stack entry that is getting updated as activity + * happens, and will be accumulated to parent stacks when it gets finalized + * by InstrStop (for non-executor use cases), ExecFinalizeNodeInstrumentation + * (executor finish) or ResOwnerReleaseInstrumentation on abort. + */ +extern PGDLLIMPORT InstrStack *CurrentInstrStack; + +extern void InstrStackAdd(InstrStack *dst, InstrStack *add); + +/* + * Pushes the stack so that all WAL/buffer usage updates go to the passed in + * stack entry. + * + * Any caller using this directly must manage the passed in stack and call + * InstrPopStack on its own again, typically by using a PG_FINALLY block to + * ensure the stack gets reset via InstrPopStack on abort. Use InstrStart + * instead when you want automatic handling of abort cases using the resource + * owner infrastructure. + */ +static inline void +InstrPushStack(InstrStack *stack) +{ + stack->previous = CurrentInstrStack; + CurrentInstrStack = stack; +} + +/* + * Pops the stack entry back to the previous one that was effective at + * InstrPushStack. + * + * Callers must ensure that no intermediate stack entries are skipped, to + * handle aborts correctly. If you're thinking of calling this in a PG_FINALLY + * block, instead call InstrPopAndFinalizeStack which can skip intermediate + * stack entries, or instead use InstrStart/InstrStop. + */ +static inline void +InstrPopStack(InstrStack *stack) +{ + Assert(stack != NULL); + CurrentInstrStack = stack->previous; +} + +extern void InstrPopAndFinalizeStack(InstrStack *stack); + +extern Instrumentation *InstrAlloc(int instrument_options); +extern void InstrStart(Instrumentation *instr); +extern void InstrStop(Instrumentation *instr, bool finalize); + +extern TriggerInstrumentation *InstrAllocTrigger(int n, int instrument_options); +extern void InstrStartTrigger(TriggerInstrumentation *tginstr); +extern void InstrStopTrigger(TriggerInstrumentation *tginstr, int firings); + +extern NodeInstrumentation *InstrAllocNode(int instrument_options, + bool async_mode); +extern void InstrInitNode(NodeInstrumentation *instr, int instrument_options); +extern void InstrRememberNodeStack(InstrStack *parent, InstrStack *instr); +extern void InstrStartNode(NodeInstrumentation *instr); +extern void InstrStopNode(NodeInstrumentation *instr, double nTuples); +extern NodeInstrumentation *InstrFinalizeNode(NodeInstrumentation *instr, InstrStack *parent); +extern void InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples); +extern void InstrEndLoop(NodeInstrumentation *instr); +extern void InstrAggNode(NodeInstrumentation *dst, NodeInstrumentation *add); + +extern InstrStack *InstrAllocAdditionalNodeStack(NodeInstrumentation *instr); +extern void InstrStartNodeStack(NodeInstrumentation *dst, InstrStack *stack); +extern void InstrStopNodeStack(NodeInstrumentation *dst, InstrStack *stack); +extern InstrStack *InstrFinalizeAdditionalNodeStack(InstrStack *stack, NodeInstrumentation *instr); + +pg_nodiscard extern Instrumentation *InstrStartParallelQuery(void); +extern void InstrEndParallelQuery(Instrumentation *instr, BufferUsage *bufusage, WalUsage *walusage); extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage); -extern void BufferUsageAccumDiff(BufferUsage *dst, - const BufferUsage *add, const BufferUsage *sub); +extern void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub); +#define INSTR_BUFUSAGE_INCR(fld) do { \ + CurrentInstrStack->bufusage.fld++; \ + } while(0) +#define INSTR_BUFUSAGE_ADD(fld,val) do { \ + CurrentInstrStack->bufusage.fld += val; \ + } while(0) +#define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \ + INSTR_TIME_ADD(CurrentInstrStack->bufusage.fld, val); \ + } while (0) +#define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \ + INSTR_TIME_ACCUM_DIFF(CurrentInstrStack->bufusage.fld, endval, startval); \ + } while (0) + +#define INSTR_WALUSAGE_INCR(fld) do { \ + pgWalUsage.fld++; \ + CurrentInstrStack->walusage.fld++; \ + } while(0) +#define INSTR_WALUSAGE_ADD(fld,val) do { \ + pgWalUsage.fld += val; \ + CurrentInstrStack->walusage.fld += val; \ + } while(0) + #endif /* INSTRUMENT_H */ diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h index 8847d7f94fa57..dd45d72559dd7 100644 --- a/src/include/executor/instrument_node.h +++ b/src/include/executor/instrument_node.h @@ -18,6 +18,8 @@ #ifndef INSTRUMENT_NODE_H #define INSTRUMENT_NODE_H +#include "executor/instrument.h" + /* --------------------- * Instrumentation information for aggregate function execution @@ -48,6 +50,9 @@ typedef struct IndexScanInstrumentation { /* Index search count (incremented with pgstat_count_index_scan call) */ uint64 nsearches; + + /* Used for passing iss_InstrumentTableStack data from parallel workers */ + InstrStack worker_table_stack; } IndexScanInstrumentation; /* diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 63c067d5aae61..6a6427afbd2f7 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -524,7 +524,7 @@ typedef struct ResultRelInfo ExprState **ri_TrigWhenExprs; /* optional runtime measurements for triggers */ - Instrumentation *ri_TrigInstrument; + TriggerInstrumentation *ri_TrigInstrument; /* On-demand created slots for triggers / returning processing */ TupleTableSlot *ri_ReturningSlot; /* for trigger output tuples */ @@ -1175,8 +1175,10 @@ typedef struct PlanState ExecProcNodeMtd ExecProcNodeReal; /* actual function, if above is a * wrapper */ - Instrumentation *instrument; /* Optional runtime stats for this node */ - WorkerInstrumentation *worker_instrument; /* per-worker instrumentation */ + NodeInstrumentation *instrument; /* Optional runtime stats for this + * node */ + WorkerNodeInstrumentation *worker_instrument; /* per-worker + * instrumentation */ /* Per-worker JIT instrumentation */ struct SharedJitInstrumentation *worker_jit_instrument; @@ -1726,6 +1728,13 @@ typedef struct IndexScanState IndexScanInstrumentation iss_Instrument; SharedIndexScanInstrumentation *iss_SharedInfo; + /* + * Instrumentation stack utilized for tracking table access. This is + * separate from iss_Instrument since it needs to be allocated in the + * right context and IndexScanInstrumentation shouldn't contain pointers. + */ + InstrStack *iss_InstrumentTableStack; + /* These are needed for re-checking ORDER BY expr ordering */ pairingheap *iss_ReorderQueue; bool iss_ReachedEnd; diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index 490593d182512..87b9ec95531b9 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -28,6 +28,8 @@ * * INSTR_TIME_ACCUM_DIFF(x, y, z) x += (y - z) * + * INSTR_TIME_GT(x, y) x > y + * * INSTR_TIME_GET_DOUBLE(t) convert t to double (in seconds) * * INSTR_TIME_GET_MILLISEC(t) convert t to double (in milliseconds) @@ -184,7 +186,7 @@ GetTimerFrequency(void) #define INSTR_TIME_ACCUM_DIFF(x,y,z) \ ((x).ticks += (y).ticks - (z).ticks) -#define INSTR_TIME_LT(x,y) \ +#define INSTR_TIME_GT(x,y) \ ((x).ticks > (y).ticks) #define INSTR_TIME_GET_DOUBLE(t) \ diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h index eb6033b4fdb65..5463bc921f06e 100644 --- a/src/include/utils/resowner.h +++ b/src/include/utils/resowner.h @@ -75,6 +75,7 @@ typedef uint32 ResourceReleasePriority; #define RELEASE_PRIO_SNAPSHOT_REFS 500 #define RELEASE_PRIO_FILES 600 #define RELEASE_PRIO_WAITEVENTSETS 700 +#define RELEASE_PRIO_INSTRUMENTATION 800 /* 0 is considered invalid */ #define RELEASE_PRIO_FIRST 1 diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out index 7c1f26b182cb0..e28e754369314 100644 --- a/src/test/regress/expected/explain.out +++ b/src/test/regress/expected/explain.out @@ -822,3 +822,200 @@ select explain_filter('explain (analyze,buffers off,costs off) select sum(n) ove (9 rows) reset work_mem; +-- EXPLAIN (ANALYZE, BUFFERS) should report buffer usage from PL/pgSQL +-- EXCEPTION blocks, even after subtransaction rollback. +CREATE TEMP TABLE explain_exc_tab (a int, b char(20)); +INSERT INTO explain_exc_tab VALUES (0, 'zzz'); +CREATE FUNCTION explain_exc_func() RETURNS void AS $$ +DECLARE + v int; +BEGIN + WITH ins AS (INSERT INTO explain_exc_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN + NULL; +END; +$$ LANGUAGE plpgsql; +CREATE FUNCTION check_explain_exception_buffers() RETURNS boolean AS $$ +DECLARE + plan_json json; + node json; + total_buffers int; +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT explain_exc_func()' INTO plan_json; + node := plan_json->0->'Plan'; + total_buffers := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + RETURN total_buffers > 0; +END; +$$ LANGUAGE plpgsql; +SELECT check_explain_exception_buffers() AS exception_buffers_visible; + exception_buffers_visible +--------------------------- + t +(1 row) + +-- Also test with nested EXPLAIN ANALYZE (two levels of instrumentation) +CREATE FUNCTION check_explain_exception_buffers_nested() RETURNS boolean AS $$ +DECLARE + plan_json json; + node json; + total_buffers int; +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT check_explain_exception_buffers()' INTO plan_json; + node := plan_json->0->'Plan'; + total_buffers := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + RETURN total_buffers > 0; +END; +$$ LANGUAGE plpgsql; +SELECT check_explain_exception_buffers_nested() AS exception_buffers_nested_visible; + exception_buffers_nested_visible +---------------------------------- + t +(1 row) + +DROP FUNCTION check_explain_exception_buffers_nested; +DROP FUNCTION check_explain_exception_buffers; +DROP FUNCTION explain_exc_func; +DROP TABLE explain_exc_tab; +-- Cursor instrumentation test. +-- Verify that buffer usage is correctly tracked through cursor execution paths. +-- Non-scrollable cursors exercise ExecShutdownNode after each ExecutorRun +-- (EXEC_FLAG_BACKWARD is not set), while scrollable cursors only shut down +-- nodes in ExecutorFinish. In both cases, buffer usage from the inner cursor +-- scan should be correctly accumulated. +CREATE TEMP TABLE cursor_buf_test AS SELECT * FROM tenk1; +CREATE FUNCTION cursor_noscroll_scan() RETURNS bigint AS $$ +DECLARE + cur NO SCROLL CURSOR FOR SELECT * FROM cursor_buf_test; + rec RECORD; + cnt bigint := 0; +BEGIN + OPEN cur; + LOOP + FETCH NEXT FROM cur INTO rec; + EXIT WHEN NOT FOUND; + cnt := cnt + 1; + END LOOP; + CLOSE cur; + RETURN cnt; +END; +$$ LANGUAGE plpgsql; +CREATE FUNCTION cursor_scroll_scan() RETURNS bigint AS $$ +DECLARE + cur SCROLL CURSOR FOR SELECT * FROM cursor_buf_test; + rec RECORD; + cnt bigint := 0; +BEGIN + OPEN cur; + LOOP + FETCH NEXT FROM cur INTO rec; + EXIT WHEN NOT FOUND; + cnt := cnt + 1; + END LOOP; + CLOSE cur; + RETURN cnt; +END; +$$ LANGUAGE plpgsql; +CREATE FUNCTION check_cursor_explain_buffers() RETURNS TABLE(noscroll_ok boolean, scroll_ok boolean) AS $$ +DECLARE + plan_json json; + node json; + direct_buf int; + noscroll_buf int; + scroll_buf int; +BEGIN + -- Direct scan: get leaf Seq Scan node buffers as baseline + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT * FROM cursor_buf_test' INTO plan_json; + node := plan_json->0->'Plan'; + WHILE node->'Plans' IS NOT NULL LOOP + node := node->'Plans'->0; + END LOOP; + direct_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Non-scrollable cursor path: ExecShutdownNode runs after each ExecutorRun + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT cursor_noscroll_scan()' INTO plan_json; + node := plan_json->0->'Plan'; + noscroll_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Scrollable cursor path: ExecShutdownNode is skipped + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT cursor_scroll_scan()' INTO plan_json; + node := plan_json->0->'Plan'; + scroll_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Both cursor paths should report buffer counts about as high as + -- the direct scan (same data plus minor catalog overhead), and not + -- double-counted (< 2x the direct scan) + RETURN QUERY SELECT + (noscroll_buf >= direct_buf * 0.5 AND noscroll_buf < direct_buf * 2), + (scroll_buf >= direct_buf * 0.5 AND scroll_buf < direct_buf * 2); +END; +$$ LANGUAGE plpgsql; +SELECT * FROM check_cursor_explain_buffers(); + noscroll_ok | scroll_ok +-------------+----------- + t | t +(1 row) + +DROP FUNCTION check_cursor_explain_buffers; +DROP FUNCTION cursor_noscroll_scan; +DROP FUNCTION cursor_scroll_scan; +DROP TABLE cursor_buf_test; +-- Parallel query buffer double-counting test. +-- +-- Compares serial Seq Scan buffers vs parallel Seq Scan buffers. +-- They scan the same table so the buffer count should be similar. +-- Double-counting would make the parallel count ~2x larger. +CREATE FUNCTION check_parallel_explain_buffers() RETURNS TABLE(ratio numeric) AS $$ +DECLARE + plan_json json; + serial_buffers int; + parallel_buffers int; + node json; +BEGIN + -- Serial -- + SET LOCAL max_parallel_workers_per_gather = 0; + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT count(*) FROM tenk1' INTO plan_json; + node := plan_json->0->'Plan'; + serial_buffers := + COALESCE((node->>'Shared Hit Blocks')::int, 0) + + COALESCE((node->>'Shared Read Blocks')::int, 0); + + -- Parallel -- + SET LOCAL parallel_setup_cost = 0; + SET LOCAL parallel_tuple_cost = 0; + SET LOCAL min_parallel_table_scan_size = 0; + SET LOCAL max_parallel_workers_per_gather = 2; + SET LOCAL parallel_leader_participation = off; + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT count(*) FROM tenk1' INTO plan_json; + node := plan_json->0->'Plan'; + parallel_buffers := + COALESCE((node->>'Shared Hit Blocks')::int, 0) + + COALESCE((node->>'Shared Read Blocks')::int, 0); + + RETURN QUERY SELECT round(parallel_buffers::numeric / GREATEST(serial_buffers, 1)); +END; +$$ LANGUAGE plpgsql; +SELECT * FROM check_parallel_explain_buffers(); + ratio +------- + 1 +(1 row) + +DROP FUNCTION check_parallel_explain_buffers; diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql index ebdab42604beb..cf5c6335a1922 100644 --- a/src/test/regress/sql/explain.sql +++ b/src/test/regress/sql/explain.sql @@ -188,3 +188,197 @@ select explain_filter('explain (analyze,buffers off,costs off) select sum(n) ove -- Test tuplestore storage usage in Window aggregate (memory and disk case, final result is disk) select explain_filter('explain (analyze,buffers off,costs off) select sum(n) over(partition by m) from (SELECT n < 3 as m, n from generate_series(1,2500) a(n))'); reset work_mem; + +-- EXPLAIN (ANALYZE, BUFFERS) should report buffer usage from PL/pgSQL +-- EXCEPTION blocks, even after subtransaction rollback. +CREATE TEMP TABLE explain_exc_tab (a int, b char(20)); +INSERT INTO explain_exc_tab VALUES (0, 'zzz'); + +CREATE FUNCTION explain_exc_func() RETURNS void AS $$ +DECLARE + v int; +BEGIN + WITH ins AS (INSERT INTO explain_exc_tab VALUES (1, 'aaa') RETURNING a) + SELECT a / 0 INTO v FROM ins; +EXCEPTION WHEN division_by_zero THEN + NULL; +END; +$$ LANGUAGE plpgsql; + +CREATE FUNCTION check_explain_exception_buffers() RETURNS boolean AS $$ +DECLARE + plan_json json; + node json; + total_buffers int; +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT explain_exc_func()' INTO plan_json; + node := plan_json->0->'Plan'; + total_buffers := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + RETURN total_buffers > 0; +END; +$$ LANGUAGE plpgsql; + +SELECT check_explain_exception_buffers() AS exception_buffers_visible; + +-- Also test with nested EXPLAIN ANALYZE (two levels of instrumentation) +CREATE FUNCTION check_explain_exception_buffers_nested() RETURNS boolean AS $$ +DECLARE + plan_json json; + node json; + total_buffers int; +BEGIN + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT check_explain_exception_buffers()' INTO plan_json; + node := plan_json->0->'Plan'; + total_buffers := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + RETURN total_buffers > 0; +END; +$$ LANGUAGE plpgsql; + +SELECT check_explain_exception_buffers_nested() AS exception_buffers_nested_visible; + +DROP FUNCTION check_explain_exception_buffers_nested; +DROP FUNCTION check_explain_exception_buffers; +DROP FUNCTION explain_exc_func; +DROP TABLE explain_exc_tab; + +-- Cursor instrumentation test. +-- Verify that buffer usage is correctly tracked through cursor execution paths. +-- Non-scrollable cursors exercise ExecShutdownNode after each ExecutorRun +-- (EXEC_FLAG_BACKWARD is not set), while scrollable cursors only shut down +-- nodes in ExecutorFinish. In both cases, buffer usage from the inner cursor +-- scan should be correctly accumulated. + +CREATE TEMP TABLE cursor_buf_test AS SELECT * FROM tenk1; + +CREATE FUNCTION cursor_noscroll_scan() RETURNS bigint AS $$ +DECLARE + cur NO SCROLL CURSOR FOR SELECT * FROM cursor_buf_test; + rec RECORD; + cnt bigint := 0; +BEGIN + OPEN cur; + LOOP + FETCH NEXT FROM cur INTO rec; + EXIT WHEN NOT FOUND; + cnt := cnt + 1; + END LOOP; + CLOSE cur; + RETURN cnt; +END; +$$ LANGUAGE plpgsql; + +CREATE FUNCTION cursor_scroll_scan() RETURNS bigint AS $$ +DECLARE + cur SCROLL CURSOR FOR SELECT * FROM cursor_buf_test; + rec RECORD; + cnt bigint := 0; +BEGIN + OPEN cur; + LOOP + FETCH NEXT FROM cur INTO rec; + EXIT WHEN NOT FOUND; + cnt := cnt + 1; + END LOOP; + CLOSE cur; + RETURN cnt; +END; +$$ LANGUAGE plpgsql; + +CREATE FUNCTION check_cursor_explain_buffers() RETURNS TABLE(noscroll_ok boolean, scroll_ok boolean) AS $$ +DECLARE + plan_json json; + node json; + direct_buf int; + noscroll_buf int; + scroll_buf int; +BEGIN + -- Direct scan: get leaf Seq Scan node buffers as baseline + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT * FROM cursor_buf_test' INTO plan_json; + node := plan_json->0->'Plan'; + WHILE node->'Plans' IS NOT NULL LOOP + node := node->'Plans'->0; + END LOOP; + direct_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Non-scrollable cursor path: ExecShutdownNode runs after each ExecutorRun + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT cursor_noscroll_scan()' INTO plan_json; + node := plan_json->0->'Plan'; + noscroll_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Scrollable cursor path: ExecShutdownNode is skipped + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT cursor_scroll_scan()' INTO plan_json; + node := plan_json->0->'Plan'; + scroll_buf := + COALESCE((node->>'Local Hit Blocks')::int, 0) + + COALESCE((node->>'Local Read Blocks')::int, 0); + + -- Both cursor paths should report buffer counts about as high as + -- the direct scan (same data plus minor catalog overhead), and not + -- double-counted (< 2x the direct scan) + RETURN QUERY SELECT + (noscroll_buf >= direct_buf * 0.5 AND noscroll_buf < direct_buf * 2), + (scroll_buf >= direct_buf * 0.5 AND scroll_buf < direct_buf * 2); +END; +$$ LANGUAGE plpgsql; + +SELECT * FROM check_cursor_explain_buffers(); + +DROP FUNCTION check_cursor_explain_buffers; +DROP FUNCTION cursor_noscroll_scan; +DROP FUNCTION cursor_scroll_scan; +DROP TABLE cursor_buf_test; + +-- Parallel query buffer double-counting test. +-- +-- Compares serial Seq Scan buffers vs parallel Seq Scan buffers. +-- They scan the same table so the buffer count should be similar. +-- Double-counting would make the parallel count ~2x larger. +CREATE FUNCTION check_parallel_explain_buffers() RETURNS TABLE(ratio numeric) AS $$ +DECLARE + plan_json json; + serial_buffers int; + parallel_buffers int; + node json; +BEGIN + -- Serial -- + SET LOCAL max_parallel_workers_per_gather = 0; + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT count(*) FROM tenk1' INTO plan_json; + node := plan_json->0->'Plan'; + serial_buffers := + COALESCE((node->>'Shared Hit Blocks')::int, 0) + + COALESCE((node->>'Shared Read Blocks')::int, 0); + + -- Parallel -- + SET LOCAL parallel_setup_cost = 0; + SET LOCAL parallel_tuple_cost = 0; + SET LOCAL min_parallel_table_scan_size = 0; + SET LOCAL max_parallel_workers_per_gather = 2; + SET LOCAL parallel_leader_participation = off; + EXECUTE 'EXPLAIN (ANALYZE, BUFFERS, COSTS OFF, FORMAT JSON) + SELECT count(*) FROM tenk1' INTO plan_json; + node := plan_json->0->'Plan'; + parallel_buffers := + COALESCE((node->>'Shared Hit Blocks')::int, 0) + + COALESCE((node->>'Shared Read Blocks')::int, 0); + + RETURN QUERY SELECT round(parallel_buffers::numeric / GREATEST(serial_buffers, 1)); +END; +$$ LANGUAGE plpgsql; + +SELECT * FROM check_parallel_explain_buffers(); + +DROP FUNCTION check_parallel_explain_buffers; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 241945734ec86..b3146b0a1651e 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1319,6 +1319,7 @@ InjectionPointSharedState InjectionPointsCtl InlineCodeBlock InsertStmt +InstrStack Instrumentation Int128AggState Int8TransTypeData @@ -1784,6 +1785,7 @@ NextSampleBlock_function NextSampleTuple_function NextValueExpr Node +NodeInstrumentation NodeTag NonEmptyRange NoneCompressorState @@ -3155,6 +3157,7 @@ TriggerDesc TriggerEvent TriggerFlags TriggerInfo +TriggerInstrumentation TriggerTransition TruncateStmt TsmRoutine @@ -3375,9 +3378,9 @@ WorkTableScan WorkTableScanState WorkerInfo WorkerInfoData -WorkerInstrumentation WorkerJobDumpPtrType WorkerJobRestorePtrType +WorkerNodeInstrumentation Working_State WriteBufPtrType WriteBytePtrType