diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 025215fcc9065..b5000bc14b78e 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -50,7 +50,6 @@
#include "access/htup_details.h"
#include "access/parallel.h"
#include "catalog/pg_authid.h"
-#include "common/int.h"
#include "executor/instrument.h"
#include "funcapi.h"
#include "jit/jit.h"
@@ -59,7 +58,6 @@
#include "nodes/queryjumble.h"
#include "optimizer/planner.h"
#include "parser/analyze.h"
-#include "parser/scanner.h"
#include "pgstat.h"
#include "storage/fd.h"
#include "storage/ipc.h"
@@ -339,7 +337,7 @@ PG_FUNCTION_INFO_V1(pg_stat_statements_info);
static void pgss_shmem_shutdown(int code, Datum arg);
static void pgss_post_parse_analyze(ParseState *pstate, Query *query,
- JumbleState *jstate);
+ const JumbleState *jstate);
static PlannedStmt *pgss_planner(Query *parse,
const char *query_string,
int cursorOptions,
@@ -363,7 +361,7 @@ static void pgss_store(const char *query, int64 queryId,
const BufferUsage *bufusage,
const WalUsage *walusage,
const struct JitInstrumentation *jitusage,
- JumbleState *jstate,
+ const JumbleState *jstate,
int parallel_workers_to_launch,
int parallel_workers_launched,
PlannedStmtOrigin planOrigin);
@@ -381,12 +379,9 @@ static char *qtext_fetch(Size query_offset, int query_len,
static bool need_gc_qtexts(void);
static void gc_qtexts(void);
static TimestampTz entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only);
-static char *generate_normalized_query(JumbleState *jstate, const char *query,
+static char *generate_normalized_query(const JumbleState *jstate,
+ const char *query,
int query_loc, int *query_len_p);
-static void fill_in_constant_lengths(JumbleState *jstate, const char *query,
- int query_loc);
-static int comp_location(const void *a, const void *b);
-
/*
* Module load callback
@@ -836,7 +831,7 @@ pgss_shmem_shutdown(int code, Datum arg)
* Post-parse-analysis hook: mark query with a queryId
*/
static void
-pgss_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate)
+pgss_post_parse_analyze(ParseState *pstate, Query *query, const JumbleState *jstate)
{
if (prev_post_parse_analyze_hook)
prev_post_parse_analyze_hook(pstate, query, jstate);
@@ -1287,7 +1282,7 @@ pgss_store(const char *query, int64 queryId,
const BufferUsage *bufusage,
const WalUsage *walusage,
const struct JitInstrumentation *jitusage,
- JumbleState *jstate,
+ const JumbleState *jstate,
int parallel_workers_to_launch,
int parallel_workers_launched,
PlannedStmtOrigin planOrigin)
@@ -2824,7 +2819,7 @@ entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only)
* Returns a palloc'd string.
*/
static char *
-generate_normalized_query(JumbleState *jstate, const char *query,
+generate_normalized_query(const JumbleState *jstate, const char *query,
int query_loc, int *query_len_p)
{
char *norm_query;
@@ -2836,12 +2831,14 @@ generate_normalized_query(JumbleState *jstate, const char *query,
last_off = 0, /* Offset from start for previous tok */
last_tok_len = 0; /* Length (in bytes) of that tok */
int num_constants_replaced = 0;
+ LocationLen *locs = NULL;
/*
- * Get constants' lengths (core system only gives us locations). Note
- * this also ensures the items are sorted by location.
+ * Determine constants' lengths (core system only gives us locations), and
+ * return a sorted copy of jstate's LocationLen data with lengths filled
+ * in.
*/
- fill_in_constant_lengths(jstate, query, query_loc);
+ locs = ComputeConstantLengths(jstate, query, query_loc);
/*
* Allow for $n symbols to be longer than the constants they replace.
@@ -2867,15 +2864,15 @@ generate_normalized_query(JumbleState *jstate, const char *query,
* the parameter in the next iteration (or after the loop is done),
* which is a bit odd but seems to work okay in most cases.
*/
- if (jstate->clocations[i].extern_param && !jstate->has_squashed_lists)
+ if (locs[i].extern_param && !jstate->has_squashed_lists)
continue;
- off = jstate->clocations[i].location;
+ off = locs[i].location;
/* Adjust recorded location if we're dealing with partial string */
off -= query_loc;
- tok_len = jstate->clocations[i].length;
+ tok_len = locs[i].length;
if (tok_len < 0)
continue; /* ignore any duplicates */
@@ -2894,7 +2891,7 @@ generate_normalized_query(JumbleState *jstate, const char *query,
*/
n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d%s",
num_constants_replaced + 1 + jstate->highest_extern_param_id,
- jstate->clocations[i].squashed ? " /*, ... */" : "");
+ locs[i].squashed ? " /*, ... */" : "");
num_constants_replaced++;
/* move forward */
@@ -2903,6 +2900,10 @@ generate_normalized_query(JumbleState *jstate, const char *query,
last_tok_len = tok_len;
}
+ /* Clean up, if needed */
+ if (locs)
+ pfree(locs);
+
/*
* We've copied up until the last ignorable constant. Copy over the
* remaining bytes of the original query string.
@@ -2919,140 +2920,3 @@ generate_normalized_query(JumbleState *jstate, const char *query,
*query_len_p = n_quer_loc;
return norm_query;
}
-
-/*
- * Given a valid SQL string and an array of constant-location records,
- * fill in the textual lengths of those constants.
- *
- * The constants may use any allowed constant syntax, such as float literals,
- * bit-strings, single-quoted strings and dollar-quoted strings. This is
- * accomplished by using the public API for the core scanner.
- *
- * It is the caller's job to ensure that the string is a valid SQL statement
- * with constants at the indicated locations. Since in practice the string
- * has already been parsed, and the locations that the caller provides will
- * have originated from within the authoritative parser, this should not be
- * a problem.
- *
- * Multiple constants can have the same location. We reset lengths of those
- * past the first to -1 so that they can later be ignored.
- *
- * If query_loc > 0, then "query" has been advanced by that much compared to
- * the original string start, so we need to translate the provided locations
- * to compensate. (This lets us avoid re-scanning statements before the one
- * of interest, so it's worth doing.)
- *
- * N.B. There is an assumption that a '-' character at a Const location begins
- * a negative numeric constant. This precludes there ever being another
- * reason for a constant to start with a '-'.
- */
-static void
-fill_in_constant_lengths(JumbleState *jstate, const char *query,
- int query_loc)
-{
- LocationLen *locs;
- core_yyscan_t yyscanner;
- core_yy_extra_type yyextra;
- core_YYSTYPE yylval;
- YYLTYPE yylloc;
-
- /*
- * Sort the records by location so that we can process them in order while
- * scanning the query text.
- */
- if (jstate->clocations_count > 1)
- qsort(jstate->clocations, jstate->clocations_count,
- sizeof(LocationLen), comp_location);
- locs = jstate->clocations;
-
- /* initialize the flex scanner --- should match raw_parser() */
- yyscanner = scanner_init(query,
- &yyextra,
- &ScanKeywords,
- ScanKeywordTokens);
-
- /* Search for each constant, in sequence */
- for (int i = 0; i < jstate->clocations_count; i++)
- {
- int loc;
- int tok;
-
- /* Ignore constants after the first one in the same location */
- if (i > 0 && locs[i].location == locs[i - 1].location)
- {
- locs[i].length = -1;
- continue;
- }
-
- if (locs[i].squashed)
- continue; /* squashable list, ignore */
-
- /* Adjust recorded location if we're dealing with partial string */
- loc = locs[i].location - query_loc;
- Assert(loc >= 0);
-
- /*
- * We have a valid location for a constant that's not a dupe. Lex
- * tokens until we find the desired constant.
- */
- for (;;)
- {
- tok = core_yylex(&yylval, &yylloc, yyscanner);
-
- /* We should not hit end-of-string, but if we do, behave sanely */
- if (tok == 0)
- break; /* out of inner for-loop */
-
- /*
- * We should find the token position exactly, but if we somehow
- * run past it, work with that.
- */
- if (yylloc >= loc)
- {
- if (query[loc] == '-')
- {
- /*
- * It's a negative value - this is the one and only case
- * where we replace more than a single token.
- *
- * Do not compensate for the core system's special-case
- * adjustment of location to that of the leading '-'
- * operator in the event of a negative constant. It is
- * also useful for our purposes to start from the minus
- * symbol. In this way, queries like "select * from foo
- * where bar = 1" and "select * from foo where bar = -2"
- * will have identical normalized query strings.
- */
- tok = core_yylex(&yylval, &yylloc, yyscanner);
- if (tok == 0)
- break; /* out of inner for-loop */
- }
-
- /*
- * We now rely on the assumption that flex has placed a zero
- * byte after the text of the current token in scanbuf.
- */
- locs[i].length = strlen(yyextra.scanbuf + loc);
- break; /* out of inner for-loop */
- }
- }
-
- /* If we hit end-of-string, give up, leaving remaining lengths -1 */
- if (tok == 0)
- break;
- }
-
- scanner_finish(yyscanner);
-}
-
-/*
- * comp_location: comparator for qsorting LocationLen structs by location
- */
-static int
-comp_location(const void *a, const void *b)
-{
- int l = ((const LocationLen *) a)->location;
- int r = ((const LocationLen *) b)->location;
-
- return pg_cmp_s32(l, r);
-}
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 3324d2d3c49e1..13e95afa97c0e 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2533,6 +2533,77 @@ include_dir 'conf.d'
+
+ Timing
+
+
+
+ timing_clock_source (enum)
+
+ timing_clock_source configuration parameter
+
+ RDTSC
+
+ Time-Stamp Counter
+ TSC
+
+ TSC
+
+
+
+ Selects the method for making timing measurements using the OS or
+ specialized CPU instructions. Possible values are:
+
+
+
+ auto (automatically chooses TSC
+ clock source on supported x86-64 CPUs, otherwise uses the OS system
+ clock)
+
+
+
+
+ system (measures timing using the OS system clock)
+
+
+
+
+ tsc (measures timing with a CPU instruction, e.g.
+ using RDTSC/RDTSCP on x86-64)
+
+
+
+ The default is auto. Only superusers can change this
+ setting. Changing the setting during query execution is not recommended
+ and may cause interval timings to jump significantly or produce negative
+ values.
+
+
+ If enabled, the TSC clock source, named after the
+ Time-Stamp Counter on x86-64, will use specialized CPU instructions when
+ measuring time intervals. This lowers timing overhead compared to reading
+ the OS system clock, and reduces the measurement error on top of the
+ actual runtime, for example with EXPLAIN ANALYZE.
+
+
+ On x86-64 CPUs the TSC clock source utilizes the
+ RDTSC instruction for EXPLAIN ANALYZE.
+ For timings that require higher precision the RDTSCP
+ instruction is used, which avoids inaccuracies due to CPU instruction
+ re-ordering. Use of the TSC clock source is not
+ supported on older x86-64 CPUs and other architectures, and is not
+ advised on systems that utilize an emulated TSC, as it
+ is likely slower than the system clock source.
+
+
+ To help decide which clock source to use you can run the
+ utility to check TSC
+ availability, and perform timing measurements.
+
+
+
+
+
Background Writer
diff --git a/doc/src/sgml/ref/pgtesttiming.sgml b/doc/src/sgml/ref/pgtesttiming.sgml
index afe6a12be4b30..342f4425c65c7 100644
--- a/doc/src/sgml/ref/pgtesttiming.sgml
+++ b/doc/src/sgml/ref/pgtesttiming.sgml
@@ -32,9 +32,10 @@ PostgreSQL documentation
pg_test_timing is a tool to measure the
timing overhead on your system and confirm that the system time never
- moves backwards. It simply reads the system clock over and over again
+ moves backwards. It reads supported clock sources over and over again
as fast as it can for a specified length of time, and then prints
- statistics about the observed differences in successive clock readings.
+ statistics about the observed differences in successive clock readings,
+ as well as which clock source will be used.
Smaller (but not zero) differences are better, since they imply both
@@ -45,7 +46,10 @@ PostgreSQL documentation
This tool is also helpful to determine if
the track_io_timing configuration parameter is likely
- to produce useful results.
+ to produce useful results, and whether the
+ TSC clock source (see
+ ) is available and if it will be
+ used by default.
@@ -151,47 +155,134 @@ PostgreSQL documentation
However, the largest observed difference is always shown.
- The example results below show that 99.99% of timing loops took between
- 8 and 31 nanoseconds, with the worst case somewhere between 32768 and
- 65535 nanoseconds. In the second block, we can see that typical loop
- time is 16 nanoseconds, and the readings appear to have full nanosecond
- precision.
+ On platforms that support the TSC clock source,
+ additional output sections are shown for the RDTSCP
+ instruction (used for general timing needs, such as
+ track_io_timing) and the RDTSC
+ instruction (used for EXPLAIN ANALYZE). At the end
+ of the output, the TSC frequency, which may either be
+ sourced from CPU information directly, or the alternate calibration
+ mechanism are shown, as well as whether the TSC clock
+ source will be used by default.
+
+
+
+ The example results below show system clock timing where 99.99% of loops
+ took between 16 and 63 nanoseconds, followed by TSC
+ clock source results. The RDTSCP instruction shows
+ most loops completing in 20–30 nanoseconds, while the
+ RDTSC instruction is the fastest at
+ 9–30 nanoseconds. In this example the TSC
+ clock source will be used by default, but can be disabled by setting
+ timing_clock_source to system.
@@ -203,6 +294,7 @@ Observed timing durations up to 99.9900%:
+
Wiki
discussion about timing
diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c
index eea45106a3ffe..b354723be4435 100644
--- a/src/backend/commands/statscmds.c
+++ b/src/backend/commands/statscmds.c
@@ -280,8 +280,10 @@ CreateStatistics(CreateStatsStmt *stmt, bool check_rights)
if (type->lt_opr == InvalidOid)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("column \"%s\" cannot be used in multivariate statistics because its type %s has no default btree operator class",
- attname, format_type_be(attForm->atttypid))));
+ errmsg("cannot create multivariate statistics on column \"%s\"",
+ attname),
+ errdetail("The type %s has no default btree operator class.",
+ format_type_be(attForm->atttypid))));
}
/* Treat virtual generated columns as expressions */
@@ -325,8 +327,10 @@ CreateStatistics(CreateStatsStmt *stmt, bool check_rights)
if (type->lt_opr == InvalidOid)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("column \"%s\" cannot be used in multivariate statistics because its type %s has no default btree operator class",
- get_attname(relid, var->varattno, false), format_type_be(var->vartype))));
+ errmsg("cannot create multivariate statistics on column \"%s\"",
+ get_attname(relid, var->varattno, false)),
+ errdetail("The type %s has no default btree operator class.",
+ format_type_be(var->vartype))));
}
/* Treat virtual generated columns as expressions */
@@ -375,8 +379,9 @@ CreateStatistics(CreateStatsStmt *stmt, bool check_rights)
if (type->lt_opr == InvalidOid)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("expression cannot be used in multivariate statistics because its type %s has no default btree operator class",
- format_type_be(atttype))));
+ errmsg("cannot create multivariate statistics on this expression"),
+ errdetail("The type %s has no default btree operator class.",
+ format_type_be(atttype))));
}
stxexprs = lappend(stxexprs, expr);
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 4d4e96a530236..c41005ba44e62 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -3940,6 +3940,13 @@ typedef struct AfterTriggerCallbackItem
static AfterTriggersData afterTriggers;
+/*
+ * Incremented before invoking afterTriggerInvokeEvents(). Used by
+ * AfterTriggerIsActive() to determine whether batch callbacks will fire,
+ * so that RI trigger functions can take the batched fast path.
+ */
+static int afterTriggerFiringDepth = 0;
+
static void AfterTriggerExecute(EState *estate,
AfterTriggerEvent event,
ResultRelInfo *relInfo,
@@ -5113,6 +5120,7 @@ AfterTriggerBeginXact(void)
Assert(afterTriggers.events.head == NULL);
Assert(afterTriggers.trans_stack == NULL);
Assert(afterTriggers.maxtransdepth == 0);
+ Assert(afterTriggerFiringDepth == 0);
}
@@ -5184,6 +5192,7 @@ AfterTriggerEndQuery(EState *estate)
*/
qs = &afterTriggers.query_stack[afterTriggers.query_depth];
+ afterTriggerFiringDepth++;
for (;;)
{
if (afterTriggerMarkEvents(&qs->events, &afterTriggers.events, true))
@@ -5234,6 +5243,7 @@ AfterTriggerEndQuery(EState *estate)
AfterTriggerFreeQuery(&afterTriggers.query_stack[afterTriggers.query_depth]);
afterTriggers.query_depth--;
+ afterTriggerFiringDepth--;
}
@@ -5329,6 +5339,7 @@ AfterTriggerFireDeferred(void)
* Run all the remaining triggers. Loop until they are all gone, in case
* some trigger queues more for us to do.
*/
+ afterTriggerFiringDepth++;
while (afterTriggerMarkEvents(events, NULL, false))
{
CommandId firing_id = afterTriggers.firing_counter++;
@@ -5340,6 +5351,8 @@ AfterTriggerFireDeferred(void)
/* Flush any fast-path batches accumulated by the triggers just fired. */
FireAfterTriggerBatchCallbacks();
+ afterTriggerFiringDepth--;
+
/*
* We don't bother freeing the event list, since it will go away anyway
* (and more efficiently than via pfree) in AfterTriggerEndXact.
@@ -5404,6 +5417,8 @@ AfterTriggerEndXact(bool isCommit)
/* No more afterTriggers manipulation until next transaction starts. */
afterTriggers.query_depth = -1;
+
+ afterTriggerFiringDepth = 0;
}
/*
@@ -6053,6 +6068,7 @@ AfterTriggerSetState(ConstraintsSetStmt *stmt)
AfterTriggerEventList *events = &afterTriggers.events;
bool snapshot_set = false;
+ afterTriggerFiringDepth++;
while (afterTriggerMarkEvents(events, NULL, true))
{
CommandId firing_id = afterTriggers.firing_counter++;
@@ -6086,6 +6102,7 @@ AfterTriggerSetState(ConstraintsSetStmt *stmt)
* Flush any fast-path batches accumulated by the triggers just fired.
*/
FireAfterTriggerBatchCallbacks();
+ afterTriggerFiringDepth--;
if (snapshot_set)
PopActiveSnapshot();
@@ -6806,10 +6823,10 @@ RegisterAfterTriggerBatchCallback(AfterTriggerBatchCallback callback,
* Allocate in TopTransactionContext so the item survives for the duration
* of the batch, which may span multiple trigger invocations.
*
- * Must be called while afterTriggers is active (query_depth >= 0);
- * callbacks registered outside a trigger-firing context would never fire.
+ * Must be called while afterTriggers is active; callbacks registered
+ * outside a trigger-firing context would never fire.
*/
- Assert(afterTriggers.query_depth >= 0);
+ Assert(afterTriggerFiringDepth > 0);
oldcxt = MemoryContextSwitchTo(TopTransactionContext);
item = palloc(sizeof(AfterTriggerCallbackItem));
item->callback = callback;
@@ -6836,6 +6853,7 @@ FireAfterTriggerBatchCallbacks(void)
if (afterTriggers.query_depth > 0)
return;
+ Assert(afterTriggerFiringDepth > 0);
foreach(lc, afterTriggers.batch_callbacks)
{
AfterTriggerCallbackItem *item = lfirst(lc);
@@ -6858,5 +6876,5 @@ FireAfterTriggerBatchCallbacks(void)
bool
AfterTriggerIsActive(void)
{
- return afterTriggers.query_depth >= 0;
+ return afterTriggerFiringDepth > 0;
}
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 011a9684df0d5..b1f31d59a4d6a 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -16,6 +16,8 @@
#include
#include "executor/instrument.h"
+#include "portability/instr_time.h"
+#include "utils/guc_hooks.h"
BufferUsage pgBufferUsage;
static BufferUsage save_pgBufferUsage;
@@ -52,7 +54,7 @@ InstrStart(Instrumentation *instr)
if (!INSTR_TIME_IS_ZERO(instr->starttime))
elog(ERROR, "InstrStart called twice in a row");
else
- INSTR_TIME_SET_CURRENT(instr->starttime);
+ INSTR_TIME_SET_CURRENT_FAST(instr->starttime);
}
/* save buffer usage totals at start, if needed */
@@ -78,7 +80,7 @@ InstrStopCommon(Instrumentation *instr, instr_time *accum_time)
if (INSTR_TIME_IS_ZERO(instr->starttime))
elog(ERROR, "InstrStop called without start");
- INSTR_TIME_SET_CURRENT(endtime);
+ INSTR_TIME_SET_CURRENT_FAST(endtime);
INSTR_TIME_ACCUM_DIFF(*accum_time, endtime, instr->starttime);
INSTR_TIME_SET_ZERO(instr->starttime);
@@ -345,3 +347,75 @@ WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub)
dst->wal_fpi_bytes += add->wal_fpi_bytes - sub->wal_fpi_bytes;
dst->wal_buffers_full += add->wal_buffers_full - sub->wal_buffers_full;
}
+
+/* GUC hooks for timing_clock_source */
+
+bool
+check_timing_clock_source(int *newval, void **extra, GucSource source)
+{
+ /*
+ * Do nothing if timing is not initialized. This is only expected on child
+ * processes in EXEC_BACKEND builds, as GUC hooks can be called during
+ * InitializeGUCOptions() before InitProcessGlobals() has had a chance to
+ * run pg_initialize_timing(). Instead, TSC will be initialized via
+ * restore_backend_variables.
+ */
+#ifdef EXEC_BACKEND
+ if (!timing_initialized)
+ return true;
+#else
+ Assert(timing_initialized);
+#endif
+
+#if PG_INSTR_TSC_CLOCK
+ pg_initialize_timing_tsc();
+
+ if (*newval == TIMING_CLOCK_SOURCE_TSC && timing_tsc_frequency_khz <= 0)
+ {
+ GUC_check_errdetail("TSC is not supported as timing clock source");
+ return false;
+ }
+#endif
+
+ return true;
+}
+
+void
+assign_timing_clock_source(int newval, void *extra)
+{
+#ifdef EXEC_BACKEND
+ if (!timing_initialized)
+ return;
+#else
+ Assert(timing_initialized);
+#endif
+
+ /*
+ * Ignore the return code since the check hook already verified TSC is
+ * usable if its explicitly requested.
+ */
+ pg_set_timing_clock_source(newval);
+}
+
+const char *
+show_timing_clock_source(void)
+{
+ switch (timing_clock_source)
+ {
+ case TIMING_CLOCK_SOURCE_AUTO:
+#if PG_INSTR_TSC_CLOCK
+ if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC)
+ return "auto (tsc)";
+#endif
+ return "auto (system)";
+ case TIMING_CLOCK_SOURCE_SYSTEM:
+ return "system";
+#if PG_INSTR_TSC_CLOCK
+ case TIMING_CLOCK_SOURCE_TSC:
+ return "tsc";
+#endif
+ }
+
+ /* unreachable */
+ return "?";
+}
diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c
index 87db8dc1a32f1..7c63766a51c5d 100644
--- a/src/backend/nodes/queryjumblefuncs.c
+++ b/src/backend/nodes/queryjumblefuncs.c
@@ -40,10 +40,12 @@
#include "access/transam.h"
#include "catalog/pg_proc.h"
#include "common/hashfn.h"
+#include "common/int.h"
#include "miscadmin.h"
#include "nodes/nodeFuncs.h"
#include "nodes/queryjumble.h"
#include "utils/lsyscache.h"
+#include "parser/scanner.h"
#include "parser/scansup.h"
#define JUMBLE_SIZE 1024 /* query serialization buffer size */
@@ -773,3 +775,156 @@ _jumbleRangeTblEntry_eref(JumbleState *jstate,
*/
JUMBLE_STRING(aliasname);
}
+
+/*
+ * CompLocation: comparator for qsorting LocationLen structs by location
+ */
+static int
+CompLocation(const void *a, const void *b)
+{
+ int l = ((const LocationLen *) a)->location;
+ int r = ((const LocationLen *) b)->location;
+
+ return pg_cmp_s32(l, r);
+}
+
+/*
+ * Given a valid SQL string and an array of constant-location records, return
+ * the textual lengths of those constants in a newly allocated LocationLen
+ * array, or NULL if there are no constants.
+ *
+ * The constants may use any allowed constant syntax, such as float literals,
+ * bit-strings, single-quoted strings and dollar-quoted strings. This is
+ * accomplished by using the public API for the core scanner.
+ *
+ * It is the caller's job to ensure that the string is a valid SQL statement
+ * with constants at the indicated locations. Since in practice the string
+ * has already been parsed, and the locations that the caller provides will
+ * have originated from within the authoritative parser, this should not be
+ * a problem.
+ *
+ * Multiple constants can have the same location. We reset lengths of those
+ * past the first to -1 so that they can later be ignored.
+ *
+ * If query_loc > 0, then "query" has been advanced by that much compared to
+ * the original string start, as is the case with multi-statement strings, so
+ * we need to translate the provided locations to compensate. (This lets us
+ * avoid re-scanning statements before the one of interest, so it's worth
+ * doing.)
+ *
+ * N.B. There is an assumption that a '-' character at a Const location begins
+ * a negative numeric constant. This precludes there ever being another
+ * reason for a constant to start with a '-'.
+ *
+ * It is the caller's responsibility to free the result, if necessary.
+ */
+LocationLen *
+ComputeConstantLengths(const JumbleState *jstate, const char *query,
+ int query_loc)
+{
+ LocationLen *locs;
+ core_yyscan_t yyscanner;
+ core_yy_extra_type yyextra;
+ core_YYSTYPE yylval;
+ YYLTYPE yylloc;
+
+ if (jstate->clocations_count == 0)
+ return NULL;
+
+ /* Copy constant locations to avoid modifying jstate */
+ locs = palloc_array(LocationLen, jstate->clocations_count);
+ memcpy(locs, jstate->clocations, jstate->clocations_count * sizeof(LocationLen));
+
+ /*
+ * Sort the records by location so that we can process them in order while
+ * scanning the query text.
+ */
+ if (jstate->clocations_count > 1)
+ qsort(locs, jstate->clocations_count,
+ sizeof(LocationLen), CompLocation);
+
+ /* initialize the flex scanner --- should match raw_parser() */
+ yyscanner = scanner_init(query,
+ &yyextra,
+ &ScanKeywords,
+ ScanKeywordTokens);
+
+ /* Search for each constant, in sequence */
+ for (int i = 0; i < jstate->clocations_count; i++)
+ {
+ int loc;
+ int tok;
+
+ /* Ignore constants after the first one in the same location */
+ if (i > 0 && locs[i].location == locs[i - 1].location)
+ {
+ locs[i].length = -1;
+ continue;
+ }
+
+ if (locs[i].squashed)
+ continue; /* squashable list, ignore */
+
+ /*
+ * Adjust the constant's location using the provided starting location
+ * of the current statement. This allows us to avoid scanning a
+ * multi-statement string from the beginning.
+ */
+ loc = locs[i].location - query_loc;
+ Assert(loc >= 0);
+
+ /*
+ * We have a valid location for a constant that's not a dupe. Lex
+ * tokens until we find the desired constant.
+ */
+ for (;;)
+ {
+ tok = core_yylex(&yylval, &yylloc, yyscanner);
+
+ /* We should not hit end-of-string, but if we do, behave sanely */
+ if (tok == 0)
+ break; /* out of inner for-loop */
+
+ /*
+ * We should find the token position exactly, but if we somehow
+ * run past it, work with that.
+ */
+ if (yylloc >= loc)
+ {
+ if (query[loc] == '-')
+ {
+ /*
+ * It's a negative value - this is the one and only case
+ * where we replace more than a single token.
+ *
+ * Do not compensate for the special-case adjustment of
+ * location to that of the leading '-' operator in the
+ * event of a negative constant (see doNegate() in
+ * gram.y). It is also useful for our purposes to start
+ * from the minus symbol. In this way, queries like
+ * "select * from foo where bar = 1" and "select * from
+ * foo where bar = -2" can be treated similarly.
+ */
+ tok = core_yylex(&yylval, &yylloc, yyscanner);
+ if (tok == 0)
+ break; /* out of inner for-loop */
+ }
+
+ /*
+ * We now rely on the assumption that flex has placed a zero
+ * byte after the text of the current token in scanbuf.
+ */
+ locs[i].length = strlen(yyextra.scanbuf + loc);
+ break; /* out of inner for-loop */
+ }
+ }
+
+ /* If we hit end-of-string, give up, leaving remaining lengths -1 */
+ if (tok == 0)
+ break;
+ }
+
+ scanner_finish(yyscanner);
+
+ return locs;
+}
diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c
index ed0f4f2d23436..8f3cfea880c3c 100644
--- a/src/backend/postmaster/launch_backend.c
+++ b/src/backend/postmaster/launch_backend.c
@@ -57,6 +57,7 @@
#ifdef EXEC_BACKEND
#include "nodes/queryjumble.h"
+#include "portability/instr_time.h"
#include "storage/pg_shmem.h"
#include "storage/spin.h"
#endif
@@ -129,6 +130,8 @@ typedef struct
int MyPMChildSlot;
+ int32 timing_tsc_frequency_khz;
+
/*
* These are only used by backend processes, but are here because passing
* a socket needs some special handling on Windows. 'client_sock' is an
@@ -750,6 +753,8 @@ save_backend_variables(BackendParameters *param,
param->MaxBackends = MaxBackends;
param->num_pmchild_slots = num_pmchild_slots;
+ param->timing_tsc_frequency_khz = timing_tsc_frequency_khz;
+
#ifdef WIN32
param->PostmasterHandle = PostmasterHandle;
if (!write_duplicated_handle(¶m->initial_signal_pipe,
@@ -1004,6 +1009,12 @@ restore_backend_variables(BackendParameters *param)
MaxBackends = param->MaxBackends;
num_pmchild_slots = param->num_pmchild_slots;
+ timing_tsc_frequency_khz = param->timing_tsc_frequency_khz;
+
+ /* Re-run logic usually done by assign_timing_clock_source */
+ pg_initialize_timing();
+ pg_set_timing_clock_source(timing_clock_source);
+
#ifdef WIN32
PostmasterHandle = param->PostmasterHandle;
pgwin32_initial_signal_pipe = param->initial_signal_pipe;
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 6f13e8f40a0be..26bf4cfe2f5c7 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -1954,6 +1954,11 @@ InitProcessGlobals(void)
MyStartTimestamp = GetCurrentTimestamp();
MyStartTime = timestamptz_to_time_t(MyStartTimestamp);
+ /*
+ * Initialize timing infrastructure
+ */
+ pg_initialize_timing();
+
/*
* Set a different global seed in every process. We want something
* unpredictable, so if possible, use high-quality random bits for the
diff --git a/src/backend/utils/activity/pgstat_shmem.c b/src/backend/utils/activity/pgstat_shmem.c
index 955faf5ebc7d2..b8f354c818a06 100644
--- a/src/backend/utils/activity/pgstat_shmem.c
+++ b/src/backend/utils/activity/pgstat_shmem.c
@@ -150,8 +150,7 @@ StatsShmemSize(void)
continue;
Assert(kind_info->shared_size != 0);
-
- sz += MAXALIGN(kind_info->shared_size);
+ sz = add_size(sz, MAXALIGN(kind_info->shared_size));
}
return sz;
@@ -189,6 +188,7 @@ StatsShmemInit(void *arg)
* efficiency win.
*/
ctl->raw_dsa_area = p;
+ p += pgstat_dsa_init_size();
dsa = dsa_create_in_place(ctl->raw_dsa_area,
pgstat_dsa_init_size(),
LWTRANCHE_PGSTATS_DSA, NULL);
@@ -242,7 +242,8 @@ StatsShmemInit(void *arg)
int idx = kind - PGSTAT_KIND_CUSTOM_MIN;
Assert(kind_info->shared_size != 0);
- ctl->custom_data[idx] = ShmemAlloc(kind_info->shared_size);
+ ctl->custom_data[idx] = p;
+ p += MAXALIGN(kind_info->shared_size);
ptr = ctl->custom_data[idx];
}
diff --git a/src/backend/utils/adt/mac.c b/src/backend/utils/adt/mac.c
index f14675dea409f..923c5af54f8bf 100644
--- a/src/backend/utils/adt/mac.c
+++ b/src/backend/utils/adt/mac.c
@@ -14,11 +14,9 @@
#include "postgres.h"
#include "common/hashfn.h"
-#include "lib/hyperloglog.h"
#include "libpq/pqformat.h"
#include "port/pg_bswap.h"
#include "utils/fmgrprotos.h"
-#include "utils/guc.h"
#include "utils/inet.h"
#include "utils/sortsupport.h"
@@ -33,15 +31,6 @@
#define lobits(addr) \
((unsigned long)(((addr)->d<<16)|((addr)->e<<8)|((addr)->f)))
-/* sortsupport for macaddr */
-typedef struct
-{
- int64 input_count; /* number of non-null values seen */
- bool estimating; /* true if estimating cardinality */
-
- hyperLogLogState abbr_card; /* cardinality estimator */
-} macaddr_sortsupport_state;
-
static int macaddr_cmp_internal(macaddr *a1, macaddr *a2);
static int macaddr_fast_cmp(Datum x, Datum y, SortSupport ssup);
static bool macaddr_abbrev_abort(int memtupcount, SortSupport ssup);
@@ -369,24 +358,10 @@ macaddr_sortsupport(PG_FUNCTION_ARGS)
if (ssup->abbreviate)
{
- macaddr_sortsupport_state *uss;
- MemoryContext oldcontext;
-
- oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
-
- uss = palloc_object(macaddr_sortsupport_state);
- uss->input_count = 0;
- uss->estimating = true;
- initHyperLogLog(&uss->abbr_card, 10);
-
- ssup->ssup_extra = uss;
-
ssup->comparator = ssup_datum_unsigned_cmp;
ssup->abbrev_converter = macaddr_abbrev_convert;
ssup->abbrev_abort = macaddr_abbrev_abort;
ssup->abbrev_full_comparator = macaddr_fast_cmp;
-
- MemoryContextSwitchTo(oldcontext);
}
PG_RETURN_VOID();
@@ -406,61 +381,13 @@ macaddr_fast_cmp(Datum x, Datum y, SortSupport ssup)
}
/*
- * Callback for estimating effectiveness of abbreviated key optimization.
- *
- * We pay no attention to the cardinality of the non-abbreviated data, because
- * there is no equality fast-path within authoritative macaddr comparator.
+ * Abbreviation is never aborted for macaddr because the 6-byte MAC address
+ * fits entirely within a 64-bit Datum, making the abbreviated key
+ * authoritative.
*/
static bool
macaddr_abbrev_abort(int memtupcount, SortSupport ssup)
{
- macaddr_sortsupport_state *uss = ssup->ssup_extra;
- double abbr_card;
-
- if (memtupcount < 10000 || uss->input_count < 10000 || !uss->estimating)
- return false;
-
- abbr_card = estimateHyperLogLog(&uss->abbr_card);
-
- /*
- * If we have >100k distinct values, then even if we were sorting many
- * billion rows we'd likely still break even, and the penalty of undoing
- * that many rows of abbrevs would probably not be worth it. At this point
- * we stop counting because we know that we're now fully committed.
- */
- if (abbr_card > 100000.0)
- {
- if (trace_sort)
- elog(LOG,
- "macaddr_abbrev: estimation ends at cardinality %f"
- " after " INT64_FORMAT " values (%d rows)",
- abbr_card, uss->input_count, memtupcount);
- uss->estimating = false;
- return false;
- }
-
- /*
- * Target minimum cardinality is 1 per ~2k of non-null inputs. 0.5 row
- * fudge factor allows us to abort earlier on genuinely pathological data
- * where we've had exactly one abbreviated value in the first 2k
- * (non-null) rows.
- */
- if (abbr_card < uss->input_count / 2000.0 + 0.5)
- {
- if (trace_sort)
- elog(LOG,
- "macaddr_abbrev: aborting abbreviation at cardinality %f"
- " below threshold %f after " INT64_FORMAT " values (%d rows)",
- abbr_card, uss->input_count / 2000.0 + 0.5, uss->input_count,
- memtupcount);
- return true;
- }
-
- if (trace_sort)
- elog(LOG,
- "macaddr_abbrev: cardinality %f after " INT64_FORMAT
- " values (%d rows)", abbr_card, uss->input_count, memtupcount);
-
return false;
}
@@ -469,14 +396,13 @@ macaddr_abbrev_abort(int memtupcount, SortSupport ssup)
* to abbreviated key representation.
*
* Packs the bytes of a 6-byte MAC address into a Datum and treats it as an
- * unsigned integer for purposes of comparison. On a 64-bit machine, there
- * will be two zeroed bytes of padding. The integer is converted to native
- * endianness to facilitate easy comparison.
+ * unsigned integer for purposes of comparison. There will be two zeroed bytes
+ * of padding. The integer is converted to native endianness to facilitate
+ * easy comparison.
*/
static Datum
macaddr_abbrev_convert(Datum original, SortSupport ssup)
{
- macaddr_sortsupport_state *uss = ssup->ssup_extra;
macaddr *authoritative = DatumGetMacaddrP(original);
Datum res;
@@ -489,21 +415,6 @@ macaddr_abbrev_convert(Datum original, SortSupport ssup)
"Datum is too small for macaddr");
memset(&res, 0, sizeof(res));
memcpy(&res, authoritative, sizeof(macaddr));
- uss->input_count += 1;
-
- /*
- * Cardinality estimation. The estimate uses uint32, so XOR the two 32-bit
- * halves together to produce slightly more entropy. The two zeroed bytes
- * won't have any practical impact on this operation.
- */
- if (uss->estimating)
- {
- uint32 tmp;
-
- tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32);
-
- addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp)));
- }
/*
* Byteswap on little-endian machines.
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index fcb6ab8058309..31e5b85dc4f34 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -3052,6 +3052,17 @@
assign_hook => 'assign_timezone_abbreviations',
},
+{ name => 'timing_clock_source', type => 'enum', context => 'PGC_SUSET', group => 'RESOURCES_TIME',
+ short_desc => 'Controls the clock source used for collecting timing measurements.',
+ long_desc => 'This enables the use of specialized clock sources, specifically the RDTSC clock source on x86-64 systems (if available), to support timing measurements with lower overhead during EXPLAIN and other instrumentation.',
+ variable => 'timing_clock_source',
+ boot_val => 'TIMING_CLOCK_SOURCE_AUTO',
+ options => 'timing_clock_source_options',
+ check_hook => 'check_timing_clock_source',
+ assign_hook => 'assign_timing_clock_source',
+ show_hook => 'show_timing_clock_source',
+},
+
{ name => 'trace_connection_negotiation', type => 'bool', context => 'PGC_POSTMASTER', group => 'DEVELOPER_OPTIONS',
short_desc => 'Logs details of pre-authentication connection handshake.',
flags => 'GUC_NOT_IN_SAMPLE',
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index d9ca13baff97d..9f9d8d17be917 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -92,6 +92,7 @@
#include "tcop/tcopprot.h"
#include "tsearch/ts_cache.h"
#include "utils/builtins.h"
+#include "portability/instr_time.h"
#include "utils/bytea.h"
#include "utils/float.h"
#include "utils/guc_hooks.h"
@@ -373,6 +374,15 @@ static const struct config_enum_entry huge_pages_options[] = {
{NULL, 0, false}
};
+static const struct config_enum_entry timing_clock_source_options[] = {
+ {"auto", TIMING_CLOCK_SOURCE_AUTO, false},
+ {"system", TIMING_CLOCK_SOURCE_SYSTEM, false},
+#if PG_INSTR_TSC_CLOCK
+ {"tsc", TIMING_CLOCK_SOURCE_TSC, false},
+#endif
+ {NULL, 0, false}
+};
+
static const struct config_enum_entry huge_pages_status_options[] = {
{"off", HUGE_PAGES_OFF, false},
{"on", HUGE_PAGES_ON, false},
@@ -731,6 +741,7 @@ const char *const config_group_names[] =
[CONN_AUTH_TCP] = gettext_noop("Connections and Authentication / TCP Settings"),
[CONN_AUTH_AUTH] = gettext_noop("Connections and Authentication / Authentication"),
[CONN_AUTH_SSL] = gettext_noop("Connections and Authentication / SSL"),
+ [RESOURCES_TIME] = gettext_noop("Resource Usage / Time"),
[RESOURCES_MEM] = gettext_noop("Resource Usage / Memory"),
[RESOURCES_DISK] = gettext_noop("Resource Usage / Disk"),
[RESOURCES_KERNEL] = gettext_noop("Resource Usage / Kernel Resources"),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index e3e462f3efb90..5fc7323440ab7 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -196,6 +196,10 @@
#max_files_per_process = 1000 # min 64
# (change requires restart)
+# - Time -
+
+#timing_clock_source = auto # auto, system, tsc (if supported)
+
# - Background Writer -
#bgwriter_delay = 200ms # 10-10000ms between rounds
diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c
index aee41dbe3f9b7..84388b74aceaa 100644
--- a/src/bin/pg_test_timing/pg_test_timing.c
+++ b/src/bin/pg_test_timing/pg_test_timing.c
@@ -30,22 +30,29 @@ static long long int largest_diff_count;
static void handle_args(int argc, char *argv[]);
-static uint64 test_timing(unsigned int duration);
+static void test_system_timing(void);
+#if PG_INSTR_TSC_CLOCK
+static void test_tsc_timing(void);
+#endif
+static uint64 test_timing(unsigned int duration, TimingClockSourceType source, bool fast_timing);
static void output(uint64 loop_count);
int
main(int argc, char *argv[])
{
- uint64 loop_count;
-
set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_test_timing"));
progname = get_progname(argv[0]);
handle_args(argc, argv);
- loop_count = test_timing(test_duration);
+ /* initialize timing infrastructure (required for INSTR_* calls) */
+ pg_initialize_timing();
- output(loop_count);
+ test_system_timing();
+
+#if PG_INSTR_TSC_CLOCK
+ test_tsc_timing();
+#endif
return 0;
}
@@ -143,20 +150,99 @@ handle_args(int argc, char *argv[])
exit(1);
}
- printf(ngettext("Testing timing overhead for %u second.\n",
- "Testing timing overhead for %u seconds.\n",
+ printf(ngettext("Testing timing overhead for %u second.\n\n",
+ "Testing timing overhead for %u seconds.\n\n",
test_duration),
test_duration);
}
+/*
+ * This tests default (non-fast) timing code. A clock source for that is
+ * always available. Hence, we can unconditionally output the result.
+ */
+static void
+test_system_timing(void)
+{
+ uint64 loop_count;
+
+ loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_SYSTEM, false);
+ output(loop_count);
+}
+
+/*
+ * If on a supported architecture, test the TSC clock source. This clock
+ * source is not always available. In that case we print an informational
+ * message indicating as such.
+ *
+ * We first emit "slow" timings (RDTSCP on x86), which are used for higher
+ * precision measurements when the TSC clock source is enabled. We emit
+ * "fast" timings second (RDTSC on x86), which is used for faster timing
+ * measurements with lower precision.
+ */
+#if PG_INSTR_TSC_CLOCK
+static void
+test_tsc_timing(void)
+{
+ uint64 loop_count;
+ uint32 calibrated_freq;
+
+ printf("\n");
+ loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_TSC, false);
+ if (loop_count > 0)
+ {
+ output(loop_count);
+ printf("\n");
+
+ /* Now, emit fast timing measurements */
+ loop_count = test_timing(test_duration, TIMING_CLOCK_SOURCE_TSC, true);
+ output(loop_count);
+ printf("\n");
+
+ printf(_("TSC frequency in use: %u kHz\n"), timing_tsc_frequency_khz);
+
+ calibrated_freq = pg_tsc_calibrate_frequency();
+ if (calibrated_freq > 0)
+ printf(_("TSC frequency from calibration: %u kHz\n"), calibrated_freq);
+ else
+ printf(_("TSC calibration did not converge\n"));
+
+ pg_set_timing_clock_source(TIMING_CLOCK_SOURCE_AUTO);
+ if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC)
+ printf(_("TSC clock source will be used by default, unless timing_clock_source is set to 'system'.\n"));
+ else
+ printf(_("TSC clock source will not be used by default, unless timing_clock_source is set to 'tsc'.\n"));
+ }
+ else
+ printf(_("TSC clock source is not usable. Likely unable to determine TSC frequency. are you running in an unsupported virtualized environment?\n"));
+}
+#endif
+
static uint64
-test_timing(unsigned int duration)
+test_timing(unsigned int duration, TimingClockSourceType source, bool fast_timing)
{
uint64 loop_count = 0;
instr_time start_time,
end_time,
prev,
cur;
+ char *time_source = NULL;
+
+ if (!pg_set_timing_clock_source(source))
+ return 0;
+
+ time_source = PG_INSTR_SYSTEM_CLOCK_NAME;
+
+#if PG_INSTR_TSC_CLOCK
+ if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC)
+ time_source = fast_timing ? PG_INSTR_TSC_CLOCK_NAME_FAST : PG_INSTR_TSC_CLOCK_NAME;
+#endif
+
+ if (fast_timing)
+ printf(_("Fast clock source: %s\n"), time_source);
+ else if (source == TIMING_CLOCK_SOURCE_SYSTEM)
+ printf(_("System clock source: %s\n"), time_source);
+ else
+ printf(_("Clock source: %s\n"), time_source);
/*
* Pre-zero the statistics data structures. They're already zero by
@@ -181,7 +267,11 @@ test_timing(unsigned int duration)
instr_time diff_time;
prev = cur;
- INSTR_TIME_SET_CURRENT(cur);
+
+ if (fast_timing)
+ INSTR_TIME_SET_CURRENT_FAST(cur);
+ else
+ INSTR_TIME_SET_CURRENT(cur);
diff_time = cur;
INSTR_TIME_SUBTRACT(diff_time, prev);
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index 1dae918cc09d2..c969afab3a595 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -6820,6 +6820,9 @@ main(int argc, char **argv)
int exit_code = 0;
struct timeval tv;
+ /* initialize timing infrastructure (required for INSTR_* calls) */
+ pg_initialize_timing();
+
/*
* Record difference between Unix time and instr_time time. We'll use
* this for logging and aggregation.
diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c
index 9a397ec87b736..69d044d405d5b 100644
--- a/src/bin/psql/startup.c
+++ b/src/bin/psql/startup.c
@@ -24,6 +24,7 @@
#include "help.h"
#include "input.h"
#include "mainloop.h"
+#include "portability/instr_time.h"
#include "settings.h"
/*
@@ -327,6 +328,9 @@ main(int argc, char *argv[])
PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL);
+ /* initialize timing infrastructure (required for INSTR_* calls) */
+ pg_initialize_timing();
+
SyncVariables();
if (options.list_dbs)
diff --git a/src/common/Makefile b/src/common/Makefile
index 2c720caa50972..1a2fbbe887f22 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -59,6 +59,7 @@ OBJS_COMMON = \
file_perm.o \
file_utils.o \
hashfn.o \
+ instr_time.o \
ip.o \
jsonapi.o \
keywords.o \
diff --git a/src/common/instr_time.c b/src/common/instr_time.c
new file mode 100644
index 0000000000000..14ab4579d37b8
--- /dev/null
+++ b/src/common/instr_time.c
@@ -0,0 +1,438 @@
+/*-------------------------------------------------------------------------
+ *
+ * instr_time.c
+ * Non-inline parts of the portable high-precision interval timing
+ * implementation
+ *
+ * Portions Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/common/instr_time.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef FRONTEND
+#include "postgres.h"
+#else
+#include "postgres_fe.h"
+#endif
+
+#include
+
+#if defined(__APPLE__)
+#include
+#endif
+
+#include "port/pg_cpu.h"
+#include "portability/instr_time.h"
+
+/*
+ * Stores what the number of ticks needs to be multiplied with to end up
+ * with nanoseconds using integer math.
+ *
+ * In certain cases (TSC on x86-64, and QueryPerformanceCounter on Windows)
+ * the ticks to nanoseconds conversion requires floating point math because:
+ *
+ * sec = ticks / frequency_hz
+ * ns = ticks / frequency_hz * 1,000,000,000
+ * ns = ticks * (1,000,000,000 / frequency_hz)
+ * ns = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz
+ *
+ * Here, 'ns' is usually a floating point number. For example for a 2.5 GHz CPU
+ * the scaling factor becomes 1,000,000 / 2,500,000 = 0.4.
+ *
+ * To be able to use integer math we work around the lack of precision. We
+ * first scale the integer up (left shift by TICKS_TO_NS_SHIFT) and after the
+ * multiplication by the number of ticks in pg_ticks_to_ns() we shift right by
+ * the same amount.
+ *
+ * We remember the maximum number of ticks that can be multiplied by the scale
+ * factor without overflowing so we can check via a * b > max <=> a > max / b.
+ *
+ * However, as this is meant for interval measurements, it is unlikely that the
+ * overflow path is actually taken in typical scenarios, since overflows would
+ * only occur for intervals longer than 6.5 days.
+ *
+ * Note we utilize unsigned integers even though ticks are stored as a signed
+ * value to encourage compilers to generate better assembly, since we can be
+ * sure these values are not negative.
+ *
+ * In all other cases we are using clock_gettime(), which uses nanoseconds
+ * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns
+ * to return the original value.
+ */
+uint64 ticks_per_ns_scaled = 0;
+uint64 max_ticks_no_overflow = 0;
+bool timing_initialized = false;
+int timing_clock_source = TIMING_CLOCK_SOURCE_AUTO;
+
+bool timing_tsc_enabled = false;
+int32 timing_tsc_frequency_khz = -1;
+
+static void set_ticks_per_ns(void);
+static void set_ticks_per_ns_system(void);
+
+#if PG_INSTR_TSC_CLOCK
+static bool tsc_use_by_default(void);
+static void set_ticks_per_ns_for_tsc(void);
+#endif
+
+/*
+ * Initializes timing infrastructure. Must be called before making any use
+ * of INSTR* macros.
+ */
+void
+pg_initialize_timing(void)
+{
+ if (timing_initialized)
+ return;
+
+ set_ticks_per_ns_system();
+ timing_initialized = true;
+}
+
+bool
+pg_set_timing_clock_source(TimingClockSourceType source)
+{
+ Assert(timing_initialized);
+
+#if PG_INSTR_TSC_CLOCK
+ pg_initialize_timing_tsc();
+
+ switch (source)
+ {
+ case TIMING_CLOCK_SOURCE_AUTO:
+ timing_tsc_enabled = (timing_tsc_frequency_khz > 0) && tsc_use_by_default();
+ break;
+ case TIMING_CLOCK_SOURCE_SYSTEM:
+ timing_tsc_enabled = false;
+ break;
+ case TIMING_CLOCK_SOURCE_TSC:
+ /* Tell caller TSC is not usable */
+ if (timing_tsc_frequency_khz <= 0)
+ return false;
+ timing_tsc_enabled = true;
+ break;
+ }
+#endif
+
+ set_ticks_per_ns();
+ timing_clock_source = source;
+ return true;
+}
+
+static void
+set_ticks_per_ns(void)
+{
+#if PG_INSTR_TSC_CLOCK
+ if (timing_tsc_enabled)
+ {
+ set_ticks_per_ns_for_tsc();
+ return;
+ }
+#endif
+ set_ticks_per_ns_system();
+}
+
+#ifndef WIN32
+
+static void
+set_ticks_per_ns_system(void)
+{
+ ticks_per_ns_scaled = 0;
+ max_ticks_no_overflow = 0;
+}
+
+#else /* WIN32 */
+
+/* GetTimerFrequency returns counts per second */
+static inline double
+GetTimerFrequency(void)
+{
+ LARGE_INTEGER f;
+
+ QueryPerformanceFrequency(&f);
+ return (double) f.QuadPart;
+}
+
+static void
+set_ticks_per_ns_system(void)
+{
+ ticks_per_ns_scaled = (NS_PER_S << TICKS_TO_NS_SHIFT) / GetTimerFrequency();
+ max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
+}
+
+#endif /* WIN32 */
+
+/* Hardware clock specific logic (x86 TSC / AArch64 CNTVCT) */
+
+#if PG_INSTR_TSC_CLOCK
+
+static void tsc_detect_frequency(void);
+
+/*
+ * Initialize the TSC clock source by determining its usability and frequency.
+ *
+ * This can be called multiple times without causing repeated work, as
+ * timing_tsc_frequency_khz will be set to 0 if a prior call determined the
+ * TSC is not usable. On EXEC_BACKEND (Windows), the TSC frequency may also be
+ * set by restore_backend_variables.
+ */
+void
+pg_initialize_timing_tsc(void)
+{
+ if (timing_tsc_frequency_khz < 0)
+ tsc_detect_frequency();
+}
+
+static void
+set_ticks_per_ns_for_tsc(void)
+{
+ ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / timing_tsc_frequency_khz;
+ max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
+}
+
+#if defined(__x86_64__) || defined(_M_X64)
+
+/*
+ * x86-64 TSC specific logic
+ */
+
+/*
+ * Detect the TSC frequency and whether RDTSCP is available on x86-64.
+ *
+ * This can't be reliably determined at compile time, since the
+ * availability of an "invariant" TSC (that is not affected by CPU
+ * frequency changes) is dependent on the CPU architecture. Additionally,
+ * there are cases where TSC availability is impacted by virtualization,
+ * where a simple cpuid feature check would not be enough.
+ */
+static void
+tsc_detect_frequency(void)
+{
+ timing_tsc_frequency_khz = 0;
+
+ /* We require RDTSCP support and an invariant TSC, bail if not available */
+ if (!x86_feature_available(PG_RDTSCP) || !x86_feature_available(PG_TSC_INVARIANT))
+ return;
+
+ /* Determine speed at which the TSC advances */
+ timing_tsc_frequency_khz = x86_tsc_frequency_khz();
+ if (timing_tsc_frequency_khz > 0)
+ return;
+
+ /*
+ * CPUID did not give us the TSC frequency. We can instead measure the
+ * frequency by comparing ticks against walltime in a calibration loop.
+ */
+ timing_tsc_frequency_khz = pg_tsc_calibrate_frequency();
+}
+
+/*
+ * Decides whether to use the TSC clock source if the user did not specify it
+ * one way or the other, and it is available (checked separately).
+ *
+ * Inspired by the Linux kernel's clocksource watchdog disable logic as updated
+ * in 2021 to reflect the reliability of the TSC on Intel platforms, see
+ * check_system_tsc_reliable() in arch/x86/kernel/tsc.c, as well as discussion
+ * in https://lore.kernel.org/lkml/87eekfk8bd.fsf@nanos.tec.linutronix.de/
+ * and https://lore.kernel.org/lkml/87a6pimt1f.ffs@nanos.tec.linutronix.de/
+ * for reference.
+ *
+ * When tsc_detect_frequency determines the TSC is viable (invariant, etc.), and
+ * we're on an Intel platform (determined via TSC_ADJUST), we consider the TSC
+ * trustworthy by default, matching the Linux kernel.
+ *
+ * On other CPU platforms (e.g. AMD), or in some virtual machines, we don't have
+ * an easy way to determine the TSC's reliability. If on Linux, we can check if
+ * TSC is the active clocksource, based on it having run the watchdog logic to
+ * monitor TSC correctness. For other platforms the user must explicitly enable
+ * it via GUC instead.
+ */
+static bool
+tsc_use_by_default(void)
+{
+ if (x86_feature_available(PG_TSC_ADJUST))
+ return true;
+
+#if defined(__linux__)
+ {
+ FILE *fp;
+ char buf[128];
+
+ fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
+ if (fp)
+ {
+ bool is_tsc = (fgets(buf, sizeof(buf), fp) != NULL &&
+ strcmp(buf, "tsc\n") == 0);
+
+ fclose(fp);
+ if (is_tsc)
+ return true;
+ }
+ }
+#endif
+
+ return false;
+}
+
+/*
+ * Calibrate the TSC frequency by comparing TSC ticks against walltime.
+ *
+ * Takes initial TSC and system clock snapshots, then loops, recomputing the
+ * frequency each TSC_CALIBRATION_SKIPS iterations from cumulative TSC
+ * ticks divided by elapsed time.
+ *
+ * Once the frequency estimate stabilizes (consecutive iterations agree), we
+ * consider it converged and the frequency in KHz is returned. If either too
+ * many iterations or a time limit passes without convergence, 0 is returned.
+ */
+#define TSC_CALIBRATION_MAX_NS (50 * NS_PER_MS)
+#define TSC_CALIBRATION_ITERATIONS 1000000
+#define TSC_CALIBRATION_SKIPS 100
+#define TSC_CALIBRATION_STABLE_CYCLES 10
+
+uint32
+pg_tsc_calibrate_frequency(void)
+{
+ instr_time initial_wall;
+ int64 initial_tsc;
+ double freq_khz = 0;
+ double prev_freq_khz = 0;
+ int stable_count = 0;
+ int64 prev_tsc;
+ int saved_clock_source = timing_clock_source;
+
+ /*
+ * Frequency must be initialized to avoid recursion via
+ * pg_set_timing_clock_source
+ */
+ Assert(timing_tsc_frequency_khz >= 0);
+
+ /* Ensure INSTR_* calls below work on system time */
+ pg_set_timing_clock_source(TIMING_CLOCK_SOURCE_SYSTEM);
+
+ INSTR_TIME_SET_CURRENT(initial_wall);
+
+ initial_tsc = pg_rdtscp();
+ prev_tsc = initial_tsc;
+
+ for (int i = 0; i < TSC_CALIBRATION_ITERATIONS; i++)
+ {
+ instr_time now_wall;
+ int64 now_tsc;
+ int64 elapsed_ns;
+ int64 elapsed_ticks;
+
+ INSTR_TIME_SET_CURRENT(now_wall);
+
+ now_tsc = pg_rdtscp();
+
+ INSTR_TIME_SUBTRACT(now_wall, initial_wall);
+ elapsed_ns = INSTR_TIME_GET_NANOSEC(now_wall);
+
+ /* Safety: bail out if we've taken too long */
+ if (elapsed_ns >= TSC_CALIBRATION_MAX_NS)
+ break;
+
+ elapsed_ticks = now_tsc - initial_tsc;
+
+ /*
+ * Skip if TSC hasn't advanced, or we walked backwards for some
+ * reason.
+ */
+ if (now_tsc == prev_tsc || elapsed_ns <= 0 || elapsed_ticks <= 0)
+ continue;
+
+ /*
+ * We only measure frequency every TSC_CALIBRATION_SKIPS to avoid
+ * stabilizing based on just a handful of RDTSC instructions.
+ */
+ if (i % TSC_CALIBRATION_SKIPS != 0)
+ continue;
+
+ freq_khz = ((double) elapsed_ticks / elapsed_ns) * 1000 * 1000;
+
+ /*
+ * Once freq_khz / prev_freq_khz is small, check if it stays that way.
+ * If it does for long enough, we've got a winner frequency.
+ */
+ if (prev_freq_khz != 0 && fabs(1 - freq_khz / prev_freq_khz) < 0.0001)
+ {
+ stable_count++;
+ if (stable_count >= TSC_CALIBRATION_STABLE_CYCLES)
+ break;
+ }
+ else
+ stable_count = 0;
+
+ prev_tsc = now_tsc;
+ prev_freq_khz = freq_khz;
+ }
+
+ /* Restore the previous clock source */
+ pg_set_timing_clock_source(saved_clock_source);
+
+ if (stable_count < TSC_CALIBRATION_STABLE_CYCLES)
+ return 0; /* did not converge */
+
+ return (uint32) freq_khz;
+}
+
+#elif defined(__aarch64__)
+
+/*
+ * Check whether this is a heterogeneous Apple Silicon P+E core system
+ * where CNTVCT_EL0 may tick at different rates on different core types.
+ */
+static bool
+aarch64_has_heterogeneous_cores(void)
+{
+#if defined(__APPLE__)
+ int nperflevels = 0;
+ size_t len = sizeof(nperflevels);
+
+ if (sysctlbyname("hw.nperflevels", &nperflevels, &len, NULL, 0) == 0)
+ return nperflevels > 1;
+#endif
+
+ return false;
+}
+
+/*
+ * Detect the generic timer frequency on AArch64.
+ */
+static void
+tsc_detect_frequency(void)
+{
+ if (aarch64_has_heterogeneous_cores())
+ {
+ timing_tsc_frequency_khz = 0;
+ return;
+ }
+
+ timing_tsc_frequency_khz = aarch64_cntvct_frequency_khz();
+}
+
+/*
+ * The ARM generic timer is architecturally guaranteed to be monotonic and
+ * synchronized across cores of the same type, so we always use it by default
+ * when available and cores are homogenous.
+ */
+static bool
+tsc_use_by_default(void)
+{
+ return true;
+}
+
+uint32
+pg_tsc_calibrate_frequency(void)
+{
+ /* No calibration loop on AArch64; frequency comes from CNTFRQ_EL0 */
+ return 0;
+}
+
+#endif /* defined(__aarch64__) */
+
+#endif /* PG_INSTR_TSC_CLOCK */
diff --git a/src/common/meson.build b/src/common/meson.build
index 4f9b8b8263d55..9bd55cda95b10 100644
--- a/src/common/meson.build
+++ b/src/common/meson.build
@@ -13,6 +13,7 @@ common_sources = files(
'file_perm.c',
'file_utils.c',
'hashfn.c',
+ 'instr_time.c',
'ip.c',
'jsonapi.c',
'keywords.c',
diff --git a/src/include/nodes/queryjumble.h b/src/include/nodes/queryjumble.h
index 9f81893003c24..f331449ba78f6 100644
--- a/src/include/nodes/queryjumble.h
+++ b/src/include/nodes/queryjumble.h
@@ -91,6 +91,9 @@ extern PGDLLIMPORT int compute_query_id;
extern const char *CleanQuerytext(const char *query, int *location, int *len);
+extern LocationLen *ComputeConstantLengths(const JumbleState *jstate,
+ const char *query,
+ int query_loc);
extern JumbleState *JumbleQuery(Query *query);
extern void EnableQueryId(void);
diff --git a/src/include/parser/analyze.h b/src/include/parser/analyze.h
index 92c1c502945ee..9da833e40e5ae 100644
--- a/src/include/parser/analyze.h
+++ b/src/include/parser/analyze.h
@@ -21,7 +21,7 @@
/* Hook for plugins to get control at end of parse analysis */
typedef void (*post_parse_analyze_hook_type) (ParseState *pstate,
Query *query,
- JumbleState *jstate);
+ const JumbleState *jstate);
extern PGDLLIMPORT post_parse_analyze_hook_type post_parse_analyze_hook;
diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h
index c5d96bb4f479f..aee501a4ecdc4 100644
--- a/src/include/port/pg_cpu.h
+++ b/src/include/port/pg_cpu.h
@@ -32,8 +32,16 @@ typedef enum X86FeatureId
PG_AVX512_VL,
PG_AVX512_VPCLMULQDQ,
PG_AVX512_VPOPCNTDQ,
+
+ /* identification */
+ PG_HYPERVISOR,
+
+ /* Time-Stamp Counter (TSC) flags */
+ PG_RDTSCP,
+ PG_TSC_INVARIANT,
+ PG_TSC_ADJUST,
} X86FeatureId;
-#define X86FeaturesSize (PG_AVX512_VPOPCNTDQ + 1)
+#define X86FeaturesSize (PG_TSC_ADJUST + 1)
extern PGDLLIMPORT bool X86Features[];
@@ -48,6 +56,14 @@ x86_feature_available(X86FeatureId feature)
return X86Features[feature];
}
+extern uint32 x86_tsc_frequency_khz(void);
+
#endif /* defined(USE_SSE2) || defined(__i386__) */
+#if defined(__aarch64__)
+
+extern uint32 aarch64_cntvct_frequency_khz(void);
+
+#endif /* defined(__aarch64__) */
+
#endif /* PG_CPU_H */
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index 0a1fff7c487ae..dfebdfbf461c1 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -4,9 +4,11 @@
* portable high-precision interval timing
*
* This file provides an abstraction layer to hide portability issues in
- * interval timing. On Unix we use clock_gettime(), and on Windows we use
- * QueryPerformanceCounter(). These macros also give some breathing room to
- * use other high-precision-timing APIs.
+ * interval timing. On x86 we use the RDTSC/RDTSCP instruction, and on
+ * AArch64 the CNTVCT_EL0 generic timer, directly in certain cases, or
+ * alternatively clock_gettime() on Unix-like systems and
+ * QueryPerformanceCounter() on Windows. These macros also give some breathing
+ * room to use other high-precision-timing APIs.
*
* The basic data type is instr_time, which all callers should treat as an
* opaque typedef. instr_time can store either an absolute time (of
@@ -17,7 +19,11 @@
*
* INSTR_TIME_SET_ZERO(t) set t to zero (memset is acceptable too)
*
- * INSTR_TIME_SET_CURRENT(t) set t to current time
+ * INSTR_TIME_SET_CURRENT_FAST(t) set t to current time without waiting
+ * for instructions in out-of-order window
+ *
+ * INSTR_TIME_SET_CURRENT(t) set t to current time while waiting for
+ * instructions in OOO to retire
*
*
* INSTR_TIME_ADD(x, y) x += y
@@ -80,11 +86,108 @@ typedef struct instr_time
#define NS_PER_MS INT64CONST(1000000)
#define NS_PER_US INT64CONST(1000)
+/* Shift amount for fixed-point ticks-to-nanoseconds conversion. */
+#define TICKS_TO_NS_SHIFT 14
-#ifndef WIN32
+/*
+ * PG_INSTR_TICKS_TO_NS controls whether pg_ticks_to_ns/pg_ns_to_ticks needs to
+ * check ticks_per_ns_scaled and potentially convert ticks <=> nanoseconds.
+ *
+ * PG_INSTR_TSC_CLOCK controls whether the TSC clock source is compiled in, and
+ * potentially used based on timing_tsc_enabled.
+ */
+#if defined(__x86_64__) || defined(_M_X64) || (defined(__aarch64__) && !defined(_MSC_VER))
+#define PG_INSTR_TICKS_TO_NS 1
+#define PG_INSTR_TSC_CLOCK 1
+#elif defined(WIN32)
+#define PG_INSTR_TICKS_TO_NS 1
+#define PG_INSTR_TSC_CLOCK 0
+#else
+#define PG_INSTR_TICKS_TO_NS 0
+#define PG_INSTR_TSC_CLOCK 0
+#endif
+
+/*
+ * Variables used to translate ticks to nanoseconds, initialized by
+ * pg_initialize_timing and adjusted by pg_set_timing_clock_source calls or
+ * changes of the "timing_clock_source" GUC.
+ *
+ * Note that changing these values after setting an instr_time and before
+ * reading/converting it will lead to incorrect results. This is technically
+ * possibly because the GUC can be changed at runtime, but unlikely, and we
+ * allow changing this at runtime to simplify testing of different sources.
+ */
+extern PGDLLIMPORT uint64 ticks_per_ns_scaled;
+extern PGDLLIMPORT uint64 max_ticks_no_overflow;
+extern PGDLLIMPORT bool timing_initialized;
+
+typedef enum
+{
+ TIMING_CLOCK_SOURCE_AUTO,
+ TIMING_CLOCK_SOURCE_SYSTEM,
+#if PG_INSTR_TSC_CLOCK
+ TIMING_CLOCK_SOURCE_TSC
+#endif
+} TimingClockSourceType;
+
+extern int timing_clock_source;
+
+/*
+ * Initialize timing infrastructure
+ *
+ * This must be called at least once before using INSTR_TIME_SET_CURRENT*
+ * macros.
+ *
+ * If you want to use the TSC clock source in a client program you just also
+ * call pg_set_timing_clock_source afterwards.
+ */
+extern void pg_initialize_timing(void);
+
+/*
+ * Sets the time source to be used. Mainly intended for frontend programs,
+ * the backend should set it via the timing_clock_source GUC instead.
+ *
+ * Returns false if the clock source could not be set, for example when TSC
+ * is not available despite being explicitly set.
+ */
+extern bool pg_set_timing_clock_source(TimingClockSourceType source);
+
+/* Whether to actually use TSC based on availability and GUC settings. */
+extern PGDLLIMPORT bool timing_tsc_enabled;
+
+/*
+ * TSC frequency in kHz, set during initialization.
+ *
+ * -1 = not yet initialized, 0 = TSC not usable, >0 = frequency in kHz.
+ */
+extern PGDLLIMPORT int32 timing_tsc_frequency_khz;
+
+#if PG_INSTR_TSC_CLOCK
+extern void pg_initialize_timing_tsc(void);
-/* Use clock_gettime() */
+extern uint32 pg_tsc_calibrate_frequency(void);
+
+#endif /* PG_INSTR_TSC_CLOCK */
+
+/*
+ * Returns the current timing clock source effectively in use, resolving
+ * TIMING_CLOCK_SOURCE_AUTO to either TIMING_CLOCK_SOURCE_SYSTEM or
+ * TIMING_CLOCK_SOURCE_TSC.
+ */
+static inline TimingClockSourceType
+pg_current_timing_clock_source(void)
+{
+#if PG_INSTR_TSC_CLOCK
+ if (timing_tsc_enabled)
+ return TIMING_CLOCK_SOURCE_TSC;
+#endif
+ return TIMING_CLOCK_SOURCE_SYSTEM;
+}
+
+#ifndef WIN32
+
+/* On POSIX, use clock_gettime() for system clock source */
#include
@@ -99,76 +202,258 @@ typedef struct instr_time
* than CLOCK_MONOTONIC. In particular, as of macOS 10.12, Apple provides
* CLOCK_MONOTONIC_RAW which is both faster to read and higher resolution than
* their version of CLOCK_MONOTONIC.
+ *
+ * Note this does not get used in case the TSC clock source logic is used,
+ * which directly calls architecture specific timing instructions (e.g. RDTSC).
*/
#if defined(__darwin__) && defined(CLOCK_MONOTONIC_RAW)
-#define PG_INSTR_CLOCK CLOCK_MONOTONIC_RAW
+#define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC_RAW
+#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_MONOTONIC_RAW)"
#elif defined(CLOCK_MONOTONIC)
-#define PG_INSTR_CLOCK CLOCK_MONOTONIC
+#define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC
+#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_MONOTONIC)"
#else
-#define PG_INSTR_CLOCK CLOCK_REALTIME
+#define PG_INSTR_SYSTEM_CLOCK CLOCK_REALTIME
+#define PG_INSTR_SYSTEM_CLOCK_NAME "clock_gettime (CLOCK_REALTIME)"
#endif
-/* helper for INSTR_TIME_SET_CURRENT */
static inline instr_time
-pg_clock_gettime_ns(void)
+pg_get_ticks_system(void)
{
instr_time now;
struct timespec tmp;
- clock_gettime(PG_INSTR_CLOCK, &tmp);
+ Assert(timing_initialized);
+
+ clock_gettime(PG_INSTR_SYSTEM_CLOCK, &tmp);
now.ticks = tmp.tv_sec * NS_PER_S + tmp.tv_nsec;
return now;
}
-#define INSTR_TIME_SET_CURRENT(t) \
- ((t) = pg_clock_gettime_ns())
-
-#define INSTR_TIME_GET_NANOSEC(t) \
- ((int64) (t).ticks)
-
-#define INSTR_TIME_ADD_NANOSEC(t, n) \
- ((t).ticks += (n))
-
-
#else /* WIN32 */
+/* On Windows, use QueryPerformanceCounter() for system clock source */
-/* Use QueryPerformanceCounter() */
-
-/* helper for INSTR_TIME_SET_CURRENT */
+#define PG_INSTR_SYSTEM_CLOCK_NAME "QueryPerformanceCounter"
static inline instr_time
-pg_query_performance_counter(void)
+pg_get_ticks_system(void)
{
instr_time now;
LARGE_INTEGER tmp;
+ Assert(timing_initialized);
+
QueryPerformanceCounter(&tmp);
now.ticks = tmp.QuadPart;
return now;
}
-static inline double
-GetTimerFrequency(void)
+#endif /* WIN32 */
+
+static inline int64
+pg_ticks_to_ns(int64 ticks)
{
- LARGE_INTEGER f;
+#if PG_INSTR_TICKS_TO_NS
+ int64 ns = 0;
+
+ Assert(timing_initialized);
+
+ /*
+ * Avoid doing work if we don't use scaled ticks, e.g. system clock on
+ * Unix (in that case ticks is counted in nanoseconds)
+ */
+ if (ticks_per_ns_scaled == 0)
+ return ticks;
+
+ /*
+ * Would multiplication overflow? If so perform computation in two parts.
+ */
+ if (unlikely(ticks > (int64) max_ticks_no_overflow))
+ {
+ /*
+ * To avoid overflow, first scale total ticks down by the fixed
+ * factor, and *afterwards* multiply them by the frequency-based scale
+ * factor.
+ *
+ * The remaining ticks can follow the regular formula, since they
+ * won't overflow.
+ */
+ int64 count = ticks >> TICKS_TO_NS_SHIFT;
+
+ ns = count * ticks_per_ns_scaled;
+ ticks -= (count << TICKS_TO_NS_SHIFT);
+ }
+
+ ns += (ticks * ticks_per_ns_scaled) >> TICKS_TO_NS_SHIFT;
+
+ return ns;
+#else
+ Assert(timing_initialized);
- QueryPerformanceFrequency(&f);
- return (double) f.QuadPart;
+ return ticks;
+#endif /* PG_INSTR_TICKS_TO_NS */
}
-#define INSTR_TIME_SET_CURRENT(t) \
- ((t) = pg_query_performance_counter())
+static inline int64
+pg_ns_to_ticks(int64 ns)
+{
+#if PG_INSTR_TICKS_TO_NS
+ int64 ticks = 0;
-#define INSTR_TIME_GET_NANOSEC(t) \
- ((int64) ((t).ticks * ((double) NS_PER_S / GetTimerFrequency())))
+ Assert(timing_initialized);
-#define INSTR_TIME_ADD_NANOSEC(t, n) \
- ((t).ticks += ((n) / ((double) NS_PER_S / GetTimerFrequency())))
+ /*
+ * If ticks_per_ns_scaled is zero, ticks are already in nanoseconds (e.g.
+ * system clock on Unix).
+ */
+ if (ticks_per_ns_scaled == 0)
+ return ns;
-#endif /* WIN32 */
+ /*
+ * The reverse of pg_ticks_to_ns to avoid a similar overflow problem.
+ */
+ if (unlikely(ns > (INT64_MAX >> TICKS_TO_NS_SHIFT)))
+ {
+ int64 count = ns / ticks_per_ns_scaled;
+ ticks = count << TICKS_TO_NS_SHIFT;
+ ns -= count * ticks_per_ns_scaled;
+ }
+
+ ticks += (ns << TICKS_TO_NS_SHIFT) / ticks_per_ns_scaled;
+
+ return ticks;
+#else
+ Assert(timing_initialized);
+
+ return ns;
+#endif /* PG_INSTR_TICKS_TO_NS */
+}
+
+#if PG_INSTR_TSC_CLOCK
+
+#if defined(__x86_64__) || defined(_M_X64)
+
+#define PG_INSTR_TSC_CLOCK_NAME_FAST "RDTSC"
+#define PG_INSTR_TSC_CLOCK_NAME "RDTSCP"
+
+#ifdef _MSC_VER
+#include
+#endif /* defined(_MSC_VER) */
+
+/* Helpers to abstract compiler differences for reading the x86 TSC. */
+static inline int64
+pg_rdtsc(void)
+{
+#ifdef _MSC_VER
+ return __rdtsc();
+#else
+ return __builtin_ia32_rdtsc();
+#endif /* defined(_MSC_VER) */
+}
+
+static inline int64
+pg_rdtscp(void)
+{
+ uint32 unused;
+
+#ifdef _MSC_VER
+ return __rdtscp(&unused);
+#else
+ return __builtin_ia32_rdtscp(&unused);
+#endif /* defined(_MSC_VER) */
+}
+
+static pg_attribute_always_inline instr_time
+pg_get_ticks_fast(void)
+{
+ if (likely(timing_tsc_enabled))
+ {
+ instr_time now;
+
+ now.ticks = pg_rdtsc();
+ return now;
+ }
+
+ return pg_get_ticks_system();
+}
+
+static pg_attribute_always_inline instr_time
+pg_get_ticks(void)
+{
+ if (likely(timing_tsc_enabled))
+ {
+ instr_time now;
+
+ now.ticks = pg_rdtscp();
+ return now;
+ }
+
+ return pg_get_ticks_system();
+}
+
+#elif defined(__aarch64__) && !defined(_MSC_VER)
+
+#define PG_INSTR_TSC_CLOCK_NAME_FAST "CNTVCT_EL0"
+#define PG_INSTR_TSC_CLOCK_NAME "CNTVCT_EL0 (ISB)"
+
+/*
+ * Read the ARM generic timer counter (CNTVCT_EL0).
+ *
+ * The "fast" variant reads the counter without a barrier, analogous to RDTSC
+ * on x86. The regular variant issues an ISB (Instruction Synchronization
+ * Barrier) first, which acts as a serializing instruction analogous to RDTSCP,
+ * ensuring all preceding instructions have completed before reading the
+ * counter.
+ */
+static pg_attribute_always_inline instr_time
+pg_get_ticks_fast(void)
+{
+ if (likely(timing_tsc_enabled))
+ {
+ instr_time now;
+
+ now.ticks = __builtin_arm_rsr64("cntvct_el0");
+ return now;
+ }
+
+ return pg_get_ticks_system();
+}
+
+static pg_attribute_always_inline instr_time
+pg_get_ticks(void)
+{
+ if (likely(timing_tsc_enabled))
+ {
+ instr_time now;
+
+ __builtin_arm_isb(0xf);
+ now.ticks = __builtin_arm_rsr64("cntvct_el0");
+ return now;
+ }
+
+ return pg_get_ticks_system();
+}
+
+#endif /* defined(__aarch64__) */
+
+#else /* !PG_INSTR_TSC_CLOCK */
+
+static pg_attribute_always_inline instr_time
+pg_get_ticks_fast(void)
+{
+ return pg_get_ticks_system();
+}
+
+static pg_attribute_always_inline instr_time
+pg_get_ticks(void)
+{
+ return pg_get_ticks_system();
+}
+
+#endif /* PG_INSTR_TSC_CLOCK */
/*
* Common macros
@@ -178,10 +463,19 @@ GetTimerFrequency(void)
#define INSTR_TIME_SET_ZERO(t) ((t).ticks = 0)
+#define INSTR_TIME_SET_CURRENT_FAST(t) \
+ ((t) = pg_get_ticks_fast())
+
+#define INSTR_TIME_SET_CURRENT(t) \
+ ((t) = pg_get_ticks())
+
#define INSTR_TIME_ADD(x,y) \
((x).ticks += (y).ticks)
+#define INSTR_TIME_ADD_NANOSEC(t, n) \
+ ((t).ticks += pg_ns_to_ticks(n))
+
#define INSTR_TIME_SUBTRACT(x,y) \
((x).ticks -= (y).ticks)
@@ -191,6 +485,9 @@ GetTimerFrequency(void)
#define INSTR_TIME_GT(x,y) \
((x).ticks > (y).ticks)
+#define INSTR_TIME_GET_NANOSEC(t) \
+ (pg_ticks_to_ns((t).ticks))
+
#define INSTR_TIME_GET_DOUBLE(t) \
((double) INSTR_TIME_GET_NANOSEC(t) / NS_PER_S)
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index b01697c1f606d..307f4fbaefe08 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -163,6 +163,9 @@ extern const char *show_timezone(void);
extern bool check_timezone_abbreviations(char **newval, void **extra,
GucSource source);
extern void assign_timezone_abbreviations(const char *newval, void *extra);
+extern void assign_timing_clock_source(int newval, void *extra);
+extern bool check_timing_clock_source(int *newval, void **extra, GucSource source);
+extern const char *show_timing_clock_source(void);
extern bool check_transaction_buffers(int *newval, void **extra, GucSource source);
extern bool check_transaction_deferrable(bool *newval, void **extra, GucSource source);
extern bool check_transaction_isolation(int *newval, void **extra, GucSource source);
diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h
index 71a8016196138..63440b8e36c83 100644
--- a/src/include/utils/guc_tables.h
+++ b/src/include/utils/guc_tables.h
@@ -60,6 +60,7 @@ enum config_group
CONN_AUTH_TCP,
CONN_AUTH_AUTH,
CONN_AUTH_SSL,
+ RESOURCES_TIME,
RESOURCES_MEM,
RESOURCES_DISK,
RESOURCES_KERNEL,
diff --git a/src/port/meson.build b/src/port/meson.build
index 922b3f646768d..d695f92b769e1 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -7,6 +7,7 @@ pgport_sources = [
'noblock.c',
'path.c',
'pg_bitutils.c',
+ 'pg_cpu_arm.c',
'pg_cpu_x86.c',
'pg_getopt_ctx.c',
'pg_localeconv_r.c',
diff --git a/src/port/pg_cpu_arm.c b/src/port/pg_cpu_arm.c
new file mode 100644
index 0000000000000..2814a9477065d
--- /dev/null
+++ b/src/port/pg_cpu_arm.c
@@ -0,0 +1,45 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_cpu_arm.c
+ * Runtime CPU feature detection for AArch64
+ *
+ * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/port/pg_cpu_arm.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#if defined(__aarch64__) && !defined(_MSC_VER)
+
+#include "port/pg_cpu.h"
+
+/*
+ * Return the frequency of the ARM generic timer (CNTVCT_EL0) in kHz.
+ *
+ * The CNTFRQ_EL0 system register is architecturally guaranteed to be readable
+ * from EL0 (userspace) and holds the timer frequency in Hz. The firmware sets
+ * this at boot and it does not change.
+ *
+ * Returns 0 if the frequency is not available (should not happen on conforming
+ * implementations).
+ */
+uint32
+aarch64_cntvct_frequency_khz(void)
+{
+ uint64 freq;
+
+ freq = __builtin_arm_rsr64("cntfrq_el0");
+
+ if (freq == 0)
+ return 0;
+
+ return (uint32) (freq / 1000);
+}
+
+#endif /* defined(__aarch64__) */
diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c
index 40ff78633ca3f..8951e7a0811ce 100644
--- a/src/port/pg_cpu_x86.c
+++ b/src/port/pg_cpu_x86.c
@@ -80,13 +80,13 @@ pg_cpuid(int leaf, unsigned int *reg)
static inline bool
pg_cpuid_subleaf(int leaf, int subleaf, unsigned int *reg)
{
+ memset(reg, 0, 4 * sizeof(unsigned int));
#if defined(HAVE__GET_CPUID_COUNT)
return __get_cpuid_count(leaf, subleaf, ®[EAX], ®[EBX], ®[ECX], ®[EDX]) == 1;
#elif defined(HAVE__CPUIDEX)
__cpuidex((int *) reg, leaf, subleaf);
return true;
#else
- memset(reg, 0, 4 * sizeof(unsigned int));
return false;
#endif
}
@@ -101,19 +101,24 @@ void
set_x86_features(void)
{
unsigned int reg[4] = {0};
+ bool have_osxsave;
pg_cpuid(0x01, reg);
X86Features[PG_SSE4_2] = reg[ECX] >> 20 & 1;
X86Features[PG_POPCNT] = reg[ECX] >> 23 & 1;
+ X86Features[PG_HYPERVISOR] = reg[ECX] >> 31 & 1;
+ have_osxsave = reg[ECX] >> 27 & 1;
+
+ pg_cpuid_subleaf(0x07, 0, reg);
+
+ X86Features[PG_TSC_ADJUST] = reg[EBX] >> 1 & 1;
/* leaf 7 features that depend on OSXSAVE */
- if (reg[ECX] & (1 << 27))
+ if (have_osxsave)
{
uint32 xcr0_val = 0;
- pg_cpuid_subleaf(0x07, 0, reg);
-
#ifdef HAVE_XSAVE_INTRINSICS
/* get value of Extended Control Register */
xcr0_val = _xgetbv(0);
@@ -135,7 +140,126 @@ set_x86_features(void)
}
}
+ /* Check for other TSC related flags */
+ pg_cpuid(0x80000001, reg);
+ X86Features[PG_RDTSCP] = reg[EDX] >> 27 & 1;
+
+ pg_cpuid(0x80000007, reg);
+ X86Features[PG_TSC_INVARIANT] = reg[EDX] >> 8 & 1;
+
X86Features[INIT_PG_X86] = true;
}
+/* TSC (Time-stamp Counter) handling code */
+
+static uint32 x86_hypervisor_tsc_frequency_khz(void);
+
+/*
+ * Determine the TSC frequency of the CPU through CPUID, where supported.
+ *
+ * Needed to interpret the tick value returned by RDTSC/RDTSCP. Return value of
+ * 0 indicates the frequency information was not accessible via CPUID.
+ */
+uint32
+x86_tsc_frequency_khz(void)
+{
+ unsigned int reg[4] = {0};
+
+ if (x86_feature_available(PG_HYPERVISOR))
+ {
+ uint32 freq = x86_hypervisor_tsc_frequency_khz();
+
+ if (freq > 0)
+ return freq;
+ }
+
+ /*
+ * On modern Intel CPUs, the TSC is implemented by invariant timekeeping
+ * hardware, also called "Always Running Timer", or ART. The ART stays
+ * consistent even if the CPU changes frequency due to changing power
+ * levels.
+ *
+ * As documented in "Determining the Processor Base Frequency" in the
+ * "IntelĀ® 64 and IA-32 Architectures Software Developer's Manual",
+ * February 2026 Edition, we can get the TSC frequency as follows:
+ *
+ * Nominal TSC frequency = ( CPUID.15H:ECX[31:0] * CPUID.15H:EBX[31:0] ) /
+ * CPUID.15H:EAX[31:0]
+ *
+ * With CPUID.15H:ECX representing the nominal core crystal clock
+ * frequency, and EAX/EBX representing values used to translate the TSC
+ * value to that frequency, see "Chapter 20.17 "Time-Stamp Counter" of
+ * that manual.
+ *
+ * Older Intel CPUs, and other vendors do not set CPUID.15H:ECX, and as
+ * such we fall back to alternate approaches.
+ */
+ pg_cpuid(0x15, reg);
+ if (reg[ECX] > 0)
+ {
+ /*
+ * EBX not being set indicates invariant TSC is not available. Require
+ * EAX being non-zero too, to avoid a theoretical divide by zero.
+ */
+ if (reg[EAX] == 0 || reg[EBX] == 0)
+ return 0;
+
+ return reg[ECX] / 1000 * reg[EBX] / reg[EAX];
+ }
+
+ /*
+ * When CPUID.15H is not available/incomplete, we can instead try to get
+ * the processor base frequency in MHz from CPUID.16H:EAX, the "Processor
+ * Frequency Information Leaf".
+ */
+ pg_cpuid(0x16, reg);
+ if (reg[EAX] > 0)
+ return reg[EAX] * 1000;
+
+ return 0;
+}
+
+/*
+ * Support for reading TSC frequency for hypervisors passing it to a guest VM.
+ *
+ * Two Hypervisors (VMware and KVM) are known to make TSC frequency in KHz
+ * available at the vendor-specific 0x40000010 leaf in the EAX register.
+ *
+ * For some other Hypervisors that have an invariant TSC, e.g. HyperV, we would
+ * need to access a model-specific register (MSR) to get the frequency. MSRs are
+ * separate from CPUID and typically not available for unprivileged processes,
+ * so we can't get the frequency this way.
+ */
+#define CPUID_HYPERVISOR_VMWARE(r) (r[EBX] == 0x61774d56 && r[ECX] == 0x4d566572 && r[EDX] == 0x65726177) /* VMwareVMware */
+#define CPUID_HYPERVISOR_KVM(r) (r[EBX] == 0x4b4d564b && r[ECX] == 0x564b4d56 && r[EDX] == 0x0000004d) /* KVMKVMKVM */
+static uint32
+x86_hypervisor_tsc_frequency_khz(void)
+{
+ unsigned int reg[4] = {0};
+
+#if defined(HAVE__CPUIDEX)
+
+ /*
+ * The hypervisor is determined using the 0x40000000 Hypervisor
+ * information leaf, which requires use of __cpuidex to set ECX to 0 to
+ * access it.
+ *
+ * The similar __get_cpuid_count function does not work as expected since
+ * it contains a check for __get_cpuid_max, which has been observed to be
+ * lower than the special Hypervisor leaf, despite it being available.
+ */
+ __cpuidex((int *) reg, 0x40000000, 0);
+
+ if (reg[EAX] >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(reg) || CPUID_HYPERVISOR_KVM(reg)))
+ {
+ __cpuidex((int *) reg, 0x40000010, 0);
+ if (reg[EAX] > 0)
+ return reg[EAX];
+ }
+#endif /* HAVE__CPUIDEX */
+
+ return 0;
+}
+
+
#endif /* defined(USE_SSE2) || defined(__i386__) */
diff --git a/src/test/modules/test_misc/t/011_lock_stats.pl b/src/test/modules/test_misc/t/011_lock_stats.pl
index 7662db160173a..45d7d26f70ccb 100644
--- a/src/test/modules/test_misc/t/011_lock_stats.pl
+++ b/src/test/modules/test_misc/t/011_lock_stats.pl
@@ -36,9 +36,9 @@ sub setup_sessions
$s2 = $node->background_psql('postgres');
# Setup injection points for the waiting session
- $s2->query_safe(
- q[
- SELECT injection_points_set_local();
+ $s2->query_until(
+ qr/attaching_injection_point/, q[
+ \echo attaching_injection_point
SELECT injection_points_attach('deadlock-timeout-fired', 'wait');
]);
}
@@ -64,10 +64,11 @@ sub wait_and_detach
my ($node, $point_name) = @_;
$node->wait_for_event('client backend', $point_name);
- $node->safe_psql('postgres',
- "SELECT injection_points_detach('$point_name');");
- $node->safe_psql('postgres',
- "SELECT injection_points_wakeup('$point_name');");
+ $node->safe_psql(
+ 'postgres', qq[
+SELECT injection_points_detach('$point_name');
+SELECT injection_points_wakeup('$point_name');
+]);
}
# Node initialization
diff --git a/src/test/regress/expected/misc_functions.out b/src/test/regress/expected/misc_functions.out
index cf55cdf3688d9..c3261bff209fb 100644
--- a/src/test/regress/expected/misc_functions.out
+++ b/src/test/regress/expected/misc_functions.out
@@ -850,3 +850,14 @@ SELECT oldest_multixact IS NULL AS null_result FROM pg_get_multixact_stats();
RESET ROLE;
DROP ROLE regress_multixact_funcs;
+-- test instr_time nanosecond<->ticks conversion
+CREATE FUNCTION test_instr_time()
+ RETURNS bool
+ AS :'regresslib'
+ LANGUAGE C;
+SELECT test_instr_time();
+ test_instr_time
+-----------------
+ t
+(1 row)
+
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index c6ba2479413c8..37070c1a89639 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -99,7 +99,8 @@ CREATE STATISTICS tst (ndistinct) ON (y + z) FROM ext_stats_test;
ERROR: cannot specify statistics kinds when building univariate statistics
-- multivariate statistics without a less-than operator not supported
CREATE STATISTICS tst (ndistinct) ON x, w from ext_stats_test;
-ERROR: column "w" cannot be used in multivariate statistics because its type xid has no default btree operator class
+ERROR: cannot create multivariate statistics on column "w"
+DETAIL: The type xid has no default btree operator class.
DROP TABLE ext_stats_test;
-- Ensure stats are dropped sanely, and test IF NOT EXISTS while at it
CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c
index 9a918156437b2..0c0620569829b 100644
--- a/src/test/regress/pg_regress.c
+++ b/src/test/regress/pg_regress.c
@@ -2181,6 +2181,8 @@ regression_main(int argc, char *argv[],
progname = get_progname(argv[0]);
set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_regress"));
+ pg_initialize_timing();
+
get_restricted_token();
atexit(stop_postmaster);
diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c
index 68a01a1dde014..c2eaa96f08605 100644
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -38,6 +38,7 @@
#include "optimizer/plancat.h"
#include "parser/parse_coerce.h"
#include "port/atomics.h"
+#include "portability/instr_time.h"
#include "postmaster/postmaster.h" /* for MAX_BACKENDS */
#include "storage/spin.h"
#include "tcop/tcopprot.h"
@@ -1384,3 +1385,38 @@ test_translation(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
+
+/* Verify that pg_ticks_to_ns behaves correct, including overflow */
+PG_FUNCTION_INFO_V1(test_instr_time);
+Datum
+test_instr_time(PG_FUNCTION_ARGS)
+{
+ instr_time t;
+ int64 test_ns[] = {0, 1000, INT64CONST(1000000000000000)};
+ int64 max_err;
+
+ /*
+ * The ns-to-ticks-to-ns roundtrip may lose precision due to integer
+ * truncation in the fixed-point conversion. The maximum error depends on
+ * ticks_per_ns_scaled relative to the shift factor.
+ */
+ max_err = (ticks_per_ns_scaled >> TICKS_TO_NS_SHIFT) + 1;
+
+ for (int i = 0; i < lengthof(test_ns); i++)
+ {
+ int64 result;
+
+ INSTR_TIME_SET_ZERO(t);
+ INSTR_TIME_ADD_NANOSEC(t, test_ns[i]);
+ result = INSTR_TIME_GET_NANOSEC(t);
+
+ if (result < test_ns[i] - max_err || result > test_ns[i])
+ elog(ERROR,
+ "INSTR_TIME_GET_NANOSEC(t) yielded " INT64_FORMAT
+ ", expected " INT64_FORMAT " (max_err " INT64_FORMAT
+ ") in file \"%s\" line %u",
+ result, test_ns[i], max_err, __FILE__, __LINE__);
+ }
+
+ PG_RETURN_BOOL(true);
+}
diff --git a/src/test/regress/sql/misc_functions.sql b/src/test/regress/sql/misc_functions.sql
index c8226652f2c94..946ee5726cdd7 100644
--- a/src/test/regress/sql/misc_functions.sql
+++ b/src/test/regress/sql/misc_functions.sql
@@ -349,3 +349,10 @@ SET ROLE regress_multixact_funcs;
SELECT oldest_multixact IS NULL AS null_result FROM pg_get_multixact_stats();
RESET ROLE;
DROP ROLE regress_multixact_funcs;
+
+-- test instr_time nanosecond<->ticks conversion
+CREATE FUNCTION test_instr_time()
+ RETURNS bool
+ AS :'regresslib'
+ LANGUAGE C;
+SELECT test_instr_time();
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 9e6a39f560833..07ac380cf976a 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3185,6 +3185,7 @@ TimeoutId
TimeoutType
Timestamp
TimestampTz
+TimingClockSourceType
TmFromChar
TmToChar
ToastAttrInfo