diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index e8c99c3773dc..ad8d06692efe 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -3429,8 +3429,8 @@ count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected) INSTR_TIME_SET_CURRENT(currenttime); elapsed = currenttime; INSTR_TIME_SUBTRACT(elapsed, starttime); - if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000) - >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL) + if (INSTR_TIME_GET_MILLISEC(elapsed) >= + VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL) { if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock)) { diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 9e11c662a7c1..3940b59d70c1 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -67,9 +67,13 @@ InstrInit(Instrumentation *instr, int instrument_options) void InstrStartNode(Instrumentation *instr) { - if (instr->need_timer && - !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) - elog(ERROR, "InstrStartNode called twice in a row"); + if (instr->need_timer) + { + if (!INSTR_TIME_IS_ZERO(instr->starttime)) + elog(ERROR, "InstrStartNode called twice in a row"); + else + INSTR_TIME_SET_CURRENT_FAST(instr->starttime); + } /* save buffer usage totals at node entry, if needed */ if (instr->need_bufusage) @@ -95,7 +99,7 @@ InstrStopNode(Instrumentation *instr, double nTuples) if (INSTR_TIME_IS_ZERO(instr->starttime)) elog(ERROR, "InstrStopNode called without start"); - INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_SET_CURRENT_FAST(endtime); INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime); INSTR_TIME_SET_ZERO(instr->starttime); diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 4ed69ac7ba29..952e710babae 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -806,6 +806,9 @@ InitPostgres(const char *in_dbname, Oid dboid, /* Initialize portal manager */ EnablePortalManager(); + /* initialize high-precision interval timing */ + INSTR_TIME_INITIALIZE(); + /* * Load relcache entries for the shared system catalogs. This must create * at least entries for pg_database and catalogs used for authentication. diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index a5621251afce..62e308dd9653 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -30,7 +30,7 @@ static long long int largest_diff_count; static void handle_args(int argc, char *argv[]); -static uint64 test_timing(unsigned int duration); +static uint64 test_timing(unsigned int duration, bool fast_timing); static void output(uint64 loop_count); int @@ -43,10 +43,22 @@ main(int argc, char *argv[]) handle_args(argc, argv); - loop_count = test_timing(test_duration); - + /* + * First, test default (non-fast) timing code. A clock source for that + * is always available. Hence, we can unconditionally output the result. + */ + loop_count = test_timing(test_duration, false); output(loop_count); + /* + * Second, test the fast timing code. This clock source is not always + * available. In that case the loop count will be 0 and we don't print. + */ + printf("\n"); + loop_count = test_timing(test_duration, true); + if (loop_count > 0) + output(loop_count); + return 0; } @@ -78,7 +90,7 @@ handle_args(int argc, char *argv[]) } } - while ((option = getopt_long(argc, argv, "d:c:", + while ((option = getopt_long(argc, argv, "d:c:f:", long_options, &optindex)) != -1) { switch (option) @@ -143,23 +155,44 @@ handle_args(int argc, char *argv[]) exit(1); } - printf(ngettext("Testing timing overhead for %u second.\n", - "Testing timing overhead for %u seconds.\n", + printf(ngettext("Testing timing overhead for %u second.\n\n", + "Testing timing overhead for %u seconds.\n\n", test_duration), test_duration); } static uint64 -test_timing(unsigned int duration) +test_timing(unsigned int duration, bool fast_timing) { uint64 total_time; int64 time_elapsed = 0; uint64 loop_count = 0; - uint64 prev, - cur; instr_time start_time, end_time, - temp; + prev, + cur; + char *time_source = NULL; + bool fast_timing_available = false; + + INSTR_TIME_INITIALIZE(); + +#if !defined(WIN32) && defined(__x86_64__) && defined(__linux__) + if (fast_timing && has_rdtsc) + { + time_source = "RDTSC"; + fast_timing_available = true; + } + else if (has_rdtscp) + time_source = "RDTSCP"; + else + time_source = PG_INSTR_CLOCK_NAME; +#else + time_source = PG_INSTR_CLOCK_NAME; +#endif + if (fast_timing && !fast_timing_available) + return 0; + + printf(_("Time source: %s\n"), time_source); /* * Pre-zero the statistics data structures. They're already zero by @@ -173,8 +206,11 @@ test_timing(unsigned int duration) total_time = duration > 0 ? duration * INT64CONST(1000000000) : 0; - INSTR_TIME_SET_CURRENT(start_time); - cur = INSTR_TIME_GET_NANOSEC(start_time); + if (fast_timing) + INSTR_TIME_SET_CURRENT_FAST(start_time); + else + INSTR_TIME_SET_CURRENT(start_time); + cur = start_time; while (time_elapsed < total_time) { @@ -182,9 +218,11 @@ test_timing(unsigned int duration) bits; prev = cur; - INSTR_TIME_SET_CURRENT(temp); - cur = INSTR_TIME_GET_NANOSEC(temp); - diff = cur - prev; + if (fast_timing) + INSTR_TIME_SET_CURRENT_FAST(cur); + else + INSTR_TIME_SET_CURRENT(cur); + diff = INSTR_TIME_DIFF_NANOSEC(cur, prev); /* Did time go backwards? */ if (unlikely(diff < 0)) @@ -217,11 +255,13 @@ test_timing(unsigned int duration) largest_diff_count++; loop_count++; - INSTR_TIME_SUBTRACT(temp, start_time); - time_elapsed = INSTR_TIME_GET_NANOSEC(temp); + time_elapsed = INSTR_TIME_DIFF_NANOSEC(cur, start_time); } - INSTR_TIME_SET_CURRENT(end_time); + if (fast_timing) + INSTR_TIME_SET_CURRENT_FAST(end_time); + else + INSTR_TIME_SET_CURRENT(end_time); INSTR_TIME_SUBTRACT(end_time, start_time); diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index 45b340d3da57..f500b9de2b07 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -7334,6 +7334,9 @@ main(int argc, char **argv) initRandomState(&state[i].cs_func_rs); } + /* initialize high-precision interval timing */ + INSTR_TIME_INITIALIZE(); + /* opening connection... */ con = doConnect(); if (con == NULL) diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c index 249b6aa51690..d615df593c70 100644 --- a/src/bin/psql/startup.c +++ b/src/bin/psql/startup.c @@ -24,6 +24,7 @@ #include "help.h" #include "input.h" #include "mainloop.h" +#include "portability/instr_time.h" #include "settings.h" /* @@ -327,6 +328,9 @@ main(int argc, char *argv[]) PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL); + /* initialize high-precision interval timing */ + INSTR_TIME_INITIALIZE(); + SyncVariables(); if (options.list_dbs) diff --git a/src/common/Makefile b/src/common/Makefile index 2c720caa5097..1a2fbbe887f2 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -59,6 +59,7 @@ OBJS_COMMON = \ file_perm.o \ file_utils.o \ hashfn.o \ + instr_time.o \ ip.o \ jsonapi.o \ keywords.o \ diff --git a/src/common/instr_time.c b/src/common/instr_time.c new file mode 100644 index 000000000000..fdf47699f207 --- /dev/null +++ b/src/common/instr_time.c @@ -0,0 +1,206 @@ +/*------------------------------------------------------------------------- + * + * instr_time.c + * Non-inline parts of the portable high-precision interval timing + * implementation + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/port/instr_time.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#if defined(HAVE__GET_CPUID) || (defined(HAVE__CPUIDEX) && !defined(_MSC_VER)) +#include +#endif + +#if defined(HAVE__CPUID) || (defined(HAVE__CPUIDEX) && defined(_MSC_VER)) +#include +#endif + +#include "portability/instr_time.h" + +#ifndef WIN32 +/* + * Stores what the number of cycles needs to be multiplied with to end up + * with nanoseconds using integer math. See comment in pg_initialize_rdtsc() + * for more details. + * + * By default assume we are using clock_gettime() as a fallback which uses + * nanoseconds as ticks. Hence, we set the multiplier to the precision scalar + * so that the division in INSTR_TIME_GET_NANOSEC() won't change the nanoseconds. + * + * When using the RDTSC instruction directly this is filled in during initialization + * based on the relevant CPUID fields. + */ +int64 ticks_per_ns_scaled = TICKS_TO_NS_PRECISION; +int64 ticks_per_sec = NS_PER_S; +int64 max_ticks_no_overflow = PG_INT64_MAX / TICKS_TO_NS_PRECISION; + +#if defined(__x86_64__) && defined(__linux__) +/* + * Indicates if RDTSC can be used (Linux/x86 only, when OS uses TSC clocksource) + */ +bool has_rdtsc = false; + +/* + * Indicates if RDTSCP can be used. True if RDTSC can be used and RDTSCP is available. + */ +bool has_rdtscp = false; + +#define CPUID_HYPERVISOR_VMWARE(words) (words[1] == 0x61774d56 && words[2] == 0x4d566572 && words[3] == 0x65726177) /* VMwareVMware */ +#define CPUID_HYPERVISOR_KVM(words) (words[1] == 0x4b4d564b && words[2] == 0x564b4d56 && words[3] == 0x0000004d) /* KVMKVMKVM */ + +static bool +get_tsc_frequency_khz(uint32 *tsc_freq) +{ + uint32 r[4] = {0, 0, 0, 0}; + +#if defined(HAVE__GET_CPUID) + __get_cpuid(0x15, &r[0] /* denominator */ , &r[1] /* numerator */ , &r[2] /* hz */ , &r[3]); +#elif defined(HAVE__CPUID) + __cpuid(r, 0x15); +#else +#error cpuid instruction not available +#endif + + if (r[2] > 0) + { + if (r[0] == 0 || r[1] == 0) + return false; + + *tsc_freq = r[2] / 1000 * r[1] / r[0]; + return true; + } + + /* Some CPUs only report frequency in 16H */ + +#if defined(HAVE__GET_CPUID) + __get_cpuid(0x16, &r[0] /* base_mhz */ , &r[1], &r[2], &r[3]); +#elif defined(HAVE__CPUID) + __cpuid(r, 0x16); +#else +#error cpuid instruction not available +#endif + + if (r[0] > 0) + { + *tsc_freq = r[0] * 1000; + return true; + } + + /* + * Check if we have a KVM or VMware Hypervisor passing down TSC frequency + * to us in a guest VM + * + * Note that accessing the 0x40000000 leaf for Hypervisor info requires + * use of __cpuidex to set ECX to 0. The similar __get_cpuid_count + * function does not work as expected since it contains a check for + * __get_cpuid_max, which has been observed to be lower than the special + * Hypervisor leaf. + */ +#if defined(HAVE__CPUIDEX) + __cpuidex((int32 *) r, 0x40000000, 0); + if (r[0] >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(r) || CPUID_HYPERVISOR_KVM(r))) + { + __cpuidex((int32 *) r, 0x40000010, 0); + if (r[0] > 0) + { + *tsc_freq = r[0]; + return true; + } + } +#endif + + return false; +} + +static bool +is_rdtscp_available() +{ + uint32 r[4] = {0, 0, 0, 0}; + +#if defined(HAVE__GET_CPUID) + if (!__get_cpuid(0x80000001, &r[0], &r[1], &r[2], &r[3])) + return false; +#elif defined(HAVE__CPUID) + __cpuid(r, 0x80000001); +#else +#error cpuid instruction not available +#endif + + return (r[3] & (1 << 27)) != 0; +} + +/* + * Decide whether we use the RDTSC instruction at runtime, for Linux/x86, + * instead of incurring the overhead of a full clock_gettime() call. + * + * This can't be reliably determined at compile time, since the + * availability of an "invariant" TSC (that is not affected by CPU + * frequency changes) is dependent on the CPU architecture. Additionally, + * there are cases where TSC availability is impacted by virtualization, + * where a simple cpuid feature check would not be enough. + * + * Since Linux already does a significant amount of work to determine + * whether TSC is a viable clock source, decide based on that. + */ +void +pg_initialize_rdtsc(void) +{ + FILE *fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r"); + + if (fp) + { + char buf[128]; + + if (fgets(buf, sizeof(buf), fp) != NULL && strcmp(buf, "tsc\n") == 0) + { + /* + * Compute baseline CPU peformance, determines speed at which + * RDTSC advances. + */ + uint32 tsc_freq; + + if (get_tsc_frequency_khz(&tsc_freq)) + { + /* + * Ticks to nanoseconds conversion requires floating point + * math because because: + * + * sec = ticks / frequency_hz ns = ticks / frequency_hz * + * 1,000,000,000 ns = ticks * (1,000,000,000 / frequency_hz) + * ns = ticks * (1,000,000 / frequency_khz) <-- now in + * kilohertz + * + * Here, 'ns' is usually a floating number. For example for a + * 2.5 GHz CPU the scaling factor becomes 1,000,000 / + * 2,500,000 = 1.2. + * + * To be able to use integer math we work around the lack of + * precision. We first scale the integer up and after the + * multiplication by the number of ticks in + * INSTR_TIME_GET_NANOSEC() we divide again by the same value. + * We picked the scaler such that it provides enough precision + * and is a power-of-two which allows for shifting instead of + * doing an integer division. + */ + ticks_per_ns_scaled = INT64CONST(1000000) * TICKS_TO_NS_PRECISION / tsc_freq; + ticks_per_sec = tsc_freq * 1000; /* KHz->Hz */ + max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; + + has_rdtsc = true; + has_rdtscp = is_rdtscp_available(); + } + } + + fclose(fp); + } +} +#endif /* defined(__x86_64__) && defined(__linux__) */ + +#endif /* WIN32 */ diff --git a/src/common/meson.build b/src/common/meson.build index 1540ba67cca4..62b90b3e609c 100644 --- a/src/common/meson.build +++ b/src/common/meson.build @@ -13,6 +13,7 @@ common_sources = files( 'file_perm.c', 'file_utils.c', 'hashfn.c', + 'instr_time.c', 'ip.c', 'jsonapi.c', 'keywords.c', diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index f71a851b18d8..f02296f1026b 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -4,9 +4,11 @@ * portable high-precision interval timing * * This file provides an abstraction layer to hide portability issues in - * interval timing. On Unix we use clock_gettime(), and on Windows we use - * QueryPerformanceCounter(). These macros also give some breathing room to - * use other high-precision-timing APIs. + * interval timing. On Linux/x86 we use the rdtsc instruction when a TSC + * clocksource is also used on the host OS. Otherwise, and on other + * Unix-like systems we use clock_gettime() and on Windows we use + * QueryPerformanceCounter(). These macros also give some breathing + * room to use other high-precision-timing APIs. * * The basic data type is instr_time, which all callers should treat as an * opaque typedef. instr_time can store either an absolute time (of @@ -17,10 +19,11 @@ * * INSTR_TIME_SET_ZERO(t) set t to zero (memset is acceptable too) * - * INSTR_TIME_SET_CURRENT(t) set t to current time + * INSTR_TIME_SET_CURRENT_FAST(t) set t to current time without waiting + * for instructions in out-of-order window * - * INSTR_TIME_SET_CURRENT_LAZY(t) set t to current time if t is zero, - * evaluates to whether t changed + * INSTR_TIME_SET_CURRENT(t) set t to current time while waiting for + * instructions in OOO to retire * * INSTR_TIME_ADD(x, y) x += y * @@ -81,6 +84,15 @@ typedef struct instr_time #ifndef WIN32 +/* + * Make sure this is a power-of-two, so that the compiler can turn the + * multiplications and divisions into shifts. + */ +#define TICKS_TO_NS_PRECISION (1<<14) + +extern int64 ticks_per_ns_scaled; +extern int64 ticks_per_sec; +extern int64 max_ticks_no_overflow; /* Use clock_gettime() */ @@ -100,15 +112,27 @@ typedef struct instr_time */ #if defined(__darwin__) && defined(CLOCK_MONOTONIC_RAW) #define PG_INSTR_CLOCK CLOCK_MONOTONIC_RAW +#define PG_INSTR_CLOCK_NAME "clock_gettime (CLOCK_MONOTONIC_RAW)" #elif defined(CLOCK_MONOTONIC) #define PG_INSTR_CLOCK CLOCK_MONOTONIC +#define PG_INSTR_CLOCK_NAME "clock_gettime (CLOCK_MONOTONIC)" #else #define PG_INSTR_CLOCK CLOCK_REALTIME +#define PG_INSTR_CLOCK_NAME "clock_gettime (CLOCK_REALTIME)" +#endif + +#if defined(__x86_64__) && defined(__linux__) +#include +#include + +extern bool has_rdtsc; +extern bool has_rdtscp; + +extern void pg_initialize_rdtsc(void); #endif -/* helper for INSTR_TIME_SET_CURRENT */ static inline instr_time -pg_clock_gettime_ns(void) +pg_clock_gettime(void) { instr_time now; struct timespec tmp; @@ -119,19 +143,102 @@ pg_clock_gettime_ns(void) return now; } -#define INSTR_TIME_SET_CURRENT(t) \ - ((t) = pg_clock_gettime_ns()) +static inline instr_time +pg_get_ticks_fast(void) +{ +#if defined(__x86_64__) && defined(__linux__) + if (has_rdtsc) + { + instr_time now; + + now.ticks = __rdtsc(); + return now; + } +#endif -#define INSTR_TIME_GET_NANOSEC(t) \ - ((int64) (t).ticks) + return pg_clock_gettime(); +} + +static inline instr_time +pg_get_ticks(void) +{ +#if defined(__x86_64__) && defined(__linux__) + if (has_rdtscp) + { + instr_time now; + uint32 unused; + + now.ticks = __rdtscp(&unused); + return now; + } +#endif + return pg_clock_gettime(); +} + +static inline int64_t +pg_ticks_to_ns(int64 ticks) +{ + /* + * Would multiplication overflow? If so perform computation in two parts. + * Check overflow without actually overflowing via: a * b > max <=> a > + * max / b + */ + int64 ns = 0; + + if (unlikely(ticks > max_ticks_no_overflow)) + { + /* + * Compute how often the maximum number of ticks fits completely into + * the number of elapsed ticks and convert that number into + * nanoseconds. Then multiply by the count to arrive at the final + * value. In a 2nd step we adjust the number of elapsed ticks and + * convert the remaining ticks. + */ + int64 count = ticks / max_ticks_no_overflow; + int64 max_ns = max_ticks_no_overflow * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION; + + ns = max_ns * count; + + /* + * Subtract the ticks that we now already accounted for, so that they + * don't get counted twice. + */ + ticks -= count * max_ticks_no_overflow; + Assert(ticks >= 0); + } + + ns += ticks * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION; + return ns; +} + +static inline void +pg_initialize_get_ticks() +{ +#if defined(__x86_64__) && defined(__linux__) + pg_initialize_rdtsc(); +#endif +} + +#define INSTR_TIME_INITIALIZE() \ + pg_initialize_get_ticks() + +#define INSTR_TIME_SET_CURRENT_FAST(t) \ + ((t) = pg_get_ticks_fast()) + +#define INSTR_TIME_SET_CURRENT(t) \ + ((t) = pg_get_ticks()) + +#define INSTR_TIME_TICKS_TO_NANOSEC(ticks) \ + (pg_ticks_to_ns(ticks)) #else /* WIN32 */ /* Use QueryPerformanceCounter() */ +#define PG_INSTR_CLOCK_NAME "QueryPerformanceCounter" -/* helper for INSTR_TIME_SET_CURRENT */ +/* helper for INSTR_TIME_SET_CURRENT / INSTR_TIME_SET_CURRENT_FAST */ static inline instr_time pg_query_performance_counter(void) { @@ -153,11 +260,16 @@ GetTimerFrequency(void) return (double) f.QuadPart; } +#define INSTR_TIME_INITIALIZE() + +#define INSTR_TIME_SET_CURRENT_FAST(t) \ + ((t) = pg_query_performance_counter()) + #define INSTR_TIME_SET_CURRENT(t) \ ((t) = pg_query_performance_counter()) -#define INSTR_TIME_GET_NANOSEC(t) \ - ((int64) ((t).ticks * ((double) NS_PER_S / GetTimerFrequency()))) +#define INSTR_TIME_TICKS_TO_NANOSEC(ticks) \ + ((int64) ((ticks) * ((double) NS_PER_S / GetTimerFrequency()))) #endif /* WIN32 */ @@ -168,22 +280,22 @@ GetTimerFrequency(void) #define INSTR_TIME_IS_ZERO(t) ((t).ticks == 0) - #define INSTR_TIME_SET_ZERO(t) ((t).ticks = 0) -#define INSTR_TIME_SET_CURRENT_LAZY(t) \ - (INSTR_TIME_IS_ZERO(t) ? INSTR_TIME_SET_CURRENT(t), true : false) - - #define INSTR_TIME_ADD(x,y) \ ((x).ticks += (y).ticks) #define INSTR_TIME_SUBTRACT(x,y) \ ((x).ticks -= (y).ticks) +#define INSTR_TIME_DIFF_NANOSEC(x,y) \ + (INSTR_TIME_TICKS_TO_NANOSEC((x).ticks - (y).ticks)) + #define INSTR_TIME_ACCUM_DIFF(x,y,z) \ ((x).ticks += (y).ticks - (z).ticks) +#define INSTR_TIME_GET_NANOSEC(t) \ + (INSTR_TIME_TICKS_TO_NANOSEC((t).ticks)) #define INSTR_TIME_GET_DOUBLE(t) \ ((double) INSTR_TIME_GET_NANOSEC(t) / NS_PER_S)