Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * instr_time.c
4 : : * Non-inline parts of the portable high-precision interval timing
5 : : * implementation
6 : : *
7 : : * Portions Copyright (c) 2026, PostgreSQL Global Development Group
8 : : *
9 : : *
10 : : * IDENTIFICATION
11 : : * src/common/instr_time.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : : #ifndef FRONTEND
16 : : #include "postgres.h"
17 : : #else
18 : : #include "postgres_fe.h"
19 : : #endif
20 : :
21 : : #include <math.h>
22 : :
23 : : #include "port/pg_cpu.h"
24 : : #include "portability/instr_time.h"
25 : :
26 : : /*
27 : : * Stores what the number of ticks needs to be multiplied with to end up
28 : : * with nanoseconds using integer math.
29 : : *
30 : : * In certain cases (TSC on x86-64, and QueryPerformanceCounter on Windows)
31 : : * the ticks to nanoseconds conversion requires floating point math because:
32 : : *
33 : : * sec = ticks / frequency_hz
34 : : * ns = ticks / frequency_hz * 1,000,000,000
35 : : * ns = ticks * (1,000,000,000 / frequency_hz)
36 : : * ns = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz
37 : : *
38 : : * Here, 'ns' is usually a floating point number. For example for a 2.5 GHz CPU
39 : : * the scaling factor becomes 1,000,000 / 2,500,000 = 0.4.
40 : : *
41 : : * To be able to use integer math we work around the lack of precision. We
42 : : * first scale the integer up (left shift by TICKS_TO_NS_SHIFT) and after the
43 : : * multiplication by the number of ticks in pg_ticks_to_ns() we shift right by
44 : : * the same amount.
45 : : *
46 : : * We remember the maximum number of ticks that can be multiplied by the scale
47 : : * factor without overflowing so we can check via a * b > max <=> a > max / b.
48 : : *
49 : : * However, as this is meant for interval measurements, it is unlikely that the
50 : : * overflow path is actually taken in typical scenarios, since overflows would
51 : : * only occur for intervals longer than 6.5 days.
52 : : *
53 : : * Note we utilize unsigned integers even though ticks are stored as a signed
54 : : * value to encourage compilers to generate better assembly, since we can be
55 : : * sure these values are not negative.
56 : : *
57 : : * In all other cases we are using clock_gettime(), which uses nanoseconds
58 : : * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns
59 : : * to return the original value.
60 : : */
61 : : uint64 ticks_per_ns_scaled = 0;
62 : : uint64 max_ticks_no_overflow = 0;
63 : : bool timing_initialized = false;
64 : : int timing_clock_source = TIMING_CLOCK_SOURCE_AUTO;
65 : :
66 : : bool timing_tsc_enabled = false;
67 : : int32 timing_tsc_frequency_khz = -1;
68 : :
69 : : static void set_ticks_per_ns(void);
70 : : static void set_ticks_per_ns_system(void);
71 : :
72 : : #if PG_INSTR_TSC_CLOCK
73 : : static TscClockSourceInfo tsc_info = {.calibrated_frequency_khz = -1};
74 : :
75 : : static bool tsc_use_by_default(void);
76 : : static void set_ticks_per_ns_for_tsc(void);
77 : : #endif
78 : :
79 : : /*
80 : : * Initializes timing infrastructure. Must be called before making any use
81 : : * of INSTR* macros.
82 : : */
83 : : void
53 andres@anarazel.de 84 :GNC 35916 : pg_initialize_timing(void)
85 : : {
86 [ + + ]: 35916 : if (timing_initialized)
87 : 23510 : return;
88 : :
89 : 12406 : set_ticks_per_ns_system();
90 : 12406 : timing_initialized = true;
91 : : }
92 : :
93 : : bool
94 : 3882 : pg_set_timing_clock_source(TimingClockSourceType source)
95 : : {
96 [ - + ]: 3882 : Assert(timing_initialized);
97 : :
98 : : #if PG_INSTR_TSC_CLOCK
99 : 3882 : pg_initialize_timing_tsc();
100 : :
101 [ + + + - ]: 3882 : switch (source)
102 : : {
103 : 2586 : case TIMING_CLOCK_SOURCE_AUTO:
104 [ + + + - ]: 2586 : timing_tsc_enabled = (timing_tsc_frequency_khz > 0) && tsc_use_by_default();
105 : 2586 : break;
106 : 1294 : case TIMING_CLOCK_SOURCE_SYSTEM:
107 : 1294 : timing_tsc_enabled = false;
108 : 1294 : break;
109 : 2 : case TIMING_CLOCK_SOURCE_TSC:
110 : : /* Tell caller TSC is not usable */
111 [ - + ]: 2 : if (timing_tsc_frequency_khz <= 0)
53 andres@anarazel.de 112 :UNC 0 : return false;
53 andres@anarazel.de 113 :GNC 2 : timing_tsc_enabled = true;
114 : 2 : break;
115 : : }
116 : : #endif
117 : :
118 : 3882 : set_ticks_per_ns();
119 : 3882 : timing_clock_source = source;
120 : 3882 : return true;
121 : : }
122 : :
123 : : static void
124 : 3882 : set_ticks_per_ns(void)
125 : : {
126 : : #if PG_INSTR_TSC_CLOCK
127 [ + + ]: 3882 : if (timing_tsc_enabled)
128 : : {
129 : 1295 : set_ticks_per_ns_for_tsc();
130 : 1295 : return;
131 : : }
132 : : #endif
133 : 2587 : set_ticks_per_ns_system();
134 : : }
135 : :
136 : : #ifndef WIN32
137 : :
138 : : static void
139 : 14993 : set_ticks_per_ns_system(void)
140 : : {
141 : 14993 : ticks_per_ns_scaled = 0;
142 : 14993 : max_ticks_no_overflow = 0;
143 : 14993 : }
144 : :
145 : : #else /* WIN32 */
146 : :
147 : : /* GetTimerFrequency returns counts per second */
148 : : static inline double
149 : : GetTimerFrequency(void)
150 : : {
151 : : LARGE_INTEGER f;
152 : :
153 : : QueryPerformanceFrequency(&f);
154 : : return (double) f.QuadPart;
155 : : }
156 : :
157 : : static void
158 : : set_ticks_per_ns_system(void)
159 : : {
160 : : ticks_per_ns_scaled = (NS_PER_S << TICKS_TO_NS_SHIFT) / GetTimerFrequency();
161 : : max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
162 : : }
163 : :
164 : : #endif /* WIN32 */
165 : :
166 : : /* TSC specific logic */
167 : :
168 : : #if PG_INSTR_TSC_CLOCK
169 : :
170 : : static void tsc_detect_frequency(void);
171 : : static uint32 pg_tsc_calibrate_frequency(void);
172 : :
173 : : /*
174 : : * Initialize the TSC clock source by determining its usability and frequency.
175 : : *
176 : : * This can be called multiple times without causing repeated work, as
177 : : * timing_tsc_frequency_khz will be set to 0 if a prior call determined the
178 : : * TSC is not usable. On EXEC_BACKEND (Windows), the TSC frequency may also be
179 : : * set by restore_backend_variables.
180 : : */
181 : : void
182 : 5174 : pg_initialize_timing_tsc(void)
183 : : {
184 [ + + ]: 5174 : if (timing_tsc_frequency_khz < 0)
185 : 1293 : tsc_detect_frequency();
186 : 5174 : }
187 : :
188 : : static void
189 : 1295 : set_ticks_per_ns_for_tsc(void)
190 : : {
191 : 1295 : ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / timing_tsc_frequency_khz;
192 : 1295 : max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
193 : 1295 : }
194 : :
195 : : /*
196 : : * Detect the TSC frequency and whether RDTSCP is available on x86-64.
197 : : *
198 : : * This can't be reliably determined at compile time, since the
199 : : * availability of an "invariant" TSC (that is not affected by CPU
200 : : * frequency changes) is dependent on the CPU architecture. Additionally,
201 : : * there are cases where TSC availability is impacted by virtualization,
202 : : * where a simple cpuid feature check would not be enough.
203 : : */
204 : : static void
205 : 1293 : tsc_detect_frequency(void)
206 : : {
207 : 1293 : timing_tsc_frequency_khz = 0;
14 208 : 1293 : tsc_info.frequency_khz = 0;
209 : 1293 : tsc_info.frequency_source[0] = '\0';
210 : :
211 : 1293 : strlcat(tsc_info.frequency_source, "x86",
212 : : sizeof(tsc_info.frequency_source));
213 : :
214 : : /* We require RDTSCP support and an invariant TSC, bail if not available */
215 [ - + ]: 1293 : if (!x86_feature_available(PG_RDTSCP))
216 : : {
14 andres@anarazel.de 217 :UNC 0 : strlcat(tsc_info.frequency_source, ", no rdtscp",
218 : : sizeof(tsc_info.frequency_source));
53 219 : 0 : return;
220 : : }
221 : :
14 andres@anarazel.de 222 [ - + ]:GNC 1293 : if (!x86_feature_available(PG_TSC_INVARIANT))
223 : : {
14 andres@anarazel.de 224 :UNC 0 : strlcat(tsc_info.frequency_source, ", not invariant",
225 : : sizeof(tsc_info.frequency_source));
226 : 0 : return;
227 : : }
228 : :
229 : : /* Determine speed at which the TSC advances */
14 andres@anarazel.de 230 :GNC 1293 : timing_tsc_frequency_khz = x86_tsc_frequency_khz(tsc_info.frequency_source,
231 : : sizeof(tsc_info.frequency_source));
53 232 [ - + ]: 1293 : if (timing_tsc_frequency_khz > 0)
233 : : {
14 andres@anarazel.de 234 :UNC 0 : tsc_info.frequency_khz = timing_tsc_frequency_khz;
53 235 : 0 : return;
236 : : }
237 : :
238 : : /*
239 : : * CPUID did not give us the TSC frequency. We can instead measure the
240 : : * frequency by comparing ticks against walltime in a calibration loop.
241 : : */
14 andres@anarazel.de 242 [ + - ]:GNC 1293 : if (tsc_info.calibrated_frequency_khz < 0)
243 : 1293 : tsc_info.calibrated_frequency_khz = pg_tsc_calibrate_frequency();
244 : :
245 : 1293 : timing_tsc_frequency_khz = tsc_info.calibrated_frequency_khz;
246 [ + - ]: 1293 : if (timing_tsc_frequency_khz > 0)
247 : : {
248 : 1293 : strlcat(tsc_info.frequency_source, ", calibration",
249 : : sizeof(tsc_info.frequency_source));
250 : 1293 : tsc_info.frequency_khz = timing_tsc_frequency_khz;
251 : : }
252 : : }
253 : :
254 : : /*
255 : : * Decides whether to use the TSC clock source if the user did not specify it
256 : : * one way or the other, and it is available (checked separately).
257 : : *
258 : : * Inspired by the Linux kernel's clocksource watchdog disable logic as updated
259 : : * in 2021 to reflect the reliability of the TSC on Intel platforms, see
260 : : * check_system_tsc_reliable() in arch/x86/kernel/tsc.c, as well as discussion
261 : : * in https://lore.kernel.org/lkml/87eekfk8bd.fsf@nanos.tec.linutronix.de/
262 : : * and https://lore.kernel.org/lkml/87a6pimt1f.ffs@nanos.tec.linutronix.de/
263 : : * for reference.
264 : : *
265 : : * When tsc_detect_frequency determines the TSC is viable (invariant, etc.), and
266 : : * we're on an Intel platform (determined via TSC_ADJUST), we consider the TSC
267 : : * trustworthy by default, matching the Linux kernel.
268 : : *
269 : : * On other CPU platforms (e.g. AMD), or in some virtual machines, we don't have
270 : : * an easy way to determine the TSC's reliability. If on Linux, we can check if
271 : : * TSC is the active clocksource, based on it having run the watchdog logic to
272 : : * monitor TSC correctness. For other platforms the user must explicitly enable
273 : : * it via GUC instead.
274 : : */
275 : : static bool
53 276 : 1293 : tsc_use_by_default(void)
277 : : {
278 [ - + ]: 1293 : if (x86_feature_available(PG_TSC_ADJUST))
53 andres@anarazel.de 279 :UNC 0 : return true;
280 : :
281 : : #if defined(__linux__)
282 : : {
283 : : FILE *fp;
284 : : char buf[128];
285 : :
53 andres@anarazel.de 286 :GNC 1293 : fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
287 [ + - ]: 1293 : if (fp)
288 : : {
289 [ + - ]: 2586 : bool is_tsc = (fgets(buf, sizeof(buf), fp) != NULL &&
290 [ + - ]: 1293 : strcmp(buf, "tsc\n") == 0);
291 : :
292 : 1293 : fclose(fp);
293 [ + - ]: 1293 : if (is_tsc)
294 : 1293 : return true;
295 : : }
296 : : }
297 : : #endif
298 : :
53 andres@anarazel.de 299 :UNC 0 : return false;
300 : : }
301 : :
302 : : /*
303 : : * Calibrate the TSC frequency by comparing TSC ticks against walltime.
304 : : *
305 : : * Takes initial TSC and system clock snapshots, then loops, recomputing the
306 : : * frequency each TSC_CALIBRATION_SKIPS iterations from cumulative TSC
307 : : * ticks divided by elapsed time.
308 : : *
309 : : * Once the frequency estimate stabilizes (consecutive iterations agree), we
310 : : * consider it converged and the frequency in KHz is returned. If either too
311 : : * many iterations or a time limit passes without convergence, 0 is returned.
312 : : */
313 : : #define TSC_CALIBRATION_MAX_NS (50 * NS_PER_MS)
314 : : #define TSC_CALIBRATION_ITERATIONS 1000000
315 : : #define TSC_CALIBRATION_SKIPS 100
316 : : #define TSC_CALIBRATION_STABLE_CYCLES 10
317 : : static uint32
53 andres@anarazel.de 318 :GNC 1293 : pg_tsc_calibrate_frequency(void)
319 : : {
320 : : instr_time initial_wall;
321 : : int64 initial_tsc;
322 : 1293 : double freq_khz = 0;
323 : 1293 : double prev_freq_khz = 0;
324 : 1293 : int stable_count = 0;
325 : : int64 prev_tsc;
326 : 1293 : int saved_clock_source = timing_clock_source;
327 : :
328 : : /*
329 : : * Frequency must be initialized to avoid recursion via
330 : : * pg_set_timing_clock_source.
331 : : */
332 [ - + ]: 1293 : Assert(timing_tsc_frequency_khz >= 0);
333 : :
334 : : /* Ensure INSTR_* calls below work on system time */
335 : 1293 : pg_set_timing_clock_source(TIMING_CLOCK_SOURCE_SYSTEM);
336 : :
337 : 1293 : INSTR_TIME_SET_CURRENT(initial_wall);
338 : :
339 : 1293 : initial_tsc = pg_rdtscp();
340 : 1293 : prev_tsc = initial_tsc;
341 : :
342 [ + - ]: 3272693 : for (int i = 0; i < TSC_CALIBRATION_ITERATIONS; i++)
343 : : {
344 : : instr_time now_wall;
345 : : int64 now_tsc;
346 : : int64 elapsed_ns;
347 : : int64 elapsed_ticks;
348 : :
349 : 3272693 : INSTR_TIME_SET_CURRENT(now_wall);
350 : :
351 : 3272693 : now_tsc = pg_rdtscp();
352 : :
353 : 3272693 : INSTR_TIME_SUBTRACT(now_wall, initial_wall);
354 : 3272693 : elapsed_ns = INSTR_TIME_GET_NANOSEC(now_wall);
355 : :
356 : : /* Safety: bail out if we've taken too long */
357 [ - + ]: 3272693 : if (elapsed_ns >= TSC_CALIBRATION_MAX_NS)
358 : 1293 : break;
359 : :
360 : 3272693 : elapsed_ticks = now_tsc - initial_tsc;
361 : :
362 : : /*
363 : : * Skip if TSC hasn't advanced, or we walked backwards for some
364 : : * reason.
365 : : */
366 [ + - + - : 3272693 : if (now_tsc == prev_tsc || elapsed_ns <= 0 || elapsed_ticks <= 0)
- + ]
367 : 3238686 : continue;
368 : :
369 : : /*
370 : : * We only measure frequency every TSC_CALIBRATION_SKIPS to avoid
371 : : * stabilizing based on just a handful of RDTSC instructions.
372 : : */
373 [ + + ]: 3272693 : if (i % TSC_CALIBRATION_SKIPS != 0)
374 : 3238686 : continue;
375 : :
376 : 34007 : freq_khz = ((double) elapsed_ticks / elapsed_ns) * 1000 * 1000;
377 : :
378 : : /*
379 : : * Once freq_khz / prev_freq_khz is small, check if it stays that way.
380 : : * If it does for long enough, we've got a winner frequency.
381 : : */
382 [ + + + + ]: 34007 : if (prev_freq_khz != 0 && fabs(1 - freq_khz / prev_freq_khz) < 0.0001)
383 : : {
384 : 19740 : stable_count++;
385 [ + + ]: 19740 : if (stable_count >= TSC_CALIBRATION_STABLE_CYCLES)
386 : 1293 : break;
387 : : }
388 : : else
389 : 14267 : stable_count = 0;
390 : :
391 : 32714 : prev_tsc = now_tsc;
392 : 32714 : prev_freq_khz = freq_khz;
393 : : }
394 : :
395 : : /* Restore the previous clock source */
396 : 1293 : pg_set_timing_clock_source(saved_clock_source);
397 : :
398 [ - + ]: 1293 : if (stable_count < TSC_CALIBRATION_STABLE_CYCLES)
53 andres@anarazel.de 399 :UNC 0 : return 0; /* did not converge */
400 : :
53 andres@anarazel.de 401 :GNC 1293 : return (uint32) freq_khz;
402 : : }
403 : :
404 : : /*
405 : : * Returns TSC clock source information for diagnostic purposes.
406 : : *
407 : : * On first call, may run the TSC calibration loop (if not already done during
408 : : * frequency detection) which can take up to TSC_CALIBRATION_MAX_NS.
409 : : * Subsequent calls return cached results.
410 : : *
411 : : * Note: This won't return the right info in EXEC_BACKEND builds if this were
412 : : * used in the backend (which it currently is not), as tsc_info is not copied
413 : : * using read_backend_variables - only the TSC frequency is.
414 : : */
415 : : const TscClockSourceInfo *
14 416 : 1 : pg_timing_tsc_clock_source_info(void)
417 : : {
418 [ + - - + ]: 1 : if (tsc_info.frequency_khz > 0 && tsc_info.calibrated_frequency_khz < 0)
14 andres@anarazel.de 419 :UNC 0 : tsc_info.calibrated_frequency_khz = pg_tsc_calibrate_frequency();
420 : :
14 andres@anarazel.de 421 :GNC 1 : return &tsc_info;
422 : : }
423 : :
424 : : #endif /* PG_INSTR_TSC_CLOCK */
|