Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * instr_time.c
4 : : * Non-inline parts of the portable high-precision interval timing
5 : : * implementation
6 : : *
7 : : * Portions Copyright (c) 2026, PostgreSQL Global Development Group
8 : : *
9 : : *
10 : : * IDENTIFICATION
11 : : * src/common/instr_time.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : : #ifndef FRONTEND
16 : : #include "postgres.h"
17 : : #else
18 : : #include "postgres_fe.h"
19 : : #endif
20 : :
21 : : #include <math.h>
22 : :
23 : : #include "port/pg_cpu.h"
24 : : #include "portability/instr_time.h"
25 : :
26 : : /*
27 : : * Stores what the number of ticks needs to be multiplied with to end up
28 : : * with nanoseconds using integer math.
29 : : *
30 : : * In certain cases (TSC on x86-64, and QueryPerformanceCounter on Windows)
31 : : * the ticks to nanoseconds conversion requires floating point math because:
32 : : *
33 : : * sec = ticks / frequency_hz
34 : : * ns = ticks / frequency_hz * 1,000,000,000
35 : : * ns = ticks * (1,000,000,000 / frequency_hz)
36 : : * ns = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz
37 : : *
38 : : * Here, 'ns' is usually a floating point number. For example for a 2.5 GHz CPU
39 : : * the scaling factor becomes 1,000,000 / 2,500,000 = 0.4.
40 : : *
41 : : * To be able to use integer math we work around the lack of precision. We
42 : : * first scale the integer up (left shift by TICKS_TO_NS_SHIFT) and after the
43 : : * multiplication by the number of ticks in pg_ticks_to_ns() we shift right by
44 : : * the same amount.
45 : : *
46 : : * We remember the maximum number of ticks that can be multiplied by the scale
47 : : * factor without overflowing so we can check via a * b > max <=> a > max / b.
48 : : *
49 : : * However, as this is meant for interval measurements, it is unlikely that the
50 : : * overflow path is actually taken in typical scenarios, since overflows would
51 : : * only occur for intervals longer than 6.5 days.
52 : : *
53 : : * Note we utilize unsigned integers even though ticks are stored as a signed
54 : : * value to encourage compilers to generate better assembly, since we can be
55 : : * sure these values are not negative.
56 : : *
57 : : * In all other cases we are using clock_gettime(), which uses nanoseconds
58 : : * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns
59 : : * to return the original value.
60 : : */
61 : : uint64 ticks_per_ns_scaled = 0;
62 : : uint64 max_ticks_no_overflow = 0;
63 : : bool timing_initialized = false;
64 : : int timing_clock_source = TIMING_CLOCK_SOURCE_AUTO;
65 : :
66 : : bool timing_tsc_enabled = false;
67 : : int32 timing_tsc_frequency_khz = -1;
68 : :
69 : : static void set_ticks_per_ns(void);
70 : : static void set_ticks_per_ns_system(void);
71 : :
72 : : #if PG_INSTR_TSC_CLOCK
73 : : static bool tsc_use_by_default(void);
74 : : static void set_ticks_per_ns_for_tsc(void);
75 : : #endif
76 : :
77 : : /*
78 : : * Initializes timing infrastructure. Must be called before making any use
79 : : * of INSTR* macros.
80 : : */
81 : : void
28 andres@anarazel.de 82 :GNC 35669 : pg_initialize_timing(void)
83 : : {
84 [ + + ]: 35669 : if (timing_initialized)
85 : 23364 : return;
86 : :
87 : 12305 : set_ticks_per_ns_system();
88 : 12305 : timing_initialized = true;
89 : : }
90 : :
91 : : bool
92 : 3866 : pg_set_timing_clock_source(TimingClockSourceType source)
93 : : {
94 [ - + ]: 3866 : Assert(timing_initialized);
95 : :
96 : : #if PG_INSTR_TSC_CLOCK
97 : 3866 : pg_initialize_timing_tsc();
98 : :
99 [ + + + - ]: 3866 : switch (source)
100 : : {
101 : 2574 : case TIMING_CLOCK_SOURCE_AUTO:
102 [ + + + - ]: 2574 : timing_tsc_enabled = (timing_tsc_frequency_khz > 0) && tsc_use_by_default();
103 : 2574 : break;
104 : 1289 : case TIMING_CLOCK_SOURCE_SYSTEM:
105 : 1289 : timing_tsc_enabled = false;
106 : 1289 : break;
107 : 3 : case TIMING_CLOCK_SOURCE_TSC:
108 : : /* Tell caller TSC is not usable */
109 [ - + ]: 3 : if (timing_tsc_frequency_khz <= 0)
28 andres@anarazel.de 110 :UNC 0 : return false;
28 andres@anarazel.de 111 :GNC 3 : timing_tsc_enabled = true;
112 : 3 : break;
113 : : }
114 : : #endif
115 : :
116 : 3866 : set_ticks_per_ns();
117 : 3866 : timing_clock_source = source;
118 : 3866 : return true;
119 : : }
120 : :
121 : : static void
122 : 3866 : set_ticks_per_ns(void)
123 : : {
124 : : #if PG_INSTR_TSC_CLOCK
125 [ + + ]: 3866 : if (timing_tsc_enabled)
126 : : {
127 : 1290 : set_ticks_per_ns_for_tsc();
128 : 1290 : return;
129 : : }
130 : : #endif
131 : 2576 : set_ticks_per_ns_system();
132 : : }
133 : :
134 : : #ifndef WIN32
135 : :
136 : : static void
137 : 14881 : set_ticks_per_ns_system(void)
138 : : {
139 : 14881 : ticks_per_ns_scaled = 0;
140 : 14881 : max_ticks_no_overflow = 0;
141 : 14881 : }
142 : :
143 : : #else /* WIN32 */
144 : :
145 : : /* GetTimerFrequency returns counts per second */
146 : : static inline double
147 : : GetTimerFrequency(void)
148 : : {
149 : : LARGE_INTEGER f;
150 : :
151 : : QueryPerformanceFrequency(&f);
152 : : return (double) f.QuadPart;
153 : : }
154 : :
155 : : static void
156 : : set_ticks_per_ns_system(void)
157 : : {
158 : : ticks_per_ns_scaled = (NS_PER_S << TICKS_TO_NS_SHIFT) / GetTimerFrequency();
159 : : max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
160 : : }
161 : :
162 : : #endif /* WIN32 */
163 : :
164 : : /* TSC specific logic */
165 : :
166 : : #if PG_INSTR_TSC_CLOCK
167 : :
168 : : static void tsc_detect_frequency(void);
169 : :
170 : : /*
171 : : * Initialize the TSC clock source by determining its usability and frequency.
172 : : *
173 : : * This can be called multiple times without causing repeated work, as
174 : : * timing_tsc_frequency_khz will be set to 0 if a prior call determined the
175 : : * TSC is not usable. On EXEC_BACKEND (Windows), the TSC frequency may also be
176 : : * set by restore_backend_variables.
177 : : */
178 : : void
179 : 5152 : pg_initialize_timing_tsc(void)
180 : : {
181 [ + + ]: 5152 : if (timing_tsc_frequency_khz < 0)
182 : 1287 : tsc_detect_frequency();
183 : 5152 : }
184 : :
185 : : static void
186 : 1290 : set_ticks_per_ns_for_tsc(void)
187 : : {
188 : 1290 : ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / timing_tsc_frequency_khz;
189 : 1290 : max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
190 : 1290 : }
191 : :
192 : : /*
193 : : * Detect the TSC frequency and whether RDTSCP is available on x86-64.
194 : : *
195 : : * This can't be reliably determined at compile time, since the
196 : : * availability of an "invariant" TSC (that is not affected by CPU
197 : : * frequency changes) is dependent on the CPU architecture. Additionally,
198 : : * there are cases where TSC availability is impacted by virtualization,
199 : : * where a simple cpuid feature check would not be enough.
200 : : */
201 : : static void
202 : 1287 : tsc_detect_frequency(void)
203 : : {
204 : 1287 : timing_tsc_frequency_khz = 0;
205 : :
206 : : /* We require RDTSCP support and an invariant TSC, bail if not available */
207 [ + - - + ]: 1287 : if (!x86_feature_available(PG_RDTSCP) || !x86_feature_available(PG_TSC_INVARIANT))
28 andres@anarazel.de 208 :UNC 0 : return;
209 : :
210 : : /* Determine speed at which the TSC advances */
28 andres@anarazel.de 211 :GNC 1287 : timing_tsc_frequency_khz = x86_tsc_frequency_khz();
212 [ - + ]: 1287 : if (timing_tsc_frequency_khz > 0)
28 andres@anarazel.de 213 :UNC 0 : return;
214 : :
215 : : /*
216 : : * CPUID did not give us the TSC frequency. We can instead measure the
217 : : * frequency by comparing ticks against walltime in a calibration loop.
218 : : */
28 andres@anarazel.de 219 :GNC 1287 : timing_tsc_frequency_khz = pg_tsc_calibrate_frequency();
220 : : }
221 : :
222 : : /*
223 : : * Decides whether to use the TSC clock source if the user did not specify it
224 : : * one way or the other, and it is available (checked separately).
225 : : *
226 : : * Inspired by the Linux kernel's clocksource watchdog disable logic as updated
227 : : * in 2021 to reflect the reliability of the TSC on Intel platforms, see
228 : : * check_system_tsc_reliable() in arch/x86/kernel/tsc.c, as well as discussion
229 : : * in https://lore.kernel.org/lkml/87eekfk8bd.fsf@nanos.tec.linutronix.de/
230 : : * and https://lore.kernel.org/lkml/87a6pimt1f.ffs@nanos.tec.linutronix.de/
231 : : * for reference.
232 : : *
233 : : * When tsc_detect_frequency determines the TSC is viable (invariant, etc.), and
234 : : * we're on an Intel platform (determined via TSC_ADJUST), we consider the TSC
235 : : * trustworthy by default, matching the Linux kernel.
236 : : *
237 : : * On other CPU platforms (e.g. AMD), or in some virtual machines, we don't have
238 : : * an easy way to determine the TSC's reliability. If on Linux, we can check if
239 : : * TSC is the active clocksource, based on it having run the watchdog logic to
240 : : * monitor TSC correctness. For other platforms the user must explicitly enable
241 : : * it via GUC instead.
242 : : */
243 : : static bool
244 : 1287 : tsc_use_by_default(void)
245 : : {
246 [ - + ]: 1287 : if (x86_feature_available(PG_TSC_ADJUST))
28 andres@anarazel.de 247 :UNC 0 : return true;
248 : :
249 : : #if defined(__linux__)
250 : : {
251 : : FILE *fp;
252 : : char buf[128];
253 : :
28 andres@anarazel.de 254 :GNC 1287 : fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
255 [ + - ]: 1287 : if (fp)
256 : : {
257 [ + - ]: 2574 : bool is_tsc = (fgets(buf, sizeof(buf), fp) != NULL &&
258 [ + - ]: 1287 : strcmp(buf, "tsc\n") == 0);
259 : :
260 : 1287 : fclose(fp);
261 [ + - ]: 1287 : if (is_tsc)
262 : 1287 : return true;
263 : : }
264 : : }
265 : : #endif
266 : :
28 andres@anarazel.de 267 :UNC 0 : return false;
268 : : }
269 : :
270 : : /*
271 : : * Calibrate the TSC frequency by comparing TSC ticks against walltime.
272 : : *
273 : : * Takes initial TSC and system clock snapshots, then loops, recomputing the
274 : : * frequency each TSC_CALIBRATION_SKIPS iterations from cumulative TSC
275 : : * ticks divided by elapsed time.
276 : : *
277 : : * Once the frequency estimate stabilizes (consecutive iterations agree), we
278 : : * consider it converged and the frequency in KHz is returned. If either too
279 : : * many iterations or a time limit passes without convergence, 0 is returned.
280 : : */
281 : : #define TSC_CALIBRATION_MAX_NS (50 * NS_PER_MS)
282 : : #define TSC_CALIBRATION_ITERATIONS 1000000
283 : : #define TSC_CALIBRATION_SKIPS 100
284 : : #define TSC_CALIBRATION_STABLE_CYCLES 10
285 : : uint32
28 andres@anarazel.de 286 :GNC 1288 : pg_tsc_calibrate_frequency(void)
287 : : {
288 : : instr_time initial_wall;
289 : : int64 initial_tsc;
290 : 1288 : double freq_khz = 0;
291 : 1288 : double prev_freq_khz = 0;
292 : 1288 : int stable_count = 0;
293 : : int64 prev_tsc;
294 : 1288 : int saved_clock_source = timing_clock_source;
295 : :
296 : : /*
297 : : * Frequency must be initialized to avoid recursion via
298 : : * pg_set_timing_clock_source.
299 : : */
300 [ - + ]: 1288 : Assert(timing_tsc_frequency_khz >= 0);
301 : :
302 : : /* Ensure INSTR_* calls below work on system time */
303 : 1288 : pg_set_timing_clock_source(TIMING_CLOCK_SOURCE_SYSTEM);
304 : :
305 : 1288 : INSTR_TIME_SET_CURRENT(initial_wall);
306 : :
307 : 1288 : initial_tsc = pg_rdtscp();
308 : 1288 : prev_tsc = initial_tsc;
309 : :
310 [ + - ]: 3143388 : for (int i = 0; i < TSC_CALIBRATION_ITERATIONS; i++)
311 : : {
312 : : instr_time now_wall;
313 : : int64 now_tsc;
314 : : int64 elapsed_ns;
315 : : int64 elapsed_ticks;
316 : :
317 : 3143388 : INSTR_TIME_SET_CURRENT(now_wall);
318 : :
319 : 3143388 : now_tsc = pg_rdtscp();
320 : :
321 : 3143388 : INSTR_TIME_SUBTRACT(now_wall, initial_wall);
322 : 3143388 : elapsed_ns = INSTR_TIME_GET_NANOSEC(now_wall);
323 : :
324 : : /* Safety: bail out if we've taken too long */
325 [ - + ]: 3143388 : if (elapsed_ns >= TSC_CALIBRATION_MAX_NS)
326 : 1288 : break;
327 : :
328 : 3143388 : elapsed_ticks = now_tsc - initial_tsc;
329 : :
330 : : /*
331 : : * Skip if TSC hasn't advanced, or we walked backwards for some
332 : : * reason.
333 : : */
334 [ + - + - : 3143388 : if (now_tsc == prev_tsc || elapsed_ns <= 0 || elapsed_ticks <= 0)
- + ]
335 : 3110679 : continue;
336 : :
337 : : /*
338 : : * We only measure frequency every TSC_CALIBRATION_SKIPS to avoid
339 : : * stabilizing based on just a handful of RDTSC instructions.
340 : : */
341 [ + + ]: 3143388 : if (i % TSC_CALIBRATION_SKIPS != 0)
342 : 3110679 : continue;
343 : :
344 : 32709 : freq_khz = ((double) elapsed_ticks / elapsed_ns) * 1000 * 1000;
345 : :
346 : : /*
347 : : * Once freq_khz / prev_freq_khz is small, check if it stays that way.
348 : : * If it does for long enough, we've got a winner frequency.
349 : : */
350 [ + + + + ]: 32709 : if (prev_freq_khz != 0 && fabs(1 - freq_khz / prev_freq_khz) < 0.0001)
351 : : {
352 : 18866 : stable_count++;
353 [ + + ]: 18866 : if (stable_count >= TSC_CALIBRATION_STABLE_CYCLES)
354 : 1288 : break;
355 : : }
356 : : else
357 : 13843 : stable_count = 0;
358 : :
359 : 31421 : prev_tsc = now_tsc;
360 : 31421 : prev_freq_khz = freq_khz;
361 : : }
362 : :
363 : : /* Restore the previous clock source */
364 : 1288 : pg_set_timing_clock_source(saved_clock_source);
365 : :
366 [ - + ]: 1288 : if (stable_count < TSC_CALIBRATION_STABLE_CYCLES)
28 andres@anarazel.de 367 :UNC 0 : return 0; /* did not converge */
368 : :
28 andres@anarazel.de 369 :GNC 1288 : return (uint32) freq_khz;
370 : : }
371 : :
372 : : #endif /* PG_INSTR_TSC_CLOCK */
|