Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * freelist.c
4 : : * routines for managing the buffer pool's replacement strategy.
5 : : *
6 : : *
7 : : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 : : * Portions Copyright (c) 1994, Regents of the University of California
9 : : *
10 : : *
11 : : * IDENTIFICATION
12 : : * src/backend/storage/buffer/freelist.c
13 : : *
14 : : *-------------------------------------------------------------------------
15 : : */
16 : : #include "postgres.h"
17 : :
18 : : #include "pgstat.h"
19 : : #include "port/atomics.h"
20 : : #include "storage/buf_internals.h"
21 : : #include "storage/bufmgr.h"
22 : : #include "storage/proc.h"
23 : :
24 : : #define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var))))
25 : :
26 : :
27 : : /*
28 : : * The shared freelist control information.
29 : : */
30 : : typedef struct
31 : : {
32 : : /* Spinlock: protects the values below */
33 : : slock_t buffer_strategy_lock;
34 : :
35 : : /*
36 : : * clock-sweep hand: index of next buffer to consider grabbing. Note that
37 : : * this isn't a concrete buffer - we only ever increase the value. So, to
38 : : * get an actual buffer, it needs to be used modulo NBuffers.
39 : : */
40 : : pg_atomic_uint32 nextVictimBuffer;
41 : :
42 : : /*
43 : : * Statistics. These counters should be wide enough that they can't
44 : : * overflow during a single bgwriter cycle.
45 : : */
46 : : uint32 completePasses; /* Complete cycles of the clock-sweep */
47 : : pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */
48 : :
49 : : /*
50 : : * Bgworker process to be notified upon activity or -1 if none. See
51 : : * StrategyNotifyBgWriter.
52 : : */
53 : : int bgwprocno;
54 : : } BufferStrategyControl;
55 : :
56 : : /* Pointers to shared state */
57 : : static BufferStrategyControl *StrategyControl = NULL;
58 : :
59 : : /*
60 : : * Private (non-shared) state for managing a ring of shared buffers to re-use.
61 : : * This is currently the only kind of BufferAccessStrategy object, but someday
62 : : * we might have more kinds.
63 : : */
64 : : typedef struct BufferAccessStrategyData
65 : : {
66 : : /* Overall strategy type */
67 : : BufferAccessStrategyType btype;
68 : : /* Number of elements in buffers[] array */
69 : : int nbuffers;
70 : :
71 : : /*
72 : : * Index of the "current" slot in the ring, ie, the one most recently
73 : : * returned by GetBufferFromRing.
74 : : */
75 : : int current;
76 : :
77 : : /*
78 : : * Array of buffer numbers. InvalidBuffer (that is, zero) indicates we
79 : : * have not yet selected a buffer for this ring slot. For allocation
80 : : * simplicity this is palloc'd together with the fixed fields of the
81 : : * struct.
82 : : */
83 : : Buffer buffers[FLEXIBLE_ARRAY_MEMBER];
84 : : } BufferAccessStrategyData;
85 : :
86 : :
87 : : /* Prototypes for internal functions */
88 : : static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
89 : : uint32 *buf_state);
90 : : static void AddBufferToRing(BufferAccessStrategy strategy,
91 : : BufferDesc *buf);
92 : :
93 : : /*
94 : : * ClockSweepTick - Helper routine for StrategyGetBuffer()
95 : : *
96 : : * Move the clock hand one buffer ahead of its current position and return the
97 : : * id of the buffer now under the hand.
98 : : */
99 : : static inline uint32
3960 andres@anarazel.de 100 :CBC 4782594 : ClockSweepTick(void)
101 : : {
102 : : uint32 victim;
103 : :
104 : : /*
105 : : * Atomically move hand ahead one buffer - if there's several processes
106 : : * doing this, this can lead to buffers being returned slightly out of
107 : : * apparent order.
108 : : */
109 : : victim =
110 : 4782594 : pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1);
111 : :
112 [ + + ]: 4782594 : if (victim >= NBuffers)
113 : : {
3811 bruce@momjian.us 114 : 32715 : uint32 originalVictim = victim;
115 : :
116 : : /* always wrap what we look up in BufferDescriptors */
3960 andres@anarazel.de 117 : 32715 : victim = victim % NBuffers;
118 : :
119 : : /*
120 : : * If we're the one that just caused a wraparound, force
121 : : * completePasses to be incremented while holding the spinlock. We
122 : : * need the spinlock so StrategySyncStart() can return a consistent
123 : : * value consisting of nextVictimBuffer and completePasses.
124 : : */
125 [ + + ]: 32715 : if (victim == 0)
126 : : {
127 : : uint32 expected;
128 : : uint32 wrapped;
3811 bruce@momjian.us 129 : 32675 : bool success = false;
130 : :
3960 andres@anarazel.de 131 : 32675 : expected = originalVictim + 1;
132 : :
133 [ + + ]: 65390 : while (!success)
134 : : {
135 : : /*
136 : : * Acquire the spinlock while increasing completePasses. That
137 : : * allows other readers to read nextVictimBuffer and
138 : : * completePasses in a consistent manner which is required for
139 : : * StrategySyncStart(). In theory delaying the increment
140 : : * could lead to an overflow of nextVictimBuffers, but that's
141 : : * highly unlikely and wouldn't be particularly harmful.
142 : : */
143 [ + + ]: 32715 : SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
144 : :
145 : 32715 : wrapped = expected % NBuffers;
146 : :
147 : 32715 : success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
148 : : &expected, wrapped);
149 [ + + ]: 32715 : if (success)
150 : 32675 : StrategyControl->completePasses++;
151 : 32715 : SpinLockRelease(&StrategyControl->buffer_strategy_lock);
152 : : }
153 : : }
154 : : }
155 : 4782594 : return victim;
156 : : }
157 : :
158 : : /*
159 : : * StrategyGetBuffer
160 : : *
161 : : * Called by the bufmgr to get the next candidate buffer to use in
162 : : * GetVictimBuffer(). The only hard requirement GetVictimBuffer() has is that
163 : : * the selected buffer must not currently be pinned by anyone.
164 : : *
165 : : * strategy is a BufferAccessStrategy object, or NULL for default strategy.
166 : : *
167 : : * It is the callers responsibility to ensure the buffer ownership can be
168 : : * tracked via TrackNewBufferPin().
169 : : *
170 : : * The buffer is pinned and marked as owned, using TrackNewBufferPin(),
171 : : * before returning.
172 : : */
173 : : BufferDesc *
992 174 : 1924262 : StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
175 : : {
176 : : BufferDesc *buf;
177 : : int bgwprocno;
178 : : int trycounter;
179 : :
180 : 1924262 : *from_ring = false;
181 : :
182 : : /*
183 : : * If given a strategy object, see whether it can select a buffer. We
184 : : * assume strategy objects don't need buffer_strategy_lock.
185 : : */
6726 tgl@sss.pgh.pa.us 186 [ + + ]: 1924262 : if (strategy != NULL)
187 : : {
3488 andres@anarazel.de 188 : 811878 : buf = GetBufferFromRing(strategy, buf_state);
6726 tgl@sss.pgh.pa.us 189 [ + + ]: 811878 : if (buf != NULL)
190 : : {
992 andres@anarazel.de 191 : 316134 : *from_ring = true;
6726 tgl@sss.pgh.pa.us 192 : 316134 : return buf;
193 : : }
194 : : }
195 : :
196 : : /*
197 : : * If asked, we need to waken the bgwriter. Since we don't want to rely on
198 : : * a spinlock for this we force a read from shared memory once, and then
199 : : * set the latch based on that value. We need to go through that length
200 : : * because otherwise bgwprocno might be reset while/after we check because
201 : : * the compiler might just reread from memory.
202 : : *
203 : : * This can possibly set the latch of the wrong process if the bgwriter
204 : : * dies in the wrong moment. But since PGPROC->procLatch is never
205 : : * deallocated the worst consequence of that is that we set the latch of
206 : : * some arbitrary process.
207 : : */
3960 andres@anarazel.de 208 : 1608128 : bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno);
209 [ + + ]: 1608128 : if (bgwprocno != -1)
210 : : {
211 : : /* reset bgwprocno first, before setting the latch */
212 : 272 : StrategyControl->bgwprocno = -1;
213 : :
214 : : /*
215 : : * Not acquiring ProcArrayLock here which is slightly icky. It's
216 : : * actually fine because procLatch isn't ever freed, so we just can
217 : : * potentially set the wrong process' (or no process') latch.
218 : : */
219 : 272 : SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
220 : : }
221 : :
222 : : /*
223 : : * We count buffer allocation requests so that the bgwriter can estimate
224 : : * the rate of buffer consumption. Note that buffers recycled by a
225 : : * strategy object are intentionally not counted here.
226 : : */
227 : 1608128 : pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
228 : :
229 : : /* Use the "clock sweep" algorithm to find a free buffer */
7543 tgl@sss.pgh.pa.us 230 :GNC 1608128 : trycounter = NBuffers;
231 : : for (;;)
8020 JanWieck@Yahoo.com 232 :GIC 3174466 : {
233 : : uint32 old_buf_state;
234 : : uint32 local_buf_state;
235 : :
3925 andres@anarazel.de 236 :GNC 4782594 : buf = GetBufferDescriptor(ClockSweepTick());
237 : :
238 : : /*
239 : : * Check whether the buffer can be used and pin it if so. Do this
240 : : * using a CAS loop, to avoid having to lock the buffer header.
241 : : */
20 242 : 4782594 : old_buf_state = pg_atomic_read_u32(&buf->state);
243 : : for (;;)
244 : : {
245 : 4782644 : local_buf_state = old_buf_state;
246 : :
247 : : /*
248 : : * If the buffer is pinned or has a nonzero usage_count, we cannot
249 : : * use it; decrement the usage_count (unless pinned) and keep
250 : : * scanning.
251 : : */
252 : :
253 [ + + ]: 4782644 : if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0)
254 : : {
255 [ - + ]: 97064 : if (--trycounter == 0)
256 : : {
257 : : /*
258 : : * We've scanned all the buffers without making any state
259 : : * changes, so all the buffers are pinned (or were when we
260 : : * looked at them). We could hope that someone will free
261 : : * one eventually, but it's probably better to fail than
262 : : * to risk getting stuck in an infinite loop.
263 : : */
20 andres@anarazel.de 264 [ # # ]:UNC 0 : elog(ERROR, "no unpinned buffers available");
265 : : }
20 andres@anarazel.de 266 :GNC 97064 : break;
267 : : }
268 : :
269 [ + + ]: 4685580 : if (unlikely(local_buf_state & BM_LOCKED))
270 : : {
271 : 35 : old_buf_state = WaitBufHdrUnlocked(buf);
272 : 35 : continue;
273 : : }
274 : :
3488 andres@anarazel.de 275 [ + + ]:CBC 4685545 : if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
276 : : {
277 : 3077417 : local_buf_state -= BUF_USAGECOUNT_ONE;
278 : :
20 andres@anarazel.de 279 [ + + ]:GNC 3077417 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
280 : : local_buf_state))
281 : : {
282 : 3077402 : trycounter = NBuffers;
283 : 3077402 : break;
284 : : }
285 : : }
286 : : else
287 : : {
288 : : /* pin the buffer if the CAS succeeds */
289 : 1608128 : local_buf_state += BUF_REFCOUNT_ONE;
290 : :
291 [ + - ]: 1608128 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
292 : : local_buf_state))
293 : : {
294 : : /* Found a usable buffer */
295 [ + + ]: 1608128 : if (strategy != NULL)
296 : 495744 : AddBufferToRing(strategy, buf);
297 : 1608128 : *buf_state = local_buf_state;
298 : :
299 : 1608128 : TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
300 : :
301 : 1608128 : return buf;
302 : : }
303 : : }
304 : :
305 : : }
306 : : }
10703 scrappy@hub.org 307 :ECB (107609) : }
308 : :
309 : : /*
310 : : * StrategySyncStart -- tell BgBufferSync where to start syncing
311 : : *
312 : : * The result is the buffer index of the best buffer to sync first.
313 : : * BgBufferSync() will proceed circularly around the buffer array from there.
314 : : *
315 : : * In addition, we return the completed-pass count (which is effectively
316 : : * the higher-order bits of nextVictimBuffer) and the count of recent buffer
317 : : * allocs if non-NULL pointers are passed. The alloc count is reset after
318 : : * being read.
319 : : */
320 : : int
6608 tgl@sss.pgh.pa.us 321 :CBC 11415 : StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
322 : : {
323 : : uint32 nextVictimBuffer;
324 : : int result;
325 : :
4051 rhaas@postgresql.org 326 [ - + ]: 11415 : SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
3960 andres@anarazel.de 327 : 11415 : nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer);
328 : 11415 : result = nextVictimBuffer % NBuffers;
329 : :
6608 tgl@sss.pgh.pa.us 330 [ + - ]: 11415 : if (complete_passes)
331 : : {
332 : 11415 : *complete_passes = StrategyControl->completePasses;
333 : :
334 : : /*
335 : : * Additionally add the number of wraparounds that happened before
336 : : * completePasses could be incremented. C.f. ClockSweepTick().
337 : : */
3960 andres@anarazel.de 338 : 11415 : *complete_passes += nextVictimBuffer / NBuffers;
339 : : }
340 : :
6608 tgl@sss.pgh.pa.us 341 [ + - ]: 11415 : if (num_buf_alloc)
342 : : {
3960 andres@anarazel.de 343 : 11415 : *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
344 : : }
4051 rhaas@postgresql.org 345 : 11415 : SpinLockRelease(&StrategyControl->buffer_strategy_lock);
7543 tgl@sss.pgh.pa.us 346 : 11415 : return result;
347 : : }
348 : :
349 : : /*
350 : : * StrategyNotifyBgWriter -- set or clear allocation notification latch
351 : : *
352 : : * If bgwprocno isn't -1, the next invocation of StrategyGetBuffer will
353 : : * set that latch. Pass -1 to clear the pending notification before it
354 : : * happens. This feature is used by the bgwriter process to wake itself up
355 : : * from hibernation, and is not meant for anybody else to use.
356 : : */
357 : : void
3960 andres@anarazel.de 358 : 468 : StrategyNotifyBgWriter(int bgwprocno)
359 : : {
360 : : /*
361 : : * We acquire buffer_strategy_lock just to ensure that the store appears
362 : : * atomic to StrategyGetBuffer. The bgwriter should call this rather
363 : : * infrequently, so there's no performance penalty from being safe.
364 : : */
4051 rhaas@postgresql.org 365 [ - + ]: 468 : SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
3960 andres@anarazel.de 366 : 468 : StrategyControl->bgwprocno = bgwprocno;
4051 rhaas@postgresql.org 367 : 468 : SpinLockRelease(&StrategyControl->buffer_strategy_lock);
4920 tgl@sss.pgh.pa.us 368 : 468 : }
369 : :
370 : :
371 : : /*
372 : : * StrategyShmemSize
373 : : *
374 : : * estimate the size of shared memory used by the freelist-related structures.
375 : : *
376 : : * Note: for somewhat historical reasons, the buffer lookup hashtable size
377 : : * is also determined here.
378 : : */
379 : : Size
7572 380 : 1949 : StrategyShmemSize(void)
381 : : {
7374 382 : 1949 : Size size = 0;
383 : :
384 : : /* size of lookup hash table ... see comment in StrategyInitialize */
7037 385 : 1949 : size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
386 : :
387 : : /* size of the shared replacement strategy control block */
7374 388 : 1949 : size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
389 : :
7572 390 : 1949 : return size;
391 : : }
392 : :
393 : : /*
394 : : * StrategyInitialize -- initialize the buffer cache replacement
395 : : * strategy.
396 : : *
397 : : * Assumes: All of the buffers are already built into a linked list.
398 : : * Only called by postmaster and only during initialization.
399 : : */
400 : : void
8020 JanWieck@Yahoo.com 401 : 1049 : StrategyInitialize(bool init)
402 : : {
403 : : bool found;
404 : :
405 : : /*
406 : : * Initialize the shared buffer lookup hashtable.
407 : : *
408 : : * Since we can't tolerate running out of lookup table entries, we must be
409 : : * sure to specify an adequate table size here. The maximum steady-state
410 : : * usage is of course NBuffers entries, but BufferAlloc() tries to insert
411 : : * a new entry before deleting the old. In principle this could be
412 : : * happening in each partition concurrently, so we could need as many as
413 : : * NBuffers + NUM_BUFFER_PARTITIONS entries.
414 : : */
7037 tgl@sss.pgh.pa.us 415 : 1049 : InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
416 : :
417 : : /*
418 : : * Get or create the shared strategy control block
419 : : */
8020 JanWieck@Yahoo.com 420 : 1049 : StrategyControl = (BufferStrategyControl *)
7862 tgl@sss.pgh.pa.us 421 : 1049 : ShmemInitStruct("Buffer Strategy Status",
422 : : sizeof(BufferStrategyControl),
423 : : &found);
424 : :
8020 JanWieck@Yahoo.com 425 [ + - ]: 1049 : if (!found)
426 : : {
427 : : /*
428 : : * Only done once, usually in postmaster
429 : : */
430 [ - + ]: 1049 : Assert(init);
431 : :
4051 rhaas@postgresql.org 432 : 1049 : SpinLockInit(&StrategyControl->buffer_strategy_lock);
433 : :
434 : : /* Initialize the clock-sweep pointer */
3960 andres@anarazel.de 435 : 1049 : pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
436 : :
437 : : /* Clear statistics */
6608 tgl@sss.pgh.pa.us 438 : 1049 : StrategyControl->completePasses = 0;
3960 andres@anarazel.de 439 : 1049 : pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
440 : :
441 : : /* No pending notification */
442 : 1049 : StrategyControl->bgwprocno = -1;
443 : : }
444 : : else
8020 JanWieck@Yahoo.com 445 [ # # ]:UBC 0 : Assert(!init);
8020 JanWieck@Yahoo.com 446 :CBC 1049 : }
447 : :
448 : :
449 : : /* ----------------------------------------------------------------
450 : : * Backend-private buffer ring management
451 : : * ----------------------------------------------------------------
452 : : */
453 : :
454 : :
455 : : /*
456 : : * GetAccessStrategy -- create a BufferAccessStrategy object
457 : : *
458 : : * The object is allocated in the current memory context.
459 : : */
460 : : BufferAccessStrategy
6726 tgl@sss.pgh.pa.us 461 : 139657 : GetAccessStrategy(BufferAccessStrategyType btype)
462 : : {
463 : : int ring_size_kb;
464 : :
465 : : /*
466 : : * Select ring size to use. See buffer/README for rationales.
467 : : *
468 : : * Note: if you change the ring size for BAS_BULKREAD, see also
469 : : * SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c.
470 : : */
471 [ - + + - : 139657 : switch (btype)
- ]
472 : : {
6726 tgl@sss.pgh.pa.us 473 :UBC 0 : case BAS_NORMAL:
474 : : /* if someone asks for NORMAL, just give 'em a "default" object */
475 : 0 : return NULL;
476 : :
6726 tgl@sss.pgh.pa.us 477 :CBC 77765 : case BAS_BULKREAD:
478 : : {
479 : : int ring_max_kb;
480 : :
481 : : /*
482 : : * The ring always needs to be large enough to allow some
483 : : * separation in time between providing a buffer to the user
484 : : * of the strategy and that buffer being reused. Otherwise the
485 : : * user's pin will prevent reuse of the buffer, even without
486 : : * concurrent activity.
487 : : *
488 : : * We also need to ensure the ring always is large enough for
489 : : * SYNC_SCAN_REPORT_INTERVAL, as noted above.
490 : : *
491 : : * Thus we start out a minimal size and increase the size
492 : : * further if appropriate.
493 : : */
203 andres@anarazel.de 494 : 77765 : ring_size_kb = 256;
495 : :
496 : : /*
497 : : * There's no point in a larger ring if we won't be allowed to
498 : : * pin sufficiently many buffers. But we never limit to less
499 : : * than the minimal size above.
500 : : */
501 : 77765 : ring_max_kb = GetPinLimit() * (BLCKSZ / 1024);
502 : 77765 : ring_max_kb = Max(ring_size_kb, ring_max_kb);
503 : :
504 : : /*
505 : : * We would like the ring to additionally have space for the
506 : : * configured degree of IO concurrency. While being read in,
507 : : * buffers can obviously not yet be reused.
508 : : *
509 : : * Each IO can be up to io_combine_limit blocks large, and we
510 : : * want to start up to effective_io_concurrency IOs.
511 : : *
512 : : * Note that effective_io_concurrency may be 0, which disables
513 : : * AIO.
514 : : */
515 : 77765 : ring_size_kb += (BLCKSZ / 1024) *
516 : 77765 : io_combine_limit * effective_io_concurrency;
517 : :
518 [ + - ]: 77765 : if (ring_size_kb > ring_max_kb)
519 : 77765 : ring_size_kb = ring_max_kb;
520 : 77765 : break;
521 : : }
6200 tgl@sss.pgh.pa.us 522 : 61892 : case BAS_BULKWRITE:
935 drowley@postgresql.o 523 : 61892 : ring_size_kb = 16 * 1024;
6200 tgl@sss.pgh.pa.us 524 : 61892 : break;
6726 tgl@sss.pgh.pa.us 525 :UBC 0 : case BAS_VACUUM:
570 tmunro@postgresql.or 526 : 0 : ring_size_kb = 2048;
6726 tgl@sss.pgh.pa.us 527 : 0 : break;
528 : :
529 : 0 : default:
530 [ # # ]: 0 : elog(ERROR, "unrecognized buffer access strategy: %d",
531 : : (int) btype);
532 : : return NULL; /* keep compiler quiet */
533 : : }
534 : :
935 drowley@postgresql.o 535 :CBC 139657 : return GetAccessStrategyWithSize(btype, ring_size_kb);
536 : : }
537 : :
538 : : /*
539 : : * GetAccessStrategyWithSize -- create a BufferAccessStrategy object with a
540 : : * number of buffers equivalent to the passed in size.
541 : : *
542 : : * If the given ring size is 0, no BufferAccessStrategy will be created and
543 : : * the function will return NULL. ring_size_kb must not be negative.
544 : : */
545 : : BufferAccessStrategy
546 : 146393 : GetAccessStrategyWithSize(BufferAccessStrategyType btype, int ring_size_kb)
547 : : {
548 : : int ring_buffers;
549 : : BufferAccessStrategy strategy;
550 : :
551 [ - + ]: 146393 : Assert(ring_size_kb >= 0);
552 : :
553 : : /* Figure out how many buffers ring_size_kb is */
554 : 146393 : ring_buffers = ring_size_kb / (BLCKSZ / 1024);
555 : :
556 : : /* 0 means unlimited, so no BufferAccessStrategy required */
557 [ + + ]: 146393 : if (ring_buffers == 0)
558 : 6 : return NULL;
559 : :
560 : : /* Cap to 1/8th of shared_buffers */
561 : 146387 : ring_buffers = Min(NBuffers / 8, ring_buffers);
562 : :
563 : : /* NBuffers should never be less than 16, so this shouldn't happen */
564 [ - + ]: 146387 : Assert(ring_buffers > 0);
565 : :
566 : : /* Allocate the object and initialize all elements to zeroes */
567 : : strategy = (BufferAccessStrategy)
6726 tgl@sss.pgh.pa.us 568 : 146387 : palloc0(offsetof(BufferAccessStrategyData, buffers) +
569 : : ring_buffers * sizeof(Buffer));
570 : :
571 : : /* Set fields that don't start out zero */
572 : 146387 : strategy->btype = btype;
935 drowley@postgresql.o 573 : 146387 : strategy->nbuffers = ring_buffers;
574 : :
6726 tgl@sss.pgh.pa.us 575 : 146387 : return strategy;
576 : : }
577 : :
578 : : /*
579 : : * GetAccessStrategyBufferCount -- an accessor for the number of buffers in
580 : : * the ring
581 : : *
582 : : * Returns 0 on NULL input to match behavior of GetAccessStrategyWithSize()
583 : : * returning NULL with 0 size.
584 : : */
585 : : int
935 drowley@postgresql.o 586 : 17 : GetAccessStrategyBufferCount(BufferAccessStrategy strategy)
587 : : {
588 [ - + ]: 17 : if (strategy == NULL)
935 drowley@postgresql.o 589 :UBC 0 : return 0;
590 : :
935 drowley@postgresql.o 591 :CBC 17 : return strategy->nbuffers;
592 : : }
593 : :
594 : : /*
595 : : * GetAccessStrategyPinLimit -- get cap of number of buffers that should be pinned
596 : : *
597 : : * When pinning extra buffers to look ahead, users of a ring-based strategy are
598 : : * in danger of pinning too much of the ring at once while performing look-ahead.
599 : : * For some strategies, that means "escaping" from the ring, and in others it
600 : : * means forcing dirty data to disk very frequently with associated WAL
601 : : * flushing. Since external code has no insight into any of that, allow
602 : : * individual strategy types to expose a clamp that should be applied when
603 : : * deciding on a maximum number of buffers to pin at once.
604 : : *
605 : : * Callers should combine this number with other relevant limits and take the
606 : : * minimum.
607 : : */
608 : : int
570 tmunro@postgresql.or 609 : 442534 : GetAccessStrategyPinLimit(BufferAccessStrategy strategy)
610 : : {
611 [ + + ]: 442534 : if (strategy == NULL)
612 : 345608 : return NBuffers;
613 : :
614 [ + + ]: 96926 : switch (strategy->btype)
615 : : {
616 : 73047 : case BAS_BULKREAD:
617 : :
618 : : /*
619 : : * Since BAS_BULKREAD uses StrategyRejectBuffer(), dirty buffers
620 : : * shouldn't be a problem and the caller is free to pin up to the
621 : : * entire ring at once.
622 : : */
623 : 73047 : return strategy->nbuffers;
624 : :
625 : 23879 : default:
626 : :
627 : : /*
628 : : * Tell caller not to pin more than half the buffers in the ring.
629 : : * This is a trade-off between look ahead distance and deferring
630 : : * writeback and associated WAL traffic.
631 : : */
632 : 23879 : return strategy->nbuffers / 2;
633 : : }
634 : : }
635 : :
636 : : /*
637 : : * FreeAccessStrategy -- release a BufferAccessStrategy object
638 : : *
639 : : * A simple pfree would do at the moment, but we would prefer that callers
640 : : * don't assume that much about the representation of BufferAccessStrategy.
641 : : */
642 : : void
6726 tgl@sss.pgh.pa.us 643 : 133718 : FreeAccessStrategy(BufferAccessStrategy strategy)
644 : : {
645 : : /* don't crash if called on a "default" strategy */
646 [ + - ]: 133718 : if (strategy != NULL)
647 : 133718 : pfree(strategy);
648 : 133718 : }
649 : :
650 : : /*
651 : : * GetBufferFromRing -- returns a buffer from the ring, or NULL if the
652 : : * ring is empty / not usable.
653 : : *
654 : : * The buffer is pinned and marked as owned, using TrackNewBufferPin(), before
655 : : * returning.
656 : : */
657 : : static BufferDesc *
3488 andres@anarazel.de 658 : 811878 : GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
659 : : {
660 : : BufferDesc *buf;
661 : : Buffer bufnum;
662 : : uint32 old_buf_state;
663 : : uint32 local_buf_state; /* to avoid repeated (de-)referencing */
664 : :
665 : :
666 : : /* Advance to next ring slot */
939 drowley@postgresql.o 667 [ + + ]: 811878 : if (++strategy->current >= strategy->nbuffers)
6726 tgl@sss.pgh.pa.us 668 : 24018 : strategy->current = 0;
669 : :
670 : : /*
671 : : * If the slot hasn't been filled yet, tell the caller to allocate a new
672 : : * buffer with the normal allocation strategy. He will then fill this
673 : : * slot by calling AddBufferToRing with the new buffer.
674 : : */
675 : 811878 : bufnum = strategy->buffers[strategy->current];
676 [ + + ]: 811878 : if (bufnum == InvalidBuffer)
677 : 483393 : return NULL;
678 : :
20 andres@anarazel.de 679 :GNC 328485 : buf = GetBufferDescriptor(bufnum - 1);
680 : :
681 : : /*
682 : : * Check whether the buffer can be used and pin it if so. Do this using a
683 : : * CAS loop, to avoid having to lock the buffer header.
684 : : */
685 : 328485 : old_buf_state = pg_atomic_read_u32(&buf->state);
686 : : for (;;)
687 : : {
688 : 328485 : local_buf_state = old_buf_state;
689 : :
690 : : /*
691 : : * If the buffer is pinned we cannot use it under any circumstances.
692 : : *
693 : : * If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
694 : : * since our own previous usage of the ring element would have left it
695 : : * there, but it might've been decremented by clock-sweep since then).
696 : : * A higher usage_count indicates someone else has touched the buffer,
697 : : * so we shouldn't re-use it.
698 : : */
699 [ + + ]: 328485 : if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0
700 [ + + ]: 322880 : || BUF_STATE_GET_USAGECOUNT(local_buf_state) > 1)
701 : : break;
702 : :
703 [ - + ]: 316134 : if (unlikely(local_buf_state & BM_LOCKED))
704 : : {
20 andres@anarazel.de 705 :UNC 0 : old_buf_state = WaitBufHdrUnlocked(buf);
706 : 0 : continue;
707 : : }
708 : :
709 : : /* pin the buffer if the CAS succeeds */
20 andres@anarazel.de 710 :GNC 316134 : local_buf_state += BUF_REFCOUNT_ONE;
711 : :
712 [ + - ]: 316134 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
713 : : local_buf_state))
714 : : {
715 : 316134 : *buf_state = local_buf_state;
716 : :
717 : 316134 : TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
718 : 316134 : return buf;
719 : : }
720 : : }
721 : :
722 : : /*
723 : : * Tell caller to allocate a new buffer with the normal allocation
724 : : * strategy. He'll then replace this ring element via AddBufferToRing.
725 : : */
6726 tgl@sss.pgh.pa.us 726 :CBC 12351 : return NULL;
727 : : }
728 : :
729 : : /*
730 : : * AddBufferToRing -- add a buffer to the buffer ring
731 : : *
732 : : * Caller must hold the buffer header spinlock on the buffer. Since this
733 : : * is called with the spinlock held, it had better be quite cheap.
734 : : */
735 : : static void
3634 rhaas@postgresql.org 736 : 495744 : AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf)
737 : : {
6726 tgl@sss.pgh.pa.us 738 : 495744 : strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf);
739 : 495744 : }
740 : :
741 : : /*
742 : : * Utility function returning the IOContext of a given BufferAccessStrategy's
743 : : * strategy ring.
744 : : */
745 : : IOContext
992 andres@anarazel.de 746 : 58499345 : IOContextForStrategy(BufferAccessStrategy strategy)
747 : : {
748 [ + + ]: 58499345 : if (!strategy)
749 : 56346816 : return IOCONTEXT_NORMAL;
750 : :
751 [ - + + + : 2152529 : switch (strategy->btype)
- ]
752 : : {
992 andres@anarazel.de 753 :UBC 0 : case BAS_NORMAL:
754 : :
755 : : /*
756 : : * Currently, GetAccessStrategy() returns NULL for
757 : : * BufferAccessStrategyType BAS_NORMAL, so this case is
758 : : * unreachable.
759 : : */
760 : 0 : pg_unreachable();
761 : : return IOCONTEXT_NORMAL;
992 andres@anarazel.de 762 :CBC 1597527 : case BAS_BULKREAD:
763 : 1597527 : return IOCONTEXT_BULKREAD;
764 : 278770 : case BAS_BULKWRITE:
765 : 278770 : return IOCONTEXT_BULKWRITE;
766 : 276232 : case BAS_VACUUM:
767 : 276232 : return IOCONTEXT_VACUUM;
768 : : }
769 : :
992 andres@anarazel.de 770 [ # # ]:UBC 0 : elog(ERROR, "unrecognized BufferAccessStrategyType: %d", strategy->btype);
771 : : pg_unreachable();
772 : : }
773 : :
774 : : /*
775 : : * StrategyRejectBuffer -- consider rejecting a dirty buffer
776 : : *
777 : : * When a nondefault strategy is used, the buffer manager calls this function
778 : : * when it turns out that the buffer selected by StrategyGetBuffer needs to
779 : : * be written out and doing so would require flushing WAL too. This gives us
780 : : * a chance to choose a different victim.
781 : : *
782 : : * Returns true if buffer manager should ask for a new victim, and false
783 : : * if this buffer should be written and re-used.
784 : : */
785 : : bool
992 andres@anarazel.de 786 :CBC 9332 : StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
787 : : {
788 : : /* We only do this in bulkread mode */
6726 tgl@sss.pgh.pa.us 789 [ + + ]: 9332 : if (strategy->btype != BAS_BULKREAD)
790 : 2569 : return false;
791 : :
792 : : /* Don't muck with behavior of normal buffer-replacement strategy */
992 andres@anarazel.de 793 [ + + - + ]: 12861 : if (!from_ring ||
3051 tgl@sss.pgh.pa.us 794 : 6098 : strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
6726 795 : 665 : return false;
796 : :
797 : : /*
798 : : * Remove the dirty buffer from the ring; necessary to prevent infinite
799 : : * loop if all ring members are dirty.
800 : : */
801 : 6098 : strategy->buffers[strategy->current] = InvalidBuffer;
802 : :
803 : 6098 : return true;
804 : : }
|