Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * buf_internals.h
4 : : * Internal definitions for buffer manager and the buffer replacement
5 : : * strategy.
6 : : *
7 : : *
8 : : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
9 : : * Portions Copyright (c) 1994, Regents of the University of California
10 : : *
11 : : * src/include/storage/buf_internals.h
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : : #ifndef BUFMGR_INTERNALS_H
16 : : #define BUFMGR_INTERNALS_H
17 : :
18 : : #include "pgstat.h"
19 : : #include "port/atomics.h"
20 : : #include "storage/aio_types.h"
21 : : #include "storage/buf.h"
22 : : #include "storage/bufmgr.h"
23 : : #include "storage/condition_variable.h"
24 : : #include "storage/lwlock.h"
25 : : #include "storage/procnumber.h"
26 : : #include "storage/shmem.h"
27 : : #include "storage/smgr.h"
28 : : #include "storage/spin.h"
29 : : #include "utils/relcache.h"
30 : : #include "utils/resowner.h"
31 : :
32 : : /*
33 : : * Buffer state is a single 32-bit variable where following data is combined.
34 : : *
35 : : * - 18 bits refcount
36 : : * - 4 bits usage count
37 : : * - 10 bits of flags
38 : : *
39 : : * Combining these values allows to perform some operations without locking
40 : : * the buffer header, by modifying them together with a CAS loop.
41 : : *
42 : : * The definition of buffer state components is below.
43 : : */
44 : : #define BUF_REFCOUNT_BITS 18
45 : : #define BUF_USAGECOUNT_BITS 4
46 : : #define BUF_FLAG_BITS 10
47 : :
48 : : StaticAssertDecl(BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS + BUF_FLAG_BITS == 32,
49 : : "parts of buffer state space need to equal 32");
50 : :
51 : : #define BUF_REFCOUNT_ONE 1
52 : : #define BUF_REFCOUNT_MASK ((1U << BUF_REFCOUNT_BITS) - 1)
53 : : #define BUF_USAGECOUNT_MASK (((1U << BUF_USAGECOUNT_BITS) - 1) << (BUF_REFCOUNT_BITS))
54 : : #define BUF_USAGECOUNT_ONE (1U << BUF_REFCOUNT_BITS)
55 : : #define BUF_USAGECOUNT_SHIFT BUF_REFCOUNT_BITS
56 : : #define BUF_FLAG_MASK (((1U << BUF_FLAG_BITS) - 1) << (BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS))
57 : :
58 : : /* Get refcount and usagecount from buffer state */
59 : : #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
60 : : #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
61 : :
62 : : /*
63 : : * Flags for buffer descriptors
64 : : *
65 : : * Note: BM_TAG_VALID essentially means that there is a buffer hashtable
66 : : * entry associated with the buffer's tag.
67 : : */
68 : : #define BM_LOCKED (1U << 22) /* buffer header is locked */
69 : : #define BM_DIRTY (1U << 23) /* data needs writing */
70 : : #define BM_VALID (1U << 24) /* data is valid */
71 : : #define BM_TAG_VALID (1U << 25) /* tag is assigned */
72 : : #define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */
73 : : #define BM_IO_ERROR (1U << 27) /* previous I/O failed */
74 : : #define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */
75 : : #define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */
76 : : #define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */
77 : : #define BM_PERMANENT (1U << 31) /* permanent buffer (not unlogged,
78 : : * or init fork) */
79 : : /*
80 : : * The maximum allowed value of usage_count represents a tradeoff between
81 : : * accuracy and speed of the clock-sweep buffer management algorithm. A
82 : : * large value (comparable to NBuffers) would approximate LRU semantics.
83 : : * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of the
84 : : * clock-sweep hand to find a free buffer, so in practice we don't want the
85 : : * value to be very large.
86 : : */
87 : : #define BM_MAX_USAGE_COUNT 5
88 : :
89 : : StaticAssertDecl(BM_MAX_USAGE_COUNT < (1 << BUF_USAGECOUNT_BITS),
90 : : "BM_MAX_USAGE_COUNT doesn't fit in BUF_USAGECOUNT_BITS bits");
91 : : StaticAssertDecl(MAX_BACKENDS_BITS <= BUF_REFCOUNT_BITS,
92 : : "MAX_BACKENDS_BITS needs to be <= BUF_REFCOUNT_BITS");
93 : :
94 : : /*
95 : : * Buffer tag identifies which disk block the buffer contains.
96 : : *
97 : : * Note: the BufferTag data must be sufficient to determine where to write the
98 : : * block, without reference to pg_class or pg_tablespace entries. It's
99 : : * possible that the backend flushing the buffer doesn't even believe the
100 : : * relation is visible yet (its xact may have started before the xact that
101 : : * created the rel). The storage manager must be able to cope anyway.
102 : : *
103 : : * Note: if there's any pad bytes in the struct, InitBufferTag will have
104 : : * to be fixed to zero them, since this struct is used as a hash key.
105 : : */
106 : : typedef struct buftag
107 : : {
108 : : Oid spcOid; /* tablespace oid */
109 : : Oid dbOid; /* database oid */
110 : : RelFileNumber relNumber; /* relation file number */
111 : : ForkNumber forkNum; /* fork number */
112 : : BlockNumber blockNum; /* blknum relative to begin of reln */
113 : : } BufferTag;
114 : :
115 : : static inline RelFileNumber
1109 rhaas@postgresql.org 116 :CBC 146974209 : BufTagGetRelNumber(const BufferTag *tag)
117 : : {
1074 118 : 146974209 : return tag->relNumber;
119 : : }
120 : :
121 : : static inline ForkNumber
1109 122 : 21042980 : BufTagGetForkNum(const BufferTag *tag)
123 : : {
1074 124 : 21042980 : return tag->forkNum;
125 : : }
126 : :
127 : : static inline void
1109 128 : 67867081 : BufTagSetRelForkDetails(BufferTag *tag, RelFileNumber relnumber,
129 : : ForkNumber forknum)
130 : : {
1074 131 : 67867081 : tag->relNumber = relnumber;
132 : 67867081 : tag->forkNum = forknum;
1109 133 : 67867081 : }
134 : :
135 : : static inline RelFileLocator
136 : 17126682 : BufTagGetRelFileLocator(const BufferTag *tag)
137 : : {
138 : : RelFileLocator rlocator;
139 : :
140 : 17126682 : rlocator.spcOid = tag->spcOid;
141 : 17126682 : rlocator.dbOid = tag->dbOid;
142 : 17126682 : rlocator.relNumber = BufTagGetRelNumber(tag);
143 : :
144 : 17126682 : return rlocator;
145 : : }
146 : :
147 : : static inline void
1137 148 : 10730339 : ClearBufferTag(BufferTag *tag)
149 : : {
1109 150 : 10730339 : tag->spcOid = InvalidOid;
151 : 10730339 : tag->dbOid = InvalidOid;
152 : 10730339 : BufTagSetRelForkDetails(tag, InvalidRelFileNumber, InvalidForkNumber);
1137 153 : 10730339 : tag->blockNum = InvalidBlockNumber;
154 : 10730339 : }
155 : :
156 : : static inline void
157 : 57136742 : InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator,
158 : : ForkNumber forkNum, BlockNumber blockNum)
159 : : {
1109 160 : 57136742 : tag->spcOid = rlocator->spcOid;
161 : 57136742 : tag->dbOid = rlocator->dbOid;
162 : 57136742 : BufTagSetRelForkDetails(tag, rlocator->relNumber, forkNum);
1137 163 : 57136742 : tag->blockNum = blockNum;
164 : 57136742 : }
165 : :
166 : : static inline bool
167 : 2828764 : BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
168 : : {
1109 169 : 5657528 : return (tag1->spcOid == tag2->spcOid) &&
170 [ + - ]: 2828764 : (tag1->dbOid == tag2->dbOid) &&
1074 171 [ + + ]: 2828764 : (tag1->relNumber == tag2->relNumber) &&
172 [ + - + + ]: 8486233 : (tag1->blockNum == tag2->blockNum) &&
173 [ + - ]: 2828705 : (tag1->forkNum == tag2->forkNum);
174 : : }
175 : :
176 : : static inline bool
1109 177 : 398814775 : BufTagMatchesRelFileLocator(const BufferTag *tag,
178 : : const RelFileLocator *rlocator)
179 : : {
180 : 563124173 : return (tag->spcOid == rlocator->spcOid) &&
181 [ + + + + ]: 528341851 : (tag->dbOid == rlocator->dbOid) &&
182 [ + + ]: 129527076 : (BufTagGetRelNumber(tag) == rlocator->relNumber);
183 : : }
184 : :
185 : :
186 : : /*
187 : : * The shared buffer mapping table is partitioned to reduce contention.
188 : : * To determine which partition lock a given tag requires, compute the tag's
189 : : * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
190 : : * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
191 : : */
192 : : static inline uint32
1137 193 : 56878309 : BufTableHashPartition(uint32 hashcode)
194 : : {
195 : 56878309 : return hashcode % NUM_BUFFER_PARTITIONS;
196 : : }
197 : :
198 : : static inline LWLock *
199 : 56878309 : BufMappingPartitionLock(uint32 hashcode)
200 : : {
201 : 56878309 : return &MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET +
202 : 56878309 : BufTableHashPartition(hashcode)].lock;
203 : : }
204 : :
205 : : static inline LWLock *
206 : : BufMappingPartitionLockByIndex(uint32 index)
207 : : {
208 : : return &MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + index].lock;
209 : : }
210 : :
211 : : /*
212 : : * BufferDesc -- shared descriptor/state data for a single shared buffer.
213 : : *
214 : : * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
215 : : * tag, state or wait_backend_pgprocno fields. In general, buffer header lock
216 : : * is a spinlock which is combined with flags, refcount and usagecount into
217 : : * single atomic variable. This layout allow us to do some operations in a
218 : : * single atomic operation, without actually acquiring and releasing spinlock;
219 : : * for instance, increase or decrease refcount. buf_id field never changes
220 : : * after initialization, so does not need locking. The LWLock can take care
221 : : * of itself. The buffer header lock is *not* used to control access to the
222 : : * data in the buffer!
223 : : *
224 : : * It's assumed that nobody changes the state field while buffer header lock
225 : : * is held. Thus buffer header lock holder can do complex updates of the
226 : : * state variable in single write, simultaneously with lock release (cleaning
227 : : * BM_LOCKED flag). On the other hand, updating of state without holding
228 : : * buffer header lock is restricted to CAS, which ensures that BM_LOCKED flag
229 : : * is not set. Atomic increment/decrement, OR/AND etc. are not allowed.
230 : : *
231 : : * An exception is that if we have the buffer pinned, its tag can't change
232 : : * underneath us, so we can examine the tag without locking the buffer header.
233 : : * Also, in places we do one-time reads of the flags without bothering to
234 : : * lock the buffer header; this is generally for situations where we don't
235 : : * expect the flag bit being tested to be changing.
236 : : *
237 : : * We can't physically remove items from a disk page if another backend has
238 : : * the buffer pinned. Hence, a backend may need to wait for all other pins
239 : : * to go away. This is signaled by storing its own pgprocno into
240 : : * wait_backend_pgprocno and setting flag bit BM_PIN_COUNT_WAITER. At present,
241 : : * there can be only one such waiter per buffer.
242 : : *
243 : : * We use this same struct for local buffer headers, but the locks are not
244 : : * used and not all of the flag bits are useful either. To avoid unnecessary
245 : : * overhead, manipulations of the state field should be done without actual
246 : : * atomic operations (i.e. only pg_atomic_read_u32() and
247 : : * pg_atomic_unlocked_write_u32()).
248 : : *
249 : : * Be careful to avoid increasing the size of the struct when adding or
250 : : * reordering members. Keeping it below 64 bytes (the most common CPU
251 : : * cache line size) is fairly important for performance.
252 : : *
253 : : * Per-buffer I/O condition variables are currently kept outside this struct in
254 : : * a separate array. They could be moved in here and still fit within that
255 : : * limit on common systems, but for now that is not done.
256 : : */
257 : : typedef struct BufferDesc
258 : : {
259 : : BufferTag tag; /* ID of page contained in buffer */
260 : : int buf_id; /* buffer's index number (from 0) */
261 : :
262 : : /* state of the tag, containing flags, refcount and usagecount */
263 : : pg_atomic_uint32 state;
264 : :
265 : : int wait_backend_pgprocno; /* backend of pin-count waiter */
266 : :
267 : : PgAioWaitRef io_wref; /* set iff AIO is in progress */
268 : : LWLock content_lock; /* to lock access to buffer contents */
269 : : } BufferDesc;
270 : :
271 : : /*
272 : : * Concurrent access to buffer headers has proven to be more efficient if
273 : : * they're cache line aligned. So we force the start of the BufferDescriptors
274 : : * array to be on a cache line boundary and force the elements to be cache
275 : : * line sized.
276 : : *
277 : : * XXX: As this is primarily matters in highly concurrent workloads which
278 : : * probably all are 64bit these days, and the space wastage would be a bit
279 : : * more noticeable on 32bit systems, we don't force the stride to be cache
280 : : * line sized on those. If somebody does actual performance testing, we can
281 : : * reevaluate.
282 : : *
283 : : * Note that local buffer descriptors aren't forced to be aligned - as there's
284 : : * no concurrent access to those it's unlikely to be beneficial.
285 : : *
286 : : * We use a 64-byte cache line size here, because that's the most common
287 : : * size. Making it bigger would be a waste of memory. Even if running on a
288 : : * platform with either 32 or 128 byte line sizes, it's good to align to
289 : : * boundaries and avoid false sharing.
290 : : */
291 : : #define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1)
292 : :
293 : : typedef union BufferDescPadded
294 : : {
295 : : BufferDesc bufferdesc;
296 : : char pad[BUFFERDESC_PAD_TO_SIZE];
297 : : } BufferDescPadded;
298 : :
299 : : /*
300 : : * The PendingWriteback & WritebackContext structure are used to keep
301 : : * information about pending flush requests to be issued to the OS.
302 : : */
303 : : typedef struct PendingWriteback
304 : : {
305 : : /* could store different types of pending flushes here */
306 : : BufferTag tag;
307 : : } PendingWriteback;
308 : :
309 : : /* struct forward declared in bufmgr.h */
310 : : typedef struct WritebackContext
311 : : {
312 : : /* pointer to the max number of writeback requests to coalesce */
313 : : int *max_pending;
314 : :
315 : : /* current number of pending writeback requests */
316 : : int nr_pending;
317 : :
318 : : /* pending requests */
319 : : PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES];
320 : : } WritebackContext;
321 : :
322 : : /* in buf_init.c */
323 : : extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
324 : : extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
325 : : extern PGDLLIMPORT WritebackContext BackendWritebackContext;
326 : :
327 : : /* in localbuf.c */
328 : : extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors;
329 : :
330 : :
331 : : static inline BufferDesc *
332 : 725821719 : GetBufferDescriptor(uint32 id)
333 : : {
334 : 725821719 : return &(BufferDescriptors[id]).bufferdesc;
335 : : }
336 : :
337 : : static inline BufferDesc *
338 : 14802347 : GetLocalBufferDescriptor(uint32 id)
339 : : {
340 : 14802347 : return &LocalBufferDescriptors[id];
341 : : }
342 : :
343 : : static inline Buffer
344 : 291522322 : BufferDescriptorGetBuffer(const BufferDesc *bdesc)
345 : : {
346 : 291522322 : return (Buffer) (bdesc->buf_id + 1);
347 : : }
348 : :
349 : : static inline ConditionVariable *
350 : 11714496 : BufferDescriptorGetIOCV(const BufferDesc *bdesc)
351 : : {
352 : 11714496 : return &(BufferIOCVArray[bdesc->buf_id]).cv;
353 : : }
354 : :
355 : : static inline LWLock *
356 : 275154868 : BufferDescriptorGetContentLock(const BufferDesc *bdesc)
357 : : {
358 : 275154868 : return (LWLock *) (&bdesc->content_lock);
359 : : }
360 : :
361 : : /*
362 : : * Functions for acquiring/releasing a shared buffer header's spinlock. Do
363 : : * not apply these to local buffers!
364 : : */
365 : : extern uint32 LockBufHdr(BufferDesc *desc);
366 : :
367 : : static inline void
368 : 34321126 : UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
369 : : {
370 : 34321126 : pg_write_barrier();
371 : 34321126 : pg_atomic_write_u32(&desc->state, buf_state & (~BM_LOCKED));
372 : 34321126 : }
373 : :
374 : : /* in bufmgr.c */
375 : :
376 : : /*
377 : : * Structure to sort buffers per file on checkpoints.
378 : : *
379 : : * This structure is allocated per buffer in shared memory, so it should be
380 : : * kept as small as possible.
381 : : */
382 : : typedef struct CkptSortItem
383 : : {
384 : : Oid tsId;
385 : : RelFileNumber relNumber;
386 : : ForkNumber forkNum;
387 : : BlockNumber blockNum;
388 : : int buf_id;
389 : : } CkptSortItem;
390 : :
391 : : extern PGDLLIMPORT CkptSortItem *CkptBufferIds;
392 : :
393 : : /* ResourceOwner callbacks to hold buffer I/Os and pins */
394 : : extern PGDLLIMPORT const ResourceOwnerDesc buffer_io_resowner_desc;
395 : : extern PGDLLIMPORT const ResourceOwnerDesc buffer_pin_resowner_desc;
396 : :
397 : : /* Convenience wrappers over ResourceOwnerRemember/Forget */
398 : : static inline void
668 heikki.linnakangas@i 399 : 67904343 : ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
400 : : {
401 : 67904343 : ResourceOwnerRemember(owner, Int32GetDatum(buffer), &buffer_pin_resowner_desc);
402 : 67904343 : }
403 : : static inline void
404 : 67896923 : ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
405 : : {
406 : 67896923 : ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_pin_resowner_desc);
407 : 67896923 : }
408 : : static inline void
409 : 2441102 : ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
410 : : {
411 : 2441102 : ResourceOwnerRemember(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc);
412 : 2441102 : }
413 : : static inline void
414 : 2441087 : ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
415 : : {
416 : 2441087 : ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc);
417 : 2441087 : }
418 : :
419 : : /*
420 : : * Internal buffer management routines
421 : : */
422 : : /* bufmgr.c */
423 : : extern void WritebackContextInit(WritebackContext *context, int *max_pending);
424 : : extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
425 : : extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
426 : : IOContext io_context, BufferTag *tag);
427 : :
428 : : /* solely to make it easier to write tests */
429 : : extern bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
430 : : extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
431 : : bool forget_owner, bool release_aio);
432 : :
433 : :
434 : : /* freelist.c */
435 : : extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
436 : : extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
437 : : uint32 *buf_state, bool *from_ring);
438 : : extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
439 : : BufferDesc *buf, bool from_ring);
440 : :
441 : : extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
442 : : extern void StrategyNotifyBgWriter(int bgwprocno);
443 : :
444 : : extern Size StrategyShmemSize(void);
445 : : extern void StrategyInitialize(bool init);
446 : :
447 : : /* buf_table.c */
448 : : extern Size BufTableShmemSize(int size);
449 : : extern void InitBufTable(int size);
450 : : extern uint32 BufTableHashCode(BufferTag *tagPtr);
451 : : extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
452 : : extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
453 : : extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
454 : :
455 : : /* localbuf.c */
456 : : extern bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount);
457 : : extern void UnpinLocalBuffer(Buffer buffer);
458 : : extern void UnpinLocalBufferNoOwner(Buffer buffer);
459 : : extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr,
460 : : ForkNumber forkNum,
461 : : BlockNumber blockNum);
462 : : extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
463 : : BlockNumber blockNum, bool *foundPtr);
464 : : extern BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr,
465 : : ForkNumber fork,
466 : : uint32 flags,
467 : : uint32 extend_by,
468 : : BlockNumber extend_upto,
469 : : Buffer *buffers,
470 : : uint32 *extended_by);
471 : : extern void MarkLocalBufferDirty(Buffer buffer);
472 : : extern void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty,
473 : : uint32 set_flag_bits, bool release_aio);
474 : : extern bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait);
475 : : extern void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln);
476 : : extern void InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced);
477 : : extern void DropRelationLocalBuffers(RelFileLocator rlocator,
478 : : ForkNumber *forkNum, int nforks,
479 : : BlockNumber *firstDelBlock);
480 : : extern void DropRelationAllLocalBuffers(RelFileLocator rlocator);
481 : : extern void AtEOXact_LocalBuffers(bool isCommit);
482 : :
483 : : #endif /* BUFMGR_INTERNALS_H */
|