Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * bufmgr.c
4 : : * buffer manager interface routines
5 : : *
6 : : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : : * Portions Copyright (c) 1994, Regents of the University of California
8 : : *
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/storage/buffer/bufmgr.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : : /*
16 : : * Principal entry points:
17 : : *
18 : : * ReadBuffer() -- find or create a buffer holding the requested page,
19 : : * and pin it so that no one can destroy it while this process
20 : : * is using it.
21 : : *
22 : : * StartReadBuffer() -- as above, with separate wait step
23 : : * StartReadBuffers() -- multiple block version
24 : : * WaitReadBuffers() -- second step of above
25 : : *
26 : : * ReleaseBuffer() -- unpin a buffer
27 : : *
28 : : * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29 : : * The disk write is delayed until buffer replacement or checkpoint.
30 : : *
31 : : * See also these files:
32 : : * freelist.c -- chooses victim for buffer replacement
33 : : * buf_table.c -- manages the buffer lookup table
34 : : */
35 : : #include "postgres.h"
36 : :
37 : : #include <sys/file.h>
38 : : #include <unistd.h>
39 : :
40 : : #include "access/tableam.h"
41 : : #include "access/xloginsert.h"
42 : : #include "access/xlogutils.h"
43 : : #ifdef USE_ASSERT_CHECKING
44 : : #include "catalog/pg_tablespace_d.h"
45 : : #endif
46 : : #include "catalog/storage.h"
47 : : #include "catalog/storage_xlog.h"
48 : : #include "executor/instrument.h"
49 : : #include "lib/binaryheap.h"
50 : : #include "miscadmin.h"
51 : : #include "pg_trace.h"
52 : : #include "pgstat.h"
53 : : #include "postmaster/bgwriter.h"
54 : : #include "storage/aio.h"
55 : : #include "storage/buf_internals.h"
56 : : #include "storage/bufmgr.h"
57 : : #include "storage/fd.h"
58 : : #include "storage/ipc.h"
59 : : #include "storage/lmgr.h"
60 : : #include "storage/proc.h"
61 : : #include "storage/read_stream.h"
62 : : #include "storage/smgr.h"
63 : : #include "storage/standby.h"
64 : : #include "utils/memdebug.h"
65 : : #include "utils/ps_status.h"
66 : : #include "utils/rel.h"
67 : : #include "utils/resowner.h"
68 : : #include "utils/timestamp.h"
69 : :
70 : :
71 : : /* Note: these two macros only work on shared buffers, not local ones! */
72 : : #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
73 : : #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
74 : :
75 : : /* Note: this macro only works on local buffers, not shared ones! */
76 : : #define LocalBufHdrGetBlock(bufHdr) \
77 : : LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
78 : :
79 : : /* Bits in SyncOneBuffer's return value */
80 : : #define BUF_WRITTEN 0x01
81 : : #define BUF_REUSABLE 0x02
82 : :
83 : : #define RELS_BSEARCH_THRESHOLD 20
84 : :
85 : : /*
86 : : * This is the size (in the number of blocks) above which we scan the
87 : : * entire buffer pool to remove the buffers for all the pages of relation
88 : : * being dropped. For the relations with size below this threshold, we find
89 : : * the buffers by doing lookups in BufMapping table.
90 : : */
91 : : #define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
92 : :
93 : : /*
94 : : * This is separated out from PrivateRefCountEntry to allow for copying all
95 : : * the data members via struct assignment.
96 : : */
97 : : typedef struct PrivateRefCountData
98 : : {
99 : : /*
100 : : * How many times has the buffer been pinned by this backend.
101 : : */
102 : : int32 refcount;
103 : : } PrivateRefCountData;
104 : :
105 : : typedef struct PrivateRefCountEntry
106 : : {
107 : : /*
108 : : * Note that this needs to be same as the entry's corresponding
109 : : * PrivateRefCountArrayKeys[i], if the entry is stored in the array. We
110 : : * store it in both places as this is used for the hashtable key and
111 : : * because it is more convenient (passing around a PrivateRefCountEntry
112 : : * suffices to identify the buffer) and faster (checking the keys array is
113 : : * faster when checking many entries, checking the entry is faster if just
114 : : * checking a single entry).
115 : : */
116 : : Buffer buffer;
117 : :
118 : : PrivateRefCountData data;
119 : : } PrivateRefCountEntry;
120 : :
121 : : /* 64 bytes, about the size of a cache line on common systems */
122 : : #define REFCOUNT_ARRAY_ENTRIES 8
123 : :
124 : : /*
125 : : * Status of buffers to checkpoint for a particular tablespace, used
126 : : * internally in BufferSync.
127 : : */
128 : : typedef struct CkptTsStatus
129 : : {
130 : : /* oid of the tablespace */
131 : : Oid tsId;
132 : :
133 : : /*
134 : : * Checkpoint progress for this tablespace. To make progress comparable
135 : : * between tablespaces the progress is, for each tablespace, measured as a
136 : : * number between 0 and the total number of to-be-checkpointed pages. Each
137 : : * page checkpointed in this tablespace increments this space's progress
138 : : * by progress_slice.
139 : : */
140 : : float8 progress;
141 : : float8 progress_slice;
142 : :
143 : : /* number of to-be checkpointed pages in this tablespace */
144 : : int num_to_scan;
145 : : /* already processed pages in this tablespace */
146 : : int num_scanned;
147 : :
148 : : /* current offset in CkptBufferIds for this tablespace */
149 : : int index;
150 : : } CkptTsStatus;
151 : :
152 : : /*
153 : : * Type for array used to sort SMgrRelations
154 : : *
155 : : * FlushRelationsAllBuffers shares the same comparator function with
156 : : * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
157 : : * compatible.
158 : : */
159 : : typedef struct SMgrSortArray
160 : : {
161 : : RelFileLocator rlocator; /* This must be the first member */
162 : : SMgrRelation srel;
163 : : } SMgrSortArray;
164 : :
165 : : /* GUC variables */
166 : : bool zero_damaged_pages = false;
167 : : int bgwriter_lru_maxpages = 100;
168 : : double bgwriter_lru_multiplier = 2.0;
169 : : bool track_io_timing = false;
170 : :
171 : : /*
172 : : * How many buffers PrefetchBuffer callers should try to stay ahead of their
173 : : * ReadBuffer calls by. Zero means "never prefetch". This value is only used
174 : : * for buffers not belonging to tablespaces that have their
175 : : * effective_io_concurrency parameter set.
176 : : */
177 : : int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY;
178 : :
179 : : /*
180 : : * Like effective_io_concurrency, but used by maintenance code paths that might
181 : : * benefit from a higher setting because they work on behalf of many sessions.
182 : : * Overridden by the tablespace setting of the same name.
183 : : */
184 : : int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY;
185 : :
186 : : /*
187 : : * Limit on how many blocks should be handled in single I/O operations.
188 : : * StartReadBuffers() callers should respect it, as should other operations
189 : : * that call smgr APIs directly. It is computed as the minimum of underlying
190 : : * GUCs io_combine_limit_guc and io_max_combine_limit.
191 : : */
192 : : int io_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
193 : : int io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT;
194 : : int io_max_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
195 : :
196 : : /*
197 : : * GUC variables about triggering kernel writeback for buffers written; OS
198 : : * dependent defaults are set via the GUC mechanism.
199 : : */
200 : : int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER;
201 : : int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER;
202 : : int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER;
203 : :
204 : : /* local state for LockBufferForCleanup */
205 : : static BufferDesc *PinCountWaitBuf = NULL;
206 : :
207 : : /*
208 : : * Backend-Private refcount management:
209 : : *
210 : : * Each buffer also has a private refcount that keeps track of the number of
211 : : * times the buffer is pinned in the current process. This is so that the
212 : : * shared refcount needs to be modified only once if a buffer is pinned more
213 : : * than once by an individual backend. It's also used to check that no buffers
214 : : * are still pinned at the end of transactions and when exiting.
215 : : *
216 : : *
217 : : * To avoid - as we used to - requiring an array with NBuffers entries to keep
218 : : * track of local buffers, we use a small sequentially searched array
219 : : * (PrivateRefCountArrayKeys, with the corresponding data stored in
220 : : * PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
221 : : * keep track of backend local pins.
222 : : *
223 : : * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
224 : : * refcounts are kept track of in the array; after that, new array entries
225 : : * displace old ones into the hash table. That way a frequently used entry
226 : : * can't get "stuck" in the hashtable while infrequent ones clog the array.
227 : : *
228 : : * Note that in most scenarios the number of pinned buffers will not exceed
229 : : * REFCOUNT_ARRAY_ENTRIES.
230 : : *
231 : : *
232 : : * To enter a buffer into the refcount tracking mechanism first reserve a free
233 : : * entry using ReservePrivateRefCountEntry() and then later, if necessary,
234 : : * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
235 : : * memory allocations in NewPrivateRefCountEntry() which can be important
236 : : * because in some scenarios it's called with a spinlock held...
237 : : */
238 : : static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES];
239 : : static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
240 : : static HTAB *PrivateRefCountHash = NULL;
241 : : static int32 PrivateRefCountOverflowed = 0;
242 : : static uint32 PrivateRefCountClock = 0;
243 : : static int ReservedRefCountSlot = -1;
244 : : static int PrivateRefCountEntryLast = -1;
245 : :
246 : : static uint32 MaxProportionalPins;
247 : :
248 : : static void ReservePrivateRefCountEntry(void);
249 : : static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
250 : : static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
251 : : static inline int32 GetPrivateRefCount(Buffer buffer);
252 : : static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
253 : :
254 : : /* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
255 : : static void ResOwnerReleaseBufferIO(Datum res);
256 : : static char *ResOwnerPrintBufferIO(Datum res);
257 : : static void ResOwnerReleaseBufferPin(Datum res);
258 : : static char *ResOwnerPrintBufferPin(Datum res);
259 : :
260 : : const ResourceOwnerDesc buffer_io_resowner_desc =
261 : : {
262 : : .name = "buffer io",
263 : : .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
264 : : .release_priority = RELEASE_PRIO_BUFFER_IOS,
265 : : .ReleaseResource = ResOwnerReleaseBufferIO,
266 : : .DebugPrint = ResOwnerPrintBufferIO
267 : : };
268 : :
269 : : const ResourceOwnerDesc buffer_pin_resowner_desc =
270 : : {
271 : : .name = "buffer pin",
272 : : .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
273 : : .release_priority = RELEASE_PRIO_BUFFER_PINS,
274 : : .ReleaseResource = ResOwnerReleaseBufferPin,
275 : : .DebugPrint = ResOwnerPrintBufferPin
276 : : };
277 : :
278 : : /*
279 : : * Ensure that the PrivateRefCountArray has sufficient space to store one more
280 : : * entry. This has to be called before using NewPrivateRefCountEntry() to fill
281 : : * a new entry - but it's perfectly fine to not use a reserved entry.
282 : : */
283 : : static void
3985 andres@anarazel.de 284 :CBC 61339639 : ReservePrivateRefCountEntry(void)
285 : : {
286 : : /* Already reserved (or freed), nothing to do */
3 andres@anarazel.de 287 [ + + ]:GNC 61339639 : if (ReservedRefCountSlot != -1)
3985 andres@anarazel.de 288 :CBC 57522310 : return;
289 : :
290 : : /*
291 : : * First search for a free entry the array, that'll be sufficient in the
292 : : * majority of cases.
293 : : */
294 : : {
295 : : int i;
296 : :
297 [ + + ]: 34355961 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
298 : : {
3 andres@anarazel.de 299 [ + + ]:GNC 30538632 : if (PrivateRefCountArrayKeys[i] == InvalidBuffer)
300 : : {
301 : 22421376 : ReservedRefCountSlot = i;
302 : :
303 : : /*
304 : : * We could return immediately, but iterating till the end of
305 : : * the array allows compiler-autovectorization.
306 : : */
307 : : }
308 : : }
309 : :
310 [ + + ]: 3817329 : if (ReservedRefCountSlot != -1)
311 : 3643470 : return;
312 : : }
313 : :
314 : : /*
315 : : * No luck. All array entries are full. Move one array entry into the hash
316 : : * table.
317 : : */
318 : : {
319 : : /*
320 : : * Move entry from the current clock position in the array into the
321 : : * hashtable. Use that slot.
322 : : */
323 : : int victim_slot;
324 : : PrivateRefCountEntry *victim_entry;
325 : : PrivateRefCountEntry *hashent;
326 : : bool found;
327 : :
328 : : /* select victim slot */
329 : 173859 : victim_slot = PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES;
330 : 173859 : victim_entry = &PrivateRefCountArray[victim_slot];
331 : 173859 : ReservedRefCountSlot = victim_slot;
332 : :
333 : : /* Better be used, otherwise we shouldn't get here. */
334 [ - + ]: 173859 : Assert(PrivateRefCountArrayKeys[victim_slot] != InvalidBuffer);
335 [ - + ]: 173859 : Assert(PrivateRefCountArray[victim_slot].buffer != InvalidBuffer);
336 [ - + ]: 173859 : Assert(PrivateRefCountArrayKeys[victim_slot] == PrivateRefCountArray[victim_slot].buffer);
337 : :
338 : : /* enter victim array entry into hashtable */
3985 andres@anarazel.de 339 :CBC 173859 : hashent = hash_search(PrivateRefCountHash,
3 andres@anarazel.de 340 :GNC 173859 : &PrivateRefCountArrayKeys[victim_slot],
341 : : HASH_ENTER,
342 : : &found);
3985 andres@anarazel.de 343 [ - + ]:CBC 173859 : Assert(!found);
344 : : /* move data from the entry in the array to the hash entry */
3 andres@anarazel.de 345 :GNC 173859 : hashent->data = victim_entry->data;
346 : :
347 : : /* clear the now free array slot */
348 : 173859 : PrivateRefCountArrayKeys[victim_slot] = InvalidBuffer;
349 : 173859 : victim_entry->buffer = InvalidBuffer;
350 : :
351 : : /* clear the whole data member, just for future proofing */
352 : 173859 : memset(&victim_entry->data, 0, sizeof(victim_entry->data));
353 : 173859 : victim_entry->data.refcount = 0;
354 : :
3985 andres@anarazel.de 355 :CBC 173859 : PrivateRefCountOverflowed++;
356 : : }
357 : : }
358 : :
359 : : /*
360 : : * Fill a previously reserved refcount entry.
361 : : */
362 : : static PrivateRefCountEntry *
363 : 55395505 : NewPrivateRefCountEntry(Buffer buffer)
364 : : {
365 : : PrivateRefCountEntry *res;
366 : :
367 : : /* only allowed to be called when a reservation has been made */
3 andres@anarazel.de 368 [ - + ]:GNC 55395505 : Assert(ReservedRefCountSlot != -1);
369 : :
370 : : /* use up the reserved entry */
371 : 55395505 : res = &PrivateRefCountArray[ReservedRefCountSlot];
372 : :
373 : : /* and fill it */
374 : 55395505 : PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer;
3985 andres@anarazel.de 375 :CBC 55395505 : res->buffer = buffer;
3 andres@anarazel.de 376 :GNC 55395505 : res->data.refcount = 0;
377 : :
378 : : /* update cache for the next lookup */
379 : 55395505 : PrivateRefCountEntryLast = ReservedRefCountSlot;
380 : :
381 : 55395505 : ReservedRefCountSlot = -1;
382 : :
3985 andres@anarazel.de 383 :CBC 55395505 : return res;
384 : : }
385 : :
386 : : /*
387 : : * Slow-path for GetPrivateRefCountEntry(). This is big enough to not be worth
388 : : * inlining. This particularly seems to be true if the compiler is capable of
389 : : * auto-vectorizing the code, as that imposes additional stack-alignment
390 : : * requirements etc.
391 : : */
392 : : static pg_noinline PrivateRefCountEntry *
3 andres@anarazel.de 393 :GNC 74819804 : GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move)
394 : : {
395 : : PrivateRefCountEntry *res;
396 : 74819804 : int match = -1;
397 : : int i;
398 : :
399 : : /*
400 : : * First search for references in the array, that'll be sufficient in the
401 : : * majority of cases.
402 : : */
4127 andres@anarazel.de 403 [ + + ]:CBC 673378236 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
404 : : {
3 andres@anarazel.de 405 [ + + ]:GNC 598558432 : if (PrivateRefCountArrayKeys[i] == buffer)
406 : : {
407 : 19996688 : match = i;
408 : : /* see ReservePrivateRefCountEntry() for why we don't return */
409 : : }
410 : : }
411 : :
412 [ + + ]: 74819804 : if (likely(match != -1))
413 : : {
414 : : /* update cache for the next lookup */
415 : 19996688 : PrivateRefCountEntryLast = match;
416 : :
417 : 19996688 : return &PrivateRefCountArray[match];
418 : : }
419 : :
420 : : /*
421 : : * By here we know that the buffer, if already pinned, isn't residing in
422 : : * the array.
423 : : *
424 : : * Only look up the buffer in the hashtable if we've previously overflowed
425 : : * into it.
426 : : */
3985 andres@anarazel.de 427 [ + + ]:CBC 54823116 : if (PrivateRefCountOverflowed == 0)
428 : 53205399 : return NULL;
429 : :
1045 peter@eisentraut.org 430 : 1617717 : res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
431 : :
3985 andres@anarazel.de 432 [ + + ]: 1617717 : if (res == NULL)
433 : 215179 : return NULL;
434 [ + + ]: 1402538 : else if (!do_move)
435 : : {
436 : : /* caller doesn't want us to move the hash entry into the array */
437 : 1381024 : return res;
438 : : }
439 : : else
440 : : {
441 : : /* move buffer from hashtable into the free array slot */
442 : : bool found;
443 : : PrivateRefCountEntry *free;
444 : :
445 : : /* Ensure there's a free array slot */
446 : 21514 : ReservePrivateRefCountEntry();
447 : :
448 : : /* Use up the reserved slot */
3 andres@anarazel.de 449 [ - + ]:GNC 21514 : Assert(ReservedRefCountSlot != -1);
450 : 21514 : free = &PrivateRefCountArray[ReservedRefCountSlot];
451 [ - + ]: 21514 : Assert(PrivateRefCountArrayKeys[ReservedRefCountSlot] == free->buffer);
3985 andres@anarazel.de 452 [ - + ]:CBC 21514 : Assert(free->buffer == InvalidBuffer);
453 : :
454 : : /* and fill it */
455 : 21514 : free->buffer = buffer;
3 andres@anarazel.de 456 :GNC 21514 : free->data = res->data;
457 : 21514 : PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer;
458 : : /* update cache for the next lookup */
459 : 21514 : PrivateRefCountEntryLast = match;
460 : :
461 : 21514 : ReservedRefCountSlot = -1;
462 : :
463 : :
464 : : /* delete from hashtable */
1045 peter@eisentraut.org 465 :CBC 21514 : hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
3985 andres@anarazel.de 466 [ - + ]: 21514 : Assert(found);
467 [ - + ]: 21514 : Assert(PrivateRefCountOverflowed > 0);
468 : 21514 : PrivateRefCountOverflowed--;
469 : :
470 : 21514 : return free;
471 : : }
472 : : }
473 : :
474 : : /*
475 : : * Return the PrivateRefCount entry for the passed buffer.
476 : : *
477 : : * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
478 : : * do_move is true, and the entry resides in the hashtable the entry is
479 : : * optimized for frequent access by moving it to the array.
480 : : */
481 : : static inline PrivateRefCountEntry *
3 andres@anarazel.de 482 :GNC 594724143 : GetPrivateRefCountEntry(Buffer buffer, bool do_move)
483 : : {
484 [ - + ]: 594724143 : Assert(BufferIsValid(buffer));
485 [ - + ]: 594724143 : Assert(!BufferIsLocal(buffer));
486 : :
487 : : /*
488 : : * It's very common to look up the same buffer repeatedly. To make that
489 : : * fast, we have a one-entry cache.
490 : : *
491 : : * In contrast to the loop in GetPrivateRefCountEntrySlow(), here it
492 : : * faster to check PrivateRefCountArray[].buffer, as in the case of a hit
493 : : * fewer addresses are computed and fewer cachelines are accessed. Whereas
494 : : * in GetPrivateRefCountEntrySlow()'s case, checking
495 : : * PrivateRefCountArrayKeys saves a lot of memory accesses.
496 : : */
497 [ + + ]: 594724143 : if (likely(PrivateRefCountEntryLast != -1) &&
498 [ + + ]: 594690023 : likely(PrivateRefCountArray[PrivateRefCountEntryLast].buffer == buffer))
499 : : {
500 : 519904339 : return &PrivateRefCountArray[PrivateRefCountEntryLast];
501 : : }
502 : :
503 : : /*
504 : : * The code for the cached lookup is small enough to be worth inlining
505 : : * into the caller. In the miss case however, that empirically doesn't
506 : : * seem worth it.
507 : : */
508 : 74819804 : return GetPrivateRefCountEntrySlow(buffer, do_move);
509 : : }
510 : :
511 : : /*
512 : : * Returns how many times the passed buffer is pinned by this backend.
513 : : *
514 : : * Only works for shared memory buffers!
515 : : */
516 : : static inline int32
4127 andres@anarazel.de 517 :CBC 460672473 : GetPrivateRefCount(Buffer buffer)
518 : : {
519 : : PrivateRefCountEntry *ref;
520 : :
521 [ - + ]: 460672473 : Assert(BufferIsValid(buffer));
522 [ - + ]: 460672473 : Assert(!BufferIsLocal(buffer));
523 : :
524 : : /*
525 : : * Not moving the entry - that's ok for the current users, but we might
526 : : * want to change this one day.
527 : : */
3985 528 : 460672473 : ref = GetPrivateRefCountEntry(buffer, false);
529 : :
4127 530 [ + + ]: 460672473 : if (ref == NULL)
531 : 11473 : return 0;
3 andres@anarazel.de 532 :GNC 460661000 : return ref->data.refcount;
533 : : }
534 : :
535 : : /*
536 : : * Release resources used to track the reference count of a buffer which we no
537 : : * longer have pinned and don't want to pin again immediately.
538 : : */
539 : : static void
4127 andres@anarazel.de 540 :CBC 55395505 : ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
541 : : {
3 andres@anarazel.de 542 [ - + ]:GNC 55395505 : Assert(ref->data.refcount == 0);
543 : :
4127 andres@anarazel.de 544 [ + - + + ]:CBC 55395505 : if (ref >= &PrivateRefCountArray[0] &&
545 : : ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
546 : : {
547 : 55243160 : ref->buffer = InvalidBuffer;
3 andres@anarazel.de 548 :GNC 55243160 : PrivateRefCountArrayKeys[ref - PrivateRefCountArray] = InvalidBuffer;
549 : :
550 : :
551 : : /*
552 : : * Mark the just used entry as reserved - in many scenarios that
553 : : * allows us to avoid ever having to search the array/hash for free
554 : : * entries.
555 : : */
556 : 55243160 : ReservedRefCountSlot = ref - PrivateRefCountArray;
557 : : }
558 : : else
559 : : {
560 : : bool found;
3861 bruce@momjian.us 561 :CBC 152345 : Buffer buffer = ref->buffer;
562 : :
1045 peter@eisentraut.org 563 : 152345 : hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
4127 andres@anarazel.de 564 [ - + ]: 152345 : Assert(found);
565 [ - + ]: 152345 : Assert(PrivateRefCountOverflowed > 0);
566 : 152345 : PrivateRefCountOverflowed--;
567 : : }
568 : 55395505 : }
569 : :
570 : : /*
571 : : * BufferIsPinned
572 : : * True iff the buffer is pinned (also checks for valid buffer number).
573 : : *
574 : : * NOTE: what we check here is that *this* backend holds a pin on
575 : : * the buffer. We do not care whether some other backend does.
576 : : */
577 : : #define BufferIsPinned(bufnum) \
578 : : ( \
579 : : !BufferIsValid(bufnum) ? \
580 : : false \
581 : : : \
582 : : BufferIsLocal(bufnum) ? \
583 : : (LocalRefCount[-(bufnum) - 1] > 0) \
584 : : : \
585 : : (GetPrivateRefCount(bufnum) > 0) \
586 : : )
587 : :
588 : :
589 : : static Buffer ReadBuffer_common(Relation rel,
590 : : SMgrRelation smgr, char smgr_persistence,
591 : : ForkNumber forkNum, BlockNumber blockNum,
592 : : ReadBufferMode mode, BufferAccessStrategy strategy);
593 : : static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr,
594 : : ForkNumber fork,
595 : : BufferAccessStrategy strategy,
596 : : uint32 flags,
597 : : uint32 extend_by,
598 : : BlockNumber extend_upto,
599 : : Buffer *buffers,
600 : : uint32 *extended_by);
601 : : static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr,
602 : : ForkNumber fork,
603 : : BufferAccessStrategy strategy,
604 : : uint32 flags,
605 : : uint32 extend_by,
606 : : BlockNumber extend_upto,
607 : : Buffer *buffers,
608 : : uint32 *extended_by);
609 : : static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
610 : : bool skip_if_not_valid);
611 : : static void PinBuffer_Locked(BufferDesc *buf);
612 : : static void UnpinBuffer(BufferDesc *buf);
613 : : static void UnpinBufferNoOwner(BufferDesc *buf);
614 : : static void BufferSync(int flags);
615 : : static int SyncOneBuffer(int buf_id, bool skip_recently_used,
616 : : WritebackContext *wb_context);
617 : : static void WaitIO(BufferDesc *buf);
618 : : static void AbortBufferIO(Buffer buffer);
619 : : static void shared_buffer_write_error_callback(void *arg);
620 : : static void local_buffer_write_error_callback(void *arg);
621 : : static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
622 : : char relpersistence,
623 : : ForkNumber forkNum,
624 : : BlockNumber blockNum,
625 : : BufferAccessStrategy strategy,
626 : : bool *foundPtr, IOContext io_context);
627 : : static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress);
628 : : static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
629 : : static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
630 : : static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
631 : : IOObject io_object, IOContext io_context);
632 : : static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
633 : : IOObject io_object, IOContext io_context);
634 : : static void FindAndDropRelationBuffers(RelFileLocator rlocator,
635 : : ForkNumber forkNum,
636 : : BlockNumber nForkBlock,
637 : : BlockNumber firstDelBlock);
638 : : static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
639 : : RelFileLocator dstlocator,
640 : : ForkNumber forkNum, bool permanent);
641 : : static void AtProcExit_Buffers(int code, Datum arg);
642 : : static void CheckForBufferLeaks(void);
643 : : #ifdef USE_ASSERT_CHECKING
644 : : static void AssertNotCatalogBufferLock(LWLock *lock, LWLockMode mode,
645 : : void *unused_context);
646 : : #endif
647 : : static int rlocator_comparator(const void *p1, const void *p2);
648 : : static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
649 : : static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
650 : : static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
651 : :
652 : :
653 : : /*
654 : : * Implementation of PrefetchBuffer() for shared buffers.
655 : : */
656 : : PrefetchBufferResult
2079 tmunro@postgresql.or 657 : 32278 : PrefetchSharedBuffer(SMgrRelation smgr_reln,
658 : : ForkNumber forkNum,
659 : : BlockNumber blockNum)
660 : : {
661 : 32278 : PrefetchBufferResult result = {InvalidBuffer, false};
662 : : BufferTag newTag; /* identity of requested block */
663 : : uint32 newHash; /* hash value for newTag */
664 : : LWLock *newPartitionLock; /* buffer partition lock for it */
665 : : int buf_id;
666 : :
667 [ - + ]: 32278 : Assert(BlockNumberIsValid(blockNum));
668 : :
669 : : /* create a tag so we can lookup the buffer */
1239 rhaas@postgresql.org 670 : 32278 : InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
671 : : forkNum, blockNum);
672 : :
673 : : /* determine its hash code and partition lock ID */
2079 tmunro@postgresql.or 674 : 32278 : newHash = BufTableHashCode(&newTag);
675 : 32278 : newPartitionLock = BufMappingPartitionLock(newHash);
676 : :
677 : : /* see if the block is in the buffer pool already */
678 : 32278 : LWLockAcquire(newPartitionLock, LW_SHARED);
679 : 32278 : buf_id = BufTableLookup(&newTag, newHash);
680 : 32278 : LWLockRelease(newPartitionLock);
681 : :
682 : : /* If not in buffers, initiate prefetch */
683 [ + + ]: 32278 : if (buf_id < 0)
684 : : {
685 : : #ifdef USE_PREFETCH
686 : : /*
687 : : * Try to initiate an asynchronous read. This returns false in
688 : : * recovery if the relation file doesn't exist.
689 : : */
984 690 [ + + + - ]: 16586 : if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
732 691 : 8181 : smgrprefetch(smgr_reln, forkNum, blockNum, 1))
692 : : {
2079 693 : 8181 : result.initiated_io = true;
694 : : }
695 : : #endif /* USE_PREFETCH */
696 : : }
697 : : else
698 : : {
699 : : /*
700 : : * Report the buffer it was in at that time. The caller may be able
701 : : * to avoid a buffer table lookup, but it's not pinned and it must be
702 : : * rechecked!
703 : : */
704 : 23873 : result.recent_buffer = buf_id + 1;
705 : : }
706 : :
707 : : /*
708 : : * If the block *is* in buffers, we do nothing. This is not really ideal:
709 : : * the block might be just about to be evicted, which would be stupid
710 : : * since we know we are going to need it soon. But the only easy answer
711 : : * is to bump the usage_count, which does not seem like a great solution:
712 : : * when the caller does ultimately touch the block, usage_count would get
713 : : * bumped again, resulting in too much favoritism for blocks that are
714 : : * involved in a prefetch sequence. A real fix would involve some
715 : : * additional per-buffer state, and it's not clear that there's enough of
716 : : * a problem to justify that.
717 : : */
718 : :
719 : 32278 : return result;
720 : : }
721 : :
722 : : /*
723 : : * PrefetchBuffer -- initiate asynchronous read of a block of a relation
724 : : *
725 : : * This is named by analogy to ReadBuffer but doesn't actually allocate a
726 : : * buffer. Instead it tries to ensure that a future ReadBuffer for the given
727 : : * block will not be delayed by the I/O. Prefetching is optional.
728 : : *
729 : : * There are three possible outcomes:
730 : : *
731 : : * 1. If the block is already cached, the result includes a valid buffer that
732 : : * could be used by the caller to avoid the need for a later buffer lookup, but
733 : : * it's not pinned, so the caller must recheck it.
734 : : *
735 : : * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
736 : : * true. Currently there is no way to know if the data was already cached by
737 : : * the kernel and therefore didn't really initiate I/O, and no way to know when
738 : : * the I/O completes other than using synchronous ReadBuffer().
739 : : *
740 : : * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
741 : : * USE_PREFETCH is not defined (this build doesn't support prefetching due to
742 : : * lack of a kernel facility), direct I/O is enabled, or the underlying
743 : : * relation file wasn't found and we are in recovery. (If the relation file
744 : : * wasn't found and we are not in recovery, an error is raised).
745 : : */
746 : : PrefetchBufferResult
6183 tgl@sss.pgh.pa.us 747 : 22561 : PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
748 : : {
749 [ - + ]: 22561 : Assert(RelationIsValid(reln));
750 [ - + ]: 22561 : Assert(BlockNumberIsValid(blockNum));
751 : :
5483 rhaas@postgresql.org 752 [ + + ]: 22561 : if (RelationUsesLocalBuffers(reln))
753 : : {
754 : : /* see comments in ReadBufferExtended */
6105 tgl@sss.pgh.pa.us 755 [ + - - + ]: 799 : if (RELATION_IS_OTHER_TEMP(reln))
6105 tgl@sss.pgh.pa.us 756 [ # # ]:UBC 0 : ereport(ERROR,
757 : : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
758 : : errmsg("cannot access temporary tables of other sessions")));
759 : :
760 : : /* pass it off to localbuf.c */
1619 tgl@sss.pgh.pa.us 761 :CBC 799 : return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
762 : : }
763 : : else
764 : : {
765 : : /* pass it to the shared buffer version */
766 : 21762 : return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
767 : : }
768 : : }
769 : :
770 : : /*
771 : : * ReadRecentBuffer -- try to pin a block in a recently observed buffer
772 : : *
773 : : * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
774 : : * successful. Return true if the buffer is valid and still has the expected
775 : : * tag. In that case, the buffer is pinned and the usage count is bumped.
776 : : */
777 : : bool
1260 rhaas@postgresql.org 778 : 4104 : ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum,
779 : : Buffer recent_buffer)
780 : : {
781 : : BufferDesc *bufHdr;
782 : : BufferTag tag;
783 : : uint32 buf_state;
784 : :
1714 tmunro@postgresql.or 785 [ - + ]: 4104 : Assert(BufferIsValid(recent_buffer));
786 : :
770 heikki.linnakangas@i 787 : 4104 : ResourceOwnerEnlarge(CurrentResourceOwner);
1714 tmunro@postgresql.or 788 : 4104 : ReservePrivateRefCountEntry();
1239 rhaas@postgresql.org 789 : 4104 : InitBufferTag(&tag, &rlocator, forkNum, blockNum);
790 : :
1714 tmunro@postgresql.or 791 [ + + ]: 4104 : if (BufferIsLocal(recent_buffer))
792 : : {
1241 heikki.linnakangas@i 793 : 48 : int b = -recent_buffer - 1;
794 : :
795 : 48 : bufHdr = GetLocalBufferDescriptor(b);
1714 tmunro@postgresql.or 796 : 48 : buf_state = pg_atomic_read_u32(&bufHdr->state);
797 : :
798 : : /* Is it still valid and holding the right tag? */
1239 rhaas@postgresql.org 799 [ + - + - ]: 48 : if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
800 : : {
987 andres@anarazel.de 801 : 48 : PinLocalBuffer(bufHdr, true);
802 : :
1350 tmunro@postgresql.or 803 : 48 : pgBufferUsage.local_blks_hit++;
804 : :
1714 805 : 48 : return true;
806 : : }
807 : : }
808 : : else
809 : : {
810 : 4056 : bufHdr = GetBufferDescriptor(recent_buffer - 1);
811 : :
812 : : /*
813 : : * Is it still valid and holding the right tag? We do an unlocked tag
814 : : * comparison first, to make it unlikely that we'll increment the
815 : : * usage counter of the wrong buffer, if someone calls us with a very
816 : : * out of date recent_buffer. Then we'll check it again if we get the
817 : : * pin.
818 : : */
70 andres@anarazel.de 819 [ + + + + ]:GNC 8064 : if (BufferTagsEqual(&tag, &bufHdr->tag) &&
820 : 4008 : PinBuffer(bufHdr, NULL, true))
821 : : {
822 [ + - ]: 4005 : if (BufferTagsEqual(&tag, &bufHdr->tag))
823 : : {
824 : 4005 : pgBufferUsage.shared_blks_hit++;
825 : 4005 : return true;
826 : : }
70 andres@anarazel.de 827 :UNC 0 : UnpinBuffer(bufHdr);
828 : : }
829 : : }
830 : :
1714 tmunro@postgresql.or 831 :CBC 51 : return false;
832 : : }
833 : :
834 : : /*
835 : : * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
836 : : * fork with RBM_NORMAL mode and default strategy.
837 : : */
838 : : Buffer
6256 heikki.linnakangas@i 839 : 40927513 : ReadBuffer(Relation reln, BlockNumber blockNum)
840 : : {
841 : 40927513 : return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
842 : : }
843 : :
844 : : /*
845 : : * ReadBufferExtended -- returns a buffer containing the requested
846 : : * block of the requested relation. If the blknum
847 : : * requested is P_NEW, extend the relation file and
848 : : * allocate a new block. (Caller is responsible for
849 : : * ensuring that only one backend tries to extend a
850 : : * relation at the same time!)
851 : : *
852 : : * Returns: the buffer number for the buffer containing
853 : : * the block read. The returned buffer has been pinned.
854 : : * Does not return on error --- elog's instead.
855 : : *
856 : : * Assume when this function is called, that reln has been opened already.
857 : : *
858 : : * In RBM_NORMAL mode, the page is read from disk, and the page header is
859 : : * validated. An error is thrown if the page header is not valid. (But
860 : : * note that an all-zero page is considered "valid"; see
861 : : * PageIsVerified().)
862 : : *
863 : : * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
864 : : * valid, the page is zeroed instead of throwing an error. This is intended
865 : : * for non-critical data, where the caller is prepared to repair errors.
866 : : *
867 : : * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
868 : : * filled with zeros instead of reading it from disk. Useful when the caller
869 : : * is going to fill the page from scratch, since this saves I/O and avoids
870 : : * unnecessary failure if the page-on-disk has corrupt page headers.
871 : : * The page is returned locked to ensure that the caller has a chance to
872 : : * initialize the page before it's made visible to others.
873 : : * Caution: do not use this mode to read a page that is beyond the relation's
874 : : * current physical EOF; that is likely to cause problems in md.c when
875 : : * the page is modified and written out. P_NEW is OK, though.
876 : : *
877 : : * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
878 : : * a cleanup-strength lock on the page.
879 : : *
880 : : * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
881 : : *
882 : : * If strategy is not NULL, a nondefault buffer access strategy is used.
883 : : * See buffer/README for details.
884 : : */
885 : : inline Buffer
886 : 49268811 : ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
887 : : ReadBufferMode mode, BufferAccessStrategy strategy)
888 : : {
889 : : Buffer buf;
890 : :
891 : : /*
892 : : * Reject attempts to read non-local temporary relations; we would be
893 : : * likely to get wrong data since we have no visibility into the owning
894 : : * session's local buffers.
895 : : */
6105 tgl@sss.pgh.pa.us 896 [ + + - + ]: 49268811 : if (RELATION_IS_OTHER_TEMP(reln))
6105 tgl@sss.pgh.pa.us 897 [ # # ]:UBC 0 : ereport(ERROR,
898 : : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
899 : : errmsg("cannot access temporary tables of other sessions")));
900 : :
901 : : /*
902 : : * Read the buffer, and update pgstat counters to reflect a cache hit or
903 : : * miss.
904 : : */
623 tmunro@postgresql.or 905 :CBC 49268811 : buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
906 : : forkNum, blockNum, mode, strategy);
907 : :
6397 heikki.linnakangas@i 908 : 49268787 : return buf;
909 : : }
910 : :
911 : :
912 : : /*
913 : : * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
914 : : * a relcache entry for the relation.
915 : : *
916 : : * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
917 : : * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
918 : : * cannot be used for temporary relations (and making that work might be
919 : : * difficult, unless we only want to read temporary relations for our own
920 : : * ProcNumber).
921 : : */
922 : : Buffer
1260 rhaas@postgresql.org 923 : 5770044 : ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum,
924 : : BlockNumber blockNum, ReadBufferMode mode,
925 : : BufferAccessStrategy strategy, bool permanent)
926 : : {
654 heikki.linnakangas@i 927 : 5770044 : SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
928 : :
623 tmunro@postgresql.or 929 [ + - ]: 5770044 : return ReadBuffer_common(NULL, smgr,
930 : : permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
931 : : forkNum, blockNum,
932 : : mode, strategy);
933 : : }
934 : :
935 : : /*
936 : : * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
937 : : */
938 : : Buffer
847 939 : 45470 : ExtendBufferedRel(BufferManagerRelation bmr,
940 : : ForkNumber forkNum,
941 : : BufferAccessStrategy strategy,
942 : : uint32 flags)
943 : : {
944 : : Buffer buf;
987 andres@anarazel.de 945 : 45470 : uint32 extend_by = 1;
946 : :
847 tmunro@postgresql.or 947 : 45470 : ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
948 : : &buf, &extend_by);
949 : :
987 andres@anarazel.de 950 : 45470 : return buf;
951 : : }
952 : :
953 : : /*
954 : : * Extend relation by multiple blocks.
955 : : *
956 : : * Tries to extend the relation by extend_by blocks. Depending on the
957 : : * availability of resources the relation may end up being extended by a
958 : : * smaller number of pages (unless an error is thrown, always by at least one
959 : : * page). *extended_by is updated to the number of pages the relation has been
960 : : * extended to.
961 : : *
962 : : * buffers needs to be an array that is at least extend_by long. Upon
963 : : * completion, the first extend_by array elements will point to a pinned
964 : : * buffer.
965 : : *
966 : : * If EB_LOCK_FIRST is part of flags, the first returned buffer is
967 : : * locked. This is useful for callers that want a buffer that is guaranteed to
968 : : * be empty.
969 : : */
970 : : BlockNumber
847 tmunro@postgresql.or 971 : 160056 : ExtendBufferedRelBy(BufferManagerRelation bmr,
972 : : ForkNumber fork,
973 : : BufferAccessStrategy strategy,
974 : : uint32 flags,
975 : : uint32 extend_by,
976 : : Buffer *buffers,
977 : : uint32 *extended_by)
978 : : {
979 [ - + ]: 160056 : Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
57 alvherre@kurilemu.de 980 [ - + - - ]:GNC 160056 : Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
987 andres@anarazel.de 981 [ - + ]:CBC 160056 : Assert(extend_by > 0);
982 : :
57 alvherre@kurilemu.de 983 [ + - ]:GNC 160056 : if (bmr.relpersistence == '\0')
847 tmunro@postgresql.or 984 :CBC 160056 : bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
985 : :
986 : 160056 : return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
987 : : extend_by, InvalidBlockNumber,
988 : : buffers, extended_by);
989 : : }
990 : :
991 : : /*
992 : : * Extend the relation so it is at least extend_to blocks large, return buffer
993 : : * (extend_to - 1).
994 : : *
995 : : * This is useful for callers that want to write a specific page, regardless
996 : : * of the current size of the relation (e.g. useful for visibilitymap and for
997 : : * crash recovery).
998 : : */
999 : : Buffer
1000 : 50749 : ExtendBufferedRelTo(BufferManagerRelation bmr,
1001 : : ForkNumber fork,
1002 : : BufferAccessStrategy strategy,
1003 : : uint32 flags,
1004 : : BlockNumber extend_to,
1005 : : ReadBufferMode mode)
1006 : : {
1007 : : BlockNumber current_size;
987 andres@anarazel.de 1008 : 50749 : uint32 extended_by = 0;
1009 : 50749 : Buffer buffer = InvalidBuffer;
1010 : : Buffer buffers[64];
1011 : :
847 tmunro@postgresql.or 1012 [ - + ]: 50749 : Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
57 alvherre@kurilemu.de 1013 [ + + - + ]:GNC 50749 : Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
987 andres@anarazel.de 1014 [ + - - + ]:CBC 50749 : Assert(extend_to != InvalidBlockNumber && extend_to > 0);
1015 : :
57 alvherre@kurilemu.de 1016 [ + + ]:GNC 50749 : if (bmr.relpersistence == '\0')
847 tmunro@postgresql.or 1017 :CBC 7178 : bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1018 : :
1019 : : /*
1020 : : * If desired, create the file if it doesn't exist. If
1021 : : * smgr_cached_nblocks[fork] is positive then it must exist, no need for
1022 : : * an smgrexists call.
1023 : : */
987 andres@anarazel.de 1024 [ + + ]: 50749 : if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
57 alvherre@kurilemu.de 1025 [ + - + + ]:GNC 7178 : (BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == 0 ||
1026 [ + - - + ]: 20 : BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
1027 [ + - + + ]: 7158 : !smgrexists(BMR_GET_SMGR(bmr), fork))
1028 : : {
847 tmunro@postgresql.or 1029 :CBC 7139 : LockRelationForExtension(bmr.rel, ExclusiveLock);
1030 : :
1031 : : /* recheck, fork might have been created concurrently */
57 alvherre@kurilemu.de 1032 [ + - + + ]:GNC 7139 : if (!smgrexists(BMR_GET_SMGR(bmr), fork))
1033 [ + - ]: 7137 : smgrcreate(BMR_GET_SMGR(bmr), fork, flags & EB_PERFORMING_RECOVERY);
1034 : :
847 tmunro@postgresql.or 1035 :CBC 7139 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
1036 : : }
1037 : :
1038 : : /*
1039 : : * If requested, invalidate size cache, so that smgrnblocks asks the
1040 : : * kernel.
1041 : : */
987 andres@anarazel.de 1042 [ + + ]: 50749 : if (flags & EB_CLEAR_SIZE_CACHE)
57 alvherre@kurilemu.de 1043 [ + - ]:GNC 7178 : BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
1044 : :
1045 : : /*
1046 : : * Estimate how many pages we'll need to extend by. This avoids acquiring
1047 : : * unnecessarily many victim buffers.
1048 : : */
1049 [ + + ]: 50749 : current_size = smgrnblocks(BMR_GET_SMGR(bmr), fork);
1050 : :
1051 : : /*
1052 : : * Since no-one else can be looking at the page contents yet, there is no
1053 : : * difference between an exclusive lock and a cleanup-strength lock. Note
1054 : : * that we pass the original mode to ReadBuffer_common() below, when
1055 : : * falling back to reading the buffer to a concurrent relation extension.
1056 : : */
978 andres@anarazel.de 1057 [ + + - + ]:CBC 50749 : if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
987 1058 : 43198 : flags |= EB_LOCK_TARGET;
1059 : :
1060 [ + + ]: 103624 : while (current_size < extend_to)
1061 : : {
1062 : 52875 : uint32 num_pages = lengthof(buffers);
1063 : : BlockNumber first_block;
1064 : :
1065 [ + + ]: 52875 : if ((uint64) current_size + num_pages > extend_to)
1066 : 52809 : num_pages = extend_to - current_size;
1067 : :
847 tmunro@postgresql.or 1068 : 52875 : first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1069 : : num_pages, extend_to,
1070 : : buffers, &extended_by);
1071 : :
987 andres@anarazel.de 1072 : 52875 : current_size = first_block + extended_by;
1073 [ - + - - ]: 52875 : Assert(num_pages != 0 || current_size >= extend_to);
1074 : :
820 peter@eisentraut.org 1075 [ + + ]: 113105 : for (uint32 i = 0; i < extended_by; i++)
1076 : : {
987 andres@anarazel.de 1077 [ + + ]: 60230 : if (first_block + i != extend_to - 1)
1078 : 9488 : ReleaseBuffer(buffers[i]);
1079 : : else
1080 : 50742 : buffer = buffers[i];
1081 : : }
1082 : : }
1083 : :
1084 : : /*
1085 : : * It's possible that another backend concurrently extended the relation.
1086 : : * In that case read the buffer.
1087 : : *
1088 : : * XXX: Should we control this via a flag?
1089 : : */
1090 [ + + ]: 50749 : if (buffer == InvalidBuffer)
1091 : : {
1092 [ - + ]: 7 : Assert(extended_by == 0);
57 alvherre@kurilemu.de 1093 [ + - ]:GNC 7 : buffer = ReadBuffer_common(bmr.rel, BMR_GET_SMGR(bmr), bmr.relpersistence,
1094 : : fork, extend_to - 1, mode, strategy);
1095 : : }
1096 : :
987 andres@anarazel.de 1097 :CBC 50749 : return buffer;
1098 : : }
1099 : :
1100 : : /*
1101 : : * Lock and optionally zero a buffer, as part of the implementation of
1102 : : * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1103 : : * pinned. If the buffer is not already valid, it is zeroed and made valid.
1104 : : */
1105 : : static void
555 tmunro@postgresql.or 1106 : 316618 : ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
1107 : : {
1108 : : BufferDesc *bufHdr;
1109 : : bool need_to_zero;
1110 : 316618 : bool isLocalBuf = BufferIsLocal(buffer);
1111 : :
623 1112 [ + + - + ]: 316618 : Assert(mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
1113 : :
555 1114 [ + + ]: 316618 : if (already_valid)
1115 : : {
1116 : : /*
1117 : : * If the caller already knew the buffer was valid, we can skip some
1118 : : * header interaction. The caller just wants to lock the buffer.
1119 : : */
1120 : 37355 : need_to_zero = false;
1121 : : }
1122 [ + + ]: 279263 : else if (isLocalBuf)
1123 : : {
1124 : : /* Simple case for non-shared buffers. */
623 1125 : 36 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
262 andres@anarazel.de 1126 : 36 : need_to_zero = StartLocalBufferIO(bufHdr, true, false);
1127 : : }
1128 : : else
1129 : : {
1130 : : /*
1131 : : * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1132 : : * concurrently. Even though we aren't doing I/O, that ensures that
1133 : : * we don't zero a page that someone else has pinned. An exclusive
1134 : : * content lock wouldn't be enough, because readers are allowed to
1135 : : * drop the content lock after determining that a tuple is visible
1136 : : * (see buffer access rules in README).
1137 : : */
623 tmunro@postgresql.or 1138 : 279227 : bufHdr = GetBufferDescriptor(buffer - 1);
555 1139 : 279227 : need_to_zero = StartBufferIO(bufHdr, true, false);
1140 : : }
1141 : :
1142 [ + + ]: 316618 : if (need_to_zero)
1143 : : {
1144 : 279263 : memset(BufferGetPage(buffer), 0, BLCKSZ);
1145 : :
1146 : : /*
1147 : : * Grab the buffer content lock before marking the page as valid, to
1148 : : * make sure that no other backend sees the zeroed page before the
1149 : : * caller has had a chance to initialize it.
1150 : : *
1151 : : * Since no-one else can be looking at the page contents yet, there is
1152 : : * no difference between an exclusive lock and a cleanup-strength
1153 : : * lock. (Note that we cannot use LockBuffer() or
1154 : : * LockBufferForCleanup() here, because they assert that the buffer is
1155 : : * already valid.)
1156 : : */
1157 [ + + ]: 279263 : if (!isLocalBuf)
70 andres@anarazel.de 1158 :GNC 279227 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
1159 : :
1160 : : /* Set BM_VALID, terminate IO, and wake up any waiters */
555 tmunro@postgresql.or 1161 [ + + ]:CBC 279263 : if (isLocalBuf)
262 andres@anarazel.de 1162 : 36 : TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1163 : : else
1164 : 279227 : TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1165 : : }
555 tmunro@postgresql.or 1166 [ + + ]: 37355 : else if (!isLocalBuf)
1167 : : {
1168 : : /*
1169 : : * The buffer is valid, so we can't zero it. The caller still expects
1170 : : * the page to be locked on return.
1171 : : */
1172 [ + + ]: 37325 : if (mode == RBM_ZERO_AND_LOCK)
1173 : 37258 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
1174 : : else
1175 : 67 : LockBufferForCleanup(buffer);
1176 : : }
623 1177 : 316618 : }
1178 : :
1179 : : /*
1180 : : * Pin a buffer for a given block. *foundPtr is set to true if the block was
1181 : : * already present, or false if more work is required to either read it in or
1182 : : * zero it.
1183 : : */
1184 : : static pg_attribute_always_inline Buffer
1185 : 58558613 : PinBufferForBlock(Relation rel,
1186 : : SMgrRelation smgr,
1187 : : char persistence,
1188 : : ForkNumber forkNum,
1189 : : BlockNumber blockNum,
1190 : : BufferAccessStrategy strategy,
1191 : : bool *foundPtr)
1192 : : {
1193 : : BufferDesc *bufHdr;
1194 : : IOContext io_context;
1195 : : IOObject io_object;
1196 : :
1197 [ - + ]: 58558613 : Assert(blockNum != P_NEW);
1198 : :
1199 : : /* Persistence should be set before */
515 noah@leadboat.com 1200 [ + + + + : 58558613 : Assert((persistence == RELPERSISTENCE_TEMP ||
- + ]
1201 : : persistence == RELPERSISTENCE_PERMANENT ||
1202 : : persistence == RELPERSISTENCE_UNLOGGED));
1203 : :
623 tmunro@postgresql.or 1204 [ + + ]: 58558613 : if (persistence == RELPERSISTENCE_TEMP)
1205 : : {
1206 : 1535415 : io_context = IOCONTEXT_NORMAL;
1207 : 1535415 : io_object = IOOBJECT_TEMP_RELATION;
1208 : : }
1209 : : else
1210 : : {
1211 : 57023198 : io_context = IOContextForStrategy(strategy);
1212 : 57023198 : io_object = IOOBJECT_RELATION;
1213 : : }
1214 : :
1215 : : TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1216 : : smgr->smgr_rlocator.locator.spcOid,
1217 : : smgr->smgr_rlocator.locator.dbOid,
1218 : : smgr->smgr_rlocator.locator.relNumber,
1219 : : smgr->smgr_rlocator.backend);
1220 : :
1221 [ + + ]: 58558613 : if (persistence == RELPERSISTENCE_TEMP)
1222 : : {
1223 : 1535415 : bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1224 [ + + ]: 1535409 : if (*foundPtr)
5846 rhaas@postgresql.org 1225 : 1526983 : pgBufferUsage.local_blks_hit++;
1226 : : }
1227 : : else
1228 : : {
623 tmunro@postgresql.or 1229 : 57023198 : bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1230 : : strategy, foundPtr, io_context);
1231 [ + + ]: 57023198 : if (*foundPtr)
1232 : 55263042 : pgBufferUsage.shared_blks_hit++;
1233 : : }
1234 [ + + ]: 58558607 : if (rel)
1235 : : {
1236 : : /*
1237 : : * While pgBufferUsage's "read" counter isn't bumped unless we reach
1238 : : * WaitReadBuffers() (so, not for hits, and not for buffers that are
1239 : : * zeroed instead), the per-relation stats always count them.
1240 : : */
1241 [ + + + + : 52554913 : pgstat_count_buffer_read(rel);
+ + ]
1242 [ + + ]: 52554913 : if (*foundPtr)
1243 [ + + - + : 51246090 : pgstat_count_buffer_hit(rel);
+ + ]
1244 : : }
1245 [ + + ]: 58558607 : if (*foundPtr)
1246 : : {
337 michael@paquier.xyz 1247 : 56790025 : pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
987 andres@anarazel.de 1248 [ + + ]: 56790025 : if (VacuumCostActive)
1249 : 109804 : VacuumCostBalance += VacuumCostPageHit;
1250 : :
1251 : : TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1252 : : smgr->smgr_rlocator.locator.spcOid,
1253 : : smgr->smgr_rlocator.locator.dbOid,
1254 : : smgr->smgr_rlocator.locator.relNumber,
1255 : : smgr->smgr_rlocator.backend,
1256 : : true);
1257 : : }
1258 : :
623 tmunro@postgresql.or 1259 : 58558607 : return BufferDescriptorGetBuffer(bufHdr);
1260 : : }
1261 : :
1262 : : /*
1263 : : * ReadBuffer_common -- common logic for all ReadBuffer variants
1264 : : *
1265 : : * smgr is required, rel is optional unless using P_NEW.
1266 : : */
1267 : : static pg_attribute_always_inline Buffer
1268 : 55038862 : ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
1269 : : ForkNumber forkNum,
1270 : : BlockNumber blockNum, ReadBufferMode mode,
1271 : : BufferAccessStrategy strategy)
1272 : : {
1273 : : ReadBuffersOperation operation;
1274 : : Buffer buffer;
1275 : : int flags;
1276 : : char persistence;
1277 : :
1278 : : /*
1279 : : * Backward compatibility path, most code should use ExtendBufferedRel()
1280 : : * instead, as acquiring the extension lock inside ExtendBufferedRel()
1281 : : * scales a lot better.
1282 : : */
1283 [ + + ]: 55038862 : if (unlikely(blockNum == P_NEW))
1284 : : {
1285 : 261 : uint32 flags = EB_SKIP_EXTENSION_LOCK;
1286 : :
1287 : : /*
1288 : : * Since no-one else can be looking at the page contents yet, there is
1289 : : * no difference between an exclusive lock and a cleanup-strength
1290 : : * lock.
1291 : : */
1292 [ + - - + ]: 261 : if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
623 tmunro@postgresql.or 1293 :UBC 0 : flags |= EB_LOCK_FIRST;
1294 : :
623 tmunro@postgresql.or 1295 :CBC 261 : return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1296 : : }
1297 : :
515 noah@leadboat.com 1298 [ + + ]: 55038601 : if (rel)
1299 : 49268557 : persistence = rel->rd_rel->relpersistence;
1300 : : else
1301 : 5770044 : persistence = smgr_persistence;
1302 : :
623 tmunro@postgresql.or 1303 [ + + + + : 55038601 : if (unlikely(mode == RBM_ZERO_AND_CLEANUP_LOCK ||
+ + ]
1304 : : mode == RBM_ZERO_AND_LOCK))
1305 : : {
1306 : : bool found;
1307 : :
515 noah@leadboat.com 1308 : 316618 : buffer = PinBufferForBlock(rel, smgr, persistence,
1309 : : forkNum, blockNum, strategy, &found);
555 tmunro@postgresql.or 1310 : 316618 : ZeroAndLockBuffer(buffer, mode, found);
623 1311 : 316618 : return buffer;
1312 : : }
1313 : :
1314 : : /*
1315 : : * Signal that we are going to immediately wait. If we're immediately
1316 : : * waiting, there is no benefit in actually executing the IO
1317 : : * asynchronously, it would just add dispatch overhead.
1318 : : */
262 andres@anarazel.de 1319 : 54721983 : flags = READ_BUFFERS_SYNCHRONOUSLY;
623 tmunro@postgresql.or 1320 [ + + ]: 54721983 : if (mode == RBM_ZERO_ON_ERROR)
262 andres@anarazel.de 1321 : 778839 : flags |= READ_BUFFERS_ZERO_ON_ERROR;
623 tmunro@postgresql.or 1322 : 54721983 : operation.smgr = smgr;
1323 : 54721983 : operation.rel = rel;
515 noah@leadboat.com 1324 : 54721983 : operation.persistence = persistence;
623 tmunro@postgresql.or 1325 : 54721983 : operation.forknum = forkNum;
1326 : 54721983 : operation.strategy = strategy;
1327 [ + + ]: 54721983 : if (StartReadBuffer(&operation,
1328 : : &buffer,
1329 : : blockNum,
1330 : : flags))
1331 : 716029 : WaitReadBuffers(&operation);
1332 : :
1333 : 54721959 : return buffer;
1334 : : }
1335 : :
1336 : : static pg_attribute_always_inline bool
1337 : 58076069 : StartReadBuffersImpl(ReadBuffersOperation *operation,
1338 : : Buffer *buffers,
1339 : : BlockNumber blockNum,
1340 : : int *nblocks,
1341 : : int flags,
1342 : : bool allow_forwarding)
1343 : : {
1344 : 58076069 : int actual_nblocks = *nblocks;
435 andres@anarazel.de 1345 : 58076069 : int maxcombine = 0;
1346 : : bool did_start_io;
1347 : :
271 tmunro@postgresql.or 1348 [ + + - + ]: 58076069 : Assert(*nblocks == 1 || allow_forwarding);
623 1349 [ - + ]: 58076069 : Assert(*nblocks > 0);
1350 [ - + ]: 58076069 : Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1351 : :
1352 [ + + ]: 59565389 : for (int i = 0; i < actual_nblocks; ++i)
1353 : : {
1354 : : bool found;
1355 : :
271 1356 [ + + + + ]: 58243443 : if (allow_forwarding && buffers[i] != InvalidBuffer)
1357 : 1448 : {
1358 : : BufferDesc *bufHdr;
1359 : :
1360 : : /*
1361 : : * This is a buffer that was pinned by an earlier call to
1362 : : * StartReadBuffers(), but couldn't be handled in one operation at
1363 : : * that time. The operation was split, and the caller has passed
1364 : : * an already pinned buffer back to us to handle the rest of the
1365 : : * operation. It must continue at the expected block number.
1366 : : */
1367 [ - + ]: 1448 : Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1368 : :
1369 : : /*
1370 : : * It might be an already valid buffer (a hit) that followed the
1371 : : * final contiguous block of an earlier I/O (a miss) marking the
1372 : : * end of it, or a buffer that some other backend has since made
1373 : : * valid by performing the I/O for us, in which case we can handle
1374 : : * it as a hit now. It is safe to check for a BM_VALID flag with
1375 : : * a relaxed load, because we got a fresh view of it while pinning
1376 : : * it in the previous call.
1377 : : *
1378 : : * On the other hand if we don't see BM_VALID yet, it must be an
1379 : : * I/O that was split by the previous call and we need to try to
1380 : : * start a new I/O from this block. We're also racing against any
1381 : : * other backend that might start the I/O or even manage to mark
1382 : : * it BM_VALID after this check, but StartBufferIO() will handle
1383 : : * those cases.
1384 : : */
1385 [ + + ]: 1448 : if (BufferIsLocal(buffers[i]))
1386 : 3 : bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1387 : : else
1388 : 1445 : bufHdr = GetBufferDescriptor(buffers[i] - 1);
1389 [ - + ]: 1448 : Assert(pg_atomic_read_u32(&bufHdr->state) & BM_TAG_VALID);
1390 : 1448 : found = pg_atomic_read_u32(&bufHdr->state) & BM_VALID;
1391 : : }
1392 : : else
1393 : : {
1394 : 58241989 : buffers[i] = PinBufferForBlock(operation->rel,
271 tmunro@postgresql.or 1395 :ECB (56487034) : operation->smgr,
271 tmunro@postgresql.or 1396 :CBC 58241995 : operation->persistence,
1397 : : operation->forknum,
1398 : : blockNum + i,
1399 : : operation->strategy,
1400 : : &found);
1401 : : }
1402 : :
623 1403 [ + + ]: 58243437 : if (found)
1404 : : {
1405 : : /*
1406 : : * We have a hit. If it's the first block in the requested range,
1407 : : * we can return it immediately and report that WaitReadBuffers()
1408 : : * does not need to be called. If the initial value of *nblocks
1409 : : * was larger, the caller will have to call again for the rest.
1410 : : */
271 1411 [ + + ]: 56754117 : if (i == 0)
1412 : : {
1413 : 56752667 : *nblocks = 1;
1414 : :
1415 : : #ifdef USE_ASSERT_CHECKING
1416 : :
1417 : : /*
1418 : : * Initialize enough of ReadBuffersOperation to make
1419 : : * CheckReadBuffersOperation() work. Outside of assertions
1420 : : * that's not necessary when no IO is issued.
1421 : : */
262 andres@anarazel.de 1422 : 56752667 : operation->buffers = buffers;
1423 : 56752667 : operation->blocknum = blockNum;
1424 : 56752667 : operation->nblocks = 1;
1425 : 56752667 : operation->nblocks_done = 1;
1426 : 56752667 : CheckReadBuffersOperation(operation, true);
1427 : : #endif
271 tmunro@postgresql.or 1428 : 56752667 : return false;
1429 : : }
1430 : :
1431 : : /*
1432 : : * Otherwise we already have an I/O to perform, but this block
1433 : : * can't be included as it is already valid. Split the I/O here.
1434 : : * There may or may not be more blocks requiring I/O after this
1435 : : * one, we haven't checked, but they can't be contiguous with this
1436 : : * one in the way. We'll leave this buffer pinned, forwarding it
1437 : : * to the next call, avoiding the need to unpin it here and re-pin
1438 : : * it in the next call.
1439 : : */
1440 : 1450 : actual_nblocks = i;
623 1441 : 1450 : break;
1442 : : }
1443 : : else
1444 : : {
1445 : : /*
1446 : : * Check how many blocks we can cover with the same IO. The smgr
1447 : : * implementation might e.g. be limited due to a segment boundary.
1448 : : */
435 andres@anarazel.de 1449 [ + + + + ]: 1489320 : if (i == 0 && actual_nblocks > 1)
1450 : : {
1451 : 33195 : maxcombine = smgrmaxcombine(operation->smgr,
1452 : : operation->forknum,
1453 : : blockNum);
1454 [ - + ]: 33195 : if (unlikely(maxcombine < actual_nblocks))
1455 : : {
435 andres@anarazel.de 1456 [ # # ]:UBC 0 : elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1457 : : blockNum, actual_nblocks, maxcombine);
1458 : 0 : actual_nblocks = maxcombine;
1459 : : }
1460 : : }
1461 : : }
1462 : : }
623 tmunro@postgresql.or 1463 :CBC 1323396 : *nblocks = actual_nblocks;
1464 : :
1465 : : /* Populate information needed for I/O. */
1466 : 1323396 : operation->buffers = buffers;
1467 : 1323396 : operation->blocknum = blockNum;
1468 : 1323396 : operation->flags = flags;
1469 : 1323396 : operation->nblocks = actual_nblocks;
262 andres@anarazel.de 1470 : 1323396 : operation->nblocks_done = 0;
1471 : 1323396 : pgaio_wref_clear(&operation->io_wref);
1472 : :
1473 : : /*
1474 : : * When using AIO, start the IO in the background. If not, issue prefetch
1475 : : * requests if desired by the caller.
1476 : : *
1477 : : * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1478 : : * de-risk the introduction of AIO somewhat. It's a large architectural
1479 : : * change, with lots of chances for unanticipated performance effects.
1480 : : *
1481 : : * Use of IOMETHOD_SYNC already leads to not actually performing IO
1482 : : * asynchronously, but without the check here we'd execute IO earlier than
1483 : : * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1484 : : */
1485 [ + + ]: 1323396 : if (io_method != IOMETHOD_SYNC)
1486 : : {
1487 : : /*
1488 : : * Try to start IO asynchronously. It's possible that no IO needs to
1489 : : * be started, if another backend already performed the IO.
1490 : : *
1491 : : * Note that if an IO is started, it might not cover the entire
1492 : : * requested range, e.g. because an intermediary block has been read
1493 : : * in by another backend. In that case any "trailing" buffers we
1494 : : * already pinned above will be "forwarded" by read_stream.c to the
1495 : : * next call to StartReadBuffers().
1496 : : *
1497 : : * This is signalled to the caller by decrementing *nblocks *and*
1498 : : * reducing operation->nblocks. The latter is done here, but not below
1499 : : * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1500 : : * overall read size anymore, we need to retry until done in its
1501 : : * entirety or until failed.
1502 : : */
1503 : 1322337 : did_start_io = AsyncReadBuffers(operation, nblocks);
1504 : :
1505 : 1322322 : operation->nblocks = *nblocks;
1506 : : }
1507 : : else
1508 : : {
1509 : 1059 : operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
1510 : :
1511 [ + + ]: 1059 : if (flags & READ_BUFFERS_ISSUE_ADVICE)
1512 : : {
1513 : : /*
1514 : : * In theory we should only do this if PinBufferForBlock() had to
1515 : : * allocate new buffers above. That way, if two calls to
1516 : : * StartReadBuffers() were made for the same blocks before
1517 : : * WaitReadBuffers(), only the first would issue the advice.
1518 : : * That'd be a better simulation of true asynchronous I/O, which
1519 : : * would only start the I/O once, but isn't done here for
1520 : : * simplicity.
1521 : : */
1522 : 2 : smgrprefetch(operation->smgr,
1523 : : operation->forknum,
1524 : : blockNum,
1525 : : actual_nblocks);
1526 : : }
1527 : :
1528 : : /*
1529 : : * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1530 : : * will initiate the necessary IO.
1531 : : */
1532 : 1059 : did_start_io = true;
1533 : : }
1534 : :
1535 : 1323381 : CheckReadBuffersOperation(operation, !did_start_io);
1536 : :
1537 : 1323381 : return did_start_io;
1538 : : }
1539 : :
1540 : : /*
1541 : : * Begin reading a range of blocks beginning at blockNum and extending for
1542 : : * *nblocks. *nblocks and the buffers array are in/out parameters. On entry,
1543 : : * the buffers elements covered by *nblocks must hold either InvalidBuffer or
1544 : : * buffers forwarded by an earlier call to StartReadBuffers() that was split
1545 : : * and is now being continued. On return, *nblocks holds the number of blocks
1546 : : * accepted by this operation. If it is less than the original number then
1547 : : * this operation has been split, but buffer elements up to the original
1548 : : * requested size may hold forwarded buffers to be used for a continuing
1549 : : * operation. The caller must either start a new I/O beginning at the block
1550 : : * immediately following the blocks accepted by this call and pass those
1551 : : * buffers back in, or release them if it chooses not to. It shouldn't make
1552 : : * any other use of or assumptions about forwarded buffers.
1553 : : *
1554 : : * If false is returned, no I/O is necessary and the buffers covered by
1555 : : * *nblocks on exit are valid and ready to be accessed. If true is returned,
1556 : : * an I/O has been started, and WaitReadBuffers() must be called with the same
1557 : : * operation object before the buffers covered by *nblocks on exit can be
1558 : : * accessed. Along with the operation object, the caller-supplied array of
1559 : : * buffers must remain valid until WaitReadBuffers() is called, and any
1560 : : * forwarded buffers must also be preserved for a continuing call unless
1561 : : * they are explicitly released.
1562 : : */
1563 : : bool
623 tmunro@postgresql.or 1564 : 1489852 : StartReadBuffers(ReadBuffersOperation *operation,
1565 : : Buffer *buffers,
1566 : : BlockNumber blockNum,
1567 : : int *nblocks,
1568 : : int flags)
1569 : : {
271 1570 : 1489852 : return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1571 : : true /* expect forwarded buffers */ );
1572 : : }
1573 : :
1574 : : /*
1575 : : * Single block version of the StartReadBuffers(). This might save a few
1576 : : * instructions when called from another translation unit, because it is
1577 : : * specialized for nblocks == 1.
1578 : : *
1579 : : * This version does not support "forwarded" buffers: they cannot be created
1580 : : * by reading only one block and *buffer is ignored on entry.
1581 : : */
1582 : : bool
623 1583 : 56586217 : StartReadBuffer(ReadBuffersOperation *operation,
1584 : : Buffer *buffer,
1585 : : BlockNumber blocknum,
1586 : : int flags)
1587 : : {
1588 : 56586217 : int nblocks = 1;
1589 : : bool result;
1590 : :
271 1591 : 56586217 : result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1592 : : false /* single block, no forwarding */ );
623 1593 [ - + ]: 56586202 : Assert(nblocks == 1); /* single block can't be short */
1594 : :
1595 : 56586202 : return result;
1596 : : }
1597 : :
1598 : : /*
1599 : : * Perform sanity checks on the ReadBuffersOperation.
1600 : : */
1601 : : static void
262 andres@anarazel.de 1602 : 60717251 : CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
1603 : : {
1604 : : #ifdef USE_ASSERT_CHECKING
1605 [ - + ]: 60717251 : Assert(operation->nblocks_done <= operation->nblocks);
1606 [ + + - + ]: 60717251 : Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1607 : :
1608 [ + + ]: 121933096 : for (int i = 0; i < operation->nblocks; i++)
1609 : : {
1610 : 61215845 : Buffer buffer = operation->buffers[i];
1611 : 61215845 : BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
1612 [ + + ]: 61215845 : GetLocalBufferDescriptor(-buffer - 1) :
1613 : 59664308 : GetBufferDescriptor(buffer - 1);
1614 : :
1615 [ - + ]: 61215845 : Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1616 [ - + ]: 61215845 : Assert(pg_atomic_read_u32(&buf_hdr->state) & BM_TAG_VALID);
1617 : :
1618 [ + + ]: 61215845 : if (i < operation->nblocks_done)
1619 [ - + ]: 58241633 : Assert(pg_atomic_read_u32(&buf_hdr->state) & BM_VALID);
1620 : : }
1621 : : #endif
1622 : 60717251 : }
1623 : :
1624 : : /* helper for ReadBuffersCanStartIO(), to avoid repetition */
1625 : : static inline bool
1626 : 1489319 : ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
1627 : : {
623 tmunro@postgresql.or 1628 [ + + ]: 1489319 : if (BufferIsLocal(buffer))
262 andres@anarazel.de 1629 : 8390 : return StartLocalBufferIO(GetLocalBufferDescriptor(-buffer - 1),
1630 : : true, nowait);
1631 : : else
623 tmunro@postgresql.or 1632 : 1480929 : return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1633 : : }
1634 : :
1635 : : /*
1636 : : * Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
1637 : : */
1638 : : static inline bool
262 andres@anarazel.de 1639 : 1489319 : ReadBuffersCanStartIO(Buffer buffer, bool nowait)
1640 : : {
1641 : : /*
1642 : : * If this backend currently has staged IO, we need to submit the pending
1643 : : * IO before waiting for the right to issue IO, to avoid the potential for
1644 : : * deadlocks (and, more commonly, unnecessary delays for other backends).
1645 : : */
1646 [ + + + + ]: 1489319 : if (!nowait && pgaio_have_staged())
1647 : : {
1648 [ + - ]: 586 : if (ReadBuffersCanStartIOOnce(buffer, true))
1649 : 586 : return true;
1650 : :
1651 : : /*
1652 : : * Unfortunately StartBufferIO() returning false doesn't allow to
1653 : : * distinguish between the buffer already being valid and IO already
1654 : : * being in progress. Since IO already being in progress is quite
1655 : : * rare, this approach seems fine.
1656 : : */
262 andres@anarazel.de 1657 :UBC 0 : pgaio_submit_staged();
1658 : : }
1659 : :
262 andres@anarazel.de 1660 :CBC 1488733 : return ReadBuffersCanStartIOOnce(buffer, nowait);
1661 : : }
1662 : :
1663 : : /*
1664 : : * Helper for WaitReadBuffers() that processes the results of a readv
1665 : : * operation, raising an error if necessary.
1666 : : */
1667 : : static void
1668 : 1320085 : ProcessReadBuffersResult(ReadBuffersOperation *operation)
1669 : : {
1670 : 1320085 : PgAioReturn *aio_ret = &operation->io_return;
1671 : 1320085 : PgAioResultStatus rs = aio_ret->result.status;
1672 : 1320085 : int newly_read_blocks = 0;
1673 : :
1674 [ - + ]: 1320085 : Assert(pgaio_wref_valid(&operation->io_wref));
1675 [ - + ]: 1320085 : Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1676 : :
1677 : : /*
1678 : : * SMGR reports the number of blocks successfully read as the result of
1679 : : * the IO operation. Thus we can simply add that to ->nblocks_done.
1680 : : */
1681 : :
1682 [ + + ]: 1320085 : if (likely(rs != PGAIO_RS_ERROR))
1683 : 1320058 : newly_read_blocks = aio_ret->result.result;
1684 : :
1685 [ + + + + ]: 1320085 : if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1686 [ + + ]: 51 : pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1687 : : rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1688 [ - + ]: 1320034 : else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1689 : : {
1690 : : /*
1691 : : * We'll retry, so we just emit a debug message to the server log (or
1692 : : * not even that in prod scenarios).
1693 : : */
262 andres@anarazel.de 1694 :UBC 0 : pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1695 [ # # ]: 0 : elog(DEBUG3, "partial read, will retry");
1696 : : }
1697 : :
262 andres@anarazel.de 1698 [ - + ]:CBC 1320058 : Assert(newly_read_blocks > 0);
1699 [ - + ]: 1320058 : Assert(newly_read_blocks <= MAX_IO_COMBINE_LIMIT);
1700 : :
1701 : 1320058 : operation->nblocks_done += newly_read_blocks;
1702 : :
1703 [ - + ]: 1320058 : Assert(operation->nblocks_done <= operation->nblocks);
1704 : 1320058 : }
1705 : :
1706 : : void
623 tmunro@postgresql.or 1707 : 1320086 : WaitReadBuffers(ReadBuffersOperation *operation)
1708 : : {
262 andres@anarazel.de 1709 : 1320086 : PgAioReturn *aio_ret = &operation->io_return;
1710 : : IOContext io_context;
1711 : : IOObject io_object;
1712 : :
1713 [ + + ]: 1320086 : if (operation->persistence == RELPERSISTENCE_TEMP)
1714 : : {
623 tmunro@postgresql.or 1715 : 1502 : io_context = IOCONTEXT_NORMAL;
1716 : 1502 : io_object = IOOBJECT_TEMP_RELATION;
1717 : : }
1718 : : else
1719 : : {
1720 : 1318584 : io_context = IOContextForStrategy(operation->strategy);
1721 : 1318584 : io_object = IOOBJECT_RELATION;
1722 : : }
1723 : :
1724 : : /*
1725 : : * If we get here without an IO operation having been issued, the
1726 : : * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1727 : : * caller should not have called WaitReadBuffers().
1728 : : *
1729 : : * In the case of IOMETHOD_SYNC, we start - as we used to before the
1730 : : * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1731 : : * of the retry logic below, no extra code is required.
1732 : : *
1733 : : * This path is expected to eventually go away.
1734 : : */
262 andres@anarazel.de 1735 [ + + + - ]: 1320086 : if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
262 andres@anarazel.de 1736 [ # # ]:UBC 0 : elog(ERROR, "waiting for read operation that didn't read");
1737 : :
1738 : : /*
1739 : : * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1740 : : * done. We may need multiple retries, not just because we could get
1741 : : * multiple partial reads, but also because some of the remaining
1742 : : * to-be-read buffers may have been read in by other backends, limiting
1743 : : * the IO size.
1744 : : */
1745 : : while (true)
623 tmunro@postgresql.or 1746 :CBC 1059 : {
1747 : : int ignored_nblocks_progress;
1748 : :
262 andres@anarazel.de 1749 : 1321145 : CheckReadBuffersOperation(operation, false);
1750 : :
1751 : : /*
1752 : : * If there is an IO associated with the operation, we may need to
1753 : : * wait for it.
1754 : : */
1755 [ + + ]: 1321145 : if (pgaio_wref_valid(&operation->io_wref))
1756 : : {
1757 : : /*
1758 : : * Track the time spent waiting for the IO to complete. As
1759 : : * tracking a wait even if we don't actually need to wait
1760 : : *
1761 : : * a) is not cheap, due to the timestamping overhead
1762 : : *
1763 : : * b) reports some time as waiting, even if we never waited
1764 : : *
1765 : : * we first check if we already know the IO is complete.
1766 : : */
1767 [ + + ]: 1320086 : if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
1768 [ + + ]: 595946 : !pgaio_wref_check_done(&operation->io_wref))
1769 : 308492 : {
1770 : 308493 : instr_time io_start = pgstat_prepare_io_time(track_io_timing);
1771 : :
1772 : 308493 : pgaio_wref_wait(&operation->io_wref);
1773 : :
1774 : : /*
1775 : : * The IO operation itself was already counted earlier, in
1776 : : * AsyncReadBuffers(), this just accounts for the wait time.
1777 : : */
1778 : 308492 : pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1779 : : io_start, 0, 0);
1780 : : }
1781 : : else
1782 : : {
1783 [ - + ]: 1011593 : Assert(pgaio_wref_check_done(&operation->io_wref));
1784 : : }
1785 : :
1786 : : /*
1787 : : * We now are sure the IO completed. Check the results. This
1788 : : * includes reporting on errors if there were any.
1789 : : */
1790 : 1320085 : ProcessReadBuffersResult(operation);
1791 : : }
1792 : :
1793 : : /*
1794 : : * Most of the time, the one IO we already started, will read in
1795 : : * everything. But we need to deal with partial reads and buffers not
1796 : : * needing IO anymore.
1797 : : */
1798 [ + + ]: 1321117 : if (operation->nblocks_done == operation->nblocks)
1799 : 1320058 : break;
1800 : :
1801 [ - + ]: 1059 : CHECK_FOR_INTERRUPTS();
1802 : :
1803 : : /*
1804 : : * This may only complete the IO partially, either because some
1805 : : * buffers were already valid, or because of a partial read.
1806 : : *
1807 : : * NB: In contrast to after the AsyncReadBuffers() call in
1808 : : * StartReadBuffers(), we do *not* reduce
1809 : : * ReadBuffersOperation->nblocks here, callers expect the full
1810 : : * operation to be completed at this point (as more operations may
1811 : : * have been queued).
1812 : : */
1813 : 1059 : AsyncReadBuffers(operation, &ignored_nblocks_progress);
1814 : : }
1815 : :
1816 : 1320058 : CheckReadBuffersOperation(operation, true);
1817 : :
1818 : : /* NB: READ_DONE tracepoint was already executed in completion callback */
1819 : 1320058 : }
1820 : :
1821 : : /*
1822 : : * Initiate IO for the ReadBuffersOperation
1823 : : *
1824 : : * This function only starts a single IO at a time. The size of the IO may be
1825 : : * limited to below the to-be-read blocks, if one of the buffers has
1826 : : * concurrently been read in. If the first to-be-read buffer is already valid,
1827 : : * no IO will be issued.
1828 : : *
1829 : : * To support retries after partial reads, the first operation->nblocks_done
1830 : : * buffers are skipped.
1831 : : *
1832 : : * On return *nblocks_progress is updated to reflect the number of buffers
1833 : : * affected by the call. If the first buffer is valid, *nblocks_progress is
1834 : : * set to 1 and operation->nblocks_done is incremented.
1835 : : *
1836 : : * Returns true if IO was initiated, false if no IO was necessary.
1837 : : */
1838 : : static bool
1839 : 1323396 : AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
1840 : : {
1841 : 1323396 : Buffer *buffers = &operation->buffers[0];
1842 : 1323396 : int flags = operation->flags;
1843 : 1323396 : BlockNumber blocknum = operation->blocknum;
1844 : 1323396 : ForkNumber forknum = operation->forknum;
1845 : 1323396 : char persistence = operation->persistence;
1846 : 1323396 : int16 nblocks_done = operation->nblocks_done;
1847 : 1323396 : Buffer *io_buffers = &operation->buffers[nblocks_done];
1848 : 1323396 : int io_buffers_len = 0;
1849 : : PgAioHandle *ioh;
1850 : 1323396 : uint32 ioh_flags = 0;
1851 : : void *io_pages[MAX_IO_COMBINE_LIMIT];
1852 : : IOContext io_context;
1853 : : IOObject io_object;
1854 : : bool did_start_io;
1855 : :
1856 : : /*
1857 : : * When this IO is executed synchronously, either because the caller will
1858 : : * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1859 : : * the AIO subsystem needs to know.
1860 : : */
1861 [ + + ]: 1323396 : if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1862 : 717424 : ioh_flags |= PGAIO_HF_SYNCHRONOUS;
1863 : :
1864 [ + + ]: 1323396 : if (persistence == RELPERSISTENCE_TEMP)
1865 : : {
1866 : 1796 : io_context = IOCONTEXT_NORMAL;
1867 : 1796 : io_object = IOOBJECT_TEMP_RELATION;
1868 : 1796 : ioh_flags |= PGAIO_HF_REFERENCES_LOCAL;
1869 : : }
1870 : : else
1871 : : {
1872 : 1321600 : io_context = IOContextForStrategy(operation->strategy);
1873 : 1321600 : io_object = IOOBJECT_RELATION;
1874 : : }
1875 : :
1876 : : /*
1877 : : * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1878 : : * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1879 : : * set globally, but on a per-session basis. The completion callback,
1880 : : * which may be run in other processes, e.g. in IO workers, may have a
1881 : : * different value of the zero_damaged_pages GUC.
1882 : : *
1883 : : * XXX: We probably should eventually use a different flag for
1884 : : * zero_damaged_pages, so we can report different log levels / error codes
1885 : : * for zero_damaged_pages and ZERO_ON_ERROR.
1886 : : */
1887 [ + + ]: 1323396 : if (zero_damaged_pages)
1888 : 24 : flags |= READ_BUFFERS_ZERO_ON_ERROR;
1889 : :
1890 : : /*
1891 : : * For the same reason as with zero_damaged_pages we need to use this
1892 : : * backend's ignore_checksum_failure value.
1893 : : */
1894 [ + + ]: 1323396 : if (ignore_checksum_failure)
1895 : 12 : flags |= READ_BUFFERS_IGNORE_CHECKSUM_FAILURES;
1896 : :
1897 : :
1898 : : /*
1899 : : * To be allowed to report stats in the local completion callback we need
1900 : : * to prepare to report stats now. This ensures we can safely report the
1901 : : * checksum failure even in a critical section.
1902 : : */
1903 : 1323396 : pgstat_prepare_report_checksum_failure(operation->smgr->smgr_rlocator.locator.dbOid);
1904 : :
1905 : : /*
1906 : : * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
1907 : : * might block, which we don't want after setting IO_IN_PROGRESS.
1908 : : *
1909 : : * If we need to wait for IO before we can get a handle, submit
1910 : : * already-staged IO first, so that other backends don't need to wait.
1911 : : * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
1912 : : * wait for already submitted IO, which doesn't require additional locks,
1913 : : * but it could still cause undesirable waits.
1914 : : *
1915 : : * A secondary benefit is that this would allow us to measure the time in
1916 : : * pgaio_io_acquire() without causing undue timer overhead in the common,
1917 : : * non-blocking, case. However, currently the pgstats infrastructure
1918 : : * doesn't really allow that, as it a) asserts that an operation can't
1919 : : * have time without operations b) doesn't have an API to report
1920 : : * "accumulated" time.
1921 : : */
1922 : 1323396 : ioh = pgaio_io_acquire_nb(CurrentResourceOwner, &operation->io_return);
1923 [ + + ]: 1323396 : if (unlikely(!ioh))
1924 : : {
1925 : 3247 : pgaio_submit_staged();
1926 : :
1927 : 3247 : ioh = pgaio_io_acquire(CurrentResourceOwner, &operation->io_return);
1928 : : }
1929 : :
1930 : : /*
1931 : : * Check if we can start IO on the first to-be-read buffer.
1932 : : *
1933 : : * If an I/O is already in progress in another backend, we want to wait
1934 : : * for the outcome: either done, or something went wrong and we will
1935 : : * retry.
1936 : : */
1937 [ + + ]: 1323396 : if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
1938 : : {
1939 : : /*
1940 : : * Someone else has already completed this block, we're done.
1941 : : *
1942 : : * When IO is necessary, ->nblocks_done is updated in
1943 : : * ProcessReadBuffersResult(), but that is not called if no IO is
1944 : : * necessary. Thus update here.
1945 : : */
1946 : 2997 : operation->nblocks_done += 1;
1947 : 2997 : *nblocks_progress = 1;
1948 : :
1949 : 2997 : pgaio_io_release(ioh);
1950 : 2997 : pgaio_wref_clear(&operation->io_wref);
1951 : 2997 : did_start_io = false;
1952 : :
1953 : : /*
1954 : : * Report and track this as a 'hit' for this backend, even though it
1955 : : * must have started out as a miss in PinBufferForBlock(). The other
1956 : : * backend will track this as a 'read'.
1957 : : */
1958 : : TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
1959 : : operation->smgr->smgr_rlocator.locator.spcOid,
1960 : : operation->smgr->smgr_rlocator.locator.dbOid,
1961 : : operation->smgr->smgr_rlocator.locator.relNumber,
1962 : : operation->smgr->smgr_rlocator.backend,
1963 : : true);
1964 : :
1965 [ - + ]: 2997 : if (persistence == RELPERSISTENCE_TEMP)
262 andres@anarazel.de 1966 :UBC 0 : pgBufferUsage.local_blks_hit += 1;
1967 : : else
262 andres@anarazel.de 1968 :CBC 2997 : pgBufferUsage.shared_blks_hit += 1;
1969 : :
1970 [ + - ]: 2997 : if (operation->rel)
1971 [ - + - - : 2997 : pgstat_count_buffer_hit(operation->rel);
+ - ]
1972 : :
1973 : 2997 : pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1974 : :
1975 [ + + ]: 2997 : if (VacuumCostActive)
1976 : 17 : VacuumCostBalance += VacuumCostPageHit;
1977 : : }
1978 : : else
1979 : : {
1980 : : instr_time io_start;
1981 : :
1982 : : /* We found a buffer that we need to read in. */
1983 [ - + ]: 1320399 : Assert(io_buffers[0] == buffers[nblocks_done]);
1984 : 1320399 : io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
623 tmunro@postgresql.or 1985 : 1320399 : io_buffers_len = 1;
1986 : :
1987 : : /*
1988 : : * How many neighboring-on-disk blocks can we scatter-read into other
1989 : : * buffers at the same time? In this case we don't wait if we see an
1990 : : * I/O already in progress. We already set BM_IO_IN_PROGRESS for the
1991 : : * head block, so we should get on with that I/O as soon as possible.
1992 : : */
262 andres@anarazel.de 1993 [ + + ]: 1486322 : for (int i = nblocks_done + 1; i < operation->nblocks; i++)
1994 : : {
1995 [ - + ]: 165923 : if (!ReadBuffersCanStartIO(buffers[i], true))
262 andres@anarazel.de 1996 :UBC 0 : break;
1997 : : /* Must be consecutive block numbers. */
262 andres@anarazel.de 1998 [ - + ]:CBC 165923 : Assert(BufferGetBlockNumber(buffers[i - 1]) ==
1999 : : BufferGetBlockNumber(buffers[i]) - 1);
2000 [ - + ]: 165923 : Assert(io_buffers[io_buffers_len] == buffers[i]);
2001 : :
623 tmunro@postgresql.or 2002 : 165923 : io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
2003 : : }
2004 : :
2005 : : /* get a reference to wait for in WaitReadBuffers() */
262 andres@anarazel.de 2006 : 1320399 : pgaio_io_get_wref(ioh, &operation->io_wref);
2007 : :
2008 : : /* provide the list of buffers to the completion callbacks */
2009 : 1320399 : pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len);
2010 : :
2011 [ + + ]: 1320399 : pgaio_io_register_callbacks(ioh,
2012 : : persistence == RELPERSISTENCE_TEMP ?
2013 : : PGAIO_HCB_LOCAL_BUFFER_READV :
2014 : : PGAIO_HCB_SHARED_BUFFER_READV,
2015 : : flags);
2016 : :
2017 : 1320399 : pgaio_io_set_flag(ioh, ioh_flags);
2018 : :
2019 : : /* ---
2020 : : * Even though we're trying to issue IO asynchronously, track the time
2021 : : * in smgrstartreadv():
2022 : : * - if io_method == IOMETHOD_SYNC, we will always perform the IO
2023 : : * immediately
2024 : : * - the io method might not support the IO (e.g. worker IO for a temp
2025 : : * table)
2026 : : * ---
2027 : : */
2028 : 1320399 : io_start = pgstat_prepare_io_time(track_io_timing);
2029 : 1320399 : smgrstartreadv(ioh, operation->smgr, forknum,
2030 : : blocknum + nblocks_done,
2031 : : io_pages, io_buffers_len);
2032 : 1320384 : pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
2033 : 1320384 : io_start, 1, io_buffers_len * BLCKSZ);
2034 : :
272 2035 [ + + ]: 1320384 : if (persistence == RELPERSISTENCE_TEMP)
2036 : 1796 : pgBufferUsage.local_blks_read += io_buffers_len;
2037 : : else
2038 : 1318588 : pgBufferUsage.shared_blks_read += io_buffers_len;
2039 : :
2040 : : /*
2041 : : * Track vacuum cost when issuing IO, not after waiting for it.
2042 : : * Otherwise we could end up issuing a lot of IO in a short timespan,
2043 : : * despite a low cost limit.
2044 : : */
623 tmunro@postgresql.or 2045 [ + + ]: 1320384 : if (VacuumCostActive)
2046 : 16239 : VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
2047 : :
262 andres@anarazel.de 2048 : 1320384 : *nblocks_progress = io_buffers_len;
2049 : 1320384 : did_start_io = true;
2050 : : }
2051 : :
2052 : 1323381 : return did_start_io;
2053 : : }
2054 : :
2055 : : /*
2056 : : * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
2057 : : * buffer. If no buffer exists already, selects a replacement victim and
2058 : : * evicts the old page, but does NOT read in new page.
2059 : : *
2060 : : * "strategy" can be a buffer replacement strategy object, or NULL for
2061 : : * the default strategy. The selected buffer's usage_count is advanced when
2062 : : * using the default strategy, but otherwise possibly not (see PinBuffer).
2063 : : *
2064 : : * The returned buffer is pinned and is already marked as holding the
2065 : : * desired page. If it already did have the desired page, *foundPtr is
2066 : : * set true. Otherwise, *foundPtr is set false.
2067 : : *
2068 : : * io_context is passed as an output parameter to avoid calling
2069 : : * IOContextForStrategy() when there is a shared buffers hit and no IO
2070 : : * statistics need be captured.
2071 : : *
2072 : : * No locks are held either at entry or exit.
2073 : : */
2074 : : static pg_attribute_always_inline BufferDesc *
5467 rhaas@postgresql.org 2075 : 57023198 : BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
2076 : : BlockNumber blockNum,
2077 : : BufferAccessStrategy strategy,
2078 : : bool *foundPtr, IOContext io_context)
2079 : : {
2080 : : BufferTag newTag; /* identity of requested block */
2081 : : uint32 newHash; /* hash value for newTag */
2082 : : LWLock *newPartitionLock; /* buffer partition lock for it */
2083 : : int existing_buf_id;
2084 : : Buffer victim_buffer;
2085 : : BufferDesc *victim_buf_hdr;
2086 : : uint32 victim_buf_state;
41 andres@anarazel.de 2087 :GNC 57023198 : uint32 set_bits = 0;
2088 : :
2089 : : /* Make sure we will have room to remember the buffer pin */
770 heikki.linnakangas@i 2090 :CBC 57023198 : ResourceOwnerEnlarge(CurrentResourceOwner);
2091 : 57023198 : ReservePrivateRefCountEntry();
2092 : :
2093 : : /* create a tag so we can lookup the buffer */
1239 rhaas@postgresql.org 2094 : 57023198 : InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2095 : :
2096 : : /* determine its hash code and partition lock ID */
7087 tgl@sss.pgh.pa.us 2097 : 57023198 : newHash = BufTableHashCode(&newTag);
2098 : 57023198 : newPartitionLock = BufMappingPartitionLock(newHash);
2099 : :
2100 : : /* see if the block is in the buffer pool already */
2101 : 57023198 : LWLockAcquire(newPartitionLock, LW_SHARED);
987 andres@anarazel.de 2102 : 57023198 : existing_buf_id = BufTableLookup(&newTag, newHash);
2103 [ + + ]: 57023198 : if (existing_buf_id >= 0)
2104 : : {
2105 : : BufferDesc *buf;
2106 : : bool valid;
2107 : :
2108 : : /*
2109 : : * Found it. Now, pin the buffer so no one can steal it from the
2110 : : * buffer pool, and check to see if the correct data has been loaded
2111 : : * into the buffer.
2112 : : */
2113 : 55265438 : buf = GetBufferDescriptor(existing_buf_id);
2114 : :
70 andres@anarazel.de 2115 :GNC 55265438 : valid = PinBuffer(buf, strategy, false);
2116 : :
2117 : : /* Can release the mapping lock as soon as we've pinned it */
7087 tgl@sss.pgh.pa.us 2118 :CBC 55265438 : LWLockRelease(newPartitionLock);
2119 : :
3045 peter_e@gmx.net 2120 : 55265438 : *foundPtr = true;
2121 : :
7593 tgl@sss.pgh.pa.us 2122 [ + + ]: 55265438 : if (!valid)
2123 : : {
2124 : : /*
2125 : : * We can only get here if (a) someone else is still reading in
2126 : : * the page, (b) a previous read attempt failed, or (c) someone
2127 : : * called StartReadBuffers() but not yet WaitReadBuffers().
2128 : : */
623 tmunro@postgresql.or 2129 : 2732 : *foundPtr = false;
2130 : : }
2131 : :
9969 bruce@momjian.us 2132 : 55265438 : return buf;
2133 : : }
2134 : :
2135 : : /*
2136 : : * Didn't find it in the buffer pool. We'll have to initialize a new
2137 : : * buffer. Remember to unlock the mapping lock while doing the work.
2138 : : */
7087 tgl@sss.pgh.pa.us 2139 : 1757760 : LWLockRelease(newPartitionLock);
2140 : :
2141 : : /*
2142 : : * Acquire a victim buffer. Somebody else might try to do the same, we
2143 : : * don't hold any conflicting locks. If so we'll have to undo our work
2144 : : * later.
2145 : : */
987 andres@anarazel.de 2146 : 1757760 : victim_buffer = GetVictimBuffer(strategy, io_context);
2147 : 1757760 : victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
2148 : :
2149 : : /*
2150 : : * Try to make a hashtable entry for the buffer under its new tag. If
2151 : : * somebody else inserted another buffer for the tag, we'll release the
2152 : : * victim buffer we acquired and use the already inserted one.
2153 : : */
2154 : 1757760 : LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
2155 : 1757760 : existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
2156 [ + + ]: 1757760 : if (existing_buf_id >= 0)
2157 : : {
2158 : : BufferDesc *existing_buf_hdr;
2159 : : bool valid;
2160 : :
2161 : : /*
2162 : : * Got a collision. Someone has already done what we were about to do.
2163 : : * We'll just handle this as if it were found in the buffer pool in
2164 : : * the first place. First, give up the buffer we were planning to
2165 : : * use.
2166 : : *
2167 : : * We could do this after releasing the partition lock, but then we'd
2168 : : * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2169 : : * before acquiring the lock, for the rare case of such a collision.
2170 : : */
2171 : 652 : UnpinBuffer(victim_buf_hdr);
2172 : :
2173 : : /* remaining code should match code at top of routine */
2174 : :
2175 : 652 : existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
2176 : :
70 andres@anarazel.de 2177 :GNC 652 : valid = PinBuffer(existing_buf_hdr, strategy, false);
2178 : :
2179 : : /* Can release the mapping lock as soon as we've pinned it */
987 andres@anarazel.de 2180 :CBC 652 : LWLockRelease(newPartitionLock);
2181 : :
2182 : 652 : *foundPtr = true;
2183 : :
2184 [ + + ]: 652 : if (!valid)
2185 : : {
2186 : : /*
2187 : : * We can only get here if (a) someone else is still reading in
2188 : : * the page, (b) a previous read attempt failed, or (c) someone
2189 : : * called StartReadBuffers() but not yet WaitReadBuffers().
2190 : : */
623 tmunro@postgresql.or 2191 : 316 : *foundPtr = false;
2192 : : }
2193 : :
987 andres@anarazel.de 2194 : 652 : return existing_buf_hdr;
2195 : : }
2196 : :
2197 : : /*
2198 : : * Need to lock the buffer header too in order to change its tag.
2199 : : */
2200 : 1757108 : victim_buf_state = LockBufHdr(victim_buf_hdr);
2201 : :
2202 : : /* some sanity checks while we hold the buffer header lock */
2203 [ - + ]: 1757108 : Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
2204 [ - + ]: 1757108 : Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
2205 : :
2206 : 1757108 : victim_buf_hdr->tag = newTag;
2207 : :
2208 : : /*
2209 : : * Make sure BM_PERMANENT is set for buffers that must be written at every
2210 : : * checkpoint. Unlogged buffers only need to be written at shutdown
2211 : : * checkpoints, except for their "init" forks, which need to be treated
2212 : : * just like permanent relations.
2213 : : */
41 andres@anarazel.de 2214 :GNC 1757108 : set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
3200 rhaas@postgresql.org 2215 [ + + - + ]:CBC 1757108 : if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
41 andres@anarazel.de 2216 :GNC 1756769 : set_bits |= BM_PERMANENT;
2217 : :
2218 : 1757108 : UnlockBufHdrExt(victim_buf_hdr, victim_buf_state,
2219 : : set_bits, 0, 0);
2220 : :
7087 tgl@sss.pgh.pa.us 2221 :CBC 1757108 : LWLockRelease(newPartitionLock);
2222 : :
2223 : : /*
2224 : : * Buffer contents are currently invalid.
2225 : : */
623 tmunro@postgresql.or 2226 : 1757108 : *foundPtr = false;
2227 : :
987 andres@anarazel.de 2228 : 1757108 : return victim_buf_hdr;
2229 : : }
2230 : :
2231 : : /*
2232 : : * InvalidateBuffer -- mark a shared buffer invalid.
2233 : : *
2234 : : * The buffer header spinlock must be held at entry. We drop it before
2235 : : * returning. (This is sane because the caller must have locked the
2236 : : * buffer in order to be sure it should be dropped.)
2237 : : *
2238 : : * This is used only in contexts such as dropping a relation. We assume
2239 : : * that no other backend could possibly be interested in using the page,
2240 : : * so the only reason the buffer might be pinned is if someone else is
2241 : : * trying to write it out. We have to let them finish before we can
2242 : : * reclaim the buffer.
2243 : : *
2244 : : * The buffer could get reclaimed by someone else while we are waiting
2245 : : * to acquire the necessary locks; if so, don't mess it up.
2246 : : */
2247 : : static void
3684 rhaas@postgresql.org 2248 : 108514 : InvalidateBuffer(BufferDesc *buf)
2249 : : {
2250 : : BufferTag oldTag;
2251 : : uint32 oldHash; /* hash value for oldTag */
2252 : : LWLock *oldPartitionLock; /* buffer partition lock for it */
2253 : : uint32 oldFlags;
2254 : : uint32 buf_state;
2255 : :
2256 : : /* Save the original buffer tag before dropping the spinlock */
7593 tgl@sss.pgh.pa.us 2257 : 108514 : oldTag = buf->tag;
2258 : :
41 andres@anarazel.de 2259 :GNC 108514 : UnlockBufHdr(buf);
2260 : :
2261 : : /*
2262 : : * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2263 : : * worth storing the hashcode in BufferDesc so we need not recompute it
2264 : : * here? Probably not.
2265 : : */
7087 tgl@sss.pgh.pa.us 2266 :CBC 108514 : oldHash = BufTableHashCode(&oldTag);
2267 : 108514 : oldPartitionLock = BufMappingPartitionLock(oldHash);
2268 : :
7593 2269 : 119951 : retry:
2270 : :
2271 : : /*
2272 : : * Acquire exclusive mapping lock in preparation for changing the buffer's
2273 : : * association.
2274 : : */
7087 2275 : 119951 : LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
2276 : :
2277 : : /* Re-lock the buffer header */
3538 andres@anarazel.de 2278 : 119951 : buf_state = LockBufHdr(buf);
2279 : :
2280 : : /* If it's changed while we were waiting for lock, do nothing */
1239 rhaas@postgresql.org 2281 [ + + ]: 119951 : if (!BufferTagsEqual(&buf->tag, &oldTag))
2282 : : {
41 andres@anarazel.de 2283 :GNC 2 : UnlockBufHdr(buf);
7087 tgl@sss.pgh.pa.us 2284 :GBC 2 : LWLockRelease(oldPartitionLock);
7593 2285 : 2 : return;
2286 : : }
2287 : :
2288 : : /*
2289 : : * We assume the reason for it to be pinned is that either we were
2290 : : * asynchronously reading the page in before erroring out or someone else
2291 : : * is flushing the page out. Wait for the IO to finish. (This could be
2292 : : * an infinite loop if the refcount is messed up... it would be nice to
2293 : : * time out after awhile, but there seems no way to be sure how many loops
2294 : : * may be needed. Note that if the other guy has pinned the buffer but
2295 : : * not yet done StartBufferIO, WaitIO will fall through and we'll
2296 : : * effectively be busy-looping here.)
2297 : : */
3538 andres@anarazel.de 2298 [ + + ]:CBC 119949 : if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
2299 : : {
41 andres@anarazel.de 2300 :GNC 11437 : UnlockBufHdr(buf);
7087 tgl@sss.pgh.pa.us 2301 :GBC 11437 : LWLockRelease(oldPartitionLock);
2302 : : /* safety check: should definitely not be our *own* pin */
3780 andres@anarazel.de 2303 [ - + ]: 11437 : if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
7579 tgl@sss.pgh.pa.us 2304 [ # # ]:UBC 0 : elog(ERROR, "buffer is pinned in InvalidateBuffer");
7593 tgl@sss.pgh.pa.us 2305 :GBC 11437 : WaitIO(buf);
2306 : 11437 : goto retry;
2307 : : }
2308 : :
2309 : : /*
2310 : : * Clear out the buffer's tag and flags. We must do this to ensure that
2311 : : * linear scans of the buffer array don't think the buffer is valid.
2312 : : */
3538 andres@anarazel.de 2313 :CBC 108512 : oldFlags = buf_state & BUF_FLAG_MASK;
1239 rhaas@postgresql.org 2314 : 108512 : ClearBufferTag(&buf->tag);
2315 : :
41 andres@anarazel.de 2316 :GNC 108512 : UnlockBufHdrExt(buf, buf_state,
2317 : : 0,
2318 : : BUF_FLAG_MASK | BUF_USAGECOUNT_MASK,
2319 : : 0);
2320 : :
2321 : : /*
2322 : : * Remove the buffer from the lookup hashtable, if it was in there.
2323 : : */
7593 tgl@sss.pgh.pa.us 2324 [ + - ]:CBC 108512 : if (oldFlags & BM_TAG_VALID)
7087 2325 : 108512 : BufTableDelete(&oldTag, oldHash);
2326 : :
2327 : : /*
2328 : : * Done with mapping lock.
2329 : : */
2330 : 108512 : LWLockRelease(oldPartitionLock);
2331 : : }
2332 : :
2333 : : /*
2334 : : * Helper routine for GetVictimBuffer()
2335 : : *
2336 : : * Needs to be called on a buffer with a valid tag, pinned, but without the
2337 : : * buffer header spinlock held.
2338 : : *
2339 : : * Returns true if the buffer can be reused, in which case the buffer is only
2340 : : * pinned by this backend and marked as invalid, false otherwise.
2341 : : */
2342 : : static bool
987 andres@anarazel.de 2343 : 1260568 : InvalidateVictimBuffer(BufferDesc *buf_hdr)
2344 : : {
2345 : : uint32 buf_state;
2346 : : uint32 hash;
2347 : : LWLock *partition_lock;
2348 : : BufferTag tag;
2349 : :
2350 [ - + ]: 1260568 : Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) == 1);
2351 : :
2352 : : /* have buffer pinned, so it's safe to read tag without lock */
2353 : 1260568 : tag = buf_hdr->tag;
2354 : :
2355 : 1260568 : hash = BufTableHashCode(&tag);
2356 : 1260568 : partition_lock = BufMappingPartitionLock(hash);
2357 : :
2358 : 1260568 : LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2359 : :
2360 : : /* lock the buffer header */
2361 : 1260568 : buf_state = LockBufHdr(buf_hdr);
2362 : :
2363 : : /*
2364 : : * We have the buffer pinned nobody else should have been able to unset
2365 : : * this concurrently.
2366 : : */
2367 [ - + ]: 1260568 : Assert(buf_state & BM_TAG_VALID);
2368 [ - + ]: 1260568 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2369 [ - + ]: 1260568 : Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2370 : :
2371 : : /*
2372 : : * If somebody else pinned the buffer since, or even worse, dirtied it,
2373 : : * give up on this buffer: It's clearly in use.
2374 : : */
2375 [ + + + + ]: 1260568 : if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
2376 : : {
2377 [ - + ]: 578 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2378 : :
41 andres@anarazel.de 2379 :GNC 578 : UnlockBufHdr(buf_hdr);
987 andres@anarazel.de 2380 :CBC 578 : LWLockRelease(partition_lock);
2381 : :
2382 : 578 : return false;
2383 : : }
2384 : :
2385 : : /*
2386 : : * Clear out the buffer's tag and flags and usagecount. This is not
2387 : : * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2388 : : * doing anything with the buffer. But currently it's beneficial, as the
2389 : : * cheaper pre-check for several linear scans of shared buffers use the
2390 : : * tag (see e.g. FlushDatabaseBuffers()).
2391 : : */
2392 : 1259990 : ClearBufferTag(&buf_hdr->tag);
41 andres@anarazel.de 2393 :GNC 1259990 : UnlockBufHdrExt(buf_hdr, buf_state,
2394 : : 0,
2395 : : BUF_FLAG_MASK | BUF_USAGECOUNT_MASK,
2396 : : 0);
2397 : :
987 andres@anarazel.de 2398 [ - + ]:CBC 1259990 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2399 : :
2400 : : /* finally delete buffer from the buffer mapping table */
2401 : 1259990 : BufTableDelete(&tag, hash);
2402 : :
2403 : 1259990 : LWLockRelease(partition_lock);
2404 : :
41 andres@anarazel.de 2405 :GNC 1259990 : buf_state = pg_atomic_read_u32(&buf_hdr->state);
987 andres@anarazel.de 2406 [ - + ]:CBC 1259990 : Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
2407 [ - + ]: 1259990 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2408 [ - + ]: 1259990 : Assert(BUF_STATE_GET_REFCOUNT(pg_atomic_read_u32(&buf_hdr->state)) > 0);
2409 : :
2410 : 1259990 : return true;
2411 : : }
2412 : :
2413 : : static Buffer
2414 : 1979644 : GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
2415 : : {
2416 : : BufferDesc *buf_hdr;
2417 : : Buffer buf;
2418 : : uint32 buf_state;
2419 : : bool from_ring;
2420 : :
2421 : : /*
2422 : : * Ensure, before we pin a victim buffer, that there's a free refcount
2423 : : * entry and resource owner slot for the pin.
2424 : : */
2425 : 1979644 : ReservePrivateRefCountEntry();
770 heikki.linnakangas@i 2426 : 1979644 : ResourceOwnerEnlarge(CurrentResourceOwner);
2427 : :
2428 : : /* we return here if a prospective victim buffer gets used concurrently */
987 andres@anarazel.de 2429 : 6763 : again:
2430 : :
2431 : : /*
2432 : : * Select a victim buffer. The buffer is returned pinned and owned by
2433 : : * this backend.
2434 : : */
2435 : 1986407 : buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
2436 : 1986407 : buf = BufferDescriptorGetBuffer(buf_hdr);
2437 : :
2438 : : /*
2439 : : * We shouldn't have any other pins for this buffer.
2440 : : */
2441 : 1986407 : CheckBufferIsPinnedOnce(buf);
2442 : :
2443 : : /*
2444 : : * If the buffer was dirty, try to write it out. There is a race
2445 : : * condition here, in that someone might dirty it after we released the
2446 : : * buffer header lock above, or even while we are writing it out (since
2447 : : * our share-lock won't prevent hint-bit updates). We will recheck the
2448 : : * dirty bit after re-locking the buffer header.
2449 : : */
2450 [ + + ]: 1986407 : if (buf_state & BM_DIRTY)
2451 : : {
2452 : : LWLock *content_lock;
2453 : :
2454 [ - + ]: 272890 : Assert(buf_state & BM_TAG_VALID);
2455 [ - + ]: 272890 : Assert(buf_state & BM_VALID);
2456 : :
2457 : : /*
2458 : : * We need a share-lock on the buffer contents to write it out (else
2459 : : * we might write invalid data, eg because someone else is compacting
2460 : : * the page contents while we write). We must use a conditional lock
2461 : : * acquisition here to avoid deadlock. Even though the buffer was not
2462 : : * pinned (and therefore surely not locked) when StrategyGetBuffer
2463 : : * returned it, someone else could have pinned and exclusive-locked it
2464 : : * by the time we get here. If we try to get the lock unconditionally,
2465 : : * we'd block waiting for them; if they later block waiting for us,
2466 : : * deadlock ensues. (This has been observed to happen when two
2467 : : * backends are both trying to split btree index pages, and the second
2468 : : * one just happens to be trying to split the page the first one got
2469 : : * from StrategyGetBuffer.)
2470 : : */
2471 : 272890 : content_lock = BufferDescriptorGetContentLock(buf_hdr);
2472 [ - + ]: 272890 : if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
2473 : : {
2474 : : /*
2475 : : * Someone else has locked the buffer, so give it up and loop back
2476 : : * to get another one.
2477 : : */
987 andres@anarazel.de 2478 :UBC 0 : UnpinBuffer(buf_hdr);
2479 : 0 : goto again;
2480 : : }
2481 : :
2482 : : /*
2483 : : * If using a nondefault strategy, and writing the buffer would
2484 : : * require a WAL flush, let the strategy decide whether to go ahead
2485 : : * and write/reuse the buffer or to choose another victim. We need a
2486 : : * lock to inspect the page LSN, so this can't be done inside
2487 : : * StrategyGetBuffer.
2488 : : */
987 andres@anarazel.de 2489 [ + + ]:CBC 272890 : if (strategy != NULL)
2490 : : {
2491 : : XLogRecPtr lsn;
2492 : :
2493 : : /* Read the LSN while holding buffer header lock */
2494 : 78214 : buf_state = LockBufHdr(buf_hdr);
2495 : 78214 : lsn = BufferGetLSN(buf_hdr);
41 andres@anarazel.de 2496 :GNC 78214 : UnlockBufHdr(buf_hdr);
2497 : :
987 andres@anarazel.de 2498 [ + + ]:CBC 78214 : if (XLogNeedsFlush(lsn)
2499 [ + + ]: 9586 : && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2500 : : {
2501 : 6185 : LWLockRelease(content_lock);
2502 : 6185 : UnpinBuffer(buf_hdr);
2503 : 6185 : goto again;
2504 : : }
2505 : : }
2506 : :
2507 : : /* OK, do the I/O */
2508 : 266705 : FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
2509 : 266705 : LWLockRelease(content_lock);
2510 : :
945 2511 : 266705 : ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
2512 : : &buf_hdr->tag);
2513 : : }
2514 : :
2515 : :
987 2516 [ + + ]: 1980222 : if (buf_state & BM_VALID)
2517 : : {
2518 : : /*
2519 : : * When a BufferAccessStrategy is in use, blocks evicted from shared
2520 : : * buffers are counted as IOOP_EVICT in the corresponding context
2521 : : * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2522 : : * strategy in two cases: 1) while initially claiming buffers for the
2523 : : * strategy ring 2) to replace an existing strategy ring buffer
2524 : : * because it is pinned or in use and cannot be reused.
2525 : : *
2526 : : * Blocks evicted from buffers already in the strategy ring are
2527 : : * counted as IOOP_REUSE in the corresponding strategy context.
2528 : : *
2529 : : * At this point, we can accurately count evictions and reuses,
2530 : : * because we have successfully claimed the valid buffer. Previously,
2531 : : * we may have been forced to release the buffer due to concurrent
2532 : : * pinners or erroring out.
2533 : : */
2534 : 1258380 : pgstat_count_io_op(IOOBJECT_RELATION, io_context,
337 michael@paquier.xyz 2535 [ + + ]: 1258380 : from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2536 : : }
2537 : :
2538 : : /*
2539 : : * If the buffer has an entry in the buffer mapping table, delete it. This
2540 : : * can fail because another backend could have pinned or dirtied the
2541 : : * buffer.
2542 : : */
987 andres@anarazel.de 2543 [ + + + + ]: 1980222 : if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2544 : : {
2545 : 578 : UnpinBuffer(buf_hdr);
2546 : 578 : goto again;
2547 : : }
2548 : :
2549 : : /* a final set of sanity checks */
2550 : : #ifdef USE_ASSERT_CHECKING
2551 : 1979644 : buf_state = pg_atomic_read_u32(&buf_hdr->state);
2552 : :
2553 [ - + ]: 1979644 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2554 [ - + ]: 1979644 : Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2555 : :
2556 : 1979644 : CheckBufferIsPinnedOnce(buf);
2557 : : #endif
2558 : :
2559 : 1979644 : return buf;
2560 : : }
2561 : :
2562 : : /*
2563 : : * Return the maximum number of buffers that a backend should try to pin once,
2564 : : * to avoid exceeding its fair share. This is the highest value that
2565 : : * GetAdditionalPinLimit() could ever return. Note that it may be zero on a
2566 : : * system with a very small buffer pool relative to max_connections.
2567 : : */
2568 : : uint32
278 tmunro@postgresql.or 2569 : 528628 : GetPinLimit(void)
2570 : : {
2571 : 528628 : return MaxProportionalPins;
2572 : : }
2573 : :
2574 : : /*
2575 : : * Return the maximum number of additional buffers that this backend should
2576 : : * pin if it wants to stay under the per-backend limit, considering the number
2577 : : * of buffers it has already pinned. Unlike LimitAdditionalPins(), the limit
2578 : : * return by this function can be zero.
2579 : : */
2580 : : uint32
2581 : 2967902 : GetAdditionalPinLimit(void)
2582 : : {
2583 : : uint32 estimated_pins_held;
2584 : :
2585 : : /*
2586 : : * We get the number of "overflowed" pins for free, but don't know the
2587 : : * number of pins in PrivateRefCountArray. The cost of calculating that
2588 : : * exactly doesn't seem worth it, so just assume the max.
2589 : : */
2590 : 2967902 : estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2591 : :
2592 : : /* Is this backend already holding more than its fair share? */
2593 [ + + ]: 2967902 : if (estimated_pins_held > MaxProportionalPins)
2594 : 1300132 : return 0;
2595 : :
2596 : 1667770 : return MaxProportionalPins - estimated_pins_held;
2597 : : }
2598 : :
2599 : : /*
2600 : : * Limit the number of pins a batch operation may additionally acquire, to
2601 : : * avoid running out of pinnable buffers.
2602 : : *
2603 : : * One additional pin is always allowed, on the assumption that the operation
2604 : : * requires at least one to make progress.
2605 : : */
2606 : : void
987 andres@anarazel.de 2607 : 200460 : LimitAdditionalPins(uint32 *additional_pins)
2608 : : {
2609 : : uint32 limit;
2610 : :
2611 [ + + ]: 200460 : if (*additional_pins <= 1)
2612 : 190386 : return;
2613 : :
278 tmunro@postgresql.or 2614 : 10074 : limit = GetAdditionalPinLimit();
2615 : 10074 : limit = Max(limit, 1);
2616 [ + + ]: 10074 : if (limit < *additional_pins)
2617 : 5488 : *additional_pins = limit;
2618 : : }
2619 : :
2620 : : /*
2621 : : * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2622 : : * avoid duplicating the tracing and relpersistence related logic.
2623 : : */
2624 : : static BlockNumber
847 2625 : 212932 : ExtendBufferedRelCommon(BufferManagerRelation bmr,
2626 : : ForkNumber fork,
2627 : : BufferAccessStrategy strategy,
2628 : : uint32 flags,
2629 : : uint32 extend_by,
2630 : : BlockNumber extend_upto,
2631 : : Buffer *buffers,
2632 : : uint32 *extended_by)
2633 : : {
2634 : : BlockNumber first_block;
2635 : :
2636 : : TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2637 : : BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2638 : : BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2639 : : BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2640 : : BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2641 : : extend_by);
2642 : :
2643 [ + + ]: 212932 : if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2644 : 12472 : first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2645 : : extend_by, extend_upto,
2646 : : buffers, &extend_by);
2647 : : else
2648 : 200460 : first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2649 : : extend_by, extend_upto,
2650 : : buffers, &extend_by);
987 andres@anarazel.de 2651 : 212932 : *extended_by = extend_by;
2652 : :
2653 : : TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2654 : : BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2655 : : BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2656 : : BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2657 : : BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2658 : : *extended_by,
2659 : : first_block);
2660 : :
2661 : 212932 : return first_block;
2662 : : }
2663 : :
2664 : : /*
2665 : : * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2666 : : * shared buffers.
2667 : : */
2668 : : static BlockNumber
847 tmunro@postgresql.or 2669 : 200460 : ExtendBufferedRelShared(BufferManagerRelation bmr,
2670 : : ForkNumber fork,
2671 : : BufferAccessStrategy strategy,
2672 : : uint32 flags,
2673 : : uint32 extend_by,
2674 : : BlockNumber extend_upto,
2675 : : Buffer *buffers,
2676 : : uint32 *extended_by)
2677 : : {
2678 : : BlockNumber first_block;
987 andres@anarazel.de 2679 : 200460 : IOContext io_context = IOContextForStrategy(strategy);
2680 : : instr_time io_start;
2681 : :
2682 : 200460 : LimitAdditionalPins(&extend_by);
2683 : :
2684 : : /*
2685 : : * Acquire victim buffers for extension without holding extension lock.
2686 : : * Writing out victim buffers is the most expensive part of extending the
2687 : : * relation, particularly when doing so requires WAL flushes. Zeroing out
2688 : : * the buffers is also quite expensive, so do that before holding the
2689 : : * extension lock as well.
2690 : : *
2691 : : * These pages are pinned by us and not valid. While we hold the pin they
2692 : : * can't be acquired as victim buffers by another backend.
2693 : : */
2694 [ + + ]: 422341 : for (uint32 i = 0; i < extend_by; i++)
2695 : : {
2696 : : Block buf_block;
2697 : :
2698 : 221881 : buffers[i] = GetVictimBuffer(strategy, io_context);
2699 : 221881 : buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2700 : :
2701 : : /* new buffers are zero-filled */
308 peter@eisentraut.org 2702 [ + - + - : 221881 : MemSet(buf_block, 0, BLCKSZ);
+ - - + -
- ]
2703 : : }
2704 : :
2705 : : /*
2706 : : * Lock relation against concurrent extensions, unless requested not to.
2707 : : *
2708 : : * We use the same extension lock for all forks. That's unnecessarily
2709 : : * restrictive, but currently extensions for forks don't happen often
2710 : : * enough to make it worth locking more granularly.
2711 : : *
2712 : : * Note that another backend might have extended the relation by the time
2713 : : * we get the lock.
2714 : : */
987 andres@anarazel.de 2715 [ + + ]: 200460 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
847 tmunro@postgresql.or 2716 : 149565 : LockRelationForExtension(bmr.rel, ExclusiveLock);
2717 : :
2718 : : /*
2719 : : * If requested, invalidate size cache, so that smgrnblocks asks the
2720 : : * kernel.
2721 : : */
987 andres@anarazel.de 2722 [ + + ]: 200460 : if (flags & EB_CLEAR_SIZE_CACHE)
57 alvherre@kurilemu.de 2723 [ + - ]:GNC 7853 : BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
2724 : :
2725 [ + + ]: 200460 : first_block = smgrnblocks(BMR_GET_SMGR(bmr), fork);
2726 : :
2727 : : /*
2728 : : * Now that we have the accurate relation size, check if the caller wants
2729 : : * us to extend to only up to a specific size. If there were concurrent
2730 : : * extensions, we might have acquired too many buffers and need to release
2731 : : * them.
2732 : : */
987 andres@anarazel.de 2733 [ + + ]:CBC 200460 : if (extend_upto != InvalidBlockNumber)
2734 : : {
2735 : 52702 : uint32 orig_extend_by = extend_by;
2736 : :
2737 [ - + ]: 52702 : if (first_block > extend_upto)
987 andres@anarazel.de 2738 :UBC 0 : extend_by = 0;
987 andres@anarazel.de 2739 [ + + ]:CBC 52702 : else if ((uint64) first_block + extend_by > extend_upto)
2740 : 7 : extend_by = extend_upto - first_block;
2741 : :
2742 [ + + ]: 52723 : for (uint32 i = extend_by; i < orig_extend_by; i++)
2743 : : {
2744 : 21 : BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2745 : :
2746 : 21 : UnpinBuffer(buf_hdr);
2747 : : }
2748 : :
2749 [ + + ]: 52702 : if (extend_by == 0)
2750 : : {
2751 [ + - ]: 7 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
847 tmunro@postgresql.or 2752 : 7 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
987 andres@anarazel.de 2753 : 7 : *extended_by = extend_by;
2754 : 7 : return first_block;
2755 : : }
2756 : : }
2757 : :
2758 : : /* Fail if relation is already at maximum possible length */
2759 [ - + ]: 200453 : if ((uint64) first_block + extend_by >= MaxBlockNumber)
987 andres@anarazel.de 2760 [ # # # # :UBC 0 : ereport(ERROR,
# # # # #
# ]
2761 : : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2762 : : errmsg("cannot extend relation %s beyond %u blocks",
2763 : : relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str,
2764 : : MaxBlockNumber)));
2765 : :
2766 : : /*
2767 : : * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2768 : : *
2769 : : * This needs to happen before we extend the relation, because as soon as
2770 : : * we do, other backends can start to read in those pages.
2771 : : */
820 peter@eisentraut.org 2772 [ + + ]:CBC 422313 : for (uint32 i = 0; i < extend_by; i++)
2773 : : {
987 andres@anarazel.de 2774 : 221860 : Buffer victim_buf = buffers[i];
2775 : 221860 : BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2776 : : BufferTag tag;
2777 : : uint32 hash;
2778 : : LWLock *partition_lock;
2779 : : int existing_id;
2780 : :
2781 : : /* in case we need to pin an existing buffer below */
770 heikki.linnakangas@i 2782 : 221860 : ResourceOwnerEnlarge(CurrentResourceOwner);
2783 : 221860 : ReservePrivateRefCountEntry();
2784 : :
57 alvherre@kurilemu.de 2785 [ + + ]:GNC 221860 : InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork,
2786 : : first_block + i);
987 andres@anarazel.de 2787 :CBC 221860 : hash = BufTableHashCode(&tag);
2788 : 221860 : partition_lock = BufMappingPartitionLock(hash);
2789 : :
2790 : 221860 : LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2791 : :
2792 : 221860 : existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2793 : :
2794 : : /*
2795 : : * We get here only in the corner case where we are trying to extend
2796 : : * the relation but we found a pre-existing buffer. This can happen
2797 : : * because a prior attempt at extending the relation failed, and
2798 : : * because mdread doesn't complain about reads beyond EOF (when
2799 : : * zero_damaged_pages is ON) and so a previous attempt to read a block
2800 : : * beyond EOF could have left a "valid" zero-filled buffer.
2801 : : *
2802 : : * This has also been observed when relation was overwritten by
2803 : : * external process. Since the legitimate cases should always have
2804 : : * left a zero-filled buffer, complain if not PageIsNew.
2805 : : */
2806 [ - + ]: 221860 : if (existing_id >= 0)
2807 : : {
987 andres@anarazel.de 2808 :UBC 0 : BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2809 : : Block buf_block;
2810 : : bool valid;
2811 : :
2812 : : /*
2813 : : * Pin the existing buffer before releasing the partition lock,
2814 : : * preventing it from being evicted.
2815 : : */
70 andres@anarazel.de 2816 :UNC 0 : valid = PinBuffer(existing_hdr, strategy, false);
2817 : :
987 andres@anarazel.de 2818 :UBC 0 : LWLockRelease(partition_lock);
2819 : 0 : UnpinBuffer(victim_buf_hdr);
2820 : :
2821 : 0 : buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2822 : 0 : buf_block = BufHdrGetBlock(existing_hdr);
2823 : :
2824 [ # # # # ]: 0 : if (valid && !PageIsNew((Page) buf_block))
2825 [ # # # # : 0 : ereport(ERROR,
# # # # #
# ]
2826 : : (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
2827 : : existing_hdr->tag.blockNum,
2828 : : relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str)));
2829 : :
2830 : : /*
2831 : : * We *must* do smgr[zero]extend before succeeding, else the page
2832 : : * will not be reserved by the kernel, and the next P_NEW call
2833 : : * will decide to return the same page. Clear the BM_VALID bit,
2834 : : * do StartBufferIO() and proceed.
2835 : : *
2836 : : * Loop to handle the very small possibility that someone re-sets
2837 : : * BM_VALID between our clearing it and StartBufferIO inspecting
2838 : : * it.
2839 : : */
2840 : : do
2841 : : {
41 andres@anarazel.de 2842 :UNC 0 : pg_atomic_fetch_and_u32(&existing_hdr->state, ~BM_VALID);
623 tmunro@postgresql.or 2843 [ # # ]:UBC 0 : } while (!StartBufferIO(existing_hdr, true, false));
2844 : : }
2845 : : else
2846 : : {
2847 : : uint32 buf_state;
41 andres@anarazel.de 2848 :GNC 221860 : uint32 set_bits = 0;
2849 : :
987 andres@anarazel.de 2850 :CBC 221860 : buf_state = LockBufHdr(victim_buf_hdr);
2851 : :
2852 : : /* some sanity checks while we hold the buffer header lock */
2853 [ - + ]: 221860 : Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2854 [ - + ]: 221860 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2855 : :
2856 : 221860 : victim_buf_hdr->tag = tag;
2857 : :
41 andres@anarazel.de 2858 :GNC 221860 : set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
847 tmunro@postgresql.or 2859 [ + + + + ]:CBC 221860 : if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
41 andres@anarazel.de 2860 :GNC 215960 : set_bits |= BM_PERMANENT;
2861 : :
2862 : 221860 : UnlockBufHdrExt(victim_buf_hdr, buf_state,
2863 : : set_bits, 0,
2864 : : 0);
2865 : :
987 andres@anarazel.de 2866 :CBC 221860 : LWLockRelease(partition_lock);
2867 : :
2868 : : /* XXX: could combine the locked operations in it with the above */
623 tmunro@postgresql.or 2869 : 221860 : StartBufferIO(victim_buf_hdr, true, false);
2870 : : }
2871 : : }
2872 : :
294 michael@paquier.xyz 2873 : 200453 : io_start = pgstat_prepare_io_time(track_io_timing);
2874 : :
2875 : : /*
2876 : : * Note: if smgrzeroextend fails, we will end up with buffers that are
2877 : : * allocated but not marked BM_VALID. The next relation extension will
2878 : : * still select the same block number (because the relation didn't get any
2879 : : * longer on disk) and so future attempts to extend the relation will find
2880 : : * the same buffers (if they have not been recycled) but come right back
2881 : : * here to try smgrzeroextend again.
2882 : : *
2883 : : * We don't need to set checksum for all-zero pages.
2884 : : */
57 alvherre@kurilemu.de 2885 [ + + ]:GNC 200453 : smgrzeroextend(BMR_GET_SMGR(bmr), fork, first_block, extend_by, false);
2886 : :
2887 : : /*
2888 : : * Release the file-extension lock; it's now OK for someone else to extend
2889 : : * the relation some more.
2890 : : *
2891 : : * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2892 : : * take noticeable time.
2893 : : */
987 andres@anarazel.de 2894 [ + + ]:CBC 200453 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
847 tmunro@postgresql.or 2895 : 149558 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
2896 : :
985 andres@anarazel.de 2897 : 200453 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND,
337 michael@paquier.xyz 2898 : 200453 : io_start, 1, extend_by * BLCKSZ);
2899 : :
2900 : : /* Set BM_VALID, terminate IO, and wake up any waiters */
820 peter@eisentraut.org 2901 [ + + ]: 422313 : for (uint32 i = 0; i < extend_by; i++)
2902 : : {
987 andres@anarazel.de 2903 : 221860 : Buffer buf = buffers[i];
2904 : 221860 : BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2905 : 221860 : bool lock = false;
2906 : :
2907 [ + + + + ]: 221860 : if (flags & EB_LOCK_FIRST && i == 0)
2908 : 147488 : lock = true;
2909 [ + + ]: 74372 : else if (flags & EB_LOCK_TARGET)
2910 : : {
2911 [ - + ]: 43766 : Assert(extend_upto != InvalidBlockNumber);
2912 [ + + ]: 43766 : if (first_block + i + 1 == extend_upto)
2913 : 43198 : lock = true;
2914 : : }
2915 : :
2916 [ + + ]: 221860 : if (lock)
70 andres@anarazel.de 2917 :GNC 190686 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2918 : :
262 andres@anarazel.de 2919 :CBC 221860 : TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
2920 : : }
2921 : :
987 2922 : 200453 : pgBufferUsage.shared_blks_written += extend_by;
2923 : :
2924 : 200453 : *extended_by = extend_by;
2925 : :
2926 : 200453 : return first_block;
2927 : : }
2928 : :
2929 : : /*
2930 : : * BufferIsLockedByMe
2931 : : *
2932 : : * Checks if this backend has the buffer locked in any mode.
2933 : : *
2934 : : * Buffer must be pinned.
2935 : : */
2936 : : bool
70 andres@anarazel.de 2937 :GNC 9313806 : BufferIsLockedByMe(Buffer buffer)
2938 : : {
2939 : : BufferDesc *bufHdr;
2940 : :
70 andres@anarazel.de 2941 [ - + - + :CBC 9313806 : Assert(BufferIsPinned(buffer));
- + ]
2942 : :
2943 [ - + ]: 9313806 : if (BufferIsLocal(buffer))
2944 : : {
2945 : : /* Content locks are not maintained for local buffers. */
70 andres@anarazel.de 2946 :UBC 0 : return true;
2947 : : }
2948 : : else
2949 : : {
70 andres@anarazel.de 2950 :GNC 9313806 : bufHdr = GetBufferDescriptor(buffer - 1);
2951 : 9313806 : return LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr));
2952 : : }
2953 : : }
2954 : :
2955 : : /*
2956 : : * BufferIsLockedByMeInMode
2957 : : *
2958 : : * Checks if this backend has the buffer locked in the specified mode.
2959 : : *
2960 : : * Buffer must be pinned.
2961 : : */
2962 : : bool
14 2963 : 50137985 : BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode)
2964 : : {
2965 : : BufferDesc *bufHdr;
2966 : :
322 tgl@sss.pgh.pa.us 2967 [ - + + + : 50137985 : Assert(BufferIsPinned(buffer));
- + ]
2968 : :
786 jdavis@postgresql.or 2969 [ + + ]: 50137985 : if (BufferIsLocal(buffer))
2970 : : {
2971 : : /* Content locks are not maintained for local buffers. */
322 tgl@sss.pgh.pa.us 2972 : 764 : return true;
2973 : : }
2974 : : else
2975 : : {
2976 : : LWLockMode lw_mode;
2977 : :
70 andres@anarazel.de 2978 [ + - - ]: 50137221 : switch (mode)
2979 : : {
2980 : 50137221 : case BUFFER_LOCK_EXCLUSIVE:
2981 : 50137221 : lw_mode = LW_EXCLUSIVE;
2982 : 50137221 : break;
70 andres@anarazel.de 2983 :UNC 0 : case BUFFER_LOCK_SHARE:
2984 : 0 : lw_mode = LW_SHARED;
2985 : 0 : break;
2986 : 0 : default:
2987 : 0 : pg_unreachable();
2988 : : }
2989 : :
786 jdavis@postgresql.or 2990 :CBC 50137221 : bufHdr = GetBufferDescriptor(buffer - 1);
322 tgl@sss.pgh.pa.us 2991 : 50137221 : return LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
2992 : : lw_mode);
2993 : : }
2994 : : }
2995 : :
2996 : : /*
2997 : : * BufferIsDirty
2998 : : *
2999 : : * Checks if buffer is already dirty.
3000 : : *
3001 : : * Buffer must be pinned and exclusive-locked. (Without an exclusive lock,
3002 : : * the result may be stale before it's returned.)
3003 : : */
3004 : : bool
786 jdavis@postgresql.or 3005 : 14946374 : BufferIsDirty(Buffer buffer)
3006 : : {
3007 : : BufferDesc *bufHdr;
3008 : :
322 tgl@sss.pgh.pa.us 3009 [ - + - + : 14946374 : Assert(BufferIsPinned(buffer));
- + ]
3010 : :
786 jdavis@postgresql.or 3011 [ - + ]: 14946374 : if (BufferIsLocal(buffer))
3012 : : {
786 jdavis@postgresql.or 3013 :UBC 0 : int bufid = -buffer - 1;
3014 : :
3015 : 0 : bufHdr = GetLocalBufferDescriptor(bufid);
3016 : : /* Content locks are not maintained for local buffers. */
3017 : : }
3018 : : else
3019 : : {
786 jdavis@postgresql.or 3020 :CBC 14946374 : bufHdr = GetBufferDescriptor(buffer - 1);
70 andres@anarazel.de 3021 [ - + ]:GNC 14946374 : Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
3022 : : }
3023 : :
786 jdavis@postgresql.or 3024 :CBC 14946374 : return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
3025 : : }
3026 : :
3027 : : /*
3028 : : * MarkBufferDirty
3029 : : *
3030 : : * Marks buffer contents as dirty (actual write happens later).
3031 : : *
3032 : : * Buffer must be pinned and exclusive-locked. (If caller does not hold
3033 : : * exclusive lock, then somebody could be in process of writing the buffer,
3034 : : * leading to risk of bad data written to disk.)
3035 : : */
3036 : : void
7201 tgl@sss.pgh.pa.us 3037 : 21669445 : MarkBufferDirty(Buffer buffer)
3038 : : {
3039 : : BufferDesc *bufHdr;
3040 : : uint32 buf_state;
3041 : : uint32 old_buf_state;
3042 : :
7733 3043 [ - + ]: 21669445 : if (!BufferIsValid(buffer))
5295 peter_e@gmx.net 3044 [ # # ]:UBC 0 : elog(ERROR, "bad buffer ID: %d", buffer);
3045 : :
9392 tgl@sss.pgh.pa.us 3046 [ + + ]:CBC 21669445 : if (BufferIsLocal(buffer))
3047 : : {
7201 3048 : 1379852 : MarkLocalBufferDirty(buffer);
8586 bruce@momjian.us 3049 : 1379852 : return;
3050 : : }
3051 : :
3975 andres@anarazel.de 3052 : 20289593 : bufHdr = GetBufferDescriptor(buffer - 1);
3053 : :
4127 3054 [ - + - + : 20289593 : Assert(BufferIsPinned(buffer));
- + ]
70 andres@anarazel.de 3055 [ - + ]:GNC 20289593 : Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
3056 : :
3057 : : /*
3058 : : * NB: We have to wait for the buffer header spinlock to be not held, as
3059 : : * TerminateBufferIO() relies on the spinlock.
3060 : : */
3538 andres@anarazel.de 3061 :CBC 20289593 : old_buf_state = pg_atomic_read_u32(&bufHdr->state);
3062 : : for (;;)
3063 : : {
3064 [ + + ]: 20289634 : if (old_buf_state & BM_LOCKED)
3065 : 1 : old_buf_state = WaitBufHdrUnlocked(bufHdr);
3066 : :
3067 : 20289634 : buf_state = old_buf_state;
3068 : :
3069 [ - + ]: 20289634 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3070 : 20289634 : buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
3071 : :
3072 [ + + ]: 20289634 : if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
3073 : : buf_state))
3074 : 20289593 : break;
3075 : : }
3076 : :
3077 : : /*
3078 : : * If the buffer was not dirty already, do vacuum accounting.
3079 : : */
3080 [ + + ]: 20289593 : if (!(old_buf_state & BM_DIRTY))
3081 : : {
5047 rhaas@postgresql.org 3082 : 662694 : pgBufferUsage.shared_blks_dirtied++;
5136 alvherre@alvh.no-ip. 3083 [ + + ]: 662694 : if (VacuumCostActive)
3084 : 9996 : VacuumCostBalance += VacuumCostPageDirty;
3085 : : }
3086 : : }
3087 : :
3088 : : /*
3089 : : * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
3090 : : *
3091 : : * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
3092 : : * compared to calling the two routines separately. Now it's mainly just
3093 : : * a convenience function. However, if the passed buffer is valid and
3094 : : * already contains the desired block, we just return it as-is; and that
3095 : : * does save considerable work compared to a full release and reacquire.
3096 : : *
3097 : : * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
3098 : : * buffer actually needs to be released. This case is the same as ReadBuffer,
3099 : : * but can save some tests in the caller.
3100 : : */
3101 : : Buffer
10753 scrappy@hub.org 3102 : 27313599 : ReleaseAndReadBuffer(Buffer buffer,
3103 : : Relation relation,
3104 : : BlockNumber blockNum)
3105 : : {
6033 bruce@momjian.us 3106 : 27313599 : ForkNumber forkNum = MAIN_FORKNUM;
3107 : : BufferDesc *bufHdr;
3108 : :
8985 tgl@sss.pgh.pa.us 3109 [ + + ]: 27313599 : if (BufferIsValid(buffer))
3110 : : {
4127 andres@anarazel.de 3111 [ - + + + : 16579892 : Assert(BufferIsPinned(buffer));
- + ]
8985 tgl@sss.pgh.pa.us 3112 [ + + ]: 16579892 : if (BufferIsLocal(buffer))
3113 : : {
3975 andres@anarazel.de 3114 : 136457 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
8957 tgl@sss.pgh.pa.us 3115 [ + + + - ]: 139979 : if (bufHdr->tag.blockNum == blockNum &&
1211 rhaas@postgresql.org 3116 [ + - ]: 7044 : BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3117 : 3522 : BufTagGetForkNum(&bufHdr->tag) == forkNum)
8957 tgl@sss.pgh.pa.us 3118 : 3522 : return buffer;
987 andres@anarazel.de 3119 : 132935 : UnpinLocalBuffer(buffer);
3120 : : }
3121 : : else
3122 : : {
3975 3123 : 16443435 : bufHdr = GetBufferDescriptor(buffer - 1);
3124 : : /* we have pin, so it's ok to examine tag without spinlock */
8957 tgl@sss.pgh.pa.us 3125 [ + + + - ]: 22061802 : if (bufHdr->tag.blockNum == blockNum &&
1211 rhaas@postgresql.org 3126 [ + - ]: 11236734 : BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3127 : 5618367 : BufTagGetForkNum(&bufHdr->tag) == forkNum)
8957 tgl@sss.pgh.pa.us 3128 : 5618367 : return buffer;
1174 michael@paquier.xyz 3129 : 10825068 : UnpinBuffer(bufHdr);
3130 : : }
3131 : : }
3132 : :
7593 tgl@sss.pgh.pa.us 3133 : 21691710 : return ReadBuffer(relation, blockNum);
3134 : : }
3135 : :
3136 : : /*
3137 : : * PinBuffer -- make buffer unavailable for replacement.
3138 : : *
3139 : : * For the default access strategy, the buffer's usage_count is incremented
3140 : : * when we first pin it; for other strategies we just make sure the usage_count
3141 : : * isn't zero. (The idea of the latter is that we don't want synchronized
3142 : : * heap scans to inflate the count, but we need it to not be zero to discourage
3143 : : * other backends from stealing buffers from our ring. As long as we cycle
3144 : : * through the ring faster than the global clock-sweep cycles, buffers in
3145 : : * our ring won't be chosen as victims for replacement by other backends.)
3146 : : *
3147 : : * This should be applied only to shared buffers, never local ones.
3148 : : *
3149 : : * Since buffers are pinned/unpinned very frequently, pin buffers without
3150 : : * taking the buffer header lock; instead update the state variable in loop of
3151 : : * CAS operations. Hopefully it's just a single CAS.
3152 : : *
3153 : : * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
3154 : : * must have been done already.
3155 : : *
3156 : : * Returns true if buffer is BM_VALID, else false. This provision allows
3157 : : * some callers to avoid an extra spinlock cycle. If skip_if_not_valid is
3158 : : * true, then a false return value also indicates that the buffer was
3159 : : * (recently) invalid and has not been pinned.
3160 : : */
3161 : : static bool
70 andres@anarazel.de 3162 :GNC 55270420 : PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
3163 : : bool skip_if_not_valid)
3164 : : {
3780 andres@anarazel.de 3165 :CBC 55270420 : Buffer b = BufferDescriptorGetBuffer(buf);
3166 : : bool result;
3167 : : PrivateRefCountEntry *ref;
3168 : :
987 3169 [ - + ]: 55270420 : Assert(!BufferIsLocal(b));
3 andres@anarazel.de 3170 [ - + ]:GNC 55270420 : Assert(ReservedRefCountSlot != -1);
3171 : :
3780 andres@anarazel.de 3172 :CBC 55270420 : ref = GetPrivateRefCountEntry(b, true);
3173 : :
3985 3174 [ + + ]: 55270420 : if (ref == NULL)
3175 : : {
3176 : : uint32 buf_state;
3177 : : uint32 old_buf_state;
3178 : :
3538 3179 : 53101956 : old_buf_state = pg_atomic_read_u32(&buf->state);
3180 : : for (;;)
3181 : : {
70 andres@anarazel.de 3182 [ + + + + :GNC 53109932 : if (unlikely(skip_if_not_valid && !(old_buf_state & BM_VALID)))
+ + ]
3183 : 3 : return false;
3184 : :
3185 : : /*
3186 : : * We're not allowed to increase the refcount while the buffer
3187 : : * header spinlock is held. Wait for the lock to be released.
3188 : : */
3538 andres@anarazel.de 3189 [ + + ]:CBC 53109929 : if (old_buf_state & BM_LOCKED)
3190 : 264 : old_buf_state = WaitBufHdrUnlocked(buf);
3191 : :
3192 : 53109929 : buf_state = old_buf_state;
3193 : :
3194 : : /* increase refcount */
3195 : 53109929 : buf_state += BUF_REFCOUNT_ONE;
3196 : :
3194 teodor@sigaev.ru 3197 [ + + ]: 53109929 : if (strategy == NULL)
3198 : : {
3199 : : /* Default case: increase usagecount unless already max. */
3200 [ + + ]: 52635769 : if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
3201 : 3371315 : buf_state += BUF_USAGECOUNT_ONE;
3202 : : }
3203 : : else
3204 : : {
3205 : : /*
3206 : : * Ring buffers shouldn't evict others from pool. Thus we
3207 : : * don't make usagecount more than 1.
3208 : : */
3209 [ + + ]: 474160 : if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3210 : 34463 : buf_state += BUF_USAGECOUNT_ONE;
3211 : : }
3212 : :
3538 andres@anarazel.de 3213 [ + + ]: 53109929 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
3214 : : buf_state))
3215 : : {
3216 : 53101953 : result = (buf_state & BM_VALID) != 0;
3217 : :
70 andres@anarazel.de 3218 :GNC 53101953 : TrackNewBufferPin(b);
3538 andres@anarazel.de 3219 :CBC 53101953 : break;
3220 : : }
3221 : : }
3222 : : }
3223 : : else
3224 : : {
3225 : : /*
3226 : : * If we previously pinned the buffer, it is likely to be valid, but
3227 : : * it may not be if StartReadBuffers() was called and
3228 : : * WaitReadBuffers() hasn't been called yet. We'll check by loading
3229 : : * the flags without locking. This is racy, but it's OK to return
3230 : : * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3231 : : * it'll see that it's now valid.
3232 : : *
3233 : : * Note: We deliberately avoid a Valgrind client request here.
3234 : : * Individual access methods can optionally superimpose buffer page
3235 : : * client requests on top of our client requests to enforce that
3236 : : * buffers are only accessed while locked (and pinned). It's possible
3237 : : * that the buffer page is legitimately non-accessible here. We
3238 : : * cannot meddle with that.
3239 : : */
623 tmunro@postgresql.or 3240 : 2168464 : result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
3241 : :
3 andres@anarazel.de 3242 [ - + ]:GNC 2168464 : Assert(ref->data.refcount > 0);
3243 : 2168464 : ref->data.refcount++;
70 3244 : 2168464 : ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
3245 : : }
3246 : :
7593 tgl@sss.pgh.pa.us 3247 :CBC 55270417 : return result;
3248 : : }
3249 : :
3250 : : /*
3251 : : * PinBuffer_Locked -- as above, but caller already locked the buffer header.
3252 : : * The spinlock is released before return.
3253 : : *
3254 : : * As this function is called with the spinlock held, the caller has to
3255 : : * previously call ReservePrivateRefCountEntry() and
3256 : : * ResourceOwnerEnlarge(CurrentResourceOwner);
3257 : : *
3258 : : * Currently, no callers of this function want to modify the buffer's
3259 : : * usage_count at all, so there's no need for a strategy parameter.
3260 : : * Also we don't bother with a BM_VALID test (the caller could check that for
3261 : : * itself).
3262 : : *
3263 : : * Also all callers only ever use this function when it's known that the
3264 : : * buffer can't have a preexisting pin by this backend. That allows us to skip
3265 : : * searching the private refcount array & hash, which is a boon, because the
3266 : : * spinlock is still held.
3267 : : *
3268 : : * Note: use of this routine is frequently mandatory, not just an optimization
3269 : : * to save a spin lock/unlock cycle, because we need to pin a buffer before
3270 : : * its state can change under us.
3271 : : */
3272 : : static void
3684 rhaas@postgresql.org 3273 : 307471 : PinBuffer_Locked(BufferDesc *buf)
3274 : : {
3275 : : uint32 old_buf_state;
3276 : :
3277 : : /*
3278 : : * As explained, We don't expect any preexisting pins. That allows us to
3279 : : * manipulate the PrivateRefCount after releasing the spinlock
3280 : : */
3780 andres@anarazel.de 3281 [ - + ]: 307471 : Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
3282 : :
3283 : : /*
3284 : : * Since we hold the buffer spinlock, we can update the buffer state and
3285 : : * release the lock in one operation.
3286 : : */
41 andres@anarazel.de 3287 :GNC 307471 : old_buf_state = pg_atomic_read_u32(&buf->state);
3288 : :
3289 : 307471 : UnlockBufHdrExt(buf, old_buf_state,
3290 : : 0, 0, 1);
3291 : :
70 3292 : 307471 : TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
7912 tgl@sss.pgh.pa.us 3293 :CBC 307471 : }
3294 : :
3295 : : /*
3296 : : * Support for waking up another backend that is waiting for the cleanup lock
3297 : : * to be released using BM_PIN_COUNT_WAITER.
3298 : : *
3299 : : * See LockBufferForCleanup().
3300 : : *
3301 : : * Expected to be called just after releasing a buffer pin (in a BufferDesc,
3302 : : * not just reducing the backend-local pincount for the buffer).
3303 : : */
3304 : : static void
262 andres@anarazel.de 3305 : 7 : WakePinCountWaiter(BufferDesc *buf)
3306 : : {
3307 : : /*
3308 : : * Acquire the buffer header lock, re-check that there's a waiter. Another
3309 : : * backend could have unpinned this buffer, and already woken up the
3310 : : * waiter.
3311 : : *
3312 : : * There's no danger of the buffer being replaced after we unpinned it
3313 : : * above, as it's pinned by the waiter. The waiter removes
3314 : : * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3315 : : * backend waking it up.
3316 : : */
3317 : 7 : uint32 buf_state = LockBufHdr(buf);
3318 : :
3319 [ + - ]: 7 : if ((buf_state & BM_PIN_COUNT_WAITER) &&
3320 [ + + ]: 7 : BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3321 : 6 : {
3322 : : /* we just released the last pin other than the waiter's */
3323 : 6 : int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3324 : :
41 andres@anarazel.de 3325 :GNC 6 : UnlockBufHdrExt(buf, buf_state,
3326 : : 0, BM_PIN_COUNT_WAITER,
3327 : : 0);
262 andres@anarazel.de 3328 :CBC 6 : ProcSendSignal(wait_backend_pgprocno);
3329 : : }
3330 : : else
41 andres@anarazel.de 3331 :GNC 1 : UnlockBufHdr(buf);
262 andres@anarazel.de 3332 :CBC 7 : }
3333 : :
3334 : : /*
3335 : : * UnpinBuffer -- make buffer available for replacement.
3336 : : *
3337 : : * This should be applied only to shared buffers, never local ones. This
3338 : : * always adjusts CurrentResourceOwner.
3339 : : */
3340 : : static void
1174 michael@paquier.xyz 3341 : 68014993 : UnpinBuffer(BufferDesc *buf)
3342 : : {
770 heikki.linnakangas@i 3343 : 68014993 : Buffer b = BufferDescriptorGetBuffer(buf);
3344 : :
3345 : 68014993 : ResourceOwnerForgetBuffer(CurrentResourceOwner, b);
3346 : 68014993 : UnpinBufferNoOwner(buf);
3347 : 68014993 : }
3348 : :
3349 : : static void
3350 : 68019557 : UnpinBufferNoOwner(BufferDesc *buf)
3351 : : {
3352 : : PrivateRefCountEntry *ref;
3780 andres@anarazel.de 3353 : 68019557 : Buffer b = BufferDescriptorGetBuffer(buf);
3354 : :
987 3355 [ - + ]: 68019557 : Assert(!BufferIsLocal(b));
3356 : :
3357 : : /* not moving as we're likely deleting it soon anyway */
3780 3358 : 68019557 : ref = GetPrivateRefCountEntry(b, false);
4127 3359 [ - + ]: 68019557 : Assert(ref != NULL);
3 andres@anarazel.de 3360 [ - + ]:GNC 68019557 : Assert(ref->data.refcount > 0);
3361 : 68019557 : ref->data.refcount--;
3362 [ + + ]: 68019557 : if (ref->data.refcount == 0)
3363 : : {
3364 : : uint32 old_buf_state;
3365 : :
3366 : : /*
3367 : : * Mark buffer non-accessible to Valgrind.
3368 : : *
3369 : : * Note that the buffer may have already been marked non-accessible
3370 : : * within access method code that enforces that buffers are only
3371 : : * accessed while a buffer lock is held.
3372 : : */
3373 : : VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
3374 : :
3375 : : /*
3376 : : * I'd better not still hold the buffer content lock. Can't use
3377 : : * BufferIsLockedByMe(), as that asserts the buffer is pinned.
3378 : : */
3655 rhaas@postgresql.org 3379 [ - + ]: 55395899 : Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
3380 : :
3381 : : /* decrement the shared reference count */
41 andres@anarazel.de 3382 : 55395899 : old_buf_state = pg_atomic_fetch_sub_u32(&buf->state, BUF_REFCOUNT_ONE);
3383 : :
3384 : : /* Support LockBufferForCleanup() */
3385 [ + + ]: 55395899 : if (old_buf_state & BM_PIN_COUNT_WAITER)
262 andres@anarazel.de 3386 :CBC 7 : WakePinCountWaiter(buf);
3387 : :
4127 3388 : 55395899 : ForgetPrivateRefCountEntry(ref);
3389 : : }
7912 tgl@sss.pgh.pa.us 3390 : 68019557 : }
3391 : :
3392 : : /*
3393 : : * Set up backend-local tracking of a buffer pinned the first time by this
3394 : : * backend.
3395 : : */
3396 : : inline void
70 andres@anarazel.de 3397 :GNC 55395899 : TrackNewBufferPin(Buffer buf)
3398 : : {
3399 : : PrivateRefCountEntry *ref;
3400 : :
3401 : 55395899 : ref = NewPrivateRefCountEntry(buf);
3 3402 : 55395899 : ref->data.refcount++;
3403 : :
70 3404 : 55395899 : ResourceOwnerRememberBuffer(CurrentResourceOwner, buf);
3405 : :
3406 : : /*
3407 : : * This is the first pin for this page by this backend, mark its page as
3408 : : * defined to valgrind. While the page contents might not actually be
3409 : : * valid yet, we don't currently guarantee that such pages are marked
3410 : : * undefined or non-accessible.
3411 : : *
3412 : : * It's not necessarily the prettiest to do this here, but otherwise we'd
3413 : : * need this block of code in multiple places.
3414 : : */
3415 : : VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(GetBufferDescriptor(buf - 1)),
3416 : : BLCKSZ);
3417 : 55395899 : }
3418 : :
3419 : : #define ST_SORT sort_checkpoint_bufferids
3420 : : #define ST_ELEMENT_TYPE CkptSortItem
3421 : : #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
3422 : : #define ST_SCOPE static
3423 : : #define ST_DEFINE
3424 : : #include "lib/sort_template.h"
3425 : :
3426 : : /*
3427 : : * BufferSync -- Write out all dirty buffers in the pool.
3428 : : *
3429 : : * This is called at checkpoint time to write out all dirty shared buffers.
3430 : : * The checkpoint request flags should be passed in. If CHECKPOINT_FAST is
3431 : : * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
3432 : : * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write
3433 : : * even unlogged buffers, which are otherwise skipped. The remaining flags
3434 : : * currently have no effect here.
3435 : : */
3436 : : static void
6747 tgl@sss.pgh.pa.us 3437 :CBC 1729 : BufferSync(int flags)
3438 : : {
3439 : : uint32 buf_state;
3440 : : int buf_id;
3441 : : int num_to_scan;
3442 : : int num_spaces;
3443 : : int num_processed;
3444 : : int num_written;
3589 andres@anarazel.de 3445 : 1729 : CkptTsStatus *per_ts_stat = NULL;
3446 : : Oid last_tsid;
3447 : : binaryheap *ts_heap;
3448 : : int i;
70 andres@anarazel.de 3449 :GNC 1729 : uint32 mask = BM_DIRTY;
3450 : : WritebackContext wb_context;
3451 : :
3452 : : /*
3453 : : * Unless this is a shutdown checkpoint or we have been explicitly told,
3454 : : * we write only permanent, dirty buffers. But at shutdown or end of
3455 : : * recovery, we write all dirty buffers.
3456 : : */
4076 andres@anarazel.de 3457 [ + + ]:CBC 1729 : if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
3458 : : CHECKPOINT_FLUSH_UNLOGGED))))
5094 rhaas@postgresql.org 3459 : 984 : mask |= BM_PERMANENT;
3460 : :
3461 : : /*
3462 : : * Loop over all buffers, and mark the ones that need to be written with
3463 : : * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3464 : : * can estimate how much work needs to be done.
3465 : : *
3466 : : * This allows us to write only those pages that were dirty when the
3467 : : * checkpoint began, and not those that get dirtied while it proceeds.
3468 : : * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3469 : : * later in this function, or by normal backends or the bgwriter cleaning
3470 : : * scan, the flag is cleared. Any buffer dirtied after this point won't
3471 : : * have the flag set.
3472 : : *
3473 : : * Note that if we fail to write some buffer, we may leave buffers with
3474 : : * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3475 : : * certainly need to be written for the next checkpoint attempt, too.
3476 : : */
3589 andres@anarazel.de 3477 : 1729 : num_to_scan = 0;
6747 tgl@sss.pgh.pa.us 3478 [ + + ]: 11943313 : for (buf_id = 0; buf_id < NBuffers; buf_id++)
3479 : : {
3684 rhaas@postgresql.org 3480 : 11941584 : BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
41 andres@anarazel.de 3481 :GNC 11941584 : uint32 set_bits = 0;
3482 : :
3483 : : /*
3484 : : * Header spinlock is enough to examine BM_DIRTY, see comment in
3485 : : * SyncOneBuffer.
3486 : : */
3538 andres@anarazel.de 3487 :CBC 11941584 : buf_state = LockBufHdr(bufHdr);
3488 : :
3489 [ + + ]: 11941584 : if ((buf_state & mask) == mask)
3490 : : {
3491 : : CkptSortItem *item;
3492 : :
41 andres@anarazel.de 3493 :GNC 293683 : set_bits = BM_CHECKPOINT_NEEDED;
3494 : :
3589 andres@anarazel.de 3495 :CBC 293683 : item = &CkptBufferIds[num_to_scan++];
3496 : 293683 : item->buf_id = buf_id;
1211 rhaas@postgresql.org 3497 : 293683 : item->tsId = bufHdr->tag.spcOid;
3498 : 293683 : item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3499 : 293683 : item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3589 andres@anarazel.de 3500 : 293683 : item->blockNum = bufHdr->tag.blockNum;
3501 : : }
3502 : :
41 andres@anarazel.de 3503 :GNC 11941584 : UnlockBufHdrExt(bufHdr, buf_state,
3504 : : set_bits, 0,
3505 : : 0);
3506 : :
3507 : : /* Check for barrier events in case NBuffers is large. */
2190 rhaas@postgresql.org 3508 [ - + ]:CBC 11941584 : if (ProcSignalBarrierPending)
2190 rhaas@postgresql.org 3509 :UBC 0 : ProcessProcSignalBarrier();
3510 : : }
3511 : :
3589 andres@anarazel.de 3512 [ + + ]:CBC 1729 : if (num_to_scan == 0)
6747 tgl@sss.pgh.pa.us 3513 : 661 : return; /* nothing to do */
3514 : :
3589 andres@anarazel.de 3515 : 1068 : WritebackContextInit(&wb_context, &checkpoint_flush_after);
3516 : :
3517 : : TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
3518 : :
3519 : : /*
3520 : : * Sort buffers that need to be written to reduce the likelihood of random
3521 : : * IO. The sorting is also important for the implementation of balancing
3522 : : * writes between tablespaces. Without balancing writes we'd potentially
3523 : : * end up writing to the tablespaces one-by-one; possibly overloading the
3524 : : * underlying system.
3525 : : */
1741 tmunro@postgresql.or 3526 : 1068 : sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
3527 : :
3589 andres@anarazel.de 3528 : 1068 : num_spaces = 0;
3529 : :
3530 : : /*
3531 : : * Allocate progress status for each tablespace with buffers that need to
3532 : : * be flushed. This requires the to-be-flushed array to be sorted.
3533 : : */
3534 : 1068 : last_tsid = InvalidOid;
3535 [ + + ]: 294751 : for (i = 0; i < num_to_scan; i++)
3536 : : {
3537 : : CkptTsStatus *s;
3538 : : Oid cur_tsid;
3539 : :
3540 : 293683 : cur_tsid = CkptBufferIds[i].tsId;
3541 : :
3542 : : /*
3543 : : * Grow array of per-tablespace status structs, every time a new
3544 : : * tablespace is found.
3545 : : */
3546 [ + + + + ]: 293683 : if (last_tsid == InvalidOid || last_tsid != cur_tsid)
3547 : 1622 : {
3548 : : Size sz;
3549 : :
3550 : 1622 : num_spaces++;
3551 : :
3552 : : /*
3553 : : * Not worth adding grow-by-power-of-2 logic here - even with a
3554 : : * few hundred tablespaces this should be fine.
3555 : : */
3556 : 1622 : sz = sizeof(CkptTsStatus) * num_spaces;
3557 : :
3558 [ + + ]: 1622 : if (per_ts_stat == NULL)
3559 : 1068 : per_ts_stat = (CkptTsStatus *) palloc(sz);
3560 : : else
3561 : 554 : per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
3562 : :
3563 : 1622 : s = &per_ts_stat[num_spaces - 1];
3564 : 1622 : memset(s, 0, sizeof(*s));
3565 : 1622 : s->tsId = cur_tsid;
3566 : :
3567 : : /*
3568 : : * The first buffer in this tablespace. As CkptBufferIds is sorted
3569 : : * by tablespace all (s->num_to_scan) buffers in this tablespace
3570 : : * will follow afterwards.
3571 : : */
3572 : 1622 : s->index = i;
3573 : :
3574 : : /*
3575 : : * progress_slice will be determined once we know how many buffers
3576 : : * are in each tablespace, i.e. after this loop.
3577 : : */
3578 : :
3579 : 1622 : last_tsid = cur_tsid;
3580 : : }
3581 : : else
3582 : : {
3583 : 292061 : s = &per_ts_stat[num_spaces - 1];
3584 : : }
3585 : :
3586 : 293683 : s->num_to_scan++;
3587 : :
3588 : : /* Check for barrier events. */
2190 rhaas@postgresql.org 3589 [ - + ]: 293683 : if (ProcSignalBarrierPending)
2190 rhaas@postgresql.org 3590 :UBC 0 : ProcessProcSignalBarrier();
3591 : : }
3592 : :
3589 andres@anarazel.de 3593 [ - + ]:CBC 1068 : Assert(num_spaces > 0);
3594 : :
3595 : : /*
3596 : : * Build a min-heap over the write-progress in the individual tablespaces,
3597 : : * and compute how large a portion of the total progress a single
3598 : : * processed buffer is.
3599 : : */
3600 : 1068 : ts_heap = binaryheap_allocate(num_spaces,
3601 : : ts_ckpt_progress_comparator,
3602 : : NULL);
3603 : :
3604 [ + + ]: 2690 : for (i = 0; i < num_spaces; i++)
3605 : : {
3606 : 1622 : CkptTsStatus *ts_stat = &per_ts_stat[i];
3607 : :
3608 : 1622 : ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3609 : :
3610 : 1622 : binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3611 : : }
3612 : :
3613 : 1068 : binaryheap_build(ts_heap);
3614 : :
3615 : : /*
3616 : : * Iterate through to-be-checkpointed buffers and write the ones (still)
3617 : : * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3618 : : * tablespaces; otherwise the sorting would lead to only one tablespace
3619 : : * receiving writes at a time, making inefficient use of the hardware.
3620 : : */
3621 : 1068 : num_processed = 0;
6747 tgl@sss.pgh.pa.us 3622 : 1068 : num_written = 0;
3589 andres@anarazel.de 3623 [ + + ]: 294751 : while (!binaryheap_empty(ts_heap))
3624 : : {
3625 : 293683 : BufferDesc *bufHdr = NULL;
3626 : : CkptTsStatus *ts_stat = (CkptTsStatus *)
943 tgl@sss.pgh.pa.us 3627 : 293683 : DatumGetPointer(binaryheap_first(ts_heap));
3628 : :
3589 andres@anarazel.de 3629 : 293683 : buf_id = CkptBufferIds[ts_stat->index].buf_id;
3630 [ - + ]: 293683 : Assert(buf_id != -1);
3631 : :
3632 : 293683 : bufHdr = GetBufferDescriptor(buf_id);
3633 : :
3634 : 293683 : num_processed++;
3635 : :
3636 : : /*
3637 : : * We don't need to acquire the lock here, because we're only looking
3638 : : * at a single bit. It's possible that someone else writes the buffer
3639 : : * and clears the flag right after we check, but that doesn't matter
3640 : : * since SyncOneBuffer will then do nothing. However, there is a
3641 : : * further race condition: it's conceivable that between the time we
3642 : : * examine the bit here and the time SyncOneBuffer acquires the lock,
3643 : : * someone else not only wrote the buffer but replaced it with another
3644 : : * page and dirtied it. In that improbable case, SyncOneBuffer will
3645 : : * write the buffer though we didn't need to. It doesn't seem worth
3646 : : * guarding against this, though.
3647 : : */
3538 3648 [ + + ]: 293683 : if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
3649 : : {
3589 3650 [ + - ]: 274072 : if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3651 : : {
3652 : : TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
779 michael@paquier.xyz 3653 : 274072 : PendingCheckpointerStats.buffers_written++;
6747 tgl@sss.pgh.pa.us 3654 : 274072 : num_written++;
3655 : : }
3656 : : }
3657 : :
3658 : : /*
3659 : : * Measure progress independent of actually having to flush the buffer
3660 : : * - otherwise writing become unbalanced.
3661 : : */
3589 andres@anarazel.de 3662 : 293683 : ts_stat->progress += ts_stat->progress_slice;
3663 : 293683 : ts_stat->num_scanned++;
3664 : 293683 : ts_stat->index++;
3665 : :
3666 : : /* Have all the buffers from the tablespace been processed? */
3667 [ + + ]: 293683 : if (ts_stat->num_scanned == ts_stat->num_to_scan)
3668 : : {
3669 : 1622 : binaryheap_remove_first(ts_heap);
3670 : : }
3671 : : else
3672 : : {
3673 : : /* update heap with the new progress */
3674 : 292061 : binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3675 : : }
3676 : :
3677 : : /*
3678 : : * Sleep to throttle our I/O rate.
3679 : : *
3680 : : * (This will check for barrier events even if it doesn't sleep.)
3681 : : */
3682 : 293683 : CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3683 : : }
3684 : :
3685 : : /*
3686 : : * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3687 : : * IOContext will always be IOCONTEXT_NORMAL.
3688 : : */
945 3689 : 1068 : IssuePendingWritebacks(&wb_context, IOCONTEXT_NORMAL);
3690 : :
3589 3691 : 1068 : pfree(per_ts_stat);
3692 : 1068 : per_ts_stat = NULL;
3693 : 1068 : binaryheap_free(ts_heap);
3694 : :
3695 : : /*
3696 : : * Update checkpoint statistics. As noted above, this doesn't include
3697 : : * buffers written by other backends or bgwriter scan.
3698 : : */
6745 tgl@sss.pgh.pa.us 3699 : 1068 : CheckpointStats.ckpt_bufs_written += num_written;
3700 : :
3701 : : TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3702 : : }
3703 : :
3704 : : /*
3705 : : * BgBufferSync -- Write out some dirty buffers in the pool.
3706 : : *
3707 : : * This is called periodically by the background writer process.
3708 : : *
3709 : : * Returns true if it's appropriate for the bgwriter process to go into
3710 : : * low-power hibernation mode. (This happens if the strategy clock-sweep
3711 : : * has been "lapped" and no buffer allocations have occurred recently,
3712 : : * or if the bgwriter has been effectively disabled by setting
3713 : : * bgwriter_lru_maxpages to 0.)
3714 : : */
3715 : : bool
3589 andres@anarazel.de 3716 : 11791 : BgBufferSync(WritebackContext *wb_context)
3717 : : {
3718 : : /* info obtained from freelist.c */
3719 : : int strategy_buf_id;
3720 : : uint32 strategy_passes;
3721 : : uint32 recent_alloc;
3722 : :
3723 : : /*
3724 : : * Information saved between calls so we can determine the strategy
3725 : : * point's advance rate and avoid scanning already-cleaned buffers.
3726 : : */
3727 : : static bool saved_info_valid = false;
3728 : : static int prev_strategy_buf_id;
3729 : : static uint32 prev_strategy_passes;
3730 : : static int next_to_clean;
3731 : : static uint32 next_passes;
3732 : :
3733 : : /* Moving averages of allocation rate and clean-buffer density */
3734 : : static float smoothed_alloc = 0;
3735 : : static float smoothed_density = 10.0;
3736 : :
3737 : : /* Potentially these could be tunables, but for now, not */
6658 tgl@sss.pgh.pa.us 3738 : 11791 : float smoothing_samples = 16;
3739 : 11791 : float scan_whole_pool_milliseconds = 120000.0;
3740 : :
3741 : : /* Used to compute how far we scan ahead */
3742 : : long strategy_delta;
3743 : : int bufs_to_lap;
3744 : : int bufs_ahead;
3745 : : float scans_per_alloc;
3746 : : int reusable_buffers_est;
3747 : : int upcoming_alloc_est;
3748 : : int min_scan_buffers;
3749 : :
3750 : : /* Variables for the scanning loop proper */
3751 : : int num_to_scan;
3752 : : int num_written;
3753 : : int reusable_buffers;
3754 : :
3755 : : /* Variables for final smoothed_density update */
3756 : : long new_strategy_delta;
3757 : : uint32 new_recent_alloc;
3758 : :
3759 : : /*
3760 : : * Find out where the clock-sweep currently is, and how many buffer
3761 : : * allocations have happened since our last call.
3762 : : */
3763 : 11791 : strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3764 : :
3765 : : /* Report buffer alloc counts to pgstat */
1351 andres@anarazel.de 3766 : 11791 : PendingBgWriterStats.buf_alloc += recent_alloc;
3767 : :
3768 : : /*
3769 : : * If we're not running the LRU scan, just stop after doing the stats
3770 : : * stuff. We mark the saved state invalid so that we can recover sanely
3771 : : * if LRU scan is turned back on later.
3772 : : */
6658 tgl@sss.pgh.pa.us 3773 [ + + ]: 11791 : if (bgwriter_lru_maxpages <= 0)
3774 : : {
3775 : 40 : saved_info_valid = false;
5074 heikki.linnakangas@i 3776 : 40 : return true;
3777 : : }
3778 : :
3779 : : /*
3780 : : * Compute strategy_delta = how many buffers have been scanned by the
3781 : : * clock-sweep since last time. If first time through, assume none. Then
3782 : : * see if we are still ahead of the clock-sweep, and if so, how many
3783 : : * buffers we could scan before we'd catch up with it and "lap" it. Note:
3784 : : * weird-looking coding of xxx_passes comparisons are to avoid bogus
3785 : : * behavior when the passes counts wrap around.
3786 : : */
6658 tgl@sss.pgh.pa.us 3787 [ + + ]: 11751 : if (saved_info_valid)
3788 : : {
6607 bruce@momjian.us 3789 : 11236 : int32 passes_delta = strategy_passes - prev_strategy_passes;
3790 : :
6658 tgl@sss.pgh.pa.us 3791 : 11236 : strategy_delta = strategy_buf_id - prev_strategy_buf_id;
3101 3792 : 11236 : strategy_delta += (long) passes_delta * NBuffers;
3793 : :
6658 3794 [ - + ]: 11236 : Assert(strategy_delta >= 0);
3795 : :
3796 [ + + ]: 11236 : if ((int32) (next_passes - strategy_passes) > 0)
3797 : : {
3798 : : /* we're one pass ahead of the strategy point */
3799 : 2413 : bufs_to_lap = strategy_buf_id - next_to_clean;
3800 : : #ifdef BGW_DEBUG
3801 : : elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3802 : : next_passes, next_to_clean,
3803 : : strategy_passes, strategy_buf_id,
3804 : : strategy_delta, bufs_to_lap);
3805 : : #endif
3806 : : }
3807 [ + + ]: 8823 : else if (next_passes == strategy_passes &&
3808 [ + + ]: 6186 : next_to_clean >= strategy_buf_id)
3809 : : {
3810 : : /* on same pass, but ahead or at least not behind */
3811 : 5374 : bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3812 : : #ifdef BGW_DEBUG
3813 : : elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3814 : : next_passes, next_to_clean,
3815 : : strategy_passes, strategy_buf_id,
3816 : : strategy_delta, bufs_to_lap);
3817 : : #endif
3818 : : }
3819 : : else
3820 : : {
3821 : : /*
3822 : : * We're behind, so skip forward to the strategy point and start
3823 : : * cleaning from there.
3824 : : */
3825 : : #ifdef BGW_DEBUG
3826 : : elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3827 : : next_passes, next_to_clean,
3828 : : strategy_passes, strategy_buf_id,
3829 : : strategy_delta);
3830 : : #endif
3831 : 3449 : next_to_clean = strategy_buf_id;
3832 : 3449 : next_passes = strategy_passes;
3833 : 3449 : bufs_to_lap = NBuffers;
3834 : : }
3835 : : }
3836 : : else
3837 : : {
3838 : : /*
3839 : : * Initializing at startup or after LRU scanning had been off. Always
3840 : : * start at the strategy point.
3841 : : */
3842 : : #ifdef BGW_DEBUG
3843 : : elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3844 : : strategy_passes, strategy_buf_id);
3845 : : #endif
3846 : 515 : strategy_delta = 0;
3847 : 515 : next_to_clean = strategy_buf_id;
3848 : 515 : next_passes = strategy_passes;
3849 : 515 : bufs_to_lap = NBuffers;
3850 : : }
3851 : :
3852 : : /* Update saved info for next time */
3853 : 11751 : prev_strategy_buf_id = strategy_buf_id;
3854 : 11751 : prev_strategy_passes = strategy_passes;
3855 : 11751 : saved_info_valid = true;
3856 : :
3857 : : /*
3858 : : * Compute how many buffers had to be scanned for each new allocation, ie,
3859 : : * 1/density of reusable buffers, and track a moving average of that.
3860 : : *
3861 : : * If the strategy point didn't move, we don't update the density estimate
3862 : : */
3863 [ + + + - ]: 11751 : if (strategy_delta > 0 && recent_alloc > 0)
3864 : : {
3865 : 8072 : scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3866 : 8072 : smoothed_density += (scans_per_alloc - smoothed_density) /
3867 : : smoothing_samples;
3868 : : }
3869 : :
3870 : : /*
3871 : : * Estimate how many reusable buffers there are between the current
3872 : : * strategy point and where we've scanned ahead to, based on the smoothed
3873 : : * density estimate.
3874 : : */
3875 : 11751 : bufs_ahead = NBuffers - bufs_to_lap;
3876 : 11751 : reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3877 : :
3878 : : /*
3879 : : * Track a moving average of recent buffer allocations. Here, rather than
3880 : : * a true average we want a fast-attack, slow-decline behavior: we
3881 : : * immediately follow any increase.
3882 : : */
3883 [ + + ]: 11751 : if (smoothed_alloc <= (float) recent_alloc)
3884 : 2220 : smoothed_alloc = recent_alloc;
3885 : : else
3886 : 9531 : smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3887 : : smoothing_samples;
3888 : :
3889 : : /* Scale the estimate by a GUC to allow more aggressive tuning. */
5142 3890 : 11751 : upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3891 : :
3892 : : /*
3893 : : * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3894 : : * eventually underflow to zero, and the underflows produce annoying
3895 : : * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3896 : : * zero, there's no point in tracking smaller and smaller values of
3897 : : * smoothed_alloc, so just reset it to exactly zero to avoid this
3898 : : * syndrome. It will pop back up as soon as recent_alloc increases.
3899 : : */
3900 [ + + ]: 11751 : if (upcoming_alloc_est == 0)
3901 : 716 : smoothed_alloc = 0;
3902 : :
3903 : : /*
3904 : : * Even in cases where there's been little or no buffer allocation
3905 : : * activity, we want to make a small amount of progress through the buffer
3906 : : * cache so that as many reusable buffers as possible are clean after an
3907 : : * idle period.
3908 : : *
3909 : : * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3910 : : * the BGW will be called during the scan_whole_pool time; slice the
3911 : : * buffer pool into that many sections.
3912 : : */
6658 3913 : 11751 : min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3914 : :
3915 [ + + ]: 11751 : if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3916 : : {
3917 : : #ifdef BGW_DEBUG
3918 : : elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3919 : : upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3920 : : #endif
3921 : 5432 : upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3922 : : }
3923 : :
3924 : : /*
3925 : : * Now write out dirty reusable buffers, working forward from the
3926 : : * next_to_clean point, until we have lapped the strategy scan, or cleaned
3927 : : * enough buffers to match our estimate of the next cycle's allocation
3928 : : * requirements, or hit the bgwriter_lru_maxpages limit.
3929 : : */
3930 : :
3931 : 11751 : num_to_scan = bufs_to_lap;
3932 : 11751 : num_written = 0;
3933 : 11751 : reusable_buffers = reusable_buffers_est;
3934 : :
3935 : : /* Execute the LRU scan */
3936 [ + + + + ]: 1819356 : while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3937 : : {
3538 andres@anarazel.de 3938 : 1807605 : int sync_state = SyncOneBuffer(next_to_clean, true,
3939 : : wb_context);
3940 : :
6658 tgl@sss.pgh.pa.us 3941 [ + + ]: 1807605 : if (++next_to_clean >= NBuffers)
3942 : : {
3943 : 3290 : next_to_clean = 0;
3944 : 3290 : next_passes++;
3945 : : }
3946 : 1807605 : num_to_scan--;
3947 : :
3538 andres@anarazel.de 3948 [ + + ]: 1807605 : if (sync_state & BUF_WRITTEN)
3949 : : {
6658 tgl@sss.pgh.pa.us 3950 : 26708 : reusable_buffers++;
3951 [ - + ]: 26708 : if (++num_written >= bgwriter_lru_maxpages)
3952 : : {
1351 andres@anarazel.de 3953 :LBC (1) : PendingBgWriterStats.maxwritten_clean++;
6658 tgl@sss.pgh.pa.us 3954 : (1) : break;
3955 : : }
3956 : : }
3538 andres@anarazel.de 3957 [ + + ]:CBC 1780897 : else if (sync_state & BUF_REUSABLE)
6658 tgl@sss.pgh.pa.us 3958 : 1338994 : reusable_buffers++;
3959 : : }
3960 : :
1351 andres@anarazel.de 3961 : 11751 : PendingBgWriterStats.buf_written_clean += num_written;
3962 : :
3963 : : #ifdef BGW_DEBUG
3964 : : elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3965 : : recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3966 : : smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3967 : : bufs_to_lap - num_to_scan,
3968 : : num_written,
3969 : : reusable_buffers - reusable_buffers_est);
3970 : : #endif
3971 : :
3972 : : /*
3973 : : * Consider the above scan as being like a new allocation scan.
3974 : : * Characterize its density and update the smoothed one based on it. This
3975 : : * effectively halves the moving average period in cases where both the
3976 : : * strategy and the background writer are doing some useful scanning,
3977 : : * which is helpful because a long memory isn't as desirable on the
3978 : : * density estimates.
3979 : : */
4970 tgl@sss.pgh.pa.us 3980 : 11751 : new_strategy_delta = bufs_to_lap - num_to_scan;
3981 : 11751 : new_recent_alloc = reusable_buffers - reusable_buffers_est;
3982 [ + + + + ]: 11751 : if (new_strategy_delta > 0 && new_recent_alloc > 0)
3983 : : {
3984 : 10398 : scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
6658 3985 : 10398 : smoothed_density += (scans_per_alloc - smoothed_density) /
3986 : : smoothing_samples;
3987 : :
3988 : : #ifdef BGW_DEBUG
3989 : : elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3990 : : new_recent_alloc, new_strategy_delta,
3991 : : scans_per_alloc, smoothed_density);
3992 : : #endif
3993 : : }
3994 : :
3995 : : /* Return true if OK to hibernate */
4970 3996 [ + + + - ]: 11751 : return (bufs_to_lap == 0 && recent_alloc == 0);
3997 : : }
3998 : :
3999 : : /*
4000 : : * SyncOneBuffer -- process a single buffer during syncing.
4001 : : *
4002 : : * If skip_recently_used is true, we don't write currently-pinned buffers, nor
4003 : : * buffers marked recently used, as these are not replacement candidates.
4004 : : *
4005 : : * Returns a bitmask containing the following flag bits:
4006 : : * BUF_WRITTEN: we wrote the buffer.
4007 : : * BUF_REUSABLE: buffer is available for replacement, ie, it has
4008 : : * pin count 0 and usage count 0.
4009 : : *
4010 : : * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
4011 : : * after locking it, but we don't care all that much.)
4012 : : */
4013 : : static int
3589 andres@anarazel.de 4014 : 2081677 : SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
4015 : : {
3684 rhaas@postgresql.org 4016 : 2081677 : BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
6607 bruce@momjian.us 4017 : 2081677 : int result = 0;
4018 : : uint32 buf_state;
4019 : : BufferTag tag;
4020 : :
4021 : : /* Make sure we can handle the pin */
3985 andres@anarazel.de 4022 : 2081677 : ReservePrivateRefCountEntry();
770 heikki.linnakangas@i 4023 : 2081677 : ResourceOwnerEnlarge(CurrentResourceOwner);
4024 : :
4025 : : /*
4026 : : * Check whether buffer needs writing.
4027 : : *
4028 : : * We can make this check without taking the buffer content lock so long
4029 : : * as we mark pages dirty in access methods *before* logging changes with
4030 : : * XLogInsert(): if someone marks the buffer dirty just after our check we
4031 : : * don't worry because our checkpoint.redo points before log record for
4032 : : * upcoming changes and so we are not required to write such dirty buffer.
4033 : : */
3538 andres@anarazel.de 4034 : 2081677 : buf_state = LockBufHdr(bufHdr);
4035 : :
4036 [ + + ]: 2081677 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
4037 [ + + ]: 2076194 : BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
4038 : : {
6658 tgl@sss.pgh.pa.us 4039 : 1367684 : result |= BUF_REUSABLE;
4040 : : }
4041 [ + + ]: 713993 : else if (skip_recently_used)
4042 : : {
4043 : : /* Caller told us not to write recently-used buffers */
41 andres@anarazel.de 4044 :GNC 441903 : UnlockBufHdr(bufHdr);
6658 tgl@sss.pgh.pa.us 4045 :CBC 441903 : return result;
4046 : : }
4047 : :
3538 andres@anarazel.de 4048 [ + + + + ]: 1639774 : if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
4049 : : {
4050 : : /* It's clean, so nothing to do */
41 andres@anarazel.de 4051 :GNC 1338994 : UnlockBufHdr(bufHdr);
6658 tgl@sss.pgh.pa.us 4052 :CBC 1338994 : return result;
4053 : : }
4054 : :
4055 : : /*
4056 : : * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
4057 : : * buffer is clean by the time we've locked it.)
4058 : : */
7593 4059 : 300780 : PinBuffer_Locked(bufHdr);
4060 : :
70 andres@anarazel.de 4061 :GNC 300780 : FlushUnlockedBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4062 : :
3589 andres@anarazel.de 4063 :CBC 300780 : tag = bufHdr->tag;
4064 : :
1174 michael@paquier.xyz 4065 : 300780 : UnpinBuffer(bufHdr);
4066 : :
4067 : : /*
4068 : : * SyncOneBuffer() is only called by checkpointer and bgwriter, so
4069 : : * IOContext will always be IOCONTEXT_NORMAL.
4070 : : */
945 andres@anarazel.de 4071 : 300780 : ScheduleBufferTagForWriteback(wb_context, IOCONTEXT_NORMAL, &tag);
4072 : :
6658 tgl@sss.pgh.pa.us 4073 : 300780 : return result | BUF_WRITTEN;
4074 : : }
4075 : :
4076 : : /*
4077 : : * AtEOXact_Buffers - clean up at end of transaction.
4078 : : *
4079 : : * As of PostgreSQL 8.0, buffer pins should get released by the
4080 : : * ResourceOwner mechanism. This routine is just a debugging
4081 : : * cross-check that no pins remain.
4082 : : */
4083 : : void
8534 4084 : 331851 : AtEOXact_Buffers(bool isCommit)
4085 : : {
4198 andres@anarazel.de 4086 : 331851 : CheckForBufferLeaks();
4087 : :
7732 tgl@sss.pgh.pa.us 4088 : 331851 : AtEOXact_LocalBuffers(isCommit);
4089 : :
4127 andres@anarazel.de 4090 [ - + ]: 331851 : Assert(PrivateRefCountOverflowed == 0);
4091 : 331851 : }
4092 : :
4093 : : /*
4094 : : * Initialize access to shared buffer pool
4095 : : *
4096 : : * This is called during backend startup (whether standalone or under the
4097 : : * postmaster). It sets up for this backend's access to the already-existing
4098 : : * buffer pool.
4099 : : */
4100 : : void
475 heikki.linnakangas@i 4101 : 19577 : InitBufferManagerAccess(void)
4102 : : {
4103 : : HASHCTL hash_ctl;
4104 : :
4105 : : /*
4106 : : * An advisory limit on the number of pins each backend should hold, based
4107 : : * on shared_buffers and the maximum number of connections possible.
4108 : : * That's very pessimistic, but outside toy-sized shared_buffers it should
4109 : : * allow plenty of pins. LimitAdditionalPins() and
4110 : : * GetAdditionalPinLimit() can be used to check the remaining balance.
4111 : : */
278 tmunro@postgresql.or 4112 : 19577 : MaxProportionalPins = NBuffers / (MaxBackends + NUM_AUXILIARY_PROCS);
4113 : :
4127 andres@anarazel.de 4114 : 19577 : memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
3 andres@anarazel.de 4115 :GNC 19577 : memset(&PrivateRefCountArrayKeys, 0, sizeof(PrivateRefCountArrayKeys));
4116 : :
13 nathan@postgresql.or 4117 : 19577 : hash_ctl.keysize = sizeof(Buffer);
3587 andres@anarazel.de 4118 :CBC 19577 : hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
4119 : :
4127 4120 : 19577 : PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
4121 : : HASH_ELEM | HASH_BLOBS);
4122 : :
4123 : : /*
4124 : : * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4125 : : * the corresponding phase of backend shutdown.
4126 : : */
1595 4127 [ - + ]: 19577 : Assert(MyProc != NULL);
7436 tgl@sss.pgh.pa.us 4128 : 19577 : on_shmem_exit(AtProcExit_Buffers, 0);
4129 : 19577 : }
4130 : :
4131 : : /*
4132 : : * During backend exit, ensure that we released all shared-buffer locks and
4133 : : * assert that we have no remaining pins.
4134 : : */
4135 : : static void
4136 : 19577 : AtProcExit_Buffers(int code, Datum arg)
4137 : : {
7732 4138 : 19577 : UnlockBuffers();
4139 : :
4198 andres@anarazel.de 4140 : 19577 : CheckForBufferLeaks();
4141 : :
4142 : : /* localbuf.c needs a chance too */
4143 : 19577 : AtProcExit_LocalBuffers();
4144 : 19577 : }
4145 : :
4146 : : /*
4147 : : * CheckForBufferLeaks - ensure this backend holds no buffer pins
4148 : : *
4149 : : * As of PostgreSQL 8.0, buffer pins should get released by the
4150 : : * ResourceOwner mechanism. This routine is just a debugging
4151 : : * cross-check that no pins remain.
4152 : : */
4153 : : static void
4154 : 351428 : CheckForBufferLeaks(void)
4155 : : {
4156 : : #ifdef USE_ASSERT_CHECKING
4157 : 351428 : int RefCountErrors = 0;
4158 : : PrivateRefCountEntry *res;
4159 : : int i;
4160 : : char *s;
4161 : :
4162 : : /* check the array */
4127 4163 [ + + ]: 3162852 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4164 : : {
3 andres@anarazel.de 4165 [ - + ]:GNC 2811424 : if (PrivateRefCountArrayKeys[i] != InvalidBuffer)
4166 : : {
3 andres@anarazel.de 4167 :UNC 0 : res = &PrivateRefCountArray[i];
4168 : :
770 heikki.linnakangas@i 4169 :UBC 0 : s = DebugPrintBufferRefcount(res->buffer);
4170 [ # # ]: 0 : elog(WARNING, "buffer refcount leak: %s", s);
4171 : 0 : pfree(s);
4172 : :
4127 andres@anarazel.de 4173 : 0 : RefCountErrors++;
4174 : : }
4175 : : }
4176 : :
4177 : : /* if necessary search the hash */
4127 andres@anarazel.de 4178 [ - + ]:CBC 351428 : if (PrivateRefCountOverflowed)
4179 : : {
4180 : : HASH_SEQ_STATUS hstat;
4181 : :
4127 andres@anarazel.de 4182 :UBC 0 : hash_seq_init(&hstat, PrivateRefCountHash);
4183 [ # # ]: 0 : while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
4184 : : {
770 heikki.linnakangas@i 4185 : 0 : s = DebugPrintBufferRefcount(res->buffer);
4186 [ # # ]: 0 : elog(WARNING, "buffer refcount leak: %s", s);
4187 : 0 : pfree(s);
4198 andres@anarazel.de 4188 : 0 : RefCountErrors++;
4189 : : }
4190 : : }
4191 : :
4198 andres@anarazel.de 4192 [ - + ]:CBC 351428 : Assert(RefCountErrors == 0);
4193 : : #endif
7732 tgl@sss.pgh.pa.us 4194 : 351428 : }
4195 : :
4196 : : #ifdef USE_ASSERT_CHECKING
4197 : : /*
4198 : : * Check for exclusive-locked catalog buffers. This is the core of
4199 : : * AssertCouldGetRelation().
4200 : : *
4201 : : * A backend would self-deadlock on LWLocks if the catalog scan read the
4202 : : * exclusive-locked buffer. The main threat is exclusive-locked buffers of
4203 : : * catalogs used in relcache, because a catcache search on any catalog may
4204 : : * build that catalog's relcache entry. We don't have an inventory of
4205 : : * catalogs relcache uses, so just check buffers of most catalogs.
4206 : : *
4207 : : * It's better to minimize waits while holding an exclusive buffer lock, so it
4208 : : * would be nice to broaden this check not to be catalog-specific. However,
4209 : : * bttextcmp() accesses pg_collation, and non-core opclasses might similarly
4210 : : * read tables. That is deadlock-free as long as there's no loop in the
4211 : : * dependency graph: modifying table A may cause an opclass to read table B,
4212 : : * but it must not cause a read of table A.
4213 : : */
4214 : : void
244 noah@leadboat.com 4215 : 105413251 : AssertBufferLocksPermitCatalogRead(void)
4216 : : {
4217 : 105413251 : ForEachLWLockHeldByMe(AssertNotCatalogBufferLock, NULL);
4218 : 105413251 : }
4219 : :
4220 : : static void
4221 : 53807 : AssertNotCatalogBufferLock(LWLock *lock, LWLockMode mode,
4222 : : void *unused_context)
4223 : : {
4224 : : BufferDesc *bufHdr;
4225 : : BufferTag tag;
4226 : : Oid relid;
4227 : :
4228 [ + + ]: 53807 : if (mode != LW_EXCLUSIVE)
4229 : 43648 : return;
4230 : :
4231 [ + + ]: 13526 : if (!((BufferDescPadded *) lock > BufferDescriptors &&
4232 [ - + ]: 10159 : (BufferDescPadded *) lock < BufferDescriptors + NBuffers))
4233 : 3367 : return; /* not a buffer lock */
4234 : :
4235 : 10159 : bufHdr = (BufferDesc *)
4236 : : ((char *) lock - offsetof(BufferDesc, content_lock));
4237 : 10159 : tag = bufHdr->tag;
4238 : :
4239 : : /*
4240 : : * This relNumber==relid assumption holds until a catalog experiences
4241 : : * VACUUM FULL or similar. After a command like that, relNumber will be
4242 : : * in the normal (non-catalog) range, and we lose the ability to detect
4243 : : * hazardous access to that catalog. Calling RelidByRelfilenumber() would
4244 : : * close that gap, but RelidByRelfilenumber() might then deadlock with a
4245 : : * held lock.
4246 : : */
4247 : 10159 : relid = tag.relNumber;
4248 : :
4249 [ - + ]: 10159 : if (IsCatalogTextUniqueIndexOid(relid)) /* see comments at the callee */
244 noah@leadboat.com 4250 :UBC 0 : return;
4251 : :
244 noah@leadboat.com 4252 [ - + ]:CBC 10159 : Assert(!IsCatalogRelationOid(relid));
4253 : : }
4254 : : #endif
4255 : :
4256 : :
4257 : : /*
4258 : : * Helper routine to issue warnings when a buffer is unexpectedly pinned
4259 : : */
4260 : : char *
770 heikki.linnakangas@i 4261 : 60 : DebugPrintBufferRefcount(Buffer buffer)
4262 : : {
4263 : : BufferDesc *buf;
4264 : : int32 loccount;
4265 : : char *result;
4266 : : ProcNumber backend;
4267 : : uint32 buf_state;
4268 : :
7732 tgl@sss.pgh.pa.us 4269 [ - + ]: 60 : Assert(BufferIsValid(buffer));
4270 [ + + ]: 60 : if (BufferIsLocal(buffer))
4271 : : {
3975 andres@anarazel.de 4272 : 24 : buf = GetLocalBufferDescriptor(-buffer - 1);
7732 tgl@sss.pgh.pa.us 4273 : 24 : loccount = LocalRefCount[-buffer - 1];
654 heikki.linnakangas@i 4274 : 24 : backend = MyProcNumber;
4275 : : }
4276 : : else
4277 : : {
3975 andres@anarazel.de 4278 : 36 : buf = GetBufferDescriptor(buffer - 1);
4127 4279 : 36 : loccount = GetPrivateRefCount(buffer);
654 heikki.linnakangas@i 4280 : 36 : backend = INVALID_PROC_NUMBER;
4281 : : }
4282 : :
4283 : : /* theoretically we should lock the bufhdr here */
3538 andres@anarazel.de 4284 : 60 : buf_state = pg_atomic_read_u32(&buf->state);
4285 : :
770 heikki.linnakangas@i 4286 : 60 : result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
4287 : : buffer,
295 andres@anarazel.de 4288 : 60 : relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
4289 : : BufTagGetForkNum(&buf->tag)).str,
4290 : : buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4291 : : BUF_STATE_GET_REFCOUNT(buf_state), loccount);
770 heikki.linnakangas@i 4292 : 60 : return result;
4293 : : }
4294 : :
4295 : : /*
4296 : : * CheckPointBuffers
4297 : : *
4298 : : * Flush all dirty blocks in buffer pool to disk at checkpoint time.
4299 : : *
4300 : : * Note: temporary relations do not participate in checkpoints, so they don't
4301 : : * need to be flushed.
4302 : : */
4303 : : void
6747 tgl@sss.pgh.pa.us 4304 : 1729 : CheckPointBuffers(int flags)
4305 : : {
4306 : 1729 : BufferSync(flags);
9148 vadim4o@yahoo.com 4307 : 1729 : }
4308 : :
4309 : : /*
4310 : : * BufferGetBlockNumber
4311 : : * Returns the block number associated with a buffer.
4312 : : *
4313 : : * Note:
4314 : : * Assumes that the buffer is valid and pinned, else the
4315 : : * value may be obsolete immediately...
4316 : : */
4317 : : BlockNumber
10753 scrappy@hub.org 4318 : 143256517 : BufferGetBlockNumber(Buffer buffer)
4319 : : {
4320 : : BufferDesc *bufHdr;
4321 : :
8647 bruce@momjian.us 4322 [ - + + + : 143256517 : Assert(BufferIsPinned(buffer));
- + ]
4323 : :
10328 4324 [ + + ]: 143256517 : if (BufferIsLocal(buffer))
3975 andres@anarazel.de 4325 : 4242955 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4326 : : else
4327 : 139013562 : bufHdr = GetBufferDescriptor(buffer - 1);
4328 : :
4329 : : /* pinned, so OK to read tag without spinlock */
7593 tgl@sss.pgh.pa.us 4330 : 143256517 : return bufHdr->tag.blockNum;
4331 : : }
4332 : :
4333 : : /*
4334 : : * BufferGetTag
4335 : : * Returns the relfilelocator, fork number and block number associated with
4336 : : * a buffer.
4337 : : */
4338 : : void
1260 rhaas@postgresql.org 4339 : 15062211 : BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
4340 : : BlockNumber *blknum)
4341 : : {
4342 : : BufferDesc *bufHdr;
4343 : :
4344 : : /* Do the same checks as BufferGetBlockNumber. */
6337 heikki.linnakangas@i 4345 [ - + - + : 15062211 : Assert(BufferIsPinned(buffer));
- + ]
4346 : :
7912 tgl@sss.pgh.pa.us 4347 [ - + ]: 15062211 : if (BufferIsLocal(buffer))
3975 andres@anarazel.de 4348 :UBC 0 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4349 : : else
3975 andres@anarazel.de 4350 :CBC 15062211 : bufHdr = GetBufferDescriptor(buffer - 1);
4351 : :
4352 : : /* pinned, so OK to read tag without spinlock */
1211 rhaas@postgresql.org 4353 : 15062211 : *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4354 : 15062211 : *forknum = BufTagGetForkNum(&bufHdr->tag);
6337 heikki.linnakangas@i 4355 : 15062211 : *blknum = bufHdr->tag.blockNum;
7912 tgl@sss.pgh.pa.us 4356 : 15062211 : }
4357 : :
4358 : : /*
4359 : : * FlushBuffer
4360 : : * Physically write out a shared buffer.
4361 : : *
4362 : : * NOTE: this actually just passes the buffer contents to the kernel; the
4363 : : * real write to disk won't happen until the kernel feels like it. This
4364 : : * is okay from our point of view since we can redo the changes from WAL.
4365 : : * However, we will need to force the changes to disk via fsync before
4366 : : * we can checkpoint WAL.
4367 : : *
4368 : : * The caller must hold a pin on the buffer and have share-locked the
4369 : : * buffer contents. (Note: a share-lock does not prevent updates of
4370 : : * hint bits in the buffer, so the page could change while the write
4371 : : * is in progress, but we assume that that will not invalidate the data
4372 : : * written.)
4373 : : *
4374 : : * If the caller has an smgr reference for the buffer's relation, pass it
4375 : : * as the second parameter. If not, pass NULL.
4376 : : */
4377 : : static void
1042 andres@anarazel.de 4378 : 573029 : FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
4379 : : IOContext io_context)
4380 : : {
4381 : : XLogRecPtr recptr;
4382 : : ErrorContextCallback errcallback;
4383 : : instr_time io_start;
4384 : : Block bufBlock;
4385 : : char *bufToWrite;
4386 : : uint32 buf_state;
4387 : :
4388 : : /*
4389 : : * Try to start an I/O operation. If StartBufferIO returns false, then
4390 : : * someone else flushed the buffer before we could, so we need not do
4391 : : * anything.
4392 : : */
623 tmunro@postgresql.or 4393 [ + + ]: 573029 : if (!StartBufferIO(buf, false, false))
7593 tgl@sss.pgh.pa.us 4394 : 9 : return;
4395 : :
4396 : : /* Setup error traceback support for ereport() */
4783 heikki.linnakangas@i 4397 : 573020 : errcallback.callback = shared_buffer_write_error_callback;
384 peter@eisentraut.org 4398 : 573020 : errcallback.arg = buf;
4783 heikki.linnakangas@i 4399 : 573020 : errcallback.previous = error_context_stack;
4400 : 573020 : error_context_stack = &errcallback;
4401 : :
4402 : : /* Find smgr relation for buffer */
7910 tgl@sss.pgh.pa.us 4403 [ + + ]: 573020 : if (reln == NULL)
654 heikki.linnakangas@i 4404 : 568588 : reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
4405 : :
4406 : : TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
4407 : : buf->tag.blockNum,
4408 : : reln->smgr_rlocator.locator.spcOid,
4409 : : reln->smgr_rlocator.locator.dbOid,
4410 : : reln->smgr_rlocator.locator.relNumber);
4411 : :
3538 andres@anarazel.de 4412 : 573020 : buf_state = LockBufHdr(buf);
4413 : :
4414 : : /*
4415 : : * Run PageGetLSN while holding header lock, since we don't have the
4416 : : * buffer locked exclusively in all cases.
4417 : : */
4653 simon@2ndQuadrant.co 4418 : 573020 : recptr = BufferGetLSN(buf);
4419 : :
4420 : : /* To check if block content changes while flushing. - vadim 01/17/97 */
41 andres@anarazel.de 4421 :GNC 573020 : UnlockBufHdrExt(buf, buf_state,
4422 : : 0, BM_JUST_DIRTIED,
4423 : : 0);
4424 : :
4425 : : /*
4426 : : * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4427 : : * rule that log updates must hit disk before any of the data-file changes
4428 : : * they describe do.
4429 : : *
4430 : : * However, this rule does not apply to unlogged relations, which will be
4431 : : * lost after a crash anyway. Most unlogged relation pages do not bear
4432 : : * LSNs since we never emit WAL records for them, and therefore flushing
4433 : : * up through the buffer LSN would be useless, but harmless. However,
4434 : : * GiST indexes use LSNs internally to track page-splits, and therefore
4435 : : * unlogged GiST pages bear "fake" LSNs generated by
4436 : : * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
4437 : : * LSN counter could advance past the WAL insertion point; and if it did
4438 : : * happen, attempting to flush WAL through that location would fail, with
4439 : : * disastrous system-wide consequences. To make sure that can't happen,
4440 : : * skip the flush if the buffer isn't permanent.
4441 : : */
3538 andres@anarazel.de 4442 [ + + ]:CBC 573020 : if (buf_state & BM_PERMANENT)
4692 heikki.linnakangas@i 4443 : 571218 : XLogFlush(recptr);
4444 : :
4445 : : /*
4446 : : * Now it's safe to write the buffer to disk. Note that no one else should
4447 : : * have been able to write it, while we were busy with log flushing,
4448 : : * because we got the exclusive right to perform I/O by setting the
4449 : : * BM_IO_IN_PROGRESS bit.
4450 : : */
4653 simon@2ndQuadrant.co 4451 : 573020 : bufBlock = BufHdrGetBlock(buf);
4452 : :
4453 : : /*
4454 : : * Update page checksum if desired. Since we have only shared lock on the
4455 : : * buffer, other processes might be updating hint bits in it, so we must
4456 : : * copy the page to private storage if we do checksumming.
4457 : : */
4458 : 573020 : bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
4459 : :
294 michael@paquier.xyz 4460 : 573020 : io_start = pgstat_prepare_io_time(track_io_timing);
4461 : :
4462 : : /*
4463 : : * bufToWrite is either the shared buffer or a copy, as appropriate.
4464 : : */
7981 tgl@sss.pgh.pa.us 4465 : 573020 : smgrwrite(reln,
1211 rhaas@postgresql.org 4466 : 573020 : BufTagGetForkNum(&buf->tag),
4467 : : buf->tag.blockNum,
4468 : : bufToWrite,
4469 : : false);
4470 : :
4471 : : /*
4472 : : * When a strategy is in use, only flushes of dirty buffers already in the
4473 : : * strategy ring are counted as strategy writes (IOCONTEXT
4474 : : * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4475 : : * statistics tracking.
4476 : : *
4477 : : * If a shared buffer initially added to the ring must be flushed before
4478 : : * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4479 : : *
4480 : : * If a shared buffer which was added to the ring later because the
4481 : : * current strategy buffer is pinned or in use or because all strategy
4482 : : * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4483 : : * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4484 : : * (from_ring will be false).
4485 : : *
4486 : : * When a strategy is not in use, the write can only be a "regular" write
4487 : : * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4488 : : */
985 andres@anarazel.de 4489 : 573020 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
4490 : : IOOP_WRITE, io_start, 1, BLCKSZ);
4491 : :
5846 rhaas@postgresql.org 4492 : 573020 : pgBufferUsage.shared_blks_written++;
4493 : :
4494 : : /*
4495 : : * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
4496 : : * end the BM_IO_IN_PROGRESS state.
4497 : : */
262 andres@anarazel.de 4498 : 573020 : TerminateBufferIO(buf, true, 0, true, false);
4499 : :
4500 : : TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
4501 : : buf->tag.blockNum,
4502 : : reln->smgr_rlocator.locator.spcOid,
4503 : : reln->smgr_rlocator.locator.dbOid,
4504 : : reln->smgr_rlocator.locator.relNumber);
4505 : :
4506 : : /* Pop the error context stack */
4783 heikki.linnakangas@i 4507 : 573020 : error_context_stack = errcallback.previous;
4508 : : }
4509 : :
4510 : : /*
4511 : : * Convenience wrapper around FlushBuffer() that locks/unlocks the buffer
4512 : : * before/after calling FlushBuffer().
4513 : : */
4514 : : static void
70 andres@anarazel.de 4515 :GNC 306250 : FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
4516 : : IOObject io_object, IOContext io_context)
4517 : : {
4518 : 306250 : LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
4519 : 306250 : FlushBuffer(buf, reln, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4520 : 306250 : LWLockRelease(BufferDescriptorGetContentLock(buf));
4521 : 306250 : }
4522 : :
4523 : : /*
4524 : : * RelationGetNumberOfBlocksInFork
4525 : : * Determines the current number of pages in the specified relation fork.
4526 : : *
4527 : : * Note that the accuracy of the result will depend on the details of the
4528 : : * relation's storage. For builtin AMs it'll be accurate, but for external AMs
4529 : : * it might not be.
4530 : : */
4531 : : BlockNumber
5467 rhaas@postgresql.org 4532 :CBC 1795764 : RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
4533 : : {
1475 peter@eisentraut.org 4534 [ + + + + : 1795764 : if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
+ + ]
4535 : : {
4536 : : /*
4537 : : * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4538 : : * tableam returns the size in bytes - but for the purpose of this
4539 : : * routine, we want the number of blocks. Therefore divide, rounding
4540 : : * up.
4541 : : */
4542 : : uint64 szbytes;
4543 : :
4544 : 1334045 : szbytes = table_relation_size(relation, forkNum);
4545 : :
4546 : 1334026 : return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4547 : : }
4548 [ + - + + : 461719 : else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
- + - - -
- ]
4549 : : {
1315 tgl@sss.pgh.pa.us 4550 : 461719 : return smgrnblocks(RelationGetSmgr(relation), forkNum);
4551 : : }
4552 : : else
1475 peter@eisentraut.org 4553 :UBC 0 : Assert(false);
4554 : :
4555 : : return 0; /* keep compiler quiet */
4556 : : }
4557 : :
4558 : : /*
4559 : : * BufferIsPermanent
4560 : : * Determines whether a buffer will potentially still be around after
4561 : : * a crash. Caller must hold a buffer pin.
4562 : : */
4563 : : bool
5164 rhaas@postgresql.org 4564 :CBC 9583724 : BufferIsPermanent(Buffer buffer)
4565 : : {
4566 : : BufferDesc *bufHdr;
4567 : :
4568 : : /* Local buffers are used only for temp relations. */
4569 [ + + ]: 9583724 : if (BufferIsLocal(buffer))
4570 : 736075 : return false;
4571 : :
4572 : : /* Make sure we've got a real buffer, and that we hold a pin on it. */
4573 [ - + ]: 8847649 : Assert(BufferIsValid(buffer));
4574 [ - + - + : 8847649 : Assert(BufferIsPinned(buffer));
- + ]
4575 : :
4576 : : /*
4577 : : * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4578 : : * need not bother with the buffer header spinlock. Even if someone else
4579 : : * changes the buffer header state while we're doing this, the state is
4580 : : * changed atomically, so we'll read the old value or the new value, but
4581 : : * not random garbage.
4582 : : */
3975 andres@anarazel.de 4583 : 8847649 : bufHdr = GetBufferDescriptor(buffer - 1);
3538 4584 : 8847649 : return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
4585 : : }
4586 : :
4587 : : /*
4588 : : * BufferGetLSNAtomic
4589 : : * Retrieves the LSN of the buffer atomically using a buffer header lock.
4590 : : * This is necessary for some callers who may not have an exclusive lock
4591 : : * on the buffer.
4592 : : */
4593 : : XLogRecPtr
4653 simon@2ndQuadrant.co 4594 : 6230759 : BufferGetLSNAtomic(Buffer buffer)
4595 : : {
3528 kgrittn@postgresql.o 4596 : 6230759 : char *page = BufferGetPage(buffer);
4597 : : BufferDesc *bufHdr;
4598 : : XLogRecPtr lsn;
4599 : :
4600 : : /*
4601 : : * If we don't need locking for correctness, fastpath out.
4602 : : */
3827 heikki.linnakangas@i 4603 [ + + - + : 6230759 : if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
+ + ]
4653 simon@2ndQuadrant.co 4604 : 239116 : return PageGetLSN(page);
4605 : :
4606 : : /* Make sure we've got a real buffer, and that we hold a pin on it. */
4607 [ - + ]: 5991643 : Assert(BufferIsValid(buffer));
4608 [ - + - + : 5991643 : Assert(BufferIsPinned(buffer));
- + ]
4609 : :
301 rguo@postgresql.org 4610 : 5991643 : bufHdr = GetBufferDescriptor(buffer - 1);
41 andres@anarazel.de 4611 :GNC 5991643 : LockBufHdr(bufHdr);
4653 simon@2ndQuadrant.co 4612 :CBC 5991643 : lsn = PageGetLSN(page);
41 andres@anarazel.de 4613 :GNC 5991643 : UnlockBufHdr(bufHdr);
4614 : :
4653 simon@2ndQuadrant.co 4615 :CBC 5991643 : return lsn;
4616 : : }
4617 : :
4618 : : /* ---------------------------------------------------------------------
4619 : : * DropRelationBuffers
4620 : : *
4621 : : * This function removes from the buffer pool all the pages of the
4622 : : * specified relation forks that have block numbers >= firstDelBlock.
4623 : : * (In particular, with firstDelBlock = 0, all pages are removed.)
4624 : : * Dirty pages are simply dropped, without bothering to write them
4625 : : * out first. Therefore, this is NOT rollback-able, and so should be
4626 : : * used only with extreme caution!
4627 : : *
4628 : : * Currently, this is called only from smgr.c when the underlying file
4629 : : * is about to be deleted or truncated (firstDelBlock is needed for
4630 : : * the truncation case). The data in the affected pages would therefore
4631 : : * be deleted momentarily anyway, and there is no point in writing it.
4632 : : * It is the responsibility of higher-level code to ensure that the
4633 : : * deletion or truncation does not lose any data that could be needed
4634 : : * later. It is also the responsibility of higher-level code to ensure
4635 : : * that no other process could be trying to load more pages of the
4636 : : * relation into buffers.
4637 : : * --------------------------------------------------------------------
4638 : : */
4639 : : void
1254 rhaas@postgresql.org 4640 : 631 : DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
4641 : : int nforks, BlockNumber *firstDelBlock)
4642 : : {
4643 : : int i;
4644 : : int j;
4645 : : RelFileLocatorBackend rlocator;
4646 : : BlockNumber nForkBlock[MAX_FORKNUM];
1680 tgl@sss.pgh.pa.us 4647 : 631 : uint64 nBlocksToInvalidate = 0;
4648 : :
1260 rhaas@postgresql.org 4649 : 631 : rlocator = smgr_reln->smgr_rlocator;
4650 : :
4651 : : /* If it's a local relation, it's localbuf.c's problem. */
4652 [ + + ]: 631 : if (RelFileLocatorBackendIsTemp(rlocator))
4653 : : {
654 heikki.linnakangas@i 4654 [ + - ]: 374 : if (rlocator.backend == MyProcNumber)
166 fujii@postgresql.org 4655 :GNC 374 : DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
4656 : : firstDelBlock);
4657 : :
8534 tgl@sss.pgh.pa.us 4658 :CBC 418 : return;
4659 : : }
4660 : :
4661 : : /*
4662 : : * To remove all the pages of the specified relation forks from the buffer
4663 : : * pool, we need to scan the entire buffer pool but we can optimize it by
4664 : : * finding the buffers from BufMapping table provided we know the exact
4665 : : * size of each fork of the relation. The exact size is required to ensure
4666 : : * that we don't leave any buffer for the relation being dropped as
4667 : : * otherwise the background writer or checkpointer can lead to a PANIC
4668 : : * error while flushing buffers corresponding to files that don't exist.
4669 : : *
4670 : : * To know the exact size, we rely on the size cached for each fork by us
4671 : : * during recovery which limits the optimization to recovery and on
4672 : : * standbys but we can easily extend it once we have shared cache for
4673 : : * relation size.
4674 : : *
4675 : : * In recovery, we cache the value returned by the first lseek(SEEK_END)
4676 : : * and the future writes keeps the cached value up-to-date. See
4677 : : * smgrextend. It is possible that the value of the first lseek is smaller
4678 : : * than the actual number of existing blocks in the file due to buggy
4679 : : * Linux kernels that might not have accounted for the recent write. But
4680 : : * that should be fine because there must not be any buffers after that
4681 : : * file size.
4682 : : */
1800 akapila@postgresql.o 4683 [ + + ]: 362 : for (i = 0; i < nforks; i++)
4684 : : {
4685 : : /* Get the number of blocks for a relation's fork */
4686 : 307 : nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4687 : :
4688 [ + + ]: 307 : if (nForkBlock[i] == InvalidBlockNumber)
4689 : : {
4690 : 202 : nBlocksToInvalidate = InvalidBlockNumber;
4691 : 202 : break;
4692 : : }
4693 : :
4694 : : /* calculate the number of blocks to be invalidated */
4695 : 105 : nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4696 : : }
4697 : :
4698 : : /*
4699 : : * We apply the optimization iff the total number of blocks to invalidate
4700 : : * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4701 : : */
4702 [ + + ]: 257 : if (BlockNumberIsValid(nBlocksToInvalidate) &&
4703 [ + + ]: 55 : nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4704 : : {
4705 [ + + ]: 124 : for (j = 0; j < nforks; j++)
1254 rhaas@postgresql.org 4706 : 80 : FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4707 : 80 : nForkBlock[j], firstDelBlock[j]);
1800 akapila@postgresql.o 4708 : 44 : return;
4709 : : }
4710 : :
7593 tgl@sss.pgh.pa.us 4711 [ + + ]: 2725973 : for (i = 0; i < NBuffers; i++)
4712 : : {
3684 rhaas@postgresql.org 4713 : 2725760 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4714 : :
4715 : : /*
4716 : : * We can make this a tad faster by prechecking the buffer tag before
4717 : : * we attempt to lock the buffer; this saves a lot of lock
4718 : : * acquisitions in typical cases. It should be safe because the
4719 : : * caller must have AccessExclusiveLock on the relation, or some other
4720 : : * reason to be certain that no one is loading new pages of the rel
4721 : : * into the buffer pool. (Otherwise we might well miss such pages
4722 : : * entirely.) Therefore, while the tag might be changing while we
4723 : : * look at it, it can't be changing *to* a value we care about, only
4724 : : * *away* from such a value. So false negatives are impossible, and
4725 : : * false positives are safe because we'll recheck after getting the
4726 : : * buffer lock.
4727 : : *
4728 : : * We could check forkNum and blockNum as well as the rlocator, but
4729 : : * the incremental win from doing so seems small.
4730 : : */
1211 4731 [ + + ]: 2725760 : if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4941 tgl@sss.pgh.pa.us 4732 : 2716618 : continue;
4733 : :
41 andres@anarazel.de 4734 :GNC 9142 : LockBufHdr(bufHdr);
4735 : :
2276 fujii@postgresql.org 4736 [ + + ]:CBC 22869 : for (j = 0; j < nforks; j++)
4737 : : {
1211 rhaas@postgresql.org 4738 [ + - ]: 16150 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4739 [ + + ]: 16150 : BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
2276 fujii@postgresql.org 4740 [ + + ]: 9025 : bufHdr->tag.blockNum >= firstDelBlock[j])
4741 : : {
2043 tgl@sss.pgh.pa.us 4742 : 2423 : InvalidateBuffer(bufHdr); /* releases spinlock */
2276 fujii@postgresql.org 4743 : 2423 : break;
4744 : : }
4745 : : }
4746 [ + + ]: 9142 : if (j >= nforks)
41 andres@anarazel.de 4747 :GNC 6719 : UnlockBufHdr(bufHdr);
4748 : : }
4749 : : }
4750 : :
4751 : : /* ---------------------------------------------------------------------
4752 : : * DropRelationsAllBuffers
4753 : : *
4754 : : * This function removes from the buffer pool all the pages of all
4755 : : * forks of the specified relations. It's equivalent to calling
4756 : : * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4757 : : * --------------------------------------------------------------------
4758 : : */
4759 : : void
1254 rhaas@postgresql.org 4760 :CBC 14168 : DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
4761 : : {
4762 : : int i;
1799 akapila@postgresql.o 4763 : 14168 : int n = 0;
4764 : : SMgrRelation *rels;
4765 : : BlockNumber (*block)[MAX_FORKNUM + 1];
1680 tgl@sss.pgh.pa.us 4766 : 14168 : uint64 nBlocksToInvalidate = 0;
4767 : : RelFileLocator *locators;
1799 akapila@postgresql.o 4768 : 14168 : bool cached = true;
4769 : : bool use_bsearch;
4770 : :
1260 rhaas@postgresql.org 4771 [ - + ]: 14168 : if (nlocators == 0)
4717 alvherre@alvh.no-ip. 4772 :UBC 0 : return;
4773 : :
6 michael@paquier.xyz 4774 :GNC 14168 : rels = palloc_array(SMgrRelation, nlocators); /* non-local relations */
4775 : :
4776 : : /* If it's a local relation, it's localbuf.c's problem. */
1260 rhaas@postgresql.org 4777 [ + + ]:CBC 62033 : for (i = 0; i < nlocators; i++)
4778 : : {
4779 [ + + ]: 47865 : if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4780 : : {
654 heikki.linnakangas@i 4781 [ + - ]: 3264 : if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
1254 rhaas@postgresql.org 4782 : 3264 : DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4783 : : }
4784 : : else
1799 akapila@postgresql.o 4785 : 44601 : rels[n++] = smgr_reln[i];
4786 : : }
4787 : :
4788 : : /*
4789 : : * If there are no non-local relations, then we're done. Release the
4790 : : * memory and return.
4791 : : */
4717 alvherre@alvh.no-ip. 4792 [ + + ]: 14168 : if (n == 0)
4793 : : {
1799 akapila@postgresql.o 4794 : 858 : pfree(rels);
4941 tgl@sss.pgh.pa.us 4795 : 858 : return;
4796 : : }
4797 : :
4798 : : /*
4799 : : * This is used to remember the number of blocks for all the relations
4800 : : * forks.
4801 : : */
4802 : : block = (BlockNumber (*)[MAX_FORKNUM + 1])
1799 akapila@postgresql.o 4803 : 13310 : palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4804 : :
4805 : : /*
4806 : : * We can avoid scanning the entire buffer pool if we know the exact size
4807 : : * of each of the given relation forks. See DropRelationBuffers.
4808 : : */
4809 [ + + + + ]: 27893 : for (i = 0; i < n && cached; i++)
4810 : : {
1211 drowley@postgresql.o 4811 [ + + ]: 22975 : for (int j = 0; j <= MAX_FORKNUM; j++)
4812 : : {
4813 : : /* Get the number of blocks for a relation's fork. */
1799 akapila@postgresql.o 4814 : 20891 : block[i][j] = smgrnblocks_cached(rels[i], j);
4815 : :
4816 : : /* We need to only consider the relation forks that exists. */
4817 [ + + ]: 20891 : if (block[i][j] == InvalidBlockNumber)
4818 : : {
4819 [ + + ]: 18646 : if (!smgrexists(rels[i], j))
4820 : 6147 : continue;
4821 : 12499 : cached = false;
4822 : 12499 : break;
4823 : : }
4824 : :
4825 : : /* calculate the total number of blocks to be invalidated */
4826 : 2245 : nBlocksToInvalidate += block[i][j];
4827 : : }
4828 : : }
4829 : :
4830 : : /*
4831 : : * We apply the optimization iff the total number of blocks to invalidate
4832 : : * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4833 : : */
4834 [ + + + + ]: 13310 : if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4835 : : {
4836 [ + + ]: 1369 : for (i = 0; i < n; i++)
4837 : : {
1211 drowley@postgresql.o 4838 [ + + ]: 3780 : for (int j = 0; j <= MAX_FORKNUM; j++)
4839 : : {
4840 : : /* ignore relation forks that doesn't exist */
1799 akapila@postgresql.o 4841 [ + + ]: 3024 : if (!BlockNumberIsValid(block[i][j]))
4842 : 2257 : continue;
4843 : :
4844 : : /* drop all the buffers for a particular relation fork */
1254 rhaas@postgresql.org 4845 : 767 : FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4846 : 767 : j, block[i][j], 0);
4847 : : }
4848 : : }
4849 : :
1799 akapila@postgresql.o 4850 : 613 : pfree(block);
4851 : 613 : pfree(rels);
4852 : 613 : return;
4853 : : }
4854 : :
4855 : 12697 : pfree(block);
6 michael@paquier.xyz 4856 :GNC 12697 : locators = palloc_array(RelFileLocator, n); /* non-local relations */
1799 akapila@postgresql.o 4857 [ + + ]:CBC 56542 : for (i = 0; i < n; i++)
1260 rhaas@postgresql.org 4858 : 43845 : locators[i] = rels[i]->smgr_rlocator.locator;
4859 : :
4860 : : /*
4861 : : * For low number of relations to drop just use a simple walk through, to
4862 : : * save the bsearch overhead. The threshold to use is rather a guess than
4863 : : * an exactly determined value, as it depends on many factors (CPU and RAM
4864 : : * speeds, amount of shared buffers etc.).
4865 : : */
2083 noah@leadboat.com 4866 : 12697 : use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4867 : :
4868 : : /* sort the list of rlocators if necessary */
4717 alvherre@alvh.no-ip. 4869 [ + + ]: 12697 : if (use_bsearch)
670 nathan@postgresql.or 4870 : 173 : qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4871 : :
4941 tgl@sss.pgh.pa.us 4872 [ + + ]: 136530201 : for (i = 0; i < NBuffers; i++)
4873 : : {
1260 rhaas@postgresql.org 4874 : 136517504 : RelFileLocator *rlocator = NULL;
3684 4875 : 136517504 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4876 : :
4877 : : /*
4878 : : * As in DropRelationBuffers, an unlocked precheck should be safe and
4879 : : * saves some cycles.
4880 : : */
4881 : :
4717 alvherre@alvh.no-ip. 4882 [ + + ]: 136517504 : if (!use_bsearch)
4883 : : {
4884 : : int j;
4885 : :
4886 [ + + ]: 548164073 : for (j = 0; j < n; j++)
4887 : : {
1211 rhaas@postgresql.org 4888 [ + + ]: 413595780 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4889 : : {
1260 4890 : 90139 : rlocator = &locators[j];
4717 alvherre@alvh.no-ip. 4891 : 90139 : break;
4892 : : }
4893 : : }
4894 : : }
4895 : : else
4896 : : {
4897 : : RelFileLocator locator;
4898 : :
1211 rhaas@postgresql.org 4899 : 1859072 : locator = BufTagGetRelFileLocator(&bufHdr->tag);
376 peter@eisentraut.org 4900 : 1859072 : rlocator = bsearch(&locator,
4901 : : locators, n, sizeof(RelFileLocator),
4902 : : rlocator_comparator);
4903 : : }
4904 : :
4905 : : /* buffer doesn't belong to any of the given relfilelocators; skip it */
1260 rhaas@postgresql.org 4906 [ + + ]: 136517504 : if (rlocator == NULL)
4941 tgl@sss.pgh.pa.us 4907 : 136425583 : continue;
4908 : :
41 andres@anarazel.de 4909 :GNC 91921 : LockBufHdr(bufHdr);
1211 rhaas@postgresql.org 4910 [ + - ]:CBC 91921 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4941 tgl@sss.pgh.pa.us 4911 : 91921 : InvalidateBuffer(bufHdr); /* releases spinlock */
4912 : : else
41 andres@anarazel.de 4913 :UNC 0 : UnlockBufHdr(bufHdr);
4914 : : }
4915 : :
1260 rhaas@postgresql.org 4916 :CBC 12697 : pfree(locators);
1799 akapila@postgresql.o 4917 : 12697 : pfree(rels);
4918 : : }
4919 : :
4920 : : /* ---------------------------------------------------------------------
4921 : : * FindAndDropRelationBuffers
4922 : : *
4923 : : * This function performs look up in BufMapping table and removes from the
4924 : : * buffer pool all the pages of the specified relation fork that has block
4925 : : * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
4926 : : * pages are removed.)
4927 : : * --------------------------------------------------------------------
4928 : : */
4929 : : static void
1254 rhaas@postgresql.org 4930 : 847 : FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum,
4931 : : BlockNumber nForkBlock,
4932 : : BlockNumber firstDelBlock)
4933 : : {
4934 : : BlockNumber curBlock;
4935 : :
1800 akapila@postgresql.o 4936 [ + + ]: 2032 : for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4937 : : {
4938 : : uint32 bufHash; /* hash value for tag */
4939 : : BufferTag bufTag; /* identity of requested block */
4940 : : LWLock *bufPartitionLock; /* buffer partition lock for it */
4941 : : int buf_id;
4942 : : BufferDesc *bufHdr;
4943 : :
4944 : : /* create a tag so we can lookup the buffer */
1239 rhaas@postgresql.org 4945 : 1185 : InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4946 : :
4947 : : /* determine its hash code and partition lock ID */
1800 akapila@postgresql.o 4948 : 1185 : bufHash = BufTableHashCode(&bufTag);
4949 : 1185 : bufPartitionLock = BufMappingPartitionLock(bufHash);
4950 : :
4951 : : /* Check that it is in the buffer pool. If not, do nothing. */
4952 : 1185 : LWLockAcquire(bufPartitionLock, LW_SHARED);
4953 : 1185 : buf_id = BufTableLookup(&bufTag, bufHash);
4954 : 1185 : LWLockRelease(bufPartitionLock);
4955 : :
4956 [ + + ]: 1185 : if (buf_id < 0)
4957 : 143 : continue;
4958 : :
4959 : 1042 : bufHdr = GetBufferDescriptor(buf_id);
4960 : :
4961 : : /*
4962 : : * We need to lock the buffer header and recheck if the buffer is
4963 : : * still associated with the same block because the buffer could be
4964 : : * evicted by some other backend loading blocks for a different
4965 : : * relation after we release lock on the BufMapping table.
4966 : : */
41 andres@anarazel.de 4967 :GNC 1042 : LockBufHdr(bufHdr);
4968 : :
1211 rhaas@postgresql.org 4969 [ + - + - ]:CBC 2084 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4970 : 1042 : BufTagGetForkNum(&bufHdr->tag) == forkNum &&
1800 akapila@postgresql.o 4971 [ + - ]: 1042 : bufHdr->tag.blockNum >= firstDelBlock)
4972 : 1042 : InvalidateBuffer(bufHdr); /* releases spinlock */
4973 : : else
41 andres@anarazel.de 4974 :UNC 0 : UnlockBufHdr(bufHdr);
4975 : : }
1800 akapila@postgresql.o 4976 :CBC 847 : }
4977 : :
4978 : : /* ---------------------------------------------------------------------
4979 : : * DropDatabaseBuffers
4980 : : *
4981 : : * This function removes all the buffers in the buffer cache for a
4982 : : * particular database. Dirty pages are simply dropped, without
4983 : : * bothering to write them out first. This is used when we destroy a
4984 : : * database, to avoid trying to flush data to disk when the directory
4985 : : * tree no longer exists. Implementation is pretty similar to
4986 : : * DropRelationBuffers() which is for destroying just one relation.
4987 : : * --------------------------------------------------------------------
4988 : : */
4989 : : void
7203 tgl@sss.pgh.pa.us 4990 : 73 : DropDatabaseBuffers(Oid dbid)
4991 : : {
4992 : : int i;
4993 : :
4994 : : /*
4995 : : * We needn't consider local buffers, since by assumption the target
4996 : : * database isn't our own.
4997 : : */
4998 : :
7593 4999 [ + + ]: 529609 : for (i = 0; i < NBuffers; i++)
5000 : : {
3684 rhaas@postgresql.org 5001 : 529536 : BufferDesc *bufHdr = GetBufferDescriptor(i);
5002 : :
5003 : : /*
5004 : : * As in DropRelationBuffers, an unlocked precheck should be safe and
5005 : : * saves some cycles.
5006 : : */
1211 5007 [ + + ]: 529536 : if (bufHdr->tag.dbOid != dbid)
4941 tgl@sss.pgh.pa.us 5008 : 516408 : continue;
5009 : :
41 andres@anarazel.de 5010 :GNC 13128 : LockBufHdr(bufHdr);
1211 rhaas@postgresql.org 5011 [ + - ]:CBC 13128 : if (bufHdr->tag.dbOid == dbid)
7368 bruce@momjian.us 5012 : 13128 : InvalidateBuffer(bufHdr); /* releases spinlock */
5013 : : else
41 andres@anarazel.de 5014 :UNC 0 : UnlockBufHdr(bufHdr);
5015 : : }
10753 scrappy@hub.org 5016 :CBC 73 : }
5017 : :
5018 : : /* ---------------------------------------------------------------------
5019 : : * FlushRelationBuffers
5020 : : *
5021 : : * This function writes all dirty pages of a relation out to disk
5022 : : * (or more accurately, out to kernel disk buffers), ensuring that the
5023 : : * kernel has an up-to-date view of the relation.
5024 : : *
5025 : : * Generally, the caller should be holding AccessExclusiveLock on the
5026 : : * target relation to ensure that no other backend is busy dirtying
5027 : : * more blocks of the relation; the effects can't be expected to last
5028 : : * after the lock is released.
5029 : : *
5030 : : * XXX currently it sequentially searches the buffer pool, should be
5031 : : * changed to more clever ways of searching. This routine is not
5032 : : * used in any performance-critical code paths, so it's not worth
5033 : : * adding additional overhead to normal paths to make it go faster.
5034 : : * --------------------------------------------------------------------
5035 : : */
5036 : : void
7577 tgl@sss.pgh.pa.us 5037 : 138 : FlushRelationBuffers(Relation rel)
5038 : : {
5039 : : int i;
5040 : : BufferDesc *bufHdr;
686 heikki.linnakangas@i 5041 : 138 : SMgrRelation srel = RelationGetSmgr(rel);
5042 : :
5483 rhaas@postgresql.org 5043 [ + + ]: 138 : if (RelationUsesLocalBuffers(rel))
5044 : : {
10313 vadim4o@yahoo.com 5045 [ + + ]: 909 : for (i = 0; i < NLocBuffer; i++)
5046 : : {
5047 : : uint32 buf_state;
5048 : :
3975 andres@anarazel.de 5049 : 900 : bufHdr = GetLocalBufferDescriptor(i);
1211 rhaas@postgresql.org 5050 [ + + ]: 900 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
3538 andres@anarazel.de 5051 [ + - ]: 300 : ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
5052 : : (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5053 : : {
5054 : : ErrorContextCallback errcallback;
5055 : :
5056 : : /* Setup error traceback support for ereport() */
4783 heikki.linnakangas@i 5057 : 300 : errcallback.callback = local_buffer_write_error_callback;
384 peter@eisentraut.org 5058 : 300 : errcallback.arg = bufHdr;
4783 heikki.linnakangas@i 5059 : 300 : errcallback.previous = error_context_stack;
5060 : 300 : error_context_stack = &errcallback;
5061 : :
5062 : : /* Make sure we can handle the pin */
254 andres@anarazel.de 5063 : 300 : ReservePrivateRefCountEntry();
5064 : 300 : ResourceOwnerEnlarge(CurrentResourceOwner);
5065 : :
5066 : : /*
5067 : : * Pin/unpin mostly to make valgrind work, but it also seems
5068 : : * like the right thing to do.
5069 : : */
5070 : 300 : PinLocalBuffer(bufHdr, false);
5071 : :
5072 : :
277 5073 : 300 : FlushLocalBuffer(bufHdr, srel);
5074 : :
254 5075 : 300 : UnpinLocalBuffer(BufferDescriptorGetBuffer(bufHdr));
5076 : :
5077 : : /* Pop the error context stack */
4783 heikki.linnakangas@i 5078 : 300 : error_context_stack = errcallback.previous;
5079 : : }
5080 : : }
5081 : :
7870 tgl@sss.pgh.pa.us 5082 : 9 : return;
5083 : : }
5084 : :
10313 vadim4o@yahoo.com 5085 [ + + ]: 1512193 : for (i = 0; i < NBuffers; i++)
5086 : : {
5087 : : uint32 buf_state;
5088 : :
3975 andres@anarazel.de 5089 : 1512064 : bufHdr = GetBufferDescriptor(i);
5090 : :
5091 : : /*
5092 : : * As in DropRelationBuffers, an unlocked precheck should be safe and
5093 : : * saves some cycles.
5094 : : */
1211 rhaas@postgresql.org 5095 [ + + ]: 1512064 : if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
4941 tgl@sss.pgh.pa.us 5096 : 1511841 : continue;
5097 : :
5098 : : /* Make sure we can handle the pin */
3985 andres@anarazel.de 5099 : 223 : ReservePrivateRefCountEntry();
770 heikki.linnakangas@i 5100 : 223 : ResourceOwnerEnlarge(CurrentResourceOwner);
5101 : :
3538 andres@anarazel.de 5102 : 223 : buf_state = LockBufHdr(bufHdr);
1211 rhaas@postgresql.org 5103 [ + - ]: 223 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
3538 andres@anarazel.de 5104 [ + + ]: 223 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5105 : : {
7577 tgl@sss.pgh.pa.us 5106 : 180 : PinBuffer_Locked(bufHdr);
70 andres@anarazel.de 5107 :GNC 180 : FlushUnlockedBuffer(bufHdr, srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
1174 michael@paquier.xyz 5108 :CBC 180 : UnpinBuffer(bufHdr);
5109 : : }
5110 : : else
41 andres@anarazel.de 5111 :GNC 43 : UnlockBufHdr(bufHdr);
5112 : : }
5113 : : }
5114 : :
5115 : : /* ---------------------------------------------------------------------
5116 : : * FlushRelationsAllBuffers
5117 : : *
5118 : : * This function flushes out of the buffer pool all the pages of all
5119 : : * forks of the specified smgr relations. It's equivalent to calling
5120 : : * FlushRelationBuffers once per relation. The relations are assumed not
5121 : : * to use local buffers.
5122 : : * --------------------------------------------------------------------
5123 : : */
5124 : : void
2083 noah@leadboat.com 5125 :CBC 13 : FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
5126 : : {
5127 : : int i;
5128 : : SMgrSortArray *srels;
5129 : : bool use_bsearch;
5130 : :
5131 [ - + ]: 13 : if (nrels == 0)
2083 noah@leadboat.com 5132 :UBC 0 : return;
5133 : :
5134 : : /* fill-in array for qsort */
6 michael@paquier.xyz 5135 :GNC 13 : srels = palloc_array(SMgrSortArray, nrels);
5136 : :
2083 noah@leadboat.com 5137 [ + + ]:CBC 26 : for (i = 0; i < nrels; i++)
5138 : : {
1260 rhaas@postgresql.org 5139 [ - + ]: 13 : Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
5140 : :
5141 : 13 : srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
2083 noah@leadboat.com 5142 : 13 : srels[i].srel = smgrs[i];
5143 : : }
5144 : :
5145 : : /*
5146 : : * Save the bsearch overhead for low number of relations to sync. See
5147 : : * DropRelationsAllBuffers for details.
5148 : : */
5149 : 13 : use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
5150 : :
5151 : : /* sort the list of SMgrRelations if necessary */
5152 [ - + ]: 13 : if (use_bsearch)
670 nathan@postgresql.or 5153 :UBC 0 : qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
5154 : :
2083 noah@leadboat.com 5155 [ + + ]:CBC 213005 : for (i = 0; i < NBuffers; i++)
5156 : : {
5157 : 212992 : SMgrSortArray *srelent = NULL;
5158 : 212992 : BufferDesc *bufHdr = GetBufferDescriptor(i);
5159 : : uint32 buf_state;
5160 : :
5161 : : /*
5162 : : * As in DropRelationBuffers, an unlocked precheck should be safe and
5163 : : * saves some cycles.
5164 : : */
5165 : :
5166 [ + - ]: 212992 : if (!use_bsearch)
5167 : : {
5168 : : int j;
5169 : :
5170 [ + + ]: 421272 : for (j = 0; j < nrels; j++)
5171 : : {
1211 rhaas@postgresql.org 5172 [ + + ]: 212992 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5173 : : {
2083 noah@leadboat.com 5174 : 4712 : srelent = &srels[j];
5175 : 4712 : break;
5176 : : }
5177 : : }
5178 : : }
5179 : : else
5180 : : {
5181 : : RelFileLocator rlocator;
5182 : :
1211 rhaas@postgresql.org 5183 :UBC 0 : rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
376 peter@eisentraut.org 5184 : 0 : srelent = bsearch(&rlocator,
5185 : : srels, nrels, sizeof(SMgrSortArray),
5186 : : rlocator_comparator);
5187 : : }
5188 : :
5189 : : /* buffer doesn't belong to any of the given relfilelocators; skip it */
2083 noah@leadboat.com 5190 [ + + ]:CBC 212992 : if (srelent == NULL)
5191 : 208280 : continue;
5192 : :
5193 : : /* Make sure we can handle the pin */
5194 : 4712 : ReservePrivateRefCountEntry();
770 heikki.linnakangas@i 5195 : 4712 : ResourceOwnerEnlarge(CurrentResourceOwner);
5196 : :
2083 noah@leadboat.com 5197 : 4712 : buf_state = LockBufHdr(bufHdr);
1211 rhaas@postgresql.org 5198 [ + - ]: 4712 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
2083 noah@leadboat.com 5199 [ + + ]: 4712 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5200 : : {
5201 : 4252 : PinBuffer_Locked(bufHdr);
70 andres@anarazel.de 5202 :GNC 4252 : FlushUnlockedBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
1174 michael@paquier.xyz 5203 :CBC 4252 : UnpinBuffer(bufHdr);
5204 : : }
5205 : : else
41 andres@anarazel.de 5206 :GNC 460 : UnlockBufHdr(bufHdr);
5207 : : }
5208 : :
2083 noah@leadboat.com 5209 :CBC 13 : pfree(srels);
5210 : : }
5211 : :
5212 : : /* ---------------------------------------------------------------------
5213 : : * RelationCopyStorageUsingBuffer
5214 : : *
5215 : : * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
5216 : : * of using smgrread and smgrextend this will copy using bufmgr APIs.
5217 : : *
5218 : : * Refer comments atop CreateAndCopyRelationData() for details about
5219 : : * 'permanent' parameter.
5220 : : * --------------------------------------------------------------------
5221 : : */
5222 : : static void
1223 rhaas@postgresql.org 5223 : 72441 : RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
5224 : : RelFileLocator dstlocator,
5225 : : ForkNumber forkNum, bool permanent)
5226 : : {
5227 : : Buffer srcBuf;
5228 : : Buffer dstBuf;
5229 : : Page srcPage;
5230 : : Page dstPage;
5231 : : bool use_wal;
5232 : : BlockNumber nblocks;
5233 : : BlockNumber blkno;
5234 : : PGIOAlignedBlock buf;
5235 : : BufferAccessStrategy bstrategy_src;
5236 : : BufferAccessStrategy bstrategy_dst;
5237 : : BlockRangeReadStreamPrivate p;
5238 : : ReadStream *src_stream;
5239 : : SMgrRelation src_smgr;
5240 : :
5241 : : /*
5242 : : * In general, we want to write WAL whenever wal_level > 'minimal', but we
5243 : : * can skip it when copying any fork of an unlogged relation other than
5244 : : * the init fork.
5245 : : */
1359 5246 [ + + - + : 72441 : use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
- - ]
5247 : :
5248 : : /* Get number of blocks in the source relation. */
654 heikki.linnakangas@i 5249 : 72441 : nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
5250 : : forkNum);
5251 : :
5252 : : /* Nothing to copy; just return. */
1359 rhaas@postgresql.org 5253 [ + + ]: 72441 : if (nblocks == 0)
5254 : 12635 : return;
5255 : :
5256 : : /*
5257 : : * Bulk extend the destination relation of the same size as the source
5258 : : * relation before starting to copy block by block.
5259 : : */
1217 5260 : 59806 : memset(buf.data, 0, BLCKSZ);
654 heikki.linnakangas@i 5261 : 59806 : smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5262 : : buf.data, true);
5263 : :
5264 : : /* This is a bulk operation, so use buffer access strategies. */
1359 rhaas@postgresql.org 5265 : 59806 : bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
5266 : 59806 : bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
5267 : :
5268 : : /* Initialize streaming read */
470 noah@leadboat.com 5269 : 59806 : p.current_blocknum = 0;
5270 : 59806 : p.last_exclusive = nblocks;
515 5271 : 59806 : src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
5272 : :
5273 : : /*
5274 : : * It is safe to use batchmode as block_range_read_stream_cb takes no
5275 : : * locks.
5276 : : */
262 andres@anarazel.de 5277 [ + - ]: 59806 : src_stream = read_stream_begin_smgr_relation(READ_STREAM_FULL |
5278 : : READ_STREAM_USE_BATCHING,
5279 : : bstrategy_src,
5280 : : src_smgr,
5281 : : permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
5282 : : forkNum,
5283 : : block_range_read_stream_cb,
5284 : : &p,
5285 : : 0);
5286 : :
5287 : : /* Iterate over each block of the source relation file. */
1359 rhaas@postgresql.org 5288 [ + + ]: 293450 : for (blkno = 0; blkno < nblocks; blkno++)
5289 : : {
5290 [ - + ]: 233647 : CHECK_FOR_INTERRUPTS();
5291 : :
5292 : : /* Read block from source relation. */
515 noah@leadboat.com 5293 : 233647 : srcBuf = read_stream_next_buffer(src_stream, NULL);
1229 tgl@sss.pgh.pa.us 5294 : 233644 : LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
1359 rhaas@postgresql.org 5295 : 233644 : srcPage = BufferGetPage(srcBuf);
5296 : :
515 noah@leadboat.com 5297 : 233644 : dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
5298 : : BufferGetBlockNumber(srcBuf),
5299 : : RBM_ZERO_AND_LOCK, bstrategy_dst,
5300 : : permanent);
1229 tgl@sss.pgh.pa.us 5301 : 233644 : dstPage = BufferGetPage(dstBuf);
5302 : :
1359 rhaas@postgresql.org 5303 : 233644 : START_CRIT_SECTION();
5304 : :
5305 : : /* Copy page data from the source to the destination. */
5306 : 233644 : memcpy(dstPage, srcPage, BLCKSZ);
5307 : 233644 : MarkBufferDirty(dstBuf);
5308 : :
5309 : : /* WAL-log the copied page. */
5310 [ + + ]: 233644 : if (use_wal)
5311 : 124059 : log_newpage_buffer(dstBuf, true);
5312 : :
5313 [ - + ]: 233644 : END_CRIT_SECTION();
5314 : :
5315 : 233644 : UnlockReleaseBuffer(dstBuf);
1229 tgl@sss.pgh.pa.us 5316 : 233644 : UnlockReleaseBuffer(srcBuf);
5317 : : }
515 noah@leadboat.com 5318 [ - + ]: 59803 : Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
5319 : 59803 : read_stream_end(src_stream);
5320 : :
1003 andres@anarazel.de 5321 : 59803 : FreeAccessStrategy(bstrategy_src);
5322 : 59803 : FreeAccessStrategy(bstrategy_dst);
5323 : : }
5324 : :
5325 : : /* ---------------------------------------------------------------------
5326 : : * CreateAndCopyRelationData
5327 : : *
5328 : : * Create destination relation storage and copy all forks from the
5329 : : * source relation to the destination.
5330 : : *
5331 : : * Pass permanent as true for permanent relations and false for
5332 : : * unlogged relations. Currently this API is not supported for
5333 : : * temporary relations.
5334 : : * --------------------------------------------------------------------
5335 : : */
5336 : : void
1260 rhaas@postgresql.org 5337 : 54454 : CreateAndCopyRelationData(RelFileLocator src_rlocator,
5338 : : RelFileLocator dst_rlocator, bool permanent)
5339 : : {
5340 : : char relpersistence;
5341 : : SMgrRelation src_rel;
5342 : : SMgrRelation dst_rel;
5343 : :
5344 : : /* Set the relpersistence. */
1359 5345 [ + - ]: 54454 : relpersistence = permanent ?
5346 : : RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
5347 : :
654 heikki.linnakangas@i 5348 : 54454 : src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
5349 : 54454 : dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
5350 : :
5351 : : /*
5352 : : * Create and copy all forks of the relation. During create database we
5353 : : * have a separate cleanup mechanism which deletes complete database
5354 : : * directory. Therefore, each individual relation doesn't need to be
5355 : : * registered for cleanup.
5356 : : */
1260 rhaas@postgresql.org 5357 : 54454 : RelationCreateStorage(dst_rlocator, relpersistence, false);
5358 : :
5359 : : /* copy main fork. */
1223 5360 : 54454 : RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
5361 : : permanent);
5362 : :
5363 : : /* copy those extra forks that exist */
1359 5364 : 54451 : for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5365 [ + + ]: 217804 : forkNum <= MAX_FORKNUM; forkNum++)
5366 : : {
686 heikki.linnakangas@i 5367 [ + + ]: 163353 : if (smgrexists(src_rel, forkNum))
5368 : : {
5369 : 17987 : smgrcreate(dst_rel, forkNum, false);
5370 : :
5371 : : /*
5372 : : * WAL log creation if the relation is persistent, or this is the
5373 : : * init fork of an unlogged relation.
5374 : : */
1359 rhaas@postgresql.org 5375 [ - + - - ]: 17987 : if (permanent || forkNum == INIT_FORKNUM)
1260 5376 : 17987 : log_smgrcreate(&dst_rlocator, forkNum);
5377 : :
5378 : : /* Copy a fork's data, block by block. */
1223 5379 : 17987 : RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
5380 : : permanent);
5381 : : }
5382 : : }
1359 5383 : 54451 : }
5384 : :
5385 : : /* ---------------------------------------------------------------------
5386 : : * FlushDatabaseBuffers
5387 : : *
5388 : : * This function writes all dirty pages of a database out to disk
5389 : : * (or more accurately, out to kernel disk buffers), ensuring that the
5390 : : * kernel has an up-to-date view of the database.
5391 : : *
5392 : : * Generally, the caller should be holding an appropriate lock to ensure
5393 : : * no other backend is active in the target database; otherwise more
5394 : : * pages could get dirtied.
5395 : : *
5396 : : * Note we don't worry about flushing any pages of temporary relations.
5397 : : * It's assumed these wouldn't be interesting.
5398 : : * --------------------------------------------------------------------
5399 : : */
5400 : : void
6747 tgl@sss.pgh.pa.us 5401 : 5 : FlushDatabaseBuffers(Oid dbid)
5402 : : {
5403 : : int i;
5404 : : BufferDesc *bufHdr;
5405 : :
5406 [ + + ]: 645 : for (i = 0; i < NBuffers; i++)
5407 : : {
5408 : : uint32 buf_state;
5409 : :
3975 andres@anarazel.de 5410 : 640 : bufHdr = GetBufferDescriptor(i);
5411 : :
5412 : : /*
5413 : : * As in DropRelationBuffers, an unlocked precheck should be safe and
5414 : : * saves some cycles.
5415 : : */
1211 rhaas@postgresql.org 5416 [ + + ]: 640 : if (bufHdr->tag.dbOid != dbid)
4941 tgl@sss.pgh.pa.us 5417 : 452 : continue;
5418 : :
5419 : : /* Make sure we can handle the pin */
3985 andres@anarazel.de 5420 : 188 : ReservePrivateRefCountEntry();
770 heikki.linnakangas@i 5421 : 188 : ResourceOwnerEnlarge(CurrentResourceOwner);
5422 : :
3538 andres@anarazel.de 5423 : 188 : buf_state = LockBufHdr(bufHdr);
1211 rhaas@postgresql.org 5424 [ + - ]: 188 : if (bufHdr->tag.dbOid == dbid &&
3538 andres@anarazel.de 5425 [ + + ]: 188 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5426 : : {
6747 tgl@sss.pgh.pa.us 5427 : 36 : PinBuffer_Locked(bufHdr);
70 andres@anarazel.de 5428 :GNC 36 : FlushUnlockedBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
1174 michael@paquier.xyz 5429 :CBC 36 : UnpinBuffer(bufHdr);
5430 : : }
5431 : : else
41 andres@anarazel.de 5432 :GNC 152 : UnlockBufHdr(bufHdr);
5433 : : }
6747 tgl@sss.pgh.pa.us 5434 :CBC 5 : }
5435 : :
5436 : : /*
5437 : : * Flush a previously, shared or exclusively, locked and pinned buffer to the
5438 : : * OS.
5439 : : */
5440 : : void
3660 andres@anarazel.de 5441 : 74 : FlushOneBuffer(Buffer buffer)
5442 : : {
5443 : : BufferDesc *bufHdr;
5444 : :
5445 : : /* currently not needed, but no fundamental reason not to support */
5446 [ - + ]: 74 : Assert(!BufferIsLocal(buffer));
5447 : :
5448 [ - + - + : 74 : Assert(BufferIsPinned(buffer));
- + ]
5449 : :
5450 : 74 : bufHdr = GetBufferDescriptor(buffer - 1);
5451 : :
70 andres@anarazel.de 5452 [ - + ]:GNC 74 : Assert(BufferIsLockedByMe(buffer));
5453 : :
1042 andres@anarazel.de 5454 :CBC 74 : FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
3660 5455 : 74 : }
5456 : :
5457 : : /*
5458 : : * ReleaseBuffer -- release the pin on a buffer
5459 : : */
5460 : : void
8534 tgl@sss.pgh.pa.us 5461 : 58646894 : ReleaseBuffer(Buffer buffer)
5462 : : {
7733 5463 [ - + ]: 58646894 : if (!BufferIsValid(buffer))
5295 peter_e@gmx.net 5464 [ # # ]:UBC 0 : elog(ERROR, "bad buffer ID: %d", buffer);
5465 : :
10328 bruce@momjian.us 5466 [ + + ]:CBC 58646894 : if (BufferIsLocal(buffer))
987 andres@anarazel.de 5467 : 1771876 : UnpinLocalBuffer(buffer);
5468 : : else
5469 : 56875018 : UnpinBuffer(GetBufferDescriptor(buffer - 1));
10753 scrappy@hub.org 5470 : 58646894 : }
5471 : :
5472 : : /*
5473 : : * UnlockReleaseBuffer -- release the content lock and pin on a buffer
5474 : : *
5475 : : * This is just a shorthand for a common combination.
5476 : : */
5477 : : void
7201 tgl@sss.pgh.pa.us 5478 : 18138313 : UnlockReleaseBuffer(Buffer buffer)
5479 : : {
5480 : 18138313 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5481 : 18138313 : ReleaseBuffer(buffer);
5482 : 18138313 : }
5483 : :
5484 : : /*
5485 : : * IncrBufferRefCount
5486 : : * Increment the pin count on a buffer that we have *already* pinned
5487 : : * at least once.
5488 : : *
5489 : : * This function cannot be used on a buffer we do not have pinned,
5490 : : * because it doesn't change the shared buffer state.
5491 : : */
5492 : : void
7823 5493 : 10811357 : IncrBufferRefCount(Buffer buffer)
5494 : : {
7693 neilc@samurai.com 5495 [ - + + + : 10811357 : Assert(BufferIsPinned(buffer));
- + ]
770 heikki.linnakangas@i 5496 : 10811357 : ResourceOwnerEnlarge(CurrentResourceOwner);
7823 tgl@sss.pgh.pa.us 5497 [ + + ]: 10811357 : if (BufferIsLocal(buffer))
5498 : 356163 : LocalRefCount[-buffer - 1]++;
5499 : : else
5500 : : {
5501 : : PrivateRefCountEntry *ref;
5502 : :
3985 andres@anarazel.de 5503 : 10455194 : ref = GetPrivateRefCountEntry(buffer, true);
4127 5504 [ - + ]: 10455194 : Assert(ref != NULL);
3 andres@anarazel.de 5505 :GNC 10455194 : ref->data.refcount++;
5506 : : }
2961 tgl@sss.pgh.pa.us 5507 :CBC 10811357 : ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
7823 5508 : 10811357 : }
5509 : :
5510 : : /*
5511 : : * MarkBufferDirtyHint
5512 : : *
5513 : : * Mark a buffer dirty for non-critical changes.
5514 : : *
5515 : : * This is essentially the same as MarkBufferDirty, except:
5516 : : *
5517 : : * 1. The caller does not write WAL; so if checksums are enabled, we may need
5518 : : * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
5519 : : * 2. The caller might have only share-lock instead of exclusive-lock on the
5520 : : * buffer's content lock.
5521 : : * 3. This function does not guarantee that the buffer is always marked dirty
5522 : : * (due to a race condition), so it cannot be used for important changes.
5523 : : */
5524 : : void
4566 jdavis@postgresql.or 5525 : 10059431 : MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
5526 : : {
5527 : : BufferDesc *bufHdr;
3528 kgrittn@postgresql.o 5528 : 10059431 : Page page = BufferGetPage(buffer);
5529 : :
7733 tgl@sss.pgh.pa.us 5530 [ - + ]: 10059431 : if (!BufferIsValid(buffer))
5295 peter_e@gmx.net 5531 [ # # ]:UBC 0 : elog(ERROR, "bad buffer ID: %d", buffer);
5532 : :
9383 tgl@sss.pgh.pa.us 5533 [ + + ]:CBC 10059431 : if (BufferIsLocal(buffer))
5534 : : {
7201 5535 : 745696 : MarkLocalBufferDirty(buffer);
9383 5536 : 745696 : return;
5537 : : }
5538 : :
3975 andres@anarazel.de 5539 : 9313735 : bufHdr = GetBufferDescriptor(buffer - 1);
5540 : :
4127 5541 [ - + ]: 9313735 : Assert(GetPrivateRefCount(buffer) > 0);
5542 : : /* here, either share or exclusive lock is OK */
70 andres@anarazel.de 5543 [ - + ]:GNC 9313735 : Assert(BufferIsLockedByMe(buffer));
5544 : :
5545 : : /*
5546 : : * This routine might get called many times on the same page, if we are
5547 : : * making the first scan after commit of an xact that added/deleted many
5548 : : * tuples. So, be as quick as we can if the buffer is already dirty. We
5549 : : * do this by not acquiring spinlock if it looks like the status bits are
5550 : : * already set. Since we make this test unlocked, there's a chance we
5551 : : * might fail to notice that the flags have just been cleared, and failed
5552 : : * to reset them, due to memory-ordering issues. But since this function
5553 : : * is only intended to be used in cases where failing to write out the
5554 : : * data would be harmless anyway, it doesn't really matter.
5555 : : */
3538 andres@anarazel.de 5556 [ + + ]:CBC 9313735 : if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
5557 : : (BM_DIRTY | BM_JUST_DIRTIED))
5558 : : {
4653 simon@2ndQuadrant.co 5559 : 867412 : XLogRecPtr lsn = InvalidXLogRecPtr;
5560 : 867412 : bool dirtied = false;
1349 rhaas@postgresql.org 5561 : 867412 : bool delayChkptFlags = false;
5562 : : uint32 buf_state;
5563 : :
5564 : : /*
5565 : : * If we need to protect hint bit updates from torn writes, WAL-log a
5566 : : * full page image of the page. This full page image is only necessary
5567 : : * if the hint bit update is the first change to the page since the
5568 : : * last checkpoint.
5569 : : *
5570 : : * We don't check full_page_writes here because that logic is included
5571 : : * when we call XLogInsert() since the value changes dynamically.
5572 : : */
3538 andres@anarazel.de 5573 [ + + - + : 1733723 : if (XLogHintBitIsNeeded() &&
+ + ]
5574 : 866311 : (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
5575 : : {
5576 : : /*
5577 : : * If we must not write WAL, due to a relfilelocator-specific
5578 : : * condition or being in recovery, don't dirty the page. We can
5579 : : * set the hint, just not dirty the page as a result so the hint
5580 : : * is lost when we evict the page or shutdown.
5581 : : *
5582 : : * See src/backend/storage/page/README for longer discussion.
5583 : : */
2083 noah@leadboat.com 5584 [ + + + + ]: 931916 : if (RecoveryInProgress() ||
1211 rhaas@postgresql.org 5585 : 65639 : RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)))
4653 simon@2ndQuadrant.co 5586 : 804112 : return;
5587 : :
5588 : : /*
5589 : : * If the block is already dirty because we either made a change
5590 : : * or set a hint already, then we don't need to write a full page
5591 : : * image. Note that aggressive cleaning of blocks dirtied by hint
5592 : : * bit setting would increase the call rate. Bulk setting of hint
5593 : : * bits would reduce the call rate...
5594 : : *
5595 : : * We must issue the WAL record before we mark the buffer dirty.
5596 : : * Otherwise we might write the page before we write the WAL. That
5597 : : * causes a race condition, since a checkpoint might occur between
5598 : : * writing the WAL record and marking the buffer dirty. We solve
5599 : : * that with a kluge, but one that is already in use during
5600 : : * transaction commit to prevent race conditions. Basically, we
5601 : : * simply prevent the checkpoint WAL record from being written
5602 : : * until we have marked the buffer dirty. We don't start the
5603 : : * checkpoint flush until we have marked dirty, so our checkpoint
5604 : : * must flush the change to disk successfully or the checkpoint
5605 : : * never gets written, so crash recovery will fix.
5606 : : *
5607 : : * It's possible we may enter here without an xid, so it is
5608 : : * essential that CreateCheckPoint waits for virtual transactions
5609 : : * rather than full transactionids.
5610 : : */
1349 rhaas@postgresql.org 5611 [ - + ]: 62165 : Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
5612 : 62165 : MyProc->delayChkptFlags |= DELAY_CHKPT_START;
5613 : 62165 : delayChkptFlags = true;
4566 jdavis@postgresql.or 5614 : 62165 : lsn = XLogSaveBufferForHint(buffer, buffer_std);
5615 : : }
5616 : :
3538 andres@anarazel.de 5617 : 63300 : buf_state = LockBufHdr(bufHdr);
5618 : :
5619 [ - + ]: 63300 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5620 : :
5621 [ + + ]: 63300 : if (!(buf_state & BM_DIRTY))
5622 : : {
4653 simon@2ndQuadrant.co 5623 : 63248 : dirtied = true; /* Means "will be dirtied by this action" */
5624 : :
5625 : : /*
5626 : : * Set the page LSN if we wrote a backup block. We aren't supposed
5627 : : * to set this when only holding a share lock but as long as we
5628 : : * serialise it somehow we're OK. We choose to set LSN while
5629 : : * holding the buffer header lock, which causes any reader of an
5630 : : * LSN who holds only a share lock to also obtain a buffer header
5631 : : * lock before using PageGetLSN(), which is enforced in
5632 : : * BufferGetLSNAtomic().
5633 : : *
5634 : : * If checksums are enabled, you might think we should reset the
5635 : : * checksum here. That will happen when the page is written
5636 : : * sometime later in this checkpoint cycle.
5637 : : */
41 alvherre@kurilemu.de 5638 [ + + ]:GNC 63248 : if (XLogRecPtrIsValid(lsn))
4653 simon@2ndQuadrant.co 5639 :CBC 31207 : PageSetLSN(page, lsn);
5640 : : }
5641 : :
41 andres@anarazel.de 5642 :GNC 63300 : UnlockBufHdrExt(bufHdr, buf_state,
5643 : : BM_DIRTY | BM_JUST_DIRTIED,
5644 : : 0, 0);
5645 : :
1349 rhaas@postgresql.org 5646 [ + + ]:CBC 63300 : if (delayChkptFlags)
5647 : 62165 : MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
5648 : :
4653 simon@2ndQuadrant.co 5649 [ + + ]: 63300 : if (dirtied)
5650 : : {
4279 rhaas@postgresql.org 5651 : 63248 : pgBufferUsage.shared_blks_dirtied++;
5136 alvherre@alvh.no-ip. 5652 [ + + ]: 63248 : if (VacuumCostActive)
5653 : 1985 : VacuumCostBalance += VacuumCostPageDirty;
5654 : : }
5655 : : }
5656 : : }
5657 : :
5658 : : /*
5659 : : * Release buffer content locks for shared buffers.
5660 : : *
5661 : : * Used to clean up after errors.
5662 : : *
5663 : : * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
5664 : : * of releasing buffer content locks per se; the only thing we need to deal
5665 : : * with here is clearing any PIN_COUNT request that was in progress.
5666 : : */
5667 : : void
9105 tgl@sss.pgh.pa.us 5668 : 50340 : UnlockBuffers(void)
5669 : : {
3684 rhaas@postgresql.org 5670 : 50340 : BufferDesc *buf = PinCountWaitBuf;
5671 : :
7732 tgl@sss.pgh.pa.us 5672 [ - + ]: 50340 : if (buf)
5673 : : {
5674 : : uint32 buf_state;
41 andres@anarazel.de 5675 :UNC 0 : uint32 unset_bits = 0;
5676 : :
3538 andres@anarazel.de 5677 :UBC 0 : buf_state = LockBufHdr(buf);
5678 : :
5679 : : /*
5680 : : * Don't complain if flag bit not set; it could have been reset but we
5681 : : * got a cancel/die interrupt before getting the signal.
5682 : : */
5683 [ # # ]: 0 : if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
664 heikki.linnakangas@i 5684 [ # # ]: 0 : buf->wait_backend_pgprocno == MyProcNumber)
41 andres@anarazel.de 5685 :UNC 0 : unset_bits = BM_PIN_COUNT_WAITER;
5686 : :
5687 : 0 : UnlockBufHdrExt(buf, buf_state,
5688 : : 0, unset_bits,
5689 : : 0);
5690 : :
7593 tgl@sss.pgh.pa.us 5691 :UBC 0 : PinCountWaitBuf = NULL;
5692 : : }
9864 vadim4o@yahoo.com 5693 :CBC 50340 : }
5694 : :
5695 : : /*
5696 : : * Acquire or release the content_lock for the buffer.
5697 : : */
5698 : : void
14 andres@anarazel.de 5699 :GNC 164806290 : LockBuffer(Buffer buffer, BufferLockMode mode)
5700 : : {
5701 : : BufferDesc *buf;
5702 : :
1976 pg@bowt.ie 5703 [ - + + + :CBC 164806290 : Assert(BufferIsPinned(buffer));
- + ]
9864 vadim4o@yahoo.com 5704 [ + + ]: 164806290 : if (BufferIsLocal(buffer))
7201 tgl@sss.pgh.pa.us 5705 : 10419702 : return; /* local buffers need no lock */
5706 : :
3975 andres@anarazel.de 5707 : 154386588 : buf = GetBufferDescriptor(buffer - 1);
5708 : :
9864 vadim4o@yahoo.com 5709 [ + + ]: 154386588 : if (mode == BUFFER_LOCK_UNLOCK)
3655 rhaas@postgresql.org 5710 : 77690800 : LWLockRelease(BufferDescriptorGetContentLock(buf));
9864 vadim4o@yahoo.com 5711 [ + + ]: 76695788 : else if (mode == BUFFER_LOCK_SHARE)
3655 rhaas@postgresql.org 5712 : 53464254 : LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
9864 vadim4o@yahoo.com 5713 [ + - ]: 23231534 : else if (mode == BUFFER_LOCK_EXCLUSIVE)
3655 rhaas@postgresql.org 5714 : 23231534 : LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
5715 : : else
8182 tgl@sss.pgh.pa.us 5716 [ # # ]:UBC 0 : elog(ERROR, "unrecognized buffer lock mode: %d", mode);
5717 : : }
5718 : :
5719 : : /*
5720 : : * Acquire the content_lock for the buffer, but only if we don't have to wait.
5721 : : *
5722 : : * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
5723 : : */
5724 : : bool
8165 tgl@sss.pgh.pa.us 5725 :CBC 1060884 : ConditionalLockBuffer(Buffer buffer)
5726 : : {
5727 : : BufferDesc *buf;
5728 : :
1976 pg@bowt.ie 5729 [ - + + + : 1060884 : Assert(BufferIsPinned(buffer));
- + ]
8165 tgl@sss.pgh.pa.us 5730 [ + + ]: 1060884 : if (BufferIsLocal(buffer))
5731 : 64682 : return true; /* act as though we got it */
5732 : :
3975 andres@anarazel.de 5733 : 996202 : buf = GetBufferDescriptor(buffer - 1);
5734 : :
3655 rhaas@postgresql.org 5735 : 996202 : return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
5736 : : LW_EXCLUSIVE);
5737 : : }
5738 : :
5739 : : /*
5740 : : * Verify that this backend is pinning the buffer exactly once.
5741 : : *
5742 : : * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
5743 : : * holds a pin on the buffer. We do not care whether some other backend does.
5744 : : */
5745 : : void
987 andres@anarazel.de 5746 : 4063498 : CheckBufferIsPinnedOnce(Buffer buffer)
5747 : : {
5748 [ + + ]: 4063498 : if (BufferIsLocal(buffer))
5749 : : {
5750 [ - + ]: 789 : if (LocalRefCount[-buffer - 1] != 1)
987 andres@anarazel.de 5751 [ # # ]:UBC 0 : elog(ERROR, "incorrect local pin count: %d",
5752 : : LocalRefCount[-buffer - 1]);
5753 : : }
5754 : : else
5755 : : {
987 andres@anarazel.de 5756 [ - + ]:CBC 4062709 : if (GetPrivateRefCount(buffer) != 1)
987 andres@anarazel.de 5757 [ # # ]:UBC 0 : elog(ERROR, "incorrect local pin count: %d",
5758 : : GetPrivateRefCount(buffer));
5759 : : }
987 andres@anarazel.de 5760 :CBC 4063498 : }
5761 : :
5762 : : /*
5763 : : * LockBufferForCleanup - lock a buffer in preparation for deleting items
5764 : : *
5765 : : * Items may be deleted from a disk page only when the caller (a) holds an
5766 : : * exclusive lock on the buffer and (b) has observed that no other backend
5767 : : * holds a pin on the buffer. If there is a pin, then the other backend
5768 : : * might have a pointer into the buffer (for example, a heapscan reference
5769 : : * to an item --- see README for more details). It's OK if a pin is added
5770 : : * after the cleanup starts, however; the newly-arrived backend will be
5771 : : * unable to look at the page until we release the exclusive lock.
5772 : : *
5773 : : * To implement this protocol, a would-be deleter must pin the buffer and
5774 : : * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
5775 : : * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
5776 : : * it has successfully observed pin count = 1.
5777 : : */
5778 : : void
8930 tgl@sss.pgh.pa.us 5779 : 23987 : LockBufferForCleanup(Buffer buffer)
5780 : : {
5781 : : BufferDesc *bufHdr;
1804 fujii@postgresql.org 5782 : 23987 : TimestampTz waitStart = 0;
1031 drowley@postgresql.o 5783 : 23987 : bool waiting = false;
1804 fujii@postgresql.org 5784 : 23987 : bool logged_recovery_conflict = false;
5785 : :
1976 pg@bowt.ie 5786 [ - + + + : 23987 : Assert(BufferIsPinned(buffer));
- + ]
7732 tgl@sss.pgh.pa.us 5787 [ - + ]: 23987 : Assert(PinCountWaitBuf == NULL);
5788 : :
987 andres@anarazel.de 5789 : 23987 : CheckBufferIsPinnedOnce(buffer);
5790 : :
5791 : : /*
5792 : : * We do not yet need to be worried about in-progress AIOs holding a pin,
5793 : : * as we, so far, only support doing reads via AIO and this function can
5794 : : * only be called once the buffer is valid (i.e. no read can be in
5795 : : * flight).
5796 : : */
5797 : :
5798 : : /* Nobody else to wait for */
8930 tgl@sss.pgh.pa.us 5799 [ + + ]: 23987 : if (BufferIsLocal(buffer))
5800 : 16 : return;
5801 : :
3975 andres@anarazel.de 5802 : 23971 : bufHdr = GetBufferDescriptor(buffer - 1);
5803 : :
5804 : : for (;;)
8930 tgl@sss.pgh.pa.us 5805 : 12 : {
5806 : : uint32 buf_state;
41 andres@anarazel.de 5807 :GNC 23983 : uint32 unset_bits = 0;
5808 : :
5809 : : /* Try to acquire lock */
8930 tgl@sss.pgh.pa.us 5810 :CBC 23983 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3538 andres@anarazel.de 5811 : 23983 : buf_state = LockBufHdr(bufHdr);
5812 : :
5813 [ - + ]: 23983 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5814 [ + + ]: 23983 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5815 : : {
5816 : : /* Successfully acquired exclusive lock with pincount 1 */
41 andres@anarazel.de 5817 :GNC 23971 : UnlockBufHdr(bufHdr);
5818 : :
5819 : : /*
5820 : : * Emit the log message if recovery conflict on buffer pin was
5821 : : * resolved but the startup process waited longer than
5822 : : * deadlock_timeout for it.
5823 : : */
1799 fujii@postgresql.org 5824 [ + + ]:CBC 23971 : if (logged_recovery_conflict)
5825 : 2 : LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
5826 : : waitStart, GetCurrentTimestamp(),
5827 : : NULL, false);
5828 : :
1031 drowley@postgresql.o 5829 [ + + ]: 23971 : if (waiting)
5830 : : {
5831 : : /* reset ps display to remove the suffix if we added one */
5832 : 2 : set_ps_display_remove_suffix();
5833 : 2 : waiting = false;
5834 : : }
8930 tgl@sss.pgh.pa.us 5835 : 23971 : return;
5836 : : }
5837 : : /* Failed, so mark myself as waiting for pincount 1 */
3538 andres@anarazel.de 5838 [ - + ]: 12 : if (buf_state & BM_PIN_COUNT_WAITER)
5839 : : {
41 andres@anarazel.de 5840 :UNC 0 : UnlockBufHdr(bufHdr);
8930 tgl@sss.pgh.pa.us 5841 :UBC 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
8182 5842 [ # # ]: 0 : elog(ERROR, "multiple backends attempting to wait for pincount 1");
5843 : : }
664 heikki.linnakangas@i 5844 :CBC 12 : bufHdr->wait_backend_pgprocno = MyProcNumber;
7732 tgl@sss.pgh.pa.us 5845 : 12 : PinCountWaitBuf = bufHdr;
41 andres@anarazel.de 5846 :GNC 12 : UnlockBufHdrExt(bufHdr, buf_state,
5847 : : BM_PIN_COUNT_WAITER, 0,
5848 : : 0);
8930 tgl@sss.pgh.pa.us 5849 :CBC 12 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5850 : :
5851 : : /* Wait to be signaled by UnpinBuffer() */
5807 simon@2ndQuadrant.co 5852 [ + + ]: 12 : if (InHotStandby)
5853 : : {
1031 drowley@postgresql.o 5854 [ + + ]: 8 : if (!waiting)
5855 : : {
5856 : : /* adjust the process title to indicate that it's waiting */
5857 : 2 : set_ps_display_suffix("waiting");
5858 : 2 : waiting = true;
5859 : : }
5860 : :
5861 : : /*
5862 : : * Emit the log message if the startup process is waiting longer
5863 : : * than deadlock_timeout for recovery conflict on buffer pin.
5864 : : *
5865 : : * Skip this if first time through because the startup process has
5866 : : * not started waiting yet in this case. So, the wait start
5867 : : * timestamp is set after this logic.
5868 : : */
1804 fujii@postgresql.org 5869 [ + + + + ]: 8 : if (waitStart != 0 && !logged_recovery_conflict)
5870 : : {
5871 : 3 : TimestampTz now = GetCurrentTimestamp();
5872 : :
5873 [ + + ]: 3 : if (TimestampDifferenceExceeds(waitStart, now,
5874 : : DeadlockTimeout))
5875 : : {
5876 : 2 : LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
5877 : : waitStart, now, NULL, true);
5878 : 2 : logged_recovery_conflict = true;
5879 : : }
5880 : : }
5881 : :
5882 : : /*
5883 : : * Set the wait start timestamp if logging is enabled and first
5884 : : * time through.
5885 : : */
5886 [ + - + + ]: 8 : if (log_recovery_conflict_waits && waitStart == 0)
5887 : 2 : waitStart = GetCurrentTimestamp();
5888 : :
5889 : : /* Publish the bufid that Startup process waits on */
5807 simon@2ndQuadrant.co 5890 : 8 : SetStartupBufferPinWaitBufId(buffer - 1);
5891 : : /* Set alarm and then wait to be signaled by UnpinBuffer() */
5892 : 8 : ResolveRecoveryConflictWithBufferPin();
5893 : : /* Reset the published bufid */
5894 : 8 : SetStartupBufferPinWaitBufId(-1);
5895 : : }
5896 : : else
14 andres@anarazel.de 5897 :GNC 4 : ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP);
5898 : :
5899 : : /*
5900 : : * Remove flag marking us as waiter. Normally this will not be set
5901 : : * anymore, but ProcWaitForSignal() can return for other signals as
5902 : : * well. We take care to only reset the flag if we're the waiter, as
5903 : : * theoretically another backend could have started waiting. That's
5904 : : * impossible with the current usages due to table level locking, but
5905 : : * better be safe.
5906 : : */
3538 andres@anarazel.de 5907 :CBC 12 : buf_state = LockBufHdr(bufHdr);
5908 [ + + ]: 12 : if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
664 heikki.linnakangas@i 5909 [ + - ]: 6 : bufHdr->wait_backend_pgprocno == MyProcNumber)
41 andres@anarazel.de 5910 :GNC 6 : unset_bits |= BM_PIN_COUNT_WAITER;
5911 : :
5912 : 12 : UnlockBufHdrExt(bufHdr, buf_state,
5913 : : 0, unset_bits,
5914 : : 0);
5915 : :
7732 tgl@sss.pgh.pa.us 5916 :CBC 12 : PinCountWaitBuf = NULL;
5917 : : /* Loop back and try again */
5918 : : }
5919 : : }
5920 : :
5921 : : /*
5922 : : * Check called from ProcessRecoveryConflictInterrupts() when Startup process
5923 : : * requests cancellation of all pin holders that are blocking it.
5924 : : */
5925 : : bool
5807 simon@2ndQuadrant.co 5926 : 3 : HoldingBufferPinThatDelaysRecovery(void)
5927 : : {
5773 bruce@momjian.us 5928 : 3 : int bufid = GetStartupBufferPinWaitBufId();
5929 : :
5930 : : /*
5931 : : * If we get woken slowly then it's possible that the Startup process was
5932 : : * already woken by other backends before we got here. Also possible that
5933 : : * we get here by multiple interrupts or interrupts at inappropriate
5934 : : * times, so make sure we do nothing if the bufid is not set.
5935 : : */
5807 simon@2ndQuadrant.co 5936 [ + + ]: 3 : if (bufid < 0)
5937 : 1 : return false;
5938 : :
4127 andres@anarazel.de 5939 [ + - ]: 2 : if (GetPrivateRefCount(bufid + 1) > 0)
5807 simon@2ndQuadrant.co 5940 : 2 : return true;
5941 : :
5807 simon@2ndQuadrant.co 5942 :UBC 0 : return false;
5943 : : }
5944 : :
5945 : : /*
5946 : : * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
5947 : : *
5948 : : * We won't loop, but just check once to see if the pin count is OK. If
5949 : : * not, return false with no lock held.
5950 : : */
5951 : : bool
6663 tgl@sss.pgh.pa.us 5952 :CBC 117736 : ConditionalLockBufferForCleanup(Buffer buffer)
5953 : : {
5954 : : BufferDesc *bufHdr;
5955 : : uint32 buf_state,
5956 : : refcount;
5957 : :
5958 [ - + ]: 117736 : Assert(BufferIsValid(buffer));
5959 : :
5960 : : /* see AIO related comment in LockBufferForCleanup() */
5961 : :
5962 [ + + ]: 117736 : if (BufferIsLocal(buffer))
5963 : : {
3538 andres@anarazel.de 5964 : 804 : refcount = LocalRefCount[-buffer - 1];
5965 : : /* There should be exactly one pin */
5966 [ - + ]: 804 : Assert(refcount > 0);
5967 [ + + ]: 804 : if (refcount != 1)
6663 tgl@sss.pgh.pa.us 5968 : 21 : return false;
5969 : : /* Nobody else to wait for */
5970 : 783 : return true;
5971 : : }
5972 : :
5973 : : /* There should be exactly one local pin */
3538 andres@anarazel.de 5974 : 116932 : refcount = GetPrivateRefCount(buffer);
5975 [ - + ]: 116932 : Assert(refcount);
5976 [ + + ]: 116932 : if (refcount != 1)
6663 tgl@sss.pgh.pa.us 5977 : 280 : return false;
5978 : :
5979 : : /* Try to acquire lock */
5980 [ + + ]: 116652 : if (!ConditionalLockBuffer(buffer))
5981 : 65 : return false;
5982 : :
3975 andres@anarazel.de 5983 : 116587 : bufHdr = GetBufferDescriptor(buffer - 1);
3538 5984 : 116587 : buf_state = LockBufHdr(bufHdr);
5985 : 116587 : refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5986 : :
5987 [ - + ]: 116587 : Assert(refcount > 0);
5988 [ + + ]: 116587 : if (refcount == 1)
5989 : : {
5990 : : /* Successfully acquired exclusive lock with pincount 1 */
41 andres@anarazel.de 5991 :GNC 116351 : UnlockBufHdr(bufHdr);
6663 tgl@sss.pgh.pa.us 5992 :CBC 116351 : return true;
5993 : : }
5994 : :
5995 : : /* Failed, so release the lock */
41 andres@anarazel.de 5996 :GNC 236 : UnlockBufHdr(bufHdr);
6663 tgl@sss.pgh.pa.us 5997 :CBC 236 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5998 : 236 : return false;
5999 : : }
6000 : :
6001 : : /*
6002 : : * IsBufferCleanupOK - as above, but we already have the lock
6003 : : *
6004 : : * Check whether it's OK to perform cleanup on a buffer we've already
6005 : : * locked. If we observe that the pin count is 1, our exclusive lock
6006 : : * happens to be a cleanup lock, and we can proceed with anything that
6007 : : * would have been allowable had we sought a cleanup lock originally.
6008 : : */
6009 : : bool
3330 rhaas@postgresql.org 6010 : 2030 : IsBufferCleanupOK(Buffer buffer)
6011 : : {
6012 : : BufferDesc *bufHdr;
6013 : : uint32 buf_state;
6014 : :
6015 [ - + ]: 2030 : Assert(BufferIsValid(buffer));
6016 : :
6017 : : /* see AIO related comment in LockBufferForCleanup() */
6018 : :
6019 [ - + ]: 2030 : if (BufferIsLocal(buffer))
6020 : : {
6021 : : /* There should be exactly one pin */
3330 rhaas@postgresql.org 6022 [ # # ]:UBC 0 : if (LocalRefCount[-buffer - 1] != 1)
6023 : 0 : return false;
6024 : : /* Nobody else to wait for */
6025 : 0 : return true;
6026 : : }
6027 : :
6028 : : /* There should be exactly one local pin */
3330 rhaas@postgresql.org 6029 [ - + ]:CBC 2030 : if (GetPrivateRefCount(buffer) != 1)
3330 rhaas@postgresql.org 6030 :UBC 0 : return false;
6031 : :
3330 rhaas@postgresql.org 6032 :CBC 2030 : bufHdr = GetBufferDescriptor(buffer - 1);
6033 : :
6034 : : /* caller must hold exclusive lock on buffer */
70 andres@anarazel.de 6035 [ - + ]:GNC 2030 : Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
6036 : :
3330 rhaas@postgresql.org 6037 :CBC 2030 : buf_state = LockBufHdr(bufHdr);
6038 : :
6039 [ - + ]: 2030 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
6040 [ + - ]: 2030 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
6041 : : {
6042 : : /* pincount is OK. */
41 andres@anarazel.de 6043 :GNC 2030 : UnlockBufHdr(bufHdr);
3330 rhaas@postgresql.org 6044 :CBC 2030 : return true;
6045 : : }
6046 : :
41 andres@anarazel.de 6047 :UNC 0 : UnlockBufHdr(bufHdr);
3330 rhaas@postgresql.org 6048 :UBC 0 : return false;
6049 : : }
6050 : :
6051 : :
6052 : : /*
6053 : : * Functions for buffer I/O handling
6054 : : *
6055 : : * Also note that these are used only for shared buffers, not local ones.
6056 : : */
6057 : :
6058 : : /*
6059 : : * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
6060 : : */
6061 : : static void
3684 rhaas@postgresql.org 6062 :CBC 14251 : WaitIO(BufferDesc *buf)
6063 : : {
1742 tmunro@postgresql.or 6064 : 14251 : ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
6065 : :
6066 : 14251 : ConditionVariablePrepareToSleep(cv);
6067 : : for (;;)
7593 tgl@sss.pgh.pa.us 6068 : 2798 : {
6069 : : uint32 buf_state;
6070 : : PgAioWaitRef iow;
6071 : :
6072 : : /*
6073 : : * It may not be necessary to acquire the spinlock to check the flag
6074 : : * here, but since this test is essential for correctness, we'd better
6075 : : * play it safe.
6076 : : */
3538 andres@anarazel.de 6077 : 17049 : buf_state = LockBufHdr(buf);
6078 : :
6079 : : /*
6080 : : * Copy the wait reference while holding the spinlock. This protects
6081 : : * against a concurrent TerminateBufferIO() in another backend from
6082 : : * clearing the wref while it's being read.
6083 : : */
262 6084 : 17049 : iow = buf->io_wref;
41 andres@anarazel.de 6085 :GNC 17049 : UnlockBufHdr(buf);
6086 : :
6087 : : /* no IO in progress, we don't need to wait */
3538 andres@anarazel.de 6088 [ + + ]:CBC 17049 : if (!(buf_state & BM_IO_IN_PROGRESS))
7593 tgl@sss.pgh.pa.us 6089 : 14251 : break;
6090 : :
6091 : : /*
6092 : : * The buffer has asynchronous IO in progress, wait for it to
6093 : : * complete.
6094 : : */
262 andres@anarazel.de 6095 [ + + ]: 2798 : if (pgaio_wref_valid(&iow))
6096 : : {
6097 : 2532 : pgaio_wref_wait(&iow);
6098 : :
6099 : : /*
6100 : : * The AIO subsystem internally uses condition variables and thus
6101 : : * might remove this backend from the BufferDesc's CV. While that
6102 : : * wouldn't cause a correctness issue (the first CV sleep just
6103 : : * immediately returns if not already registered), it seems worth
6104 : : * avoiding unnecessary loop iterations, given that we take care
6105 : : * to do so at the start of the function.
6106 : : */
6107 : 2532 : ConditionVariablePrepareToSleep(cv);
6108 : 2532 : continue;
6109 : : }
6110 : :
6111 : : /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
1742 tmunro@postgresql.or 6112 : 266 : ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
6113 : : }
6114 : 14251 : ConditionVariableCancelSleep();
7593 tgl@sss.pgh.pa.us 6115 : 14251 : }
6116 : :
6117 : : /*
6118 : : * StartBufferIO: begin I/O on this buffer
6119 : : * (Assumptions)
6120 : : * My process is executing no IO on this buffer
6121 : : * The buffer is Pinned
6122 : : *
6123 : : * In some scenarios multiple backends could attempt the same I/O operation
6124 : : * concurrently. If someone else has already started I/O on this buffer then
6125 : : * we will wait for completion of the IO using WaitIO().
6126 : : *
6127 : : * Input operations are only attempted on buffers that are not BM_VALID,
6128 : : * and output operations only on buffers that are BM_VALID and BM_DIRTY,
6129 : : * so we can always tell if the work is already done.
6130 : : *
6131 : : * Returns true if we successfully marked the buffer as I/O busy,
6132 : : * false if someone else already did the work.
6133 : : *
6134 : : * If nowait is true, then we don't wait for an I/O to be finished by another
6135 : : * backend. In that case, false indicates either that the I/O was already
6136 : : * finished, or is still in progress. This is useful for callers that want to
6137 : : * find out if they can perform the I/O as part of a larger operation, without
6138 : : * waiting for the answer or distinguishing the reasons why not.
6139 : : */
6140 : : bool
623 tmunro@postgresql.or 6141 : 2555257 : StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
6142 : : {
6143 : : uint32 buf_state;
6144 : :
770 heikki.linnakangas@i 6145 : 2555257 : ResourceOwnerEnlarge(CurrentResourceOwner);
6146 : :
6147 : : for (;;)
6148 : : {
3538 andres@anarazel.de 6149 : 2558071 : buf_state = LockBufHdr(buf);
6150 : :
6151 [ + + ]: 2558071 : if (!(buf_state & BM_IO_IN_PROGRESS))
7593 tgl@sss.pgh.pa.us 6152 : 2555251 : break;
41 andres@anarazel.de 6153 :GNC 2820 : UnlockBufHdr(buf);
623 tmunro@postgresql.or 6154 [ + + ]:CBC 2820 : if (nowait)
6155 : 6 : return false;
7593 tgl@sss.pgh.pa.us 6156 : 2814 : WaitIO(buf);
6157 : : }
6158 : :
6159 : : /* Once we get here, there is definitely no I/O active on this buffer */
6160 : :
6161 : : /* Check if someone else already did the I/O */
3538 andres@anarazel.de 6162 [ + + + + ]: 2555251 : if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
6163 : : {
41 andres@anarazel.de 6164 :GNC 3009 : UnlockBufHdr(buf);
7593 tgl@sss.pgh.pa.us 6165 :CBC 3009 : return false;
6166 : : }
6167 : :
41 andres@anarazel.de 6168 :GNC 2552242 : UnlockBufHdrExt(buf, buf_state,
6169 : : BM_IO_IN_PROGRESS, 0,
6170 : : 0);
6171 : :
987 andres@anarazel.de 6172 :CBC 2552242 : ResourceOwnerRememberBufferIO(CurrentResourceOwner,
6173 : : BufferDescriptorGetBuffer(buf));
6174 : :
7593 tgl@sss.pgh.pa.us 6175 : 2552242 : return true;
6176 : : }
6177 : :
6178 : : /*
6179 : : * TerminateBufferIO: release a buffer we were doing I/O on
6180 : : * (Assumptions)
6181 : : * My process is executing IO for the buffer
6182 : : * BM_IO_IN_PROGRESS bit is set for the buffer
6183 : : * The buffer is Pinned
6184 : : *
6185 : : * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
6186 : : * buffer's BM_DIRTY flag. This is appropriate when terminating a
6187 : : * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
6188 : : * marking the buffer clean if it was re-dirtied while we were writing.
6189 : : *
6190 : : * set_flag_bits gets ORed into the buffer's flags. It must include
6191 : : * BM_IO_ERROR in a failure case. For successful completion it could
6192 : : * be 0, or BM_VALID if we just finished reading in the page.
6193 : : *
6194 : : * If forget_owner is true, we release the buffer I/O from the current
6195 : : * resource owner. (forget_owner=false is used when the resource owner itself
6196 : : * is being released)
6197 : : */
6198 : : void
770 heikki.linnakangas@i 6199 : 2415302 : TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
6200 : : bool forget_owner, bool release_aio)
6201 : : {
6202 : : uint32 buf_state;
41 andres@anarazel.de 6203 :GNC 2415302 : uint32 unset_flag_bits = 0;
6204 : 2415302 : int refcount_change = 0;
6205 : :
3538 andres@anarazel.de 6206 :CBC 2415302 : buf_state = LockBufHdr(buf);
6207 : :
6208 [ - + ]: 2415302 : Assert(buf_state & BM_IO_IN_PROGRESS);
41 andres@anarazel.de 6209 :GNC 2415302 : unset_flag_bits |= BM_IO_IN_PROGRESS;
6210 : :
6211 : : /* Clear earlier errors, if this IO failed, it'll be marked again */
6212 : 2415302 : unset_flag_bits |= BM_IO_ERROR;
6213 : :
3538 andres@anarazel.de 6214 [ + + + + ]:CBC 2415302 : if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
41 andres@anarazel.de 6215 :GNC 572973 : unset_flag_bits |= BM_DIRTY | BM_CHECKPOINT_NEEDED;
6216 : :
262 andres@anarazel.de 6217 [ + + ]:CBC 2415302 : if (release_aio)
6218 : : {
6219 : : /* release ownership by the AIO subsystem */
6220 [ - + ]: 1341171 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
41 andres@anarazel.de 6221 :GNC 1341171 : refcount_change = -1;
262 andres@anarazel.de 6222 :CBC 1341171 : pgaio_wref_clear(&buf->io_wref);
6223 : : }
6224 : :
41 andres@anarazel.de 6225 :GNC 2415302 : buf_state = UnlockBufHdrExt(buf, buf_state,
6226 : : set_flag_bits, unset_flag_bits,
6227 : : refcount_change);
6228 : :
770 heikki.linnakangas@i 6229 [ + + ]:CBC 2415302 : if (forget_owner)
6230 : 1074107 : ResourceOwnerForgetBufferIO(CurrentResourceOwner,
6231 : : BufferDescriptorGetBuffer(buf));
6232 : :
1742 tmunro@postgresql.or 6233 : 2415302 : ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
6234 : :
6235 : : /*
6236 : : * Support LockBufferForCleanup()
6237 : : *
6238 : : * We may have just released the last pin other than the waiter's. In most
6239 : : * cases, this backend holds another pin on the buffer. But, if, for
6240 : : * example, this backend is completing an IO issued by another backend, it
6241 : : * may be time to wake the waiter.
6242 : : */
262 andres@anarazel.de 6243 [ + + - + ]: 2415302 : if (release_aio && (buf_state & BM_PIN_COUNT_WAITER))
262 andres@anarazel.de 6244 :UBC 0 : WakePinCountWaiter(buf);
9466 inoue@tpf.co.jp 6245 :CBC 2415302 : }
6246 : :
6247 : : /*
6248 : : * AbortBufferIO: Clean up active buffer I/O after an error.
6249 : : *
6250 : : * All LWLocks we might have held have been released,
6251 : : * but we haven't yet released buffer pins, so the buffer is still pinned.
6252 : : *
6253 : : * If I/O was in progress, we always set BM_IO_ERROR, even though it's
6254 : : * possible the error condition wasn't related to the I/O.
6255 : : *
6256 : : * Note: this does not remove the buffer I/O from the resource owner.
6257 : : * That's correct when we're releasing the whole resource owner, but
6258 : : * beware if you use this in other contexts.
6259 : : */
6260 : : static void
979 pg@bowt.ie 6261 : 15 : AbortBufferIO(Buffer buffer)
6262 : : {
6263 : 15 : BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
6264 : : uint32 buf_state;
6265 : :
987 andres@anarazel.de 6266 : 15 : buf_state = LockBufHdr(buf_hdr);
6267 [ - + ]: 15 : Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
6268 : :
6269 [ + - ]: 15 : if (!(buf_state & BM_VALID))
6270 : : {
6271 [ - + ]: 15 : Assert(!(buf_state & BM_DIRTY));
41 andres@anarazel.de 6272 :GNC 15 : UnlockBufHdr(buf_hdr);
6273 : : }
6274 : : else
6275 : : {
985 andres@anarazel.de 6276 [ # # ]:UBC 0 : Assert(buf_state & BM_DIRTY);
41 andres@anarazel.de 6277 :UNC 0 : UnlockBufHdr(buf_hdr);
6278 : :
6279 : : /* Issue notice if this is not the first failure... */
987 andres@anarazel.de 6280 [ # # ]:UBC 0 : if (buf_state & BM_IO_ERROR)
6281 : : {
6282 : : /* Buffer is pinned, so we can read tag without spinlock */
6283 [ # # ]: 0 : ereport(WARNING,
6284 : : (errcode(ERRCODE_IO_ERROR),
6285 : : errmsg("could not write block %u of %s",
6286 : : buf_hdr->tag.blockNum,
6287 : : relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
6288 : : BufTagGetForkNum(&buf_hdr->tag)).str),
6289 : : errdetail("Multiple failures --- write error might be permanent.")));
6290 : : }
6291 : : }
6292 : :
262 andres@anarazel.de 6293 :CBC 15 : TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
9466 inoue@tpf.co.jp 6294 : 15 : }
6295 : :
6296 : : /*
6297 : : * Error context callback for errors occurring during shared buffer writes.
6298 : : */
6299 : : static void
5605 rhaas@postgresql.org 6300 : 35 : shared_buffer_write_error_callback(void *arg)
6301 : : {
3684 6302 : 35 : BufferDesc *bufHdr = (BufferDesc *) arg;
6303 : :
6304 : : /* Buffer is pinned, so we can read the tag without locking the spinlock */
8257 tgl@sss.pgh.pa.us 6305 [ + - ]: 35 : if (bufHdr != NULL)
113 peter@eisentraut.org 6306 : 70 : errcontext("writing block %u of relation \"%s\"",
6307 : : bufHdr->tag.blockNum,
295 andres@anarazel.de 6308 : 35 : relpathperm(BufTagGetRelFileLocator(&bufHdr->tag),
6309 : : BufTagGetForkNum(&bufHdr->tag)).str);
5605 rhaas@postgresql.org 6310 : 35 : }
6311 : :
6312 : : /*
6313 : : * Error context callback for errors occurring during local buffer writes.
6314 : : */
6315 : : static void
5605 rhaas@postgresql.org 6316 :UBC 0 : local_buffer_write_error_callback(void *arg)
6317 : : {
3684 6318 : 0 : BufferDesc *bufHdr = (BufferDesc *) arg;
6319 : :
5605 6320 [ # # ]: 0 : if (bufHdr != NULL)
113 peter@eisentraut.org 6321 : 0 : errcontext("writing block %u of relation \"%s\"",
6322 : : bufHdr->tag.blockNum,
295 andres@anarazel.de 6323 : 0 : relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
6324 : : MyProcNumber,
6325 : : BufTagGetForkNum(&bufHdr->tag)).str);
8257 tgl@sss.pgh.pa.us 6326 : 0 : }
6327 : :
6328 : : /*
6329 : : * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
6330 : : */
6331 : : static int
1260 rhaas@postgresql.org 6332 :CBC 9708751 : rlocator_comparator(const void *p1, const void *p2)
6333 : : {
6334 : 9708751 : RelFileLocator n1 = *(const RelFileLocator *) p1;
6335 : 9708751 : RelFileLocator n2 = *(const RelFileLocator *) p2;
6336 : :
6337 [ + + ]: 9708751 : if (n1.relNumber < n2.relNumber)
4717 alvherre@alvh.no-ip. 6338 : 9670537 : return -1;
1260 rhaas@postgresql.org 6339 [ + + ]: 38214 : else if (n1.relNumber > n2.relNumber)
4717 alvherre@alvh.no-ip. 6340 : 36432 : return 1;
6341 : :
1260 rhaas@postgresql.org 6342 [ - + ]: 1782 : if (n1.dbOid < n2.dbOid)
4717 alvherre@alvh.no-ip. 6343 :UBC 0 : return -1;
1260 rhaas@postgresql.org 6344 [ - + ]:CBC 1782 : else if (n1.dbOid > n2.dbOid)
4717 alvherre@alvh.no-ip. 6345 :UBC 0 : return 1;
6346 : :
1260 rhaas@postgresql.org 6347 [ - + ]:CBC 1782 : if (n1.spcOid < n2.spcOid)
4717 alvherre@alvh.no-ip. 6348 :UBC 0 : return -1;
1260 rhaas@postgresql.org 6349 [ - + ]:CBC 1782 : else if (n1.spcOid > n2.spcOid)
4717 alvherre@alvh.no-ip. 6350 :UBC 0 : return 1;
6351 : : else
4717 alvherre@alvh.no-ip. 6352 :CBC 1782 : return 0;
6353 : : }
6354 : :
6355 : : /*
6356 : : * Lock buffer header - set BM_LOCKED in buffer state.
6357 : : */
6358 : : uint32
3538 andres@anarazel.de 6359 : 30970328 : LockBufHdr(BufferDesc *desc)
6360 : : {
6361 : : SpinDelayStatus delayStatus;
6362 : : uint32 old_buf_state;
6363 : :
987 6364 [ - + ]: 30970328 : Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc)));
6365 : :
3534 6366 : 30970328 : init_local_spin_delay(&delayStatus);
6367 : :
6368 : : while (true)
6369 : : {
6370 : : /* set BM_LOCKED flag */
3538 6371 : 30987665 : old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
6372 : : /* if it wasn't set before we're OK */
6373 [ + + ]: 30987665 : if (!(old_buf_state & BM_LOCKED))
6374 : 30970328 : break;
6375 : 17337 : perform_spin_delay(&delayStatus);
6376 : : }
6377 : 30970328 : finish_spin_delay(&delayStatus);
6378 : 30970328 : return old_buf_state | BM_LOCKED;
6379 : : }
6380 : :
6381 : : /*
6382 : : * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
6383 : : * state at that point.
6384 : : *
6385 : : * Obviously the buffer could be locked by the time the value is returned, so
6386 : : * this is primarily useful in CAS style loops.
6387 : : */
6388 : : pg_noinline uint32
6389 : 278 : WaitBufHdrUnlocked(BufferDesc *buf)
6390 : : {
6391 : : SpinDelayStatus delayStatus;
6392 : : uint32 buf_state;
6393 : :
3534 6394 : 278 : init_local_spin_delay(&delayStatus);
6395 : :
3538 6396 : 278 : buf_state = pg_atomic_read_u32(&buf->state);
6397 : :
6398 [ + + ]: 52616 : while (buf_state & BM_LOCKED)
6399 : : {
6400 : 52338 : perform_spin_delay(&delayStatus);
6401 : 52338 : buf_state = pg_atomic_read_u32(&buf->state);
6402 : : }
6403 : :
6404 : 278 : finish_spin_delay(&delayStatus);
6405 : :
6406 : 278 : return buf_state;
6407 : : }
6408 : :
6409 : : /*
6410 : : * BufferTag comparator.
6411 : : */
6412 : : static inline int
1741 tmunro@postgresql.or 6413 :UBC 0 : buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
6414 : : {
6415 : : int ret;
6416 : : RelFileLocator rlocatora;
6417 : : RelFileLocator rlocatorb;
6418 : :
1211 rhaas@postgresql.org 6419 : 0 : rlocatora = BufTagGetRelFileLocator(ba);
6420 : 0 : rlocatorb = BufTagGetRelFileLocator(bb);
6421 : :
6422 : 0 : ret = rlocator_comparator(&rlocatora, &rlocatorb);
6423 : :
3589 andres@anarazel.de 6424 [ # # ]: 0 : if (ret != 0)
6425 : 0 : return ret;
6426 : :
1211 rhaas@postgresql.org 6427 [ # # ]: 0 : if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
3589 andres@anarazel.de 6428 : 0 : return -1;
1211 rhaas@postgresql.org 6429 [ # # ]: 0 : if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
3589 andres@anarazel.de 6430 : 0 : return 1;
6431 : :
6432 [ # # ]: 0 : if (ba->blockNum < bb->blockNum)
6433 : 0 : return -1;
6434 [ # # ]: 0 : if (ba->blockNum > bb->blockNum)
6435 : 0 : return 1;
6436 : :
6437 : 0 : return 0;
6438 : : }
6439 : :
6440 : : /*
6441 : : * Comparator determining the writeout order in a checkpoint.
6442 : : *
6443 : : * It is important that tablespaces are compared first, the logic balancing
6444 : : * writes between tablespaces relies on it.
6445 : : */
6446 : : static inline int
1741 tmunro@postgresql.or 6447 :CBC 3018221 : ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
6448 : : {
6449 : : /* compare tablespace */
3589 andres@anarazel.de 6450 [ + + ]: 3018221 : if (a->tsId < b->tsId)
6451 : 7131 : return -1;
6452 [ + + ]: 3011090 : else if (a->tsId > b->tsId)
6453 : 26762 : return 1;
6454 : : /* compare relation */
1260 rhaas@postgresql.org 6455 [ + + ]: 2984328 : if (a->relNumber < b->relNumber)
3589 andres@anarazel.de 6456 : 854867 : return -1;
1260 rhaas@postgresql.org 6457 [ + + ]: 2129461 : else if (a->relNumber > b->relNumber)
3589 andres@anarazel.de 6458 : 821716 : return 1;
6459 : : /* compare fork */
6460 [ + + ]: 1307745 : else if (a->forkNum < b->forkNum)
6461 : 58276 : return -1;
6462 [ + + ]: 1249469 : else if (a->forkNum > b->forkNum)
6463 : 59511 : return 1;
6464 : : /* compare block number */
6465 [ + + ]: 1189958 : else if (a->blockNum < b->blockNum)
6466 : 580303 : return -1;
2898 tgl@sss.pgh.pa.us 6467 [ + + ]: 609655 : else if (a->blockNum > b->blockNum)
3589 andres@anarazel.de 6468 : 572289 : return 1;
6469 : : /* equal page IDs are unlikely, but not impossible */
2898 tgl@sss.pgh.pa.us 6470 : 37366 : return 0;
6471 : : }
6472 : :
6473 : : /*
6474 : : * Comparator for a Min-Heap over the per-tablespace checkpoint completion
6475 : : * progress.
6476 : : */
6477 : : static int
3589 andres@anarazel.de 6478 : 245276 : ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
6479 : : {
131 peter@eisentraut.org 6480 :GNC 245276 : CkptTsStatus *sa = (CkptTsStatus *) DatumGetPointer(a);
6481 : 245276 : CkptTsStatus *sb = (CkptTsStatus *) DatumGetPointer(b);
6482 : :
6483 : : /* we want a min-heap, so return 1 for the a < b */
3589 andres@anarazel.de 6484 [ + + ]:CBC 245276 : if (sa->progress < sb->progress)
6485 : 218911 : return 1;
6486 [ + + ]: 26365 : else if (sa->progress == sb->progress)
6487 : 1006 : return 0;
6488 : : else
6489 : 25359 : return -1;
6490 : : }
6491 : :
6492 : : /*
6493 : : * Initialize a writeback context, discarding potential previous state.
6494 : : *
6495 : : * *max_pending is a pointer instead of an immediate value, so the coalesce
6496 : : * limits can easily changed by the GUC mechanism, and so calling code does
6497 : : * not have to check the current configuration. A value of 0 means that no
6498 : : * writeback control will be performed.
6499 : : */
6500 : : void
6501 : 2658 : WritebackContextInit(WritebackContext *context, int *max_pending)
6502 : : {
6503 [ - + ]: 2658 : Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
6504 : :
6505 : 2658 : context->max_pending = max_pending;
6506 : 2658 : context->nr_pending = 0;
6507 : 2658 : }
6508 : :
6509 : : /*
6510 : : * Add buffer to list of pending writeback requests.
6511 : : */
6512 : : void
945 6513 : 567485 : ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context,
6514 : : BufferTag *tag)
6515 : : {
6516 : : PendingWriteback *pending;
6517 : :
6518 : : /*
6519 : : * As pg_flush_data() doesn't do anything with fsync disabled, there's no
6520 : : * point in tracking in that case.
6521 : : */
435 6522 [ + + ]: 567485 : if (io_direct_flags & IO_DIRECT_DATA ||
6523 [ + + ]: 566967 : !enableFsync)
984 tmunro@postgresql.or 6524 : 567481 : return;
6525 : :
6526 : : /*
6527 : : * Add buffer to the pending writeback array, unless writeback control is
6528 : : * disabled.
6529 : : */
945 andres@anarazel.de 6530 [ - + ]: 4 : if (*wb_context->max_pending > 0)
6531 : : {
945 andres@anarazel.de 6532 [ # # ]:UBC 0 : Assert(*wb_context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
6533 : :
6534 : 0 : pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
6535 : :
3589 6536 : 0 : pending->tag = *tag;
6537 : : }
6538 : :
6539 : : /*
6540 : : * Perform pending flushes if the writeback limit is exceeded. This
6541 : : * includes the case where previously an item has been added, but control
6542 : : * is now disabled.
6543 : : */
945 andres@anarazel.de 6544 [ + - ]:CBC 4 : if (wb_context->nr_pending >= *wb_context->max_pending)
6545 : 4 : IssuePendingWritebacks(wb_context, io_context);
6546 : : }
6547 : :
6548 : : #define ST_SORT sort_pending_writebacks
6549 : : #define ST_ELEMENT_TYPE PendingWriteback
6550 : : #define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
6551 : : #define ST_SCOPE static
6552 : : #define ST_DEFINE
6553 : : #include "lib/sort_template.h"
6554 : :
6555 : : /*
6556 : : * Issue all pending writeback requests, previously scheduled with
6557 : : * ScheduleBufferTagForWriteback, to the OS.
6558 : : *
6559 : : * Because this is only used to improve the OSs IO scheduling we try to never
6560 : : * error out - it's just a hint.
6561 : : */
6562 : : void
6563 : 1072 : IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
6564 : : {
6565 : : instr_time io_start;
6566 : : int i;
6567 : :
6568 [ + - ]: 1072 : if (wb_context->nr_pending == 0)
3589 6569 : 1072 : return;
6570 : :
6571 : : /*
6572 : : * Executing the writes in-order can make them a lot faster, and allows to
6573 : : * merge writeback requests to consecutive blocks into larger writebacks.
6574 : : */
945 andres@anarazel.de 6575 :UBC 0 : sort_pending_writebacks(wb_context->pending_writebacks,
6576 : 0 : wb_context->nr_pending);
6577 : :
294 michael@paquier.xyz 6578 : 0 : io_start = pgstat_prepare_io_time(track_io_timing);
6579 : :
6580 : : /*
6581 : : * Coalesce neighbouring writes, but nothing else. For that we iterate
6582 : : * through the, now sorted, array of pending flushes, and look forward to
6583 : : * find all neighbouring (or identical) writes.
6584 : : */
945 andres@anarazel.de 6585 [ # # ]: 0 : for (i = 0; i < wb_context->nr_pending; i++)
6586 : : {
6587 : : PendingWriteback *cur;
6588 : : PendingWriteback *next;
6589 : : SMgrRelation reln;
6590 : : int ahead;
6591 : : BufferTag tag;
6592 : : RelFileLocator currlocator;
3589 6593 : 0 : Size nblocks = 1;
6594 : :
945 6595 : 0 : cur = &wb_context->pending_writebacks[i];
3589 6596 : 0 : tag = cur->tag;
1211 rhaas@postgresql.org 6597 : 0 : currlocator = BufTagGetRelFileLocator(&tag);
6598 : :
6599 : : /*
6600 : : * Peek ahead, into following writeback requests, to see if they can
6601 : : * be combined with the current one.
6602 : : */
945 andres@anarazel.de 6603 [ # # ]: 0 : for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
6604 : : {
6605 : :
6606 : 0 : next = &wb_context->pending_writebacks[i + ahead + 1];
6607 : :
6608 : : /* different file, stop */
1211 rhaas@postgresql.org 6609 [ # # # # : 0 : if (!RelFileLocatorEquals(currlocator,
# # ]
6610 [ # # ]: 0 : BufTagGetRelFileLocator(&next->tag)) ||
6611 : 0 : BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
6612 : : break;
6613 : :
6614 : : /* ok, block queued twice, skip */
3589 andres@anarazel.de 6615 [ # # ]: 0 : if (cur->tag.blockNum == next->tag.blockNum)
6616 : 0 : continue;
6617 : :
6618 : : /* only merge consecutive writes */
6619 [ # # ]: 0 : if (cur->tag.blockNum + 1 != next->tag.blockNum)
6620 : 0 : break;
6621 : :
6622 : 0 : nblocks++;
6623 : 0 : cur = next;
6624 : : }
6625 : :
6626 : 0 : i += ahead;
6627 : :
6628 : : /* and finally tell the kernel to write the data to storage */
654 heikki.linnakangas@i 6629 : 0 : reln = smgropen(currlocator, INVALID_PROC_NUMBER);
1211 rhaas@postgresql.org 6630 : 0 : smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
6631 : : }
6632 : :
6633 : : /*
6634 : : * Assume that writeback requests are only issued for buffers containing
6635 : : * blocks of permanent relations.
6636 : : */
945 andres@anarazel.de 6637 : 0 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
337 michael@paquier.xyz 6638 : 0 : IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
6639 : :
945 andres@anarazel.de 6640 : 0 : wb_context->nr_pending = 0;
6641 : : }
6642 : :
6643 : : /* ResourceOwner callbacks */
6644 : :
6645 : : static void
770 heikki.linnakangas@i 6646 :CBC 15 : ResOwnerReleaseBufferIO(Datum res)
6647 : : {
6648 : 15 : Buffer buffer = DatumGetInt32(res);
6649 : :
6650 : 15 : AbortBufferIO(buffer);
6651 : 15 : }
6652 : :
6653 : : static char *
770 heikki.linnakangas@i 6654 :UBC 0 : ResOwnerPrintBufferIO(Datum res)
6655 : : {
6656 : 0 : Buffer buffer = DatumGetInt32(res);
6657 : :
6658 : 0 : return psprintf("lost track of buffer IO on buffer %d", buffer);
6659 : : }
6660 : :
6661 : : static void
770 heikki.linnakangas@i 6662 :CBC 7602 : ResOwnerReleaseBufferPin(Datum res)
6663 : : {
6664 : 7602 : Buffer buffer = DatumGetInt32(res);
6665 : :
6666 : : /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
6667 [ - + ]: 7602 : if (!BufferIsValid(buffer))
770 heikki.linnakangas@i 6668 [ # # ]:UBC 0 : elog(ERROR, "bad buffer ID: %d", buffer);
6669 : :
770 heikki.linnakangas@i 6670 [ + + ]:CBC 7602 : if (BufferIsLocal(buffer))
6671 : 3038 : UnpinLocalBufferNoOwner(buffer);
6672 : : else
6673 : 4564 : UnpinBufferNoOwner(GetBufferDescriptor(buffer - 1));
6674 : 7602 : }
6675 : :
6676 : : static char *
770 heikki.linnakangas@i 6677 :UBC 0 : ResOwnerPrintBufferPin(Datum res)
6678 : : {
6679 : 0 : return DebugPrintBufferRefcount(DatumGetInt32(res));
6680 : : }
6681 : :
6682 : : /*
6683 : : * Helper function to evict unpinned buffer whose buffer header lock is
6684 : : * already acquired.
6685 : : */
6686 : : static bool
253 andres@anarazel.de 6687 :CBC 2188 : EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
6688 : : {
6689 : : uint32 buf_state;
6690 : : bool result;
6691 : :
6692 : 2188 : *buffer_flushed = false;
6693 : :
6694 : 2188 : buf_state = pg_atomic_read_u32(&(desc->state));
6695 [ - + ]: 2188 : Assert(buf_state & BM_LOCKED);
6696 : :
619 tmunro@postgresql.or 6697 [ - + ]: 2188 : if ((buf_state & BM_VALID) == 0)
6698 : : {
41 andres@anarazel.de 6699 :UNC 0 : UnlockBufHdr(desc);
619 tmunro@postgresql.or 6700 :UBC 0 : return false;
6701 : : }
6702 : :
6703 : : /* Check that it's not pinned already. */
619 tmunro@postgresql.or 6704 [ - + ]:CBC 2188 : if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
6705 : : {
41 andres@anarazel.de 6706 :UNC 0 : UnlockBufHdr(desc);
619 tmunro@postgresql.or 6707 :UBC 0 : return false;
6708 : : }
6709 : :
619 tmunro@postgresql.or 6710 :CBC 2188 : PinBuffer_Locked(desc); /* releases spinlock */
6711 : :
6712 : : /* If it was dirty, try to clean it once. */
6713 [ + + ]: 2188 : if (buf_state & BM_DIRTY)
6714 : : {
70 andres@anarazel.de 6715 :GNC 1002 : FlushUnlockedBuffer(desc, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
253 andres@anarazel.de 6716 :CBC 1002 : *buffer_flushed = true;
6717 : : }
6718 : :
6719 : : /* This will return false if it becomes dirty or someone else pins it. */
619 tmunro@postgresql.or 6720 : 2188 : result = InvalidateVictimBuffer(desc);
6721 : :
6722 : 2188 : UnpinBuffer(desc);
6723 : :
6724 : 2188 : return result;
6725 : : }
6726 : :
6727 : : /*
6728 : : * Try to evict the current block in a shared buffer.
6729 : : *
6730 : : * This function is intended for testing/development use only!
6731 : : *
6732 : : * To succeed, the buffer must not be pinned on entry, so if the caller had a
6733 : : * particular block in mind, it might already have been replaced by some other
6734 : : * block by the time this function runs. It's also unpinned on return, so the
6735 : : * buffer might be occupied again by the time control is returned, potentially
6736 : : * even by the same block. This inherent raciness without other interlocking
6737 : : * makes the function unsuitable for non-testing usage.
6738 : : *
6739 : : * *buffer_flushed is set to true if the buffer was dirty and has been
6740 : : * flushed, false otherwise. However, *buffer_flushed=true does not
6741 : : * necessarily mean that we flushed the buffer, it could have been flushed by
6742 : : * someone else.
6743 : : *
6744 : : * Returns true if the buffer was valid and it has now been made invalid.
6745 : : * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
6746 : : * or if the buffer becomes dirty again while we're trying to write it out.
6747 : : */
6748 : : bool
253 andres@anarazel.de 6749 : 145 : EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
6750 : : {
6751 : : BufferDesc *desc;
6752 : :
6753 [ + - - + ]: 145 : Assert(BufferIsValid(buf) && !BufferIsLocal(buf));
6754 : :
6755 : : /* Make sure we can pin the buffer. */
6756 : 145 : ResourceOwnerEnlarge(CurrentResourceOwner);
6757 : 145 : ReservePrivateRefCountEntry();
6758 : :
6759 : 145 : desc = GetBufferDescriptor(buf - 1);
6760 : 145 : LockBufHdr(desc);
6761 : :
6762 : 145 : return EvictUnpinnedBufferInternal(desc, buffer_flushed);
6763 : : }
6764 : :
6765 : : /*
6766 : : * Try to evict all the shared buffers.
6767 : : *
6768 : : * This function is intended for testing/development use only! See
6769 : : * EvictUnpinnedBuffer().
6770 : : *
6771 : : * The buffers_* parameters are mandatory and indicate the total count of
6772 : : * buffers that:
6773 : : * - buffers_evicted - were evicted
6774 : : * - buffers_flushed - were flushed
6775 : : * - buffers_skipped - could not be evicted
6776 : : */
6777 : : void
6778 : 1 : EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed,
6779 : : int32 *buffers_skipped)
6780 : : {
6781 : 1 : *buffers_evicted = 0;
6782 : 1 : *buffers_skipped = 0;
6783 : 1 : *buffers_flushed = 0;
6784 : :
6785 [ + + ]: 16385 : for (int buf = 1; buf <= NBuffers; buf++)
6786 : : {
6787 : 16384 : BufferDesc *desc = GetBufferDescriptor(buf - 1);
6788 : : uint32 buf_state;
6789 : : bool buffer_flushed;
6790 : :
43 msawada@postgresql.o 6791 [ - + ]: 16384 : CHECK_FOR_INTERRUPTS();
6792 : :
253 andres@anarazel.de 6793 : 16384 : buf_state = pg_atomic_read_u32(&desc->state);
6794 [ + + ]: 16384 : if (!(buf_state & BM_VALID))
6795 : 14341 : continue;
6796 : :
6797 : 2043 : ResourceOwnerEnlarge(CurrentResourceOwner);
6798 : 2043 : ReservePrivateRefCountEntry();
6799 : :
6800 : 2043 : LockBufHdr(desc);
6801 : :
6802 [ + - ]: 2043 : if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6803 : 2043 : (*buffers_evicted)++;
6804 : : else
253 andres@anarazel.de 6805 :UBC 0 : (*buffers_skipped)++;
6806 : :
253 andres@anarazel.de 6807 [ + + ]:CBC 2043 : if (buffer_flushed)
6808 : 974 : (*buffers_flushed)++;
6809 : : }
6810 : 1 : }
6811 : :
6812 : : /*
6813 : : * Try to evict all the shared buffers containing provided relation's pages.
6814 : : *
6815 : : * This function is intended for testing/development use only! See
6816 : : * EvictUnpinnedBuffer().
6817 : : *
6818 : : * The caller must hold at least AccessShareLock on the relation to prevent
6819 : : * the relation from being dropped.
6820 : : *
6821 : : * The buffers_* parameters are mandatory and indicate the total count of
6822 : : * buffers that:
6823 : : * - buffers_evicted - were evicted
6824 : : * - buffers_flushed - were flushed
6825 : : * - buffers_skipped - could not be evicted
6826 : : */
6827 : : void
6828 : 1 : EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted,
6829 : : int32 *buffers_flushed, int32 *buffers_skipped)
6830 : : {
6831 [ - + ]: 1 : Assert(!RelationUsesLocalBuffers(rel));
6832 : :
6833 : 1 : *buffers_skipped = 0;
6834 : 1 : *buffers_evicted = 0;
6835 : 1 : *buffers_flushed = 0;
6836 : :
6837 [ + + ]: 16385 : for (int buf = 1; buf <= NBuffers; buf++)
6838 : : {
6839 : 16384 : BufferDesc *desc = GetBufferDescriptor(buf - 1);
6840 : 16384 : uint32 buf_state = pg_atomic_read_u32(&(desc->state));
6841 : : bool buffer_flushed;
6842 : :
43 msawada@postgresql.o 6843 [ - + ]: 16384 : CHECK_FOR_INTERRUPTS();
6844 : :
6845 : : /* An unlocked precheck should be safe and saves some cycles. */
253 andres@anarazel.de 6846 [ + + ]: 16384 : if ((buf_state & BM_VALID) == 0 ||
6847 [ + - ]: 27 : !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
6848 : 16384 : continue;
6849 : :
6850 : : /* Make sure we can pin the buffer. */
253 andres@anarazel.de 6851 :UBC 0 : ResourceOwnerEnlarge(CurrentResourceOwner);
6852 : 0 : ReservePrivateRefCountEntry();
6853 : :
6854 : 0 : buf_state = LockBufHdr(desc);
6855 : :
6856 : : /* recheck, could have changed without the lock */
6857 [ # # ]: 0 : if ((buf_state & BM_VALID) == 0 ||
6858 [ # # ]: 0 : !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
6859 : : {
41 andres@anarazel.de 6860 :UNC 0 : UnlockBufHdr(desc);
253 andres@anarazel.de 6861 :UBC 0 : continue;
6862 : : }
6863 : :
6864 [ # # ]: 0 : if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6865 : 0 : (*buffers_evicted)++;
6866 : : else
6867 : 0 : (*buffers_skipped)++;
6868 : :
6869 [ # # ]: 0 : if (buffer_flushed)
6870 : 0 : (*buffers_flushed)++;
6871 : : }
253 andres@anarazel.de 6872 :CBC 1 : }
6873 : :
6874 : : /*
6875 : : * Helper function to mark unpinned buffer dirty whose buffer header lock is
6876 : : * already acquired.
6877 : : */
6878 : : static bool
19 michael@paquier.xyz 6879 :GNC 36 : MarkDirtyUnpinnedBufferInternal(Buffer buf, BufferDesc *desc,
6880 : : bool *buffer_already_dirty)
6881 : : {
6882 : : uint32 buf_state;
6883 : 36 : bool result = false;
6884 : :
6885 : 36 : *buffer_already_dirty = false;
6886 : :
6887 : 36 : buf_state = pg_atomic_read_u32(&(desc->state));
6888 [ - + ]: 36 : Assert(buf_state & BM_LOCKED);
6889 : :
6890 [ + + ]: 36 : if ((buf_state & BM_VALID) == 0)
6891 : : {
6892 : 1 : UnlockBufHdr(desc);
6893 : 1 : return false;
6894 : : }
6895 : :
6896 : : /* Check that it's not pinned already. */
6897 [ - + ]: 35 : if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
6898 : : {
19 michael@paquier.xyz 6899 :UNC 0 : UnlockBufHdr(desc);
6900 : 0 : return false;
6901 : : }
6902 : :
6903 : : /* Pin the buffer and then release the buffer spinlock */
19 michael@paquier.xyz 6904 :GNC 35 : PinBuffer_Locked(desc);
6905 : :
6906 : : /* If it was not already dirty, mark it as dirty. */
6907 [ + + ]: 35 : if (!(buf_state & BM_DIRTY))
6908 : : {
6909 : 17 : LWLockAcquire(BufferDescriptorGetContentLock(desc), LW_EXCLUSIVE);
6910 : 17 : MarkBufferDirty(buf);
6911 : 17 : result = true;
6912 : 17 : LWLockRelease(BufferDescriptorGetContentLock(desc));
6913 : : }
6914 : : else
6915 : 18 : *buffer_already_dirty = true;
6916 : :
6917 : 35 : UnpinBuffer(desc);
6918 : :
6919 : 35 : return result;
6920 : : }
6921 : :
6922 : : /*
6923 : : * Try to mark the provided shared buffer as dirty.
6924 : : *
6925 : : * This function is intended for testing/development use only!
6926 : : *
6927 : : * Same as EvictUnpinnedBuffer() but with MarkBufferDirty() call inside.
6928 : : *
6929 : : * The buffer_already_dirty parameter is mandatory and indicate if the buffer
6930 : : * could not be dirtied because it is already dirty.
6931 : : *
6932 : : * Returns true if the buffer has successfully been marked as dirty.
6933 : : */
6934 : : bool
6935 : 1 : MarkDirtyUnpinnedBuffer(Buffer buf, bool *buffer_already_dirty)
6936 : : {
6937 : : BufferDesc *desc;
6938 : 1 : bool buffer_dirtied = false;
6939 : :
6940 [ - + ]: 1 : Assert(!BufferIsLocal(buf));
6941 : :
6942 : : /* Make sure we can pin the buffer. */
6943 : 1 : ResourceOwnerEnlarge(CurrentResourceOwner);
6944 : 1 : ReservePrivateRefCountEntry();
6945 : :
6946 : 1 : desc = GetBufferDescriptor(buf - 1);
6947 : 1 : LockBufHdr(desc);
6948 : :
6949 : 1 : buffer_dirtied = MarkDirtyUnpinnedBufferInternal(buf, desc, buffer_already_dirty);
6950 : : /* Both can not be true at the same time */
6951 [ - + - - ]: 1 : Assert(!(buffer_dirtied && *buffer_already_dirty));
6952 : :
6953 : 1 : return buffer_dirtied;
6954 : : }
6955 : :
6956 : : /*
6957 : : * Try to mark all the shared buffers containing provided relation's pages as
6958 : : * dirty.
6959 : : *
6960 : : * This function is intended for testing/development use only! See
6961 : : * MarkDirtyUnpinnedBuffer().
6962 : : *
6963 : : * The buffers_* parameters are mandatory and indicate the total count of
6964 : : * buffers that:
6965 : : * - buffers_dirtied - were dirtied
6966 : : * - buffers_already_dirty - were already dirty
6967 : : * - buffers_skipped - could not be dirtied because of a reason different
6968 : : * than a buffer being already dirty.
6969 : : */
6970 : : void
6971 : 1 : MarkDirtyRelUnpinnedBuffers(Relation rel,
6972 : : int32 *buffers_dirtied,
6973 : : int32 *buffers_already_dirty,
6974 : : int32 *buffers_skipped)
6975 : : {
6976 [ - + ]: 1 : Assert(!RelationUsesLocalBuffers(rel));
6977 : :
6978 : 1 : *buffers_dirtied = 0;
6979 : 1 : *buffers_already_dirty = 0;
6980 : 1 : *buffers_skipped = 0;
6981 : :
6982 [ + + ]: 16385 : for (int buf = 1; buf <= NBuffers; buf++)
6983 : : {
6984 : 16384 : BufferDesc *desc = GetBufferDescriptor(buf - 1);
6985 : 16384 : uint32 buf_state = pg_atomic_read_u32(&(desc->state));
6986 : : bool buffer_already_dirty;
6987 : :
6988 [ - + ]: 16384 : CHECK_FOR_INTERRUPTS();
6989 : :
6990 : : /* An unlocked precheck should be safe and saves some cycles. */
6991 [ + + ]: 16384 : if ((buf_state & BM_VALID) == 0 ||
6992 [ + - ]: 27 : !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
6993 : 16384 : continue;
6994 : :
6995 : : /* Make sure we can pin the buffer. */
19 michael@paquier.xyz 6996 :UNC 0 : ResourceOwnerEnlarge(CurrentResourceOwner);
6997 : 0 : ReservePrivateRefCountEntry();
6998 : :
6999 : 0 : buf_state = LockBufHdr(desc);
7000 : :
7001 : : /* recheck, could have changed without the lock */
7002 [ # # ]: 0 : if ((buf_state & BM_VALID) == 0 ||
7003 [ # # ]: 0 : !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
7004 : : {
7005 : 0 : UnlockBufHdr(desc);
7006 : 0 : continue;
7007 : : }
7008 : :
7009 [ # # ]: 0 : if (MarkDirtyUnpinnedBufferInternal(buf, desc, &buffer_already_dirty))
7010 : 0 : (*buffers_dirtied)++;
7011 [ # # ]: 0 : else if (buffer_already_dirty)
7012 : 0 : (*buffers_already_dirty)++;
7013 : : else
7014 : 0 : (*buffers_skipped)++;
7015 : : }
19 michael@paquier.xyz 7016 :GNC 1 : }
7017 : :
7018 : : /*
7019 : : * Try to mark all the shared buffers as dirty.
7020 : : *
7021 : : * This function is intended for testing/development use only! See
7022 : : * MarkDirtyUnpinnedBuffer().
7023 : : *
7024 : : * See MarkDirtyRelUnpinnedBuffers() above for details about the buffers_*
7025 : : * parameters.
7026 : : */
7027 : : void
7028 : 1 : MarkDirtyAllUnpinnedBuffers(int32 *buffers_dirtied,
7029 : : int32 *buffers_already_dirty,
7030 : : int32 *buffers_skipped)
7031 : : {
7032 : 1 : *buffers_dirtied = 0;
7033 : 1 : *buffers_already_dirty = 0;
7034 : 1 : *buffers_skipped = 0;
7035 : :
7036 [ + + ]: 16385 : for (int buf = 1; buf <= NBuffers; buf++)
7037 : : {
7038 : 16384 : BufferDesc *desc = GetBufferDescriptor(buf - 1);
7039 : : uint32 buf_state;
7040 : : bool buffer_already_dirty;
7041 : :
7042 [ - + ]: 16384 : CHECK_FOR_INTERRUPTS();
7043 : :
7044 : 16384 : buf_state = pg_atomic_read_u32(&desc->state);
7045 [ + + ]: 16384 : if (!(buf_state & BM_VALID))
7046 : 16349 : continue;
7047 : :
7048 : 35 : ResourceOwnerEnlarge(CurrentResourceOwner);
7049 : 35 : ReservePrivateRefCountEntry();
7050 : :
7051 : 35 : LockBufHdr(desc);
7052 : :
7053 [ + + ]: 35 : if (MarkDirtyUnpinnedBufferInternal(buf, desc, &buffer_already_dirty))
7054 : 17 : (*buffers_dirtied)++;
7055 [ + - ]: 18 : else if (buffer_already_dirty)
7056 : 18 : (*buffers_already_dirty)++;
7057 : : else
19 michael@paquier.xyz 7058 :UNC 0 : (*buffers_skipped)++;
7059 : : }
19 michael@paquier.xyz 7060 :GNC 1 : }
7061 : :
7062 : : /*
7063 : : * Generic implementation of the AIO handle staging callback for readv/writev
7064 : : * on local/shared buffers.
7065 : : *
7066 : : * Each readv/writev can target multiple buffers. The buffers have already
7067 : : * been registered with the IO handle.
7068 : : *
7069 : : * To make the IO ready for execution ("staging"), we need to ensure that the
7070 : : * targeted buffers are in an appropriate state while the IO is ongoing. For
7071 : : * that the AIO subsystem needs to have its own buffer pin, otherwise an error
7072 : : * in this backend could lead to this backend's buffer pin being released as
7073 : : * part of error handling, which in turn could lead to the buffer being
7074 : : * replaced while IO is ongoing.
7075 : : */
7076 : : static pg_attribute_always_inline void
262 andres@anarazel.de 7077 :CBC 1320563 : buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
7078 : : {
7079 : : uint64 *io_data;
7080 : : uint8 handle_data_len;
7081 : : PgAioWaitRef io_ref;
7082 : 1320563 : BufferTag first PG_USED_FOR_ASSERTS_ONLY = {0};
7083 : :
7084 : 1320563 : io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
7085 : :
7086 : 1320563 : pgaio_io_get_wref(ioh, &io_ref);
7087 : :
7088 : : /* iterate over all buffers affected by the vectored readv/writev */
7089 [ + + ]: 2807127 : for (int i = 0; i < handle_data_len; i++)
7090 : : {
7091 : 1486564 : Buffer buffer = (Buffer) io_data[i];
7092 : 1486564 : BufferDesc *buf_hdr = is_temp ?
7093 : 8453 : GetLocalBufferDescriptor(-buffer - 1)
7094 [ + + ]: 1486564 : : GetBufferDescriptor(buffer - 1);
7095 : : uint32 buf_state;
7096 : :
7097 : : /*
7098 : : * Check that all the buffers are actually ones that could conceivably
7099 : : * be done in one IO, i.e. are sequential. This is the last
7100 : : * buffer-aware code before IO is actually executed and confusion
7101 : : * about which buffers are targeted by IO can be hard to debug, making
7102 : : * it worth doing extra-paranoid checks.
7103 : : */
7104 [ + + ]: 1486564 : if (i == 0)
7105 : 1320563 : first = buf_hdr->tag;
7106 : : else
7107 : : {
7108 [ - + ]: 166001 : Assert(buf_hdr->tag.relNumber == first.relNumber);
7109 [ - + ]: 166001 : Assert(buf_hdr->tag.blockNum == first.blockNum + i);
7110 : : }
7111 : :
7112 [ + + ]: 1486564 : if (is_temp)
7113 : 8453 : buf_state = pg_atomic_read_u32(&buf_hdr->state);
7114 : : else
7115 : 1478111 : buf_state = LockBufHdr(buf_hdr);
7116 : :
7117 : : /* verify the buffer is in the expected state */
7118 [ - + ]: 1486564 : Assert(buf_state & BM_TAG_VALID);
7119 [ - + ]: 1486564 : if (is_write)
7120 : : {
262 andres@anarazel.de 7121 [ # # ]:UBC 0 : Assert(buf_state & BM_VALID);
7122 [ # # ]: 0 : Assert(buf_state & BM_DIRTY);
7123 : : }
7124 : : else
7125 : : {
262 andres@anarazel.de 7126 [ - + ]:CBC 1486564 : Assert(!(buf_state & BM_VALID));
7127 [ - + ]: 1486564 : Assert(!(buf_state & BM_DIRTY));
7128 : : }
7129 : :
7130 : : /* temp buffers don't use BM_IO_IN_PROGRESS */
7131 [ + + ]: 1486564 : if (!is_temp)
7132 [ - + ]: 1478111 : Assert(buf_state & BM_IO_IN_PROGRESS);
7133 : :
7134 [ - + ]: 1486564 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) >= 1);
7135 : :
7136 : : /*
7137 : : * Reflect that the buffer is now owned by the AIO subsystem.
7138 : : *
7139 : : * For local buffers: This can't be done just via LocalRefCount, as
7140 : : * one might initially think, as this backend could error out while
7141 : : * AIO is still in progress, releasing all the pins by the backend
7142 : : * itself.
7143 : : *
7144 : : * This pin is released again in TerminateBufferIO().
7145 : : */
7146 : 1486564 : buf_hdr->io_wref = io_ref;
7147 : :
7148 [ + + ]: 1486564 : if (is_temp)
7149 : : {
41 andres@anarazel.de 7150 :GNC 8453 : buf_state += BUF_REFCOUNT_ONE;
262 andres@anarazel.de 7151 :CBC 8453 : pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state);
7152 : : }
7153 : : else
41 andres@anarazel.de 7154 :GNC 1478111 : UnlockBufHdrExt(buf_hdr, buf_state, 0, 0, 1);
7155 : :
7156 : : /*
7157 : : * Ensure the content lock that prevents buffer modifications while
7158 : : * the buffer is being written out is not released early due to an
7159 : : * error.
7160 : : */
262 andres@anarazel.de 7161 [ - + - - ]:CBC 1486564 : if (is_write && !is_temp)
7162 : : {
7163 : : LWLock *content_lock;
7164 : :
262 andres@anarazel.de 7165 :UBC 0 : content_lock = BufferDescriptorGetContentLock(buf_hdr);
7166 : :
7167 [ # # ]: 0 : Assert(LWLockHeldByMe(content_lock));
7168 : :
7169 : : /*
7170 : : * Lock is now owned by AIO subsystem.
7171 : : */
7172 : 0 : LWLockDisown(content_lock);
7173 : : }
7174 : :
7175 : : /*
7176 : : * Stop tracking this buffer via the resowner - the AIO system now
7177 : : * keeps track.
7178 : : */
262 andres@anarazel.de 7179 [ + + ]:CBC 1486564 : if (!is_temp)
7180 : 1478111 : ResourceOwnerForgetBufferIO(CurrentResourceOwner, buffer);
7181 : : }
7182 : 1320563 : }
7183 : :
7184 : : /*
7185 : : * Decode readv errors as encoded by buffer_readv_encode_error().
7186 : : */
7187 : : static inline void
7188 : 765 : buffer_readv_decode_error(PgAioResult result,
7189 : : bool *zeroed_any,
7190 : : bool *ignored_any,
7191 : : uint8 *zeroed_or_error_count,
7192 : : uint8 *checkfail_count,
7193 : : uint8 *first_off)
7194 : : {
7195 : 765 : uint32 rem_error = result.error_data;
7196 : :
7197 : : /* see static asserts in buffer_readv_encode_error */
7198 : : #define READV_COUNT_BITS 7
7199 : : #define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
7200 : :
7201 : 765 : *zeroed_any = rem_error & 1;
7202 : 765 : rem_error >>= 1;
7203 : :
7204 : 765 : *ignored_any = rem_error & 1;
7205 : 765 : rem_error >>= 1;
7206 : :
7207 : 765 : *zeroed_or_error_count = rem_error & READV_COUNT_MASK;
7208 : 765 : rem_error >>= READV_COUNT_BITS;
7209 : :
7210 : 765 : *checkfail_count = rem_error & READV_COUNT_MASK;
7211 : 765 : rem_error >>= READV_COUNT_BITS;
7212 : :
7213 : 765 : *first_off = rem_error & READV_COUNT_MASK;
7214 : 765 : rem_error >>= READV_COUNT_BITS;
7215 : 765 : }
7216 : :
7217 : : /*
7218 : : * Helper to encode errors for buffer_readv_complete()
7219 : : *
7220 : : * Errors are encoded as follows:
7221 : : * - bit 0 indicates whether any page was zeroed (1) or not (0)
7222 : : * - bit 1 indicates whether any checksum failure was ignored (1) or not (0)
7223 : : * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages
7224 : : * - next READV_COUNT_BITS bits indicate the number of checksum failures
7225 : : * - next READV_COUNT_BITS bits indicate the first offset of the first page
7226 : : * that was errored or zeroed or, if no errors/zeroes, the first ignored
7227 : : * checksum
7228 : : */
7229 : : static inline void
7230 : 282 : buffer_readv_encode_error(PgAioResult *result,
7231 : : bool is_temp,
7232 : : bool zeroed_any,
7233 : : bool ignored_any,
7234 : : uint8 error_count,
7235 : : uint8 zeroed_count,
7236 : : uint8 checkfail_count,
7237 : : uint8 first_error_off,
7238 : : uint8 first_zeroed_off,
7239 : : uint8 first_ignored_off)
7240 : : {
7241 : :
7242 : 282 : uint8 shift = 0;
7243 [ + + ]: 282 : uint8 zeroed_or_error_count =
7244 : : error_count > 0 ? error_count : zeroed_count;
7245 : : uint8 first_off;
7246 : :
7247 : : StaticAssertDecl(PG_IOV_MAX <= 1 << READV_COUNT_BITS,
7248 : : "PG_IOV_MAX is bigger than reserved space for error data");
7249 : : StaticAssertDecl((1 + 1 + 3 * READV_COUNT_BITS) <= PGAIO_RESULT_ERROR_BITS,
7250 : : "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
7251 : :
7252 : : /*
7253 : : * We only have space to encode one offset - but luckily that's good
7254 : : * enough. If there is an error, the error is the interesting offset, same
7255 : : * with a zeroed buffer vs an ignored buffer.
7256 : : */
7257 [ + + ]: 282 : if (error_count > 0)
7258 : 135 : first_off = first_error_off;
7259 [ + + ]: 147 : else if (zeroed_count > 0)
7260 : 120 : first_off = first_zeroed_off;
7261 : : else
7262 : 27 : first_off = first_ignored_off;
7263 : :
7264 [ + + - + ]: 282 : Assert(!zeroed_any || error_count == 0);
7265 : :
7266 : 282 : result->error_data = 0;
7267 : :
7268 : 282 : result->error_data |= zeroed_any << shift;
7269 : 282 : shift += 1;
7270 : :
7271 : 282 : result->error_data |= ignored_any << shift;
7272 : 282 : shift += 1;
7273 : :
7274 : 282 : result->error_data |= ((uint32) zeroed_or_error_count) << shift;
7275 : 282 : shift += READV_COUNT_BITS;
7276 : :
7277 : 282 : result->error_data |= ((uint32) checkfail_count) << shift;
7278 : 282 : shift += READV_COUNT_BITS;
7279 : :
7280 : 282 : result->error_data |= ((uint32) first_off) << shift;
7281 : 282 : shift += READV_COUNT_BITS;
7282 : :
7283 [ + + ]: 282 : result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
7284 : : PGAIO_HCB_SHARED_BUFFER_READV;
7285 : :
7286 [ + + ]: 282 : if (error_count > 0)
7287 : 135 : result->status = PGAIO_RS_ERROR;
7288 : : else
7289 : 147 : result->status = PGAIO_RS_WARNING;
7290 : :
7291 : : /*
7292 : : * The encoding is complicated enough to warrant cross-checking it against
7293 : : * the decode function.
7294 : : */
7295 : : #ifdef USE_ASSERT_CHECKING
7296 : : {
7297 : : bool zeroed_any_2,
7298 : : ignored_any_2;
7299 : : uint8 zeroed_or_error_count_2,
7300 : : checkfail_count_2,
7301 : : first_off_2;
7302 : :
7303 : 282 : buffer_readv_decode_error(*result,
7304 : : &zeroed_any_2, &ignored_any_2,
7305 : : &zeroed_or_error_count_2,
7306 : : &checkfail_count_2,
7307 : : &first_off_2);
7308 [ - + ]: 282 : Assert(zeroed_any == zeroed_any_2);
7309 [ - + ]: 282 : Assert(ignored_any == ignored_any_2);
7310 [ - + ]: 282 : Assert(zeroed_or_error_count == zeroed_or_error_count_2);
7311 [ - + ]: 282 : Assert(checkfail_count == checkfail_count_2);
7312 [ - + ]: 282 : Assert(first_off == first_off_2);
7313 : : }
7314 : : #endif
7315 : :
7316 : : #undef READV_COUNT_BITS
7317 : : #undef READV_COUNT_MASK
7318 : 282 : }
7319 : :
7320 : : /*
7321 : : * Helper for AIO readv completion callbacks, supporting both shared and temp
7322 : : * buffers. Gets called once for each buffer in a multi-page read.
7323 : : */
7324 : : static pg_attribute_always_inline void
7325 : 1349624 : buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer,
7326 : : uint8 flags, bool failed, bool is_temp,
7327 : : bool *buffer_invalid,
7328 : : bool *failed_checksum,
7329 : : bool *ignored_checksum,
7330 : : bool *zeroed_buffer)
7331 : : {
7332 : 1349624 : BufferDesc *buf_hdr = is_temp ?
7333 : 8453 : GetLocalBufferDescriptor(-buffer - 1)
7334 [ + + ]: 1349624 : : GetBufferDescriptor(buffer - 1);
7335 : 1349624 : BufferTag tag = buf_hdr->tag;
7336 : 1349624 : char *bufdata = BufferGetBlock(buffer);
7337 : : uint32 set_flag_bits;
7338 : : int piv_flags;
7339 : :
7340 : : /* check that the buffer is in the expected state for a read */
7341 : : #ifdef USE_ASSERT_CHECKING
7342 : : {
7343 : 1349624 : uint32 buf_state = pg_atomic_read_u32(&buf_hdr->state);
7344 : :
7345 [ - + ]: 1349624 : Assert(buf_state & BM_TAG_VALID);
7346 [ - + ]: 1349624 : Assert(!(buf_state & BM_VALID));
7347 : : /* temp buffers don't use BM_IO_IN_PROGRESS */
7348 [ + + ]: 1349624 : if (!is_temp)
7349 [ - + ]: 1341171 : Assert(buf_state & BM_IO_IN_PROGRESS);
7350 [ - + ]: 1349624 : Assert(!(buf_state & BM_DIRTY));
7351 : : }
7352 : : #endif
7353 : :
7354 : 1349624 : *buffer_invalid = false;
7355 : 1349624 : *failed_checksum = false;
7356 : 1349624 : *ignored_checksum = false;
7357 : 1349624 : *zeroed_buffer = false;
7358 : :
7359 : : /*
7360 : : * We ask PageIsVerified() to only log the message about checksum errors,
7361 : : * as the completion might be run in any backend (or IO workers). We will
7362 : : * report checksum errors in buffer_readv_report().
7363 : : */
7364 : 1349624 : piv_flags = PIV_LOG_LOG;
7365 : :
7366 : : /* the local zero_damaged_pages may differ from the definer's */
7367 [ + + ]: 1349624 : if (flags & READ_BUFFERS_IGNORE_CHECKSUM_FAILURES)
7368 : 57 : piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
7369 : :
7370 : : /* Check for garbage data. */
7371 [ + - ]: 1349624 : if (!failed)
7372 : : {
7373 : : /*
7374 : : * If the buffer is not currently pinned by this backend, e.g. because
7375 : : * we're completing this IO after an error, the buffer data will have
7376 : : * been marked as inaccessible when the buffer was unpinned. The AIO
7377 : : * subsystem holds a pin, but that doesn't prevent the buffer from
7378 : : * having been marked as inaccessible. The completion might also be
7379 : : * executed in a different process.
7380 : : */
7381 : : #ifdef USE_VALGRIND
7382 : : if (!BufferIsPinned(buffer))
7383 : : VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
7384 : : #endif
7385 : :
7386 [ + + ]: 1349624 : if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
7387 : : failed_checksum))
7388 : : {
7389 [ + + ]: 141 : if (flags & READ_BUFFERS_ZERO_ON_ERROR)
7390 : : {
7391 : 69 : memset(bufdata, 0, BLCKSZ);
7392 : 69 : *zeroed_buffer = true;
7393 : : }
7394 : : else
7395 : : {
7396 : 72 : *buffer_invalid = true;
7397 : : /* mark buffer as having failed */
7398 : 72 : failed = true;
7399 : : }
7400 : : }
7401 [ + + ]: 1349483 : else if (*failed_checksum)
7402 : 18 : *ignored_checksum = true;
7403 : :
7404 : : /* undo what we did above */
7405 : : #ifdef USE_VALGRIND
7406 : : if (!BufferIsPinned(buffer))
7407 : : VALGRIND_MAKE_MEM_NOACCESS(bufdata, BLCKSZ);
7408 : : #endif
7409 : :
7410 : : /*
7411 : : * Immediately log a message about the invalid page, but only to the
7412 : : * server log. The reason to do so immediately is that this may be
7413 : : * executed in a different backend than the one that originated the
7414 : : * request. The reason to do so immediately is that the originator
7415 : : * might not process the query result immediately (because it is busy
7416 : : * doing another part of query processing) or at all (e.g. if it was
7417 : : * cancelled or errored out due to another IO also failing). The
7418 : : * definer of the IO will emit an ERROR or WARNING when processing the
7419 : : * IO's results
7420 : : *
7421 : : * To avoid duplicating the code to emit these log messages, we reuse
7422 : : * buffer_readv_report().
7423 : : */
7424 [ + + + + : 1349624 : if (*buffer_invalid || *failed_checksum || *zeroed_buffer)
+ + ]
7425 : : {
257 7426 : 159 : PgAioResult result_one = {0};
7427 : :
262 7428 : 159 : buffer_readv_encode_error(&result_one, is_temp,
7429 : 159 : *zeroed_buffer,
7430 : 159 : *ignored_checksum,
7431 : 159 : *buffer_invalid,
7432 : 159 : *zeroed_buffer ? 1 : 0,
7433 : 159 : *failed_checksum ? 1 : 0,
7434 : : buf_off, buf_off, buf_off);
7435 : 159 : pgaio_result_report(result_one, td, LOG_SERVER_ONLY);
7436 : : }
7437 : : }
7438 : :
7439 : : /* Terminate I/O and set BM_VALID. */
7440 [ + + ]: 1349624 : set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
7441 [ + + ]: 1349624 : if (is_temp)
7442 : 8453 : TerminateLocalBufferIO(buf_hdr, false, set_flag_bits, true);
7443 : : else
7444 : 1341171 : TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
7445 : :
7446 : : /*
7447 : : * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
7448 : : * callback may not be executed in the same backend that called
7449 : : * BUFFER_READ_START. The alternative would be to defer calling the
7450 : : * tracepoint to a later point (e.g. the local completion callback for
7451 : : * shared buffer reads), which seems even less helpful.
7452 : : */
7453 : : TRACE_POSTGRESQL_BUFFER_READ_DONE(tag.forkNum,
7454 : : tag.blockNum,
7455 : : tag.spcOid,
7456 : : tag.dbOid,
7457 : : tag.relNumber,
7458 : : is_temp ? MyProcNumber : INVALID_PROC_NUMBER,
7459 : : false);
7460 : 1349624 : }
7461 : :
7462 : : /*
7463 : : * Perform completion handling of a single AIO read. This read may cover
7464 : : * multiple blocks / buffers.
7465 : : *
7466 : : * Shared between shared and local buffers, to reduce code duplication.
7467 : : */
7468 : : static pg_attribute_always_inline PgAioResult
7469 : 1216889 : buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
7470 : : uint8 cb_data, bool is_temp)
7471 : : {
7472 : 1216889 : PgAioResult result = prior_result;
7473 : 1216889 : PgAioTargetData *td = pgaio_io_get_target_data(ioh);
7474 : 1216889 : uint8 first_error_off = 0;
7475 : 1216889 : uint8 first_zeroed_off = 0;
7476 : 1216889 : uint8 first_ignored_off = 0;
7477 : 1216889 : uint8 error_count = 0;
7478 : 1216889 : uint8 zeroed_count = 0;
7479 : 1216889 : uint8 ignored_count = 0;
7480 : 1216889 : uint8 checkfail_count = 0;
7481 : : uint64 *io_data;
7482 : : uint8 handle_data_len;
7483 : :
7484 [ + + ]: 1216889 : if (is_temp)
7485 : : {
7486 [ - + ]: 1829 : Assert(td->smgr.is_temp);
7487 [ - + ]: 1829 : Assert(pgaio_io_get_owner(ioh) == MyProcNumber);
7488 : : }
7489 : : else
7490 [ - + ]: 1215060 : Assert(!td->smgr.is_temp);
7491 : :
7492 : : /*
7493 : : * Iterate over all the buffers affected by this IO and call the
7494 : : * per-buffer completion function for each buffer.
7495 : : */
7496 : 1216889 : io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
7497 [ + + ]: 2566513 : for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
7498 : : {
7499 : 1349624 : Buffer buf = io_data[buf_off];
7500 : : bool failed;
7501 : 1349624 : bool failed_verification = false;
7502 : 1349624 : bool failed_checksum = false;
7503 : 1349624 : bool zeroed_buffer = false;
7504 : 1349624 : bool ignored_checksum = false;
7505 : :
7506 [ - + ]: 1349624 : Assert(BufferIsValid(buf));
7507 : :
7508 : : /*
7509 : : * If the entire I/O failed on a lower-level, each buffer needs to be
7510 : : * marked as failed. In case of a partial read, the first few buffers
7511 : : * may be ok.
7512 : : */
7513 : 1349624 : failed =
7514 : 1349624 : prior_result.status == PGAIO_RS_ERROR
7515 [ + - - + ]: 1349624 : || prior_result.result <= buf_off;
7516 : :
7517 : 1349624 : buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
7518 : : &failed_verification,
7519 : : &failed_checksum,
7520 : : &ignored_checksum,
7521 : : &zeroed_buffer);
7522 : :
7523 : : /*
7524 : : * Track information about the number of different kinds of error
7525 : : * conditions across all pages, as there can be multiple pages failing
7526 : : * verification as part of one IO.
7527 : : */
7528 [ + + + - : 1349624 : if (failed_verification && !zeroed_buffer && error_count++ == 0)
+ + ]
7529 : 63 : first_error_off = buf_off;
7530 [ + + + + ]: 1349624 : if (zeroed_buffer && zeroed_count++ == 0)
7531 : 51 : first_zeroed_off = buf_off;
7532 [ + + + + ]: 1349624 : if (ignored_checksum && ignored_count++ == 0)
7533 : 15 : first_ignored_off = buf_off;
7534 [ + + ]: 1349624 : if (failed_checksum)
7535 : 48 : checkfail_count++;
7536 : : }
7537 : :
7538 : : /*
7539 : : * If the smgr read succeeded [partially] and page verification failed for
7540 : : * some of the pages, adjust the IO's result state appropriately.
7541 : : */
7542 [ + - + + ]: 1216889 : if (prior_result.status != PGAIO_RS_ERROR &&
7543 [ + + + + ]: 1216826 : (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
7544 : : {
7545 : 123 : buffer_readv_encode_error(&result, is_temp,
7546 : : zeroed_count > 0, ignored_count > 0,
7547 : : error_count, zeroed_count, checkfail_count,
7548 : : first_error_off, first_zeroed_off,
7549 : : first_ignored_off);
7550 : 123 : pgaio_result_report(result, td, DEBUG1);
7551 : : }
7552 : :
7553 : : /*
7554 : : * For shared relations this reporting is done in
7555 : : * shared_buffer_readv_complete_local().
7556 : : */
7557 [ + + + + ]: 1216889 : if (is_temp && checkfail_count > 0)
7558 : 3 : pgstat_report_checksum_failures_in_db(td->smgr.rlocator.dbOid,
7559 : : checkfail_count);
7560 : :
7561 : 1216889 : return result;
7562 : : }
7563 : :
7564 : : /*
7565 : : * AIO error reporting callback for aio_shared_buffer_readv_cb and
7566 : : * aio_local_buffer_readv_cb.
7567 : : *
7568 : : * The error is encoded / decoded in buffer_readv_encode_error() /
7569 : : * buffer_readv_decode_error().
7570 : : */
7571 : : static void
7572 : 399 : buffer_readv_report(PgAioResult result, const PgAioTargetData *td,
7573 : : int elevel)
7574 : : {
7575 : 399 : int nblocks = td->smgr.nblocks;
7576 : 399 : BlockNumber first = td->smgr.blockNum;
7577 : 399 : BlockNumber last = first + nblocks - 1;
7578 : 399 : ProcNumber errProc =
7579 [ + + ]: 399 : td->smgr.is_temp ? MyProcNumber : INVALID_PROC_NUMBER;
7580 : : RelPathStr rpath =
7581 : 399 : relpathbackend(td->smgr.rlocator, errProc, td->smgr.forkNum);
7582 : : bool zeroed_any,
7583 : : ignored_any;
7584 : : uint8 zeroed_or_error_count,
7585 : : checkfail_count,
7586 : : first_off;
7587 : : uint8 affected_count;
7588 : : const char *msg_one,
7589 : : *msg_mult,
7590 : : *det_mult,
7591 : : *hint_mult;
7592 : :
7593 : 399 : buffer_readv_decode_error(result, &zeroed_any, &ignored_any,
7594 : : &zeroed_or_error_count,
7595 : : &checkfail_count,
7596 : : &first_off);
7597 : :
7598 : : /*
7599 : : * Treat a read that had both zeroed buffers *and* ignored checksums as a
7600 : : * special case, it's too irregular to be emitted the same way as the
7601 : : * other cases.
7602 : : */
7603 [ + + + + ]: 399 : if (zeroed_any && ignored_any)
7604 : : {
7605 [ + - - + ]: 6 : Assert(zeroed_any && ignored_any);
7606 [ - + ]: 6 : Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
7607 [ - + ]: 6 : Assert(result.status != PGAIO_RS_ERROR);
7608 : 6 : affected_count = zeroed_or_error_count;
7609 : :
7610 [ + - + - ]: 6 : ereport(elevel,
7611 : : errcode(ERRCODE_DATA_CORRUPTED),
7612 : : errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
7613 : : affected_count, checkfail_count, first, last, rpath.str),
7614 : : affected_count > 1 ?
7615 : : errdetail("Block %u held the first zeroed page.",
7616 : : first + first_off) : 0,
7617 : : errhint_plural("See server log for details about the other %d invalid block.",
7618 : : "See server log for details about the other %d invalid blocks.",
7619 : : affected_count + checkfail_count - 1,
7620 : : affected_count + checkfail_count - 1));
7621 : 6 : return;
7622 : : }
7623 : :
7624 : : /*
7625 : : * The other messages are highly repetitive. To avoid duplicating a long
7626 : : * and complicated ereport(), gather the translated format strings
7627 : : * separately and then do one common ereport.
7628 : : */
7629 [ + + ]: 393 : if (result.status == PGAIO_RS_ERROR)
7630 : : {
7631 [ - + ]: 195 : Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
7632 : 195 : affected_count = zeroed_or_error_count;
113 peter@eisentraut.org 7633 : 195 : msg_one = _("invalid page in block %u of relation \"%s\"");
7634 : 195 : msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
7635 : 195 : det_mult = _("Block %u held the first invalid page.");
262 andres@anarazel.de 7636 : 195 : hint_mult = _("See server log for the other %u invalid block(s).");
7637 : : }
7638 [ + + + - ]: 198 : else if (zeroed_any && !ignored_any)
7639 : : {
7640 : 162 : affected_count = zeroed_or_error_count;
113 peter@eisentraut.org 7641 : 162 : msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
7642 : 162 : msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
7643 : 162 : det_mult = _("Block %u held the first zeroed page.");
262 andres@anarazel.de 7644 : 162 : hint_mult = _("See server log for the other %u zeroed block(s).");
7645 : : }
7646 [ + - + - ]: 36 : else if (!zeroed_any && ignored_any)
7647 : : {
7648 : 36 : affected_count = checkfail_count;
113 peter@eisentraut.org 7649 : 36 : msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
7650 : 36 : msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
7651 : 36 : det_mult = _("Block %u held the first ignored page.");
262 andres@anarazel.de 7652 : 36 : hint_mult = _("See server log for the other %u ignored block(s).");
7653 : : }
7654 : : else
262 andres@anarazel.de 7655 :UBC 0 : pg_unreachable();
7656 : :
262 andres@anarazel.de 7657 [ + - + + :CBC 393 : ereport(elevel,
+ + + + ]
7658 : : errcode(ERRCODE_DATA_CORRUPTED),
7659 : : affected_count == 1 ?
7660 : : errmsg_internal(msg_one, first + first_off, rpath.str) :
7661 : : errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
7662 : : affected_count > 1 ? errdetail_internal(det_mult, first + first_off) : 0,
7663 : : affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0);
7664 : : }
7665 : :
7666 : : static void
7667 : 1318734 : shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
7668 : : {
7669 : 1318734 : buffer_stage_common(ioh, false, false);
7670 : 1318734 : }
7671 : :
7672 : : static PgAioResult
7673 : 1215060 : shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
7674 : : uint8 cb_data)
7675 : : {
7676 : 1215060 : return buffer_readv_complete(ioh, prior_result, cb_data, false);
7677 : : }
7678 : :
7679 : : /*
7680 : : * We need a backend-local completion callback for shared buffers, to be able
7681 : : * to report checksum errors correctly. Unfortunately that can only safely
7682 : : * happen if the reporting backend has previously called
7683 : : * pgstat_prepare_report_checksum_failure(), which we can only guarantee in
7684 : : * the backend that started the IO. Hence this callback.
7685 : : */
7686 : : static PgAioResult
7687 : 1318734 : shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result,
7688 : : uint8 cb_data)
7689 : : {
7690 : : bool zeroed_any,
7691 : : ignored_any;
7692 : : uint8 zeroed_or_error_count,
7693 : : checkfail_count,
7694 : : first_off;
7695 : :
7696 [ + + ]: 1318734 : if (prior_result.status == PGAIO_RS_OK)
7697 : 1318650 : return prior_result;
7698 : :
7699 : 84 : buffer_readv_decode_error(prior_result,
7700 : : &zeroed_any,
7701 : : &ignored_any,
7702 : : &zeroed_or_error_count,
7703 : : &checkfail_count,
7704 : : &first_off);
7705 : :
7706 [ + + ]: 84 : if (checkfail_count)
7707 : : {
7708 : 36 : PgAioTargetData *td = pgaio_io_get_target_data(ioh);
7709 : :
7710 : 36 : pgstat_report_checksum_failures_in_db(td->smgr.rlocator.dbOid,
7711 : : checkfail_count);
7712 : : }
7713 : :
7714 : 84 : return prior_result;
7715 : : }
7716 : :
7717 : : static void
7718 : 1829 : local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
7719 : : {
7720 : 1829 : buffer_stage_common(ioh, false, true);
7721 : 1829 : }
7722 : :
7723 : : static PgAioResult
7724 : 1829 : local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
7725 : : uint8 cb_data)
7726 : : {
7727 : 1829 : return buffer_readv_complete(ioh, prior_result, cb_data, true);
7728 : : }
7729 : :
7730 : : /* readv callback is passed READ_BUFFERS_* flags as callback data */
7731 : : const PgAioHandleCallbacks aio_shared_buffer_readv_cb = {
7732 : : .stage = shared_buffer_readv_stage,
7733 : : .complete_shared = shared_buffer_readv_complete,
7734 : : /* need a local callback to report checksum failures */
7735 : : .complete_local = shared_buffer_readv_complete_local,
7736 : : .report = buffer_readv_report,
7737 : : };
7738 : :
7739 : : /* readv callback is passed READ_BUFFERS_* flags as callback data */
7740 : : const PgAioHandleCallbacks aio_local_buffer_readv_cb = {
7741 : : .stage = local_buffer_readv_stage,
7742 : :
7743 : : /*
7744 : : * Note that this, in contrast to the shared_buffers case, uses
7745 : : * complete_local, as only the issuing backend has access to the required
7746 : : * datastructures. This is important in case the IO completion may be
7747 : : * consumed incidentally by another backend.
7748 : : */
7749 : : .complete_local = local_buffer_readv_complete,
7750 : : .report = buffer_readv_report,
7751 : : };
|