Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * freespace.c
4 : : * POSTGRES free space map for quickly finding free space in relations
5 : : *
6 : : *
7 : : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 : : * Portions Copyright (c) 1994, Regents of the University of California
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/storage/freespace/freespace.c
12 : : *
13 : : *
14 : : * NOTES:
15 : : *
16 : : * Free Space Map keeps track of the amount of free space on pages, and
17 : : * allows quickly searching for a page with enough free space. The FSM is
18 : : * stored in a dedicated relation fork of all heap relations, and those
19 : : * index access methods that need it (see also indexfsm.c). See README for
20 : : * more information.
21 : : *
22 : : *-------------------------------------------------------------------------
23 : : */
24 : : #include "postgres.h"
25 : :
26 : : #include "access/htup_details.h"
27 : : #include "access/xloginsert.h"
28 : : #include "access/xlogutils.h"
29 : : #include "miscadmin.h"
30 : : #include "storage/freespace.h"
31 : : #include "storage/fsm_internals.h"
32 : : #include "storage/smgr.h"
33 : : #include "utils/rel.h"
34 : :
35 : :
36 : : /*
37 : : * We use just one byte to store the amount of free space on a page, so we
38 : : * divide the amount of free space a page can have into 256 different
39 : : * categories. The highest category, 255, represents a page with at least
40 : : * MaxFSMRequestSize bytes of free space, and the second highest category
41 : : * represents the range from 254 * FSM_CAT_STEP, inclusive, to
42 : : * MaxFSMRequestSize, exclusive.
43 : : *
44 : : * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
45 : : * default 8k BLCKSZ, and that MaxFSMRequestSize is 8164 bytes, the
46 : : * categories look like this:
47 : : *
48 : : *
49 : : * Range Category
50 : : * 0 - 31 0
51 : : * 32 - 63 1
52 : : * ... ... ...
53 : : * 8096 - 8127 253
54 : : * 8128 - 8163 254
55 : : * 8164 - 8192 255
56 : : *
57 : : * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
58 : : * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
59 : : * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
60 : : * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
61 : : * completely empty page, that would mean that we could never satisfy a
62 : : * request of exactly MaxFSMRequestSize bytes.
63 : : */
64 : : #define FSM_CATEGORIES 256
65 : : #define FSM_CAT_STEP (BLCKSZ / FSM_CATEGORIES)
66 : : #define MaxFSMRequestSize MaxHeapTupleSize
67 : :
68 : : /*
69 : : * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
70 : : * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
71 : : * 256 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
72 : : * this means that 4096 bytes is the smallest BLCKSZ that we can get away
73 : : * with a 3-level tree, and 512 is the smallest we support.
74 : : */
75 : : #define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
76 : :
77 : : #define FSM_ROOT_LEVEL (FSM_TREE_DEPTH - 1)
78 : : #define FSM_BOTTOM_LEVEL 0
79 : :
80 : : /*
81 : : * The internal FSM routines work on a logical addressing scheme. Each
82 : : * level of the tree can be thought of as a separately addressable file.
83 : : */
84 : : typedef struct
85 : : {
86 : : int level; /* level */
87 : : int logpageno; /* page number within the level */
88 : : } FSMAddress;
89 : :
90 : : /* Address of the root page. */
91 : : static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
92 : :
93 : : /* functions to navigate the tree */
94 : : static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
95 : : static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
96 : : static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
97 : : static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
98 : : static BlockNumber fsm_logical_to_physical(FSMAddress addr);
99 : :
100 : : static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
101 : : static Buffer fsm_extend(Relation rel, BlockNumber fsm_nblocks);
102 : :
103 : : /* functions to convert amount of free space to a FSM category */
104 : : static uint8 fsm_space_avail_to_cat(Size avail);
105 : : static uint8 fsm_space_needed_to_cat(Size needed);
106 : : static Size fsm_space_cat_to_avail(uint8 cat);
107 : :
108 : : /* workhorse functions for various operations */
109 : : static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
110 : : uint8 newValue, uint8 minValue);
111 : : static BlockNumber fsm_search(Relation rel, uint8 min_cat);
112 : : static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
113 : : BlockNumber start, BlockNumber end,
114 : : bool *eof_p);
115 : : static bool fsm_does_block_exist(Relation rel, BlockNumber blknumber);
116 : :
117 : :
118 : : /******** Public API ********/
119 : :
120 : : /*
121 : : * GetPageWithFreeSpace - try to find a page in the given relation with
122 : : * at least the specified amount of free space.
123 : : *
124 : : * If successful, return the block number; if not, return InvalidBlockNumber.
125 : : *
126 : : * The caller must be prepared for the possibility that the returned page
127 : : * will turn out to have too little space available by the time the caller
128 : : * gets a lock on it. In that case, the caller should report the actual
129 : : * amount of free space available on that page and then try again (see
130 : : * RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned,
131 : : * extend the relation.
132 : : *
133 : : * This can trigger FSM updates if any FSM entry is found to point to a block
134 : : * past the end of the relation.
135 : : */
136 : : BlockNumber
2314 akapila@postgresql.o 137 :CBC 85358 : GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
138 : : {
5931 bruce@momjian.us 139 : 85358 : uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded);
140 : :
2314 akapila@postgresql.o 141 : 85358 : return fsm_search(rel, min_cat);
142 : : }
143 : :
144 : : /*
145 : : * RecordAndGetPageWithFreeSpace - update info about a page and try again.
146 : : *
147 : : * We provide this combo form to save some locking overhead, compared to
148 : : * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
149 : : * also some effort to return a page close to the old page; if there's a
150 : : * page with enough free space on the same FSM page where the old one page
151 : : * is located, it is preferred.
152 : : */
153 : : BlockNumber
6185 heikki.linnakangas@i 154 : 109942 : RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
155 : : Size oldSpaceAvail, Size spaceNeeded)
156 : : {
2314 akapila@postgresql.o 157 : 109942 : int old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
158 : 109942 : int search_cat = fsm_space_needed_to_cat(spaceNeeded);
159 : : FSMAddress addr;
160 : : uint16 slot;
161 : : int search_slot;
162 : :
163 : : /* Get the location of the FSM byte representing the heap block */
6185 heikki.linnakangas@i 164 : 109942 : addr = fsm_get_location(oldPage, &slot);
165 : :
166 : 109942 : search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);
167 : :
168 : : /*
169 : : * If fsm_set_and_search found a suitable new block, return that.
170 : : * Otherwise, search as usual.
171 : : */
172 [ + + ]: 109942 : if (search_slot != -1)
173 : : {
511 noah@leadboat.com 174 : 15797 : BlockNumber blknum = fsm_get_heap_blk(addr, search_slot);
175 : :
176 : : /*
177 : : * Check that the blknum is actually in the relation. Don't try to
178 : : * update the FSM in that case, just fall back to the other case
179 : : */
180 [ + - ]: 15797 : if (fsm_does_block_exist(rel, blknum))
181 : 15797 : return blknum;
182 : : }
183 : 94145 : return fsm_search(rel, search_cat);
184 : : }
185 : :
186 : : /*
187 : : * RecordPageWithFreeSpace - update info about a page.
188 : : *
189 : : * Note that if the new spaceAvail value is higher than the old value stored
190 : : * in the FSM, the space might not become visible to searchers until the next
191 : : * FreeSpaceMapVacuum call, which updates the upper level pages.
192 : : */
193 : : void
2314 akapila@postgresql.o 194 : 86245 : RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
195 : : {
196 : 86245 : int new_cat = fsm_space_avail_to_cat(spaceAvail);
197 : : FSMAddress addr;
198 : : uint16 slot;
199 : :
200 : : /* Get the location of the FSM byte representing the heap block */
6185 heikki.linnakangas@i 201 : 86245 : addr = fsm_get_location(heapBlk, &slot);
202 : :
203 : 86245 : fsm_set_and_search(rel, addr, slot, new_cat, 0);
8222 tgl@sss.pgh.pa.us 204 : 86245 : }
205 : :
206 : : /*
207 : : * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
208 : : * WAL replay
209 : : */
210 : : void
1158 rhaas@postgresql.org 211 : 300363 : XLogRecordPageWithFreeSpace(RelFileLocator rlocator, BlockNumber heapBlk,
212 : : Size spaceAvail)
213 : : {
6154 heikki.linnakangas@i 214 : 300363 : int new_cat = fsm_space_avail_to_cat(spaceAvail);
215 : : FSMAddress addr;
216 : : uint16 slot;
217 : : BlockNumber blkno;
218 : : Buffer buf;
219 : : Page page;
220 : :
221 : : /* Get the location of the FSM byte representing the heap block */
222 : 300363 : addr = fsm_get_location(heapBlk, &slot);
223 : 300363 : blkno = fsm_logical_to_physical(addr);
224 : :
225 : : /* If the page doesn't exist already, extend */
1158 rhaas@postgresql.org 226 : 300363 : buf = XLogReadBufferExtended(rlocator, FSM_FORKNUM, blkno,
227 : : RBM_ZERO_ON_ERROR, InvalidBuffer);
6073 heikki.linnakangas@i 228 : 300363 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
229 : :
3426 kgrittn@postgresql.o 230 : 300363 : page = BufferGetPage(buf);
6154 heikki.linnakangas@i 231 [ + + ]: 300363 : if (PageIsNew(page))
232 : 485 : PageInit(page, BLCKSZ, 0);
233 : :
234 [ + + ]: 300363 : if (fsm_set_avail(page, slot, new_cat))
4464 jdavis@postgresql.or 235 : 291858 : MarkBufferDirtyHint(buf, false);
6154 heikki.linnakangas@i 236 : 300363 : UnlockReleaseBuffer(buf);
237 : 300363 : }
238 : :
239 : : /*
240 : : * GetRecordedFreeSpace - return the amount of free space on a particular page,
241 : : * according to the FSM.
242 : : */
243 : : Size
6185 244 : 1366 : GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
245 : : {
246 : : FSMAddress addr;
247 : : uint16 slot;
248 : : Buffer buf;
249 : : uint8 cat;
250 : :
251 : : /* Get the location of the FSM byte representing the heap block */
252 : 1366 : addr = fsm_get_location(heapBlk, &slot);
253 : :
254 : 1366 : buf = fsm_readbuf(rel, addr, false);
255 [ + + ]: 1366 : if (!BufferIsValid(buf))
256 : 36 : return 0;
3426 kgrittn@postgresql.o 257 : 1330 : cat = fsm_get_avail(BufferGetPage(buf), slot);
6185 heikki.linnakangas@i 258 : 1330 : ReleaseBuffer(buf);
259 : :
260 : 1330 : return fsm_space_cat_to_avail(cat);
261 : : }
262 : :
263 : : /*
264 : : * FreeSpaceMapPrepareTruncateRel - prepare for truncation of a relation.
265 : : *
266 : : * nblocks is the new size of the heap.
267 : : *
268 : : * Return the number of blocks of new FSM.
269 : : * If it's InvalidBlockNumber, there is nothing to truncate;
270 : : * otherwise the caller is responsible for calling smgrtruncate()
271 : : * to truncate the FSM pages, and FreeSpaceMapVacuumRange()
272 : : * to update upper-level pages in the FSM.
273 : : */
274 : : BlockNumber
2174 fujii@postgresql.org 275 : 207 : FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
276 : : {
277 : : BlockNumber new_nfsmblocks;
278 : : FSMAddress first_removed_address;
279 : : uint16 first_removed_slot;
280 : : Buffer buf;
281 : :
282 : : /*
283 : : * If no FSM has been created yet for this relation, there's nothing to
284 : : * truncate.
285 : : */
1517 tgl@sss.pgh.pa.us 286 [ - + ]: 207 : if (!smgrexists(RelationGetSmgr(rel), FSM_FORKNUM))
2174 fujii@postgresql.org 287 :UBC 0 : return InvalidBlockNumber;
288 : :
289 : : /* Get the location in the FSM of the first removed heap block */
6185 heikki.linnakangas@i 290 :CBC 207 : first_removed_address = fsm_get_location(nblocks, &first_removed_slot);
291 : :
292 : : /*
293 : : * Zero out the tail of the last remaining FSM page. If the slot
294 : : * representing the first removed heap block is at a page boundary, as the
295 : : * first slot on the FSM page that first_removed_address points to, we can
296 : : * just truncate that page altogether.
297 : : */
298 [ + + ]: 207 : if (first_removed_slot > 0)
299 : : {
300 : 129 : buf = fsm_readbuf(rel, first_removed_address, false);
301 [ - + ]: 129 : if (!BufferIsValid(buf))
1941 tgl@sss.pgh.pa.us 302 :UBC 0 : return InvalidBlockNumber; /* nothing to do; the FSM was already
303 : : * smaller */
6185 heikki.linnakangas@i 304 :CBC 129 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
305 : :
306 : : /* NO EREPORT(ERROR) from here till changes are logged */
3244 307 : 129 : START_CRIT_SECTION();
308 : :
3426 kgrittn@postgresql.o 309 : 129 : fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
310 : :
311 : : /*
312 : : * This change is non-critical, because fsm_does_block_exist() would
313 : : * stop us from returning a truncated-away block. However, since this
314 : : * may remove up to SlotsPerFSMPage slots, it's nice to avoid the cost
315 : : * of that many fsm_does_block_exist() rejections. Use a full
316 : : * MarkBufferDirty(), not MarkBufferDirtyHint().
317 : : */
3244 heikki.linnakangas@i 318 : 129 : MarkBufferDirty(buf);
319 : :
320 : : /*
321 : : * WAL-log like MarkBufferDirtyHint() might have done, just to avoid
322 : : * differing from the rest of the file in this respect. This is
323 : : * optional; see README mention of full page images. XXX consider
324 : : * XLogSaveBufferForHint() for even closer similarity.
325 : : *
326 : : * A higher-level operation calls us at WAL replay. If we crash
327 : : * before the XLOG_SMGR_TRUNCATE flushes to disk, main fork length has
328 : : * not changed, and our fork remains valid. If we crash after that
329 : : * flush, redo will return here.
330 : : */
331 [ + + + + : 129 : if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
+ + + - +
- - + -
- ]
332 : 108 : log_newpage_buffer(buf, false);
333 : :
334 [ - + ]: 129 : END_CRIT_SECTION();
335 : :
6185 336 : 129 : UnlockReleaseBuffer(buf);
337 : :
338 : 129 : new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
339 : : }
340 : : else
341 : : {
342 : 78 : new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
1517 tgl@sss.pgh.pa.us 343 [ - + ]: 78 : if (smgrnblocks(RelationGetSmgr(rel), FSM_FORKNUM) <= new_nfsmblocks)
1941 tgl@sss.pgh.pa.us 344 :UBC 0 : return InvalidBlockNumber; /* nothing to do; the FSM was already
345 : : * smaller */
346 : : }
347 : :
2174 fujii@postgresql.org 348 :CBC 207 : return new_nfsmblocks;
349 : : }
350 : :
351 : : /*
352 : : * FreeSpaceMapVacuum - update upper-level pages in the rel's FSM
353 : : *
354 : : * We assume that the bottom-level pages have already been updated with
355 : : * new free-space information.
356 : : */
357 : : void
6185 heikki.linnakangas@i 358 : 200 : FreeSpaceMapVacuum(Relation rel)
359 : : {
360 : : bool dummy;
361 : :
362 : : /* Recursively scan the tree, starting at the root */
2718 tgl@sss.pgh.pa.us 363 : 200 : (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS,
364 : : (BlockNumber) 0, InvalidBlockNumber,
365 : : &dummy);
366 : 200 : }
367 : :
368 : : /*
369 : : * FreeSpaceMapVacuumRange - update upper-level pages in the rel's FSM
370 : : *
371 : : * As above, but assume that only heap pages between start and end-1 inclusive
372 : : * have new free-space information, so update only the upper-level slots
373 : : * covering that block range. end == InvalidBlockNumber is equivalent to
374 : : * "all the rest of the relation".
375 : : */
376 : : void
377 : 6630 : FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
378 : : {
379 : : bool dummy;
380 : :
381 : : /* Recursively scan the tree, starting at the root */
382 [ + + ]: 6630 : if (end > start)
383 : 6490 : (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, start, end, &dummy);
8832 384 : 6630 : }
385 : :
386 : : /******** Internal routines ********/
387 : :
388 : : /*
389 : : * Return category corresponding x bytes of free space
390 : : */
391 : : static uint8
6185 heikki.linnakangas@i 392 : 496550 : fsm_space_avail_to_cat(Size avail)
393 : : {
394 : : int cat;
395 : :
396 [ - + ]: 496550 : Assert(avail < BLCKSZ);
397 : :
398 [ + + ]: 496550 : if (avail >= MaxFSMRequestSize)
399 : 23226 : return 255;
400 : :
401 : 473324 : cat = avail / FSM_CAT_STEP;
402 : :
403 : : /*
404 : : * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
405 : : * more.
406 : : */
407 [ - + ]: 473324 : if (cat > 254)
6185 heikki.linnakangas@i 408 :UBC 0 : cat = 254;
409 : :
6185 heikki.linnakangas@i 410 :CBC 473324 : return (uint8) cat;
411 : : }
412 : :
413 : : /*
414 : : * Return the lower bound of the range of free space represented by given
415 : : * category.
416 : : */
417 : : static Size
418 : 1330 : fsm_space_cat_to_avail(uint8 cat)
419 : : {
420 : : /* The highest category represents exactly MaxFSMRequestSize bytes. */
421 [ + + ]: 1330 : if (cat == 255)
422 : 800 : return MaxFSMRequestSize;
423 : : else
424 : 530 : return cat * FSM_CAT_STEP;
425 : : }
426 : :
427 : : /*
428 : : * Which category does a page need to have, to accommodate x bytes of data?
429 : : * While fsm_space_avail_to_cat() rounds down, this needs to round up.
430 : : */
431 : : static uint8
432 : 195300 : fsm_space_needed_to_cat(Size needed)
433 : : {
434 : : int cat;
435 : :
436 : : /* Can't ask for more space than the highest category represents */
437 [ - + ]: 195300 : if (needed > MaxFSMRequestSize)
4244 tgl@sss.pgh.pa.us 438 [ # # ]:UBC 0 : elog(ERROR, "invalid FSM request size %zu", needed);
439 : :
6185 heikki.linnakangas@i 440 [ - + ]:CBC 195300 : if (needed == 0)
6185 heikki.linnakangas@i 441 :UBC 0 : return 1;
442 : :
6185 heikki.linnakangas@i 443 :CBC 195300 : cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;
444 : :
445 [ - + ]: 195300 : if (cat > 255)
6185 heikki.linnakangas@i 446 :UBC 0 : cat = 255;
447 : :
6185 heikki.linnakangas@i 448 :CBC 195300 : return (uint8) cat;
449 : : }
450 : :
451 : : /*
452 : : * Returns the physical block number of a FSM page
453 : : */
454 : : static BlockNumber
455 : 728807 : fsm_logical_to_physical(FSMAddress addr)
456 : : {
457 : : BlockNumber pages;
458 : : int leafno;
459 : : int l;
460 : :
461 : : /*
462 : : * Calculate the logical page number of the first leaf page below the
463 : : * given page.
464 : : */
465 : 728807 : leafno = addr.logpageno;
466 [ + + ]: 1128805 : for (l = 0; l < addr.level; l++)
467 : 399998 : leafno *= SlotsPerFSMPage;
468 : :
469 : : /* Count upper level nodes required to address the leaf page */
470 : 728807 : pages = 0;
471 [ + + ]: 2915228 : for (l = 0; l < FSM_TREE_DEPTH; l++)
472 : : {
473 : 2186421 : pages += leafno + 1;
474 : 2186421 : leafno /= SlotsPerFSMPage;
475 : : }
476 : :
477 : : /*
478 : : * If the page we were asked for wasn't at the bottom level, subtract the
479 : : * additional lower level pages we counted above.
480 : : */
481 : 728807 : pages -= addr.level;
482 : :
483 : : /* Turn the page count into 0-based block number */
484 : 728807 : return pages - 1;
485 : : }
486 : :
487 : : /*
488 : : * Return the FSM location corresponding to given heap block.
489 : : */
490 : : static FSMAddress
491 : 524807 : fsm_get_location(BlockNumber heapblk, uint16 *slot)
492 : : {
493 : : FSMAddress addr;
494 : :
495 : 524807 : addr.level = FSM_BOTTOM_LEVEL;
496 : 524807 : addr.logpageno = heapblk / SlotsPerFSMPage;
497 : 524807 : *slot = heapblk % SlotsPerFSMPage;
498 : :
499 : 524807 : return addr;
500 : : }
501 : :
502 : : /*
503 : : * Return the heap block number corresponding to given location in the FSM.
504 : : */
505 : : static BlockNumber
506 : 27526 : fsm_get_heap_blk(FSMAddress addr, uint16 slot)
507 : : {
508 [ - + ]: 27526 : Assert(addr.level == FSM_BOTTOM_LEVEL);
509 : 27526 : return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
510 : : }
511 : :
512 : : /*
513 : : * Given a logical address of a child page, get the logical page number of
514 : : * the parent, and the slot within the parent corresponding to the child.
515 : : */
516 : : static FSMAddress
517 : 41987 : fsm_get_parent(FSMAddress child, uint16 *slot)
518 : : {
519 : : FSMAddress parent;
520 : :
521 [ - + ]: 41987 : Assert(child.level < FSM_ROOT_LEVEL);
522 : :
523 : 41987 : parent.level = child.level + 1;
524 : 41987 : parent.logpageno = child.logpageno / SlotsPerFSMPage;
525 : 41987 : *slot = child.logpageno % SlotsPerFSMPage;
526 : :
527 : 41987 : return parent;
528 : : }
529 : :
530 : : /*
531 : : * Given a logical address of a parent page and a slot number, get the
532 : : * logical address of the corresponding child page.
533 : : */
534 : : static FSMAddress
535 : 40440 : fsm_get_child(FSMAddress parent, uint16 slot)
536 : : {
537 : : FSMAddress child;
538 : :
539 [ - + ]: 40440 : Assert(parent.level > FSM_BOTTOM_LEVEL);
540 : :
541 : 40440 : child.level = parent.level - 1;
542 : 40440 : child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;
543 : :
544 : 40440 : return child;
545 : : }
546 : :
547 : : /*
548 : : * Read a FSM page.
549 : : *
550 : : * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
551 : : * true, the FSM file is extended.
552 : : */
553 : : static Buffer
554 : 428237 : fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
555 : : {
556 : 428237 : BlockNumber blkno = fsm_logical_to_physical(addr);
557 : : Buffer buf;
584 558 : 428237 : SMgrRelation reln = RelationGetSmgr(rel);
559 : :
560 : : /*
561 : : * If we haven't cached the size of the FSM yet, check it first. Also
562 : : * recheck if the requested block seems to be past end, since our cached
563 : : * value might be stale. (We send smgr inval messages on truncation, but
564 : : * not on extension.)
565 : : */
1517 tgl@sss.pgh.pa.us 566 [ + + ]: 428237 : if (reln->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber ||
567 [ + + ]: 366338 : blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
568 : : {
569 : : /* Invalidate the cache so smgrnblocks asks the kernel. */
570 : 93437 : reln->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
571 [ + + ]: 93437 : if (smgrexists(reln, FSM_FORKNUM))
572 : 34359 : smgrnblocks(reln, FSM_FORKNUM);
573 : : else
574 : 59078 : reln->smgr_cached_nblocks[FSM_FORKNUM] = 0;
575 : : }
576 : :
577 : : /*
578 : : * For reading we use ZERO_ON_ERROR mode, and initialize the page if
579 : : * necessary. The FSM information is not accurate anyway, so it's better
580 : : * to clear corrupt pages than error out. Since the FSM changes are not
581 : : * WAL-logged, the so-called torn page problem on crash can lead to pages
582 : : * with corrupt headers, for example.
583 : : *
584 : : * We use the same path below to initialize pages when extending the
585 : : * relation, as a concurrent extension can end up with vm_extend()
586 : : * returning an already-initialized page.
587 : : */
588 [ + + ]: 428237 : if (blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
589 : : {
6185 heikki.linnakangas@i 590 [ + + ]: 59876 : if (extend)
885 andres@anarazel.de 591 : 4050 : buf = fsm_extend(rel, blkno + 1);
592 : : else
6185 heikki.linnakangas@i 593 : 55826 : return InvalidBuffer;
594 : : }
595 : : else
885 andres@anarazel.de 596 : 368361 : buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
597 : :
598 : : /*
599 : : * Initializing the page when needed is trickier than it looks, because of
600 : : * the possibility of multiple backends doing this concurrently, and our
601 : : * desire to not uselessly take the buffer lock in the normal path where
602 : : * the page is OK. We must take the lock to initialize the page, so
603 : : * recheck page newness after we have the lock, in case someone else
604 : : * already did it. Also, because we initially check PageIsNew with no
605 : : * lock, it's possible to fall through and return the buffer while someone
606 : : * else is still initializing the page (i.e., we might see pd_upper as set
607 : : * but other page header fields are still zeroes). This is harmless for
608 : : * callers that will take a buffer lock themselves, but some callers
609 : : * inspect the page without any lock at all. The latter is OK only so
610 : : * long as it doesn't depend on the page header having correct contents.
611 : : * Current usage is safe because PageGetContents() does not require that.
612 : : */
3426 kgrittn@postgresql.o 613 [ + + ]: 372411 : if (PageIsNew(BufferGetPage(buf)))
614 : : {
2612 tgl@sss.pgh.pa.us 615 : 13895 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
616 [ + - ]: 13895 : if (PageIsNew(BufferGetPage(buf)))
617 : 13895 : PageInit(BufferGetPage(buf), BLCKSZ, 0);
618 : 13895 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
619 : : }
6154 heikki.linnakangas@i 620 : 372411 : return buf;
621 : : }
622 : :
623 : : /*
624 : : * Ensure that the FSM fork is at least fsm_nblocks long, extending
625 : : * it if necessary with empty pages. And by empty, I mean pages filled
626 : : * with zeros, meaning there's no free space.
627 : : */
628 : : static Buffer
6128 629 : 4050 : fsm_extend(Relation rel, BlockNumber fsm_nblocks)
630 : : {
745 tmunro@postgresql.or 631 : 4050 : return ExtendBufferedRelTo(BMR_REL(rel), FSM_FORKNUM, NULL,
632 : : EB_CREATE_FORK_IF_NEEDED |
633 : : EB_CLEAR_SIZE_CACHE,
634 : : fsm_nblocks,
635 : : RBM_ZERO_ON_ERROR);
636 : : }
637 : :
638 : : /*
639 : : * Set value in given FSM page and slot.
640 : : *
641 : : * If minValue > 0, the updated page is also searched for a page with at
642 : : * least minValue of free space. If one is found, its slot number is
643 : : * returned, -1 otherwise.
644 : : */
645 : : static int
6185 heikki.linnakangas@i 646 : 198148 : fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
647 : : uint8 newValue, uint8 minValue)
648 : : {
649 : : Buffer buf;
650 : : Page page;
651 : 198148 : int newslot = -1;
652 : :
653 : 198148 : buf = fsm_readbuf(rel, addr, true);
654 : 198148 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
655 : :
3426 kgrittn@postgresql.o 656 : 198148 : page = BufferGetPage(buf);
657 : :
6185 heikki.linnakangas@i 658 [ + + ]: 198148 : if (fsm_set_avail(page, slot, newValue))
4464 jdavis@postgresql.or 659 : 111316 : MarkBufferDirtyHint(buf, false);
660 : :
6185 heikki.linnakangas@i 661 [ + + ]: 198148 : if (minValue != 0)
662 : : {
663 : : /* Search while we still hold the lock */
664 : 109942 : newslot = fsm_search_avail(buf, minValue,
665 : 109942 : addr.level == FSM_BOTTOM_LEVEL,
666 : : true);
667 : : }
668 : :
669 : 198148 : UnlockReleaseBuffer(buf);
670 : :
671 : 198148 : return newslot;
672 : : }
673 : :
674 : : /*
675 : : * Search the tree for a heap page with at least min_cat of free space
676 : : */
677 : : static BlockNumber
678 : 179503 : fsm_search(Relation rel, uint8 min_cat)
679 : : {
5931 bruce@momjian.us 680 : 179503 : int restarts = 0;
681 : 179503 : FSMAddress addr = FSM_ROOT_ADDRESS;
682 : :
683 : : for (;;)
8832 tgl@sss.pgh.pa.us 684 : 28361 : {
685 : : int slot;
686 : : Buffer buf;
5931 bruce@momjian.us 687 : 207864 : uint8 max_avail = 0;
688 : :
689 : : /* Read the FSM page. */
6127 heikki.linnakangas@i 690 : 207864 : buf = fsm_readbuf(rel, addr, false);
691 : :
692 : : /* Search within the page */
6185 693 [ + + ]: 207864 : if (BufferIsValid(buf))
694 : : {
695 : 152869 : LockBuffer(buf, BUFFER_LOCK_SHARE);
696 : 152869 : slot = fsm_search_avail(buf, min_cat,
697 : 152869 : (addr.level == FSM_BOTTOM_LEVEL),
698 : : false);
699 [ + + ]: 152869 : if (slot == -1)
700 : : {
3426 kgrittn@postgresql.o 701 : 114740 : max_avail = fsm_get_max_avail(BufferGetPage(buf));
511 noah@leadboat.com 702 : 114740 : UnlockReleaseBuffer(buf);
703 : : }
704 : : else
705 : : {
706 : : /* Keep the pin for possible update below */
707 : 38129 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
708 : : }
709 : : }
710 : : else
6185 heikki.linnakangas@i 711 : 54995 : slot = -1;
712 : :
713 [ + + ]: 207864 : if (slot != -1)
714 : : {
715 : : /*
716 : : * Descend the tree, or return the found block if we're at the
717 : : * bottom.
718 : : */
719 [ + + ]: 38129 : if (addr.level == FSM_BOTTOM_LEVEL)
720 : : {
511 noah@leadboat.com 721 : 11729 : BlockNumber blkno = fsm_get_heap_blk(addr, slot);
722 : : Page page;
723 : :
724 [ + - ]: 11729 : if (fsm_does_block_exist(rel, blkno))
725 : : {
726 : 11729 : ReleaseBuffer(buf);
727 : 11729 : return blkno;
728 : : }
729 : :
730 : : /*
731 : : * Block is past the end of the relation. Update FSM, and
732 : : * restart from root. The usual "advancenext" behavior is
733 : : * pessimal for this rare scenario, since every later slot is
734 : : * unusable in the same way. We could zero all affected slots
735 : : * on the same FSM page, but don't bet on the benefits of that
736 : : * optimization justifying its compiled code bulk.
737 : : */
511 noah@leadboat.com 738 :UBC 0 : page = BufferGetPage(buf);
739 : 0 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
740 : 0 : fsm_set_avail(page, slot, 0);
741 : 0 : MarkBufferDirtyHint(buf, false);
742 : 0 : UnlockReleaseBuffer(buf);
743 [ # # ]: 0 : if (restarts++ > 10000) /* same rationale as below */
744 : 0 : return InvalidBlockNumber;
745 : 0 : addr = FSM_ROOT_ADDRESS;
746 : : }
747 : : else
748 : : {
511 noah@leadboat.com 749 :CBC 26400 : ReleaseBuffer(buf);
750 : : }
6185 heikki.linnakangas@i 751 : 26400 : addr = fsm_get_child(addr, slot);
752 : : }
753 [ + + ]: 169735 : else if (addr.level == FSM_ROOT_LEVEL)
754 : : {
755 : : /*
756 : : * At the root, failure means there's no page with enough free
757 : : * space in the FSM. Give up.
758 : : */
759 : 167774 : return InvalidBlockNumber;
760 : : }
761 : : else
762 : : {
763 : : uint16 parentslot;
764 : : FSMAddress parent;
765 : :
766 : : /*
767 : : * At lower level, failure can happen if the value in the upper-
768 : : * level node didn't reflect the value on the lower page. Update
769 : : * the upper node, to avoid falling into the same trap again, and
770 : : * start over.
771 : : *
772 : : * There's a race condition here, if another backend updates this
773 : : * page right after we release it, and gets the lock on the parent
774 : : * page before us. We'll then update the parent page with the now
775 : : * stale information we had. It's OK, because it should happen
776 : : * rarely, and will be fixed by the next vacuum.
777 : : */
778 : 1961 : parent = fsm_get_parent(addr, &parentslot);
779 : 1961 : fsm_set_and_search(rel, parent, parentslot, max_avail, 0);
780 : :
781 : : /*
782 : : * If the upper pages are badly out of date, we might need to loop
783 : : * quite a few times, updating them as we go. Any inconsistencies
784 : : * should eventually be corrected and the loop should end. Looping
785 : : * indefinitely is nevertheless scary, so provide an emergency
786 : : * valve.
787 : : */
788 [ - + ]: 1961 : if (restarts++ > 10000)
6185 heikki.linnakangas@i 789 :UBC 0 : return InvalidBlockNumber;
790 : :
791 : : /* Start search all over from the root */
6185 heikki.linnakangas@i 792 :CBC 1961 : addr = FSM_ROOT_ADDRESS;
793 : : }
794 : : }
795 : : }
796 : :
797 : :
798 : : /*
799 : : * Recursive guts of FreeSpaceMapVacuum
800 : : *
801 : : * Examine the FSM page indicated by addr, as well as its children, updating
802 : : * upper-level nodes that cover the heap block range from start to end-1.
803 : : * (It's okay if end is beyond the actual end of the map.)
804 : : * Return the maximum freespace value on this page.
805 : : *
806 : : * If addr is past the end of the FSM, set *eof_p to true and return 0.
807 : : *
808 : : * This traverses the tree in depth-first order. The tree is stored
809 : : * physically in depth-first order, so this should be pretty I/O efficient.
810 : : */
811 : : static uint8
2718 tgl@sss.pgh.pa.us 812 : 20730 : fsm_vacuum_page(Relation rel, FSMAddress addr,
813 : : BlockNumber start, BlockNumber end,
814 : : bool *eof_p)
815 : : {
816 : : Buffer buf;
817 : : Page page;
818 : : uint8 max_avail;
819 : :
820 : : /* Read the page if it exists, or return EOF */
6185 heikki.linnakangas@i 821 : 20730 : buf = fsm_readbuf(rel, addr, false);
822 [ + + ]: 20730 : if (!BufferIsValid(buf))
823 : : {
824 : 795 : *eof_p = true;
825 : 795 : return 0;
826 : : }
827 : : else
828 : 19935 : *eof_p = false;
829 : :
3426 kgrittn@postgresql.o 830 : 19935 : page = BufferGetPage(buf);
831 : :
832 : : /*
833 : : * If we're above the bottom level, recurse into children, and fix the
834 : : * information stored about them at this level.
835 : : */
6185 heikki.linnakangas@i 836 [ + + ]: 19935 : if (addr.level > FSM_BOTTOM_LEVEL)
837 : : {
838 : : FSMAddress fsm_start,
839 : : fsm_end;
840 : : uint16 fsm_start_slot,
841 : : fsm_end_slot;
842 : : int slot,
843 : : start_slot,
844 : : end_slot;
5931 bruce@momjian.us 845 : 13342 : bool eof = false;
846 : :
847 : : /*
848 : : * Compute the range of slots we need to update on this page, given
849 : : * the requested range of heap blocks to consider. The first slot to
850 : : * update is the one covering the "start" block, and the last slot is
851 : : * the one covering "end - 1". (Some of this work will be duplicated
852 : : * in each recursive call, but it's cheap enough to not worry about.)
853 : : */
2718 tgl@sss.pgh.pa.us 854 : 13342 : fsm_start = fsm_get_location(start, &fsm_start_slot);
855 : 13342 : fsm_end = fsm_get_location(end - 1, &fsm_end_slot);
856 : :
857 [ + + ]: 33355 : while (fsm_start.level < addr.level)
858 : : {
859 : 20013 : fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot);
860 : 20013 : fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot);
861 : : }
862 [ - + ]: 13342 : Assert(fsm_start.level == addr.level);
863 : :
864 [ + - ]: 13342 : if (fsm_start.logpageno == addr.logpageno)
865 : 13342 : start_slot = fsm_start_slot;
2718 tgl@sss.pgh.pa.us 866 [ # # ]:UBC 0 : else if (fsm_start.logpageno > addr.logpageno)
867 : 0 : start_slot = SlotsPerFSMPage; /* shouldn't get here... */
868 : : else
869 : 0 : start_slot = 0;
870 : :
2718 tgl@sss.pgh.pa.us 871 [ + + ]:CBC 13342 : if (fsm_end.logpageno == addr.logpageno)
872 : 12954 : end_slot = fsm_end_slot;
873 [ + - ]: 388 : else if (fsm_end.logpageno > addr.logpageno)
874 : 388 : end_slot = SlotsPerFSMPage - 1;
875 : : else
2718 tgl@sss.pgh.pa.us 876 :UBC 0 : end_slot = -1; /* shouldn't get here... */
877 : :
2718 tgl@sss.pgh.pa.us 878 [ + + ]:CBC 1705560 : for (slot = start_slot; slot <= end_slot; slot++)
879 : : {
880 : : int child_avail;
881 : :
5688 882 [ + + ]: 1692218 : CHECK_FOR_INTERRUPTS();
883 : :
884 : : /* After we hit end-of-file, just clear the rest of the slots */
6185 heikki.linnakangas@i 885 [ + + ]: 1692218 : if (!eof)
2718 tgl@sss.pgh.pa.us 886 : 14040 : child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot),
887 : : start, end,
888 : : &eof);
889 : : else
6185 heikki.linnakangas@i 890 : 1678178 : child_avail = 0;
891 : :
892 : : /* Update information about the child */
893 [ + + ]: 1692218 : if (fsm_get_avail(page, slot) != child_avail)
894 : : {
895 : 7826 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2718 tgl@sss.pgh.pa.us 896 : 7826 : fsm_set_avail(page, slot, child_avail);
4464 jdavis@postgresql.or 897 : 7826 : MarkBufferDirtyHint(buf, false);
6185 heikki.linnakangas@i 898 : 7826 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
899 : : }
900 : : }
901 : : }
902 : :
903 : : /* Now get the maximum value on the page, to return to caller */
2718 tgl@sss.pgh.pa.us 904 : 19935 : max_avail = fsm_get_max_avail(page);
905 : :
906 : : /*
907 : : * Reset the next slot pointer. This encourages the use of low-numbered
908 : : * pages, increasing the chances that a later vacuum can truncate the
909 : : * relation. We don't bother with a lock here, nor with marking the page
910 : : * dirty if it wasn't already, since this is just a hint.
911 : : */
6185 heikki.linnakangas@i 912 : 19935 : ((FSMPage) PageGetContents(page))->fp_next_slot = 0;
913 : :
914 : 19935 : ReleaseBuffer(buf);
915 : :
916 : 19935 : return max_avail;
917 : : }
918 : :
919 : :
920 : : /*
921 : : * Check whether a block number is past the end of the relation. This can
922 : : * happen after WAL replay, if the FSM reached disk but newly-extended pages
923 : : * it refers to did not.
924 : : */
925 : : static bool
511 noah@leadboat.com 926 : 27526 : fsm_does_block_exist(Relation rel, BlockNumber blknumber)
927 : : {
928 : 27526 : SMgrRelation smgr = RelationGetSmgr(rel);
929 : :
930 : : /*
931 : : * If below the cached nblocks, the block surely exists. Otherwise, we
932 : : * face a trade-off. We opt to compare to a fresh nblocks, incurring
933 : : * lseek() overhead. The alternative would be to assume the block does
934 : : * not exist, but that would cause FSM to set zero space available for
935 : : * blocks that main fork extension just recorded.
936 : : */
937 : 27526 : return ((BlockNumberIsValid(smgr->smgr_cached_nblocks[MAIN_FORKNUM]) &&
938 [ + + + + : 37520 : blknumber < smgr->smgr_cached_nblocks[MAIN_FORKNUM]) ||
+ - ]
939 : 9994 : blknumber < RelationGetNumberOfBlocks(rel));
940 : : }
|