Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * freespace.c
4 : : * POSTGRES free space map for quickly finding free space in relations
5 : : *
6 : : *
7 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
8 : : * Portions Copyright (c) 1994, Regents of the University of California
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/storage/freespace/freespace.c
12 : : *
13 : : *
14 : : * NOTES:
15 : : *
16 : : * Free Space Map keeps track of the amount of free space on pages, and
17 : : * allows quickly searching for a page with enough free space. The FSM is
18 : : * stored in a dedicated relation fork of all heap relations, and those
19 : : * index access methods that need it (see also indexfsm.c). See README for
20 : : * more information.
21 : : *
22 : : *-------------------------------------------------------------------------
23 : : */
24 : : #include "postgres.h"
25 : :
26 : : #include "access/htup_details.h"
27 : : #include "access/xloginsert.h"
28 : : #include "access/xlogutils.h"
29 : : #include "miscadmin.h"
30 : : #include "storage/freespace.h"
31 : : #include "storage/fsm_internals.h"
32 : : #include "storage/smgr.h"
33 : : #include "utils/rel.h"
34 : :
35 : :
36 : : /*
37 : : * We use just one byte to store the amount of free space on a page, so we
38 : : * divide the amount of free space a page can have into 256 different
39 : : * categories. The highest category, 255, represents a page with at least
40 : : * MaxFSMRequestSize bytes of free space, and the second highest category
41 : : * represents the range from 254 * FSM_CAT_STEP, inclusive, to
42 : : * MaxFSMRequestSize, exclusive.
43 : : *
44 : : * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
45 : : * default 8k BLCKSZ, and that MaxFSMRequestSize is 8164 bytes, the
46 : : * categories look like this:
47 : : *
48 : : *
49 : : * Range Category
50 : : * 0 - 31 0
51 : : * 32 - 63 1
52 : : * ... ... ...
53 : : * 8096 - 8127 253
54 : : * 8128 - 8163 254
55 : : * 8164 - 8192 255
56 : : *
57 : : * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
58 : : * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
59 : : * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
60 : : * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
61 : : * completely empty page, that would mean that we could never satisfy a
62 : : * request of exactly MaxFSMRequestSize bytes.
63 : : */
64 : : #define FSM_CATEGORIES 256
65 : : #define FSM_CAT_STEP (BLCKSZ / FSM_CATEGORIES)
66 : : #define MaxFSMRequestSize MaxHeapTupleSize
67 : :
68 : : /*
69 : : * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
70 : : * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
71 : : * 256 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
72 : : * this means that 4096 bytes is the smallest BLCKSZ that we can get away
73 : : * with a 3-level tree, and 512 is the smallest we support.
74 : : */
75 : : #define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
76 : :
77 : : #define FSM_ROOT_LEVEL (FSM_TREE_DEPTH - 1)
78 : : #define FSM_BOTTOM_LEVEL 0
79 : :
80 : : /*
81 : : * The internal FSM routines work on a logical addressing scheme. Each
82 : : * level of the tree can be thought of as a separately addressable file.
83 : : */
84 : : typedef struct
85 : : {
86 : : int level; /* level */
87 : : int logpageno; /* page number within the level */
88 : : } FSMAddress;
89 : :
90 : : /* Address of the root page. */
91 : : static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
92 : :
93 : : /* functions to navigate the tree */
94 : : static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
95 : : static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
96 : : static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
97 : : static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
98 : : static BlockNumber fsm_logical_to_physical(FSMAddress addr);
99 : :
100 : : static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
101 : : static Buffer fsm_extend(Relation rel, BlockNumber fsm_nblocks);
102 : :
103 : : /* functions to convert amount of free space to a FSM category */
104 : : static uint8 fsm_space_avail_to_cat(Size avail);
105 : : static uint8 fsm_space_needed_to_cat(Size needed);
106 : : static Size fsm_space_cat_to_avail(uint8 cat);
107 : :
108 : : /* workhorse functions for various operations */
109 : : static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
110 : : uint8 newValue, uint8 minValue);
111 : : static BlockNumber fsm_search(Relation rel, uint8 min_cat);
112 : : static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
113 : : BlockNumber start, BlockNumber end,
114 : : bool *eof_p);
115 : : static bool fsm_does_block_exist(Relation rel, BlockNumber blknumber);
116 : :
117 : :
118 : : /******** Public API ********/
119 : :
120 : : /*
121 : : * GetPageWithFreeSpace - try to find a page in the given relation with
122 : : * at least the specified amount of free space.
123 : : *
124 : : * If successful, return the block number; if not, return InvalidBlockNumber.
125 : : *
126 : : * The caller must be prepared for the possibility that the returned page
127 : : * will turn out to have too little space available by the time the caller
128 : : * gets a lock on it. In that case, the caller should report the actual
129 : : * amount of free space available on that page and then try again (see
130 : : * RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned,
131 : : * extend the relation.
132 : : *
133 : : * This can trigger FSM updates if any FSM entry is found to point to a block
134 : : * past the end of the relation.
135 : : */
136 : : BlockNumber
2555 akapila@postgresql.o 137 :CBC 109994 : GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
138 : : {
6172 bruce@momjian.us 139 : 109994 : uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded);
140 : :
2555 akapila@postgresql.o 141 : 109994 : return fsm_search(rel, min_cat);
142 : : }
143 : :
144 : : /*
145 : : * RecordAndGetPageWithFreeSpace - update info about a page and try again.
146 : : *
147 : : * We provide this combo form to save some locking overhead, compared to
148 : : * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
149 : : * also some effort to return a page close to the old page; if there's a
150 : : * page with enough free space on the same FSM page where the old one page
151 : : * is located, it is preferred.
152 : : */
153 : : BlockNumber
6426 heikki.linnakangas@i 154 : 144921 : RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
155 : : Size oldSpaceAvail, Size spaceNeeded)
156 : : {
2555 akapila@postgresql.o 157 : 144921 : int old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
158 : 144921 : int search_cat = fsm_space_needed_to_cat(spaceNeeded);
159 : : FSMAddress addr;
160 : : uint16 slot;
161 : : int search_slot;
162 : :
163 : : /* Get the location of the FSM byte representing the heap block */
6426 heikki.linnakangas@i 164 : 144921 : addr = fsm_get_location(oldPage, &slot);
165 : :
166 : 144921 : search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);
167 : :
168 : : /*
169 : : * If fsm_set_and_search found a suitable new block, return that.
170 : : * Otherwise, search as usual.
171 : : */
172 [ + + ]: 144921 : if (search_slot != -1)
173 : : {
752 noah@leadboat.com 174 : 19504 : BlockNumber blknum = fsm_get_heap_blk(addr, search_slot);
175 : :
176 : : /*
177 : : * Check that the blknum is actually in the relation. Don't try to
178 : : * update the FSM in that case, just fall back to the other case
179 : : */
180 [ + - ]: 19504 : if (fsm_does_block_exist(rel, blknum))
181 : 19504 : return blknum;
182 : : }
183 : 125417 : return fsm_search(rel, search_cat);
184 : : }
185 : :
186 : : /*
187 : : * RecordPageWithFreeSpace - update info about a page.
188 : : *
189 : : * Note that if the new spaceAvail value is higher than the old value stored
190 : : * in the FSM, the space might not become visible to searchers until the next
191 : : * FreeSpaceMapVacuum call, which updates the upper level pages.
192 : : */
193 : : void
2555 akapila@postgresql.o 194 : 111255 : RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
195 : : {
196 : 111255 : int new_cat = fsm_space_avail_to_cat(spaceAvail);
197 : : FSMAddress addr;
198 : : uint16 slot;
199 : :
200 : : /* Get the location of the FSM byte representing the heap block */
6426 heikki.linnakangas@i 201 : 111255 : addr = fsm_get_location(heapBlk, &slot);
202 : :
203 : 111255 : fsm_set_and_search(rel, addr, slot, new_cat, 0);
8463 tgl@sss.pgh.pa.us 204 : 111255 : }
205 : :
206 : : /*
207 : : * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
208 : : * WAL replay
209 : : */
210 : : void
1399 rhaas@postgresql.org 211 : 308989 : XLogRecordPageWithFreeSpace(RelFileLocator rlocator, BlockNumber heapBlk,
212 : : Size spaceAvail)
213 : : {
6395 heikki.linnakangas@i 214 : 308989 : int new_cat = fsm_space_avail_to_cat(spaceAvail);
215 : : FSMAddress addr;
216 : : uint16 slot;
217 : : BlockNumber blkno;
218 : : Buffer buf;
219 : : Page page;
220 : :
221 : : /* Get the location of the FSM byte representing the heap block */
222 : 308989 : addr = fsm_get_location(heapBlk, &slot);
223 : 308989 : blkno = fsm_logical_to_physical(addr);
224 : :
225 : : /* If the page doesn't exist already, extend */
1399 rhaas@postgresql.org 226 : 308989 : buf = XLogReadBufferExtended(rlocator, FSM_FORKNUM, blkno,
227 : : RBM_ZERO_ON_ERROR, InvalidBuffer);
6314 heikki.linnakangas@i 228 : 308989 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
229 : :
3667 kgrittn@postgresql.o 230 : 308989 : page = BufferGetPage(buf);
6395 heikki.linnakangas@i 231 [ + + ]: 308989 : if (PageIsNew(page))
232 : 402 : PageInit(page, BLCKSZ, 0);
233 : :
234 : : /*
235 : : * Changes to FSM are usually marked as changed using MarkBufferDirtyHint;
236 : : * however, during recovery, it does nothing if checksums are enabled. It
237 : : * is assumed that the page should not be dirtied during recovery while
238 : : * modifying hints to prevent torn pages, since no new WAL data can be
239 : : * generated at this point to store FPI. This is not relevant to the FSM
240 : : * case, as its blocks are zeroed when a checksum mismatch occurs. So, we
241 : : * need to use regular MarkBufferDirty here to mark the FSM block as
242 : : * modified during recovery, otherwise changes to the FSM may be lost.
243 : : */
244 [ + + ]: 308989 : if (fsm_set_avail(page, slot, new_cat))
2 akorotkov@postgresql 245 : 302541 : MarkBufferDirty(buf);
6395 heikki.linnakangas@i 246 : 308989 : UnlockReleaseBuffer(buf);
247 : 308989 : }
248 : :
249 : : /*
250 : : * GetRecordedFreeSpace - return the amount of free space on a particular page,
251 : : * according to the FSM.
252 : : */
253 : : Size
6426 254 : 2003 : GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
255 : : {
256 : : FSMAddress addr;
257 : : uint16 slot;
258 : : Buffer buf;
259 : : uint8 cat;
260 : :
261 : : /* Get the location of the FSM byte representing the heap block */
262 : 2003 : addr = fsm_get_location(heapBlk, &slot);
263 : :
264 : 2003 : buf = fsm_readbuf(rel, addr, false);
265 [ + + ]: 2003 : if (!BufferIsValid(buf))
266 : 36 : return 0;
3667 kgrittn@postgresql.o 267 : 1967 : cat = fsm_get_avail(BufferGetPage(buf), slot);
6426 heikki.linnakangas@i 268 : 1967 : ReleaseBuffer(buf);
269 : :
270 : 1967 : return fsm_space_cat_to_avail(cat);
271 : : }
272 : :
273 : : /*
274 : : * FreeSpaceMapPrepareTruncateRel - prepare for truncation of a relation.
275 : : *
276 : : * nblocks is the new size of the heap.
277 : : *
278 : : * Return the number of blocks of new FSM.
279 : : * If it's InvalidBlockNumber, there is nothing to truncate;
280 : : * otherwise the caller is responsible for calling smgrtruncate()
281 : : * to truncate the FSM pages, and FreeSpaceMapVacuumRange()
282 : : * to update upper-level pages in the FSM.
283 : : */
284 : : BlockNumber
2415 fujii@postgresql.org 285 : 261 : FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
286 : : {
287 : : BlockNumber new_nfsmblocks;
288 : : FSMAddress first_removed_address;
289 : : uint16 first_removed_slot;
290 : : Buffer buf;
291 : :
292 : : /*
293 : : * If no FSM has been created yet for this relation, there's nothing to
294 : : * truncate.
295 : : */
1758 tgl@sss.pgh.pa.us 296 [ - + ]: 261 : if (!smgrexists(RelationGetSmgr(rel), FSM_FORKNUM))
2415 fujii@postgresql.org 297 :UBC 0 : return InvalidBlockNumber;
298 : :
299 : : /* Get the location in the FSM of the first removed heap block */
6426 heikki.linnakangas@i 300 :CBC 261 : first_removed_address = fsm_get_location(nblocks, &first_removed_slot);
301 : :
302 : : /*
303 : : * Zero out the tail of the last remaining FSM page. If the slot
304 : : * representing the first removed heap block is at a page boundary, as the
305 : : * first slot on the FSM page that first_removed_address points to, we can
306 : : * just truncate that page altogether.
307 : : */
308 [ + + ]: 261 : if (first_removed_slot > 0)
309 : : {
310 : 154 : buf = fsm_readbuf(rel, first_removed_address, false);
311 [ - + ]: 154 : if (!BufferIsValid(buf))
2182 tgl@sss.pgh.pa.us 312 :UBC 0 : return InvalidBlockNumber; /* nothing to do; the FSM was already
313 : : * smaller */
6426 heikki.linnakangas@i 314 :CBC 154 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
315 : :
316 : : /* NO EREPORT(ERROR) from here till changes are logged */
3485 317 : 154 : START_CRIT_SECTION();
318 : :
3667 kgrittn@postgresql.o 319 : 154 : fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
320 : :
321 : : /*
322 : : * This change is non-critical, because fsm_does_block_exist() would
323 : : * stop us from returning a truncated-away block. However, since this
324 : : * may remove up to SlotsPerFSMPage slots, it's nice to avoid the cost
325 : : * of that many fsm_does_block_exist() rejections. Use a full
326 : : * MarkBufferDirty(), not MarkBufferDirtyHint().
327 : : */
3485 heikki.linnakangas@i 328 : 154 : MarkBufferDirty(buf);
329 : :
330 : : /*
331 : : * WAL-log like MarkBufferDirtyHint() might have done, just to avoid
332 : : * differing from the rest of the file in this respect. This is
333 : : * optional; see README mention of full page images. XXX consider
334 : : * XLogSaveBufferForHint() for even closer similarity.
335 : : *
336 : : * A higher-level operation calls us at WAL replay. If we crash
337 : : * before the XLOG_SMGR_TRUNCATE flushes to disk, main fork length has
338 : : * not changed, and our fork remains valid. If we crash after that
339 : : * flush, redo will return here.
340 : : */
341 [ + + + + : 154 : if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
+ + + - +
- + + +
- ]
342 : 133 : log_newpage_buffer(buf, false);
343 : :
344 [ - + ]: 154 : END_CRIT_SECTION();
345 : :
6426 346 : 154 : UnlockReleaseBuffer(buf);
347 : :
348 : 154 : new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
349 : : }
350 : : else
351 : : {
352 : 107 : new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
1758 tgl@sss.pgh.pa.us 353 [ - + ]: 107 : if (smgrnblocks(RelationGetSmgr(rel), FSM_FORKNUM) <= new_nfsmblocks)
2182 tgl@sss.pgh.pa.us 354 :UBC 0 : return InvalidBlockNumber; /* nothing to do; the FSM was already
355 : : * smaller */
356 : : }
357 : :
2415 fujii@postgresql.org 358 :CBC 261 : return new_nfsmblocks;
359 : : }
360 : :
361 : : /*
362 : : * FreeSpaceMapVacuum - update upper-level pages in the rel's FSM
363 : : *
364 : : * We assume that the bottom-level pages have already been updated with
365 : : * new free-space information.
366 : : */
367 : : void
6426 heikki.linnakangas@i 368 : 252 : FreeSpaceMapVacuum(Relation rel)
369 : : {
370 : : bool dummy;
371 : :
372 : : /* Recursively scan the tree, starting at the root */
2959 tgl@sss.pgh.pa.us 373 : 252 : (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS,
374 : : (BlockNumber) 0, InvalidBlockNumber,
375 : : &dummy);
376 : 252 : }
377 : :
378 : : /*
379 : : * FreeSpaceMapVacuumRange - update upper-level pages in the rel's FSM
380 : : *
381 : : * As above, but assume that only heap pages between start and end-1 inclusive
382 : : * have new free-space information, so update only the upper-level slots
383 : : * covering that block range. end == InvalidBlockNumber is equivalent to
384 : : * "all the rest of the relation".
385 : : */
386 : : void
387 : 7941 : FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
388 : : {
389 : : bool dummy;
390 : :
391 : : /* Recursively scan the tree, starting at the root */
392 [ + + ]: 7941 : if (end > start)
393 : 7794 : (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, start, end, &dummy);
9073 394 : 7941 : }
395 : :
396 : : /******** Internal routines ********/
397 : :
398 : : /*
399 : : * Return category corresponding x bytes of free space
400 : : */
401 : : static uint8
6426 heikki.linnakangas@i 402 : 565165 : fsm_space_avail_to_cat(Size avail)
403 : : {
404 : : int cat;
405 : :
406 [ - + ]: 565165 : Assert(avail < BLCKSZ);
407 : :
408 [ + + ]: 565165 : if (avail >= MaxFSMRequestSize)
409 : 26349 : return 255;
410 : :
411 : 538816 : cat = avail / FSM_CAT_STEP;
412 : :
413 : : /*
414 : : * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
415 : : * more.
416 : : */
417 [ - + ]: 538816 : if (cat > 254)
6426 heikki.linnakangas@i 418 :UBC 0 : cat = 254;
419 : :
6426 heikki.linnakangas@i 420 :CBC 538816 : return (uint8) cat;
421 : : }
422 : :
423 : : /*
424 : : * Return the lower bound of the range of free space represented by given
425 : : * category.
426 : : */
427 : : static Size
428 : 1967 : fsm_space_cat_to_avail(uint8 cat)
429 : : {
430 : : /* The highest category represents exactly MaxFSMRequestSize bytes. */
431 [ + + ]: 1967 : if (cat == 255)
432 : 1200 : return MaxFSMRequestSize;
433 : : else
434 : 767 : return cat * FSM_CAT_STEP;
435 : : }
436 : :
437 : : /*
438 : : * Which category does a page need to have, to accommodate x bytes of data?
439 : : * While fsm_space_avail_to_cat() rounds down, this needs to round up.
440 : : */
441 : : static uint8
442 : 254915 : fsm_space_needed_to_cat(Size needed)
443 : : {
444 : : int cat;
445 : :
446 : : /* Can't ask for more space than the highest category represents */
447 [ - + ]: 254915 : if (needed > MaxFSMRequestSize)
4485 tgl@sss.pgh.pa.us 448 [ # # ]:UBC 0 : elog(ERROR, "invalid FSM request size %zu", needed);
449 : :
6426 heikki.linnakangas@i 450 [ - + ]:CBC 254915 : if (needed == 0)
6426 heikki.linnakangas@i 451 :UBC 0 : return 1;
452 : :
6426 heikki.linnakangas@i 453 :CBC 254915 : cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;
454 : :
455 [ - + ]: 254915 : if (cat > 255)
6426 heikki.linnakangas@i 456 :UBC 0 : cat = 255;
457 : :
6426 heikki.linnakangas@i 458 :CBC 254915 : return (uint8) cat;
459 : : }
460 : :
461 : : /*
462 : : * Returns the physical block number of a FSM page
463 : : */
464 : : static BlockNumber
465 : 867461 : fsm_logical_to_physical(FSMAddress addr)
466 : : {
467 : : BlockNumber pages;
468 : : int leafno;
469 : : int l;
470 : :
471 : : /*
472 : : * Calculate the logical page number of the first leaf page below the
473 : : * given page.
474 : : */
475 : 867461 : leafno = addr.logpageno;
476 [ + + ]: 1390053 : for (l = 0; l < addr.level; l++)
477 : 522592 : leafno *= SlotsPerFSMPage;
478 : :
479 : : /* Count upper level nodes required to address the leaf page */
480 : 867461 : pages = 0;
481 [ + + ]: 3469844 : for (l = 0; l < FSM_TREE_DEPTH; l++)
482 : : {
483 : 2602383 : pages += leafno + 1;
484 : 2602383 : leafno /= SlotsPerFSMPage;
485 : : }
486 : :
487 : : /*
488 : : * If the page we were asked for wasn't at the bottom level, subtract the
489 : : * additional lower level pages we counted above.
490 : : */
491 : 867461 : pages -= addr.level;
492 : :
493 : : /* Turn the page count into 0-based block number */
494 : 867461 : return pages - 1;
495 : : }
496 : :
497 : : /*
498 : : * Return the FSM location corresponding to given heap block.
499 : : */
500 : : static FSMAddress
501 : 599525 : fsm_get_location(BlockNumber heapblk, uint16 *slot)
502 : : {
503 : : FSMAddress addr;
504 : :
505 : 599525 : addr.level = FSM_BOTTOM_LEVEL;
506 : 599525 : addr.logpageno = heapblk / SlotsPerFSMPage;
507 : 599525 : *slot = heapblk % SlotsPerFSMPage;
508 : :
509 : 599525 : return addr;
510 : : }
511 : :
512 : : /*
513 : : * Return the heap block number corresponding to given location in the FSM.
514 : : */
515 : : static BlockNumber
516 : 34535 : fsm_get_heap_blk(FSMAddress addr, uint16 slot)
517 : : {
518 [ - + ]: 34535 : Assert(addr.level == FSM_BOTTOM_LEVEL);
519 : 34535 : return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
520 : : }
521 : :
522 : : /*
523 : : * Given a logical address of a child page, get the logical page number of
524 : : * the parent, and the slot within the parent corresponding to the child.
525 : : */
526 : : static FSMAddress
527 : 50840 : fsm_get_parent(FSMAddress child, uint16 *slot)
528 : : {
529 : : FSMAddress parent;
530 : :
531 [ - + ]: 50840 : Assert(child.level < FSM_ROOT_LEVEL);
532 : :
533 : 50840 : parent.level = child.level + 1;
534 : 50840 : parent.logpageno = child.logpageno / SlotsPerFSMPage;
535 : 50840 : *slot = child.logpageno % SlotsPerFSMPage;
536 : :
537 : 50840 : return parent;
538 : : }
539 : :
540 : : /*
541 : : * Given a logical address of a parent page and a slot number, get the
542 : : * logical address of the corresponding child page.
543 : : */
544 : : static FSMAddress
545 : 51029 : fsm_get_child(FSMAddress parent, uint16 slot)
546 : : {
547 : : FSMAddress child;
548 : :
549 [ - + ]: 51029 : Assert(parent.level > FSM_BOTTOM_LEVEL);
550 : :
551 : 51029 : child.level = parent.level - 1;
552 : 51029 : child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;
553 : :
554 : 51029 : return child;
555 : : }
556 : :
557 : : /*
558 : : * Read a FSM page.
559 : : *
560 : : * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
561 : : * true, the FSM file is extended.
562 : : */
563 : : static Buffer
564 : 558211 : fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
565 : : {
566 : 558211 : BlockNumber blkno = fsm_logical_to_physical(addr);
567 : : Buffer buf;
825 568 : 558211 : SMgrRelation reln = RelationGetSmgr(rel);
569 : :
570 : : /*
571 : : * If we haven't cached the size of the FSM yet, check it first. Also
572 : : * recheck if the requested block seems to be past end, since our cached
573 : : * value might be stale. (We send smgr inval messages on truncation, but
574 : : * not on extension.)
575 : : */
1758 tgl@sss.pgh.pa.us 576 [ + + ]: 558211 : if (reln->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber ||
577 [ + + ]: 478948 : blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
578 : : {
579 : : /* Invalidate the cache so smgrnblocks asks the kernel. */
580 : 119484 : reln->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
581 [ + + ]: 119484 : if (smgrexists(reln, FSM_FORKNUM))
582 : 43800 : smgrnblocks(reln, FSM_FORKNUM);
583 : : else
584 : 75684 : reln->smgr_cached_nblocks[FSM_FORKNUM] = 0;
585 : : }
586 : :
587 : : /*
588 : : * For reading we use ZERO_ON_ERROR mode, and initialize the page if
589 : : * necessary. The FSM information is not accurate anyway, so it's better
590 : : * to clear corrupt pages than error out. Since the FSM changes are not
591 : : * WAL-logged, the so-called torn page problem on crash can lead to pages
592 : : * with corrupt headers, for example.
593 : : *
594 : : * We use the same path below to initialize pages when extending the
595 : : * relation, as a concurrent extension can end up with vm_extend()
596 : : * returning an already-initialized page.
597 : : */
598 [ + + ]: 558211 : if (blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
599 : : {
6426 heikki.linnakangas@i 600 [ + + ]: 76705 : if (extend)
1126 andres@anarazel.de 601 : 4898 : buf = fsm_extend(rel, blkno + 1);
602 : : else
6426 heikki.linnakangas@i 603 : 71807 : return InvalidBuffer;
604 : : }
605 : : else
1126 andres@anarazel.de 606 : 481506 : buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
607 : :
608 : : /*
609 : : * Initializing the page when needed is trickier than it looks, because of
610 : : * the possibility of multiple backends doing this concurrently, and our
611 : : * desire to not uselessly take the buffer lock in the normal path where
612 : : * the page is OK. We must take the lock to initialize the page, so
613 : : * recheck page newness after we have the lock, in case someone else
614 : : * already did it. Also, because we initially check PageIsNew with no
615 : : * lock, it's possible to fall through and return the buffer while someone
616 : : * else is still initializing the page (i.e., we might see pd_upper as set
617 : : * but other page header fields are still zeroes). This is harmless for
618 : : * callers that will take a buffer lock themselves, but some callers
619 : : * inspect the page without any lock at all. The latter is OK only so
620 : : * long as it doesn't depend on the page header having correct contents.
621 : : * Current usage is safe because PageGetContents() does not require that.
622 : : */
3667 kgrittn@postgresql.o 623 [ + + ]: 486404 : if (PageIsNew(BufferGetPage(buf)))
624 : : {
2853 tgl@sss.pgh.pa.us 625 : 16535 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
626 [ + - ]: 16535 : if (PageIsNew(BufferGetPage(buf)))
627 : 16535 : PageInit(BufferGetPage(buf), BLCKSZ, 0);
628 : 16535 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
629 : : }
6395 heikki.linnakangas@i 630 : 486404 : return buf;
631 : : }
632 : :
633 : : /*
634 : : * Ensure that the FSM fork is at least fsm_nblocks long, extending
635 : : * it if necessary with empty pages. And by empty, I mean pages filled
636 : : * with zeros, meaning there's no free space.
637 : : */
638 : : static Buffer
6369 639 : 4898 : fsm_extend(Relation rel, BlockNumber fsm_nblocks)
640 : : {
986 tmunro@postgresql.or 641 : 4898 : return ExtendBufferedRelTo(BMR_REL(rel), FSM_FORKNUM, NULL,
642 : : EB_CREATE_FORK_IF_NEEDED |
643 : : EB_CLEAR_SIZE_CACHE,
644 : : fsm_nblocks,
645 : : RBM_ZERO_ON_ERROR);
646 : : }
647 : :
648 : : /*
649 : : * Set value in given FSM page and slot.
650 : : *
651 : : * If minValue > 0, the updated page is also searched for a page with at
652 : : * least minValue of free space. If one is found, its slot number is
653 : : * returned, -1 otherwise.
654 : : */
655 : : static int
6426 heikki.linnakangas@i 656 : 258872 : fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
657 : : uint8 newValue, uint8 minValue)
658 : : {
659 : : Buffer buf;
660 : : Page page;
661 : 258872 : int newslot = -1;
662 : :
663 : 258872 : buf = fsm_readbuf(rel, addr, true);
664 : 258872 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
665 : :
3667 kgrittn@postgresql.o 666 : 258872 : page = BufferGetPage(buf);
667 : :
6426 heikki.linnakangas@i 668 [ + + ]: 258872 : if (fsm_set_avail(page, slot, newValue))
4705 jdavis@postgresql.or 669 : 135488 : MarkBufferDirtyHint(buf, false);
670 : :
6426 heikki.linnakangas@i 671 [ + + ]: 258872 : if (minValue != 0)
672 : : {
673 : : /* Search while we still hold the lock */
674 : 144921 : newslot = fsm_search_avail(buf, minValue,
675 : 144921 : addr.level == FSM_BOTTOM_LEVEL,
676 : : true);
677 : : }
678 : :
679 : 258872 : UnlockReleaseBuffer(buf);
680 : :
681 : 258872 : return newslot;
682 : : }
683 : :
684 : : /*
685 : : * Search the tree for a heap page with at least min_cat of free space
686 : : */
687 : : static BlockNumber
688 : 235411 : fsm_search(Relation rel, uint8 min_cat)
689 : : {
6172 bruce@momjian.us 690 : 235411 : int restarts = 0;
691 : 235411 : FSMAddress addr = FSM_ROOT_ADDRESS;
692 : :
693 : : for (;;)
9073 tgl@sss.pgh.pa.us 694 : 36802 : {
695 : : int slot;
696 : : Buffer buf;
6172 bruce@momjian.us 697 : 272213 : uint8 max_avail = 0;
698 : :
699 : : /* Read the FSM page. */
6368 heikki.linnakangas@i 700 : 272213 : buf = fsm_readbuf(rel, addr, false);
701 : :
702 : : /* Search within the page */
6426 703 [ + + ]: 272213 : if (BufferIsValid(buf))
704 : : {
705 : 201446 : LockBuffer(buf, BUFFER_LOCK_SHARE);
706 : 201446 : slot = fsm_search_avail(buf, min_cat,
707 : 201446 : (addr.level == FSM_BOTTOM_LEVEL),
708 : : false);
709 [ + + ]: 201446 : if (slot == -1)
710 : : {
3667 kgrittn@postgresql.o 711 : 152309 : max_avail = fsm_get_max_avail(BufferGetPage(buf));
752 noah@leadboat.com 712 : 152309 : UnlockReleaseBuffer(buf);
713 : : }
714 : : else
715 : : {
716 : : /* Keep the pin for possible update below */
717 : 49137 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
718 : : }
719 : : }
720 : : else
6426 heikki.linnakangas@i 721 : 70767 : slot = -1;
722 : :
723 [ + + ]: 272213 : if (slot != -1)
724 : : {
725 : : /*
726 : : * Descend the tree, or return the found block if we're at the
727 : : * bottom.
728 : : */
729 [ + + ]: 49137 : if (addr.level == FSM_BOTTOM_LEVEL)
730 : : {
752 noah@leadboat.com 731 : 15031 : BlockNumber blkno = fsm_get_heap_blk(addr, slot);
732 : : Page page;
733 : :
734 [ + - ]: 15031 : if (fsm_does_block_exist(rel, blkno))
735 : : {
736 : 15031 : ReleaseBuffer(buf);
737 : 15031 : return blkno;
738 : : }
739 : :
740 : : /*
741 : : * Block is past the end of the relation. Update FSM, and
742 : : * restart from root. The usual "advancenext" behavior is
743 : : * pessimal for this rare scenario, since every later slot is
744 : : * unusable in the same way. We could zero all affected slots
745 : : * on the same FSM page, but don't bet on the benefits of that
746 : : * optimization justifying its compiled code bulk.
747 : : */
752 noah@leadboat.com 748 :UBC 0 : page = BufferGetPage(buf);
749 : 0 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
750 : 0 : fsm_set_avail(page, slot, 0);
751 : 0 : MarkBufferDirtyHint(buf, false);
752 : 0 : UnlockReleaseBuffer(buf);
753 [ # # ]: 0 : if (restarts++ > 10000) /* same rationale as below */
754 : 0 : return InvalidBlockNumber;
755 : 0 : addr = FSM_ROOT_ADDRESS;
756 : : }
757 : : else
758 : : {
752 noah@leadboat.com 759 :CBC 34106 : ReleaseBuffer(buf);
760 : : }
6426 heikki.linnakangas@i 761 : 34106 : addr = fsm_get_child(addr, slot);
762 : : }
763 [ + + ]: 223076 : else if (addr.level == FSM_ROOT_LEVEL)
764 : : {
765 : : /*
766 : : * At the root, failure means there's no page with enough free
767 : : * space in the FSM. Give up.
768 : : */
769 : 220380 : return InvalidBlockNumber;
770 : : }
771 : : else
772 : : {
773 : : uint16 parentslot;
774 : : FSMAddress parent;
775 : :
776 : : /*
777 : : * At lower level, failure can happen if the value in the upper-
778 : : * level node didn't reflect the value on the lower page. Update
779 : : * the upper node, to avoid falling into the same trap again, and
780 : : * start over.
781 : : *
782 : : * There's a race condition here, if another backend updates this
783 : : * page right after we release it, and gets the lock on the parent
784 : : * page before us. We'll then update the parent page with the now
785 : : * stale information we had. It's OK, because it should happen
786 : : * rarely, and will be fixed by the next vacuum.
787 : : */
788 : 2696 : parent = fsm_get_parent(addr, &parentslot);
789 : 2696 : fsm_set_and_search(rel, parent, parentslot, max_avail, 0);
790 : :
791 : : /*
792 : : * If the upper pages are badly out of date, we might need to loop
793 : : * quite a few times, updating them as we go. Any inconsistencies
794 : : * should eventually be corrected and the loop should end. Looping
795 : : * indefinitely is nevertheless scary, so provide an emergency
796 : : * valve.
797 : : */
798 [ - + ]: 2696 : if (restarts++ > 10000)
6426 heikki.linnakangas@i 799 :UBC 0 : return InvalidBlockNumber;
800 : :
801 : : /* Start search all over from the root */
6426 heikki.linnakangas@i 802 :CBC 2696 : addr = FSM_ROOT_ADDRESS;
803 : : }
804 : : }
805 : : }
806 : :
807 : :
808 : : /*
809 : : * Recursive guts of FreeSpaceMapVacuum
810 : : *
811 : : * Examine the FSM page indicated by addr, as well as its children, updating
812 : : * upper-level nodes that cover the heap block range from start to end-1.
813 : : * (It's okay if end is beyond the actual end of the map.)
814 : : * Return the maximum freespace value on this page.
815 : : *
816 : : * If addr is past the end of the FSM, set *eof_p to true and return 0.
817 : : *
818 : : * This traverses the tree in depth-first order. The tree is stored
819 : : * physically in depth-first order, so this should be pretty I/O efficient.
820 : : */
821 : : static uint8
2959 tgl@sss.pgh.pa.us 822 : 24969 : fsm_vacuum_page(Relation rel, FSMAddress addr,
823 : : BlockNumber start, BlockNumber end,
824 : : bool *eof_p)
825 : : {
826 : : Buffer buf;
827 : : Page page;
828 : : uint8 max_avail;
829 : :
830 : : /* Read the page if it exists, or return EOF */
6426 heikki.linnakangas@i 831 : 24969 : buf = fsm_readbuf(rel, addr, false);
832 [ + + ]: 24969 : if (!BufferIsValid(buf))
833 : : {
834 : 1004 : *eof_p = true;
835 : 1004 : return 0;
836 : : }
837 : : else
838 : 23965 : *eof_p = false;
839 : :
3667 kgrittn@postgresql.o 840 : 23965 : page = BufferGetPage(buf);
841 : :
842 : : /*
843 : : * If we're above the bottom level, recurse into children, and fix the
844 : : * information stored about them at this level.
845 : : */
6426 heikki.linnakangas@i 846 [ + + ]: 23965 : if (addr.level > FSM_BOTTOM_LEVEL)
847 : : {
848 : : FSMAddress fsm_start,
849 : : fsm_end;
850 : : uint16 fsm_start_slot,
851 : : fsm_end_slot;
852 : : int slot,
853 : : start_slot,
854 : : end_slot;
6172 bruce@momjian.us 855 : 16048 : bool eof = false;
856 : :
857 : : /*
858 : : * Compute the range of slots we need to update on this page, given
859 : : * the requested range of heap blocks to consider. The first slot to
860 : : * update is the one covering the "start" block, and the last slot is
861 : : * the one covering "end - 1". (Some of this work will be duplicated
862 : : * in each recursive call, but it's cheap enough to not worry about.)
863 : : */
2959 tgl@sss.pgh.pa.us 864 : 16048 : fsm_start = fsm_get_location(start, &fsm_start_slot);
865 : 16048 : fsm_end = fsm_get_location(end - 1, &fsm_end_slot);
866 : :
867 [ + + ]: 40120 : while (fsm_start.level < addr.level)
868 : : {
869 : 24072 : fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot);
870 : 24072 : fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot);
871 : : }
872 [ - + ]: 16048 : Assert(fsm_start.level == addr.level);
873 : :
874 [ + - ]: 16048 : if (fsm_start.logpageno == addr.logpageno)
875 : 16048 : start_slot = fsm_start_slot;
2959 tgl@sss.pgh.pa.us 876 [ # # ]:UBC 0 : else if (fsm_start.logpageno > addr.logpageno)
877 : 0 : start_slot = SlotsPerFSMPage; /* shouldn't get here... */
878 : : else
879 : 0 : start_slot = 0;
880 : :
2959 tgl@sss.pgh.pa.us 881 [ + + ]:CBC 16048 : if (fsm_end.logpageno == addr.logpageno)
882 : 15557 : end_slot = fsm_end_slot;
883 [ + - ]: 491 : else if (fsm_end.logpageno > addr.logpageno)
884 : 491 : end_slot = SlotsPerFSMPage - 1;
885 : : else
2959 tgl@sss.pgh.pa.us 886 :UBC 0 : end_slot = -1; /* shouldn't get here... */
887 : :
2959 tgl@sss.pgh.pa.us 888 [ + + ]:CBC 2156653 : for (slot = start_slot; slot <= end_slot; slot++)
889 : : {
890 : : int child_avail;
891 : :
5929 892 [ - + ]: 2140605 : CHECK_FOR_INTERRUPTS();
893 : :
894 : : /* After we hit end-of-file, just clear the rest of the slots */
6426 heikki.linnakangas@i 895 [ + + ]: 2140605 : if (!eof)
2959 tgl@sss.pgh.pa.us 896 : 16923 : child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot),
897 : : start, end,
898 : : &eof);
899 : : else
6426 heikki.linnakangas@i 900 : 2123682 : child_avail = 0;
901 : :
902 : : /* Update information about the child */
903 [ + + ]: 2140605 : if (fsm_get_avail(page, slot) != child_avail)
904 : : {
905 : 9544 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2959 tgl@sss.pgh.pa.us 906 : 9544 : fsm_set_avail(page, slot, child_avail);
4705 jdavis@postgresql.or 907 : 9544 : MarkBufferDirtyHint(buf, false);
6426 heikki.linnakangas@i 908 : 9544 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
909 : : }
910 : : }
911 : : }
912 : :
913 : : /* Now get the maximum value on the page, to return to caller */
2959 tgl@sss.pgh.pa.us 914 : 23965 : max_avail = fsm_get_max_avail(page);
915 : :
916 : : /*
917 : : * Try to reset the next slot pointer. This encourages the use of
918 : : * low-numbered pages, increasing the chances that a later vacuum can
919 : : * truncate the relation. We don't bother with marking the page dirty if
920 : : * it wasn't already, since this is just a hint.
921 : : */
113 andres@anarazel.de 922 :GNC 23965 : LockBuffer(buf, BUFFER_LOCK_SHARE);
56 923 [ + - ]: 23965 : if (BufferBeginSetHintBits(buf))
924 : : {
925 : 23965 : ((FSMPage) PageGetContents(page))->fp_next_slot = 0;
926 : 23965 : BufferFinishSetHintBits(buf, false, false);
927 : : }
928 : :
39 929 : 23965 : UnlockReleaseBuffer(buf);
930 : :
6426 heikki.linnakangas@i 931 :CBC 23965 : return max_avail;
932 : : }
933 : :
934 : :
935 : : /*
936 : : * Check whether a block number is past the end of the relation. This can
937 : : * happen after WAL replay, if the FSM reached disk but newly-extended pages
938 : : * it refers to did not.
939 : : */
940 : : static bool
752 noah@leadboat.com 941 : 34535 : fsm_does_block_exist(Relation rel, BlockNumber blknumber)
942 : : {
943 : 34535 : SMgrRelation smgr = RelationGetSmgr(rel);
944 : :
945 : : /*
946 : : * If below the cached nblocks, the block surely exists. Otherwise, we
947 : : * face a trade-off. We opt to compare to a fresh nblocks, incurring
948 : : * lseek() overhead. The alternative would be to assume the block does
949 : : * not exist, but that would cause FSM to set zero space available for
950 : : * blocks that main fork extension just recorded.
951 : : */
952 : 34535 : return ((BlockNumberIsValid(smgr->smgr_cached_nblocks[MAIN_FORKNUM]) &&
953 [ + + + + : 47215 : blknumber < smgr->smgr_cached_nblocks[MAIN_FORKNUM]) ||
+ - ]
954 : 12680 : blknumber < RelationGetNumberOfBlocks(rel));
955 : : }
|