Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * nbtpage.c
4 : : * BTree-specific page management code for the Postgres btree access
5 : : * method.
6 : : *
7 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
8 : : * Portions Copyright (c) 1994, Regents of the University of California
9 : : *
10 : : *
11 : : * IDENTIFICATION
12 : : * src/backend/access/nbtree/nbtpage.c
13 : : *
14 : : * NOTES
15 : : * Postgres btree pages look like ordinary relation pages. The opaque
16 : : * data at high addresses includes pointers to left and right siblings
17 : : * and flag data describing page state. The first page in a btree, page
18 : : * zero, is special -- it stores meta-information describing the tree.
19 : : * Pages one and higher store the actual tree data.
20 : : *
21 : : *-------------------------------------------------------------------------
22 : : */
23 : : #include "postgres.h"
24 : :
25 : : #include "access/nbtree.h"
26 : : #include "access/nbtxlog.h"
27 : : #include "access/tableam.h"
28 : : #include "access/transam.h"
29 : : #include "access/xlog.h"
30 : : #include "access/xloginsert.h"
31 : : #include "common/int.h"
32 : : #include "miscadmin.h"
33 : : #include "storage/indexfsm.h"
34 : : #include "storage/predicate.h"
35 : : #include "storage/procarray.h"
36 : : #include "utils/injection_point.h"
37 : : #include "utils/memdebug.h"
38 : : #include "utils/memutils.h"
39 : : #include "utils/snapmgr.h"
40 : :
41 : : static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
42 : : static void _bt_delitems_delete(Relation rel, Buffer buf,
43 : : TransactionId snapshotConflictHorizon,
44 : : bool isCatalogRel,
45 : : OffsetNumber *deletable, int ndeletable,
46 : : BTVacuumPosting *updatable, int nupdatable);
47 : : static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
48 : : OffsetNumber *updatedoffsets,
49 : : Size *updatedbuflen, bool needswal);
50 : : static bool _bt_mark_page_halfdead(Relation rel, Relation heaprel,
51 : : Buffer leafbuf, BTStack stack);
52 : : static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
53 : : BlockNumber scanblkno,
54 : : bool *rightsib_empty,
55 : : BTVacState *vstate);
56 : : static bool _bt_lock_subtree_parent(Relation rel, Relation heaprel,
57 : : BlockNumber child, BTStack stack,
58 : : Buffer *subtreeparent, OffsetNumber *poffset,
59 : : BlockNumber *topparent,
60 : : BlockNumber *topparentrightsib);
61 : : static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target,
62 : : FullTransactionId safexid);
63 : :
64 : : /*
65 : : * _bt_initmetapage() -- Fill a page buffer with a correct metapage image
66 : : */
67 : : void
2209 pg@bowt.ie 68 :CBC 26292 : _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
69 : : bool allequalimage)
70 : : {
71 : : BTMetaPageData *metad;
72 : : BTPageOpaque metaopaque;
73 : :
7956 tgl@sss.pgh.pa.us 74 : 26292 : _bt_pageinit(page, BLCKSZ);
75 : :
76 : 26292 : metad = BTPageGetMeta(page);
77 : 26292 : metad->btm_magic = BTREE_MAGIC;
78 : 26292 : metad->btm_version = BTREE_VERSION;
79 : 26292 : metad->btm_root = rootbknum;
80 : 26292 : metad->btm_level = level;
81 : 26292 : metad->btm_fastroot = rootbknum;
82 : 26292 : metad->btm_fastlevel = level;
1845 pg@bowt.ie 83 : 26292 : metad->btm_last_cleanup_num_delpages = 0;
2902 teodor@sigaev.ru 84 : 26292 : metad->btm_last_cleanup_num_heap_tuples = -1.0;
2209 pg@bowt.ie 85 : 26292 : metad->btm_allequalimage = allequalimage;
86 : :
1444 michael@paquier.xyz 87 : 26292 : metaopaque = BTPageGetOpaque(page);
7956 tgl@sss.pgh.pa.us 88 : 26292 : metaopaque->btpo_flags = BTP_META;
89 : :
90 : : /*
91 : : * Set pd_lower just past the end of the metadata. This is essential,
92 : : * because without doing so, metadata will be lost if xlog.c compresses
93 : : * the page.
94 : : */
7591 95 : 26292 : ((PageHeader) page)->pd_lower =
96 : 26292 : ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
7956 97 : 26292 : }
98 : :
99 : : /*
100 : : * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
101 : : * 3, the last version that can be updated without broadly affecting
102 : : * on-disk compatibility. (A REINDEX is required to upgrade to v4.)
103 : : *
104 : : * This routine does purely in-memory image upgrade. Caller is
105 : : * responsible for locking, WAL-logging etc.
106 : : */
107 : : void
2902 teodor@sigaev.ru 108 :UBC 0 : _bt_upgrademetapage(Page page)
109 : : {
110 : : BTMetaPageData *metad;
111 : : BTPageOpaque metaopaque PG_USED_FOR_ASSERTS_ONLY;
112 : :
113 : 0 : metad = BTPageGetMeta(page);
1444 michael@paquier.xyz 114 : 0 : metaopaque = BTPageGetOpaque(page);
115 : :
116 : : /* It must be really a meta page of upgradable version */
2902 teodor@sigaev.ru 117 [ # # ]: 0 : Assert(metaopaque->btpo_flags & BTP_META);
2552 pg@bowt.ie 118 [ # # ]: 0 : Assert(metad->btm_version < BTREE_NOVAC_VERSION);
2902 teodor@sigaev.ru 119 [ # # ]: 0 : Assert(metad->btm_version >= BTREE_MIN_VERSION);
120 : :
121 : : /* Set version number and fill extra fields added into version 3 */
2552 pg@bowt.ie 122 : 0 : metad->btm_version = BTREE_NOVAC_VERSION;
1845 123 : 0 : metad->btm_last_cleanup_num_delpages = 0;
2902 teodor@sigaev.ru 124 : 0 : metad->btm_last_cleanup_num_heap_tuples = -1.0;
125 : : /* Only a REINDEX can set this field */
2209 pg@bowt.ie 126 [ # # ]: 0 : Assert(!metad->btm_allequalimage);
127 : 0 : metad->btm_allequalimage = false;
128 : :
129 : : /* Adjust pd_lower (see _bt_initmetapage() for details) */
2902 teodor@sigaev.ru 130 : 0 : ((PageHeader) page)->pd_lower =
131 : 0 : ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
132 : 0 : }
133 : :
134 : : /*
135 : : * Get metadata from share-locked buffer containing metapage, while performing
136 : : * standard sanity checks.
137 : : *
138 : : * Callers that cache data returned here in local cache should note that an
139 : : * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
140 : : * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
141 : : */
142 : : static BTMetaPageData *
2552 pg@bowt.ie 143 :CBC 1137284 : _bt_getmeta(Relation rel, Buffer metabuf)
144 : : {
145 : : Page metapg;
146 : : BTPageOpaque metaopaque;
147 : : BTMetaPageData *metad;
148 : :
149 : 1137284 : metapg = BufferGetPage(metabuf);
1444 michael@paquier.xyz 150 : 1137284 : metaopaque = BTPageGetOpaque(metapg);
2552 pg@bowt.ie 151 : 1137284 : metad = BTPageGetMeta(metapg);
152 : :
153 : : /* sanity-check the metapage */
154 [ + - ]: 1137284 : if (!P_ISMETA(metaopaque) ||
155 [ - + ]: 1137284 : metad->btm_magic != BTREE_MAGIC)
2552 pg@bowt.ie 156 [ # # ]:UBC 0 : ereport(ERROR,
157 : : (errcode(ERRCODE_INDEX_CORRUPTED),
158 : : errmsg("index \"%s\" is not a btree",
159 : : RelationGetRelationName(rel))));
160 : :
2552 pg@bowt.ie 161 [ + - ]:CBC 1137284 : if (metad->btm_version < BTREE_MIN_VERSION ||
162 [ - + ]: 1137284 : metad->btm_version > BTREE_VERSION)
2552 pg@bowt.ie 163 [ # # ]:UBC 0 : ereport(ERROR,
164 : : (errcode(ERRCODE_INDEX_CORRUPTED),
165 : : errmsg("version mismatch in index \"%s\": file version %d, "
166 : : "current version %d, minimal supported version %d",
167 : : RelationGetRelationName(rel),
168 : : metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
169 : :
2552 pg@bowt.ie 170 :CBC 1137284 : return metad;
171 : : }
172 : :
173 : : /*
174 : : * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
175 : : *
176 : : * Called by btvacuumcleanup when btbulkdelete was never called because no
177 : : * index tuples needed to be deleted.
178 : : */
179 : : bool
1009 180 : 17921 : _bt_vacuum_needs_cleanup(Relation rel)
181 : : {
182 : : Buffer metabuf;
183 : : Page metapg;
184 : : BTMetaPageData *metad;
185 : : uint32 btm_version;
186 : : BlockNumber prev_num_delpages;
187 : :
188 : : /*
189 : : * Copy details from metapage to local variables quickly.
190 : : *
191 : : * Note that we deliberately avoid using cached version of metapage here.
192 : : */
193 : 17921 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
1829 194 : 17921 : metapg = BufferGetPage(metabuf);
195 : 17921 : metad = BTPageGetMeta(metapg);
196 : 17921 : btm_version = metad->btm_version;
197 : :
198 [ - + ]: 17921 : if (btm_version < BTREE_NOVAC_VERSION)
199 : : {
200 : : /*
201 : : * Metapage needs to be dynamically upgraded to store fields that are
202 : : * only present when btm_version >= BTREE_NOVAC_VERSION
203 : : */
1829 pg@bowt.ie 204 :UBC 0 : _bt_relbuf(rel, metabuf);
205 : 0 : return true;
206 : : }
207 : :
1829 pg@bowt.ie 208 :CBC 17921 : prev_num_delpages = metad->btm_last_cleanup_num_delpages;
209 : 17921 : _bt_relbuf(rel, metabuf);
210 : :
211 : : /*
212 : : * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
213 : : * total size of the index. We can reasonably expect (though are not
214 : : * guaranteed) to be able to recycle this many pages if we decide to do a
215 : : * btvacuumscan call during the ongoing btvacuumcleanup. For further
216 : : * details see the nbtree/README section on placing deleted pages in the
217 : : * FSM.
218 : : */
219 [ + + ]: 17921 : if (prev_num_delpages > 0 &&
220 [ + - ]: 8 : prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)
221 : 8 : return true;
222 : :
223 : 17913 : return false;
224 : : }
225 : :
226 : : /*
227 : : * _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup.
228 : : *
229 : : * Called at the end of btvacuumcleanup, when num_delpages value has been
230 : : * finalized.
231 : : */
232 : : void
1009 233 : 1311 : _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
234 : : {
235 : : Buffer metabuf;
236 : : Page metapg;
237 : : BTMetaPageData *metad;
238 : : XLogRecPtr recptr;
239 : :
240 : : /*
241 : : * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
242 : : * field started out as a TransactionId field called btm_oldest_btpo_xact.
243 : : * Both "versions" are just uint32 fields. It was convenient to repurpose
244 : : * the field when we began to use 64-bit XIDs in deleted pages.
245 : : *
246 : : * It's possible that a pg_upgrade'd database will contain an XID value in
247 : : * what is now recognized as the metapage's btm_last_cleanup_num_delpages
248 : : * field. _bt_vacuum_needs_cleanup() may even believe that this value
249 : : * indicates that there are lots of pages that it needs to recycle, when
250 : : * in reality there are only one or two. The worst that can happen is
251 : : * that there will be a call to btvacuumscan a little earlier, which will
252 : : * set btm_last_cleanup_num_delpages to a sane value when we're called.
253 : : *
254 : : * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is
255 : : * no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just
256 : : * to be consistent.
257 : : */
258 : 1311 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
2902 teodor@sigaev.ru 259 : 1311 : metapg = BufferGetPage(metabuf);
260 : 1311 : metad = BTPageGetMeta(metapg);
261 : :
262 : : /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */
1831 pg@bowt.ie 263 [ + - ]: 1311 : if (metad->btm_version >= BTREE_NOVAC_VERSION &&
264 [ + + ]: 1311 : metad->btm_last_cleanup_num_delpages == num_delpages)
265 : : {
266 : : /* Usually means index continues to have num_delpages of 0 */
2902 teodor@sigaev.ru 267 : 1205 : _bt_relbuf(rel, metabuf);
268 : 1205 : return;
269 : : }
270 : :
271 : : /* trade in our read lock for a write lock */
2063 pg@bowt.ie 272 : 106 : _bt_unlockbuf(rel, metabuf);
273 : 106 : _bt_lockbuf(rel, metabuf, BT_WRITE);
274 : :
2902 teodor@sigaev.ru 275 : 106 : START_CRIT_SECTION();
276 : :
277 : : /* upgrade meta-page if needed */
2552 pg@bowt.ie 278 [ - + ]: 106 : if (metad->btm_version < BTREE_NOVAC_VERSION)
2902 teodor@sigaev.ru 279 :UBC 0 : _bt_upgrademetapage(metapg);
280 : :
281 : : /* update cleanup-related information */
1845 pg@bowt.ie 282 :CBC 106 : metad->btm_last_cleanup_num_delpages = num_delpages;
1831 283 : 106 : metad->btm_last_cleanup_num_heap_tuples = -1.0;
2902 teodor@sigaev.ru 284 : 106 : MarkBufferDirty(metabuf);
285 : :
286 : : /* write wal record if needed */
287 [ + - + + : 106 : if (RelationNeedsWAL(rel))
+ - + - ]
2902 teodor@sigaev.ru 288 :GIC 106 : {
289 : : xl_btree_metadata md;
290 : :
2902 teodor@sigaev.ru 291 :CBC 106 : XLogBeginInsert();
292 : 106 : XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
293 : :
2552 pg@bowt.ie 294 [ - + ]: 106 : Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
295 : 106 : md.version = metad->btm_version;
2902 teodor@sigaev.ru 296 : 106 : md.root = metad->btm_root;
297 : 106 : md.level = metad->btm_level;
298 : 106 : md.fastroot = metad->btm_fastroot;
299 : 106 : md.fastlevel = metad->btm_fastlevel;
1845 pg@bowt.ie 300 : 106 : md.last_cleanup_num_delpages = num_delpages;
2209 301 : 106 : md.allequalimage = metad->btm_allequalimage;
302 : :
397 peter@eisentraut.org 303 : 106 : XLogRegisterBufData(0, &md, sizeof(xl_btree_metadata));
304 : :
2902 teodor@sigaev.ru 305 : 106 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
306 : : }
307 : : else
2 pg@bowt.ie 308 :UNC 0 : recptr = XLogGetFakeLSN(rel);
309 : :
2 pg@bowt.ie 310 :GNC 106 : PageSetLSN(metapg, recptr);
311 : :
2902 teodor@sigaev.ru 312 [ - + ]:CBC 106 : END_CRIT_SECTION();
313 : :
314 : 106 : _bt_relbuf(rel, metabuf);
315 : : }
316 : :
317 : : /*
318 : : * _bt_getroot() -- Get the root page of the btree.
319 : : *
320 : : * Since the root page can move around the btree file, we have to read
321 : : * its location from the metadata page, and then read the root page
322 : : * itself. If no root page exists yet, we have to create one.
323 : : *
324 : : * The access type parameter (BT_READ or BT_WRITE) controls whether
325 : : * a new root page will be created or not. If access = BT_READ,
326 : : * and no root page exists, we just return InvalidBuffer. For
327 : : * BT_WRITE, we try to create the root page if it doesn't exist.
328 : : * NOTE that the returned root page will have only a read lock set
329 : : * on it even if access = BT_WRITE!
330 : : *
331 : : * If access = BT_WRITE, heaprel must be set; otherwise caller can just
332 : : * pass NULL. See _bt_allocbuf for an explanation.
333 : : *
334 : : * The returned page is not necessarily the true root --- it could be
335 : : * a "fast root" (a page that is alone in its level due to deletions).
336 : : * Also, if the root page is split while we are "in flight" to it,
337 : : * what we will return is the old root, which is now just the leftmost
338 : : * page on a probably-not-very-wide level. For most purposes this is
339 : : * as good as or better than the true root, so we do not bother to
340 : : * insist on finding the true root. We do, however, guarantee to
341 : : * return a live (not deleted or half-dead) page.
342 : : *
343 : : * On successful return, the root page is pinned and read-locked.
344 : : * The metadata page is not locked or pinned on exit.
345 : : */
346 : : Buffer
1079 andres@anarazel.de 347 : 12935701 : _bt_getroot(Relation rel, Relation heaprel, int access)
348 : : {
349 : : Buffer metabuf;
350 : : Buffer rootbuf;
351 : : Page rootpage;
352 : : BTPageOpaque rootopaque;
353 : : BlockNumber rootblkno;
354 : : uint32 rootlevel;
355 : : BTMetaPageData *metad;
356 : : XLogRecPtr recptr;
357 : :
1009 pg@bowt.ie 358 [ + + - + ]: 12935701 : Assert(access == BT_READ || heaprel != NULL);
359 : :
360 : : /*
361 : : * Try to use previously-cached metapage data to find the root. This
362 : : * normally saves one buffer access per index search, which is a very
363 : : * helpful savings in bufmgr traffic and hence contention.
364 : : */
7264 tgl@sss.pgh.pa.us 365 [ + + ]: 12935701 : if (rel->rd_amcache != NULL)
366 : : {
367 : 12636863 : metad = (BTMetaPageData *) rel->rd_amcache;
368 : : /* We shouldn't have cached it if any of these fail */
369 [ - + ]: 12636863 : Assert(metad->btm_magic == BTREE_MAGIC);
2902 teodor@sigaev.ru 370 [ - + ]: 12636863 : Assert(metad->btm_version >= BTREE_MIN_VERSION);
371 [ - + ]: 12636863 : Assert(metad->btm_version <= BTREE_VERSION);
2209 pg@bowt.ie 372 [ + + - + ]: 12636863 : Assert(!metad->btm_allequalimage ||
373 : : metad->btm_version > BTREE_NOVAC_VERSION);
7264 tgl@sss.pgh.pa.us 374 [ - + ]: 12636863 : Assert(metad->btm_root != P_NONE);
375 : :
376 : 12636863 : rootblkno = metad->btm_fastroot;
377 [ - + ]: 12636863 : Assert(rootblkno != P_NONE);
378 : 12636863 : rootlevel = metad->btm_fastlevel;
379 : :
1009 pg@bowt.ie 380 : 12636863 : rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
3616 kgrittn@postgresql.o 381 : 12636863 : rootpage = BufferGetPage(rootbuf);
1444 michael@paquier.xyz 382 : 12636863 : rootopaque = BTPageGetOpaque(rootpage);
383 : :
384 : : /*
385 : : * Since the cache might be stale, we check the page more carefully
386 : : * here than normal. We *must* check that it's not deleted. If it's
387 : : * not alone on its level, then we reject too --- this may be overly
388 : : * paranoid but better safe than sorry. Note we don't check P_ISROOT,
389 : : * because that's not set in a "fast root".
390 : : */
7264 tgl@sss.pgh.pa.us 391 [ + - ]: 12636863 : if (!P_IGNORE(rootopaque) &&
1845 pg@bowt.ie 392 [ + - ]: 12636863 : rootopaque->btpo_level == rootlevel &&
7264 tgl@sss.pgh.pa.us 393 [ + - ]: 12636863 : P_LEFTMOST(rootopaque) &&
394 [ + + ]: 12636863 : P_RIGHTMOST(rootopaque))
395 : : {
396 : : /* OK, accept cached page as the root */
397 : 12486674 : return rootbuf;
398 : : }
399 : 150189 : _bt_relbuf(rel, rootbuf);
400 : : /* Cache is stale, throw it away */
401 [ + - ]: 150189 : if (rel->rd_amcache)
402 : 150189 : pfree(rel->rd_amcache);
403 : 150189 : rel->rd_amcache = NULL;
404 : : }
405 : :
1009 pg@bowt.ie 406 : 449027 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
2432 407 : 449027 : metad = _bt_getmeta(rel, metabuf);
408 : :
409 : : /* if no root page initialized yet, do it */
10416 bruce@momjian.us 410 [ + + ]: 449027 : if (metad->btm_root == P_NONE)
411 : : {
412 : : Page metapg;
413 : :
414 : : /* If access = BT_READ, caller doesn't want us to create root yet */
9368 tgl@sss.pgh.pa.us 415 [ + + ]: 298567 : if (access == BT_READ)
416 : : {
9009 417 : 292188 : _bt_relbuf(rel, metabuf);
9368 418 : 292188 : return InvalidBuffer;
419 : : }
420 : :
421 : : /* trade in our read lock for a write lock */
2063 pg@bowt.ie 422 : 6379 : _bt_unlockbuf(rel, metabuf);
423 : 6379 : _bt_lockbuf(rel, metabuf, BT_WRITE);
424 : :
425 : : /*
426 : : * Race condition: if someone else initialized the metadata between
427 : : * the time we released the read lock and acquired the write lock, we
428 : : * must avoid doing it again.
429 : : */
8422 tgl@sss.pgh.pa.us 430 [ - + ]: 6379 : if (metad->btm_root != P_NONE)
431 : : {
432 : : /*
433 : : * Metadata initialized by someone else. In order to guarantee no
434 : : * deadlocks, we have to release the metadata page and start all
435 : : * over again. (Is that really true? But it's hardly worth trying
436 : : * to optimize this case.)
437 : : */
8422 tgl@sss.pgh.pa.us 438 :UBC 0 : _bt_relbuf(rel, metabuf);
1079 andres@anarazel.de 439 : 0 : return _bt_getroot(rel, heaprel, access);
440 : : }
441 : :
442 : : /*
443 : : * Get, initialize, write, and leave a lock of the appropriate type on
444 : : * the new root page. Since this is the first page in the tree, it's
445 : : * a leaf as well as the root.
446 : : */
1009 pg@bowt.ie 447 :CBC 6379 : rootbuf = _bt_allocbuf(rel, heaprel);
8422 tgl@sss.pgh.pa.us 448 : 6379 : rootblkno = BufferGetBlockNumber(rootbuf);
3616 kgrittn@postgresql.o 449 : 6379 : rootpage = BufferGetPage(rootbuf);
1444 michael@paquier.xyz 450 : 6379 : rootopaque = BTPageGetOpaque(rootpage);
8422 tgl@sss.pgh.pa.us 451 : 6379 : rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
452 : 6379 : rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
1845 pg@bowt.ie 453 : 6379 : rootopaque->btpo_level = 0;
7251 tgl@sss.pgh.pa.us 454 : 6379 : rootopaque->btpo_cycleid = 0;
455 : : /* Get raw page pointer for metapage */
2432 pg@bowt.ie 456 : 6379 : metapg = BufferGetPage(metabuf);
457 : :
458 : : /* NO ELOG(ERROR) till meta is updated */
8422 tgl@sss.pgh.pa.us 459 : 6379 : START_CRIT_SECTION();
460 : :
461 : : /* upgrade metapage if needed */
2552 pg@bowt.ie 462 [ - + ]: 6379 : if (metad->btm_version < BTREE_NOVAC_VERSION)
2846 teodor@sigaev.ru 463 :UBC 0 : _bt_upgrademetapage(metapg);
464 : :
8422 tgl@sss.pgh.pa.us 465 :CBC 6379 : metad->btm_root = rootblkno;
466 : 6379 : metad->btm_level = 0;
467 : 6379 : metad->btm_fastroot = rootblkno;
468 : 6379 : metad->btm_fastlevel = 0;
1845 pg@bowt.ie 469 : 6379 : metad->btm_last_cleanup_num_delpages = 0;
2902 teodor@sigaev.ru 470 : 6379 : metad->btm_last_cleanup_num_heap_tuples = -1.0;
471 : :
7289 tgl@sss.pgh.pa.us 472 : 6379 : MarkBufferDirty(rootbuf);
473 : 6379 : MarkBufferDirty(metabuf);
474 : :
475 : : /* XLOG stuff */
5571 rhaas@postgresql.org 476 [ + + + + : 6379 : if (RelationNeedsWAL(rel))
+ + + + ]
8422 tgl@sss.pgh.pa.us 477 :GIC 6142 : {
478 : : xl_btree_newroot xlrec;
479 : : xl_btree_metadata md;
480 : :
4133 heikki.linnakangas@i 481 :CBC 6142 : XLogBeginInsert();
482 : 6142 : XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
3054 tgl@sss.pgh.pa.us 483 : 6142 : XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
484 : :
2552 pg@bowt.ie 485 [ - + ]: 6142 : Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
486 : 6142 : md.version = metad->btm_version;
4133 heikki.linnakangas@i 487 : 6142 : md.root = rootblkno;
488 : 6142 : md.level = 0;
489 : 6142 : md.fastroot = rootblkno;
490 : 6142 : md.fastlevel = 0;
1845 pg@bowt.ie 491 : 6142 : md.last_cleanup_num_delpages = 0;
2209 492 : 6142 : md.allequalimage = metad->btm_allequalimage;
493 : :
397 peter@eisentraut.org 494 : 6142 : XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
495 : :
8422 tgl@sss.pgh.pa.us 496 : 6142 : xlrec.rootblk = rootblkno;
497 : 6142 : xlrec.level = 0;
498 : :
397 peter@eisentraut.org 499 : 6142 : XLogRegisterData(&xlrec, SizeOfBtreeNewroot);
500 : :
4133 heikki.linnakangas@i 501 : 6142 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
502 : : }
503 : : else
2 pg@bowt.ie 504 :GNC 237 : recptr = XLogGetFakeLSN(rel);
505 : :
506 : 6379 : PageSetLSN(rootpage, recptr);
507 : 6379 : PageSetLSN(metapg, recptr);
508 : :
8422 tgl@sss.pgh.pa.us 509 [ - + ]:CBC 6379 : END_CRIT_SECTION();
510 : :
511 : : /*
512 : : * swap root write lock for read lock. There is no danger of anyone
513 : : * else accessing the new root page while it's unlocked, since no one
514 : : * else knows where it is yet.
515 : : */
2063 pg@bowt.ie 516 : 6379 : _bt_unlockbuf(rel, rootbuf);
517 : 6379 : _bt_lockbuf(rel, rootbuf, BT_READ);
518 : :
519 : : /* okay, metadata is correct, release lock on it without caching */
7289 tgl@sss.pgh.pa.us 520 : 6379 : _bt_relbuf(rel, metabuf);
521 : : }
522 : : else
523 : : {
8423 524 : 150460 : rootblkno = metad->btm_fastroot;
8422 525 [ - + ]: 150460 : Assert(rootblkno != P_NONE);
526 : 150460 : rootlevel = metad->btm_fastlevel;
527 : :
528 : : /*
529 : : * Cache the metapage data for next time
530 : : */
2432 pg@bowt.ie 531 : 150460 : rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
532 : : sizeof(BTMetaPageData));
533 : 150460 : memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
534 : :
535 : : /*
536 : : * We are done with the metapage; arrange to release it via first
537 : : * _bt_relandgetbuf call
538 : : */
7998 tgl@sss.pgh.pa.us 539 : 150460 : rootbuf = metabuf;
540 : :
541 : : for (;;)
542 : : {
543 : 150460 : rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
3616 kgrittn@postgresql.o 544 : 150460 : rootpage = BufferGetPage(rootbuf);
1444 michael@paquier.xyz 545 : 150460 : rootopaque = BTPageGetOpaque(rootpage);
546 : :
8422 tgl@sss.pgh.pa.us 547 [ + - ]: 150460 : if (!P_IGNORE(rootopaque))
548 : 150460 : break;
549 : :
550 : : /* it's dead, Jim. step right one page */
8422 tgl@sss.pgh.pa.us 551 [ # # ]:UBC 0 : if (P_RIGHTMOST(rootopaque))
6649 552 [ # # ]: 0 : elog(ERROR, "no live root page found in index \"%s\"",
553 : : RelationGetRelationName(rel));
8422 554 : 0 : rootblkno = rootopaque->btpo_next;
555 : : }
556 : :
1845 pg@bowt.ie 557 [ - + ]:CBC 150460 : if (rootopaque->btpo_level != rootlevel)
6649 tgl@sss.pgh.pa.us 558 [ # # ]:UBC 0 : elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
559 : : rootblkno, RelationGetRelationName(rel),
560 : : rootopaque->btpo_level, rootlevel);
561 : : }
562 : :
563 : : /*
564 : : * By here, we have a pin and read lock on the root page, and no lock set
565 : : * on the metadata page. Return the root page's buffer.
566 : : */
8423 tgl@sss.pgh.pa.us 567 :CBC 156839 : return rootbuf;
568 : : }
569 : :
570 : : /*
571 : : * _bt_gettrueroot() -- Get the true root page of the btree.
572 : : *
573 : : * This is the same as the BT_READ case of _bt_getroot(), except
574 : : * we follow the true-root link not the fast-root link.
575 : : *
576 : : * By the time we acquire lock on the root page, it might have been split and
577 : : * not be the true root anymore. This is okay for the present uses of this
578 : : * routine; we only really need to be able to move up at least one tree level
579 : : * from whatever non-root page we were at. If we ever do need to lock the
580 : : * one true root page, we could loop here, re-reading the metapage on each
581 : : * failure. (Note that it wouldn't do to hold the lock on the metapage while
582 : : * moving to the root --- that'd deadlock against any concurrent root split.)
583 : : */
584 : : Buffer
1009 pg@bowt.ie 585 : 9 : _bt_gettrueroot(Relation rel)
586 : : {
587 : : Buffer metabuf;
588 : : Page metapg;
589 : : BTPageOpaque metaopaque;
590 : : Buffer rootbuf;
591 : : Page rootpage;
592 : : BTPageOpaque rootopaque;
593 : : BlockNumber rootblkno;
594 : : uint32 rootlevel;
595 : : BTMetaPageData *metad;
596 : :
597 : : /*
598 : : * We don't try to use cached metapage data here, since (a) this path is
599 : : * not performance-critical, and (b) if we are here it suggests our cache
600 : : * is out-of-date anyway. In light of point (b), it's probably safest to
601 : : * actively flush any cached metapage info.
602 : : */
7264 tgl@sss.pgh.pa.us 603 [ + - ]: 9 : if (rel->rd_amcache)
604 : 9 : pfree(rel->rd_amcache);
605 : 9 : rel->rd_amcache = NULL;
606 : :
1009 pg@bowt.ie 607 : 9 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
3616 kgrittn@postgresql.o 608 : 9 : metapg = BufferGetPage(metabuf);
1444 michael@paquier.xyz 609 : 9 : metaopaque = BTPageGetOpaque(metapg);
8423 tgl@sss.pgh.pa.us 610 : 9 : metad = BTPageGetMeta(metapg);
611 : :
3100 612 [ + - ]: 9 : if (!P_ISMETA(metaopaque) ||
8423 613 [ - + ]: 9 : metad->btm_magic != BTREE_MAGIC)
8273 tgl@sss.pgh.pa.us 614 [ # # ]:UBC 0 : ereport(ERROR,
615 : : (errcode(ERRCODE_INDEX_CORRUPTED),
616 : : errmsg("index \"%s\" is not a btree",
617 : : RelationGetRelationName(rel))));
618 : :
2902 teodor@sigaev.ru 619 [ + - ]:CBC 9 : if (metad->btm_version < BTREE_MIN_VERSION ||
620 [ - + ]: 9 : metad->btm_version > BTREE_VERSION)
8273 tgl@sss.pgh.pa.us 621 [ # # ]:UBC 0 : ereport(ERROR,
622 : : (errcode(ERRCODE_INDEX_CORRUPTED),
623 : : errmsg("version mismatch in index \"%s\": file version %d, "
624 : : "current version %d, minimal supported version %d",
625 : : RelationGetRelationName(rel),
626 : : metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
627 : :
628 : : /* if no root page initialized yet, fail */
8423 tgl@sss.pgh.pa.us 629 [ - + ]:CBC 9 : if (metad->btm_root == P_NONE)
630 : : {
8423 tgl@sss.pgh.pa.us 631 :UBC 0 : _bt_relbuf(rel, metabuf);
632 : 0 : return InvalidBuffer;
633 : : }
634 : :
8423 tgl@sss.pgh.pa.us 635 :CBC 9 : rootblkno = metad->btm_root;
8422 636 : 9 : rootlevel = metad->btm_level;
637 : :
638 : : /*
639 : : * We are done with the metapage; arrange to release it via first
640 : : * _bt_relandgetbuf call
641 : : */
7998 642 : 9 : rootbuf = metabuf;
643 : :
644 : : for (;;)
645 : : {
646 : 9 : rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
3616 kgrittn@postgresql.o 647 : 9 : rootpage = BufferGetPage(rootbuf);
1444 michael@paquier.xyz 648 : 9 : rootopaque = BTPageGetOpaque(rootpage);
649 : :
8422 tgl@sss.pgh.pa.us 650 [ + - ]: 9 : if (!P_IGNORE(rootopaque))
651 : 9 : break;
652 : :
653 : : /* it's dead, Jim. step right one page */
8422 tgl@sss.pgh.pa.us 654 [ # # ]:UBC 0 : if (P_RIGHTMOST(rootopaque))
6649 655 [ # # ]: 0 : elog(ERROR, "no live root page found in index \"%s\"",
656 : : RelationGetRelationName(rel));
8422 657 : 0 : rootblkno = rootopaque->btpo_next;
658 : : }
659 : :
1845 pg@bowt.ie 660 [ - + ]:CBC 9 : if (rootopaque->btpo_level != rootlevel)
6649 tgl@sss.pgh.pa.us 661 [ # # ]:UBC 0 : elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
662 : : rootblkno, RelationGetRelationName(rel),
663 : : rootopaque->btpo_level, rootlevel);
664 : :
10057 bruce@momjian.us 665 :CBC 9 : return rootbuf;
666 : : }
667 : :
668 : : /*
669 : : * _bt_getrootheight() -- Get the height of the btree search tree.
670 : : *
671 : : * We return the level (counting from zero) of the current fast root.
672 : : * This represents the number of tree levels we'd have to descend through
673 : : * to start any btree index search.
674 : : *
675 : : * This is used by the planner for cost-estimation purposes. Since it's
676 : : * only an estimate, slightly-stale data is fine, hence we don't worry
677 : : * about updating previously cached data.
678 : : */
679 : : int
1009 pg@bowt.ie 680 : 2943593 : _bt_getrootheight(Relation rel)
681 : : {
682 : : BTMetaPageData *metad;
683 : :
4811 tgl@sss.pgh.pa.us 684 [ + + ]: 2943593 : if (rel->rd_amcache == NULL)
685 : : {
686 : : Buffer metabuf;
687 : :
1009 pg@bowt.ie 688 : 55100 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
2552 689 : 55100 : metad = _bt_getmeta(rel, metabuf);
690 : :
691 : : /*
692 : : * If there's no root page yet, _bt_getroot() doesn't expect a cache
693 : : * to be made, so just stop here and report the index height is zero.
694 : : * (XXX perhaps _bt_getroot() should be changed to allow this case.)
695 : : */
4811 tgl@sss.pgh.pa.us 696 [ + + ]: 55100 : if (metad->btm_root == P_NONE)
697 : : {
698 : 29149 : _bt_relbuf(rel, metabuf);
699 : 29149 : return 0;
700 : : }
701 : :
702 : : /*
703 : : * Cache the metapage data for next time
704 : : */
2432 pg@bowt.ie 705 : 25951 : rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
706 : : sizeof(BTMetaPageData));
707 : 25951 : memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
4811 tgl@sss.pgh.pa.us 708 : 25951 : _bt_relbuf(rel, metabuf);
709 : : }
710 : :
711 : : /* Get cached page */
712 : 2914444 : metad = (BTMetaPageData *) rel->rd_amcache;
713 : : /* We shouldn't have cached it if any of these fail */
2432 pg@bowt.ie 714 [ - + ]: 2914444 : Assert(metad->btm_magic == BTREE_MAGIC);
715 [ - + ]: 2914444 : Assert(metad->btm_version >= BTREE_MIN_VERSION);
716 [ - + ]: 2914444 : Assert(metad->btm_version <= BTREE_VERSION);
2209 717 [ + + - + ]: 2914444 : Assert(!metad->btm_allequalimage ||
718 : : metad->btm_version > BTREE_NOVAC_VERSION);
2432 719 [ - + ]: 2914444 : Assert(metad->btm_fastroot != P_NONE);
720 : :
4811 tgl@sss.pgh.pa.us 721 : 2914444 : return metad->btm_fastlevel;
722 : : }
723 : :
724 : : /*
725 : : * _bt_metaversion() -- Get version/status info from metapage.
726 : : *
727 : : * Sets caller's *heapkeyspace and *allequalimage arguments using data
728 : : * from the B-Tree metapage (could be locally-cached version). This
729 : : * information needs to be stashed in insertion scankey, so we provide a
730 : : * single function that fetches both at once.
731 : : *
732 : : * This is used to determine the rules that must be used to descend a
733 : : * btree. Version 4 indexes treat heap TID as a tiebreaker attribute.
734 : : * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
735 : : * performance when inserting a new BTScanInsert-wise duplicate tuple
736 : : * among many leaf pages already full of such duplicates.
737 : : *
738 : : * Also sets allequalimage field, which indicates whether or not it is
739 : : * safe to apply deduplication. We rely on the assumption that
740 : : * btm_allequalimage will be zero'ed on heapkeyspace indexes that were
741 : : * pg_upgrade'd from Postgres 12.
742 : : */
743 : : void
1009 pg@bowt.ie 744 : 15574178 : _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
745 : : {
746 : : BTMetaPageData *metad;
747 : :
2552 748 [ + + ]: 15574178 : if (rel->rd_amcache == NULL)
749 : : {
750 : : Buffer metabuf;
751 : :
1009 752 : 633157 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
2552 753 : 633157 : metad = _bt_getmeta(rel, metabuf);
754 : :
755 : : /*
756 : : * If there's no root page yet, _bt_getroot() doesn't expect a cache
757 : : * to be made, so just stop here. (XXX perhaps _bt_getroot() should
758 : : * be changed to allow this case.)
759 : : */
760 [ + + ]: 633157 : if (metad->btm_root == P_NONE)
761 : : {
2209 762 : 293880 : *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
763 : 293880 : *allequalimage = metad->btm_allequalimage;
764 : :
2552 765 : 293880 : _bt_relbuf(rel, metabuf);
2209 766 : 293880 : return;
767 : : }
768 : :
769 : : /*
770 : : * Cache the metapage data for next time
771 : : *
772 : : * An on-the-fly version upgrade performed by _bt_upgrademetapage()
773 : : * can change the nbtree version for an index without invalidating any
774 : : * local cache. This is okay because it can only happen when moving
775 : : * from version 2 to version 3, both of which are !heapkeyspace
776 : : * versions.
777 : : */
2432 778 : 339277 : rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
779 : : sizeof(BTMetaPageData));
780 : 339277 : memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
2552 781 : 339277 : _bt_relbuf(rel, metabuf);
782 : : }
783 : :
784 : : /* Get cached page */
785 : 15280298 : metad = (BTMetaPageData *) rel->rd_amcache;
786 : : /* We shouldn't have cached it if any of these fail */
2432 787 [ - + ]: 15280298 : Assert(metad->btm_magic == BTREE_MAGIC);
788 [ - + ]: 15280298 : Assert(metad->btm_version >= BTREE_MIN_VERSION);
789 [ - + ]: 15280298 : Assert(metad->btm_version <= BTREE_VERSION);
2209 790 [ + + - + ]: 15280298 : Assert(!metad->btm_allequalimage ||
791 : : metad->btm_version > BTREE_NOVAC_VERSION);
2432 792 [ - + ]: 15280298 : Assert(metad->btm_fastroot != P_NONE);
793 : :
2209 794 : 15280298 : *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
795 : 15280298 : *allequalimage = metad->btm_allequalimage;
796 : : }
797 : :
798 : : /*
799 : : * _bt_checkpage() -- Verify that a freshly-read page looks sane.
800 : : */
801 : : void
7434 tgl@sss.pgh.pa.us 802 : 25239017 : _bt_checkpage(Relation rel, Buffer buf)
803 : : {
3616 kgrittn@postgresql.o 804 : 25239017 : Page page = BufferGetPage(buf);
805 : :
806 : : /*
807 : : * ReadBuffer verifies that every newly-read page passes
808 : : * PageHeaderIsValid, which means it either contains a reasonably sane
809 : : * page header or is all-zero. We have to defend against the all-zero
810 : : * case, however.
811 : : */
7434 tgl@sss.pgh.pa.us 812 [ - + ]: 25239017 : if (PageIsNew(page))
7434 tgl@sss.pgh.pa.us 813 [ # # ]:UBC 0 : ereport(ERROR,
814 : : (errcode(ERRCODE_INDEX_CORRUPTED),
815 : : errmsg("index \"%s\" contains unexpected zero page at block %u",
816 : : RelationGetRelationName(rel),
817 : : BufferGetBlockNumber(buf)),
818 : : errhint("Please REINDEX it.")));
819 : :
820 : : /*
821 : : * Additionally check that the special area looks sane.
822 : : */
6454 tgl@sss.pgh.pa.us 823 [ - + ]:CBC 25239017 : if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
7434 tgl@sss.pgh.pa.us 824 [ # # ]:UBC 0 : ereport(ERROR,
825 : : (errcode(ERRCODE_INDEX_CORRUPTED),
826 : : errmsg("index \"%s\" contains corrupted page at block %u",
827 : : RelationGetRelationName(rel),
828 : : BufferGetBlockNumber(buf)),
829 : : errhint("Please REINDEX it.")));
7434 tgl@sss.pgh.pa.us 830 :CBC 25239017 : }
831 : :
832 : : /*
833 : : * _bt_getbuf() -- Get an existing block in a buffer, for read or write.
834 : : *
835 : : * The general rule in nbtree is that it's never okay to access a
836 : : * page without holding both a buffer pin and a buffer lock on
837 : : * the page's buffer.
838 : : *
839 : : * When this routine returns, the appropriate lock is set on the
840 : : * requested buffer and its reference count has been incremented
841 : : * (ie, the buffer is "locked and pinned"). Also, we apply
842 : : * _bt_checkpage to sanity-check the page, and perform Valgrind
843 : : * client requests that help Valgrind detect unsafe page accesses.
844 : : *
845 : : * Note: raw LockBuffer() calls are disallowed in nbtree; all
846 : : * buffer lock requests need to go through wrapper functions such
847 : : * as _bt_lockbuf().
848 : : */
849 : : Buffer
1009 pg@bowt.ie 850 : 13909026 : _bt_getbuf(Relation rel, BlockNumber blkno, int access)
851 : : {
852 : : Buffer buf;
853 : :
854 [ - + ]: 13909026 : Assert(BlockNumberIsValid(blkno));
855 : :
856 : : /* Read an existing block of the relation */
857 : 13909026 : buf = ReadBuffer(rel, blkno);
858 : 13909026 : _bt_lockbuf(rel, buf, access);
859 : 13909026 : _bt_checkpage(rel, buf);
860 : :
861 : 13909026 : return buf;
862 : : }
863 : :
864 : : /*
865 : : * _bt_allocbuf() -- Allocate a new block/page.
866 : : *
867 : : * Returns a write-locked buffer containing an unallocated nbtree page.
868 : : *
869 : : * Callers are required to pass a valid heaprel. We need heaprel so that we
870 : : * can handle generating a snapshotConflictHorizon that makes reusing a page
871 : : * from the FSM safe for queries that may be running on standbys.
872 : : */
873 : : Buffer
874 : 20331 : _bt_allocbuf(Relation rel, Relation heaprel)
875 : : {
876 : : Buffer buf;
877 : : BlockNumber blkno;
878 : : Page page;
879 : :
880 [ + - ]: 20331 : Assert(heaprel != NULL);
881 : :
882 : : /*
883 : : * First see if the FSM knows of any free pages.
884 : : *
885 : : * We can't trust the FSM's report unreservedly; we have to check that the
886 : : * page is still free. (For example, an already-free page could have been
887 : : * re-used between the time the last VACUUM scanned it and the time the
888 : : * VACUUM made its FSM updates.)
889 : : *
890 : : * In fact, it's worse than that: we can't even assume that it's safe to
891 : : * take a lock on the reported page. If somebody else has a lock on it,
892 : : * or even worse our own caller does, we could deadlock. (The own-caller
893 : : * scenario is actually not improbable. Consider an index on a serial or
894 : : * timestamp column. Nearly all splits will be at the rightmost page, so
895 : : * it's entirely likely that _bt_split will call us while holding a lock
896 : : * on the page most recently acquired from FSM. A VACUUM running
897 : : * concurrently with the previous split could well have placed that page
898 : : * back in FSM.)
899 : : *
900 : : * To get around that, we ask for only a conditional lock on the reported
901 : : * page. If we fail, then someone else is using the page, and we may
902 : : * reasonably assume it's not free. (If we happen to be wrong, the worst
903 : : * consequence is the page will be lost to use till the next VACUUM, which
904 : : * is no big problem.)
905 : : */
906 : : for (;;)
907 : : {
908 : 20331 : blkno = GetFreeIndexPage(rel);
909 [ + + ]: 20331 : if (blkno == InvalidBlockNumber)
910 : 19966 : break;
911 : 365 : buf = ReadBuffer(rel, blkno);
912 [ + - ]: 365 : if (_bt_conditionallockbuf(rel, buf))
913 : : {
914 : 365 : page = BufferGetPage(buf);
915 : :
916 : : /*
917 : : * It's possible to find an all-zeroes page in an index. For
918 : : * example, a backend might successfully extend the relation one
919 : : * page and then crash before it is able to make a WAL entry for
920 : : * adding the page. If we find a zeroed page then reclaim it
921 : : * immediately.
922 : : */
923 [ - + ]: 365 : if (PageIsNew(page))
924 : : {
925 : : /* Okay to use page. Initialize and return it. */
1009 pg@bowt.ie 926 :UBC 0 : _bt_pageinit(page, BufferGetPageSize(buf));
927 : 0 : return buf;
928 : : }
929 : :
1009 pg@bowt.ie 930 [ + - ]:CBC 365 : if (BTPageIsRecyclable(page, heaprel))
931 : : {
932 : : /*
933 : : * If we are generating WAL for Hot Standby then create a WAL
934 : : * record that will allow us to conflict with queries running
935 : : * on standby, in case they have snapshots older than safexid
936 : : * value
937 : : */
938 [ + - - + : 365 : if (RelationNeedsWAL(rel) && XLogStandbyInfoActive())
- - - - +
- ]
939 : : {
940 : : xl_btree_reuse_page xlrec_reuse;
941 : :
942 : : /*
943 : : * Note that we don't register the buffer with the record,
944 : : * because this operation doesn't modify the page (that
945 : : * already happened, back when VACUUM deleted the page).
946 : : * This record only exists to provide a conflict point for
947 : : * Hot Standby. See record REDO routine comments.
948 : : */
949 : 365 : xlrec_reuse.locator = rel->rd_locator;
950 : 365 : xlrec_reuse.block = blkno;
951 : 365 : xlrec_reuse.snapshotConflictHorizon = BTPageGetDeleteXid(page);
952 : 365 : xlrec_reuse.isCatalogRel =
953 [ + - - + : 365 : RelationIsAccessibleInLogicalDecoding(heaprel);
- - - - -
- - - - -
- - - - -
- - - ]
954 : :
955 : 365 : XLogBeginInsert();
397 peter@eisentraut.org 956 : 365 : XLogRegisterData(&xlrec_reuse, SizeOfBtreeReusePage);
957 : :
1009 pg@bowt.ie 958 : 365 : XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
959 : : }
960 : :
961 : : /* Okay to use page. Re-initialize and return it. */
962 : 365 : _bt_pageinit(page, BufferGetPageSize(buf));
963 : 365 : return buf;
964 : : }
1009 pg@bowt.ie 965 [ # # ]:UBC 0 : elog(DEBUG2, "FSM returned nonrecyclable page");
966 : 0 : _bt_relbuf(rel, buf);
967 : : }
968 : : else
969 : : {
970 [ # # ]: 0 : elog(DEBUG2, "FSM returned nonlockable page");
971 : : /* couldn't get lock, so just drop pin */
972 : 0 : ReleaseBuffer(buf);
973 : : }
974 : : }
975 : :
976 : : /*
977 : : * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or we
978 : : * risk a race condition against btvacuumscan --- see comments therein.
979 : : * This forces us to repeat the valgrind request that _bt_lockbuf()
980 : : * otherwise would make, as we can't use _bt_lockbuf() without introducing
981 : : * a race.
982 : : */
935 tmunro@postgresql.or 983 :CBC 19966 : buf = ExtendBufferedRel(BMR_REL(rel), MAIN_FORKNUM, NULL, EB_LOCK_FIRST);
1009 pg@bowt.ie 984 : 19966 : if (!RelationUsesLocalBuffers(rel))
985 : : VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
986 : :
987 : : /* Initialize the new page before returning it */
988 : 19966 : page = BufferGetPage(buf);
989 [ - + ]: 19966 : Assert(PageIsNew(page));
990 : 19966 : _bt_pageinit(page, BufferGetPageSize(buf));
991 : :
10057 bruce@momjian.us 992 : 19966 : return buf;
993 : : }
994 : :
995 : : /*
996 : : * _bt_relandgetbuf() -- release a locked buffer and get another one.
997 : : *
998 : : * This is equivalent to _bt_relbuf followed by _bt_getbuf. Also, if obuf is
999 : : * InvalidBuffer then it reduces to just _bt_getbuf; allowing this case
1000 : : * simplifies some callers.
1001 : : *
1002 : : * The original motivation for using this was to avoid two entries to the
1003 : : * bufmgr when one would do. However, now it's mainly just a notational
1004 : : * convenience. The only case where it saves work over _bt_relbuf/_bt_getbuf
1005 : : * is when the target page is the same one already in the buffer.
1006 : : */
1007 : : Buffer
7998 tgl@sss.pgh.pa.us 1008 : 11251768 : _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
1009 : : {
1010 : : Buffer buf;
1011 : :
1009 pg@bowt.ie 1012 [ - + ]: 11251768 : Assert(BlockNumberIsValid(blkno));
7998 tgl@sss.pgh.pa.us 1013 [ + + ]: 11251768 : if (BufferIsValid(obuf))
2063 pg@bowt.ie 1014 : 11243866 : _bt_unlockbuf(rel, obuf);
7998 tgl@sss.pgh.pa.us 1015 : 11251768 : buf = ReleaseAndReadBuffer(obuf, rel, blkno);
2063 pg@bowt.ie 1016 : 11251768 : _bt_lockbuf(rel, buf, access);
1017 : :
7434 tgl@sss.pgh.pa.us 1018 : 11251768 : _bt_checkpage(rel, buf);
7998 1019 : 11251768 : return buf;
1020 : : }
1021 : :
1022 : : /*
1023 : : * _bt_relbuf() -- release a locked buffer.
1024 : : *
1025 : : * Lock and pin (refcount) are both dropped.
1026 : : */
1027 : : void
9009 1028 : 11540247 : _bt_relbuf(Relation rel, Buffer buf)
1029 : : {
2063 pg@bowt.ie 1030 : 11540247 : _bt_unlockbuf(rel, buf);
1031 : 11540247 : ReleaseBuffer(buf);
1032 : 11540247 : }
1033 : :
1034 : : /*
1035 : : * _bt_lockbuf() -- lock a pinned buffer.
1036 : : *
1037 : : * Lock is acquired without acquiring another pin. This is like a raw
1038 : : * LockBuffer() call, but performs extra steps needed by Valgrind.
1039 : : *
1040 : : * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
1041 : : * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
1042 : : */
1043 : : void
1044 : 25683910 : _bt_lockbuf(Relation rel, Buffer buf, int access)
1045 : : {
1046 : : /* LockBuffer() asserts that pin is held by this backend */
1047 : 25683910 : LockBuffer(buf, access);
1048 : :
1049 : : /*
1050 : : * It doesn't matter that _bt_unlockbuf() won't get called in the event of
1051 : : * an nbtree error (e.g. a unique violation error). That won't cause
1052 : : * Valgrind false positives.
1053 : : *
1054 : : * The nbtree client requests are superimposed on top of the bufmgr.c
1055 : : * buffer pin client requests. In the event of an nbtree error the buffer
1056 : : * will certainly get marked as defined when the backend once again
1057 : : * acquires its first pin on the buffer. (Of course, if the backend never
1058 : : * touches the buffer again then it doesn't matter that it remains
1059 : : * non-accessible to Valgrind.)
1060 : : *
1061 : : * Note: When an IndexTuple C pointer gets computed using an ItemId read
1062 : : * from a page while a lock was held, the C pointer becomes unsafe to
1063 : : * dereference forever as soon as the lock is released. Valgrind can only
1064 : : * detect cases where the pointer gets dereferenced with no _current_
1065 : : * lock/pin held, though.
1066 : : */
1067 : 25683910 : if (!RelationUsesLocalBuffers(rel))
1068 : : VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
1069 : 25683910 : }
1070 : :
1071 : : /*
1072 : : * _bt_unlockbuf() -- unlock a pinned buffer.
1073 : : */
1074 : : void
1075 : 25738017 : _bt_unlockbuf(Relation rel, Buffer buf)
1076 : : {
1077 : : /*
1078 : : * Buffer is pinned and locked, which means that it is expected to be
1079 : : * defined and addressable. Check that proactively.
1080 : : */
1081 : : VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
1082 : :
1083 : : /* LockBuffer() asserts that pin is held by this backend */
1084 : 25738017 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1085 : :
1086 : 25738017 : if (!RelationUsesLocalBuffers(rel))
1087 : : VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);
1088 : 25738017 : }
1089 : :
1090 : : /*
1091 : : * _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
1092 : : * buffer.
1093 : : *
1094 : : * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
1095 : : * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
1096 : : */
1097 : : bool
1098 : 35021 : _bt_conditionallockbuf(Relation rel, Buffer buf)
1099 : : {
1100 : : /* ConditionalLockBuffer() asserts that pin is held by this backend */
1101 [ + + ]: 35021 : if (!ConditionalLockBuffer(buf))
1102 : 869 : return false;
1103 : :
1104 : 34152 : if (!RelationUsesLocalBuffers(rel))
1105 : : VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
1106 : :
1107 : 34152 : return true;
1108 : : }
1109 : :
1110 : : /*
1111 : : * _bt_upgradelockbufcleanup() -- upgrade lock to a full cleanup lock.
1112 : : */
1113 : : void
1114 : 15027 : _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
1115 : : {
1116 : : /*
1117 : : * Buffer is pinned and locked, which means that it is expected to be
1118 : : * defined and addressable. Check that proactively.
1119 : : */
1120 : : VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
1121 : :
1122 : : /* LockBuffer() asserts that pin is held by this backend */
1123 : 15027 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1124 : 15027 : LockBufferForCleanup(buf);
10841 scrappy@hub.org 1125 : 15027 : }
1126 : :
1127 : : /*
1128 : : * _bt_pageinit() -- Initialize a new page.
1129 : : *
1130 : : * On return, the page header is initialized; data space is empty;
1131 : : * special space is zeroed out.
1132 : : */
1133 : : void
1134 : 92030 : _bt_pageinit(Page page, Size size)
1135 : : {
10416 bruce@momjian.us 1136 : 92030 : PageInit(page, size, sizeof(BTPageOpaqueData));
10841 scrappy@hub.org 1137 : 92030 : }
1138 : :
1139 : : /*
1140 : : * Delete item(s) from a btree leaf page during VACUUM.
1141 : : *
1142 : : * This routine assumes that the caller already has a full cleanup lock on
1143 : : * the buffer. Also, the given deletable and updatable arrays *must* be
1144 : : * sorted in ascending order.
1145 : : *
1146 : : * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
1147 : : * in an existing posting list item are to be removed. This works by
1148 : : * updating/overwriting an existing item with caller's new version of the item
1149 : : * (a version that lacks the TIDs that are to be deleted).
1150 : : *
1151 : : * We record VACUUMs and b-tree deletes differently in WAL. Deletes must
1152 : : * generate their own snapshotConflictHorizon directly from the tableam,
1153 : : * whereas VACUUMs rely on the initial VACUUM table scan performing
1154 : : * WAL-logging that takes care of the issue for the table's indexes
1155 : : * indirectly. Also, we remove the VACUUM cycle ID from pages, which b-tree
1156 : : * deletes don't do.
1157 : : */
1158 : : void
5831 simon@2ndQuadrant.co 1159 : 8484 : _bt_delitems_vacuum(Relation rel, Buffer buf,
1160 : : OffsetNumber *deletable, int ndeletable,
1161 : : BTVacuumPosting *updatable, int nupdatable)
1162 : : {
3616 kgrittn@postgresql.o 1163 : 8484 : Page page = BufferGetPage(buf);
1164 : : BTPageOpaque opaque;
1887 pg@bowt.ie 1165 [ + + + + : 8484 : bool needswal = RelationNeedsWAL(rel);
+ - + - ]
2209 1166 : 8484 : char *updatedbuf = NULL;
1167 : 8484 : Size updatedbuflen = 0;
1168 : : OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1169 : : XLogRecPtr recptr;
1170 : :
1171 : : /* Shouldn't be called unless there's something to do */
1172 [ + + - + ]: 8484 : Assert(ndeletable > 0 || nupdatable > 0);
1173 : :
1174 : : /* Generate new version of posting lists without deleted TIDs */
1887 1175 [ + + ]: 8484 : if (nupdatable > 0)
1176 : 970 : updatedbuf = _bt_delitems_update(updatable, nupdatable,
1177 : : updatedoffsets, &updatedbuflen,
1178 : : needswal);
1179 : :
1180 : : /* No ereport(ERROR) until changes are logged */
9193 tgl@sss.pgh.pa.us 1181 : 8484 : START_CRIT_SECTION();
1182 : :
1183 : : /*
1184 : : * Handle posting tuple updates.
1185 : : *
1186 : : * Deliberately do this before handling simple deletes. If we did it the
1187 : : * other way around (i.e. WAL record order -- simple deletes before
1188 : : * updates) then we'd have to make compensating changes to the 'updatable'
1189 : : * array of offset numbers.
1190 : : *
1191 : : * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1192 : : * happens to already be set. It's important that we not interfere with
1193 : : * any future simple index tuple deletion operations.
1194 : : */
2209 pg@bowt.ie 1195 [ + + ]: 32323 : for (int i = 0; i < nupdatable; i++)
1196 : : {
1197 : 23839 : OffsetNumber updatedoffset = updatedoffsets[i];
1198 : : IndexTuple itup;
1199 : : Size itemsz;
1200 : :
1201 : 23839 : itup = updatable[i]->itup;
1202 : 23839 : itemsz = MAXALIGN(IndexTupleSize(itup));
139 peter@eisentraut.org 1203 [ - + ]:GNC 23839 : if (!PageIndexTupleOverwrite(page, updatedoffset, itup, itemsz))
2209 pg@bowt.ie 1204 [ # # ]:UBC 0 : elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1205 : : BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1206 : : }
1207 : :
1208 : : /* Now handle simple deletes of entire tuples */
2209 pg@bowt.ie 1209 [ + + ]:CBC 8484 : if (ndeletable > 0)
1210 : 8192 : PageIndexMultiDelete(page, deletable, ndeletable);
1211 : :
1212 : : /*
1213 : : * We can clear the vacuum cycle ID since this page has certainly been
1214 : : * processed by the current vacuum scan.
1215 : : */
1444 michael@paquier.xyz 1216 : 8484 : opaque = BTPageGetOpaque(page);
7251 tgl@sss.pgh.pa.us 1217 : 8484 : opaque->btpo_cycleid = 0;
1218 : :
1219 : : /*
1220 : : * Clear the BTP_HAS_GARBAGE page flag.
1221 : : *
1222 : : * This flag indicates the presence of LP_DEAD items on the page (though
1223 : : * not reliably). Note that we only rely on it with pg_upgrade'd
1224 : : * !heapkeyspace indexes. That's why clearing it here won't usually
1225 : : * interfere with simple index tuple deletion.
1226 : : */
7173 1227 : 8484 : opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1228 : :
5227 simon@2ndQuadrant.co 1229 : 8484 : MarkBufferDirty(buf);
1230 : :
1231 : : /* XLOG stuff */
1887 pg@bowt.ie 1232 [ + + ]: 8484 : if (needswal)
1233 : : {
1234 : : xl_btree_vacuum xlrec_vacuum;
1235 : :
2278 1236 : 8483 : xlrec_vacuum.ndeleted = ndeletable;
2209 1237 : 8483 : xlrec_vacuum.nupdated = nupdatable;
1238 : :
4133 heikki.linnakangas@i 1239 : 8483 : XLogBeginInsert();
1240 : 8483 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
397 peter@eisentraut.org 1241 : 8483 : XLogRegisterData(&xlrec_vacuum, SizeOfBtreeVacuum);
1242 : :
2209 pg@bowt.ie 1243 [ + + ]: 8483 : if (ndeletable > 0)
397 peter@eisentraut.org 1244 : 8191 : XLogRegisterBufData(0, deletable,
1245 : : ndeletable * sizeof(OffsetNumber));
1246 : :
2209 pg@bowt.ie 1247 [ + + ]: 8483 : if (nupdatable > 0)
1248 : : {
397 peter@eisentraut.org 1249 : 970 : XLogRegisterBufData(0, updatedoffsets,
1250 : : nupdatable * sizeof(OffsetNumber));
2209 pg@bowt.ie 1251 : 970 : XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1252 : : }
1253 : :
4133 heikki.linnakangas@i 1254 : 8483 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1255 : : }
1256 : : else
2 pg@bowt.ie 1257 :GNC 1 : recptr = XLogGetFakeLSN(rel);
1258 : :
1259 : 8484 : PageSetLSN(page, recptr);
1260 : :
5831 simon@2ndQuadrant.co 1261 [ - + ]:CBC 8484 : END_CRIT_SECTION();
1262 : :
1263 : : /* can't leak memory here */
2209 pg@bowt.ie 1264 [ + + ]: 8484 : if (updatedbuf != NULL)
1265 : 970 : pfree(updatedbuf);
1266 : : /* free tuples allocated within _bt_delitems_update() */
1267 [ + + ]: 32323 : for (int i = 0; i < nupdatable; i++)
1268 : 23839 : pfree(updatable[i]->itup);
5831 simon@2ndQuadrant.co 1269 : 8484 : }
1270 : :
1271 : : /*
1272 : : * Delete item(s) from a btree leaf page during single-page cleanup.
1273 : : *
1274 : : * This routine assumes that the caller has pinned and write locked the
1275 : : * buffer. Also, the given deletable and updatable arrays *must* be sorted in
1276 : : * ascending order.
1277 : : *
1278 : : * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
1279 : : * in an existing posting list item are to be removed. This works by
1280 : : * updating/overwriting an existing item with caller's new version of the item
1281 : : * (a version that lacks the TIDs that are to be deleted).
1282 : : *
1283 : : * This is nearly the same as _bt_delitems_vacuum as far as what it does to
1284 : : * the page, but it needs its own snapshotConflictHorizon and isCatalogRel
1285 : : * (from the tableam). This is used by the REDO routine to generate recovery
1286 : : * conflicts. The other difference is that only _bt_delitems_vacuum will
1287 : : * clear page's VACUUM cycle ID.
1288 : : */
1289 : : static void
1009 pg@bowt.ie 1290 : 4176 : _bt_delitems_delete(Relation rel, Buffer buf,
1291 : : TransactionId snapshotConflictHorizon, bool isCatalogRel,
1292 : : OffsetNumber *deletable, int ndeletable,
1293 : : BTVacuumPosting *updatable, int nupdatable)
1294 : : {
3616 kgrittn@postgresql.o 1295 : 4176 : Page page = BufferGetPage(buf);
1296 : : BTPageOpaque opaque;
1887 pg@bowt.ie 1297 [ + - + + : 4176 : bool needswal = RelationNeedsWAL(rel);
+ - + - ]
1298 : 4176 : char *updatedbuf = NULL;
1299 : 4176 : Size updatedbuflen = 0;
1300 : : OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1301 : : XLogRecPtr recptr;
1302 : :
1303 : : /* Shouldn't be called unless there's something to do */
1304 [ + + - + ]: 4176 : Assert(ndeletable > 0 || nupdatable > 0);
1305 : :
1306 : : /* Generate new versions of posting lists without deleted TIDs */
1307 [ + + ]: 4176 : if (nupdatable > 0)
1308 : 425 : updatedbuf = _bt_delitems_update(updatable, nupdatable,
1309 : : updatedoffsets, &updatedbuflen,
1310 : : needswal);
1311 : :
1312 : : /* No ereport(ERROR) until changes are logged */
5831 simon@2ndQuadrant.co 1313 : 4176 : START_CRIT_SECTION();
1314 : :
1315 : : /* Handle updates and deletes just like _bt_delitems_vacuum */
1887 pg@bowt.ie 1316 [ + + ]: 8956 : for (int i = 0; i < nupdatable; i++)
1317 : : {
1318 : 4780 : OffsetNumber updatedoffset = updatedoffsets[i];
1319 : : IndexTuple itup;
1320 : : Size itemsz;
1321 : :
1322 : 4780 : itup = updatable[i]->itup;
1323 : 4780 : itemsz = MAXALIGN(IndexTupleSize(itup));
139 peter@eisentraut.org 1324 [ - + ]:GNC 4780 : if (!PageIndexTupleOverwrite(page, updatedoffset, itup, itemsz))
1887 pg@bowt.ie 1325 [ # # ]:UBC 0 : elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1326 : : BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1327 : : }
1328 : :
1887 pg@bowt.ie 1329 [ + + ]:CBC 4176 : if (ndeletable > 0)
1330 : 4085 : PageIndexMultiDelete(page, deletable, ndeletable);
1331 : :
1332 : : /*
1333 : : * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at
1334 : : * this point. The VACUUM command alone controls vacuum cycle IDs.
1335 : : */
1444 michael@paquier.xyz 1336 : 4176 : opaque = BTPageGetOpaque(page);
1337 : :
1338 : : /*
1339 : : * Clear the BTP_HAS_GARBAGE page flag.
1340 : : *
1341 : : * This flag indicates the presence of LP_DEAD items on the page (though
1342 : : * not reliably). Note that we only rely on it with pg_upgrade'd
1343 : : * !heapkeyspace indexes.
1344 : : */
5831 simon@2ndQuadrant.co 1345 : 4176 : opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1346 : :
1347 : 4176 : MarkBufferDirty(buf);
1348 : :
1349 : : /* XLOG stuff */
1887 pg@bowt.ie 1350 [ + - ]: 4176 : if (needswal)
1351 : : {
1352 : : xl_btree_delete xlrec_delete;
1353 : :
1214 1354 : 4176 : xlrec_delete.snapshotConflictHorizon = snapshotConflictHorizon;
2263 1355 : 4176 : xlrec_delete.ndeleted = ndeletable;
1887 1356 : 4176 : xlrec_delete.nupdated = nupdatable;
1009 1357 : 4176 : xlrec_delete.isCatalogRel = isCatalogRel;
1358 : :
4133 heikki.linnakangas@i 1359 : 4176 : XLogBeginInsert();
1360 : 4176 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
397 peter@eisentraut.org 1361 : 4176 : XLogRegisterData(&xlrec_delete, SizeOfBtreeDelete);
1362 : :
1887 pg@bowt.ie 1363 [ + + ]: 4176 : if (ndeletable > 0)
397 peter@eisentraut.org 1364 : 4085 : XLogRegisterBufData(0, deletable,
1365 : : ndeletable * sizeof(OffsetNumber));
1366 : :
1887 pg@bowt.ie 1367 [ + + ]: 4176 : if (nupdatable > 0)
1368 : : {
397 peter@eisentraut.org 1369 : 425 : XLogRegisterBufData(0, updatedoffsets,
1370 : : nupdatable * sizeof(OffsetNumber));
1887 pg@bowt.ie 1371 : 425 : XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1372 : : }
1373 : :
4133 heikki.linnakangas@i 1374 : 4176 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1375 : : }
1376 : : else
2 pg@bowt.ie 1377 :UNC 0 : recptr = XLogGetFakeLSN(rel);
1378 : :
2 pg@bowt.ie 1379 :GNC 4176 : PageSetLSN(page, recptr);
1380 : :
9009 tgl@sss.pgh.pa.us 1381 [ - + ]:CBC 4176 : END_CRIT_SECTION();
1382 : :
1383 : : /* can't leak memory here */
1887 pg@bowt.ie 1384 [ + + ]: 4176 : if (updatedbuf != NULL)
1385 : 425 : pfree(updatedbuf);
1386 : : /* free tuples allocated within _bt_delitems_update() */
1387 [ + + ]: 8956 : for (int i = 0; i < nupdatable; i++)
1388 : 4780 : pfree(updatable[i]->itup);
10841 scrappy@hub.org 1389 : 4176 : }
1390 : :
1391 : : /*
1392 : : * Set up state needed to delete TIDs from posting list tuples via "updating"
1393 : : * the tuple. Performs steps common to both _bt_delitems_vacuum and
1394 : : * _bt_delitems_delete. These steps must take place before each function's
1395 : : * critical section begins.
1396 : : *
1397 : : * updatable and nupdatable are inputs, though note that we will use
1398 : : * _bt_update_posting() to replace the original itup with a pointer to a final
1399 : : * version in palloc()'d memory. Caller should free the tuples when its done.
1400 : : *
1401 : : * The first nupdatable entries from updatedoffsets are set to the page offset
1402 : : * number for posting list tuples that caller updates. This is mostly useful
1403 : : * because caller may need to WAL-log the page offsets (though we always do
1404 : : * this for caller out of convenience).
1405 : : *
1406 : : * Returns buffer consisting of an array of xl_btree_update structs that
1407 : : * describe the steps we perform here for caller (though only when needswal is
1408 : : * true). Also sets *updatedbuflen to the final size of the buffer. This
1409 : : * buffer is used by caller when WAL logging is required.
1410 : : */
1411 : : static char *
1887 pg@bowt.ie 1412 : 1395 : _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
1413 : : OffsetNumber *updatedoffsets, Size *updatedbuflen,
1414 : : bool needswal)
1415 : : {
1416 : 1395 : char *updatedbuf = NULL;
1417 : 1395 : Size buflen = 0;
1418 : :
1419 : : /* Shouldn't be called unless there's something to do */
1420 [ - + ]: 1395 : Assert(nupdatable > 0);
1421 : :
1422 [ + + ]: 30014 : for (int i = 0; i < nupdatable; i++)
1423 : : {
1424 : 28619 : BTVacuumPosting vacposting = updatable[i];
1425 : : Size itemsz;
1426 : :
1427 : : /* Replace work area IndexTuple with updated version */
1428 : 28619 : _bt_update_posting(vacposting);
1429 : :
1430 : : /* Keep track of size of xl_btree_update for updatedbuf in passing */
1431 : 28619 : itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16);
1432 : 28619 : buflen += itemsz;
1433 : :
1434 : : /* Build updatedoffsets buffer in passing */
1435 : 28619 : updatedoffsets[i] = vacposting->updatedoffset;
1436 : : }
1437 : :
1438 : : /* XLOG stuff */
1439 [ + - ]: 1395 : if (needswal)
1440 : : {
1441 : 1395 : Size offset = 0;
1442 : :
1443 : : /* Allocate, set final size for caller */
1444 : 1395 : updatedbuf = palloc(buflen);
1445 : 1395 : *updatedbuflen = buflen;
1446 [ + + ]: 30014 : for (int i = 0; i < nupdatable; i++)
1447 : : {
1448 : 28619 : BTVacuumPosting vacposting = updatable[i];
1449 : : Size itemsz;
1450 : : xl_btree_update update;
1451 : :
1452 : 28619 : update.ndeletedtids = vacposting->ndeletedtids;
1453 : 28619 : memcpy(updatedbuf + offset, &update.ndeletedtids,
1454 : : SizeOfBtreeUpdate);
1455 : 28619 : offset += SizeOfBtreeUpdate;
1456 : :
1457 : 28619 : itemsz = update.ndeletedtids * sizeof(uint16);
1458 : 28619 : memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
1459 : 28619 : offset += itemsz;
1460 : : }
1461 : : }
1462 : :
1463 : 1395 : return updatedbuf;
1464 : : }
1465 : :
1466 : : /*
1467 : : * Comparator used by _bt_delitems_delete_check() to restore deltids array
1468 : : * back to its original leaf-page-wise sort order
1469 : : */
1470 : : static int
1471 : 2566980 : _bt_delitems_cmp(const void *a, const void *b)
1472 : : {
62 peter@eisentraut.org 1473 :GNC 2566980 : const TM_IndexDelete *indexdelete1 = a;
1474 : 2566980 : const TM_IndexDelete *indexdelete2 = b;
1475 : :
758 nathan@postgresql.or 1476 [ - + ]:CBC 2566980 : Assert(indexdelete1->id != indexdelete2->id);
1477 : :
1478 : 2566980 : return pg_cmp_s16(indexdelete1->id, indexdelete2->id);
1479 : : }
1480 : :
1481 : : /*
1482 : : * Try to delete item(s) from a btree leaf page during single-page cleanup.
1483 : : *
1484 : : * nbtree interface to table_index_delete_tuples(). Deletes a subset of index
1485 : : * tuples from caller's deltids array: those whose TIDs are found safe to
1486 : : * delete by the tableam (or already marked LP_DEAD in index, and so already
1487 : : * known to be deletable by our simple index deletion caller). We physically
1488 : : * delete index tuples from buf leaf page last of all (for index tuples where
1489 : : * that is known to be safe following our table_index_delete_tuples() call).
1490 : : *
1491 : : * Simple index deletion caller only includes TIDs from index tuples marked
1492 : : * LP_DEAD, as well as extra TIDs it found on the same leaf page that can be
1493 : : * included without increasing the total number of distinct table blocks for
1494 : : * the deletion operation as a whole. This approach often allows us to delete
1495 : : * some extra index tuples that were practically free for tableam to check in
1496 : : * passing (when they actually turn out to be safe to delete). It probably
1497 : : * only makes sense for the tableam to go ahead with these extra checks when
1498 : : * it is block-oriented (otherwise the checks probably won't be practically
1499 : : * free, which we rely on). The tableam interface requires the tableam side
1500 : : * to handle the problem, though, so this is okay (we as an index AM are free
1501 : : * to make the simplifying assumption that all tableams must be block-based).
1502 : : *
1503 : : * Bottom-up index deletion caller provides all the TIDs from the leaf page,
1504 : : * without expecting that tableam will check most of them. The tableam has
1505 : : * considerable discretion around which entries/blocks it checks. Our role in
1506 : : * costing the bottom-up deletion operation is strictly advisory.
1507 : : *
1508 : : * Note: Caller must have added deltids entries (i.e. entries that go in
1509 : : * delstate's main array) in leaf-page-wise order: page offset number order,
1510 : : * TID order among entries taken from the same posting list tuple (tiebreak on
1511 : : * TID). This order is convenient to work with here.
1512 : : *
1513 : : * Note: We also rely on the id field of each deltids element "capturing" this
1514 : : * original leaf-page-wise order. That is, we expect to be able to get back
1515 : : * to the original leaf-page-wise order just by sorting deltids on the id
1516 : : * field (tableam will sort deltids for its own reasons, so we'll need to put
1517 : : * it back in leaf-page-wise order afterwards).
1518 : : */
1519 : : void
1887 pg@bowt.ie 1520 : 5777 : _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel,
1521 : : TM_IndexDeleteOp *delstate)
1522 : : {
1523 : 5777 : Page page = BufferGetPage(buf);
1524 : : TransactionId snapshotConflictHorizon;
1525 : : bool isCatalogRel;
1526 : 5777 : OffsetNumber postingidxoffnum = InvalidOffsetNumber;
1527 : 5777 : int ndeletable = 0,
1528 : 5777 : nupdatable = 0;
1529 : : OffsetNumber deletable[MaxIndexTuplesPerPage];
1530 : : BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1531 : :
1532 : : /* Use tableam interface to determine which tuples to delete first */
1214 1533 : 5777 : snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate);
1009 1534 [ + + - + : 5777 : isCatalogRel = RelationIsAccessibleInLogicalDecoding(heapRel);
+ - - + -
- - - + +
- + - - -
- - - ]
1535 : :
1536 : : /* Should not WAL-log snapshotConflictHorizon unless it's required */
1214 1537 [ + + ]: 5777 : if (!XLogStandbyInfoActive())
1538 : 706 : snapshotConflictHorizon = InvalidTransactionId;
1539 : :
1540 : : /*
1541 : : * Construct a leaf-page-wise description of what _bt_delitems_delete()
1542 : : * needs to do to physically delete index tuples from the page.
1543 : : *
1544 : : * Must sort deltids array to restore leaf-page-wise order (original order
1545 : : * before call to tableam). This is the order that the loop expects.
1546 : : *
1547 : : * Note that deltids array might be a lot smaller now. It might even have
1548 : : * no entries at all (with bottom-up deletion caller), in which case there
1549 : : * is nothing left to do.
1550 : : */
1887 1551 : 5777 : qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1552 : : _bt_delitems_cmp);
1553 [ + + ]: 5777 : if (delstate->ndeltids == 0)
1554 : : {
1555 [ - + ]: 1601 : Assert(delstate->bottomup);
1556 : 1601 : return;
1557 : : }
1558 : :
1559 : : /* We definitely have to delete at least one index tuple (or one TID) */
1560 [ + + ]: 377833 : for (int i = 0; i < delstate->ndeltids; i++)
1561 : : {
1562 : 373657 : TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1563 : 373657 : OffsetNumber idxoffnum = dstatus->idxoffnum;
1564 : 373657 : ItemId itemid = PageGetItemId(page, idxoffnum);
1565 : 373657 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
1566 : : int nestedi,
1567 : : nitem;
1568 : : BTVacuumPosting vacposting;
1569 : :
1570 [ + - + - : 373657 : Assert(OffsetNumberIsValid(idxoffnum));
- + ]
1571 : :
1572 [ + + ]: 373657 : if (idxoffnum == postingidxoffnum)
1573 : : {
1574 : : /*
1575 : : * This deltid entry is a TID from a posting list tuple that has
1576 : : * already been completely processed
1577 : : */
1578 [ - + ]: 14902 : Assert(BTreeTupleIsPosting(itup));
1579 [ - + ]: 14902 : Assert(ItemPointerCompare(BTreeTupleGetHeapTID(itup),
1580 : : &delstate->deltids[i].tid) < 0);
1581 [ - + ]: 14902 : Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(itup),
1582 : : &delstate->deltids[i].tid) >= 0);
1583 : 14902 : continue;
1584 : : }
1585 : :
1586 [ + + ]: 358755 : if (!BTreeTupleIsPosting(itup))
1587 : : {
1588 : : /* Plain non-pivot tuple */
1589 [ - + ]: 346242 : Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1590 [ + + ]: 346242 : if (dstatus->knowndeletable)
1591 : 278577 : deletable[ndeletable++] = idxoffnum;
1592 : 346242 : continue;
1593 : : }
1594 : :
1595 : : /*
1596 : : * itup is a posting list tuple whose lowest deltids entry (which may
1597 : : * or may not be for the first TID from itup) is considered here now.
1598 : : * We should process all of the deltids entries for the posting list
1599 : : * together now, though (not just the lowest). Remember to skip over
1600 : : * later itup-related entries during later iterations of outermost
1601 : : * loop.
1602 : : */
1603 : 12513 : postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
1604 : 12513 : nestedi = i; /* Initialize for first itup deltids entry */
1605 : 12513 : vacposting = NULL; /* Describes final action for itup */
1606 : 12513 : nitem = BTreeTupleGetNPosting(itup);
1607 [ + + ]: 58323 : for (int p = 0; p < nitem; p++)
1608 : : {
1609 : 45810 : ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
1610 : 45810 : int ptidcmp = -1;
1611 : :
1612 : : /*
1613 : : * This nested loop reuses work across ptid TIDs taken from itup.
1614 : : * We take advantage of the fact that both itup's TIDs and deltids
1615 : : * entries (within a single itup/posting list grouping) must both
1616 : : * be in ascending TID order.
1617 : : */
1618 [ + + ]: 66097 : for (; nestedi < delstate->ndeltids; nestedi++)
1619 : : {
1620 : 63125 : TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
1621 : 63125 : TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1622 : :
1623 : : /* Stop once we get past all itup related deltids entries */
1624 [ - + ]: 63125 : Assert(tdstatus->idxoffnum >= idxoffnum);
1625 [ + + ]: 63125 : if (tdstatus->idxoffnum != idxoffnum)
1626 : 11677 : break;
1627 : :
1628 : : /* Skip past non-deletable itup related entries up front */
1629 [ + + ]: 51448 : if (!tdstatus->knowndeletable)
1630 : 4018 : continue;
1631 : :
1632 : : /* Entry is first partial ptid match (or an exact match)? */
1633 : 47430 : ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
1634 [ + + ]: 47430 : if (ptidcmp >= 0)
1635 : : {
1636 : : /* Greater than or equal (partial or exact) match... */
1637 : 31161 : break;
1638 : : }
1639 : : }
1640 : :
1641 : : /* ...exact ptid match to a deletable deltids entry? */
1642 [ + + ]: 45810 : if (ptidcmp != 0)
1643 : 22413 : continue;
1644 : :
1645 : : /* Exact match for deletable deltids entry -- ptid gets deleted */
1646 [ + + ]: 23397 : if (vacposting == NULL)
1647 : : {
1648 : 11415 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1649 : : nitem * sizeof(uint16));
1650 : 11415 : vacposting->itup = itup;
1651 : 11415 : vacposting->updatedoffset = idxoffnum;
1652 : 11415 : vacposting->ndeletedtids = 0;
1653 : : }
1654 : 23397 : vacposting->deletetids[vacposting->ndeletedtids++] = p;
1655 : : }
1656 : :
1657 : : /* Final decision on itup, a posting list tuple */
1658 : :
1659 [ + + ]: 12513 : if (vacposting == NULL)
1660 : : {
1661 : : /* No TIDs to delete from itup -- do nothing */
1662 : : }
1663 [ + + ]: 11415 : else if (vacposting->ndeletedtids == nitem)
1664 : : {
1665 : : /* Straight delete of itup (to delete all TIDs) */
1666 : 6635 : deletable[ndeletable++] = idxoffnum;
1667 : : /* Turns out we won't need granular information */
1668 : 6635 : pfree(vacposting);
1669 : : }
1670 : : else
1671 : : {
1672 : : /* Delete some (but not all) TIDs from itup */
1673 [ + - - + ]: 4780 : Assert(vacposting->ndeletedtids > 0 &&
1674 : : vacposting->ndeletedtids < nitem);
1675 : 4780 : updatable[nupdatable++] = vacposting;
1676 : : }
1677 : : }
1678 : :
1679 : : /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1009 1680 : 4176 : _bt_delitems_delete(rel, buf, snapshotConflictHorizon, isCatalogRel,
1681 : : deletable, ndeletable, updatable, nupdatable);
1682 : :
1683 : : /* be tidy */
1887 1684 [ + + ]: 8956 : for (int i = 0; i < nupdatable; i++)
1685 : 4780 : pfree(updatable[i]);
1686 : : }
1687 : :
1688 : : /*
1689 : : * Check that leftsib page (the btpo_prev of target page) is not marked with
1690 : : * INCOMPLETE_SPLIT flag. Used during page deletion.
1691 : : *
1692 : : * Returning true indicates that page flag is set in leftsib (which is
1693 : : * definitely still the left sibling of target). When that happens, the
1694 : : * target doesn't have a downlink in parent, and the page deletion algorithm
1695 : : * isn't prepared to handle that. Deletion of the target page (or the whole
1696 : : * subtree that contains the target page) cannot take place.
1697 : : *
1698 : : * Caller should not have a lock on the target page itself, since pages on the
1699 : : * same level must always be locked left to right to avoid deadlocks.
1700 : : */
1701 : : static bool
1009 1702 : 3224 : _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
1703 : : {
1704 : : Buffer buf;
1705 : : Page page;
1706 : : BTPageOpaque opaque;
1707 : : bool result;
1708 : :
1709 : : /* Easy case: No left sibling */
2138 1710 [ + + ]: 3224 : if (leftsib == P_NONE)
1711 : 2251 : return false;
1712 : :
1009 1713 : 973 : buf = _bt_getbuf(rel, leftsib, BT_READ);
2138 1714 : 973 : page = BufferGetPage(buf);
1444 michael@paquier.xyz 1715 : 973 : opaque = BTPageGetOpaque(page);
1716 : :
1717 : : /*
1718 : : * If the left sibling was concurrently split, so that its next-pointer
1719 : : * doesn't point to the current page anymore, the split that created
1720 : : * target must be completed. Caller can reasonably expect that there will
1721 : : * be a downlink to the target page that it can relocate using its stack.
1722 : : * (We don't allow splitting an incompletely split page again until the
1723 : : * previous split has been completed.)
1724 : : */
2138 pg@bowt.ie 1725 [ + - - + ]: 973 : result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
1726 : 973 : _bt_relbuf(rel, buf);
1727 : :
1728 : 973 : return result;
1729 : : }
1730 : :
1731 : : /*
1732 : : * Check that leafrightsib page (the btpo_next of target leaf page) is not
1733 : : * marked with ISHALFDEAD flag. Used during page deletion.
1734 : : *
1735 : : * Returning true indicates that page flag is set in leafrightsib, so page
1736 : : * deletion cannot go ahead. Our caller is not prepared to deal with the case
1737 : : * where the parent page does not have a pivot tuples whose downlink points to
1738 : : * leafrightsib (due to an earlier interrupted VACUUM operation). It doesn't
1739 : : * seem worth going to the trouble of teaching our caller to deal with it.
1740 : : * The situation will be resolved after VACUUM finishes the deletion of the
1741 : : * half-dead page (when a future VACUUM operation reaches the target page
1742 : : * again).
1743 : : *
1744 : : * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
1745 : : * _bt_rightsib_halfdeadflag() is only called for leaf pages, though. This is
1746 : : * okay because of the restriction on deleting pages that are the rightmost
1747 : : * page of their parent (i.e. that such deletions can only take place when the
1748 : : * entire subtree must be deleted). The leaf level check made here will apply
1749 : : * to a right "cousin" leaf page rather than a simple right sibling leaf page
1750 : : * in cases where caller actually goes on to attempt deleting pages that are
1751 : : * above the leaf page. The right cousin leaf page is representative of the
1752 : : * left edge of the subtree to the right of the to-be-deleted subtree as a
1753 : : * whole, which is exactly the condition that our caller cares about.
1754 : : * (Besides, internal pages are never marked half-dead, so it isn't even
1755 : : * possible to _directly_ assess if an internal page is part of some other
1756 : : * to-be-deleted subtree.)
1757 : : */
1758 : : static bool
1009 1759 : 3076 : _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
1760 : : {
1761 : : Buffer buf;
1762 : : Page page;
1763 : : BTPageOpaque opaque;
1764 : : bool result;
1765 : :
2138 1766 [ - + ]: 3076 : Assert(leafrightsib != P_NONE);
1767 : :
1009 1768 : 3076 : buf = _bt_getbuf(rel, leafrightsib, BT_READ);
3616 kgrittn@postgresql.o 1769 : 3076 : page = BufferGetPage(buf);
1444 michael@paquier.xyz 1770 : 3076 : opaque = BTPageGetOpaque(page);
1771 : :
2138 pg@bowt.ie 1772 [ + - - + ]: 3076 : Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
4312 heikki.linnakangas@i 1773 : 3076 : result = P_ISHALFDEAD(opaque);
1774 : 3076 : _bt_relbuf(rel, buf);
1775 : :
1776 : 3076 : return result;
1777 : : }
1778 : :
1779 : : /*
1780 : : * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
1781 : : *
1782 : : * This action unlinks the leaf page from the b-tree structure, removing all
1783 : : * pointers leading to it --- but not touching its own left and right links.
1784 : : * The page cannot be physically reclaimed right away, since other processes
1785 : : * may currently be trying to follow links leading to the page; they have to
1786 : : * be allowed to use its right-link to recover. See nbtree/README.
1787 : : *
1788 : : * On entry, the target buffer must be pinned and locked (either read or write
1789 : : * lock is OK). The page must be an empty leaf page, which may be half-dead
1790 : : * already (a half-dead page should only be passed to us when an earlier
1791 : : * VACUUM operation was interrupted, though). Note in particular that caller
1792 : : * should never pass a buffer containing an existing deleted page here. The
1793 : : * lock and pin on caller's buffer will be dropped before we return.
1794 : : *
1795 : : * Maintains bulk delete stats for caller, which are taken from vstate. We
1796 : : * need to cooperate closely with caller here so that whole VACUUM operation
1797 : : * reliably avoids any double counting of subsidiary-to-leafbuf pages that we
1798 : : * delete in passing. If such pages happen to be from a block number that is
1799 : : * ahead of the current scanblkno position, then caller is expected to count
1800 : : * them directly later on. It's simpler for us to understand caller's
1801 : : * requirements than it would be for caller to understand when or how a
1802 : : * deleted page became deleted after the fact.
1803 : : *
1804 : : * NOTE: this leaks memory. Rather than trying to clean up everything
1805 : : * carefully, it's better to run it in a temp context that can be reset
1806 : : * frequently.
1807 : : */
1808 : : void
1844 pg@bowt.ie 1809 : 3167 : _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
1810 : : {
1811 : : BlockNumber rightsib;
1812 : : bool rightsib_empty;
1813 : : Page page;
1814 : : BTPageOpaque opaque;
1815 : :
1816 : : /*
1817 : : * Save original leafbuf block number from caller. Only deleted blocks
1818 : : * that are <= scanblkno are added to bulk delete stat's pages_deleted
1819 : : * count.
1820 : : */
2144 1821 : 3167 : BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
1822 : :
1823 : : /*
1824 : : * "stack" is a search stack leading (approximately) to the target page.
1825 : : * It is initially NULL, but when iterating, we keep it to avoid
1826 : : * duplicated search effort.
1827 : : *
1828 : : * Also, when "stack" is not NULL, we have already checked that the
1829 : : * current page is not the right half of an incomplete split, i.e. the
1830 : : * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1831 : : * when the current target page is to the right of caller's initial page
1832 : : * (the scanblkno page).
1833 : : */
4384 heikki.linnakangas@i 1834 : 3167 : BTStack stack = NULL;
1835 : :
1836 : : for (;;)
1837 : : {
2144 pg@bowt.ie 1838 : 6243 : page = BufferGetPage(leafbuf);
1444 michael@paquier.xyz 1839 : 6243 : opaque = BTPageGetOpaque(page);
1840 : :
1841 : : /*
1842 : : * Internal pages are never deleted directly, only as part of deleting
1843 : : * the whole subtree all the way down to leaf level.
1844 : : *
1845 : : * Also check for deleted pages here. Caller never passes us a fully
1846 : : * deleted page. Only VACUUM can delete pages, so there can't have
1847 : : * been a concurrent deletion. Assume that we reached any deleted
1848 : : * page encountered here by following a sibling link, and that the
1849 : : * index is corrupt.
1850 : : */
2144 pg@bowt.ie 1851 [ - + ]: 6243 : Assert(!P_ISDELETED(opaque));
1852 [ + - - + ]: 6243 : if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
1853 : : {
1854 : : /*
1855 : : * Pre-9.4 page deletion only marked internal pages as half-dead,
1856 : : * but now we only use that flag on leaf pages. The old algorithm
1857 : : * was never supposed to leave half-dead pages in the tree, it was
1858 : : * just a transient state, but it was nevertheless possible in
1859 : : * error scenarios. We don't know how to deal with them here. They
1860 : : * are harmless as far as searches are considered, but inserts
1861 : : * into the deleted keyspace could add out-of-order downlinks in
1862 : : * the upper levels. Log a notice, hopefully the admin will notice
1863 : : * and reindex.
1864 : : */
4384 heikki.linnakangas@i 1865 [ # # ]:UBC 0 : if (P_ISHALFDEAD(opaque))
1866 [ # # ]: 0 : ereport(LOG,
1867 : : (errcode(ERRCODE_INDEX_CORRUPTED),
1868 : : errmsg("index \"%s\" contains a half-dead internal page",
1869 : : RelationGetRelationName(rel)),
1870 : : errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1871 : :
2144 pg@bowt.ie 1872 [ # # ]: 0 : if (P_ISDELETED(opaque))
1873 [ # # ]: 0 : ereport(LOG,
1874 : : (errcode(ERRCODE_INDEX_CORRUPTED),
1875 : : errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
1876 : : BufferGetBlockNumber(leafbuf),
1877 : : scanblkno,
1878 : : RelationGetRelationName(rel))));
1879 : :
1880 : 0 : _bt_relbuf(rel, leafbuf);
1844 pg@bowt.ie 1881 :CBC 101 : return;
1882 : : }
1883 : :
1884 : : /*
1885 : : * We can never delete rightmost pages nor root pages. While at it,
1886 : : * check that page is empty, since it's possible that the leafbuf page
1887 : : * was empty a moment ago, but has since had some inserts.
1888 : : *
1889 : : * To keep the algorithm simple, we also never delete an incompletely
1890 : : * split page (they should be rare enough that this doesn't make any
1891 : : * meaningful difference to disk usage):
1892 : : *
1893 : : * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1894 : : * left half of an incomplete split, but ensuring that it's not the
1895 : : * right half is more complicated. For that, we have to check that
1896 : : * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1897 : : * _bt_leftsib_splitflag(). On the first iteration, we temporarily
1898 : : * release the lock on scanblkno/leafbuf, check the left sibling, and
1899 : : * construct a search stack to scanblkno. On subsequent iterations,
1900 : : * we know we stepped right from a page that passed these tests, so
1901 : : * it's OK.
1902 : : */
2144 1903 [ + + + - ]: 6243 : if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
4380 heikki.linnakangas@i 1904 [ - + + - ]: 6148 : P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1905 [ - + ]: 6148 : P_INCOMPLETE_SPLIT(opaque))
1906 : : {
1907 : : /* Should never fail to delete a half-dead page */
4384 1908 [ - + ]: 95 : Assert(!P_ISHALFDEAD(opaque));
1909 : :
2144 pg@bowt.ie 1910 : 95 : _bt_relbuf(rel, leafbuf);
1844 1911 : 95 : return;
1912 : : }
1913 : :
1914 : : /*
1915 : : * First, remove downlink pointing to the page (or a parent of the
1916 : : * page, if we are going to delete a taller subtree), and mark the
1917 : : * leafbuf page half-dead
1918 : : */
4384 heikki.linnakangas@i 1919 [ + + ]: 6148 : if (!P_ISHALFDEAD(opaque))
1920 : : {
1921 : : /*
1922 : : * We need an approximate pointer to the page's parent page. We
1923 : : * use a variant of the standard search mechanism to search for
1924 : : * the page's high key; this will give us a link to either the
1925 : : * current parent or someplace to its left (if there are multiple
1926 : : * equal high keys, which is possible with !heapkeyspace indexes).
1927 : : *
1928 : : * Also check if this is the right-half of an incomplete split
1929 : : * (see comment above).
1930 : : */
1931 [ + + ]: 6147 : if (!stack)
1932 : 3071 : {
1933 : : BTScanInsert itup_key;
1934 : : ItemId itemid;
1935 : : IndexTuple targetkey;
1936 : : BlockNumber leftsib,
1937 : : leafblkno;
1938 : : Buffer sleafbuf;
1939 : :
1940 : 3071 : itemid = PageGetItemId(page, P_HIKEY);
1941 : 3071 : targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1942 : :
4380 1943 : 3071 : leftsib = opaque->btpo_prev;
2134 pg@bowt.ie 1944 : 3071 : leafblkno = BufferGetBlockNumber(leafbuf);
1945 : :
1946 : : /*
1947 : : * To avoid deadlocks, we'd better drop the leaf page lock
1948 : : * before going further.
1949 : : */
2063 1950 : 3071 : _bt_unlockbuf(rel, leafbuf);
1951 : :
1952 : : /*
1953 : : * Check that the left sibling of leafbuf (if any) is not
1954 : : * marked with INCOMPLETE_SPLIT flag before proceeding
1955 : : */
2134 1956 [ - + ]: 3071 : Assert(leafblkno == scanblkno);
1009 1957 [ - + ]: 3071 : if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
1958 : : {
2138 pg@bowt.ie 1959 :UBC 0 : ReleaseBuffer(leafbuf);
1844 1960 : 0 : return;
1961 : : }
1962 : :
1963 : : /*
1964 : : * We need an insertion scan key, so build one.
1965 : : *
1966 : : * _bt_search searches for the leaf page that contains any
1967 : : * matching non-pivot tuples, but we need it to "search" for
1968 : : * the high key pivot from the page that we're set to delete.
1969 : : * Compensate for the mismatch by having _bt_search locate the
1970 : : * last position < equal-to-untruncated-prefix non-pivots.
1971 : : */
1009 pg@bowt.ie 1972 :CBC 3071 : itup_key = _bt_mkscankey(rel, targetkey);
1973 : :
1974 : : /* Set up a BTLessStrategyNumber-like insertion scan key */
828 1975 : 3071 : itup_key->nextkey = false;
1976 : 3071 : itup_key->backward = true;
3 pg@bowt.ie 1977 :GNC 3071 : stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ, true);
1978 : : /* won't need a second lock or pin on leafbuf */
2134 pg@bowt.ie 1979 :CBC 3071 : _bt_relbuf(rel, sleafbuf);
1980 : :
1981 : : /*
1982 : : * Re-lock the leaf page, and start over to use our stack
1983 : : * within _bt_mark_page_halfdead. We must do it that way
1984 : : * because it's possible that leafbuf can no longer be
1985 : : * deleted. We need to recheck.
1986 : : *
1987 : : * Note: We can't simply hold on to the sleafbuf lock instead,
1988 : : * because it's barely possible that sleafbuf is not the same
1989 : : * page as leafbuf. This happens when leafbuf split after our
1990 : : * original lock was dropped, but before _bt_search finished
1991 : : * its descent. We rely on the assumption that we'll find
1992 : : * leafbuf isn't safe to delete anymore in this scenario.
1993 : : * (Page deletion can cope with the stack being to the left of
1994 : : * leafbuf, but not to the right of leafbuf.)
1995 : : */
2063 1996 : 3071 : _bt_lockbuf(rel, leafbuf, BT_WRITE);
4384 heikki.linnakangas@i 1997 : 3071 : continue;
1998 : : }
1999 : :
2000 : : /*
2001 : : * See if it's safe to delete the leaf page, and determine how
2002 : : * many parent/internal pages above the leaf level will be
2003 : : * deleted. If it's safe then _bt_mark_page_halfdead will also
2004 : : * perform the first phase of deletion, which includes marking the
2005 : : * leafbuf page half-dead.
2006 : : */
2144 pg@bowt.ie 2007 [ + - - + ]: 3076 : Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
1009 2008 [ + + ]: 3076 : if (!_bt_mark_page_halfdead(rel, vstate->info->heaprel, leafbuf,
2009 : : stack))
2010 : : {
2144 2011 : 6 : _bt_relbuf(rel, leafbuf);
1844 2012 : 6 : return;
2013 : : }
2014 : : }
2015 : : else
2016 : : {
103 heikki.linnakangas@i 2017 :GNC 1 : INJECTION_POINT("nbtree-finish-half-dead-page-vacuum", NULL);
2018 : : }
2019 : :
2020 : : /*
2021 : : * Then unlink it from its siblings. Each call to
2022 : : * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
2023 : : * making it shallower. Iterate until the leafbuf page is deleted.
2024 : : */
4384 heikki.linnakangas@i 2025 :CBC 3071 : rightsib_empty = false;
2144 pg@bowt.ie 2026 [ + - - + ]: 3071 : Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
4384 heikki.linnakangas@i 2027 [ + + ]: 6283 : while (P_ISHALFDEAD(opaque))
2028 : : {
2029 : : /* Check for interrupts in _bt_unlink_halfdead_page */
2144 pg@bowt.ie 2030 [ - + ]: 3213 : if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
2031 : : &rightsib_empty, vstate))
2032 : : {
2033 : : /*
2034 : : * _bt_unlink_halfdead_page should never fail, since we
2035 : : * established that deletion is generally safe in
2036 : : * _bt_mark_page_halfdead -- index must be corrupt.
2037 : : *
2038 : : * Note that _bt_unlink_halfdead_page already released the
2039 : : * lock and pin on leafbuf for us.
2040 : : */
1865 pg@bowt.ie 2041 :UBC 0 : Assert(false);
2042 : : return;
2043 : : }
2044 : : }
2045 : :
2144 pg@bowt.ie 2046 [ + - - + ]:CBC 3070 : Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
2047 : :
4384 heikki.linnakangas@i 2048 : 3070 : rightsib = opaque->btpo_next;
2049 : :
2144 pg@bowt.ie 2050 : 3070 : _bt_relbuf(rel, leafbuf);
2051 : :
2052 : : /*
2053 : : * Check here, as calling loops will have locks held, preventing
2054 : : * interrupts from being processed.
2055 : : */
2811 andres@anarazel.de 2056 [ - + ]: 3070 : CHECK_FOR_INTERRUPTS();
2057 : :
2058 : : /*
2059 : : * The page has now been deleted. If its right sibling is completely
2060 : : * empty, it's possible that the reason we haven't deleted it earlier
2061 : : * is that it was the rightmost child of the parent. Now that we
2062 : : * removed the downlink for this page, the right sibling might now be
2063 : : * the only child of the parent, and could be removed. It would be
2064 : : * picked up by the next vacuum anyway, but might as well try to
2065 : : * remove it now, so loop back to process the right sibling.
2066 : : *
2067 : : * Note: This relies on the assumption that _bt_getstackbuf() will be
2068 : : * able to reuse our original descent stack with a different child
2069 : : * block (provided that the child block is to the right of the
2070 : : * original leaf page reached by _bt_search()). It will even update
2071 : : * the descent stack each time we loop around, avoiding repeated work.
2072 : : */
4384 heikki.linnakangas@i 2073 [ + + ]: 3070 : if (!rightsib_empty)
2074 : 3065 : break;
2075 : :
1009 pg@bowt.ie 2076 : 5 : leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2077 : : }
2078 : : }
2079 : :
2080 : : /*
2081 : : * First stage of page deletion.
2082 : : *
2083 : : * Establish the height of the to-be-deleted subtree with leafbuf at its
2084 : : * lowest level, remove the downlink to the subtree, and mark leafbuf
2085 : : * half-dead. The final to-be-deleted subtree is usually just leafbuf itself,
2086 : : * but may include additional internal pages (at most one per level of the
2087 : : * tree below the root).
2088 : : *
2089 : : * Caller must pass a valid heaprel, since it's just about possible that our
2090 : : * call to _bt_lock_subtree_parent will need to allocate a new index page to
2091 : : * complete a page split. Every call to _bt_allocbuf needs to pass a heaprel.
2092 : : *
2093 : : * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is
2094 : : * the rightmost child of its parent (and parent has more than one downlink).
2095 : : * Returns 'true' when the first stage of page deletion completed
2096 : : * successfully.
2097 : : */
2098 : : static bool
1079 andres@anarazel.de 2099 : 3076 : _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf,
2100 : : BTStack stack)
2101 : : {
2102 : : BlockNumber leafblkno;
2103 : : BlockNumber leafrightsib;
2104 : : BlockNumber topparent;
2105 : : BlockNumber topparentrightsib;
2106 : : ItemId itemid;
2107 : : Page page;
2108 : : BTPageOpaque opaque;
2109 : : Buffer subtreeparent;
2110 : : OffsetNumber poffset;
2111 : : OffsetNumber nextoffset;
2112 : : IndexTuple itup;
2113 : : IndexTupleData trunctuple;
2114 : : XLogRecPtr recptr;
2115 : :
3616 kgrittn@postgresql.o 2116 : 3076 : page = BufferGetPage(leafbuf);
1444 michael@paquier.xyz 2117 : 3076 : opaque = BTPageGetOpaque(page);
2118 : :
2134 pg@bowt.ie 2119 [ + - + - : 3076 : Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
+ - + - -
+ - + ]
2120 : : P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
2121 : : P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
1009 2122 [ - + ]: 3076 : Assert(heaprel != NULL);
2123 : :
2124 : : /*
2125 : : * Save info about the leaf page.
2126 : : */
4384 heikki.linnakangas@i 2127 : 3076 : leafblkno = BufferGetBlockNumber(leafbuf);
2128 : 3076 : leafrightsib = opaque->btpo_next;
2129 : :
2130 : : /*
2131 : : * Before attempting to lock the parent page, check that the right sibling
2132 : : * is not in half-dead state. A half-dead right sibling would have no
2133 : : * downlink in the parent, which would be highly confusing later when we
2134 : : * delete the downlink. It would fail the "right sibling of target page
2135 : : * is also the next child in parent page" cross-check below.
2136 : : */
1009 pg@bowt.ie 2137 [ - + ]: 3076 : if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
2138 : : {
4312 heikki.linnakangas@i 2139 [ # # ]:UBC 0 : elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
2140 : : leafblkno, leafrightsib);
2141 : 0 : return false;
2142 : : }
2143 : :
2144 : : /*
2145 : : * We cannot delete a page that is the rightmost child of its immediate
2146 : : * parent, unless it is the only child --- in which case the parent has to
2147 : : * be deleted too, and the same condition applies recursively to it. We
2148 : : * have to check this condition all the way up before trying to delete,
2149 : : * and lock the parent of the root of the to-be-deleted subtree (the
2150 : : * "subtree parent"). _bt_lock_subtree_parent() locks the subtree parent
2151 : : * for us. We remove the downlink to the "top parent" page (subtree root
2152 : : * page) from the subtree parent page below.
2153 : : *
2154 : : * Initialize topparent to be leafbuf page now. The final to-be-deleted
2155 : : * subtree is often a degenerate one page subtree consisting only of the
2156 : : * leafbuf page. When that happens, the leafbuf page is the final subtree
2157 : : * root page/top parent page.
2158 : : */
2134 pg@bowt.ie 2159 :CBC 3076 : topparent = leafblkno;
2160 : 3076 : topparentrightsib = leafrightsib;
1079 andres@anarazel.de 2161 [ + + ]: 3076 : if (!_bt_lock_subtree_parent(rel, heaprel, leafblkno, stack,
2162 : : &subtreeparent, &poffset,
2163 : : &topparent, &topparentrightsib))
4384 heikki.linnakangas@i 2164 : 6 : return false;
2165 : :
2134 pg@bowt.ie 2166 : 3070 : page = BufferGetPage(subtreeparent);
1444 michael@paquier.xyz 2167 : 3070 : opaque = BTPageGetOpaque(page);
2168 : :
2169 : : #ifdef USE_ASSERT_CHECKING
2170 : :
2171 : : /*
2172 : : * This is just an assertion because _bt_lock_subtree_parent should have
2173 : : * guaranteed tuple has the expected contents
2174 : : */
2134 pg@bowt.ie 2175 : 3070 : itemid = PageGetItemId(page, poffset);
4384 heikki.linnakangas@i 2176 : 3070 : itup = (IndexTuple) PageGetItem(page, itemid);
2134 pg@bowt.ie 2177 [ - + ]: 3070 : Assert(BTreeTupleGetDownLink(itup) == topparent);
2178 : : #endif
2179 : :
2180 : 3070 : nextoffset = OffsetNumberNext(poffset);
4384 heikki.linnakangas@i 2181 : 3070 : itemid = PageGetItemId(page, nextoffset);
2182 : 3070 : itup = (IndexTuple) PageGetItem(page, itemid);
2183 : :
2184 : : /*
2185 : : * Check that the parent-page index items we're about to delete/overwrite
2186 : : * in subtree parent page contain what we expect. This can fail if the
2187 : : * index has become corrupt for some reason. When that happens we back
2188 : : * out of deletion of the leafbuf subtree. (This is just like the case
2189 : : * where _bt_lock_subtree_parent() cannot "re-find" leafbuf's downlink.)
2190 : : */
2134 pg@bowt.ie 2191 [ - + ]: 3070 : if (BTreeTupleGetDownLink(itup) != topparentrightsib)
2192 : : {
998 pg@bowt.ie 2193 [ # # ]:UBC 0 : ereport(LOG,
2194 : : (errcode(ERRCODE_INDEX_CORRUPTED),
2195 : : errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
2196 : : topparentrightsib, topparent,
2197 : : BTreeTupleGetDownLink(itup),
2198 : : BufferGetBlockNumber(subtreeparent),
2199 : : RelationGetRelationName(rel))));
2200 : :
2201 : 0 : _bt_relbuf(rel, subtreeparent);
2202 : 0 : Assert(false);
2203 : : return false;
2204 : : }
2205 : :
2206 : : /*
2207 : : * Any insert which would have gone on the leaf block will now go to its
2208 : : * right sibling. In other words, the key space moves right.
2209 : : */
4384 heikki.linnakangas@i 2210 :CBC 3070 : PredicateLockPageCombine(rel, leafblkno, leafrightsib);
2211 : :
2212 : : /* No ereport(ERROR) until changes are logged */
2213 : 3070 : START_CRIT_SECTION();
2214 : :
2215 : : /*
2216 : : * Update parent of subtree. We want to delete the downlink to the top
2217 : : * parent page/root of the subtree, and the *following* key. Easiest way
2218 : : * is to copy the right sibling's downlink over the downlink that points
2219 : : * to top parent page, and then delete the right sibling's original pivot
2220 : : * tuple.
2221 : : *
2222 : : * Lanin and Shasha make the key space move left when deleting a page,
2223 : : * whereas the key space moves right here. That's why we cannot simply
2224 : : * delete the pivot tuple with the downlink to the top parent page. See
2225 : : * nbtree/README.
2226 : : */
2134 pg@bowt.ie 2227 : 3070 : page = BufferGetPage(subtreeparent);
1444 michael@paquier.xyz 2228 : 3070 : opaque = BTPageGetOpaque(page);
2229 : :
2134 pg@bowt.ie 2230 : 3070 : itemid = PageGetItemId(page, poffset);
4384 heikki.linnakangas@i 2231 : 3070 : itup = (IndexTuple) PageGetItem(page, itemid);
2134 pg@bowt.ie 2232 : 3070 : BTreeTupleSetDownLink(itup, topparentrightsib);
2233 : :
2234 : 3070 : nextoffset = OffsetNumberNext(poffset);
4384 heikki.linnakangas@i 2235 : 3070 : PageIndexTupleDelete(page, nextoffset);
2236 : :
2237 : : /*
2238 : : * Mark the leaf page as half-dead, and stamp it with a link to the top
2239 : : * parent page. When the leaf page is also the top parent page, the link
2240 : : * is set to InvalidBlockNumber.
2241 : : */
3616 kgrittn@postgresql.o 2242 : 3070 : page = BufferGetPage(leafbuf);
1444 michael@paquier.xyz 2243 : 3070 : opaque = BTPageGetOpaque(page);
4384 heikki.linnakangas@i 2244 : 3070 : opaque->btpo_flags |= BTP_HALF_DEAD;
2245 : :
2406 pg@bowt.ie 2246 [ - + ]: 3070 : Assert(PageGetMaxOffsetNumber(page) == P_HIKEY);
4344 heikki.linnakangas@i 2247 [ + - + - : 6140 : MemSet(&trunctuple, 0, sizeof(IndexTupleData));
+ - + - +
+ ]
2248 : 3070 : trunctuple.t_info = sizeof(IndexTupleData);
2134 pg@bowt.ie 2249 [ + + ]: 3070 : if (topparent != leafblkno)
2250 : 64 : BTreeTupleSetTopParent(&trunctuple, topparent);
2251 : : else
2883 teodor@sigaev.ru 2252 : 3006 : BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber);
2253 : :
139 peter@eisentraut.org 2254 [ - + ]:GNC 3070 : if (!PageIndexTupleOverwrite(page, P_HIKEY, &trunctuple, IndexTupleSize(&trunctuple)))
2406 pg@bowt.ie 2255 [ # # ]:UBC 0 : elog(ERROR, "could not overwrite high key in half-dead page");
2256 : :
2257 : : /* Must mark buffers dirty before XLogInsert */
2134 pg@bowt.ie 2258 :CBC 3070 : MarkBufferDirty(subtreeparent);
4384 heikki.linnakangas@i 2259 : 3070 : MarkBufferDirty(leafbuf);
2260 : :
2261 : : /* XLOG stuff */
2262 [ + - + + : 3070 : if (RelationNeedsWAL(rel))
+ - + - ]
4384 heikki.linnakangas@i 2263 :GIC 3070 : {
2264 : : xl_btree_mark_page_halfdead xlrec;
2265 : :
2134 pg@bowt.ie 2266 :CBC 3070 : xlrec.poffset = poffset;
4384 heikki.linnakangas@i 2267 : 3070 : xlrec.leafblk = leafblkno;
2134 pg@bowt.ie 2268 [ + + ]: 3070 : if (topparent != leafblkno)
2269 : 64 : xlrec.topparent = topparent;
2270 : : else
4344 heikki.linnakangas@i 2271 : 3006 : xlrec.topparent = InvalidBlockNumber;
2272 : :
4133 2273 : 3070 : XLogBeginInsert();
2274 : 3070 : XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
2134 pg@bowt.ie 2275 : 3070 : XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
2276 : :
3616 kgrittn@postgresql.o 2277 : 3070 : page = BufferGetPage(leafbuf);
1444 michael@paquier.xyz 2278 : 3070 : opaque = BTPageGetOpaque(page);
4384 heikki.linnakangas@i 2279 : 3070 : xlrec.leftblk = opaque->btpo_prev;
2280 : 3070 : xlrec.rightblk = opaque->btpo_next;
2281 : :
397 peter@eisentraut.org 2282 : 3070 : XLogRegisterData(&xlrec, SizeOfBtreeMarkPageHalfDead);
2283 : :
4133 heikki.linnakangas@i 2284 : 3070 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
2285 : : }
2286 : : else
2 pg@bowt.ie 2287 :UNC 0 : recptr = XLogGetFakeLSN(rel);
2288 : :
2 pg@bowt.ie 2289 :GNC 3070 : page = BufferGetPage(subtreeparent);
2290 : 3070 : PageSetLSN(page, recptr);
2291 : 3070 : page = BufferGetPage(leafbuf);
2292 : 3070 : PageSetLSN(page, recptr);
2293 : :
4384 heikki.linnakangas@i 2294 [ - + ]:CBC 3070 : END_CRIT_SECTION();
2295 : :
2134 pg@bowt.ie 2296 : 3070 : _bt_relbuf(rel, subtreeparent);
4384 heikki.linnakangas@i 2297 : 3070 : return true;
2298 : : }
2299 : :
2300 : : /*
2301 : : * Second stage of page deletion.
2302 : : *
2303 : : * Unlinks a single page (in the subtree undergoing deletion) from its
2304 : : * siblings. Also marks the page deleted.
2305 : : *
2306 : : * To get rid of the whole subtree, including the leaf page itself, call here
2307 : : * until the leaf page is deleted. The original "top parent" established in
2308 : : * the first stage of deletion is deleted in the first call here, while the
2309 : : * leaf page is deleted in the last call here. Note that the leaf page itself
2310 : : * is often the initial top parent page.
2311 : : *
2312 : : * Returns 'false' if the page could not be unlinked (shouldn't happen). If
2313 : : * the right sibling of the current target page is empty, *rightsib_empty is
2314 : : * set to true, allowing caller to delete the target's right sibling page in
2315 : : * passing. Note that *rightsib_empty is only actually used by caller when
2316 : : * target page is leafbuf, following last call here for leafbuf/the subtree
2317 : : * containing leafbuf. (We always set *rightsib_empty for caller, just to be
2318 : : * consistent.)
2319 : : *
2320 : : * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
2321 : : * On success exit, we'll be holding pin and write lock. On failure exit,
2322 : : * we'll release both pin and lock before returning (we define it that way
2323 : : * to avoid having to reacquire a lock we already released).
2324 : : */
2325 : : static bool
2144 pg@bowt.ie 2326 : 3213 : _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
2327 : : bool *rightsib_empty, BTVacState *vstate)
2328 : : {
4384 heikki.linnakangas@i 2329 : 3213 : BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
1844 pg@bowt.ie 2330 : 3213 : IndexBulkDeleteResult *stats = vstate->stats;
2331 : : BlockNumber leafleftsib;
2332 : : BlockNumber leafrightsib;
2333 : : BlockNumber target;
2334 : : BlockNumber leftsib;
2335 : : BlockNumber rightsib;
4384 heikki.linnakangas@i 2336 : 3213 : Buffer lbuf = InvalidBuffer;
2337 : : Buffer buf;
2338 : : Buffer rbuf;
2339 : 3213 : Buffer metabuf = InvalidBuffer;
2340 : 3213 : Page metapg = NULL;
2341 : 3213 : BTMetaPageData *metad = NULL;
2342 : : ItemId itemid;
2343 : : Page page;
2344 : : BTPageOpaque opaque;
2345 : : FullTransactionId safexid;
2346 : : bool rightsib_is_rightmost;
2347 : : uint32 targetlevel;
2348 : : IndexTuple leafhikey;
2349 : : BlockNumber leaftopparent;
2350 : : XLogRecPtr recptr;
2351 : :
3616 kgrittn@postgresql.o 2352 : 3213 : page = BufferGetPage(leafbuf);
1444 michael@paquier.xyz 2353 : 3213 : opaque = BTPageGetOpaque(page);
2354 : :
2134 pg@bowt.ie 2355 [ + - + - : 3213 : Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
- + ]
2356 : :
2357 : : /*
2358 : : * Remember some information about the leaf page.
2359 : : */
4384 heikki.linnakangas@i 2360 : 3213 : itemid = PageGetItemId(page, P_HIKEY);
2883 teodor@sigaev.ru 2361 : 3213 : leafhikey = (IndexTuple) PageGetItem(page, itemid);
2150 pg@bowt.ie 2362 : 3213 : target = BTreeTupleGetTopParent(leafhikey);
4384 heikki.linnakangas@i 2363 : 3213 : leafleftsib = opaque->btpo_prev;
2364 : 3213 : leafrightsib = opaque->btpo_next;
2365 : :
2063 pg@bowt.ie 2366 : 3213 : _bt_unlockbuf(rel, leafbuf);
2367 : :
103 heikki.linnakangas@i 2368 :GNC 3213 : INJECTION_POINT("nbtree-leave-page-half-dead", NULL);
2369 : :
2370 : : /*
2371 : : * Check here, as calling loops will have locks held, preventing
2372 : : * interrupts from being processed.
2373 : : */
2811 andres@anarazel.de 2374 [ + + ]:CBC 3212 : CHECK_FOR_INTERRUPTS();
2375 : :
2376 : : /* Unlink the current top parent of the subtree */
2150 pg@bowt.ie 2377 [ + + ]: 3212 : if (!BlockNumberIsValid(target))
2378 : : {
2379 : : /* Target is leaf page (or leaf page is top parent, if you prefer) */
2380 : 3070 : target = leafblkno;
2381 : :
2382 : 3070 : buf = leafbuf;
2383 : 3070 : leftsib = leafleftsib;
2384 : 3070 : targetlevel = 0;
2385 : : }
2386 : : else
2387 : : {
2388 : : /* Target is the internal page taken from leaf's top parent link */
3508 tgl@sss.pgh.pa.us 2389 [ - + ]: 142 : Assert(target != leafblkno);
2390 : :
2391 : : /* Fetch the block number of the target's left sibling */
1009 pg@bowt.ie 2392 : 142 : buf = _bt_getbuf(rel, target, BT_READ);
3616 kgrittn@postgresql.o 2393 : 142 : page = BufferGetPage(buf);
1444 michael@paquier.xyz 2394 : 142 : opaque = BTPageGetOpaque(page);
4384 heikki.linnakangas@i 2395 : 142 : leftsib = opaque->btpo_prev;
1845 pg@bowt.ie 2396 : 142 : targetlevel = opaque->btpo_level;
2150 2397 [ - + ]: 142 : Assert(targetlevel > 0);
2398 : :
2399 : : /*
2400 : : * To avoid deadlocks, we'd better drop the target page lock before
2401 : : * going further.
2402 : : */
2063 2403 : 142 : _bt_unlockbuf(rel, buf);
2404 : : }
2405 : :
2406 : : /*
2407 : : * We have to lock the pages we need to modify in the standard order:
2408 : : * moving right, then up. Else we will deadlock against other writers.
2409 : : *
2410 : : * So, first lock the leaf page, if it's not the target. Then find and
2411 : : * write-lock the current left sibling of the target page. The sibling
2412 : : * that was current a moment ago could have split, so we may have to move
2413 : : * right.
2414 : : */
4384 heikki.linnakangas@i 2415 [ + + ]: 3212 : if (target != leafblkno)
2063 pg@bowt.ie 2416 : 142 : _bt_lockbuf(rel, leafbuf, BT_WRITE);
8421 tgl@sss.pgh.pa.us 2417 [ + + ]: 3212 : if (leftsib != P_NONE)
2418 : : {
1009 pg@bowt.ie 2419 : 956 : lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
3616 kgrittn@postgresql.o 2420 : 956 : page = BufferGetPage(lbuf);
1444 michael@paquier.xyz 2421 : 956 : opaque = BTPageGetOpaque(page);
8421 tgl@sss.pgh.pa.us 2422 [ - + - + ]: 956 : while (P_ISDELETED(opaque) || opaque->btpo_next != target)
2423 : : {
1768 tgl@sss.pgh.pa.us 2424 :UBC 0 : bool leftsibvalid = true;
2425 : :
2426 : : /*
2427 : : * Before we follow the link from the page that was the left
2428 : : * sibling mere moments ago, validate its right link. This
2429 : : * reduces the opportunities for loop to fail to ever make any
2430 : : * progress in the presence of index corruption.
2431 : : *
2432 : : * Note: we rely on the assumption that there can only be one
2433 : : * vacuum process running at a time (against the same index).
2434 : : */
1865 pg@bowt.ie 2435 [ # # # # ]: 0 : if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) ||
2436 [ # # ]: 0 : leftsib == opaque->btpo_next)
2437 : 0 : leftsibvalid = false;
2438 : :
2439 : 0 : leftsib = opaque->btpo_next;
2440 : 0 : _bt_relbuf(rel, lbuf);
2441 : :
2442 [ # # ]: 0 : if (!leftsibvalid)
2443 : : {
2444 : : /*
2445 : : * This is known to fail in the field; sibling link corruption
2446 : : * is relatively common. Press on with vacuuming rather than
2447 : : * just throwing an ERROR.
2448 : : */
2449 [ # # ]: 0 : ereport(LOG,
2450 : : (errcode(ERRCODE_INDEX_CORRUPTED),
2451 : : errmsg_internal("valid left sibling for deletion target could not be located: "
2452 : : "left sibling %u of target %u with leafblkno %u and scanblkno %u on level %u of index \"%s\"",
2453 : : leftsib, target, leafblkno, scanblkno,
2454 : : targetlevel, RelationGetRelationName(rel))));
2455 : :
2456 : : /* Must release all pins and locks on failure exit */
1025 2457 : 0 : ReleaseBuffer(buf);
2458 [ # # ]: 0 : if (target != leafblkno)
2459 : 0 : _bt_relbuf(rel, leafbuf);
2460 : :
4384 heikki.linnakangas@i 2461 : 0 : return false;
2462 : : }
2463 : :
1865 pg@bowt.ie 2464 [ # # ]: 0 : CHECK_FOR_INTERRUPTS();
2465 : :
2466 : : /* step right one page */
1009 2467 : 0 : lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
3616 kgrittn@postgresql.o 2468 : 0 : page = BufferGetPage(lbuf);
1444 michael@paquier.xyz 2469 : 0 : opaque = BTPageGetOpaque(page);
2470 : : }
2471 : : }
2472 : : else
8421 tgl@sss.pgh.pa.us 2473 :CBC 2256 : lbuf = InvalidBuffer;
2474 : :
2475 : : /* Next write-lock the target page itself */
2063 pg@bowt.ie 2476 : 3212 : _bt_lockbuf(rel, buf, BT_WRITE);
3616 kgrittn@postgresql.o 2477 : 3212 : page = BufferGetPage(buf);
1444 michael@paquier.xyz 2478 : 3212 : opaque = BTPageGetOpaque(page);
2479 : :
2480 : : /*
2481 : : * Check page is still empty etc, else abandon deletion. This is just for
2482 : : * paranoia's sake; a half-dead page cannot resurrect because there can be
2483 : : * only one vacuum process running at a time.
2484 : : */
4384 heikki.linnakangas@i 2485 [ + - + - : 3212 : if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
- + ]
1839 pg@bowt.ie 2486 [ # # ]:UBC 0 : elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"",
2487 : : target, RelationGetRelationName(rel));
2488 : :
8421 tgl@sss.pgh.pa.us 2489 [ - + ]:CBC 3212 : if (opaque->btpo_prev != leftsib)
2418 peter@eisentraut.org 2490 [ # # ]:UBC 0 : ereport(ERROR,
2491 : : (errcode(ERRCODE_INDEX_CORRUPTED),
2492 : : errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"",
2493 : : leftsib, opaque->btpo_prev, target,
2494 : : RelationGetRelationName(rel))));
2495 : :
4384 heikki.linnakangas@i 2496 [ + + ]:CBC 3212 : if (target == leafblkno)
2497 : : {
2498 [ - + + - ]: 3070 : if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
2499 [ + - - + ]: 3070 : !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
1839 pg@bowt.ie 2500 [ # # ]:UBC 0 : elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"",
2501 : : target, RelationGetRelationName(rel));
2502 : :
2503 : : /* Leaf page is also target page: don't set leaftopparent */
1845 pg@bowt.ie 2504 :CBC 3070 : leaftopparent = InvalidBlockNumber;
2505 : : }
2506 : : else
2507 : : {
2508 : : IndexTuple finaldataitem;
2509 : :
4384 heikki.linnakangas@i 2510 [ - + + - ]: 142 : if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
2511 [ - + ]: 142 : P_ISLEAF(opaque))
1839 pg@bowt.ie 2512 [ # # ]:UBC 0 : elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"",
2513 : : targetlevel, target, RelationGetRelationName(rel));
2514 : :
2515 : : /* Target is internal: set leaftopparent for next call here... */
4384 heikki.linnakangas@i 2516 [ - + ]:CBC 142 : itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
1845 pg@bowt.ie 2517 : 142 : finaldataitem = (IndexTuple) PageGetItem(page, itemid);
2518 : 142 : leaftopparent = BTreeTupleGetDownLink(finaldataitem);
2519 : : /* ...except when it would be a redundant pointer-to-self */
2520 [ + + ]: 142 : if (leaftopparent == leafblkno)
2521 : 64 : leaftopparent = InvalidBlockNumber;
2522 : : }
2523 : :
2524 : : /* No leaftopparent for level 0 (leaf page) or level 1 target */
1839 2525 [ + + - + ]: 3212 : Assert(!BlockNumberIsValid(leaftopparent) || targetlevel > 1);
2526 : :
2527 : : /*
2528 : : * And next write-lock the (current) right sibling.
2529 : : */
8421 tgl@sss.pgh.pa.us 2530 : 3212 : rightsib = opaque->btpo_next;
1009 pg@bowt.ie 2531 : 3212 : rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
3616 kgrittn@postgresql.o 2532 : 3212 : page = BufferGetPage(rbuf);
1444 michael@paquier.xyz 2533 : 3212 : opaque = BTPageGetOpaque(page);
2534 : :
2535 : : /*
2536 : : * Validate target's right sibling page. Its left link must point back to
2537 : : * the target page.
2538 : : */
5677 tgl@sss.pgh.pa.us 2539 [ - + ]: 3212 : if (opaque->btpo_prev != target)
2540 : : {
2541 : : /*
2542 : : * This is known to fail in the field; sibling link corruption is
2543 : : * relatively common. Press on with vacuuming rather than just
2544 : : * throwing an ERROR (same approach used for left-sibling's-right-link
2545 : : * validation check a moment ago).
2546 : : */
1025 pg@bowt.ie 2547 [ # # ]:UBC 0 : ereport(LOG,
2548 : : (errcode(ERRCODE_INDEX_CORRUPTED),
2549 : : errmsg_internal("right sibling's left-link doesn't match: "
2550 : : "right sibling %u of target %u with leafblkno %u "
2551 : : "and scanblkno %u spuriously links to non-target %u "
2552 : : "on level %u of index \"%s\"",
2553 : : rightsib, target, leafblkno,
2554 : : scanblkno, opaque->btpo_prev,
2555 : : targetlevel, RelationGetRelationName(rel))));
2556 : :
2557 : : /* Must release all pins and locks on failure exit */
2558 [ # # ]: 0 : if (BufferIsValid(lbuf))
2559 : 0 : _bt_relbuf(rel, lbuf);
2560 : 0 : _bt_relbuf(rel, rbuf);
2561 : 0 : _bt_relbuf(rel, buf);
2562 [ # # ]: 0 : if (target != leafblkno)
2563 : 0 : _bt_relbuf(rel, leafbuf);
2564 : :
2565 : 0 : return false;
2566 : : }
2567 : :
4384 heikki.linnakangas@i 2568 :CBC 3212 : rightsib_is_rightmost = P_RIGHTMOST(opaque);
2569 [ + + ]: 3212 : *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2570 : :
2571 : : /*
2572 : : * If we are deleting the next-to-last page on the target's level, then
2573 : : * the rightsib is a candidate to become the new fast root. (In theory, it
2574 : : * might be possible to push the fast root even further down, but the odds
2575 : : * of doing so are slim, and the locking considerations daunting.)
2576 : : *
2577 : : * We can safely acquire a lock on the metapage here --- see comments for
2578 : : * _bt_newlevel().
2579 : : */
2580 [ + + + + ]: 3212 : if (leftsib == P_NONE && rightsib_is_rightmost)
2581 : : {
3616 kgrittn@postgresql.o 2582 : 29 : page = BufferGetPage(rbuf);
1444 michael@paquier.xyz 2583 : 29 : opaque = BTPageGetOpaque(page);
8421 tgl@sss.pgh.pa.us 2584 [ + - ]: 29 : if (P_RIGHTMOST(opaque))
2585 : : {
2586 : : /* rightsib will be the only one left on the level */
1009 pg@bowt.ie 2587 : 29 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
3616 kgrittn@postgresql.o 2588 : 29 : metapg = BufferGetPage(metabuf);
8421 tgl@sss.pgh.pa.us 2589 : 29 : metad = BTPageGetMeta(metapg);
2590 : :
2591 : : /*
2592 : : * The expected case here is btm_fastlevel == targetlevel+1; if
2593 : : * the fastlevel is <= targetlevel, something is wrong, and we
2594 : : * choose to overwrite it to fix it.
2595 : : */
8259 bruce@momjian.us 2596 [ - + ]: 29 : if (metad->btm_fastlevel > targetlevel + 1)
2597 : : {
2598 : : /* no update wanted */
8421 tgl@sss.pgh.pa.us 2599 :UBC 0 : _bt_relbuf(rel, metabuf);
2600 : 0 : metabuf = InvalidBuffer;
2601 : : }
2602 : : }
2603 : : }
2604 : :
2605 : : /*
2606 : : * Here we begin doing the deletion.
2607 : : */
2608 : :
2609 : : /* No ereport(ERROR) until changes are logged */
8421 tgl@sss.pgh.pa.us 2610 :CBC 3212 : START_CRIT_SECTION();
2611 : :
2612 : : /*
2613 : : * Update siblings' side-links. Note the target page's side-links will
2614 : : * continue to point to the siblings. Asserts here are just rechecking
2615 : : * things we already verified above.
2616 : : */
2617 [ + + ]: 3212 : if (BufferIsValid(lbuf))
2618 : : {
3616 kgrittn@postgresql.o 2619 : 956 : page = BufferGetPage(lbuf);
1444 michael@paquier.xyz 2620 : 956 : opaque = BTPageGetOpaque(page);
8421 tgl@sss.pgh.pa.us 2621 [ - + ]: 956 : Assert(opaque->btpo_next == target);
2622 : 956 : opaque->btpo_next = rightsib;
2623 : : }
3616 kgrittn@postgresql.o 2624 : 3212 : page = BufferGetPage(rbuf);
1444 michael@paquier.xyz 2625 : 3212 : opaque = BTPageGetOpaque(page);
8421 tgl@sss.pgh.pa.us 2626 [ - + ]: 3212 : Assert(opaque->btpo_prev == target);
2627 : 3212 : opaque->btpo_prev = leftsib;
2628 : :
2629 : : /*
2630 : : * If we deleted a parent of the targeted leaf page, instead of the leaf
2631 : : * itself, update the leaf to point to the next remaining child in the
2632 : : * subtree.
2633 : : *
2634 : : * Note: We rely on the fact that a buffer pin on the leaf page has been
2635 : : * held since leafhikey was initialized. This is safe, though only
2636 : : * because the page was already half-dead at that point. The leaf page
2637 : : * cannot have been modified by any other backend during the period when
2638 : : * no lock was held.
2639 : : */
4384 heikki.linnakangas@i 2640 [ + + ]: 3212 : if (target != leafblkno)
1845 pg@bowt.ie 2641 : 142 : BTreeTupleSetTopParent(leafhikey, leaftopparent);
2642 : :
2643 : : /*
2644 : : * Mark the page itself deleted. It can be recycled when all current
2645 : : * transactions are gone. Storing GetTopTransactionId() would work, but
2646 : : * we're in VACUUM and would not otherwise have an XID. Having already
2647 : : * updated links to the target, ReadNextFullTransactionId() suffices as an
2648 : : * upper bound. Any scan having retained a now-stale link is advertising
2649 : : * in its PGPROC an xmin less than or equal to the value we read here. It
2650 : : * will continue to do so, holding back the xmin horizon, for the duration
2651 : : * of that scan.
2652 : : */
3616 kgrittn@postgresql.o 2653 : 3212 : page = BufferGetPage(buf);
1444 michael@paquier.xyz 2654 : 3212 : opaque = BTPageGetOpaque(page);
2144 pg@bowt.ie 2655 [ + + - + ]: 3212 : Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
2656 : :
2657 : : /*
2658 : : * Store upper bound XID that's used to determine when deleted page is no
2659 : : * longer needed as a tombstone
2660 : : */
1845 2661 : 3212 : safexid = ReadNextFullTransactionId();
2662 : 3212 : BTPageSetDeleted(page, safexid);
2663 : 3212 : opaque->btpo_cycleid = 0;
2664 : :
2665 : : /* And update the metapage, if needed */
8421 tgl@sss.pgh.pa.us 2666 [ + + ]: 3212 : if (BufferIsValid(metabuf))
2667 : : {
2668 : : /* upgrade metapage if needed */
2552 pg@bowt.ie 2669 [ - + ]: 29 : if (metad->btm_version < BTREE_NOVAC_VERSION)
2902 teodor@sigaev.ru 2670 :UBC 0 : _bt_upgrademetapage(metapg);
8421 tgl@sss.pgh.pa.us 2671 :CBC 29 : metad->btm_fastroot = rightsib;
2672 : 29 : metad->btm_fastlevel = targetlevel;
7289 2673 : 29 : MarkBufferDirty(metabuf);
2674 : : }
2675 : :
2676 : : /* Must mark buffers dirty before XLogInsert */
2677 : 3212 : MarkBufferDirty(rbuf);
2678 : 3212 : MarkBufferDirty(buf);
2679 [ + + ]: 3212 : if (BufferIsValid(lbuf))
2680 : 956 : MarkBufferDirty(lbuf);
4384 heikki.linnakangas@i 2681 [ + + ]: 3212 : if (target != leafblkno)
2682 : 142 : MarkBufferDirty(leafbuf);
2683 : :
2684 : : /* XLOG stuff */
5571 rhaas@postgresql.org 2685 [ + - + + : 3212 : if (RelationNeedsWAL(rel))
+ - + - ]
8421 tgl@sss.pgh.pa.us 2686 :GIC 3212 : {
2687 : : xl_btree_unlink_page xlrec;
2688 : : xl_btree_metadata xlmeta;
2689 : : uint8 xlinfo;
2690 : :
4133 heikki.linnakangas@i 2691 :CBC 3212 : XLogBeginInsert();
2692 : :
2693 : 3212 : XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT);
2694 [ + + ]: 3212 : if (BufferIsValid(lbuf))
2695 : 956 : XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
2696 : 3212 : XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
2697 [ + + ]: 3212 : if (target != leafblkno)
2698 : 142 : XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
2699 : :
2700 : : /* information stored on the target/to-be-unlinked block */
4384 2701 : 3212 : xlrec.leftsib = leftsib;
2702 : 3212 : xlrec.rightsib = rightsib;
1845 pg@bowt.ie 2703 : 3212 : xlrec.level = targetlevel;
2704 : 3212 : xlrec.safexid = safexid;
2705 : :
2706 : : /* information needed to recreate the leaf block (if not the target) */
4384 heikki.linnakangas@i 2707 : 3212 : xlrec.leafleftsib = leafleftsib;
2708 : 3212 : xlrec.leafrightsib = leafrightsib;
1845 pg@bowt.ie 2709 : 3212 : xlrec.leaftopparent = leaftopparent;
2710 : :
397 peter@eisentraut.org 2711 : 3212 : XLogRegisterData(&xlrec, SizeOfBtreeUnlinkPage);
2712 : :
8421 tgl@sss.pgh.pa.us 2713 [ + + ]: 3212 : if (BufferIsValid(metabuf))
2714 : : {
3054 2715 : 29 : XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
2716 : :
2552 pg@bowt.ie 2717 [ - + ]: 29 : Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
2718 : 29 : xlmeta.version = metad->btm_version;
8421 tgl@sss.pgh.pa.us 2719 : 29 : xlmeta.root = metad->btm_root;
2720 : 29 : xlmeta.level = metad->btm_level;
2721 : 29 : xlmeta.fastroot = metad->btm_fastroot;
2722 : 29 : xlmeta.fastlevel = metad->btm_fastlevel;
1845 pg@bowt.ie 2723 : 29 : xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
2209 2724 : 29 : xlmeta.allequalimage = metad->btm_allequalimage;
2725 : :
397 peter@eisentraut.org 2726 : 29 : XLogRegisterBufData(4, &xlmeta, sizeof(xl_btree_metadata));
4384 heikki.linnakangas@i 2727 : 29 : xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
2728 : : }
2729 : : else
2730 : 3183 : xlinfo = XLOG_BTREE_UNLINK_PAGE;
2731 : :
4133 2732 : 3212 : recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2733 : : }
2734 : : else
2 pg@bowt.ie 2735 :UNC 0 : recptr = XLogGetFakeLSN(rel);
2736 : :
2 pg@bowt.ie 2737 [ + + ]:GNC 3212 : if (BufferIsValid(metabuf))
2738 : 29 : PageSetLSN(metapg, recptr);
2739 : 3212 : page = BufferGetPage(rbuf);
2740 : 3212 : PageSetLSN(page, recptr);
2741 : 3212 : page = BufferGetPage(buf);
2742 : 3212 : PageSetLSN(page, recptr);
2743 [ + + ]: 3212 : if (BufferIsValid(lbuf))
2744 : : {
2745 : 956 : page = BufferGetPage(lbuf);
8421 tgl@sss.pgh.pa.us 2746 :CBC 956 : PageSetLSN(page, recptr);
2747 : : }
2 pg@bowt.ie 2748 [ + + ]:GNC 3212 : if (target != leafblkno)
2749 : : {
2750 : 142 : page = BufferGetPage(leafbuf);
8421 tgl@sss.pgh.pa.us 2751 :CBC 142 : PageSetLSN(page, recptr);
2752 : : }
2753 : :
2754 [ - + ]: 3212 : END_CRIT_SECTION();
2755 : :
2756 : : /* release metapage */
2757 [ + + ]: 3212 : if (BufferIsValid(metabuf))
7289 2758 : 29 : _bt_relbuf(rel, metabuf);
2759 : :
2760 : : /* release siblings */
8421 2761 [ + + ]: 3212 : if (BufferIsValid(lbuf))
7289 2762 : 956 : _bt_relbuf(rel, lbuf);
4384 heikki.linnakangas@i 2763 : 3212 : _bt_relbuf(rel, rbuf);
2764 : :
2765 : : /* If the target is not leafbuf, we're done with it now -- release it */
1845 pg@bowt.ie 2766 [ + + ]: 3212 : if (target != leafblkno)
2767 : 142 : _bt_relbuf(rel, buf);
2768 : :
2769 : : /*
2770 : : * Maintain pages_newly_deleted, which is simply the number of pages
2771 : : * deleted by the ongoing VACUUM operation.
2772 : : *
2773 : : * Maintain pages_deleted in a way that takes into account how
2774 : : * btvacuumpage() will count deleted pages that have yet to become
2775 : : * scanblkno -- only count page when it's not going to get that treatment
2776 : : * later on.
2777 : : */
1844 2778 : 3212 : stats->pages_newly_deleted++;
2144 2779 [ + + ]: 3212 : if (target <= scanblkno)
1844 2780 : 3087 : stats->pages_deleted++;
2781 : :
2782 : : /*
2783 : : * Remember information about the target page (now a newly deleted page)
2784 : : * in dedicated vstate space for later. The page will be considered as a
2785 : : * candidate to place in the FSM at the end of the current btvacuumscan()
2786 : : * call.
2787 : : */
1820 2788 : 3212 : _bt_pendingfsm_add(vstate, target, safexid);
2789 : :
2790 : : /* Success - hold on to lock on leafbuf (might also have been target) */
4384 heikki.linnakangas@i 2791 : 3212 : return true;
2792 : : }
2793 : :
2794 : : /*
2795 : : * Establish how tall the to-be-deleted subtree will be during the first stage
2796 : : * of page deletion.
2797 : : *
2798 : : * Caller's child argument is the block number of the page caller wants to
2799 : : * delete (this is leafbuf's block number, except when we're called
2800 : : * recursively). stack is a search stack leading to it. Note that we will
2801 : : * update the stack entry(s) to reflect current downlink positions --- this is
2802 : : * similar to the corresponding point in page split handling.
2803 : : *
2804 : : * If "first stage" caller cannot go ahead with deleting _any_ pages, returns
2805 : : * false. Returns true on success, in which case caller can use certain
2806 : : * details established here to perform the first stage of deletion. This
2807 : : * function is the last point at which page deletion may be deemed unsafe
2808 : : * (barring index corruption, or unexpected concurrent page deletions).
2809 : : *
2810 : : * We write lock the parent of the root of the to-be-deleted subtree for
2811 : : * caller on success (i.e. we leave our lock on the *subtreeparent buffer for
2812 : : * caller). Caller will have to remove a downlink from *subtreeparent. We
2813 : : * also set a *subtreeparent offset number in *poffset, to indicate the
2814 : : * location of the pivot tuple that contains the relevant downlink.
2815 : : *
2816 : : * The root of the to-be-deleted subtree is called the "top parent". Note
2817 : : * that the leafbuf page is often the final "top parent" page (you can think
2818 : : * of the leafbuf page as a degenerate single page subtree when that happens).
2819 : : * Caller should initialize *topparent to the target leafbuf page block number
2820 : : * (while *topparentrightsib should be set to leafbuf's right sibling block
2821 : : * number). We will update *topparent (and *topparentrightsib) for caller
2822 : : * here, though only when it turns out that caller will delete at least one
2823 : : * internal page (i.e. only when caller needs to store a valid link to the top
2824 : : * parent block in the leafbuf page using BTreeTupleSetTopParent()).
2825 : : */
2826 : : static bool
1079 andres@anarazel.de 2827 : 3229 : _bt_lock_subtree_parent(Relation rel, Relation heaprel, BlockNumber child,
2828 : : BTStack stack, Buffer *subtreeparent,
2829 : : OffsetNumber *poffset, BlockNumber *topparent,
2830 : : BlockNumber *topparentrightsib)
2831 : : {
2832 : : BlockNumber parent,
2833 : : leftsibparent;
2834 : : OffsetNumber parentoffset,
2835 : : maxoff;
2836 : : Buffer pbuf;
2837 : : Page page;
2838 : : BTPageOpaque opaque;
2839 : :
2840 : : /*
2841 : : * Locate the pivot tuple whose downlink points to "child". Write lock
2842 : : * the parent page itself.
2843 : : */
2844 : 3229 : pbuf = _bt_getstackbuf(rel, heaprel, stack, child);
2134 pg@bowt.ie 2845 [ - + ]: 3229 : if (pbuf == InvalidBuffer)
2846 : : {
2847 : : /*
2848 : : * Failed to "re-find" a pivot tuple whose downlink matched our child
2849 : : * block number on the parent level -- the index must be corrupt.
2850 : : * Don't even try to delete the leafbuf subtree. Just report the
2851 : : * issue and press on with vacuuming the index.
2852 : : *
2853 : : * Note: _bt_getstackbuf() recovers from concurrent page splits that
2854 : : * take place on the parent level. Its approach is a near-exhaustive
2855 : : * linear search. This also gives it a surprisingly good chance of
2856 : : * recovering in the event of a buggy or inconsistent opclass. But we
2857 : : * don't rely on that here.
2858 : : */
1818 pg@bowt.ie 2859 [ # # ]:UBC 0 : ereport(LOG,
2860 : : (errcode(ERRCODE_INDEX_CORRUPTED),
2861 : : errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
2862 : : RelationGetRelationName(rel), child)));
998 2863 : 0 : Assert(false);
2864 : : return false;
2865 : : }
2866 : :
2134 pg@bowt.ie 2867 :CBC 3229 : parent = stack->bts_blkno;
2868 : 3229 : parentoffset = stack->bts_offset;
2869 : :
2870 : 3229 : page = BufferGetPage(pbuf);
1444 michael@paquier.xyz 2871 : 3229 : opaque = BTPageGetOpaque(page);
2134 pg@bowt.ie 2872 : 3229 : maxoff = PageGetMaxOffsetNumber(page);
2873 : 3229 : leftsibparent = opaque->btpo_prev;
2874 : :
2875 : : /*
2876 : : * _bt_getstackbuf() completes page splits on returned parent buffer when
2877 : : * required.
2878 : : *
2879 : : * In general it's a bad idea for VACUUM to use up more disk space, which
2880 : : * is why page deletion does not finish incomplete page splits most of the
2881 : : * time. We allow this limited exception because the risk is much lower,
2882 : : * and the potential downside of not proceeding is much higher: A single
2883 : : * internal page with the INCOMPLETE_SPLIT flag set might otherwise
2884 : : * prevent us from deleting hundreds of empty leaf pages from one level
2885 : : * down.
2886 : : */
2887 [ - + ]: 3229 : Assert(!P_INCOMPLETE_SPLIT(opaque));
2888 : :
2889 [ + + ]: 3229 : if (parentoffset < maxoff)
2890 : : {
2891 : : /*
2892 : : * Child is not the rightmost child in parent, so it's safe to delete
2893 : : * the subtree whose root/topparent is child page
2894 : : */
2895 : 3070 : *subtreeparent = pbuf;
2896 : 3070 : *poffset = parentoffset;
2897 : 3070 : return true;
2898 : : }
2899 : :
2900 : : /*
2901 : : * Child is the rightmost child of parent.
2902 : : *
2903 : : * Since it's the rightmost child of parent, deleting the child (or
2904 : : * deleting the subtree whose root/topparent is the child page) is only
2905 : : * safe when it's also possible to delete the parent.
2906 : : */
2907 [ - + ]: 159 : Assert(parentoffset == maxoff);
2908 [ - + + + : 159 : if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
- + ]
2909 : : {
2910 : : /*
2911 : : * Child isn't parent's only child, or parent is rightmost on its
2912 : : * entire level. Definitely cannot delete any pages.
2913 : : */
2914 : 6 : _bt_relbuf(rel, pbuf);
2915 : 6 : return false;
2916 : : }
2917 : :
2918 : : /*
2919 : : * Now make sure that the parent deletion is itself safe by examining the
2920 : : * child's grandparent page. Recurse, passing the parent page as the
2921 : : * child page (child's grandparent is the parent on the next level up). If
2922 : : * parent deletion is unsafe, then child deletion must also be unsafe (in
2923 : : * which case caller cannot delete any pages at all).
2924 : : */
2925 : 153 : *topparent = parent;
2926 : 153 : *topparentrightsib = opaque->btpo_next;
2927 : :
2928 : : /*
2929 : : * Release lock on parent before recursing.
2930 : : *
2931 : : * It's OK to release page locks on parent before recursive call locks
2932 : : * grandparent. An internal page can only acquire an entry if the child
2933 : : * is split, but that cannot happen as long as we still hold a lock on the
2934 : : * leafbuf page.
2935 : : */
2936 : 153 : _bt_relbuf(rel, pbuf);
2937 : :
2938 : : /*
2939 : : * Before recursing, check that the left sibling of parent (if any) is not
2940 : : * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
2941 : : * parent lock).
2942 : : *
2943 : : * Note: We deliberately avoid completing incomplete splits here.
2944 : : */
1009 2945 [ - + ]: 153 : if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
2134 pg@bowt.ie 2946 :UBC 0 : return false;
2947 : :
2948 : : /* Recurse to examine child page's grandparent page */
1079 andres@anarazel.de 2949 :CBC 153 : return _bt_lock_subtree_parent(rel, heaprel, parent, stack->bts_parent,
2950 : : subtreeparent, poffset,
2951 : : topparent, topparentrightsib);
2952 : : }
2953 : :
2954 : : /*
2955 : : * Initialize local memory state used by VACUUM for _bt_pendingfsm_finalize
2956 : : * optimization.
2957 : : *
2958 : : * Called at the start of a btvacuumscan(). Caller's cleanuponly argument
2959 : : * indicates if ongoing VACUUM has not (and will not) call btbulkdelete().
2960 : : *
2961 : : * We expect to allocate memory inside VACUUM's top-level memory context here.
2962 : : * The working buffer is subject to a limit based on work_mem. Our strategy
2963 : : * when the array can no longer grow within the bounds of that limit is to
2964 : : * stop saving additional newly deleted pages, while proceeding as usual with
2965 : : * the pages that we can fit.
2966 : : */
2967 : : void
1820 pg@bowt.ie 2968 : 1634 : _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
2969 : : {
2970 : : Size maxbufsize;
2971 : :
2972 : : /*
2973 : : * Don't bother with optimization in cleanup-only case -- we don't expect
2974 : : * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan()
2975 : : * can only take place because this optimization didn't work out during
2976 : : * the last VACUUM.
2977 : : */
2978 [ + + ]: 1634 : if (cleanuponly)
2979 : 8 : return;
2980 : :
2981 : : /*
2982 : : * Cap maximum size of array so that we always respect work_mem. Avoid
2983 : : * int overflow here.
2984 : : */
2985 : 1626 : vstate->bufsize = 256;
408 tgl@sss.pgh.pa.us 2986 : 1626 : maxbufsize = (work_mem * (Size) 1024) / sizeof(BTPendingFSM);
1820 pg@bowt.ie 2987 : 1626 : maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
2988 : : /* BTVacState.maxbufsize has type int */
408 tgl@sss.pgh.pa.us 2989 : 1626 : maxbufsize = Min(maxbufsize, INT_MAX);
2990 : : /* Stay sane with small work_mem */
1820 pg@bowt.ie 2991 : 1626 : maxbufsize = Max(maxbufsize, vstate->bufsize);
408 tgl@sss.pgh.pa.us 2992 : 1626 : vstate->maxbufsize = (int) maxbufsize;
2993 : :
2994 : : /* Allocate buffer, indicate that there are currently 0 pending pages */
95 michael@paquier.xyz 2995 :GNC 1626 : vstate->pendingpages = palloc_array(BTPendingFSM, vstate->bufsize);
1820 pg@bowt.ie 2996 :CBC 1626 : vstate->npendingpages = 0;
2997 : : }
2998 : :
2999 : : /*
3000 : : * Place any newly deleted pages (i.e. pages that _bt_pagedel() deleted during
3001 : : * the ongoing VACUUM operation) into the free space map -- though only when
3002 : : * it is actually safe to do so by now.
3003 : : *
3004 : : * Called at the end of a btvacuumscan(), just before free space map vacuuming
3005 : : * takes place.
3006 : : *
3007 : : * Frees memory allocated by _bt_pendingfsm_init(), if any.
3008 : : */
3009 : : void
3010 : 1633 : _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
3011 : : {
3012 : 1633 : IndexBulkDeleteResult *stats = vstate->stats;
1031 tgl@sss.pgh.pa.us 3013 : 1633 : Relation heaprel = vstate->info->heaprel;
3014 : :
1820 pg@bowt.ie 3015 [ - + ]: 1633 : Assert(stats->pages_newly_deleted >= vstate->npendingpages);
1009 3016 [ - + ]: 1633 : Assert(heaprel != NULL);
3017 : :
1820 3018 [ + + ]: 1633 : if (vstate->npendingpages == 0)
3019 : : {
3020 : : /* Just free memory when nothing to do */
3021 [ + + ]: 1536 : if (vstate->pendingpages)
3022 : 1528 : pfree(vstate->pendingpages);
3023 : :
3024 : 1536 : return;
3025 : : }
3026 : :
3027 : : #ifdef DEBUG_BTREE_PENDING_FSM
3028 : :
3029 : : /*
3030 : : * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
3031 : : * placing pending pages in the FSM. Note that the optimization will
3032 : : * never be effective without some other backend concurrently consuming an
3033 : : * XID.
3034 : : */
3035 : : pg_usleep(5000000L);
3036 : : #endif
3037 : :
3038 : : /*
3039 : : * Recompute VACUUM XID boundaries.
3040 : : *
3041 : : * We don't actually care about the oldest non-removable XID. Computing
3042 : : * the oldest such XID has a useful side-effect that we rely on: it
3043 : : * forcibly updates the XID horizon state for this backend. This step is
3044 : : * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
3045 : : * that it is now safe to recycle newly deleted pages without this step.
3046 : : */
1077 3047 : 97 : GetOldestNonRemovableTransactionId(heaprel);
3048 : :
1820 3049 [ + - ]: 97 : for (int i = 0; i < vstate->npendingpages; i++)
3050 : : {
3051 : 97 : BlockNumber target = vstate->pendingpages[i].target;
3052 : 97 : FullTransactionId safexid = vstate->pendingpages[i].safexid;
3053 : :
3054 : : /*
3055 : : * Do the equivalent of checking BTPageIsRecyclable(), but without
3056 : : * accessing the page again a second time.
3057 : : *
3058 : : * Give up on finding the first non-recyclable page -- all later pages
3059 : : * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
3060 : : * to the array in safexid order.
3061 : : */
1077 3062 [ + - ]: 97 : if (!GlobalVisCheckRemovableFullXid(heaprel, safexid))
1820 3063 : 97 : break;
3064 : :
1820 pg@bowt.ie 3065 :LBC (16) : RecordFreeIndexPage(rel, target);
3066 : (16) : stats->pages_free++;
3067 : : }
3068 : :
1820 pg@bowt.ie 3069 :CBC 97 : pfree(vstate->pendingpages);
3070 : : }
3071 : :
3072 : : /*
3073 : : * Maintain array of pages that were deleted during current btvacuumscan()
3074 : : * call, for use in _bt_pendingfsm_finalize()
3075 : : */
3076 : : static void
3077 : 3212 : _bt_pendingfsm_add(BTVacState *vstate,
3078 : : BlockNumber target,
3079 : : FullTransactionId safexid)
3080 : : {
3081 [ - + ]: 3212 : Assert(vstate->npendingpages <= vstate->bufsize);
3082 [ - + ]: 3212 : Assert(vstate->bufsize <= vstate->maxbufsize);
3083 : :
3084 : : #ifdef USE_ASSERT_CHECKING
3085 : :
3086 : : /*
3087 : : * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the
3088 : : * array will always be in safexid order (since that is the order that we
3089 : : * save them in here)
3090 : : */
3091 [ + + ]: 3212 : if (vstate->npendingpages > 0)
3092 : : {
3093 : 3115 : FullTransactionId lastsafexid =
1031 tgl@sss.pgh.pa.us 3094 : 3115 : vstate->pendingpages[vstate->npendingpages - 1].safexid;
3095 : :
1820 pg@bowt.ie 3096 [ - + ]: 3115 : Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid));
3097 : : }
3098 : : #endif
3099 : :
3100 : : /*
3101 : : * If temp buffer reaches maxbufsize/work_mem capacity then we discard
3102 : : * information about this page.
3103 : : *
3104 : : * Note that this also covers the case where we opted to not use the
3105 : : * optimization in _bt_pendingfsm_init().
3106 : : */
3107 [ - + ]: 3212 : if (vstate->npendingpages == vstate->maxbufsize)
1820 pg@bowt.ie 3108 :UBC 0 : return;
3109 : :
3110 : : /* Consider enlarging buffer */
1820 pg@bowt.ie 3111 [ + + ]:CBC 3212 : if (vstate->npendingpages == vstate->bufsize)
3112 : : {
3113 : 4 : int newbufsize = vstate->bufsize * 2;
3114 : :
3115 : : /* Respect work_mem */
3116 [ - + ]: 4 : if (newbufsize > vstate->maxbufsize)
1820 pg@bowt.ie 3117 :UBC 0 : newbufsize = vstate->maxbufsize;
3118 : :
1820 pg@bowt.ie 3119 :CBC 4 : vstate->bufsize = newbufsize;
3120 : 4 : vstate->pendingpages =
3121 : 4 : repalloc(vstate->pendingpages,
3122 : 4 : sizeof(BTPendingFSM) * vstate->bufsize);
3123 : : }
3124 : :
3125 : : /* Save metadata for newly deleted page */
3126 : 3212 : vstate->pendingpages[vstate->npendingpages].target = target;
3127 : 3212 : vstate->pendingpages[vstate->npendingpages].safexid = safexid;
3128 : 3212 : vstate->npendingpages++;
3129 : : }
|