Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * nbtinsert.c
4 : : * Item insertion in Lehman and Yao btrees for Postgres.
5 : : *
6 : : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : : * Portions Copyright (c) 1994, Regents of the University of California
8 : : *
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/access/nbtree/nbtinsert.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : :
16 : : #include "postgres.h"
17 : :
18 : : #include "access/nbtree.h"
19 : : #include "access/nbtxlog.h"
20 : : #include "access/tableam.h"
21 : : #include "access/transam.h"
22 : : #include "access/xloginsert.h"
23 : : #include "common/int.h"
24 : : #include "common/pg_prng.h"
25 : : #include "lib/qunique.h"
26 : : #include "miscadmin.h"
27 : : #include "storage/lmgr.h"
28 : : #include "storage/predicate.h"
29 : : #include "utils/injection_point.h"
30 : :
31 : : /* Minimum tree height for application of fastpath optimization */
32 : : #define BTREE_FASTPATH_MIN_LEVEL 2
33 : :
34 : :
35 : : static BTStack _bt_search_insert(Relation rel, Relation heaprel,
36 : : BTInsertState insertstate);
37 : : static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
38 : : Relation heapRel,
39 : : IndexUniqueCheck checkUnique, bool *is_unique,
40 : : uint32 *speculativeToken);
41 : : static OffsetNumber _bt_findinsertloc(Relation rel,
42 : : BTInsertState insertstate,
43 : : bool checkingunique,
44 : : bool indexUnchanged,
45 : : BTStack stack,
46 : : Relation heapRel);
47 : : static void _bt_stepright(Relation rel, Relation heaprel,
48 : : BTInsertState insertstate, BTStack stack);
49 : : static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key,
50 : : Buffer buf,
51 : : Buffer cbuf,
52 : : BTStack stack,
53 : : IndexTuple itup,
54 : : Size itemsz,
55 : : OffsetNumber newitemoff,
56 : : int postingoff,
57 : : bool split_only_page);
58 : : static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key,
59 : : Buffer buf, Buffer cbuf, OffsetNumber newitemoff,
60 : : Size newitemsz, IndexTuple newitem, IndexTuple orignewitem,
61 : : IndexTuple nposting, uint16 postingoff);
62 : : static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf,
63 : : Buffer rbuf, BTStack stack, bool isroot, bool isonly);
64 : : static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf);
65 : : static inline bool _bt_pgaddtup(Page page, Size itemsize, const IndexTupleData *itup,
66 : : OffsetNumber itup_off, bool newfirstdataitem);
67 : : static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
68 : : BTInsertState insertstate,
69 : : bool simpleonly, bool checkingunique,
70 : : bool uniquedup, bool indexUnchanged);
71 : : static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
72 : : OffsetNumber *deletable, int ndeletable,
73 : : IndexTuple newitem, OffsetNumber minoff,
74 : : OffsetNumber maxoff);
75 : : static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
76 : : int ndeletable, IndexTuple newitem,
77 : : int *nblocks);
78 : : static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
79 : :
80 : : /*
81 : : * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
82 : : *
83 : : * This routine is called by the public interface routine, btinsert.
84 : : * By here, itup is filled in, including the TID.
85 : : *
86 : : * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
87 : : * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
88 : : * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
89 : : * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
90 : : * don't actually insert.
91 : : *
92 : : * indexUnchanged executor hint indicates if itup is from an
93 : : * UPDATE that didn't logically change the indexed value, but
94 : : * must nevertheless have a new entry to point to a successor
95 : : * version.
96 : : *
97 : : * The result value is only significant for UNIQUE_CHECK_PARTIAL:
98 : : * it must be true if the entry is known unique, else false.
99 : : * (In the current implementation we'll also return true after a
100 : : * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
101 : : * that's just a coding artifact.)
102 : : */
103 : : bool
7265 tgl@sss.pgh.pa.us 104 :CBC 3757192 : _bt_doinsert(Relation rel, IndexTuple itup,
105 : : IndexUniqueCheck checkUnique, bool indexUnchanged,
106 : : Relation heapRel)
107 : : {
5984 108 : 3757192 : bool is_unique = false;
109 : : BTInsertStateData insertstate;
110 : : BTScanInsert itup_key;
111 : : BTStack stack;
2463 pg@bowt.ie 112 : 3757192 : bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
113 : :
114 : : /* we need an insertion scan key to do our search, so build one */
920 115 : 3757192 : itup_key = _bt_mkscankey(rel, itup);
116 : :
2429 117 [ + + ]: 3757192 : if (checkingunique)
118 : : {
119 [ + + ]: 2716084 : if (!itup_key->anynullkeys)
120 : : {
121 : : /* No (heapkeyspace) scantid until uniqueness established */
122 : 2705997 : itup_key->scantid = NULL;
123 : : }
124 : : else
125 : : {
126 : : /*
127 : : * Scan key for new tuple contains NULL key values. Bypass
128 : : * checkingunique steps. They are unnecessary because core code
129 : : * considers NULL unequal to every value, including NULL.
130 : : *
131 : : * This optimization avoids O(N^2) behavior within the
132 : : * _bt_findinsertloc() heapkeyspace path when a unique index has a
133 : : * large number of "duplicates" with NULL key values.
134 : : */
135 : 10087 : checkingunique = false;
136 : : /* Tuple is unique in the sense that core code cares about */
137 [ - + ]: 10087 : Assert(checkUnique != UNIQUE_CHECK_EXISTING);
138 : 10087 : is_unique = true;
139 : : }
140 : : }
141 : :
142 : : /*
143 : : * Fill in the BTInsertState working area, to track the current page and
144 : : * position within the page to insert on.
145 : : *
146 : : * Note that itemsz is passed down to lower level code that deals with
147 : : * inserting the item. It must be MAXALIGN()'d. This ensures that space
148 : : * accounting code consistently considers the alignment overhead that we
149 : : * expect PageAddItem() will add later. (Actually, index_form_tuple() is
150 : : * already conservative about alignment, but we don't rely on that from
151 : : * this distance. Besides, preserving the "true" tuple size in index
152 : : * tuple headers for the benefit of nbtsplitloc.c might happen someday.
153 : : * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
154 : : */
2463 155 : 3757192 : insertstate.itup = itup;
156 : 3757192 : insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
157 : 3757192 : insertstate.itup_key = itup_key;
158 : 3757192 : insertstate.bounds_valid = false;
159 : 3757192 : insertstate.buf = InvalidBuffer;
2120 160 : 3757192 : insertstate.postingoff = 0;
161 : :
2099 162 : 3757204 : search:
163 : :
164 : : /*
165 : : * Find and lock the leaf page that the tuple should be added to by
166 : : * searching from the root page. insertstate.buf will hold a buffer that
167 : : * is locked in exclusive mode afterwards.
168 : : */
990 andres@anarazel.de 169 : 3757204 : stack = _bt_search_insert(rel, heapRel, &insertstate);
170 : :
171 : : /*
172 : : * checkingunique inserts are not allowed to go ahead when two tuples with
173 : : * equal key attribute values would be visible to new MVCC snapshots once
174 : : * the xact commits. Check for conflicts in the locked page/buffer (if
175 : : * needed) here.
176 : : *
177 : : * It might be necessary to check a page to the right in _bt_check_unique,
178 : : * though that should be very rare. In practice the first page the value
179 : : * could be on (with scantid omitted) is almost always also the only page
180 : : * that a matching tuple might be found on. This is due to the behavior
181 : : * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
182 : : * only be allowed to cross a page boundary when there is no candidate
183 : : * leaf page split point that avoids it. Also, _bt_check_unique can use
184 : : * the leaf page high key to determine that there will be no duplicates on
185 : : * the right sibling without actually visiting it (it uses the high key in
186 : : * cases where the new item happens to belong at the far right of the leaf
187 : : * page).
188 : : *
189 : : * NOTE: obviously, _bt_check_unique can only detect keys that are already
190 : : * in the index; so it cannot defend against concurrent insertions of the
191 : : * same key. We protect against that by means of holding a write lock on
192 : : * the first page the value could be on, with omitted/-inf value for the
193 : : * implicit heap TID tiebreaker attribute. Any other would-be inserter of
194 : : * the same key must acquire a write lock on the same page, so only one
195 : : * would-be inserter can be making the check at one time. Furthermore,
196 : : * once we are past the check we hold write locks continuously until we
197 : : * have performed our insertion, so no later inserter can fail to see our
198 : : * insertion. (This requires some care in _bt_findinsertloc.)
199 : : *
200 : : * If we must wait for another xact, we release the lock while waiting,
201 : : * and then must perform a new search.
202 : : *
203 : : * For a partial uniqueness check, we don't wait for the other xact. Just
204 : : * let the tuple in and return false for possibly non-unique, or true for
205 : : * definitely unique.
206 : : */
2463 pg@bowt.ie 207 [ + + ]: 3757204 : if (checkingunique)
208 : : {
209 : : TransactionId xwait;
210 : : uint32 speculativeToken;
211 : :
212 : 2706009 : xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
213 : : &is_unique, &speculativeToken);
214 : :
2099 215 [ + + ]: 2705746 : if (unlikely(TransactionIdIsValid(xwait)))
216 : : {
217 : : /* Have to wait for the other guy ... */
2463 218 : 12 : _bt_relbuf(rel, insertstate.buf);
219 : 12 : insertstate.buf = InvalidBuffer;
220 : :
221 : : /*
222 : : * If it's a speculative insertion, wait for it to finish (ie. to
223 : : * go ahead with the insertion, or kill the tuple). Otherwise
224 : : * wait for the transaction to finish as usual.
225 : : */
3875 andres@anarazel.de 226 [ - + ]: 12 : if (speculativeToken)
3875 andres@anarazel.de 227 :UBC 0 : SpeculativeInsertionWait(xwait, speculativeToken);
228 : : else
3875 andres@anarazel.de 229 :CBC 12 : XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
230 : :
231 : : /* start over... */
2822 andrew@dunslane.net 232 [ - + ]: 12 : if (stack)
2822 andrew@dunslane.net 233 :UBC 0 : _bt_freestack(stack);
2099 pg@bowt.ie 234 :CBC 12 : goto search;
235 : : }
236 : :
237 : : /* Uniqueness is established -- restore heap tid as scantid */
2463 238 [ + - ]: 2705734 : if (itup_key->heapkeyspace)
239 : 2705734 : itup_key->scantid = &itup->t_tid;
240 : : }
241 : :
5984 tgl@sss.pgh.pa.us 242 [ + + ]: 3756929 : if (checkUnique != UNIQUE_CHECK_EXISTING)
243 : : {
244 : : OffsetNumber newitemoff;
245 : :
246 : : /*
247 : : * The only conflict predicate locking cares about for indexes is when
248 : : * an index tuple insert conflicts with an existing lock. We don't
249 : : * know the actual page we're going to insert on for sure just yet in
250 : : * checkingunique and !heapkeyspace cases, but it's okay to use the
251 : : * first page the value could be on (with scantid omitted) instead.
252 : : */
2149 tmunro@postgresql.or 253 : 3756902 : CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));
254 : :
255 : : /*
256 : : * Do the insertion. Note that insertstate contains cached binary
257 : : * search bounds established within _bt_check_unique when insertion is
258 : : * checkingunique.
259 : : */
2463 pg@bowt.ie 260 : 3756899 : newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
261 : : indexUnchanged, stack, heapRel);
990 andres@anarazel.de 262 : 3756899 : _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
263 : : stack, itup, insertstate.itemsz, newitemoff,
264 : : insertstate.postingoff, false);
265 : : }
266 : : else
267 : : {
268 : : /* just release the buffer */
2463 pg@bowt.ie 269 : 27 : _bt_relbuf(rel, insertstate.buf);
270 : : }
271 : :
272 : : /* be tidy */
2822 andrew@dunslane.net 273 [ + + ]: 3756926 : if (stack)
274 : 3271641 : _bt_freestack(stack);
2463 pg@bowt.ie 275 : 3756926 : pfree(itup_key);
276 : :
5984 tgl@sss.pgh.pa.us 277 : 3756926 : return is_unique;
278 : : }
279 : :
280 : : /*
281 : : * _bt_search_insert() -- _bt_search() wrapper for inserts
282 : : *
283 : : * Search the tree for a particular scankey, or more precisely for the first
284 : : * leaf page it could be on. Try to make use of the fastpath optimization's
285 : : * rightmost leaf page cache before actually searching the tree from the root
286 : : * page, though.
287 : : *
288 : : * Return value is a stack of parent-page pointers (though see notes about
289 : : * fastpath optimization and page splits below). insertstate->buf is set to
290 : : * the address of the leaf-page buffer, which is write-locked and pinned in
291 : : * all cases (if necessary by creating a new empty root page for caller).
292 : : *
293 : : * The fastpath optimization avoids most of the work of searching the tree
294 : : * repeatedly when a single backend inserts successive new tuples on the
295 : : * rightmost leaf page of an index. A backend cache of the rightmost leaf
296 : : * page is maintained within _bt_insertonpg(), and used here. The cache is
297 : : * invalidated here when an insert of a non-pivot tuple must take place on a
298 : : * non-rightmost leaf page.
299 : : *
300 : : * The optimization helps with indexes on an auto-incremented field. It also
301 : : * helps with indexes on datetime columns, as well as indexes with lots of
302 : : * NULL values. (NULLs usually get inserted in the rightmost page for single
303 : : * column indexes, since they usually get treated as coming after everything
304 : : * else in the key space. Individual NULL tuples will generally be placed on
305 : : * the rightmost leaf page due to the influence of the heap TID column.)
306 : : *
307 : : * Note that we avoid applying the optimization when there is insufficient
308 : : * space on the rightmost page to fit caller's new item. This is necessary
309 : : * because we'll need to return a real descent stack when a page split is
310 : : * expected (actually, caller can cope with a leaf page split that uses a NULL
311 : : * stack, but that's very slow and so must be avoided). Note also that the
312 : : * fastpath optimization acquires the lock on the page conditionally as a way
313 : : * of reducing extra contention when there are concurrent insertions into the
314 : : * rightmost page (we give up if we'd have to wait for the lock). We assume
315 : : * that it isn't useful to apply the optimization when there is contention,
316 : : * since each per-backend cache won't stay valid for long.
317 : : */
318 : : static BTStack
990 andres@anarazel.de 319 : 3757204 : _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
320 : : {
2099 pg@bowt.ie 321 [ - + ]: 3757204 : Assert(insertstate->buf == InvalidBuffer);
322 [ - + ]: 3757204 : Assert(!insertstate->bounds_valid);
323 [ - + ]: 3757204 : Assert(insertstate->postingoff == 0);
324 : :
325 [ + - + + ]: 3757204 : if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
326 : : {
327 : : /* Simulate a _bt_getbuf() call with conditional locking */
328 [ + - ]: 31185 : insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
1974 329 [ + + ]: 31185 : if (_bt_conditionallockbuf(rel, insertstate->buf))
330 : : {
331 : : Page page;
332 : : BTPageOpaque opaque;
333 : :
2099 334 : 30659 : _bt_checkpage(rel, insertstate->buf);
335 : 30659 : page = BufferGetPage(insertstate->buf);
1355 michael@paquier.xyz 336 : 30659 : opaque = BTPageGetOpaque(page);
337 : :
338 : : /*
339 : : * Check if the page is still the rightmost leaf page and has
340 : : * enough free space to accommodate the new tuple. Also check
341 : : * that the insertion scan key is strictly greater than the first
342 : : * non-pivot tuple on the page. (Note that we expect itup_key's
343 : : * scantid to be unset when our caller is a checkingunique
344 : : * inserter.)
345 : : */
1855 pg@bowt.ie 346 [ + + ]: 30659 : if (P_RIGHTMOST(opaque) &&
347 [ + - ]: 30634 : P_ISLEAF(opaque) &&
348 [ + - ]: 30634 : !P_IGNORE(opaque) &&
2099 349 [ + + + - ]: 61092 : PageGetFreeSpace(page) > insertstate->itemsz &&
350 [ + + ]: 60916 : PageGetMaxOffsetNumber(page) >= P_HIKEY &&
351 : 30458 : _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
352 : : {
353 : : /*
354 : : * Caller can use the fastpath optimization because cached
355 : : * block is still rightmost leaf page, which can fit caller's
356 : : * new tuple without splitting. Keep block in local cache for
357 : : * next insert, and have caller use NULL stack.
358 : : *
359 : : * Note that _bt_insert_parent() has an assertion that catches
360 : : * leaf page splits that somehow follow from a fastpath insert
361 : : * (it should only be passed a NULL stack when it must deal
362 : : * with a concurrent root page split, and never because a NULL
363 : : * stack was returned here).
364 : : */
365 : 30438 : return NULL;
366 : : }
367 : :
368 : : /* Page unsuitable for caller, drop lock and pin */
369 : 221 : _bt_relbuf(rel, insertstate->buf);
370 : : }
371 : : else
372 : : {
373 : : /* Lock unavailable, drop pin */
374 : 526 : ReleaseBuffer(insertstate->buf);
375 : : }
376 : :
377 : : /* Forget block, since cache doesn't appear to be useful */
378 : 747 : RelationSetTargetBlock(rel, InvalidBlockNumber);
379 : : }
380 : :
381 : : /* Cannot use optimization -- descend tree, return proper descent stack */
990 andres@anarazel.de 382 : 3726766 : return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
383 : : BT_WRITE);
384 : : }
385 : :
386 : : /*
387 : : * _bt_check_unique() -- Check for violation of unique index constraint
388 : : *
389 : : * Returns InvalidTransactionId if there is no conflict, else an xact ID
390 : : * we must wait for to see if it commits a conflicting tuple. If an actual
391 : : * conflict is detected, no return --- just ereport(). If an xact ID is
392 : : * returned, and the conflicting tuple still has a speculative insertion in
393 : : * progress, *speculativeToken is set to non-zero, and the caller can wait for
394 : : * the verdict on the insertion using SpeculativeInsertionWait().
395 : : *
396 : : * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
397 : : * InvalidTransactionId because we don't want to wait. In this case we
398 : : * set *is_unique to false if there is a potential conflict, and the
399 : : * core code must redo the uniqueness check later.
400 : : *
401 : : * As a side-effect, sets state in insertstate that can later be used by
402 : : * _bt_findinsertloc() to reuse most of the binary search work we do
403 : : * here.
404 : : *
405 : : * This code treats NULLs as equal, unlike the default semantics for unique
406 : : * indexes. So do not call here when there are NULL values in scan key and
407 : : * the index uses the default NULLS DISTINCT mode.
408 : : */
409 : : static TransactionId
2463 pg@bowt.ie 410 : 2706009 : _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
411 : : IndexUniqueCheck checkUnique, bool *is_unique,
412 : : uint32 *speculativeToken)
413 : : {
414 : 2706009 : IndexTuple itup = insertstate->itup;
2012 415 : 2706009 : IndexTuple curitup = NULL;
1713 416 : 2706009 : ItemId curitemid = NULL;
2463 417 : 2706009 : BTScanInsert itup_key = insertstate->itup_key;
418 : : SnapshotData SnapshotDirty;
419 : : OffsetNumber offset;
420 : : OffsetNumber maxoff;
421 : : Page page;
422 : : BTPageOpaque opaque;
9279 tgl@sss.pgh.pa.us 423 : 2706009 : Buffer nbuf = InvalidBuffer;
5984 424 : 2706009 : bool found = false;
2120 pg@bowt.ie 425 : 2706009 : bool inposting = false;
426 : 2706009 : bool prevalldead = true;
427 : 2706009 : int curposti = 0;
428 : :
429 : : /* Assume unique until we find a duplicate */
5984 tgl@sss.pgh.pa.us 430 : 2706009 : *is_unique = true;
431 : :
6841 432 : 2706009 : InitDirtySnapshot(SnapshotDirty);
433 : :
2463 pg@bowt.ie 434 : 2706009 : page = BufferGetPage(insertstate->buf);
1355 michael@paquier.xyz 435 : 2706009 : opaque = BTPageGetOpaque(page);
9279 tgl@sss.pgh.pa.us 436 : 2706009 : maxoff = PageGetMaxOffsetNumber(page);
437 : :
438 : : /*
439 : : * Find the first tuple with the same key.
440 : : *
441 : : * This also saves the binary search bounds in insertstate. We use them
442 : : * in the fastpath below, but also in the _bt_findinsertloc() call later.
443 : : */
2448 pg@bowt.ie 444 [ - + ]: 2706009 : Assert(!insertstate->bounds_valid);
2463 445 : 2706009 : offset = _bt_binsrch_insert(rel, insertstate);
446 : :
447 : : /*
448 : : * Scan over all equal tuples, looking for live conflicts.
449 : : */
450 [ + + - + ]: 2706009 : Assert(!insertstate->bounds_valid || insertstate->low == offset);
2429 451 [ - + ]: 2706009 : Assert(!itup_key->anynullkeys);
2463 452 [ + - ]: 2706009 : Assert(itup_key->scantid == NULL);
453 : : for (;;)
454 : : {
455 : : /*
456 : : * Each iteration of the loop processes one heap TID, not one index
457 : : * tuple. Current offset number for page isn't usually advanced on
458 : : * iterations that process heap TIDs from posting list tuples.
459 : : *
460 : : * "inposting" state is set when _inside_ a posting list --- not when
461 : : * we're at the start (or end) of a posting list. We advance curposti
462 : : * at the end of the iteration when inside a posting list tuple. In
463 : : * general, every loop iteration either advances the page offset or
464 : : * advances curposti --- an iteration that handles the rightmost/max
465 : : * heap TID in a posting list finally advances the page offset (and
466 : : * unsets "inposting").
467 : : *
468 : : * Make sure the offset points to an actual index tuple before trying
469 : : * to examine it...
470 : : */
9279 tgl@sss.pgh.pa.us 471 [ + + ]: 8722850 : if (offset <= maxoff)
472 : : {
473 : : /*
474 : : * Fastpath: In most cases, we can use cached search bounds to
475 : : * limit our consideration to items that are definitely
476 : : * duplicates. This fastpath doesn't apply when the original page
477 : : * is empty, or when initial offset is past the end of the
478 : : * original page, which may indicate that we need to examine a
479 : : * second or subsequent page.
480 : : *
481 : : * Note that this optimization allows us to avoid calling
482 : : * _bt_compare() directly when there are no duplicates, as long as
483 : : * the offset where the key will go is not at the end of the page.
484 : : */
2463 pg@bowt.ie 485 [ + + + + ]: 7221586 : if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
486 : : {
487 [ - + ]: 1085187 : Assert(insertstate->bounds_valid);
488 [ + + - + ]: 1085187 : Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
489 [ - + ]: 1085187 : Assert(insertstate->low <= insertstate->stricthigh);
2429 490 [ - + ]: 1085187 : Assert(_bt_compare(rel, itup_key, page, offset) < 0);
2463 491 : 1085187 : break;
492 : : }
493 : :
494 : : /*
495 : : * We can skip items that are already marked killed.
496 : : *
497 : : * In the presence of heavy update activity an index may contain
498 : : * many killed items with the same key; running _bt_compare() on
499 : : * each killed item gets expensive. Just advance over killed
500 : : * items as quickly as we can. We only apply _bt_compare() when
501 : : * we get to a non-killed item. We could reuse the bounds to
502 : : * avoid _bt_compare() calls for known equal tuples, but it
503 : : * doesn't seem worth it.
504 : : */
2120 505 [ + + ]: 6136399 : if (!inposting)
506 : 3878510 : curitemid = PageGetItemId(page, offset);
507 [ + + + + ]: 6136399 : if (inposting || !ItemIdIsDead(curitemid))
508 : : {
509 : : ItemPointerData htid;
510 : 5841577 : bool all_dead = false;
511 : :
512 [ + + ]: 5841577 : if (!inposting)
513 : : {
514 : : /* Plain tuple, or first TID in posting list tuple */
515 [ + + ]: 3583688 : if (_bt_compare(rel, itup_key, page, offset) != 0)
516 : 106512 : break; /* we're past all the equal tuples */
517 : :
518 : : /* Advanced curitup */
519 : 3477176 : curitup = (IndexTuple) PageGetItem(page, curitemid);
520 [ - + ]: 3477176 : Assert(!BTreeTupleIsPivot(curitup));
521 : : }
522 : :
523 : : /* okay, we gotta fetch the heap tuple using htid ... */
524 [ + + ]: 5735065 : if (!BTreeTupleIsPosting(curitup))
525 : : {
526 : : /* ... htid is from simple non-pivot tuple */
527 [ - + ]: 3453723 : Assert(!inposting);
528 : 3453723 : htid = curitup->t_tid;
529 : : }
530 [ + + ]: 2281342 : else if (!inposting)
531 : : {
532 : : /* ... htid is first TID in new posting list */
533 : 23453 : inposting = true;
534 : 23453 : prevalldead = true;
535 : 23453 : curposti = 0;
536 : 23453 : htid = *BTreeTupleGetPostingN(curitup, 0);
537 : : }
538 : : else
539 : : {
540 : : /* ... htid is second or subsequent TID in posting list */
541 [ - + ]: 2257889 : Assert(curposti > 0);
542 : 2257889 : htid = *BTreeTupleGetPostingN(curitup, curposti);
543 : : }
544 : :
545 : : /*
546 : : * If we are doing a recheck, we expect to find the tuple we
547 : : * are rechecking. It's not a duplicate, but we have to keep
548 : : * scanning.
549 : : */
5984 tgl@sss.pgh.pa.us 550 [ + + + + ]: 5735174 : if (checkUnique == UNIQUE_CHECK_EXISTING &&
551 : 109 : ItemPointerCompare(&htid, &itup->t_tid) == 0)
552 : : {
553 : 27 : found = true;
554 : : }
555 : :
556 : : /*
557 : : * Check if there's any table tuples for this index entry
558 : : * satisfying SnapshotDirty. This is necessary because for AMs
559 : : * with optimizations like heap's HOT, we have just a single
560 : : * index entry for the entire chain.
561 : : */
2458 andres@anarazel.de 562 [ + + ]: 5735038 : else if (table_index_fetch_tuple_check(heapRel, &htid,
563 : : &SnapshotDirty,
564 : : &all_dead))
565 : : {
566 : : TransactionId xwait;
567 : :
568 : : /*
569 : : * It is a duplicate. If we are only doing a partial
570 : : * check, then don't bother checking if the tuple is being
571 : : * updated in another transaction. Just return the fact
572 : : * that it is a potential conflict and leave the full
573 : : * check till later. Don't invalidate binary search
574 : : * bounds.
575 : : */
5984 tgl@sss.pgh.pa.us 576 [ + + ]: 381 : if (checkUnique == UNIQUE_CHECK_PARTIAL)
577 : : {
578 [ - + ]: 106 : if (nbuf != InvalidBuffer)
5984 tgl@sss.pgh.pa.us 579 :UBC 0 : _bt_relbuf(rel, nbuf);
5984 tgl@sss.pgh.pa.us 580 :CBC 106 : *is_unique = false;
581 : 118 : return InvalidTransactionId;
582 : : }
583 : :
584 : : /*
585 : : * If this tuple is being updated by other transaction
586 : : * then we have to wait for its commit/abort.
587 : : */
588 : 550 : xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
589 [ + + ]: 275 : SnapshotDirty.xmin : SnapshotDirty.xmax;
590 : :
8607 591 [ + + ]: 275 : if (TransactionIdIsValid(xwait))
592 : : {
593 [ - + ]: 12 : if (nbuf != InvalidBuffer)
8607 tgl@sss.pgh.pa.us 594 :UBC 0 : _bt_relbuf(rel, nbuf);
595 : : /* Tell _bt_doinsert to wait... */
3875 andres@anarazel.de 596 :CBC 12 : *speculativeToken = SnapshotDirty.speculativeToken;
597 : : /* Caller releases lock on buf immediately */
2448 pg@bowt.ie 598 : 12 : insertstate->bounds_valid = false;
8607 tgl@sss.pgh.pa.us 599 : 12 : return xwait;
600 : : }
601 : :
602 : : /*
603 : : * Otherwise we have a definite conflict. But before
604 : : * complaining, look to see if the tuple we want to insert
605 : : * is itself now committed dead --- if so, don't complain.
606 : : * This is a waste of time in normal scenarios but we must
607 : : * do it to support CREATE INDEX CONCURRENTLY.
608 : : *
609 : : * We must follow HOT-chains here because during
610 : : * concurrent index build, we insert the root TID though
611 : : * the actual tuple may be somewhere in the HOT-chain.
612 : : * While following the chain we might not stop at the
613 : : * exact tuple which triggered the insert, but that's OK
614 : : * because if we find a live tuple anywhere in this chain,
615 : : * we have a unique key conflict. The other live tuple is
616 : : * not part of this chain because it had a different index
617 : : * entry.
618 : : */
2000 pg@bowt.ie 619 : 263 : htid = itup->t_tid;
620 [ - + ]: 263 : if (table_index_fetch_tuple_check(heapRel, &htid,
621 : : SnapshotSelf, NULL))
622 : : {
623 : : /* Normal case --- it's still live */
624 : : }
625 : : else
626 : : {
627 : : /*
628 : : * It's been deleted, so no error, and no need to
629 : : * continue searching
630 : : */
7053 tgl@sss.pgh.pa.us 631 :UBC 0 : break;
632 : : }
633 : :
634 : : /*
635 : : * Check for a conflict-in as we would if we were going to
636 : : * write to this page. We aren't actually going to write,
637 : : * but we want a chance to report SSI conflicts that would
638 : : * otherwise be masked by this unique constraint
639 : : * violation.
640 : : */
2149 tmunro@postgresql.or 641 :CBC 263 : CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
642 : :
643 : : /*
644 : : * This is a definite conflict. Break the tuple down into
645 : : * datums and report the error. But first, make sure we
646 : : * release the buffer locks we're holding ---
647 : : * BuildIndexValueDescription could make catalog accesses,
648 : : * which in the worst case might touch this same index and
649 : : * cause deadlocks.
650 : : */
5981 tgl@sss.pgh.pa.us 651 [ - + ]: 259 : if (nbuf != InvalidBuffer)
5981 tgl@sss.pgh.pa.us 652 :UBC 0 : _bt_relbuf(rel, nbuf);
2463 pg@bowt.ie 653 :CBC 259 : _bt_relbuf(rel, insertstate->buf);
654 : 259 : insertstate->buf = InvalidBuffer;
2448 655 : 259 : insertstate->bounds_valid = false;
656 : :
657 : : {
658 : : Datum values[INDEX_MAX_KEYS];
659 : : bool isnull[INDEX_MAX_KEYS];
660 : : char *key_desc;
661 : :
5981 tgl@sss.pgh.pa.us 662 : 259 : index_deform_tuple(itup, RelationGetDescr(rel),
663 : : values, isnull);
664 : :
3991 sfrost@snowman.net 665 : 259 : key_desc = BuildIndexValueDescription(rel, values,
666 : : isnull);
667 : :
5981 tgl@sss.pgh.pa.us 668 [ + - + + ]: 259 : ereport(ERROR,
669 : : (errcode(ERRCODE_UNIQUE_VIOLATION),
670 : : errmsg("duplicate key value violates unique constraint \"%s\"",
671 : : RelationGetRelationName(rel)),
672 : : key_desc ? errdetail("Key %s already exists.",
673 : : key_desc) : 0,
674 : : errtableconstraint(heapRel,
675 : : RelationGetRelationName(rel))));
676 : : }
677 : : }
2120 pg@bowt.ie 678 [ + + + + : 5734657 : else if (all_dead && (!inposting ||
+ + ]
679 : 19129 : (prevalldead &&
680 [ + + ]: 19129 : curposti == BTreeTupleGetNPosting(curitup) - 1)))
681 : : {
682 : : /*
683 : : * The conflicting tuple (or all HOT chains pointed to by
684 : : * all posting list TIDs) is dead to everyone, so mark the
685 : : * index entry killed.
686 : : */
6662 tgl@sss.pgh.pa.us 687 : 53383 : ItemIdMarkDead(curitemid);
688 : 53383 : opaque->btpo_flags |= BTP_HAS_GARBAGE;
689 : :
690 : : /*
691 : : * Mark buffer with a dirty hint, since state is not
692 : : * crucial. Be sure to mark the proper buffer dirty.
693 : : */
694 [ + + ]: 53383 : if (nbuf != InvalidBuffer)
4565 jdavis@postgresql.or 695 : 3 : MarkBufferDirtyHint(nbuf, true);
696 : : else
2463 pg@bowt.ie 697 : 53380 : MarkBufferDirtyHint(insertstate->buf, true);
698 : : }
699 : :
700 : : /*
701 : : * Remember if posting list tuple has even a single HOT chain
702 : : * whose members are not all dead
703 : : */
2120 704 [ + + + + ]: 5734684 : if (!all_dead && inposting)
705 : 2261750 : prevalldead = false;
706 : : }
707 : : }
708 : :
709 [ + + + + ]: 7530770 : if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
710 : : {
711 : : /* Advance to next TID in same posting list */
712 : 2257889 : curposti++;
713 : 2257889 : continue;
714 : : }
715 [ + + ]: 5272881 : else if (offset < maxoff)
716 : : {
717 : : /* Advance to next tuple */
718 : 3753930 : curposti = 0;
719 : 3753930 : inposting = false;
9279 tgl@sss.pgh.pa.us 720 : 3753930 : offset = OffsetNumberNext(offset);
721 : : }
722 : : else
723 : : {
724 : : int highkeycmp;
725 : :
726 : : /* If scankey == hikey we gotta check the next page too */
727 [ + + ]: 1518951 : if (P_RIGHTMOST(opaque))
728 : 1437685 : break;
2463 pg@bowt.ie 729 : 81266 : highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
730 [ - + ]: 81266 : Assert(highkeycmp <= 0);
731 [ + + ]: 81266 : if (highkeycmp != 0)
9279 tgl@sss.pgh.pa.us 732 : 76244 : break;
733 : : /* Advance to next non-dead page --- there must be one */
734 : : for (;;)
8333 tgl@sss.pgh.pa.us 735 :UBC 0 : {
2120 pg@bowt.ie 736 :CBC 5022 : BlockNumber nblkno = opaque->btpo_next;
737 : :
7909 tgl@sss.pgh.pa.us 738 : 5022 : nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
3527 kgrittn@postgresql.o 739 : 5022 : page = BufferGetPage(nbuf);
1355 michael@paquier.xyz 740 : 5022 : opaque = BTPageGetOpaque(page);
8333 tgl@sss.pgh.pa.us 741 [ + - ]: 5022 : if (!P_IGNORE(opaque))
742 : 5022 : break;
8333 tgl@sss.pgh.pa.us 743 [ # # ]:UBC 0 : if (P_RIGHTMOST(opaque))
6560 744 [ # # ]: 0 : elog(ERROR, "fell off the end of index \"%s\"",
745 : : RelationGetRelationName(rel));
746 : : }
747 : : /* Will also advance to next tuple */
2120 pg@bowt.ie 748 :CBC 5022 : curposti = 0;
749 : 5022 : inposting = false;
9279 tgl@sss.pgh.pa.us 750 : 5022 : maxoff = PageGetMaxOffsetNumber(page);
751 [ + + ]: 5022 : offset = P_FIRSTDATAKEY(opaque);
752 : : /* Don't invalidate binary search bounds */
753 : : }
754 : : }
755 : :
756 : : /*
757 : : * If we are doing a recheck then we should have found the tuple we are
758 : : * checking. Otherwise there's something very wrong --- probably, the
759 : : * index is on a non-immutable expression.
760 : : */
5984 761 [ + + - + ]: 2705628 : if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
5984 tgl@sss.pgh.pa.us 762 [ # # ]:UBC 0 : ereport(ERROR,
763 : : (errcode(ERRCODE_INTERNAL_ERROR),
764 : : errmsg("failed to re-find tuple within index \"%s\"",
765 : : RelationGetRelationName(rel)),
766 : : errhint("This may be because of a non-immutable index expression."),
767 : : errtableconstraint(heapRel,
768 : : RelationGetRelationName(rel))));
769 : :
9279 tgl@sss.pgh.pa.us 770 [ + + ]:CBC 2705628 : if (nbuf != InvalidBuffer)
8920 771 : 2880 : _bt_relbuf(rel, nbuf);
772 : :
8881 773 : 2705628 : return InvalidTransactionId;
774 : : }
775 : :
776 : :
777 : : /*
778 : : * _bt_findinsertloc() -- Finds an insert location for a tuple
779 : : *
780 : : * On entry, insertstate buffer contains the page the new tuple belongs
781 : : * on. It is exclusive-locked and pinned by the caller.
782 : : *
783 : : * If 'checkingunique' is true, the buffer on entry is the first page
784 : : * that contains duplicates of the new key. If there are duplicates on
785 : : * multiple pages, the correct insertion position might be some page to
786 : : * the right, rather than the first page. In that case, this function
787 : : * moves right to the correct target page.
788 : : *
789 : : * (In a !heapkeyspace index, there can be multiple pages with the same
790 : : * high key, where the new tuple could legitimately be placed on. In
791 : : * that case, the caller passes the first page containing duplicates,
792 : : * just like when checkingunique=true. If that page doesn't have enough
793 : : * room for the new tuple, this function moves right, trying to find a
794 : : * legal page that does.)
795 : : *
796 : : * If 'indexUnchanged' is true, this is for an UPDATE that didn't
797 : : * logically change the indexed value, but must nevertheless have a new
798 : : * entry to point to a successor version. This hint from the executor
799 : : * will influence our behavior when the page might have to be split and
800 : : * we must consider our options. Bottom-up index deletion can avoid
801 : : * pathological version-driven page splits, but we only want to go to the
802 : : * trouble of trying it when we already have moderate confidence that
803 : : * it's appropriate. The hint should not significantly affect our
804 : : * behavior over time unless practically all inserts on to the leaf page
805 : : * get the hint.
806 : : *
807 : : * On exit, insertstate buffer contains the chosen insertion page, and
808 : : * the offset within that page is returned. If _bt_findinsertloc needed
809 : : * to move right, the lock and pin on the original page are released, and
810 : : * the new buffer is exclusively locked and pinned instead.
811 : : *
812 : : * If insertstate contains cached binary search bounds, we will take
813 : : * advantage of them. This avoids repeating comparisons that we made in
814 : : * _bt_check_unique() already.
815 : : */
816 : : static OffsetNumber
6863 bruce@momjian.us 817 : 3756899 : _bt_findinsertloc(Relation rel,
818 : : BTInsertState insertstate,
819 : : bool checkingunique,
820 : : bool indexUnchanged,
821 : : BTStack stack,
822 : : Relation heapRel)
823 : : {
2463 pg@bowt.ie 824 : 3756899 : BTScanInsert itup_key = insertstate->itup_key;
825 : 3756899 : Page page = BufferGetPage(insertstate->buf);
826 : : BTPageOpaque opaque;
827 : : OffsetNumber newitemoff;
828 : :
1355 michael@paquier.xyz 829 : 3756899 : opaque = BTPageGetOpaque(page);
830 : :
831 : : /* Check 1/3 of a page restriction */
280 pg@bowt.ie 832 [ - + ]: 3756899 : if (unlikely(insertstate->itemsz > BTMaxItemSize))
2463 pg@bowt.ie 833 :UBC 0 : _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
834 : : insertstate->itup);
835 : :
1855 pg@bowt.ie 836 [ + - - + ]:CBC 3756899 : Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
2463 837 [ + + - + ]: 3756899 : Assert(!insertstate->bounds_valid || checkingunique);
838 [ + - - + ]: 3756899 : Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
839 [ - + - - ]: 3756899 : Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
2120 840 [ + + - + ]: 3756899 : Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
841 : :
2463 842 [ + - ]: 3756899 : if (itup_key->heapkeyspace)
843 : : {
844 : : /* Keep track of whether checkingunique duplicate seen */
1798 845 : 3756899 : bool uniquedup = indexUnchanged;
846 : :
847 : : /*
848 : : * If we're inserting into a unique index, we may have to walk right
849 : : * through leaf pages to find the one leaf page that we must insert on
850 : : * to.
851 : : *
852 : : * This is needed for checkingunique callers because a scantid was not
853 : : * used when we called _bt_search(). scantid can only be set after
854 : : * _bt_check_unique() has checked for duplicates. The buffer
855 : : * initially stored in insertstate->buf has the page where the first
856 : : * duplicate key might be found, which isn't always the page that new
857 : : * tuple belongs on. The heap TID attribute for new tuple (scantid)
858 : : * could force us to insert on a sibling page, though that should be
859 : : * very rare in practice.
860 : : */
2463 861 [ + + ]: 3756899 : if (checkingunique)
862 : : {
2120 863 [ + + ]: 2705704 : if (insertstate->low < insertstate->stricthigh)
864 : : {
865 : : /* Encountered a duplicate in _bt_check_unique() */
866 [ - + ]: 229166 : Assert(insertstate->bounds_valid);
867 : 229166 : uniquedup = true;
868 : : }
869 : :
870 : : for (;;)
871 : : {
872 : : /*
873 : : * Does the new tuple belong on this page?
874 : : *
875 : : * The earlier _bt_check_unique() call may well have
876 : : * established a strict upper bound on the offset for the new
877 : : * item. If it's not the last item of the page (i.e. if there
878 : : * is at least one tuple on the page that goes after the tuple
879 : : * we're inserting) then we know that the tuple belongs on
880 : : * this page. We can skip the high key check.
881 : : */
2463 882 [ + + ]: 2710726 : if (insertstate->bounds_valid &&
883 [ + - + + ]: 5400438 : insertstate->low <= insertstate->stricthigh &&
884 : 2700219 : insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
885 : 1178057 : break;
886 : :
887 : : /* Test '<=', not '!=', since scantid is set now */
1855 888 [ + + + + ]: 1621642 : if (P_RIGHTMOST(opaque) ||
2463 889 : 88973 : _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
890 : : break;
891 : :
990 andres@anarazel.de 892 : 5022 : _bt_stepright(rel, heapRel, insertstate, stack);
893 : : /* Update local state after stepping right */
2463 pg@bowt.ie 894 : 5022 : page = BufferGetPage(insertstate->buf);
1355 michael@paquier.xyz 895 : 5022 : opaque = BTPageGetOpaque(page);
896 : : /* Assume duplicates (if checkingunique) */
2120 pg@bowt.ie 897 : 5022 : uniquedup = true;
898 : : }
899 : : }
900 : :
901 : : /*
902 : : * If the target page cannot fit newitem, try to avoid splitting the
903 : : * page on insert by performing deletion or deduplication now
904 : : */
905 [ + + ]: 3756899 : if (PageGetFreeSpace(page) < insertstate->itemsz)
1855 906 : 26199 : _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
907 : : checkingunique, uniquedup,
908 : : indexUnchanged);
909 : : }
910 : : else
911 : : {
912 : : /*----------
913 : : * This is a !heapkeyspace (version 2 or 3) index. The current page
914 : : * is the first page that we could insert the new tuple to, but there
915 : : * may be other pages to the right that we could opt to use instead.
916 : : *
917 : : * If the new key is equal to one or more existing keys, we can
918 : : * legitimately place it anywhere in the series of equal keys. In
919 : : * fact, if the new key is equal to the page's "high key" we can place
920 : : * it on the next page. If it is equal to the high key, and there's
921 : : * not room to insert the new tuple on the current page without
922 : : * splitting, then we move right hoping to find more free space and
923 : : * avoid a split.
924 : : *
925 : : * Keep scanning right until we
926 : : * (a) find a page with enough free space,
927 : : * (b) reach the last page where the tuple can legally go, or
928 : : * (c) get tired of searching.
929 : : * (c) is not flippant; it is important because if there are many
930 : : * pages' worth of equal keys, it's better to split one of the early
931 : : * pages than to scan all the way to the end of the run of equal keys
932 : : * on every insert. We implement "get tired" as a random choice,
933 : : * since stopping after scanning a fixed number of pages wouldn't work
934 : : * well (we'd never reach the right-hand side of previously split
935 : : * pages). The probability of moving right is set at 0.99, which may
936 : : * seem too high to change the behavior much, but it does an excellent
937 : : * job of preventing O(N^2) behavior with many equal keys.
938 : : *----------
939 : : */
2463 pg@bowt.ie 940 [ # # ]:UBC 0 : while (PageGetFreeSpace(page) < insertstate->itemsz)
941 : : {
942 : : /*
943 : : * Before considering moving right, see if we can obtain enough
944 : : * space by erasing LP_DEAD items
945 : : */
1855 946 [ # # ]: 0 : if (P_HAS_GARBAGE(opaque))
947 : : {
948 : : /* Perform simple deletion */
949 : 0 : _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
950 : : false, false, false);
951 : :
2463 952 [ # # ]: 0 : if (PageGetFreeSpace(page) >= insertstate->itemsz)
953 : 0 : break; /* OK, now we have enough space */
954 : : }
955 : :
956 : : /*
957 : : * Nope, so check conditions (b) and (c) enumerated above
958 : : *
959 : : * The earlier _bt_check_unique() call may well have established a
960 : : * strict upper bound on the offset for the new item. If it's not
961 : : * the last item of the page (i.e. if there is at least one tuple
962 : : * on the page that's greater than the tuple we're inserting to)
963 : : * then we know that the tuple belongs on this page. We can skip
964 : : * the high key check.
965 : : */
966 [ # # ]: 0 : if (insertstate->bounds_valid &&
967 [ # # # # ]: 0 : insertstate->low <= insertstate->stricthigh &&
968 : 0 : insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
969 : 0 : break;
970 : :
1855 971 [ # # # # ]: 0 : if (P_RIGHTMOST(opaque) ||
2463 972 [ # # ]: 0 : _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
1479 tgl@sss.pgh.pa.us 973 : 0 : pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
974 : : break;
975 : :
990 andres@anarazel.de 976 : 0 : _bt_stepright(rel, heapRel, insertstate, stack);
977 : : /* Update local state after stepping right */
2463 pg@bowt.ie 978 : 0 : page = BufferGetPage(insertstate->buf);
1355 michael@paquier.xyz 979 : 0 : opaque = BTPageGetOpaque(page);
980 : : }
981 : : }
982 : :
983 : : /*
984 : : * We should now be on the correct page. Find the offset within the page
985 : : * for the new tuple. (Possibly reusing earlier search bounds.)
986 : : */
1855 pg@bowt.ie 987 [ + + - + ]:CBC 3756899 : Assert(P_RIGHTMOST(opaque) ||
988 : : _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
989 : :
2120 990 : 3756899 : newitemoff = _bt_binsrch_insert(rel, insertstate);
991 : :
992 [ + + ]: 3756899 : if (insertstate->postingoff == -1)
993 : : {
994 : : /*
995 : : * There is an overlapping posting list tuple with its LP_DEAD bit
996 : : * set. We don't want to unnecessarily unset its LP_DEAD bit while
997 : : * performing a posting list split, so perform simple index tuple
998 : : * deletion early.
999 : : */
1855 1000 : 3 : _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
1001 : : false, false, false);
1002 : :
1003 : : /*
1004 : : * Do new binary search. New insert location cannot overlap with any
1005 : : * posting list now.
1006 : : */
1007 [ - + ]: 3 : Assert(!insertstate->bounds_valid);
2120 1008 : 3 : insertstate->postingoff = 0;
1009 : 3 : newitemoff = _bt_binsrch_insert(rel, insertstate);
1010 [ - + ]: 3 : Assert(insertstate->postingoff == 0);
1011 : : }
1012 : :
1013 : 3756899 : return newitemoff;
1014 : : }
1015 : :
1016 : : /*
1017 : : * Step right to next non-dead page, during insertion.
1018 : : *
1019 : : * This is a bit more complicated than moving right in a search. We must
1020 : : * write-lock the target page before releasing write lock on current page;
1021 : : * else someone else's _bt_check_unique scan could fail to see our insertion.
1022 : : * Write locks on intermediate dead pages won't do because we don't know when
1023 : : * they will get de-linked from the tree.
1024 : : *
1025 : : * This is more aggressive than it needs to be for non-unique !heapkeyspace
1026 : : * indexes.
1027 : : */
1028 : : static void
920 1029 : 5022 : _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate,
1030 : : BTStack stack)
1031 : : {
1032 : : Page page;
1033 : : BTPageOpaque opaque;
1034 : : Buffer rbuf;
1035 : : BlockNumber rblkno;
1036 : :
1037 [ - + ]: 5022 : Assert(heaprel != NULL);
2463 1038 : 5022 : page = BufferGetPage(insertstate->buf);
1355 michael@paquier.xyz 1039 : 5022 : opaque = BTPageGetOpaque(page);
1040 : :
2463 pg@bowt.ie 1041 : 5022 : rbuf = InvalidBuffer;
1855 1042 : 5022 : rblkno = opaque->btpo_next;
1043 : : for (;;)
1044 : : {
2463 1045 : 5022 : rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
1046 : 5022 : page = BufferGetPage(rbuf);
1355 michael@paquier.xyz 1047 : 5022 : opaque = BTPageGetOpaque(page);
1048 : :
1049 : : /*
1050 : : * If this page was incompletely split, finish the split now. We do
1051 : : * this while holding a lock on the left sibling, which is not good
1052 : : * because finishing the split could be a fairly lengthy operation.
1053 : : * But this should happen very seldom.
1054 : : */
1855 pg@bowt.ie 1055 [ - + ]: 5022 : if (P_INCOMPLETE_SPLIT(opaque))
1056 : : {
990 andres@anarazel.de 1057 :UBC 0 : _bt_finish_split(rel, heaprel, rbuf, stack);
2463 pg@bowt.ie 1058 : 0 : rbuf = InvalidBuffer;
1059 : 0 : continue;
1060 : : }
1061 : :
1855 pg@bowt.ie 1062 [ + - ]:CBC 5022 : if (!P_IGNORE(opaque))
2463 1063 : 5022 : break;
1855 pg@bowt.ie 1064 [ # # ]:UBC 0 : if (P_RIGHTMOST(opaque))
2463 1065 [ # # ]: 0 : elog(ERROR, "fell off the end of index \"%s\"",
1066 : : RelationGetRelationName(rel));
1067 : :
1855 1068 : 0 : rblkno = opaque->btpo_next;
1069 : : }
1070 : : /* rbuf locked; unlock buf, update state for caller */
2463 pg@bowt.ie 1071 :CBC 5022 : _bt_relbuf(rel, insertstate->buf);
1072 : 5022 : insertstate->buf = rbuf;
1073 : 5022 : insertstate->bounds_valid = false;
6863 bruce@momjian.us 1074 : 5022 : }
1075 : :
1076 : : /*----------
1077 : : * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
1078 : : *
1079 : : * This recursive procedure does the following things:
1080 : : *
1081 : : * + if postingoff != 0, splits existing posting list tuple
1082 : : * (since it overlaps with new 'itup' tuple).
1083 : : * + if necessary, splits the target page, using 'itup_key' for
1084 : : * suffix truncation on leaf pages (caller passes NULL for
1085 : : * non-leaf pages).
1086 : : * + inserts the new tuple (might be split from posting list).
1087 : : * + if the page was split, pops the parent stack, and finds the
1088 : : * right place to insert the new child pointer (by walking
1089 : : * right using information stored in the parent stack).
1090 : : * + invokes itself with the appropriate tuple for the right
1091 : : * child page on the parent.
1092 : : * + updates the metapage if a true root or fast root is split.
1093 : : *
1094 : : * On entry, we must have the correct buffer in which to do the
1095 : : * insertion, and the buffer must be pinned and write-locked. On return,
1096 : : * we will have dropped both the pin and the lock on the buffer.
1097 : : *
1098 : : * This routine only performs retail tuple insertions. 'itup' should
1099 : : * always be either a non-highkey leaf item, or a downlink (new high
1100 : : * key items are created indirectly, when a page is split). When
1101 : : * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
1102 : : * we're inserting the downlink for. This function will clear the
1103 : : * INCOMPLETE_SPLIT flag on it, and release the buffer.
1104 : : *----------
1105 : : */
1106 : : static void
1107 : 3768011 : _bt_insertonpg(Relation rel,
1108 : : Relation heaprel,
1109 : : BTScanInsert itup_key,
1110 : : Buffer buf,
1111 : : Buffer cbuf,
1112 : : BTStack stack,
1113 : : IndexTuple itup,
1114 : : Size itemsz,
1115 : : OffsetNumber newitemoff,
1116 : : int postingoff,
1117 : : bool split_only_page)
1118 : : {
1119 : : Page page;
1120 : : BTPageOpaque opaque;
1121 : : bool isleaf,
1122 : : isroot,
1123 : : isrightmost,
1124 : : isonly;
2120 pg@bowt.ie 1125 : 3768011 : IndexTuple oposting = NULL;
1126 : 3768011 : IndexTuple origitup = NULL;
1127 : 3768011 : IndexTuple nposting = NULL;
1128 : :
3527 kgrittn@postgresql.o 1129 : 3768011 : page = BufferGetPage(buf);
1355 michael@paquier.xyz 1130 : 3768011 : opaque = BTPageGetOpaque(page);
1855 pg@bowt.ie 1131 : 3768011 : isleaf = P_ISLEAF(opaque);
1132 : 3768011 : isroot = P_ISROOT(opaque);
1133 : 3768011 : isrightmost = P_RIGHTMOST(opaque);
1134 [ + + + + ]: 3768011 : isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
1135 : :
1136 : : /* child buffer must be given iff inserting on an internal page */
1137 [ - + ]: 3768011 : Assert(isleaf == !BufferIsValid(cbuf));
1138 : : /* tuple must have appropriate number of attributes */
1139 [ + + - + : 3768011 : Assert(!isleaf ||
- - ]
1140 : : BTreeTupleGetNAtts(itup, rel) ==
1141 : : IndexRelationGetNumberOfAttributes(rel));
1142 [ + + + - : 3768011 : Assert(isleaf ||
- + ]
1143 : : BTreeTupleGetNAtts(itup, rel) <=
1144 : : IndexRelationGetNumberOfKeyAttributes(rel));
2120 1145 [ - + ]: 3768011 : Assert(!BTreeTupleIsPosting(itup));
2101 1146 [ - + ]: 3768011 : Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
1147 : : /* Caller must always finish incomplete split for us */
1855 1148 [ - + ]: 3768011 : Assert(!P_INCOMPLETE_SPLIT(opaque));
1149 : :
1150 : : /*
1151 : : * Every internal page should have exactly one negative infinity item at
1152 : : * all times. Only _bt_split() and _bt_newlevel() should add items that
1153 : : * become negative infinity items through truncation, since they're the
1154 : : * only routines that allocate new internal pages.
1155 : : */
1156 [ + + + + : 3768011 : Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
- + ]
1157 : :
1158 : : /*
1159 : : * Do we need to split an existing posting list item?
1160 : : */
2120 1161 [ + + ]: 3768011 : if (postingoff != 0)
1162 : : {
1163 : 15988 : ItemId itemid = PageGetItemId(page, newitemoff);
1164 : :
1165 : : /*
1166 : : * The new tuple is a duplicate with a heap TID that falls inside the
1167 : : * range of an existing posting list tuple on a leaf page. Prepare to
1168 : : * split an existing posting list. Overwriting the posting list with
1169 : : * its post-split version is treated as an extra step in either the
1170 : : * insert or page split critical section.
1171 : : */
1511 1172 [ + - + - : 15988 : Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
- + ]
2120 1173 : 15988 : oposting = (IndexTuple) PageGetItem(page, itemid);
1174 : :
1175 : : /*
1176 : : * postingoff value comes from earlier call to _bt_binsrch_posting().
1177 : : * Its binary search might think that a plain tuple must be a posting
1178 : : * list tuple that needs to be split. This can happen with corruption
1179 : : * involving an existing plain tuple that is a duplicate of the new
1180 : : * item, up to and including its table TID. Check for that here in
1181 : : * passing.
1182 : : *
1183 : : * Also verify that our caller has made sure that the existing posting
1184 : : * list tuple does not have its LP_DEAD bit set.
1185 : : */
1511 1186 [ + - - + ]: 15988 : if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
1511 pg@bowt.ie 1187 [ # # ]:UBC 0 : ereport(ERROR,
1188 : : (errcode(ERRCODE_INDEX_CORRUPTED),
1189 : : errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
1190 : : ItemPointerGetBlockNumber(&itup->t_tid),
1191 : : ItemPointerGetOffsetNumber(&itup->t_tid),
1192 : : newitemoff, BufferGetBlockNumber(buf),
1193 : : RelationGetRelationName(rel))));
1194 : :
1195 : : /* use a mutable copy of itup as our itup from here on */
2120 pg@bowt.ie 1196 :CBC 15988 : origitup = itup;
1197 : 15988 : itup = CopyIndexTuple(origitup);
1198 : 15988 : nposting = _bt_swap_posting(itup, oposting, postingoff);
1199 : : /* itup now contains rightmost/max TID from oposting */
1200 : :
1201 : : /* Alter offset so that newitem goes after posting list */
1202 : 15988 : newitemoff = OffsetNumberNext(newitemoff);
1203 : : }
1204 : :
1205 : : /*
1206 : : * Do we need to split the page to fit the item on it?
1207 : : *
1208 : : * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1209 : : * so this comparison is correct even though we appear to be accounting
1210 : : * only for the item and not for its line pointer.
1211 : : */
9279 tgl@sss.pgh.pa.us 1212 [ + + ]: 3768011 : if (PageGetFreeSpace(page) < itemsz)
1213 : : {
1214 : : Buffer rbuf;
1215 : :
2073 pg@bowt.ie 1216 [ - + ]: 11803 : Assert(!split_only_page);
1217 : :
1218 : : /* split the buffer into left and right halves */
990 andres@anarazel.de 1219 : 11803 : rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
1220 : : itup, origitup, nposting, postingoff);
5426 heikki.linnakangas@i 1221 : 11803 : PredicateLockPageSplit(rel,
1222 : : BufferGetBlockNumber(buf),
1223 : : BufferGetBlockNumber(rbuf));
1224 : :
1225 : : /*----------
1226 : : * By here,
1227 : : *
1228 : : * + our target page has been split;
1229 : : * + the original tuple has been inserted;
1230 : : * + we have write locks on both the old (left half)
1231 : : * and new (right half) buffers, after the split; and
1232 : : * + we know the key we want to insert into the parent
1233 : : * (it's the "high key" on the left child page).
1234 : : *
1235 : : * We're ready to do the parent insertion. We need to hold onto the
1236 : : * locks for the child pages until we locate the parent, but we can
1237 : : * at least release the lock on the right child before doing the
1238 : : * actual insertion. The lock on the left child will be released
1239 : : * last of all by parent insertion, where it is the 'cbuf' of parent
1240 : : * page.
1241 : : *----------
1242 : : */
1243 : : #ifdef USE_INJECTION_POINTS
1244 : : if (P_ISLEAF(opaque))
1245 : : INJECTION_POINT("nbtree-leave-leaf-split-incomplete", NULL);
1246 : : else
1247 : : INJECTION_POINT("nbtree-leave-internal-split-incomplete", NULL);
1248 : : #endif
1249 : :
990 andres@anarazel.de 1250 : 11803 : _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
1251 : : }
1252 : : else
1253 : : {
8334 tgl@sss.pgh.pa.us 1254 : 3756208 : Buffer metabuf = InvalidBuffer;
1255 : 3756208 : Page metapg = NULL;
1256 : 3756208 : BTMetaPageData *metad = NULL;
1257 : : BlockNumber blockcache;
1258 : :
1259 : : /*
1260 : : * If we are doing this insert because we split a page that was the
1261 : : * only one on its tree level, but was not the root, it may have been
1262 : : * the "fast root". We need to ensure that the fast root link points
1263 : : * at or above the current page. We can safely acquire a lock on the
1264 : : * metapage here --- see comments for _bt_newlevel().
1265 : : */
1855 pg@bowt.ie 1266 [ + + ]: 3756208 : if (unlikely(split_only_page))
1267 : : {
2099 1268 [ - + ]: 12 : Assert(!isleaf);
1269 [ - + ]: 12 : Assert(BufferIsValid(cbuf));
1270 : :
920 1271 : 12 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
3527 kgrittn@postgresql.o 1272 : 12 : metapg = BufferGetPage(metabuf);
8334 tgl@sss.pgh.pa.us 1273 : 12 : metad = BTPageGetMeta(metapg);
1274 : :
1756 pg@bowt.ie 1275 [ - + ]: 12 : if (metad->btm_fastlevel >= opaque->btpo_level)
1276 : : {
1277 : : /* no update wanted */
8334 tgl@sss.pgh.pa.us 1278 :UBC 0 : _bt_relbuf(rel, metabuf);
1279 : 0 : metabuf = InvalidBuffer;
1280 : : }
1281 : : }
1282 : :
1283 : : /* Do the update. No ereport(ERROR) until changes are logged */
8334 tgl@sss.pgh.pa.us 1284 :CBC 3756208 : START_CRIT_SECTION();
1285 : :
2120 pg@bowt.ie 1286 [ + + ]: 3756208 : if (postingoff != 0)
1287 : 15961 : memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
1288 : :
50 peter@eisentraut.org 1289 [ - + ]:GNC 3756208 : if (PageAddItem(page, itup, itemsz, newitemoff, false, false) == InvalidOffsetNumber)
5588 tgl@sss.pgh.pa.us 1290 [ # # ]:UBC 0 : elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1291 : : BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1292 : :
7200 tgl@sss.pgh.pa.us 1293 :CBC 3756208 : MarkBufferDirty(buf);
1294 : :
8334 1295 [ + + ]: 3756208 : if (BufferIsValid(metabuf))
1296 : : {
1297 : : /* upgrade meta-page if needed */
2463 pg@bowt.ie 1298 [ - + ]: 12 : if (metad->btm_version < BTREE_NOVAC_VERSION)
2813 teodor@sigaev.ru 1299 :UBC 0 : _bt_upgrademetapage(metapg);
2100 pg@bowt.ie 1300 :CBC 12 : metad->btm_fastroot = BufferGetBlockNumber(buf);
1756 1301 : 12 : metad->btm_fastlevel = opaque->btpo_level;
7200 tgl@sss.pgh.pa.us 1302 : 12 : MarkBufferDirty(metabuf);
1303 : : }
1304 : :
1305 : : /*
1306 : : * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
1307 : : * finishes a split
1308 : : */
2073 pg@bowt.ie 1309 [ + + ]: 3756208 : if (!isleaf)
1310 : : {
3527 kgrittn@postgresql.o 1311 : 10955 : Page cpage = BufferGetPage(cbuf);
1355 michael@paquier.xyz 1312 : 10955 : BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1313 : :
4291 heikki.linnakangas@i 1314 [ - + ]: 10955 : Assert(P_INCOMPLETE_SPLIT(cpageop));
1315 : 10955 : cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1316 : 10955 : MarkBufferDirty(cbuf);
1317 : : }
1318 : :
1319 : : /* XLOG stuff */
5482 rhaas@postgresql.org 1320 [ + + + + : 3756208 : if (RelationNeedsWAL(rel))
+ + + + ]
1321 : : {
1322 : : xl_btree_insert xlrec;
1323 : : xl_btree_metadata xlmeta;
1324 : : uint8 xlinfo;
1325 : : XLogRecPtr recptr;
1326 : : uint16 upostingoff;
1327 : :
2100 pg@bowt.ie 1328 : 3403196 : xlrec.offnum = newitemoff;
1329 : :
4044 heikki.linnakangas@i 1330 : 3403196 : XLogBeginInsert();
308 peter@eisentraut.org 1331 : 3403196 : XLogRegisterData(&xlrec, SizeOfBtreeInsert);
1332 : :
2099 pg@bowt.ie 1333 [ + + + + ]: 3403196 : if (isleaf && postingoff == 0)
1334 : : {
1335 : : /* Simple leaf insert */
7187 tgl@sss.pgh.pa.us 1336 : 3377179 : xlinfo = XLOG_BTREE_INSERT_LEAF;
1337 : : }
2120 pg@bowt.ie 1338 [ + + ]: 26017 : else if (postingoff != 0)
1339 : : {
1340 : : /*
1341 : : * Leaf insert with posting list split. Must include
1342 : : * postingoff field before newitem/orignewitem.
1343 : : */
2072 1344 [ - + ]: 15961 : Assert(isleaf);
2120 1345 : 15961 : xlinfo = XLOG_BTREE_INSERT_POST;
1346 : : }
1347 : : else
1348 : : {
1349 : : /* Internal page insert, which finishes a split on cbuf */
7187 tgl@sss.pgh.pa.us 1350 : 10056 : xlinfo = XLOG_BTREE_INSERT_UPPER;
2073 pg@bowt.ie 1351 : 10056 : XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
1352 : :
2072 1353 [ + + ]: 10056 : if (BufferIsValid(metabuf))
1354 : : {
1355 : : /* Actually, it's an internal page insert + meta update */
1356 : 12 : xlinfo = XLOG_BTREE_INSERT_META;
1357 : :
1358 [ - + ]: 12 : Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
1359 : 12 : xlmeta.version = metad->btm_version;
1360 : 12 : xlmeta.root = metad->btm_root;
1361 : 12 : xlmeta.level = metad->btm_level;
1362 : 12 : xlmeta.fastroot = metad->btm_fastroot;
1363 : 12 : xlmeta.fastlevel = metad->btm_fastlevel;
1756 1364 : 12 : xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
2072 1365 : 12 : xlmeta.allequalimage = metad->btm_allequalimage;
1366 : :
1367 : 12 : XLogRegisterBuffer(2, metabuf,
1368 : : REGBUF_WILL_INIT | REGBUF_STANDARD);
308 peter@eisentraut.org 1369 : 12 : XLogRegisterBufData(2, &xlmeta,
1370 : : sizeof(xl_btree_metadata));
1371 : : }
1372 : : }
1373 : :
4044 heikki.linnakangas@i 1374 : 3403196 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
2120 pg@bowt.ie 1375 [ + + ]: 3403196 : if (postingoff == 0)
1376 : : {
1377 : : /* Just log itup from caller */
308 peter@eisentraut.org 1378 : 3387235 : XLogRegisterBufData(0, itup, IndexTupleSize(itup));
1379 : : }
1380 : : else
1381 : : {
1382 : : /*
1383 : : * Insert with posting list split (XLOG_BTREE_INSERT_POST
1384 : : * record) case.
1385 : : *
1386 : : * Log postingoff. Also log origitup, not itup. REDO routine
1387 : : * must reconstruct final itup (as well as nposting) using
1388 : : * _bt_swap_posting().
1389 : : */
2056 pg@bowt.ie 1390 : 15961 : upostingoff = postingoff;
1391 : :
308 peter@eisentraut.org 1392 : 15961 : XLogRegisterBufData(0, &upostingoff, sizeof(uint16));
1393 : 15961 : XLogRegisterBufData(0, origitup,
2120 pg@bowt.ie 1394 : 15961 : IndexTupleSize(origitup));
1395 : : }
1396 : :
4044 heikki.linnakangas@i 1397 : 3403196 : recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1398 : :
8334 tgl@sss.pgh.pa.us 1399 [ + + ]: 3403196 : if (BufferIsValid(metabuf))
1400 : 12 : PageSetLSN(metapg, recptr);
2073 pg@bowt.ie 1401 [ + + ]: 3403196 : if (!isleaf)
3527 kgrittn@postgresql.o 1402 : 10056 : PageSetLSN(BufferGetPage(cbuf), recptr);
1403 : :
8334 tgl@sss.pgh.pa.us 1404 : 3403196 : PageSetLSN(page, recptr);
1405 : : }
1406 : :
1407 [ - + ]: 3756208 : END_CRIT_SECTION();
1408 : :
1409 : : /* Release subsidiary buffers */
1410 [ + + ]: 3756208 : if (BufferIsValid(metabuf))
7200 1411 : 12 : _bt_relbuf(rel, metabuf);
2073 pg@bowt.ie 1412 [ + + ]: 3756208 : if (!isleaf)
4291 heikki.linnakangas@i 1413 : 10955 : _bt_relbuf(rel, cbuf);
1414 : :
1415 : : /*
1416 : : * Cache the block number if this is the rightmost leaf page. Cache
1417 : : * may be used by a future inserter within _bt_search_insert().
1418 : : */
2099 pg@bowt.ie 1419 : 3756208 : blockcache = InvalidBlockNumber;
1855 1420 [ + + + + : 3756208 : if (isrightmost && isleaf && !isroot)
+ + ]
2099 1421 : 2067683 : blockcache = BufferGetBlockNumber(buf);
1422 : :
1423 : : /* Release buffer for insertion target block */
7200 tgl@sss.pgh.pa.us 1424 : 3756208 : _bt_relbuf(rel, buf);
1425 : :
1426 : : /*
1427 : : * If we decided to cache the insertion target block before releasing
1428 : : * its buffer lock, then cache it now. Check the height of the tree
1429 : : * first, though. We don't go for the optimization with small
1430 : : * indexes. Defer final check to this point to ensure that we don't
1431 : : * call _bt_getrootheight while holding a buffer lock.
1432 : : */
2099 pg@bowt.ie 1433 [ + + + + ]: 5823891 : if (BlockNumberIsValid(blockcache) &&
920 1434 : 2067683 : _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
2099 1435 : 31197 : RelationSetTargetBlock(rel, blockcache);
1436 : : }
1437 : :
1438 : : /* be tidy */
2120 1439 [ + + ]: 3768011 : if (postingoff != 0)
1440 : : {
1441 : : /* itup is actually a modified copy of caller's original */
1442 : 15988 : pfree(nposting);
1443 : 15988 : pfree(itup);
1444 : : }
10752 scrappy@hub.org 1445 : 3768011 : }
1446 : :
1447 : : /*
1448 : : * _bt_split() -- split a page in the btree.
1449 : : *
1450 : : * On entry, buf is the page to split, and is pinned and write-locked.
1451 : : * newitemoff etc. tell us about the new item that must be inserted
1452 : : * along with the data from the original page.
1453 : : *
1454 : : * itup_key is used for suffix truncation on leaf pages (internal
1455 : : * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
1456 : : * is the left-sibling of the page we're inserting the downlink for.
1457 : : * This function will clear the INCOMPLETE_SPLIT flag on it, and
1458 : : * release the buffer.
1459 : : *
1460 : : * orignewitem, nposting, and postingoff are needed when an insert of
1461 : : * orignewitem results in both a posting list split and a page split.
1462 : : * These extra posting list split details are used here in the same
1463 : : * way as they are used in the more common case where a posting list
1464 : : * split does not coincide with a page split. We need to deal with
1465 : : * posting list splits directly in order to ensure that everything
1466 : : * that follows from the insert of orignewitem is handled as a single
1467 : : * atomic operation (though caller's insert of a new pivot/downlink
1468 : : * into parent page will still be a separate operation). See
1469 : : * nbtree/README for details on the design of posting list splits.
1470 : : *
1471 : : * Returns the new right sibling of buf, pinned and write-locked.
1472 : : * The pin and lock on buf are maintained.
1473 : : */
1474 : : static Buffer
990 andres@anarazel.de 1475 : 11803 : _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
1476 : : Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1477 : : IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
1478 : : {
1479 : : Buffer rbuf;
1480 : : Page origpage;
1481 : : Page leftpage,
1482 : : rightpage;
1483 : : PGAlignedBlock leftpage_buf,
1484 : : rightpage_buf;
1485 : : BlockNumber origpagenumber,
1486 : : rightpagenumber;
1487 : : BTPageOpaque ropaque,
1488 : : lopaque,
1489 : : oopaque;
8333 tgl@sss.pgh.pa.us 1490 : 11803 : Buffer sbuf = InvalidBuffer;
1491 : 11803 : Page spage = NULL;
1492 : 11803 : BTPageOpaque sopaque = NULL;
1493 : : Size itemsz;
1494 : : ItemId itemid;
1495 : : IndexTuple firstright,
1496 : : lefthighkey;
1497 : : OffsetNumber firstrightoff;
1498 : : OffsetNumber afterleftoff,
1499 : : afterrightoff,
1500 : : minusinfoff;
1501 : : OffsetNumber origpagepostingoff;
1502 : : OffsetNumber maxoff;
1503 : : OffsetNumber i;
1504 : : bool newitemonleft,
1505 : : isleaf,
1506 : : isrightmost;
1507 : :
1508 : : /*
1509 : : * origpage is the original page to be split. leftpage is a temporary
1510 : : * buffer that receives the left-sibling data, which will be copied back
1511 : : * into origpage on success. rightpage is the new page that will receive
1512 : : * the right-sibling data.
1513 : : *
1514 : : * leftpage is allocated after choosing a split point. rightpage's new
1515 : : * buffer isn't acquired until after leftpage is initialized and has new
1516 : : * high key, the last point where splitting the page may fail (barring
1517 : : * corruption). Failing before acquiring new buffer won't have lasting
1518 : : * consequences, since origpage won't have been modified and leftpage is
1519 : : * only workspace.
1520 : : */
3527 kgrittn@postgresql.o 1521 : 11803 : origpage = BufferGetPage(buf);
1355 michael@paquier.xyz 1522 : 11803 : oopaque = BTPageGetOpaque(origpage);
2073 pg@bowt.ie 1523 : 11803 : isleaf = P_ISLEAF(oopaque);
1524 : 11803 : isrightmost = P_RIGHTMOST(oopaque);
1525 : 11803 : maxoff = PageGetMaxOffsetNumber(origpage);
5588 tgl@sss.pgh.pa.us 1526 : 11803 : origpagenumber = BufferGetBlockNumber(buf);
1527 : :
1528 : : /*
1529 : : * Choose a point to split origpage at.
1530 : : *
1531 : : * A split point can be thought of as a point _between_ two existing data
1532 : : * items on origpage (the lastleft and firstright tuples), provided you
1533 : : * pretend that the new item that didn't fit is already on origpage.
1534 : : *
1535 : : * Since origpage does not actually contain newitem, the representation of
1536 : : * split points needs to work with two boundary cases: splits where
1537 : : * newitem is lastleft, and splits where newitem is firstright.
1538 : : * newitemonleft resolves the ambiguity that would otherwise exist when
1539 : : * newitemoff == firstrightoff. In all other cases it's clear which side
1540 : : * of the split every tuple goes on from context. newitemonleft is
1541 : : * usually (but not always) redundant information.
1542 : : *
1543 : : * firstrightoff is supposed to be an origpage offset number, but it's
1544 : : * possible that its value will be maxoff+1, which is "past the end" of
1545 : : * origpage. This happens in the rare case where newitem goes after all
1546 : : * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
1547 : : * origpage at the point that leaves newitem alone on new right page. Any
1548 : : * "!newitemonleft && newitemoff == firstrightoff" split point makes
1549 : : * newitem the firstright tuple, though, so this case isn't a special
1550 : : * case.
1551 : : */
2073 pg@bowt.ie 1552 : 11803 : firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1553 : : newitem, &newitemonleft);
1554 : :
1555 : : /* Use temporary buffer for leftpage */
81 michael@paquier.xyz 1556 :GNC 11803 : leftpage = leftpage_buf.data;
2409 pg@bowt.ie 1557 :CBC 11803 : _bt_pageinit(leftpage, BufferGetPageSize(buf));
1355 michael@paquier.xyz 1558 : 11803 : lopaque = BTPageGetOpaque(leftpage);
1559 : :
1560 : : /*
1561 : : * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1562 : : * and HAS_GARBAGE flags.
1563 : : */
9204 vadim4o@yahoo.com 1564 : 11803 : lopaque->btpo_flags = oopaque->btpo_flags;
7084 tgl@sss.pgh.pa.us 1565 : 11803 : lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1566 : : /* set flag in leftpage indicating that rightpage has no downlink yet */
4291 heikki.linnakangas@i 1567 : 11803 : lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
10327 bruce@momjian.us 1568 : 11803 : lopaque->btpo_prev = oopaque->btpo_prev;
1569 : : /* handle btpo_next after rightpage buffer acquired */
1756 pg@bowt.ie 1570 : 11803 : lopaque->btpo_level = oopaque->btpo_level;
1571 : : /* handle btpo_cycleid after rightpage buffer acquired */
1572 : :
1573 : : /*
1574 : : * Copy the original page's LSN into leftpage, which will become the
1575 : : * updated version of the page. We need this because XLogInsert will
1576 : : * examine the LSN and possibly dump it in a page image.
1577 : : */
2409 1578 : 11803 : PageSetLSN(leftpage, PageGetLSN(origpage));
1579 : :
1580 : : /*
1581 : : * Determine page offset number of existing overlapped-with-orignewitem
1582 : : * posting list when it is necessary to perform a posting list split in
1583 : : * passing. Note that newitem was already changed by caller (newitem no
1584 : : * longer has the orignewitem TID).
1585 : : *
1586 : : * This page offset number (origpagepostingoff) will be used to pretend
1587 : : * that the posting split has already taken place, even though the
1588 : : * required modifications to origpage won't occur until we reach the
1589 : : * critical section. The lastleft and firstright tuples of our page split
1590 : : * point should, in effect, come from an imaginary version of origpage
1591 : : * that has the nposting tuple instead of the original posting list tuple.
1592 : : *
1593 : : * Note: _bt_findsplitloc() should have compensated for coinciding posting
1594 : : * list splits in just the same way, at least in theory. It doesn't
1595 : : * bother with that, though. In practice it won't affect its choice of
1596 : : * split point.
1597 : : */
2120 1598 : 11803 : origpagepostingoff = InvalidOffsetNumber;
1599 [ + + ]: 11803 : if (postingoff != 0)
1600 : : {
1601 [ - + ]: 27 : Assert(isleaf);
1602 [ - + ]: 27 : Assert(ItemPointerCompare(&orignewitem->t_tid,
1603 : : &newitem->t_tid) < 0);
1604 [ - + ]: 27 : Assert(BTreeTupleIsPosting(nposting));
1605 : 27 : origpagepostingoff = OffsetNumberPrev(newitemoff);
1606 : : }
1607 : :
1608 : : /*
1609 : : * The high key for the new left page is a possibly-truncated copy of
1610 : : * firstright on the leaf level (it's "firstright itself" on internal
1611 : : * pages; see !isleaf comments below). This may seem to be contrary to
1612 : : * Lehman & Yao's approach of using a copy of lastleft as the new high key
1613 : : * when splitting on the leaf level. It isn't, though.
1614 : : *
1615 : : * Suffix truncation will leave the left page's high key fully equal to
1616 : : * lastleft when lastleft and firstright are equal prior to heap TID (that
1617 : : * is, the tiebreaker TID value comes from lastleft). It isn't actually
1618 : : * necessary for a new leaf high key to be a copy of lastleft for the L&Y
1619 : : * "subtree" invariant to hold. It's sufficient to make sure that the new
1620 : : * leaf high key is strictly less than firstright, and greater than or
1621 : : * equal to (not necessarily equal to) lastleft. In other words, when
1622 : : * suffix truncation isn't possible during a leaf page split, we take
1623 : : * L&Y's exact approach to generating a new high key for the left page.
1624 : : * (Actually, that is slightly inaccurate. We don't just use a copy of
1625 : : * lastleft. A tuple with all the keys from firstright but the max heap
1626 : : * TID from lastleft is used, to avoid introducing a special case.)
1627 : : */
2073 1628 [ + + + + ]: 11803 : if (!newitemonleft && newitemoff == firstrightoff)
1629 : : {
1630 : : /* incoming tuple becomes firstright */
9279 tgl@sss.pgh.pa.us 1631 : 32 : itemsz = newitemsz;
2073 pg@bowt.ie 1632 : 32 : firstright = newitem;
1633 : : }
1634 : : else
1635 : : {
1636 : : /* existing item at firstrightoff becomes firstright */
1637 : 11771 : itemid = PageGetItemId(origpage, firstrightoff);
9279 tgl@sss.pgh.pa.us 1638 : 11771 : itemsz = ItemIdGetLength(itemid);
2073 pg@bowt.ie 1639 : 11771 : firstright = (IndexTuple) PageGetItem(origpage, itemid);
1640 [ - + ]: 11771 : if (firstrightoff == origpagepostingoff)
2073 pg@bowt.ie 1641 :LBC (2) : firstright = nposting;
1642 : : }
1643 : :
2073 pg@bowt.ie 1644 [ + + ]:CBC 11803 : if (isleaf)
1645 : : {
1646 : : IndexTuple lastleft;
1647 : :
1648 : : /* Attempt suffix truncation for leaf page splits */
1649 [ + + + + ]: 11646 : if (newitemonleft && newitemoff == firstrightoff)
1650 : : {
1651 : : /* incoming tuple becomes lastleft */
2463 1652 : 245 : lastleft = newitem;
1653 : : }
1654 : : else
1655 : : {
1656 : : OffsetNumber lastleftoff;
1657 : :
1658 : : /* existing item before firstrightoff becomes lastleft */
2073 1659 : 11401 : lastleftoff = OffsetNumberPrev(firstrightoff);
2463 1660 [ + + - + ]: 11401 : Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1661 : 11401 : itemid = PageGetItemId(origpage, lastleftoff);
1662 : 11401 : lastleft = (IndexTuple) PageGetItem(origpage, itemid);
2120 1663 [ + + ]: 11401 : if (lastleftoff == origpagepostingoff)
1664 : 3 : lastleft = nposting;
1665 : : }
1666 : :
2073 1667 : 11646 : lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
1668 : 11646 : itemsz = IndexTupleSize(lefthighkey);
1669 : : }
1670 : : else
1671 : : {
1672 : : /*
1673 : : * Don't perform suffix truncation on a copy of firstright to make
1674 : : * left page high key for internal page splits. Must use firstright
1675 : : * as new high key directly.
1676 : : *
1677 : : * Each distinct separator key value originates as a leaf level high
1678 : : * key; all other separator keys/pivot tuples are copied from one
1679 : : * level down. A separator key in a grandparent page must be
1680 : : * identical to high key in rightmost parent page of the subtree to
1681 : : * its left, which must itself be identical to high key in rightmost
1682 : : * child page of that same subtree (this even applies to separator
1683 : : * from grandparent's high key). There must always be an unbroken
1684 : : * "seam" of identical separator keys that guide index scans at every
1685 : : * level, starting from the grandparent. That's why suffix truncation
1686 : : * is unsafe here.
1687 : : *
1688 : : * Internal page splits will truncate firstright into a "negative
1689 : : * infinity" data item when it gets inserted on the new right page
1690 : : * below, though. This happens during the call to _bt_pgaddtup() for
1691 : : * the new first data item for right page. Do not confuse this
1692 : : * mechanism with suffix truncation. It is just a convenient way of
1693 : : * implementing page splits that split the internal page "inside"
1694 : : * firstright. The lefthighkey separator key cannot appear a second
1695 : : * time in the right page (only firstright's downlink goes in right
1696 : : * page).
1697 : : */
1698 : 157 : lefthighkey = firstright;
1699 : : }
1700 : :
1701 : : /*
1702 : : * Add new high key to leftpage
1703 : : */
1704 : 11803 : afterleftoff = P_HIKEY;
1705 : :
1706 [ + - - + ]: 11803 : Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
1707 [ + - - + ]: 11803 : Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
1708 : : IndexRelationGetNumberOfKeyAttributes(rel));
1709 [ - + ]: 11803 : Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
50 peter@eisentraut.org 1710 [ - + ]:GNC 11803 : if (PageAddItem(leftpage, lefthighkey, itemsz, afterleftoff, false, false) == InvalidOffsetNumber)
2073 pg@bowt.ie 1711 [ # # ]:UBC 0 : elog(ERROR, "failed to add high key to the left sibling"
1712 : : " while splitting block %u of index \"%s\"",
1713 : : origpagenumber, RelationGetRelationName(rel));
2073 pg@bowt.ie 1714 :CBC 11803 : afterleftoff = OffsetNumberNext(afterleftoff);
1715 : :
1716 : : /*
1717 : : * Acquire a new right page to split into, now that left page has a new
1718 : : * high key.
1719 : : *
1720 : : * To not confuse future VACUUM operations, we zero the right page and
1721 : : * work on an in-memory copy of it before writing WAL, then copy its
1722 : : * contents back to the actual page once we start the critical section
1723 : : * work. This simplifies the split work, so as there is no need to zero
1724 : : * the right page before throwing an error.
1725 : : */
920 1726 : 11803 : rbuf = _bt_allocbuf(rel, heaprel);
81 michael@paquier.xyz 1727 :GNC 11803 : rightpage = rightpage_buf.data;
1728 : :
1729 : : /*
1730 : : * Copy the contents of the right page into its temporary location, and
1731 : : * zero the original space.
1732 : : */
1733 : 11803 : memcpy(rightpage, BufferGetPage(rbuf), BLCKSZ);
1734 : 11803 : memset(BufferGetPage(rbuf), 0, BLCKSZ);
2409 pg@bowt.ie 1735 :CBC 11803 : rightpagenumber = BufferGetBlockNumber(rbuf);
1736 : : /* rightpage was initialized by _bt_allocbuf */
1355 michael@paquier.xyz 1737 : 11803 : ropaque = BTPageGetOpaque(rightpage);
1738 : :
1739 : : /*
1740 : : * Finish off remaining leftpage special area fields. They cannot be set
1741 : : * before both origpage (leftpage) and rightpage buffers are acquired and
1742 : : * locked.
1743 : : *
1744 : : * btpo_cycleid is only used with leaf pages, though we set it here in all
1745 : : * cases just to be consistent.
1746 : : */
2409 pg@bowt.ie 1747 : 11803 : lopaque->btpo_next = rightpagenumber;
1748 : 11803 : lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1749 : :
1750 : : /*
1751 : : * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1752 : : * and HAS_GARBAGE flags.
1753 : : */
1754 : 11803 : ropaque->btpo_flags = oopaque->btpo_flags;
1755 : 11803 : ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1756 : 11803 : ropaque->btpo_prev = origpagenumber;
1757 : 11803 : ropaque->btpo_next = oopaque->btpo_next;
1756 1758 : 11803 : ropaque->btpo_level = oopaque->btpo_level;
2409 1759 : 11803 : ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1760 : :
1761 : : /*
1762 : : * Add new high key to rightpage where necessary.
1763 : : *
1764 : : * If the page we're splitting is not the rightmost page at its level in
1765 : : * the tree, then the first entry on the page is the high key from
1766 : : * origpage.
1767 : : */
2073 1768 : 11803 : afterrightoff = P_HIKEY;
1769 : :
1770 [ + + ]: 11803 : if (!isrightmost)
1771 : : {
1772 : : IndexTuple righthighkey;
1773 : :
2409 1774 : 5292 : itemid = PageGetItemId(origpage, P_HIKEY);
1775 : 5292 : itemsz = ItemIdGetLength(itemid);
2073 1776 : 5292 : righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
1777 [ + - - + ]: 5292 : Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
1778 [ + - - + ]: 5292 : Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
1779 : : IndexRelationGetNumberOfKeyAttributes(rel));
50 peter@eisentraut.org 1780 [ - + ]:GNC 5292 : if (PageAddItem(rightpage, righthighkey, itemsz, afterrightoff, false, false) == InvalidOffsetNumber)
1781 : : {
2073 pg@bowt.ie 1782 [ # # ]:UBC 0 : elog(ERROR, "failed to add high key to the right sibling"
1783 : : " while splitting block %u of index \"%s\"",
1784 : : origpagenumber, RelationGetRelationName(rel));
1785 : : }
2073 pg@bowt.ie 1786 :CBC 5292 : afterrightoff = OffsetNumberNext(afterrightoff);
1787 : : }
1788 : :
1789 : : /*
1790 : : * Internal page splits truncate first data item on right page -- it
1791 : : * becomes "minus infinity" item for the page. Set this up here.
1792 : : */
1793 : 11803 : minusinfoff = InvalidOffsetNumber;
1794 [ + + ]: 11803 : if (!isleaf)
1795 : 157 : minusinfoff = afterrightoff;
1796 : :
1797 : : /*
1798 : : * Now transfer all the data items (non-pivot tuples in isleaf case, or
1799 : : * additional pivot tuples in !isleaf case) to the appropriate page.
1800 : : *
1801 : : * Note: we *must* insert at least the right page's items in item-number
1802 : : * order, for the benefit of _bt_restore_page().
1803 : : */
9279 tgl@sss.pgh.pa.us 1804 [ + + + + ]: 3577256 : for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1805 : : {
1806 : : IndexTuple dataitem;
1807 : :
10327 bruce@momjian.us 1808 : 3565453 : itemid = PageGetItemId(origpage, i);
1809 : 3565453 : itemsz = ItemIdGetLength(itemid);
2073 pg@bowt.ie 1810 : 3565453 : dataitem = (IndexTuple) PageGetItem(origpage, itemid);
1811 : :
1812 : : /* replace original item with nposting due to posting split? */
2120 1813 [ + + ]: 3565453 : if (i == origpagepostingoff)
1814 : : {
2073 1815 [ - + ]: 27 : Assert(BTreeTupleIsPosting(dataitem));
2120 1816 [ - + ]: 27 : Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
2073 1817 : 27 : dataitem = nposting;
1818 : : }
1819 : :
1820 : : /* does new item belong before this one? */
2120 1821 [ + + ]: 3565426 : else if (i == newitemoff)
1822 : : {
9279 tgl@sss.pgh.pa.us 1823 [ + + ]: 6844 : if (newitemonleft)
1824 : : {
2073 pg@bowt.ie 1825 [ - + ]: 1887 : Assert(newitemoff <= firstrightoff);
1826 [ - + ]: 1887 : if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
1827 : : false))
1828 : : {
5588 tgl@sss.pgh.pa.us 1829 [ # # ]:UBC 0 : elog(ERROR, "failed to add new item to the left sibling"
1830 : : " while splitting block %u of index \"%s\"",
1831 : : origpagenumber, RelationGetRelationName(rel));
1832 : : }
2073 pg@bowt.ie 1833 :CBC 1887 : afterleftoff = OffsetNumberNext(afterleftoff);
1834 : : }
1835 : : else
1836 : : {
1837 [ - + ]: 4957 : Assert(newitemoff >= firstrightoff);
1838 [ - + ]: 4957 : if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1839 : : afterrightoff == minusinfoff))
1840 : : {
5588 tgl@sss.pgh.pa.us 1841 [ # # ]:UBC 0 : elog(ERROR, "failed to add new item to the right sibling"
1842 : : " while splitting block %u of index \"%s\"",
1843 : : origpagenumber, RelationGetRelationName(rel));
1844 : : }
2073 pg@bowt.ie 1845 :CBC 4957 : afterrightoff = OffsetNumberNext(afterrightoff);
1846 : : }
1847 : : }
1848 : :
1849 : : /* decide which page to put it on */
1850 [ + + ]: 3565453 : if (i < firstrightoff)
1851 : : {
1852 [ - + ]: 2694850 : if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
1853 : : {
5588 tgl@sss.pgh.pa.us 1854 [ # # ]:UBC 0 : elog(ERROR, "failed to add old item to the left sibling"
1855 : : " while splitting block %u of index \"%s\"",
1856 : : origpagenumber, RelationGetRelationName(rel));
1857 : : }
2073 pg@bowt.ie 1858 :CBC 2694850 : afterleftoff = OffsetNumberNext(afterleftoff);
1859 : : }
1860 : : else
1861 : : {
1862 [ - + ]: 870603 : if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
1863 : : afterrightoff == minusinfoff))
1864 : : {
5588 tgl@sss.pgh.pa.us 1865 [ # # ]:UBC 0 : elog(ERROR, "failed to add old item to the right sibling"
1866 : : " while splitting block %u of index \"%s\"",
1867 : : origpagenumber, RelationGetRelationName(rel));
1868 : : }
2073 pg@bowt.ie 1869 :CBC 870603 : afterrightoff = OffsetNumberNext(afterrightoff);
1870 : : }
1871 : : }
1872 : :
1873 : : /* Handle case where newitem goes at the end of rightpage */
9279 tgl@sss.pgh.pa.us 1874 [ + + ]: 11803 : if (i <= newitemoff)
1875 : : {
1876 : : /*
1877 : : * Can't have newitemonleft here; that would imply we were told to put
1878 : : * *everything* on the left page, which cannot fit (if it could, we'd
1879 : : * not be splitting the page).
1880 : : */
2073 pg@bowt.ie 1881 [ + - - + ]: 4959 : Assert(!newitemonleft && newitemoff == maxoff + 1);
1882 [ - + ]: 4959 : if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1883 : : afterrightoff == minusinfoff))
1884 : : {
5588 tgl@sss.pgh.pa.us 1885 [ # # ]:UBC 0 : elog(ERROR, "failed to add new item to the right sibling"
1886 : : " while splitting block %u of index \"%s\"",
1887 : : origpagenumber, RelationGetRelationName(rel));
1888 : : }
2073 pg@bowt.ie 1889 :CBC 4959 : afterrightoff = OffsetNumberNext(afterrightoff);
1890 : : }
1891 : :
1892 : : /*
1893 : : * We have to grab the original right sibling (if any) and update its prev
1894 : : * link. We are guaranteed that this is deadlock-free, since we couple
1895 : : * the locks in the standard order: left to right.
1896 : : */
1897 [ + + ]: 11803 : if (!isrightmost)
1898 : : {
920 1899 : 5292 : sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
3527 kgrittn@postgresql.o 1900 : 5292 : spage = BufferGetPage(sbuf);
1355 michael@paquier.xyz 1901 : 5292 : sopaque = BTPageGetOpaque(spage);
5588 tgl@sss.pgh.pa.us 1902 [ - + ]: 5292 : if (sopaque->btpo_prev != origpagenumber)
1903 : : {
2329 peter@eisentraut.org 1904 [ # # ]:UBC 0 : ereport(ERROR,
1905 : : (errcode(ERRCODE_INDEX_CORRUPTED),
1906 : : errmsg_internal("right sibling's left-link doesn't match: "
1907 : : "block %u links to %u instead of expected %u in index \"%s\"",
1908 : : oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1909 : : RelationGetRelationName(rel))));
1910 : : }
1911 : :
1912 : : /*
1913 : : * Check to see if we can set the SPLIT_END flag in the right-hand
1914 : : * split page; this can save some I/O for vacuum since it need not
1915 : : * proceed to the right sibling. We can set the flag if the right
1916 : : * sibling has a different cycleid: that means it could not be part of
1917 : : * a group of pages that were all split off from the same ancestor
1918 : : * page. If you're confused, imagine that page A splits to A B and
1919 : : * then again, yielding A C B, while vacuum is in progress. Tuples
1920 : : * originally in A could now be in either B or C, hence vacuum must
1921 : : * examine both pages. But if D, our right sibling, has a different
1922 : : * cycleid then it could not contain any tuples that were in A when
1923 : : * the vacuum started.
1924 : : */
7162 tgl@sss.pgh.pa.us 1925 [ - + ]:CBC 5292 : if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
7162 tgl@sss.pgh.pa.us 1926 :LBC (1) : ropaque->btpo_flags |= BTP_SPLIT_END;
1927 : : }
1928 : :
1929 : : /*
1930 : : * Right sibling is locked, new siblings are prepared, but original page
1931 : : * is not updated yet.
1932 : : *
1933 : : * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1934 : : * not starting the critical section till here because we haven't been
1935 : : * scribbling on the original page yet; see comments above.
1936 : : */
9104 tgl@sss.pgh.pa.us 1937 :CBC 11803 : START_CRIT_SECTION();
1938 : :
1939 : : /*
1940 : : * By here, the original data page has been split into two new halves, and
1941 : : * these are correct. The algorithm requires that the left page never
1942 : : * move during a split, so we copy the new left page back on top of the
1943 : : * original. We need to do this before writing the WAL record, so that
1944 : : * XLogInsert can WAL log an image of the page if necessary.
1945 : : */
81 michael@paquier.xyz 1946 :GNC 11803 : memcpy(origpage, leftpage, BLCKSZ);
1947 : : /* leftpage, lopaque must not be used below here */
1948 : :
1949 : : /*
1950 : : * Move the contents of the right page from its temporary location to the
1951 : : * destination buffer, before writing the WAL record. Unlike the left
1952 : : * page, the right page and its opaque area are still needed to complete
1953 : : * the update of the page, so reinitialize them.
1954 : : */
1955 : 11803 : rightpage = BufferGetPage(rbuf);
1956 : 11803 : memcpy(rightpage, rightpage_buf.data, BLCKSZ);
1957 : 11803 : ropaque = BTPageGetOpaque(rightpage);
1958 : :
6824 tgl@sss.pgh.pa.us 1959 :CBC 11803 : MarkBufferDirty(buf);
1960 : 11803 : MarkBufferDirty(rbuf);
1961 : :
2073 pg@bowt.ie 1962 [ + + ]: 11803 : if (!isrightmost)
1963 : : {
5588 tgl@sss.pgh.pa.us 1964 : 5292 : sopaque->btpo_prev = rightpagenumber;
6824 1965 : 5292 : MarkBufferDirty(sbuf);
1966 : : }
1967 : :
1968 : : /*
1969 : : * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1970 : : * a split
1971 : : */
4291 heikki.linnakangas@i 1972 [ + + ]: 11803 : if (!isleaf)
1973 : : {
3527 kgrittn@postgresql.o 1974 : 157 : Page cpage = BufferGetPage(cbuf);
1355 michael@paquier.xyz 1975 : 157 : BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1976 : :
4291 heikki.linnakangas@i 1977 : 157 : cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1978 : 157 : MarkBufferDirty(cbuf);
1979 : : }
1980 : :
1981 : : /* XLOG stuff */
5482 rhaas@postgresql.org 1982 [ + + + + : 11803 : if (RelationNeedsWAL(rel))
+ + + - ]
1983 : : {
1984 : : xl_btree_split xlrec;
1985 : : uint8 xlinfo;
1986 : : XLogRecPtr recptr;
1987 : :
1756 pg@bowt.ie 1988 : 10885 : xlrec.level = ropaque->btpo_level;
1989 : : /* See comments below on newitem, orignewitem, and posting lists */
2073 1990 : 10885 : xlrec.firstrightoff = firstrightoff;
4044 heikki.linnakangas@i 1991 : 10885 : xlrec.newitemoff = newitemoff;
2120 pg@bowt.ie 1992 : 10885 : xlrec.postingoff = 0;
2073 1993 [ + + + + ]: 10885 : if (postingoff != 0 && origpagepostingoff < firstrightoff)
2120 1994 : 16 : xlrec.postingoff = postingoff;
1995 : :
4044 heikki.linnakangas@i 1996 : 10885 : XLogBeginInsert();
308 peter@eisentraut.org 1997 : 10885 : XLogRegisterData(&xlrec, SizeOfBtreeSplit);
1998 : :
4044 heikki.linnakangas@i 1999 : 10885 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
2000 : 10885 : XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
2001 : : /* Log original right sibling, since we've changed its prev-pointer */
2073 pg@bowt.ie 2002 [ + + ]: 10885 : if (!isrightmost)
4044 heikki.linnakangas@i 2003 : 5286 : XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
2073 pg@bowt.ie 2004 [ + + ]: 10885 : if (!isleaf)
4044 heikki.linnakangas@i 2005 : 157 : XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
2006 : :
2007 : : /*
2008 : : * Log the new item, if it was inserted on the left page. (If it was
2009 : : * put on the right page, we don't need to explicitly WAL log it
2010 : : * because it's included with all the other items on the right page.)
2011 : : * Show the new item as belonging to the left page buffer, so that it
2012 : : * is not stored if XLogInsert decides it needs a full-page image of
2013 : : * the left page. We always store newitemoff in the record, though.
2014 : : *
2015 : : * The details are sometimes slightly different for page splits that
2016 : : * coincide with a posting list split. If both the replacement
2017 : : * posting list and newitem go on the right page, then we don't need
2018 : : * to log anything extra, just like the simple !newitemonleft
2019 : : * no-posting-split case (postingoff is set to zero in the WAL record,
2020 : : * so recovery doesn't need to process a posting list split at all).
2021 : : * Otherwise, we set postingoff and log orignewitem instead of
2022 : : * newitem, despite having actually inserted newitem. REDO routine
2023 : : * must reconstruct nposting and newitem using _bt_swap_posting().
2024 : : *
2025 : : * Note: It's possible that our page split point is the point that
2026 : : * makes the posting list lastleft and newitem firstright. This is
2027 : : * the only case where we log orignewitem/newitem despite newitem
2028 : : * going on the right page. If XLogInsert decides that it can omit
2029 : : * orignewitem due to logging a full-page image of the left page,
2030 : : * everything still works out, since recovery only needs to log
2031 : : * orignewitem for items on the left page (just like the regular
2032 : : * newitem-logged case).
2033 : : */
2120 pg@bowt.ie 2034 [ + + + + ]: 10885 : if (newitemonleft && xlrec.postingoff == 0)
308 peter@eisentraut.org 2035 : 1871 : XLogRegisterBufData(0, newitem, newitemsz);
2120 pg@bowt.ie 2036 [ + + ]: 9014 : else if (xlrec.postingoff != 0)
2037 : : {
2073 2038 [ - + ]: 16 : Assert(isleaf);
2039 [ + + - + ]: 16 : Assert(newitemonleft || firstrightoff == newitemoff);
2040 [ - + ]: 16 : Assert(newitemsz == IndexTupleSize(orignewitem));
308 peter@eisentraut.org 2041 : 16 : XLogRegisterBufData(0, orignewitem, newitemsz);
2042 : : }
2043 : :
2044 : : /* Log the left page's new high key */
2073 pg@bowt.ie 2045 [ + + ]: 10885 : if (!isleaf)
2046 : : {
2047 : : /* lefthighkey isn't local copy, get current pointer */
2048 : 157 : itemid = PageGetItemId(origpage, P_HIKEY);
2049 : 157 : lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
2050 : : }
308 peter@eisentraut.org 2051 : 10885 : XLogRegisterBufData(0, lefthighkey,
2073 pg@bowt.ie 2052 : 10885 : MAXALIGN(IndexTupleSize(lefthighkey)));
2053 : :
2054 : : /*
2055 : : * Log the contents of the right page in the format understood by
2056 : : * _bt_restore_page(). The whole right page will be recreated.
2057 : : *
2058 : : * Direct access to page is not good but faster - we should implement
2059 : : * some new func in page API. Note we only store the tuples
2060 : : * themselves, knowing that they were inserted in item-number order
2061 : : * and so the line pointers can be reconstructed. See comments for
2062 : : * _bt_restore_page().
2063 : : */
4044 heikki.linnakangas@i 2064 : 10885 : XLogRegisterBufData(1,
3100 tgl@sss.pgh.pa.us 2065 : 10885 : (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
4044 heikki.linnakangas@i 2066 : 10885 : ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
2067 : :
2463 pg@bowt.ie 2068 [ + + ]: 10885 : xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
4044 heikki.linnakangas@i 2069 : 10885 : recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2070 : :
6886 alvherre@alvh.no-ip. 2071 : 10885 : PageSetLSN(origpage, recptr);
9204 vadim4o@yahoo.com 2072 : 10885 : PageSetLSN(rightpage, recptr);
2073 pg@bowt.ie 2073 [ + + ]: 10885 : if (!isrightmost)
9204 vadim4o@yahoo.com 2074 : 5286 : PageSetLSN(spage, recptr);
4277 heikki.linnakangas@i 2075 [ + + ]: 10885 : if (!isleaf)
3527 kgrittn@postgresql.o 2076 : 157 : PageSetLSN(BufferGetPage(cbuf), recptr);
2077 : : }
2078 : :
9093 tgl@sss.pgh.pa.us 2079 [ - + ]: 11803 : END_CRIT_SECTION();
2080 : :
2081 : : /* release the old right sibling */
2073 pg@bowt.ie 2082 [ + + ]: 11803 : if (!isrightmost)
7200 tgl@sss.pgh.pa.us 2083 : 5292 : _bt_relbuf(rel, sbuf);
2084 : :
2085 : : /* release the child */
4291 heikki.linnakangas@i 2086 [ + + ]: 11803 : if (!isleaf)
2087 : 157 : _bt_relbuf(rel, cbuf);
2088 : :
2089 : : /* be tidy */
2073 pg@bowt.ie 2090 [ + + ]: 11803 : if (isleaf)
2091 : 11646 : pfree(lefthighkey);
2092 : :
2093 : : /* split's done */
9968 bruce@momjian.us 2094 : 11803 : return rbuf;
2095 : : }
2096 : :
2097 : : /*
2098 : : * _bt_insert_parent() -- Insert downlink into parent, completing split.
2099 : : *
2100 : : * On entry, buf and rbuf are the left and right split pages, which we
2101 : : * still hold write locks on. Both locks will be released here. We
2102 : : * release the rbuf lock once we have a write lock on the page that we
2103 : : * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
2104 : : * The lock on buf is released at the same point as the lock on the parent
2105 : : * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
2106 : : * atomic operation that completes the split by inserting a new downlink.
2107 : : *
2108 : : * stack - stack showing how we got here. Will be NULL when splitting true
2109 : : * root, or during concurrent root split, where we can be inefficient
2110 : : * isroot - we split the true root
2111 : : * isonly - we split a page alone on its level (might have been fast root)
2112 : : */
2113 : : static void
8334 tgl@sss.pgh.pa.us 2114 : 11803 : _bt_insert_parent(Relation rel,
2115 : : Relation heaprel,
2116 : : Buffer buf,
2117 : : Buffer rbuf,
2118 : : BTStack stack,
2119 : : bool isroot,
2120 : : bool isonly)
2121 : : {
920 pg@bowt.ie 2122 [ - + ]: 11803 : Assert(heaprel != NULL);
2123 : :
2124 : : /*
2125 : : * Here we have to do something Lehman and Yao don't talk about: deal with
2126 : : * a root split and construction of a new root. If our stack is empty
2127 : : * then we have just split a node on what had been the root level when we
2128 : : * descended the tree. If it was still the root then we perform a
2129 : : * new-root construction. If it *wasn't* the root anymore, search to find
2130 : : * the next higher level that someone constructed meanwhile, and find the
2131 : : * right place to insert as for the normal case.
2132 : : *
2133 : : * If we have to search for the parent level, we do so by re-descending
2134 : : * from the root. This is not super-efficient, but it's rare enough not
2135 : : * to matter.
2136 : : */
1855 2137 [ + + ]: 11803 : if (isroot)
2138 : : {
2139 : : Buffer rootbuf;
2140 : :
8014 neilc@samurai.com 2141 [ - + ]: 691 : Assert(stack == NULL);
1855 pg@bowt.ie 2142 [ - + ]: 691 : Assert(isonly);
2143 : : /* create a new root node one level up and update the metapage */
920 2144 : 691 : rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf);
2145 : : /* release the split buffers */
7200 tgl@sss.pgh.pa.us 2146 : 691 : _bt_relbuf(rel, rootbuf);
2147 : 691 : _bt_relbuf(rel, rbuf);
2148 : 691 : _bt_relbuf(rel, buf);
2149 : : }
2150 : : else
2151 : : {
8334 2152 : 11112 : BlockNumber bknum = BufferGetBlockNumber(buf);
2153 : 11112 : BlockNumber rbknum = BufferGetBlockNumber(rbuf);
3527 kgrittn@postgresql.o 2154 : 11112 : Page page = BufferGetPage(buf);
2155 : : IndexTuple new_item;
2156 : : BTStackData fakestack;
2157 : : IndexTuple ritem;
2158 : : Buffer pbuf;
2159 : :
8014 neilc@samurai.com 2160 [ + + ]: 11112 : if (stack == NULL)
2161 : : {
2162 : : BTPageOpaque opaque;
2163 : :
4114 heikki.linnakangas@i 2164 [ - + ]: 12 : elog(DEBUG2, "concurrent ROOT page split");
1355 michael@paquier.xyz 2165 : 12 : opaque = BTPageGetOpaque(page);
2166 : :
2167 : : /*
2168 : : * We should never reach here when a leaf page split takes place
2169 : : * despite the insert of newitem being able to apply the fastpath
2170 : : * optimization. Make sure of that with an assertion.
2171 : : *
2172 : : * This is more of a performance issue than a correctness issue.
2173 : : * The fastpath won't have a descent stack. Using a phony stack
2174 : : * here works, but never rely on that. The fastpath should be
2175 : : * rejected within _bt_search_insert() when the rightmost leaf
2176 : : * page will split, since it's faster to go through _bt_search()
2177 : : * and get a stack in the usual way.
2178 : : */
1855 pg@bowt.ie 2179 [ + - + - : 12 : Assert(!(P_ISLEAF(opaque) &&
- + ]
2180 : : BlockNumberIsValid(RelationGetTargetBlock(rel))));
2181 : :
2182 : : /* Find the leftmost page at the next level up */
830 tmunro@postgresql.or 2183 : 12 : pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false);
2184 : : /* Set up a phony stack entry pointing there */
8334 tgl@sss.pgh.pa.us 2185 : 12 : stack = &fakestack;
2186 : 12 : stack->bts_blkno = BufferGetBlockNumber(pbuf);
2187 : 12 : stack->bts_offset = InvalidOffsetNumber;
2188 : 12 : stack->bts_parent = NULL;
2189 : 12 : _bt_relbuf(rel, pbuf);
2190 : : }
2191 : :
2192 : : /* get high key from left, a strict lower bound for new right page */
7265 2193 : 11112 : ritem = (IndexTuple) PageGetItem(page,
2194 : 11112 : PageGetItemId(page, P_HIKEY));
2195 : :
2196 : : /* form an index tuple that points at the new right page */
2197 : 11112 : new_item = CopyIndexTuple(ritem);
2192 pg@bowt.ie 2198 : 11112 : BTreeTupleSetDownLink(new_item, rbknum);
2199 : :
2200 : : /*
2201 : : * Re-find and write lock the parent of buf.
2202 : : *
2203 : : * It's possible that the location of buf's downlink has changed since
2204 : : * our initial _bt_search() descent. _bt_getstackbuf() will detect
2205 : : * and recover from this, updating the stack, which ensures that the
2206 : : * new downlink will be inserted at the correct offset. Even buf's
2207 : : * parent may have changed.
2208 : : */
990 andres@anarazel.de 2209 : 11112 : pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
2210 : :
2211 : : /*
2212 : : * Unlock the right child. The left child will be unlocked in
2213 : : * _bt_insertonpg().
2214 : : *
2215 : : * Unlocking the right child must be delayed until here to ensure that
2216 : : * no concurrent VACUUM operation can become confused. Page deletion
2217 : : * cannot be allowed to fail to re-find a downlink for the rbuf page.
2218 : : * (Actually, this is just a vestige of how things used to work. The
2219 : : * page deletion code is expected to check for the INCOMPLETE_SPLIT
2220 : : * flag on the left child. It won't attempt deletion of the right
2221 : : * child until the split is complete. Despite all this, we opt to
2222 : : * conservatively delay unlocking the right child until here.)
2223 : : */
7200 tgl@sss.pgh.pa.us 2224 : 11112 : _bt_relbuf(rel, rbuf);
2225 : :
8334 2226 [ - + ]: 11112 : if (pbuf == InvalidBuffer)
2329 peter@eisentraut.org 2227 [ # # ]:UBC 0 : ereport(ERROR,
2228 : : (errcode(ERRCODE_INDEX_CORRUPTED),
2229 : : errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2230 : : RelationGetRelationName(rel), bknum, rbknum)));
2231 : :
2232 : : /* Recursively insert into the parent */
990 andres@anarazel.de 2233 :CBC 22224 : _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
2101 pg@bowt.ie 2234 : 11112 : new_item, MAXALIGN(IndexTupleSize(new_item)),
1855 2235 : 11112 : stack->bts_offset + 1, 0, isonly);
2236 : :
2237 : : /* be tidy */
8334 tgl@sss.pgh.pa.us 2238 : 11112 : pfree(new_item);
2239 : : }
2240 : 11803 : }
2241 : :
2242 : : /*
2243 : : * _bt_finish_split() -- Finish an incomplete split
2244 : : *
2245 : : * A crash or other failure can leave a split incomplete. The insertion
2246 : : * routines won't allow to insert on a page that is incompletely split.
2247 : : * Before inserting on such a page, call _bt_finish_split().
2248 : : *
2249 : : * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
2250 : : * and unpinned.
2251 : : *
2252 : : * Caller must provide a valid heaprel, since finishing a page split requires
2253 : : * allocating a new page if and when the parent page splits in turn.
2254 : : */
2255 : : void
990 andres@anarazel.de 2256 :UBC 0 : _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
2257 : : {
3527 kgrittn@postgresql.o 2258 : 0 : Page lpage = BufferGetPage(lbuf);
1355 michael@paquier.xyz 2259 : 0 : BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2260 : : Buffer rbuf;
2261 : : Page rpage;
2262 : : BTPageOpaque rpageop;
2263 : : bool wasroot;
2264 : : bool wasonly;
2265 : :
4291 heikki.linnakangas@i 2266 [ # # ]: 0 : Assert(P_INCOMPLETE_SPLIT(lpageop));
920 pg@bowt.ie 2267 [ # # ]: 0 : Assert(heaprel != NULL);
2268 : :
2269 : : /* Lock right sibling, the one missing the downlink */
2270 : 0 : rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
3527 kgrittn@postgresql.o 2271 : 0 : rpage = BufferGetPage(rbuf);
1355 michael@paquier.xyz 2272 : 0 : rpageop = BTPageGetOpaque(rpage);
2273 : :
2274 : : /* Could this be a root split? */
4291 heikki.linnakangas@i 2275 [ # # ]: 0 : if (!stack)
2276 : : {
2277 : : Buffer metabuf;
2278 : : Page metapg;
2279 : : BTMetaPageData *metad;
2280 : :
2281 : : /* acquire lock on the metapage */
920 pg@bowt.ie 2282 : 0 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
3527 kgrittn@postgresql.o 2283 : 0 : metapg = BufferGetPage(metabuf);
4291 heikki.linnakangas@i 2284 : 0 : metad = BTPageGetMeta(metapg);
2285 : :
1855 pg@bowt.ie 2286 : 0 : wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2287 : :
4291 heikki.linnakangas@i 2288 : 0 : _bt_relbuf(rel, metabuf);
2289 : : }
2290 : : else
1855 pg@bowt.ie 2291 : 0 : wasroot = false;
2292 : :
2293 : : /* Was this the only page on the level before split? */
2294 [ # # # # ]: 0 : wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2295 : :
2296 : : INJECTION_POINT("nbtree-finish-incomplete-split", NULL);
4291 heikki.linnakangas@i 2297 [ # # ]: 0 : elog(DEBUG1, "finishing incomplete split of %u/%u",
2298 : : BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
2299 : :
990 andres@anarazel.de 2300 : 0 : _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
4291 heikki.linnakangas@i 2301 : 0 : }
2302 : :
2303 : : /*
2304 : : * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
2305 : : * tuple whose downlink points to child page.
2306 : : *
2307 : : * Caller passes child's block number, which is used to identify
2308 : : * associated pivot tuple in parent page using a linear search that
2309 : : * matches on pivot's downlink/block number. The expected location of
2310 : : * the pivot tuple is taken from the stack one level above the child
2311 : : * page. This is used as a starting point. Insertions into the
2312 : : * parent level could cause the pivot tuple to move right; deletions
2313 : : * could cause it to move left, but not left of the page we previously
2314 : : * found it on.
2315 : : *
2316 : : * Caller can use its stack to relocate the pivot tuple/downlink for
2317 : : * any same-level page to the right of the page found by its initial
2318 : : * descent. This is necessary because of the possibility that caller
2319 : : * moved right to recover from a concurrent page split. It's also
2320 : : * convenient for certain callers to be able to step right when there
2321 : : * wasn't a concurrent page split, while still using their original
2322 : : * stack. For example, the checkingunique _bt_doinsert() case may
2323 : : * have to step right when there are many physical duplicates, and its
2324 : : * scantid forces an insertion to the right of the "first page the
2325 : : * value could be on". (This is also relied on by all of our callers
2326 : : * when dealing with !heapkeyspace indexes.)
2327 : : *
2328 : : * Returns write-locked parent page buffer, or InvalidBuffer if pivot
2329 : : * tuple not found (should not happen). Adjusts bts_blkno &
2330 : : * bts_offset if changed. Page split caller should insert its new
2331 : : * pivot tuple for its new right sibling page on parent page, at the
2332 : : * offset number bts_offset + 1.
2333 : : */
2334 : : Buffer
990 andres@anarazel.de 2335 :CBC 14228 : _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
2336 : : {
2337 : : BlockNumber blkno;
2338 : : OffsetNumber start;
2339 : :
9279 tgl@sss.pgh.pa.us 2340 : 14228 : blkno = stack->bts_blkno;
2341 : 14228 : start = stack->bts_offset;
2342 : :
2343 : : for (;;)
2344 : 7 : {
2345 : : Buffer buf;
2346 : : Page page;
2347 : : BTPageOpaque opaque;
2348 : :
920 pg@bowt.ie 2349 : 14235 : buf = _bt_getbuf(rel, blkno, BT_WRITE);
3527 kgrittn@postgresql.o 2350 : 14235 : page = BufferGetPage(buf);
1355 michael@paquier.xyz 2351 : 14235 : opaque = BTPageGetOpaque(page);
2352 : :
920 pg@bowt.ie 2353 [ - + ]: 14235 : Assert(heaprel != NULL);
2486 2354 [ - + ]: 14235 : if (P_INCOMPLETE_SPLIT(opaque))
2355 : : {
990 andres@anarazel.de 2356 :UBC 0 : _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
4291 heikki.linnakangas@i 2357 : 0 : continue;
2358 : : }
2359 : :
8333 tgl@sss.pgh.pa.us 2360 [ + + ]:CBC 14235 : if (!P_IGNORE(opaque))
2361 : : {
2362 : : OffsetNumber offnum,
2363 : : minoff,
2364 : : maxoff;
2365 : : ItemId itemid;
2366 : : IndexTuple item;
2367 : :
2368 [ + + ]: 14228 : minoff = P_FIRSTDATAKEY(opaque);
2369 : 14228 : maxoff = PageGetMaxOffsetNumber(page);
2370 : :
2371 : : /*
2372 : : * start = InvalidOffsetNumber means "search the whole page". We
2373 : : * need this test anyway due to possibility that page has a high
2374 : : * key now when it didn't before.
2375 : : */
2376 [ + + ]: 14228 : if (start < minoff)
2377 : 19 : start = minoff;
2378 : :
2379 : : /*
2380 : : * Need this check too, to guard against possibility that page
2381 : : * split since we visited it originally.
2382 : : */
7791 2383 [ - + ]: 14228 : if (start > maxoff)
7791 tgl@sss.pgh.pa.us 2384 :UBC 0 : start = OffsetNumberNext(maxoff);
2385 : :
2386 : : /*
2387 : : * These loops will check every item on the page --- but in an
2388 : : * order that's attuned to the probability of where it actually
2389 : : * is. Scan to the right first, then to the left.
2390 : : */
8333 tgl@sss.pgh.pa.us 2391 :CBC 14228 : for (offnum = start;
2392 [ + - ]: 14285 : offnum <= maxoff;
2393 : 57 : offnum = OffsetNumberNext(offnum))
2394 : : {
2395 : 14285 : itemid = PageGetItemId(page, offnum);
7265 2396 : 14285 : item = (IndexTuple) PageGetItem(page, itemid);
2397 : :
2192 pg@bowt.ie 2398 [ + + ]: 14285 : if (BTreeTupleGetDownLink(item) == child)
2399 : : {
2400 : : /* Return accurate pointer to where link is now */
8333 tgl@sss.pgh.pa.us 2401 : 14228 : stack->bts_blkno = blkno;
2402 : 14228 : stack->bts_offset = offnum;
2403 : 14228 : return buf;
2404 : : }
2405 : : }
2406 : :
8333 tgl@sss.pgh.pa.us 2407 :UBC 0 : for (offnum = OffsetNumberPrev(start);
2408 [ # # ]: 0 : offnum >= minoff;
2409 : 0 : offnum = OffsetNumberPrev(offnum))
2410 : : {
2411 : 0 : itemid = PageGetItemId(page, offnum);
7265 2412 : 0 : item = (IndexTuple) PageGetItem(page, itemid);
2413 : :
2192 pg@bowt.ie 2414 [ # # ]: 0 : if (BTreeTupleGetDownLink(item) == child)
2415 : : {
2416 : : /* Return accurate pointer to where link is now */
8333 tgl@sss.pgh.pa.us 2417 : 0 : stack->bts_blkno = blkno;
2418 : 0 : stack->bts_offset = offnum;
2419 : 0 : return buf;
2420 : : }
2421 : : }
2422 : : }
2423 : :
2424 : : /*
2425 : : * The item we're looking for moved right at least one page.
2426 : : *
2427 : : * Lehman and Yao couple/chain locks when moving right here, which we
2428 : : * can avoid. See nbtree/README.
2429 : : */
9279 tgl@sss.pgh.pa.us 2430 [ - + ]:CBC 7 : if (P_RIGHTMOST(opaque))
2431 : : {
8920 tgl@sss.pgh.pa.us 2432 :UBC 0 : _bt_relbuf(rel, buf);
7909 2433 : 0 : return InvalidBuffer;
2434 : : }
9279 tgl@sss.pgh.pa.us 2435 :CBC 7 : blkno = opaque->btpo_next;
8334 2436 : 7 : start = InvalidOffsetNumber;
8920 2437 : 7 : _bt_relbuf(rel, buf);
2438 : : }
2439 : : }
2440 : :
2441 : : /*
2442 : : * _bt_newlevel() -- Create a new level above root page.
2443 : : *
2444 : : * We've just split the old root page and need to create a new one.
2445 : : * In order to do this, we add a new root page to the file, then lock
2446 : : * the metadata page and update it. This is guaranteed to be deadlock-
2447 : : * free, because all readers release their locks on the metadata page
2448 : : * before trying to lock the root, and all writers lock the root before
2449 : : * trying to lock the metadata page. We have a write lock on the old
2450 : : * root page, so we have not introduced any cycles into the waits-for
2451 : : * graph.
2452 : : *
2453 : : * On entry, lbuf (the old root) and rbuf (its new peer) are write-
2454 : : * locked. On exit, a new root page exists with entries for the
2455 : : * two new children, metapage is updated and unlocked/unpinned.
2456 : : * The new root buffer is returned to caller which has to unlock/unpin
2457 : : * lbuf, rbuf & rootbuf.
2458 : : */
2459 : : static Buffer
920 pg@bowt.ie 2460 : 691 : _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
2461 : : {
2462 : : Buffer rootbuf;
2463 : : Page lpage,
2464 : : rootpage;
2465 : : BlockNumber lbkno,
2466 : : rbkno;
2467 : : BlockNumber rootblknum;
2468 : : BTPageOpaque rootopaque;
2469 : : BTPageOpaque lopaque;
2470 : : ItemId itemid;
2471 : : IndexTuple item;
2472 : : IndexTuple left_item;
2473 : : Size left_item_sz;
2474 : : IndexTuple right_item;
2475 : : Size right_item_sz;
2476 : : Buffer metabuf;
2477 : : Page metapg;
2478 : : BTMetaPageData *metad;
2479 : :
8334 tgl@sss.pgh.pa.us 2480 : 691 : lbkno = BufferGetBlockNumber(lbuf);
2481 : 691 : rbkno = BufferGetBlockNumber(rbuf);
3527 kgrittn@postgresql.o 2482 : 691 : lpage = BufferGetPage(lbuf);
1355 michael@paquier.xyz 2483 : 691 : lopaque = BTPageGetOpaque(lpage);
2484 : :
2485 : : /* get a new root page */
920 pg@bowt.ie 2486 : 691 : rootbuf = _bt_allocbuf(rel, heaprel);
3527 kgrittn@postgresql.o 2487 : 691 : rootpage = BufferGetPage(rootbuf);
9204 vadim4o@yahoo.com 2488 : 691 : rootblknum = BufferGetBlockNumber(rootbuf);
2489 : :
2490 : : /* acquire lock on the metapage */
920 pg@bowt.ie 2491 : 691 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
3527 kgrittn@postgresql.o 2492 : 691 : metapg = BufferGetPage(metabuf);
9119 vadim4o@yahoo.com 2493 : 691 : metad = BTPageGetMeta(metapg);
2494 : :
2495 : : /*
2496 : : * Create downlink item for left page (old root). The key value used is
2497 : : * "minus infinity", a sentinel value that's reliably less than any real
2498 : : * key value that could appear in the left page.
2499 : : */
4274 heikki.linnakangas@i 2500 : 691 : left_item_sz = sizeof(IndexTupleData);
2501 : 691 : left_item = (IndexTuple) palloc(left_item_sz);
2502 : 691 : left_item->t_info = left_item_sz;
2192 pg@bowt.ie 2503 : 691 : BTreeTupleSetDownLink(left_item, lbkno);
2079 2504 : 691 : BTreeTupleSetNAtts(left_item, 0, false);
2505 : :
2506 : : /*
2507 : : * Create downlink item for right page. The key for it is obtained from
2508 : : * the "high key" position in the left page.
2509 : : */
4274 heikki.linnakangas@i 2510 : 691 : itemid = PageGetItemId(lpage, P_HIKEY);
2511 : 691 : right_item_sz = ItemIdGetLength(itemid);
2512 : 691 : item = (IndexTuple) PageGetItem(lpage, itemid);
2513 : 691 : right_item = CopyIndexTuple(item);
2192 pg@bowt.ie 2514 : 691 : BTreeTupleSetDownLink(right_item, rbkno);
2515 : :
2516 : : /* NO EREPORT(ERROR) from here till newroot op is logged */
9104 tgl@sss.pgh.pa.us 2517 : 691 : START_CRIT_SECTION();
2518 : :
2519 : : /* upgrade metapage if needed */
2463 pg@bowt.ie 2520 [ - + ]: 691 : if (metad->btm_version < BTREE_NOVAC_VERSION)
2757 teodor@sigaev.ru 2521 :UBC 0 : _bt_upgrademetapage(metapg);
2522 : :
2523 : : /* set btree special data */
1355 michael@paquier.xyz 2524 :CBC 691 : rootopaque = BTPageGetOpaque(rootpage);
10327 bruce@momjian.us 2525 : 691 : rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
8334 tgl@sss.pgh.pa.us 2526 : 691 : rootopaque->btpo_flags = BTP_ROOT;
1756 pg@bowt.ie 2527 : 691 : rootopaque->btpo_level =
1355 michael@paquier.xyz 2528 : 691 : (BTPageGetOpaque(lpage))->btpo_level + 1;
7162 tgl@sss.pgh.pa.us 2529 : 691 : rootopaque->btpo_cycleid = 0;
2530 : :
2531 : : /* update metapage data */
8334 2532 : 691 : metad->btm_root = rootblknum;
1756 pg@bowt.ie 2533 : 691 : metad->btm_level = rootopaque->btpo_level;
8334 tgl@sss.pgh.pa.us 2534 : 691 : metad->btm_fastroot = rootblknum;
1756 pg@bowt.ie 2535 : 691 : metad->btm_fastlevel = rootopaque->btpo_level;
2536 : :
2537 : : /*
2538 : : * Insert the left page pointer into the new root page. The root page is
2539 : : * the rightmost page on its level so there is no "high key" in it; the
2540 : : * two items will go into positions P_HIKEY and P_FIRSTKEY.
2541 : : *
2542 : : * Note: we *must* insert the two items in item-number order, for the
2543 : : * benefit of _bt_restore_page().
2544 : : */
2798 teodor@sigaev.ru 2545 [ + - - + ]: 691 : Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
50 peter@eisentraut.org 2546 [ - + ]:GNC 691 : if (PageAddItem(rootpage, left_item, left_item_sz, P_HIKEY, false, false) == InvalidOffsetNumber)
6560 tgl@sss.pgh.pa.us 2547 [ # # ]:UBC 0 : elog(PANIC, "failed to add leftkey to new root page"
2548 : : " while splitting block %u of index \"%s\"",
2549 : : BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
2550 : :
2551 : : /*
2552 : : * insert the right page pointer into the new root page.
2553 : : */
2463 pg@bowt.ie 2554 [ + - - + ]:CBC 691 : Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2555 [ + - - + ]: 691 : Assert(BTreeTupleGetNAtts(right_item, rel) <=
2556 : : IndexRelationGetNumberOfKeyAttributes(rel));
50 peter@eisentraut.org 2557 [ - + ]:GNC 691 : if (PageAddItem(rootpage, right_item, right_item_sz, P_FIRSTKEY, false, false) == InvalidOffsetNumber)
6560 tgl@sss.pgh.pa.us 2558 [ # # ]:UBC 0 : elog(PANIC, "failed to add rightkey to new root page"
2559 : : " while splitting block %u of index \"%s\"",
2560 : : BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
2561 : :
2562 : : /* Clear the incomplete-split flag in the left child */
4291 heikki.linnakangas@i 2563 [ - + ]:CBC 691 : Assert(P_INCOMPLETE_SPLIT(lopaque));
2564 : 691 : lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2565 : 691 : MarkBufferDirty(lbuf);
2566 : :
7200 tgl@sss.pgh.pa.us 2567 : 691 : MarkBufferDirty(rootbuf);
2568 : 691 : MarkBufferDirty(metabuf);
2569 : :
2570 : : /* XLOG stuff */
5482 rhaas@postgresql.org 2571 [ + + + + : 691 : if (RelationNeedsWAL(rel))
+ + + - ]
2572 : : {
2573 : : xl_btree_newroot xlrec;
2574 : : XLogRecPtr recptr;
2575 : : xl_btree_metadata md;
2576 : :
8334 tgl@sss.pgh.pa.us 2577 : 672 : xlrec.rootblk = rootblknum;
9119 vadim4o@yahoo.com 2578 : 672 : xlrec.level = metad->btm_level;
2579 : :
4044 heikki.linnakangas@i 2580 : 672 : XLogBeginInsert();
308 peter@eisentraut.org 2581 : 672 : XLogRegisterData(&xlrec, SizeOfBtreeNewroot);
2582 : :
4044 heikki.linnakangas@i 2583 : 672 : XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2584 : 672 : XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
2965 tgl@sss.pgh.pa.us 2585 : 672 : XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
2586 : :
2463 pg@bowt.ie 2587 [ - + ]: 672 : Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
2588 : 672 : md.version = metad->btm_version;
4044 heikki.linnakangas@i 2589 : 672 : md.root = rootblknum;
2590 : 672 : md.level = metad->btm_level;
2591 : 672 : md.fastroot = rootblknum;
2592 : 672 : md.fastlevel = metad->btm_level;
1756 pg@bowt.ie 2593 : 672 : md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
2120 2594 : 672 : md.allequalimage = metad->btm_allequalimage;
2595 : :
308 peter@eisentraut.org 2596 : 672 : XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
2597 : :
2598 : : /*
2599 : : * Direct access to page is not good but faster - we should implement
2600 : : * some new func in page API.
2601 : : */
4044 heikki.linnakangas@i 2602 : 672 : XLogRegisterBufData(0,
3100 tgl@sss.pgh.pa.us 2603 : 672 : (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
4044 heikki.linnakangas@i 2604 : 672 : ((PageHeader) rootpage)->pd_special -
2605 : 672 : ((PageHeader) rootpage)->pd_upper);
2606 : :
2607 : 672 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2608 : :
4256 2609 : 672 : PageSetLSN(lpage, recptr);
9204 vadim4o@yahoo.com 2610 : 672 : PageSetLSN(rootpage, recptr);
9195 2611 : 672 : PageSetLSN(metapg, recptr);
2612 : : }
2613 : :
9104 tgl@sss.pgh.pa.us 2614 [ - + ]: 691 : END_CRIT_SECTION();
2615 : :
2616 : : /* done with metapage */
7200 2617 : 691 : _bt_relbuf(rel, metabuf);
2618 : :
4274 heikki.linnakangas@i 2619 : 691 : pfree(left_item);
2620 : 691 : pfree(right_item);
2621 : :
7279 neilc@samurai.com 2622 : 691 : return rootbuf;
2623 : : }
2624 : :
2625 : : /*
2626 : : * _bt_pgaddtup() -- add a data item to a particular page during split.
2627 : : *
2628 : : * The difference between this routine and a bare PageAddItem call is
2629 : : * that this code can deal with the first data item on an internal btree
2630 : : * page in passing. This data item (which is called "firstright" within
2631 : : * _bt_split()) has a key that must be treated as minus infinity after
2632 : : * the split. Therefore, we truncate away all attributes when caller
2633 : : * specifies it's the first data item on page (downlink is not changed,
2634 : : * though). This extra step is only needed for the right page of an
2635 : : * internal page split. There is no need to do this for the first data
2636 : : * item on the existing/left page, since that will already have been
2637 : : * truncated during an earlier page split.
2638 : : *
2639 : : * See _bt_split() for a high level explanation of why we truncate here.
2640 : : * Note that this routine has nothing to do with suffix truncation,
2641 : : * despite using some of the same infrastructure.
2642 : : */
2643 : : static inline bool
5588 tgl@sss.pgh.pa.us 2644 : 3577256 : _bt_pgaddtup(Page page,
2645 : : Size itemsize,
2646 : : const IndexTupleData *itup,
2647 : : OffsetNumber itup_off,
2648 : : bool newfirstdataitem)
2649 : : {
2650 : : IndexTupleData trunctuple;
2651 : :
2073 pg@bowt.ie 2652 [ + + ]: 3577256 : if (newfirstdataitem)
2653 : : {
7265 tgl@sss.pgh.pa.us 2654 : 157 : trunctuple = *itup;
2655 : 157 : trunctuple.t_info = sizeof(IndexTupleData);
2079 pg@bowt.ie 2656 : 157 : BTreeTupleSetNAtts(&trunctuple, 0, false);
7265 tgl@sss.pgh.pa.us 2657 : 157 : itup = &trunctuple;
2658 : 157 : itemsize = sizeof(IndexTupleData);
2659 : : }
2660 : :
50 peter@eisentraut.org 2661 [ - + ]:GNC 3577256 : if (unlikely(PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber))
5588 tgl@sss.pgh.pa.us 2662 :UBC 0 : return false;
2663 : :
5588 tgl@sss.pgh.pa.us 2664 :CBC 3577256 : return true;
2665 : : }
2666 : :
2667 : : /*
2668 : : * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
2669 : : *
2670 : : * There are three operations performed here: simple index deletion, bottom-up
2671 : : * index deletion, and deduplication. If all three operations fail to free
2672 : : * enough space for the incoming item then caller will go on to split the
2673 : : * page. We always consider simple deletion first. If that doesn't work out
2674 : : * we consider alternatives. Callers that only want us to consider simple
2675 : : * deletion (without any fallback) ask for that using the 'simpleonly'
2676 : : * argument.
2677 : : *
2678 : : * We usually pick only one alternative "complex" operation when simple
2679 : : * deletion alone won't prevent a page split. The 'checkingunique',
2680 : : * 'uniquedup', and 'indexUnchanged' arguments are used for that.
2681 : : *
2682 : : * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
2683 : : * level flag was found set. The flag was useful back when there wasn't
2684 : : * necessarily one single page for a duplicate tuple to go on (before heap TID
2685 : : * became a part of the key space in version 4 indexes). But we don't
2686 : : * actually look at the flag anymore (it's not a gating condition for our
2687 : : * caller). That would cause us to miss tuples that are safe to delete,
2688 : : * without getting any benefit in return. We know that the alternative is to
2689 : : * split the page; scanning the line pointer array in passing won't have
2690 : : * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite
2691 : : * all this because !heapkeyspace indexes must still do a "getting tired"
2692 : : * linear search, and so are likely to get some benefit from using it as a
2693 : : * gating condition.)
2694 : : */
2695 : : static void
1855 pg@bowt.ie 2696 : 26202 : _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
2697 : : BTInsertState insertstate,
2698 : : bool simpleonly, bool checkingunique,
2699 : : bool uniquedup, bool indexUnchanged)
2700 : : {
2701 : : OffsetNumber deletable[MaxIndexTuplesPerPage];
7013 bruce@momjian.us 2702 : 26202 : int ndeletable = 0;
2703 : : OffsetNumber offnum,
2704 : : minoff,
2705 : : maxoff;
1855 pg@bowt.ie 2706 : 26202 : Buffer buffer = insertstate->buf;
2707 : 26202 : BTScanInsert itup_key = insertstate->itup_key;
3527 kgrittn@postgresql.o 2708 : 26202 : Page page = BufferGetPage(buffer);
1355 michael@paquier.xyz 2709 : 26202 : BTPageOpaque opaque = BTPageGetOpaque(page);
2710 : :
2463 pg@bowt.ie 2711 [ - + ]: 26202 : Assert(P_ISLEAF(opaque));
1798 2712 [ + + - + ]: 26202 : Assert(simpleonly || itup_key->heapkeyspace);
2713 [ + + + - : 26202 : Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
+ - - + ]
2714 : :
2715 : : /*
2716 : : * Scan over all items to see which ones need to be deleted according to
2717 : : * LP_DEAD flags. We'll usually manage to delete a few extra items that
2718 : : * are not marked LP_DEAD in passing. Often the extra items that actually
2719 : : * end up getting deleted are items that would have had their LP_DEAD bit
2720 : : * set before long anyway (if we opted not to include them as extras).
2721 : : */
2722 [ + + ]: 26202 : minoff = P_FIRSTDATAKEY(opaque);
7084 tgl@sss.pgh.pa.us 2723 : 26202 : maxoff = PageGetMaxOffsetNumber(page);
1798 pg@bowt.ie 2724 : 26202 : for (offnum = minoff;
7084 tgl@sss.pgh.pa.us 2725 [ + + ]: 7045817 : offnum <= maxoff;
2726 : 7019615 : offnum = OffsetNumberNext(offnum))
2727 : : {
7013 bruce@momjian.us 2728 : 7019615 : ItemId itemId = PageGetItemId(page, offnum);
2729 : :
6670 tgl@sss.pgh.pa.us 2730 [ + + ]: 7019615 : if (ItemIdIsDead(itemId))
7084 2731 : 130812 : deletable[ndeletable++] = offnum;
2732 : : }
2733 : :
2734 [ + + ]: 26202 : if (ndeletable > 0)
2735 : : {
1798 pg@bowt.ie 2736 : 3900 : _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
2737 : : insertstate->itup, minoff, maxoff);
1855 2738 : 3900 : insertstate->bounds_valid = false;
2739 : :
2740 : : /* Return when a page split has already been avoided */
2741 [ + + ]: 3900 : if (PageGetFreeSpace(page) >= insertstate->itemsz)
2742 : 12274 : return;
2743 : :
2744 : : /* Might as well assume duplicates (if checkingunique) */
2745 : 50 : uniquedup = true;
2746 : : }
2747 : :
2748 : : /*
2749 : : * We're done with simple deletion. Return early with callers that only
2750 : : * call here so that simple deletion can be considered. This includes
2751 : : * callers that explicitly ask for this and checkingunique callers that
2752 : : * probably don't have any version churn duplicates on the page.
2753 : : *
2754 : : * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
2755 : : * return at this point (or when we go on the try either or both of our
2756 : : * other strategies and they also fail). We do not bother expending a
2757 : : * separate write to clear it, however. Caller will definitely clear it
2758 : : * when it goes on to split the page (note also that the deduplication
2759 : : * process will clear the flag in passing, just to keep things tidy).
2760 : : */
1798 2761 [ + - + + : 22352 : if (simpleonly || (checkingunique && !uniquedup))
+ + ]
2762 : : {
2763 [ - + ]: 8205 : Assert(!indexUnchanged);
1855 2764 : 8205 : return;
2765 : : }
2766 : :
2767 : : /* Assume bounds about to be invalidated (this is almost certain now) */
2768 : 14147 : insertstate->bounds_valid = false;
2769 : :
2770 : : /*
2771 : : * Perform bottom-up index deletion pass when executor hint indicated that
2772 : : * incoming item is logically unchanged, or for a unique index that is
2773 : : * known to have physical duplicates for some other reason. (There is a
2774 : : * large overlap between these two cases for a unique index. It's worth
2775 : : * having both triggering conditions in order to apply the optimization in
2776 : : * the event of successive related INSERT and DELETE statements.)
2777 : : *
2778 : : * We'll go on to do a deduplication pass when a bottom-up pass fails to
2779 : : * delete an acceptable amount of free space (a significant fraction of
2780 : : * the page, or space for the new item, whichever is greater).
2781 : : *
2782 : : * Note: Bottom-up index deletion uses the same equality/equivalence
2783 : : * routines as deduplication internally. However, it does not merge
2784 : : * together index tuples, so the same correctness considerations do not
2785 : : * apply. We deliberately omit an index-is-allequalimage test here.
2786 : : */
1798 2787 [ + + + + : 16165 : if ((indexUnchanged || uniquedup) &&
+ + ]
2788 : 2018 : _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
2789 : 219 : return;
2790 : :
2791 : : /* Perform deduplication pass (when enabled and index-is-allequalimage) */
1855 2792 [ + - - + : 13928 : if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
+ + + + +
+ + - ]
973 2793 : 13919 : _bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz,
2794 [ + + + + ]: 13919 : (indexUnchanged || uniquedup));
2795 : : }
2796 : :
2797 : : /*
2798 : : * _bt_simpledel_pass - Simple index tuple deletion pass.
2799 : : *
2800 : : * We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers
2801 : : * of all such tuples are determined by caller (caller passes these to us as
2802 : : * its 'deletable' argument).
2803 : : *
2804 : : * We might also delete extra index tuples that turn out to be safe to delete
2805 : : * in passing (though they must be cheap to check in passing to begin with).
2806 : : * There is no certainty that any extra tuples will be deleted, though. The
2807 : : * high level goal of the approach we take is to get the most out of each call
2808 : : * here (without noticeably increasing the per-call overhead compared to what
2809 : : * we need to do just to be able to delete the page's LP_DEAD-marked index
2810 : : * tuples).
2811 : : *
2812 : : * The number of extra index tuples that turn out to be deletable might
2813 : : * greatly exceed the number of LP_DEAD-marked index tuples due to various
2814 : : * locality related effects. For example, it's possible that the total number
2815 : : * of table blocks (pointed to by all TIDs on the leaf page) is naturally
2816 : : * quite low, in which case we might end up checking if it's possible to
2817 : : * delete _most_ index tuples on the page (without the tableam needing to
2818 : : * access additional table blocks). The tableam will sometimes stumble upon
2819 : : * _many_ extra deletable index tuples in indexes where this pattern is
2820 : : * common.
2821 : : *
2822 : : * See nbtree/README for further details on simple index tuple deletion.
2823 : : */
2824 : : static void
1798 2825 : 3900 : _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
2826 : : OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
2827 : : OffsetNumber minoff, OffsetNumber maxoff)
2828 : : {
2829 : 3900 : Page page = BufferGetPage(buffer);
2830 : : BlockNumber *deadblocks;
2831 : : int ndeadblocks;
2832 : : TM_IndexDeleteOp delstate;
2833 : : OffsetNumber offnum;
2834 : :
2835 : : /* Get array of table blocks pointed to by LP_DEAD-set tuples */
2836 : 3900 : deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
2837 : : &ndeadblocks);
2838 : :
2839 : : /* Initialize tableam state that describes index deletion operation */
1503 2840 : 3900 : delstate.irel = rel;
2841 : 3900 : delstate.iblknum = BufferGetBlockNumber(buffer);
1798 2842 : 3900 : delstate.bottomup = false;
2843 : 3900 : delstate.bottomupfreespace = 0;
2844 : 3900 : delstate.ndeltids = 0;
2845 : 3900 : delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
2846 : 3900 : delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
2847 : :
2848 : 3900 : for (offnum = minoff;
2849 [ + + ]: 1105016 : offnum <= maxoff;
2850 : 1101116 : offnum = OffsetNumberNext(offnum))
2851 : : {
2852 : 1101116 : ItemId itemid = PageGetItemId(page, offnum);
2853 : 1101116 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2854 : 1101116 : TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
2855 : 1101116 : TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
2856 : : BlockNumber tidblock;
2857 : : void *match;
2858 : :
2859 [ + + ]: 1101116 : if (!BTreeTupleIsPosting(itup))
2860 : : {
2861 : 1048242 : tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
2862 : 1048242 : match = bsearch(&tidblock, deadblocks, ndeadblocks,
2863 : : sizeof(BlockNumber), _bt_blk_cmp);
2864 : :
2865 [ + + ]: 1048242 : if (!match)
2866 : : {
2867 [ - + ]: 664764 : Assert(!ItemIdIsDead(itemid));
2868 : 664764 : continue;
2869 : : }
2870 : :
2871 : : /*
2872 : : * TID's table block is among those pointed to by the TIDs from
2873 : : * LP_DEAD-bit set tuples on page -- add TID to deltids
2874 : : */
2875 : 383478 : odeltid->tid = itup->t_tid;
2876 : 383478 : odeltid->id = delstate.ndeltids;
2877 : 383478 : ostatus->idxoffnum = offnum;
2878 : 383478 : ostatus->knowndeletable = ItemIdIsDead(itemid);
2879 : 383478 : ostatus->promising = false; /* unused */
2880 : 383478 : ostatus->freespace = 0; /* unused */
2881 : :
2882 : 383478 : delstate.ndeltids++;
2883 : : }
2884 : : else
2885 : : {
2886 : 52874 : int nitem = BTreeTupleGetNPosting(itup);
2887 : :
2888 [ + + ]: 255495 : for (int p = 0; p < nitem; p++)
2889 : : {
2890 : 202621 : ItemPointer tid = BTreeTupleGetPostingN(itup, p);
2891 : :
2892 : 202621 : tidblock = ItemPointerGetBlockNumber(tid);
2893 : 202621 : match = bsearch(&tidblock, deadblocks, ndeadblocks,
2894 : : sizeof(BlockNumber), _bt_blk_cmp);
2895 : :
2896 [ + + ]: 202621 : if (!match)
2897 : : {
2898 [ - + ]: 178178 : Assert(!ItemIdIsDead(itemid));
2899 : 178178 : continue;
2900 : : }
2901 : :
2902 : : /*
2903 : : * TID's table block is among those pointed to by the TIDs
2904 : : * from LP_DEAD-bit set tuples on page -- add TID to deltids
2905 : : */
2906 : 24443 : odeltid->tid = *tid;
2907 : 24443 : odeltid->id = delstate.ndeltids;
2908 : 24443 : ostatus->idxoffnum = offnum;
2909 : 24443 : ostatus->knowndeletable = ItemIdIsDead(itemid);
2910 : 24443 : ostatus->promising = false; /* unused */
2911 : 24443 : ostatus->freespace = 0; /* unused */
2912 : :
2913 : 24443 : odeltid++;
2914 : 24443 : ostatus++;
2915 : 24443 : delstate.ndeltids++;
2916 : : }
2917 : : }
2918 : : }
2919 : :
2920 : 3900 : pfree(deadblocks);
2921 : :
2922 [ - + ]: 3900 : Assert(delstate.ndeltids >= ndeletable);
2923 : :
2924 : : /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
2925 : 3900 : _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
2926 : :
2927 : 3900 : pfree(delstate.deltids);
2928 : 3900 : pfree(delstate.status);
2929 : 3900 : }
2930 : :
2931 : : /*
2932 : : * _bt_deadblocks() -- Get LP_DEAD related table blocks.
2933 : : *
2934 : : * Builds sorted and unique-ified array of table block numbers from index
2935 : : * tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table
2936 : : * block from incoming newitem just in case it isn't among the LP_DEAD-related
2937 : : * table blocks.
2938 : : *
2939 : : * Always counting the newitem's table block as an LP_DEAD related block makes
2940 : : * sense because the cost is consistently low; it is practically certain that
2941 : : * the table block will not incur a buffer miss in tableam. On the other hand
2942 : : * the benefit is often quite high. There is a decent chance that there will
2943 : : * be some deletable items from this block, since in general most garbage
2944 : : * tuples became garbage in the recent past (in many cases this won't be the
2945 : : * first logical row that core code added to/modified in table block
2946 : : * recently).
2947 : : *
2948 : : * Returns final array, and sets *nblocks to its final size for caller.
2949 : : */
2950 : : static BlockNumber *
2951 : 3900 : _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
2952 : : IndexTuple newitem, int *nblocks)
2953 : : {
2954 : : int spacentids,
2955 : : ntids;
2956 : : BlockNumber *tidblocks;
2957 : :
2958 : : /*
2959 : : * Accumulate each TID's block in array whose initial size has space for
2960 : : * one table block per LP_DEAD-set tuple (plus space for the newitem table
2961 : : * block). Array will only need to grow when there are LP_DEAD-marked
2962 : : * posting list tuples (which is not that common).
2963 : : */
2964 : 3900 : spacentids = ndeletable + 1;
2965 : 3900 : ntids = 0;
6 michael@paquier.xyz 2966 :GNC 3900 : tidblocks = palloc_array(BlockNumber, spacentids);
2967 : :
2968 : : /*
2969 : : * First add the table block for the incoming newitem. This is the one
2970 : : * case where simple deletion can visit a table block that doesn't have
2971 : : * any known deletable items.
2972 : : */
1798 pg@bowt.ie 2973 [ + - - + ]:CBC 3900 : Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
2974 : 3900 : tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
2975 : :
2976 [ + + ]: 134712 : for (int i = 0; i < ndeletable; i++)
2977 : : {
2978 : 130812 : ItemId itemid = PageGetItemId(page, deletable[i]);
2979 : 130812 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2980 : :
2981 [ - + ]: 130812 : Assert(ItemIdIsDead(itemid));
2982 : :
2983 [ + + ]: 130812 : if (!BTreeTupleIsPosting(itup))
2984 : : {
2985 [ + + ]: 126460 : if (ntids + 1 > spacentids)
2986 : : {
2987 : 108 : spacentids *= 2;
2988 : : tidblocks = (BlockNumber *)
2989 : 108 : repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2990 : : }
2991 : :
2992 : 126460 : tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
2993 : : }
2994 : : else
2995 : : {
2996 : 4352 : int nposting = BTreeTupleGetNPosting(itup);
2997 : :
2998 [ + + ]: 4352 : if (ntids + nposting > spacentids)
2999 : : {
3000 : 124 : spacentids = Max(spacentids * 2, ntids + nposting);
3001 : : tidblocks = (BlockNumber *)
3002 : 124 : repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
3003 : : }
3004 : :
3005 [ + + ]: 14817 : for (int j = 0; j < nposting; j++)
3006 : : {
3007 : 10465 : ItemPointer tid = BTreeTupleGetPostingN(itup, j);
3008 : :
3009 : 10465 : tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
3010 : : }
3011 : : }
3012 : : }
3013 : :
3014 : 3900 : qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3015 : 3900 : *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3016 : :
3017 : 3900 : return tidblocks;
3018 : : }
3019 : :
3020 : : /*
3021 : : * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
3022 : : */
3023 : : static inline int
3024 : 2796146 : _bt_blk_cmp(const void *arg1, const void *arg2)
3025 : : {
3026 : 2796146 : BlockNumber b1 = *((BlockNumber *) arg1);
3027 : 2796146 : BlockNumber b2 = *((BlockNumber *) arg2);
3028 : :
669 nathan@postgresql.or 3029 : 2796146 : return pg_cmp_u32(b1, b2);
3030 : : }
|