Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * nbtinsert.c
4 : : * Item insertion in Lehman and Yao btrees for Postgres.
5 : : *
6 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : : * Portions Copyright (c) 1994, Regents of the University of California
8 : : *
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/access/nbtree/nbtinsert.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : :
16 : : #include "postgres.h"
17 : :
18 : : #include "access/nbtree.h"
19 : : #include "access/nbtxlog.h"
20 : : #include "access/tableam.h"
21 : : #include "access/transam.h"
22 : : #include "access/xloginsert.h"
23 : : #include "common/int.h"
24 : : #include "common/pg_prng.h"
25 : : #include "lib/qunique.h"
26 : : #include "miscadmin.h"
27 : : #include "storage/lmgr.h"
28 : : #include "storage/predicate.h"
29 : : #include "utils/injection_point.h"
30 : :
31 : : /* Minimum tree height for application of fastpath optimization */
32 : : #define BTREE_FASTPATH_MIN_LEVEL 2
33 : :
34 : :
35 : : static BTStack _bt_search_insert(Relation rel, Relation heaprel,
36 : : BTInsertState insertstate);
37 : : static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
38 : : Relation heapRel,
39 : : IndexUniqueCheck checkUnique, bool *is_unique,
40 : : uint32 *speculativeToken);
41 : : static OffsetNumber _bt_findinsertloc(Relation rel,
42 : : BTInsertState insertstate,
43 : : bool checkingunique,
44 : : bool indexUnchanged,
45 : : BTStack stack,
46 : : Relation heapRel);
47 : : static void _bt_stepright(Relation rel, Relation heaprel,
48 : : BTInsertState insertstate, BTStack stack);
49 : : static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key,
50 : : Buffer buf,
51 : : Buffer cbuf,
52 : : BTStack stack,
53 : : IndexTuple itup,
54 : : Size itemsz,
55 : : OffsetNumber newitemoff,
56 : : int postingoff,
57 : : bool split_only_page);
58 : : static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key,
59 : : Buffer buf, Buffer cbuf, OffsetNumber newitemoff,
60 : : Size newitemsz, IndexTuple newitem, IndexTuple orignewitem,
61 : : IndexTuple nposting, uint16 postingoff);
62 : : static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf,
63 : : Buffer rbuf, BTStack stack, bool isroot, bool isonly);
64 : : static void _bt_freestack(BTStack stack);
65 : : static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf);
66 : : static inline bool _bt_pgaddtup(Page page, Size itemsize, const IndexTupleData *itup,
67 : : OffsetNumber itup_off, bool newfirstdataitem);
68 : : static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
69 : : BTInsertState insertstate,
70 : : bool simpleonly, bool checkingunique,
71 : : bool uniquedup, bool indexUnchanged);
72 : : static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
73 : : OffsetNumber *deletable, int ndeletable,
74 : : IndexTuple newitem, OffsetNumber minoff,
75 : : OffsetNumber maxoff);
76 : : static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
77 : : int ndeletable, IndexTuple newitem,
78 : : int *nblocks);
79 : : static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
80 : :
81 : : /*
82 : : * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
83 : : *
84 : : * This routine is called by the public interface routine, btinsert.
85 : : * By here, itup is filled in, including the TID.
86 : : *
87 : : * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
88 : : * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
89 : : * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
90 : : * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
91 : : * don't actually insert.
92 : : *
93 : : * indexUnchanged executor hint indicates if itup is from an
94 : : * UPDATE that didn't logically change the indexed value, but
95 : : * must nevertheless have a new entry to point to a successor
96 : : * version.
97 : : *
98 : : * The result value is only significant for UNIQUE_CHECK_PARTIAL:
99 : : * it must be true if the entry is known unique, else false.
100 : : * (In the current implementation we'll also return true after a
101 : : * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
102 : : * that's just a coding artifact.)
103 : : */
104 : : bool
7354 tgl@sss.pgh.pa.us 105 :CBC 4273976 : _bt_doinsert(Relation rel, IndexTuple itup,
106 : : IndexUniqueCheck checkUnique, bool indexUnchanged,
107 : : Relation heapRel)
108 : : {
6073 109 : 4273976 : bool is_unique = false;
110 : : BTInsertStateData insertstate;
111 : : BTScanInsert itup_key;
112 : : BTStack stack;
2552 pg@bowt.ie 113 : 4273976 : bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
114 : :
115 : : /* we need an insertion scan key to do our search, so build one */
1009 116 : 4273976 : itup_key = _bt_mkscankey(rel, itup);
117 : :
2518 118 [ + + ]: 4273976 : if (checkingunique)
119 : : {
120 [ + + ]: 3056214 : if (!itup_key->anynullkeys)
121 : : {
122 : : /* No (heapkeyspace) scantid until uniqueness established */
123 : 3046127 : itup_key->scantid = NULL;
124 : : }
125 : : else
126 : : {
127 : : /*
128 : : * Scan key for new tuple contains NULL key values. Bypass
129 : : * checkingunique steps. They are unnecessary because core code
130 : : * considers NULL unequal to every value, including NULL.
131 : : *
132 : : * This optimization avoids O(N^2) behavior within the
133 : : * _bt_findinsertloc() heapkeyspace path when a unique index has a
134 : : * large number of "duplicates" with NULL key values.
135 : : */
136 : 10087 : checkingunique = false;
137 : : /* Tuple is unique in the sense that core code cares about */
138 [ - + ]: 10087 : Assert(checkUnique != UNIQUE_CHECK_EXISTING);
139 : 10087 : is_unique = true;
140 : : }
141 : : }
142 : :
143 : : /*
144 : : * Fill in the BTInsertState working area, to track the current page and
145 : : * position within the page to insert on.
146 : : *
147 : : * Note that itemsz is passed down to lower level code that deals with
148 : : * inserting the item. It must be MAXALIGN()'d. This ensures that space
149 : : * accounting code consistently considers the alignment overhead that we
150 : : * expect PageAddItem() will add later. (Actually, index_form_tuple() is
151 : : * already conservative about alignment, but we don't rely on that from
152 : : * this distance. Besides, preserving the "true" tuple size in index
153 : : * tuple headers for the benefit of nbtsplitloc.c might happen someday.
154 : : * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
155 : : */
2552 156 : 4273976 : insertstate.itup = itup;
157 : 4273976 : insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
158 : 4273976 : insertstate.itup_key = itup_key;
159 : 4273976 : insertstate.bounds_valid = false;
160 : 4273976 : insertstate.buf = InvalidBuffer;
2209 161 : 4273976 : insertstate.postingoff = 0;
162 : :
2188 163 : 4273988 : search:
164 : :
165 : : /*
166 : : * Find and lock the leaf page that the tuple should be added to by
167 : : * searching from the root page. insertstate.buf will hold a buffer that
168 : : * is locked in exclusive mode afterwards.
169 : : */
1079 andres@anarazel.de 170 : 4273988 : stack = _bt_search_insert(rel, heapRel, &insertstate);
171 : :
172 : : /*
173 : : * checkingunique inserts are not allowed to go ahead when two tuples with
174 : : * equal key attribute values would be visible to new MVCC snapshots once
175 : : * the xact commits. Check for conflicts in the locked page/buffer (if
176 : : * needed) here.
177 : : *
178 : : * It might be necessary to check a page to the right in _bt_check_unique,
179 : : * though that should be very rare. In practice the first page the value
180 : : * could be on (with scantid omitted) is almost always also the only page
181 : : * that a matching tuple might be found on. This is due to the behavior
182 : : * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
183 : : * only be allowed to cross a page boundary when there is no candidate
184 : : * leaf page split point that avoids it. Also, _bt_check_unique can use
185 : : * the leaf page high key to determine that there will be no duplicates on
186 : : * the right sibling without actually visiting it (it uses the high key in
187 : : * cases where the new item happens to belong at the far right of the leaf
188 : : * page).
189 : : *
190 : : * NOTE: obviously, _bt_check_unique can only detect keys that are already
191 : : * in the index; so it cannot defend against concurrent insertions of the
192 : : * same key. We protect against that by means of holding a write lock on
193 : : * the first page the value could be on, with omitted/-inf value for the
194 : : * implicit heap TID tiebreaker attribute. Any other would-be inserter of
195 : : * the same key must acquire a write lock on the same page, so only one
196 : : * would-be inserter can be making the check at one time. Furthermore,
197 : : * once we are past the check we hold write locks continuously until we
198 : : * have performed our insertion, so no later inserter can fail to see our
199 : : * insertion. (This requires some care in _bt_findinsertloc.)
200 : : *
201 : : * If we must wait for another xact, we release the lock while waiting,
202 : : * and then must perform a new search.
203 : : *
204 : : * For a partial uniqueness check, we don't wait for the other xact. Just
205 : : * let the tuple in and return false for possibly non-unique, or true for
206 : : * definitely unique.
207 : : */
2552 pg@bowt.ie 208 [ + + ]: 4273988 : if (checkingunique)
209 : : {
210 : : TransactionId xwait;
211 : : uint32 speculativeToken;
212 : :
213 : 3046139 : xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
214 : : &is_unique, &speculativeToken);
215 : :
2188 216 [ + + ]: 3045875 : if (unlikely(TransactionIdIsValid(xwait)))
217 : : {
218 : : /* Have to wait for the other guy ... */
2552 219 : 12 : _bt_relbuf(rel, insertstate.buf);
220 : 12 : insertstate.buf = InvalidBuffer;
221 : :
222 : : /*
223 : : * If it's a speculative insertion, wait for it to finish (ie. to
224 : : * go ahead with the insertion, or kill the tuple). Otherwise
225 : : * wait for the transaction to finish as usual.
226 : : */
3964 andres@anarazel.de 227 [ - + ]: 12 : if (speculativeToken)
3964 andres@anarazel.de 228 :UBC 0 : SpeculativeInsertionWait(xwait, speculativeToken);
229 : : else
3964 andres@anarazel.de 230 :CBC 12 : XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
231 : :
232 : : /* start over... */
2911 andrew@dunslane.net 233 [ - + ]: 12 : if (stack)
2911 andrew@dunslane.net 234 :UBC 0 : _bt_freestack(stack);
2188 pg@bowt.ie 235 :CBC 12 : goto search;
236 : : }
237 : :
238 : : /* Uniqueness is established -- restore heap tid as scantid */
2552 239 [ + - ]: 3045863 : if (itup_key->heapkeyspace)
240 : 3045863 : itup_key->scantid = &itup->t_tid;
241 : : }
242 : :
6073 tgl@sss.pgh.pa.us 243 [ + + ]: 4273712 : if (checkUnique != UNIQUE_CHECK_EXISTING)
244 : : {
245 : : OffsetNumber newitemoff;
246 : :
247 : : /*
248 : : * The only conflict predicate locking cares about for indexes is when
249 : : * an index tuple insert conflicts with an existing lock. We don't
250 : : * know the actual page we're going to insert on for sure just yet in
251 : : * checkingunique and !heapkeyspace cases, but it's okay to use the
252 : : * first page the value could be on (with scantid omitted) instead.
253 : : */
2238 tmunro@postgresql.or 254 : 4273685 : CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));
255 : :
256 : : /*
257 : : * Do the insertion. Note that insertstate contains cached binary
258 : : * search bounds established within _bt_check_unique when insertion is
259 : : * checkingunique.
260 : : */
2552 pg@bowt.ie 261 : 4273682 : newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
262 : : indexUnchanged, stack, heapRel);
1079 andres@anarazel.de 263 : 4273682 : _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
264 : : stack, itup, insertstate.itemsz, newitemoff,
265 : : insertstate.postingoff, false);
266 : : }
267 : : else
268 : : {
269 : : /* just release the buffer */
2552 pg@bowt.ie 270 : 27 : _bt_relbuf(rel, insertstate.buf);
271 : : }
272 : :
273 : : /* be tidy */
2911 andrew@dunslane.net 274 [ + + ]: 4273707 : if (stack)
275 : 3773883 : _bt_freestack(stack);
2552 pg@bowt.ie 276 : 4273707 : pfree(itup_key);
277 : :
6073 tgl@sss.pgh.pa.us 278 : 4273707 : return is_unique;
279 : : }
280 : :
281 : : /*
282 : : * _bt_search_insert() -- _bt_search() wrapper for inserts
283 : : *
284 : : * Search the tree for a particular scankey, or more precisely for the first
285 : : * leaf page it could be on. Try to make use of the fastpath optimization's
286 : : * rightmost leaf page cache before actually searching the tree from the root
287 : : * page, though.
288 : : *
289 : : * Return value is a stack of parent-page pointers (though see notes about
290 : : * fastpath optimization and page splits below). insertstate->buf is set to
291 : : * the address of the leaf-page buffer, which is write-locked and pinned in
292 : : * all cases (if necessary by creating a new empty root page for caller).
293 : : *
294 : : * The fastpath optimization avoids most of the work of searching the tree
295 : : * repeatedly when a single backend inserts successive new tuples on the
296 : : * rightmost leaf page of an index. A backend cache of the rightmost leaf
297 : : * page is maintained within _bt_insertonpg(), and used here. The cache is
298 : : * invalidated here when an insert of a non-pivot tuple must take place on a
299 : : * non-rightmost leaf page.
300 : : *
301 : : * The optimization helps with indexes on an auto-incremented field. It also
302 : : * helps with indexes on datetime columns, as well as indexes with lots of
303 : : * NULL values. (NULLs usually get inserted in the rightmost page for single
304 : : * column indexes, since they usually get treated as coming after everything
305 : : * else in the key space. Individual NULL tuples will generally be placed on
306 : : * the rightmost leaf page due to the influence of the heap TID column.)
307 : : *
308 : : * Note that we avoid applying the optimization when there is insufficient
309 : : * space on the rightmost page to fit caller's new item. This is necessary
310 : : * because we'll need to return a real descent stack when a page split is
311 : : * expected (actually, caller can cope with a leaf page split that uses a NULL
312 : : * stack, but that's very slow and so must be avoided). Note also that the
313 : : * fastpath optimization acquires the lock on the page conditionally as a way
314 : : * of reducing extra contention when there are concurrent insertions into the
315 : : * rightmost page (we give up if we'd have to wait for the lock). We assume
316 : : * that it isn't useful to apply the optimization when there is contention,
317 : : * since each per-backend cache won't stay valid for long.
318 : : */
319 : : static BTStack
1079 andres@anarazel.de 320 : 4273988 : _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
321 : : {
2188 pg@bowt.ie 322 [ - + ]: 4273988 : Assert(insertstate->buf == InvalidBuffer);
323 [ - + ]: 4273988 : Assert(!insertstate->bounds_valid);
324 [ - + ]: 4273988 : Assert(insertstate->postingoff == 0);
325 : :
326 [ + - + + ]: 4273988 : if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
327 : : {
328 : : /* Simulate a _bt_getbuf() call with conditional locking */
329 [ + - ]: 34656 : insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
2063 330 [ + + ]: 34656 : if (_bt_conditionallockbuf(rel, insertstate->buf))
331 : : {
332 : : Page page;
333 : : BTPageOpaque opaque;
334 : :
2188 335 : 33787 : _bt_checkpage(rel, insertstate->buf);
336 : 33787 : page = BufferGetPage(insertstate->buf);
1444 michael@paquier.xyz 337 : 33787 : opaque = BTPageGetOpaque(page);
338 : :
339 : : /*
340 : : * Check if the page is still the rightmost leaf page and has
341 : : * enough free space to accommodate the new tuple. Also check
342 : : * that the insertion scan key is strictly greater than the first
343 : : * non-pivot tuple on the page. (Note that we expect itup_key's
344 : : * scantid to be unset when our caller is a checkingunique
345 : : * inserter.)
346 : : */
1944 pg@bowt.ie 347 [ + + ]: 33787 : if (P_RIGHTMOST(opaque) &&
348 [ + - ]: 33752 : P_ISLEAF(opaque) &&
349 [ + - ]: 33752 : !P_IGNORE(opaque) &&
2188 350 [ + + + - ]: 67314 : PageGetFreeSpace(page) > insertstate->itemsz &&
351 [ + + ]: 67124 : PageGetMaxOffsetNumber(page) >= P_HIKEY &&
352 : 33562 : _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
353 : : {
354 : : /*
355 : : * Caller can use the fastpath optimization because cached
356 : : * block is still rightmost leaf page, which can fit caller's
357 : : * new tuple without splitting. Keep block in local cache for
358 : : * next insert, and have caller use NULL stack.
359 : : *
360 : : * Note that _bt_insert_parent() has an assertion that catches
361 : : * leaf page splits that somehow follow from a fastpath insert
362 : : * (it should only be passed a NULL stack when it must deal
363 : : * with a concurrent root page split, and never because a NULL
364 : : * stack was returned here).
365 : : */
366 : 33545 : return NULL;
367 : : }
368 : :
369 : : /* Page unsuitable for caller, drop lock and pin */
370 : 242 : _bt_relbuf(rel, insertstate->buf);
371 : : }
372 : : else
373 : : {
374 : : /* Lock unavailable, drop pin */
375 : 869 : ReleaseBuffer(insertstate->buf);
376 : : }
377 : :
378 : : /* Forget block, since cache doesn't appear to be useful */
379 : 1111 : RelationSetTargetBlock(rel, InvalidBlockNumber);
380 : : }
381 : :
382 : : /* Cannot use optimization -- descend tree, return proper descent stack */
1079 andres@anarazel.de 383 : 4240443 : return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
384 : : BT_WRITE, true);
385 : : }
386 : :
387 : : /*
388 : : * _bt_check_unique() -- Check for violation of unique index constraint
389 : : *
390 : : * Returns InvalidTransactionId if there is no conflict, else an xact ID
391 : : * we must wait for to see if it commits a conflicting tuple. If an actual
392 : : * conflict is detected, no return --- just ereport(). If an xact ID is
393 : : * returned, and the conflicting tuple still has a speculative insertion in
394 : : * progress, *speculativeToken is set to non-zero, and the caller can wait for
395 : : * the verdict on the insertion using SpeculativeInsertionWait().
396 : : *
397 : : * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
398 : : * InvalidTransactionId because we don't want to wait. In this case we
399 : : * set *is_unique to false if there is a potential conflict, and the
400 : : * core code must redo the uniqueness check later.
401 : : *
402 : : * As a side-effect, sets state in insertstate that can later be used by
403 : : * _bt_findinsertloc() to reuse most of the binary search work we do
404 : : * here.
405 : : *
406 : : * This code treats NULLs as equal, unlike the default semantics for unique
407 : : * indexes. So do not call here when there are NULL values in scan key and
408 : : * the index uses the default NULLS DISTINCT mode.
409 : : */
410 : : static TransactionId
2552 pg@bowt.ie 411 : 3046139 : _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
412 : : IndexUniqueCheck checkUnique, bool *is_unique,
413 : : uint32 *speculativeToken)
414 : : {
415 : 3046139 : IndexTuple itup = insertstate->itup;
2101 416 : 3046139 : IndexTuple curitup = NULL;
1802 417 : 3046139 : ItemId curitemid = NULL;
2552 418 : 3046139 : BTScanInsert itup_key = insertstate->itup_key;
419 : : SnapshotData SnapshotDirty;
420 : : OffsetNumber offset;
421 : : OffsetNumber maxoff;
422 : : Page page;
423 : : BTPageOpaque opaque;
9368 tgl@sss.pgh.pa.us 424 : 3046139 : Buffer nbuf = InvalidBuffer;
6073 425 : 3046139 : bool found = false;
2209 pg@bowt.ie 426 : 3046139 : bool inposting = false;
427 : 3046139 : bool prevalldead = true;
428 : 3046139 : int curposti = 0;
429 : :
430 : : /* Assume unique until we find a duplicate */
6073 tgl@sss.pgh.pa.us 431 : 3046139 : *is_unique = true;
432 : :
6930 433 : 3046139 : InitDirtySnapshot(SnapshotDirty);
434 : :
2552 pg@bowt.ie 435 : 3046139 : page = BufferGetPage(insertstate->buf);
1444 michael@paquier.xyz 436 : 3046139 : opaque = BTPageGetOpaque(page);
9368 tgl@sss.pgh.pa.us 437 : 3046139 : maxoff = PageGetMaxOffsetNumber(page);
438 : :
439 : : /*
440 : : * Find the first tuple with the same key.
441 : : *
442 : : * This also saves the binary search bounds in insertstate. We use them
443 : : * in the fastpath below, but also in the _bt_findinsertloc() call later.
444 : : */
2537 pg@bowt.ie 445 [ - + ]: 3046139 : Assert(!insertstate->bounds_valid);
2552 446 : 3046139 : offset = _bt_binsrch_insert(rel, insertstate);
447 : :
448 : : /*
449 : : * Scan over all equal tuples, looking for live conflicts.
450 : : */
451 [ + + - + ]: 3046139 : Assert(!insertstate->bounds_valid || insertstate->low == offset);
2518 452 [ - + ]: 3046139 : Assert(!itup_key->anynullkeys);
2552 453 [ + - ]: 3046139 : Assert(itup_key->scantid == NULL);
454 : : for (;;)
455 : : {
456 : : /*
457 : : * Each iteration of the loop processes one heap TID, not one index
458 : : * tuple. Current offset number for page isn't usually advanced on
459 : : * iterations that process heap TIDs from posting list tuples.
460 : : *
461 : : * "inposting" state is set when _inside_ a posting list --- not when
462 : : * we're at the start (or end) of a posting list. We advance curposti
463 : : * at the end of the iteration when inside a posting list tuple. In
464 : : * general, every loop iteration either advances the page offset or
465 : : * advances curposti --- an iteration that handles the rightmost/max
466 : : * heap TID in a posting list finally advances the page offset (and
467 : : * unsets "inposting").
468 : : *
469 : : * Make sure the offset points to an actual index tuple before trying
470 : : * to examine it...
471 : : */
9368 tgl@sss.pgh.pa.us 472 [ + + ]: 9063481 : if (offset <= maxoff)
473 : : {
474 : : /*
475 : : * Fastpath: In most cases, we can use cached search bounds to
476 : : * limit our consideration to items that are definitely
477 : : * duplicates. This fastpath doesn't apply when the original page
478 : : * is empty, or when initial offset is past the end of the
479 : : * original page, which may indicate that we need to examine a
480 : : * second or subsequent page.
481 : : *
482 : : * Note that this optimization allows us to avoid calling
483 : : * _bt_compare() directly when there are no duplicates, as long as
484 : : * the offset where the key will go is not at the end of the page.
485 : : */
2552 pg@bowt.ie 486 [ + + + + ]: 7237575 : if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
487 : : {
488 [ - + ]: 1101905 : Assert(insertstate->bounds_valid);
489 [ + + - + ]: 1101905 : Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
490 [ - + ]: 1101905 : Assert(insertstate->low <= insertstate->stricthigh);
2518 491 [ - + ]: 1101905 : Assert(_bt_compare(rel, itup_key, page, offset) < 0);
2552 492 : 1101905 : break;
493 : : }
494 : :
495 : : /*
496 : : * We can skip items that are already marked killed.
497 : : *
498 : : * In the presence of heavy update activity an index may contain
499 : : * many killed items with the same key; running _bt_compare() on
500 : : * each killed item gets expensive. Just advance over killed
501 : : * items as quickly as we can. We only apply _bt_compare() when
502 : : * we get to a non-killed item. We could reuse the bounds to
503 : : * avoid _bt_compare() calls for known equal tuples, but it
504 : : * doesn't seem worth it.
505 : : */
2209 506 [ + + ]: 6135670 : if (!inposting)
507 : 3877058 : curitemid = PageGetItemId(page, offset);
508 [ + + + + ]: 6135670 : if (inposting || !ItemIdIsDead(curitemid))
509 : : {
510 : : ItemPointerData htid;
511 : 5838337 : bool all_dead = false;
512 : :
513 [ + + ]: 5838337 : if (!inposting)
514 : : {
515 : : /* Plain tuple, or first TID in posting list tuple */
516 [ + + ]: 3579725 : if (_bt_compare(rel, itup_key, page, offset) != 0)
517 : 105102 : break; /* we're past all the equal tuples */
518 : :
519 : : /* Advanced curitup */
520 : 3474623 : curitup = (IndexTuple) PageGetItem(page, curitemid);
521 [ - + ]: 3474623 : Assert(!BTreeTupleIsPivot(curitup));
522 : : }
523 : :
524 : : /* okay, we gotta fetch the heap tuple using htid ... */
525 [ + + ]: 5733235 : if (!BTreeTupleIsPosting(curitup))
526 : : {
527 : : /* ... htid is from simple non-pivot tuple */
528 [ - + ]: 3451055 : Assert(!inposting);
529 : 3451055 : htid = curitup->t_tid;
530 : : }
531 [ + + ]: 2282180 : else if (!inposting)
532 : : {
533 : : /* ... htid is first TID in new posting list */
534 : 23568 : inposting = true;
535 : 23568 : prevalldead = true;
536 : 23568 : curposti = 0;
537 : 23568 : htid = *BTreeTupleGetPostingN(curitup, 0);
538 : : }
539 : : else
540 : : {
541 : : /* ... htid is second or subsequent TID in posting list */
542 [ - + ]: 2258612 : Assert(curposti > 0);
543 : 2258612 : htid = *BTreeTupleGetPostingN(curitup, curposti);
544 : : }
545 : :
546 : : /*
547 : : * If we are doing a recheck, we expect to find the tuple we
548 : : * are rechecking. It's not a duplicate, but we have to keep
549 : : * scanning.
550 : : */
6073 tgl@sss.pgh.pa.us 551 [ + + + + ]: 5733353 : if (checkUnique == UNIQUE_CHECK_EXISTING &&
552 : 118 : ItemPointerCompare(&htid, &itup->t_tid) == 0)
553 : : {
554 : 27 : found = true;
555 : : }
556 : :
557 : : /*
558 : : * Check if there's any table tuples for this index entry
559 : : * satisfying SnapshotDirty. This is necessary because for AMs
560 : : * with optimizations like heap's HOT, we have just a single
561 : : * index entry for the entire chain.
562 : : */
2547 andres@anarazel.de 563 [ + + ]: 5733208 : else if (table_index_fetch_tuple_check(heapRel, &htid,
564 : : &SnapshotDirty,
565 : : &all_dead))
566 : : {
567 : : TransactionId xwait;
568 : :
569 : : /*
570 : : * It is a duplicate. If we are only doing a partial
571 : : * check, then don't bother checking if the tuple is being
572 : : * updated in another transaction. Just return the fact
573 : : * that it is a potential conflict and leave the full
574 : : * check till later. Don't invalidate binary search
575 : : * bounds.
576 : : */
6073 tgl@sss.pgh.pa.us 577 [ + + ]: 404 : if (checkUnique == UNIQUE_CHECK_PARTIAL)
578 : : {
579 [ - + ]: 128 : if (nbuf != InvalidBuffer)
6073 tgl@sss.pgh.pa.us 580 :UBC 0 : _bt_relbuf(rel, nbuf);
6073 tgl@sss.pgh.pa.us 581 :CBC 128 : *is_unique = false;
582 : 140 : return InvalidTransactionId;
583 : : }
584 : :
585 : : /*
586 : : * If this tuple is being updated by other transaction
587 : : * then we have to wait for its commit/abort.
588 : : */
589 : 552 : xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
590 [ + + ]: 276 : SnapshotDirty.xmin : SnapshotDirty.xmax;
591 : :
8696 592 [ + + ]: 276 : if (TransactionIdIsValid(xwait))
593 : : {
594 [ - + ]: 12 : if (nbuf != InvalidBuffer)
8696 tgl@sss.pgh.pa.us 595 :UBC 0 : _bt_relbuf(rel, nbuf);
596 : : /* Tell _bt_doinsert to wait... */
3964 andres@anarazel.de 597 :CBC 12 : *speculativeToken = SnapshotDirty.speculativeToken;
598 : : /* Caller releases lock on buf immediately */
2537 pg@bowt.ie 599 : 12 : insertstate->bounds_valid = false;
8696 tgl@sss.pgh.pa.us 600 : 12 : return xwait;
601 : : }
602 : :
603 : : /*
604 : : * Otherwise we have a definite conflict. But before
605 : : * complaining, look to see if the tuple we want to insert
606 : : * is itself now committed dead --- if so, don't complain.
607 : : * This is a waste of time in normal scenarios but we must
608 : : * do it to support CREATE INDEX CONCURRENTLY.
609 : : *
610 : : * We must follow HOT-chains here because during
611 : : * concurrent index build, we insert the root TID though
612 : : * the actual tuple may be somewhere in the HOT-chain.
613 : : * While following the chain we might not stop at the
614 : : * exact tuple which triggered the insert, but that's OK
615 : : * because if we find a live tuple anywhere in this chain,
616 : : * we have a unique key conflict. The other live tuple is
617 : : * not part of this chain because it had a different index
618 : : * entry.
619 : : */
2089 pg@bowt.ie 620 : 264 : htid = itup->t_tid;
621 [ - + ]: 264 : if (table_index_fetch_tuple_check(heapRel, &htid,
622 : : SnapshotSelf, NULL))
623 : : {
624 : : /* Normal case --- it's still live */
625 : : }
626 : : else
627 : : {
628 : : /*
629 : : * It's been deleted, so no error, and no need to
630 : : * continue searching
631 : : */
7142 tgl@sss.pgh.pa.us 632 :UBC 0 : break;
633 : : }
634 : :
635 : : /*
636 : : * Check for a conflict-in as we would if we were going to
637 : : * write to this page. We aren't actually going to write,
638 : : * but we want a chance to report SSI conflicts that would
639 : : * otherwise be masked by this unique constraint
640 : : * violation.
641 : : */
2238 tmunro@postgresql.or 642 :CBC 264 : CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
643 : :
644 : : /*
645 : : * This is a definite conflict. Break the tuple down into
646 : : * datums and report the error. But first, make sure we
647 : : * release the buffer locks we're holding ---
648 : : * BuildIndexValueDescription could make catalog accesses,
649 : : * which in the worst case might touch this same index and
650 : : * cause deadlocks.
651 : : */
6070 tgl@sss.pgh.pa.us 652 [ - + ]: 260 : if (nbuf != InvalidBuffer)
6070 tgl@sss.pgh.pa.us 653 :UBC 0 : _bt_relbuf(rel, nbuf);
2552 pg@bowt.ie 654 :CBC 260 : _bt_relbuf(rel, insertstate->buf);
655 : 260 : insertstate->buf = InvalidBuffer;
2537 656 : 260 : insertstate->bounds_valid = false;
657 : :
658 : : {
659 : : Datum values[INDEX_MAX_KEYS];
660 : : bool isnull[INDEX_MAX_KEYS];
661 : : char *key_desc;
662 : :
6070 tgl@sss.pgh.pa.us 663 : 260 : index_deform_tuple(itup, RelationGetDescr(rel),
664 : : values, isnull);
665 : :
4080 sfrost@snowman.net 666 : 260 : key_desc = BuildIndexValueDescription(rel, values,
667 : : isnull);
668 : :
6070 tgl@sss.pgh.pa.us 669 [ + - + + ]: 260 : ereport(ERROR,
670 : : (errcode(ERRCODE_UNIQUE_VIOLATION),
671 : : errmsg("duplicate key value violates unique constraint \"%s\"",
672 : : RelationGetRelationName(rel)),
673 : : key_desc ? errdetail("Key %s already exists.",
674 : : key_desc) : 0,
675 : : errtableconstraint(heapRel,
676 : : RelationGetRelationName(rel))));
677 : : }
678 : : }
2209 pg@bowt.ie 679 [ + + + + : 5732804 : else if (all_dead && (!inposting ||
+ + ]
680 : 18785 : (prevalldead &&
681 [ + + ]: 18785 : curposti == BTreeTupleGetNPosting(curitup) - 1)))
682 : : {
683 : : /*
684 : : * The conflicting tuple (or all HOT chains pointed to by
685 : : * all posting list TIDs) is dead to everyone, so try to
686 : : * mark the index entry killed. It's ok if we're not
687 : : * allowed to, this isn't required for correctness.
688 : : */
689 : : Buffer buf;
690 : :
691 : : /* Be sure to operate on the proper buffer */
6751 tgl@sss.pgh.pa.us 692 [ + + ]: 53350 : if (nbuf != InvalidBuffer)
5 andres@anarazel.de 693 :GNC 3 : buf = nbuf;
694 : : else
695 : 53347 : buf = insertstate->buf;
696 : :
697 : : /*
698 : : * Use the hint bit infrastructure to check if we can
699 : : * update the page while just holding a share lock.
700 : : *
701 : : * Can't use BufferSetHintBits16() here as we update two
702 : : * different locations.
703 : : */
704 [ + - ]: 53350 : if (BufferBeginSetHintBits(buf))
705 : : {
706 : 53350 : ItemIdMarkDead(curitemid);
707 : 53350 : opaque->btpo_flags |= BTP_HAS_GARBAGE;
708 : 53350 : BufferFinishSetHintBits(buf, true, true);
709 : : }
710 : : }
711 : :
712 : : /*
713 : : * Remember if posting list tuple has even a single HOT chain
714 : : * whose members are not all dead
715 : : */
2209 pg@bowt.ie 716 [ + + + + ]:CBC 5732831 : if (!all_dead && inposting)
717 : 2263370 : prevalldead = false;
718 : : }
719 : : }
720 : :
721 [ + + + + ]: 7856070 : if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
722 : : {
723 : : /* Advance to next TID in same posting list */
724 : 2258612 : curposti++;
725 : 2258612 : continue;
726 : : }
727 [ + + ]: 5597458 : else if (offset < maxoff)
728 : : {
729 : : /* Advance to next tuple */
730 : 3753708 : curposti = 0;
731 : 3753708 : inposting = false;
9368 tgl@sss.pgh.pa.us 732 : 3753708 : offset = OffsetNumberNext(offset);
733 : : }
734 : : else
735 : : {
736 : : int highkeycmp;
737 : :
738 : : /* If scankey == hikey we gotta check the next page too */
739 [ + + ]: 1843750 : if (P_RIGHTMOST(opaque))
740 : 1764896 : break;
2552 pg@bowt.ie 741 : 78854 : highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
742 [ - + ]: 78854 : Assert(highkeycmp <= 0);
743 [ + + ]: 78854 : if (highkeycmp != 0)
9368 tgl@sss.pgh.pa.us 744 : 73832 : break;
745 : : /* Advance to next non-dead page --- there must be one */
746 : : for (;;)
8422 tgl@sss.pgh.pa.us 747 :UBC 0 : {
2209 pg@bowt.ie 748 :CBC 5022 : BlockNumber nblkno = opaque->btpo_next;
749 : :
7998 tgl@sss.pgh.pa.us 750 : 5022 : nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
3616 kgrittn@postgresql.o 751 : 5022 : page = BufferGetPage(nbuf);
1444 michael@paquier.xyz 752 : 5022 : opaque = BTPageGetOpaque(page);
8422 tgl@sss.pgh.pa.us 753 [ + - ]: 5022 : if (!P_IGNORE(opaque))
754 : 5022 : break;
8422 tgl@sss.pgh.pa.us 755 [ # # ]:UBC 0 : if (P_RIGHTMOST(opaque))
6649 756 [ # # ]: 0 : elog(ERROR, "fell off the end of index \"%s\"",
757 : : RelationGetRelationName(rel));
758 : : }
759 : : /* Will also advance to next tuple */
2209 pg@bowt.ie 760 :CBC 5022 : curposti = 0;
761 : 5022 : inposting = false;
9368 tgl@sss.pgh.pa.us 762 : 5022 : maxoff = PageGetMaxOffsetNumber(page);
763 [ + + ]: 5022 : offset = P_FIRSTDATAKEY(opaque);
764 : : /* Don't invalidate binary search bounds */
765 : : }
766 : : }
767 : :
768 : : /*
769 : : * If we are doing a recheck then we should have found the tuple we are
770 : : * checking. Otherwise there's something very wrong --- probably, the
771 : : * index is on a non-immutable expression.
772 : : */
6073 773 [ + + - + ]: 3045735 : if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
6073 tgl@sss.pgh.pa.us 774 [ # # ]:UBC 0 : ereport(ERROR,
775 : : (errcode(ERRCODE_INTERNAL_ERROR),
776 : : errmsg("failed to re-find tuple within index \"%s\"",
777 : : RelationGetRelationName(rel)),
778 : : errhint("This may be because of a non-immutable index expression."),
779 : : errtableconstraint(heapRel,
780 : : RelationGetRelationName(rel))));
781 : :
9368 tgl@sss.pgh.pa.us 782 [ + + ]:CBC 3045735 : if (nbuf != InvalidBuffer)
9009 783 : 2880 : _bt_relbuf(rel, nbuf);
784 : :
8970 785 : 3045735 : return InvalidTransactionId;
786 : : }
787 : :
788 : :
789 : : /*
790 : : * _bt_findinsertloc() -- Finds an insert location for a tuple
791 : : *
792 : : * On entry, insertstate buffer contains the page the new tuple belongs
793 : : * on. It is exclusive-locked and pinned by the caller.
794 : : *
795 : : * If 'checkingunique' is true, the buffer on entry is the first page
796 : : * that contains duplicates of the new key. If there are duplicates on
797 : : * multiple pages, the correct insertion position might be some page to
798 : : * the right, rather than the first page. In that case, this function
799 : : * moves right to the correct target page.
800 : : *
801 : : * (In a !heapkeyspace index, there can be multiple pages with the same
802 : : * high key, where the new tuple could legitimately be placed on. In
803 : : * that case, the caller passes the first page containing duplicates,
804 : : * just like when checkingunique=true. If that page doesn't have enough
805 : : * room for the new tuple, this function moves right, trying to find a
806 : : * legal page that does.)
807 : : *
808 : : * If 'indexUnchanged' is true, this is for an UPDATE that didn't
809 : : * logically change the indexed value, but must nevertheless have a new
810 : : * entry to point to a successor version. This hint from the executor
811 : : * will influence our behavior when the page might have to be split and
812 : : * we must consider our options. Bottom-up index deletion can avoid
813 : : * pathological version-driven page splits, but we only want to go to the
814 : : * trouble of trying it when we already have moderate confidence that
815 : : * it's appropriate. The hint should not significantly affect our
816 : : * behavior over time unless practically all inserts on to the leaf page
817 : : * get the hint.
818 : : *
819 : : * On exit, insertstate buffer contains the chosen insertion page, and
820 : : * the offset within that page is returned. If _bt_findinsertloc needed
821 : : * to move right, the lock and pin on the original page are released, and
822 : : * the new buffer is exclusively locked and pinned instead.
823 : : *
824 : : * If insertstate contains cached binary search bounds, we will take
825 : : * advantage of them. This avoids repeating comparisons that we made in
826 : : * _bt_check_unique() already.
827 : : */
828 : : static OffsetNumber
6952 bruce@momjian.us 829 : 4273682 : _bt_findinsertloc(Relation rel,
830 : : BTInsertState insertstate,
831 : : bool checkingunique,
832 : : bool indexUnchanged,
833 : : BTStack stack,
834 : : Relation heapRel)
835 : : {
2552 pg@bowt.ie 836 : 4273682 : BTScanInsert itup_key = insertstate->itup_key;
837 : 4273682 : Page page = BufferGetPage(insertstate->buf);
838 : : BTPageOpaque opaque;
839 : : OffsetNumber newitemoff;
840 : :
1444 michael@paquier.xyz 841 : 4273682 : opaque = BTPageGetOpaque(page);
842 : :
843 : : /* Check 1/3 of a page restriction */
369 pg@bowt.ie 844 [ - + ]: 4273682 : if (unlikely(insertstate->itemsz > BTMaxItemSize))
2552 pg@bowt.ie 845 :UBC 0 : _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
846 : : insertstate->itup);
847 : :
1944 pg@bowt.ie 848 [ + - - + ]:CBC 4273682 : Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
2552 849 [ + + - + ]: 4273682 : Assert(!insertstate->bounds_valid || checkingunique);
850 [ + - - + ]: 4273682 : Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
851 [ - + - - ]: 4273682 : Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
2209 852 [ + + - + ]: 4273682 : Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
853 : :
2552 854 [ + - ]: 4273682 : if (itup_key->heapkeyspace)
855 : : {
856 : : /* Keep track of whether checkingunique duplicate seen */
1887 857 : 4273682 : bool uniquedup = indexUnchanged;
858 : :
859 : : /*
860 : : * If we're inserting into a unique index, we may have to walk right
861 : : * through leaf pages to find the one leaf page that we must insert on
862 : : * to.
863 : : *
864 : : * This is needed for checkingunique callers because a scantid was not
865 : : * used when we called _bt_search(). scantid can only be set after
866 : : * _bt_check_unique() has checked for duplicates. The buffer
867 : : * initially stored in insertstate->buf has the page where the first
868 : : * duplicate key might be found, which isn't always the page that new
869 : : * tuple belongs on. The heap TID attribute for new tuple (scantid)
870 : : * could force us to insert on a sibling page, though that should be
871 : : * very rare in practice.
872 : : */
2552 873 [ + + ]: 4273682 : if (checkingunique)
874 : : {
2209 875 [ + + ]: 3045833 : if (insertstate->low < insertstate->stricthigh)
876 : : {
877 : : /* Encountered a duplicate in _bt_check_unique() */
878 [ - + ]: 225450 : Assert(insertstate->bounds_valid);
879 : 225450 : uniquedup = true;
880 : : }
881 : :
882 : : for (;;)
883 : : {
884 : : /*
885 : : * Does the new tuple belong on this page?
886 : : *
887 : : * The earlier _bt_check_unique() call may well have
888 : : * established a strict upper bound on the offset for the new
889 : : * item. If it's not the last item of the page (i.e. if there
890 : : * is at least one tuple on the page that goes after the tuple
891 : : * we're inserting) then we know that the tuple belongs on
892 : : * this page. We can skip the high key check.
893 : : */
2552 894 [ + + ]: 3050855 : if (insertstate->bounds_valid &&
895 [ + - + + ]: 6080736 : insertstate->low <= insertstate->stricthigh &&
896 : 3040368 : insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
897 : 1193245 : break;
898 : :
899 : : /* Test '<=', not '!=', since scantid is set now */
1944 900 [ + + + + ]: 1944307 : if (P_RIGHTMOST(opaque) ||
2552 901 : 86697 : _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
902 : : break;
903 : :
1079 andres@anarazel.de 904 : 5022 : _bt_stepright(rel, heapRel, insertstate, stack);
905 : : /* Update local state after stepping right */
2552 pg@bowt.ie 906 : 5022 : page = BufferGetPage(insertstate->buf);
1444 michael@paquier.xyz 907 : 5022 : opaque = BTPageGetOpaque(page);
908 : : /* Assume duplicates (if checkingunique) */
2209 pg@bowt.ie 909 : 5022 : uniquedup = true;
910 : : }
911 : : }
912 : :
913 : : /*
914 : : * If the target page cannot fit newitem, try to avoid splitting the
915 : : * page on insert by performing deletion or deduplication now
916 : : */
917 [ + + ]: 4273682 : if (PageGetFreeSpace(page) < insertstate->itemsz)
1944 918 : 28227 : _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
919 : : checkingunique, uniquedup,
920 : : indexUnchanged);
921 : : }
922 : : else
923 : : {
924 : : /*----------
925 : : * This is a !heapkeyspace (version 2 or 3) index. The current page
926 : : * is the first page that we could insert the new tuple to, but there
927 : : * may be other pages to the right that we could opt to use instead.
928 : : *
929 : : * If the new key is equal to one or more existing keys, we can
930 : : * legitimately place it anywhere in the series of equal keys. In
931 : : * fact, if the new key is equal to the page's "high key" we can place
932 : : * it on the next page. If it is equal to the high key, and there's
933 : : * not room to insert the new tuple on the current page without
934 : : * splitting, then we move right hoping to find more free space and
935 : : * avoid a split.
936 : : *
937 : : * Keep scanning right until we
938 : : * (a) find a page with enough free space,
939 : : * (b) reach the last page where the tuple can legally go, or
940 : : * (c) get tired of searching.
941 : : * (c) is not flippant; it is important because if there are many
942 : : * pages' worth of equal keys, it's better to split one of the early
943 : : * pages than to scan all the way to the end of the run of equal keys
944 : : * on every insert. We implement "get tired" as a random choice,
945 : : * since stopping after scanning a fixed number of pages wouldn't work
946 : : * well (we'd never reach the right-hand side of previously split
947 : : * pages). The probability of moving right is set at 0.99, which may
948 : : * seem too high to change the behavior much, but it does an excellent
949 : : * job of preventing O(N^2) behavior with many equal keys.
950 : : *----------
951 : : */
2552 pg@bowt.ie 952 [ # # ]:UBC 0 : while (PageGetFreeSpace(page) < insertstate->itemsz)
953 : : {
954 : : /*
955 : : * Before considering moving right, see if we can obtain enough
956 : : * space by erasing LP_DEAD items
957 : : */
1944 958 [ # # ]: 0 : if (P_HAS_GARBAGE(opaque))
959 : : {
960 : : /* Perform simple deletion */
961 : 0 : _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
962 : : false, false, false);
963 : :
2552 964 [ # # ]: 0 : if (PageGetFreeSpace(page) >= insertstate->itemsz)
965 : 0 : break; /* OK, now we have enough space */
966 : : }
967 : :
968 : : /*
969 : : * Nope, so check conditions (b) and (c) enumerated above
970 : : *
971 : : * The earlier _bt_check_unique() call may well have established a
972 : : * strict upper bound on the offset for the new item. If it's not
973 : : * the last item of the page (i.e. if there is at least one tuple
974 : : * on the page that's greater than the tuple we're inserting to)
975 : : * then we know that the tuple belongs on this page. We can skip
976 : : * the high key check.
977 : : */
978 [ # # ]: 0 : if (insertstate->bounds_valid &&
979 [ # # # # ]: 0 : insertstate->low <= insertstate->stricthigh &&
980 : 0 : insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
981 : 0 : break;
982 : :
1944 983 [ # # # # ]: 0 : if (P_RIGHTMOST(opaque) ||
2552 984 [ # # ]: 0 : _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
1568 tgl@sss.pgh.pa.us 985 : 0 : pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
986 : : break;
987 : :
1079 andres@anarazel.de 988 : 0 : _bt_stepright(rel, heapRel, insertstate, stack);
989 : : /* Update local state after stepping right */
2552 pg@bowt.ie 990 : 0 : page = BufferGetPage(insertstate->buf);
1444 michael@paquier.xyz 991 : 0 : opaque = BTPageGetOpaque(page);
992 : : }
993 : : }
994 : :
995 : : /*
996 : : * We should now be on the correct page. Find the offset within the page
997 : : * for the new tuple. (Possibly reusing earlier search bounds.)
998 : : */
1944 pg@bowt.ie 999 [ + + - + ]:CBC 4273682 : Assert(P_RIGHTMOST(opaque) ||
1000 : : _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
1001 : :
2209 1002 : 4273682 : newitemoff = _bt_binsrch_insert(rel, insertstate);
1003 : :
1004 [ + + ]: 4273682 : if (insertstate->postingoff == -1)
1005 : : {
1006 : : /*
1007 : : * There is an overlapping posting list tuple with its LP_DEAD bit
1008 : : * set. We don't want to unnecessarily unset its LP_DEAD bit while
1009 : : * performing a posting list split, so perform simple index tuple
1010 : : * deletion early.
1011 : : */
1944 pg@bowt.ie 1012 :GBC 5 : _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
1013 : : false, false, false);
1014 : :
1015 : : /*
1016 : : * Do new binary search. New insert location cannot overlap with any
1017 : : * posting list now.
1018 : : */
1019 [ - + ]: 5 : Assert(!insertstate->bounds_valid);
2209 1020 : 5 : insertstate->postingoff = 0;
1021 : 5 : newitemoff = _bt_binsrch_insert(rel, insertstate);
1022 [ - + ]: 5 : Assert(insertstate->postingoff == 0);
1023 : : }
1024 : :
2209 pg@bowt.ie 1025 :CBC 4273682 : return newitemoff;
1026 : : }
1027 : :
1028 : : /*
1029 : : * Step right to next non-dead page, during insertion.
1030 : : *
1031 : : * This is a bit more complicated than moving right in a search. We must
1032 : : * write-lock the target page before releasing write lock on current page;
1033 : : * else someone else's _bt_check_unique scan could fail to see our insertion.
1034 : : * Write locks on intermediate dead pages won't do because we don't know when
1035 : : * they will get de-linked from the tree.
1036 : : *
1037 : : * This is more aggressive than it needs to be for non-unique !heapkeyspace
1038 : : * indexes.
1039 : : */
1040 : : static void
1009 1041 : 5022 : _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate,
1042 : : BTStack stack)
1043 : : {
1044 : : Page page;
1045 : : BTPageOpaque opaque;
1046 : : Buffer rbuf;
1047 : : BlockNumber rblkno;
1048 : :
1049 [ - + ]: 5022 : Assert(heaprel != NULL);
2552 1050 : 5022 : page = BufferGetPage(insertstate->buf);
1444 michael@paquier.xyz 1051 : 5022 : opaque = BTPageGetOpaque(page);
1052 : :
2552 pg@bowt.ie 1053 : 5022 : rbuf = InvalidBuffer;
1944 1054 : 5022 : rblkno = opaque->btpo_next;
1055 : : for (;;)
1056 : : {
2552 1057 : 5022 : rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
1058 : 5022 : page = BufferGetPage(rbuf);
1444 michael@paquier.xyz 1059 : 5022 : opaque = BTPageGetOpaque(page);
1060 : :
1061 : : /*
1062 : : * If this page was incompletely split, finish the split now. We do
1063 : : * this while holding a lock on the left sibling, which is not good
1064 : : * because finishing the split could be a fairly lengthy operation.
1065 : : * But this should happen very seldom.
1066 : : */
1944 pg@bowt.ie 1067 [ - + ]: 5022 : if (P_INCOMPLETE_SPLIT(opaque))
1068 : : {
1079 andres@anarazel.de 1069 :UBC 0 : _bt_finish_split(rel, heaprel, rbuf, stack);
2552 pg@bowt.ie 1070 : 0 : rbuf = InvalidBuffer;
1071 : 0 : continue;
1072 : : }
1073 : :
1944 pg@bowt.ie 1074 [ + - ]:CBC 5022 : if (!P_IGNORE(opaque))
2552 1075 : 5022 : break;
1944 pg@bowt.ie 1076 [ # # ]:UBC 0 : if (P_RIGHTMOST(opaque))
2552 1077 [ # # ]: 0 : elog(ERROR, "fell off the end of index \"%s\"",
1078 : : RelationGetRelationName(rel));
1079 : :
1944 1080 : 0 : rblkno = opaque->btpo_next;
1081 : : }
1082 : : /* rbuf locked; unlock buf, update state for caller */
2552 pg@bowt.ie 1083 :CBC 5022 : _bt_relbuf(rel, insertstate->buf);
1084 : 5022 : insertstate->buf = rbuf;
1085 : 5022 : insertstate->bounds_valid = false;
6952 bruce@momjian.us 1086 : 5022 : }
1087 : :
1088 : : /*----------
1089 : : * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
1090 : : *
1091 : : * This recursive procedure does the following things:
1092 : : *
1093 : : * + if postingoff != 0, splits existing posting list tuple
1094 : : * (since it overlaps with new 'itup' tuple).
1095 : : * + if necessary, splits the target page, using 'itup_key' for
1096 : : * suffix truncation on leaf pages (caller passes NULL for
1097 : : * non-leaf pages).
1098 : : * + inserts the new tuple (might be split from posting list).
1099 : : * + if the page was split, pops the parent stack, and finds the
1100 : : * right place to insert the new child pointer (by walking
1101 : : * right using information stored in the parent stack).
1102 : : * + invokes itself with the appropriate tuple for the right
1103 : : * child page on the parent.
1104 : : * + updates the metapage if a true root or fast root is split.
1105 : : *
1106 : : * On entry, we must have the correct buffer in which to do the
1107 : : * insertion, and the buffer must be pinned and write-locked. On return,
1108 : : * we will have dropped both the pin and the lock on the buffer.
1109 : : *
1110 : : * This routine only performs retail tuple insertions. 'itup' should
1111 : : * always be either a non-highkey leaf item, or a downlink (new high
1112 : : * key items are created indirectly, when a page is split). When
1113 : : * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
1114 : : * we're inserting the downlink for. This function will clear the
1115 : : * INCOMPLETE_SPLIT flag on it, and release the buffer.
1116 : : *----------
1117 : : */
1118 : : static void
1119 : 4286212 : _bt_insertonpg(Relation rel,
1120 : : Relation heaprel,
1121 : : BTScanInsert itup_key,
1122 : : Buffer buf,
1123 : : Buffer cbuf,
1124 : : BTStack stack,
1125 : : IndexTuple itup,
1126 : : Size itemsz,
1127 : : OffsetNumber newitemoff,
1128 : : int postingoff,
1129 : : bool split_only_page)
1130 : : {
1131 : : Page page;
1132 : : BTPageOpaque opaque;
1133 : : bool isleaf,
1134 : : isroot,
1135 : : isrightmost,
1136 : : isonly;
2209 pg@bowt.ie 1137 : 4286212 : IndexTuple oposting = NULL;
1138 : 4286212 : IndexTuple origitup = NULL;
1139 : 4286212 : IndexTuple nposting = NULL;
1140 : : XLogRecPtr recptr;
1141 : :
3616 kgrittn@postgresql.o 1142 : 4286212 : page = BufferGetPage(buf);
1444 michael@paquier.xyz 1143 : 4286212 : opaque = BTPageGetOpaque(page);
1944 pg@bowt.ie 1144 : 4286212 : isleaf = P_ISLEAF(opaque);
1145 : 4286212 : isroot = P_ISROOT(opaque);
1146 : 4286212 : isrightmost = P_RIGHTMOST(opaque);
1147 [ + + + + ]: 4286212 : isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
1148 : :
1149 : : /* child buffer must be given iff inserting on an internal page */
1150 [ - + ]: 4286212 : Assert(isleaf == !BufferIsValid(cbuf));
1151 : : /* tuple must have appropriate number of attributes */
1152 [ + + - + : 4286212 : Assert(!isleaf ||
- - ]
1153 : : BTreeTupleGetNAtts(itup, rel) ==
1154 : : IndexRelationGetNumberOfAttributes(rel));
1155 [ + + + - : 4286212 : Assert(isleaf ||
- + ]
1156 : : BTreeTupleGetNAtts(itup, rel) <=
1157 : : IndexRelationGetNumberOfKeyAttributes(rel));
2209 1158 [ - + ]: 4286212 : Assert(!BTreeTupleIsPosting(itup));
2190 1159 [ - + ]: 4286212 : Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
1160 : : /* Caller must always finish incomplete split for us */
1944 1161 [ - + ]: 4286212 : Assert(!P_INCOMPLETE_SPLIT(opaque));
1162 : :
1163 : : /*
1164 : : * Every internal page should have exactly one negative infinity item at
1165 : : * all times. Only _bt_split() and _bt_newlevel() should add items that
1166 : : * become negative infinity items through truncation, since they're the
1167 : : * only routines that allocate new internal pages.
1168 : : */
1169 [ + + + + : 4286212 : Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
- + ]
1170 : :
1171 : : /*
1172 : : * Do we need to split an existing posting list item?
1173 : : */
2209 1174 [ + + ]: 4286212 : if (postingoff != 0)
1175 : : {
1176 : 16406 : ItemId itemid = PageGetItemId(page, newitemoff);
1177 : :
1178 : : /*
1179 : : * The new tuple is a duplicate with a heap TID that falls inside the
1180 : : * range of an existing posting list tuple on a leaf page. Prepare to
1181 : : * split an existing posting list. Overwriting the posting list with
1182 : : * its post-split version is treated as an extra step in either the
1183 : : * insert or page split critical section.
1184 : : */
1600 1185 [ + - + - : 16406 : Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
- + ]
2209 1186 : 16406 : oposting = (IndexTuple) PageGetItem(page, itemid);
1187 : :
1188 : : /*
1189 : : * postingoff value comes from earlier call to _bt_binsrch_posting().
1190 : : * Its binary search might think that a plain tuple must be a posting
1191 : : * list tuple that needs to be split. This can happen with corruption
1192 : : * involving an existing plain tuple that is a duplicate of the new
1193 : : * item, up to and including its table TID. Check for that here in
1194 : : * passing.
1195 : : *
1196 : : * Also verify that our caller has made sure that the existing posting
1197 : : * list tuple does not have its LP_DEAD bit set.
1198 : : */
1600 1199 [ + - - + ]: 16406 : if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
1600 pg@bowt.ie 1200 [ # # ]:UBC 0 : ereport(ERROR,
1201 : : (errcode(ERRCODE_INDEX_CORRUPTED),
1202 : : errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
1203 : : ItemPointerGetBlockNumber(&itup->t_tid),
1204 : : ItemPointerGetOffsetNumber(&itup->t_tid),
1205 : : newitemoff, BufferGetBlockNumber(buf),
1206 : : RelationGetRelationName(rel))));
1207 : :
1208 : : /* use a mutable copy of itup as our itup from here on */
2209 pg@bowt.ie 1209 :CBC 16406 : origitup = itup;
1210 : 16406 : itup = CopyIndexTuple(origitup);
1211 : 16406 : nposting = _bt_swap_posting(itup, oposting, postingoff);
1212 : : /* itup now contains rightmost/max TID from oposting */
1213 : :
1214 : : /* Alter offset so that newitem goes after posting list */
1215 : 16406 : newitemoff = OffsetNumberNext(newitemoff);
1216 : : }
1217 : :
1218 : : /*
1219 : : * Do we need to split the page to fit the item on it?
1220 : : *
1221 : : * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1222 : : * so this comparison is correct even though we appear to be accounting
1223 : : * only for the item and not for its line pointer.
1224 : : */
9368 tgl@sss.pgh.pa.us 1225 [ + + ]: 4286212 : if (PageGetFreeSpace(page) < itemsz)
1226 : : {
1227 : : Buffer rbuf;
1228 : :
2162 pg@bowt.ie 1229 [ - + ]: 13241 : Assert(!split_only_page);
1230 : :
1231 : : /* split the buffer into left and right halves */
1079 andres@anarazel.de 1232 : 13241 : rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
1233 : : itup, origitup, nposting, postingoff);
5515 heikki.linnakangas@i 1234 : 13241 : PredicateLockPageSplit(rel,
1235 : : BufferGetBlockNumber(buf),
1236 : : BufferGetBlockNumber(rbuf));
1237 : :
1238 : : /*----------
1239 : : * By here,
1240 : : *
1241 : : * + our target page has been split;
1242 : : * + the original tuple has been inserted;
1243 : : * + we have write locks on both the old (left half)
1244 : : * and new (right half) buffers, after the split; and
1245 : : * + we know the key we want to insert into the parent
1246 : : * (it's the "high key" on the left child page).
1247 : : *
1248 : : * We're ready to do the parent insertion. We need to hold onto the
1249 : : * locks for the child pages until we locate the parent, but we can
1250 : : * at least release the lock on the right child before doing the
1251 : : * actual insertion. The lock on the left child will be released
1252 : : * last of all by parent insertion, where it is the 'cbuf' of parent
1253 : : * page.
1254 : : *----------
1255 : : */
1256 : : #ifdef USE_INJECTION_POINTS
103 heikki.linnakangas@i 1257 [ + + ]:GNC 13241 : if (P_ISLEAF(opaque))
1258 : 13083 : INJECTION_POINT("nbtree-leave-leaf-split-incomplete", NULL);
1259 : : else
1260 : 158 : INJECTION_POINT("nbtree-leave-internal-split-incomplete", NULL);
1261 : : #endif
1262 : :
1079 andres@anarazel.de 1263 :CBC 13239 : _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
1264 : : }
1265 : : else
1266 : : {
8423 tgl@sss.pgh.pa.us 1267 : 4272971 : Buffer metabuf = InvalidBuffer;
1268 : 4272971 : Page metapg = NULL;
1269 : 4272971 : BTMetaPageData *metad = NULL;
1270 : : BlockNumber blockcache;
1271 : :
1272 : : /*
1273 : : * If we are doing this insert because we split a page that was the
1274 : : * only one on its tree level, but was not the root, it may have been
1275 : : * the "fast root". We need to ensure that the fast root link points
1276 : : * at or above the current page. We can safely acquire a lock on the
1277 : : * metapage here --- see comments for _bt_newlevel().
1278 : : */
1944 pg@bowt.ie 1279 [ + + ]: 4272971 : if (unlikely(split_only_page))
1280 : : {
2188 1281 [ - + ]: 9 : Assert(!isleaf);
1282 [ - + ]: 9 : Assert(BufferIsValid(cbuf));
1283 : :
1009 1284 : 9 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
3616 kgrittn@postgresql.o 1285 : 9 : metapg = BufferGetPage(metabuf);
8423 tgl@sss.pgh.pa.us 1286 : 9 : metad = BTPageGetMeta(metapg);
1287 : :
1845 pg@bowt.ie 1288 [ - + ]: 9 : if (metad->btm_fastlevel >= opaque->btpo_level)
1289 : : {
1290 : : /* no update wanted */
8423 tgl@sss.pgh.pa.us 1291 :UBC 0 : _bt_relbuf(rel, metabuf);
1292 : 0 : metabuf = InvalidBuffer;
1293 : : }
1294 : : }
1295 : :
1296 : : /* Do the update. No ereport(ERROR) until changes are logged */
8423 tgl@sss.pgh.pa.us 1297 :CBC 4272971 : START_CRIT_SECTION();
1298 : :
2209 pg@bowt.ie 1299 [ + + ]: 4272971 : if (postingoff != 0)
1300 : 16377 : memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
1301 : :
139 peter@eisentraut.org 1302 [ - + ]:GNC 4272971 : if (PageAddItem(page, itup, itemsz, newitemoff, false, false) == InvalidOffsetNumber)
5677 tgl@sss.pgh.pa.us 1303 [ # # ]:UBC 0 : elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1304 : : BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1305 : :
7289 tgl@sss.pgh.pa.us 1306 :CBC 4272971 : MarkBufferDirty(buf);
1307 : :
8423 1308 [ + + ]: 4272971 : if (BufferIsValid(metabuf))
1309 : : {
1310 : : /* upgrade meta-page if needed */
2552 pg@bowt.ie 1311 [ - + ]: 9 : if (metad->btm_version < BTREE_NOVAC_VERSION)
2902 teodor@sigaev.ru 1312 :UBC 0 : _bt_upgrademetapage(metapg);
2189 pg@bowt.ie 1313 :CBC 9 : metad->btm_fastroot = BufferGetBlockNumber(buf);
1845 1314 : 9 : metad->btm_fastlevel = opaque->btpo_level;
7289 tgl@sss.pgh.pa.us 1315 : 9 : MarkBufferDirty(metabuf);
1316 : : }
1317 : :
1318 : : /*
1319 : : * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
1320 : : * finishes a split
1321 : : */
2162 pg@bowt.ie 1322 [ + + ]: 4272971 : if (!isleaf)
1323 : : {
3616 kgrittn@postgresql.o 1324 : 12372 : Page cpage = BufferGetPage(cbuf);
1444 michael@paquier.xyz 1325 : 12372 : BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1326 : :
4380 heikki.linnakangas@i 1327 [ - + ]: 12372 : Assert(P_INCOMPLETE_SPLIT(cpageop));
1328 : 12372 : cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1329 : 12372 : MarkBufferDirty(cbuf);
1330 : : }
1331 : :
1332 : : /* XLOG stuff */
5571 rhaas@postgresql.org 1333 [ + + + + : 4272971 : if (RelationNeedsWAL(rel))
+ + + + ]
8423 tgl@sss.pgh.pa.us 1334 :GIC 3932973 : {
1335 : : xl_btree_insert xlrec;
1336 : : xl_btree_metadata xlmeta;
1337 : : uint8 xlinfo;
1338 : : uint16 upostingoff;
1339 : :
2189 pg@bowt.ie 1340 :CBC 3932973 : xlrec.offnum = newitemoff;
1341 : :
4133 heikki.linnakangas@i 1342 : 3932973 : XLogBeginInsert();
397 peter@eisentraut.org 1343 : 3932973 : XLogRegisterData(&xlrec, SizeOfBtreeInsert);
1344 : :
2188 pg@bowt.ie 1345 [ + + + + ]: 3932973 : if (isleaf && postingoff == 0)
1346 : : {
1347 : : /* Simple leaf insert */
7276 tgl@sss.pgh.pa.us 1348 : 3905120 : xlinfo = XLOG_BTREE_INSERT_LEAF;
1349 : : }
2209 pg@bowt.ie 1350 [ + + ]: 27853 : else if (postingoff != 0)
1351 : : {
1352 : : /*
1353 : : * Leaf insert with posting list split. Must include
1354 : : * postingoff field before newitem/orignewitem.
1355 : : */
2161 1356 [ - + ]: 16377 : Assert(isleaf);
2209 1357 : 16377 : xlinfo = XLOG_BTREE_INSERT_POST;
1358 : : }
1359 : : else
1360 : : {
1361 : : /* Internal page insert, which finishes a split on cbuf */
7276 tgl@sss.pgh.pa.us 1362 : 11476 : xlinfo = XLOG_BTREE_INSERT_UPPER;
2162 pg@bowt.ie 1363 : 11476 : XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
1364 : :
2161 1365 [ + + ]: 11476 : if (BufferIsValid(metabuf))
1366 : : {
1367 : : /* Actually, it's an internal page insert + meta update */
1368 : 9 : xlinfo = XLOG_BTREE_INSERT_META;
1369 : :
1370 [ - + ]: 9 : Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
1371 : 9 : xlmeta.version = metad->btm_version;
1372 : 9 : xlmeta.root = metad->btm_root;
1373 : 9 : xlmeta.level = metad->btm_level;
1374 : 9 : xlmeta.fastroot = metad->btm_fastroot;
1375 : 9 : xlmeta.fastlevel = metad->btm_fastlevel;
1845 1376 : 9 : xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
2161 1377 : 9 : xlmeta.allequalimage = metad->btm_allequalimage;
1378 : :
1379 : 9 : XLogRegisterBuffer(2, metabuf,
1380 : : REGBUF_WILL_INIT | REGBUF_STANDARD);
397 peter@eisentraut.org 1381 : 9 : XLogRegisterBufData(2, &xlmeta,
1382 : : sizeof(xl_btree_metadata));
1383 : : }
1384 : : }
1385 : :
4133 heikki.linnakangas@i 1386 : 3932973 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
2209 pg@bowt.ie 1387 [ + + ]: 3932973 : if (postingoff == 0)
1388 : : {
1389 : : /* Just log itup from caller */
397 peter@eisentraut.org 1390 : 3916596 : XLogRegisterBufData(0, itup, IndexTupleSize(itup));
1391 : : }
1392 : : else
1393 : : {
1394 : : /*
1395 : : * Insert with posting list split (XLOG_BTREE_INSERT_POST
1396 : : * record) case.
1397 : : *
1398 : : * Log postingoff. Also log origitup, not itup. REDO routine
1399 : : * must reconstruct final itup (as well as nposting) using
1400 : : * _bt_swap_posting().
1401 : : */
2145 pg@bowt.ie 1402 : 16377 : upostingoff = postingoff;
1403 : :
397 peter@eisentraut.org 1404 : 16377 : XLogRegisterBufData(0, &upostingoff, sizeof(uint16));
1405 : 16377 : XLogRegisterBufData(0, origitup,
2209 pg@bowt.ie 1406 : 16377 : IndexTupleSize(origitup));
1407 : : }
1408 : :
4133 heikki.linnakangas@i 1409 : 3932973 : recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1410 : : }
1411 : : else
2 pg@bowt.ie 1412 :GNC 339998 : recptr = XLogGetFakeLSN(rel);
1413 : :
1414 [ + + ]: 4272971 : if (BufferIsValid(metabuf))
1415 : 9 : PageSetLSN(metapg, recptr);
1416 [ + + ]: 4272971 : if (!isleaf)
1417 : 12372 : PageSetLSN(BufferGetPage(cbuf), recptr);
1418 : :
1419 : 4272971 : PageSetLSN(page, recptr);
1420 : :
8423 tgl@sss.pgh.pa.us 1421 [ - + ]:CBC 4272971 : END_CRIT_SECTION();
1422 : :
1423 : : /* Release subsidiary buffers */
1424 [ + + ]: 4272971 : if (BufferIsValid(metabuf))
7289 1425 : 9 : _bt_relbuf(rel, metabuf);
2162 pg@bowt.ie 1426 [ + + ]: 4272971 : if (!isleaf)
4380 heikki.linnakangas@i 1427 : 12372 : _bt_relbuf(rel, cbuf);
1428 : :
1429 : : /*
1430 : : * Cache the block number if this is the rightmost leaf page. Cache
1431 : : * may be used by a future inserter within _bt_search_insert().
1432 : : */
2188 pg@bowt.ie 1433 : 4272971 : blockcache = InvalidBlockNumber;
1944 1434 [ + + + + : 4272971 : if (isrightmost && isleaf && !isroot)
+ + ]
2188 1435 : 2532669 : blockcache = BufferGetBlockNumber(buf);
1436 : :
1437 : : /* Release buffer for insertion target block */
7289 tgl@sss.pgh.pa.us 1438 : 4272971 : _bt_relbuf(rel, buf);
1439 : :
1440 : : /*
1441 : : * If we decided to cache the insertion target block before releasing
1442 : : * its buffer lock, then cache it now. Check the height of the tree
1443 : : * first, though. We don't go for the optimization with small
1444 : : * indexes. Defer final check to this point to ensure that we don't
1445 : : * call _bt_getrootheight while holding a buffer lock.
1446 : : */
2188 pg@bowt.ie 1447 [ + + + + ]: 6805640 : if (BlockNumberIsValid(blockcache) &&
1009 1448 : 2532669 : _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
2188 1449 : 34672 : RelationSetTargetBlock(rel, blockcache);
1450 : : }
1451 : :
1452 : : /* be tidy */
2209 1453 [ + + ]: 4286209 : if (postingoff != 0)
1454 : : {
1455 : : /* itup is actually a modified copy of caller's original */
1456 : 16406 : pfree(nposting);
1457 : 16406 : pfree(itup);
1458 : : }
10841 scrappy@hub.org 1459 : 4286209 : }
1460 : :
1461 : : /*
1462 : : * _bt_split() -- split a page in the btree.
1463 : : *
1464 : : * On entry, buf is the page to split, and is pinned and write-locked.
1465 : : * newitemoff etc. tell us about the new item that must be inserted
1466 : : * along with the data from the original page.
1467 : : *
1468 : : * itup_key is used for suffix truncation on leaf pages (internal
1469 : : * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
1470 : : * is the left-sibling of the page we're inserting the downlink for.
1471 : : * This function will clear the INCOMPLETE_SPLIT flag on it, and
1472 : : * release the buffer.
1473 : : *
1474 : : * orignewitem, nposting, and postingoff are needed when an insert of
1475 : : * orignewitem results in both a posting list split and a page split.
1476 : : * These extra posting list split details are used here in the same
1477 : : * way as they are used in the more common case where a posting list
1478 : : * split does not coincide with a page split. We need to deal with
1479 : : * posting list splits directly in order to ensure that everything
1480 : : * that follows from the insert of orignewitem is handled as a single
1481 : : * atomic operation (though caller's insert of a new pivot/downlink
1482 : : * into parent page will still be a separate operation). See
1483 : : * nbtree/README for details on the design of posting list splits.
1484 : : *
1485 : : * Returns the new right sibling of buf, pinned and write-locked.
1486 : : * The pin and lock on buf are maintained.
1487 : : */
1488 : : static Buffer
1079 andres@anarazel.de 1489 : 13241 : _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
1490 : : Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1491 : : IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
1492 : : {
1493 : : Buffer rbuf;
1494 : : Page origpage;
1495 : : Page leftpage,
1496 : : rightpage;
1497 : : PGAlignedBlock leftpage_buf,
1498 : : rightpage_buf;
1499 : : BlockNumber origpagenumber,
1500 : : rightpagenumber;
1501 : : BTPageOpaque ropaque,
1502 : : lopaque,
1503 : : oopaque;
8422 tgl@sss.pgh.pa.us 1504 : 13241 : Buffer sbuf = InvalidBuffer;
1505 : 13241 : Page spage = NULL;
1506 : 13241 : BTPageOpaque sopaque = NULL;
1507 : : Size itemsz;
1508 : : ItemId itemid;
1509 : : IndexTuple firstright,
1510 : : lefthighkey;
1511 : : OffsetNumber firstrightoff;
1512 : : OffsetNumber afterleftoff,
1513 : : afterrightoff,
1514 : : minusinfoff;
1515 : : OffsetNumber origpagepostingoff;
1516 : : OffsetNumber maxoff;
1517 : : OffsetNumber i;
1518 : : bool newitemonleft,
1519 : : isleaf,
1520 : : isrightmost;
1521 : : XLogRecPtr recptr;
1522 : :
1523 : : /*
1524 : : * origpage is the original page to be split. leftpage is a temporary
1525 : : * buffer that receives the left-sibling data, which will be copied back
1526 : : * into origpage on success. rightpage is the new page that will receive
1527 : : * the right-sibling data.
1528 : : *
1529 : : * leftpage is allocated after choosing a split point. rightpage's new
1530 : : * buffer isn't acquired until after leftpage is initialized and has new
1531 : : * high key, the last point where splitting the page may fail (barring
1532 : : * corruption). Failing before acquiring new buffer won't have lasting
1533 : : * consequences, since origpage won't have been modified and leftpage is
1534 : : * only workspace.
1535 : : */
3616 kgrittn@postgresql.o 1536 : 13241 : origpage = BufferGetPage(buf);
1444 michael@paquier.xyz 1537 : 13241 : oopaque = BTPageGetOpaque(origpage);
2162 pg@bowt.ie 1538 : 13241 : isleaf = P_ISLEAF(oopaque);
1539 : 13241 : isrightmost = P_RIGHTMOST(oopaque);
1540 : 13241 : maxoff = PageGetMaxOffsetNumber(origpage);
5677 tgl@sss.pgh.pa.us 1541 : 13241 : origpagenumber = BufferGetBlockNumber(buf);
1542 : :
1543 : : /*
1544 : : * Choose a point to split origpage at.
1545 : : *
1546 : : * A split point can be thought of as a point _between_ two existing data
1547 : : * items on origpage (the lastleft and firstright tuples), provided you
1548 : : * pretend that the new item that didn't fit is already on origpage.
1549 : : *
1550 : : * Since origpage does not actually contain newitem, the representation of
1551 : : * split points needs to work with two boundary cases: splits where
1552 : : * newitem is lastleft, and splits where newitem is firstright.
1553 : : * newitemonleft resolves the ambiguity that would otherwise exist when
1554 : : * newitemoff == firstrightoff. In all other cases it's clear which side
1555 : : * of the split every tuple goes on from context. newitemonleft is
1556 : : * usually (but not always) redundant information.
1557 : : *
1558 : : * firstrightoff is supposed to be an origpage offset number, but it's
1559 : : * possible that its value will be maxoff+1, which is "past the end" of
1560 : : * origpage. This happens in the rare case where newitem goes after all
1561 : : * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
1562 : : * origpage at the point that leaves newitem alone on new right page. Any
1563 : : * "!newitemonleft && newitemoff == firstrightoff" split point makes
1564 : : * newitem the firstright tuple, though, so this case isn't a special
1565 : : * case.
1566 : : */
2162 pg@bowt.ie 1567 : 13241 : firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1568 : : newitem, &newitemonleft);
1569 : :
1570 : : /* Use temporary buffer for leftpage */
170 michael@paquier.xyz 1571 :GNC 13241 : leftpage = leftpage_buf.data;
2498 pg@bowt.ie 1572 :CBC 13241 : _bt_pageinit(leftpage, BufferGetPageSize(buf));
1444 michael@paquier.xyz 1573 : 13241 : lopaque = BTPageGetOpaque(leftpage);
1574 : :
1575 : : /*
1576 : : * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1577 : : * and HAS_GARBAGE flags.
1578 : : */
9293 vadim4o@yahoo.com 1579 : 13241 : lopaque->btpo_flags = oopaque->btpo_flags;
7173 tgl@sss.pgh.pa.us 1580 : 13241 : lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1581 : : /* set flag in leftpage indicating that rightpage has no downlink yet */
4380 heikki.linnakangas@i 1582 : 13241 : lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
10416 bruce@momjian.us 1583 : 13241 : lopaque->btpo_prev = oopaque->btpo_prev;
1584 : : /* handle btpo_next after rightpage buffer acquired */
1845 pg@bowt.ie 1585 : 13241 : lopaque->btpo_level = oopaque->btpo_level;
1586 : : /* handle btpo_cycleid after rightpage buffer acquired */
1587 : :
1588 : : /*
1589 : : * Copy the original page's LSN into leftpage, which will become the
1590 : : * updated version of the page. We need this because XLogInsert will
1591 : : * examine the LSN and possibly dump it in a page image.
1592 : : */
2498 1593 : 13241 : PageSetLSN(leftpage, PageGetLSN(origpage));
1594 : :
1595 : : /*
1596 : : * Determine page offset number of existing overlapped-with-orignewitem
1597 : : * posting list when it is necessary to perform a posting list split in
1598 : : * passing. Note that newitem was already changed by caller (newitem no
1599 : : * longer has the orignewitem TID).
1600 : : *
1601 : : * This page offset number (origpagepostingoff) will be used to pretend
1602 : : * that the posting split has already taken place, even though the
1603 : : * required modifications to origpage won't occur until we reach the
1604 : : * critical section. The lastleft and firstright tuples of our page split
1605 : : * point should, in effect, come from an imaginary version of origpage
1606 : : * that has the nposting tuple instead of the original posting list tuple.
1607 : : *
1608 : : * Note: _bt_findsplitloc() should have compensated for coinciding posting
1609 : : * list splits in just the same way, at least in theory. It doesn't
1610 : : * bother with that, though. In practice it won't affect its choice of
1611 : : * split point.
1612 : : */
2209 1613 : 13241 : origpagepostingoff = InvalidOffsetNumber;
1614 [ + + ]: 13241 : if (postingoff != 0)
1615 : : {
1616 [ - + ]: 29 : Assert(isleaf);
1617 [ - + ]: 29 : Assert(ItemPointerCompare(&orignewitem->t_tid,
1618 : : &newitem->t_tid) < 0);
1619 [ - + ]: 29 : Assert(BTreeTupleIsPosting(nposting));
1620 : 29 : origpagepostingoff = OffsetNumberPrev(newitemoff);
1621 : : }
1622 : :
1623 : : /*
1624 : : * The high key for the new left page is a possibly-truncated copy of
1625 : : * firstright on the leaf level (it's "firstright itself" on internal
1626 : : * pages; see !isleaf comments below). This may seem to be contrary to
1627 : : * Lehman & Yao's approach of using a copy of lastleft as the new high key
1628 : : * when splitting on the leaf level. It isn't, though.
1629 : : *
1630 : : * Suffix truncation will leave the left page's high key fully equal to
1631 : : * lastleft when lastleft and firstright are equal prior to heap TID (that
1632 : : * is, the tiebreaker TID value comes from lastleft). It isn't actually
1633 : : * necessary for a new leaf high key to be a copy of lastleft for the L&Y
1634 : : * "subtree" invariant to hold. It's sufficient to make sure that the new
1635 : : * leaf high key is strictly less than firstright, and greater than or
1636 : : * equal to (not necessarily equal to) lastleft. In other words, when
1637 : : * suffix truncation isn't possible during a leaf page split, we take
1638 : : * L&Y's exact approach to generating a new high key for the left page.
1639 : : * (Actually, that is slightly inaccurate. We don't just use a copy of
1640 : : * lastleft. A tuple with all the keys from firstright but the max heap
1641 : : * TID from lastleft is used, to avoid introducing a special case.)
1642 : : */
2162 1643 [ + + + + ]: 13241 : if (!newitemonleft && newitemoff == firstrightoff)
1644 : : {
1645 : : /* incoming tuple becomes firstright */
9368 tgl@sss.pgh.pa.us 1646 : 29 : itemsz = newitemsz;
2162 pg@bowt.ie 1647 : 29 : firstright = newitem;
1648 : : }
1649 : : else
1650 : : {
1651 : : /* existing item at firstrightoff becomes firstright */
1652 : 13212 : itemid = PageGetItemId(origpage, firstrightoff);
9368 tgl@sss.pgh.pa.us 1653 : 13212 : itemsz = ItemIdGetLength(itemid);
2162 pg@bowt.ie 1654 : 13212 : firstright = (IndexTuple) PageGetItem(origpage, itemid);
1655 [ + + ]: 13212 : if (firstrightoff == origpagepostingoff)
2162 pg@bowt.ie 1656 :GBC 1 : firstright = nposting;
1657 : : }
1658 : :
2162 pg@bowt.ie 1659 [ + + ]:CBC 13241 : if (isleaf)
1660 : : {
1661 : : IndexTuple lastleft;
1662 : :
1663 : : /* Attempt suffix truncation for leaf page splits */
1664 [ + + + + ]: 13083 : if (newitemonleft && newitemoff == firstrightoff)
1665 : : {
1666 : : /* incoming tuple becomes lastleft */
2552 1667 : 242 : lastleft = newitem;
1668 : : }
1669 : : else
1670 : : {
1671 : : OffsetNumber lastleftoff;
1672 : :
1673 : : /* existing item before firstrightoff becomes lastleft */
2162 1674 : 12841 : lastleftoff = OffsetNumberPrev(firstrightoff);
2552 1675 [ + + - + ]: 12841 : Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1676 : 12841 : itemid = PageGetItemId(origpage, lastleftoff);
1677 : 12841 : lastleft = (IndexTuple) PageGetItem(origpage, itemid);
2209 1678 [ + + ]: 12841 : if (lastleftoff == origpagepostingoff)
1679 : 1 : lastleft = nposting;
1680 : : }
1681 : :
2162 1682 : 13083 : lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
1683 : 13083 : itemsz = IndexTupleSize(lefthighkey);
1684 : : }
1685 : : else
1686 : : {
1687 : : /*
1688 : : * Don't perform suffix truncation on a copy of firstright to make
1689 : : * left page high key for internal page splits. Must use firstright
1690 : : * as new high key directly.
1691 : : *
1692 : : * Each distinct separator key value originates as a leaf level high
1693 : : * key; all other separator keys/pivot tuples are copied from one
1694 : : * level down. A separator key in a grandparent page must be
1695 : : * identical to high key in rightmost parent page of the subtree to
1696 : : * its left, which must itself be identical to high key in rightmost
1697 : : * child page of that same subtree (this even applies to separator
1698 : : * from grandparent's high key). There must always be an unbroken
1699 : : * "seam" of identical separator keys that guide index scans at every
1700 : : * level, starting from the grandparent. That's why suffix truncation
1701 : : * is unsafe here.
1702 : : *
1703 : : * Internal page splits will truncate firstright into a "negative
1704 : : * infinity" data item when it gets inserted on the new right page
1705 : : * below, though. This happens during the call to _bt_pgaddtup() for
1706 : : * the new first data item for right page. Do not confuse this
1707 : : * mechanism with suffix truncation. It is just a convenient way of
1708 : : * implementing page splits that split the internal page "inside"
1709 : : * firstright. The lefthighkey separator key cannot appear a second
1710 : : * time in the right page (only firstright's downlink goes in right
1711 : : * page).
1712 : : */
1713 : 158 : lefthighkey = firstright;
1714 : : }
1715 : :
1716 : : /*
1717 : : * Add new high key to leftpage
1718 : : */
1719 : 13241 : afterleftoff = P_HIKEY;
1720 : :
1721 [ + - - + ]: 13241 : Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
1722 [ + - - + ]: 13241 : Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
1723 : : IndexRelationGetNumberOfKeyAttributes(rel));
1724 [ - + ]: 13241 : Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
139 peter@eisentraut.org 1725 [ - + ]:GNC 13241 : if (PageAddItem(leftpage, lefthighkey, itemsz, afterleftoff, false, false) == InvalidOffsetNumber)
2162 pg@bowt.ie 1726 [ # # ]:UBC 0 : elog(ERROR, "failed to add high key to the left sibling"
1727 : : " while splitting block %u of index \"%s\"",
1728 : : origpagenumber, RelationGetRelationName(rel));
2162 pg@bowt.ie 1729 :CBC 13241 : afterleftoff = OffsetNumberNext(afterleftoff);
1730 : :
1731 : : /*
1732 : : * Acquire a new right page to split into, now that left page has a new
1733 : : * high key.
1734 : : *
1735 : : * To not confuse future VACUUM operations, we zero the right page and
1736 : : * work on an in-memory copy of it before writing WAL, then copy its
1737 : : * contents back to the actual page once we start the critical section
1738 : : * work. This simplifies the split work, so as there is no need to zero
1739 : : * the right page before throwing an error.
1740 : : */
1009 1741 : 13241 : rbuf = _bt_allocbuf(rel, heaprel);
170 michael@paquier.xyz 1742 :GNC 13241 : rightpage = rightpage_buf.data;
1743 : :
1744 : : /*
1745 : : * Copy the contents of the right page into its temporary location, and
1746 : : * zero the original space.
1747 : : */
1748 : 13241 : memcpy(rightpage, BufferGetPage(rbuf), BLCKSZ);
1749 : 13241 : memset(BufferGetPage(rbuf), 0, BLCKSZ);
2498 pg@bowt.ie 1750 :CBC 13241 : rightpagenumber = BufferGetBlockNumber(rbuf);
1751 : : /* rightpage was initialized by _bt_allocbuf */
1444 michael@paquier.xyz 1752 : 13241 : ropaque = BTPageGetOpaque(rightpage);
1753 : :
1754 : : /*
1755 : : * Finish off remaining leftpage special area fields. They cannot be set
1756 : : * before both origpage (leftpage) and rightpage buffers are acquired and
1757 : : * locked.
1758 : : *
1759 : : * btpo_cycleid is only used with leaf pages, though we set it here in all
1760 : : * cases just to be consistent.
1761 : : */
2498 pg@bowt.ie 1762 : 13241 : lopaque->btpo_next = rightpagenumber;
1763 : 13241 : lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1764 : :
1765 : : /*
1766 : : * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1767 : : * and HAS_GARBAGE flags.
1768 : : */
1769 : 13241 : ropaque->btpo_flags = oopaque->btpo_flags;
1770 : 13241 : ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1771 : 13241 : ropaque->btpo_prev = origpagenumber;
1772 : 13241 : ropaque->btpo_next = oopaque->btpo_next;
1845 1773 : 13241 : ropaque->btpo_level = oopaque->btpo_level;
2498 1774 : 13241 : ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1775 : :
1776 : : /*
1777 : : * Add new high key to rightpage where necessary.
1778 : : *
1779 : : * If the page we're splitting is not the rightmost page at its level in
1780 : : * the tree, then the first entry on the page is the high key from
1781 : : * origpage.
1782 : : */
2162 1783 : 13241 : afterrightoff = P_HIKEY;
1784 : :
1785 [ + + ]: 13241 : if (!isrightmost)
1786 : : {
1787 : : IndexTuple righthighkey;
1788 : :
2498 1789 : 5436 : itemid = PageGetItemId(origpage, P_HIKEY);
1790 : 5436 : itemsz = ItemIdGetLength(itemid);
2162 1791 : 5436 : righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
1792 [ + - - + ]: 5436 : Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
1793 [ + - - + ]: 5436 : Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
1794 : : IndexRelationGetNumberOfKeyAttributes(rel));
139 peter@eisentraut.org 1795 [ - + ]:GNC 5436 : if (PageAddItem(rightpage, righthighkey, itemsz, afterrightoff, false, false) == InvalidOffsetNumber)
1796 : : {
2162 pg@bowt.ie 1797 [ # # ]:UBC 0 : elog(ERROR, "failed to add high key to the right sibling"
1798 : : " while splitting block %u of index \"%s\"",
1799 : : origpagenumber, RelationGetRelationName(rel));
1800 : : }
2162 pg@bowt.ie 1801 :CBC 5436 : afterrightoff = OffsetNumberNext(afterrightoff);
1802 : : }
1803 : :
1804 : : /*
1805 : : * Internal page splits truncate first data item on right page -- it
1806 : : * becomes "minus infinity" item for the page. Set this up here.
1807 : : */
1808 : 13241 : minusinfoff = InvalidOffsetNumber;
1809 [ + + ]: 13241 : if (!isleaf)
1810 : 158 : minusinfoff = afterrightoff;
1811 : :
1812 : : /*
1813 : : * Now transfer all the data items (non-pivot tuples in isleaf case, or
1814 : : * additional pivot tuples in !isleaf case) to the appropriate page.
1815 : : *
1816 : : * Note: we *must* insert at least the right page's items in item-number
1817 : : * order, for the benefit of _bt_restore_page().
1818 : : */
9368 tgl@sss.pgh.pa.us 1819 [ + + + + ]: 4141784 : for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1820 : : {
1821 : : IndexTuple dataitem;
1822 : :
10416 bruce@momjian.us 1823 : 4128543 : itemid = PageGetItemId(origpage, i);
1824 : 4128543 : itemsz = ItemIdGetLength(itemid);
2162 pg@bowt.ie 1825 : 4128543 : dataitem = (IndexTuple) PageGetItem(origpage, itemid);
1826 : :
1827 : : /* replace original item with nposting due to posting split? */
2209 1828 [ + + ]: 4128543 : if (i == origpagepostingoff)
1829 : : {
2162 1830 [ - + ]: 29 : Assert(BTreeTupleIsPosting(dataitem));
2209 1831 [ - + ]: 29 : Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
2162 1832 : 29 : dataitem = nposting;
1833 : : }
1834 : :
1835 : : /* does new item belong before this one? */
2209 1836 [ + + ]: 4128514 : else if (i == newitemoff)
1837 : : {
9368 tgl@sss.pgh.pa.us 1838 [ + + ]: 7033 : if (newitemonleft)
1839 : : {
2162 pg@bowt.ie 1840 [ - + ]: 2059 : Assert(newitemoff <= firstrightoff);
1841 [ - + ]: 2059 : if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
1842 : : false))
1843 : : {
5677 tgl@sss.pgh.pa.us 1844 [ # # ]:UBC 0 : elog(ERROR, "failed to add new item to the left sibling"
1845 : : " while splitting block %u of index \"%s\"",
1846 : : origpagenumber, RelationGetRelationName(rel));
1847 : : }
2162 pg@bowt.ie 1848 :CBC 2059 : afterleftoff = OffsetNumberNext(afterleftoff);
1849 : : }
1850 : : else
1851 : : {
1852 [ - + ]: 4974 : Assert(newitemoff >= firstrightoff);
1853 [ - + ]: 4974 : if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1854 : : afterrightoff == minusinfoff))
1855 : : {
5677 tgl@sss.pgh.pa.us 1856 [ # # ]:UBC 0 : elog(ERROR, "failed to add new item to the right sibling"
1857 : : " while splitting block %u of index \"%s\"",
1858 : : origpagenumber, RelationGetRelationName(rel));
1859 : : }
2162 pg@bowt.ie 1860 :CBC 4974 : afterrightoff = OffsetNumberNext(afterrightoff);
1861 : : }
1862 : : }
1863 : :
1864 : : /* decide which page to put it on */
1865 [ + + ]: 4128543 : if (i < firstrightoff)
1866 : : {
1867 [ - + ]: 3186039 : if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
1868 : : {
5677 tgl@sss.pgh.pa.us 1869 [ # # ]:UBC 0 : elog(ERROR, "failed to add old item to the left sibling"
1870 : : " while splitting block %u of index \"%s\"",
1871 : : origpagenumber, RelationGetRelationName(rel));
1872 : : }
2162 pg@bowt.ie 1873 :CBC 3186039 : afterleftoff = OffsetNumberNext(afterleftoff);
1874 : : }
1875 : : else
1876 : : {
1877 [ - + ]: 942504 : if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
1878 : : afterrightoff == minusinfoff))
1879 : : {
5677 tgl@sss.pgh.pa.us 1880 [ # # ]:UBC 0 : elog(ERROR, "failed to add old item to the right sibling"
1881 : : " while splitting block %u of index \"%s\"",
1882 : : origpagenumber, RelationGetRelationName(rel));
1883 : : }
2162 pg@bowt.ie 1884 :CBC 942504 : afterrightoff = OffsetNumberNext(afterrightoff);
1885 : : }
1886 : : }
1887 : :
1888 : : /* Handle case where newitem goes at the end of rightpage */
9368 tgl@sss.pgh.pa.us 1889 [ + + ]: 13241 : if (i <= newitemoff)
1890 : : {
1891 : : /*
1892 : : * Can't have newitemonleft here; that would imply we were told to put
1893 : : * *everything* on the left page, which cannot fit (if it could, we'd
1894 : : * not be splitting the page).
1895 : : */
2162 pg@bowt.ie 1896 [ + - - + ]: 6208 : Assert(!newitemonleft && newitemoff == maxoff + 1);
1897 [ - + ]: 6208 : if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1898 : : afterrightoff == minusinfoff))
1899 : : {
5677 tgl@sss.pgh.pa.us 1900 [ # # ]:UBC 0 : elog(ERROR, "failed to add new item to the right sibling"
1901 : : " while splitting block %u of index \"%s\"",
1902 : : origpagenumber, RelationGetRelationName(rel));
1903 : : }
2162 pg@bowt.ie 1904 :CBC 6208 : afterrightoff = OffsetNumberNext(afterrightoff);
1905 : : }
1906 : :
1907 : : /*
1908 : : * We have to grab the original right sibling (if any) and update its prev
1909 : : * link. We are guaranteed that this is deadlock-free, since we couple
1910 : : * the locks in the standard order: left to right.
1911 : : */
1912 [ + + ]: 13241 : if (!isrightmost)
1913 : : {
1009 1914 : 5436 : sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
3616 kgrittn@postgresql.o 1915 : 5436 : spage = BufferGetPage(sbuf);
1444 michael@paquier.xyz 1916 : 5436 : sopaque = BTPageGetOpaque(spage);
5677 tgl@sss.pgh.pa.us 1917 [ - + ]: 5436 : if (sopaque->btpo_prev != origpagenumber)
1918 : : {
2418 peter@eisentraut.org 1919 [ # # ]:UBC 0 : ereport(ERROR,
1920 : : (errcode(ERRCODE_INDEX_CORRUPTED),
1921 : : errmsg_internal("right sibling's left-link doesn't match: "
1922 : : "block %u links to %u instead of expected %u in index \"%s\"",
1923 : : oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1924 : : RelationGetRelationName(rel))));
1925 : : }
1926 : :
1927 : : /*
1928 : : * Check to see if we can set the SPLIT_END flag in the right-hand
1929 : : * split page; this can save some I/O for vacuum since it need not
1930 : : * proceed to the right sibling. We can set the flag if the right
1931 : : * sibling has a different cycleid: that means it could not be part of
1932 : : * a group of pages that were all split off from the same ancestor
1933 : : * page. If you're confused, imagine that page A splits to A B and
1934 : : * then again, yielding A C B, while vacuum is in progress. Tuples
1935 : : * originally in A could now be in either B or C, hence vacuum must
1936 : : * examine both pages. But if D, our right sibling, has a different
1937 : : * cycleid then it could not contain any tuples that were in A when
1938 : : * the vacuum started.
1939 : : */
7251 tgl@sss.pgh.pa.us 1940 [ + + ]:CBC 5436 : if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
7251 tgl@sss.pgh.pa.us 1941 :GBC 1 : ropaque->btpo_flags |= BTP_SPLIT_END;
1942 : : }
1943 : :
1944 : : /*
1945 : : * Right sibling is locked, new siblings are prepared, but original page
1946 : : * is not updated yet.
1947 : : *
1948 : : * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1949 : : * not starting the critical section till here because we haven't been
1950 : : * scribbling on the original page yet; see comments above.
1951 : : */
9193 tgl@sss.pgh.pa.us 1952 :CBC 13241 : START_CRIT_SECTION();
1953 : :
1954 : : /*
1955 : : * By here, the original data page has been split into two new halves, and
1956 : : * these are correct. The algorithm requires that the left page never
1957 : : * move during a split, so we copy the new left page back on top of the
1958 : : * original. We need to do this before writing the WAL record, so that
1959 : : * XLogInsert can WAL log an image of the page if necessary.
1960 : : */
170 michael@paquier.xyz 1961 :GNC 13241 : memcpy(origpage, leftpage, BLCKSZ);
1962 : : /* leftpage, lopaque must not be used below here */
1963 : :
1964 : : /*
1965 : : * Move the contents of the right page from its temporary location to the
1966 : : * destination buffer, before writing the WAL record. Unlike the left
1967 : : * page, the right page and its opaque area are still needed to complete
1968 : : * the update of the page, so reinitialize them.
1969 : : */
1970 : 13241 : rightpage = BufferGetPage(rbuf);
1971 : 13241 : memcpy(rightpage, rightpage_buf.data, BLCKSZ);
1972 : 13241 : ropaque = BTPageGetOpaque(rightpage);
1973 : :
6913 tgl@sss.pgh.pa.us 1974 :CBC 13241 : MarkBufferDirty(buf);
1975 : 13241 : MarkBufferDirty(rbuf);
1976 : :
2162 pg@bowt.ie 1977 [ + + ]: 13241 : if (!isrightmost)
1978 : : {
5677 tgl@sss.pgh.pa.us 1979 : 5436 : sopaque->btpo_prev = rightpagenumber;
6913 1980 : 5436 : MarkBufferDirty(sbuf);
1981 : : }
1982 : :
1983 : : /*
1984 : : * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1985 : : * a split
1986 : : */
4380 heikki.linnakangas@i 1987 [ + + ]: 13241 : if (!isleaf)
1988 : : {
3616 kgrittn@postgresql.o 1989 : 158 : Page cpage = BufferGetPage(cbuf);
1444 michael@paquier.xyz 1990 : 158 : BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1991 : :
4380 heikki.linnakangas@i 1992 : 158 : cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1993 : 158 : MarkBufferDirty(cbuf);
1994 : : }
1995 : :
1996 : : /* XLOG stuff */
5571 rhaas@postgresql.org 1997 [ + + + + : 13241 : if (RelationNeedsWAL(rel))
+ - + - ]
9293 vadim4o@yahoo.com 1998 :GIC 12329 : {
1999 : : xl_btree_split xlrec;
2000 : : uint8 xlinfo;
2001 : :
1845 pg@bowt.ie 2002 :CBC 12329 : xlrec.level = ropaque->btpo_level;
2003 : : /* See comments below on newitem, orignewitem, and posting lists */
2162 2004 : 12329 : xlrec.firstrightoff = firstrightoff;
4133 heikki.linnakangas@i 2005 : 12329 : xlrec.newitemoff = newitemoff;
2209 pg@bowt.ie 2006 : 12329 : xlrec.postingoff = 0;
2162 2007 [ + + + + ]: 12329 : if (postingoff != 0 && origpagepostingoff < firstrightoff)
2209 2008 : 16 : xlrec.postingoff = postingoff;
2009 : :
4133 heikki.linnakangas@i 2010 : 12329 : XLogBeginInsert();
397 peter@eisentraut.org 2011 : 12329 : XLogRegisterData(&xlrec, SizeOfBtreeSplit);
2012 : :
4133 heikki.linnakangas@i 2013 : 12329 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
2014 : 12329 : XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
2015 : : /* Log original right sibling, since we've changed its prev-pointer */
2162 pg@bowt.ie 2016 [ + + ]: 12329 : if (!isrightmost)
4133 heikki.linnakangas@i 2017 : 5430 : XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
2162 pg@bowt.ie 2018 [ + + ]: 12329 : if (!isleaf)
4133 heikki.linnakangas@i 2019 : 158 : XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
2020 : :
2021 : : /*
2022 : : * Log the new item, if it was inserted on the left page. (If it was
2023 : : * put on the right page, we don't need to explicitly WAL log it
2024 : : * because it's included with all the other items on the right page.)
2025 : : * Show the new item as belonging to the left page buffer, so that it
2026 : : * is not stored if XLogInsert decides it needs a full-page image of
2027 : : * the left page. We always store newitemoff in the record, though.
2028 : : *
2029 : : * The details are sometimes slightly different for page splits that
2030 : : * coincide with a posting list split. If both the replacement
2031 : : * posting list and newitem go on the right page, then we don't need
2032 : : * to log anything extra, just like the simple !newitemonleft
2033 : : * no-posting-split case (postingoff is set to zero in the WAL record,
2034 : : * so recovery doesn't need to process a posting list split at all).
2035 : : * Otherwise, we set postingoff and log orignewitem instead of
2036 : : * newitem, despite having actually inserted newitem. REDO routine
2037 : : * must reconstruct nposting and newitem using _bt_swap_posting().
2038 : : *
2039 : : * Note: It's possible that our page split point is the point that
2040 : : * makes the posting list lastleft and newitem firstright. This is
2041 : : * the only case where we log orignewitem/newitem despite newitem
2042 : : * going on the right page. If XLogInsert decides that it can omit
2043 : : * orignewitem due to logging a full-page image of the left page,
2044 : : * everything still works out, since recovery only needs to log
2045 : : * orignewitem for items on the left page (just like the regular
2046 : : * newitem-logged case).
2047 : : */
2209 pg@bowt.ie 2048 [ + + + + ]: 12329 : if (newitemonleft && xlrec.postingoff == 0)
397 peter@eisentraut.org 2049 : 2041 : XLogRegisterBufData(0, newitem, newitemsz);
2209 pg@bowt.ie 2050 [ + + ]: 10288 : else if (xlrec.postingoff != 0)
2051 : : {
2162 2052 [ - + ]: 16 : Assert(isleaf);
2053 [ + + - + ]: 16 : Assert(newitemonleft || firstrightoff == newitemoff);
2054 [ - + ]: 16 : Assert(newitemsz == IndexTupleSize(orignewitem));
397 peter@eisentraut.org 2055 : 16 : XLogRegisterBufData(0, orignewitem, newitemsz);
2056 : : }
2057 : :
2058 : : /* Log the left page's new high key */
2162 pg@bowt.ie 2059 [ + + ]: 12329 : if (!isleaf)
2060 : : {
2061 : : /* lefthighkey isn't local copy, get current pointer */
2062 : 158 : itemid = PageGetItemId(origpage, P_HIKEY);
2063 : 158 : lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
2064 : : }
397 peter@eisentraut.org 2065 : 12329 : XLogRegisterBufData(0, lefthighkey,
2162 pg@bowt.ie 2066 : 12329 : MAXALIGN(IndexTupleSize(lefthighkey)));
2067 : :
2068 : : /*
2069 : : * Log the contents of the right page in the format understood by
2070 : : * _bt_restore_page(). The whole right page will be recreated.
2071 : : *
2072 : : * Direct access to page is not good but faster - we should implement
2073 : : * some new func in page API. Note we only store the tuples
2074 : : * themselves, knowing that they were inserted in item-number order
2075 : : * and so the line pointers can be reconstructed. See comments for
2076 : : * _bt_restore_page().
2077 : : */
4133 heikki.linnakangas@i 2078 : 12329 : XLogRegisterBufData(1,
3189 tgl@sss.pgh.pa.us 2079 : 12329 : (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
4133 heikki.linnakangas@i 2080 : 12329 : ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
2081 : :
2552 pg@bowt.ie 2082 [ + + ]: 12329 : xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
4133 heikki.linnakangas@i 2083 : 12329 : recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2084 : : }
2085 : : else
2 pg@bowt.ie 2086 :GNC 912 : recptr = XLogGetFakeLSN(rel);
2087 : :
2088 : 13241 : PageSetLSN(origpage, recptr);
2089 : 13241 : PageSetLSN(rightpage, recptr);
2090 [ + + ]: 13241 : if (!isrightmost)
2091 : 5436 : PageSetLSN(spage, recptr);
2092 [ + + ]: 13241 : if (!isleaf)
2093 : 158 : PageSetLSN(BufferGetPage(cbuf), recptr);
2094 : :
9182 tgl@sss.pgh.pa.us 2095 [ - + ]:CBC 13241 : END_CRIT_SECTION();
2096 : :
2097 : : /* release the old right sibling */
2162 pg@bowt.ie 2098 [ + + ]: 13241 : if (!isrightmost)
7289 tgl@sss.pgh.pa.us 2099 : 5436 : _bt_relbuf(rel, sbuf);
2100 : :
2101 : : /* release the child */
4380 heikki.linnakangas@i 2102 [ + + ]: 13241 : if (!isleaf)
2103 : 158 : _bt_relbuf(rel, cbuf);
2104 : :
2105 : : /* be tidy */
2162 pg@bowt.ie 2106 [ + + ]: 13241 : if (isleaf)
2107 : 13083 : pfree(lefthighkey);
2108 : :
2109 : : /* split's done */
10057 bruce@momjian.us 2110 : 13241 : return rbuf;
2111 : : }
2112 : :
2113 : : /*
2114 : : * _bt_insert_parent() -- Insert downlink into parent, completing split.
2115 : : *
2116 : : * On entry, buf and rbuf are the left and right split pages, which we
2117 : : * still hold write locks on. Both locks will be released here. We
2118 : : * release the rbuf lock once we have a write lock on the page that we
2119 : : * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
2120 : : * The lock on buf is released at the same point as the lock on the parent
2121 : : * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
2122 : : * atomic operation that completes the split by inserting a new downlink.
2123 : : *
2124 : : * stack - stack showing how we got here. Will be NULL when splitting true
2125 : : * root, or during concurrent root split, where we can be inefficient
2126 : : * isroot - we split the true root
2127 : : * isonly - we split a page alone on its level (might have been fast root)
2128 : : */
2129 : : static void
8423 tgl@sss.pgh.pa.us 2130 : 13241 : _bt_insert_parent(Relation rel,
2131 : : Relation heaprel,
2132 : : Buffer buf,
2133 : : Buffer rbuf,
2134 : : BTStack stack,
2135 : : bool isroot,
2136 : : bool isonly)
2137 : : {
1009 pg@bowt.ie 2138 [ - + ]: 13241 : Assert(heaprel != NULL);
2139 : :
2140 : : /*
2141 : : * Here we have to do something Lehman and Yao don't talk about: deal with
2142 : : * a root split and construction of a new root. If our stack is empty
2143 : : * then we have just split a node on what had been the root level when we
2144 : : * descended the tree. If it was still the root then we perform a
2145 : : * new-root construction. If it *wasn't* the root anymore, search to find
2146 : : * the next higher level that someone constructed meanwhile, and find the
2147 : : * right place to insert as for the normal case.
2148 : : *
2149 : : * If we have to search for the parent level, we do so by re-descending
2150 : : * from the root. This is not super-efficient, but it's rare enough not
2151 : : * to matter.
2152 : : */
1944 2153 [ + + ]: 13241 : if (isroot)
2154 : : {
2155 : : Buffer rootbuf;
2156 : :
8103 neilc@samurai.com 2157 [ - + ]: 711 : Assert(stack == NULL);
1944 pg@bowt.ie 2158 [ - + ]: 711 : Assert(isonly);
2159 : : /* create a new root node one level up and update the metapage */
1009 2160 : 711 : rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf);
2161 : : /* release the split buffers */
7289 tgl@sss.pgh.pa.us 2162 : 711 : _bt_relbuf(rel, rootbuf);
2163 : 711 : _bt_relbuf(rel, rbuf);
2164 : 711 : _bt_relbuf(rel, buf);
2165 : : }
2166 : : else
2167 : : {
8423 2168 : 12530 : BlockNumber bknum = BufferGetBlockNumber(buf);
2169 : 12530 : BlockNumber rbknum = BufferGetBlockNumber(rbuf);
3616 kgrittn@postgresql.o 2170 : 12530 : Page page = BufferGetPage(buf);
2171 : : IndexTuple new_item;
2172 : : BTStackData fakestack;
2173 : : IndexTuple ritem;
2174 : : Buffer pbuf;
2175 : :
8103 neilc@samurai.com 2176 [ + + ]: 12530 : if (stack == NULL)
2177 : : {
2178 : : BTPageOpaque opaque;
2179 : :
4203 heikki.linnakangas@i 2180 [ - + ]: 9 : elog(DEBUG2, "concurrent ROOT page split");
1444 michael@paquier.xyz 2181 : 9 : opaque = BTPageGetOpaque(page);
2182 : :
2183 : : /*
2184 : : * We should never reach here when a leaf page split takes place
2185 : : * despite the insert of newitem being able to apply the fastpath
2186 : : * optimization. Make sure of that with an assertion.
2187 : : *
2188 : : * This is more of a performance issue than a correctness issue.
2189 : : * The fastpath won't have a descent stack. Using a phony stack
2190 : : * here works, but never rely on that. The fastpath should be
2191 : : * rejected within _bt_search_insert() when the rightmost leaf
2192 : : * page will split, since it's faster to go through _bt_search()
2193 : : * and get a stack in the usual way.
2194 : : */
1944 pg@bowt.ie 2195 [ + - + - : 9 : Assert(!(P_ISLEAF(opaque) &&
- + ]
2196 : : BlockNumberIsValid(RelationGetTargetBlock(rel))));
2197 : :
2198 : : /* Find the leftmost page at the next level up */
919 tmunro@postgresql.or 2199 : 9 : pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false);
2200 : : /* Set up a phony stack entry pointing there */
8423 tgl@sss.pgh.pa.us 2201 : 9 : stack = &fakestack;
2202 : 9 : stack->bts_blkno = BufferGetBlockNumber(pbuf);
2203 : 9 : stack->bts_offset = InvalidOffsetNumber;
2204 : 9 : stack->bts_parent = NULL;
2205 : 9 : _bt_relbuf(rel, pbuf);
2206 : : }
2207 : :
2208 : : /* get high key from left, a strict lower bound for new right page */
7354 2209 : 12530 : ritem = (IndexTuple) PageGetItem(page,
2210 : 12530 : PageGetItemId(page, P_HIKEY));
2211 : :
2212 : : /* form an index tuple that points at the new right page */
2213 : 12530 : new_item = CopyIndexTuple(ritem);
2281 pg@bowt.ie 2214 : 12530 : BTreeTupleSetDownLink(new_item, rbknum);
2215 : :
2216 : : /*
2217 : : * Re-find and write lock the parent of buf.
2218 : : *
2219 : : * It's possible that the location of buf's downlink has changed since
2220 : : * our initial _bt_search() descent. _bt_getstackbuf() will detect
2221 : : * and recover from this, updating the stack, which ensures that the
2222 : : * new downlink will be inserted at the correct offset. Even buf's
2223 : : * parent may have changed.
2224 : : */
1079 andres@anarazel.de 2225 : 12530 : pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
2226 : :
2227 : : /*
2228 : : * Unlock the right child. The left child will be unlocked in
2229 : : * _bt_insertonpg().
2230 : : *
2231 : : * Unlocking the right child must be delayed until here to ensure that
2232 : : * no concurrent VACUUM operation can become confused. Page deletion
2233 : : * cannot be allowed to fail to re-find a downlink for the rbuf page.
2234 : : * (Actually, this is just a vestige of how things used to work. The
2235 : : * page deletion code is expected to check for the INCOMPLETE_SPLIT
2236 : : * flag on the left child. It won't attempt deletion of the right
2237 : : * child until the split is complete. Despite all this, we opt to
2238 : : * conservatively delay unlocking the right child until here.)
2239 : : */
7289 tgl@sss.pgh.pa.us 2240 : 12530 : _bt_relbuf(rel, rbuf);
2241 : :
8423 2242 [ - + ]: 12530 : if (pbuf == InvalidBuffer)
2418 peter@eisentraut.org 2243 [ # # ]:UBC 0 : ereport(ERROR,
2244 : : (errcode(ERRCODE_INDEX_CORRUPTED),
2245 : : errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2246 : : RelationGetRelationName(rel), bknum, rbknum)));
2247 : :
2248 : : /* Recursively insert into the parent */
1079 andres@anarazel.de 2249 :CBC 25060 : _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
2190 pg@bowt.ie 2250 : 12530 : new_item, MAXALIGN(IndexTupleSize(new_item)),
1944 2251 : 12530 : stack->bts_offset + 1, 0, isonly);
2252 : :
2253 : : /* be tidy */
8423 tgl@sss.pgh.pa.us 2254 : 12529 : pfree(new_item);
2255 : : }
2256 : 13240 : }
2257 : :
2258 : : /*
2259 : : * _bt_finish_split() -- Finish an incomplete split
2260 : : *
2261 : : * A crash or other failure can leave a split incomplete. The insertion
2262 : : * routines won't allow to insert on a page that is incompletely split.
2263 : : * Before inserting on such a page, call _bt_finish_split().
2264 : : *
2265 : : * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
2266 : : * and unpinned.
2267 : : *
2268 : : * Caller must provide a valid heaprel, since finishing a page split requires
2269 : : * allocating a new page if and when the parent page splits in turn.
2270 : : */
2271 : : void
1079 andres@anarazel.de 2272 :GBC 2 : _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
2273 : : {
3616 kgrittn@postgresql.o 2274 : 2 : Page lpage = BufferGetPage(lbuf);
1444 michael@paquier.xyz 2275 : 2 : BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2276 : : Buffer rbuf;
2277 : : Page rpage;
2278 : : BTPageOpaque rpageop;
2279 : : bool wasroot;
2280 : : bool wasonly;
2281 : :
4380 heikki.linnakangas@i 2282 [ - + ]: 2 : Assert(P_INCOMPLETE_SPLIT(lpageop));
1009 pg@bowt.ie 2283 [ - + ]: 2 : Assert(heaprel != NULL);
2284 : :
2285 : : /* Lock right sibling, the one missing the downlink */
2286 : 2 : rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
3616 kgrittn@postgresql.o 2287 : 2 : rpage = BufferGetPage(rbuf);
1444 michael@paquier.xyz 2288 : 2 : rpageop = BTPageGetOpaque(rpage);
2289 : :
2290 : : /* Could this be a root split? */
4380 heikki.linnakangas@i 2291 [ + + ]: 2 : if (!stack)
2292 : : {
2293 : : Buffer metabuf;
2294 : : Page metapg;
2295 : : BTMetaPageData *metad;
2296 : :
2297 : : /* acquire lock on the metapage */
1009 pg@bowt.ie 2298 : 1 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
3616 kgrittn@postgresql.o 2299 : 1 : metapg = BufferGetPage(metabuf);
4380 heikki.linnakangas@i 2300 : 1 : metad = BTPageGetMeta(metapg);
2301 : :
1944 pg@bowt.ie 2302 : 1 : wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2303 : :
4380 heikki.linnakangas@i 2304 : 1 : _bt_relbuf(rel, metabuf);
2305 : : }
2306 : : else
1944 pg@bowt.ie 2307 : 1 : wasroot = false;
2308 : :
2309 : : /* Was this the only page on the level before split? */
2310 [ + + + - ]: 2 : wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2311 : :
103 heikki.linnakangas@i 2312 :GNC 2 : INJECTION_POINT("nbtree-finish-incomplete-split", NULL);
4380 heikki.linnakangas@i 2313 [ - + ]:GBC 2 : elog(DEBUG1, "finishing incomplete split of %u/%u",
2314 : : BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
2315 : :
1079 andres@anarazel.de 2316 : 2 : _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
4380 heikki.linnakangas@i 2317 : 2 : }
2318 : :
2319 : : /*
2320 : : * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
2321 : : * tuple whose downlink points to child page.
2322 : : *
2323 : : * Caller passes child's block number, which is used to identify
2324 : : * associated pivot tuple in parent page using a linear search that
2325 : : * matches on pivot's downlink/block number. The expected location of
2326 : : * the pivot tuple is taken from the stack one level above the child
2327 : : * page. This is used as a starting point. Insertions into the
2328 : : * parent level could cause the pivot tuple to move right; deletions
2329 : : * could cause it to move left, but not left of the page we previously
2330 : : * found it on.
2331 : : *
2332 : : * Caller can use its stack to relocate the pivot tuple/downlink for
2333 : : * any same-level page to the right of the page found by its initial
2334 : : * descent. This is necessary because of the possibility that caller
2335 : : * moved right to recover from a concurrent page split. It's also
2336 : : * convenient for certain callers to be able to step right when there
2337 : : * wasn't a concurrent page split, while still using their original
2338 : : * stack. For example, the checkingunique _bt_doinsert() case may
2339 : : * have to step right when there are many physical duplicates, and its
2340 : : * scantid forces an insertion to the right of the "first page the
2341 : : * value could be on". (This is also relied on by all of our callers
2342 : : * when dealing with !heapkeyspace indexes.)
2343 : : *
2344 : : * Returns write-locked parent page buffer, or InvalidBuffer if pivot
2345 : : * tuple not found (should not happen). Adjusts bts_blkno &
2346 : : * bts_offset if changed. Page split caller should insert its new
2347 : : * pivot tuple for its new right sibling page on parent page, at the
2348 : : * offset number bts_offset + 1.
2349 : : */
2350 : : Buffer
1079 andres@anarazel.de 2351 :CBC 15759 : _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
2352 : : {
2353 : : BlockNumber blkno;
2354 : : OffsetNumber start;
2355 : :
9368 tgl@sss.pgh.pa.us 2356 : 15759 : blkno = stack->bts_blkno;
2357 : 15759 : start = stack->bts_offset;
2358 : :
2359 : : for (;;)
2360 : 11 : {
2361 : : Buffer buf;
2362 : : Page page;
2363 : : BTPageOpaque opaque;
2364 : :
1009 pg@bowt.ie 2365 : 15770 : buf = _bt_getbuf(rel, blkno, BT_WRITE);
3616 kgrittn@postgresql.o 2366 : 15770 : page = BufferGetPage(buf);
1444 michael@paquier.xyz 2367 : 15770 : opaque = BTPageGetOpaque(page);
2368 : :
1009 pg@bowt.ie 2369 [ - + ]: 15770 : Assert(heaprel != NULL);
2575 2370 [ - + ]: 15770 : if (P_INCOMPLETE_SPLIT(opaque))
2371 : : {
1079 andres@anarazel.de 2372 :UBC 0 : _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
4380 heikki.linnakangas@i 2373 : 0 : continue;
2374 : : }
2375 : :
8422 tgl@sss.pgh.pa.us 2376 [ + + ]:CBC 15770 : if (!P_IGNORE(opaque))
2377 : : {
2378 : : OffsetNumber offnum,
2379 : : minoff,
2380 : : maxoff;
2381 : : ItemId itemid;
2382 : : IndexTuple item;
2383 : :
2384 [ + + ]: 15759 : minoff = P_FIRSTDATAKEY(opaque);
2385 : 15759 : maxoff = PageGetMaxOffsetNumber(page);
2386 : :
2387 : : /*
2388 : : * start = InvalidOffsetNumber means "search the whole page". We
2389 : : * need this test anyway due to possibility that page has a high
2390 : : * key now when it didn't before.
2391 : : */
2392 [ + + ]: 15759 : if (start < minoff)
2393 : 20 : start = minoff;
2394 : :
2395 : : /*
2396 : : * Need this check too, to guard against possibility that page
2397 : : * split since we visited it originally.
2398 : : */
7880 2399 [ - + ]: 15759 : if (start > maxoff)
7880 tgl@sss.pgh.pa.us 2400 :UBC 0 : start = OffsetNumberNext(maxoff);
2401 : :
2402 : : /*
2403 : : * These loops will check every item on the page --- but in an
2404 : : * order that's attuned to the probability of where it actually
2405 : : * is. Scan to the right first, then to the left.
2406 : : */
8422 tgl@sss.pgh.pa.us 2407 :CBC 15759 : for (offnum = start;
2408 [ + - ]: 15802 : offnum <= maxoff;
2409 : 43 : offnum = OffsetNumberNext(offnum))
2410 : : {
2411 : 15802 : itemid = PageGetItemId(page, offnum);
7354 2412 : 15802 : item = (IndexTuple) PageGetItem(page, itemid);
2413 : :
2281 pg@bowt.ie 2414 [ + + ]: 15802 : if (BTreeTupleGetDownLink(item) == child)
2415 : : {
2416 : : /* Return accurate pointer to where link is now */
8422 tgl@sss.pgh.pa.us 2417 : 15759 : stack->bts_blkno = blkno;
2418 : 15759 : stack->bts_offset = offnum;
2419 : 15759 : return buf;
2420 : : }
2421 : : }
2422 : :
8422 tgl@sss.pgh.pa.us 2423 :UBC 0 : for (offnum = OffsetNumberPrev(start);
2424 [ # # ]: 0 : offnum >= minoff;
2425 : 0 : offnum = OffsetNumberPrev(offnum))
2426 : : {
2427 : 0 : itemid = PageGetItemId(page, offnum);
7354 2428 : 0 : item = (IndexTuple) PageGetItem(page, itemid);
2429 : :
2281 pg@bowt.ie 2430 [ # # ]: 0 : if (BTreeTupleGetDownLink(item) == child)
2431 : : {
2432 : : /* Return accurate pointer to where link is now */
8422 tgl@sss.pgh.pa.us 2433 : 0 : stack->bts_blkno = blkno;
2434 : 0 : stack->bts_offset = offnum;
2435 : 0 : return buf;
2436 : : }
2437 : : }
2438 : : }
2439 : :
2440 : : /*
2441 : : * The item we're looking for moved right at least one page.
2442 : : *
2443 : : * Lehman and Yao couple/chain locks when moving right here, which we
2444 : : * can avoid. See nbtree/README.
2445 : : */
9368 tgl@sss.pgh.pa.us 2446 [ - + ]:CBC 11 : if (P_RIGHTMOST(opaque))
2447 : : {
9009 tgl@sss.pgh.pa.us 2448 :UBC 0 : _bt_relbuf(rel, buf);
7998 2449 : 0 : return InvalidBuffer;
2450 : : }
9368 tgl@sss.pgh.pa.us 2451 :CBC 11 : blkno = opaque->btpo_next;
8423 2452 : 11 : start = InvalidOffsetNumber;
9009 2453 : 11 : _bt_relbuf(rel, buf);
2454 : : }
2455 : : }
2456 : :
2457 : : /*
2458 : : * _bt_freestack() -- free a retracement stack made by _bt_search_insert.
2459 : : */
2460 : : static void
3 pg@bowt.ie 2461 :GNC 3773883 : _bt_freestack(BTStack stack)
2462 : : {
2463 : : BTStack ostack;
2464 : :
2465 [ + + ]: 7605926 : while (stack != NULL)
2466 : : {
2467 : 3832043 : ostack = stack;
2468 : 3832043 : stack = stack->bts_parent;
2469 : 3832043 : pfree(ostack);
2470 : : }
2471 : 3773883 : }
2472 : :
2473 : : /*
2474 : : * _bt_newlevel() -- Create a new level above root page.
2475 : : *
2476 : : * We've just split the old root page and need to create a new one.
2477 : : * In order to do this, we add a new root page to the file, then lock
2478 : : * the metadata page and update it. This is guaranteed to be deadlock-
2479 : : * free, because all readers release their locks on the metadata page
2480 : : * before trying to lock the root, and all writers lock the root before
2481 : : * trying to lock the metadata page. We have a write lock on the old
2482 : : * root page, so we have not introduced any cycles into the waits-for
2483 : : * graph.
2484 : : *
2485 : : * On entry, lbuf (the old root) and rbuf (its new peer) are write-
2486 : : * locked. On exit, a new root page exists with entries for the
2487 : : * two new children, metapage is updated and unlocked/unpinned.
2488 : : * The new root buffer is returned to caller which has to unlock/unpin
2489 : : * lbuf, rbuf & rootbuf.
2490 : : */
2491 : : static Buffer
1009 pg@bowt.ie 2492 :CBC 711 : _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
2493 : : {
2494 : : Buffer rootbuf;
2495 : : Page lpage,
2496 : : rootpage;
2497 : : BlockNumber lbkno,
2498 : : rbkno;
2499 : : BlockNumber rootblknum;
2500 : : BTPageOpaque rootopaque;
2501 : : BTPageOpaque lopaque;
2502 : : ItemId itemid;
2503 : : IndexTuple item;
2504 : : IndexTuple left_item;
2505 : : Size left_item_sz;
2506 : : IndexTuple right_item;
2507 : : Size right_item_sz;
2508 : : Buffer metabuf;
2509 : : Page metapg;
2510 : : BTMetaPageData *metad;
2511 : : XLogRecPtr recptr;
2512 : :
8423 tgl@sss.pgh.pa.us 2513 : 711 : lbkno = BufferGetBlockNumber(lbuf);
2514 : 711 : rbkno = BufferGetBlockNumber(rbuf);
3616 kgrittn@postgresql.o 2515 : 711 : lpage = BufferGetPage(lbuf);
1444 michael@paquier.xyz 2516 : 711 : lopaque = BTPageGetOpaque(lpage);
2517 : :
2518 : : /* get a new root page */
1009 pg@bowt.ie 2519 : 711 : rootbuf = _bt_allocbuf(rel, heaprel);
3616 kgrittn@postgresql.o 2520 : 711 : rootpage = BufferGetPage(rootbuf);
9293 vadim4o@yahoo.com 2521 : 711 : rootblknum = BufferGetBlockNumber(rootbuf);
2522 : :
2523 : : /* acquire lock on the metapage */
1009 pg@bowt.ie 2524 : 711 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
3616 kgrittn@postgresql.o 2525 : 711 : metapg = BufferGetPage(metabuf);
9208 vadim4o@yahoo.com 2526 : 711 : metad = BTPageGetMeta(metapg);
2527 : :
2528 : : /*
2529 : : * Create downlink item for left page (old root). The key value used is
2530 : : * "minus infinity", a sentinel value that's reliably less than any real
2531 : : * key value that could appear in the left page.
2532 : : */
4363 heikki.linnakangas@i 2533 : 711 : left_item_sz = sizeof(IndexTupleData);
2534 : 711 : left_item = (IndexTuple) palloc(left_item_sz);
2535 : 711 : left_item->t_info = left_item_sz;
2281 pg@bowt.ie 2536 : 711 : BTreeTupleSetDownLink(left_item, lbkno);
2168 2537 : 711 : BTreeTupleSetNAtts(left_item, 0, false);
2538 : :
2539 : : /*
2540 : : * Create downlink item for right page. The key for it is obtained from
2541 : : * the "high key" position in the left page.
2542 : : */
4363 heikki.linnakangas@i 2543 : 711 : itemid = PageGetItemId(lpage, P_HIKEY);
2544 : 711 : right_item_sz = ItemIdGetLength(itemid);
2545 : 711 : item = (IndexTuple) PageGetItem(lpage, itemid);
2546 : 711 : right_item = CopyIndexTuple(item);
2281 pg@bowt.ie 2547 : 711 : BTreeTupleSetDownLink(right_item, rbkno);
2548 : :
2549 : : /* NO EREPORT(ERROR) from here till newroot op is logged */
9193 tgl@sss.pgh.pa.us 2550 : 711 : START_CRIT_SECTION();
2551 : :
2552 : : /* upgrade metapage if needed */
2552 pg@bowt.ie 2553 [ - + ]: 711 : if (metad->btm_version < BTREE_NOVAC_VERSION)
2846 teodor@sigaev.ru 2554 :UBC 0 : _bt_upgrademetapage(metapg);
2555 : :
2556 : : /* set btree special data */
1444 michael@paquier.xyz 2557 :CBC 711 : rootopaque = BTPageGetOpaque(rootpage);
10416 bruce@momjian.us 2558 : 711 : rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
8423 tgl@sss.pgh.pa.us 2559 : 711 : rootopaque->btpo_flags = BTP_ROOT;
1845 pg@bowt.ie 2560 : 711 : rootopaque->btpo_level =
1444 michael@paquier.xyz 2561 : 711 : (BTPageGetOpaque(lpage))->btpo_level + 1;
7251 tgl@sss.pgh.pa.us 2562 : 711 : rootopaque->btpo_cycleid = 0;
2563 : :
2564 : : /* update metapage data */
8423 2565 : 711 : metad->btm_root = rootblknum;
1845 pg@bowt.ie 2566 : 711 : metad->btm_level = rootopaque->btpo_level;
8423 tgl@sss.pgh.pa.us 2567 : 711 : metad->btm_fastroot = rootblknum;
1845 pg@bowt.ie 2568 : 711 : metad->btm_fastlevel = rootopaque->btpo_level;
2569 : :
2570 : : /*
2571 : : * Insert the left page pointer into the new root page. The root page is
2572 : : * the rightmost page on its level so there is no "high key" in it; the
2573 : : * two items will go into positions P_HIKEY and P_FIRSTKEY.
2574 : : *
2575 : : * Note: we *must* insert the two items in item-number order, for the
2576 : : * benefit of _bt_restore_page().
2577 : : */
2887 teodor@sigaev.ru 2578 [ + - - + ]: 711 : Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
139 peter@eisentraut.org 2579 [ - + ]:GNC 711 : if (PageAddItem(rootpage, left_item, left_item_sz, P_HIKEY, false, false) == InvalidOffsetNumber)
6649 tgl@sss.pgh.pa.us 2580 [ # # ]:UBC 0 : elog(PANIC, "failed to add leftkey to new root page"
2581 : : " while splitting block %u of index \"%s\"",
2582 : : BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
2583 : :
2584 : : /*
2585 : : * insert the right page pointer into the new root page.
2586 : : */
2552 pg@bowt.ie 2587 [ + - - + ]:CBC 711 : Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2588 [ + - - + ]: 711 : Assert(BTreeTupleGetNAtts(right_item, rel) <=
2589 : : IndexRelationGetNumberOfKeyAttributes(rel));
139 peter@eisentraut.org 2590 [ - + ]:GNC 711 : if (PageAddItem(rootpage, right_item, right_item_sz, P_FIRSTKEY, false, false) == InvalidOffsetNumber)
6649 tgl@sss.pgh.pa.us 2591 [ # # ]:UBC 0 : elog(PANIC, "failed to add rightkey to new root page"
2592 : : " while splitting block %u of index \"%s\"",
2593 : : BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
2594 : :
2595 : : /* Clear the incomplete-split flag in the left child */
4380 heikki.linnakangas@i 2596 [ - + ]:CBC 711 : Assert(P_INCOMPLETE_SPLIT(lopaque));
2597 : 711 : lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2598 : 711 : MarkBufferDirty(lbuf);
2599 : :
7289 tgl@sss.pgh.pa.us 2600 : 711 : MarkBufferDirty(rootbuf);
2601 : 711 : MarkBufferDirty(metabuf);
2602 : :
2603 : : /* XLOG stuff */
5571 rhaas@postgresql.org 2604 [ + + + + : 711 : if (RelationNeedsWAL(rel))
+ - + - ]
9293 vadim4o@yahoo.com 2605 :GIC 695 : {
2606 : : xl_btree_newroot xlrec;
2607 : : xl_btree_metadata md;
2608 : :
8423 tgl@sss.pgh.pa.us 2609 :CBC 695 : xlrec.rootblk = rootblknum;
9208 vadim4o@yahoo.com 2610 : 695 : xlrec.level = metad->btm_level;
2611 : :
4133 heikki.linnakangas@i 2612 : 695 : XLogBeginInsert();
397 peter@eisentraut.org 2613 : 695 : XLogRegisterData(&xlrec, SizeOfBtreeNewroot);
2614 : :
4133 heikki.linnakangas@i 2615 : 695 : XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2616 : 695 : XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
3054 tgl@sss.pgh.pa.us 2617 : 695 : XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
2618 : :
2552 pg@bowt.ie 2619 [ - + ]: 695 : Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
2620 : 695 : md.version = metad->btm_version;
4133 heikki.linnakangas@i 2621 : 695 : md.root = rootblknum;
2622 : 695 : md.level = metad->btm_level;
2623 : 695 : md.fastroot = rootblknum;
2624 : 695 : md.fastlevel = metad->btm_level;
1845 pg@bowt.ie 2625 : 695 : md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
2209 2626 : 695 : md.allequalimage = metad->btm_allequalimage;
2627 : :
397 peter@eisentraut.org 2628 : 695 : XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
2629 : :
2630 : : /*
2631 : : * Direct access to page is not good but faster - we should implement
2632 : : * some new func in page API.
2633 : : */
4133 heikki.linnakangas@i 2634 : 695 : XLogRegisterBufData(0,
3189 tgl@sss.pgh.pa.us 2635 : 695 : (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
4133 heikki.linnakangas@i 2636 : 695 : ((PageHeader) rootpage)->pd_special -
2637 : 695 : ((PageHeader) rootpage)->pd_upper);
2638 : :
2639 : 695 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2640 : : }
2641 : : else
2 pg@bowt.ie 2642 :GNC 16 : recptr = XLogGetFakeLSN(rel);
2643 : :
2644 : 711 : PageSetLSN(lpage, recptr);
2645 : 711 : PageSetLSN(rootpage, recptr);
2646 : 711 : PageSetLSN(metapg, recptr);
2647 : :
9193 tgl@sss.pgh.pa.us 2648 [ - + ]:CBC 711 : END_CRIT_SECTION();
2649 : :
2650 : : /* done with metapage */
7289 2651 : 711 : _bt_relbuf(rel, metabuf);
2652 : :
4363 heikki.linnakangas@i 2653 : 711 : pfree(left_item);
2654 : 711 : pfree(right_item);
2655 : :
7368 neilc@samurai.com 2656 : 711 : return rootbuf;
2657 : : }
2658 : :
2659 : : /*
2660 : : * _bt_pgaddtup() -- add a data item to a particular page during split.
2661 : : *
2662 : : * The difference between this routine and a bare PageAddItem call is
2663 : : * that this code can deal with the first data item on an internal btree
2664 : : * page in passing. This data item (which is called "firstright" within
2665 : : * _bt_split()) has a key that must be treated as minus infinity after
2666 : : * the split. Therefore, we truncate away all attributes when caller
2667 : : * specifies it's the first data item on page (downlink is not changed,
2668 : : * though). This extra step is only needed for the right page of an
2669 : : * internal page split. There is no need to do this for the first data
2670 : : * item on the existing/left page, since that will already have been
2671 : : * truncated during an earlier page split.
2672 : : *
2673 : : * See _bt_split() for a high level explanation of why we truncate here.
2674 : : * Note that this routine has nothing to do with suffix truncation,
2675 : : * despite using some of the same infrastructure.
2676 : : */
2677 : : static inline bool
5677 tgl@sss.pgh.pa.us 2678 : 4141784 : _bt_pgaddtup(Page page,
2679 : : Size itemsize,
2680 : : const IndexTupleData *itup,
2681 : : OffsetNumber itup_off,
2682 : : bool newfirstdataitem)
2683 : : {
2684 : : IndexTupleData trunctuple;
2685 : :
2162 pg@bowt.ie 2686 [ + + ]: 4141784 : if (newfirstdataitem)
2687 : : {
7354 tgl@sss.pgh.pa.us 2688 : 158 : trunctuple = *itup;
2689 : 158 : trunctuple.t_info = sizeof(IndexTupleData);
2168 pg@bowt.ie 2690 : 158 : BTreeTupleSetNAtts(&trunctuple, 0, false);
7354 tgl@sss.pgh.pa.us 2691 : 158 : itup = &trunctuple;
2692 : 158 : itemsize = sizeof(IndexTupleData);
2693 : : }
2694 : :
139 peter@eisentraut.org 2695 [ - + ]:GNC 4141784 : if (unlikely(PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber))
5677 tgl@sss.pgh.pa.us 2696 :UBC 0 : return false;
2697 : :
5677 tgl@sss.pgh.pa.us 2698 :CBC 4141784 : return true;
2699 : : }
2700 : :
2701 : : /*
2702 : : * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
2703 : : *
2704 : : * There are three operations performed here: simple index deletion, bottom-up
2705 : : * index deletion, and deduplication. If all three operations fail to free
2706 : : * enough space for the incoming item then caller will go on to split the
2707 : : * page. We always consider simple deletion first. If that doesn't work out
2708 : : * we consider alternatives. Callers that only want us to consider simple
2709 : : * deletion (without any fallback) ask for that using the 'simpleonly'
2710 : : * argument.
2711 : : *
2712 : : * We usually pick only one alternative "complex" operation when simple
2713 : : * deletion alone won't prevent a page split. The 'checkingunique',
2714 : : * 'uniquedup', and 'indexUnchanged' arguments are used for that.
2715 : : *
2716 : : * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
2717 : : * level flag was found set. The flag was useful back when there wasn't
2718 : : * necessarily one single page for a duplicate tuple to go on (before heap TID
2719 : : * became a part of the key space in version 4 indexes). But we don't
2720 : : * actually look at the flag anymore (it's not a gating condition for our
2721 : : * caller). That would cause us to miss tuples that are safe to delete,
2722 : : * without getting any benefit in return. We know that the alternative is to
2723 : : * split the page; scanning the line pointer array in passing won't have
2724 : : * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite
2725 : : * all this because !heapkeyspace indexes must still do a "getting tired"
2726 : : * linear search, and so are likely to get some benefit from using it as a
2727 : : * gating condition.)
2728 : : */
2729 : : static void
1944 pg@bowt.ie 2730 : 28232 : _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
2731 : : BTInsertState insertstate,
2732 : : bool simpleonly, bool checkingunique,
2733 : : bool uniquedup, bool indexUnchanged)
2734 : : {
2735 : : OffsetNumber deletable[MaxIndexTuplesPerPage];
7102 bruce@momjian.us 2736 : 28232 : int ndeletable = 0;
2737 : : OffsetNumber offnum,
2738 : : minoff,
2739 : : maxoff;
1944 pg@bowt.ie 2740 : 28232 : Buffer buffer = insertstate->buf;
2741 : 28232 : BTScanInsert itup_key = insertstate->itup_key;
3616 kgrittn@postgresql.o 2742 : 28232 : Page page = BufferGetPage(buffer);
1444 michael@paquier.xyz 2743 : 28232 : BTPageOpaque opaque = BTPageGetOpaque(page);
2744 : :
2552 pg@bowt.ie 2745 [ - + ]: 28232 : Assert(P_ISLEAF(opaque));
1887 2746 [ + + - + ]: 28232 : Assert(simpleonly || itup_key->heapkeyspace);
2747 [ + + + - : 28232 : Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
+ - - + ]
2748 : :
2749 : : /*
2750 : : * Scan over all items to see which ones need to be deleted according to
2751 : : * LP_DEAD flags. We'll usually manage to delete a few extra items that
2752 : : * are not marked LP_DEAD in passing. Often the extra items that actually
2753 : : * end up getting deleted are items that would have had their LP_DEAD bit
2754 : : * set before long anyway (if we opted not to include them as extras).
2755 : : */
2756 [ + + ]: 28232 : minoff = P_FIRSTDATAKEY(opaque);
7173 tgl@sss.pgh.pa.us 2757 : 28232 : maxoff = PageGetMaxOffsetNumber(page);
1887 pg@bowt.ie 2758 : 28232 : for (offnum = minoff;
7173 tgl@sss.pgh.pa.us 2759 [ + + ]: 7773867 : offnum <= maxoff;
2760 : 7745635 : offnum = OffsetNumberNext(offnum))
2761 : : {
7102 bruce@momjian.us 2762 : 7745635 : ItemId itemId = PageGetItemId(page, offnum);
2763 : :
6759 tgl@sss.pgh.pa.us 2764 [ + + ]: 7745635 : if (ItemIdIsDead(itemId))
7173 2765 : 130296 : deletable[ndeletable++] = offnum;
2766 : : }
2767 : :
2768 [ + + ]: 28232 : if (ndeletable > 0)
2769 : : {
1887 pg@bowt.ie 2770 : 3790 : _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
2771 : : insertstate->itup, minoff, maxoff);
1944 2772 : 3790 : insertstate->bounds_valid = false;
2773 : :
2774 : : /* Return when a page split has already been avoided */
2775 [ + + ]: 3790 : if (PageGetFreeSpace(page) >= insertstate->itemsz)
2776 : 13076 : return;
2777 : :
2778 : : /* Might as well assume duplicates (if checkingunique) */
2779 : 52 : uniquedup = true;
2780 : : }
2781 : :
2782 : : /*
2783 : : * We're done with simple deletion. Return early with callers that only
2784 : : * call here so that simple deletion can be considered. This includes
2785 : : * callers that explicitly ask for this and checkingunique callers that
2786 : : * probably don't have any version churn duplicates on the page.
2787 : : *
2788 : : * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
2789 : : * return at this point (or when we go on the try either or both of our
2790 : : * other strategies and they also fail). We do not bother expending a
2791 : : * separate write to clear it, however. Caller will definitely clear it
2792 : : * when it goes on to split the page (note also that the deduplication
2793 : : * process will clear the flag in passing, just to keep things tidy).
2794 : : */
1887 2795 [ + - + + : 24494 : if (simpleonly || (checkingunique && !uniquedup))
+ + ]
2796 : : {
2797 [ - + ]: 9124 : Assert(!indexUnchanged);
1944 2798 : 9124 : return;
2799 : : }
2800 : :
2801 : : /* Assume bounds about to be invalidated (this is almost certain now) */
2802 : 15370 : insertstate->bounds_valid = false;
2803 : :
2804 : : /*
2805 : : * Perform bottom-up index deletion pass when executor hint indicated that
2806 : : * incoming item is logically unchanged, or for a unique index that is
2807 : : * known to have physical duplicates for some other reason. (There is a
2808 : : * large overlap between these two cases for a unique index. It's worth
2809 : : * having both triggering conditions in order to apply the optimization in
2810 : : * the event of successive related INSERT and DELETE statements.)
2811 : : *
2812 : : * We'll go on to do a deduplication pass when a bottom-up pass fails to
2813 : : * delete an acceptable amount of free space (a significant fraction of
2814 : : * the page, or space for the new item, whichever is greater).
2815 : : *
2816 : : * Note: Bottom-up index deletion uses the same equality/equivalence
2817 : : * routines as deduplication internally. However, it does not merge
2818 : : * together index tuples, so the same correctness considerations do not
2819 : : * apply. We deliberately omit an index-is-allequalimage test here.
2820 : : */
1887 2821 [ + + + + : 17357 : if ((indexUnchanged || uniquedup) &&
+ + ]
2822 : 1987 : _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
2823 : 214 : return;
2824 : :
2825 : : /* Perform deduplication pass (when enabled and index-is-allequalimage) */
1944 2826 [ + - - + : 15156 : if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
+ + + + +
+ + - ]
1062 2827 : 15147 : _bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz,
2828 [ + + + + ]: 15147 : (indexUnchanged || uniquedup));
2829 : : }
2830 : :
2831 : : /*
2832 : : * _bt_simpledel_pass - Simple index tuple deletion pass.
2833 : : *
2834 : : * We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers
2835 : : * of all such tuples are determined by caller (caller passes these to us as
2836 : : * its 'deletable' argument).
2837 : : *
2838 : : * We might also delete extra index tuples that turn out to be safe to delete
2839 : : * in passing (though they must be cheap to check in passing to begin with).
2840 : : * There is no certainty that any extra tuples will be deleted, though. The
2841 : : * high level goal of the approach we take is to get the most out of each call
2842 : : * here (without noticeably increasing the per-call overhead compared to what
2843 : : * we need to do just to be able to delete the page's LP_DEAD-marked index
2844 : : * tuples).
2845 : : *
2846 : : * The number of extra index tuples that turn out to be deletable might
2847 : : * greatly exceed the number of LP_DEAD-marked index tuples due to various
2848 : : * locality related effects. For example, it's possible that the total number
2849 : : * of table blocks (pointed to by all TIDs on the leaf page) is naturally
2850 : : * quite low, in which case we might end up checking if it's possible to
2851 : : * delete _most_ index tuples on the page (without the tableam needing to
2852 : : * access additional table blocks). The tableam will sometimes stumble upon
2853 : : * _many_ extra deletable index tuples in indexes where this pattern is
2854 : : * common.
2855 : : *
2856 : : * See nbtree/README for further details on simple index tuple deletion.
2857 : : */
2858 : : static void
1887 2859 : 3790 : _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
2860 : : OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
2861 : : OffsetNumber minoff, OffsetNumber maxoff)
2862 : : {
2863 : 3790 : Page page = BufferGetPage(buffer);
2864 : : BlockNumber *deadblocks;
2865 : : int ndeadblocks;
2866 : : TM_IndexDeleteOp delstate;
2867 : : OffsetNumber offnum;
2868 : :
2869 : : /* Get array of table blocks pointed to by LP_DEAD-set tuples */
2870 : 3790 : deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
2871 : : &ndeadblocks);
2872 : :
2873 : : /* Initialize tableam state that describes index deletion operation */
1592 2874 : 3790 : delstate.irel = rel;
2875 : 3790 : delstate.iblknum = BufferGetBlockNumber(buffer);
1887 2876 : 3790 : delstate.bottomup = false;
2877 : 3790 : delstate.bottomupfreespace = 0;
2878 : 3790 : delstate.ndeltids = 0;
2879 : 3790 : delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
2880 : 3790 : delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
2881 : :
2882 : 3790 : for (offnum = minoff;
2883 [ + + ]: 1086104 : offnum <= maxoff;
2884 : 1082314 : offnum = OffsetNumberNext(offnum))
2885 : : {
2886 : 1082314 : ItemId itemid = PageGetItemId(page, offnum);
2887 : 1082314 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2888 : 1082314 : TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
2889 : 1082314 : TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
2890 : : BlockNumber tidblock;
2891 : : void *match;
2892 : :
2893 [ + + ]: 1082314 : if (!BTreeTupleIsPosting(itup))
2894 : : {
2895 : 1033103 : tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
2896 : 1033103 : match = bsearch(&tidblock, deadblocks, ndeadblocks,
2897 : : sizeof(BlockNumber), _bt_blk_cmp);
2898 : :
2899 [ + + ]: 1033103 : if (!match)
2900 : : {
2901 [ - + ]: 650883 : Assert(!ItemIdIsDead(itemid));
2902 : 650883 : continue;
2903 : : }
2904 : :
2905 : : /*
2906 : : * TID's table block is among those pointed to by the TIDs from
2907 : : * LP_DEAD-bit set tuples on page -- add TID to deltids
2908 : : */
2909 : 382220 : odeltid->tid = itup->t_tid;
2910 : 382220 : odeltid->id = delstate.ndeltids;
2911 : 382220 : ostatus->idxoffnum = offnum;
2912 : 382220 : ostatus->knowndeletable = ItemIdIsDead(itemid);
2913 : 382220 : ostatus->promising = false; /* unused */
2914 : 382220 : ostatus->freespace = 0; /* unused */
2915 : :
2916 : 382220 : delstate.ndeltids++;
2917 : : }
2918 : : else
2919 : : {
2920 : 49211 : int nitem = BTreeTupleGetNPosting(itup);
2921 : :
2922 [ + + ]: 245684 : for (int p = 0; p < nitem; p++)
2923 : : {
2924 : 196473 : ItemPointer tid = BTreeTupleGetPostingN(itup, p);
2925 : :
2926 : 196473 : tidblock = ItemPointerGetBlockNumber(tid);
2927 : 196473 : match = bsearch(&tidblock, deadblocks, ndeadblocks,
2928 : : sizeof(BlockNumber), _bt_blk_cmp);
2929 : :
2930 [ + + ]: 196473 : if (!match)
2931 : : {
2932 [ - + ]: 172714 : Assert(!ItemIdIsDead(itemid));
2933 : 172714 : continue;
2934 : : }
2935 : :
2936 : : /*
2937 : : * TID's table block is among those pointed to by the TIDs
2938 : : * from LP_DEAD-bit set tuples on page -- add TID to deltids
2939 : : */
2940 : 23759 : odeltid->tid = *tid;
2941 : 23759 : odeltid->id = delstate.ndeltids;
2942 : 23759 : ostatus->idxoffnum = offnum;
2943 : 23759 : ostatus->knowndeletable = ItemIdIsDead(itemid);
2944 : 23759 : ostatus->promising = false; /* unused */
2945 : 23759 : ostatus->freespace = 0; /* unused */
2946 : :
2947 : 23759 : odeltid++;
2948 : 23759 : ostatus++;
2949 : 23759 : delstate.ndeltids++;
2950 : : }
2951 : : }
2952 : : }
2953 : :
2954 : 3790 : pfree(deadblocks);
2955 : :
2956 [ - + ]: 3790 : Assert(delstate.ndeltids >= ndeletable);
2957 : :
2958 : : /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
2959 : 3790 : _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
2960 : :
2961 : 3790 : pfree(delstate.deltids);
2962 : 3790 : pfree(delstate.status);
2963 : 3790 : }
2964 : :
2965 : : /*
2966 : : * _bt_deadblocks() -- Get LP_DEAD related table blocks.
2967 : : *
2968 : : * Builds sorted and unique-ified array of table block numbers from index
2969 : : * tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table
2970 : : * block from incoming newitem just in case it isn't among the LP_DEAD-related
2971 : : * table blocks.
2972 : : *
2973 : : * Always counting the newitem's table block as an LP_DEAD related block makes
2974 : : * sense because the cost is consistently low; it is practically certain that
2975 : : * the table block will not incur a buffer miss in tableam. On the other hand
2976 : : * the benefit is often quite high. There is a decent chance that there will
2977 : : * be some deletable items from this block, since in general most garbage
2978 : : * tuples became garbage in the recent past (in many cases this won't be the
2979 : : * first logical row that core code added to/modified in table block
2980 : : * recently).
2981 : : *
2982 : : * Returns final array, and sets *nblocks to its final size for caller.
2983 : : */
2984 : : static BlockNumber *
2985 : 3790 : _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
2986 : : IndexTuple newitem, int *nblocks)
2987 : : {
2988 : : int spacentids,
2989 : : ntids;
2990 : : BlockNumber *tidblocks;
2991 : :
2992 : : /*
2993 : : * Accumulate each TID's block in array whose initial size has space for
2994 : : * one table block per LP_DEAD-set tuple (plus space for the newitem table
2995 : : * block). Array will only need to grow when there are LP_DEAD-marked
2996 : : * posting list tuples (which is not that common).
2997 : : */
2998 : 3790 : spacentids = ndeletable + 1;
2999 : 3790 : ntids = 0;
95 michael@paquier.xyz 3000 :GNC 3790 : tidblocks = palloc_array(BlockNumber, spacentids);
3001 : :
3002 : : /*
3003 : : * First add the table block for the incoming newitem. This is the one
3004 : : * case where simple deletion can visit a table block that doesn't have
3005 : : * any known deletable items.
3006 : : */
1887 pg@bowt.ie 3007 [ + - - + ]:CBC 3790 : Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
3008 : 3790 : tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
3009 : :
3010 [ + + ]: 134086 : for (int i = 0; i < ndeletable; i++)
3011 : : {
3012 : 130296 : ItemId itemid = PageGetItemId(page, deletable[i]);
3013 : 130296 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
3014 : :
3015 [ - + ]: 130296 : Assert(ItemIdIsDead(itemid));
3016 : :
3017 [ + + ]: 130296 : if (!BTreeTupleIsPosting(itup))
3018 : : {
3019 [ + + ]: 126062 : if (ntids + 1 > spacentids)
3020 : : {
3021 : 107 : spacentids *= 2;
3022 : : tidblocks = (BlockNumber *)
3023 : 107 : repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
3024 : : }
3025 : :
3026 : 126062 : tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
3027 : : }
3028 : : else
3029 : : {
3030 : 4234 : int nposting = BTreeTupleGetNPosting(itup);
3031 : :
3032 [ + + ]: 4234 : if (ntids + nposting > spacentids)
3033 : : {
3034 : 111 : spacentids = Max(spacentids * 2, ntids + nposting);
3035 : : tidblocks = (BlockNumber *)
3036 : 111 : repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
3037 : : }
3038 : :
3039 [ + + ]: 14381 : for (int j = 0; j < nposting; j++)
3040 : : {
3041 : 10147 : ItemPointer tid = BTreeTupleGetPostingN(itup, j);
3042 : :
3043 : 10147 : tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
3044 : : }
3045 : : }
3046 : : }
3047 : :
3048 : 3790 : qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3049 : 3790 : *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3050 : :
3051 : 3790 : return tidblocks;
3052 : : }
3053 : :
3054 : : /*
3055 : : * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
3056 : : */
3057 : : static inline int
3058 : 2750511 : _bt_blk_cmp(const void *arg1, const void *arg2)
3059 : : {
48 peter@eisentraut.org 3060 :GNC 2750511 : BlockNumber b1 = *((const BlockNumber *) arg1);
3061 : 2750511 : BlockNumber b2 = *((const BlockNumber *) arg2);
3062 : :
758 nathan@postgresql.or 3063 :CBC 2750511 : return pg_cmp_u32(b1, b2);
3064 : : }
|