LCOV - differential code coverage report
Current view: top level - src/backend/access/nbtree - nbtinsert.c (source / functions) Coverage Total Hit UNC LBC UBC GBC GIC GNC CBC DUB DCB
Current: 0e5ff9b9b45a657aea12440478dc002e9b01f138 vs 0123ce131fca454009439dfa3b2266d1d40737d7 Lines: 92.6 % 860 796 64 26 3 50 717 7 30
Current Date: 2026-03-14 14:10:32 -0400 Functions: 100.0 % 17 17 9 8
Baseline: lcov-20260315-024220-baseline Branches: 69.3 % 700 485 7 2 206 18 19 448 6 14
Baseline Date: 2026-03-14 15:27:56 +0100 Line coverage date bins:
Legend: Lines:     hit not hit
Branches: + taken - not taken # not executed
(1,7] days: 100.0 % 29 29 29
(30,360] days: 100.0 % 21 21 21
(360..) days: 92.1 % 810 746 64 26 3 717
Function coverage date bins:
(1,7] days: 100.0 % 1 1 1
(360..) days: 100.0 % 16 16 8 8
Branch coverage date bins:
(1,7] days: 91.7 % 12 11 1 11
(30,360] days: 57.1 % 14 8 6 8
(360..) days: 69.1 % 674 466 2 206 18 448

 Age         Owner                    Branch data    TLA  Line data    Source code
                                  1                 :                : /*-------------------------------------------------------------------------
                                  2                 :                :  *
                                  3                 :                :  * nbtinsert.c
                                  4                 :                :  *    Item insertion in Lehman and Yao btrees for Postgres.
                                  5                 :                :  *
                                  6                 :                :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
                                  7                 :                :  * Portions Copyright (c) 1994, Regents of the University of California
                                  8                 :                :  *
                                  9                 :                :  *
                                 10                 :                :  * IDENTIFICATION
                                 11                 :                :  *    src/backend/access/nbtree/nbtinsert.c
                                 12                 :                :  *
                                 13                 :                :  *-------------------------------------------------------------------------
                                 14                 :                :  */
                                 15                 :                : 
                                 16                 :                : #include "postgres.h"
                                 17                 :                : 
                                 18                 :                : #include "access/nbtree.h"
                                 19                 :                : #include "access/nbtxlog.h"
                                 20                 :                : #include "access/tableam.h"
                                 21                 :                : #include "access/transam.h"
                                 22                 :                : #include "access/xloginsert.h"
                                 23                 :                : #include "common/int.h"
                                 24                 :                : #include "common/pg_prng.h"
                                 25                 :                : #include "lib/qunique.h"
                                 26                 :                : #include "miscadmin.h"
                                 27                 :                : #include "storage/lmgr.h"
                                 28                 :                : #include "storage/predicate.h"
                                 29                 :                : #include "utils/injection_point.h"
                                 30                 :                : 
                                 31                 :                : /* Minimum tree height for application of fastpath optimization */
                                 32                 :                : #define BTREE_FASTPATH_MIN_LEVEL    2
                                 33                 :                : 
                                 34                 :                : 
                                 35                 :                : static BTStack _bt_search_insert(Relation rel, Relation heaprel,
                                 36                 :                :                                  BTInsertState insertstate);
                                 37                 :                : static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
                                 38                 :                :                                       Relation heapRel,
                                 39                 :                :                                       IndexUniqueCheck checkUnique, bool *is_unique,
                                 40                 :                :                                       uint32 *speculativeToken);
                                 41                 :                : static OffsetNumber _bt_findinsertloc(Relation rel,
                                 42                 :                :                                       BTInsertState insertstate,
                                 43                 :                :                                       bool checkingunique,
                                 44                 :                :                                       bool indexUnchanged,
                                 45                 :                :                                       BTStack stack,
                                 46                 :                :                                       Relation heapRel);
                                 47                 :                : static void _bt_stepright(Relation rel, Relation heaprel,
                                 48                 :                :                           BTInsertState insertstate, BTStack stack);
                                 49                 :                : static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key,
                                 50                 :                :                            Buffer buf,
                                 51                 :                :                            Buffer cbuf,
                                 52                 :                :                            BTStack stack,
                                 53                 :                :                            IndexTuple itup,
                                 54                 :                :                            Size itemsz,
                                 55                 :                :                            OffsetNumber newitemoff,
                                 56                 :                :                            int postingoff,
                                 57                 :                :                            bool split_only_page);
                                 58                 :                : static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key,
                                 59                 :                :                         Buffer buf, Buffer cbuf, OffsetNumber newitemoff,
                                 60                 :                :                         Size newitemsz, IndexTuple newitem, IndexTuple orignewitem,
                                 61                 :                :                         IndexTuple nposting, uint16 postingoff);
                                 62                 :                : static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf,
                                 63                 :                :                               Buffer rbuf, BTStack stack, bool isroot, bool isonly);
                                 64                 :                : static void _bt_freestack(BTStack stack);
                                 65                 :                : static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf);
                                 66                 :                : static inline bool _bt_pgaddtup(Page page, Size itemsize, const IndexTupleData *itup,
                                 67                 :                :                                 OffsetNumber itup_off, bool newfirstdataitem);
                                 68                 :                : static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
                                 69                 :                :                                          BTInsertState insertstate,
                                 70                 :                :                                          bool simpleonly, bool checkingunique,
                                 71                 :                :                                          bool uniquedup, bool indexUnchanged);
                                 72                 :                : static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
                                 73                 :                :                                OffsetNumber *deletable, int ndeletable,
                                 74                 :                :                                IndexTuple newitem, OffsetNumber minoff,
                                 75                 :                :                                OffsetNumber maxoff);
                                 76                 :                : static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
                                 77                 :                :                                    int ndeletable, IndexTuple newitem,
                                 78                 :                :                                    int *nblocks);
                                 79                 :                : static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
                                 80                 :                : 
                                 81                 :                : /*
                                 82                 :                :  *  _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
                                 83                 :                :  *
                                 84                 :                :  *      This routine is called by the public interface routine, btinsert.
                                 85                 :                :  *      By here, itup is filled in, including the TID.
                                 86                 :                :  *
                                 87                 :                :  *      If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
                                 88                 :                :  *      will allow duplicates.  Otherwise (UNIQUE_CHECK_YES or
                                 89                 :                :  *      UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
                                 90                 :                :  *      For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
                                 91                 :                :  *      don't actually insert.
                                 92                 :                :  *
                                 93                 :                :  *      indexUnchanged executor hint indicates if itup is from an
                                 94                 :                :  *      UPDATE that didn't logically change the indexed value, but
                                 95                 :                :  *      must nevertheless have a new entry to point to a successor
                                 96                 :                :  *      version.
                                 97                 :                :  *
                                 98                 :                :  *      The result value is only significant for UNIQUE_CHECK_PARTIAL:
                                 99                 :                :  *      it must be true if the entry is known unique, else false.
                                100                 :                :  *      (In the current implementation we'll also return true after a
                                101                 :                :  *      successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
                                102                 :                :  *      that's just a coding artifact.)
                                103                 :                :  */
                                104                 :                : bool
 7354 tgl@sss.pgh.pa.us         105                 :CBC     4273976 : _bt_doinsert(Relation rel, IndexTuple itup,
                                106                 :                :              IndexUniqueCheck checkUnique, bool indexUnchanged,
                                107                 :                :              Relation heapRel)
                                108                 :                : {
 6073                           109                 :        4273976 :     bool        is_unique = false;
                                110                 :                :     BTInsertStateData insertstate;
                                111                 :                :     BTScanInsert itup_key;
                                112                 :                :     BTStack     stack;
 2552 pg@bowt.ie                113                 :        4273976 :     bool        checkingunique = (checkUnique != UNIQUE_CHECK_NO);
                                114                 :                : 
                                115                 :                :     /* we need an insertion scan key to do our search, so build one */
 1009                           116                 :        4273976 :     itup_key = _bt_mkscankey(rel, itup);
                                117                 :                : 
 2518                           118         [ +  + ]:        4273976 :     if (checkingunique)
                                119                 :                :     {
                                120         [ +  + ]:        3056214 :         if (!itup_key->anynullkeys)
                                121                 :                :         {
                                122                 :                :             /* No (heapkeyspace) scantid until uniqueness established */
                                123                 :        3046127 :             itup_key->scantid = NULL;
                                124                 :                :         }
                                125                 :                :         else
                                126                 :                :         {
                                127                 :                :             /*
                                128                 :                :              * Scan key for new tuple contains NULL key values.  Bypass
                                129                 :                :              * checkingunique steps.  They are unnecessary because core code
                                130                 :                :              * considers NULL unequal to every value, including NULL.
                                131                 :                :              *
                                132                 :                :              * This optimization avoids O(N^2) behavior within the
                                133                 :                :              * _bt_findinsertloc() heapkeyspace path when a unique index has a
                                134                 :                :              * large number of "duplicates" with NULL key values.
                                135                 :                :              */
                                136                 :          10087 :             checkingunique = false;
                                137                 :                :             /* Tuple is unique in the sense that core code cares about */
                                138         [ -  + ]:          10087 :             Assert(checkUnique != UNIQUE_CHECK_EXISTING);
                                139                 :          10087 :             is_unique = true;
                                140                 :                :         }
                                141                 :                :     }
                                142                 :                : 
                                143                 :                :     /*
                                144                 :                :      * Fill in the BTInsertState working area, to track the current page and
                                145                 :                :      * position within the page to insert on.
                                146                 :                :      *
                                147                 :                :      * Note that itemsz is passed down to lower level code that deals with
                                148                 :                :      * inserting the item.  It must be MAXALIGN()'d.  This ensures that space
                                149                 :                :      * accounting code consistently considers the alignment overhead that we
                                150                 :                :      * expect PageAddItem() will add later.  (Actually, index_form_tuple() is
                                151                 :                :      * already conservative about alignment, but we don't rely on that from
                                152                 :                :      * this distance.  Besides, preserving the "true" tuple size in index
                                153                 :                :      * tuple headers for the benefit of nbtsplitloc.c might happen someday.
                                154                 :                :      * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
                                155                 :                :      */
 2552                           156                 :        4273976 :     insertstate.itup = itup;
                                157                 :        4273976 :     insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
                                158                 :        4273976 :     insertstate.itup_key = itup_key;
                                159                 :        4273976 :     insertstate.bounds_valid = false;
                                160                 :        4273976 :     insertstate.buf = InvalidBuffer;
 2209                           161                 :        4273976 :     insertstate.postingoff = 0;
                                162                 :                : 
 2188                           163                 :        4273988 : search:
                                164                 :                : 
                                165                 :                :     /*
                                166                 :                :      * Find and lock the leaf page that the tuple should be added to by
                                167                 :                :      * searching from the root page.  insertstate.buf will hold a buffer that
                                168                 :                :      * is locked in exclusive mode afterwards.
                                169                 :                :      */
 1079 andres@anarazel.de        170                 :        4273988 :     stack = _bt_search_insert(rel, heapRel, &insertstate);
                                171                 :                : 
                                172                 :                :     /*
                                173                 :                :      * checkingunique inserts are not allowed to go ahead when two tuples with
                                174                 :                :      * equal key attribute values would be visible to new MVCC snapshots once
                                175                 :                :      * the xact commits.  Check for conflicts in the locked page/buffer (if
                                176                 :                :      * needed) here.
                                177                 :                :      *
                                178                 :                :      * It might be necessary to check a page to the right in _bt_check_unique,
                                179                 :                :      * though that should be very rare.  In practice the first page the value
                                180                 :                :      * could be on (with scantid omitted) is almost always also the only page
                                181                 :                :      * that a matching tuple might be found on.  This is due to the behavior
                                182                 :                :      * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
                                183                 :                :      * only be allowed to cross a page boundary when there is no candidate
                                184                 :                :      * leaf page split point that avoids it.  Also, _bt_check_unique can use
                                185                 :                :      * the leaf page high key to determine that there will be no duplicates on
                                186                 :                :      * the right sibling without actually visiting it (it uses the high key in
                                187                 :                :      * cases where the new item happens to belong at the far right of the leaf
                                188                 :                :      * page).
                                189                 :                :      *
                                190                 :                :      * NOTE: obviously, _bt_check_unique can only detect keys that are already
                                191                 :                :      * in the index; so it cannot defend against concurrent insertions of the
                                192                 :                :      * same key.  We protect against that by means of holding a write lock on
                                193                 :                :      * the first page the value could be on, with omitted/-inf value for the
                                194                 :                :      * implicit heap TID tiebreaker attribute.  Any other would-be inserter of
                                195                 :                :      * the same key must acquire a write lock on the same page, so only one
                                196                 :                :      * would-be inserter can be making the check at one time.  Furthermore,
                                197                 :                :      * once we are past the check we hold write locks continuously until we
                                198                 :                :      * have performed our insertion, so no later inserter can fail to see our
                                199                 :                :      * insertion.  (This requires some care in _bt_findinsertloc.)
                                200                 :                :      *
                                201                 :                :      * If we must wait for another xact, we release the lock while waiting,
                                202                 :                :      * and then must perform a new search.
                                203                 :                :      *
                                204                 :                :      * For a partial uniqueness check, we don't wait for the other xact. Just
                                205                 :                :      * let the tuple in and return false for possibly non-unique, or true for
                                206                 :                :      * definitely unique.
                                207                 :                :      */
 2552 pg@bowt.ie                208         [ +  + ]:        4273988 :     if (checkingunique)
                                209                 :                :     {
                                210                 :                :         TransactionId xwait;
                                211                 :                :         uint32      speculativeToken;
                                212                 :                : 
                                213                 :        3046139 :         xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
                                214                 :                :                                  &is_unique, &speculativeToken);
                                215                 :                : 
 2188                           216         [ +  + ]:        3045875 :         if (unlikely(TransactionIdIsValid(xwait)))
                                217                 :                :         {
                                218                 :                :             /* Have to wait for the other guy ... */
 2552                           219                 :             12 :             _bt_relbuf(rel, insertstate.buf);
                                220                 :             12 :             insertstate.buf = InvalidBuffer;
                                221                 :                : 
                                222                 :                :             /*
                                223                 :                :              * If it's a speculative insertion, wait for it to finish (ie. to
                                224                 :                :              * go ahead with the insertion, or kill the tuple).  Otherwise
                                225                 :                :              * wait for the transaction to finish as usual.
                                226                 :                :              */
 3964 andres@anarazel.de        227         [ -  + ]:             12 :             if (speculativeToken)
 3964 andres@anarazel.de        228                 :UBC           0 :                 SpeculativeInsertionWait(xwait, speculativeToken);
                                229                 :                :             else
 3964 andres@anarazel.de        230                 :CBC          12 :                 XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
                                231                 :                : 
                                232                 :                :             /* start over... */
 2911 andrew@dunslane.net       233         [ -  + ]:             12 :             if (stack)
 2911 andrew@dunslane.net       234                 :UBC           0 :                 _bt_freestack(stack);
 2188 pg@bowt.ie                235                 :CBC          12 :             goto search;
                                236                 :                :         }
                                237                 :                : 
                                238                 :                :         /* Uniqueness is established -- restore heap tid as scantid */
 2552                           239         [ +  - ]:        3045863 :         if (itup_key->heapkeyspace)
                                240                 :        3045863 :             itup_key->scantid = &itup->t_tid;
                                241                 :                :     }
                                242                 :                : 
 6073 tgl@sss.pgh.pa.us         243         [ +  + ]:        4273712 :     if (checkUnique != UNIQUE_CHECK_EXISTING)
                                244                 :                :     {
                                245                 :                :         OffsetNumber newitemoff;
                                246                 :                : 
                                247                 :                :         /*
                                248                 :                :          * The only conflict predicate locking cares about for indexes is when
                                249                 :                :          * an index tuple insert conflicts with an existing lock.  We don't
                                250                 :                :          * know the actual page we're going to insert on for sure just yet in
                                251                 :                :          * checkingunique and !heapkeyspace cases, but it's okay to use the
                                252                 :                :          * first page the value could be on (with scantid omitted) instead.
                                253                 :                :          */
 2238 tmunro@postgresql.or      254                 :        4273685 :         CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));
                                255                 :                : 
                                256                 :                :         /*
                                257                 :                :          * Do the insertion.  Note that insertstate contains cached binary
                                258                 :                :          * search bounds established within _bt_check_unique when insertion is
                                259                 :                :          * checkingunique.
                                260                 :                :          */
 2552 pg@bowt.ie                261                 :        4273682 :         newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
                                262                 :                :                                        indexUnchanged, stack, heapRel);
 1079 andres@anarazel.de        263                 :        4273682 :         _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
                                264                 :                :                        stack, itup, insertstate.itemsz, newitemoff,
                                265                 :                :                        insertstate.postingoff, false);
                                266                 :                :     }
                                267                 :                :     else
                                268                 :                :     {
                                269                 :                :         /* just release the buffer */
 2552 pg@bowt.ie                270                 :             27 :         _bt_relbuf(rel, insertstate.buf);
                                271                 :                :     }
                                272                 :                : 
                                273                 :                :     /* be tidy */
 2911 andrew@dunslane.net       274         [ +  + ]:        4273707 :     if (stack)
                                275                 :        3773883 :         _bt_freestack(stack);
 2552 pg@bowt.ie                276                 :        4273707 :     pfree(itup_key);
                                277                 :                : 
 6073 tgl@sss.pgh.pa.us         278                 :        4273707 :     return is_unique;
                                279                 :                : }
                                280                 :                : 
                                281                 :                : /*
                                282                 :                :  *  _bt_search_insert() -- _bt_search() wrapper for inserts
                                283                 :                :  *
                                284                 :                :  * Search the tree for a particular scankey, or more precisely for the first
                                285                 :                :  * leaf page it could be on.  Try to make use of the fastpath optimization's
                                286                 :                :  * rightmost leaf page cache before actually searching the tree from the root
                                287                 :                :  * page, though.
                                288                 :                :  *
                                289                 :                :  * Return value is a stack of parent-page pointers (though see notes about
                                290                 :                :  * fastpath optimization and page splits below).  insertstate->buf is set to
                                291                 :                :  * the address of the leaf-page buffer, which is write-locked and pinned in
                                292                 :                :  * all cases (if necessary by creating a new empty root page for caller).
                                293                 :                :  *
                                294                 :                :  * The fastpath optimization avoids most of the work of searching the tree
                                295                 :                :  * repeatedly when a single backend inserts successive new tuples on the
                                296                 :                :  * rightmost leaf page of an index.  A backend cache of the rightmost leaf
                                297                 :                :  * page is maintained within _bt_insertonpg(), and used here.  The cache is
                                298                 :                :  * invalidated here when an insert of a non-pivot tuple must take place on a
                                299                 :                :  * non-rightmost leaf page.
                                300                 :                :  *
                                301                 :                :  * The optimization helps with indexes on an auto-incremented field.  It also
                                302                 :                :  * helps with indexes on datetime columns, as well as indexes with lots of
                                303                 :                :  * NULL values.  (NULLs usually get inserted in the rightmost page for single
                                304                 :                :  * column indexes, since they usually get treated as coming after everything
                                305                 :                :  * else in the key space.  Individual NULL tuples will generally be placed on
                                306                 :                :  * the rightmost leaf page due to the influence of the heap TID column.)
                                307                 :                :  *
                                308                 :                :  * Note that we avoid applying the optimization when there is insufficient
                                309                 :                :  * space on the rightmost page to fit caller's new item.  This is necessary
                                310                 :                :  * because we'll need to return a real descent stack when a page split is
                                311                 :                :  * expected (actually, caller can cope with a leaf page split that uses a NULL
                                312                 :                :  * stack, but that's very slow and so must be avoided).  Note also that the
                                313                 :                :  * fastpath optimization acquires the lock on the page conditionally as a way
                                314                 :                :  * of reducing extra contention when there are concurrent insertions into the
                                315                 :                :  * rightmost page (we give up if we'd have to wait for the lock).  We assume
                                316                 :                :  * that it isn't useful to apply the optimization when there is contention,
                                317                 :                :  * since each per-backend cache won't stay valid for long.
                                318                 :                :  */
                                319                 :                : static BTStack
 1079 andres@anarazel.de        320                 :        4273988 : _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
                                321                 :                : {
 2188 pg@bowt.ie                322         [ -  + ]:        4273988 :     Assert(insertstate->buf == InvalidBuffer);
                                323         [ -  + ]:        4273988 :     Assert(!insertstate->bounds_valid);
                                324         [ -  + ]:        4273988 :     Assert(insertstate->postingoff == 0);
                                325                 :                : 
                                326   [ +  -  +  + ]:        4273988 :     if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
                                327                 :                :     {
                                328                 :                :         /* Simulate a _bt_getbuf() call with conditional locking */
                                329         [ +  - ]:          34656 :         insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
 2063                           330         [ +  + ]:          34656 :         if (_bt_conditionallockbuf(rel, insertstate->buf))
                                331                 :                :         {
                                332                 :                :             Page        page;
                                333                 :                :             BTPageOpaque opaque;
                                334                 :                : 
 2188                           335                 :          33787 :             _bt_checkpage(rel, insertstate->buf);
                                336                 :          33787 :             page = BufferGetPage(insertstate->buf);
 1444 michael@paquier.xyz       337                 :          33787 :             opaque = BTPageGetOpaque(page);
                                338                 :                : 
                                339                 :                :             /*
                                340                 :                :              * Check if the page is still the rightmost leaf page and has
                                341                 :                :              * enough free space to accommodate the new tuple.  Also check
                                342                 :                :              * that the insertion scan key is strictly greater than the first
                                343                 :                :              * non-pivot tuple on the page.  (Note that we expect itup_key's
                                344                 :                :              * scantid to be unset when our caller is a checkingunique
                                345                 :                :              * inserter.)
                                346                 :                :              */
 1944 pg@bowt.ie                347         [ +  + ]:          33787 :             if (P_RIGHTMOST(opaque) &&
                                348         [ +  - ]:          33752 :                 P_ISLEAF(opaque) &&
                                349         [ +  - ]:          33752 :                 !P_IGNORE(opaque) &&
 2188                           350   [ +  +  +  - ]:          67314 :                 PageGetFreeSpace(page) > insertstate->itemsz &&
                                351         [ +  + ]:          67124 :                 PageGetMaxOffsetNumber(page) >= P_HIKEY &&
                                352                 :          33562 :                 _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
                                353                 :                :             {
                                354                 :                :                 /*
                                355                 :                :                  * Caller can use the fastpath optimization because cached
                                356                 :                :                  * block is still rightmost leaf page, which can fit caller's
                                357                 :                :                  * new tuple without splitting.  Keep block in local cache for
                                358                 :                :                  * next insert, and have caller use NULL stack.
                                359                 :                :                  *
                                360                 :                :                  * Note that _bt_insert_parent() has an assertion that catches
                                361                 :                :                  * leaf page splits that somehow follow from a fastpath insert
                                362                 :                :                  * (it should only be passed a NULL stack when it must deal
                                363                 :                :                  * with a concurrent root page split, and never because a NULL
                                364                 :                :                  * stack was returned here).
                                365                 :                :                  */
                                366                 :          33545 :                 return NULL;
                                367                 :                :             }
                                368                 :                : 
                                369                 :                :             /* Page unsuitable for caller, drop lock and pin */
                                370                 :            242 :             _bt_relbuf(rel, insertstate->buf);
                                371                 :                :         }
                                372                 :                :         else
                                373                 :                :         {
                                374                 :                :             /* Lock unavailable, drop pin */
                                375                 :            869 :             ReleaseBuffer(insertstate->buf);
                                376                 :                :         }
                                377                 :                : 
                                378                 :                :         /* Forget block, since cache doesn't appear to be useful */
                                379                 :           1111 :         RelationSetTargetBlock(rel, InvalidBlockNumber);
                                380                 :                :     }
                                381                 :                : 
                                382                 :                :     /* Cannot use optimization -- descend tree, return proper descent stack */
 1079 andres@anarazel.de        383                 :        4240443 :     return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
                                384                 :                :                       BT_WRITE, true);
                                385                 :                : }
                                386                 :                : 
                                387                 :                : /*
                                388                 :                :  *  _bt_check_unique() -- Check for violation of unique index constraint
                                389                 :                :  *
                                390                 :                :  * Returns InvalidTransactionId if there is no conflict, else an xact ID
                                391                 :                :  * we must wait for to see if it commits a conflicting tuple.   If an actual
                                392                 :                :  * conflict is detected, no return --- just ereport().  If an xact ID is
                                393                 :                :  * returned, and the conflicting tuple still has a speculative insertion in
                                394                 :                :  * progress, *speculativeToken is set to non-zero, and the caller can wait for
                                395                 :                :  * the verdict on the insertion using SpeculativeInsertionWait().
                                396                 :                :  *
                                397                 :                :  * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
                                398                 :                :  * InvalidTransactionId because we don't want to wait.  In this case we
                                399                 :                :  * set *is_unique to false if there is a potential conflict, and the
                                400                 :                :  * core code must redo the uniqueness check later.
                                401                 :                :  *
                                402                 :                :  * As a side-effect, sets state in insertstate that can later be used by
                                403                 :                :  * _bt_findinsertloc() to reuse most of the binary search work we do
                                404                 :                :  * here.
                                405                 :                :  *
                                406                 :                :  * This code treats NULLs as equal, unlike the default semantics for unique
                                407                 :                :  * indexes.  So do not call here when there are NULL values in scan key and
                                408                 :                :  * the index uses the default NULLS DISTINCT mode.
                                409                 :                :  */
                                410                 :                : static TransactionId
 2552 pg@bowt.ie                411                 :        3046139 : _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
                                412                 :                :                  IndexUniqueCheck checkUnique, bool *is_unique,
                                413                 :                :                  uint32 *speculativeToken)
                                414                 :                : {
                                415                 :        3046139 :     IndexTuple  itup = insertstate->itup;
 2101                           416                 :        3046139 :     IndexTuple  curitup = NULL;
 1802                           417                 :        3046139 :     ItemId      curitemid = NULL;
 2552                           418                 :        3046139 :     BTScanInsert itup_key = insertstate->itup_key;
                                419                 :                :     SnapshotData SnapshotDirty;
                                420                 :                :     OffsetNumber offset;
                                421                 :                :     OffsetNumber maxoff;
                                422                 :                :     Page        page;
                                423                 :                :     BTPageOpaque opaque;
 9368 tgl@sss.pgh.pa.us         424                 :        3046139 :     Buffer      nbuf = InvalidBuffer;
 6073                           425                 :        3046139 :     bool        found = false;
 2209 pg@bowt.ie                426                 :        3046139 :     bool        inposting = false;
                                427                 :        3046139 :     bool        prevalldead = true;
                                428                 :        3046139 :     int         curposti = 0;
                                429                 :                : 
                                430                 :                :     /* Assume unique until we find a duplicate */
 6073 tgl@sss.pgh.pa.us         431                 :        3046139 :     *is_unique = true;
                                432                 :                : 
 6930                           433                 :        3046139 :     InitDirtySnapshot(SnapshotDirty);
                                434                 :                : 
 2552 pg@bowt.ie                435                 :        3046139 :     page = BufferGetPage(insertstate->buf);
 1444 michael@paquier.xyz       436                 :        3046139 :     opaque = BTPageGetOpaque(page);
 9368 tgl@sss.pgh.pa.us         437                 :        3046139 :     maxoff = PageGetMaxOffsetNumber(page);
                                438                 :                : 
                                439                 :                :     /*
                                440                 :                :      * Find the first tuple with the same key.
                                441                 :                :      *
                                442                 :                :      * This also saves the binary search bounds in insertstate.  We use them
                                443                 :                :      * in the fastpath below, but also in the _bt_findinsertloc() call later.
                                444                 :                :      */
 2537 pg@bowt.ie                445         [ -  + ]:        3046139 :     Assert(!insertstate->bounds_valid);
 2552                           446                 :        3046139 :     offset = _bt_binsrch_insert(rel, insertstate);
                                447                 :                : 
                                448                 :                :     /*
                                449                 :                :      * Scan over all equal tuples, looking for live conflicts.
                                450                 :                :      */
                                451   [ +  +  -  + ]:        3046139 :     Assert(!insertstate->bounds_valid || insertstate->low == offset);
 2518                           452         [ -  + ]:        3046139 :     Assert(!itup_key->anynullkeys);
 2552                           453         [ +  - ]:        3046139 :     Assert(itup_key->scantid == NULL);
                                454                 :                :     for (;;)
                                455                 :                :     {
                                456                 :                :         /*
                                457                 :                :          * Each iteration of the loop processes one heap TID, not one index
                                458                 :                :          * tuple.  Current offset number for page isn't usually advanced on
                                459                 :                :          * iterations that process heap TIDs from posting list tuples.
                                460                 :                :          *
                                461                 :                :          * "inposting" state is set when _inside_ a posting list --- not when
                                462                 :                :          * we're at the start (or end) of a posting list.  We advance curposti
                                463                 :                :          * at the end of the iteration when inside a posting list tuple.  In
                                464                 :                :          * general, every loop iteration either advances the page offset or
                                465                 :                :          * advances curposti --- an iteration that handles the rightmost/max
                                466                 :                :          * heap TID in a posting list finally advances the page offset (and
                                467                 :                :          * unsets "inposting").
                                468                 :                :          *
                                469                 :                :          * Make sure the offset points to an actual index tuple before trying
                                470                 :                :          * to examine it...
                                471                 :                :          */
 9368 tgl@sss.pgh.pa.us         472         [ +  + ]:        9063481 :         if (offset <= maxoff)
                                473                 :                :         {
                                474                 :                :             /*
                                475                 :                :              * Fastpath: In most cases, we can use cached search bounds to
                                476                 :                :              * limit our consideration to items that are definitely
                                477                 :                :              * duplicates.  This fastpath doesn't apply when the original page
                                478                 :                :              * is empty, or when initial offset is past the end of the
                                479                 :                :              * original page, which may indicate that we need to examine a
                                480                 :                :              * second or subsequent page.
                                481                 :                :              *
                                482                 :                :              * Note that this optimization allows us to avoid calling
                                483                 :                :              * _bt_compare() directly when there are no duplicates, as long as
                                484                 :                :              * the offset where the key will go is not at the end of the page.
                                485                 :                :              */
 2552 pg@bowt.ie                486   [ +  +  +  + ]:        7237575 :             if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
                                487                 :                :             {
                                488         [ -  + ]:        1101905 :                 Assert(insertstate->bounds_valid);
                                489   [ +  +  -  + ]:        1101905 :                 Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
                                490         [ -  + ]:        1101905 :                 Assert(insertstate->low <= insertstate->stricthigh);
 2518                           491         [ -  + ]:        1101905 :                 Assert(_bt_compare(rel, itup_key, page, offset) < 0);
 2552                           492                 :        1101905 :                 break;
                                493                 :                :             }
                                494                 :                : 
                                495                 :                :             /*
                                496                 :                :              * We can skip items that are already marked killed.
                                497                 :                :              *
                                498                 :                :              * In the presence of heavy update activity an index may contain
                                499                 :                :              * many killed items with the same key; running _bt_compare() on
                                500                 :                :              * each killed item gets expensive.  Just advance over killed
                                501                 :                :              * items as quickly as we can.  We only apply _bt_compare() when
                                502                 :                :              * we get to a non-killed item.  We could reuse the bounds to
                                503                 :                :              * avoid _bt_compare() calls for known equal tuples, but it
                                504                 :                :              * doesn't seem worth it.
                                505                 :                :              */
 2209                           506         [ +  + ]:        6135670 :             if (!inposting)
                                507                 :        3877058 :                 curitemid = PageGetItemId(page, offset);
                                508   [ +  +  +  + ]:        6135670 :             if (inposting || !ItemIdIsDead(curitemid))
                                509                 :                :             {
                                510                 :                :                 ItemPointerData htid;
                                511                 :        5838337 :                 bool        all_dead = false;
                                512                 :                : 
                                513         [ +  + ]:        5838337 :                 if (!inposting)
                                514                 :                :                 {
                                515                 :                :                     /* Plain tuple, or first TID in posting list tuple */
                                516         [ +  + ]:        3579725 :                     if (_bt_compare(rel, itup_key, page, offset) != 0)
                                517                 :         105102 :                         break;  /* we're past all the equal tuples */
                                518                 :                : 
                                519                 :                :                     /* Advanced curitup */
                                520                 :        3474623 :                     curitup = (IndexTuple) PageGetItem(page, curitemid);
                                521         [ -  + ]:        3474623 :                     Assert(!BTreeTupleIsPivot(curitup));
                                522                 :                :                 }
                                523                 :                : 
                                524                 :                :                 /* okay, we gotta fetch the heap tuple using htid ... */
                                525         [ +  + ]:        5733235 :                 if (!BTreeTupleIsPosting(curitup))
                                526                 :                :                 {
                                527                 :                :                     /* ... htid is from simple non-pivot tuple */
                                528         [ -  + ]:        3451055 :                     Assert(!inposting);
                                529                 :        3451055 :                     htid = curitup->t_tid;
                                530                 :                :                 }
                                531         [ +  + ]:        2282180 :                 else if (!inposting)
                                532                 :                :                 {
                                533                 :                :                     /* ... htid is first TID in new posting list */
                                534                 :          23568 :                     inposting = true;
                                535                 :          23568 :                     prevalldead = true;
                                536                 :          23568 :                     curposti = 0;
                                537                 :          23568 :                     htid = *BTreeTupleGetPostingN(curitup, 0);
                                538                 :                :                 }
                                539                 :                :                 else
                                540                 :                :                 {
                                541                 :                :                     /* ... htid is second or subsequent TID in posting list */
                                542         [ -  + ]:        2258612 :                     Assert(curposti > 0);
                                543                 :        2258612 :                     htid = *BTreeTupleGetPostingN(curitup, curposti);
                                544                 :                :                 }
                                545                 :                : 
                                546                 :                :                 /*
                                547                 :                :                  * If we are doing a recheck, we expect to find the tuple we
                                548                 :                :                  * are rechecking.  It's not a duplicate, but we have to keep
                                549                 :                :                  * scanning.
                                550                 :                :                  */
 6073 tgl@sss.pgh.pa.us         551   [ +  +  +  + ]:        5733353 :                 if (checkUnique == UNIQUE_CHECK_EXISTING &&
                                552                 :            118 :                     ItemPointerCompare(&htid, &itup->t_tid) == 0)
                                553                 :                :                 {
                                554                 :             27 :                     found = true;
                                555                 :                :                 }
                                556                 :                : 
                                557                 :                :                 /*
                                558                 :                :                  * Check if there's any table tuples for this index entry
                                559                 :                :                  * satisfying SnapshotDirty. This is necessary because for AMs
                                560                 :                :                  * with optimizations like heap's HOT, we have just a single
                                561                 :                :                  * index entry for the entire chain.
                                562                 :                :                  */
 2547 andres@anarazel.de        563         [ +  + ]:        5733208 :                 else if (table_index_fetch_tuple_check(heapRel, &htid,
                                564                 :                :                                                        &SnapshotDirty,
                                565                 :                :                                                        &all_dead))
                                566                 :                :                 {
                                567                 :                :                     TransactionId xwait;
                                568                 :                : 
                                569                 :                :                     /*
                                570                 :                :                      * It is a duplicate. If we are only doing a partial
                                571                 :                :                      * check, then don't bother checking if the tuple is being
                                572                 :                :                      * updated in another transaction. Just return the fact
                                573                 :                :                      * that it is a potential conflict and leave the full
                                574                 :                :                      * check till later. Don't invalidate binary search
                                575                 :                :                      * bounds.
                                576                 :                :                      */
 6073 tgl@sss.pgh.pa.us         577         [ +  + ]:            404 :                     if (checkUnique == UNIQUE_CHECK_PARTIAL)
                                578                 :                :                     {
                                579         [ -  + ]:            128 :                         if (nbuf != InvalidBuffer)
 6073 tgl@sss.pgh.pa.us         580                 :UBC           0 :                             _bt_relbuf(rel, nbuf);
 6073 tgl@sss.pgh.pa.us         581                 :CBC         128 :                         *is_unique = false;
                                582                 :            140 :                         return InvalidTransactionId;
                                583                 :                :                     }
                                584                 :                : 
                                585                 :                :                     /*
                                586                 :                :                      * If this tuple is being updated by other transaction
                                587                 :                :                      * then we have to wait for its commit/abort.
                                588                 :                :                      */
                                589                 :            552 :                     xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
                                590         [ +  + ]:            276 :                         SnapshotDirty.xmin : SnapshotDirty.xmax;
                                591                 :                : 
 8696                           592         [ +  + ]:            276 :                     if (TransactionIdIsValid(xwait))
                                593                 :                :                     {
                                594         [ -  + ]:             12 :                         if (nbuf != InvalidBuffer)
 8696 tgl@sss.pgh.pa.us         595                 :UBC           0 :                             _bt_relbuf(rel, nbuf);
                                596                 :                :                         /* Tell _bt_doinsert to wait... */
 3964 andres@anarazel.de        597                 :CBC          12 :                         *speculativeToken = SnapshotDirty.speculativeToken;
                                598                 :                :                         /* Caller releases lock on buf immediately */
 2537 pg@bowt.ie                599                 :             12 :                         insertstate->bounds_valid = false;
 8696 tgl@sss.pgh.pa.us         600                 :             12 :                         return xwait;
                                601                 :                :                     }
                                602                 :                : 
                                603                 :                :                     /*
                                604                 :                :                      * Otherwise we have a definite conflict.  But before
                                605                 :                :                      * complaining, look to see if the tuple we want to insert
                                606                 :                :                      * is itself now committed dead --- if so, don't complain.
                                607                 :                :                      * This is a waste of time in normal scenarios but we must
                                608                 :                :                      * do it to support CREATE INDEX CONCURRENTLY.
                                609                 :                :                      *
                                610                 :                :                      * We must follow HOT-chains here because during
                                611                 :                :                      * concurrent index build, we insert the root TID though
                                612                 :                :                      * the actual tuple may be somewhere in the HOT-chain.
                                613                 :                :                      * While following the chain we might not stop at the
                                614                 :                :                      * exact tuple which triggered the insert, but that's OK
                                615                 :                :                      * because if we find a live tuple anywhere in this chain,
                                616                 :                :                      * we have a unique key conflict.  The other live tuple is
                                617                 :                :                      * not part of this chain because it had a different index
                                618                 :                :                      * entry.
                                619                 :                :                      */
 2089 pg@bowt.ie                620                 :            264 :                     htid = itup->t_tid;
                                621         [ -  + ]:            264 :                     if (table_index_fetch_tuple_check(heapRel, &htid,
                                622                 :                :                                                       SnapshotSelf, NULL))
                                623                 :                :                     {
                                624                 :                :                         /* Normal case --- it's still live */
                                625                 :                :                     }
                                626                 :                :                     else
                                627                 :                :                     {
                                628                 :                :                         /*
                                629                 :                :                          * It's been deleted, so no error, and no need to
                                630                 :                :                          * continue searching
                                631                 :                :                          */
 7142 tgl@sss.pgh.pa.us         632                 :UBC           0 :                         break;
                                633                 :                :                     }
                                634                 :                : 
                                635                 :                :                     /*
                                636                 :                :                      * Check for a conflict-in as we would if we were going to
                                637                 :                :                      * write to this page.  We aren't actually going to write,
                                638                 :                :                      * but we want a chance to report SSI conflicts that would
                                639                 :                :                      * otherwise be masked by this unique constraint
                                640                 :                :                      * violation.
                                641                 :                :                      */
 2238 tmunro@postgresql.or      642                 :CBC         264 :                     CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
                                643                 :                : 
                                644                 :                :                     /*
                                645                 :                :                      * This is a definite conflict.  Break the tuple down into
                                646                 :                :                      * datums and report the error.  But first, make sure we
                                647                 :                :                      * release the buffer locks we're holding ---
                                648                 :                :                      * BuildIndexValueDescription could make catalog accesses,
                                649                 :                :                      * which in the worst case might touch this same index and
                                650                 :                :                      * cause deadlocks.
                                651                 :                :                      */
 6070 tgl@sss.pgh.pa.us         652         [ -  + ]:            260 :                     if (nbuf != InvalidBuffer)
 6070 tgl@sss.pgh.pa.us         653                 :UBC           0 :                         _bt_relbuf(rel, nbuf);
 2552 pg@bowt.ie                654                 :CBC         260 :                     _bt_relbuf(rel, insertstate->buf);
                                655                 :            260 :                     insertstate->buf = InvalidBuffer;
 2537                           656                 :            260 :                     insertstate->bounds_valid = false;
                                657                 :                : 
                                658                 :                :                     {
                                659                 :                :                         Datum       values[INDEX_MAX_KEYS];
                                660                 :                :                         bool        isnull[INDEX_MAX_KEYS];
                                661                 :                :                         char       *key_desc;
                                662                 :                : 
 6070 tgl@sss.pgh.pa.us         663                 :            260 :                         index_deform_tuple(itup, RelationGetDescr(rel),
                                664                 :                :                                            values, isnull);
                                665                 :                : 
 4080 sfrost@snowman.net        666                 :            260 :                         key_desc = BuildIndexValueDescription(rel, values,
                                667                 :                :                                                               isnull);
                                668                 :                : 
 6070 tgl@sss.pgh.pa.us         669   [ +  -  +  + ]:            260 :                         ereport(ERROR,
                                670                 :                :                                 (errcode(ERRCODE_UNIQUE_VIOLATION),
                                671                 :                :                                  errmsg("duplicate key value violates unique constraint \"%s\"",
                                672                 :                :                                         RelationGetRelationName(rel)),
                                673                 :                :                                  key_desc ? errdetail("Key %s already exists.",
                                674                 :                :                                                       key_desc) : 0,
                                675                 :                :                                  errtableconstraint(heapRel,
                                676                 :                :                                                     RelationGetRelationName(rel))));
                                677                 :                :                     }
                                678                 :                :                 }
 2209 pg@bowt.ie                679   [ +  +  +  +  :        5732804 :                 else if (all_dead && (!inposting ||
                                              +  + ]
                                680                 :          18785 :                                       (prevalldead &&
                                681         [ +  + ]:          18785 :                                        curposti == BTreeTupleGetNPosting(curitup) - 1)))
                                682                 :                :                 {
                                683                 :                :                     /*
                                684                 :                :                      * The conflicting tuple (or all HOT chains pointed to by
                                685                 :                :                      * all posting list TIDs) is dead to everyone, so try to
                                686                 :                :                      * mark the index entry killed. It's ok if we're not
                                687                 :                :                      * allowed to, this isn't required for correctness.
                                688                 :                :                      */
                                689                 :                :                     Buffer      buf;
                                690                 :                : 
                                691                 :                :                     /* Be sure to operate on the proper buffer */
 6751 tgl@sss.pgh.pa.us         692         [ +  + ]:          53350 :                     if (nbuf != InvalidBuffer)
    5 andres@anarazel.de        693                 :GNC           3 :                         buf = nbuf;
                                694                 :                :                     else
                                695                 :          53347 :                         buf = insertstate->buf;
                                696                 :                : 
                                697                 :                :                     /*
                                698                 :                :                      * Use the hint bit infrastructure to check if we can
                                699                 :                :                      * update the page while just holding a share lock.
                                700                 :                :                      *
                                701                 :                :                      * Can't use BufferSetHintBits16() here as we update two
                                702                 :                :                      * different locations.
                                703                 :                :                      */
                                704         [ +  - ]:          53350 :                     if (BufferBeginSetHintBits(buf))
                                705                 :                :                     {
                                706                 :          53350 :                         ItemIdMarkDead(curitemid);
                                707                 :          53350 :                         opaque->btpo_flags |= BTP_HAS_GARBAGE;
                                708                 :          53350 :                         BufferFinishSetHintBits(buf, true, true);
                                709                 :                :                     }
                                710                 :                :                 }
                                711                 :                : 
                                712                 :                :                 /*
                                713                 :                :                  * Remember if posting list tuple has even a single HOT chain
                                714                 :                :                  * whose members are not all dead
                                715                 :                :                  */
 2209 pg@bowt.ie                716   [ +  +  +  + ]:CBC     5732831 :                 if (!all_dead && inposting)
                                717                 :        2263370 :                     prevalldead = false;
                                718                 :                :             }
                                719                 :                :         }
                                720                 :                : 
                                721   [ +  +  +  + ]:        7856070 :         if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
                                722                 :                :         {
                                723                 :                :             /* Advance to next TID in same posting list */
                                724                 :        2258612 :             curposti++;
                                725                 :        2258612 :             continue;
                                726                 :                :         }
                                727         [ +  + ]:        5597458 :         else if (offset < maxoff)
                                728                 :                :         {
                                729                 :                :             /* Advance to next tuple */
                                730                 :        3753708 :             curposti = 0;
                                731                 :        3753708 :             inposting = false;
 9368 tgl@sss.pgh.pa.us         732                 :        3753708 :             offset = OffsetNumberNext(offset);
                                733                 :                :         }
                                734                 :                :         else
                                735                 :                :         {
                                736                 :                :             int         highkeycmp;
                                737                 :                : 
                                738                 :                :             /* If scankey == hikey we gotta check the next page too */
                                739         [ +  + ]:        1843750 :             if (P_RIGHTMOST(opaque))
                                740                 :        1764896 :                 break;
 2552 pg@bowt.ie                741                 :          78854 :             highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
                                742         [ -  + ]:          78854 :             Assert(highkeycmp <= 0);
                                743         [ +  + ]:          78854 :             if (highkeycmp != 0)
 9368 tgl@sss.pgh.pa.us         744                 :          73832 :                 break;
                                745                 :                :             /* Advance to next non-dead page --- there must be one */
                                746                 :                :             for (;;)
 8422 tgl@sss.pgh.pa.us         747                 :UBC           0 :             {
 2209 pg@bowt.ie                748                 :CBC        5022 :                 BlockNumber nblkno = opaque->btpo_next;
                                749                 :                : 
 7998 tgl@sss.pgh.pa.us         750                 :           5022 :                 nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
 3616 kgrittn@postgresql.o      751                 :           5022 :                 page = BufferGetPage(nbuf);
 1444 michael@paquier.xyz       752                 :           5022 :                 opaque = BTPageGetOpaque(page);
 8422 tgl@sss.pgh.pa.us         753         [ +  - ]:           5022 :                 if (!P_IGNORE(opaque))
                                754                 :           5022 :                     break;
 8422 tgl@sss.pgh.pa.us         755         [ #  # ]:UBC           0 :                 if (P_RIGHTMOST(opaque))
 6649                           756         [ #  # ]:              0 :                     elog(ERROR, "fell off the end of index \"%s\"",
                                757                 :                :                          RelationGetRelationName(rel));
                                758                 :                :             }
                                759                 :                :             /* Will also advance to next tuple */
 2209 pg@bowt.ie                760                 :CBC        5022 :             curposti = 0;
                                761                 :           5022 :             inposting = false;
 9368 tgl@sss.pgh.pa.us         762                 :           5022 :             maxoff = PageGetMaxOffsetNumber(page);
                                763         [ +  + ]:           5022 :             offset = P_FIRSTDATAKEY(opaque);
                                764                 :                :             /* Don't invalidate binary search bounds */
                                765                 :                :         }
                                766                 :                :     }
                                767                 :                : 
                                768                 :                :     /*
                                769                 :                :      * If we are doing a recheck then we should have found the tuple we are
                                770                 :                :      * checking.  Otherwise there's something very wrong --- probably, the
                                771                 :                :      * index is on a non-immutable expression.
                                772                 :                :      */
 6073                           773   [ +  +  -  + ]:        3045735 :     if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
 6073 tgl@sss.pgh.pa.us         774         [ #  # ]:UBC           0 :         ereport(ERROR,
                                775                 :                :                 (errcode(ERRCODE_INTERNAL_ERROR),
                                776                 :                :                  errmsg("failed to re-find tuple within index \"%s\"",
                                777                 :                :                         RelationGetRelationName(rel)),
                                778                 :                :                  errhint("This may be because of a non-immutable index expression."),
                                779                 :                :                  errtableconstraint(heapRel,
                                780                 :                :                                     RelationGetRelationName(rel))));
                                781                 :                : 
 9368 tgl@sss.pgh.pa.us         782         [ +  + ]:CBC     3045735 :     if (nbuf != InvalidBuffer)
 9009                           783                 :           2880 :         _bt_relbuf(rel, nbuf);
                                784                 :                : 
 8970                           785                 :        3045735 :     return InvalidTransactionId;
                                786                 :                : }
                                787                 :                : 
                                788                 :                : 
                                789                 :                : /*
                                790                 :                :  *  _bt_findinsertloc() -- Finds an insert location for a tuple
                                791                 :                :  *
                                792                 :                :  *      On entry, insertstate buffer contains the page the new tuple belongs
                                793                 :                :  *      on.  It is exclusive-locked and pinned by the caller.
                                794                 :                :  *
                                795                 :                :  *      If 'checkingunique' is true, the buffer on entry is the first page
                                796                 :                :  *      that contains duplicates of the new key.  If there are duplicates on
                                797                 :                :  *      multiple pages, the correct insertion position might be some page to
                                798                 :                :  *      the right, rather than the first page.  In that case, this function
                                799                 :                :  *      moves right to the correct target page.
                                800                 :                :  *
                                801                 :                :  *      (In a !heapkeyspace index, there can be multiple pages with the same
                                802                 :                :  *      high key, where the new tuple could legitimately be placed on.  In
                                803                 :                :  *      that case, the caller passes the first page containing duplicates,
                                804                 :                :  *      just like when checkingunique=true.  If that page doesn't have enough
                                805                 :                :  *      room for the new tuple, this function moves right, trying to find a
                                806                 :                :  *      legal page that does.)
                                807                 :                :  *
                                808                 :                :  *      If 'indexUnchanged' is true, this is for an UPDATE that didn't
                                809                 :                :  *      logically change the indexed value, but must nevertheless have a new
                                810                 :                :  *      entry to point to a successor version.  This hint from the executor
                                811                 :                :  *      will influence our behavior when the page might have to be split and
                                812                 :                :  *      we must consider our options.  Bottom-up index deletion can avoid
                                813                 :                :  *      pathological version-driven page splits, but we only want to go to the
                                814                 :                :  *      trouble of trying it when we already have moderate confidence that
                                815                 :                :  *      it's appropriate.  The hint should not significantly affect our
                                816                 :                :  *      behavior over time unless practically all inserts on to the leaf page
                                817                 :                :  *      get the hint.
                                818                 :                :  *
                                819                 :                :  *      On exit, insertstate buffer contains the chosen insertion page, and
                                820                 :                :  *      the offset within that page is returned.  If _bt_findinsertloc needed
                                821                 :                :  *      to move right, the lock and pin on the original page are released, and
                                822                 :                :  *      the new buffer is exclusively locked and pinned instead.
                                823                 :                :  *
                                824                 :                :  *      If insertstate contains cached binary search bounds, we will take
                                825                 :                :  *      advantage of them.  This avoids repeating comparisons that we made in
                                826                 :                :  *      _bt_check_unique() already.
                                827                 :                :  */
                                828                 :                : static OffsetNumber
 6952 bruce@momjian.us          829                 :        4273682 : _bt_findinsertloc(Relation rel,
                                830                 :                :                   BTInsertState insertstate,
                                831                 :                :                   bool checkingunique,
                                832                 :                :                   bool indexUnchanged,
                                833                 :                :                   BTStack stack,
                                834                 :                :                   Relation heapRel)
                                835                 :                : {
 2552 pg@bowt.ie                836                 :        4273682 :     BTScanInsert itup_key = insertstate->itup_key;
                                837                 :        4273682 :     Page        page = BufferGetPage(insertstate->buf);
                                838                 :                :     BTPageOpaque opaque;
                                839                 :                :     OffsetNumber newitemoff;
                                840                 :                : 
 1444 michael@paquier.xyz       841                 :        4273682 :     opaque = BTPageGetOpaque(page);
                                842                 :                : 
                                843                 :                :     /* Check 1/3 of a page restriction */
  369 pg@bowt.ie                844         [ -  + ]:        4273682 :     if (unlikely(insertstate->itemsz > BTMaxItemSize))
 2552 pg@bowt.ie                845                 :UBC           0 :         _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
                                846                 :                :                              insertstate->itup);
                                847                 :                : 
 1944 pg@bowt.ie                848   [ +  -  -  + ]:CBC     4273682 :     Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
 2552                           849   [ +  +  -  + ]:        4273682 :     Assert(!insertstate->bounds_valid || checkingunique);
                                850   [ +  -  -  + ]:        4273682 :     Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
                                851   [ -  +  -  - ]:        4273682 :     Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
 2209                           852   [ +  +  -  + ]:        4273682 :     Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
                                853                 :                : 
 2552                           854         [ +  - ]:        4273682 :     if (itup_key->heapkeyspace)
                                855                 :                :     {
                                856                 :                :         /* Keep track of whether checkingunique duplicate seen */
 1887                           857                 :        4273682 :         bool        uniquedup = indexUnchanged;
                                858                 :                : 
                                859                 :                :         /*
                                860                 :                :          * If we're inserting into a unique index, we may have to walk right
                                861                 :                :          * through leaf pages to find the one leaf page that we must insert on
                                862                 :                :          * to.
                                863                 :                :          *
                                864                 :                :          * This is needed for checkingunique callers because a scantid was not
                                865                 :                :          * used when we called _bt_search().  scantid can only be set after
                                866                 :                :          * _bt_check_unique() has checked for duplicates.  The buffer
                                867                 :                :          * initially stored in insertstate->buf has the page where the first
                                868                 :                :          * duplicate key might be found, which isn't always the page that new
                                869                 :                :          * tuple belongs on.  The heap TID attribute for new tuple (scantid)
                                870                 :                :          * could force us to insert on a sibling page, though that should be
                                871                 :                :          * very rare in practice.
                                872                 :                :          */
 2552                           873         [ +  + ]:        4273682 :         if (checkingunique)
                                874                 :                :         {
 2209                           875         [ +  + ]:        3045833 :             if (insertstate->low < insertstate->stricthigh)
                                876                 :                :             {
                                877                 :                :                 /* Encountered a duplicate in _bt_check_unique() */
                                878         [ -  + ]:         225450 :                 Assert(insertstate->bounds_valid);
                                879                 :         225450 :                 uniquedup = true;
                                880                 :                :             }
                                881                 :                : 
                                882                 :                :             for (;;)
                                883                 :                :             {
                                884                 :                :                 /*
                                885                 :                :                  * Does the new tuple belong on this page?
                                886                 :                :                  *
                                887                 :                :                  * The earlier _bt_check_unique() call may well have
                                888                 :                :                  * established a strict upper bound on the offset for the new
                                889                 :                :                  * item.  If it's not the last item of the page (i.e. if there
                                890                 :                :                  * is at least one tuple on the page that goes after the tuple
                                891                 :                :                  * we're inserting) then we know that the tuple belongs on
                                892                 :                :                  * this page.  We can skip the high key check.
                                893                 :                :                  */
 2552                           894         [ +  + ]:        3050855 :                 if (insertstate->bounds_valid &&
                                895   [ +  -  +  + ]:        6080736 :                     insertstate->low <= insertstate->stricthigh &&
                                896                 :        3040368 :                     insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
                                897                 :        1193245 :                     break;
                                898                 :                : 
                                899                 :                :                 /* Test '<=', not '!=', since scantid is set now */
 1944                           900   [ +  +  +  + ]:        1944307 :                 if (P_RIGHTMOST(opaque) ||
 2552                           901                 :          86697 :                     _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
                                902                 :                :                     break;
                                903                 :                : 
 1079 andres@anarazel.de        904                 :           5022 :                 _bt_stepright(rel, heapRel, insertstate, stack);
                                905                 :                :                 /* Update local state after stepping right */
 2552 pg@bowt.ie                906                 :           5022 :                 page = BufferGetPage(insertstate->buf);
 1444 michael@paquier.xyz       907                 :           5022 :                 opaque = BTPageGetOpaque(page);
                                908                 :                :                 /* Assume duplicates (if checkingunique) */
 2209 pg@bowt.ie                909                 :           5022 :                 uniquedup = true;
                                910                 :                :             }
                                911                 :                :         }
                                912                 :                : 
                                913                 :                :         /*
                                914                 :                :          * If the target page cannot fit newitem, try to avoid splitting the
                                915                 :                :          * page on insert by performing deletion or deduplication now
                                916                 :                :          */
                                917         [ +  + ]:        4273682 :         if (PageGetFreeSpace(page) < insertstate->itemsz)
 1944                           918                 :          28227 :             _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
                                919                 :                :                                          checkingunique, uniquedup,
                                920                 :                :                                          indexUnchanged);
                                921                 :                :     }
                                922                 :                :     else
                                923                 :                :     {
                                924                 :                :         /*----------
                                925                 :                :          * This is a !heapkeyspace (version 2 or 3) index.  The current page
                                926                 :                :          * is the first page that we could insert the new tuple to, but there
                                927                 :                :          * may be other pages to the right that we could opt to use instead.
                                928                 :                :          *
                                929                 :                :          * If the new key is equal to one or more existing keys, we can
                                930                 :                :          * legitimately place it anywhere in the series of equal keys.  In
                                931                 :                :          * fact, if the new key is equal to the page's "high key" we can place
                                932                 :                :          * it on the next page.  If it is equal to the high key, and there's
                                933                 :                :          * not room to insert the new tuple on the current page without
                                934                 :                :          * splitting, then we move right hoping to find more free space and
                                935                 :                :          * avoid a split.
                                936                 :                :          *
                                937                 :                :          * Keep scanning right until we
                                938                 :                :          *      (a) find a page with enough free space,
                                939                 :                :          *      (b) reach the last page where the tuple can legally go, or
                                940                 :                :          *      (c) get tired of searching.
                                941                 :                :          * (c) is not flippant; it is important because if there are many
                                942                 :                :          * pages' worth of equal keys, it's better to split one of the early
                                943                 :                :          * pages than to scan all the way to the end of the run of equal keys
                                944                 :                :          * on every insert.  We implement "get tired" as a random choice,
                                945                 :                :          * since stopping after scanning a fixed number of pages wouldn't work
                                946                 :                :          * well (we'd never reach the right-hand side of previously split
                                947                 :                :          * pages).  The probability of moving right is set at 0.99, which may
                                948                 :                :          * seem too high to change the behavior much, but it does an excellent
                                949                 :                :          * job of preventing O(N^2) behavior with many equal keys.
                                950                 :                :          *----------
                                951                 :                :          */
 2552 pg@bowt.ie                952         [ #  # ]:UBC           0 :         while (PageGetFreeSpace(page) < insertstate->itemsz)
                                953                 :                :         {
                                954                 :                :             /*
                                955                 :                :              * Before considering moving right, see if we can obtain enough
                                956                 :                :              * space by erasing LP_DEAD items
                                957                 :                :              */
 1944                           958         [ #  # ]:              0 :             if (P_HAS_GARBAGE(opaque))
                                959                 :                :             {
                                960                 :                :                 /* Perform simple deletion */
                                961                 :              0 :                 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
                                962                 :                :                                              false, false, false);
                                963                 :                : 
 2552                           964         [ #  # ]:              0 :                 if (PageGetFreeSpace(page) >= insertstate->itemsz)
                                965                 :              0 :                     break;      /* OK, now we have enough space */
                                966                 :                :             }
                                967                 :                : 
                                968                 :                :             /*
                                969                 :                :              * Nope, so check conditions (b) and (c) enumerated above
                                970                 :                :              *
                                971                 :                :              * The earlier _bt_check_unique() call may well have established a
                                972                 :                :              * strict upper bound on the offset for the new item.  If it's not
                                973                 :                :              * the last item of the page (i.e. if there is at least one tuple
                                974                 :                :              * on the page that's greater than the tuple we're inserting to)
                                975                 :                :              * then we know that the tuple belongs on this page.  We can skip
                                976                 :                :              * the high key check.
                                977                 :                :              */
                                978         [ #  # ]:              0 :             if (insertstate->bounds_valid &&
                                979   [ #  #  #  # ]:              0 :                 insertstate->low <= insertstate->stricthigh &&
                                980                 :              0 :                 insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
                                981                 :              0 :                 break;
                                982                 :                : 
 1944                           983   [ #  #  #  # ]:              0 :             if (P_RIGHTMOST(opaque) ||
 2552                           984         [ #  # ]:              0 :                 _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
 1568 tgl@sss.pgh.pa.us         985                 :              0 :                 pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
                                986                 :                :                 break;
                                987                 :                : 
 1079 andres@anarazel.de        988                 :              0 :             _bt_stepright(rel, heapRel, insertstate, stack);
                                989                 :                :             /* Update local state after stepping right */
 2552 pg@bowt.ie                990                 :              0 :             page = BufferGetPage(insertstate->buf);
 1444 michael@paquier.xyz       991                 :              0 :             opaque = BTPageGetOpaque(page);
                                992                 :                :         }
                                993                 :                :     }
                                994                 :                : 
                                995                 :                :     /*
                                996                 :                :      * We should now be on the correct page.  Find the offset within the page
                                997                 :                :      * for the new tuple. (Possibly reusing earlier search bounds.)
                                998                 :                :      */
 1944 pg@bowt.ie                999   [ +  +  -  + ]:CBC     4273682 :     Assert(P_RIGHTMOST(opaque) ||
                               1000                 :                :            _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
                               1001                 :                : 
 2209                          1002                 :        4273682 :     newitemoff = _bt_binsrch_insert(rel, insertstate);
                               1003                 :                : 
                               1004         [ +  + ]:        4273682 :     if (insertstate->postingoff == -1)
                               1005                 :                :     {
                               1006                 :                :         /*
                               1007                 :                :          * There is an overlapping posting list tuple with its LP_DEAD bit
                               1008                 :                :          * set.  We don't want to unnecessarily unset its LP_DEAD bit while
                               1009                 :                :          * performing a posting list split, so perform simple index tuple
                               1010                 :                :          * deletion early.
                               1011                 :                :          */
 1944 pg@bowt.ie               1012                 :GBC           5 :         _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
                               1013                 :                :                                      false, false, false);
                               1014                 :                : 
                               1015                 :                :         /*
                               1016                 :                :          * Do new binary search.  New insert location cannot overlap with any
                               1017                 :                :          * posting list now.
                               1018                 :                :          */
                               1019         [ -  + ]:              5 :         Assert(!insertstate->bounds_valid);
 2209                          1020                 :              5 :         insertstate->postingoff = 0;
                               1021                 :              5 :         newitemoff = _bt_binsrch_insert(rel, insertstate);
                               1022         [ -  + ]:              5 :         Assert(insertstate->postingoff == 0);
                               1023                 :                :     }
                               1024                 :                : 
 2209 pg@bowt.ie               1025                 :CBC     4273682 :     return newitemoff;
                               1026                 :                : }
                               1027                 :                : 
                               1028                 :                : /*
                               1029                 :                :  * Step right to next non-dead page, during insertion.
                               1030                 :                :  *
                               1031                 :                :  * This is a bit more complicated than moving right in a search.  We must
                               1032                 :                :  * write-lock the target page before releasing write lock on current page;
                               1033                 :                :  * else someone else's _bt_check_unique scan could fail to see our insertion.
                               1034                 :                :  * Write locks on intermediate dead pages won't do because we don't know when
                               1035                 :                :  * they will get de-linked from the tree.
                               1036                 :                :  *
                               1037                 :                :  * This is more aggressive than it needs to be for non-unique !heapkeyspace
                               1038                 :                :  * indexes.
                               1039                 :                :  */
                               1040                 :                : static void
 1009                          1041                 :           5022 : _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate,
                               1042                 :                :               BTStack stack)
                               1043                 :                : {
                               1044                 :                :     Page        page;
                               1045                 :                :     BTPageOpaque opaque;
                               1046                 :                :     Buffer      rbuf;
                               1047                 :                :     BlockNumber rblkno;
                               1048                 :                : 
                               1049         [ -  + ]:           5022 :     Assert(heaprel != NULL);
 2552                          1050                 :           5022 :     page = BufferGetPage(insertstate->buf);
 1444 michael@paquier.xyz      1051                 :           5022 :     opaque = BTPageGetOpaque(page);
                               1052                 :                : 
 2552 pg@bowt.ie               1053                 :           5022 :     rbuf = InvalidBuffer;
 1944                          1054                 :           5022 :     rblkno = opaque->btpo_next;
                               1055                 :                :     for (;;)
                               1056                 :                :     {
 2552                          1057                 :           5022 :         rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
                               1058                 :           5022 :         page = BufferGetPage(rbuf);
 1444 michael@paquier.xyz      1059                 :           5022 :         opaque = BTPageGetOpaque(page);
                               1060                 :                : 
                               1061                 :                :         /*
                               1062                 :                :          * If this page was incompletely split, finish the split now.  We do
                               1063                 :                :          * this while holding a lock on the left sibling, which is not good
                               1064                 :                :          * because finishing the split could be a fairly lengthy operation.
                               1065                 :                :          * But this should happen very seldom.
                               1066                 :                :          */
 1944 pg@bowt.ie               1067         [ -  + ]:           5022 :         if (P_INCOMPLETE_SPLIT(opaque))
                               1068                 :                :         {
 1079 andres@anarazel.de       1069                 :UBC           0 :             _bt_finish_split(rel, heaprel, rbuf, stack);
 2552 pg@bowt.ie               1070                 :              0 :             rbuf = InvalidBuffer;
                               1071                 :              0 :             continue;
                               1072                 :                :         }
                               1073                 :                : 
 1944 pg@bowt.ie               1074         [ +  - ]:CBC        5022 :         if (!P_IGNORE(opaque))
 2552                          1075                 :           5022 :             break;
 1944 pg@bowt.ie               1076         [ #  # ]:UBC           0 :         if (P_RIGHTMOST(opaque))
 2552                          1077         [ #  # ]:              0 :             elog(ERROR, "fell off the end of index \"%s\"",
                               1078                 :                :                  RelationGetRelationName(rel));
                               1079                 :                : 
 1944                          1080                 :              0 :         rblkno = opaque->btpo_next;
                               1081                 :                :     }
                               1082                 :                :     /* rbuf locked; unlock buf, update state for caller */
 2552 pg@bowt.ie               1083                 :CBC        5022 :     _bt_relbuf(rel, insertstate->buf);
                               1084                 :           5022 :     insertstate->buf = rbuf;
                               1085                 :           5022 :     insertstate->bounds_valid = false;
 6952 bruce@momjian.us         1086                 :           5022 : }
                               1087                 :                : 
                               1088                 :                : /*----------
                               1089                 :                :  *  _bt_insertonpg() -- Insert a tuple on a particular page in the index.
                               1090                 :                :  *
                               1091                 :                :  *      This recursive procedure does the following things:
                               1092                 :                :  *
                               1093                 :                :  *          +  if postingoff != 0, splits existing posting list tuple
                               1094                 :                :  *             (since it overlaps with new 'itup' tuple).
                               1095                 :                :  *          +  if necessary, splits the target page, using 'itup_key' for
                               1096                 :                :  *             suffix truncation on leaf pages (caller passes NULL for
                               1097                 :                :  *             non-leaf pages).
                               1098                 :                :  *          +  inserts the new tuple (might be split from posting list).
                               1099                 :                :  *          +  if the page was split, pops the parent stack, and finds the
                               1100                 :                :  *             right place to insert the new child pointer (by walking
                               1101                 :                :  *             right using information stored in the parent stack).
                               1102                 :                :  *          +  invokes itself with the appropriate tuple for the right
                               1103                 :                :  *             child page on the parent.
                               1104                 :                :  *          +  updates the metapage if a true root or fast root is split.
                               1105                 :                :  *
                               1106                 :                :  *      On entry, we must have the correct buffer in which to do the
                               1107                 :                :  *      insertion, and the buffer must be pinned and write-locked.  On return,
                               1108                 :                :  *      we will have dropped both the pin and the lock on the buffer.
                               1109                 :                :  *
                               1110                 :                :  *      This routine only performs retail tuple insertions.  'itup' should
                               1111                 :                :  *      always be either a non-highkey leaf item, or a downlink (new high
                               1112                 :                :  *      key items are created indirectly, when a page is split).  When
                               1113                 :                :  *      inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
                               1114                 :                :  *      we're inserting the downlink for.  This function will clear the
                               1115                 :                :  *      INCOMPLETE_SPLIT flag on it, and release the buffer.
                               1116                 :                :  *----------
                               1117                 :                :  */
                               1118                 :                : static void
                               1119                 :        4286212 : _bt_insertonpg(Relation rel,
                               1120                 :                :                Relation heaprel,
                               1121                 :                :                BTScanInsert itup_key,
                               1122                 :                :                Buffer buf,
                               1123                 :                :                Buffer cbuf,
                               1124                 :                :                BTStack stack,
                               1125                 :                :                IndexTuple itup,
                               1126                 :                :                Size itemsz,
                               1127                 :                :                OffsetNumber newitemoff,
                               1128                 :                :                int postingoff,
                               1129                 :                :                bool split_only_page)
                               1130                 :                : {
                               1131                 :                :     Page        page;
                               1132                 :                :     BTPageOpaque opaque;
                               1133                 :                :     bool        isleaf,
                               1134                 :                :                 isroot,
                               1135                 :                :                 isrightmost,
                               1136                 :                :                 isonly;
 2209 pg@bowt.ie               1137                 :        4286212 :     IndexTuple  oposting = NULL;
                               1138                 :        4286212 :     IndexTuple  origitup = NULL;
                               1139                 :        4286212 :     IndexTuple  nposting = NULL;
                               1140                 :                :     XLogRecPtr  recptr;
                               1141                 :                : 
 3616 kgrittn@postgresql.o     1142                 :        4286212 :     page = BufferGetPage(buf);
 1444 michael@paquier.xyz      1143                 :        4286212 :     opaque = BTPageGetOpaque(page);
 1944 pg@bowt.ie               1144                 :        4286212 :     isleaf = P_ISLEAF(opaque);
                               1145                 :        4286212 :     isroot = P_ISROOT(opaque);
                               1146                 :        4286212 :     isrightmost = P_RIGHTMOST(opaque);
                               1147   [ +  +  +  + ]:        4286212 :     isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
                               1148                 :                : 
                               1149                 :                :     /* child buffer must be given iff inserting on an internal page */
                               1150         [ -  + ]:        4286212 :     Assert(isleaf == !BufferIsValid(cbuf));
                               1151                 :                :     /* tuple must have appropriate number of attributes */
                               1152   [ +  +  -  +  :        4286212 :     Assert(!isleaf ||
                                              -  - ]
                               1153                 :                :            BTreeTupleGetNAtts(itup, rel) ==
                               1154                 :                :            IndexRelationGetNumberOfAttributes(rel));
                               1155   [ +  +  +  -  :        4286212 :     Assert(isleaf ||
                                              -  + ]
                               1156                 :                :            BTreeTupleGetNAtts(itup, rel) <=
                               1157                 :                :            IndexRelationGetNumberOfKeyAttributes(rel));
 2209                          1158         [ -  + ]:        4286212 :     Assert(!BTreeTupleIsPosting(itup));
 2190                          1159         [ -  + ]:        4286212 :     Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
                               1160                 :                :     /* Caller must always finish incomplete split for us */
 1944                          1161         [ -  + ]:        4286212 :     Assert(!P_INCOMPLETE_SPLIT(opaque));
                               1162                 :                : 
                               1163                 :                :     /*
                               1164                 :                :      * Every internal page should have exactly one negative infinity item at
                               1165                 :                :      * all times.  Only _bt_split() and _bt_newlevel() should add items that
                               1166                 :                :      * become negative infinity items through truncation, since they're the
                               1167                 :                :      * only routines that allocate new internal pages.
                               1168                 :                :      */
                               1169   [ +  +  +  +  :        4286212 :     Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
                                              -  + ]
                               1170                 :                : 
                               1171                 :                :     /*
                               1172                 :                :      * Do we need to split an existing posting list item?
                               1173                 :                :      */
 2209                          1174         [ +  + ]:        4286212 :     if (postingoff != 0)
                               1175                 :                :     {
                               1176                 :          16406 :         ItemId      itemid = PageGetItemId(page, newitemoff);
                               1177                 :                : 
                               1178                 :                :         /*
                               1179                 :                :          * The new tuple is a duplicate with a heap TID that falls inside the
                               1180                 :                :          * range of an existing posting list tuple on a leaf page.  Prepare to
                               1181                 :                :          * split an existing posting list.  Overwriting the posting list with
                               1182                 :                :          * its post-split version is treated as an extra step in either the
                               1183                 :                :          * insert or page split critical section.
                               1184                 :                :          */
 1600                          1185   [ +  -  +  -  :          16406 :         Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
                                              -  + ]
 2209                          1186                 :          16406 :         oposting = (IndexTuple) PageGetItem(page, itemid);
                               1187                 :                : 
                               1188                 :                :         /*
                               1189                 :                :          * postingoff value comes from earlier call to _bt_binsrch_posting().
                               1190                 :                :          * Its binary search might think that a plain tuple must be a posting
                               1191                 :                :          * list tuple that needs to be split.  This can happen with corruption
                               1192                 :                :          * involving an existing plain tuple that is a duplicate of the new
                               1193                 :                :          * item, up to and including its table TID.  Check for that here in
                               1194                 :                :          * passing.
                               1195                 :                :          *
                               1196                 :                :          * Also verify that our caller has made sure that the existing posting
                               1197                 :                :          * list tuple does not have its LP_DEAD bit set.
                               1198                 :                :          */
 1600                          1199   [ +  -  -  + ]:          16406 :         if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
 1600 pg@bowt.ie               1200         [ #  # ]:UBC           0 :             ereport(ERROR,
                               1201                 :                :                     (errcode(ERRCODE_INDEX_CORRUPTED),
                               1202                 :                :                      errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
                               1203                 :                :                                      ItemPointerGetBlockNumber(&itup->t_tid),
                               1204                 :                :                                      ItemPointerGetOffsetNumber(&itup->t_tid),
                               1205                 :                :                                      newitemoff, BufferGetBlockNumber(buf),
                               1206                 :                :                                      RelationGetRelationName(rel))));
                               1207                 :                : 
                               1208                 :                :         /* use a mutable copy of itup as our itup from here on */
 2209 pg@bowt.ie               1209                 :CBC       16406 :         origitup = itup;
                               1210                 :          16406 :         itup = CopyIndexTuple(origitup);
                               1211                 :          16406 :         nposting = _bt_swap_posting(itup, oposting, postingoff);
                               1212                 :                :         /* itup now contains rightmost/max TID from oposting */
                               1213                 :                : 
                               1214                 :                :         /* Alter offset so that newitem goes after posting list */
                               1215                 :          16406 :         newitemoff = OffsetNumberNext(newitemoff);
                               1216                 :                :     }
                               1217                 :                : 
                               1218                 :                :     /*
                               1219                 :                :      * Do we need to split the page to fit the item on it?
                               1220                 :                :      *
                               1221                 :                :      * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
                               1222                 :                :      * so this comparison is correct even though we appear to be accounting
                               1223                 :                :      * only for the item and not for its line pointer.
                               1224                 :                :      */
 9368 tgl@sss.pgh.pa.us        1225         [ +  + ]:        4286212 :     if (PageGetFreeSpace(page) < itemsz)
                               1226                 :                :     {
                               1227                 :                :         Buffer      rbuf;
                               1228                 :                : 
 2162 pg@bowt.ie               1229         [ -  + ]:          13241 :         Assert(!split_only_page);
                               1230                 :                : 
                               1231                 :                :         /* split the buffer into left and right halves */
 1079 andres@anarazel.de       1232                 :          13241 :         rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
                               1233                 :                :                          itup, origitup, nposting, postingoff);
 5515 heikki.linnakangas@i     1234                 :          13241 :         PredicateLockPageSplit(rel,
                               1235                 :                :                                BufferGetBlockNumber(buf),
                               1236                 :                :                                BufferGetBlockNumber(rbuf));
                               1237                 :                : 
                               1238                 :                :         /*----------
                               1239                 :                :          * By here,
                               1240                 :                :          *
                               1241                 :                :          *      +  our target page has been split;
                               1242                 :                :          *      +  the original tuple has been inserted;
                               1243                 :                :          *      +  we have write locks on both the old (left half)
                               1244                 :                :          *         and new (right half) buffers, after the split; and
                               1245                 :                :          *      +  we know the key we want to insert into the parent
                               1246                 :                :          *         (it's the "high key" on the left child page).
                               1247                 :                :          *
                               1248                 :                :          * We're ready to do the parent insertion.  We need to hold onto the
                               1249                 :                :          * locks for the child pages until we locate the parent, but we can
                               1250                 :                :          * at least release the lock on the right child before doing the
                               1251                 :                :          * actual insertion.  The lock on the left child will be released
                               1252                 :                :          * last of all by parent insertion, where it is the 'cbuf' of parent
                               1253                 :                :          * page.
                               1254                 :                :          *----------
                               1255                 :                :          */
                               1256                 :                : #ifdef USE_INJECTION_POINTS
  103 heikki.linnakangas@i     1257         [ +  + ]:GNC       13241 :         if (P_ISLEAF(opaque))
                               1258                 :          13083 :             INJECTION_POINT("nbtree-leave-leaf-split-incomplete", NULL);
                               1259                 :                :         else
                               1260                 :            158 :             INJECTION_POINT("nbtree-leave-internal-split-incomplete", NULL);
                               1261                 :                : #endif
                               1262                 :                : 
 1079 andres@anarazel.de       1263                 :CBC       13239 :         _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
                               1264                 :                :     }
                               1265                 :                :     else
                               1266                 :                :     {
 8423 tgl@sss.pgh.pa.us        1267                 :        4272971 :         Buffer      metabuf = InvalidBuffer;
                               1268                 :        4272971 :         Page        metapg = NULL;
                               1269                 :        4272971 :         BTMetaPageData *metad = NULL;
                               1270                 :                :         BlockNumber blockcache;
                               1271                 :                : 
                               1272                 :                :         /*
                               1273                 :                :          * If we are doing this insert because we split a page that was the
                               1274                 :                :          * only one on its tree level, but was not the root, it may have been
                               1275                 :                :          * the "fast root".  We need to ensure that the fast root link points
                               1276                 :                :          * at or above the current page.  We can safely acquire a lock on the
                               1277                 :                :          * metapage here --- see comments for _bt_newlevel().
                               1278                 :                :          */
 1944 pg@bowt.ie               1279         [ +  + ]:        4272971 :         if (unlikely(split_only_page))
                               1280                 :                :         {
 2188                          1281         [ -  + ]:              9 :             Assert(!isleaf);
                               1282         [ -  + ]:              9 :             Assert(BufferIsValid(cbuf));
                               1283                 :                : 
 1009                          1284                 :              9 :             metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
 3616 kgrittn@postgresql.o     1285                 :              9 :             metapg = BufferGetPage(metabuf);
 8423 tgl@sss.pgh.pa.us        1286                 :              9 :             metad = BTPageGetMeta(metapg);
                               1287                 :                : 
 1845 pg@bowt.ie               1288         [ -  + ]:              9 :             if (metad->btm_fastlevel >= opaque->btpo_level)
                               1289                 :                :             {
                               1290                 :                :                 /* no update wanted */
 8423 tgl@sss.pgh.pa.us        1291                 :UBC           0 :                 _bt_relbuf(rel, metabuf);
                               1292                 :              0 :                 metabuf = InvalidBuffer;
                               1293                 :                :             }
                               1294                 :                :         }
                               1295                 :                : 
                               1296                 :                :         /* Do the update.  No ereport(ERROR) until changes are logged */
 8423 tgl@sss.pgh.pa.us        1297                 :CBC     4272971 :         START_CRIT_SECTION();
                               1298                 :                : 
 2209 pg@bowt.ie               1299         [ +  + ]:        4272971 :         if (postingoff != 0)
                               1300                 :          16377 :             memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
                               1301                 :                : 
  139 peter@eisentraut.org     1302         [ -  + ]:GNC     4272971 :         if (PageAddItem(page, itup, itemsz, newitemoff, false, false) == InvalidOffsetNumber)
 5677 tgl@sss.pgh.pa.us        1303         [ #  # ]:UBC           0 :             elog(PANIC, "failed to add new item to block %u in index \"%s\"",
                               1304                 :                :                  BufferGetBlockNumber(buf), RelationGetRelationName(rel));
                               1305                 :                : 
 7289 tgl@sss.pgh.pa.us        1306                 :CBC     4272971 :         MarkBufferDirty(buf);
                               1307                 :                : 
 8423                          1308         [ +  + ]:        4272971 :         if (BufferIsValid(metabuf))
                               1309                 :                :         {
                               1310                 :                :             /* upgrade meta-page if needed */
 2552 pg@bowt.ie               1311         [ -  + ]:              9 :             if (metad->btm_version < BTREE_NOVAC_VERSION)
 2902 teodor@sigaev.ru         1312                 :UBC           0 :                 _bt_upgrademetapage(metapg);
 2189 pg@bowt.ie               1313                 :CBC           9 :             metad->btm_fastroot = BufferGetBlockNumber(buf);
 1845                          1314                 :              9 :             metad->btm_fastlevel = opaque->btpo_level;
 7289 tgl@sss.pgh.pa.us        1315                 :              9 :             MarkBufferDirty(metabuf);
                               1316                 :                :         }
                               1317                 :                : 
                               1318                 :                :         /*
                               1319                 :                :          * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
                               1320                 :                :          * finishes a split
                               1321                 :                :          */
 2162 pg@bowt.ie               1322         [ +  + ]:        4272971 :         if (!isleaf)
                               1323                 :                :         {
 3616 kgrittn@postgresql.o     1324                 :          12372 :             Page        cpage = BufferGetPage(cbuf);
 1444 michael@paquier.xyz      1325                 :          12372 :             BTPageOpaque cpageop = BTPageGetOpaque(cpage);
                               1326                 :                : 
 4380 heikki.linnakangas@i     1327         [ -  + ]:          12372 :             Assert(P_INCOMPLETE_SPLIT(cpageop));
                               1328                 :          12372 :             cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
                               1329                 :          12372 :             MarkBufferDirty(cbuf);
                               1330                 :                :         }
                               1331                 :                : 
                               1332                 :                :         /* XLOG stuff */
 5571 rhaas@postgresql.org     1333   [ +  +  +  +  :        4272971 :         if (RelationNeedsWAL(rel))
                                        +  +  +  + ]
 8423 tgl@sss.pgh.pa.us        1334                 :GIC     3932973 :         {
                               1335                 :                :             xl_btree_insert xlrec;
                               1336                 :                :             xl_btree_metadata xlmeta;
                               1337                 :                :             uint8       xlinfo;
                               1338                 :                :             uint16      upostingoff;
                               1339                 :                : 
 2189 pg@bowt.ie               1340                 :CBC     3932973 :             xlrec.offnum = newitemoff;
                               1341                 :                : 
 4133 heikki.linnakangas@i     1342                 :        3932973 :             XLogBeginInsert();
  397 peter@eisentraut.org     1343                 :        3932973 :             XLogRegisterData(&xlrec, SizeOfBtreeInsert);
                               1344                 :                : 
 2188 pg@bowt.ie               1345   [ +  +  +  + ]:        3932973 :             if (isleaf && postingoff == 0)
                               1346                 :                :             {
                               1347                 :                :                 /* Simple leaf insert */
 7276 tgl@sss.pgh.pa.us        1348                 :        3905120 :                 xlinfo = XLOG_BTREE_INSERT_LEAF;
                               1349                 :                :             }
 2209 pg@bowt.ie               1350         [ +  + ]:          27853 :             else if (postingoff != 0)
                               1351                 :                :             {
                               1352                 :                :                 /*
                               1353                 :                :                  * Leaf insert with posting list split.  Must include
                               1354                 :                :                  * postingoff field before newitem/orignewitem.
                               1355                 :                :                  */
 2161                          1356         [ -  + ]:          16377 :                 Assert(isleaf);
 2209                          1357                 :          16377 :                 xlinfo = XLOG_BTREE_INSERT_POST;
                               1358                 :                :             }
                               1359                 :                :             else
                               1360                 :                :             {
                               1361                 :                :                 /* Internal page insert, which finishes a split on cbuf */
 7276 tgl@sss.pgh.pa.us        1362                 :          11476 :                 xlinfo = XLOG_BTREE_INSERT_UPPER;
 2162 pg@bowt.ie               1363                 :          11476 :                 XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
                               1364                 :                : 
 2161                          1365         [ +  + ]:          11476 :                 if (BufferIsValid(metabuf))
                               1366                 :                :                 {
                               1367                 :                :                     /* Actually, it's an internal page insert + meta update */
                               1368                 :              9 :                     xlinfo = XLOG_BTREE_INSERT_META;
                               1369                 :                : 
                               1370         [ -  + ]:              9 :                     Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
                               1371                 :              9 :                     xlmeta.version = metad->btm_version;
                               1372                 :              9 :                     xlmeta.root = metad->btm_root;
                               1373                 :              9 :                     xlmeta.level = metad->btm_level;
                               1374                 :              9 :                     xlmeta.fastroot = metad->btm_fastroot;
                               1375                 :              9 :                     xlmeta.fastlevel = metad->btm_fastlevel;
 1845                          1376                 :              9 :                     xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
 2161                          1377                 :              9 :                     xlmeta.allequalimage = metad->btm_allequalimage;
                               1378                 :                : 
                               1379                 :              9 :                     XLogRegisterBuffer(2, metabuf,
                               1380                 :                :                                        REGBUF_WILL_INIT | REGBUF_STANDARD);
  397 peter@eisentraut.org     1381                 :              9 :                     XLogRegisterBufData(2, &xlmeta,
                               1382                 :                :                                         sizeof(xl_btree_metadata));
                               1383                 :                :                 }
                               1384                 :                :             }
                               1385                 :                : 
 4133 heikki.linnakangas@i     1386                 :        3932973 :             XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
 2209 pg@bowt.ie               1387         [ +  + ]:        3932973 :             if (postingoff == 0)
                               1388                 :                :             {
                               1389                 :                :                 /* Just log itup from caller */
  397 peter@eisentraut.org     1390                 :        3916596 :                 XLogRegisterBufData(0, itup, IndexTupleSize(itup));
                               1391                 :                :             }
                               1392                 :                :             else
                               1393                 :                :             {
                               1394                 :                :                 /*
                               1395                 :                :                  * Insert with posting list split (XLOG_BTREE_INSERT_POST
                               1396                 :                :                  * record) case.
                               1397                 :                :                  *
                               1398                 :                :                  * Log postingoff.  Also log origitup, not itup.  REDO routine
                               1399                 :                :                  * must reconstruct final itup (as well as nposting) using
                               1400                 :                :                  * _bt_swap_posting().
                               1401                 :                :                  */
 2145 pg@bowt.ie               1402                 :          16377 :                 upostingoff = postingoff;
                               1403                 :                : 
  397 peter@eisentraut.org     1404                 :          16377 :                 XLogRegisterBufData(0, &upostingoff, sizeof(uint16));
                               1405                 :          16377 :                 XLogRegisterBufData(0, origitup,
 2209 pg@bowt.ie               1406                 :          16377 :                                     IndexTupleSize(origitup));
                               1407                 :                :             }
                               1408                 :                : 
 4133 heikki.linnakangas@i     1409                 :        3932973 :             recptr = XLogInsert(RM_BTREE_ID, xlinfo);
                               1410                 :                :         }
                               1411                 :                :         else
    2 pg@bowt.ie               1412                 :GNC      339998 :             recptr = XLogGetFakeLSN(rel);
                               1413                 :                : 
                               1414         [ +  + ]:        4272971 :         if (BufferIsValid(metabuf))
                               1415                 :              9 :             PageSetLSN(metapg, recptr);
                               1416         [ +  + ]:        4272971 :         if (!isleaf)
                               1417                 :          12372 :             PageSetLSN(BufferGetPage(cbuf), recptr);
                               1418                 :                : 
                               1419                 :        4272971 :         PageSetLSN(page, recptr);
                               1420                 :                : 
 8423 tgl@sss.pgh.pa.us        1421         [ -  + ]:CBC     4272971 :         END_CRIT_SECTION();
                               1422                 :                : 
                               1423                 :                :         /* Release subsidiary buffers */
                               1424         [ +  + ]:        4272971 :         if (BufferIsValid(metabuf))
 7289                          1425                 :              9 :             _bt_relbuf(rel, metabuf);
 2162 pg@bowt.ie               1426         [ +  + ]:        4272971 :         if (!isleaf)
 4380 heikki.linnakangas@i     1427                 :          12372 :             _bt_relbuf(rel, cbuf);
                               1428                 :                : 
                               1429                 :                :         /*
                               1430                 :                :          * Cache the block number if this is the rightmost leaf page.  Cache
                               1431                 :                :          * may be used by a future inserter within _bt_search_insert().
                               1432                 :                :          */
 2188 pg@bowt.ie               1433                 :        4272971 :         blockcache = InvalidBlockNumber;
 1944                          1434   [ +  +  +  +  :        4272971 :         if (isrightmost && isleaf && !isroot)
                                              +  + ]
 2188                          1435                 :        2532669 :             blockcache = BufferGetBlockNumber(buf);
                               1436                 :                : 
                               1437                 :                :         /* Release buffer for insertion target block */
 7289 tgl@sss.pgh.pa.us        1438                 :        4272971 :         _bt_relbuf(rel, buf);
                               1439                 :                : 
                               1440                 :                :         /*
                               1441                 :                :          * If we decided to cache the insertion target block before releasing
                               1442                 :                :          * its buffer lock, then cache it now.  Check the height of the tree
                               1443                 :                :          * first, though.  We don't go for the optimization with small
                               1444                 :                :          * indexes.  Defer final check to this point to ensure that we don't
                               1445                 :                :          * call _bt_getrootheight while holding a buffer lock.
                               1446                 :                :          */
 2188 pg@bowt.ie               1447   [ +  +  +  + ]:        6805640 :         if (BlockNumberIsValid(blockcache) &&
 1009                          1448                 :        2532669 :             _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
 2188                          1449                 :          34672 :             RelationSetTargetBlock(rel, blockcache);
                               1450                 :                :     }
                               1451                 :                : 
                               1452                 :                :     /* be tidy */
 2209                          1453         [ +  + ]:        4286209 :     if (postingoff != 0)
                               1454                 :                :     {
                               1455                 :                :         /* itup is actually a modified copy of caller's original */
                               1456                 :          16406 :         pfree(nposting);
                               1457                 :          16406 :         pfree(itup);
                               1458                 :                :     }
10841 scrappy@hub.org          1459                 :        4286209 : }
                               1460                 :                : 
                               1461                 :                : /*
                               1462                 :                :  *  _bt_split() -- split a page in the btree.
                               1463                 :                :  *
                               1464                 :                :  *      On entry, buf is the page to split, and is pinned and write-locked.
                               1465                 :                :  *      newitemoff etc. tell us about the new item that must be inserted
                               1466                 :                :  *      along with the data from the original page.
                               1467                 :                :  *
                               1468                 :                :  *      itup_key is used for suffix truncation on leaf pages (internal
                               1469                 :                :  *      page callers pass NULL).  When splitting a non-leaf page, 'cbuf'
                               1470                 :                :  *      is the left-sibling of the page we're inserting the downlink for.
                               1471                 :                :  *      This function will clear the INCOMPLETE_SPLIT flag on it, and
                               1472                 :                :  *      release the buffer.
                               1473                 :                :  *
                               1474                 :                :  *      orignewitem, nposting, and postingoff are needed when an insert of
                               1475                 :                :  *      orignewitem results in both a posting list split and a page split.
                               1476                 :                :  *      These extra posting list split details are used here in the same
                               1477                 :                :  *      way as they are used in the more common case where a posting list
                               1478                 :                :  *      split does not coincide with a page split.  We need to deal with
                               1479                 :                :  *      posting list splits directly in order to ensure that everything
                               1480                 :                :  *      that follows from the insert of orignewitem is handled as a single
                               1481                 :                :  *      atomic operation (though caller's insert of a new pivot/downlink
                               1482                 :                :  *      into parent page will still be a separate operation).  See
                               1483                 :                :  *      nbtree/README for details on the design of posting list splits.
                               1484                 :                :  *
                               1485                 :                :  *      Returns the new right sibling of buf, pinned and write-locked.
                               1486                 :                :  *      The pin and lock on buf are maintained.
                               1487                 :                :  */
                               1488                 :                : static Buffer
 1079 andres@anarazel.de       1489                 :          13241 : _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
                               1490                 :                :           Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
                               1491                 :                :           IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
                               1492                 :                : {
                               1493                 :                :     Buffer      rbuf;
                               1494                 :                :     Page        origpage;
                               1495                 :                :     Page        leftpage,
                               1496                 :                :                 rightpage;
                               1497                 :                :     PGAlignedBlock leftpage_buf,
                               1498                 :                :                 rightpage_buf;
                               1499                 :                :     BlockNumber origpagenumber,
                               1500                 :                :                 rightpagenumber;
                               1501                 :                :     BTPageOpaque ropaque,
                               1502                 :                :                 lopaque,
                               1503                 :                :                 oopaque;
 8422 tgl@sss.pgh.pa.us        1504                 :          13241 :     Buffer      sbuf = InvalidBuffer;
                               1505                 :          13241 :     Page        spage = NULL;
                               1506                 :          13241 :     BTPageOpaque sopaque = NULL;
                               1507                 :                :     Size        itemsz;
                               1508                 :                :     ItemId      itemid;
                               1509                 :                :     IndexTuple  firstright,
                               1510                 :                :                 lefthighkey;
                               1511                 :                :     OffsetNumber firstrightoff;
                               1512                 :                :     OffsetNumber afterleftoff,
                               1513                 :                :                 afterrightoff,
                               1514                 :                :                 minusinfoff;
                               1515                 :                :     OffsetNumber origpagepostingoff;
                               1516                 :                :     OffsetNumber maxoff;
                               1517                 :                :     OffsetNumber i;
                               1518                 :                :     bool        newitemonleft,
                               1519                 :                :                 isleaf,
                               1520                 :                :                 isrightmost;
                               1521                 :                :     XLogRecPtr  recptr;
                               1522                 :                : 
                               1523                 :                :     /*
                               1524                 :                :      * origpage is the original page to be split.  leftpage is a temporary
                               1525                 :                :      * buffer that receives the left-sibling data, which will be copied back
                               1526                 :                :      * into origpage on success.  rightpage is the new page that will receive
                               1527                 :                :      * the right-sibling data.
                               1528                 :                :      *
                               1529                 :                :      * leftpage is allocated after choosing a split point.  rightpage's new
                               1530                 :                :      * buffer isn't acquired until after leftpage is initialized and has new
                               1531                 :                :      * high key, the last point where splitting the page may fail (barring
                               1532                 :                :      * corruption).  Failing before acquiring new buffer won't have lasting
                               1533                 :                :      * consequences, since origpage won't have been modified and leftpage is
                               1534                 :                :      * only workspace.
                               1535                 :                :      */
 3616 kgrittn@postgresql.o     1536                 :          13241 :     origpage = BufferGetPage(buf);
 1444 michael@paquier.xyz      1537                 :          13241 :     oopaque = BTPageGetOpaque(origpage);
 2162 pg@bowt.ie               1538                 :          13241 :     isleaf = P_ISLEAF(oopaque);
                               1539                 :          13241 :     isrightmost = P_RIGHTMOST(oopaque);
                               1540                 :          13241 :     maxoff = PageGetMaxOffsetNumber(origpage);
 5677 tgl@sss.pgh.pa.us        1541                 :          13241 :     origpagenumber = BufferGetBlockNumber(buf);
                               1542                 :                : 
                               1543                 :                :     /*
                               1544                 :                :      * Choose a point to split origpage at.
                               1545                 :                :      *
                               1546                 :                :      * A split point can be thought of as a point _between_ two existing data
                               1547                 :                :      * items on origpage (the lastleft and firstright tuples), provided you
                               1548                 :                :      * pretend that the new item that didn't fit is already on origpage.
                               1549                 :                :      *
                               1550                 :                :      * Since origpage does not actually contain newitem, the representation of
                               1551                 :                :      * split points needs to work with two boundary cases: splits where
                               1552                 :                :      * newitem is lastleft, and splits where newitem is firstright.
                               1553                 :                :      * newitemonleft resolves the ambiguity that would otherwise exist when
                               1554                 :                :      * newitemoff == firstrightoff.  In all other cases it's clear which side
                               1555                 :                :      * of the split every tuple goes on from context.  newitemonleft is
                               1556                 :                :      * usually (but not always) redundant information.
                               1557                 :                :      *
                               1558                 :                :      * firstrightoff is supposed to be an origpage offset number, but it's
                               1559                 :                :      * possible that its value will be maxoff+1, which is "past the end" of
                               1560                 :                :      * origpage.  This happens in the rare case where newitem goes after all
                               1561                 :                :      * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
                               1562                 :                :      * origpage at the point that leaves newitem alone on new right page.  Any
                               1563                 :                :      * "!newitemonleft && newitemoff == firstrightoff" split point makes
                               1564                 :                :      * newitem the firstright tuple, though, so this case isn't a special
                               1565                 :                :      * case.
                               1566                 :                :      */
 2162 pg@bowt.ie               1567                 :          13241 :     firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
                               1568                 :                :                                      newitem, &newitemonleft);
                               1569                 :                : 
                               1570                 :                :     /* Use temporary buffer for leftpage */
  170 michael@paquier.xyz      1571                 :GNC       13241 :     leftpage = leftpage_buf.data;
 2498 pg@bowt.ie               1572                 :CBC       13241 :     _bt_pageinit(leftpage, BufferGetPageSize(buf));
 1444 michael@paquier.xyz      1573                 :          13241 :     lopaque = BTPageGetOpaque(leftpage);
                               1574                 :                : 
                               1575                 :                :     /*
                               1576                 :                :      * leftpage won't be the root when we're done.  Also, clear the SPLIT_END
                               1577                 :                :      * and HAS_GARBAGE flags.
                               1578                 :                :      */
 9293 vadim4o@yahoo.com        1579                 :          13241 :     lopaque->btpo_flags = oopaque->btpo_flags;
 7173 tgl@sss.pgh.pa.us        1580                 :          13241 :     lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
                               1581                 :                :     /* set flag in leftpage indicating that rightpage has no downlink yet */
 4380 heikki.linnakangas@i     1582                 :          13241 :     lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
10416 bruce@momjian.us         1583                 :          13241 :     lopaque->btpo_prev = oopaque->btpo_prev;
                               1584                 :                :     /* handle btpo_next after rightpage buffer acquired */
 1845 pg@bowt.ie               1585                 :          13241 :     lopaque->btpo_level = oopaque->btpo_level;
                               1586                 :                :     /* handle btpo_cycleid after rightpage buffer acquired */
                               1587                 :                : 
                               1588                 :                :     /*
                               1589                 :                :      * Copy the original page's LSN into leftpage, which will become the
                               1590                 :                :      * updated version of the page.  We need this because XLogInsert will
                               1591                 :                :      * examine the LSN and possibly dump it in a page image.
                               1592                 :                :      */
 2498                          1593                 :          13241 :     PageSetLSN(leftpage, PageGetLSN(origpage));
                               1594                 :                : 
                               1595                 :                :     /*
                               1596                 :                :      * Determine page offset number of existing overlapped-with-orignewitem
                               1597                 :                :      * posting list when it is necessary to perform a posting list split in
                               1598                 :                :      * passing.  Note that newitem was already changed by caller (newitem no
                               1599                 :                :      * longer has the orignewitem TID).
                               1600                 :                :      *
                               1601                 :                :      * This page offset number (origpagepostingoff) will be used to pretend
                               1602                 :                :      * that the posting split has already taken place, even though the
                               1603                 :                :      * required modifications to origpage won't occur until we reach the
                               1604                 :                :      * critical section.  The lastleft and firstright tuples of our page split
                               1605                 :                :      * point should, in effect, come from an imaginary version of origpage
                               1606                 :                :      * that has the nposting tuple instead of the original posting list tuple.
                               1607                 :                :      *
                               1608                 :                :      * Note: _bt_findsplitloc() should have compensated for coinciding posting
                               1609                 :                :      * list splits in just the same way, at least in theory.  It doesn't
                               1610                 :                :      * bother with that, though.  In practice it won't affect its choice of
                               1611                 :                :      * split point.
                               1612                 :                :      */
 2209                          1613                 :          13241 :     origpagepostingoff = InvalidOffsetNumber;
                               1614         [ +  + ]:          13241 :     if (postingoff != 0)
                               1615                 :                :     {
                               1616         [ -  + ]:             29 :         Assert(isleaf);
                               1617         [ -  + ]:             29 :         Assert(ItemPointerCompare(&orignewitem->t_tid,
                               1618                 :                :                                   &newitem->t_tid) < 0);
                               1619         [ -  + ]:             29 :         Assert(BTreeTupleIsPosting(nposting));
                               1620                 :             29 :         origpagepostingoff = OffsetNumberPrev(newitemoff);
                               1621                 :                :     }
                               1622                 :                : 
                               1623                 :                :     /*
                               1624                 :                :      * The high key for the new left page is a possibly-truncated copy of
                               1625                 :                :      * firstright on the leaf level (it's "firstright itself" on internal
                               1626                 :                :      * pages; see !isleaf comments below).  This may seem to be contrary to
                               1627                 :                :      * Lehman & Yao's approach of using a copy of lastleft as the new high key
                               1628                 :                :      * when splitting on the leaf level.  It isn't, though.
                               1629                 :                :      *
                               1630                 :                :      * Suffix truncation will leave the left page's high key fully equal to
                               1631                 :                :      * lastleft when lastleft and firstright are equal prior to heap TID (that
                               1632                 :                :      * is, the tiebreaker TID value comes from lastleft).  It isn't actually
                               1633                 :                :      * necessary for a new leaf high key to be a copy of lastleft for the L&Y
                               1634                 :                :      * "subtree" invariant to hold.  It's sufficient to make sure that the new
                               1635                 :                :      * leaf high key is strictly less than firstright, and greater than or
                               1636                 :                :      * equal to (not necessarily equal to) lastleft.  In other words, when
                               1637                 :                :      * suffix truncation isn't possible during a leaf page split, we take
                               1638                 :                :      * L&Y's exact approach to generating a new high key for the left page.
                               1639                 :                :      * (Actually, that is slightly inaccurate.  We don't just use a copy of
                               1640                 :                :      * lastleft.  A tuple with all the keys from firstright but the max heap
                               1641                 :                :      * TID from lastleft is used, to avoid introducing a special case.)
                               1642                 :                :      */
 2162                          1643   [ +  +  +  + ]:          13241 :     if (!newitemonleft && newitemoff == firstrightoff)
                               1644                 :                :     {
                               1645                 :                :         /* incoming tuple becomes firstright */
 9368 tgl@sss.pgh.pa.us        1646                 :             29 :         itemsz = newitemsz;
 2162 pg@bowt.ie               1647                 :             29 :         firstright = newitem;
                               1648                 :                :     }
                               1649                 :                :     else
                               1650                 :                :     {
                               1651                 :                :         /* existing item at firstrightoff becomes firstright */
                               1652                 :          13212 :         itemid = PageGetItemId(origpage, firstrightoff);
 9368 tgl@sss.pgh.pa.us        1653                 :          13212 :         itemsz = ItemIdGetLength(itemid);
 2162 pg@bowt.ie               1654                 :          13212 :         firstright = (IndexTuple) PageGetItem(origpage, itemid);
                               1655         [ +  + ]:          13212 :         if (firstrightoff == origpagepostingoff)
 2162 pg@bowt.ie               1656                 :GBC           1 :             firstright = nposting;
                               1657                 :                :     }
                               1658                 :                : 
 2162 pg@bowt.ie               1659         [ +  + ]:CBC       13241 :     if (isleaf)
                               1660                 :                :     {
                               1661                 :                :         IndexTuple  lastleft;
                               1662                 :                : 
                               1663                 :                :         /* Attempt suffix truncation for leaf page splits */
                               1664   [ +  +  +  + ]:          13083 :         if (newitemonleft && newitemoff == firstrightoff)
                               1665                 :                :         {
                               1666                 :                :             /* incoming tuple becomes lastleft */
 2552                          1667                 :            242 :             lastleft = newitem;
                               1668                 :                :         }
                               1669                 :                :         else
                               1670                 :                :         {
                               1671                 :                :             OffsetNumber lastleftoff;
                               1672                 :                : 
                               1673                 :                :             /* existing item before firstrightoff becomes lastleft */
 2162                          1674                 :          12841 :             lastleftoff = OffsetNumberPrev(firstrightoff);
 2552                          1675   [ +  +  -  + ]:          12841 :             Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
                               1676                 :          12841 :             itemid = PageGetItemId(origpage, lastleftoff);
                               1677                 :          12841 :             lastleft = (IndexTuple) PageGetItem(origpage, itemid);
 2209                          1678         [ +  + ]:          12841 :             if (lastleftoff == origpagepostingoff)
                               1679                 :              1 :                 lastleft = nposting;
                               1680                 :                :         }
                               1681                 :                : 
 2162                          1682                 :          13083 :         lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
                               1683                 :          13083 :         itemsz = IndexTupleSize(lefthighkey);
                               1684                 :                :     }
                               1685                 :                :     else
                               1686                 :                :     {
                               1687                 :                :         /*
                               1688                 :                :          * Don't perform suffix truncation on a copy of firstright to make
                               1689                 :                :          * left page high key for internal page splits.  Must use firstright
                               1690                 :                :          * as new high key directly.
                               1691                 :                :          *
                               1692                 :                :          * Each distinct separator key value originates as a leaf level high
                               1693                 :                :          * key; all other separator keys/pivot tuples are copied from one
                               1694                 :                :          * level down.  A separator key in a grandparent page must be
                               1695                 :                :          * identical to high key in rightmost parent page of the subtree to
                               1696                 :                :          * its left, which must itself be identical to high key in rightmost
                               1697                 :                :          * child page of that same subtree (this even applies to separator
                               1698                 :                :          * from grandparent's high key).  There must always be an unbroken
                               1699                 :                :          * "seam" of identical separator keys that guide index scans at every
                               1700                 :                :          * level, starting from the grandparent.  That's why suffix truncation
                               1701                 :                :          * is unsafe here.
                               1702                 :                :          *
                               1703                 :                :          * Internal page splits will truncate firstright into a "negative
                               1704                 :                :          * infinity" data item when it gets inserted on the new right page
                               1705                 :                :          * below, though.  This happens during the call to _bt_pgaddtup() for
                               1706                 :                :          * the new first data item for right page.  Do not confuse this
                               1707                 :                :          * mechanism with suffix truncation.  It is just a convenient way of
                               1708                 :                :          * implementing page splits that split the internal page "inside"
                               1709                 :                :          * firstright.  The lefthighkey separator key cannot appear a second
                               1710                 :                :          * time in the right page (only firstright's downlink goes in right
                               1711                 :                :          * page).
                               1712                 :                :          */
                               1713                 :            158 :         lefthighkey = firstright;
                               1714                 :                :     }
                               1715                 :                : 
                               1716                 :                :     /*
                               1717                 :                :      * Add new high key to leftpage
                               1718                 :                :      */
                               1719                 :          13241 :     afterleftoff = P_HIKEY;
                               1720                 :                : 
                               1721   [ +  -  -  + ]:          13241 :     Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
                               1722   [ +  -  -  + ]:          13241 :     Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
                               1723                 :                :            IndexRelationGetNumberOfKeyAttributes(rel));
                               1724         [ -  + ]:          13241 :     Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
  139 peter@eisentraut.org     1725         [ -  + ]:GNC       13241 :     if (PageAddItem(leftpage, lefthighkey, itemsz, afterleftoff, false, false) == InvalidOffsetNumber)
 2162 pg@bowt.ie               1726         [ #  # ]:UBC           0 :         elog(ERROR, "failed to add high key to the left sibling"
                               1727                 :                :              " while splitting block %u of index \"%s\"",
                               1728                 :                :              origpagenumber, RelationGetRelationName(rel));
 2162 pg@bowt.ie               1729                 :CBC       13241 :     afterleftoff = OffsetNumberNext(afterleftoff);
                               1730                 :                : 
                               1731                 :                :     /*
                               1732                 :                :      * Acquire a new right page to split into, now that left page has a new
                               1733                 :                :      * high key.
                               1734                 :                :      *
                               1735                 :                :      * To not confuse future VACUUM operations, we zero the right page and
                               1736                 :                :      * work on an in-memory copy of it before writing WAL, then copy its
                               1737                 :                :      * contents back to the actual page once we start the critical section
                               1738                 :                :      * work.  This simplifies the split work, so as there is no need to zero
                               1739                 :                :      * the right page before throwing an error.
                               1740                 :                :      */
 1009                          1741                 :          13241 :     rbuf = _bt_allocbuf(rel, heaprel);
  170 michael@paquier.xyz      1742                 :GNC       13241 :     rightpage = rightpage_buf.data;
                               1743                 :                : 
                               1744                 :                :     /*
                               1745                 :                :      * Copy the contents of the right page into its temporary location, and
                               1746                 :                :      * zero the original space.
                               1747                 :                :      */
                               1748                 :          13241 :     memcpy(rightpage, BufferGetPage(rbuf), BLCKSZ);
                               1749                 :          13241 :     memset(BufferGetPage(rbuf), 0, BLCKSZ);
 2498 pg@bowt.ie               1750                 :CBC       13241 :     rightpagenumber = BufferGetBlockNumber(rbuf);
                               1751                 :                :     /* rightpage was initialized by _bt_allocbuf */
 1444 michael@paquier.xyz      1752                 :          13241 :     ropaque = BTPageGetOpaque(rightpage);
                               1753                 :                : 
                               1754                 :                :     /*
                               1755                 :                :      * Finish off remaining leftpage special area fields.  They cannot be set
                               1756                 :                :      * before both origpage (leftpage) and rightpage buffers are acquired and
                               1757                 :                :      * locked.
                               1758                 :                :      *
                               1759                 :                :      * btpo_cycleid is only used with leaf pages, though we set it here in all
                               1760                 :                :      * cases just to be consistent.
                               1761                 :                :      */
 2498 pg@bowt.ie               1762                 :          13241 :     lopaque->btpo_next = rightpagenumber;
                               1763                 :          13241 :     lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
                               1764                 :                : 
                               1765                 :                :     /*
                               1766                 :                :      * rightpage won't be the root when we're done.  Also, clear the SPLIT_END
                               1767                 :                :      * and HAS_GARBAGE flags.
                               1768                 :                :      */
                               1769                 :          13241 :     ropaque->btpo_flags = oopaque->btpo_flags;
                               1770                 :          13241 :     ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
                               1771                 :          13241 :     ropaque->btpo_prev = origpagenumber;
                               1772                 :          13241 :     ropaque->btpo_next = oopaque->btpo_next;
 1845                          1773                 :          13241 :     ropaque->btpo_level = oopaque->btpo_level;
 2498                          1774                 :          13241 :     ropaque->btpo_cycleid = lopaque->btpo_cycleid;
                               1775                 :                : 
                               1776                 :                :     /*
                               1777                 :                :      * Add new high key to rightpage where necessary.
                               1778                 :                :      *
                               1779                 :                :      * If the page we're splitting is not the rightmost page at its level in
                               1780                 :                :      * the tree, then the first entry on the page is the high key from
                               1781                 :                :      * origpage.
                               1782                 :                :      */
 2162                          1783                 :          13241 :     afterrightoff = P_HIKEY;
                               1784                 :                : 
                               1785         [ +  + ]:          13241 :     if (!isrightmost)
                               1786                 :                :     {
                               1787                 :                :         IndexTuple  righthighkey;
                               1788                 :                : 
 2498                          1789                 :           5436 :         itemid = PageGetItemId(origpage, P_HIKEY);
                               1790                 :           5436 :         itemsz = ItemIdGetLength(itemid);
 2162                          1791                 :           5436 :         righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
                               1792   [ +  -  -  + ]:           5436 :         Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
                               1793   [ +  -  -  + ]:           5436 :         Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
                               1794                 :                :                IndexRelationGetNumberOfKeyAttributes(rel));
  139 peter@eisentraut.org     1795         [ -  + ]:GNC        5436 :         if (PageAddItem(rightpage, righthighkey, itemsz, afterrightoff, false, false) == InvalidOffsetNumber)
                               1796                 :                :         {
 2162 pg@bowt.ie               1797         [ #  # ]:UBC           0 :             elog(ERROR, "failed to add high key to the right sibling"
                               1798                 :                :                  " while splitting block %u of index \"%s\"",
                               1799                 :                :                  origpagenumber, RelationGetRelationName(rel));
                               1800                 :                :         }
 2162 pg@bowt.ie               1801                 :CBC        5436 :         afterrightoff = OffsetNumberNext(afterrightoff);
                               1802                 :                :     }
                               1803                 :                : 
                               1804                 :                :     /*
                               1805                 :                :      * Internal page splits truncate first data item on right page -- it
                               1806                 :                :      * becomes "minus infinity" item for the page.  Set this up here.
                               1807                 :                :      */
                               1808                 :          13241 :     minusinfoff = InvalidOffsetNumber;
                               1809         [ +  + ]:          13241 :     if (!isleaf)
                               1810                 :            158 :         minusinfoff = afterrightoff;
                               1811                 :                : 
                               1812                 :                :     /*
                               1813                 :                :      * Now transfer all the data items (non-pivot tuples in isleaf case, or
                               1814                 :                :      * additional pivot tuples in !isleaf case) to the appropriate page.
                               1815                 :                :      *
                               1816                 :                :      * Note: we *must* insert at least the right page's items in item-number
                               1817                 :                :      * order, for the benefit of _bt_restore_page().
                               1818                 :                :      */
 9368 tgl@sss.pgh.pa.us        1819   [ +  +  +  + ]:        4141784 :     for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
                               1820                 :                :     {
                               1821                 :                :         IndexTuple  dataitem;
                               1822                 :                : 
10416 bruce@momjian.us         1823                 :        4128543 :         itemid = PageGetItemId(origpage, i);
                               1824                 :        4128543 :         itemsz = ItemIdGetLength(itemid);
 2162 pg@bowt.ie               1825                 :        4128543 :         dataitem = (IndexTuple) PageGetItem(origpage, itemid);
                               1826                 :                : 
                               1827                 :                :         /* replace original item with nposting due to posting split? */
 2209                          1828         [ +  + ]:        4128543 :         if (i == origpagepostingoff)
                               1829                 :                :         {
 2162                          1830         [ -  + ]:             29 :             Assert(BTreeTupleIsPosting(dataitem));
 2209                          1831         [ -  + ]:             29 :             Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
 2162                          1832                 :             29 :             dataitem = nposting;
                               1833                 :                :         }
                               1834                 :                : 
                               1835                 :                :         /* does new item belong before this one? */
 2209                          1836         [ +  + ]:        4128514 :         else if (i == newitemoff)
                               1837                 :                :         {
 9368 tgl@sss.pgh.pa.us        1838         [ +  + ]:           7033 :             if (newitemonleft)
                               1839                 :                :             {
 2162 pg@bowt.ie               1840         [ -  + ]:           2059 :                 Assert(newitemoff <= firstrightoff);
                               1841         [ -  + ]:           2059 :                 if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
                               1842                 :                :                                   false))
                               1843                 :                :                 {
 5677 tgl@sss.pgh.pa.us        1844         [ #  # ]:UBC           0 :                     elog(ERROR, "failed to add new item to the left sibling"
                               1845                 :                :                          " while splitting block %u of index \"%s\"",
                               1846                 :                :                          origpagenumber, RelationGetRelationName(rel));
                               1847                 :                :                 }
 2162 pg@bowt.ie               1848                 :CBC        2059 :                 afterleftoff = OffsetNumberNext(afterleftoff);
                               1849                 :                :             }
                               1850                 :                :             else
                               1851                 :                :             {
                               1852         [ -  + ]:           4974 :                 Assert(newitemoff >= firstrightoff);
                               1853         [ -  + ]:           4974 :                 if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
                               1854                 :                :                                   afterrightoff == minusinfoff))
                               1855                 :                :                 {
 5677 tgl@sss.pgh.pa.us        1856         [ #  # ]:UBC           0 :                     elog(ERROR, "failed to add new item to the right sibling"
                               1857                 :                :                          " while splitting block %u of index \"%s\"",
                               1858                 :                :                          origpagenumber, RelationGetRelationName(rel));
                               1859                 :                :                 }
 2162 pg@bowt.ie               1860                 :CBC        4974 :                 afterrightoff = OffsetNumberNext(afterrightoff);
                               1861                 :                :             }
                               1862                 :                :         }
                               1863                 :                : 
                               1864                 :                :         /* decide which page to put it on */
                               1865         [ +  + ]:        4128543 :         if (i < firstrightoff)
                               1866                 :                :         {
                               1867         [ -  + ]:        3186039 :             if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
                               1868                 :                :             {
 5677 tgl@sss.pgh.pa.us        1869         [ #  # ]:UBC           0 :                 elog(ERROR, "failed to add old item to the left sibling"
                               1870                 :                :                      " while splitting block %u of index \"%s\"",
                               1871                 :                :                      origpagenumber, RelationGetRelationName(rel));
                               1872                 :                :             }
 2162 pg@bowt.ie               1873                 :CBC     3186039 :             afterleftoff = OffsetNumberNext(afterleftoff);
                               1874                 :                :         }
                               1875                 :                :         else
                               1876                 :                :         {
                               1877         [ -  + ]:         942504 :             if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
                               1878                 :                :                               afterrightoff == minusinfoff))
                               1879                 :                :             {
 5677 tgl@sss.pgh.pa.us        1880         [ #  # ]:UBC           0 :                 elog(ERROR, "failed to add old item to the right sibling"
                               1881                 :                :                      " while splitting block %u of index \"%s\"",
                               1882                 :                :                      origpagenumber, RelationGetRelationName(rel));
                               1883                 :                :             }
 2162 pg@bowt.ie               1884                 :CBC      942504 :             afterrightoff = OffsetNumberNext(afterrightoff);
                               1885                 :                :         }
                               1886                 :                :     }
                               1887                 :                : 
                               1888                 :                :     /* Handle case where newitem goes at the end of rightpage */
 9368 tgl@sss.pgh.pa.us        1889         [ +  + ]:          13241 :     if (i <= newitemoff)
                               1890                 :                :     {
                               1891                 :                :         /*
                               1892                 :                :          * Can't have newitemonleft here; that would imply we were told to put
                               1893                 :                :          * *everything* on the left page, which cannot fit (if it could, we'd
                               1894                 :                :          * not be splitting the page).
                               1895                 :                :          */
 2162 pg@bowt.ie               1896   [ +  -  -  + ]:           6208 :         Assert(!newitemonleft && newitemoff == maxoff + 1);
                               1897         [ -  + ]:           6208 :         if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
                               1898                 :                :                           afterrightoff == minusinfoff))
                               1899                 :                :         {
 5677 tgl@sss.pgh.pa.us        1900         [ #  # ]:UBC           0 :             elog(ERROR, "failed to add new item to the right sibling"
                               1901                 :                :                  " while splitting block %u of index \"%s\"",
                               1902                 :                :                  origpagenumber, RelationGetRelationName(rel));
                               1903                 :                :         }
 2162 pg@bowt.ie               1904                 :CBC        6208 :         afterrightoff = OffsetNumberNext(afterrightoff);
                               1905                 :                :     }
                               1906                 :                : 
                               1907                 :                :     /*
                               1908                 :                :      * We have to grab the original right sibling (if any) and update its prev
                               1909                 :                :      * link.  We are guaranteed that this is deadlock-free, since we couple
                               1910                 :                :      * the locks in the standard order: left to right.
                               1911                 :                :      */
                               1912         [ +  + ]:          13241 :     if (!isrightmost)
                               1913                 :                :     {
 1009                          1914                 :           5436 :         sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
 3616 kgrittn@postgresql.o     1915                 :           5436 :         spage = BufferGetPage(sbuf);
 1444 michael@paquier.xyz      1916                 :           5436 :         sopaque = BTPageGetOpaque(spage);
 5677 tgl@sss.pgh.pa.us        1917         [ -  + ]:           5436 :         if (sopaque->btpo_prev != origpagenumber)
                               1918                 :                :         {
 2418 peter@eisentraut.org     1919         [ #  # ]:UBC           0 :             ereport(ERROR,
                               1920                 :                :                     (errcode(ERRCODE_INDEX_CORRUPTED),
                               1921                 :                :                      errmsg_internal("right sibling's left-link doesn't match: "
                               1922                 :                :                                      "block %u links to %u instead of expected %u in index \"%s\"",
                               1923                 :                :                                      oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
                               1924                 :                :                                      RelationGetRelationName(rel))));
                               1925                 :                :         }
                               1926                 :                : 
                               1927                 :                :         /*
                               1928                 :                :          * Check to see if we can set the SPLIT_END flag in the right-hand
                               1929                 :                :          * split page; this can save some I/O for vacuum since it need not
                               1930                 :                :          * proceed to the right sibling.  We can set the flag if the right
                               1931                 :                :          * sibling has a different cycleid: that means it could not be part of
                               1932                 :                :          * a group of pages that were all split off from the same ancestor
                               1933                 :                :          * page.  If you're confused, imagine that page A splits to A B and
                               1934                 :                :          * then again, yielding A C B, while vacuum is in progress.  Tuples
                               1935                 :                :          * originally in A could now be in either B or C, hence vacuum must
                               1936                 :                :          * examine both pages.  But if D, our right sibling, has a different
                               1937                 :                :          * cycleid then it could not contain any tuples that were in A when
                               1938                 :                :          * the vacuum started.
                               1939                 :                :          */
 7251 tgl@sss.pgh.pa.us        1940         [ +  + ]:CBC        5436 :         if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
 7251 tgl@sss.pgh.pa.us        1941                 :GBC           1 :             ropaque->btpo_flags |= BTP_SPLIT_END;
                               1942                 :                :     }
                               1943                 :                : 
                               1944                 :                :     /*
                               1945                 :                :      * Right sibling is locked, new siblings are prepared, but original page
                               1946                 :                :      * is not updated yet.
                               1947                 :                :      *
                               1948                 :                :      * NO EREPORT(ERROR) till right sibling is updated.  We can get away with
                               1949                 :                :      * not starting the critical section till here because we haven't been
                               1950                 :                :      * scribbling on the original page yet; see comments above.
                               1951                 :                :      */
 9193 tgl@sss.pgh.pa.us        1952                 :CBC       13241 :     START_CRIT_SECTION();
                               1953                 :                : 
                               1954                 :                :     /*
                               1955                 :                :      * By here, the original data page has been split into two new halves, and
                               1956                 :                :      * these are correct.  The algorithm requires that the left page never
                               1957                 :                :      * move during a split, so we copy the new left page back on top of the
                               1958                 :                :      * original.  We need to do this before writing the WAL record, so that
                               1959                 :                :      * XLogInsert can WAL log an image of the page if necessary.
                               1960                 :                :      */
  170 michael@paquier.xyz      1961                 :GNC       13241 :     memcpy(origpage, leftpage, BLCKSZ);
                               1962                 :                :     /* leftpage, lopaque must not be used below here */
                               1963                 :                : 
                               1964                 :                :     /*
                               1965                 :                :      * Move the contents of the right page from its temporary location to the
                               1966                 :                :      * destination buffer, before writing the WAL record.  Unlike the left
                               1967                 :                :      * page, the right page and its opaque area are still needed to complete
                               1968                 :                :      * the update of the page, so reinitialize them.
                               1969                 :                :      */
                               1970                 :          13241 :     rightpage = BufferGetPage(rbuf);
                               1971                 :          13241 :     memcpy(rightpage, rightpage_buf.data, BLCKSZ);
                               1972                 :          13241 :     ropaque = BTPageGetOpaque(rightpage);
                               1973                 :                : 
 6913 tgl@sss.pgh.pa.us        1974                 :CBC       13241 :     MarkBufferDirty(buf);
                               1975                 :          13241 :     MarkBufferDirty(rbuf);
                               1976                 :                : 
 2162 pg@bowt.ie               1977         [ +  + ]:          13241 :     if (!isrightmost)
                               1978                 :                :     {
 5677 tgl@sss.pgh.pa.us        1979                 :           5436 :         sopaque->btpo_prev = rightpagenumber;
 6913                          1980                 :           5436 :         MarkBufferDirty(sbuf);
                               1981                 :                :     }
                               1982                 :                : 
                               1983                 :                :     /*
                               1984                 :                :      * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
                               1985                 :                :      * a split
                               1986                 :                :      */
 4380 heikki.linnakangas@i     1987         [ +  + ]:          13241 :     if (!isleaf)
                               1988                 :                :     {
 3616 kgrittn@postgresql.o     1989                 :            158 :         Page        cpage = BufferGetPage(cbuf);
 1444 michael@paquier.xyz      1990                 :            158 :         BTPageOpaque cpageop = BTPageGetOpaque(cpage);
                               1991                 :                : 
 4380 heikki.linnakangas@i     1992                 :            158 :         cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
                               1993                 :            158 :         MarkBufferDirty(cbuf);
                               1994                 :                :     }
                               1995                 :                : 
                               1996                 :                :     /* XLOG stuff */
 5571 rhaas@postgresql.org     1997   [ +  +  +  +  :          13241 :     if (RelationNeedsWAL(rel))
                                        +  -  +  - ]
 9293 vadim4o@yahoo.com        1998                 :GIC       12329 :     {
                               1999                 :                :         xl_btree_split xlrec;
                               2000                 :                :         uint8       xlinfo;
                               2001                 :                : 
 1845 pg@bowt.ie               2002                 :CBC       12329 :         xlrec.level = ropaque->btpo_level;
                               2003                 :                :         /* See comments below on newitem, orignewitem, and posting lists */
 2162                          2004                 :          12329 :         xlrec.firstrightoff = firstrightoff;
 4133 heikki.linnakangas@i     2005                 :          12329 :         xlrec.newitemoff = newitemoff;
 2209 pg@bowt.ie               2006                 :          12329 :         xlrec.postingoff = 0;
 2162                          2007   [ +  +  +  + ]:          12329 :         if (postingoff != 0 && origpagepostingoff < firstrightoff)
 2209                          2008                 :             16 :             xlrec.postingoff = postingoff;
                               2009                 :                : 
 4133 heikki.linnakangas@i     2010                 :          12329 :         XLogBeginInsert();
  397 peter@eisentraut.org     2011                 :          12329 :         XLogRegisterData(&xlrec, SizeOfBtreeSplit);
                               2012                 :                : 
 4133 heikki.linnakangas@i     2013                 :          12329 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
                               2014                 :          12329 :         XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
                               2015                 :                :         /* Log original right sibling, since we've changed its prev-pointer */
 2162 pg@bowt.ie               2016         [ +  + ]:          12329 :         if (!isrightmost)
 4133 heikki.linnakangas@i     2017                 :           5430 :             XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
 2162 pg@bowt.ie               2018         [ +  + ]:          12329 :         if (!isleaf)
 4133 heikki.linnakangas@i     2019                 :            158 :             XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
                               2020                 :                : 
                               2021                 :                :         /*
                               2022                 :                :          * Log the new item, if it was inserted on the left page. (If it was
                               2023                 :                :          * put on the right page, we don't need to explicitly WAL log it
                               2024                 :                :          * because it's included with all the other items on the right page.)
                               2025                 :                :          * Show the new item as belonging to the left page buffer, so that it
                               2026                 :                :          * is not stored if XLogInsert decides it needs a full-page image of
                               2027                 :                :          * the left page.  We always store newitemoff in the record, though.
                               2028                 :                :          *
                               2029                 :                :          * The details are sometimes slightly different for page splits that
                               2030                 :                :          * coincide with a posting list split.  If both the replacement
                               2031                 :                :          * posting list and newitem go on the right page, then we don't need
                               2032                 :                :          * to log anything extra, just like the simple !newitemonleft
                               2033                 :                :          * no-posting-split case (postingoff is set to zero in the WAL record,
                               2034                 :                :          * so recovery doesn't need to process a posting list split at all).
                               2035                 :                :          * Otherwise, we set postingoff and log orignewitem instead of
                               2036                 :                :          * newitem, despite having actually inserted newitem.  REDO routine
                               2037                 :                :          * must reconstruct nposting and newitem using _bt_swap_posting().
                               2038                 :                :          *
                               2039                 :                :          * Note: It's possible that our page split point is the point that
                               2040                 :                :          * makes the posting list lastleft and newitem firstright.  This is
                               2041                 :                :          * the only case where we log orignewitem/newitem despite newitem
                               2042                 :                :          * going on the right page.  If XLogInsert decides that it can omit
                               2043                 :                :          * orignewitem due to logging a full-page image of the left page,
                               2044                 :                :          * everything still works out, since recovery only needs to log
                               2045                 :                :          * orignewitem for items on the left page (just like the regular
                               2046                 :                :          * newitem-logged case).
                               2047                 :                :          */
 2209 pg@bowt.ie               2048   [ +  +  +  + ]:          12329 :         if (newitemonleft && xlrec.postingoff == 0)
  397 peter@eisentraut.org     2049                 :           2041 :             XLogRegisterBufData(0, newitem, newitemsz);
 2209 pg@bowt.ie               2050         [ +  + ]:          10288 :         else if (xlrec.postingoff != 0)
                               2051                 :                :         {
 2162                          2052         [ -  + ]:             16 :             Assert(isleaf);
                               2053   [ +  +  -  + ]:             16 :             Assert(newitemonleft || firstrightoff == newitemoff);
                               2054         [ -  + ]:             16 :             Assert(newitemsz == IndexTupleSize(orignewitem));
  397 peter@eisentraut.org     2055                 :             16 :             XLogRegisterBufData(0, orignewitem, newitemsz);
                               2056                 :                :         }
                               2057                 :                : 
                               2058                 :                :         /* Log the left page's new high key */
 2162 pg@bowt.ie               2059         [ +  + ]:          12329 :         if (!isleaf)
                               2060                 :                :         {
                               2061                 :                :             /* lefthighkey isn't local copy, get current pointer */
                               2062                 :            158 :             itemid = PageGetItemId(origpage, P_HIKEY);
                               2063                 :            158 :             lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
                               2064                 :                :         }
  397 peter@eisentraut.org     2065                 :          12329 :         XLogRegisterBufData(0, lefthighkey,
 2162 pg@bowt.ie               2066                 :          12329 :                             MAXALIGN(IndexTupleSize(lefthighkey)));
                               2067                 :                : 
                               2068                 :                :         /*
                               2069                 :                :          * Log the contents of the right page in the format understood by
                               2070                 :                :          * _bt_restore_page().  The whole right page will be recreated.
                               2071                 :                :          *
                               2072                 :                :          * Direct access to page is not good but faster - we should implement
                               2073                 :                :          * some new func in page API.  Note we only store the tuples
                               2074                 :                :          * themselves, knowing that they were inserted in item-number order
                               2075                 :                :          * and so the line pointers can be reconstructed.  See comments for
                               2076                 :                :          * _bt_restore_page().
                               2077                 :                :          */
 4133 heikki.linnakangas@i     2078                 :          12329 :         XLogRegisterBufData(1,
 3189 tgl@sss.pgh.pa.us        2079                 :          12329 :                             (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
 4133 heikki.linnakangas@i     2080                 :          12329 :                             ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
                               2081                 :                : 
 2552 pg@bowt.ie               2082         [ +  + ]:          12329 :         xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
 4133 heikki.linnakangas@i     2083                 :          12329 :         recptr = XLogInsert(RM_BTREE_ID, xlinfo);
                               2084                 :                :     }
                               2085                 :                :     else
    2 pg@bowt.ie               2086                 :GNC         912 :         recptr = XLogGetFakeLSN(rel);
                               2087                 :                : 
                               2088                 :          13241 :     PageSetLSN(origpage, recptr);
                               2089                 :          13241 :     PageSetLSN(rightpage, recptr);
                               2090         [ +  + ]:          13241 :     if (!isrightmost)
                               2091                 :           5436 :         PageSetLSN(spage, recptr);
                               2092         [ +  + ]:          13241 :     if (!isleaf)
                               2093                 :            158 :         PageSetLSN(BufferGetPage(cbuf), recptr);
                               2094                 :                : 
 9182 tgl@sss.pgh.pa.us        2095         [ -  + ]:CBC       13241 :     END_CRIT_SECTION();
                               2096                 :                : 
                               2097                 :                :     /* release the old right sibling */
 2162 pg@bowt.ie               2098         [ +  + ]:          13241 :     if (!isrightmost)
 7289 tgl@sss.pgh.pa.us        2099                 :           5436 :         _bt_relbuf(rel, sbuf);
                               2100                 :                : 
                               2101                 :                :     /* release the child */
 4380 heikki.linnakangas@i     2102         [ +  + ]:          13241 :     if (!isleaf)
                               2103                 :            158 :         _bt_relbuf(rel, cbuf);
                               2104                 :                : 
                               2105                 :                :     /* be tidy */
 2162 pg@bowt.ie               2106         [ +  + ]:          13241 :     if (isleaf)
                               2107                 :          13083 :         pfree(lefthighkey);
                               2108                 :                : 
                               2109                 :                :     /* split's done */
10057 bruce@momjian.us         2110                 :          13241 :     return rbuf;
                               2111                 :                : }
                               2112                 :                : 
                               2113                 :                : /*
                               2114                 :                :  * _bt_insert_parent() -- Insert downlink into parent, completing split.
                               2115                 :                :  *
                               2116                 :                :  * On entry, buf and rbuf are the left and right split pages, which we
                               2117                 :                :  * still hold write locks on.  Both locks will be released here.  We
                               2118                 :                :  * release the rbuf lock once we have a write lock on the page that we
                               2119                 :                :  * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
                               2120                 :                :  * The lock on buf is released at the same point as the lock on the parent
                               2121                 :                :  * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
                               2122                 :                :  * atomic operation that completes the split by inserting a new downlink.
                               2123                 :                :  *
                               2124                 :                :  * stack - stack showing how we got here.  Will be NULL when splitting true
                               2125                 :                :  *          root, or during concurrent root split, where we can be inefficient
                               2126                 :                :  * isroot - we split the true root
                               2127                 :                :  * isonly - we split a page alone on its level (might have been fast root)
                               2128                 :                :  */
                               2129                 :                : static void
 8423 tgl@sss.pgh.pa.us        2130                 :          13241 : _bt_insert_parent(Relation rel,
                               2131                 :                :                   Relation heaprel,
                               2132                 :                :                   Buffer buf,
                               2133                 :                :                   Buffer rbuf,
                               2134                 :                :                   BTStack stack,
                               2135                 :                :                   bool isroot,
                               2136                 :                :                   bool isonly)
                               2137                 :                : {
 1009 pg@bowt.ie               2138         [ -  + ]:          13241 :     Assert(heaprel != NULL);
                               2139                 :                : 
                               2140                 :                :     /*
                               2141                 :                :      * Here we have to do something Lehman and Yao don't talk about: deal with
                               2142                 :                :      * a root split and construction of a new root.  If our stack is empty
                               2143                 :                :      * then we have just split a node on what had been the root level when we
                               2144                 :                :      * descended the tree.  If it was still the root then we perform a
                               2145                 :                :      * new-root construction.  If it *wasn't* the root anymore, search to find
                               2146                 :                :      * the next higher level that someone constructed meanwhile, and find the
                               2147                 :                :      * right place to insert as for the normal case.
                               2148                 :                :      *
                               2149                 :                :      * If we have to search for the parent level, we do so by re-descending
                               2150                 :                :      * from the root.  This is not super-efficient, but it's rare enough not
                               2151                 :                :      * to matter.
                               2152                 :                :      */
 1944                          2153         [ +  + ]:          13241 :     if (isroot)
                               2154                 :                :     {
                               2155                 :                :         Buffer      rootbuf;
                               2156                 :                : 
 8103 neilc@samurai.com        2157         [ -  + ]:            711 :         Assert(stack == NULL);
 1944 pg@bowt.ie               2158         [ -  + ]:            711 :         Assert(isonly);
                               2159                 :                :         /* create a new root node one level up and update the metapage */
 1009                          2160                 :            711 :         rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf);
                               2161                 :                :         /* release the split buffers */
 7289 tgl@sss.pgh.pa.us        2162                 :            711 :         _bt_relbuf(rel, rootbuf);
                               2163                 :            711 :         _bt_relbuf(rel, rbuf);
                               2164                 :            711 :         _bt_relbuf(rel, buf);
                               2165                 :                :     }
                               2166                 :                :     else
                               2167                 :                :     {
 8423                          2168                 :          12530 :         BlockNumber bknum = BufferGetBlockNumber(buf);
                               2169                 :          12530 :         BlockNumber rbknum = BufferGetBlockNumber(rbuf);
 3616 kgrittn@postgresql.o     2170                 :          12530 :         Page        page = BufferGetPage(buf);
                               2171                 :                :         IndexTuple  new_item;
                               2172                 :                :         BTStackData fakestack;
                               2173                 :                :         IndexTuple  ritem;
                               2174                 :                :         Buffer      pbuf;
                               2175                 :                : 
 8103 neilc@samurai.com        2176         [ +  + ]:          12530 :         if (stack == NULL)
                               2177                 :                :         {
                               2178                 :                :             BTPageOpaque opaque;
                               2179                 :                : 
 4203 heikki.linnakangas@i     2180         [ -  + ]:              9 :             elog(DEBUG2, "concurrent ROOT page split");
 1444 michael@paquier.xyz      2181                 :              9 :             opaque = BTPageGetOpaque(page);
                               2182                 :                : 
                               2183                 :                :             /*
                               2184                 :                :              * We should never reach here when a leaf page split takes place
                               2185                 :                :              * despite the insert of newitem being able to apply the fastpath
                               2186                 :                :              * optimization.  Make sure of that with an assertion.
                               2187                 :                :              *
                               2188                 :                :              * This is more of a performance issue than a correctness issue.
                               2189                 :                :              * The fastpath won't have a descent stack.  Using a phony stack
                               2190                 :                :              * here works, but never rely on that.  The fastpath should be
                               2191                 :                :              * rejected within _bt_search_insert() when the rightmost leaf
                               2192                 :                :              * page will split, since it's faster to go through _bt_search()
                               2193                 :                :              * and get a stack in the usual way.
                               2194                 :                :              */
 1944 pg@bowt.ie               2195   [ +  -  +  -  :              9 :             Assert(!(P_ISLEAF(opaque) &&
                                              -  + ]
                               2196                 :                :                      BlockNumberIsValid(RelationGetTargetBlock(rel))));
                               2197                 :                : 
                               2198                 :                :             /* Find the leftmost page at the next level up */
  919 tmunro@postgresql.or     2199                 :              9 :             pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false);
                               2200                 :                :             /* Set up a phony stack entry pointing there */
 8423 tgl@sss.pgh.pa.us        2201                 :              9 :             stack = &fakestack;
                               2202                 :              9 :             stack->bts_blkno = BufferGetBlockNumber(pbuf);
                               2203                 :              9 :             stack->bts_offset = InvalidOffsetNumber;
                               2204                 :              9 :             stack->bts_parent = NULL;
                               2205                 :              9 :             _bt_relbuf(rel, pbuf);
                               2206                 :                :         }
                               2207                 :                : 
                               2208                 :                :         /* get high key from left, a strict lower bound for new right page */
 7354                          2209                 :          12530 :         ritem = (IndexTuple) PageGetItem(page,
                               2210                 :          12530 :                                          PageGetItemId(page, P_HIKEY));
                               2211                 :                : 
                               2212                 :                :         /* form an index tuple that points at the new right page */
                               2213                 :          12530 :         new_item = CopyIndexTuple(ritem);
 2281 pg@bowt.ie               2214                 :          12530 :         BTreeTupleSetDownLink(new_item, rbknum);
                               2215                 :                : 
                               2216                 :                :         /*
                               2217                 :                :          * Re-find and write lock the parent of buf.
                               2218                 :                :          *
                               2219                 :                :          * It's possible that the location of buf's downlink has changed since
                               2220                 :                :          * our initial _bt_search() descent.  _bt_getstackbuf() will detect
                               2221                 :                :          * and recover from this, updating the stack, which ensures that the
                               2222                 :                :          * new downlink will be inserted at the correct offset. Even buf's
                               2223                 :                :          * parent may have changed.
                               2224                 :                :          */
 1079 andres@anarazel.de       2225                 :          12530 :         pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
                               2226                 :                : 
                               2227                 :                :         /*
                               2228                 :                :          * Unlock the right child.  The left child will be unlocked in
                               2229                 :                :          * _bt_insertonpg().
                               2230                 :                :          *
                               2231                 :                :          * Unlocking the right child must be delayed until here to ensure that
                               2232                 :                :          * no concurrent VACUUM operation can become confused.  Page deletion
                               2233                 :                :          * cannot be allowed to fail to re-find a downlink for the rbuf page.
                               2234                 :                :          * (Actually, this is just a vestige of how things used to work.  The
                               2235                 :                :          * page deletion code is expected to check for the INCOMPLETE_SPLIT
                               2236                 :                :          * flag on the left child.  It won't attempt deletion of the right
                               2237                 :                :          * child until the split is complete.  Despite all this, we opt to
                               2238                 :                :          * conservatively delay unlocking the right child until here.)
                               2239                 :                :          */
 7289 tgl@sss.pgh.pa.us        2240                 :          12530 :         _bt_relbuf(rel, rbuf);
                               2241                 :                : 
 8423                          2242         [ -  + ]:          12530 :         if (pbuf == InvalidBuffer)
 2418 peter@eisentraut.org     2243         [ #  # ]:UBC           0 :             ereport(ERROR,
                               2244                 :                :                     (errcode(ERRCODE_INDEX_CORRUPTED),
                               2245                 :                :                      errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
                               2246                 :                :                                      RelationGetRelationName(rel), bknum, rbknum)));
                               2247                 :                : 
                               2248                 :                :         /* Recursively insert into the parent */
 1079 andres@anarazel.de       2249                 :CBC       25060 :         _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
 2190 pg@bowt.ie               2250                 :          12530 :                        new_item, MAXALIGN(IndexTupleSize(new_item)),
 1944                          2251                 :          12530 :                        stack->bts_offset + 1, 0, isonly);
                               2252                 :                : 
                               2253                 :                :         /* be tidy */
 8423 tgl@sss.pgh.pa.us        2254                 :          12529 :         pfree(new_item);
                               2255                 :                :     }
                               2256                 :          13240 : }
                               2257                 :                : 
                               2258                 :                : /*
                               2259                 :                :  * _bt_finish_split() -- Finish an incomplete split
                               2260                 :                :  *
                               2261                 :                :  * A crash or other failure can leave a split incomplete.  The insertion
                               2262                 :                :  * routines won't allow to insert on a page that is incompletely split.
                               2263                 :                :  * Before inserting on such a page, call _bt_finish_split().
                               2264                 :                :  *
                               2265                 :                :  * On entry, 'lbuf' must be locked in write-mode.  On exit, it is unlocked
                               2266                 :                :  * and unpinned.
                               2267                 :                :  *
                               2268                 :                :  * Caller must provide a valid heaprel, since finishing a page split requires
                               2269                 :                :  * allocating a new page if and when the parent page splits in turn.
                               2270                 :                :  */
                               2271                 :                : void
 1079 andres@anarazel.de       2272                 :GBC           2 : _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
                               2273                 :                : {
 3616 kgrittn@postgresql.o     2274                 :              2 :     Page        lpage = BufferGetPage(lbuf);
 1444 michael@paquier.xyz      2275                 :              2 :     BTPageOpaque lpageop = BTPageGetOpaque(lpage);
                               2276                 :                :     Buffer      rbuf;
                               2277                 :                :     Page        rpage;
                               2278                 :                :     BTPageOpaque rpageop;
                               2279                 :                :     bool        wasroot;
                               2280                 :                :     bool        wasonly;
                               2281                 :                : 
 4380 heikki.linnakangas@i     2282         [ -  + ]:              2 :     Assert(P_INCOMPLETE_SPLIT(lpageop));
 1009 pg@bowt.ie               2283         [ -  + ]:              2 :     Assert(heaprel != NULL);
                               2284                 :                : 
                               2285                 :                :     /* Lock right sibling, the one missing the downlink */
                               2286                 :              2 :     rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
 3616 kgrittn@postgresql.o     2287                 :              2 :     rpage = BufferGetPage(rbuf);
 1444 michael@paquier.xyz      2288                 :              2 :     rpageop = BTPageGetOpaque(rpage);
                               2289                 :                : 
                               2290                 :                :     /* Could this be a root split? */
 4380 heikki.linnakangas@i     2291         [ +  + ]:              2 :     if (!stack)
                               2292                 :                :     {
                               2293                 :                :         Buffer      metabuf;
                               2294                 :                :         Page        metapg;
                               2295                 :                :         BTMetaPageData *metad;
                               2296                 :                : 
                               2297                 :                :         /* acquire lock on the metapage */
 1009 pg@bowt.ie               2298                 :              1 :         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
 3616 kgrittn@postgresql.o     2299                 :              1 :         metapg = BufferGetPage(metabuf);
 4380 heikki.linnakangas@i     2300                 :              1 :         metad = BTPageGetMeta(metapg);
                               2301                 :                : 
 1944 pg@bowt.ie               2302                 :              1 :         wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
                               2303                 :                : 
 4380 heikki.linnakangas@i     2304                 :              1 :         _bt_relbuf(rel, metabuf);
                               2305                 :                :     }
                               2306                 :                :     else
 1944 pg@bowt.ie               2307                 :              1 :         wasroot = false;
                               2308                 :                : 
                               2309                 :                :     /* Was this the only page on the level before split? */
                               2310   [ +  +  +  - ]:              2 :     wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
                               2311                 :                : 
  103 heikki.linnakangas@i     2312                 :GNC           2 :     INJECTION_POINT("nbtree-finish-incomplete-split", NULL);
 4380 heikki.linnakangas@i     2313         [ -  + ]:GBC           2 :     elog(DEBUG1, "finishing incomplete split of %u/%u",
                               2314                 :                :          BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
                               2315                 :                : 
 1079 andres@anarazel.de       2316                 :              2 :     _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
 4380 heikki.linnakangas@i     2317                 :              2 : }
                               2318                 :                : 
                               2319                 :                : /*
                               2320                 :                :  *  _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
                               2321                 :                :  *                       tuple whose downlink points to child page.
                               2322                 :                :  *
                               2323                 :                :  *      Caller passes child's block number, which is used to identify
                               2324                 :                :  *      associated pivot tuple in parent page using a linear search that
                               2325                 :                :  *      matches on pivot's downlink/block number.  The expected location of
                               2326                 :                :  *      the pivot tuple is taken from the stack one level above the child
                               2327                 :                :  *      page.  This is used as a starting point.  Insertions into the
                               2328                 :                :  *      parent level could cause the pivot tuple to move right; deletions
                               2329                 :                :  *      could cause it to move left, but not left of the page we previously
                               2330                 :                :  *      found it on.
                               2331                 :                :  *
                               2332                 :                :  *      Caller can use its stack to relocate the pivot tuple/downlink for
                               2333                 :                :  *      any same-level page to the right of the page found by its initial
                               2334                 :                :  *      descent.  This is necessary because of the possibility that caller
                               2335                 :                :  *      moved right to recover from a concurrent page split.  It's also
                               2336                 :                :  *      convenient for certain callers to be able to step right when there
                               2337                 :                :  *      wasn't a concurrent page split, while still using their original
                               2338                 :                :  *      stack.  For example, the checkingunique _bt_doinsert() case may
                               2339                 :                :  *      have to step right when there are many physical duplicates, and its
                               2340                 :                :  *      scantid forces an insertion to the right of the "first page the
                               2341                 :                :  *      value could be on".  (This is also relied on by all of our callers
                               2342                 :                :  *      when dealing with !heapkeyspace indexes.)
                               2343                 :                :  *
                               2344                 :                :  *      Returns write-locked parent page buffer, or InvalidBuffer if pivot
                               2345                 :                :  *      tuple not found (should not happen).  Adjusts bts_blkno &
                               2346                 :                :  *      bts_offset if changed.  Page split caller should insert its new
                               2347                 :                :  *      pivot tuple for its new right sibling page on parent page, at the
                               2348                 :                :  *      offset number bts_offset + 1.
                               2349                 :                :  */
                               2350                 :                : Buffer
 1079 andres@anarazel.de       2351                 :CBC       15759 : _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
                               2352                 :                : {
                               2353                 :                :     BlockNumber blkno;
                               2354                 :                :     OffsetNumber start;
                               2355                 :                : 
 9368 tgl@sss.pgh.pa.us        2356                 :          15759 :     blkno = stack->bts_blkno;
                               2357                 :          15759 :     start = stack->bts_offset;
                               2358                 :                : 
                               2359                 :                :     for (;;)
                               2360                 :             11 :     {
                               2361                 :                :         Buffer      buf;
                               2362                 :                :         Page        page;
                               2363                 :                :         BTPageOpaque opaque;
                               2364                 :                : 
 1009 pg@bowt.ie               2365                 :          15770 :         buf = _bt_getbuf(rel, blkno, BT_WRITE);
 3616 kgrittn@postgresql.o     2366                 :          15770 :         page = BufferGetPage(buf);
 1444 michael@paquier.xyz      2367                 :          15770 :         opaque = BTPageGetOpaque(page);
                               2368                 :                : 
 1009 pg@bowt.ie               2369         [ -  + ]:          15770 :         Assert(heaprel != NULL);
 2575                          2370         [ -  + ]:          15770 :         if (P_INCOMPLETE_SPLIT(opaque))
                               2371                 :                :         {
 1079 andres@anarazel.de       2372                 :UBC           0 :             _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
 4380 heikki.linnakangas@i     2373                 :              0 :             continue;
                               2374                 :                :         }
                               2375                 :                : 
 8422 tgl@sss.pgh.pa.us        2376         [ +  + ]:CBC       15770 :         if (!P_IGNORE(opaque))
                               2377                 :                :         {
                               2378                 :                :             OffsetNumber offnum,
                               2379                 :                :                         minoff,
                               2380                 :                :                         maxoff;
                               2381                 :                :             ItemId      itemid;
                               2382                 :                :             IndexTuple  item;
                               2383                 :                : 
                               2384         [ +  + ]:          15759 :             minoff = P_FIRSTDATAKEY(opaque);
                               2385                 :          15759 :             maxoff = PageGetMaxOffsetNumber(page);
                               2386                 :                : 
                               2387                 :                :             /*
                               2388                 :                :              * start = InvalidOffsetNumber means "search the whole page". We
                               2389                 :                :              * need this test anyway due to possibility that page has a high
                               2390                 :                :              * key now when it didn't before.
                               2391                 :                :              */
                               2392         [ +  + ]:          15759 :             if (start < minoff)
                               2393                 :             20 :                 start = minoff;
                               2394                 :                : 
                               2395                 :                :             /*
                               2396                 :                :              * Need this check too, to guard against possibility that page
                               2397                 :                :              * split since we visited it originally.
                               2398                 :                :              */
 7880                          2399         [ -  + ]:          15759 :             if (start > maxoff)
 7880 tgl@sss.pgh.pa.us        2400                 :UBC           0 :                 start = OffsetNumberNext(maxoff);
                               2401                 :                : 
                               2402                 :                :             /*
                               2403                 :                :              * These loops will check every item on the page --- but in an
                               2404                 :                :              * order that's attuned to the probability of where it actually
                               2405                 :                :              * is.  Scan to the right first, then to the left.
                               2406                 :                :              */
 8422 tgl@sss.pgh.pa.us        2407                 :CBC       15759 :             for (offnum = start;
                               2408         [ +  - ]:          15802 :                  offnum <= maxoff;
                               2409                 :             43 :                  offnum = OffsetNumberNext(offnum))
                               2410                 :                :             {
                               2411                 :          15802 :                 itemid = PageGetItemId(page, offnum);
 7354                          2412                 :          15802 :                 item = (IndexTuple) PageGetItem(page, itemid);
                               2413                 :                : 
 2281 pg@bowt.ie               2414         [ +  + ]:          15802 :                 if (BTreeTupleGetDownLink(item) == child)
                               2415                 :                :                 {
                               2416                 :                :                     /* Return accurate pointer to where link is now */
 8422 tgl@sss.pgh.pa.us        2417                 :          15759 :                     stack->bts_blkno = blkno;
                               2418                 :          15759 :                     stack->bts_offset = offnum;
                               2419                 :          15759 :                     return buf;
                               2420                 :                :                 }
                               2421                 :                :             }
                               2422                 :                : 
 8422 tgl@sss.pgh.pa.us        2423                 :UBC           0 :             for (offnum = OffsetNumberPrev(start);
                               2424         [ #  # ]:              0 :                  offnum >= minoff;
                               2425                 :              0 :                  offnum = OffsetNumberPrev(offnum))
                               2426                 :                :             {
                               2427                 :              0 :                 itemid = PageGetItemId(page, offnum);
 7354                          2428                 :              0 :                 item = (IndexTuple) PageGetItem(page, itemid);
                               2429                 :                : 
 2281 pg@bowt.ie               2430         [ #  # ]:              0 :                 if (BTreeTupleGetDownLink(item) == child)
                               2431                 :                :                 {
                               2432                 :                :                     /* Return accurate pointer to where link is now */
 8422 tgl@sss.pgh.pa.us        2433                 :              0 :                     stack->bts_blkno = blkno;
                               2434                 :              0 :                     stack->bts_offset = offnum;
                               2435                 :              0 :                     return buf;
                               2436                 :                :                 }
                               2437                 :                :             }
                               2438                 :                :         }
                               2439                 :                : 
                               2440                 :                :         /*
                               2441                 :                :          * The item we're looking for moved right at least one page.
                               2442                 :                :          *
                               2443                 :                :          * Lehman and Yao couple/chain locks when moving right here, which we
                               2444                 :                :          * can avoid.  See nbtree/README.
                               2445                 :                :          */
 9368 tgl@sss.pgh.pa.us        2446         [ -  + ]:CBC          11 :         if (P_RIGHTMOST(opaque))
                               2447                 :                :         {
 9009 tgl@sss.pgh.pa.us        2448                 :UBC           0 :             _bt_relbuf(rel, buf);
 7998                          2449                 :              0 :             return InvalidBuffer;
                               2450                 :                :         }
 9368 tgl@sss.pgh.pa.us        2451                 :CBC          11 :         blkno = opaque->btpo_next;
 8423                          2452                 :             11 :         start = InvalidOffsetNumber;
 9009                          2453                 :             11 :         _bt_relbuf(rel, buf);
                               2454                 :                :     }
                               2455                 :                : }
                               2456                 :                : 
                               2457                 :                : /*
                               2458                 :                :  * _bt_freestack() -- free a retracement stack made by _bt_search_insert.
                               2459                 :                :  */
                               2460                 :                : static void
    3 pg@bowt.ie               2461                 :GNC     3773883 : _bt_freestack(BTStack stack)
                               2462                 :                : {
                               2463                 :                :     BTStack     ostack;
                               2464                 :                : 
                               2465         [ +  + ]:        7605926 :     while (stack != NULL)
                               2466                 :                :     {
                               2467                 :        3832043 :         ostack = stack;
                               2468                 :        3832043 :         stack = stack->bts_parent;
                               2469                 :        3832043 :         pfree(ostack);
                               2470                 :                :     }
                               2471                 :        3773883 : }
                               2472                 :                : 
                               2473                 :                : /*
                               2474                 :                :  *  _bt_newlevel() -- Create a new level above root page.
                               2475                 :                :  *
                               2476                 :                :  *      We've just split the old root page and need to create a new one.
                               2477                 :                :  *      In order to do this, we add a new root page to the file, then lock
                               2478                 :                :  *      the metadata page and update it.  This is guaranteed to be deadlock-
                               2479                 :                :  *      free, because all readers release their locks on the metadata page
                               2480                 :                :  *      before trying to lock the root, and all writers lock the root before
                               2481                 :                :  *      trying to lock the metadata page.  We have a write lock on the old
                               2482                 :                :  *      root page, so we have not introduced any cycles into the waits-for
                               2483                 :                :  *      graph.
                               2484                 :                :  *
                               2485                 :                :  *      On entry, lbuf (the old root) and rbuf (its new peer) are write-
                               2486                 :                :  *      locked. On exit, a new root page exists with entries for the
                               2487                 :                :  *      two new children, metapage is updated and unlocked/unpinned.
                               2488                 :                :  *      The new root buffer is returned to caller which has to unlock/unpin
                               2489                 :                :  *      lbuf, rbuf & rootbuf.
                               2490                 :                :  */
                               2491                 :                : static Buffer
 1009 pg@bowt.ie               2492                 :CBC         711 : _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
                               2493                 :                : {
                               2494                 :                :     Buffer      rootbuf;
                               2495                 :                :     Page        lpage,
                               2496                 :                :                 rootpage;
                               2497                 :                :     BlockNumber lbkno,
                               2498                 :                :                 rbkno;
                               2499                 :                :     BlockNumber rootblknum;
                               2500                 :                :     BTPageOpaque rootopaque;
                               2501                 :                :     BTPageOpaque lopaque;
                               2502                 :                :     ItemId      itemid;
                               2503                 :                :     IndexTuple  item;
                               2504                 :                :     IndexTuple  left_item;
                               2505                 :                :     Size        left_item_sz;
                               2506                 :                :     IndexTuple  right_item;
                               2507                 :                :     Size        right_item_sz;
                               2508                 :                :     Buffer      metabuf;
                               2509                 :                :     Page        metapg;
                               2510                 :                :     BTMetaPageData *metad;
                               2511                 :                :     XLogRecPtr  recptr;
                               2512                 :                : 
 8423 tgl@sss.pgh.pa.us        2513                 :            711 :     lbkno = BufferGetBlockNumber(lbuf);
                               2514                 :            711 :     rbkno = BufferGetBlockNumber(rbuf);
 3616 kgrittn@postgresql.o     2515                 :            711 :     lpage = BufferGetPage(lbuf);
 1444 michael@paquier.xyz      2516                 :            711 :     lopaque = BTPageGetOpaque(lpage);
                               2517                 :                : 
                               2518                 :                :     /* get a new root page */
 1009 pg@bowt.ie               2519                 :            711 :     rootbuf = _bt_allocbuf(rel, heaprel);
 3616 kgrittn@postgresql.o     2520                 :            711 :     rootpage = BufferGetPage(rootbuf);
 9293 vadim4o@yahoo.com        2521                 :            711 :     rootblknum = BufferGetBlockNumber(rootbuf);
                               2522                 :                : 
                               2523                 :                :     /* acquire lock on the metapage */
 1009 pg@bowt.ie               2524                 :            711 :     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
 3616 kgrittn@postgresql.o     2525                 :            711 :     metapg = BufferGetPage(metabuf);
 9208 vadim4o@yahoo.com        2526                 :            711 :     metad = BTPageGetMeta(metapg);
                               2527                 :                : 
                               2528                 :                :     /*
                               2529                 :                :      * Create downlink item for left page (old root).  The key value used is
                               2530                 :                :      * "minus infinity", a sentinel value that's reliably less than any real
                               2531                 :                :      * key value that could appear in the left page.
                               2532                 :                :      */
 4363 heikki.linnakangas@i     2533                 :            711 :     left_item_sz = sizeof(IndexTupleData);
                               2534                 :            711 :     left_item = (IndexTuple) palloc(left_item_sz);
                               2535                 :            711 :     left_item->t_info = left_item_sz;
 2281 pg@bowt.ie               2536                 :            711 :     BTreeTupleSetDownLink(left_item, lbkno);
 2168                          2537                 :            711 :     BTreeTupleSetNAtts(left_item, 0, false);
                               2538                 :                : 
                               2539                 :                :     /*
                               2540                 :                :      * Create downlink item for right page.  The key for it is obtained from
                               2541                 :                :      * the "high key" position in the left page.
                               2542                 :                :      */
 4363 heikki.linnakangas@i     2543                 :            711 :     itemid = PageGetItemId(lpage, P_HIKEY);
                               2544                 :            711 :     right_item_sz = ItemIdGetLength(itemid);
                               2545                 :            711 :     item = (IndexTuple) PageGetItem(lpage, itemid);
                               2546                 :            711 :     right_item = CopyIndexTuple(item);
 2281 pg@bowt.ie               2547                 :            711 :     BTreeTupleSetDownLink(right_item, rbkno);
                               2548                 :                : 
                               2549                 :                :     /* NO EREPORT(ERROR) from here till newroot op is logged */
 9193 tgl@sss.pgh.pa.us        2550                 :            711 :     START_CRIT_SECTION();
                               2551                 :                : 
                               2552                 :                :     /* upgrade metapage if needed */
 2552 pg@bowt.ie               2553         [ -  + ]:            711 :     if (metad->btm_version < BTREE_NOVAC_VERSION)
 2846 teodor@sigaev.ru         2554                 :UBC           0 :         _bt_upgrademetapage(metapg);
                               2555                 :                : 
                               2556                 :                :     /* set btree special data */
 1444 michael@paquier.xyz      2557                 :CBC         711 :     rootopaque = BTPageGetOpaque(rootpage);
10416 bruce@momjian.us         2558                 :            711 :     rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
 8423 tgl@sss.pgh.pa.us        2559                 :            711 :     rootopaque->btpo_flags = BTP_ROOT;
 1845 pg@bowt.ie               2560                 :            711 :     rootopaque->btpo_level =
 1444 michael@paquier.xyz      2561                 :            711 :         (BTPageGetOpaque(lpage))->btpo_level + 1;
 7251 tgl@sss.pgh.pa.us        2562                 :            711 :     rootopaque->btpo_cycleid = 0;
                               2563                 :                : 
                               2564                 :                :     /* update metapage data */
 8423                          2565                 :            711 :     metad->btm_root = rootblknum;
 1845 pg@bowt.ie               2566                 :            711 :     metad->btm_level = rootopaque->btpo_level;
 8423 tgl@sss.pgh.pa.us        2567                 :            711 :     metad->btm_fastroot = rootblknum;
 1845 pg@bowt.ie               2568                 :            711 :     metad->btm_fastlevel = rootopaque->btpo_level;
                               2569                 :                : 
                               2570                 :                :     /*
                               2571                 :                :      * Insert the left page pointer into the new root page.  The root page is
                               2572                 :                :      * the rightmost page on its level so there is no "high key" in it; the
                               2573                 :                :      * two items will go into positions P_HIKEY and P_FIRSTKEY.
                               2574                 :                :      *
                               2575                 :                :      * Note: we *must* insert the two items in item-number order, for the
                               2576                 :                :      * benefit of _bt_restore_page().
                               2577                 :                :      */
 2887 teodor@sigaev.ru         2578   [ +  -  -  + ]:            711 :     Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
  139 peter@eisentraut.org     2579         [ -  + ]:GNC         711 :     if (PageAddItem(rootpage, left_item, left_item_sz, P_HIKEY, false, false) == InvalidOffsetNumber)
 6649 tgl@sss.pgh.pa.us        2580         [ #  # ]:UBC           0 :         elog(PANIC, "failed to add leftkey to new root page"
                               2581                 :                :              " while splitting block %u of index \"%s\"",
                               2582                 :                :              BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
                               2583                 :                : 
                               2584                 :                :     /*
                               2585                 :                :      * insert the right page pointer into the new root page.
                               2586                 :                :      */
 2552 pg@bowt.ie               2587   [ +  -  -  + ]:CBC         711 :     Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
                               2588   [ +  -  -  + ]:            711 :     Assert(BTreeTupleGetNAtts(right_item, rel) <=
                               2589                 :                :            IndexRelationGetNumberOfKeyAttributes(rel));
  139 peter@eisentraut.org     2590         [ -  + ]:GNC         711 :     if (PageAddItem(rootpage, right_item, right_item_sz, P_FIRSTKEY, false, false) == InvalidOffsetNumber)
 6649 tgl@sss.pgh.pa.us        2591         [ #  # ]:UBC           0 :         elog(PANIC, "failed to add rightkey to new root page"
                               2592                 :                :              " while splitting block %u of index \"%s\"",
                               2593                 :                :              BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
                               2594                 :                : 
                               2595                 :                :     /* Clear the incomplete-split flag in the left child */
 4380 heikki.linnakangas@i     2596         [ -  + ]:CBC         711 :     Assert(P_INCOMPLETE_SPLIT(lopaque));
                               2597                 :            711 :     lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
                               2598                 :            711 :     MarkBufferDirty(lbuf);
                               2599                 :                : 
 7289 tgl@sss.pgh.pa.us        2600                 :            711 :     MarkBufferDirty(rootbuf);
                               2601                 :            711 :     MarkBufferDirty(metabuf);
                               2602                 :                : 
                               2603                 :                :     /* XLOG stuff */
 5571 rhaas@postgresql.org     2604   [ +  +  +  +  :            711 :     if (RelationNeedsWAL(rel))
                                        +  -  +  - ]
 9293 vadim4o@yahoo.com        2605                 :GIC         695 :     {
                               2606                 :                :         xl_btree_newroot xlrec;
                               2607                 :                :         xl_btree_metadata md;
                               2608                 :                : 
 8423 tgl@sss.pgh.pa.us        2609                 :CBC         695 :         xlrec.rootblk = rootblknum;
 9208 vadim4o@yahoo.com        2610                 :            695 :         xlrec.level = metad->btm_level;
                               2611                 :                : 
 4133 heikki.linnakangas@i     2612                 :            695 :         XLogBeginInsert();
  397 peter@eisentraut.org     2613                 :            695 :         XLogRegisterData(&xlrec, SizeOfBtreeNewroot);
                               2614                 :                : 
 4133 heikki.linnakangas@i     2615                 :            695 :         XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
                               2616                 :            695 :         XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
 3054 tgl@sss.pgh.pa.us        2617                 :            695 :         XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
                               2618                 :                : 
 2552 pg@bowt.ie               2619         [ -  + ]:            695 :         Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
                               2620                 :            695 :         md.version = metad->btm_version;
 4133 heikki.linnakangas@i     2621                 :            695 :         md.root = rootblknum;
                               2622                 :            695 :         md.level = metad->btm_level;
                               2623                 :            695 :         md.fastroot = rootblknum;
                               2624                 :            695 :         md.fastlevel = metad->btm_level;
 1845 pg@bowt.ie               2625                 :            695 :         md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
 2209                          2626                 :            695 :         md.allequalimage = metad->btm_allequalimage;
                               2627                 :                : 
  397 peter@eisentraut.org     2628                 :            695 :         XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
                               2629                 :                : 
                               2630                 :                :         /*
                               2631                 :                :          * Direct access to page is not good but faster - we should implement
                               2632                 :                :          * some new func in page API.
                               2633                 :                :          */
 4133 heikki.linnakangas@i     2634                 :            695 :         XLogRegisterBufData(0,
 3189 tgl@sss.pgh.pa.us        2635                 :            695 :                             (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
 4133 heikki.linnakangas@i     2636                 :            695 :                             ((PageHeader) rootpage)->pd_special -
                               2637                 :            695 :                             ((PageHeader) rootpage)->pd_upper);
                               2638                 :                : 
                               2639                 :            695 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
                               2640                 :                :     }
                               2641                 :                :     else
    2 pg@bowt.ie               2642                 :GNC          16 :         recptr = XLogGetFakeLSN(rel);
                               2643                 :                : 
                               2644                 :            711 :     PageSetLSN(lpage, recptr);
                               2645                 :            711 :     PageSetLSN(rootpage, recptr);
                               2646                 :            711 :     PageSetLSN(metapg, recptr);
                               2647                 :                : 
 9193 tgl@sss.pgh.pa.us        2648         [ -  + ]:CBC         711 :     END_CRIT_SECTION();
                               2649                 :                : 
                               2650                 :                :     /* done with metapage */
 7289                          2651                 :            711 :     _bt_relbuf(rel, metabuf);
                               2652                 :                : 
 4363 heikki.linnakangas@i     2653                 :            711 :     pfree(left_item);
                               2654                 :            711 :     pfree(right_item);
                               2655                 :                : 
 7368 neilc@samurai.com        2656                 :            711 :     return rootbuf;
                               2657                 :                : }
                               2658                 :                : 
                               2659                 :                : /*
                               2660                 :                :  *  _bt_pgaddtup() -- add a data item to a particular page during split.
                               2661                 :                :  *
                               2662                 :                :  *      The difference between this routine and a bare PageAddItem call is
                               2663                 :                :  *      that this code can deal with the first data item on an internal btree
                               2664                 :                :  *      page in passing.  This data item (which is called "firstright" within
                               2665                 :                :  *      _bt_split()) has a key that must be treated as minus infinity after
                               2666                 :                :  *      the split.  Therefore, we truncate away all attributes when caller
                               2667                 :                :  *      specifies it's the first data item on page (downlink is not changed,
                               2668                 :                :  *      though).  This extra step is only needed for the right page of an
                               2669                 :                :  *      internal page split.  There is no need to do this for the first data
                               2670                 :                :  *      item on the existing/left page, since that will already have been
                               2671                 :                :  *      truncated during an earlier page split.
                               2672                 :                :  *
                               2673                 :                :  *      See _bt_split() for a high level explanation of why we truncate here.
                               2674                 :                :  *      Note that this routine has nothing to do with suffix truncation,
                               2675                 :                :  *      despite using some of the same infrastructure.
                               2676                 :                :  */
                               2677                 :                : static inline bool
 5677 tgl@sss.pgh.pa.us        2678                 :        4141784 : _bt_pgaddtup(Page page,
                               2679                 :                :              Size itemsize,
                               2680                 :                :              const IndexTupleData *itup,
                               2681                 :                :              OffsetNumber itup_off,
                               2682                 :                :              bool newfirstdataitem)
                               2683                 :                : {
                               2684                 :                :     IndexTupleData trunctuple;
                               2685                 :                : 
 2162 pg@bowt.ie               2686         [ +  + ]:        4141784 :     if (newfirstdataitem)
                               2687                 :                :     {
 7354 tgl@sss.pgh.pa.us        2688                 :            158 :         trunctuple = *itup;
                               2689                 :            158 :         trunctuple.t_info = sizeof(IndexTupleData);
 2168 pg@bowt.ie               2690                 :            158 :         BTreeTupleSetNAtts(&trunctuple, 0, false);
 7354 tgl@sss.pgh.pa.us        2691                 :            158 :         itup = &trunctuple;
                               2692                 :            158 :         itemsize = sizeof(IndexTupleData);
                               2693                 :                :     }
                               2694                 :                : 
  139 peter@eisentraut.org     2695         [ -  + ]:GNC     4141784 :     if (unlikely(PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber))
 5677 tgl@sss.pgh.pa.us        2696                 :UBC           0 :         return false;
                               2697                 :                : 
 5677 tgl@sss.pgh.pa.us        2698                 :CBC     4141784 :     return true;
                               2699                 :                : }
                               2700                 :                : 
                               2701                 :                : /*
                               2702                 :                :  * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
                               2703                 :                :  *
                               2704                 :                :  * There are three operations performed here: simple index deletion, bottom-up
                               2705                 :                :  * index deletion, and deduplication.  If all three operations fail to free
                               2706                 :                :  * enough space for the incoming item then caller will go on to split the
                               2707                 :                :  * page.  We always consider simple deletion first.  If that doesn't work out
                               2708                 :                :  * we consider alternatives.  Callers that only want us to consider simple
                               2709                 :                :  * deletion (without any fallback) ask for that using the 'simpleonly'
                               2710                 :                :  * argument.
                               2711                 :                :  *
                               2712                 :                :  * We usually pick only one alternative "complex" operation when simple
                               2713                 :                :  * deletion alone won't prevent a page split.  The 'checkingunique',
                               2714                 :                :  * 'uniquedup', and 'indexUnchanged' arguments are used for that.
                               2715                 :                :  *
                               2716                 :                :  * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
                               2717                 :                :  * level flag was found set.  The flag was useful back when there wasn't
                               2718                 :                :  * necessarily one single page for a duplicate tuple to go on (before heap TID
                               2719                 :                :  * became a part of the key space in version 4 indexes).  But we don't
                               2720                 :                :  * actually look at the flag anymore (it's not a gating condition for our
                               2721                 :                :  * caller).  That would cause us to miss tuples that are safe to delete,
                               2722                 :                :  * without getting any benefit in return.  We know that the alternative is to
                               2723                 :                :  * split the page; scanning the line pointer array in passing won't have
                               2724                 :                :  * noticeable overhead.  (We still maintain the BTP_HAS_GARBAGE flag despite
                               2725                 :                :  * all this because !heapkeyspace indexes must still do a "getting tired"
                               2726                 :                :  * linear search, and so are likely to get some benefit from using it as a
                               2727                 :                :  * gating condition.)
                               2728                 :                :  */
                               2729                 :                : static void
 1944 pg@bowt.ie               2730                 :          28232 : _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
                               2731                 :                :                              BTInsertState insertstate,
                               2732                 :                :                              bool simpleonly, bool checkingunique,
                               2733                 :                :                              bool uniquedup, bool indexUnchanged)
                               2734                 :                : {
                               2735                 :                :     OffsetNumber deletable[MaxIndexTuplesPerPage];
 7102 bruce@momjian.us         2736                 :          28232 :     int         ndeletable = 0;
                               2737                 :                :     OffsetNumber offnum,
                               2738                 :                :                 minoff,
                               2739                 :                :                 maxoff;
 1944 pg@bowt.ie               2740                 :          28232 :     Buffer      buffer = insertstate->buf;
                               2741                 :          28232 :     BTScanInsert itup_key = insertstate->itup_key;
 3616 kgrittn@postgresql.o     2742                 :          28232 :     Page        page = BufferGetPage(buffer);
 1444 michael@paquier.xyz      2743                 :          28232 :     BTPageOpaque opaque = BTPageGetOpaque(page);
                               2744                 :                : 
 2552 pg@bowt.ie               2745         [ -  + ]:          28232 :     Assert(P_ISLEAF(opaque));
 1887                          2746   [ +  +  -  + ]:          28232 :     Assert(simpleonly || itup_key->heapkeyspace);
                               2747   [ +  +  +  -  :          28232 :     Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
                                        +  -  -  + ]
                               2748                 :                : 
                               2749                 :                :     /*
                               2750                 :                :      * Scan over all items to see which ones need to be deleted according to
                               2751                 :                :      * LP_DEAD flags.  We'll usually manage to delete a few extra items that
                               2752                 :                :      * are not marked LP_DEAD in passing.  Often the extra items that actually
                               2753                 :                :      * end up getting deleted are items that would have had their LP_DEAD bit
                               2754                 :                :      * set before long anyway (if we opted not to include them as extras).
                               2755                 :                :      */
                               2756         [ +  + ]:          28232 :     minoff = P_FIRSTDATAKEY(opaque);
 7173 tgl@sss.pgh.pa.us        2757                 :          28232 :     maxoff = PageGetMaxOffsetNumber(page);
 1887 pg@bowt.ie               2758                 :          28232 :     for (offnum = minoff;
 7173 tgl@sss.pgh.pa.us        2759         [ +  + ]:        7773867 :          offnum <= maxoff;
                               2760                 :        7745635 :          offnum = OffsetNumberNext(offnum))
                               2761                 :                :     {
 7102 bruce@momjian.us         2762                 :        7745635 :         ItemId      itemId = PageGetItemId(page, offnum);
                               2763                 :                : 
 6759 tgl@sss.pgh.pa.us        2764         [ +  + ]:        7745635 :         if (ItemIdIsDead(itemId))
 7173                          2765                 :         130296 :             deletable[ndeletable++] = offnum;
                               2766                 :                :     }
                               2767                 :                : 
                               2768         [ +  + ]:          28232 :     if (ndeletable > 0)
                               2769                 :                :     {
 1887 pg@bowt.ie               2770                 :           3790 :         _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
                               2771                 :                :                            insertstate->itup, minoff, maxoff);
 1944                          2772                 :           3790 :         insertstate->bounds_valid = false;
                               2773                 :                : 
                               2774                 :                :         /* Return when a page split has already been avoided */
                               2775         [ +  + ]:           3790 :         if (PageGetFreeSpace(page) >= insertstate->itemsz)
                               2776                 :          13076 :             return;
                               2777                 :                : 
                               2778                 :                :         /* Might as well assume duplicates (if checkingunique) */
                               2779                 :             52 :         uniquedup = true;
                               2780                 :                :     }
                               2781                 :                : 
                               2782                 :                :     /*
                               2783                 :                :      * We're done with simple deletion.  Return early with callers that only
                               2784                 :                :      * call here so that simple deletion can be considered.  This includes
                               2785                 :                :      * callers that explicitly ask for this and checkingunique callers that
                               2786                 :                :      * probably don't have any version churn duplicates on the page.
                               2787                 :                :      *
                               2788                 :                :      * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
                               2789                 :                :      * return at this point (or when we go on the try either or both of our
                               2790                 :                :      * other strategies and they also fail).  We do not bother expending a
                               2791                 :                :      * separate write to clear it, however.  Caller will definitely clear it
                               2792                 :                :      * when it goes on to split the page (note also that the deduplication
                               2793                 :                :      * process will clear the flag in passing, just to keep things tidy).
                               2794                 :                :      */
 1887                          2795   [ +  -  +  +  :          24494 :     if (simpleonly || (checkingunique && !uniquedup))
                                              +  + ]
                               2796                 :                :     {
                               2797         [ -  + ]:           9124 :         Assert(!indexUnchanged);
 1944                          2798                 :           9124 :         return;
                               2799                 :                :     }
                               2800                 :                : 
                               2801                 :                :     /* Assume bounds about to be invalidated (this is almost certain now) */
                               2802                 :          15370 :     insertstate->bounds_valid = false;
                               2803                 :                : 
                               2804                 :                :     /*
                               2805                 :                :      * Perform bottom-up index deletion pass when executor hint indicated that
                               2806                 :                :      * incoming item is logically unchanged, or for a unique index that is
                               2807                 :                :      * known to have physical duplicates for some other reason.  (There is a
                               2808                 :                :      * large overlap between these two cases for a unique index.  It's worth
                               2809                 :                :      * having both triggering conditions in order to apply the optimization in
                               2810                 :                :      * the event of successive related INSERT and DELETE statements.)
                               2811                 :                :      *
                               2812                 :                :      * We'll go on to do a deduplication pass when a bottom-up pass fails to
                               2813                 :                :      * delete an acceptable amount of free space (a significant fraction of
                               2814                 :                :      * the page, or space for the new item, whichever is greater).
                               2815                 :                :      *
                               2816                 :                :      * Note: Bottom-up index deletion uses the same equality/equivalence
                               2817                 :                :      * routines as deduplication internally.  However, it does not merge
                               2818                 :                :      * together index tuples, so the same correctness considerations do not
                               2819                 :                :      * apply.  We deliberately omit an index-is-allequalimage test here.
                               2820                 :                :      */
 1887                          2821   [ +  +  +  +  :          17357 :     if ((indexUnchanged || uniquedup) &&
                                              +  + ]
                               2822                 :           1987 :         _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
                               2823                 :            214 :         return;
                               2824                 :                : 
                               2825                 :                :     /* Perform deduplication pass (when enabled and index-is-allequalimage) */
 1944                          2826   [ +  -  -  +  :          15156 :     if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
                                     +  +  +  +  +  
                                           +  +  - ]
 1062                          2827                 :          15147 :         _bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz,
                               2828   [ +  +  +  + ]:          15147 :                        (indexUnchanged || uniquedup));
                               2829                 :                : }
                               2830                 :                : 
                               2831                 :                : /*
                               2832                 :                :  * _bt_simpledel_pass - Simple index tuple deletion pass.
                               2833                 :                :  *
                               2834                 :                :  * We delete all LP_DEAD-set index tuples on a leaf page.  The offset numbers
                               2835                 :                :  * of all such tuples are determined by caller (caller passes these to us as
                               2836                 :                :  * its 'deletable' argument).
                               2837                 :                :  *
                               2838                 :                :  * We might also delete extra index tuples that turn out to be safe to delete
                               2839                 :                :  * in passing (though they must be cheap to check in passing to begin with).
                               2840                 :                :  * There is no certainty that any extra tuples will be deleted, though.  The
                               2841                 :                :  * high level goal of the approach we take is to get the most out of each call
                               2842                 :                :  * here (without noticeably increasing the per-call overhead compared to what
                               2843                 :                :  * we need to do just to be able to delete the page's LP_DEAD-marked index
                               2844                 :                :  * tuples).
                               2845                 :                :  *
                               2846                 :                :  * The number of extra index tuples that turn out to be deletable might
                               2847                 :                :  * greatly exceed the number of LP_DEAD-marked index tuples due to various
                               2848                 :                :  * locality related effects.  For example, it's possible that the total number
                               2849                 :                :  * of table blocks (pointed to by all TIDs on the leaf page) is naturally
                               2850                 :                :  * quite low, in which case we might end up checking if it's possible to
                               2851                 :                :  * delete _most_ index tuples on the page (without the tableam needing to
                               2852                 :                :  * access additional table blocks).  The tableam will sometimes stumble upon
                               2853                 :                :  * _many_ extra deletable index tuples in indexes where this pattern is
                               2854                 :                :  * common.
                               2855                 :                :  *
                               2856                 :                :  * See nbtree/README for further details on simple index tuple deletion.
                               2857                 :                :  */
                               2858                 :                : static void
 1887                          2859                 :           3790 : _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
                               2860                 :                :                    OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
                               2861                 :                :                    OffsetNumber minoff, OffsetNumber maxoff)
                               2862                 :                : {
                               2863                 :           3790 :     Page        page = BufferGetPage(buffer);
                               2864                 :                :     BlockNumber *deadblocks;
                               2865                 :                :     int         ndeadblocks;
                               2866                 :                :     TM_IndexDeleteOp delstate;
                               2867                 :                :     OffsetNumber offnum;
                               2868                 :                : 
                               2869                 :                :     /* Get array of table blocks pointed to by LP_DEAD-set tuples */
                               2870                 :           3790 :     deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
                               2871                 :                :                                 &ndeadblocks);
                               2872                 :                : 
                               2873                 :                :     /* Initialize tableam state that describes index deletion operation */
 1592                          2874                 :           3790 :     delstate.irel = rel;
                               2875                 :           3790 :     delstate.iblknum = BufferGetBlockNumber(buffer);
 1887                          2876                 :           3790 :     delstate.bottomup = false;
                               2877                 :           3790 :     delstate.bottomupfreespace = 0;
                               2878                 :           3790 :     delstate.ndeltids = 0;
                               2879                 :           3790 :     delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
                               2880                 :           3790 :     delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
                               2881                 :                : 
                               2882                 :           3790 :     for (offnum = minoff;
                               2883         [ +  + ]:        1086104 :          offnum <= maxoff;
                               2884                 :        1082314 :          offnum = OffsetNumberNext(offnum))
                               2885                 :                :     {
                               2886                 :        1082314 :         ItemId      itemid = PageGetItemId(page, offnum);
                               2887                 :        1082314 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
                               2888                 :        1082314 :         TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
                               2889                 :        1082314 :         TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
                               2890                 :                :         BlockNumber tidblock;
                               2891                 :                :         void       *match;
                               2892                 :                : 
                               2893         [ +  + ]:        1082314 :         if (!BTreeTupleIsPosting(itup))
                               2894                 :                :         {
                               2895                 :        1033103 :             tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
                               2896                 :        1033103 :             match = bsearch(&tidblock, deadblocks, ndeadblocks,
                               2897                 :                :                             sizeof(BlockNumber), _bt_blk_cmp);
                               2898                 :                : 
                               2899         [ +  + ]:        1033103 :             if (!match)
                               2900                 :                :             {
                               2901         [ -  + ]:         650883 :                 Assert(!ItemIdIsDead(itemid));
                               2902                 :         650883 :                 continue;
                               2903                 :                :             }
                               2904                 :                : 
                               2905                 :                :             /*
                               2906                 :                :              * TID's table block is among those pointed to by the TIDs from
                               2907                 :                :              * LP_DEAD-bit set tuples on page -- add TID to deltids
                               2908                 :                :              */
                               2909                 :         382220 :             odeltid->tid = itup->t_tid;
                               2910                 :         382220 :             odeltid->id = delstate.ndeltids;
                               2911                 :         382220 :             ostatus->idxoffnum = offnum;
                               2912                 :         382220 :             ostatus->knowndeletable = ItemIdIsDead(itemid);
                               2913                 :         382220 :             ostatus->promising = false; /* unused */
                               2914                 :         382220 :             ostatus->freespace = 0; /* unused */
                               2915                 :                : 
                               2916                 :         382220 :             delstate.ndeltids++;
                               2917                 :                :         }
                               2918                 :                :         else
                               2919                 :                :         {
                               2920                 :          49211 :             int         nitem = BTreeTupleGetNPosting(itup);
                               2921                 :                : 
                               2922         [ +  + ]:         245684 :             for (int p = 0; p < nitem; p++)
                               2923                 :                :             {
                               2924                 :         196473 :                 ItemPointer tid = BTreeTupleGetPostingN(itup, p);
                               2925                 :                : 
                               2926                 :         196473 :                 tidblock = ItemPointerGetBlockNumber(tid);
                               2927                 :         196473 :                 match = bsearch(&tidblock, deadblocks, ndeadblocks,
                               2928                 :                :                                 sizeof(BlockNumber), _bt_blk_cmp);
                               2929                 :                : 
                               2930         [ +  + ]:         196473 :                 if (!match)
                               2931                 :                :                 {
                               2932         [ -  + ]:         172714 :                     Assert(!ItemIdIsDead(itemid));
                               2933                 :         172714 :                     continue;
                               2934                 :                :                 }
                               2935                 :                : 
                               2936                 :                :                 /*
                               2937                 :                :                  * TID's table block is among those pointed to by the TIDs
                               2938                 :                :                  * from LP_DEAD-bit set tuples on page -- add TID to deltids
                               2939                 :                :                  */
                               2940                 :          23759 :                 odeltid->tid = *tid;
                               2941                 :          23759 :                 odeltid->id = delstate.ndeltids;
                               2942                 :          23759 :                 ostatus->idxoffnum = offnum;
                               2943                 :          23759 :                 ostatus->knowndeletable = ItemIdIsDead(itemid);
                               2944                 :          23759 :                 ostatus->promising = false; /* unused */
                               2945                 :          23759 :                 ostatus->freespace = 0; /* unused */
                               2946                 :                : 
                               2947                 :          23759 :                 odeltid++;
                               2948                 :          23759 :                 ostatus++;
                               2949                 :          23759 :                 delstate.ndeltids++;
                               2950                 :                :             }
                               2951                 :                :         }
                               2952                 :                :     }
                               2953                 :                : 
                               2954                 :           3790 :     pfree(deadblocks);
                               2955                 :                : 
                               2956         [ -  + ]:           3790 :     Assert(delstate.ndeltids >= ndeletable);
                               2957                 :                : 
                               2958                 :                :     /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
                               2959                 :           3790 :     _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
                               2960                 :                : 
                               2961                 :           3790 :     pfree(delstate.deltids);
                               2962                 :           3790 :     pfree(delstate.status);
                               2963                 :           3790 : }
                               2964                 :                : 
                               2965                 :                : /*
                               2966                 :                :  * _bt_deadblocks() -- Get LP_DEAD related table blocks.
                               2967                 :                :  *
                               2968                 :                :  * Builds sorted and unique-ified array of table block numbers from index
                               2969                 :                :  * tuple TIDs whose line pointers are marked LP_DEAD.  Also adds the table
                               2970                 :                :  * block from incoming newitem just in case it isn't among the LP_DEAD-related
                               2971                 :                :  * table blocks.
                               2972                 :                :  *
                               2973                 :                :  * Always counting the newitem's table block as an LP_DEAD related block makes
                               2974                 :                :  * sense because the cost is consistently low; it is practically certain that
                               2975                 :                :  * the table block will not incur a buffer miss in tableam.  On the other hand
                               2976                 :                :  * the benefit is often quite high.  There is a decent chance that there will
                               2977                 :                :  * be some deletable items from this block, since in general most garbage
                               2978                 :                :  * tuples became garbage in the recent past (in many cases this won't be the
                               2979                 :                :  * first logical row that core code added to/modified in table block
                               2980                 :                :  * recently).
                               2981                 :                :  *
                               2982                 :                :  * Returns final array, and sets *nblocks to its final size for caller.
                               2983                 :                :  */
                               2984                 :                : static BlockNumber *
                               2985                 :           3790 : _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
                               2986                 :                :                IndexTuple newitem, int *nblocks)
                               2987                 :                : {
                               2988                 :                :     int         spacentids,
                               2989                 :                :                 ntids;
                               2990                 :                :     BlockNumber *tidblocks;
                               2991                 :                : 
                               2992                 :                :     /*
                               2993                 :                :      * Accumulate each TID's block in array whose initial size has space for
                               2994                 :                :      * one table block per LP_DEAD-set tuple (plus space for the newitem table
                               2995                 :                :      * block).  Array will only need to grow when there are LP_DEAD-marked
                               2996                 :                :      * posting list tuples (which is not that common).
                               2997                 :                :      */
                               2998                 :           3790 :     spacentids = ndeletable + 1;
                               2999                 :           3790 :     ntids = 0;
   95 michael@paquier.xyz      3000                 :GNC        3790 :     tidblocks = palloc_array(BlockNumber, spacentids);
                               3001                 :                : 
                               3002                 :                :     /*
                               3003                 :                :      * First add the table block for the incoming newitem.  This is the one
                               3004                 :                :      * case where simple deletion can visit a table block that doesn't have
                               3005                 :                :      * any known deletable items.
                               3006                 :                :      */
 1887 pg@bowt.ie               3007   [ +  -  -  + ]:CBC        3790 :     Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
                               3008                 :           3790 :     tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
                               3009                 :                : 
                               3010         [ +  + ]:         134086 :     for (int i = 0; i < ndeletable; i++)
                               3011                 :                :     {
                               3012                 :         130296 :         ItemId      itemid = PageGetItemId(page, deletable[i]);
                               3013                 :         130296 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
                               3014                 :                : 
                               3015         [ -  + ]:         130296 :         Assert(ItemIdIsDead(itemid));
                               3016                 :                : 
                               3017         [ +  + ]:         130296 :         if (!BTreeTupleIsPosting(itup))
                               3018                 :                :         {
                               3019         [ +  + ]:         126062 :             if (ntids + 1 > spacentids)
                               3020                 :                :             {
                               3021                 :            107 :                 spacentids *= 2;
                               3022                 :                :                 tidblocks = (BlockNumber *)
                               3023                 :            107 :                     repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
                               3024                 :                :             }
                               3025                 :                : 
                               3026                 :         126062 :             tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
                               3027                 :                :         }
                               3028                 :                :         else
                               3029                 :                :         {
                               3030                 :           4234 :             int         nposting = BTreeTupleGetNPosting(itup);
                               3031                 :                : 
                               3032         [ +  + ]:           4234 :             if (ntids + nposting > spacentids)
                               3033                 :                :             {
                               3034                 :            111 :                 spacentids = Max(spacentids * 2, ntids + nposting);
                               3035                 :                :                 tidblocks = (BlockNumber *)
                               3036                 :            111 :                     repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
                               3037                 :                :             }
                               3038                 :                : 
                               3039         [ +  + ]:          14381 :             for (int j = 0; j < nposting; j++)
                               3040                 :                :             {
                               3041                 :          10147 :                 ItemPointer tid = BTreeTupleGetPostingN(itup, j);
                               3042                 :                : 
                               3043                 :          10147 :                 tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
                               3044                 :                :             }
                               3045                 :                :         }
                               3046                 :                :     }
                               3047                 :                : 
                               3048                 :           3790 :     qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
                               3049                 :           3790 :     *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
                               3050                 :                : 
                               3051                 :           3790 :     return tidblocks;
                               3052                 :                : }
                               3053                 :                : 
                               3054                 :                : /*
                               3055                 :                :  * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
                               3056                 :                :  */
                               3057                 :                : static inline int
                               3058                 :        2750511 : _bt_blk_cmp(const void *arg1, const void *arg2)
                               3059                 :                : {
   48 peter@eisentraut.org     3060                 :GNC     2750511 :     BlockNumber b1 = *((const BlockNumber *) arg1);
                               3061                 :        2750511 :     BlockNumber b2 = *((const BlockNumber *) arg2);
                               3062                 :                : 
  758 nathan@postgresql.or     3063                 :CBC     2750511 :     return pg_cmp_u32(b1, b2);
                               3064                 :                : }
        

Generated by: LCOV version 2.4-beta