LCOV - differential code coverage report
Current view: top level - src/backend/access/nbtree - nbtdedup.c (source / functions) Coverage Total Hit UBC CBC
Current: c70b6db34ffeab48beef1fb4ce61bcad3772b8dd vs 06473f5a344df8c9594ead90a609b86f6724cff8 Lines: 96.8 % 346 335 11 335
Current Date: 2025-09-06 07:49:51 +0900 Functions: 100.0 % 12 12 12
Baseline: lcov-20250906-005545-baseline Branches: 70.4 % 226 159 67 159
Baseline Date: 2025-09-05 08:21:35 +0100 Line coverage date bins:
Legend: Lines:     hit not hit
Branches: + taken - not taken # not executed
(30,360] days: 100.0 % 5 5 5
(360..) days: 96.8 % 341 330 11 330
Function coverage date bins:
(360..) days: 100.0 % 12 12 12
Branch coverage date bins:
(30,360] days: 50.0 % 4 2 2 2
(360..) days: 70.7 % 222 157 65 157

 Age         Owner                    Branch data    TLA  Line data    Source code
                                  1                 :                : /*-------------------------------------------------------------------------
                                  2                 :                :  *
                                  3                 :                :  * nbtdedup.c
                                  4                 :                :  *    Deduplicate or bottom-up delete items in Postgres btrees.
                                  5                 :                :  *
                                  6                 :                :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
                                  7                 :                :  * Portions Copyright (c) 1994, Regents of the University of California
                                  8                 :                :  *
                                  9                 :                :  *
                                 10                 :                :  * IDENTIFICATION
                                 11                 :                :  *    src/backend/access/nbtree/nbtdedup.c
                                 12                 :                :  *
                                 13                 :                :  *-------------------------------------------------------------------------
                                 14                 :                :  */
                                 15                 :                : #include "postgres.h"
                                 16                 :                : 
                                 17                 :                : #include "access/nbtree.h"
                                 18                 :                : #include "access/nbtxlog.h"
                                 19                 :                : #include "access/tableam.h"
                                 20                 :                : #include "access/xloginsert.h"
                                 21                 :                : #include "miscadmin.h"
                                 22                 :                : #include "utils/rel.h"
                                 23                 :                : 
                                 24                 :                : static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
                                 25                 :                :                                            TM_IndexDeleteOp *delstate);
                                 26                 :                : static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
                                 27                 :                :                              OffsetNumber minoff, IndexTuple newitem);
                                 28                 :                : static void _bt_singleval_fillfactor(Page page, BTDedupState state,
                                 29                 :                :                                      Size newitemsz);
                                 30                 :                : #ifdef USE_ASSERT_CHECKING
                                 31                 :                : static bool _bt_posting_valid(IndexTuple posting);
                                 32                 :                : #endif
                                 33                 :                : 
                                 34                 :                : /*
                                 35                 :                :  * Perform a deduplication pass.
                                 36                 :                :  *
                                 37                 :                :  * The general approach taken here is to perform as much deduplication as
                                 38                 :                :  * possible to free as much space as possible.  Note, however, that "single
                                 39                 :                :  * value" strategy is used for !bottomupdedup callers when the page is full of
                                 40                 :                :  * tuples of a single value.  Deduplication passes that apply the strategy
                                 41                 :                :  * will leave behind a few untouched tuples at the end of the page, preparing
                                 42                 :                :  * the page for an anticipated page split that uses nbtsplitloc.c's own single
                                 43                 :                :  * value strategy.  Our high level goal is to delay merging the untouched
                                 44                 :                :  * tuples until after the page splits.
                                 45                 :                :  *
                                 46                 :                :  * When a call to _bt_bottomupdel_pass() just took place (and failed), our
                                 47                 :                :  * high level goal is to prevent a page split entirely by buying more time.
                                 48                 :                :  * We still hope that a page split can be avoided altogether.  That's why
                                 49                 :                :  * single value strategy is not even considered for bottomupdedup callers.
                                 50                 :                :  *
                                 51                 :                :  * The page will have to be split if we cannot successfully free at least
                                 52                 :                :  * newitemsz (we also need space for newitem's line pointer, which isn't
                                 53                 :                :  * included in caller's newitemsz).
                                 54                 :                :  *
                                 55                 :                :  * Note: Caller should have already deleted all existing items with their
                                 56                 :                :  * LP_DEAD bits set.
                                 57                 :                :  */
                                 58                 :                : void
  872 pg@bowt.ie                 59                 :CBC       13283 : _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz,
                                 60                 :                :                bool bottomupdedup)
                                 61                 :                : {
                                 62                 :                :     OffsetNumber offnum,
                                 63                 :                :                 minoff,
                                 64                 :                :                 maxoff;
 2019                            65                 :          13283 :     Page        page = BufferGetPage(buf);
 1254 michael@paquier.xyz        66                 :          13283 :     BTPageOpaque opaque = BTPageGetOpaque(page);
                                 67                 :                :     Page        newpage;
                                 68                 :                :     BTDedupState state;
 1249 tgl@sss.pgh.pa.us          69                 :          13283 :     Size        pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
 2019 pg@bowt.ie                 70                 :          13283 :     bool        singlevalstrat = false;
 1988                            71                 :          13283 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
                                 72                 :                : 
                                 73                 :                :     /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
 2019                            74                 :          13283 :     newitemsz += sizeof(ItemIdData);
                                 75                 :                : 
                                 76                 :                :     /*
                                 77                 :                :      * Initialize deduplication state.
                                 78                 :                :      *
                                 79                 :                :      * It would be possible for maxpostingsize (limit on posting list tuple
                                 80                 :                :      * size) to be set to one third of the page.  However, it seems like a
                                 81                 :                :      * good idea to limit the size of posting lists to one sixth of a page.
                                 82                 :                :      * That ought to leave us with a good split point when pages full of
                                 83                 :                :      * duplicates can be split several times.
                                 84                 :                :      */
                                 85                 :          13283 :     state = (BTDedupState) palloc(sizeof(BTDedupStateData));
                                 86                 :          13283 :     state->deduplicate = true;
 1905                            87                 :          13283 :     state->nmaxitems = 0;
  179                            88                 :          13283 :     state->maxpostingsize = Min(BTMaxItemSize / 2, INDEX_SIZE_MASK);
                                 89                 :                :     /* Metadata about base tuple of current pending posting list */
 2019                            90                 :          13283 :     state->base = NULL;
                                 91                 :          13283 :     state->baseoff = InvalidOffsetNumber;
                                 92                 :          13283 :     state->basetupsize = 0;
                                 93                 :                :     /* Metadata about current pending posting list TIDs */
                                 94                 :          13283 :     state->htids = palloc(state->maxpostingsize);
                                 95                 :          13283 :     state->nhtids = 0;
                                 96                 :          13283 :     state->nitems = 0;
                                 97                 :                :     /* Size of all physical tuples to be replaced by pending posting list */
                                 98                 :          13283 :     state->phystupsize = 0;
                                 99                 :                :     /* nintervals should be initialized to zero */
                                100                 :          13283 :     state->nintervals = 0;
                                101                 :                : 
 1754                           102         [ +  + ]:          13283 :     minoff = P_FIRSTDATAKEY(opaque);
                                103                 :          13283 :     maxoff = PageGetMaxOffsetNumber(page);
                                104                 :                : 
                                105                 :                :     /*
                                106                 :                :      * Consider applying "single value" strategy, though only if the page
                                107                 :                :      * seems likely to be split in the near future
                                108                 :                :      */
 1446                           109         [ +  + ]:          13283 :     if (!bottomupdedup)
 2019                           110                 :          11561 :         singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
                                111                 :                : 
                                112                 :                :     /*
                                113                 :                :      * Deduplicate items from page, and write them to newpage.
                                114                 :                :      *
                                115                 :                :      * Copy the original page's LSN into newpage copy.  This will become the
                                116                 :                :      * updated version of the page.  We need this because XLogInsert will
                                117                 :                :      * examine the LSN and possibly dump it in a page image.
                                118                 :                :      */
                                119                 :          13283 :     newpage = PageGetTempPageCopySpecial(page);
                                120                 :          13283 :     PageSetLSN(newpage, PageGetLSN(page));
                                121                 :                : 
                                122                 :                :     /* Copy high key, if any */
                                123         [ +  + ]:          13283 :     if (!P_RIGHTMOST(opaque))
                                124                 :                :     {
                                125                 :          10508 :         ItemId      hitemid = PageGetItemId(page, P_HIKEY);
                                126                 :          10508 :         Size        hitemsz = ItemIdGetLength(hitemid);
                                127                 :          10508 :         IndexTuple  hitem = (IndexTuple) PageGetItem(page, hitemid);
                                128                 :                : 
                                129         [ -  + ]:          10508 :         if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
                                130                 :                :                         false, false) == InvalidOffsetNumber)
 2019 pg@bowt.ie                131         [ #  # ]:UBC           0 :             elog(ERROR, "deduplication failed to add highkey");
                                132                 :                :     }
                                133                 :                : 
 2019 pg@bowt.ie                134                 :CBC       13283 :     for (offnum = minoff;
                                135         [ +  + ]:        2984592 :          offnum <= maxoff;
                                136                 :        2971309 :          offnum = OffsetNumberNext(offnum))
                                137                 :                :     {
                                138                 :        2971309 :         ItemId      itemid = PageGetItemId(page, offnum);
                                139                 :        2971309 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
                                140                 :                : 
                                141         [ -  + ]:        2971309 :         Assert(!ItemIdIsDead(itemid));
                                142                 :                : 
                                143         [ +  + ]:        2971309 :         if (offnum == minoff)
                                144                 :                :         {
                                145                 :                :             /*
                                146                 :                :              * No previous/base tuple for the data item -- use the data item
                                147                 :                :              * as base tuple of pending posting list
                                148                 :                :              */
                                149                 :          13283 :             _bt_dedup_start_pending(state, itup, offnum);
                                150                 :                :         }
                                151   [ +  +  +  + ]:        5914926 :         else if (state->deduplicate &&
 1988                           152         [ +  + ]:        3370534 :                  _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
 2019                           153                 :         413634 :                  _bt_dedup_save_htid(state, itup))
                                154                 :                :         {
                                155                 :                :             /*
                                156                 :                :              * Tuple is equal to base tuple of pending posting list.  Heap
                                157                 :                :              * TID(s) for itup have been saved in state.
                                158                 :                :              */
                                159                 :                :         }
                                160                 :                :         else
                                161                 :                :         {
                                162                 :                :             /*
                                163                 :                :              * Tuple is not equal to pending posting list tuple, or
                                164                 :                :              * _bt_dedup_save_htid() opted to not merge current item into
                                165                 :                :              * pending posting list for some other reason (e.g., adding more
                                166                 :                :              * TIDs would have caused posting list to exceed current
                                167                 :                :              * maxpostingsize).
                                168                 :                :              *
                                169                 :                :              * If state contains pending posting list with more than one item,
                                170                 :                :              * form new posting tuple and add it to our temp page (newpage).
                                171                 :                :              * Else add pending interval's base tuple to the temp page as-is.
                                172                 :                :              */
                                173                 :        2550766 :             pagesaving += _bt_dedup_finish_pending(newpage, state);
                                174                 :                : 
                                175         [ +  + ]:        2550766 :             if (singlevalstrat)
                                176                 :                :             {
                                177                 :                :                 /*
                                178                 :                :                  * Single value strategy's extra steps.
                                179                 :                :                  *
                                180                 :                :                  * Lower maxpostingsize for sixth and final large posting list
                                181                 :                :                  * tuple at the point where 5 maxpostingsize-capped tuples
                                182                 :                :                  * have either been formed or observed.
                                183                 :                :                  *
                                184                 :                :                  * When a sixth maxpostingsize-capped item is formed/observed,
                                185                 :                :                  * stop merging together tuples altogether.  The few tuples
                                186                 :                :                  * that remain at the end of the page won't be merged together
                                187                 :                :                  * at all (at least not until after a future page split takes
                                188                 :                :                  * place, when this page's newly allocated right sibling page
                                189                 :                :                  * gets its first deduplication pass).
                                190                 :                :                  */
 1905                           191         [ +  + ]:           2674 :                 if (state->nmaxitems == 5)
 2019                           192                 :            326 :                     _bt_singleval_fillfactor(page, state, newitemsz);
 1905                           193         [ +  + ]:           2348 :                 else if (state->nmaxitems == 6)
                                194                 :                :                 {
 2019                           195                 :            127 :                     state->deduplicate = false;
                                196                 :            127 :                     singlevalstrat = false; /* won't be back here */
                                197                 :                :                 }
                                198                 :                :             }
                                199                 :                : 
                                200                 :                :             /* itup starts new pending posting list */
                                201                 :        2550766 :             _bt_dedup_start_pending(state, itup, offnum);
                                202                 :                :         }
                                203                 :                :     }
                                204                 :                : 
                                205                 :                :     /* Handle the last item */
                                206                 :          13283 :     pagesaving += _bt_dedup_finish_pending(newpage, state);
                                207                 :                : 
                                208                 :                :     /*
                                209                 :                :      * If no items suitable for deduplication were found, newpage must be
                                210                 :                :      * exactly the same as the original page, so just return from function.
                                211                 :                :      *
                                212                 :                :      * We could determine whether or not to proceed on the basis the space
                                213                 :                :      * savings being sufficient to avoid an immediate page split instead.  We
                                214                 :                :      * don't do that because there is some small value in nbtsplitloc.c always
                                215                 :                :      * operating against a page that is fully deduplicated (apart from
                                216                 :                :      * newitem).  Besides, most of the cost has already been paid.
                                217                 :                :      */
                                218         [ +  + ]:          13283 :     if (state->nintervals == 0)
                                219                 :                :     {
                                220                 :                :         /* cannot leak memory here */
                                221                 :           2303 :         pfree(newpage);
                                222                 :           2303 :         pfree(state->htids);
                                223                 :           2303 :         pfree(state);
                                224                 :           2303 :         return;
                                225                 :                :     }
                                226                 :                : 
                                227                 :                :     /*
                                228                 :                :      * By here, it's clear that deduplication will definitely go ahead.
                                229                 :                :      *
                                230                 :                :      * Clear the BTP_HAS_GARBAGE page flag.  The index must be a heapkeyspace
                                231                 :                :      * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
                                232                 :                :      * But keep things tidy.
                                233                 :                :      */
                                234         [ -  + ]:          10980 :     if (P_HAS_GARBAGE(opaque))
                                235                 :                :     {
 1254 michael@paquier.xyz       236                 :UBC           0 :         BTPageOpaque nopaque = BTPageGetOpaque(newpage);
                                237                 :                : 
 2019 pg@bowt.ie                238                 :              0 :         nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
                                239                 :                :     }
                                240                 :                : 
 2019 pg@bowt.ie                241                 :CBC       10980 :     START_CRIT_SECTION();
                                242                 :                : 
                                243                 :          10980 :     PageRestoreTempPage(newpage, page);
                                244                 :          10980 :     MarkBufferDirty(buf);
                                245                 :                : 
                                246                 :                :     /* XLOG stuff */
                                247   [ +  +  +  +  :          10980 :     if (RelationNeedsWAL(rel))
                                        +  -  +  - ]
                                248                 :                :     {
                                249                 :                :         XLogRecPtr  recptr;
                                250                 :                :         xl_btree_dedup xlrec_dedup;
                                251                 :                : 
                                252                 :          10917 :         xlrec_dedup.nintervals = state->nintervals;
                                253                 :                : 
                                254                 :          10917 :         XLogBeginInsert();
                                255                 :          10917 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
  207 peter@eisentraut.org      256                 :          10917 :         XLogRegisterData(&xlrec_dedup, SizeOfBtreeDedup);
                                257                 :                : 
                                258                 :                :         /*
                                259                 :                :          * The intervals array is not in the buffer, but pretend that it is.
                                260                 :                :          * When XLogInsert stores the whole buffer, the array need not be
                                261                 :                :          * stored too.
                                262                 :                :          */
                                263                 :          10917 :         XLogRegisterBufData(0, state->intervals,
 2019 pg@bowt.ie                264                 :          10917 :                             state->nintervals * sizeof(BTDedupInterval));
                                265                 :                : 
                                266                 :          10917 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
                                267                 :                : 
                                268                 :          10917 :         PageSetLSN(page, recptr);
                                269                 :                :     }
                                270                 :                : 
                                271         [ -  + ]:          10980 :     END_CRIT_SECTION();
                                272                 :                : 
                                273                 :                :     /* Local space accounting should agree with page accounting */
                                274   [ +  +  -  + ]:          10980 :     Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
                                275                 :                : 
                                276                 :                :     /* cannot leak memory here */
                                277                 :          10980 :     pfree(state->htids);
                                278                 :          10980 :     pfree(state);
                                279                 :                : }
                                280                 :                : 
                                281                 :                : /*
                                282                 :                :  * Perform bottom-up index deletion pass.
                                283                 :                :  *
                                284                 :                :  * See if duplicate index tuples (plus certain nearby tuples) are eligible to
                                285                 :                :  * be deleted via bottom-up index deletion.  The high level goal here is to
                                286                 :                :  * entirely prevent "unnecessary" page splits caused by MVCC version churn
                                287                 :                :  * from UPDATEs (when the UPDATEs don't logically modify any of the columns
                                288                 :                :  * covered by the 'rel' index).  This is qualitative, not quantitative -- we
                                289                 :                :  * do not particularly care about once-off opportunities to delete many index
                                290                 :                :  * tuples together.
                                291                 :                :  *
                                292                 :                :  * See nbtree/README for details on the design of nbtree bottom-up deletion.
                                293                 :                :  * See access/tableam.h for a description of how we're expected to cooperate
                                294                 :                :  * with the tableam.
                                295                 :                :  *
                                296                 :                :  * Returns true on success, in which case caller can assume page split will be
                                297                 :                :  * avoided for a reasonable amount of time.  Returns false when caller should
                                298                 :                :  * deduplicate the page (if possible at all).
                                299                 :                :  *
                                300                 :                :  * Note: Occasionally we return true despite failing to delete enough items to
                                301                 :                :  * avoid a split.  This makes caller skip deduplication and go split the page
                                302                 :                :  * right away.  Our return value is always just advisory information.
                                303                 :                :  *
                                304                 :                :  * Note: Caller should have already deleted all existing items with their
                                305                 :                :  * LP_DEAD bits set.
                                306                 :                :  */
                                307                 :                : bool
 1697                           308                 :           1930 : _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
                                309                 :                :                      Size newitemsz)
                                310                 :                : {
                                311                 :                :     OffsetNumber offnum,
                                312                 :                :                 minoff,
                                313                 :                :                 maxoff;
                                314                 :           1930 :     Page        page = BufferGetPage(buf);
 1254 michael@paquier.xyz       315                 :           1930 :     BTPageOpaque opaque = BTPageGetOpaque(page);
                                316                 :                :     BTDedupState state;
                                317                 :                :     TM_IndexDeleteOp delstate;
                                318                 :                :     bool        neverdedup;
 1697 pg@bowt.ie                319                 :           1930 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
                                320                 :                : 
                                321                 :                :     /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
                                322                 :           1930 :     newitemsz += sizeof(ItemIdData);
                                323                 :                : 
                                324                 :                :     /* Initialize deduplication state */
                                325                 :           1930 :     state = (BTDedupState) palloc(sizeof(BTDedupStateData));
                                326                 :           1930 :     state->deduplicate = true;
                                327                 :           1930 :     state->nmaxitems = 0;
                                328                 :           1930 :     state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
                                329                 :           1930 :     state->base = NULL;
                                330                 :           1930 :     state->baseoff = InvalidOffsetNumber;
                                331                 :           1930 :     state->basetupsize = 0;
                                332                 :           1930 :     state->htids = palloc(state->maxpostingsize);
                                333                 :           1930 :     state->nhtids = 0;
                                334                 :           1930 :     state->nitems = 0;
                                335                 :           1930 :     state->phystupsize = 0;
                                336                 :           1930 :     state->nintervals = 0;
                                337                 :                : 
                                338                 :                :     /*
                                339                 :                :      * Initialize tableam state that describes bottom-up index deletion
                                340                 :                :      * operation.
                                341                 :                :      *
                                342                 :                :      * We'll go on to ask the tableam to search for TIDs whose index tuples we
                                343                 :                :      * can safely delete.  The tableam will search until our leaf page space
                                344                 :                :      * target is satisfied, or until the cost of continuing with the tableam
                                345                 :                :      * operation seems too high.  It focuses its efforts on TIDs associated
                                346                 :                :      * with duplicate index tuples that we mark "promising".
                                347                 :                :      *
                                348                 :                :      * This space target is a little arbitrary.  The tableam must be able to
                                349                 :                :      * keep the costs and benefits in balance.  We provide the tableam with
                                350                 :                :      * exhaustive information about what might work, without directly
                                351                 :                :      * concerning ourselves with avoiding work during the tableam call.  Our
                                352                 :                :      * role in costing the bottom-up deletion process is strictly advisory.
                                353                 :                :      */
 1402                           354                 :           1930 :     delstate.irel = rel;
                                355                 :           1930 :     delstate.iblknum = BufferGetBlockNumber(buf);
 1697                           356                 :           1930 :     delstate.bottomup = true;
                                357                 :           1930 :     delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
                                358                 :           1930 :     delstate.ndeltids = 0;
                                359                 :           1930 :     delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
                                360                 :           1930 :     delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
                                361                 :                : 
                                362         [ +  + ]:           1930 :     minoff = P_FIRSTDATAKEY(opaque);
                                363                 :           1930 :     maxoff = PageGetMaxOffsetNumber(page);
                                364                 :           1930 :     for (offnum = minoff;
                                365         [ +  + ]:         530851 :          offnum <= maxoff;
                                366                 :         528921 :          offnum = OffsetNumberNext(offnum))
                                367                 :                :     {
                                368                 :         528921 :         ItemId      itemid = PageGetItemId(page, offnum);
                                369                 :         528921 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
                                370                 :                : 
                                371         [ -  + ]:         528921 :         Assert(!ItemIdIsDead(itemid));
                                372                 :                : 
                                373         [ +  + ]:         528921 :         if (offnum == minoff)
                                374                 :                :         {
                                375                 :                :             /* itup starts first pending interval */
                                376                 :           1930 :             _bt_dedup_start_pending(state, itup, offnum);
                                377                 :                :         }
                                378   [ +  +  +  - ]:         601309 :         else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
                                379                 :          74318 :                  _bt_dedup_save_htid(state, itup))
                                380                 :                :         {
                                381                 :                :             /* Tuple is equal; just added its TIDs to pending interval */
                                382                 :                :         }
                                383                 :                :         else
                                384                 :                :         {
                                385                 :                :             /* Finalize interval -- move its TIDs to delete state */
                                386                 :         452673 :             _bt_bottomupdel_finish_pending(page, state, &delstate);
                                387                 :                : 
                                388                 :                :             /* itup starts new pending interval */
                                389                 :         452673 :             _bt_dedup_start_pending(state, itup, offnum);
                                390                 :                :         }
                                391                 :                :     }
                                392                 :                :     /* Finalize final interval -- move its TIDs to delete state */
                                393                 :           1930 :     _bt_bottomupdel_finish_pending(page, state, &delstate);
                                394                 :                : 
                                395                 :                :     /*
                                396                 :                :      * We don't give up now in the event of having few (or even zero)
                                397                 :                :      * promising tuples for the tableam because it's not up to us as the index
                                398                 :                :      * AM to manage costs (note that the tableam might have heuristics of its
                                399                 :                :      * own that work out what to do).  We should at least avoid having our
                                400                 :                :      * caller do a useless deduplication pass after we return in the event of
                                401                 :                :      * zero promising tuples, though.
                                402                 :                :      */
                                403                 :           1930 :     neverdedup = false;
                                404         [ +  + ]:           1930 :     if (state->nintervals == 0)
                                405                 :              4 :         neverdedup = true;
                                406                 :                : 
                                407                 :           1930 :     pfree(state->htids);
                                408                 :           1930 :     pfree(state);
                                409                 :                : 
                                410                 :                :     /* Ask tableam which TIDs are deletable, then physically delete them */
                                411                 :           1930 :     _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
                                412                 :                : 
                                413                 :           1930 :     pfree(delstate.deltids);
                                414                 :           1930 :     pfree(delstate.status);
                                415                 :                : 
                                416                 :                :     /* Report "success" to caller unconditionally to avoid deduplication */
                                417         [ +  + ]:           1930 :     if (neverdedup)
                                418                 :              4 :         return true;
                                419                 :                : 
                                420                 :                :     /* Don't dedup when we won't end up back here any time soon anyway */
                                421                 :           1926 :     return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
                                422                 :                : }
                                423                 :                : 
                                424                 :                : /*
                                425                 :                :  * Create a new pending posting list tuple based on caller's base tuple.
                                426                 :                :  *
                                427                 :                :  * Every tuple processed by deduplication either becomes the base tuple for a
                                428                 :                :  * posting list, or gets its heap TID(s) accepted into a pending posting list.
                                429                 :                :  * A tuple that starts out as the base tuple for a posting list will only
                                430                 :                :  * actually be rewritten within _bt_dedup_finish_pending() when it turns out
                                431                 :                :  * that there are duplicates that can be merged into the base tuple.
                                432                 :                :  */
                                433                 :                : void
 2019                           434                 :        5790210 : _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
                                435                 :                :                         OffsetNumber baseoff)
                                436                 :                : {
                                437         [ -  + ]:        5790210 :     Assert(state->nhtids == 0);
                                438         [ -  + ]:        5790210 :     Assert(state->nitems == 0);
                                439         [ -  + ]:        5790210 :     Assert(!BTreeTupleIsPivot(base));
                                440                 :                : 
                                441                 :                :     /*
                                442                 :                :      * Copy heap TID(s) from new base tuple for new candidate posting list
                                443                 :                :      * into working state's array
                                444                 :                :      */
                                445         [ +  + ]:        5790210 :     if (!BTreeTupleIsPosting(base))
                                446                 :                :     {
                                447                 :        4841822 :         memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
                                448                 :        4841822 :         state->nhtids = 1;
                                449                 :        4841822 :         state->basetupsize = IndexTupleSize(base);
                                450                 :                :     }
                                451                 :                :     else
                                452                 :                :     {
                                453                 :                :         int         nposting;
                                454                 :                : 
                                455                 :         948388 :         nposting = BTreeTupleGetNPosting(base);
                                456                 :         948388 :         memcpy(state->htids, BTreeTupleGetPosting(base),
                                457                 :                :                sizeof(ItemPointerData) * nposting);
                                458                 :         948388 :         state->nhtids = nposting;
                                459                 :                :         /* basetupsize should not include existing posting list */
                                460                 :         948388 :         state->basetupsize = BTreeTupleGetPostingOffset(base);
                                461                 :                :     }
                                462                 :                : 
                                463                 :                :     /*
                                464                 :                :      * Save new base tuple itself -- it'll be needed if we actually create a
                                465                 :                :      * new posting list from new pending posting list.
                                466                 :                :      *
                                467                 :                :      * Must maintain physical size of all existing tuples (including line
                                468                 :                :      * pointer overhead) so that we can calculate space savings on page.
                                469                 :                :      */
                                470                 :        5790210 :     state->nitems = 1;
                                471                 :        5790210 :     state->base = base;
                                472                 :        5790210 :     state->baseoff = baseoff;
                                473                 :        5790210 :     state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
                                474                 :                :     /* Also save baseoff in pending state for interval */
                                475                 :        5790210 :     state->intervals[state->nintervals].baseoff = state->baseoff;
                                476                 :        5790210 : }
                                477                 :                : 
                                478                 :                : /*
                                479                 :                :  * Save itup heap TID(s) into pending posting list where possible.
                                480                 :                :  *
                                481                 :                :  * Returns bool indicating if the pending posting list managed by state now
                                482                 :                :  * includes itup's heap TID(s).
                                483                 :                :  */
                                484                 :                : bool
                                485                 :        1254475 : _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
                                486                 :                : {
                                487                 :                :     int         nhtids;
                                488                 :                :     ItemPointer htids;
                                489                 :                :     Size        mergedtupsz;
                                490                 :                : 
                                491         [ -  + ]:        1254475 :     Assert(!BTreeTupleIsPivot(itup));
                                492                 :                : 
                                493         [ +  + ]:        1254475 :     if (!BTreeTupleIsPosting(itup))
                                494                 :                :     {
                                495                 :        1248115 :         nhtids = 1;
                                496                 :        1248115 :         htids = &itup->t_tid;
                                497                 :                :     }
                                498                 :                :     else
                                499                 :                :     {
                                500                 :           6360 :         nhtids = BTreeTupleGetNPosting(itup);
                                501                 :           6360 :         htids = BTreeTupleGetPosting(itup);
                                502                 :                :     }
                                503                 :                : 
                                504                 :                :     /*
                                505                 :                :      * Don't append (have caller finish pending posting list as-is) if
                                506                 :                :      * appending heap TID(s) from itup would put us over maxpostingsize limit.
                                507                 :                :      *
                                508                 :                :      * This calculation needs to match the code used within _bt_form_posting()
                                509                 :                :      * for new posting list tuples.
                                510                 :                :      */
                                511                 :        1254475 :     mergedtupsz = MAXALIGN(state->basetupsize +
                                512                 :                :                            (state->nhtids + nhtids) * sizeof(ItemPointerData));
                                513                 :                : 
                                514         [ +  + ]:        1254475 :     if (mergedtupsz > state->maxpostingsize)
                                515                 :                :     {
                                516                 :                :         /*
                                517                 :                :          * Count this as an oversized item for single value strategy, though
                                518                 :                :          * only when there are 50 TIDs in the final posting list tuple.  This
                                519                 :                :          * limit (which is fairly arbitrary) avoids confusion about how many
                                520                 :                :          * 1/6 of a page tuples have been encountered/created by the current
                                521                 :                :          * deduplication pass.
                                522                 :                :          *
                                523                 :                :          * Note: We deliberately don't consider which deduplication pass
                                524                 :                :          * merged together tuples to create this item (could be a previous
                                525                 :                :          * deduplication pass, or current pass).  See _bt_do_singleval()
                                526                 :                :          * comments.
                                527                 :                :          */
 1905                           528         [ +  + ]:          10297 :         if (state->nhtids > 50)
                                529                 :           9685 :             state->nmaxitems++;
                                530                 :                : 
 2019                           531                 :          10297 :         return false;
                                532                 :                :     }
                                533                 :                : 
                                534                 :                :     /*
                                535                 :                :      * Save heap TIDs to pending posting list tuple -- itup can be merged into
                                536                 :                :      * pending posting list
                                537                 :                :      */
                                538                 :        1244178 :     state->nitems++;
                                539                 :        1244178 :     memcpy(state->htids + state->nhtids, htids,
                                540                 :                :            sizeof(ItemPointerData) * nhtids);
                                541                 :        1244178 :     state->nhtids += nhtids;
                                542                 :        1244178 :     state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
                                543                 :                : 
                                544                 :        1244178 :     return true;
                                545                 :                : }
                                546                 :                : 
                                547                 :                : /*
                                548                 :                :  * Finalize pending posting list tuple, and add it to the page.  Final tuple
                                549                 :                :  * is based on saved base tuple, and saved list of heap TIDs.
                                550                 :                :  *
                                551                 :                :  * Returns space saving from deduplicating to make a new posting list tuple.
                                552                 :                :  * Note that this includes line pointer overhead.  This is zero in the case
                                553                 :                :  * where no deduplication was possible.
                                554                 :                :  */
                                555                 :                : Size
                                556                 :        2976729 : _bt_dedup_finish_pending(Page newpage, BTDedupState state)
                                557                 :                : {
                                558                 :                :     OffsetNumber tupoff;
                                559                 :                :     Size        tuplesz;
                                560                 :                :     Size        spacesaving;
                                561                 :                : 
                                562         [ -  + ]:        2976729 :     Assert(state->nitems > 0);
                                563         [ -  + ]:        2976729 :     Assert(state->nitems <= state->nhtids);
                                564         [ -  + ]:        2976729 :     Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
                                565                 :                : 
                                566                 :        2976729 :     tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
                                567         [ +  + ]:        2976729 :     if (state->nitems == 1)
                                568                 :                :     {
                                569                 :                :         /* Use original, unchanged base tuple */
                                570                 :        2775542 :         tuplesz = IndexTupleSize(state->base);
 1128                           571         [ -  + ]:        2775542 :         Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
  179                           572         [ -  + ]:        2775542 :         Assert(tuplesz <= BTMaxItemSize);
 2019                           573         [ -  + ]:        2775542 :         if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
                                574                 :                :                         false, false) == InvalidOffsetNumber)
 2019 pg@bowt.ie                575         [ #  # ]:UBC           0 :             elog(ERROR, "deduplication failed to add tuple to page");
                                576                 :                : 
 2019 pg@bowt.ie                577                 :CBC     2775542 :         spacesaving = 0;
                                578                 :                :     }
                                579                 :                :     else
                                580                 :                :     {
                                581                 :                :         IndexTuple  final;
                                582                 :                : 
                                583                 :                :         /* Form a tuple with a posting list */
                                584                 :         201187 :         final = _bt_form_posting(state->base, state->htids, state->nhtids);
                                585                 :         201187 :         tuplesz = IndexTupleSize(final);
                                586         [ -  + ]:         201187 :         Assert(tuplesz <= state->maxpostingsize);
                                587                 :                : 
                                588                 :                :         /* Save final number of items for posting list */
                                589                 :         201187 :         state->intervals[state->nintervals].nitems = state->nitems;
                                590                 :                : 
                                591         [ -  + ]:         201187 :         Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
  179                           592         [ -  + ]:         201187 :         Assert(tuplesz <= BTMaxItemSize);
 2019                           593         [ -  + ]:         201187 :         if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
                                594                 :                :                         false) == InvalidOffsetNumber)
 2019 pg@bowt.ie                595         [ #  # ]:UBC           0 :             elog(ERROR, "deduplication failed to add tuple to page");
                                596                 :                : 
 2019 pg@bowt.ie                597                 :CBC      201187 :         pfree(final);
                                598                 :         201187 :         spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
                                599                 :                :         /* Increment nintervals, since we wrote a new posting list tuple */
                                600                 :         201187 :         state->nintervals++;
                                601   [ +  -  -  + ]:         201187 :         Assert(spacesaving > 0 && spacesaving < BLCKSZ);
                                602                 :                :     }
                                603                 :                : 
                                604                 :                :     /* Reset state for next pending posting list */
                                605                 :        2976729 :     state->nhtids = 0;
                                606                 :        2976729 :     state->nitems = 0;
                                607                 :        2976729 :     state->phystupsize = 0;
                                608                 :                : 
                                609                 :        2976729 :     return spacesaving;
                                610                 :                : }
                                611                 :                : 
                                612                 :                : /*
                                613                 :                :  * Finalize interval during bottom-up index deletion.
                                614                 :                :  *
                                615                 :                :  * During a bottom-up pass we expect that TIDs will be recorded in dedup state
                                616                 :                :  * first, and then get moved over to delstate (in variable-sized batches) by
                                617                 :                :  * calling here.  Call here happens when the number of TIDs in a dedup
                                618                 :                :  * interval is known, and interval gets finalized (i.e. when caller sees next
                                619                 :                :  * tuple on the page is not a duplicate, or when caller runs out of tuples to
                                620                 :                :  * process from leaf page).
                                621                 :                :  *
                                622                 :                :  * This is where bottom-up deletion determines and remembers which entries are
                                623                 :                :  * duplicates.  This will be important information to the tableam delete
                                624                 :                :  * infrastructure later on.  Plain index tuple duplicates are marked
                                625                 :                :  * "promising" here, per tableam contract.
                                626                 :                :  *
                                627                 :                :  * Our approach to marking entries whose TIDs come from posting lists is more
                                628                 :                :  * complicated.  Posting lists can only be formed by a deduplication pass (or
                                629                 :                :  * during an index build), so recent version churn affecting the pointed-to
                                630                 :                :  * logical rows is not particularly likely.  We may still give a weak signal
                                631                 :                :  * about posting list tuples' entries (by marking just one of its TIDs/entries
                                632                 :                :  * promising), though this is only a possibility in the event of further
                                633                 :                :  * duplicate index tuples in final interval that covers posting list tuple (as
                                634                 :                :  * in the plain tuple case).  A weak signal/hint will be useful to the tableam
                                635                 :                :  * when it has no stronger signal to go with for the deletion operation as a
                                636                 :                :  * whole.
                                637                 :                :  *
                                638                 :                :  * The heuristics we use work well in practice because we only need to give
                                639                 :                :  * the tableam the right _general_ idea about where to look.  Garbage tends to
                                640                 :                :  * naturally get concentrated in relatively few table blocks with workloads
                                641                 :                :  * that bottom-up deletion targets.  The tableam cannot possibly rank all
                                642                 :                :  * available table blocks sensibly based on the hints we provide, but that's
                                643                 :                :  * okay -- only the extremes matter.  The tableam just needs to be able to
                                644                 :                :  * predict which few table blocks will have the most tuples that are safe to
                                645                 :                :  * delete for each deletion operation, with low variance across related
                                646                 :                :  * deletion operations.
                                647                 :                :  */
                                648                 :                : static void
 1697                           649                 :         454603 : _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
                                650                 :                :                                TM_IndexDeleteOp *delstate)
                                651                 :                : {
                                652                 :         454603 :     bool        dupinterval = (state->nitems > 1);
                                653                 :                : 
                                654         [ -  + ]:         454603 :     Assert(state->nitems > 0);
                                655         [ -  + ]:         454603 :     Assert(state->nitems <= state->nhtids);
                                656         [ -  + ]:         454603 :     Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
                                657                 :                : 
                                658         [ +  + ]:         983524 :     for (int i = 0; i < state->nitems; i++)
                                659                 :                :     {
                                660                 :         528921 :         OffsetNumber offnum = state->baseoff + i;
                                661                 :         528921 :         ItemId      itemid = PageGetItemId(page, offnum);
                                662                 :         528921 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
                                663                 :         528921 :         TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids];
                                664                 :         528921 :         TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids];
                                665                 :                : 
                                666         [ +  + ]:         528921 :         if (!BTreeTupleIsPosting(itup))
                                667                 :                :         {
                                668                 :                :             /* Simple case: A plain non-pivot tuple */
                                669                 :         422665 :             ideltid->tid = itup->t_tid;
                                670                 :         422665 :             ideltid->id = delstate->ndeltids;
                                671                 :         422665 :             istatus->idxoffnum = offnum;
                                672                 :         422665 :             istatus->knowndeletable = false; /* for now */
                                673                 :         422665 :             istatus->promising = dupinterval;    /* simple rule */
                                674                 :         422665 :             istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData);
                                675                 :                : 
                                676                 :         422665 :             delstate->ndeltids++;
                                677                 :                :         }
                                678                 :                :         else
                                679                 :                :         {
                                680                 :                :             /*
                                681                 :                :              * Complicated case: A posting list tuple.
                                682                 :                :              *
                                683                 :                :              * We make the conservative assumption that there can only be at
                                684                 :                :              * most one affected logical row per posting list tuple.  There
                                685                 :                :              * will be at most one promising entry in deltids to represent
                                686                 :                :              * this presumed lone logical row.  Note that this isn't even
                                687                 :                :              * considered unless the posting list tuple is also in an interval
                                688                 :                :              * of duplicates -- this complicated rule is just a variant of the
                                689                 :                :              * simple rule used to decide if plain index tuples are promising.
                                690                 :                :              */
                                691                 :         106256 :             int         nitem = BTreeTupleGetNPosting(itup);
                                692                 :         106256 :             bool        firstpromising = false;
                                693                 :         106256 :             bool        lastpromising = false;
                                694                 :                : 
                                695         [ -  + ]:         106256 :             Assert(_bt_posting_valid(itup));
                                696                 :                : 
                                697         [ +  + ]:         106256 :             if (dupinterval)
                                698                 :                :             {
                                699                 :                :                 /*
                                700                 :                :                  * Complicated rule: either the first or last TID in the
                                701                 :                :                  * posting list gets marked promising (if any at all)
                                702                 :                :                  */
                                703                 :                :                 BlockNumber minblocklist,
                                704                 :                :                             midblocklist,
                                705                 :                :                             maxblocklist;
                                706                 :                :                 ItemPointer mintid,
                                707                 :                :                             midtid,
                                708                 :                :                             maxtid;
                                709                 :                : 
                                710                 :           8535 :                 mintid = BTreeTupleGetHeapTID(itup);
                                711                 :           8535 :                 midtid = BTreeTupleGetPostingN(itup, nitem / 2);
                                712                 :           8535 :                 maxtid = BTreeTupleGetMaxHeapTID(itup);
                                713                 :           8535 :                 minblocklist = ItemPointerGetBlockNumber(mintid);
                                714                 :           8535 :                 midblocklist = ItemPointerGetBlockNumber(midtid);
                                715                 :           8535 :                 maxblocklist = ItemPointerGetBlockNumber(maxtid);
                                716                 :                : 
                                717                 :                :                 /* Only entry with predominant table block can be promising */
                                718                 :           8535 :                 firstpromising = (minblocklist == midblocklist);
                                719   [ +  +  +  + ]:           8535 :                 lastpromising = (!firstpromising &&
                                720                 :                :                                  midblocklist == maxblocklist);
                                721                 :                :             }
                                722                 :                : 
                                723         [ +  + ]:         592174 :             for (int p = 0; p < nitem; p++)
                                724                 :                :             {
                                725                 :         485918 :                 ItemPointer htid = BTreeTupleGetPostingN(itup, p);
                                726                 :                : 
                                727                 :         485918 :                 ideltid->tid = *htid;
                                728                 :         485918 :                 ideltid->id = delstate->ndeltids;
                                729                 :         485918 :                 istatus->idxoffnum = offnum;
                                730                 :         485918 :                 istatus->knowndeletable = false; /* for now */
                                731                 :         485918 :                 istatus->promising = false;
                                732   [ +  +  +  +  :         485918 :                 if ((firstpromising && p == 0) ||
                                              +  + ]
                                733         [ +  + ]:          61888 :                     (lastpromising && p == nitem - 1))
                                734                 :           5804 :                     istatus->promising = true;
                                735                 :         485918 :                 istatus->freespace = sizeof(ItemPointerData);    /* at worst */
                                736                 :                : 
                                737                 :         485918 :                 ideltid++;
                                738                 :         485918 :                 istatus++;
                                739                 :         485918 :                 delstate->ndeltids++;
                                740                 :                :             }
                                741                 :                :         }
                                742                 :                :     }
                                743                 :                : 
                                744         [ +  + ]:         454603 :     if (dupinterval)
                                745                 :                :     {
                                746                 :          49281 :         state->intervals[state->nintervals].nitems = state->nitems;
                                747                 :          49281 :         state->nintervals++;
                                748                 :                :     }
                                749                 :                : 
                                750                 :                :     /* Reset state for next interval */
                                751                 :         454603 :     state->nhtids = 0;
                                752                 :         454603 :     state->nitems = 0;
                                753                 :         454603 :     state->phystupsize = 0;
                                754                 :         454603 : }
                                755                 :                : 
                                756                 :                : /*
                                757                 :                :  * Determine if page non-pivot tuples (data items) are all duplicates of the
                                758                 :                :  * same value -- if they are, deduplication's "single value" strategy should
                                759                 :                :  * be applied.  The general goal of this strategy is to ensure that
                                760                 :                :  * nbtsplitloc.c (which uses its own single value strategy) will find a useful
                                761                 :                :  * split point as further duplicates are inserted, and successive rightmost
                                762                 :                :  * page splits occur among pages that store the same duplicate value.  When
                                763                 :                :  * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
                                764                 :                :  * just like it would if deduplication were disabled.
                                765                 :                :  *
                                766                 :                :  * We expect that affected workloads will require _several_ single value
                                767                 :                :  * strategy deduplication passes (over a page that only stores duplicates)
                                768                 :                :  * before the page is finally split.  The first deduplication pass should only
                                769                 :                :  * find regular non-pivot tuples.  Later deduplication passes will find
                                770                 :                :  * existing maxpostingsize-capped posting list tuples, which must be skipped
                                771                 :                :  * over.  The penultimate pass is generally the first pass that actually
                                772                 :                :  * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
                                773                 :                :  * few untouched non-pivot tuples.  The final deduplication pass won't free
                                774                 :                :  * any space -- it will skip over everything without merging anything (it
                                775                 :                :  * retraces the steps of the penultimate pass).
                                776                 :                :  *
                                777                 :                :  * Fortunately, having several passes isn't too expensive.  Each pass (after
                                778                 :                :  * the first pass) won't spend many cycles on the large posting list tuples
                                779                 :                :  * left by previous passes.  Each pass will find a large contiguous group of
                                780                 :                :  * smaller duplicate tuples to merge together at the end of the page.
                                781                 :                :  */
                                782                 :                : static bool
 2019                           783                 :          11561 : _bt_do_singleval(Relation rel, Page page, BTDedupState state,
                                784                 :                :                  OffsetNumber minoff, IndexTuple newitem)
                                785                 :                : {
 1988                           786                 :          11561 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
                                787                 :                :     ItemId      itemid;
                                788                 :                :     IndexTuple  itup;
                                789                 :                : 
 2019                           790                 :          11561 :     itemid = PageGetItemId(page, minoff);
                                791                 :          11561 :     itup = (IndexTuple) PageGetItem(page, itemid);
                                792                 :                : 
 1988                           793         [ +  + ]:          11561 :     if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
                                794                 :                :     {
 2019                           795                 :           1021 :         itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
                                796                 :           1021 :         itup = (IndexTuple) PageGetItem(page, itemid);
                                797                 :                : 
 1988                           798         [ +  + ]:           1021 :         if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
 2019                           799                 :            664 :             return true;
                                800                 :                :     }
                                801                 :                : 
                                802                 :          10897 :     return false;
                                803                 :                : }
                                804                 :                : 
                                805                 :                : /*
                                806                 :                :  * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
                                807                 :                :  * and final maxpostingsize-capped tuple.  The sixth and final posting list
                                808                 :                :  * tuple will end up somewhat smaller than the first five.  (Note: The first
                                809                 :                :  * five tuples could actually just be very large duplicate tuples that
                                810                 :                :  * couldn't be merged together at all.  Deduplication will simply not modify
                                811                 :                :  * the page when that happens.)
                                812                 :                :  *
                                813                 :                :  * When there are six posting lists on the page (after current deduplication
                                814                 :                :  * pass goes on to create/observe a sixth very large tuple), caller should end
                                815                 :                :  * its deduplication pass.  It isn't useful to try to deduplicate items that
                                816                 :                :  * are supposed to end up on the new right sibling page following the
                                817                 :                :  * anticipated page split.  A future deduplication pass of future right
                                818                 :                :  * sibling page might take care of it.  (This is why the first single value
                                819                 :                :  * strategy deduplication pass for a given leaf page will generally find only
                                820                 :                :  * plain non-pivot tuples -- see _bt_do_singleval() comments.)
                                821                 :                :  */
                                822                 :                : static void
                                823                 :            326 : _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
                                824                 :                : {
                                825                 :                :     Size        leftfree;
                                826                 :                :     int         reduction;
                                827                 :                : 
                                828                 :                :     /* This calculation needs to match nbtsplitloc.c */
                                829                 :            326 :     leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
                                830                 :                :         MAXALIGN(sizeof(BTPageOpaqueData));
                                831                 :                :     /* Subtract size of new high key (includes pivot heap TID space) */
                                832                 :            326 :     leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
                                833                 :                : 
                                834                 :                :     /*
                                835                 :                :      * Reduce maxpostingsize by an amount equal to target free space on left
                                836                 :                :      * half of page
                                837                 :                :      */
                                838                 :            326 :     reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
                                839         [ +  - ]:            326 :     if (state->maxpostingsize > reduction)
                                840                 :            326 :         state->maxpostingsize -= reduction;
                                841                 :                :     else
 2019 pg@bowt.ie                842                 :UBC           0 :         state->maxpostingsize = 0;
 2019 pg@bowt.ie                843                 :CBC         326 : }
                                844                 :                : 
                                845                 :                : /*
                                846                 :                :  * Build a posting list tuple based on caller's "base" index tuple and list of
                                847                 :                :  * heap TIDs.  When nhtids == 1, builds a standard non-pivot tuple without a
                                848                 :                :  * posting list. (Posting list tuples can never have a single heap TID, partly
                                849                 :                :  * because that ensures that deduplication always reduces final MAXALIGN()'d
                                850                 :                :  * size of entire tuple.)
                                851                 :                :  *
                                852                 :                :  * Convention is that posting list starts at a MAXALIGN()'d offset (rather
                                853                 :                :  * than a SHORTALIGN()'d offset), in line with the approach taken when
                                854                 :                :  * appending a heap TID to new pivot tuple/high key during suffix truncation.
                                855                 :                :  * This sometimes wastes a little space that was only needed as alignment
                                856                 :                :  * padding in the original tuple.  Following this convention simplifies the
                                857                 :                :  * space accounting used when deduplicating a page (the same convention
                                858                 :                :  * simplifies the accounting for choosing a point to split a page at).
                                859                 :                :  *
                                860                 :                :  * Note: Caller's "htids" array must be unique and already in ascending TID
                                861                 :                :  * order.  Any existing heap TIDs from "base" won't automatically appear in
                                862                 :                :  * returned posting list tuple (they must be included in htids array.)
                                863                 :                :  */
                                864                 :                : IndexTuple
                                865                 :         246742 : _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
                                866                 :                : {
                                867                 :                :     uint32      keysize,
                                868                 :                :                 newsize;
                                869                 :                :     IndexTuple  itup;
                                870                 :                : 
                                871         [ +  + ]:         246742 :     if (BTreeTupleIsPosting(base))
                                872                 :          66861 :         keysize = BTreeTupleGetPostingOffset(base);
                                873                 :                :     else
                                874                 :         179881 :         keysize = IndexTupleSize(base);
                                875                 :                : 
                                876         [ -  + ]:         246742 :     Assert(!BTreeTupleIsPivot(base));
                                877   [ +  -  -  + ]:         246742 :     Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
                                878         [ -  + ]:         246742 :     Assert(keysize == MAXALIGN(keysize));
                                879                 :                : 
                                880                 :                :     /* Determine final size of new tuple */
                                881         [ +  + ]:         246742 :     if (nhtids > 1)
                                882                 :         221219 :         newsize = MAXALIGN(keysize +
                                883                 :                :                            nhtids * sizeof(ItemPointerData));
                                884                 :                :     else
                                885                 :          25523 :         newsize = keysize;
                                886                 :                : 
                                887         [ -  + ]:         246742 :     Assert(newsize <= INDEX_SIZE_MASK);
                                888         [ -  + ]:         246742 :     Assert(newsize == MAXALIGN(newsize));
                                889                 :                : 
                                890                 :                :     /* Allocate memory using palloc0() (matches index_form_tuple()) */
                                891                 :         246742 :     itup = palloc0(newsize);
                                892                 :         246742 :     memcpy(itup, base, keysize);
                                893                 :         246742 :     itup->t_info &= ~INDEX_SIZE_MASK;
                                894                 :         246742 :     itup->t_info |= newsize;
                                895         [ +  + ]:         246742 :     if (nhtids > 1)
                                896                 :                :     {
                                897                 :                :         /* Form posting list tuple */
                                898                 :         221219 :         BTreeTupleSetPosting(itup, nhtids, keysize);
                                899                 :         221219 :         memcpy(BTreeTupleGetPosting(itup), htids,
                                900                 :                :                sizeof(ItemPointerData) * nhtids);
                                901         [ -  + ]:         221219 :         Assert(_bt_posting_valid(itup));
                                902                 :                :     }
                                903                 :                :     else
                                904                 :                :     {
                                905                 :                :         /* Form standard non-pivot tuple */
                                906                 :          25523 :         itup->t_info &= ~INDEX_ALT_TID_MASK;
                                907                 :          25523 :         ItemPointerCopy(htids, &itup->t_tid);
                                908         [ -  + ]:          25523 :         Assert(ItemPointerIsValid(&itup->t_tid));
                                909                 :                :     }
                                910                 :                : 
                                911                 :         246742 :     return itup;
                                912                 :                : }
                                913                 :                : 
                                914                 :                : /*
                                915                 :                :  * Generate a replacement tuple by "updating" a posting list tuple so that it
                                916                 :                :  * no longer has TIDs that need to be deleted.
                                917                 :                :  *
                                918                 :                :  * Used by both VACUUM and index deletion.  Caller's vacposting argument
                                919                 :                :  * points to the existing posting list tuple to be updated.
                                920                 :                :  *
                                921                 :                :  * On return, caller's vacposting argument will point to final "updated"
                                922                 :                :  * tuple, which will be palloc()'d in caller's memory context.
                                923                 :                :  */
                                924                 :                : void
                                925                 :          36842 : _bt_update_posting(BTVacuumPosting vacposting)
                                926                 :                : {
                                927                 :          36842 :     IndexTuple  origtuple = vacposting->itup;
                                928                 :                :     uint32      keysize,
                                929                 :                :                 newsize;
                                930                 :                :     IndexTuple  itup;
                                931                 :                :     int         nhtids;
                                932                 :                :     int         ui,
                                933                 :                :                 d;
                                934                 :                :     ItemPointer htids;
                                935                 :                : 
                                936                 :          36842 :     nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
                                937                 :                : 
                                938         [ -  + ]:          36842 :     Assert(_bt_posting_valid(origtuple));
                                939   [ +  -  -  + ]:          36842 :     Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
                                940                 :                : 
                                941                 :                :     /*
                                942                 :                :      * Determine final size of new tuple.
                                943                 :                :      *
                                944                 :                :      * This calculation needs to match the code used within _bt_form_posting()
                                945                 :                :      * for new posting list tuples.  We avoid calling _bt_form_posting() here
                                946                 :                :      * to save ourselves a second memory allocation for a htids workspace.
                                947                 :                :      */
 2015                           948                 :          36842 :     keysize = BTreeTupleGetPostingOffset(origtuple);
 2019                           949         [ +  + ]:          36842 :     if (nhtids > 1)
                                950                 :           7726 :         newsize = MAXALIGN(keysize +
                                951                 :                :                            nhtids * sizeof(ItemPointerData));
                                952                 :                :     else
                                953                 :          29116 :         newsize = keysize;
                                954                 :                : 
 2014                           955         [ -  + ]:          36842 :     Assert(newsize <= INDEX_SIZE_MASK);
                                956         [ -  + ]:          36842 :     Assert(newsize == MAXALIGN(newsize));
                                957                 :                : 
                                958                 :                :     /* Allocate memory using palloc0() (matches index_form_tuple()) */
 2019                           959                 :          36842 :     itup = palloc0(newsize);
                                960                 :          36842 :     memcpy(itup, origtuple, keysize);
                                961                 :          36842 :     itup->t_info &= ~INDEX_SIZE_MASK;
                                962                 :          36842 :     itup->t_info |= newsize;
                                963                 :                : 
                                964         [ +  + ]:          36842 :     if (nhtids > 1)
                                965                 :                :     {
                                966                 :                :         /* Form posting list tuple */
                                967                 :           7726 :         BTreeTupleSetPosting(itup, nhtids, keysize);
                                968                 :           7726 :         htids = BTreeTupleGetPosting(itup);
                                969                 :                :     }
                                970                 :                :     else
                                971                 :                :     {
                                972                 :                :         /* Form standard non-pivot tuple */
                                973                 :          29116 :         itup->t_info &= ~INDEX_ALT_TID_MASK;
                                974                 :          29116 :         htids = &itup->t_tid;
                                975                 :                :     }
                                976                 :                : 
                                977                 :          36842 :     ui = 0;
                                978                 :          36842 :     d = 0;
                                979         [ +  + ]:         200482 :     for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
                                980                 :                :     {
                                981   [ +  +  +  + ]:         163640 :         if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
                                982                 :                :         {
                                983                 :          79438 :             d++;
                                984                 :          79438 :             continue;
                                985                 :                :         }
                                986                 :          84202 :         htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
                                987                 :                :     }
                                988         [ -  + ]:          36842 :     Assert(ui == nhtids);
                                989         [ -  + ]:          36842 :     Assert(d == vacposting->ndeletedtids);
                                990   [ +  +  -  + ]:          36842 :     Assert(nhtids == 1 || _bt_posting_valid(itup));
 2014                           991   [ +  +  -  + ]:          36842 :     Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
                                992                 :                : 
                                993                 :                :     /* vacposting arg's itup will now point to updated version */
 2019                           994                 :          36842 :     vacposting->itup = itup;
                                995                 :          36842 : }
                                996                 :                : 
                                997                 :                : /*
                                998                 :                :  * Prepare for a posting list split by swapping heap TID in newitem with heap
                                999                 :                :  * TID from original posting list (the 'oposting' heap TID located at offset
                               1000                 :                :  * 'postingoff').  Modifies newitem, so caller should pass their own private
                               1001                 :                :  * copy that can safely be modified.
                               1002                 :                :  *
                               1003                 :                :  * Returns new posting list tuple, which is palloc()'d in caller's context.
                               1004                 :                :  * This is guaranteed to be the same size as 'oposting'.  Modified newitem is
                               1005                 :                :  * what caller actually inserts. (This happens inside the same critical
                               1006                 :                :  * section that performs an in-place update of old posting list using new
                               1007                 :                :  * posting list returned here.)
                               1008                 :                :  *
                               1009                 :                :  * While the keys from newitem and oposting must be opclass equal, and must
                               1010                 :                :  * generate identical output when run through the underlying type's output
                               1011                 :                :  * function, it doesn't follow that their representations match exactly.
                               1012                 :                :  * Caller must avoid assuming that there can't be representational differences
                               1013                 :                :  * that make datums from oposting bigger or smaller than the corresponding
                               1014                 :                :  * datums from newitem.  For example, differences in TOAST input state might
                               1015                 :                :  * break a faulty assumption about tuple size (the executor is entitled to
                               1016                 :                :  * apply TOAST compression based on its own criteria).  It also seems possible
                               1017                 :                :  * that further representational variation will be introduced in the future,
                               1018                 :                :  * in order to support nbtree features like page-level prefix compression.
                               1019                 :                :  *
                               1020                 :                :  * See nbtree/README for details on the design of posting list splits.
                               1021                 :                :  */
                               1022                 :                : IndexTuple
                               1023                 :          17222 : _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
                               1024                 :                : {
                               1025                 :                :     int         nhtids;
                               1026                 :                :     char       *replacepos;
                               1027                 :                :     char       *replaceposright;
                               1028                 :                :     Size        nmovebytes;
                               1029                 :                :     IndexTuple  nposting;
                               1030                 :                : 
                               1031                 :          17222 :     nhtids = BTreeTupleGetNPosting(oposting);
                               1032         [ -  + ]:          17222 :     Assert(_bt_posting_valid(oposting));
                               1033                 :                : 
                               1034                 :                :     /*
                               1035                 :                :      * The postingoff argument originated as a _bt_binsrch_posting() return
                               1036                 :                :      * value.  It will be 0 in the event of corruption that makes a leaf page
                               1037                 :                :      * contain a non-pivot tuple that's somehow identical to newitem (no two
                               1038                 :                :      * non-pivot tuples should ever have the same TID).  This has been known
                               1039                 :                :      * to happen in the field from time to time.
                               1040                 :                :      *
                               1041                 :                :      * Perform a basic sanity check to catch this case now.
                               1042                 :                :      */
 1576                          1043   [ +  -  -  + ]:          17222 :     if (!(postingoff > 0 && postingoff < nhtids))
 1576 pg@bowt.ie               1044         [ #  # ]:UBC           0 :         elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
                               1045                 :                :              nhtids, postingoff);
                               1046                 :                : 
                               1047                 :                :     /*
                               1048                 :                :      * Move item pointers in posting list to make a gap for the new item's
                               1049                 :                :      * heap TID.  We shift TIDs one place to the right, losing original
                               1050                 :                :      * rightmost TID. (nmovebytes must not include TIDs to the left of
                               1051                 :                :      * postingoff, nor the existing rightmost/max TID that gets overwritten.)
                               1052                 :                :      */
 2019 pg@bowt.ie               1053                 :CBC       17222 :     nposting = CopyIndexTuple(oposting);
                               1054                 :          17222 :     replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
                               1055                 :          17222 :     replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
                               1056                 :          17222 :     nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
                               1057                 :          17222 :     memmove(replaceposright, replacepos, nmovebytes);
                               1058                 :                : 
                               1059                 :                :     /* Fill the gap at postingoff with TID of new item (original new TID) */
                               1060   [ +  -  -  + ]:          17222 :     Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
                               1061                 :          17222 :     ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
                               1062                 :                : 
                               1063                 :                :     /* Now copy oposting's rightmost/max TID into new item (final new TID) */
                               1064                 :          17222 :     ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
                               1065                 :                : 
                               1066         [ -  + ]:          17222 :     Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
                               1067                 :                :                               BTreeTupleGetHeapTID(newitem)) < 0);
                               1068         [ -  + ]:          17222 :     Assert(_bt_posting_valid(nposting));
                               1069                 :                : 
                               1070                 :          17222 :     return nposting;
                               1071                 :                : }
                               1072                 :                : 
                               1073                 :                : /*
                               1074                 :                :  * Verify posting list invariants for "posting", which must be a posting list
                               1075                 :                :  * tuple.  Used within assertions.
                               1076                 :                :  */
                               1077                 :                : #ifdef USE_ASSERT_CHECKING
                               1078                 :                : static bool
                               1079                 :         406487 : _bt_posting_valid(IndexTuple posting)
                               1080                 :                : {
                               1081                 :                :     ItemPointerData last;
                               1082                 :                :     ItemPointer htid;
                               1083                 :                : 
                               1084   [ +  -  -  + ]:         406487 :     if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
 2019 pg@bowt.ie               1085                 :UBC           0 :         return false;
                               1086                 :                : 
                               1087                 :                :     /* Remember first heap TID for loop */
 2019 pg@bowt.ie               1088                 :CBC      406487 :     ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
                               1089         [ -  + ]:         406487 :     if (!ItemPointerIsValid(&last))
 2019 pg@bowt.ie               1090                 :UBC           0 :         return false;
                               1091                 :                : 
                               1092                 :                :     /* Iterate, starting from second TID */
 2019 pg@bowt.ie               1093         [ +  + ]:CBC     7893245 :     for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
                               1094                 :                :     {
                               1095                 :        7486758 :         htid = BTreeTupleGetPostingN(posting, i);
                               1096                 :                : 
                               1097         [ -  + ]:        7486758 :         if (!ItemPointerIsValid(htid))
 2019 pg@bowt.ie               1098                 :UBC           0 :             return false;
 2019 pg@bowt.ie               1099         [ -  + ]:CBC     7486758 :         if (ItemPointerCompare(htid, &last) <= 0)
 2019 pg@bowt.ie               1100                 :UBC           0 :             return false;
 2019 pg@bowt.ie               1101                 :CBC     7486758 :         ItemPointerCopy(htid, &last);
                               1102                 :                :     }
                               1103                 :                : 
                               1104                 :         406487 :     return true;
                               1105                 :                : }
                               1106                 :                : #endif
        

Generated by: LCOV version 2.4-beta