Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * nbtdedup.c
4 : : * Deduplicate or bottom-up delete items in Postgres btrees.
5 : : *
6 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : : * Portions Copyright (c) 1994, Regents of the University of California
8 : : *
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/access/nbtree/nbtdedup.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : : #include "postgres.h"
16 : :
17 : : #include "access/nbtree.h"
18 : : #include "access/nbtxlog.h"
19 : : #include "access/tableam.h"
20 : : #include "access/xloginsert.h"
21 : : #include "miscadmin.h"
22 : : #include "utils/rel.h"
23 : :
24 : : static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
25 : : TM_IndexDeleteOp *delstate);
26 : : static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
27 : : OffsetNumber minoff, IndexTuple newitem);
28 : : static void _bt_singleval_fillfactor(Page page, BTDedupState state,
29 : : Size newitemsz);
30 : : #ifdef USE_ASSERT_CHECKING
31 : : static bool _bt_posting_valid(IndexTuple posting);
32 : : #endif
33 : :
34 : : /*
35 : : * Perform a deduplication pass.
36 : : *
37 : : * The general approach taken here is to perform as much deduplication as
38 : : * possible to free as much space as possible. Note, however, that "single
39 : : * value" strategy is used for !bottomupdedup callers when the page is full of
40 : : * tuples of a single value. Deduplication passes that apply the strategy
41 : : * will leave behind a few untouched tuples at the end of the page, preparing
42 : : * the page for an anticipated page split that uses nbtsplitloc.c's own single
43 : : * value strategy. Our high level goal is to delay merging the untouched
44 : : * tuples until after the page splits.
45 : : *
46 : : * When a call to _bt_bottomupdel_pass() just took place (and failed), our
47 : : * high level goal is to prevent a page split entirely by buying more time.
48 : : * We still hope that a page split can be avoided altogether. That's why
49 : : * single value strategy is not even considered for bottomupdedup callers.
50 : : *
51 : : * The page will have to be split if we cannot successfully free at least
52 : : * newitemsz (we also need space for newitem's line pointer, which isn't
53 : : * included in caller's newitemsz).
54 : : *
55 : : * Note: Caller should have already deleted all existing items with their
56 : : * LP_DEAD bits set.
57 : : */
58 : : void
1062 pg@bowt.ie 59 :CBC 15147 : _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz,
60 : : bool bottomupdedup)
61 : : {
62 : : OffsetNumber offnum,
63 : : minoff,
64 : : maxoff;
2209 65 : 15147 : Page page = BufferGetPage(buf);
1444 michael@paquier.xyz 66 : 15147 : BTPageOpaque opaque = BTPageGetOpaque(page);
67 : : Page newpage;
68 : : BTDedupState state;
1439 tgl@sss.pgh.pa.us 69 : 15147 : Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
2209 pg@bowt.ie 70 : 15147 : bool singlevalstrat = false;
2178 71 : 15147 : int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
72 : : XLogRecPtr recptr;
73 : :
74 : : /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
2209 75 : 15147 : newitemsz += sizeof(ItemIdData);
76 : :
77 : : /*
78 : : * Initialize deduplication state.
79 : : *
80 : : * It would be possible for maxpostingsize (limit on posting list tuple
81 : : * size) to be set to one third of the page. However, it seems like a
82 : : * good idea to limit the size of posting lists to one sixth of a page.
83 : : * That ought to leave us with a good split point when pages full of
84 : : * duplicates can be split several times.
85 : : */
95 michael@paquier.xyz 86 :GNC 15147 : state = palloc_object(BTDedupStateData);
2209 pg@bowt.ie 87 :CBC 15147 : state->deduplicate = true;
2095 88 : 15147 : state->nmaxitems = 0;
369 89 : 15147 : state->maxpostingsize = Min(BTMaxItemSize / 2, INDEX_SIZE_MASK);
90 : : /* Metadata about base tuple of current pending posting list */
2209 91 : 15147 : state->base = NULL;
92 : 15147 : state->baseoff = InvalidOffsetNumber;
93 : 15147 : state->basetupsize = 0;
94 : : /* Metadata about current pending posting list TIDs */
95 : 15147 : state->htids = palloc(state->maxpostingsize);
96 : 15147 : state->nhtids = 0;
97 : 15147 : state->nitems = 0;
98 : : /* Size of all physical tuples to be replaced by pending posting list */
99 : 15147 : state->phystupsize = 0;
100 : : /* nintervals should be initialized to zero */
101 : 15147 : state->nintervals = 0;
102 : :
1944 103 [ + + ]: 15147 : minoff = P_FIRSTDATAKEY(opaque);
104 : 15147 : maxoff = PageGetMaxOffsetNumber(page);
105 : :
106 : : /*
107 : : * Consider applying "single value" strategy, though only if the page
108 : : * seems likely to be split in the near future
109 : : */
1636 110 [ + + ]: 15147 : if (!bottomupdedup)
2209 111 : 13383 : singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
112 : :
113 : : /*
114 : : * Deduplicate items from page, and write them to newpage.
115 : : *
116 : : * Copy the original page's LSN into newpage copy. This will become the
117 : : * updated version of the page. We need this because XLogInsert will
118 : : * examine the LSN and possibly dump it in a page image.
119 : : */
120 : 15147 : newpage = PageGetTempPageCopySpecial(page);
121 : 15147 : PageSetLSN(newpage, PageGetLSN(page));
122 : :
123 : : /* Copy high key, if any */
124 [ + + ]: 15147 : if (!P_RIGHTMOST(opaque))
125 : : {
126 : 11788 : ItemId hitemid = PageGetItemId(page, P_HIKEY);
127 : 11788 : Size hitemsz = ItemIdGetLength(hitemid);
128 : 11788 : IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
129 : :
139 peter@eisentraut.org 130 [ - + ]:GNC 11788 : if (PageAddItem(newpage, hitem, hitemsz, P_HIKEY, false, false) == InvalidOffsetNumber)
2209 pg@bowt.ie 131 [ # # ]:UBC 0 : elog(ERROR, "deduplication failed to add highkey");
132 : : }
133 : :
2209 pg@bowt.ie 134 :CBC 15147 : for (offnum = minoff;
135 [ + + ]: 3499807 : offnum <= maxoff;
136 : 3484660 : offnum = OffsetNumberNext(offnum))
137 : : {
138 : 3484660 : ItemId itemid = PageGetItemId(page, offnum);
139 : 3484660 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
140 : :
141 [ - + ]: 3484660 : Assert(!ItemIdIsDead(itemid));
142 : :
143 [ + + ]: 3484660 : if (offnum == minoff)
144 : : {
145 : : /*
146 : : * No previous/base tuple for the data item -- use the data item
147 : : * as base tuple of pending posting list
148 : : */
149 : 15147 : _bt_dedup_start_pending(state, itup, offnum);
150 : : }
151 [ + + + + ]: 6937785 : else if (state->deduplicate &&
2178 152 [ + + ]: 3928248 : _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
2209 153 : 459976 : _bt_dedup_save_htid(state, itup))
154 : : {
155 : : /*
156 : : * Tuple is equal to base tuple of pending posting list. Heap
157 : : * TID(s) for itup have been saved in state.
158 : : */
159 : : }
160 : : else
161 : : {
162 : : /*
163 : : * Tuple is not equal to pending posting list tuple, or
164 : : * _bt_dedup_save_htid() opted to not merge current item into
165 : : * pending posting list for some other reason (e.g., adding more
166 : : * TIDs would have caused posting list to exceed current
167 : : * maxpostingsize).
168 : : *
169 : : * If state contains pending posting list with more than one item,
170 : : * form new posting tuple and add it to our temp page (newpage).
171 : : * Else add pending interval's base tuple to the temp page as-is.
172 : : */
173 : 3016360 : pagesaving += _bt_dedup_finish_pending(newpage, state);
174 : :
175 [ + + ]: 3016360 : if (singlevalstrat)
176 : : {
177 : : /*
178 : : * Single value strategy's extra steps.
179 : : *
180 : : * Lower maxpostingsize for sixth and final large posting list
181 : : * tuple at the point where 5 maxpostingsize-capped tuples
182 : : * have either been formed or observed.
183 : : *
184 : : * When a sixth maxpostingsize-capped item is formed/observed,
185 : : * stop merging together tuples altogether. The few tuples
186 : : * that remain at the end of the page won't be merged together
187 : : * at all (at least not until after a future page split takes
188 : : * place, when this page's newly allocated right sibling page
189 : : * gets its first deduplication pass).
190 : : */
2095 191 [ + + ]: 2611 : if (state->nmaxitems == 5)
2209 192 : 317 : _bt_singleval_fillfactor(page, state, newitemsz);
2095 193 [ + + ]: 2294 : else if (state->nmaxitems == 6)
194 : : {
2209 195 : 98 : state->deduplicate = false;
196 : 98 : singlevalstrat = false; /* won't be back here */
197 : : }
198 : : }
199 : :
200 : : /* itup starts new pending posting list */
201 : 3016360 : _bt_dedup_start_pending(state, itup, offnum);
202 : : }
203 : : }
204 : :
205 : : /* Handle the last item */
206 : 15147 : pagesaving += _bt_dedup_finish_pending(newpage, state);
207 : :
208 : : /*
209 : : * If no items suitable for deduplication were found, newpage must be
210 : : * exactly the same as the original page, so just return from function.
211 : : *
212 : : * We could determine whether or not to proceed on the basis the space
213 : : * savings being sufficient to avoid an immediate page split instead. We
214 : : * don't do that because there is some small value in nbtsplitloc.c always
215 : : * operating against a page that is fully deduplicated (apart from
216 : : * newitem). Besides, most of the cost has already been paid.
217 : : */
218 [ + + ]: 15147 : if (state->nintervals == 0)
219 : : {
220 : : /* cannot leak memory here */
221 : 2869 : pfree(newpage);
222 : 2869 : pfree(state->htids);
223 : 2869 : pfree(state);
224 : 2869 : return;
225 : : }
226 : :
227 : : /*
228 : : * By here, it's clear that deduplication will definitely go ahead.
229 : : *
230 : : * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace
231 : : * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
232 : : * But keep things tidy.
233 : : */
234 [ - + ]: 12278 : if (P_HAS_GARBAGE(opaque))
235 : : {
1444 michael@paquier.xyz 236 :UBC 0 : BTPageOpaque nopaque = BTPageGetOpaque(newpage);
237 : :
2209 pg@bowt.ie 238 : 0 : nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
239 : : }
240 : :
2209 pg@bowt.ie 241 :CBC 12278 : START_CRIT_SECTION();
242 : :
243 : 12278 : PageRestoreTempPage(newpage, page);
244 : 12278 : MarkBufferDirty(buf);
245 : :
246 : : /* XLOG stuff */
247 [ + + + + : 12278 : if (RelationNeedsWAL(rel))
+ - + - ]
2209 pg@bowt.ie 248 :GIC 12215 : {
249 : : xl_btree_dedup xlrec_dedup;
250 : :
2209 pg@bowt.ie 251 :CBC 12215 : xlrec_dedup.nintervals = state->nintervals;
252 : :
253 : 12215 : XLogBeginInsert();
254 : 12215 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
397 peter@eisentraut.org 255 : 12215 : XLogRegisterData(&xlrec_dedup, SizeOfBtreeDedup);
256 : :
257 : : /*
258 : : * The intervals array is not in the buffer, but pretend that it is.
259 : : * When XLogInsert stores the whole buffer, the array need not be
260 : : * stored too.
261 : : */
262 : 12215 : XLogRegisterBufData(0, state->intervals,
2209 pg@bowt.ie 263 : 12215 : state->nintervals * sizeof(BTDedupInterval));
264 : :
265 : 12215 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
266 : : }
267 : : else
2 pg@bowt.ie 268 :GNC 63 : recptr = XLogGetFakeLSN(rel);
269 : :
270 : 12278 : PageSetLSN(page, recptr);
271 : :
2209 pg@bowt.ie 272 [ - + ]:CBC 12278 : END_CRIT_SECTION();
273 : :
274 : : /* Local space accounting should agree with page accounting */
275 [ + + - + ]: 12278 : Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
276 : :
277 : : /* cannot leak memory here */
278 : 12278 : pfree(state->htids);
279 : 12278 : pfree(state);
280 : : }
281 : :
282 : : /*
283 : : * Perform bottom-up index deletion pass.
284 : : *
285 : : * See if duplicate index tuples (plus certain nearby tuples) are eligible to
286 : : * be deleted via bottom-up index deletion. The high level goal here is to
287 : : * entirely prevent "unnecessary" page splits caused by MVCC version churn
288 : : * from UPDATEs (when the UPDATEs don't logically modify any of the columns
289 : : * covered by the 'rel' index). This is qualitative, not quantitative -- we
290 : : * do not particularly care about once-off opportunities to delete many index
291 : : * tuples together.
292 : : *
293 : : * See nbtree/README for details on the design of nbtree bottom-up deletion.
294 : : * See access/tableam.h for a description of how we're expected to cooperate
295 : : * with the tableam.
296 : : *
297 : : * Returns true on success, in which case caller can assume page split will be
298 : : * avoided for a reasonable amount of time. Returns false when caller should
299 : : * deduplicate the page (if possible at all).
300 : : *
301 : : * Note: Occasionally we return true despite failing to delete enough items to
302 : : * avoid a split. This makes caller skip deduplication and go split the page
303 : : * right away. Our return value is always just advisory information.
304 : : *
305 : : * Note: Caller should have already deleted all existing items with their
306 : : * LP_DEAD bits set.
307 : : */
308 : : bool
1887 309 : 1987 : _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
310 : : Size newitemsz)
311 : : {
312 : : OffsetNumber offnum,
313 : : minoff,
314 : : maxoff;
315 : 1987 : Page page = BufferGetPage(buf);
1444 michael@paquier.xyz 316 : 1987 : BTPageOpaque opaque = BTPageGetOpaque(page);
317 : : BTDedupState state;
318 : : TM_IndexDeleteOp delstate;
319 : : bool neverdedup;
1887 pg@bowt.ie 320 : 1987 : int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
321 : :
322 : : /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
323 : 1987 : newitemsz += sizeof(ItemIdData);
324 : :
325 : : /* Initialize deduplication state */
95 michael@paquier.xyz 326 :GNC 1987 : state = palloc_object(BTDedupStateData);
1887 pg@bowt.ie 327 :CBC 1987 : state->deduplicate = true;
328 : 1987 : state->nmaxitems = 0;
329 : 1987 : state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
330 : 1987 : state->base = NULL;
331 : 1987 : state->baseoff = InvalidOffsetNumber;
332 : 1987 : state->basetupsize = 0;
333 : 1987 : state->htids = palloc(state->maxpostingsize);
334 : 1987 : state->nhtids = 0;
335 : 1987 : state->nitems = 0;
336 : 1987 : state->phystupsize = 0;
337 : 1987 : state->nintervals = 0;
338 : :
339 : : /*
340 : : * Initialize tableam state that describes bottom-up index deletion
341 : : * operation.
342 : : *
343 : : * We'll go on to ask the tableam to search for TIDs whose index tuples we
344 : : * can safely delete. The tableam will search until our leaf page space
345 : : * target is satisfied, or until the cost of continuing with the tableam
346 : : * operation seems too high. It focuses its efforts on TIDs associated
347 : : * with duplicate index tuples that we mark "promising".
348 : : *
349 : : * This space target is a little arbitrary. The tableam must be able to
350 : : * keep the costs and benefits in balance. We provide the tableam with
351 : : * exhaustive information about what might work, without directly
352 : : * concerning ourselves with avoiding work during the tableam call. Our
353 : : * role in costing the bottom-up deletion process is strictly advisory.
354 : : */
1592 355 : 1987 : delstate.irel = rel;
356 : 1987 : delstate.iblknum = BufferGetBlockNumber(buf);
1887 357 : 1987 : delstate.bottomup = true;
358 : 1987 : delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
359 : 1987 : delstate.ndeltids = 0;
95 michael@paquier.xyz 360 :GNC 1987 : delstate.deltids = palloc_array(TM_IndexDelete, MaxTIDsPerBTreePage);
361 : 1987 : delstate.status = palloc_array(TM_IndexStatus, MaxTIDsPerBTreePage);
362 : :
1887 pg@bowt.ie 363 [ + + ]:CBC 1987 : minoff = P_FIRSTDATAKEY(opaque);
364 : 1987 : maxoff = PageGetMaxOffsetNumber(page);
365 : 1987 : for (offnum = minoff;
366 [ + + ]: 562526 : offnum <= maxoff;
367 : 560539 : offnum = OffsetNumberNext(offnum))
368 : : {
369 : 560539 : ItemId itemid = PageGetItemId(page, offnum);
370 : 560539 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
371 : :
372 [ - + ]: 560539 : Assert(!ItemIdIsDead(itemid));
373 : :
374 [ + + ]: 560539 : if (offnum == minoff)
375 : : {
376 : : /* itup starts first pending interval */
377 : 1987 : _bt_dedup_start_pending(state, itup, offnum);
378 : : }
379 [ + + + - ]: 635696 : else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
380 : 77144 : _bt_dedup_save_htid(state, itup))
381 : : {
382 : : /* Tuple is equal; just added its TIDs to pending interval */
383 : : }
384 : : else
385 : : {
386 : : /* Finalize interval -- move its TIDs to delete state */
387 : 481408 : _bt_bottomupdel_finish_pending(page, state, &delstate);
388 : :
389 : : /* itup starts new pending interval */
390 : 481408 : _bt_dedup_start_pending(state, itup, offnum);
391 : : }
392 : : }
393 : : /* Finalize final interval -- move its TIDs to delete state */
394 : 1987 : _bt_bottomupdel_finish_pending(page, state, &delstate);
395 : :
396 : : /*
397 : : * We don't give up now in the event of having few (or even zero)
398 : : * promising tuples for the tableam because it's not up to us as the index
399 : : * AM to manage costs (note that the tableam might have heuristics of its
400 : : * own that work out what to do). We should at least avoid having our
401 : : * caller do a useless deduplication pass after we return in the event of
402 : : * zero promising tuples, though.
403 : : */
404 : 1987 : neverdedup = false;
405 [ + + ]: 1987 : if (state->nintervals == 0)
406 : 9 : neverdedup = true;
407 : :
408 : 1987 : pfree(state->htids);
409 : 1987 : pfree(state);
410 : :
411 : : /* Ask tableam which TIDs are deletable, then physically delete them */
412 : 1987 : _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
413 : :
414 : 1987 : pfree(delstate.deltids);
415 : 1987 : pfree(delstate.status);
416 : :
417 : : /* Report "success" to caller unconditionally to avoid deduplication */
418 [ + + ]: 1987 : if (neverdedup)
419 : 9 : return true;
420 : :
421 : : /* Don't dedup when we won't end up back here any time soon anyway */
422 : 1978 : return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
423 : : }
424 : :
425 : : /*
426 : : * Create a new pending posting list tuple based on caller's base tuple.
427 : : *
428 : : * Every tuple processed by deduplication either becomes the base tuple for a
429 : : * posting list, or gets its heap TID(s) accepted into a pending posting list.
430 : : * A tuple that starts out as the base tuple for a posting list will only
431 : : * actually be rewritten within _bt_dedup_finish_pending() when it turns out
432 : : * that there are duplicates that can be merged into the base tuple.
433 : : */
434 : : void
2209 435 : 6508632 : _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
436 : : OffsetNumber baseoff)
437 : : {
438 [ - + ]: 6508632 : Assert(state->nhtids == 0);
439 [ - + ]: 6508632 : Assert(state->nitems == 0);
440 [ - + ]: 6508632 : Assert(!BTreeTupleIsPivot(base));
441 : :
442 : : /*
443 : : * Copy heap TID(s) from new base tuple for new candidate posting list
444 : : * into working state's array
445 : : */
446 [ + + ]: 6508632 : if (!BTreeTupleIsPosting(base))
447 : : {
448 : 5457517 : memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
449 : 5457517 : state->nhtids = 1;
450 : 5457517 : state->basetupsize = IndexTupleSize(base);
451 : : }
452 : : else
453 : : {
454 : : int nposting;
455 : :
456 : 1051115 : nposting = BTreeTupleGetNPosting(base);
457 : 1051115 : memcpy(state->htids, BTreeTupleGetPosting(base),
458 : : sizeof(ItemPointerData) * nposting);
459 : 1051115 : state->nhtids = nposting;
460 : : /* basetupsize should not include existing posting list */
461 : 1051115 : state->basetupsize = BTreeTupleGetPostingOffset(base);
462 : : }
463 : :
464 : : /*
465 : : * Save new base tuple itself -- it'll be needed if we actually create a
466 : : * new posting list from new pending posting list.
467 : : *
468 : : * Must maintain physical size of all existing tuples (including line
469 : : * pointer overhead) so that we can calculate space savings on page.
470 : : */
471 : 6508632 : state->nitems = 1;
472 : 6508632 : state->base = base;
473 : 6508632 : state->baseoff = baseoff;
474 : 6508632 : state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
475 : : /* Also save baseoff in pending state for interval */
476 : 6508632 : state->intervals[state->nintervals].baseoff = state->baseoff;
477 : 6508632 : }
478 : :
479 : : /*
480 : : * Save itup heap TID(s) into pending posting list where possible.
481 : : *
482 : : * Returns bool indicating if the pending posting list managed by state now
483 : : * includes itup's heap TID(s).
484 : : */
485 : : bool
486 : 1422657 : _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
487 : : {
488 : : int nhtids;
489 : : ItemPointer htids;
490 : : Size mergedtupsz;
491 : :
492 [ - + ]: 1422657 : Assert(!BTreeTupleIsPivot(itup));
493 : :
494 [ + + ]: 1422657 : if (!BTreeTupleIsPosting(itup))
495 : : {
496 : 1415955 : nhtids = 1;
497 : 1415955 : htids = &itup->t_tid;
498 : : }
499 : : else
500 : : {
501 : 6702 : nhtids = BTreeTupleGetNPosting(itup);
502 : 6702 : htids = BTreeTupleGetPosting(itup);
503 : : }
504 : :
505 : : /*
506 : : * Don't append (have caller finish pending posting list as-is) if
507 : : * appending heap TID(s) from itup would put us over maxpostingsize limit.
508 : : *
509 : : * This calculation needs to match the code used within _bt_form_posting()
510 : : * for new posting list tuples.
511 : : */
512 : 1422657 : mergedtupsz = MAXALIGN(state->basetupsize +
513 : : (state->nhtids + nhtids) * sizeof(ItemPointerData));
514 : :
515 [ + + ]: 1422657 : if (mergedtupsz > state->maxpostingsize)
516 : : {
517 : : /*
518 : : * Count this as an oversized item for single value strategy, though
519 : : * only when there are 50 TIDs in the final posting list tuple. This
520 : : * limit (which is fairly arbitrary) avoids confusion about how many
521 : : * 1/6 of a page tuples have been encountered/created by the current
522 : : * deduplication pass.
523 : : *
524 : : * Note: We deliberately don't consider which deduplication pass
525 : : * merged together tuples to create this item (could be a previous
526 : : * deduplication pass, or current pass). See _bt_do_singleval()
527 : : * comments.
528 : : */
2095 529 [ + + ]: 11558 : if (state->nhtids > 50)
530 : 10982 : state->nmaxitems++;
531 : :
2209 532 : 11558 : return false;
533 : : }
534 : :
535 : : /*
536 : : * Save heap TIDs to pending posting list tuple -- itup can be merged into
537 : : * pending posting list
538 : : */
539 : 1411099 : state->nitems++;
540 : 1411099 : memcpy(state->htids + state->nhtids, htids,
541 : : sizeof(ItemPointerData) * nhtids);
542 : 1411099 : state->nhtids += nhtids;
543 : 1411099 : state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
544 : :
545 : 1411099 : return true;
546 : : }
547 : :
548 : : /*
549 : : * Finalize pending posting list tuple, and add it to the page. Final tuple
550 : : * is based on saved base tuple, and saved list of heap TIDs.
551 : : *
552 : : * Returns space saving from deduplicating to make a new posting list tuple.
553 : : * Note that this includes line pointer overhead. This is zero in the case
554 : : * where no deduplication was possible.
555 : : */
556 : : Size
557 : 3492138 : _bt_dedup_finish_pending(Page newpage, BTDedupState state)
558 : : {
559 : : OffsetNumber tupoff;
560 : : Size tuplesz;
561 : : Size spacesaving;
562 : :
563 [ - + ]: 3492138 : Assert(state->nitems > 0);
564 [ - + ]: 3492138 : Assert(state->nitems <= state->nhtids);
565 [ - + ]: 3492138 : Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
566 : :
567 : 3492138 : tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
568 [ + + ]: 3492138 : if (state->nitems == 1)
569 : : {
570 : : /* Use original, unchanged base tuple */
571 : 3262925 : tuplesz = IndexTupleSize(state->base);
1318 572 [ - + ]: 3262925 : Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
369 573 [ - + ]: 3262925 : Assert(tuplesz <= BTMaxItemSize);
139 peter@eisentraut.org 574 [ - + ]:GNC 3262925 : if (PageAddItem(newpage, state->base, tuplesz, tupoff, false, false) == InvalidOffsetNumber)
2209 pg@bowt.ie 575 [ # # ]:UBC 0 : elog(ERROR, "deduplication failed to add tuple to page");
576 : :
2209 pg@bowt.ie 577 :CBC 3262925 : spacesaving = 0;
578 : : }
579 : : else
580 : : {
581 : : IndexTuple final;
582 : :
583 : : /* Form a tuple with a posting list */
584 : 229213 : final = _bt_form_posting(state->base, state->htids, state->nhtids);
585 : 229213 : tuplesz = IndexTupleSize(final);
586 [ - + ]: 229213 : Assert(tuplesz <= state->maxpostingsize);
587 : :
588 : : /* Save final number of items for posting list */
589 : 229213 : state->intervals[state->nintervals].nitems = state->nitems;
590 : :
591 [ - + ]: 229213 : Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
369 592 [ - + ]: 229213 : Assert(tuplesz <= BTMaxItemSize);
139 peter@eisentraut.org 593 [ - + ]:GNC 229213 : if (PageAddItem(newpage, final, tuplesz, tupoff, false, false) == InvalidOffsetNumber)
2209 pg@bowt.ie 594 [ # # ]:UBC 0 : elog(ERROR, "deduplication failed to add tuple to page");
595 : :
2209 pg@bowt.ie 596 :CBC 229213 : pfree(final);
597 : 229213 : spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
598 : : /* Increment nintervals, since we wrote a new posting list tuple */
599 : 229213 : state->nintervals++;
600 [ + - - + ]: 229213 : Assert(spacesaving > 0 && spacesaving < BLCKSZ);
601 : : }
602 : :
603 : : /* Reset state for next pending posting list */
604 : 3492138 : state->nhtids = 0;
605 : 3492138 : state->nitems = 0;
606 : 3492138 : state->phystupsize = 0;
607 : :
608 : 3492138 : return spacesaving;
609 : : }
610 : :
611 : : /*
612 : : * Finalize interval during bottom-up index deletion.
613 : : *
614 : : * During a bottom-up pass we expect that TIDs will be recorded in dedup state
615 : : * first, and then get moved over to delstate (in variable-sized batches) by
616 : : * calling here. Call here happens when the number of TIDs in a dedup
617 : : * interval is known, and interval gets finalized (i.e. when caller sees next
618 : : * tuple on the page is not a duplicate, or when caller runs out of tuples to
619 : : * process from leaf page).
620 : : *
621 : : * This is where bottom-up deletion determines and remembers which entries are
622 : : * duplicates. This will be important information to the tableam delete
623 : : * infrastructure later on. Plain index tuple duplicates are marked
624 : : * "promising" here, per tableam contract.
625 : : *
626 : : * Our approach to marking entries whose TIDs come from posting lists is more
627 : : * complicated. Posting lists can only be formed by a deduplication pass (or
628 : : * during an index build), so recent version churn affecting the pointed-to
629 : : * logical rows is not particularly likely. We may still give a weak signal
630 : : * about posting list tuples' entries (by marking just one of its TIDs/entries
631 : : * promising), though this is only a possibility in the event of further
632 : : * duplicate index tuples in final interval that covers posting list tuple (as
633 : : * in the plain tuple case). A weak signal/hint will be useful to the tableam
634 : : * when it has no stronger signal to go with for the deletion operation as a
635 : : * whole.
636 : : *
637 : : * The heuristics we use work well in practice because we only need to give
638 : : * the tableam the right _general_ idea about where to look. Garbage tends to
639 : : * naturally get concentrated in relatively few table blocks with workloads
640 : : * that bottom-up deletion targets. The tableam cannot possibly rank all
641 : : * available table blocks sensibly based on the hints we provide, but that's
642 : : * okay -- only the extremes matter. The tableam just needs to be able to
643 : : * predict which few table blocks will have the most tuples that are safe to
644 : : * delete for each deletion operation, with low variance across related
645 : : * deletion operations.
646 : : */
647 : : static void
1887 648 : 483395 : _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
649 : : TM_IndexDeleteOp *delstate)
650 : : {
651 : 483395 : bool dupinterval = (state->nitems > 1);
652 : :
653 [ - + ]: 483395 : Assert(state->nitems > 0);
654 [ - + ]: 483395 : Assert(state->nitems <= state->nhtids);
655 [ - + ]: 483395 : Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
656 : :
657 [ + + ]: 1043934 : for (int i = 0; i < state->nitems; i++)
658 : : {
659 : 560539 : OffsetNumber offnum = state->baseoff + i;
660 : 560539 : ItemId itemid = PageGetItemId(page, offnum);
661 : 560539 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
662 : 560539 : TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids];
663 : 560539 : TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids];
664 : :
665 [ + + ]: 560539 : if (!BTreeTupleIsPosting(itup))
666 : : {
667 : : /* Simple case: A plain non-pivot tuple */
668 : 452846 : ideltid->tid = itup->t_tid;
669 : 452846 : ideltid->id = delstate->ndeltids;
670 : 452846 : istatus->idxoffnum = offnum;
671 : 452846 : istatus->knowndeletable = false; /* for now */
672 : 452846 : istatus->promising = dupinterval; /* simple rule */
673 : 452846 : istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData);
674 : :
675 : 452846 : delstate->ndeltids++;
676 : : }
677 : : else
678 : : {
679 : : /*
680 : : * Complicated case: A posting list tuple.
681 : : *
682 : : * We make the conservative assumption that there can only be at
683 : : * most one affected logical row per posting list tuple. There
684 : : * will be at most one promising entry in deltids to represent
685 : : * this presumed lone logical row. Note that this isn't even
686 : : * considered unless the posting list tuple is also in an interval
687 : : * of duplicates -- this complicated rule is just a variant of the
688 : : * simple rule used to decide if plain index tuples are promising.
689 : : */
690 : 107693 : int nitem = BTreeTupleGetNPosting(itup);
691 : 107693 : bool firstpromising = false;
692 : 107693 : bool lastpromising = false;
693 : :
694 [ - + ]: 107693 : Assert(_bt_posting_valid(itup));
695 : :
696 [ + + ]: 107693 : if (dupinterval)
697 : : {
698 : : /*
699 : : * Complicated rule: either the first or last TID in the
700 : : * posting list gets marked promising (if any at all)
701 : : */
702 : : BlockNumber minblocklist,
703 : : midblocklist,
704 : : maxblocklist;
705 : : ItemPointer mintid,
706 : : midtid,
707 : : maxtid;
708 : :
709 : 10225 : mintid = BTreeTupleGetHeapTID(itup);
710 : 10225 : midtid = BTreeTupleGetPostingN(itup, nitem / 2);
711 : 10225 : maxtid = BTreeTupleGetMaxHeapTID(itup);
712 : 10225 : minblocklist = ItemPointerGetBlockNumber(mintid);
713 : 10225 : midblocklist = ItemPointerGetBlockNumber(midtid);
714 : 10225 : maxblocklist = ItemPointerGetBlockNumber(maxtid);
715 : :
716 : : /* Only entry with predominant table block can be promising */
717 : 10225 : firstpromising = (minblocklist == midblocklist);
718 [ + + + + ]: 10225 : lastpromising = (!firstpromising &&
719 : : midblocklist == maxblocklist);
720 : : }
721 : :
722 [ + + ]: 602944 : for (int p = 0; p < nitem; p++)
723 : : {
724 : 495251 : ItemPointer htid = BTreeTupleGetPostingN(itup, p);
725 : :
726 : 495251 : ideltid->tid = *htid;
727 : 495251 : ideltid->id = delstate->ndeltids;
728 : 495251 : istatus->idxoffnum = offnum;
729 : 495251 : istatus->knowndeletable = false; /* for now */
730 : 495251 : istatus->promising = false;
731 [ + + + + : 495251 : if ((firstpromising && p == 0) ||
+ + ]
732 [ + + ]: 64224 : (lastpromising && p == nitem - 1))
733 : 6969 : istatus->promising = true;
734 : 495251 : istatus->freespace = sizeof(ItemPointerData); /* at worst */
735 : :
736 : 495251 : ideltid++;
737 : 495251 : istatus++;
738 : 495251 : delstate->ndeltids++;
739 : : }
740 : : }
741 : : }
742 : :
743 [ + + ]: 483395 : if (dupinterval)
744 : : {
745 : 51276 : state->intervals[state->nintervals].nitems = state->nitems;
746 : 51276 : state->nintervals++;
747 : : }
748 : :
749 : : /* Reset state for next interval */
750 : 483395 : state->nhtids = 0;
751 : 483395 : state->nitems = 0;
752 : 483395 : state->phystupsize = 0;
753 : 483395 : }
754 : :
755 : : /*
756 : : * Determine if page non-pivot tuples (data items) are all duplicates of the
757 : : * same value -- if they are, deduplication's "single value" strategy should
758 : : * be applied. The general goal of this strategy is to ensure that
759 : : * nbtsplitloc.c (which uses its own single value strategy) will find a useful
760 : : * split point as further duplicates are inserted, and successive rightmost
761 : : * page splits occur among pages that store the same duplicate value. When
762 : : * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
763 : : * just like it would if deduplication were disabled.
764 : : *
765 : : * We expect that affected workloads will require _several_ single value
766 : : * strategy deduplication passes (over a page that only stores duplicates)
767 : : * before the page is finally split. The first deduplication pass should only
768 : : * find regular non-pivot tuples. Later deduplication passes will find
769 : : * existing maxpostingsize-capped posting list tuples, which must be skipped
770 : : * over. The penultimate pass is generally the first pass that actually
771 : : * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
772 : : * few untouched non-pivot tuples. The final deduplication pass won't free
773 : : * any space -- it will skip over everything without merging anything (it
774 : : * retraces the steps of the penultimate pass).
775 : : *
776 : : * Fortunately, having several passes isn't too expensive. Each pass (after
777 : : * the first pass) won't spend many cycles on the large posting list tuples
778 : : * left by previous passes. Each pass will find a large contiguous group of
779 : : * smaller duplicate tuples to merge together at the end of the page.
780 : : */
781 : : static bool
2209 782 : 13383 : _bt_do_singleval(Relation rel, Page page, BTDedupState state,
783 : : OffsetNumber minoff, IndexTuple newitem)
784 : : {
2178 785 : 13383 : int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
786 : : ItemId itemid;
787 : : IndexTuple itup;
788 : :
2209 789 : 13383 : itemid = PageGetItemId(page, minoff);
790 : 13383 : itup = (IndexTuple) PageGetItem(page, itemid);
791 : :
2178 792 [ + + ]: 13383 : if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
793 : : {
2209 794 : 1125 : itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
795 : 1125 : itup = (IndexTuple) PageGetItem(page, itemid);
796 : :
2178 797 [ + + ]: 1125 : if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
2209 798 : 667 : return true;
799 : : }
800 : :
801 : 12716 : return false;
802 : : }
803 : :
804 : : /*
805 : : * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
806 : : * and final maxpostingsize-capped tuple. The sixth and final posting list
807 : : * tuple will end up somewhat smaller than the first five. (Note: The first
808 : : * five tuples could actually just be very large duplicate tuples that
809 : : * couldn't be merged together at all. Deduplication will simply not modify
810 : : * the page when that happens.)
811 : : *
812 : : * When there are six posting lists on the page (after current deduplication
813 : : * pass goes on to create/observe a sixth very large tuple), caller should end
814 : : * its deduplication pass. It isn't useful to try to deduplicate items that
815 : : * are supposed to end up on the new right sibling page following the
816 : : * anticipated page split. A future deduplication pass of future right
817 : : * sibling page might take care of it. (This is why the first single value
818 : : * strategy deduplication pass for a given leaf page will generally find only
819 : : * plain non-pivot tuples -- see _bt_do_singleval() comments.)
820 : : */
821 : : static void
822 : 317 : _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
823 : : {
824 : : Size leftfree;
825 : : int reduction;
826 : :
827 : : /* This calculation needs to match nbtsplitloc.c */
828 : 317 : leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
829 : : MAXALIGN(sizeof(BTPageOpaqueData));
830 : : /* Subtract size of new high key (includes pivot heap TID space) */
831 : 317 : leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
832 : :
833 : : /*
834 : : * Reduce maxpostingsize by an amount equal to target free space on left
835 : : * half of page
836 : : */
837 : 317 : reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
838 [ + - ]: 317 : if (state->maxpostingsize > reduction)
839 : 317 : state->maxpostingsize -= reduction;
840 : : else
2209 pg@bowt.ie 841 :UBC 0 : state->maxpostingsize = 0;
2209 pg@bowt.ie 842 :CBC 317 : }
843 : :
844 : : /*
845 : : * Build a posting list tuple based on caller's "base" index tuple and list of
846 : : * heap TIDs. When nhtids == 1, builds a standard non-pivot tuple without a
847 : : * posting list. (Posting list tuples can never have a single heap TID, partly
848 : : * because that ensures that deduplication always reduces final MAXALIGN()'d
849 : : * size of entire tuple.)
850 : : *
851 : : * Convention is that posting list starts at a MAXALIGN()'d offset (rather
852 : : * than a SHORTALIGN()'d offset), in line with the approach taken when
853 : : * appending a heap TID to new pivot tuple/high key during suffix truncation.
854 : : * This sometimes wastes a little space that was only needed as alignment
855 : : * padding in the original tuple. Following this convention simplifies the
856 : : * space accounting used when deduplicating a page (the same convention
857 : : * simplifies the accounting for choosing a point to split a page at).
858 : : *
859 : : * Note: Caller's "htids" array must be unique and already in ascending TID
860 : : * order. Any existing heap TIDs from "base" won't automatically appear in
861 : : * returned posting list tuple (they must be included in htids array.)
862 : : */
863 : : IndexTuple
136 peter@eisentraut.org 864 :GNC 277411 : _bt_form_posting(IndexTuple base, const ItemPointerData *htids, int nhtids)
865 : : {
866 : : uint32 keysize,
867 : : newsize;
868 : : IndexTuple itup;
869 : :
2209 pg@bowt.ie 870 [ + + ]:CBC 277411 : if (BTreeTupleIsPosting(base))
871 : 71378 : keysize = BTreeTupleGetPostingOffset(base);
872 : : else
873 : 206033 : keysize = IndexTupleSize(base);
874 : :
875 [ - + ]: 277411 : Assert(!BTreeTupleIsPivot(base));
876 [ + - - + ]: 277411 : Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
877 [ - + ]: 277411 : Assert(keysize == MAXALIGN(keysize));
878 : :
879 : : /* Determine final size of new tuple */
880 [ + + ]: 277411 : if (nhtids > 1)
881 : 250295 : newsize = MAXALIGN(keysize +
882 : : nhtids * sizeof(ItemPointerData));
883 : : else
884 : 27116 : newsize = keysize;
885 : :
886 [ - + ]: 277411 : Assert(newsize <= INDEX_SIZE_MASK);
887 [ - + ]: 277411 : Assert(newsize == MAXALIGN(newsize));
888 : :
889 : : /* Allocate memory using palloc0() (matches index_form_tuple()) */
890 : 277411 : itup = palloc0(newsize);
891 : 277411 : memcpy(itup, base, keysize);
892 : 277411 : itup->t_info &= ~INDEX_SIZE_MASK;
893 : 277411 : itup->t_info |= newsize;
894 [ + + ]: 277411 : if (nhtids > 1)
895 : : {
896 : : /* Form posting list tuple */
897 : 250295 : BTreeTupleSetPosting(itup, nhtids, keysize);
898 : 250295 : memcpy(BTreeTupleGetPosting(itup), htids,
899 : : sizeof(ItemPointerData) * nhtids);
900 [ - + ]: 250295 : Assert(_bt_posting_valid(itup));
901 : : }
902 : : else
903 : : {
904 : : /* Form standard non-pivot tuple */
905 : 27116 : itup->t_info &= ~INDEX_ALT_TID_MASK;
906 : 27116 : ItemPointerCopy(htids, &itup->t_tid);
907 [ - + ]: 27116 : Assert(ItemPointerIsValid(&itup->t_tid));
908 : : }
909 : :
910 : 277411 : return itup;
911 : : }
912 : :
913 : : /*
914 : : * Generate a replacement tuple by "updating" a posting list tuple so that it
915 : : * no longer has TIDs that need to be deleted.
916 : : *
917 : : * Used by both VACUUM and index deletion. Caller's vacposting argument
918 : : * points to the existing posting list tuple to be updated.
919 : : *
920 : : * On return, caller's vacposting argument will point to final "updated"
921 : : * tuple, which will be palloc()'d in caller's memory context.
922 : : */
923 : : void
924 : 31969 : _bt_update_posting(BTVacuumPosting vacposting)
925 : : {
926 : 31969 : IndexTuple origtuple = vacposting->itup;
927 : : uint32 keysize,
928 : : newsize;
929 : : IndexTuple itup;
930 : : int nhtids;
931 : : int ui,
932 : : d;
933 : : ItemPointer htids;
934 : :
935 : 31969 : nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
936 : :
937 [ - + ]: 31969 : Assert(_bt_posting_valid(origtuple));
938 [ + - - + ]: 31969 : Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
939 : :
940 : : /*
941 : : * Determine final size of new tuple.
942 : : *
943 : : * This calculation needs to match the code used within _bt_form_posting()
944 : : * for new posting list tuples. We avoid calling _bt_form_posting() here
945 : : * to save ourselves a second memory allocation for a htids workspace.
946 : : */
2205 947 : 31969 : keysize = BTreeTupleGetPostingOffset(origtuple);
2209 948 [ + + ]: 31969 : if (nhtids > 1)
949 : 5652 : newsize = MAXALIGN(keysize +
950 : : nhtids * sizeof(ItemPointerData));
951 : : else
952 : 26317 : newsize = keysize;
953 : :
2204 954 [ - + ]: 31969 : Assert(newsize <= INDEX_SIZE_MASK);
955 [ - + ]: 31969 : Assert(newsize == MAXALIGN(newsize));
956 : :
957 : : /* Allocate memory using palloc0() (matches index_form_tuple()) */
2209 958 : 31969 : itup = palloc0(newsize);
959 : 31969 : memcpy(itup, origtuple, keysize);
960 : 31969 : itup->t_info &= ~INDEX_SIZE_MASK;
961 : 31969 : itup->t_info |= newsize;
962 : :
963 [ + + ]: 31969 : if (nhtids > 1)
964 : : {
965 : : /* Form posting list tuple */
966 : 5652 : BTreeTupleSetPosting(itup, nhtids, keysize);
967 : 5652 : htids = BTreeTupleGetPosting(itup);
968 : : }
969 : : else
970 : : {
971 : : /* Form standard non-pivot tuple */
972 : 26317 : itup->t_info &= ~INDEX_ALT_TID_MASK;
973 : 26317 : htids = &itup->t_tid;
974 : : }
975 : :
976 : 31969 : ui = 0;
977 : 31969 : d = 0;
978 [ + + ]: 193388 : for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
979 : : {
980 [ + + + + ]: 161419 : if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
981 : : {
982 : 83595 : d++;
983 : 83595 : continue;
984 : : }
985 : 77824 : htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
986 : : }
987 [ - + ]: 31969 : Assert(ui == nhtids);
988 [ - + ]: 31969 : Assert(d == vacposting->ndeletedtids);
989 [ + + - + ]: 31969 : Assert(nhtids == 1 || _bt_posting_valid(itup));
2204 990 [ + + - + ]: 31969 : Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
991 : :
992 : : /* vacposting arg's itup will now point to updated version */
2209 993 : 31969 : vacposting->itup = itup;
994 : 31969 : }
995 : :
996 : : /*
997 : : * Prepare for a posting list split by swapping heap TID in newitem with heap
998 : : * TID from original posting list (the 'oposting' heap TID located at offset
999 : : * 'postingoff'). Modifies newitem, so caller should pass their own private
1000 : : * copy that can safely be modified.
1001 : : *
1002 : : * Returns new posting list tuple, which is palloc()'d in caller's context.
1003 : : * This is guaranteed to be the same size as 'oposting'. Modified newitem is
1004 : : * what caller actually inserts. (This happens inside the same critical
1005 : : * section that performs an in-place update of old posting list using new
1006 : : * posting list returned here.)
1007 : : *
1008 : : * While the keys from newitem and oposting must be opclass equal, and must
1009 : : * generate identical output when run through the underlying type's output
1010 : : * function, it doesn't follow that their representations match exactly.
1011 : : * Caller must avoid assuming that there can't be representational differences
1012 : : * that make datums from oposting bigger or smaller than the corresponding
1013 : : * datums from newitem. For example, differences in TOAST input state might
1014 : : * break a faulty assumption about tuple size (the executor is entitled to
1015 : : * apply TOAST compression based on its own criteria). It also seems possible
1016 : : * that further representational variation will be introduced in the future,
1017 : : * in order to support nbtree features like page-level prefix compression.
1018 : : *
1019 : : * See nbtree/README for details on the design of posting list splits.
1020 : : */
1021 : : IndexTuple
1022 : 20646 : _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
1023 : : {
1024 : : int nhtids;
1025 : : char *replacepos;
1026 : : char *replaceposright;
1027 : : Size nmovebytes;
1028 : : IndexTuple nposting;
1029 : :
1030 : 20646 : nhtids = BTreeTupleGetNPosting(oposting);
1031 [ - + ]: 20646 : Assert(_bt_posting_valid(oposting));
1032 : :
1033 : : /*
1034 : : * The postingoff argument originated as a _bt_binsrch_posting() return
1035 : : * value. It will be 0 in the event of corruption that makes a leaf page
1036 : : * contain a non-pivot tuple that's somehow identical to newitem (no two
1037 : : * non-pivot tuples should ever have the same TID). This has been known
1038 : : * to happen in the field from time to time.
1039 : : *
1040 : : * Perform a basic sanity check to catch this case now.
1041 : : */
1766 1042 [ + - - + ]: 20646 : if (!(postingoff > 0 && postingoff < nhtids))
1766 pg@bowt.ie 1043 [ # # ]:UBC 0 : elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
1044 : : nhtids, postingoff);
1045 : :
1046 : : /*
1047 : : * Move item pointers in posting list to make a gap for the new item's
1048 : : * heap TID. We shift TIDs one place to the right, losing original
1049 : : * rightmost TID. (nmovebytes must not include TIDs to the left of
1050 : : * postingoff, nor the existing rightmost/max TID that gets overwritten.)
1051 : : */
2209 pg@bowt.ie 1052 :CBC 20646 : nposting = CopyIndexTuple(oposting);
1053 : 20646 : replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
1054 : 20646 : replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
1055 : 20646 : nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
1056 : 20646 : memmove(replaceposright, replacepos, nmovebytes);
1057 : :
1058 : : /* Fill the gap at postingoff with TID of new item (original new TID) */
1059 [ + - - + ]: 20646 : Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
1060 : 20646 : ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
1061 : :
1062 : : /* Now copy oposting's rightmost/max TID into new item (final new TID) */
1063 : 20646 : ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
1064 : :
1065 [ - + ]: 20646 : Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
1066 : : BTreeTupleGetHeapTID(newitem)) < 0);
1067 [ - + ]: 20646 : Assert(_bt_posting_valid(nposting));
1068 : :
1069 : 20646 : return nposting;
1070 : : }
1071 : :
1072 : : /*
1073 : : * Verify posting list invariants for "posting", which must be a posting list
1074 : : * tuple. Used within assertions.
1075 : : */
1076 : : #ifdef USE_ASSERT_CHECKING
1077 : : static bool
1078 : 436901 : _bt_posting_valid(IndexTuple posting)
1079 : : {
1080 : : ItemPointerData last;
1081 : : ItemPointer htid;
1082 : :
1083 [ + - - + ]: 436901 : if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
2209 pg@bowt.ie 1084 :UBC 0 : return false;
1085 : :
1086 : : /* Remember first heap TID for loop */
2209 pg@bowt.ie 1087 :CBC 436901 : ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
1088 [ - + ]: 436901 : if (!ItemPointerIsValid(&last))
2209 pg@bowt.ie 1089 :UBC 0 : return false;
1090 : :
1091 : : /* Iterate, starting from second TID */
2209 pg@bowt.ie 1092 [ + + ]:CBC 9561858 : for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
1093 : : {
1094 : 9124957 : htid = BTreeTupleGetPostingN(posting, i);
1095 : :
1096 [ - + ]: 9124957 : if (!ItemPointerIsValid(htid))
2209 pg@bowt.ie 1097 :UBC 0 : return false;
2209 pg@bowt.ie 1098 [ - + ]:CBC 9124957 : if (ItemPointerCompare(htid, &last) <= 0)
2209 pg@bowt.ie 1099 :UBC 0 : return false;
2209 pg@bowt.ie 1100 :CBC 9124957 : ItemPointerCopy(htid, &last);
1101 : : }
1102 : :
1103 : 436901 : return true;
1104 : : }
1105 : : #endif
|