Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * nbtutils.c
4 : : * Utility code for Postgres btree implementation.
5 : : *
6 : : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : : * Portions Copyright (c) 1994, Regents of the University of California
8 : : *
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/access/nbtree/nbtutils.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : :
16 : : #include "postgres.h"
17 : :
18 : : #include <time.h>
19 : :
20 : : #include "access/nbtree.h"
21 : : #include "access/reloptions.h"
22 : : #include "access/relscan.h"
23 : : #include "commands/progress.h"
24 : : #include "common/int.h"
25 : : #include "lib/qunique.h"
26 : : #include "miscadmin.h"
27 : : #include "utils/datum.h"
28 : : #include "utils/lsyscache.h"
29 : : #include "utils/rel.h"
30 : :
31 : :
32 : : static int _bt_compare_int(const void *va, const void *vb);
33 : : static int _bt_keep_natts(Relation rel, IndexTuple lastleft,
34 : : IndexTuple firstright, BTScanInsert itup_key);
35 : :
36 : :
37 : : /*
38 : : * _bt_mkscankey
39 : : * Build an insertion scan key that contains comparison data from itup
40 : : * as well as comparator routines appropriate to the key datatypes.
41 : : *
42 : : * The result is intended for use with _bt_compare() and _bt_truncate().
43 : : * Callers that don't need to fill out the insertion scankey arguments
44 : : * (e.g. they use an ad-hoc comparison routine, or only need a scankey
45 : : * for _bt_truncate()) can pass a NULL index tuple. The scankey will
46 : : * be initialized as if an "all truncated" pivot tuple was passed
47 : : * instead.
48 : : *
49 : : * Note that we may occasionally have to share lock the metapage to
50 : : * determine whether or not the keys in the index are expected to be
51 : : * unique (i.e. if this is a "heapkeyspace" index). We assume a
52 : : * heapkeyspace index when caller passes a NULL tuple, allowing index
53 : : * build callers to avoid accessing the non-existent metapage. We
54 : : * also assume that the index is _not_ allequalimage when a NULL tuple
55 : : * is passed; CREATE INDEX callers call _bt_allequalimage() to set the
56 : : * field themselves.
57 : : */
58 : : BTScanInsert
920 pg@bowt.ie 59 :CBC 6095871 : _bt_mkscankey(Relation rel, IndexTuple itup)
60 : : {
61 : : BTScanInsert key;
62 : : ScanKey skey;
63 : : TupleDesc itupdesc;
64 : : int indnkeyatts;
65 : : int16 *indoption;
66 : : int tupnatts;
67 : : int i;
68 : :
9968 bruce@momjian.us 69 : 6095871 : itupdesc = RelationGetDescr(rel);
2810 teodor@sigaev.ru 70 : 6095871 : indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
6916 tgl@sss.pgh.pa.us 71 : 6095871 : indoption = rel->rd_indoption;
2463 pg@bowt.ie 72 [ + + + + ]: 6095871 : tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0;
73 : :
74 [ - + ]: 6095871 : Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel));
75 : :
76 : : /*
77 : : * We'll execute search using scan key constructed on key columns.
78 : : * Truncated attributes and non-key attributes are omitted from the final
79 : : * scan key.
80 : : */
81 : 6095871 : key = palloc(offsetof(BTScanInsertData, scankeys) +
82 : 6095871 : sizeof(ScanKeyData) * indnkeyatts);
2120 83 [ + + ]: 6095871 : if (itup)
920 84 : 6022741 : _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage);
85 : : else
86 : : {
87 : : /* Utility statement callers can set these fields themselves */
2120 88 : 73130 : key->heapkeyspace = true;
89 : 73130 : key->allequalimage = false;
90 : : }
2400 tgl@sss.pgh.pa.us 91 : 6095871 : key->anynullkeys = false; /* initial assumption */
739 pg@bowt.ie 92 : 6095871 : key->nextkey = false; /* usual case, required by btinsert */
93 : 6095871 : key->backward = false; /* usual case, required by btinsert */
2463 94 : 6095871 : key->keysz = Min(indnkeyatts, tupnatts);
95 [ + + ]: 6095871 : key->scantid = key->heapkeyspace && itup ?
96 [ + - ]: 12191742 : BTreeTupleGetHeapTID(itup) : NULL;
97 : 6095871 : skey = key->scankeys;
2810 teodor@sigaev.ru 98 [ + + ]: 16548638 : for (i = 0; i < indnkeyatts; i++)
99 : : {
100 : : FmgrInfo *procinfo;
101 : : Datum arg;
102 : : bool null;
103 : : int flags;
104 : :
105 : : /*
106 : : * We can use the cached (default) support procs since no cross-type
107 : : * comparison can be needed.
108 : : */
8837 tgl@sss.pgh.pa.us 109 : 10452767 : procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
110 : :
111 : : /*
112 : : * Key arguments built from truncated attributes (or when caller
113 : : * provides no tuple) are defensively represented as NULL values. They
114 : : * should never be used.
115 : : */
2463 pg@bowt.ie 116 [ + + ]: 10452767 : if (i < tupnatts)
117 : 10321815 : arg = index_getattr(itup, i + 1, itupdesc, &null);
118 : : else
119 : : {
120 : 130952 : arg = (Datum) 0;
121 : 130952 : null = true;
122 : : }
123 : 10452767 : flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
8837 tgl@sss.pgh.pa.us 124 : 10452767 : ScanKeyEntryInitializeWithInfo(&skey[i],
125 : : flags,
126 : 10452767 : (AttrNumber) (i + 1),
127 : : InvalidStrategy,
128 : : InvalidOid,
5362 129 : 10452767 : rel->rd_indcollation[i],
130 : : procinfo,
131 : : arg);
132 : : /* Record if any key attribute is NULL (or truncated) */
2429 pg@bowt.ie 133 [ + + ]: 10452767 : if (null)
134 : 141276 : key->anynullkeys = true;
135 : : }
136 : :
137 : : /*
138 : : * In NULLS NOT DISTINCT mode, we pretend that there are no null keys, so
139 : : * that full uniqueness check is done.
140 : : */
1412 peter@eisentraut.org 141 [ + + ]: 6095871 : if (rel->rd_index->indnullsnotdistinct)
142 : 93 : key->anynullkeys = false;
143 : :
2463 pg@bowt.ie 144 : 6095871 : return key;
145 : : }
146 : :
147 : : /*
148 : : * free a retracement stack made by _bt_search.
149 : : */
150 : : void
10752 scrappy@hub.org 151 : 10792349 : _bt_freestack(BTStack stack)
152 : : {
153 : : BTStack ostack;
154 : :
8014 neilc@samurai.com 155 [ + + ]: 19903398 : while (stack != NULL)
156 : : {
10327 bruce@momjian.us 157 : 9111049 : ostack = stack;
158 : 9111049 : stack = stack->bts_parent;
159 : 9111049 : pfree(ostack);
160 : : }
10752 scrappy@hub.org 161 : 10792349 : }
162 : :
163 : : /*
164 : : * qsort comparison function for int arrays
165 : : */
166 : : static int
6 pg@bowt.ie 167 :GNC 260148 : _bt_compare_int(const void *va, const void *vb)
168 : : {
169 : 260148 : int a = *((const int *) va);
170 : 260148 : int b = *((const int *) vb);
171 : :
172 : 260148 : return pg_cmp_s32(a, b);
173 : : }
174 : :
175 : : /*
176 : : * _bt_killitems - set LP_DEAD state for items an indexscan caller has
177 : : * told us were killed
178 : : *
179 : : * scan->opaque, referenced locally through so, contains information about the
180 : : * current page and killed tuples thereon (generally, this should only be
181 : : * called if so->numKilled > 0).
182 : : *
183 : : * Caller should not have a lock on the so->currPos page, but must hold a
184 : : * buffer pin when !so->dropPin. When we return, it still won't be locked.
185 : : * It'll continue to hold whatever pins were held before calling here.
186 : : *
187 : : * We match items by heap TID before assuming they are the right ones to set
188 : : * LP_DEAD. If the scan is one that holds a buffer pin on the target page
189 : : * continuously from initially reading the items until applying this function
190 : : * (if it is a !so->dropPin scan), VACUUM cannot have deleted any items on the
191 : : * page, so the page's TIDs can't have been recycled by now. There's no risk
192 : : * that we'll confuse a new index tuple that happens to use a recycled TID
193 : : * with a now-removed tuple with the same TID (that used to be on this same
194 : : * page). We can't rely on that during scans that drop buffer pins eagerly
195 : : * (so->dropPin scans), though, so we must condition setting LP_DEAD bits on
196 : : * the page LSN having not changed since back when _bt_readpage saw the page.
197 : : * We totally give up on setting LP_DEAD bits when the page LSN changed.
198 : : *
199 : : * We give up much less often during !so->dropPin scans, but it still happens.
200 : : * We cope with cases where items have moved right due to insertions. If an
201 : : * item has moved off the current page due to a split, we'll fail to find it
202 : : * and just give up on it.
203 : : */
204 : : void
3919 kgrittn@postgresql.o 205 : 89511 : _bt_killitems(IndexScanDesc scan)
206 : : {
193 pg@bowt.ie 207 :CBC 89511 : Relation rel = scan->indexRelation;
7163 tgl@sss.pgh.pa.us 208 : 89511 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
209 : : Page page;
210 : : BTPageOpaque opaque;
211 : : OffsetNumber minoff;
212 : : OffsetNumber maxoff;
3919 kgrittn@postgresql.o 213 :GNC 89511 : int numKilled = so->numKilled;
7163 tgl@sss.pgh.pa.us 214 : 89511 : bool killedsomething = false;
215 : : Buffer buf;
216 : :
193 pg@bowt.ie 217 [ - + ]: 89511 : Assert(numKilled > 0);
3919 kgrittn@postgresql.o 218 [ - + - - : 89511 : Assert(BTScanPosIsValid(so->currPos));
- + ]
193 pg@bowt.ie 219 [ - + ]: 89511 : Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
220 : :
221 : : /* Always invalidate so->killedItems[] before leaving so->currPos */
3919 kgrittn@postgresql.o 222 : 89511 : so->numKilled = 0;
223 : :
224 : : /*
225 : : * We need to iterate through so->killedItems[] in leaf page order; the
226 : : * loop below expects this (when marking posting list tuples, at least).
227 : : * so->killedItems[] is now in whatever order the scan returned items in.
228 : : * Scrollable cursor scans might have even saved the same item/TID twice.
229 : : *
230 : : * Sort and unique-ify so->killedItems[] to deal with all this.
231 : : */
6 pg@bowt.ie 232 [ + + ]: 89511 : if (numKilled > 1)
233 : : {
234 : 8623 : qsort(so->killedItems, numKilled, sizeof(int), _bt_compare_int);
235 : 8623 : numKilled = qunique(so->killedItems, numKilled, sizeof(int),
236 : : _bt_compare_int);
237 : : }
238 : :
193 239 [ + + ]: 89511 : if (!so->dropPin)
240 : : {
241 : : /*
242 : : * We have held the pin on this page since we read the index tuples,
243 : : * so all we need to do is lock it. The pin will have prevented
244 : : * concurrent VACUUMs from recycling any of the TIDs on the page.
245 : : */
246 [ - + - - : 19595 : Assert(BTScanPosIsPinned(so->currPos));
- + ]
188 247 : 19595 : buf = so->currPos.buf;
248 : 19595 : _bt_lockbuf(rel, buf, BT_READ);
249 : : }
250 : : else
251 : : {
252 : : XLogRecPtr latestlsn;
253 : :
193 254 [ - + - - : 69916 : Assert(!BTScanPosIsPinned(so->currPos));
- + ]
255 [ + - + + : 69916 : Assert(RelationNeedsWAL(rel));
+ - - + ]
256 : 69916 : buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
257 : :
258 : 69916 : latestlsn = BufferGetLSNAtomic(buf);
259 [ - + ]: 69916 : Assert(so->currPos.lsn <= latestlsn);
260 [ + + ]: 69916 : if (so->currPos.lsn != latestlsn)
261 : : {
262 : : /* Modified, give up on hinting */
263 : 73 : _bt_relbuf(rel, buf);
3919 kgrittn@postgresql.o 264 : 73 : return;
265 : : }
266 : :
267 : : /* Unmodified, hinting is safe */
268 : : }
269 : :
188 pg@bowt.ie 270 : 89438 : page = BufferGetPage(buf);
1355 michael@paquier.xyz 271 : 89438 : opaque = BTPageGetOpaque(page);
7163 tgl@sss.pgh.pa.us 272 [ + + ]: 89438 : minoff = P_FIRSTDATAKEY(opaque);
273 : 89438 : maxoff = PageGetMaxOffsetNumber(page);
274 : :
275 : : /* Iterate through so->killedItems[] in leaf page order */
193 pg@bowt.ie 276 [ + + ]: 292458 : for (int i = 0; i < numKilled; i++)
277 : : {
7013 bruce@momjian.us 278 : 203020 : int itemIndex = so->killedItems[i];
279 : 203020 : BTScanPosItem *kitem = &so->currPos.items[itemIndex];
280 : 203020 : OffsetNumber offnum = kitem->indexOffset;
281 : :
7163 tgl@sss.pgh.pa.us 282 [ + - - + ]: 203020 : Assert(itemIndex >= so->currPos.firstItem &&
283 : : itemIndex <= so->currPos.lastItem);
6 pg@bowt.ie 284 [ + + - + ]: 203020 : Assert(i == 0 ||
285 : : offnum >= so->currPos.items[so->killedItems[i - 1]].indexOffset);
286 : :
7163 tgl@sss.pgh.pa.us 287 [ - + ]:CBC 203020 : if (offnum < minoff)
7163 tgl@sss.pgh.pa.us 288 :UBC 0 : continue; /* pure paranoia */
7163 tgl@sss.pgh.pa.us 289 [ + + ]:CBC 4224884 : while (offnum <= maxoff)
290 : : {
291 : 4193985 : ItemId iid = PageGetItemId(page, offnum);
292 : 4193985 : IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
2120 pg@bowt.ie 293 : 4193985 : bool killtuple = false;
294 : :
295 [ + + ]: 4193985 : if (BTreeTupleIsPosting(ituple))
296 : : {
297 : 1046682 : int pi = i + 1;
298 : 1046682 : int nposting = BTreeTupleGetNPosting(ituple);
299 : : int j;
300 : :
301 : : /*
302 : : * Note that the page may have been modified in almost any way
303 : : * since we first read it (in the !so->dropPin case), so it's
304 : : * possible that this posting list tuple wasn't a posting list
305 : : * tuple when we first encountered its heap TIDs.
306 : : */
307 [ + + ]: 1073704 : for (j = 0; j < nposting; j++)
308 : : {
309 : 1072893 : ItemPointer item = BTreeTupleGetPostingN(ituple, j);
310 : :
311 [ + + ]: 1072893 : if (!ItemPointerEquals(item, &kitem->heapTid))
312 : 1045871 : break; /* out of posting list loop */
313 : :
314 : : /*
315 : : * kitem must have matching offnum when heap TIDs match,
316 : : * though only in the common case where the page can't
317 : : * have been concurrently modified
318 : : */
193 319 [ - + - - ]: 27022 : Assert(kitem->indexOffset == offnum || !so->dropPin);
320 : :
321 : : /*
322 : : * Read-ahead to later kitems here.
323 : : *
324 : : * We rely on the assumption that not advancing kitem here
325 : : * will prevent us from considering the posting list tuple
326 : : * fully dead by not matching its next heap TID in next
327 : : * loop iteration.
328 : : *
329 : : * If, on the other hand, this is the final heap TID in
330 : : * the posting list tuple, then tuple gets killed
331 : : * regardless (i.e. we handle the case where the last
332 : : * kitem is also the last heap TID in the last index tuple
333 : : * correctly -- posting tuple still gets killed).
334 : : */
2120 335 [ + + ]: 27022 : if (pi < numKilled)
336 : 9446 : kitem = &so->currPos.items[so->killedItems[pi++]];
337 : : }
338 : :
339 : : /*
340 : : * Don't bother advancing the outermost loop's int iterator to
341 : : * avoid processing killed items that relate to the same
342 : : * offnum/posting list tuple. This micro-optimization hardly
343 : : * seems worth it. (Further iterations of the outermost loop
344 : : * will fail to match on this same posting list's first heap
345 : : * TID instead, so we'll advance to the next offnum/index
346 : : * tuple pretty quickly.)
347 : : */
348 [ + + ]: 1046682 : if (j == nposting)
349 : 811 : killtuple = true;
350 : : }
351 [ + + ]: 3147303 : else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
352 : 171706 : killtuple = true;
353 : :
354 : : /*
355 : : * Mark index item as dead, if it isn't already. Since this
356 : : * happens while holding a buffer lock possibly in shared mode,
357 : : * it's possible that multiple processes attempt to do this
358 : : * simultaneously, leading to multiple full-page images being sent
359 : : * to WAL (if wal_log_hints or data checksums are enabled), which
360 : : * is undesirable.
361 : : */
2041 alvherre@alvh.no-ip. 362 [ + + + + ]: 4193985 : if (killtuple && !ItemIdIsDead(iid))
363 : : {
364 : : /* found the item/all posting list items */
6670 tgl@sss.pgh.pa.us 365 : 172121 : ItemIdMarkDead(iid);
7163 366 : 172121 : killedsomething = true;
367 : 172121 : break; /* out of inner search loop */
368 : : }
369 : 4021864 : offnum = OffsetNumberNext(offnum);
370 : : }
371 : : }
372 : :
373 : : /*
374 : : * Since this can be redone later if needed, mark as dirty hint.
375 : : *
376 : : * Whenever we mark anything LP_DEAD, we also set the page's
377 : : * BTP_HAS_GARBAGE flag, which is likewise just a hint. (Note that we
378 : : * only rely on the page-level flag in !heapkeyspace indexes.)
379 : : */
380 [ + + ]: 89438 : if (killedsomething)
381 : : {
7084 382 : 70730 : opaque->btpo_flags |= BTP_HAS_GARBAGE;
188 pg@bowt.ie 383 : 70730 : MarkBufferDirtyHint(buf, true);
384 : : }
385 : :
386 [ + + ]: 89438 : if (!so->dropPin)
387 : 19595 : _bt_unlockbuf(rel, buf);
388 : : else
389 : 69843 : _bt_relbuf(rel, buf);
390 : : }
391 : :
392 : :
393 : : /*
394 : : * The following routines manage a shared-memory area in which we track
395 : : * assignment of "vacuum cycle IDs" to currently-active btree vacuuming
396 : : * operations. There is a single counter which increments each time we
397 : : * start a vacuum to assign it a cycle ID. Since multiple vacuums could
398 : : * be active concurrently, we have to track the cycle ID for each active
399 : : * vacuum; this requires at most MaxBackends entries (usually far fewer).
400 : : * We assume at most one vacuum can be active for a given index.
401 : : *
402 : : * Access to the shared memory area is controlled by BtreeVacuumLock.
403 : : * In principle we could use a separate lmgr locktag for each index,
404 : : * but a single LWLock is much cheaper, and given the short time that
405 : : * the lock is ever held, the concurrency hit should be minimal.
406 : : */
407 : :
408 : : typedef struct BTOneVacInfo
409 : : {
410 : : LockRelId relid; /* global identifier of an index */
411 : : BTCycleId cycleid; /* cycle ID for its active VACUUM */
412 : : } BTOneVacInfo;
413 : :
414 : : typedef struct BTVacInfo
415 : : {
416 : : BTCycleId cycle_ctr; /* cycle ID most recently assigned */
417 : : int num_vacuums; /* number of currently active VACUUMs */
418 : : int max_vacuums; /* allocated length of vacuums[] array */
419 : : BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER];
420 : : } BTVacInfo;
421 : :
422 : : static BTVacInfo *btvacinfo;
423 : :
424 : :
425 : : /*
426 : : * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index,
427 : : * or zero if there is no active VACUUM
428 : : *
429 : : * Note: for correct interlocking, the caller must already hold pin and
430 : : * exclusive lock on each buffer it will store the cycle ID into. This
431 : : * ensures that even if a VACUUM starts immediately afterwards, it cannot
432 : : * process those pages until the page split is complete.
433 : : */
434 : : BTCycleId
7162 tgl@sss.pgh.pa.us 435 : 11803 : _bt_vacuum_cycleid(Relation rel)
436 : : {
437 : 11803 : BTCycleId result = 0;
438 : : int i;
439 : :
440 : : /* Share lock is enough since this is a read-only operation */
441 : 11803 : LWLockAcquire(BtreeVacuumLock, LW_SHARED);
442 : :
443 [ + + ]: 11827 : for (i = 0; i < btvacinfo->num_vacuums; i++)
444 : : {
445 : 24 : BTOneVacInfo *vac = &btvacinfo->vacuums[i];
446 : :
447 [ - + ]: 24 : if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
7162 tgl@sss.pgh.pa.us 448 [ # # ]:LBC (2) : vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
449 : : {
450 : (2) : result = vac->cycleid;
451 : (2) : break;
452 : : }
453 : : }
454 : :
7162 tgl@sss.pgh.pa.us 455 :CBC 11803 : LWLockRelease(BtreeVacuumLock);
456 : 11803 : return result;
457 : : }
458 : :
459 : : /*
460 : : * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation
461 : : *
462 : : * Note: the caller must guarantee that it will eventually call
463 : : * _bt_end_vacuum, else we'll permanently leak an array slot. To ensure
464 : : * that this happens even in elog(FATAL) scenarios, the appropriate coding
465 : : * is not just a PG_TRY, but
466 : : * PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel))
467 : : */
468 : : BTCycleId
469 : 1540 : _bt_start_vacuum(Relation rel)
470 : : {
471 : : BTCycleId result;
472 : : int i;
473 : : BTOneVacInfo *vac;
474 : :
475 : 1540 : LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
476 : :
477 : : /*
478 : : * Assign the next cycle ID, being careful to avoid zero as well as the
479 : : * reserved high values.
480 : : */
6826 481 : 1540 : result = ++(btvacinfo->cycle_ctr);
482 [ + - - + ]: 1540 : if (result == 0 || result > MAX_BT_CYCLE_ID)
6826 tgl@sss.pgh.pa.us 483 :UBC 0 : result = btvacinfo->cycle_ctr = 1;
484 : :
485 : : /* Let's just make sure there's no entry already for this index */
7162 tgl@sss.pgh.pa.us 486 [ + + ]:CBC 1541 : for (i = 0; i < btvacinfo->num_vacuums; i++)
487 : : {
7162 tgl@sss.pgh.pa.us 488 :GBC 1 : vac = &btvacinfo->vacuums[i];
489 [ - + ]: 1 : if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
7162 tgl@sss.pgh.pa.us 490 [ # # ]:UBC 0 : vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
491 : : {
492 : : /*
493 : : * Unlike most places in the backend, we have to explicitly
494 : : * release our LWLock before throwing an error. This is because
495 : : * we expect _bt_end_vacuum() to be called before transaction
496 : : * abort cleanup can run to release LWLocks.
497 : : */
6836 498 : 0 : LWLockRelease(BtreeVacuumLock);
7162 499 [ # # ]: 0 : elog(ERROR, "multiple active vacuums for index \"%s\"",
500 : : RelationGetRelationName(rel));
501 : : }
502 : : }
503 : :
504 : : /* OK, add an entry */
7162 tgl@sss.pgh.pa.us 505 [ - + ]:CBC 1540 : if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
506 : : {
6836 tgl@sss.pgh.pa.us 507 :UBC 0 : LWLockRelease(BtreeVacuumLock);
7162 508 [ # # ]: 0 : elog(ERROR, "out of btvacinfo slots");
509 : : }
7162 tgl@sss.pgh.pa.us 510 :CBC 1540 : vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
511 : 1540 : vac->relid = rel->rd_lockInfo.lockRelId;
512 : 1540 : vac->cycleid = result;
513 : 1540 : btvacinfo->num_vacuums++;
514 : :
515 : 1540 : LWLockRelease(BtreeVacuumLock);
516 : 1540 : return result;
517 : : }
518 : :
519 : : /*
520 : : * _bt_end_vacuum --- mark a btree VACUUM operation as done
521 : : *
522 : : * Note: this is deliberately coded not to complain if no entry is found;
523 : : * this allows the caller to put PG_TRY around the start_vacuum operation.
524 : : */
525 : : void
526 : 1540 : _bt_end_vacuum(Relation rel)
527 : : {
528 : : int i;
529 : :
530 : 1540 : LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
531 : :
532 : : /* Find the array entry */
533 [ + - ]: 1541 : for (i = 0; i < btvacinfo->num_vacuums; i++)
534 : : {
535 : 1541 : BTOneVacInfo *vac = &btvacinfo->vacuums[i];
536 : :
537 [ + + ]: 1541 : if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
538 [ + - ]: 1540 : vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
539 : : {
540 : : /* Remove it by shifting down the last entry */
541 : 1540 : *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
542 : 1540 : btvacinfo->num_vacuums--;
543 : 1540 : break;
544 : : }
545 : : }
546 : :
547 : 1540 : LWLockRelease(BtreeVacuumLock);
548 : 1540 : }
549 : :
550 : : /*
551 : : * _bt_end_vacuum wrapped as an on_shmem_exit callback function
552 : : */
553 : : void
6453 tgl@sss.pgh.pa.us 554 :UBC 0 : _bt_end_vacuum_callback(int code, Datum arg)
555 : : {
556 : 0 : _bt_end_vacuum((Relation) DatumGetPointer(arg));
557 : 0 : }
558 : :
559 : : /*
560 : : * BTreeShmemSize --- report amount of shared memory space needed
561 : : */
562 : : Size
7162 tgl@sss.pgh.pa.us 563 :CBC 3055 : BTreeShmemSize(void)
564 : : {
565 : : Size size;
566 : :
3952 567 : 3055 : size = offsetof(BTVacInfo, vacuums);
1344 rhaas@postgresql.org 568 : 3055 : size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
7162 tgl@sss.pgh.pa.us 569 : 3055 : return size;
570 : : }
571 : :
572 : : /*
573 : : * BTreeShmemInit --- initialize this module's shared memory
574 : : */
575 : : void
576 : 1069 : BTreeShmemInit(void)
577 : : {
578 : : bool found;
579 : :
580 : 1069 : btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
581 : : BTreeShmemSize(),
582 : : &found);
583 : :
584 [ + - ]: 1069 : if (!IsUnderPostmaster)
585 : : {
586 : : /* Initialize shared memory area */
587 [ - + ]: 1069 : Assert(!found);
588 : :
589 : : /*
590 : : * It doesn't really matter what the cycle counter starts at, but
591 : : * having it always start the same doesn't seem good. Seed with
592 : : * low-order bits of time() instead.
593 : : */
594 : 1069 : btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
595 : :
596 : 1069 : btvacinfo->num_vacuums = 0;
1344 rhaas@postgresql.org 597 : 1069 : btvacinfo->max_vacuums = MaxBackends;
598 : : }
599 : : else
7162 tgl@sss.pgh.pa.us 600 [ # # ]:UBC 0 : Assert(found);
7162 tgl@sss.pgh.pa.us 601 :CBC 1069 : }
602 : :
603 : : bytea *
3621 604 : 165 : btoptions(Datum reloptions, bool validate)
605 : : {
606 : : static const relopt_parse_elt tab[] = {
607 : : {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)},
608 : : {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
609 : : offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
610 : : {"deduplicate_items", RELOPT_TYPE_BOOL,
611 : : offsetof(BTOptions, deduplicate_items)}
612 : : };
613 : :
2213 michael@paquier.xyz 614 : 165 : return (bytea *) build_reloptions(reloptions, validate,
615 : : RELOPT_KIND_BTREE,
616 : : sizeof(BTOptions),
617 : : tab, lengthof(tab));
618 : : }
619 : :
620 : : /*
621 : : * btproperty() -- Check boolean properties of indexes.
622 : : *
623 : : * This is optional, but handling AMPROP_RETURNABLE here saves opening the rel
624 : : * to call btcanreturn.
625 : : */
626 : : bool
3412 tgl@sss.pgh.pa.us 627 : 378 : btproperty(Oid index_oid, int attno,
628 : : IndexAMProperty prop, const char *propname,
629 : : bool *res, bool *isnull)
630 : : {
631 [ + + ]: 378 : switch (prop)
632 : : {
633 : 21 : case AMPROP_RETURNABLE:
634 : : /* answer only for columns, not AM or whole index */
635 [ + + ]: 21 : if (attno == 0)
636 : 6 : return false;
637 : : /* otherwise, btree can always return data */
638 : 15 : *res = true;
639 : 15 : return true;
640 : :
641 : 357 : default:
642 : 357 : return false; /* punt to generic code */
643 : : }
644 : : }
645 : :
646 : : /*
647 : : * btbuildphasename() -- Return name of index build phase.
648 : : */
649 : : char *
2450 alvherre@alvh.no-ip. 650 :UBC 0 : btbuildphasename(int64 phasenum)
651 : : {
652 [ # # # # : 0 : switch (phasenum)
# # ]
653 : : {
654 : 0 : case PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE:
655 : 0 : return "initializing";
656 : 0 : case PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN:
657 : 0 : return "scanning table";
658 : 0 : case PROGRESS_BTREE_PHASE_PERFORMSORT_1:
659 : 0 : return "sorting live tuples";
660 : 0 : case PROGRESS_BTREE_PHASE_PERFORMSORT_2:
661 : 0 : return "sorting dead tuples";
662 : 0 : case PROGRESS_BTREE_PHASE_LEAF_LOAD:
663 : 0 : return "loading tuples in tree";
664 : 0 : default:
665 : 0 : return NULL;
666 : : }
667 : : }
668 : :
669 : : /*
670 : : * _bt_truncate() -- create tuple without unneeded suffix attributes.
671 : : *
672 : : * Returns truncated pivot index tuple allocated in caller's memory context,
673 : : * with key attributes copied from caller's firstright argument. If rel is
674 : : * an INCLUDE index, non-key attributes will definitely be truncated away,
675 : : * since they're not part of the key space. More aggressive suffix
676 : : * truncation can take place when it's clear that the returned tuple does not
677 : : * need one or more suffix key attributes. We only need to keep firstright
678 : : * attributes up to and including the first non-lastleft-equal attribute.
679 : : * Caller's insertion scankey is used to compare the tuples; the scankey's
680 : : * argument values are not considered here.
681 : : *
682 : : * Note that returned tuple's t_tid offset will hold the number of attributes
683 : : * present, so the original item pointer offset is not represented. Caller
684 : : * should only change truncated tuple's downlink. Note also that truncated
685 : : * key attributes are treated as containing "minus infinity" values by
686 : : * _bt_compare().
687 : : *
688 : : * In the worst case (when a heap TID must be appended to distinguish lastleft
689 : : * from firstright), the size of the returned tuple is the size of firstright
690 : : * plus the size of an additional MAXALIGN()'d item pointer. This guarantee
691 : : * is important, since callers need to stay under the 1/3 of a page
692 : : * restriction on tuple size. If this routine is ever taught to truncate
693 : : * within an attribute/datum, it will need to avoid returning an enlarged
694 : : * tuple to caller when truncation + TOAST compression ends up enlarging the
695 : : * final datum.
696 : : */
697 : : IndexTuple
2463 pg@bowt.ie 698 :CBC 31718 : _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
699 : : BTScanInsert itup_key)
700 : : {
701 : 31718 : TupleDesc itupdesc = RelationGetDescr(rel);
702 : 31718 : int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
703 : : int keepnatts;
704 : : IndexTuple pivot;
705 : : IndexTuple tidpivot;
706 : : ItemPointer pivotheaptid;
707 : : Size newsize;
708 : :
709 : : /*
710 : : * We should only ever truncate non-pivot tuples from leaf pages. It's
711 : : * never okay to truncate when splitting an internal page.
712 : : */
2120 713 [ + - - + ]: 31718 : Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright));
714 : :
715 : : /* Determine how many attributes must be kept in truncated tuple */
2463 716 : 31718 : keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
717 : :
718 : : #ifdef DEBUG_NO_TRUNCATE
719 : : /* Force truncation to be ineffective for testing purposes */
720 : : keepnatts = nkeyatts + 1;
721 : : #endif
722 : :
2087 723 : 31718 : pivot = index_truncate_tuple(itupdesc, firstright,
724 : : Min(keepnatts, nkeyatts));
725 : :
726 [ + + ]: 31718 : if (BTreeTupleIsPosting(pivot))
727 : : {
728 : : /*
729 : : * index_truncate_tuple() just returns a straight copy of firstright
730 : : * when it has no attributes to truncate. When that happens, we may
731 : : * need to truncate away a posting list here instead.
732 : : */
733 [ + + - + ]: 611 : Assert(keepnatts == nkeyatts || keepnatts == nkeyatts + 1);
734 [ - + ]: 611 : Assert(IndexRelationGetNumberOfAttributes(rel) == nkeyatts);
735 : 611 : pivot->t_info &= ~INDEX_SIZE_MASK;
736 : 611 : pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright));
737 : : }
738 : :
739 : : /*
740 : : * If there is a distinguishing key attribute within pivot tuple, we're
741 : : * done
742 : : */
743 [ + + ]: 31718 : if (keepnatts <= nkeyatts)
744 : : {
2079 745 : 31171 : BTreeTupleSetNAtts(pivot, keepnatts, false);
2087 746 : 31171 : return pivot;
747 : : }
748 : :
749 : : /*
750 : : * We have to store a heap TID in the new pivot tuple, since no non-TID
751 : : * key attribute value in firstright distinguishes the right side of the
752 : : * split from the left side. nbtree conceptualizes this case as an
753 : : * inability to truncate away any key attributes, since heap TID is
754 : : * treated as just another key attribute (despite lacking a pg_attribute
755 : : * entry).
756 : : *
757 : : * Use enlarged space that holds a copy of pivot. We need the extra space
758 : : * to store a heap TID at the end (using the special pivot tuple
759 : : * representation). Note that the original pivot already has firstright's
760 : : * possible posting list/non-key attribute values removed at this point.
761 : : */
762 : 547 : newsize = MAXALIGN(IndexTupleSize(pivot)) + MAXALIGN(sizeof(ItemPointerData));
763 : 547 : tidpivot = palloc0(newsize);
764 : 547 : memcpy(tidpivot, pivot, MAXALIGN(IndexTupleSize(pivot)));
765 : : /* Cannot leak memory here */
766 : 547 : pfree(pivot);
767 : :
768 : : /*
769 : : * Store all of firstright's key attribute values plus a tiebreaker heap
770 : : * TID value in enlarged pivot tuple
771 : : */
772 : 547 : tidpivot->t_info &= ~INDEX_SIZE_MASK;
773 : 547 : tidpivot->t_info |= newsize;
2079 774 : 547 : BTreeTupleSetNAtts(tidpivot, nkeyatts, true);
2087 775 : 547 : pivotheaptid = BTreeTupleGetHeapTID(tidpivot);
776 : :
777 : : /*
778 : : * Lehman & Yao use lastleft as the leaf high key in all cases, but don't
779 : : * consider suffix truncation. It seems like a good idea to follow that
780 : : * example in cases where no truncation takes place -- use lastleft's heap
781 : : * TID. (This is also the closest value to negative infinity that's
782 : : * legally usable.)
783 : : */
2120 784 : 547 : ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid);
785 : :
786 : : /*
787 : : * We're done. Assert() that heap TID invariants hold before returning.
788 : : *
789 : : * Lehman and Yao require that the downlink to the right page, which is to
790 : : * be inserted into the parent page in the second phase of a page split be
791 : : * a strict lower bound on items on the right page, and a non-strict upper
792 : : * bound for items on the left page. Assert that heap TIDs follow these
793 : : * invariants, since a heap TID value is apparently needed as a
794 : : * tiebreaker.
795 : : */
796 : : #ifndef DEBUG_NO_TRUNCATE
797 [ - + ]: 547 : Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft),
798 : : BTreeTupleGetHeapTID(firstright)) < 0);
799 [ - + ]: 547 : Assert(ItemPointerCompare(pivotheaptid,
800 : : BTreeTupleGetHeapTID(lastleft)) >= 0);
801 [ - + ]: 547 : Assert(ItemPointerCompare(pivotheaptid,
802 : : BTreeTupleGetHeapTID(firstright)) < 0);
803 : : #else
804 : :
805 : : /*
806 : : * Those invariants aren't guaranteed to hold for lastleft + firstright
807 : : * heap TID attribute values when they're considered here only because
808 : : * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually
809 : : * needed as a tiebreaker). DEBUG_NO_TRUNCATE must therefore use a heap
810 : : * TID value that always works as a strict lower bound for items to the
811 : : * right. In particular, it must avoid using firstright's leading key
812 : : * attribute values along with lastleft's heap TID value when lastleft's
813 : : * TID happens to be greater than firstright's TID.
814 : : */
815 : : ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
816 : :
817 : : /*
818 : : * Pivot heap TID should never be fully equal to firstright. Note that
819 : : * the pivot heap TID will still end up equal to lastleft's heap TID when
820 : : * that's the only usable value.
821 : : */
822 : : ItemPointerSetOffsetNumber(pivotheaptid,
823 : : OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
824 : : Assert(ItemPointerCompare(pivotheaptid,
825 : : BTreeTupleGetHeapTID(firstright)) < 0);
826 : : #endif
827 : :
2087 828 : 547 : return tidpivot;
829 : : }
830 : :
831 : : /*
832 : : * _bt_keep_natts - how many key attributes to keep when truncating.
833 : : *
834 : : * Caller provides two tuples that enclose a split point. Caller's insertion
835 : : * scankey is used to compare the tuples; the scankey's argument values are
836 : : * not considered here.
837 : : *
838 : : * This can return a number of attributes that is one greater than the
839 : : * number of key attributes for the index relation. This indicates that the
840 : : * caller must use a heap TID as a unique-ifier in new pivot tuple.
841 : : */
842 : : static int
2463 843 : 31718 : _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
844 : : BTScanInsert itup_key)
845 : : {
846 : 31718 : int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
847 : 31718 : TupleDesc itupdesc = RelationGetDescr(rel);
848 : : int keepnatts;
849 : : ScanKey scankey;
850 : :
851 : : /*
852 : : * _bt_compare() treats truncated key attributes as having the value minus
853 : : * infinity, which would break searches within !heapkeyspace indexes. We
854 : : * must still truncate away non-key attribute values, though.
855 : : */
856 [ - + ]: 31718 : if (!itup_key->heapkeyspace)
2463 pg@bowt.ie 857 :UBC 0 : return nkeyatts;
858 : :
2463 pg@bowt.ie 859 :CBC 31718 : scankey = itup_key->scankeys;
860 : 31718 : keepnatts = 1;
861 [ + + ]: 38161 : for (int attnum = 1; attnum <= nkeyatts; attnum++, scankey++)
862 : : {
863 : : Datum datum1,
864 : : datum2;
865 : : bool isNull1,
866 : : isNull2;
867 : :
868 : 37614 : datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
869 : 37614 : datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
870 : :
871 [ - + ]: 37614 : if (isNull1 != isNull2)
872 : 31171 : break;
873 : :
874 [ + + + + ]: 75213 : if (!isNull1 &&
875 : 37599 : DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
876 : : scankey->sk_collation,
877 : : datum1,
878 : : datum2)) != 0)
879 : 31171 : break;
880 : :
881 : 6443 : keepnatts++;
882 : : }
883 : :
884 : : /*
885 : : * Assert that _bt_keep_natts_fast() agrees with us in passing. This is
886 : : * expected in an allequalimage index.
887 : : */
2120 888 [ + + - + ]: 31718 : Assert(!itup_key->allequalimage ||
889 : : keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright));
890 : :
2463 891 : 31718 : return keepnatts;
892 : : }
893 : :
894 : : /*
895 : : * _bt_keep_natts_fast - fast bitwise variant of _bt_keep_natts.
896 : : *
897 : : * This is exported so that a candidate split point can have its effect on
898 : : * suffix truncation inexpensively evaluated ahead of time when finding a
899 : : * split location. A naive bitwise approach to datum comparisons is used to
900 : : * save cycles.
901 : : *
902 : : * The approach taken here usually provides the same answer as _bt_keep_natts
903 : : * will (for the same pair of tuples from a heapkeyspace index), since the
904 : : * majority of btree opclasses can never indicate that two datums are equal
905 : : * unless they're bitwise equal after detoasting. When an index only has
906 : : * "equal image" columns, routine is guaranteed to give the same result as
907 : : * _bt_keep_natts would.
908 : : *
909 : : * Callers can rely on the fact that attributes considered equal here are
910 : : * definitely also equal according to _bt_keep_natts, even when the index uses
911 : : * an opclass or collation that is not "allequalimage"/deduplication-safe.
912 : : * This weaker guarantee is good enough for nbtsplitloc.c caller, since false
913 : : * negatives generally only have the effect of making leaf page splits use a
914 : : * more balanced split point.
915 : : */
916 : : int
917 : 6799746 : _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
918 : : {
919 : 6799746 : TupleDesc itupdesc = RelationGetDescr(rel);
920 : 6799746 : int keysz = IndexRelationGetNumberOfKeyAttributes(rel);
921 : : int keepnatts;
922 : :
923 : 6799746 : keepnatts = 1;
924 [ + + ]: 11419872 : for (int attnum = 1; attnum <= keysz; attnum++)
925 : : {
926 : : Datum datum1,
927 : : datum2;
928 : : bool isNull1,
929 : : isNull2;
930 : : CompactAttribute *att;
931 : :
932 : 10226305 : datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
933 : 10226305 : datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
361 drowley@postgresql.o 934 : 10226305 : att = TupleDescCompactAttr(itupdesc, attnum - 1);
935 : :
2463 pg@bowt.ie 936 [ + + ]: 10226305 : if (isNull1 != isNull2)
937 : 5606179 : break;
938 : :
939 [ + + ]: 10226203 : if (!isNull1 &&
2226 940 [ + + ]: 10202660 : !datum_image_eq(datum1, datum2, att->attbyval, att->attlen))
2463 941 : 5606077 : break;
942 : :
943 : 4620126 : keepnatts++;
944 : : }
945 : :
946 : 6799746 : return keepnatts;
947 : : }
948 : :
949 : : /*
950 : : * _bt_check_natts() -- Verify tuple has expected number of attributes.
951 : : *
952 : : * Returns value indicating if the expected number of attributes were found
953 : : * for a particular offset on page. This can be used as a general purpose
954 : : * sanity check.
955 : : *
956 : : * Testing a tuple directly with BTreeTupleGetNAtts() should generally be
957 : : * preferred to calling here. That's usually more convenient, and is always
958 : : * more explicit. Call here instead when offnum's tuple may be a negative
959 : : * infinity tuple that uses the pre-v11 on-disk representation, or when a low
960 : : * context check is appropriate. This routine is as strict as possible about
961 : : * what is expected on each version of btree.
962 : : */
963 : : bool
964 : 136219296 : _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
965 : : {
2791 tgl@sss.pgh.pa.us 966 : 136219296 : int16 natts = IndexRelationGetNumberOfAttributes(rel);
967 : 136219296 : int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
1355 michael@paquier.xyz 968 : 136219296 : BTPageOpaque opaque = BTPageGetOpaque(page);
969 : : IndexTuple itup;
970 : : int tupnatts;
971 : :
972 : : /*
973 : : * We cannot reliably test a deleted or half-dead page, since they have
974 : : * dummy high keys
975 : : */
2798 teodor@sigaev.ru 976 [ - + ]: 136219296 : if (P_IGNORE(opaque))
2798 teodor@sigaev.ru 977 :UBC 0 : return true;
978 : :
2798 teodor@sigaev.ru 979 [ + - - + ]:CBC 136219296 : Assert(offnum >= FirstOffsetNumber &&
980 : : offnum <= PageGetMaxOffsetNumber(page));
981 : :
982 : 136219296 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2463 pg@bowt.ie 983 [ + + ]: 136219296 : tupnatts = BTreeTupleGetNAtts(itup, rel);
984 : :
985 : : /* !heapkeyspace indexes do not support deduplication */
2120 986 [ - + - - ]: 136219296 : if (!heapkeyspace && BTreeTupleIsPosting(itup))
2120 pg@bowt.ie 987 :UBC 0 : return false;
988 : :
989 : : /* Posting list tuples should never have "pivot heap TID" bit set */
2120 pg@bowt.ie 990 [ + + ]:CBC 136219296 : if (BTreeTupleIsPosting(itup) &&
991 [ - + ]: 1665970 : (ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
992 : : BT_PIVOT_HEAP_TID_ATTR) != 0)
2120 pg@bowt.ie 993 :UBC 0 : return false;
994 : :
995 : : /* INCLUDE indexes do not support deduplication */
2120 pg@bowt.ie 996 [ + + - + ]:CBC 136219296 : if (natts != nkeyatts && BTreeTupleIsPosting(itup))
2120 pg@bowt.ie 997 :UBC 0 : return false;
998 : :
2798 teodor@sigaev.ru 999 [ + + ]:CBC 136219296 : if (P_ISLEAF(opaque))
1000 : : {
1001 [ + + + + ]: 100095796 : if (offnum >= P_FIRSTDATAKEY(opaque))
1002 : : {
1003 : : /*
1004 : : * Non-pivot tuple should never be explicitly marked as a pivot
1005 : : * tuple
1006 : : */
2120 pg@bowt.ie 1007 [ - + ]: 92079381 : if (BTreeTupleIsPivot(itup))
2463 pg@bowt.ie 1008 :UBC 0 : return false;
1009 : :
1010 : : /*
1011 : : * Leaf tuples that are not the page high key (non-pivot tuples)
1012 : : * should never be truncated. (Note that tupnatts must have been
1013 : : * inferred, even with a posting list tuple, because only pivot
1014 : : * tuples store tupnatts directly.)
1015 : : */
2463 pg@bowt.ie 1016 :CBC 92079381 : return tupnatts == natts;
1017 : : }
1018 : : else
1019 : : {
1020 : : /*
1021 : : * Rightmost page doesn't contain a page high key, so tuple was
1022 : : * checked above as ordinary leaf tuple
1023 : : */
2798 teodor@sigaev.ru 1024 [ - + ]: 8016415 : Assert(!P_RIGHTMOST(opaque));
1025 : :
1026 : : /*
1027 : : * !heapkeyspace high key tuple contains only key attributes. Note
1028 : : * that tupnatts will only have been explicitly represented in
1029 : : * !heapkeyspace indexes that happen to have non-key attributes.
1030 : : */
2463 pg@bowt.ie 1031 [ - + ]: 8016415 : if (!heapkeyspace)
2463 pg@bowt.ie 1032 :UBC 0 : return tupnatts == nkeyatts;
1033 : :
1034 : : /* Use generic heapkeyspace pivot tuple handling */
1035 : : }
1036 : : }
1037 : : else /* !P_ISLEAF(opaque) */
1038 : : {
2798 teodor@sigaev.ru 1039 [ + + + + ]:CBC 36123500 : if (offnum == P_FIRSTDATAKEY(opaque))
1040 : : {
1041 : : /*
1042 : : * The first tuple on any internal page (possibly the first after
1043 : : * its high key) is its negative infinity tuple. Negative
1044 : : * infinity tuples are always truncated to zero attributes. They
1045 : : * are a particular kind of pivot tuple.
1046 : : */
2463 pg@bowt.ie 1047 [ + - ]: 1568612 : if (heapkeyspace)
1048 : 1568612 : return tupnatts == 0;
1049 : :
1050 : : /*
1051 : : * The number of attributes won't be explicitly represented if the
1052 : : * negative infinity tuple was generated during a page split that
1053 : : * occurred with a version of Postgres before v11. There must be
1054 : : * a problem when there is an explicit representation that is
1055 : : * non-zero, or when there is no explicit representation and the
1056 : : * tuple is evidently not a pre-pg_upgrade tuple.
1057 : : *
1058 : : * Prior to v11, downlinks always had P_HIKEY as their offset.
1059 : : * Accept that as an alternative indication of a valid
1060 : : * !heapkeyspace negative infinity tuple.
1061 : : */
2463 pg@bowt.ie 1062 [ # # # # ]:UBC 0 : return tupnatts == 0 ||
2120 1063 : 0 : ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY;
1064 : : }
1065 : : else
1066 : : {
1067 : : /*
1068 : : * !heapkeyspace downlink tuple with separator key contains only
1069 : : * key attributes. Note that tupnatts will only have been
1070 : : * explicitly represented in !heapkeyspace indexes that happen to
1071 : : * have non-key attributes.
1072 : : */
2463 pg@bowt.ie 1073 [ - + ]:CBC 34554888 : if (!heapkeyspace)
2463 pg@bowt.ie 1074 :UBC 0 : return tupnatts == nkeyatts;
1075 : :
1076 : : /* Use generic heapkeyspace pivot tuple handling */
1077 : : }
1078 : : }
1079 : :
1080 : : /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
2463 pg@bowt.ie 1081 [ - + ]:CBC 42571303 : Assert(heapkeyspace);
1082 : :
1083 : : /*
1084 : : * Explicit representation of the number of attributes is mandatory with
1085 : : * heapkeyspace index pivot tuples, regardless of whether or not there are
1086 : : * non-key attributes.
1087 : : */
2120 1088 [ - + ]: 42571303 : if (!BTreeTupleIsPivot(itup))
2120 pg@bowt.ie 1089 :UBC 0 : return false;
1090 : :
1091 : : /* Pivot tuple should not use posting list representation (redundant) */
2120 pg@bowt.ie 1092 [ - + ]:CBC 42571303 : if (BTreeTupleIsPosting(itup))
2463 pg@bowt.ie 1093 :UBC 0 : return false;
1094 : :
1095 : : /*
1096 : : * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
1097 : : * when any other key attribute is truncated
1098 : : */
2463 pg@bowt.ie 1099 [ + + - + ]:CBC 42571303 : if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
2463 pg@bowt.ie 1100 :UBC 0 : return false;
1101 : :
1102 : : /*
1103 : : * Pivot tuple must have at least one untruncated key attribute (minus
1104 : : * infinity pivot tuples are the only exception). Pivot tuples can never
1105 : : * represent that there is a value present for a key attribute that
1106 : : * exceeds pg_index.indnkeyatts for the index.
1107 : : */
2463 pg@bowt.ie 1108 [ + - + - ]:CBC 42571303 : return tupnatts > 0 && tupnatts <= nkeyatts;
1109 : : }
1110 : :
1111 : : /*
1112 : : *
1113 : : * _bt_check_third_page() -- check whether tuple fits on a btree page at all.
1114 : : *
1115 : : * We actually need to be able to fit three items on every page, so restrict
1116 : : * any one item to 1/3 the per-page available space. Note that itemsz should
1117 : : * not include the ItemId overhead.
1118 : : *
1119 : : * It might be useful to apply TOAST methods rather than throw an error here.
1120 : : * Using out of line storage would break assumptions made by suffix truncation
1121 : : * and by contrib/amcheck, though.
1122 : : */
1123 : : void
1124 : 132 : _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
1125 : : Page page, IndexTuple newtup)
1126 : : {
1127 : : Size itemsz;
1128 : : BTPageOpaque opaque;
1129 : :
1130 : 132 : itemsz = MAXALIGN(IndexTupleSize(newtup));
1131 : :
1132 : : /* Double check item size against limit */
280 1133 [ - + ]: 132 : if (itemsz <= BTMaxItemSize)
2463 pg@bowt.ie 1134 :UBC 0 : return;
1135 : :
1136 : : /*
1137 : : * Tuple is probably too large to fit on page, but it's possible that the
1138 : : * index uses version 2 or version 3, or that page is an internal page, in
1139 : : * which case a slightly higher limit applies.
1140 : : */
280 pg@bowt.ie 1141 [ + - + - ]:CBC 132 : if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid)
2463 1142 : 132 : return;
1143 : :
1144 : : /*
1145 : : * Internal page insertions cannot fail here, because that would mean that
1146 : : * an earlier leaf level insertion that should have failed didn't
1147 : : */
1355 michael@paquier.xyz 1148 :UBC 0 : opaque = BTPageGetOpaque(page);
2463 pg@bowt.ie 1149 [ # # ]: 0 : if (!P_ISLEAF(opaque))
1150 [ # # ]: 0 : elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
1151 : : itemsz, RelationGetRelationName(rel));
1152 : :
1153 [ # # # # : 0 : ereport(ERROR,
# # ]
1154 : : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1155 : : errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
1156 : : itemsz,
1157 : : needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
1158 : : needheaptidspace ? BTMaxItemSize : BTMaxItemSizeNoHeapTid,
1159 : : RelationGetRelationName(rel)),
1160 : : errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
1161 : : ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)),
1162 : : ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)),
1163 : : RelationGetRelationName(heap)),
1164 : : errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
1165 : : "Consider a function index of an MD5 hash of the value, "
1166 : : "or use full text indexing."),
1167 : : errtableconstraint(heap, RelationGetRelationName(rel))));
1168 : : }
1169 : :
1170 : : /*
1171 : : * Are all attributes in rel "equality is image equality" attributes?
1172 : : *
1173 : : * We use each attribute's BTEQUALIMAGE_PROC opclass procedure. If any
1174 : : * opclass either lacks a BTEQUALIMAGE_PROC procedure or returns false, we
1175 : : * return false; otherwise we return true.
1176 : : *
1177 : : * Returned boolean value is stored in index metapage during index builds.
1178 : : * Deduplication can only be used when we return true.
1179 : : */
1180 : : bool
2120 pg@bowt.ie 1181 :CBC 29888 : _bt_allequalimage(Relation rel, bool debugmessage)
1182 : : {
1183 : 29888 : bool allequalimage = true;
1184 : :
1185 : : /* INCLUDE indexes can never support deduplication */
1186 : 29888 : if (IndexRelationGetNumberOfAttributes(rel) !=
1187 [ + + ]: 29888 : IndexRelationGetNumberOfKeyAttributes(rel))
1188 : 136 : return false;
1189 : :
1190 [ + + ]: 78306 : for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
1191 : : {
1192 : 48813 : Oid opfamily = rel->rd_opfamily[i];
1193 : 48813 : Oid opcintype = rel->rd_opcintype[i];
1194 : 48813 : Oid collation = rel->rd_indcollation[i];
1195 : : Oid equalimageproc;
1196 : :
1197 : 48813 : equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
1198 : : BTEQUALIMAGE_PROC);
1199 : :
1200 : : /*
1201 : : * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
1202 : : * be unsafe. Otherwise, actually call proc and see what it says.
1203 : : */
1204 [ + + ]: 48813 : if (!OidIsValid(equalimageproc) ||
1205 [ + + ]: 48576 : !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
1206 : : ObjectIdGetDatum(opcintype))))
1207 : : {
1208 : 259 : allequalimage = false;
1209 : 259 : break;
1210 : : }
1211 : : }
1212 : :
1213 [ + + ]: 29752 : if (debugmessage)
1214 : : {
1215 [ + + ]: 25730 : if (allequalimage)
1216 [ + + ]: 25471 : elog(DEBUG1, "index \"%s\" can safely use deduplication",
1217 : : RelationGetRelationName(rel));
1218 : : else
1219 [ - + ]: 259 : elog(DEBUG1, "index \"%s\" cannot use deduplication",
1220 : : RelationGetRelationName(rel));
1221 : : }
1222 : :
1223 : 29752 : return allequalimage;
1224 : : }
|