Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * nbtsearch.c
4 : : * Search code for postgres btrees.
5 : : *
6 : : *
7 : : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 : : * Portions Copyright (c) 1994, Regents of the University of California
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/access/nbtree/nbtsearch.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : :
16 : : #include "postgres.h"
17 : :
18 : : #include "access/nbtree.h"
19 : : #include "access/relscan.h"
20 : : #include "access/xact.h"
21 : : #include "miscadmin.h"
22 : : #include "pgstat.h"
23 : : #include "storage/predicate.h"
24 : : #include "utils/lsyscache.h"
25 : : #include "utils/rel.h"
26 : :
27 : :
28 : : static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so);
29 : : static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
30 : : Buffer buf, bool forupdate, BTStack stack,
31 : : int access);
32 : : static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
33 : : static int _bt_binsrch_posting(BTScanInsert key, Page page,
34 : : OffsetNumber offnum);
35 : : static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
36 : : OffsetNumber offnum, bool firstpage);
37 : : static void _bt_saveitem(BTScanOpaque so, int itemIndex,
38 : : OffsetNumber offnum, IndexTuple itup);
39 : : static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
40 : : OffsetNumber offnum, ItemPointer heapTid,
41 : : IndexTuple itup);
42 : : static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
43 : : OffsetNumber offnum,
44 : : ItemPointer heapTid, int tupleOffset);
45 : : static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);
46 : : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
47 : : static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
48 : : ScanDirection dir);
49 : : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
50 : : BlockNumber lastcurrblkno, ScanDirection dir,
51 : : bool seized);
52 : : static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
53 : : BlockNumber lastcurrblkno);
54 : : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
55 : :
56 : :
57 : : /*
58 : : * _bt_drop_lock_and_maybe_pin()
59 : : *
60 : : * Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too.
61 : : * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
62 : : */
63 : : static inline void
92 pg@bowt.ie 64 :CBC 4779385 : _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
65 : : {
66 [ + + ]: 4779385 : if (!so->dropPin)
67 : : {
68 : : /* Just drop the lock (not the pin) */
69 : 245415 : _bt_unlockbuf(rel, so->currPos.buf);
70 : 245415 : return;
71 : : }
72 : :
73 : : /*
74 : : * Drop both the lock and the pin.
75 : : *
76 : : * Have to set so->currPos.lsn so that _bt_killitems has a way to detect
77 : : * when concurrent heap TID recycling by VACUUM might have taken place.
78 : : */
79 [ + - + + : 4533970 : Assert(RelationNeedsWAL(rel));
+ - - + ]
80 : 4533970 : so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
81 : 4533970 : _bt_relbuf(rel, so->currPos.buf);
82 : 4533970 : so->currPos.buf = InvalidBuffer;
83 : : }
84 : :
85 : : /*
86 : : * _bt_search() -- Search the tree for a particular scankey,
87 : : * or more precisely for the first leaf page it could be on.
88 : : *
89 : : * The passed scankey is an insertion-type scankey (see nbtree/README),
90 : : * but it can omit the rightmost column(s) of the index.
91 : : *
92 : : * Return value is a stack of parent-page pointers (i.e. there is no entry for
93 : : * the leaf level/page). *bufP is set to the address of the leaf-page buffer,
94 : : * which is locked and pinned. No locks are held on the parent pages,
95 : : * however!
96 : : *
97 : : * The returned buffer is locked according to access parameter. Additionally,
98 : : * access = BT_WRITE will allow an empty root page to be created and returned.
99 : : * When access = BT_READ, an empty index will result in *bufP being set to
100 : : * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
101 : : * during the search will be finished.
102 : : *
103 : : * heaprel must be provided by callers that pass access = BT_WRITE, since we
104 : : * might need to allocate a new root page for caller -- see _bt_allocbuf.
105 : : */
106 : : BTStack
889 andres@anarazel.de 107 : 10781608 : _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
108 : : int access)
109 : : {
8934 bruce@momjian.us 110 : 10781608 : BTStack stack_in = NULL;
2597 akorotkov@postgresql 111 : 10781608 : int page_access = BT_READ;
112 : :
113 : : /* heaprel must be set whenever _bt_allocbuf is reachable */
819 pg@bowt.ie 114 [ + + - + ]: 10781608 : Assert(access == BT_READ || access == BT_WRITE);
115 [ + + - + ]: 10781608 : Assert(access == BT_READ || heaprel != NULL);
116 : :
117 : : /* Get the root page to start with */
889 andres@anarazel.de 118 : 10781608 : *bufP = _bt_getroot(rel, heaprel, access);
119 : :
120 : : /* If index is empty and access = BT_READ, no root page is created. */
8934 bruce@momjian.us 121 [ + + ]: 10781608 : if (!BufferIsValid(*bufP))
9178 tgl@sss.pgh.pa.us 122 : 252625 : return (BTStack) NULL;
123 : :
124 : : /* Loop iterates once per level descended in the tree */
125 : : for (;;)
126 : 8726379 : {
127 : : Page page;
128 : : BTPageOpaque opaque;
129 : : OffsetNumber offnum;
130 : : ItemId itemid;
131 : : IndexTuple itup;
132 : : BlockNumber child;
133 : : BTStack new_stack;
134 : :
135 : : /*
136 : : * Race -- the page we just grabbed may have split since we read its
137 : : * downlink in its parent page (or the metapage). If it has, we may
138 : : * need to move right to its new sibling. Do that.
139 : : *
140 : : * In write-mode, allow _bt_moveright to finish any incomplete splits
141 : : * along the way. Strictly speaking, we'd only need to finish an
142 : : * incomplete split on the leaf page we're about to insert to, not on
143 : : * any of the upper levels (internal pages with incomplete splits are
144 : : * also taken care of in _bt_getstackbuf). But this is a good
145 : : * opportunity to finish splits of internal pages too.
146 : : */
889 andres@anarazel.de 147 : 19255362 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
148 : : stack_in, page_access);
149 : :
150 : : /* if this is a leaf page, we're done */
3426 kgrittn@postgresql.o 151 : 19255362 : page = BufferGetPage(*bufP);
1254 michael@paquier.xyz 152 : 19255362 : opaque = BTPageGetOpaque(page);
9178 tgl@sss.pgh.pa.us 153 [ + + ]: 19255362 : if (P_ISLEAF(opaque))
154 : 10528983 : break;
155 : :
156 : : /*
157 : : * Find the appropriate pivot tuple on this page. Its downlink points
158 : : * to the child page that we're about to descend to.
159 : : */
2362 pg@bowt.ie 160 : 8726379 : offnum = _bt_binsrch(rel, key, *bufP);
9178 tgl@sss.pgh.pa.us 161 : 8726379 : itemid = PageGetItemId(page, offnum);
7164 162 : 8726379 : itup = (IndexTuple) PageGetItem(page, itemid);
2019 pg@bowt.ie 163 [ - + - - ]: 8726379 : Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
1892 164 : 8726379 : child = BTreeTupleGetDownLink(itup);
165 : :
166 : : /*
167 : : * We need to save the location of the pivot tuple we chose in a new
168 : : * stack entry for this page/level. If caller ends up splitting a
169 : : * page one level down, it usually ends up inserting a new pivot
170 : : * tuple/downlink immediately after the location recorded here.
171 : : */
9178 tgl@sss.pgh.pa.us 172 : 8726379 : new_stack = (BTStack) palloc(sizeof(BTStackData));
1892 pg@bowt.ie 173 : 8726379 : new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
9178 tgl@sss.pgh.pa.us 174 : 8726379 : new_stack->bts_offset = offnum;
175 : 8726379 : new_stack->bts_parent = stack_in;
176 : :
177 : : /*
178 : : * Page level 1 is lowest non-leaf page level prior to leaves. So, if
179 : : * we're on the level 1 and asked to lock leaf page in write mode,
180 : : * then lock next page in write mode, because it must be a leaf.
181 : : */
1655 pg@bowt.ie 182 [ + + + + ]: 8726379 : if (opaque->btpo_level == 1 && access == BT_WRITE)
2597 akorotkov@postgresql 183 : 3165996 : page_access = BT_WRITE;
184 : :
185 : : /* drop the read lock on the page, then acquire one on its child */
1892 pg@bowt.ie 186 : 8726379 : *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
187 : :
188 : : /* okay, all set to move down a level */
9178 tgl@sss.pgh.pa.us 189 : 8726379 : stack_in = new_stack;
190 : : }
191 : :
192 : : /*
193 : : * If we're asked to lock leaf in write mode, but didn't manage to, then
194 : : * relock. This should only happen when the root page is a leaf page (and
195 : : * the only page in the index other than the metapage).
196 : : */
2597 akorotkov@postgresql 197 [ + + + + ]: 10528983 : if (access == BT_WRITE && page_access == BT_READ)
198 : : {
199 : : /* trade in our read lock for a write lock */
1873 pg@bowt.ie 200 : 445502 : _bt_unlockbuf(rel, *bufP);
201 : 445502 : _bt_lockbuf(rel, *bufP, BT_WRITE);
202 : :
203 : : /*
204 : : * Race -- the leaf page may have split after we dropped the read lock
205 : : * but before we acquired a write lock. If it has, we may need to
206 : : * move right to its new sibling. Do that.
207 : : */
729 tmunro@postgresql.or 208 : 445502 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
209 : : }
210 : :
9178 tgl@sss.pgh.pa.us 211 : 10528983 : return stack_in;
212 : : }
213 : :
214 : : /*
215 : : * _bt_moveright() -- move right in the btree if necessary.
216 : : *
217 : : * When we follow a pointer to reach a page, it is possible that
218 : : * the page has changed in the meanwhile. If this happens, we're
219 : : * guaranteed that the page has "split right" -- that is, that any
220 : : * data that appeared on the page originally is either on the page
221 : : * or strictly to the right of it.
222 : : *
223 : : * This routine decides whether or not we need to move right in the
224 : : * tree by examining the high key entry on the page. If that entry is
225 : : * strictly less than the scankey, or <= the scankey in the
226 : : * key.nextkey=true case, then we followed the wrong link and we need
227 : : * to move right.
228 : : *
229 : : * The passed insertion-type scankey can omit the rightmost column(s) of the
230 : : * index. (see nbtree/README)
231 : : *
232 : : * When key.nextkey is false (the usual case), we are looking for the first
233 : : * item >= key. When key.nextkey is true, we are looking for the first item
234 : : * strictly greater than key.
235 : : *
236 : : * If forupdate is true, we will attempt to finish any incomplete splits
237 : : * that we encounter. This is required when locking a target page for an
238 : : * insertion, because we don't allow inserting on a page before the split is
239 : : * completed. 'heaprel' and 'stack' are only used if forupdate is true.
240 : : *
241 : : * On entry, we have the buffer pinned and a lock of the type specified by
242 : : * 'access'. If we move right, we release the buffer and lock and acquire
243 : : * the same on the right sibling. Return value is the buffer we stop at.
244 : : */
245 : : static Buffer
10651 scrappy@hub.org 246 : 19700864 : _bt_moveright(Relation rel,
247 : : Relation heaprel,
248 : : BTScanInsert key,
249 : : Buffer buf,
250 : : bool forupdate,
251 : : BTStack stack,
252 : : int access)
253 : : {
254 : : Page page;
255 : : BTPageOpaque opaque;
256 : : int32 cmpval;
257 : :
819 pg@bowt.ie 258 [ + + - + ]: 19700864 : Assert(!forupdate || heaprel != NULL);
259 : :
260 : : /*
261 : : * When nextkey = false (normal case): if the scan key that brought us to
262 : : * this page is > the high key stored on the page, then the page has split
263 : : * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
264 : : * have some duplicates to the right as well as the left, but that's
265 : : * something that's only ever dealt with on the leaf level, after
266 : : * _bt_search has found an initial leaf page.)
267 : : *
268 : : * When nextkey = true: move right if the scan key is >= page's high key.
269 : : * (Note that key.scantid cannot be set in this case.)
270 : : *
271 : : * The page could even have split more than once, so scan as far as
272 : : * needed.
273 : : *
274 : : * We also have to move right if we followed a link that brought us to a
275 : : * dead page.
276 : : */
2362 277 : 19700864 : cmpval = key->nextkey ? 0 : 1;
278 : :
279 : : for (;;)
280 : : {
3426 kgrittn@postgresql.o 281 : 19701608 : page = BufferGetPage(buf);
1254 michael@paquier.xyz 282 : 19701608 : opaque = BTPageGetOpaque(page);
283 : :
4190 heikki.linnakangas@i 284 [ + + ]: 19701608 : if (P_RIGHTMOST(opaque))
285 : 15141562 : break;
286 : :
287 : : /*
288 : : * Finish any incomplete splits we encounter along the way.
289 : : */
290 [ + + - + ]: 4560046 : if (forupdate && P_INCOMPLETE_SPLIT(opaque))
4190 heikki.linnakangas@i 291 :UBC 0 : {
292 : 0 : BlockNumber blkno = BufferGetBlockNumber(buf);
293 : :
294 : : /* upgrade our lock if necessary */
295 [ # # ]: 0 : if (access == BT_READ)
296 : : {
1873 pg@bowt.ie 297 : 0 : _bt_unlockbuf(rel, buf);
298 : 0 : _bt_lockbuf(rel, buf, BT_WRITE);
299 : : }
300 : :
4190 heikki.linnakangas@i 301 [ # # ]: 0 : if (P_INCOMPLETE_SPLIT(opaque))
889 andres@anarazel.de 302 : 0 : _bt_finish_split(rel, heaprel, buf, stack);
303 : : else
4190 heikki.linnakangas@i 304 : 0 : _bt_relbuf(rel, buf);
305 : :
306 : : /* re-acquire the lock in the right mode, and re-check */
819 pg@bowt.ie 307 : 0 : buf = _bt_getbuf(rel, blkno, access);
4190 heikki.linnakangas@i 308 : 0 : continue;
309 : : }
310 : :
2362 pg@bowt.ie 311 [ + - + + ]:CBC 4560046 : if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
312 : : {
313 : : /* step right one page */
4190 heikki.linnakangas@i 314 : 744 : buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
315 : 744 : continue;
316 : : }
317 : : else
318 : : break;
319 : : }
320 : :
8232 tgl@sss.pgh.pa.us 321 [ - + ]: 19700864 : if (P_IGNORE(opaque))
6459 tgl@sss.pgh.pa.us 322 [ # # ]:UBC 0 : elog(ERROR, "fell off the end of index \"%s\"",
323 : : RelationGetRelationName(rel));
324 : :
9867 bruce@momjian.us 325 :CBC 19700864 : return buf;
326 : : }
327 : :
328 : : /*
329 : : * _bt_binsrch() -- Do a binary search for a key on a particular page.
330 : : *
331 : : * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
332 : : * of the last key < given scankey, or last key <= given scankey if nextkey
333 : : * is true. (Since _bt_compare treats the first data key of such a page as
334 : : * minus infinity, there will be at least one key < scankey, so the result
335 : : * always points at one of the keys on the page.)
336 : : *
337 : : * On a leaf page, _bt_binsrch() returns the final result of the initial
338 : : * positioning process that started with _bt_first's call to _bt_search.
339 : : * We're returning a non-pivot tuple offset, so things are a little different.
340 : : * It is possible that we'll return an offset that's either past the last
341 : : * non-pivot slot, or (in the case of a backward scan) before the first slot.
342 : : *
343 : : * This procedure is not responsible for walking right, it just examines
344 : : * the given page. _bt_binsrch() has no lock or refcount side effects
345 : : * on the buffer.
346 : : */
347 : : static OffsetNumber
10651 scrappy@hub.org 348 : 15439868 : _bt_binsrch(Relation rel,
349 : : BTScanInsert key,
350 : : Buffer buf)
351 : : {
352 : : Page page;
353 : : BTPageOpaque opaque;
354 : : OffsetNumber low,
355 : : high;
356 : : int32 result,
357 : : cmpval;
358 : :
3426 kgrittn@postgresql.o 359 : 15439868 : page = BufferGetPage(buf);
1254 michael@paquier.xyz 360 : 15439868 : opaque = BTPageGetOpaque(page);
361 : :
362 : : /* Requesting nextkey semantics while using scantid seems nonsensical */
2189 pg@bowt.ie 363 [ + + - + ]: 15439868 : Assert(!key->nextkey || key->scantid == NULL);
364 : : /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
365 [ + + - + ]: 15439868 : Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
366 : :
9178 tgl@sss.pgh.pa.us 367 [ + + ]: 15439868 : low = P_FIRSTDATAKEY(opaque);
10226 bruce@momjian.us 368 : 15439868 : high = PageGetMaxOffsetNumber(page);
369 : :
370 : : /*
371 : : * If there are no keys on the page, return the first available slot. Note
372 : : * this covers two cases: the page is really empty (no keys), or it
373 : : * contains only a high key. The latter case is possible after vacuuming.
374 : : * This can never happen on an internal page, however, since they are
375 : : * never empty (an internal page must have at least one child).
376 : : */
2362 pg@bowt.ie 377 [ + + ]: 15439868 : if (unlikely(high < low))
9867 bruce@momjian.us 378 : 6330 : return low;
379 : :
380 : : /*
381 : : * Binary search to find the first key on the page >= scan key, or first
382 : : * key > scankey when nextkey is true.
383 : : *
384 : : * For nextkey=false (cmpval=1), the loop invariant is: all slots before
385 : : * 'low' are < scan key, all slots at or after 'high' are >= scan key.
386 : : *
387 : : * For nextkey=true (cmpval=0), the loop invariant is: all slots before
388 : : * 'low' are <= scan key, all slots at or after 'high' are > scan key.
389 : : *
390 : : * We can fall out when high == low.
391 : : */
9549 tgl@sss.pgh.pa.us 392 : 15433538 : high++; /* establish the loop invariant for high */
393 : :
2362 pg@bowt.ie 394 : 15433538 : cmpval = key->nextkey ? 0 : 1; /* select comparison value */
395 : :
9549 tgl@sss.pgh.pa.us 396 [ + + ]: 100481660 : while (high > low)
397 : : {
9278 bruce@momjian.us 398 : 85048122 : OffsetNumber mid = low + ((high - low) / 2);
399 : :
400 : : /* We have low <= mid < high, so mid points at a real slot */
401 : :
2362 pg@bowt.ie 402 : 85048122 : result = _bt_compare(rel, key, page, mid);
403 : :
7930 tgl@sss.pgh.pa.us 404 [ + + ]: 85048122 : if (result >= cmpval)
9549 405 : 54232740 : low = mid + 1;
406 : : else
407 : 30815382 : high = mid;
408 : : }
409 : :
410 : : /*
411 : : * At this point we have high == low.
412 : : *
413 : : * On a leaf page we always return the first non-pivot tuple >= scan key
414 : : * (resp. > scan key) for forward scan callers. For backward scans, it's
415 : : * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
416 : : */
9178 417 [ + + ]: 15433538 : if (P_ISLEAF(opaque))
418 : : {
419 : : /*
420 : : * In the backward scan case we're supposed to locate the last
421 : : * matching tuple on the leaf level -- not the first matching tuple
422 : : * (the last tuple will be the first one returned by the scan).
423 : : *
424 : : * At this point we've located the first non-pivot tuple immediately
425 : : * after the last matching tuple (which might just be maxoff + 1).
426 : : * Compensate by stepping back.
427 : : */
638 pg@bowt.ie 428 [ + + ]: 6707159 : if (key->backward)
429 : 28217 : return OffsetNumberPrev(low);
430 : :
9549 tgl@sss.pgh.pa.us 431 : 6678942 : return low;
432 : : }
433 : :
434 : : /*
435 : : * On a non-leaf page, return the last key < scan key (resp. <= scan key).
436 : : * There must be one if _bt_compare() is playing by the rules.
437 : : *
438 : : * _bt_compare() will seldom see any exactly-matching pivot tuples, since
439 : : * a truncated -inf heap TID is usually enough to prevent it altogether.
440 : : * Even omitted scan key entries are treated as > truncated attributes.
441 : : *
442 : : * However, during backward scans _bt_compare() interprets omitted scan
443 : : * key attributes as == corresponding truncated -inf attributes instead.
444 : : * This works just like < would work here. Under this scheme, < strategy
445 : : * backward scans will always directly descend to the correct leaf page.
446 : : * In particular, they will never incur an "extra" leaf page access with a
447 : : * scan key that happens to contain the same prefix of values as some
448 : : * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
449 : : * it uses a leaf page high key to "re-find" a page undergoing deletion.
450 : : */
9178 451 [ + + - + ]: 8726379 : Assert(low > P_FIRSTDATAKEY(opaque));
452 : :
9549 453 : 8726379 : return OffsetNumberPrev(low);
454 : : }
455 : :
456 : : /*
457 : : *
458 : : * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
459 : : *
460 : : * Like _bt_binsrch(), but with support for caching the binary search
461 : : * bounds. Only used during insertion, and only on the leaf page that it
462 : : * looks like caller will insert tuple on. Exclusive-locked and pinned
463 : : * leaf page is contained within insertstate.
464 : : *
465 : : * Caches the bounds fields in insertstate so that a subsequent call can
466 : : * reuse the low and strict high bounds of original binary search. Callers
467 : : * that use these fields directly must be prepared for the case where low
468 : : * and/or stricthigh are not on the same page (one or both exceed maxoff
469 : : * for the page). The case where there are no items on the page (high <
470 : : * low) makes bounds invalid.
471 : : *
472 : : * Caller is responsible for invalidating bounds when it modifies the page
473 : : * before calling here a second time, and for dealing with posting list
474 : : * tuple matches (callers can use insertstate's postingoff field to
475 : : * determine which existing heap TID will need to be replaced by a posting
476 : : * list split).
477 : : */
478 : : OffsetNumber
2362 pg@bowt.ie 479 : 6483429 : _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
480 : : {
481 : 6483429 : BTScanInsert key = insertstate->itup_key;
482 : : Page page;
483 : : BTPageOpaque opaque;
484 : : OffsetNumber low,
485 : : high,
486 : : stricthigh;
487 : : int32 result,
488 : : cmpval;
489 : :
490 : 6483429 : page = BufferGetPage(insertstate->buf);
1254 michael@paquier.xyz 491 : 6483429 : opaque = BTPageGetOpaque(page);
492 : :
2362 pg@bowt.ie 493 [ - + ]: 6483429 : Assert(P_ISLEAF(opaque));
494 [ - + ]: 6483429 : Assert(!key->nextkey);
2019 495 [ - + ]: 6483429 : Assert(insertstate->postingoff == 0);
496 : :
2362 497 [ + + ]: 6483429 : if (!insertstate->bounds_valid)
498 : : {
499 : : /* Start new binary search */
500 [ + + ]: 3861391 : low = P_FIRSTDATAKEY(opaque);
501 : 3861391 : high = PageGetMaxOffsetNumber(page);
502 : : }
503 : : else
504 : : {
505 : : /* Restore result of previous binary search against same page */
506 : 2622038 : low = insertstate->low;
507 : 2622038 : high = insertstate->stricthigh;
508 : : }
509 : :
510 : : /* If there are no keys on the page, return the first available slot */
511 [ + + ]: 6483429 : if (unlikely(high < low))
512 : : {
513 : : /* Caller can't reuse bounds */
514 : 11495 : insertstate->low = InvalidOffsetNumber;
515 : 11495 : insertstate->stricthigh = InvalidOffsetNumber;
516 : 11495 : insertstate->bounds_valid = false;
517 : 11495 : return low;
518 : : }
519 : :
520 : : /*
521 : : * Binary search to find the first key on the page >= scan key. (nextkey
522 : : * is always false when inserting).
523 : : *
524 : : * The loop invariant is: all slots before 'low' are < scan key, all slots
525 : : * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
526 : : * maintained to save additional search effort for caller.
527 : : *
528 : : * We can fall out when high == low.
529 : : */
530 [ + + ]: 6471934 : if (!insertstate->bounds_valid)
531 : 3849896 : high++; /* establish the loop invariant for high */
532 : 6471934 : stricthigh = high; /* high initially strictly higher */
533 : :
534 : 6471934 : cmpval = 1; /* !nextkey comparison value */
535 : :
536 [ + + ]: 34696811 : while (high > low)
537 : : {
538 : 28224877 : OffsetNumber mid = low + ((high - low) / 2);
539 : :
540 : : /* We have low <= mid < high, so mid points at a real slot */
541 : :
542 : 28224877 : result = _bt_compare(rel, key, page, mid);
543 : :
544 [ + + ]: 28224877 : if (result >= cmpval)
545 : 21470304 : low = mid + 1;
546 : : else
547 : : {
548 : 6754573 : high = mid;
549 [ + + ]: 6754573 : if (result != 0)
550 : 6201701 : stricthigh = high;
551 : : }
552 : :
553 : : /*
554 : : * If tuple at offset located by binary search is a posting list whose
555 : : * TID range overlaps with caller's scantid, perform posting list
556 : : * binary search to set postingoff for caller. Caller must split the
557 : : * posting list when postingoff is set. This should happen
558 : : * infrequently.
559 : : */
2019 560 [ + + + + : 28224877 : if (unlikely(result == 0 && key->scantid != NULL))
+ + ]
561 : : {
562 : : /*
563 : : * postingoff should never be set more than once per leaf page
564 : : * binary search. That would mean that there are duplicate table
565 : : * TIDs in the index, which is never okay. Check for that here.
566 : : */
1410 567 [ - + ]: 214506 : if (insertstate->postingoff != 0)
1410 pg@bowt.ie 568 [ # # ]:UBC 0 : ereport(ERROR,
569 : : (errcode(ERRCODE_INDEX_CORRUPTED),
570 : : errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
571 : : ItemPointerGetBlockNumber(key->scantid),
572 : : ItemPointerGetOffsetNumber(key->scantid),
573 : : low, stricthigh,
574 : : BufferGetBlockNumber(insertstate->buf),
575 : : RelationGetRelationName(rel))));
576 : :
2019 pg@bowt.ie 577 :CBC 214506 : insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
578 : : }
579 : : }
580 : :
581 : : /*
582 : : * On a leaf page, a binary search always returns the first key >= scan
583 : : * key (at least in !nextkey case), which could be the last slot + 1. This
584 : : * is also the lower bound of cached search.
585 : : *
586 : : * stricthigh may also be the last slot + 1, which prevents caller from
587 : : * using bounds directly, but is still useful to us if we're called a
588 : : * second time with cached bounds (cached low will be < stricthigh when
589 : : * that happens).
590 : : */
2362 591 : 6471934 : insertstate->low = low;
592 : 6471934 : insertstate->stricthigh = stricthigh;
593 : 6471934 : insertstate->bounds_valid = true;
594 : :
595 : 6471934 : return low;
596 : : }
597 : :
598 : : /*----------
599 : : * _bt_binsrch_posting() -- posting list binary search.
600 : : *
601 : : * Helper routine for _bt_binsrch_insert().
602 : : *
603 : : * Returns offset into posting list where caller's scantid belongs.
604 : : *----------
605 : : */
606 : : static int
2019 607 : 214506 : _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
608 : : {
609 : : IndexTuple itup;
610 : : ItemId itemid;
611 : : int low,
612 : : high,
613 : : mid,
614 : : res;
615 : :
616 : : /*
617 : : * If this isn't a posting tuple, then the index must be corrupt (if it is
618 : : * an ordinary non-pivot tuple then there must be an existing tuple with a
619 : : * heap TID that equals inserter's new heap TID/scantid). Defensively
620 : : * check that tuple is a posting list tuple whose posting list range
621 : : * includes caller's scantid.
622 : : *
623 : : * (This is also needed because contrib/amcheck's rootdescend option needs
624 : : * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
625 : : */
626 : 214506 : itemid = PageGetItemId(page, offnum);
627 : 214506 : itup = (IndexTuple) PageGetItem(page, itemid);
628 [ + + ]: 214506 : if (!BTreeTupleIsPosting(itup))
629 : 201098 : return 0;
630 : :
631 [ + - - + ]: 13408 : Assert(key->heapkeyspace && key->allequalimage);
632 : :
633 : : /*
634 : : * In the event that posting list tuple has LP_DEAD bit set, indicate this
635 : : * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
636 : : * second call to _bt_binsrch_insert() can take place when its caller has
637 : : * removed the dead item.
638 : : */
639 [ + + ]: 13408 : if (ItemIdIsDead(itemid))
640 : 1 : return -1;
641 : :
642 : : /* "high" is past end of posting list for loop invariant */
643 : 13407 : low = 0;
644 : 13407 : high = BTreeTupleGetNPosting(itup);
645 [ - + ]: 13407 : Assert(high >= 2);
646 : :
647 [ + + ]: 107041 : while (high > low)
648 : : {
649 : 93634 : mid = low + ((high - low) / 2);
650 : 93634 : res = ItemPointerCompare(key->scantid,
651 : : BTreeTupleGetPostingN(itup, mid));
652 : :
653 [ + + ]: 93634 : if (res > 0)
654 : 49143 : low = mid + 1;
655 [ + - ]: 44491 : else if (res < 0)
656 : 44491 : high = mid;
657 : : else
2019 pg@bowt.ie 658 :UBC 0 : return mid;
659 : : }
660 : :
661 : : /* Exact match not found */
2019 pg@bowt.ie 662 :CBC 13407 : return low;
663 : : }
664 : :
665 : : /*----------
666 : : * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
667 : : *
668 : : * page/offnum: location of btree item to be compared to.
669 : : *
670 : : * This routine returns:
671 : : * <0 if scankey < tuple at offnum;
672 : : * 0 if scankey == tuple at offnum;
673 : : * >0 if scankey > tuple at offnum.
674 : : *
675 : : * NULLs in the keys are treated as sortable values. Therefore
676 : : * "equality" does not necessarily mean that the item should be returned
677 : : * to the caller as a matching key. Similarly, an insertion scankey
678 : : * with its scantid set is treated as equal to a posting tuple whose TID
679 : : * range overlaps with their scantid. There generally won't be a
680 : : * matching TID in the posting tuple, which caller must handle
681 : : * themselves (e.g., by splitting the posting list tuple).
682 : : *
683 : : * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
684 : : * "minus infinity": this routine will always claim it is less than the
685 : : * scankey. The actual key value stored is explicitly truncated to 0
686 : : * attributes (explicitly minus infinity) with version 3+ indexes, but
687 : : * that isn't relied upon. This allows us to implement the Lehman and
688 : : * Yao convention that the first down-link pointer is before the first
689 : : * key. See backend/access/nbtree/README for details.
690 : : *----------
691 : : */
692 : : int32
10651 scrappy@hub.org 693 : 128692627 : _bt_compare(Relation rel,
694 : : BTScanInsert key,
695 : : Page page,
696 : : OffsetNumber offnum)
697 : : {
9178 tgl@sss.pgh.pa.us 698 : 128692627 : TupleDesc itupdesc = RelationGetDescr(rel);
1254 michael@paquier.xyz 699 : 128692627 : BTPageOpaque opaque = BTPageGetOpaque(page);
700 : : IndexTuple itup;
701 : : ItemPointer heapTid;
702 : : ScanKey scankey;
703 : : int ncmpkey;
704 : : int ntupatts;
705 : : int32 result;
706 : :
2362 pg@bowt.ie 707 [ - + ]: 128692627 : Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
708 [ - + ]: 128692627 : Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
709 [ - + - - ]: 128692627 : Assert(key->heapkeyspace || key->scantid == NULL);
710 : :
711 : : /*
712 : : * Force result ">" if target item is first data item on an internal page
713 : : * --- see NOTE above.
714 : : */
8934 bruce@momjian.us 715 [ + + + + : 128692627 : if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
+ + ]
9867 716 : 1481744 : return 1;
717 : :
7164 tgl@sss.pgh.pa.us 718 : 127210883 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2362 pg@bowt.ie 719 [ + + ]: 127210883 : ntupatts = BTreeTupleGetNAtts(itup, rel);
720 : :
721 : : /*
722 : : * The scan key is set up with the attribute number associated with each
723 : : * term in the key. It is important that, if the index is multi-key, the
724 : : * scan contain the first k key attributes, and that they be in order. If
725 : : * you think about how multi-key ordering works, you'll understand why
726 : : * this is.
727 : : *
728 : : * We don't test for violation of this condition here, however. The
729 : : * initial setup for the index scan had better have gotten it right (see
730 : : * _bt_first).
731 : : */
732 : :
733 : 127210883 : ncmpkey = Min(ntupatts, key->keysz);
734 [ - + - - ]: 127210883 : Assert(key->heapkeyspace || ncmpkey == key->keysz);
2019 735 [ + + - + ]: 127210883 : Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
2362 736 : 127210883 : scankey = key->scankeys;
737 [ + + ]: 159918612 : for (int i = 1; i <= ncmpkey; i++)
738 : : {
739 : : Datum datum;
740 : : bool isNull;
741 : :
7969 tgl@sss.pgh.pa.us 742 : 148910474 : datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
743 : :
2999 744 [ + + ]: 148910474 : if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
745 : : {
9178 746 [ + + ]: 279810 : if (isNull)
9230 747 : 78692 : result = 0; /* NULL "=" NULL */
6815 748 [ + + ]: 201118 : else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
749 : 312 : result = -1; /* NULL "<" NOT_NULL */
750 : : else
9230 751 : 200806 : result = 1; /* NULL ">" NOT_NULL */
752 : : }
9178 753 [ + + ]: 148630664 : else if (isNull) /* key is NOT_NULL and item is NULL */
754 : : {
6815 755 [ - + ]: 174 : if (scankey->sk_flags & SK_BT_NULLS_FIRST)
6815 tgl@sss.pgh.pa.us 756 :UBC 0 : result = 1; /* NOT_NULL ">" NULL */
757 : : else
6815 tgl@sss.pgh.pa.us 758 :CBC 174 : result = -1; /* NOT_NULL "<" NULL */
759 : : }
760 : : else
761 : : {
762 : : /*
763 : : * The sk_func needs to be passed the index value as left arg and
764 : : * the sk_argument as right arg (they might be of different
765 : : * types). Since it is convenient for callers to think of
766 : : * _bt_compare as comparing the scankey to the index item, we have
767 : : * to flip the sign of the comparison result. (Unless it's a DESC
768 : : * column, in which case we *don't* flip the sign.)
769 : : */
5261 770 : 148630490 : result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
771 : : scankey->sk_collation,
772 : : datum,
773 : : scankey->sk_argument));
774 : :
6815 775 [ + + ]: 148630490 : if (!(scankey->sk_flags & SK_BT_DESC))
2528 776 [ + + ]: 148630457 : INVERT_COMPARE_RESULT(result);
777 : : }
778 : :
779 : : /* if the keys are unequal, return the difference */
10226 bruce@momjian.us 780 [ + + ]: 148910474 : if (result != 0)
9867 781 : 116202745 : return result;
782 : :
7969 tgl@sss.pgh.pa.us 783 : 32707729 : scankey++;
784 : : }
785 : :
786 : : /*
787 : : * All non-truncated attributes (other than heap TID) were found to be
788 : : * equal. Treat truncated attributes as minus infinity when scankey has a
789 : : * key attribute value that would otherwise be compared directly.
790 : : *
791 : : * Note: it doesn't matter if ntupatts includes non-key attributes;
792 : : * scankey won't, so explicitly excluding non-key attributes isn't
793 : : * necessary.
794 : : */
2362 pg@bowt.ie 795 [ + + ]: 11008138 : if (key->keysz > ntupatts)
796 : 95831 : return 1;
797 : :
798 : : /*
799 : : * Use the heap TID attribute and scantid to try to break the tie. The
800 : : * rules are the same as any other key attribute -- only the
801 : : * representation differs.
802 : : */
803 : 10912307 : heapTid = BTreeTupleGetHeapTID(itup);
804 [ + + ]: 10912307 : if (key->scantid == NULL)
805 : : {
806 : : /*
807 : : * Forward scans have a scankey that is considered greater than a
808 : : * truncated pivot tuple if and when the scankey has equal values for
809 : : * attributes up to and including the least significant untruncated
810 : : * attribute in tuple. Even attributes that were omitted from the
811 : : * scan key are considered greater than -inf truncated attributes.
812 : : * (See _bt_binsrch for an explanation of our backward scan behavior.)
813 : : *
814 : : * For example, if an index has the minimum two attributes (single
815 : : * user key attribute, plus heap TID attribute), and a page's high key
816 : : * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
817 : : * will not descend to the page to the left. The search will descend
818 : : * right instead. The truncated attribute in pivot tuple means that
819 : : * all non-pivot tuples on the page to the left are strictly < 'foo',
820 : : * so it isn't necessary to descend left. In other words, search
821 : : * doesn't have to descend left because it isn't interested in a match
822 : : * that has a heap TID value of -inf.
823 : : *
824 : : * Note: the heap TID part of the test ensures that scankey is being
825 : : * compared to a pivot tuple with one or more truncated -inf key
826 : : * attributes. The heap TID attribute is the last key attribute in
827 : : * every index, of course, but other than that it isn't special.
828 : : */
638 829 [ + + + + : 8666558 : if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
+ + ]
830 [ + - ]: 4458 : key->heapkeyspace)
2362 831 : 4458 : return 1;
832 : :
833 : : /* All provided scankey arguments found to be equal */
834 : 8662100 : return 0;
835 : : }
836 : :
837 : : /*
838 : : * Treat truncated heap TID as minus infinity, since scankey has a key
839 : : * attribute value (scantid) that would otherwise be compared directly
840 : : */
841 [ - + ]: 2245749 : Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
842 [ + + ]: 2245749 : if (heapTid == NULL)
843 : 1895 : return 1;
844 : :
845 : : /*
846 : : * Scankey must be treated as equal to a posting list tuple if its scantid
847 : : * value falls within the range of the posting list. In all other cases
848 : : * there can only be a single heap TID value, which is compared directly
849 : : * with scantid.
850 : : */
851 [ - + ]: 2243854 : Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
2019 852 : 2243854 : result = ItemPointerCompare(key->scantid, heapTid);
853 [ + + + + ]: 2243854 : if (result <= 0 || !BTreeTupleIsPosting(itup))
854 : 2159080 : return result;
855 : : else
856 : : {
857 : 84774 : result = ItemPointerCompare(key->scantid,
858 : : BTreeTupleGetMaxHeapTID(itup));
859 [ + + ]: 84774 : if (result > 0)
860 : 71366 : return 1;
861 : : }
862 : :
863 : 13408 : return 0;
864 : : }
865 : :
866 : : /*
867 : : * _bt_first() -- Find the first item in a scan.
868 : : *
869 : : * We need to be clever about the direction of scan, the search
870 : : * conditions, and the tree ordering. We find the first item (or,
871 : : * if backwards scan, the last item) in the tree that satisfies the
872 : : * qualifications in the scan key. On success exit, data about the
873 : : * matching tuple(s) on the page has been loaded into so->currPos. We'll
874 : : * drop all locks and hold onto a pin on page's buffer, except during
875 : : * so->dropPin scans, when we drop both the lock and the pin.
876 : : * _bt_returnitem sets the next item to return to scan on success exit.
877 : : *
878 : : * If there are no matching items in the index, we return false, with no
879 : : * pins or locks held. so->currPos will remain invalid.
880 : : *
881 : : * Note that scan->keyData[], and the so->keyData[] scankey built from it,
882 : : * are both search-type scankeys (see nbtree/README for more about this).
883 : : * Within this routine, we build a temporary insertion-type scankey to use
884 : : * in locating the scan start position.
885 : : */
886 : : bool
10651 scrappy@hub.org 887 : 7007870 : _bt_first(IndexScanDesc scan, ScanDirection dir)
888 : : {
8506 tgl@sss.pgh.pa.us 889 : 7007870 : Relation rel = scan->indexRelation;
890 : 7007870 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
891 : : BTStack stack;
892 : : OffsetNumber offnum;
893 : : BTScanInsertData inskey;
894 : : ScanKey startKeys[INDEX_MAX_KEYS];
895 : : ScanKeyData notnullkey;
638 pg@bowt.ie 896 : 7007870 : int keysz = 0;
52 pg@bowt.ie 897 :GNC 7007870 : StrategyNumber strat_total = InvalidStrategy;
242 pg@bowt.ie 898 :CBC 7007870 : BlockNumber blkno = InvalidBlockNumber,
899 : : lastcurrblkno;
900 : :
3818 kgrittn@postgresql.o 901 [ + - - + : 7007870 : Assert(!BTScanPosIsValid(so->currPos));
- + ]
902 : :
903 : : /*
904 : : * Examine the scan keys and eliminate any redundant keys; also mark the
905 : : * keys that must be matched to continue the scan.
906 : : */
7969 tgl@sss.pgh.pa.us 907 : 7007870 : _bt_preprocess_keys(scan);
908 : :
909 : : /*
910 : : * Quit now if _bt_preprocess_keys() discovered that the scan keys can
911 : : * never be satisfied (eg, x == 1 AND x > 2).
912 : : */
8934 bruce@momjian.us 913 [ + + ]: 7007870 : if (!so->qual_ok)
914 : : {
242 pg@bowt.ie 915 [ - + ]: 1051 : Assert(!so->needPrimScan);
1815 akapila@postgresql.o 916 : 1051 : _bt_parallel_done(scan);
8510 tgl@sss.pgh.pa.us 917 : 1051 : return false;
918 : : }
919 : :
920 : : /*
921 : : * If this is a parallel scan, we must seize the scan. _bt_readfirstpage
922 : : * will likely release the parallel scan later on.
923 : : */
242 pg@bowt.ie 924 [ + + ]: 7006819 : if (scan->parallel_scan != NULL &&
925 [ + + ]: 223 : !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
926 : 118 : return false;
927 : :
928 : : /*
929 : : * Initialize the scan's arrays (if any) for the current scan direction
930 : : * (except when they were already set to later values as part of
931 : : * scheduling the primitive index scan that is now underway)
932 : : */
933 [ + + + + ]: 7006701 : if (so->numArrayKeys && !so->needPrimScan)
934 : 35478 : _bt_start_array_keys(scan, dir);
935 : :
936 [ + + ]: 7006701 : if (blkno != InvalidBlockNumber)
937 : : {
938 : : /*
939 : : * We anticipated calling _bt_search, but another worker bet us to it.
940 : : * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
941 : : */
942 [ - + ]: 43 : Assert(scan->parallel_scan != NULL);
943 [ - + ]: 43 : Assert(!so->needPrimScan);
302 944 [ - + ]: 43 : Assert(blkno != P_NONE);
945 : :
242 946 [ + + ]: 43 : if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
242 pg@bowt.ie 947 :GBC 1 : return false;
948 : :
242 pg@bowt.ie 949 :CBC 42 : _bt_returnitem(scan, so);
950 : 42 : return true;
951 : : }
952 : :
953 : : /*
954 : : * Count an indexscan for stats, now that we know that we'll call
955 : : * _bt_search/_bt_endpoint below
956 : : */
351 957 [ + + + + : 7006658 : pgstat_count_index_scan(rel);
+ + ]
179 958 [ + + ]: 7006658 : if (scan->instrument)
959 : 395911 : scan->instrument->nsearches++;
960 : :
961 : : /*----------
962 : : * Examine the scan keys to discover where we need to start the scan.
963 : : * The selected scan keys (at most one per index column) are remembered by
964 : : * storing their addresses into the local startKeys[] array. The final
965 : : * startKeys[] entry's strategy is set in strat_total. (Actually, there
966 : : * are a couple of cases where we force a less/more restrictive strategy.)
967 : : *
968 : : * We must use the key that was marked required (in the direction opposite
969 : : * our own scan's) during preprocessing. Each index attribute can only
970 : : * have one such required key. In general, the keys that we use to find
971 : : * an initial position when scanning forwards are the same keys that end
972 : : * the scan on the leaf level when scanning backwards (and vice-versa).
973 : : *
974 : : * When the scan keys include cross-type operators, _bt_preprocess_keys
975 : : * may not be able to eliminate redundant keys; in such cases it will
976 : : * arbitrarily pick a usable key for each attribute (and scan direction),
977 : : * ensuring that there is no more than one key required in each direction.
978 : : * We stop considering further keys once we reach the first nonrequired
979 : : * key (which must come after all required keys), so this can't affect us.
980 : : *
981 : : * The required keys that we use as starting boundaries have to be =, >,
982 : : * or >= keys for a forward scan or =, <, <= keys for a backwards scan.
983 : : * We can use keys for multiple attributes so long as the prior attributes
984 : : * had only =, >= (resp. =, <=) keys. These rules are very similar to the
985 : : * rules that preprocessing used to determine which keys to mark required.
986 : : * We cannot always use every required key as a positioning key, though.
987 : : * Skip arrays necessitate independently applying our own rules here.
988 : : * Skip arrays are always generally considered = array keys, but we'll
989 : : * nevertheless treat them as inequalities at certain points of the scan.
990 : : * When that happens, it _might_ have implications for the number of
991 : : * required keys that we can safely use for initial positioning purposes.
992 : : *
993 : : * For example, a forward scan with a skip array on its leading attribute
994 : : * (with no low_compare/high_compare) will have at least two required scan
995 : : * keys, but we won't use any of them as boundary keys during the scan's
996 : : * initial call here. Our positioning key during the first call here can
997 : : * be thought of as representing "> -infinity". Similarly, if such a skip
998 : : * array's low_compare is "a > 'foo'", then we position using "a > 'foo'"
999 : : * during the scan's initial call here; a lower-order key such as "b = 42"
1000 : : * can't be used until the "a" array advances beyond MINVAL/low_compare.
1001 : : *
1002 : : * On the other hand, if such a skip array's low_compare was "a >= 'foo'",
1003 : : * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here.
1004 : : * A subsequent call here might have us use "a = 'fop' AND b = 42". Note
1005 : : * that we treat = and >= as equivalent when scanning forwards (just as we
1006 : : * treat = and <= as equivalent when scanning backwards). We effectively
1007 : : * do the same thing (though with a distinct "a" element/value) each time.
1008 : : *
1009 : : * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
1010 : : * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
1011 : : * If the index stores nulls at the end of the index we'll be starting
1012 : : * from, and we have no boundary key for the column (which means the key
1013 : : * we deduced NOT NULL from is an inequality key that constrains the other
1014 : : * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1015 : : * use as a boundary key. If we didn't do this, we might find ourselves
1016 : : * traversing a lot of null entries at the start of the scan.
1017 : : *
1018 : : * In this loop, row-comparison keys are treated the same as keys on their
1019 : : * first (leftmost) columns. We'll add all lower-order columns of the row
1020 : : * comparison that were marked required during preprocessing below.
1021 : : *
1022 : : * _bt_advance_array_keys needs to know exactly how we'll reposition the
1023 : : * scan (should it opt to schedule another primitive index scan). It is
1024 : : * critical that primscans only be scheduled when they'll definitely make
1025 : : * some useful progress. _bt_advance_array_keys does this by calling
1026 : : * _bt_checkkeys routines that report whether a tuple is past the end of
1027 : : * matches for the scan's keys (given the scan's current array elements).
1028 : : * If the page's final tuple is "after the end of matches" for a scan that
1029 : : * uses the *opposite* scan direction, then it must follow that it's also
1030 : : * "before the start of matches" for the actual current scan direction.
1031 : : * It is therefore essential that all of our initial positioning rules are
1032 : : * symmetric with _bt_checkkeys's corresponding continuescan=false rule.
1033 : : * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1034 : : * need to be kept in sync.
1035 : : *----------
1036 : : */
9174 tgl@sss.pgh.pa.us 1037 [ + + ]: 7006658 : if (so->numberOfKeys > 0)
1038 : : {
1039 : : AttrNumber curattr;
1040 : : ScanKey bkey;
1041 : : ScanKey impliesNN;
1042 : : ScanKey cur;
1043 : :
1044 : : /*
1045 : : * bkey will be set to the key that preprocessing left behind as the
1046 : : * boundary key for this attribute, in this scan direction (if any)
1047 : : */
302 pg@bowt.ie 1048 : 7000150 : cur = so->keyData;
7969 tgl@sss.pgh.pa.us 1049 : 7000150 : curattr = 1;
66 pg@bowt.ie 1050 : 7000150 : bkey = NULL;
1051 : : /* Also remember any scankey that implies a NOT NULL constraint */
5057 tgl@sss.pgh.pa.us 1052 : 7000150 : impliesNN = NULL;
1053 : :
1054 : : /*
1055 : : * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1056 : : * pass to handle after-last-key processing. Actual exit from the
1057 : : * loop is at one of the "break" statements below.
1058 : : */
302 pg@bowt.ie 1059 : 18261313 : for (int i = 0;; cur++, i++)
1060 : : {
7969 tgl@sss.pgh.pa.us 1061 [ + + + + ]: 18261313 : if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1062 : : {
1063 : : /* Done looking for the curattr boundary key */
66 pg@bowt.ie 1064 [ + + + - : 11260252 : Assert(bkey == NULL ||
- + ]
1065 : : (bkey->sk_attno == curattr &&
1066 : : (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1067 [ + + + - : 11260252 : Assert(impliesNN == NULL ||
- + ]
1068 : : (impliesNN->sk_attno == curattr &&
1069 : : (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1070 : :
1071 : : /*
1072 : : * If this is a scan key for a skip array whose current
1073 : : * element is MINVAL, choose low_compare (when scanning
1074 : : * backwards it'll be MAXVAL, and we'll choose high_compare).
1075 : : *
1076 : : * Note: if the array's low_compare key makes 'bkey' NULL,
1077 : : * then we behave as if the array's first element is -inf,
1078 : : * except when !array->null_elem implies a usable NOT NULL
1079 : : * constraint.
1080 : : */
1081 [ + + ]: 11260252 : if (bkey != NULL &&
1082 [ + + ]: 11224880 : (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
1083 : : {
1084 : 1836 : int ikey = bkey - so->keyData;
1085 : 1836 : ScanKey skipequalitykey = bkey;
155 1086 : 1836 : BTArrayKeyInfo *array = NULL;
1087 : :
1088 [ + - ]: 1891 : for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
1089 : : {
1090 : 1891 : array = &so->arrayKeys[arridx];
1091 [ + + ]: 1891 : if (array->scan_key == ikey)
1092 : 1836 : break;
1093 : : }
1094 : :
1095 [ + + ]: 1836 : if (ScanDirectionIsForward(dir))
1096 : : {
1097 [ - + ]: 1827 : Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
66 1098 : 1827 : bkey = array->low_compare;
1099 : : }
1100 : : else
1101 : : {
155 1102 [ - + ]: 9 : Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
66 1103 : 9 : bkey = array->high_compare;
1104 : : }
1105 : :
1106 [ + + - + ]: 1836 : Assert(bkey == NULL ||
1107 : : bkey->sk_attno == skipequalitykey->sk_attno);
1108 : :
155 1109 [ + + ]: 1836 : if (!array->null_elem)
1110 : 68 : impliesNN = skipequalitykey;
1111 : : else
66 1112 [ + - - + ]: 1768 : Assert(bkey == NULL && impliesNN == NULL);
1113 : : }
1114 : :
1115 : : /*
1116 : : * If we didn't find a usable boundary key, see if we can
1117 : : * deduce a NOT NULL key
1118 : : */
1119 [ + + + + : 11295654 : if (bkey == NULL && impliesNN != NULL &&
+ + ]
5057 tgl@sss.pgh.pa.us 1120 [ + + ]: 35402 : ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1121 : : ScanDirectionIsForward(dir) :
1122 : : ScanDirectionIsBackward(dir)))
1123 : : {
1124 : : /* Final startKeys[] entry will be deduced NOT NULL key */
52 pg@bowt.ie 1125 :GNC 15 : bkey = ¬nullkey;
66 pg@bowt.ie 1126 [ + + ]:CBC 15 : ScanKeyEntryInitialize(bkey,
1127 : : (SK_SEARCHNOTNULL | SK_ISNULL |
5057 tgl@sss.pgh.pa.us 1128 : 15 : (impliesNN->sk_flags &
1129 : : (SK_BT_DESC | SK_BT_NULLS_FIRST))),
1130 : : curattr,
1131 : : ScanDirectionIsForward(dir) ?
1132 : : BTGreaterStrategyNumber : BTLessStrategyNumber,
1133 : : InvalidOid,
1134 : : InvalidOid,
1135 : : InvalidOid,
1136 : : (Datum) 0);
1137 : : }
1138 : :
1139 : : /*
1140 : : * If preprocessing didn't leave a usable boundary key, quit;
1141 : : * else save the boundary key pointer in startKeys[]
1142 : : */
66 pg@bowt.ie 1143 [ + + ]: 11260252 : if (bkey == NULL)
7969 tgl@sss.pgh.pa.us 1144 : 37155 : break;
66 pg@bowt.ie 1145 : 11223097 : startKeys[keysz++] = bkey;
1146 : :
1147 : : /*
1148 : : * We can only consider adding more boundary keys when the one
1149 : : * that we just chose to add uses either the = or >= strategy
1150 : : * (during backwards scans we can only do so when the key that
1151 : : * we just added to startKeys[] uses the = or <= strategy)
1152 : : */
1153 : 11223097 : strat_total = bkey->sk_strategy;
306 1154 [ + + + + ]: 11223097 : if (strat_total == BTGreaterStrategyNumber ||
1155 : : strat_total == BTLessStrategyNumber)
1156 : : break;
1157 : :
1158 : : /*
1159 : : * If the key that we just added to startKeys[] is a skip
1160 : : * array = key whose current element is marked NEXT or PRIOR,
1161 : : * make strat_total > or < (and stop adding boundary keys).
1162 : : * This can only happen with opclasses that lack skip support.
1163 : : */
66 1164 [ + + ]: 10548764 : if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
1165 : : {
1166 [ - + ]: 6 : Assert(bkey->sk_flags & SK_BT_SKIP);
155 1167 [ - + ]: 6 : Assert(strat_total == BTEqualStrategyNumber);
1168 : :
1169 [ + + ]: 6 : if (ScanDirectionIsForward(dir))
1170 : : {
66 1171 [ - + ]: 3 : Assert(!(bkey->sk_flags & SK_BT_PRIOR));
155 1172 : 3 : strat_total = BTGreaterStrategyNumber;
1173 : : }
1174 : : else
1175 : : {
66 1176 [ - + ]: 3 : Assert(!(bkey->sk_flags & SK_BT_NEXT));
155 1177 : 3 : strat_total = BTLessStrategyNumber;
1178 : : }
1179 : :
1180 : : /*
1181 : : * We're done. We'll never find an exact = match for a
1182 : : * NEXT or PRIOR sentinel sk_argument value. There's no
1183 : : * sense in trying to add more keys to startKeys[].
1184 : : */
1185 : 6 : break;
1186 : : }
1187 : :
1188 : : /*
1189 : : * Done if that was the last scan key output by preprocessing.
1190 : : * Also done if we've now examined all keys marked required.
1191 : : */
7390 tgl@sss.pgh.pa.us 1192 [ + + ]: 10548758 : if (i >= so->numberOfKeys ||
66 pg@bowt.ie 1193 [ + + ]: 4260105 : !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1194 : : break;
1195 : :
1196 : : /*
1197 : : * Reset for next attr.
1198 : : */
1199 [ - + ]: 4260102 : Assert(cur->sk_attno == curattr + 1);
7969 tgl@sss.pgh.pa.us 1200 : 4260102 : curattr = cur->sk_attno;
66 pg@bowt.ie 1201 : 4260102 : bkey = NULL;
5057 tgl@sss.pgh.pa.us 1202 : 4260102 : impliesNN = NULL;
1203 : : }
1204 : :
1205 : : /*
1206 : : * If we've located the starting boundary key for curattr, we have
1207 : : * no interest in curattr's other required key
1208 : : */
66 pg@bowt.ie 1209 [ + + ]: 11261163 : if (bkey != NULL)
1210 : 899 : continue;
1211 : :
1212 : : /*
1213 : : * Is this key the starting boundary key for curattr?
1214 : : *
1215 : : * If not, does it imply a NOT NULL constraint? (Because
1216 : : * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1217 : : * *any* inequality key works for that; we need not test.)
1218 : : */
7969 tgl@sss.pgh.pa.us 1219 [ + + + - ]: 11260264 : switch (cur->sk_strategy)
1220 : : {
1221 : 63490 : case BTLessStrategyNumber:
1222 : : case BTLessEqualStrategyNumber:
66 pg@bowt.ie 1223 [ + + ]: 63490 : if (ScanDirectionIsBackward(dir))
1224 : 28127 : bkey = cur;
1225 [ + - ]: 35363 : else if (impliesNN == NULL)
1226 : 35363 : impliesNN = cur;
7969 tgl@sss.pgh.pa.us 1227 : 63490 : break;
1228 : 10548307 : case BTEqualStrategyNumber:
66 pg@bowt.ie 1229 : 10548307 : bkey = cur;
7969 tgl@sss.pgh.pa.us 1230 : 10548307 : break;
1231 : 648467 : case BTGreaterEqualStrategyNumber:
1232 : : case BTGreaterStrategyNumber:
66 pg@bowt.ie 1233 [ + + ]: 648467 : if (ScanDirectionIsForward(dir))
1234 : 648446 : bkey = cur;
1235 [ + - ]: 21 : else if (impliesNN == NULL)
1236 : 21 : impliesNN = cur;
9643 bruce@momjian.us 1237 : 648467 : break;
1238 : : }
1239 : : }
1240 : : }
1241 : :
1242 : : /*
1243 : : * If we found no usable boundary keys, we have to start from one end of
1244 : : * the tree. Walk down that edge to the first or last key, and scan from
1245 : : * there.
1246 : : *
1247 : : * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
1248 : : */
638 pg@bowt.ie 1249 [ + + ]: 7006658 : if (keysz == 0)
306 1250 : 43298 : return _bt_endpoint(scan, dir);
1251 : :
1252 : : /*
1253 : : * We want to start the scan somewhere within the index. Set up an
1254 : : * insertion scankey we can use to search for the boundary point we
1255 : : * identified above. The insertion scankey is built using the keys
1256 : : * identified by startKeys[]. (Remaining insertion scankey fields are
1257 : : * initialized after initial-positioning scan keys are finalized.)
1258 : : */
638 1259 [ - + ]: 6963360 : Assert(keysz <= INDEX_MAX_KEYS);
302 1260 [ + + ]: 18186433 : for (int i = 0; i < keysz; i++)
1261 : : {
66 1262 : 11223097 : ScanKey bkey = startKeys[i];
1263 : :
1264 [ - + ]: 11223097 : Assert(bkey->sk_attno == i + 1);
1265 : :
1266 [ + + ]: 11223097 : if (bkey->sk_flags & SK_ROW_HEADER)
1267 : : {
1268 : : /*
1269 : : * Row comparison header: look to the first row member instead
1270 : : */
1271 : 24 : ScanKey subkey = (ScanKey) DatumGetPointer(bkey->sk_argument);
1272 : 24 : bool loosen_strat = false,
1273 : 24 : tighten_strat = false;
1274 : :
1275 : : /*
1276 : : * Cannot be a NULL in the first row member: _bt_preprocess_keys
1277 : : * would've marked the qual as unsatisfiable, preventing us from
1278 : : * ever getting this far
1279 : : */
6815 tgl@sss.pgh.pa.us 1280 [ - + ]: 24 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
66 pg@bowt.ie 1281 [ - + ]: 24 : Assert(subkey->sk_attno == bkey->sk_attno);
242 1282 [ - + ]: 24 : Assert(!(subkey->sk_flags & SK_ISNULL));
1283 : :
1284 : : /*
1285 : : * This is either a > or >= key (during backwards scans it is
1286 : : * either < or <=) that was marked required during preprocessing.
1287 : : * Later so->keyData[] keys can't have been marked required, so
1288 : : * our row compare header key must be the final startKeys[] entry.
1289 : : */
66 1290 [ - + ]: 24 : Assert(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD));
1291 [ - + ]: 24 : Assert(i == keysz - 1);
1292 : :
1293 : : /*
1294 : : * The member scankeys are already in insertion format (ie, they
1295 : : * have sk_func = 3-way-comparison function)
1296 : : */
2362 1297 : 24 : memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1298 : :
1299 : : /*
1300 : : * Now look to later row compare members.
1301 : : *
1302 : : * If there's an "index attribute gap" between two row compare
1303 : : * members, the second member won't have been marked required, and
1304 : : * so can't be used as a starting boundary key here. The part of
1305 : : * the row comparison that we do still use has to be treated as a
1306 : : * ">=" or "<=" condition. For example, a qual "(a, c) > (1, 42)"
1307 : : * with an omitted intervening index attribute "b" will use an
1308 : : * insertion scan key "a >= 1". Even the first "a = 1" tuple on
1309 : : * the leaf level might satisfy the row compare qual.
1310 : : *
1311 : : * We're able to use a _more_ restrictive strategy when we reach a
1312 : : * NULL row compare member, since they're always unsatisfiable.
1313 : : * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an
1314 : : * insertion scan key "a > 1". All tuples where "a = 1" cannot
1315 : : * possibly satisfy the row compare qual, so this is safe.
1316 : : */
66 1317 [ - + ]: 24 : Assert(!(subkey->sk_flags & SK_ROW_END));
1318 : : for (;;)
1319 : : {
1320 : 24 : subkey++;
1321 [ - + ]: 24 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1322 : :
1323 [ + + ]: 24 : if (subkey->sk_flags & SK_ISNULL)
1324 : : {
1325 : : /*
1326 : : * NULL member key, can only use earlier keys.
1327 : : *
1328 : : * We deliberately avoid checking if this key is marked
1329 : : * required. All earlier keys are required, and this key
1330 : : * is unsatisfiable either way, so we can't miss anything.
1331 : : */
1332 : 6 : tighten_strat = true;
1333 : 6 : break;
1334 : : }
1335 : :
1336 [ + + ]: 18 : if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1337 : : {
1338 : : /* nonrequired member key, can only use earlier keys */
1339 : 6 : loosen_strat = true;
1340 : 6 : break;
1341 : : }
1342 : :
1343 [ - + ]: 12 : Assert(subkey->sk_attno == keysz + 1);
1344 [ - + ]: 12 : Assert(subkey->sk_strategy == bkey->sk_strategy);
1345 [ - + ]: 12 : Assert(keysz < INDEX_MAX_KEYS);
1346 : :
1347 : 12 : memcpy(inskey.scankeys + keysz, subkey,
1348 : : sizeof(ScanKeyData));
1349 : 12 : keysz++;
1350 [ + - ]: 12 : if (subkey->sk_flags & SK_ROW_END)
1351 : 12 : break;
1352 : : }
1353 [ + + - + ]: 24 : Assert(!(loosen_strat && tighten_strat));
1354 [ + + ]: 24 : if (loosen_strat)
1355 : : {
1356 : : /* Use less restrictive strategy (and fewer member keys) */
1357 [ + + - ]: 6 : switch (strat_total)
1358 : : {
1359 : 3 : case BTLessStrategyNumber:
1360 : 3 : strat_total = BTLessEqualStrategyNumber;
1361 : 3 : break;
1362 : 3 : case BTGreaterStrategyNumber:
1363 : 3 : strat_total = BTGreaterEqualStrategyNumber;
1364 : 3 : break;
1365 : : }
1366 : : }
1367 [ + + ]: 24 : if (tighten_strat)
1368 : : {
1369 : : /* Use more restrictive strategy (and fewer member keys) */
1370 [ + + - ]: 6 : switch (strat_total)
1371 : : {
1372 : 3 : case BTLessEqualStrategyNumber:
1373 : 3 : strat_total = BTLessStrategyNumber;
1374 : 3 : break;
1375 : 3 : case BTGreaterEqualStrategyNumber:
1376 : 3 : strat_total = BTGreaterStrategyNumber;
1377 : 3 : break;
1378 : : }
1379 : : }
1380 : :
1381 : : /* done adding to inskey (row comparison keys always come last) */
1382 : 24 : break;
1383 : : }
1384 : :
1385 : : /*
1386 : : * Ordinary comparison key/search-style key.
1387 : : *
1388 : : * Transform the search-style scan key to an insertion scan key by
1389 : : * replacing the sk_func with the appropriate btree 3-way-comparison
1390 : : * function.
1391 : : *
1392 : : * If scankey operator is not a cross-type comparison, we can use the
1393 : : * cached comparison function; otherwise gotta look it up in the
1394 : : * catalogs. (That can't lead to infinite recursion, since no
1395 : : * indexscan initiated by syscache lookup will use cross-data-type
1396 : : * operators.)
1397 : : *
1398 : : * We support the convention that sk_subtype == InvalidOid means the
1399 : : * opclass input type; this hack simplifies life for ScanKeyInit().
1400 : : */
1401 [ + + ]: 11223073 : if (bkey->sk_subtype == rel->rd_opcintype[i] ||
1402 [ + + ]: 10804823 : bkey->sk_subtype == InvalidOid)
1403 : 11217644 : {
1404 : : FmgrInfo *procinfo;
1405 : :
1406 : 11217644 : procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC);
1407 : 11217644 : ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1408 : : bkey->sk_flags,
1409 : 11217644 : bkey->sk_attno,
1410 : : InvalidStrategy,
1411 : : bkey->sk_subtype,
1412 : : bkey->sk_collation,
1413 : : procinfo,
1414 : : bkey->sk_argument);
1415 : : }
1416 : : else
1417 : : {
1418 : : RegProcedure cmp_proc;
1419 : :
1420 : 5429 : cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1421 : 5429 : rel->rd_opcintype[i],
1422 : : bkey->sk_subtype, BTORDER_PROC);
1423 [ - + ]: 5429 : if (!RegProcedureIsValid(cmp_proc))
66 pg@bowt.ie 1424 [ # # ]:UBC 0 : elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1425 : : BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype,
1426 : : bkey->sk_attno, RelationGetRelationName(rel));
66 pg@bowt.ie 1427 :CBC 5429 : ScanKeyEntryInitialize(inskey.scankeys + i,
1428 : : bkey->sk_flags,
1429 : 5429 : bkey->sk_attno,
1430 : : InvalidStrategy,
1431 : : bkey->sk_subtype,
1432 : : bkey->sk_collation,
1433 : : cmp_proc,
1434 : : bkey->sk_argument);
1435 : : }
1436 : : }
1437 : :
1438 : : /*----------
1439 : : * Examine the selected initial-positioning strategy to determine exactly
1440 : : * where we need to start the scan, and set flag variables to control the
1441 : : * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1442 : : * page _bt_search returns).
1443 : : *----------
1444 : : */
638 1445 : 6963360 : _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1446 : 6963360 : inskey.anynullkeys = false; /* unused */
1447 : 6963360 : inskey.scantid = NULL;
1448 : 6963360 : inskey.keysz = keysz;
7930 tgl@sss.pgh.pa.us 1449 [ + + + + : 6963360 : switch (strat_total)
+ - ]
1450 : : {
1451 : 28130 : case BTLessStrategyNumber:
1452 : :
638 pg@bowt.ie 1453 : 28130 : inskey.nextkey = false;
1454 : 28130 : inskey.backward = true;
7930 tgl@sss.pgh.pa.us 1455 : 28130 : break;
1456 : :
1457 : 9 : case BTLessEqualStrategyNumber:
1458 : :
638 pg@bowt.ie 1459 : 9 : inskey.nextkey = true;
1460 : 9 : inskey.backward = true;
7930 tgl@sss.pgh.pa.us 1461 : 9 : break;
1462 : :
1463 : 6286763 : case BTEqualStrategyNumber:
1464 : :
1465 : : /*
1466 : : * If a backward scan was specified, need to start with last equal
1467 : : * item not first one.
1468 : : */
1469 [ + + ]: 6286763 : if (ScanDirectionIsBackward(dir))
1470 : : {
1471 : : /*
1472 : : * This is the same as the <= strategy
1473 : : */
638 pg@bowt.ie 1474 : 88 : inskey.nextkey = true;
1475 : 88 : inskey.backward = true;
1476 : : }
1477 : : else
1478 : : {
1479 : : /*
1480 : : * This is the same as the >= strategy
1481 : : */
1482 : 6286675 : inskey.nextkey = false;
1483 : 6286675 : inskey.backward = false;
1484 : : }
7930 tgl@sss.pgh.pa.us 1485 : 6286763 : break;
1486 : :
1487 : 2249 : case BTGreaterEqualStrategyNumber:
1488 : :
1489 : : /*
1490 : : * Find first item >= scankey
1491 : : */
638 pg@bowt.ie 1492 : 2249 : inskey.nextkey = false;
1493 : 2249 : inskey.backward = false;
7930 tgl@sss.pgh.pa.us 1494 : 2249 : break;
1495 : :
1496 : 646209 : case BTGreaterStrategyNumber:
1497 : :
1498 : : /*
1499 : : * Find first item > scankey
1500 : : */
638 pg@bowt.ie 1501 : 646209 : inskey.nextkey = true;
1502 : 646209 : inskey.backward = false;
7930 tgl@sss.pgh.pa.us 1503 : 646209 : break;
1504 : :
7930 tgl@sss.pgh.pa.us 1505 :UBC 0 : default:
1506 : : /* can't get here, but keep compiler quiet */
1507 [ # # ]: 0 : elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1508 : : return false;
1509 : : }
1510 : :
1511 : : /*
1512 : : * Use the manufactured insertion scan key to descend the tree and
1513 : : * position ourselves on the target leaf page.
1514 : : */
638 pg@bowt.ie 1515 [ - + ]:CBC 6963360 : Assert(ScanDirectionIsBackward(dir) == inskey.backward);
323 1516 : 6963360 : stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1517 : :
1518 : : /* don't need to keep the stack around... */
9178 tgl@sss.pgh.pa.us 1519 : 6963360 : _bt_freestack(stack);
1520 : :
323 pg@bowt.ie 1521 [ + + ]: 6963360 : if (!BufferIsValid(so->currPos.buf))
1522 : : {
66 1523 [ - + ]: 249871 : Assert(!so->needPrimScan);
1524 : :
1525 : : /*
1526 : : * We only get here if the index is completely empty. Lock relation
1527 : : * because nothing finer to lock exists. Without a buffer lock, it's
1528 : : * possible for another transaction to insert data between
1529 : : * _bt_search() and PredicateLockRelation(). We have to try again
1530 : : * after taking the relation-level predicate lock, to close a narrow
1531 : : * window where we wouldn't scan concurrently inserted tuples, but the
1532 : : * writer wouldn't see our predicate lock.
1533 : : */
796 tmunro@postgresql.or 1534 [ + + ]: 249871 : if (IsolationIsSerializable())
1535 : : {
1536 : 2754 : PredicateLockRelation(rel, scan->xs_snapshot);
323 pg@bowt.ie 1537 : 2754 : stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
796 tmunro@postgresql.or 1538 : 2754 : _bt_freestack(stack);
1539 : : }
1540 : :
323 pg@bowt.ie 1541 [ + - ]: 249871 : if (!BufferIsValid(so->currPos.buf))
1542 : : {
796 tmunro@postgresql.or 1543 : 249871 : _bt_parallel_done(scan);
1544 : 249871 : return false;
1545 : : }
1546 : : }
1547 : :
1548 : : /* position to the precise item on the page */
323 pg@bowt.ie 1549 : 6713489 : offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
1550 : :
1551 : : /*
1552 : : * Now load data from the first page of the scan (usually the page
1553 : : * currently in so->currPos.buf).
1554 : : *
1555 : : * If inskey.nextkey = false and inskey.backward = false, offnum is
1556 : : * positioned at the first non-pivot tuple >= inskey.scankeys.
1557 : : *
1558 : : * If inskey.nextkey = false and inskey.backward = true, offnum is
1559 : : * positioned at the last non-pivot tuple < inskey.scankeys.
1560 : : *
1561 : : * If inskey.nextkey = true and inskey.backward = false, offnum is
1562 : : * positioned at the first non-pivot tuple > inskey.scankeys.
1563 : : *
1564 : : * If inskey.nextkey = true and inskey.backward = true, offnum is
1565 : : * positioned at the last non-pivot tuple <= inskey.scankeys.
1566 : : *
1567 : : * It's possible that _bt_binsrch returned an offnum that is out of bounds
1568 : : * for the page. For example, when inskey is both < the leaf page's high
1569 : : * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1570 : : */
1571 [ + + ]: 6713489 : if (!_bt_readfirstpage(scan, offnum, dir))
1572 : 1986951 : return false;
1573 : :
297 1574 : 4726538 : _bt_returnitem(scan, so);
7062 tgl@sss.pgh.pa.us 1575 : 4726538 : return true;
1576 : : }
1577 : :
1578 : : /*
1579 : : * _bt_next() -- Get the next item in a scan.
1580 : : *
1581 : : * On entry, so->currPos describes the current page, which may be pinned
1582 : : * but is not locked, and so->currPos.itemIndex identifies which item was
1583 : : * previously returned.
1584 : : *
1585 : : * On success exit, so->currPos is updated as needed, and _bt_returnitem
1586 : : * sets the next item to return to the scan. so->currPos remains valid.
1587 : : *
1588 : : * On failure exit (no more tuples), we invalidate so->currPos. It'll
1589 : : * still be possible for the scan to return tuples by changing direction,
1590 : : * though we'll need to call _bt_first anew in that other direction.
1591 : : */
1592 : : bool
1593 : 9133082 : _bt_next(IndexScanDesc scan, ScanDirection dir)
1594 : : {
1595 : 9133082 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1596 : :
306 pg@bowt.ie 1597 [ - + - - : 9133082 : Assert(BTScanPosIsValid(so->currPos));
- + ]
1598 : :
1599 : : /*
1600 : : * Advance to next tuple on current page; or if there's no more, try to
1601 : : * step to the next page with data.
1602 : : */
7062 tgl@sss.pgh.pa.us 1603 [ + + ]: 9133082 : if (ScanDirectionIsForward(dir))
1604 : : {
1605 [ + + ]: 9112873 : if (++so->currPos.itemIndex > so->currPos.lastItem)
1606 : : {
1607 [ + + ]: 1163197 : if (!_bt_steppage(scan, dir))
1608 : 1149223 : return false;
1609 : : }
1610 : : }
1611 : : else
1612 : : {
1613 [ + + ]: 20209 : if (--so->currPos.itemIndex < so->currPos.firstItem)
1614 : : {
1615 [ + + ]: 69 : if (!_bt_steppage(scan, dir))
8510 1616 : 46 : return false;
1617 : : }
1618 : : }
1619 : :
297 pg@bowt.ie 1620 : 7983812 : _bt_returnitem(scan, so);
7062 tgl@sss.pgh.pa.us 1621 : 7983812 : return true;
1622 : : }
1623 : :
1624 : : /*
1625 : : * _bt_readpage() -- Load data from current index page into so->currPos
1626 : : *
1627 : : * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1628 : : * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1629 : : * they are updated as appropriate. All other fields of so->currPos are
1630 : : * initialized from scratch here.
1631 : : *
1632 : : * We scan the current page starting at offnum and moving in the indicated
1633 : : * direction. All items matching the scan keys are loaded into currPos.items.
1634 : : * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1635 : : * that there can be no more matching tuples in the current scan direction
1636 : : * (could just be for the current primitive index scan when scan has arrays).
1637 : : *
1638 : : * In the case of a parallel scan, caller must have called _bt_parallel_seize
1639 : : * prior to calling this function; this function will invoke
1640 : : * _bt_parallel_release before returning.
1641 : : *
1642 : : * Returns true if any matching items found on the page, false if none.
1643 : : */
1644 : : static bool
619 akorotkov@postgresql 1645 : 6768504 : _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
1646 : : bool firstpage)
1647 : : {
323 pg@bowt.ie 1648 : 6768504 : Relation rel = scan->indexRelation;
7062 tgl@sss.pgh.pa.us 1649 : 6768504 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1650 : : Page page;
1651 : : BTPageOpaque opaque;
1652 : : OffsetNumber minoff;
1653 : : OffsetNumber maxoff;
1654 : : BTReadPageState pstate;
1655 : : bool arrayKeys;
1656 : : int itemIndex,
1657 : : indnatts;
1658 : :
1659 : : /* save the page/buffer block number, along with its sibling links */
3426 kgrittn@postgresql.o 1660 : 6768504 : page = BufferGetPage(so->currPos.buf);
1254 michael@paquier.xyz 1661 : 6768504 : opaque = BTPageGetOpaque(page);
323 pg@bowt.ie 1662 : 6768504 : so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1663 : 6768504 : so->currPos.prevPage = opaque->btpo_prev;
1664 : 6768504 : so->currPos.nextPage = opaque->btpo_next;
1665 : : /* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */
92 1666 : 6768504 : so->currPos.dir = dir;
1667 : 6768504 : so->currPos.nextTupleOffset = 0;
1668 : :
1669 : : /* either moreRight or moreLeft should be set now (may be unset later) */
1670 [ + + - + ]: 6768504 : Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
1671 : : so->currPos.moreLeft);
323 1672 [ - + ]: 6768504 : Assert(!P_IGNORE(opaque));
1673 [ - + - - : 6768504 : Assert(BTScanPosIsPinned(so->currPos));
- + ]
311 1674 [ - + ]: 6768504 : Assert(!so->needPrimScan);
1675 : :
3125 rhaas@postgresql.org 1676 [ + + ]: 6768504 : if (scan->parallel_scan)
1677 : : {
1678 : : /* allow next/prev page to be read by other worker without delay */
1679 [ + - ]: 669 : if (ScanDirectionIsForward(dir))
323 pg@bowt.ie 1680 : 669 : _bt_parallel_release(scan, so->currPos.nextPage,
1681 : : so->currPos.currPage);
1682 : : else
323 pg@bowt.ie 1683 :UBC 0 : _bt_parallel_release(scan, so->currPos.prevPage,
1684 : : so->currPos.currPage);
1685 : : }
1686 : :
323 pg@bowt.ie 1687 :CBC 6768504 : PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
1688 : :
1689 : : /* initialize local variables */
1690 : 6768504 : indnatts = IndexRelationGetNumberOfAttributes(rel);
518 1691 : 6768504 : arrayKeys = so->numArrayKeys != 0;
7062 tgl@sss.pgh.pa.us 1692 [ + + ]: 6768504 : minoff = P_FIRSTDATAKEY(opaque);
1693 : 6768504 : maxoff = PageGetMaxOffsetNumber(page);
1694 : :
1695 : : /* initialize page-level state that we'll pass to _bt_checkkeys */
518 pg@bowt.ie 1696 : 6768504 : pstate.minoff = minoff;
1697 : 6768504 : pstate.maxoff = maxoff;
1698 : 6768504 : pstate.finaltup = NULL;
1699 : 6768504 : pstate.page = page;
168 1700 : 6768504 : pstate.firstpage = firstpage;
155 1701 : 6768504 : pstate.forcenonrequired = false;
1702 : 6768504 : pstate.startikey = 0;
518 1703 : 6768504 : pstate.offnum = InvalidOffsetNumber;
1704 : 6768504 : pstate.skip = InvalidOffsetNumber;
1705 : 6768504 : pstate.continuescan = true; /* default assumption */
1706 : 6768504 : pstate.rechecks = 0;
1707 : 6768504 : pstate.targetdistance = 0;
155 1708 : 6768504 : pstate.nskipadvances = 0;
1709 : :
7062 tgl@sss.pgh.pa.us 1710 [ + + ]: 6768504 : if (ScanDirectionIsForward(dir))
1711 : : {
1712 : : /* SK_SEARCHARRAY forward scans must provide high key up front */
168 pg@bowt.ie 1713 [ + + ]: 6740176 : if (arrayKeys)
1714 : : {
1715 [ + + ]: 45611 : if (!P_RIGHTMOST(opaque))
1716 : : {
1717 : 14328 : ItemId iid = PageGetItemId(page, P_HIKEY);
1718 : :
1719 : 14328 : pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1720 : :
1721 [ + + ]: 14328 : if (so->scanBehind &&
1722 [ + + ]: 1285 : !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
1723 : : {
1724 : : /* Schedule another primitive index scan after all */
325 1725 : 203 : so->currPos.moreRight = false;
1726 : 203 : so->needPrimScan = true;
168 1727 [ - + ]: 203 : if (scan->parallel_scan)
168 pg@bowt.ie 1728 :UBC 0 : _bt_parallel_primscan_schedule(scan,
1729 : : so->currPos.currPage);
325 pg@bowt.ie 1730 :CBC 203 : return false;
1731 : : }
1732 : : }
1733 : :
168 1734 : 45408 : so->scanBehind = so->oppositeDirCheck = false; /* reset */
1735 : : }
1736 : :
1737 : : /*
1738 : : * Consider pstate.startikey optimization once the ongoing primitive
1739 : : * index scan has already read at least one page
1740 : : */
155 1741 [ + + + + ]: 6739973 : if (!pstate.firstpage && minoff < maxoff)
1742 : 14979 : _bt_set_startikey(scan, &pstate);
1743 : :
1744 : : /* load items[] in ascending order */
7062 tgl@sss.pgh.pa.us 1745 : 6739973 : itemIndex = 0;
1746 : :
1747 : 6739973 : offnum = Max(offnum, minoff);
1748 : :
1749 [ + + ]: 26964336 : while (offnum <= maxoff)
1750 : : {
2359 pg@bowt.ie 1751 : 25320377 : ItemId iid = PageGetItemId(page, offnum);
1752 : : IndexTuple itup;
1753 : : bool passes_quals;
1754 : :
1755 : : /*
1756 : : * If the scan specifies not to return killed tuples, then we
1757 : : * treat a killed tuple as not passing the qual
1758 : : */
1759 [ + + + + ]: 25320377 : if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1760 : : {
1761 : 1918824 : offnum = OffsetNumberNext(offnum);
1762 : 1918824 : continue;
1763 : : }
1764 : :
1765 : 23401553 : itup = (IndexTuple) PageGetItem(page, iid);
638 1766 [ - + ]: 23401553 : Assert(!BTreeTupleIsPivot(itup));
1767 : :
518 1768 : 23401553 : pstate.offnum = offnum;
1769 : 23401553 : passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1770 : : itup, indnatts);
1771 : :
1772 : : /*
1773 : : * Check if we need to skip ahead to a later tuple (only possible
1774 : : * when the scan uses array keys)
1775 : : */
1776 [ + + + + : 23401553 : if (arrayKeys && OffsetNumberIsValid(pstate.skip))
+ - + + ]
1777 : : {
1778 [ + - - + ]: 1983 : Assert(!passes_quals && pstate.continuescan);
1779 [ - + ]: 1983 : Assert(offnum < pstate.skip);
155 1780 [ - + ]: 1983 : Assert(!pstate.forcenonrequired);
1781 : :
518 1782 : 1983 : offnum = pstate.skip;
1783 : 1983 : pstate.skip = InvalidOffsetNumber;
1784 : 1983 : continue;
1785 : : }
1786 : :
701 akorotkov@postgresql 1787 [ + + ]: 23399570 : if (passes_quals)
1788 : : {
1789 : : /* tuple passes all scan key conditions */
2019 pg@bowt.ie 1790 [ + + ]: 18041851 : if (!BTreeTupleIsPosting(itup))
1791 : : {
1792 : : /* Remember it */
1793 : 17826249 : _bt_saveitem(so, itemIndex, offnum, itup);
1794 : 17826249 : itemIndex++;
1795 : : }
1796 : : else
1797 : : {
1798 : : int tupleOffset;
1799 : :
1800 : : /*
1801 : : * Set up state to return posting list, and remember first
1802 : : * TID
1803 : : */
1804 : : tupleOffset =
1805 : 215602 : _bt_setuppostingitems(so, itemIndex, offnum,
1806 : : BTreeTupleGetPostingN(itup, 0),
1807 : : itup);
1808 : 215602 : itemIndex++;
1809 : : /* Remember additional TIDs */
1810 [ + + ]: 1361412 : for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1811 : : {
1812 : 1145810 : _bt_savepostingitem(so, itemIndex, offnum,
1813 : : BTreeTupleGetPostingN(itup, i),
1814 : : tupleOffset);
1815 : 1145810 : itemIndex++;
1816 : : }
1817 : : }
1818 : : }
1819 : : /* When !continuescan, there can't be any more matches, so stop */
518 1820 [ + + ]: 23399570 : if (!pstate.continuescan)
7062 tgl@sss.pgh.pa.us 1821 : 5096014 : break;
1822 : :
1823 : 18303556 : offnum = OffsetNumberNext(offnum);
1824 : : }
1825 : :
1826 : : /*
1827 : : * We don't need to visit page to the right when the high key
1828 : : * indicates that no more matches will be found there.
1829 : : *
1830 : : * Checking the high key like this works out more often than you might
1831 : : * think. Leaf page splits pick a split point between the two most
1832 : : * dissimilar tuples (this is weighed against the need to evenly share
1833 : : * free space). Leaf pages with high key attribute values that can
1834 : : * only appear on non-pivot tuples on the right sibling page are
1835 : : * common.
1836 : : */
168 pg@bowt.ie 1837 [ + + + + : 6739973 : if (pstate.continuescan && !so->scanBehind && !P_RIGHTMOST(opaque))
+ + ]
1838 : : {
2359 1839 : 63160 : ItemId iid = PageGetItemId(page, P_HIKEY);
1840 : 63160 : IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1841 : : int truncatt;
1842 : :
1843 : : /* Reset arrays, per _bt_set_startikey contract */
122 1844 [ + + ]: 63160 : if (pstate.forcenonrequired)
1845 : 1073 : _bt_start_array_keys(scan, dir);
155 1846 : 63160 : pstate.forcenonrequired = false;
140 michael@paquier.xyz 1847 : 63160 : pstate.startikey = 0; /* _bt_set_startikey ignores P_HIKEY */
1848 : :
122 pg@bowt.ie 1849 [ + - ]: 63160 : truncatt = BTreeTupleGetNAtts(itup, rel);
518 1850 : 63160 : _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
1851 : : }
1852 : :
1853 [ + + ]: 6739973 : if (!pstate.continuescan)
2359 1854 : 5133724 : so->currPos.moreRight = false;
1855 : :
2019 1856 [ - + ]: 6739973 : Assert(itemIndex <= MaxTIDsPerBTreePage);
7062 tgl@sss.pgh.pa.us 1857 : 6739973 : so->currPos.firstItem = 0;
1858 : 6739973 : so->currPos.lastItem = itemIndex - 1;
1859 : 6739973 : so->currPos.itemIndex = 0;
1860 : : }
1861 : : else
1862 : : {
1863 : : /* SK_SEARCHARRAY backward scans must provide final tuple up front */
168 pg@bowt.ie 1864 [ + + ]: 28328 : if (arrayKeys)
1865 : : {
1866 [ + - + + ]: 39 : if (minoff <= maxoff && !P_LEFTMOST(opaque))
1867 : : {
1868 : 30 : ItemId iid = PageGetItemId(page, minoff);
1869 : :
1870 : 30 : pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1871 : :
1872 [ + + ]: 30 : if (so->scanBehind &&
1873 [ + + ]: 6 : !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
1874 : : {
1875 : : /* Schedule another primitive index scan after all */
1876 : 3 : so->currPos.moreLeft = false;
1877 : 3 : so->needPrimScan = true;
1878 [ - + ]: 3 : if (scan->parallel_scan)
168 pg@bowt.ie 1879 :UBC 0 : _bt_parallel_primscan_schedule(scan,
1880 : : so->currPos.currPage);
168 pg@bowt.ie 1881 :CBC 3 : return false;
1882 : : }
1883 : : }
1884 : :
1885 : 36 : so->scanBehind = so->oppositeDirCheck = false; /* reset */
1886 : : }
1887 : :
1888 : : /*
1889 : : * Consider pstate.startikey optimization once the ongoing primitive
1890 : : * index scan has already read at least one page
1891 : : */
155 1892 [ + + + - ]: 28325 : if (!pstate.firstpage && minoff < maxoff)
1893 : 77 : _bt_set_startikey(scan, &pstate);
1894 : :
1895 : : /* load items[] in descending order */
2019 1896 : 28325 : itemIndex = MaxTIDsPerBTreePage;
1897 : :
7062 tgl@sss.pgh.pa.us 1898 : 28325 : offnum = Min(offnum, maxoff);
1899 : :
1900 [ + + ]: 4800595 : while (offnum >= minoff)
1901 : : {
2359 pg@bowt.ie 1902 : 4772336 : ItemId iid = PageGetItemId(page, offnum);
1903 : : IndexTuple itup;
1904 : : bool tuple_alive;
1905 : : bool passes_quals;
1906 : :
1907 : : /*
1908 : : * If the scan specifies not to return killed tuples, then we
1909 : : * treat a killed tuple as not passing the qual. Most of the
1910 : : * time, it's a win to not bother examining the tuple's index
1911 : : * keys, but just skip to the next tuple (previous, actually,
1912 : : * since we're scanning backwards). However, if this is the first
1913 : : * tuple on the page, we do check the index keys, to prevent
1914 : : * uselessly advancing to the page to the left. This is similar
1915 : : * to the high key optimization used by forward scans.
1916 : : */
1917 [ + + + + ]: 4772336 : if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1918 : : {
292 1919 [ + + ]: 201798 : if (offnum > minoff)
1920 : : {
2359 1921 : 201403 : offnum = OffsetNumberPrev(offnum);
1922 : 201403 : continue;
1923 : : }
1924 : :
1925 : 395 : tuple_alive = false;
1926 : : }
1927 : : else
1928 : 4570538 : tuple_alive = true;
1929 : :
1930 : 4570933 : itup = (IndexTuple) PageGetItem(page, iid);
638 1931 [ - + ]: 4570933 : Assert(!BTreeTupleIsPivot(itup));
1932 : :
518 1933 : 4570933 : pstate.offnum = offnum;
155 1934 [ + + + + : 4570933 : if (arrayKeys && offnum == minoff && pstate.forcenonrequired)
+ + ]
1935 : : {
1936 : : /* Reset arrays, per _bt_set_startikey contract */
1937 : 3 : pstate.forcenonrequired = false;
1938 : 3 : pstate.startikey = 0;
122 1939 : 3 : _bt_start_array_keys(scan, dir);
1940 : : }
518 1941 : 4570933 : passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1942 : : itup, indnatts);
1943 : :
155 1944 [ + + + + ]: 4570933 : if (arrayKeys && so->scanBehind)
1945 : : {
1946 : : /*
1947 : : * Done scanning this page, but not done with the current
1948 : : * primscan.
1949 : : *
1950 : : * Note: Forward scans don't check this explicitly, since they
1951 : : * prefer to reuse pstate.skip for this instead.
1952 : : */
1953 [ + - - + ]: 9 : Assert(!passes_quals && pstate.continuescan);
1954 [ - + ]: 9 : Assert(!pstate.forcenonrequired);
1955 : :
1956 : 9 : break;
1957 : : }
1958 : :
1959 : : /*
1960 : : * Check if we need to skip ahead to a later tuple (only possible
1961 : : * when the scan uses array keys)
1962 : : */
518 1963 [ + + + + : 4570924 : if (arrayKeys && OffsetNumberIsValid(pstate.skip))
+ - + + ]
1964 : : {
1965 [ + - - + ]: 18 : Assert(!passes_quals && pstate.continuescan);
1966 [ - + ]: 18 : Assert(offnum > pstate.skip);
155 1967 [ - + ]: 18 : Assert(!pstate.forcenonrequired);
1968 : :
518 1969 : 18 : offnum = pstate.skip;
1970 : 18 : pstate.skip = InvalidOffsetNumber;
1971 : 18 : continue;
1972 : : }
1973 : :
2359 1974 [ + + + + ]: 4570906 : if (passes_quals && tuple_alive)
1975 : : {
1976 : : /* tuple passes all scan key conditions */
2019 1977 [ + + ]: 4569527 : if (!BTreeTupleIsPosting(itup))
1978 : : {
1979 : : /* Remember it */
1980 : 4544566 : itemIndex--;
1981 : 4544566 : _bt_saveitem(so, itemIndex, offnum, itup);
1982 : : }
1983 : : else
1984 : : {
1985 : : int tupleOffset;
1986 : :
1987 : : /*
1988 : : * Set up state to return posting list, and remember first
1989 : : * TID.
1990 : : *
1991 : : * Note that we deliberately save/return items from
1992 : : * posting lists in ascending heap TID order for backwards
1993 : : * scans. This allows _bt_killitems() to make a
1994 : : * consistent assumption about the order of items
1995 : : * associated with the same posting list tuple.
1996 : : */
1997 : 24961 : itemIndex--;
1998 : : tupleOffset =
1999 : 24961 : _bt_setuppostingitems(so, itemIndex, offnum,
2000 : : BTreeTupleGetPostingN(itup, 0),
2001 : : itup);
2002 : : /* Remember additional TIDs */
2003 [ + + ]: 93852 : for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
2004 : : {
2005 : 68891 : itemIndex--;
2006 : 68891 : _bt_savepostingitem(so, itemIndex, offnum,
2007 : : BTreeTupleGetPostingN(itup, i),
2008 : : tupleOffset);
2009 : : }
2010 : : }
2011 : : }
2012 : : /* When !continuescan, there can't be any more matches, so stop */
518 2013 [ + + ]: 4570906 : if (!pstate.continuescan)
7062 tgl@sss.pgh.pa.us 2014 : 57 : break;
2015 : :
2016 : 4570849 : offnum = OffsetNumberPrev(offnum);
2017 : : }
2018 : :
2019 : : /*
2020 : : * We don't need to visit page to the left when no more matches will
2021 : : * be found there
2022 : : */
323 pg@bowt.ie 2023 [ + + ]: 28325 : if (!pstate.continuescan)
391 2024 : 57 : so->currPos.moreLeft = false;
2025 : :
7062 tgl@sss.pgh.pa.us 2026 [ - + ]: 28325 : Assert(itemIndex >= 0);
2027 : 28325 : so->currPos.firstItem = itemIndex;
2019 pg@bowt.ie 2028 : 28325 : so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
2029 : 28325 : so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
2030 : : }
2031 : :
2032 : : /*
2033 : : * If _bt_set_startikey told us to temporarily treat the scan's keys as
2034 : : * nonrequired (possible only during scans with array keys), there must be
2035 : : * no lasting consequences for the scan's array keys. The scan's arrays
2036 : : * should now have exactly the same elements as they would have had if the
2037 : : * nonrequired behavior had never been used. (In general, a scan's arrays
2038 : : * are expected to track its progress through the index's key space.)
2039 : : *
2040 : : * We are required (by _bt_set_startikey) to call _bt_checkkeys against
2041 : : * pstate.finaltup with pstate.forcenonrequired=false to allow the scan's
2042 : : * arrays to recover. Assert that that step hasn't been missed.
2043 : : */
155 2044 [ - + ]: 6768298 : Assert(!pstate.forcenonrequired);
2045 : :
7062 tgl@sss.pgh.pa.us 2046 : 6768298 : return (so->currPos.firstItem <= so->currPos.lastItem);
2047 : : }
2048 : :
2049 : : /* Save an index item into so->currPos.items[itemIndex] */
2050 : : static void
5081 2051 : 22370815 : _bt_saveitem(BTScanOpaque so, int itemIndex,
2052 : : OffsetNumber offnum, IndexTuple itup)
2053 : : {
2054 : 22370815 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2055 : :
2019 pg@bowt.ie 2056 [ + - - + ]: 22370815 : Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
2057 : :
5081 tgl@sss.pgh.pa.us 2058 : 22370815 : currItem->heapTid = itup->t_tid;
2059 : 22370815 : currItem->indexOffset = offnum;
2060 [ + + ]: 22370815 : if (so->currTuples)
2061 : : {
2062 : 11356919 : Size itupsz = IndexTupleSize(itup);
2063 : :
2064 : 11356919 : currItem->tupleOffset = so->currPos.nextTupleOffset;
2065 : 11356919 : memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
2066 : 11356919 : so->currPos.nextTupleOffset += MAXALIGN(itupsz);
2067 : : }
2068 : 22370815 : }
2069 : :
2070 : : /*
2071 : : * Setup state to save TIDs/items from a single posting list tuple.
2072 : : *
2073 : : * Saves an index item into so->currPos.items[itemIndex] for TID that is
2074 : : * returned to scan first. Second or subsequent TIDs for posting list should
2075 : : * be saved by calling _bt_savepostingitem().
2076 : : *
2077 : : * Returns an offset into tuple storage space that main tuple is stored at if
2078 : : * needed.
2079 : : */
2080 : : static int
2019 pg@bowt.ie 2081 : 240563 : _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
2082 : : ItemPointer heapTid, IndexTuple itup)
2083 : : {
2084 : 240563 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2085 : :
2086 [ - + ]: 240563 : Assert(BTreeTupleIsPosting(itup));
2087 : :
2088 : 240563 : currItem->heapTid = *heapTid;
2089 : 240563 : currItem->indexOffset = offnum;
2090 [ + + ]: 240563 : if (so->currTuples)
2091 : : {
2092 : : /* Save base IndexTuple (truncate posting list) */
2093 : : IndexTuple base;
2094 : 95652 : Size itupsz = BTreeTupleGetPostingOffset(itup);
2095 : :
2096 : 95652 : itupsz = MAXALIGN(itupsz);
2097 : 95652 : currItem->tupleOffset = so->currPos.nextTupleOffset;
2098 : 95652 : base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
2099 : 95652 : memcpy(base, itup, itupsz);
2100 : : /* Defensively reduce work area index tuple header size */
2101 : 95652 : base->t_info &= ~INDEX_SIZE_MASK;
2102 : 95652 : base->t_info |= itupsz;
2103 : 95652 : so->currPos.nextTupleOffset += itupsz;
2104 : :
2105 : 95652 : return currItem->tupleOffset;
2106 : : }
2107 : :
2108 : 144911 : return 0;
2109 : : }
2110 : :
2111 : : /*
2112 : : * Save an index item into so->currPos.items[itemIndex] for current posting
2113 : : * tuple.
2114 : : *
2115 : : * Assumes that _bt_setuppostingitems() has already been called for current
2116 : : * posting list tuple. Caller passes its return value as tupleOffset.
2117 : : */
2118 : : static inline void
2119 : 1214701 : _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
2120 : : ItemPointer heapTid, int tupleOffset)
2121 : : {
2122 : 1214701 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2123 : :
2124 : 1214701 : currItem->heapTid = *heapTid;
2125 : 1214701 : currItem->indexOffset = offnum;
2126 : :
2127 : : /*
2128 : : * Have index-only scans return the same base IndexTuple for every TID
2129 : : * that originates from the same posting list
2130 : : */
2131 [ + + ]: 1214701 : if (so->currTuples)
2132 : 491814 : currItem->tupleOffset = tupleOffset;
2133 : 1214701 : }
2134 : :
2135 : : /*
2136 : : * Return the index item from so->currPos.items[so->currPos.itemIndex] to the
2137 : : * index scan by setting the relevant fields in caller's index scan descriptor
2138 : : */
2139 : : static inline void
297 2140 : 12749201 : _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
2141 : : {
2142 : 12749201 : BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
2143 : :
2144 : : /* Most recent _bt_readpage must have succeeded */
2145 [ - + - - : 12749201 : Assert(BTScanPosIsValid(so->currPos));
- + ]
2146 [ - + ]: 12749201 : Assert(so->currPos.itemIndex >= so->currPos.firstItem);
2147 [ - + ]: 12749201 : Assert(so->currPos.itemIndex <= so->currPos.lastItem);
2148 : :
2149 : : /* Return next item, per amgettuple contract */
2150 : 12749201 : scan->xs_heaptid = currItem->heapTid;
2151 [ + + ]: 12749201 : if (so->currTuples)
2152 : 2190382 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2153 : 12749201 : }
2154 : :
2155 : : /*
2156 : : * _bt_steppage() -- Step to next page containing valid data for scan
2157 : : *
2158 : : * Wrapper on _bt_readnextpage that performs final steps for the current page.
2159 : : *
2160 : : * On entry, so->currPos must be valid. Its buffer will be pinned, though
2161 : : * never locked. (Actually, when so->dropPin there won't even be a pin held,
2162 : : * though so->currPos.currPage must still be set to a valid block number.)
2163 : : */
2164 : : static bool
7062 tgl@sss.pgh.pa.us 2165 : 3151190 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
2166 : : {
9178 2167 : 3151190 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2168 : : BlockNumber blkno,
2169 : : lastcurrblkno;
2170 : :
3818 kgrittn@postgresql.o 2171 [ - + - - : 3151190 : Assert(BTScanPosIsValid(so->currPos));
- + ]
2172 : :
2173 : : /* Before leaving current page, deal with any killed items */
7062 tgl@sss.pgh.pa.us 2174 [ + + ]: 3151190 : if (so->numKilled > 0)
3818 kgrittn@postgresql.o 2175 : 40824 : _bt_killitems(scan);
2176 : :
2177 : : /*
2178 : : * Before we modify currPos, make a copy of the page data if there was a
2179 : : * mark position that needs it.
2180 : : */
6953 tgl@sss.pgh.pa.us 2181 [ + + ]: 3151190 : if (so->markItemIndex >= 0)
2182 : : {
2183 : : /* bump pin on current buffer for assignment to mark buffer */
3818 kgrittn@postgresql.o 2184 [ - + - - : 181 : if (BTScanPosIsPinned(so->currPos))
+ + ]
2185 : 174 : IncrBufferRefCount(so->currPos.buf);
6953 tgl@sss.pgh.pa.us 2186 : 181 : memcpy(&so->markPos, &so->currPos,
2187 : : offsetof(BTScanPosData, items[1]) +
2188 : 181 : so->currPos.lastItem * sizeof(BTScanPosItem));
5081 2189 [ + + ]: 181 : if (so->markTuples)
2190 : 174 : memcpy(so->markTuples, so->currTuples,
2191 : 174 : so->currPos.nextTupleOffset);
6953 2192 : 181 : so->markPos.itemIndex = so->markItemIndex;
2193 : 181 : so->markItemIndex = -1;
2194 : :
2195 : : /*
2196 : : * If we're just about to start the next primitive index scan
2197 : : * (possible with a scan that has arrays keys, and needs to skip to
2198 : : * continue in the current scan direction), moreLeft/moreRight only
2199 : : * indicate the end of the current primitive index scan. They must
2200 : : * never be taken to indicate that the top-level index scan has ended
2201 : : * (that would be wrong).
2202 : : *
2203 : : * We could handle this case by treating the current array keys as
2204 : : * markPos state. But depending on the current array state like this
2205 : : * would add complexity. Instead, we just unset markPos's copy of
2206 : : * moreRight or moreLeft (whichever might be affected), while making
2207 : : * btrestrpos reset the scan's arrays to their initial scan positions.
2208 : : * In effect, btrestrpos leaves advancing the arrays up to the first
2209 : : * _bt_readpage call (that takes place after it has restored markPos).
2210 : : */
518 pg@bowt.ie 2211 [ - + ]: 181 : if (so->needPrimScan)
2212 : : {
311 pg@bowt.ie 2213 [ # # ]:UBC 0 : if (ScanDirectionIsForward(so->currPos.dir))
518 2214 : 0 : so->markPos.moreRight = true;
2215 : : else
2216 : 0 : so->markPos.moreLeft = true;
2217 : : }
2218 : :
2219 : : /* mark/restore not supported by parallel scans */
323 pg@bowt.ie 2220 [ - + ]:CBC 181 : Assert(!scan->parallel_scan);
2221 : : }
2222 : :
2223 [ - + - - : 3151190 : BTScanPosUnpinIfPinned(so->currPos);
+ + ]
2224 : :
2225 : : /* Walk to the next page with data */
302 2226 [ + + ]: 3151190 : if (ScanDirectionIsForward(dir))
2227 : 3151063 : blkno = so->currPos.nextPage;
2228 : : else
2229 : 127 : blkno = so->currPos.prevPage;
2230 : 3151190 : lastcurrblkno = so->currPos.currPage;
2231 : :
2232 : : /*
2233 : : * Cancel primitive index scans that were scheduled when the call to
2234 : : * _bt_readpage for currPos happened to use the opposite direction to the
2235 : : * one that we're stepping in now. (It's okay to leave the scan's array
2236 : : * keys as-is, since the next _bt_readpage will advance them.)
2237 : : */
2238 [ + + ]: 3151190 : if (so->currPos.dir != dir)
2239 : 18 : so->needPrimScan = false;
2240 : :
2241 : 3151190 : return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
2242 : : }
2243 : :
2244 : : /*
2245 : : * _bt_readfirstpage() -- Read first page containing valid data for _bt_first
2246 : : *
2247 : : * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
2248 : : * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
2249 : : * _bt_checkkeys will stop the scan as soon as an equality qual fails (when
2250 : : * its scan key was marked required), so _bt_first _must_ pass us an offnum
2251 : : * exactly at the beginning of where equal tuples are to be found. When we're
2252 : : * passed an offnum past the end of the page, we might still manage to stop
2253 : : * the scan on this page by calling _bt_checkkeys against the high key. See
2254 : : * _bt_readpage for full details.
2255 : : *
2256 : : * On entry, so->currPos must be pinned and locked (so offnum stays valid).
2257 : : * Parallel scan callers must have seized the scan before calling here.
2258 : : *
2259 : : * On exit, we'll have updated so->currPos and retained locks and pins
2260 : : * according to the same rules as those laid out for _bt_readnextpage exit.
2261 : : * Like _bt_readnextpage, our return value indicates if there are any matching
2262 : : * records in the given direction.
2263 : : *
2264 : : * We always release the scan for a parallel scan caller, regardless of
2265 : : * success or failure; we'll call _bt_parallel_release as soon as possible.
2266 : : */
2267 : : static bool
323 2268 : 6753186 : _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
2269 : : {
2270 : 6753186 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2271 : :
2272 : 6753186 : so->numKilled = 0; /* just paranoia */
2273 : 6753186 : so->markItemIndex = -1; /* ditto */
2274 : :
2275 : : /* Initialize so->currPos for the first page (page in so->currPos.buf) */
2276 [ + + ]: 6753186 : if (so->needPrimScan)
2277 : : {
2278 [ - + ]: 8751 : Assert(so->numArrayKeys);
2279 : :
2280 : 8751 : so->currPos.moreLeft = true;
2281 : 8751 : so->currPos.moreRight = true;
2282 : 8751 : so->needPrimScan = false;
2283 : : }
2284 [ + + ]: 6744435 : else if (ScanDirectionIsForward(dir))
2285 : : {
2286 : 6716193 : so->currPos.moreLeft = false;
3125 rhaas@postgresql.org 2287 : 6716193 : so->currPos.moreRight = true;
2288 : : }
2289 : : else
2290 : : {
323 pg@bowt.ie 2291 : 28242 : so->currPos.moreLeft = true;
2292 : 28242 : so->currPos.moreRight = false;
2293 : : }
2294 : :
2295 : : /*
2296 : : * Attempt to load matching tuples from the first page.
2297 : : *
2298 : : * Note that _bt_readpage will finish initializing the so->currPos fields.
2299 : : * _bt_readpage also releases parallel scan (even when it returns false).
2300 : : */
2301 [ + + ]: 6753186 : if (_bt_readpage(scan, dir, offnum, true))
2302 : : {
92 2303 : 4765262 : Relation rel = scan->indexRelation;
2304 : :
2305 : : /*
2306 : : * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
2307 : : * so->currPos.buf in preparation for btgettuple returning tuples.
2308 : : */
323 2309 [ - + - - : 4765262 : Assert(BTScanPosIsPinned(so->currPos));
- + ]
92 2310 : 4765262 : _bt_drop_lock_and_maybe_pin(rel, so);
323 2311 : 4765262 : return true;
2312 : : }
2313 : :
2314 : : /* There's no actually-matching data on the page in so->currPos.buf */
2315 : 1987924 : _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2316 : :
2317 : : /* Call _bt_readnextpage using its _bt_steppage wrapper function */
2318 [ + + ]: 1987924 : if (!_bt_steppage(scan, dir))
2319 : 1987839 : return false;
2320 : :
2321 : : /* _bt_readpage for a later page (now in so->currPos) succeeded */
3125 rhaas@postgresql.org 2322 : 85 : return true;
2323 : : }
2324 : :
2325 : : /*
2326 : : * _bt_readnextpage() -- Read next page containing valid data for _bt_next
2327 : : *
2328 : : * Caller's blkno is the next interesting page's link, taken from either the
2329 : : * previously-saved right link or left link. lastcurrblkno is the page that
2330 : : * was current at the point where the blkno link was saved, which we use to
2331 : : * reason about concurrent page splits/page deletions during backwards scans.
2332 : : * In the common case where seized=false, blkno is either so->currPos.nextPage
2333 : : * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage.
2334 : : *
2335 : : * On entry, so->currPos shouldn't be locked by caller. so->currPos.buf must
2336 : : * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno
2337 : : * won't need to be read again in almost all cases). Parallel scan callers
2338 : : * that seized the scan before calling here should pass seized=true; such a
2339 : : * caller's blkno and lastcurrblkno arguments come from the seized scan.
2340 : : * seized=false callers just pass us the blkno/lastcurrblkno taken from their
2341 : : * so->currPos, which (along with so->currPos itself) can be used to end the
2342 : : * scan. A seized=false caller's blkno can never be assumed to be the page
2343 : : * that must be read next during a parallel scan, though. We must figure that
2344 : : * part out for ourselves by seizing the scan (the correct page to read might
2345 : : * already be beyond the seized=false caller's blkno during a parallel scan,
2346 : : * unless blkno/so->currPos.nextPage/so->currPos.prevPage is already P_NONE,
2347 : : * or unless so->currPos.moreRight/so->currPos.moreLeft is already unset).
2348 : : *
2349 : : * On success exit, so->currPos is updated to contain data from the next
2350 : : * interesting page, and we return true. We hold a pin on the buffer on
2351 : : * success exit (except during so->dropPin index scans, when we drop the pin
2352 : : * eagerly to avoid blocking VACUUM).
2353 : : *
2354 : : * If there are no more matching records in the given direction, we invalidate
2355 : : * so->currPos (while ensuring it retains no locks or pins), and return false.
2356 : : *
2357 : : * We always release the scan for a parallel scan caller, regardless of
2358 : : * success or failure; we'll call _bt_parallel_release as soon as possible.
2359 : : */
2360 : : static bool
323 pg@bowt.ie 2361 : 3151233 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
2362 : : BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
2363 : : {
2364 : 3151233 : Relation rel = scan->indexRelation;
3125 rhaas@postgresql.org 2365 : 3151233 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2366 : :
302 pg@bowt.ie 2367 [ + + - + ]: 3151233 : Assert(so->currPos.currPage == lastcurrblkno || seized);
183 2368 [ + + - + ]: 3151233 : Assert(!(blkno == P_NONE && seized));
323 2369 [ + + - + : 3151233 : Assert(!BTScanPosIsPinned(so->currPos));
- + ]
2370 : :
2371 : : /*
2372 : : * Remember that the scan already read lastcurrblkno, a page to the left
2373 : : * of blkno (or remember reading a page to the right, for backwards scans)
2374 : : */
3125 rhaas@postgresql.org 2375 [ + + ]: 3151233 : if (ScanDirectionIsForward(dir))
323 pg@bowt.ie 2376 : 3151106 : so->currPos.moreLeft = true;
2377 : : else
2378 : 127 : so->currPos.moreRight = true;
2379 : :
2380 : : for (;;)
10226 bruce@momjian.us 2381 : 1195 : {
2382 : : Page page;
2383 : : BTPageOpaque opaque;
2384 : :
323 pg@bowt.ie 2385 [ + + + + ]: 3152428 : if (blkno == P_NONE ||
2386 : : (ScanDirectionIsForward(dir) ?
2387 [ + + + + ]: 1061846 : !so->currPos.moreRight : !so->currPos.moreLeft))
2388 : : {
2389 : : /* most recent _bt_readpage call (for lastcurrblkno) ended scan */
302 2390 [ + - - + ]: 3137067 : Assert(so->currPos.currPage == lastcurrblkno && !seized);
391 2391 : 3137067 : BTScanPosInvalidate(so->currPos);
302 2392 : 3137067 : _bt_parallel_done(scan); /* iff !so->needPrimScan */
391 2393 : 3137067 : return false;
2394 : : }
2395 : :
311 2396 [ - + ]: 15361 : Assert(!so->needPrimScan);
2397 : :
2398 : : /* parallel scan must never actually visit so->currPos blkno */
302 2399 [ + + + + ]: 15361 : if (!seized && scan->parallel_scan != NULL &&
2400 [ + + ]: 606 : !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
2401 : : {
2402 : : /* whole scan is now done (or another primitive scan required) */
2403 : 42 : BTScanPosInvalidate(so->currPos);
2404 : 42 : return false;
2405 : : }
2406 : :
323 2407 [ + + ]: 15319 : if (ScanDirectionIsForward(dir))
2408 : : {
2409 : : /* read blkno, but check for interrupts first */
2410 [ + + ]: 15239 : CHECK_FOR_INTERRUPTS();
2411 : 15238 : so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2412 : : }
2413 : : else
2414 : : {
2415 : : /* read blkno, avoiding race (also checks for interrupts) */
2416 : 80 : so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
2417 : : lastcurrblkno);
2418 [ - + ]: 80 : if (so->currPos.buf == InvalidBuffer)
2419 : : {
2420 : : /* must have been a concurrent deletion of leftmost page */
3818 kgrittn@postgresql.o 2421 :UBC 0 : BTScanPosInvalidate(so->currPos);
306 pg@bowt.ie 2422 : 0 : _bt_parallel_done(scan);
7062 tgl@sss.pgh.pa.us 2423 : 0 : return false;
2424 : : }
2425 : : }
2426 : :
323 pg@bowt.ie 2427 :CBC 15318 : page = BufferGetPage(so->currPos.buf);
2428 : 15318 : opaque = BTPageGetOpaque(page);
2429 : 15318 : lastcurrblkno = blkno;
2430 [ + - ]: 15318 : if (likely(!P_IGNORE(opaque)))
2431 : : {
2432 : : /* see if there are any matches on this page */
2433 [ + + ]: 15318 : if (ScanDirectionIsForward(dir))
2434 : : {
2435 : : /* note that this will clear moreRight if we can stop */
168 2436 [ + + + + ]: 15238 : if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))
323 2437 : 14054 : break;
2438 : 1184 : blkno = so->currPos.nextPage;
2439 : : }
2440 : : else
2441 : : {
2442 : : /* note that this will clear moreLeft if we can stop */
168 2443 [ + + ]: 80 : if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))
7062 tgl@sss.pgh.pa.us 2444 : 69 : break;
323 pg@bowt.ie 2445 : 11 : blkno = so->currPos.prevPage;
2446 : : }
2447 : : }
2448 : : else
2449 : : {
2450 : : /* _bt_readpage not called, so do all this for ourselves */
323 pg@bowt.ie 2451 [ # # ]:UBC 0 : if (ScanDirectionIsForward(dir))
2452 : 0 : blkno = opaque->btpo_next;
2453 : : else
2454 : 0 : blkno = opaque->btpo_prev;
3125 rhaas@postgresql.org 2455 [ # # ]: 0 : if (scan->parallel_scan != NULL)
323 pg@bowt.ie 2456 : 0 : _bt_parallel_release(scan, blkno, lastcurrblkno);
2457 : : }
2458 : :
2459 : : /* no matching tuples on this page */
323 pg@bowt.ie 2460 :CBC 1195 : _bt_relbuf(rel, so->currPos.buf);
302 2461 : 1195 : seized = false; /* released by _bt_readpage (or by us) */
2462 : : }
2463 : :
2464 : : /*
2465 : : * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
2466 : : * so->currPos.buf in preparation for btgettuple returning tuples.
2467 : : */
323 2468 [ - + ]: 14123 : Assert(so->currPos.currPage == blkno);
2469 [ - + - - : 14123 : Assert(BTScanPosIsPinned(so->currPos));
- + ]
92 2470 : 14123 : _bt_drop_lock_and_maybe_pin(rel, so);
2471 : :
9867 bruce@momjian.us 2472 : 14123 : return true;
2473 : : }
2474 : :
2475 : : /*
2476 : : * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,
2477 : : * recovering from concurrent page splits/page deletions when necessary
2478 : : *
2479 : : * Called during backwards scans, to deal with their unique concurrency rules.
2480 : : *
2481 : : * blkno points to the block number of the page that we expect to move the
2482 : : * scan to. We'll successfully move the scan there when we find that its
2483 : : * right sibling link still points to lastcurrblkno (the page we just read).
2484 : : * Otherwise, we have to figure out which page is the correct one for the scan
2485 : : * to now read the hard way, reasoning about concurrent splits and deletions.
2486 : : * See nbtree/README.
2487 : : *
2488 : : * On return, we have both a pin and a read lock on the returned page, whose
2489 : : * block number will be set in *blkno. Returns InvalidBuffer if there is no
2490 : : * page to the left (no lock or pin is held in that case).
2491 : : *
2492 : : * It is possible for the returned leaf page to be half-dead; caller must
2493 : : * check that condition and step left again when required.
2494 : : */
2495 : : static Buffer
323 pg@bowt.ie 2496 : 80 : _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
2497 : : BlockNumber lastcurrblkno)
2498 : : {
2499 : 80 : BlockNumber origblkno = *blkno; /* detects circular links */
2500 : :
2501 : : for (;;)
8232 tgl@sss.pgh.pa.us 2502 :UBC 0 : {
2503 : : Buffer buf;
2504 : : Page page;
2505 : : BTPageOpaque opaque;
2506 : : int tries;
2507 : :
2508 : : /* check for interrupts while we're not holding any buffer lock */
5968 tgl@sss.pgh.pa.us 2509 [ - + ]:CBC 80 : CHECK_FOR_INTERRUPTS();
323 pg@bowt.ie 2510 : 80 : buf = _bt_getbuf(rel, *blkno, BT_READ);
3426 kgrittn@postgresql.o 2511 : 80 : page = BufferGetPage(buf);
1254 michael@paquier.xyz 2512 : 80 : opaque = BTPageGetOpaque(page);
2513 : :
2514 : : /*
2515 : : * If this isn't the page we want, walk right till we find what we
2516 : : * want --- but go no more than four hops (an arbitrary limit). If we
2517 : : * don't find the correct page by then, the most likely bet is that
2518 : : * lastcurrblkno got deleted and isn't in the sibling chain at all
2519 : : * anymore, not that its left sibling got split more than four times.
2520 : : *
2521 : : * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2522 : : * because half-dead pages are still in the sibling chain.
2523 : : */
8232 tgl@sss.pgh.pa.us 2524 : 80 : tries = 0;
2525 : : for (;;)
2526 : : {
323 pg@bowt.ie 2527 [ + - + - : 80 : if (likely(!P_ISDELETED(opaque) &&
+ - ]
2528 : : opaque->btpo_next == lastcurrblkno))
2529 : : {
2530 : : /* Found desired page, return it */
8232 tgl@sss.pgh.pa.us 2531 : 80 : return buf;
2532 : : }
8232 tgl@sss.pgh.pa.us 2533 [ # # # # ]:UBC 0 : if (P_RIGHTMOST(opaque) || ++tries > 4)
2534 : : break;
2535 : : /* step right */
323 pg@bowt.ie 2536 : 0 : *blkno = opaque->btpo_next;
2537 : 0 : buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
3426 kgrittn@postgresql.o 2538 : 0 : page = BufferGetPage(buf);
1254 michael@paquier.xyz 2539 : 0 : opaque = BTPageGetOpaque(page);
2540 : : }
2541 : :
2542 : : /*
2543 : : * Return to the original page (usually the page most recently read by
2544 : : * _bt_readpage, which is passed by caller as lastcurrblkno) to see
2545 : : * what's up with its prev sibling link
2546 : : */
323 pg@bowt.ie 2547 : 0 : buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
3426 kgrittn@postgresql.o 2548 : 0 : page = BufferGetPage(buf);
1254 michael@paquier.xyz 2549 : 0 : opaque = BTPageGetOpaque(page);
8232 tgl@sss.pgh.pa.us 2550 [ # # ]: 0 : if (P_ISDELETED(opaque))
2551 : : {
2552 : : /*
2553 : : * It was deleted. Move right to first nondeleted page (there
2554 : : * must be one); that is the page that has acquired the deleted
2555 : : * one's keyspace, so stepping left from it will take us where we
2556 : : * want to be.
2557 : : */
2558 : : for (;;)
2559 : : {
2560 [ # # ]: 0 : if (P_RIGHTMOST(opaque))
6459 2561 [ # # ]: 0 : elog(ERROR, "fell off the end of index \"%s\"",
2562 : : RelationGetRelationName(rel));
323 pg@bowt.ie 2563 : 0 : lastcurrblkno = opaque->btpo_next;
2564 : 0 : buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
3426 kgrittn@postgresql.o 2565 : 0 : page = BufferGetPage(buf);
1254 michael@paquier.xyz 2566 : 0 : opaque = BTPageGetOpaque(page);
8232 tgl@sss.pgh.pa.us 2567 [ # # ]: 0 : if (!P_ISDELETED(opaque))
2568 : 0 : break;
2569 : : }
2570 : : }
2571 : : else
2572 : : {
2573 : : /*
2574 : : * Original lastcurrblkno wasn't deleted; the explanation had
2575 : : * better be that the page to the left got split or deleted.
2576 : : * Without this check, we risk going into an infinite loop.
2577 : : */
323 pg@bowt.ie 2578 [ # # ]: 0 : if (opaque->btpo_prev == origblkno)
6459 tgl@sss.pgh.pa.us 2579 [ # # ]: 0 : elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2580 : : lastcurrblkno, RelationGetRelationName(rel));
2581 : : /* Okay to try again, since left sibling link changed */
2582 : : }
2583 : :
2584 : : /*
2585 : : * Original lastcurrblkno from caller was concurrently deleted (could
2586 : : * also have been a great many concurrent left sibling page splits).
2587 : : * Found a non-deleted page that should now act as our lastcurrblkno.
2588 : : */
323 pg@bowt.ie 2589 [ # # ]: 0 : if (P_LEFTMOST(opaque))
2590 : : {
2591 : : /* New lastcurrblkno has no left sibling (concurrently deleted) */
2592 : 0 : _bt_relbuf(rel, buf);
2593 : 0 : break;
2594 : : }
2595 : :
2596 : : /* Start from scratch with new lastcurrblkno's blkno/prev link */
2597 : 0 : *blkno = origblkno = opaque->btpo_prev;
2598 : 0 : _bt_relbuf(rel, buf);
2599 : : }
2600 : :
8232 tgl@sss.pgh.pa.us 2601 : 0 : return InvalidBuffer;
2602 : : }
2603 : :
2604 : : /*
2605 : : * _bt_get_endpoint() -- Find the first or last page on a given tree level
2606 : : *
2607 : : * If the index is empty, we will return InvalidBuffer; any other failure
2608 : : * condition causes ereport(). We will not return a dead page.
2609 : : *
2610 : : * The returned buffer is pinned and read-locked.
2611 : : */
2612 : : Buffer
729 tmunro@postgresql.or 2613 :CBC 43310 : _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
2614 : : {
2615 : : Buffer buf;
2616 : : Page page;
2617 : : BTPageOpaque opaque;
2618 : : OffsetNumber offnum;
2619 : : BlockNumber blkno;
2620 : : IndexTuple itup;
2621 : :
2622 : : /*
2623 : : * If we are looking for a leaf page, okay to descend from fast root;
2624 : : * otherwise better descend from true root. (There is no point in being
2625 : : * smarter about intermediate levels.)
2626 : : */
8233 tgl@sss.pgh.pa.us 2627 [ + + ]: 43310 : if (level == 0)
819 pg@bowt.ie 2628 : 43298 : buf = _bt_getroot(rel, NULL, BT_READ);
2629 : : else
2630 : 12 : buf = _bt_gettrueroot(rel);
2631 : :
8233 tgl@sss.pgh.pa.us 2632 [ + + ]: 43310 : if (!BufferIsValid(buf))
2633 : 3601 : return InvalidBuffer;
2634 : :
3426 kgrittn@postgresql.o 2635 : 39709 : page = BufferGetPage(buf);
1254 michael@paquier.xyz 2636 : 39709 : opaque = BTPageGetOpaque(page);
2637 : :
2638 : : for (;;)
2639 : : {
2640 : : /*
2641 : : * If we landed on a deleted page, step right to find a live page
2642 : : * (there must be one). Also, if we want the rightmost page, step
2643 : : * right if needed to get to it (this could happen if the page split
2644 : : * since we obtained a pointer to it).
2645 : : */
8232 tgl@sss.pgh.pa.us 2646 [ - + + + ]: 50866 : while (P_IGNORE(opaque) ||
8233 2647 [ - + ]: 33 : (rightmost && !P_RIGHTMOST(opaque)))
2648 : : {
8233 tgl@sss.pgh.pa.us 2649 :UBC 0 : blkno = opaque->btpo_next;
2650 [ # # ]: 0 : if (blkno == P_NONE)
6459 2651 [ # # ]: 0 : elog(ERROR, "fell off the end of index \"%s\"",
2652 : : RelationGetRelationName(rel));
7808 2653 : 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
3426 kgrittn@postgresql.o 2654 : 0 : page = BufferGetPage(buf);
1254 michael@paquier.xyz 2655 : 0 : opaque = BTPageGetOpaque(page);
2656 : : }
2657 : :
2658 : : /* Done? */
1655 pg@bowt.ie 2659 [ + + ]:CBC 50866 : if (opaque->btpo_level == level)
8233 tgl@sss.pgh.pa.us 2660 : 39709 : break;
1655 pg@bowt.ie 2661 [ - + ]: 11157 : if (opaque->btpo_level < level)
2228 peter@eisentraut.org 2662 [ # # ]:UBC 0 : ereport(ERROR,
2663 : : (errcode(ERRCODE_INDEX_CORRUPTED),
2664 : : errmsg_internal("btree level %u not found in index \"%s\"",
2665 : : level, RelationGetRelationName(rel))));
2666 : :
2667 : : /* Descend to leftmost or rightmost child page */
8233 tgl@sss.pgh.pa.us 2668 [ + + ]:CBC 11157 : if (rightmost)
2669 : 3 : offnum = PageGetMaxOffsetNumber(page);
2670 : : else
2671 [ + + ]: 11154 : offnum = P_FIRSTDATAKEY(opaque);
2672 : :
7164 2673 : 11157 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2091 pg@bowt.ie 2674 : 11157 : blkno = BTreeTupleGetDownLink(itup);
2675 : :
7808 tgl@sss.pgh.pa.us 2676 : 11157 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
3426 kgrittn@postgresql.o 2677 : 11157 : page = BufferGetPage(buf);
1254 michael@paquier.xyz 2678 : 11157 : opaque = BTPageGetOpaque(page);
2679 : : }
2680 : :
8233 tgl@sss.pgh.pa.us 2681 : 39709 : return buf;
2682 : : }
2683 : :
2684 : : /*
2685 : : * _bt_endpoint() -- Find the first or last page in the index, and scan
2686 : : * from there to the first key satisfying all the quals.
2687 : : *
2688 : : * This is used by _bt_first() to set up a scan when we've determined
2689 : : * that the scan must start at the beginning or end of the index (for
2690 : : * a forward or backward scan respectively).
2691 : : *
2692 : : * Parallel scan callers must have seized the scan before calling here.
2693 : : * Exit conditions are the same as for _bt_first().
2694 : : */
2695 : : static bool
10651 scrappy@hub.org 2696 : 43298 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2697 : : {
7062 tgl@sss.pgh.pa.us 2698 : 43298 : Relation rel = scan->indexRelation;
2699 : 43298 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2700 : : Page page;
2701 : : BTPageOpaque opaque;
2702 : : OffsetNumber start;
2703 : :
306 pg@bowt.ie 2704 [ + - - + : 43298 : Assert(!BTScanPosIsValid(so->currPos));
- + ]
242 2705 [ - + ]: 43298 : Assert(!so->needPrimScan);
2706 : :
2707 : : /*
2708 : : * Scan down to the leftmost or rightmost leaf page. This is a simplified
2709 : : * version of _bt_search().
2710 : : */
323 2711 : 43298 : so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
2712 : :
2713 [ + + ]: 43298 : if (!BufferIsValid(so->currPos.buf))
2714 : : {
2715 : : /*
2716 : : * Empty index. Lock the whole relation, as nothing finer to lock
2717 : : * exists.
2718 : : */
5197 heikki.linnakangas@i 2719 : 3601 : PredicateLockRelation(rel, scan->xs_snapshot);
306 pg@bowt.ie 2720 : 3601 : _bt_parallel_done(scan);
8510 tgl@sss.pgh.pa.us 2721 : 3601 : return false;
2722 : : }
2723 : :
323 pg@bowt.ie 2724 : 39697 : page = BufferGetPage(so->currPos.buf);
1254 michael@paquier.xyz 2725 : 39697 : opaque = BTPageGetOpaque(page);
8233 tgl@sss.pgh.pa.us 2726 [ - + ]: 39697 : Assert(P_ISLEAF(opaque));
2727 : :
10226 bruce@momjian.us 2728 [ + + ]: 39697 : if (ScanDirectionIsForward(dir))
2729 : : {
2730 : : /* There could be dead pages to the left, so not this: */
2731 : : /* Assert(P_LEFTMOST(opaque)); */
2732 : :
9178 tgl@sss.pgh.pa.us 2733 [ + + ]: 39667 : start = P_FIRSTDATAKEY(opaque);
2734 : : }
10226 bruce@momjian.us 2735 [ + - ]: 30 : else if (ScanDirectionIsBackward(dir))
2736 : : {
9178 tgl@sss.pgh.pa.us 2737 [ - + ]: 30 : Assert(P_RIGHTMOST(opaque));
2738 : :
2739 : 30 : start = PageGetMaxOffsetNumber(page);
2740 : : }
2741 : : else
2742 : : {
8083 tgl@sss.pgh.pa.us 2743 [ # # ]:UBC 0 : elog(ERROR, "invalid scan direction: %d", (int) dir);
2744 : : start = 0; /* keep compiler quiet */
2745 : : }
2746 : :
2747 : : /*
2748 : : * Now load data from the first page of the scan.
2749 : : */
323 pg@bowt.ie 2750 [ + + ]:CBC 39697 : if (!_bt_readfirstpage(scan, start, dir))
2751 : 888 : return false;
2752 : :
297 2753 : 38809 : _bt_returnitem(scan, so);
7062 tgl@sss.pgh.pa.us 2754 : 38809 : return true;
2755 : : }
|