Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * nbtsearch.c
4 : : * Search code for postgres btrees.
5 : : *
6 : : *
7 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
8 : : * Portions Copyright (c) 1994, Regents of the University of California
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/access/nbtree/nbtsearch.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : :
16 : : #include "postgres.h"
17 : :
18 : : #include "access/nbtree.h"
19 : : #include "access/relscan.h"
20 : : #include "access/xact.h"
21 : : #include "executor/instrument_node.h"
22 : : #include "miscadmin.h"
23 : : #include "pgstat.h"
24 : : #include "storage/predicate.h"
25 : : #include "utils/lsyscache.h"
26 : : #include "utils/rel.h"
27 : :
28 : :
29 : : static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so);
30 : : static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
31 : : Buffer buf, bool forupdate, BTStack stack,
32 : : int access);
33 : : static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
34 : : static int _bt_binsrch_posting(BTScanInsert key, Page page,
35 : : OffsetNumber offnum);
36 : : static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);
37 : : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
38 : : static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
39 : : ScanDirection dir);
40 : : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
41 : : BlockNumber lastcurrblkno, ScanDirection dir,
42 : : bool seized);
43 : : static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
44 : : BlockNumber lastcurrblkno);
45 : : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
46 : :
47 : :
48 : : /*
49 : : * _bt_drop_lock_and_maybe_pin()
50 : : *
51 : : * Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too.
52 : : * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
53 : : */
54 : : static inline void
282 pg@bowt.ie 55 :CBC 5448311 : _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
56 : : {
57 [ + + ]: 5448311 : if (!so->dropPin)
58 : : {
59 : : /* Just drop the lock (not the pin) */
60 : 261175 : _bt_unlockbuf(rel, so->currPos.buf);
61 : 261175 : return;
62 : : }
63 : :
64 : : /*
65 : : * Drop both the lock and the pin.
66 : : *
67 : : * Have to set so->currPos.lsn so that _bt_killitems has a way to detect
68 : : * when concurrent heap TID recycling by VACUUM might have taken place.
69 : : */
70 : 5187136 : so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
71 : 5187136 : _bt_relbuf(rel, so->currPos.buf);
72 : 5187136 : so->currPos.buf = InvalidBuffer;
73 : : }
74 : :
75 : : /*
76 : : * _bt_search() -- Search the tree for a particular scankey,
77 : : * or more precisely for the first leaf page it could be on.
78 : : *
79 : : * The passed scankey is an insertion-type scankey (see nbtree/README),
80 : : * but it can omit the rightmost column(s) of the index.
81 : : *
82 : : * If returnstack is true, return value is a stack of parent-page pointers
83 : : * (i.e. there is no entry for the leaf level/page). If returnstack is false,
84 : : * we just return NULL. This scheme allows callers that don't need a descent
85 : : * stack to avoid palloc churn.
86 : : *
87 : : * When we return, *bufP is set to the address of the leaf-page buffer, which
88 : : * is locked and pinned. No locks are held on the parent pages, however!
89 : : *
90 : : * The returned buffer is locked according to access parameter. Additionally,
91 : : * access = BT_WRITE will allow an empty root page to be created and returned.
92 : : * When access = BT_READ, an empty index will result in *bufP being set to
93 : : * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
94 : : * during the search will be finished.
95 : : *
96 : : * heaprel must be provided by callers that pass access = BT_WRITE, since we
97 : : * might need to allocate a new root page for caller -- see _bt_allocbuf.
98 : : */
99 : : BTStack
1079 andres@anarazel.de 100 : 12889508 : _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
101 : : int access, bool returnstack)
102 : : {
9124 bruce@momjian.us 103 : 12889508 : BTStack stack_in = NULL;
2787 akorotkov@postgresql 104 : 12889508 : int page_access = BT_READ;
105 : :
106 : : /* heaprel must be set whenever _bt_allocbuf is reachable */
1009 pg@bowt.ie 107 [ + + - + ]: 12889508 : Assert(access == BT_READ || access == BT_WRITE);
108 [ + + - + ]: 12889508 : Assert(access == BT_READ || heaprel != NULL);
109 : :
110 : : /* Get the root page to start with */
1079 andres@anarazel.de 111 : 12889508 : *bufP = _bt_getroot(rel, heaprel, access);
112 : :
113 : : /* If index is empty and access = BT_READ, no root page is created. */
9124 bruce@momjian.us 114 [ + + ]: 12889508 : if (!BufferIsValid(*bufP))
9368 tgl@sss.pgh.pa.us 115 : 288322 : return (BTStack) NULL;
116 : :
117 : : /* Loop iterates once per level descended in the tree */
118 : : for (;;)
119 : 11033665 : {
120 : : Page page;
121 : : BTPageOpaque opaque;
122 : : OffsetNumber offnum;
123 : : ItemId itemid;
124 : : IndexTuple itup;
125 : : BlockNumber child;
126 : : BTStack new_stack;
127 : :
128 : : /*
129 : : * Race -- the page we just grabbed may have split since we read its
130 : : * downlink in its parent page (or the metapage). If it has, we may
131 : : * need to move right to its new sibling. Do that.
132 : : *
133 : : * In write-mode, allow _bt_moveright to finish any incomplete splits
134 : : * along the way. Strictly speaking, we'd only need to finish an
135 : : * incomplete split on the leaf page we're about to insert to, not on
136 : : * any of the upper levels (internal pages with incomplete splits are
137 : : * also taken care of in _bt_getstackbuf). But this is a good
138 : : * opportunity to finish splits of internal pages too.
139 : : */
1079 andres@anarazel.de 140 : 23634851 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
141 : : stack_in, page_access);
142 : :
143 : : /* if this is a leaf page, we're done */
3616 kgrittn@postgresql.o 144 : 23634851 : page = BufferGetPage(*bufP);
1444 michael@paquier.xyz 145 : 23634851 : opaque = BTPageGetOpaque(page);
9368 tgl@sss.pgh.pa.us 146 [ + + ]: 23634851 : if (P_ISLEAF(opaque))
147 : 12601186 : break;
148 : :
149 : : /*
150 : : * Find the appropriate pivot tuple on this page. Its downlink points
151 : : * to the child page that we're about to descend to.
152 : : */
2552 pg@bowt.ie 153 : 11033665 : offnum = _bt_binsrch(rel, key, *bufP);
9368 tgl@sss.pgh.pa.us 154 : 11033665 : itemid = PageGetItemId(page, offnum);
7354 155 : 11033665 : itup = (IndexTuple) PageGetItem(page, itemid);
2209 pg@bowt.ie 156 [ - + - - ]: 11033665 : Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
2082 157 : 11033665 : child = BTreeTupleGetDownLink(itup);
158 : :
159 : : /*
160 : : * We need to save the location of the pivot tuple we chose in a new
161 : : * stack entry for this page/level. If caller ends up splitting a
162 : : * page one level down, it usually ends up inserting a new pivot
163 : : * tuple/downlink immediately after the location recorded here.
164 : : */
3 pg@bowt.ie 165 [ + + ]:GNC 11033665 : if (returnstack)
166 : : {
167 : 3837471 : new_stack = (BTStack) palloc_object(BTStackData);
168 : 3837471 : new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
169 : 3837471 : new_stack->bts_offset = offnum;
170 : 3837471 : new_stack->bts_parent = stack_in;
171 : 3837471 : stack_in = new_stack;
172 : : }
173 : :
174 : : /*
175 : : * Page level 1 is lowest non-leaf page level prior to leaves. So, if
176 : : * we're on the level 1 and asked to lock leaf page in write mode,
177 : : * then lock next page in write mode, because it must be a leaf.
178 : : */
1845 pg@bowt.ie 179 [ + + + + ]:CBC 11033665 : if (opaque->btpo_level == 1 && access == BT_WRITE)
2787 akorotkov@postgresql 180 : 3773891 : page_access = BT_WRITE;
181 : :
182 : : /* drop the read lock on the page, then acquire one on its child */
2082 pg@bowt.ie 183 : 11033665 : *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
184 : :
185 : : /* okay, all set to move down a level */
186 : : }
187 : :
188 : : /*
189 : : * If we're asked to lock leaf in write mode, but didn't manage to, then
190 : : * relock. This should only happen when the root page is a leaf page (and
191 : : * the only page in the index other than the metapage).
192 : : */
2787 akorotkov@postgresql 193 [ + + + + ]: 12601186 : if (access == BT_WRITE && page_access == BT_READ)
194 : : {
195 : : /* trade in our read lock for a write lock */
2063 pg@bowt.ie 196 : 466552 : _bt_unlockbuf(rel, *bufP);
197 : 466552 : _bt_lockbuf(rel, *bufP, BT_WRITE);
198 : :
199 : : /*
200 : : * Race -- the leaf page may have split after we dropped the read lock
201 : : * but before we acquired a write lock. If it has, we may need to
202 : : * move right to its new sibling. Do that.
203 : : */
919 tmunro@postgresql.or 204 : 466552 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
205 : : }
206 : :
9368 tgl@sss.pgh.pa.us 207 : 12601186 : return stack_in;
208 : : }
209 : :
210 : : /*
211 : : * _bt_moveright() -- move right in the btree if necessary.
212 : : *
213 : : * When we follow a pointer to reach a page, it is possible that
214 : : * the page has changed in the meanwhile. If this happens, we're
215 : : * guaranteed that the page has "split right" -- that is, that any
216 : : * data that appeared on the page originally is either on the page
217 : : * or strictly to the right of it.
218 : : *
219 : : * This routine decides whether or not we need to move right in the
220 : : * tree by examining the high key entry on the page. If that entry is
221 : : * strictly less than the scankey, or <= the scankey in the
222 : : * key.nextkey=true case, then we followed the wrong link and we need
223 : : * to move right.
224 : : *
225 : : * The passed insertion-type scankey can omit the rightmost column(s) of the
226 : : * index. (see nbtree/README)
227 : : *
228 : : * When key.nextkey is false (the usual case), we are looking for the first
229 : : * item >= key. When key.nextkey is true, we are looking for the first item
230 : : * strictly greater than key.
231 : : *
232 : : * If forupdate is true, we will attempt to finish any incomplete splits
233 : : * that we encounter. This is required when locking a target page for an
234 : : * insertion, because we don't allow inserting on a page before the split is
235 : : * completed. 'heaprel' and 'stack' are only used if forupdate is true.
236 : : *
237 : : * On entry, we have the buffer pinned and a lock of the type specified by
238 : : * 'access'. If we move right, we release the buffer and lock and acquire
239 : : * the same on the right sibling. Return value is the buffer we stop at.
240 : : */
241 : : static Buffer
10841 scrappy@hub.org 242 : 24101403 : _bt_moveright(Relation rel,
243 : : Relation heaprel,
244 : : BTScanInsert key,
245 : : Buffer buf,
246 : : bool forupdate,
247 : : BTStack stack,
248 : : int access)
249 : : {
250 : : Page page;
251 : : BTPageOpaque opaque;
252 : : int32 cmpval;
253 : :
1009 pg@bowt.ie 254 [ + + - + ]: 24101403 : Assert(!forupdate || heaprel != NULL);
255 : :
256 : : /*
257 : : * When nextkey = false (normal case): if the scan key that brought us to
258 : : * this page is > the high key stored on the page, then the page has split
259 : : * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
260 : : * have some duplicates to the right as well as the left, but that's
261 : : * something that's only ever dealt with on the leaf level, after
262 : : * _bt_search has found an initial leaf page.)
263 : : *
264 : : * When nextkey = true: move right if the scan key is >= page's high key.
265 : : * (Note that key.scantid cannot be set in this case.)
266 : : *
267 : : * The page could even have split more than once, so scan as far as
268 : : * needed.
269 : : *
270 : : * We also have to move right if we followed a link that brought us to a
271 : : * dead page.
272 : : */
2552 273 : 24101403 : cmpval = key->nextkey ? 0 : 1;
274 : :
275 : : for (;;)
276 : : {
3616 kgrittn@postgresql.o 277 : 24147422 : page = BufferGetPage(buf);
1444 michael@paquier.xyz 278 : 24147422 : opaque = BTPageGetOpaque(page);
279 : :
4380 heikki.linnakangas@i 280 [ + + ]: 24147422 : if (P_RIGHTMOST(opaque))
281 : 17958945 : break;
282 : :
283 : : /*
284 : : * Finish any incomplete splits we encounter along the way.
285 : : */
286 [ + + + + ]: 6188477 : if (forupdate && P_INCOMPLETE_SPLIT(opaque))
4380 heikki.linnakangas@i 287 :GBC 2 : {
288 : 2 : BlockNumber blkno = BufferGetBlockNumber(buf);
289 : :
290 : : /* upgrade our lock if necessary */
291 [ + + ]: 2 : if (access == BT_READ)
292 : : {
2063 pg@bowt.ie 293 : 1 : _bt_unlockbuf(rel, buf);
294 : 1 : _bt_lockbuf(rel, buf, BT_WRITE);
295 : : }
296 : :
4380 heikki.linnakangas@i 297 [ + - ]: 2 : if (P_INCOMPLETE_SPLIT(opaque))
1079 andres@anarazel.de 298 : 2 : _bt_finish_split(rel, heaprel, buf, stack);
299 : : else
4380 heikki.linnakangas@i 300 :UBC 0 : _bt_relbuf(rel, buf);
301 : :
302 : : /* re-acquire the lock in the right mode, and re-check */
1009 pg@bowt.ie 303 :GBC 2 : buf = _bt_getbuf(rel, blkno, access);
4380 heikki.linnakangas@i 304 : 2 : continue;
305 : : }
306 : :
2552 pg@bowt.ie 307 [ + - + + ]:CBC 6188475 : if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
308 : : {
309 : : /* step right one page */
4380 heikki.linnakangas@i 310 : 46017 : buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
311 : 46017 : continue;
312 : : }
313 : : else
314 : : break;
315 : : }
316 : :
8422 tgl@sss.pgh.pa.us 317 [ - + ]: 24101403 : if (P_IGNORE(opaque))
6649 tgl@sss.pgh.pa.us 318 [ # # ]:UBC 0 : elog(ERROR, "fell off the end of index \"%s\"",
319 : : RelationGetRelationName(rel));
320 : :
10057 bruce@momjian.us 321 :CBC 24101403 : return buf;
322 : : }
323 : :
324 : : /*
325 : : * _bt_binsrch() -- Do a binary search for a key on a particular page.
326 : : *
327 : : * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
328 : : * of the last key < given scankey, or last key <= given scankey if nextkey
329 : : * is true. (Since _bt_compare treats the first data key of such a page as
330 : : * minus infinity, there will be at least one key < scankey, so the result
331 : : * always points at one of the keys on the page.)
332 : : *
333 : : * On a leaf page, _bt_binsrch() returns the final result of the initial
334 : : * positioning process that started with _bt_first's call to _bt_search.
335 : : * We're returning a non-pivot tuple offset, so things are a little different.
336 : : * It is possible that we'll return an offset that's either past the last
337 : : * non-pivot slot, or (in the case of a backward scan) before the first slot.
338 : : *
339 : : * This procedure is not responsible for walking right, it just examines
340 : : * the given page. _bt_binsrch() has no lock or refcount side effects
341 : : * on the buffer.
342 : : */
343 : : static OffsetNumber
10841 scrappy@hub.org 344 : 18608850 : _bt_binsrch(Relation rel,
345 : : BTScanInsert key,
346 : : Buffer buf)
347 : : {
348 : : Page page;
349 : : BTPageOpaque opaque;
350 : : OffsetNumber low,
351 : : high;
352 : : int32 result,
353 : : cmpval;
354 : :
3616 kgrittn@postgresql.o 355 : 18608850 : page = BufferGetPage(buf);
1444 michael@paquier.xyz 356 : 18608850 : opaque = BTPageGetOpaque(page);
357 : :
358 : : /* Requesting nextkey semantics while using scantid seems nonsensical */
2379 pg@bowt.ie 359 [ + + - + ]: 18608850 : Assert(!key->nextkey || key->scantid == NULL);
360 : : /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
361 [ + + - + ]: 18608850 : Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
362 : :
9368 tgl@sss.pgh.pa.us 363 [ + + ]: 18608850 : low = P_FIRSTDATAKEY(opaque);
10416 bruce@momjian.us 364 : 18608850 : high = PageGetMaxOffsetNumber(page);
365 : :
366 : : /*
367 : : * If there are no keys on the page, return the first available slot. Note
368 : : * this covers two cases: the page is really empty (no keys), or it
369 : : * contains only a high key. The latter case is possible after vacuuming.
370 : : * This can never happen on an internal page, however, since they are
371 : : * never empty (an internal page must have at least one child).
372 : : */
2552 pg@bowt.ie 373 [ + + ]: 18608850 : if (unlikely(high < low))
10057 bruce@momjian.us 374 : 5363 : return low;
375 : :
376 : : /*
377 : : * Binary search to find the first key on the page >= scan key, or first
378 : : * key > scankey when nextkey is true.
379 : : *
380 : : * For nextkey=false (cmpval=1), the loop invariant is: all slots before
381 : : * 'low' are < scan key, all slots at or after 'high' are >= scan key.
382 : : *
383 : : * For nextkey=true (cmpval=0), the loop invariant is: all slots before
384 : : * 'low' are <= scan key, all slots at or after 'high' are > scan key.
385 : : *
386 : : * We can fall out when high == low.
387 : : */
9739 tgl@sss.pgh.pa.us 388 : 18603487 : high++; /* establish the loop invariant for high */
389 : :
2552 pg@bowt.ie 390 : 18603487 : cmpval = key->nextkey ? 0 : 1; /* select comparison value */
391 : :
9739 tgl@sss.pgh.pa.us 392 [ + + ]: 121438850 : while (high > low)
393 : : {
9468 bruce@momjian.us 394 : 102835363 : OffsetNumber mid = low + ((high - low) / 2);
395 : :
396 : : /* We have low <= mid < high, so mid points at a real slot */
397 : :
2552 pg@bowt.ie 398 : 102835363 : result = _bt_compare(rel, key, page, mid);
399 : :
8120 tgl@sss.pgh.pa.us 400 [ + + ]: 102835363 : if (result >= cmpval)
9739 401 : 64832494 : low = mid + 1;
402 : : else
403 : 38002869 : high = mid;
404 : : }
405 : :
406 : : /*
407 : : * At this point we have high == low.
408 : : *
409 : : * On a leaf page we always return the first non-pivot tuple >= scan key
410 : : * (resp. > scan key) for forward scan callers. For backward scans, it's
411 : : * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
412 : : */
9368 413 [ + + ]: 18603487 : if (P_ISLEAF(opaque))
414 : : {
415 : : /*
416 : : * In the backward scan case we're supposed to locate the last
417 : : * matching tuple on the leaf level -- not the first matching tuple
418 : : * (the last tuple will be the first one returned by the scan).
419 : : *
420 : : * At this point we've located the first non-pivot tuple immediately
421 : : * after the last matching tuple (which might just be maxoff + 1).
422 : : * Compensate by stepping back.
423 : : */
828 pg@bowt.ie 424 [ + + ]: 7569822 : if (key->backward)
425 : 33287 : return OffsetNumberPrev(low);
426 : :
9739 tgl@sss.pgh.pa.us 427 : 7536535 : return low;
428 : : }
429 : :
430 : : /*
431 : : * On a non-leaf page, return the last key < scan key (resp. <= scan key).
432 : : * There must be one if _bt_compare() is playing by the rules.
433 : : *
434 : : * _bt_compare() will seldom see any exactly-matching pivot tuples, since
435 : : * a truncated -inf heap TID is usually enough to prevent it altogether.
436 : : * Even omitted scan key entries are treated as > truncated attributes.
437 : : *
438 : : * However, during backward scans _bt_compare() interprets omitted scan
439 : : * key attributes as == corresponding truncated -inf attributes instead.
440 : : * This works just like < would work here. Under this scheme, < strategy
441 : : * backward scans will always directly descend to the correct leaf page.
442 : : * In particular, they will never incur an "extra" leaf page access with a
443 : : * scan key that happens to contain the same prefix of values as some
444 : : * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
445 : : * it uses a leaf page high key to "re-find" a page undergoing deletion.
446 : : */
9368 447 [ + + - + ]: 11033665 : Assert(low > P_FIRSTDATAKEY(opaque));
448 : :
9739 449 : 11033665 : return OffsetNumberPrev(low);
450 : : }
451 : :
452 : : /*
453 : : *
454 : : * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
455 : : *
456 : : * Like _bt_binsrch(), but with support for caching the binary search
457 : : * bounds. Only used during insertion, and only on the leaf page that it
458 : : * looks like caller will insert tuple on. Exclusive-locked and pinned
459 : : * leaf page is contained within insertstate.
460 : : *
461 : : * Caches the bounds fields in insertstate so that a subsequent call can
462 : : * reuse the low and strict high bounds of original binary search. Callers
463 : : * that use these fields directly must be prepared for the case where low
464 : : * and/or stricthigh are not on the same page (one or both exceed maxoff
465 : : * for the page). The case where there are no items on the page (high <
466 : : * low) makes bounds invalid.
467 : : *
468 : : * Caller is responsible for invalidating bounds when it modifies the page
469 : : * before calling here a second time, and for dealing with posting list
470 : : * tuple matches (callers can use insertstate's postingoff field to
471 : : * determine which existing heap TID will need to be replaced by a posting
472 : : * list split).
473 : : */
474 : : OffsetNumber
2552 pg@bowt.ie 475 : 8102313 : _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
476 : : {
477 : 8102313 : BTScanInsert key = insertstate->itup_key;
478 : : Page page;
479 : : BTPageOpaque opaque;
480 : : OffsetNumber low,
481 : : high,
482 : : stricthigh;
483 : : int32 result,
484 : : cmpval;
485 : :
486 : 8102313 : page = BufferGetPage(insertstate->buf);
1444 michael@paquier.xyz 487 : 8102313 : opaque = BTPageGetOpaque(page);
488 : :
2552 pg@bowt.ie 489 [ - + ]: 8102313 : Assert(P_ISLEAF(opaque));
490 [ - + ]: 8102313 : Assert(!key->nextkey);
2209 491 [ - + ]: 8102313 : Assert(insertstate->postingoff == 0);
492 : :
2552 493 [ + + ]: 8102313 : if (!insertstate->bounds_valid)
494 : : {
495 : : /* Start new binary search */
496 [ + + ]: 5068905 : low = P_FIRSTDATAKEY(opaque);
497 : 5068905 : high = PageGetMaxOffsetNumber(page);
498 : : }
499 : : else
500 : : {
501 : : /* Restore result of previous binary search against same page */
502 : 3033408 : low = insertstate->low;
503 : 3033408 : high = insertstate->stricthigh;
504 : : }
505 : :
506 : : /* If there are no keys on the page, return the first available slot */
507 [ + + ]: 8102313 : if (unlikely(high < low))
508 : : {
509 : : /* Caller can't reuse bounds */
510 : 11921 : insertstate->low = InvalidOffsetNumber;
511 : 11921 : insertstate->stricthigh = InvalidOffsetNumber;
512 : 11921 : insertstate->bounds_valid = false;
513 : 11921 : return low;
514 : : }
515 : :
516 : : /*
517 : : * Binary search to find the first key on the page >= scan key. (nextkey
518 : : * is always false when inserting).
519 : : *
520 : : * The loop invariant is: all slots before 'low' are < scan key, all slots
521 : : * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
522 : : * maintained to save additional search effort for caller.
523 : : *
524 : : * We can fall out when high == low.
525 : : */
526 [ + + ]: 8090392 : if (!insertstate->bounds_valid)
527 : 5056984 : high++; /* establish the loop invariant for high */
528 : 8090392 : stricthigh = high; /* high initially strictly higher */
529 : :
530 : 8090392 : cmpval = 1; /* !nextkey comparison value */
531 : :
532 [ + + ]: 45829321 : while (high > low)
533 : : {
534 : 37738929 : OffsetNumber mid = low + ((high - low) / 2);
535 : :
536 : : /* We have low <= mid < high, so mid points at a real slot */
537 : :
538 : 37738929 : result = _bt_compare(rel, key, page, mid);
539 : :
540 [ + + ]: 37738929 : if (result >= cmpval)
541 : 27925665 : low = mid + 1;
542 : : else
543 : : {
544 : 9813264 : high = mid;
545 [ + + ]: 9813264 : if (result != 0)
546 : 8668550 : stricthigh = high;
547 : : }
548 : :
549 : : /*
550 : : * If tuple at offset located by binary search is a posting list whose
551 : : * TID range overlaps with caller's scantid, perform posting list
552 : : * binary search to set postingoff for caller. Caller must split the
553 : : * posting list when postingoff is set. This should happen
554 : : * infrequently.
555 : : */
2209 556 [ + + + + : 37738929 : if (unlikely(result == 0 && key->scantid != NULL))
+ + ]
557 : : {
558 : : /*
559 : : * postingoff should never be set more than once per leaf page
560 : : * binary search. That would mean that there are duplicate table
561 : : * TIDs in the index, which is never okay. Check for that here.
562 : : */
1600 563 [ - + ]: 798898 : if (insertstate->postingoff != 0)
1600 pg@bowt.ie 564 [ # # ]:UBC 0 : ereport(ERROR,
565 : : (errcode(ERRCODE_INDEX_CORRUPTED),
566 : : errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
567 : : ItemPointerGetBlockNumber(key->scantid),
568 : : ItemPointerGetOffsetNumber(key->scantid),
569 : : low, stricthigh,
570 : : BufferGetBlockNumber(insertstate->buf),
571 : : RelationGetRelationName(rel))));
572 : :
2209 pg@bowt.ie 573 :CBC 798898 : insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
574 : : }
575 : : }
576 : :
577 : : /*
578 : : * On a leaf page, a binary search always returns the first key >= scan
579 : : * key (at least in !nextkey case), which could be the last slot + 1. This
580 : : * is also the lower bound of cached search.
581 : : *
582 : : * stricthigh may also be the last slot + 1, which prevents caller from
583 : : * using bounds directly, but is still useful to us if we're called a
584 : : * second time with cached bounds (cached low will be < stricthigh when
585 : : * that happens).
586 : : */
2552 587 : 8090392 : insertstate->low = low;
588 : 8090392 : insertstate->stricthigh = stricthigh;
589 : 8090392 : insertstate->bounds_valid = true;
590 : :
591 : 8090392 : return low;
592 : : }
593 : :
594 : : /*----------
595 : : * _bt_binsrch_posting() -- posting list binary search.
596 : : *
597 : : * Helper routine for _bt_binsrch_insert().
598 : : *
599 : : * Returns offset into posting list where caller's scantid belongs.
600 : : *----------
601 : : */
602 : : static int
2209 603 : 798898 : _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
604 : : {
605 : : IndexTuple itup;
606 : : ItemId itemid;
607 : : int low,
608 : : high,
609 : : mid,
610 : : res;
611 : :
612 : : /*
613 : : * If this isn't a posting tuple, then the index must be corrupt (if it is
614 : : * an ordinary non-pivot tuple then there must be an existing tuple with a
615 : : * heap TID that equals inserter's new heap TID/scantid). Defensively
616 : : * check that tuple is a posting list tuple whose posting list range
617 : : * includes caller's scantid.
618 : : *
619 : : * (This is also needed because contrib/amcheck's rootdescend option needs
620 : : * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
621 : : */
622 : 798898 : itemid = PageGetItemId(page, offnum);
623 : 798898 : itup = (IndexTuple) PageGetItem(page, itemid);
624 [ + + ]: 798898 : if (!BTreeTupleIsPosting(itup))
625 : 782485 : return 0;
626 : :
627 [ + - - + ]: 16413 : Assert(key->heapkeyspace && key->allequalimage);
628 : :
629 : : /*
630 : : * In the event that posting list tuple has LP_DEAD bit set, indicate this
631 : : * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
632 : : * second call to _bt_binsrch_insert() can take place when its caller has
633 : : * removed the dead item.
634 : : */
635 [ + + ]: 16413 : if (ItemIdIsDead(itemid))
2209 pg@bowt.ie 636 :GBC 5 : return -1;
637 : :
638 : : /* "high" is past end of posting list for loop invariant */
2209 pg@bowt.ie 639 :CBC 16408 : low = 0;
640 : 16408 : high = BTreeTupleGetNPosting(itup);
641 [ - + ]: 16408 : Assert(high >= 2);
642 : :
643 [ + + ]: 132920 : while (high > low)
644 : : {
645 : 116514 : mid = low + ((high - low) / 2);
646 : 116514 : res = ItemPointerCompare(key->scantid,
2209 pg@bowt.ie 647 :GIC 116514 : BTreeTupleGetPostingN(itup, mid));
648 : :
2209 pg@bowt.ie 649 [ + + ]:CBC 116514 : if (res > 0)
650 : 61016 : low = mid + 1;
651 [ + + ]: 55498 : else if (res < 0)
652 : 55496 : high = mid;
653 : : else
2209 pg@bowt.ie 654 :GBC 2 : return mid;
655 : : }
656 : :
657 : : /* Exact match not found */
2209 pg@bowt.ie 658 :CBC 16406 : return low;
659 : : }
660 : :
661 : : /*----------
662 : : * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
663 : : *
664 : : * page/offnum: location of btree item to be compared to.
665 : : *
666 : : * This routine returns:
667 : : * <0 if scankey < tuple at offnum;
668 : : * 0 if scankey == tuple at offnum;
669 : : * >0 if scankey > tuple at offnum.
670 : : *
671 : : * NULLs in the keys are treated as sortable values. Therefore
672 : : * "equality" does not necessarily mean that the item should be returned
673 : : * to the caller as a matching key. Similarly, an insertion scankey
674 : : * with its scantid set is treated as equal to a posting tuple whose TID
675 : : * range overlaps with their scantid. There generally won't be a
676 : : * matching TID in the posting tuple, which caller must handle
677 : : * themselves (e.g., by splitting the posting list tuple).
678 : : *
679 : : * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
680 : : * "minus infinity": this routine will always claim it is less than the
681 : : * scankey. The actual key value stored is explicitly truncated to 0
682 : : * attributes (explicitly minus infinity) with version 3+ indexes, but
683 : : * that isn't relied upon. This allows us to implement the Lehman and
684 : : * Yao convention that the first down-link pointer is before the first
685 : : * key. See backend/access/nbtree/README for details.
686 : : *----------
687 : : */
688 : : int32
10841 scrappy@hub.org 689 : 160142855 : _bt_compare(Relation rel,
690 : : BTScanInsert key,
691 : : Page page,
692 : : OffsetNumber offnum)
693 : : {
9368 tgl@sss.pgh.pa.us 694 : 160142855 : TupleDesc itupdesc = RelationGetDescr(rel);
1444 michael@paquier.xyz 695 : 160142855 : BTPageOpaque opaque = BTPageGetOpaque(page);
696 : : IndexTuple itup;
697 : : ItemPointer heapTid;
698 : : ScanKey scankey;
699 : : int ncmpkey;
700 : : int ntupatts;
701 : : int32 result;
702 : :
2552 pg@bowt.ie 703 [ - + ]: 160142855 : Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
704 [ - + ]: 160142855 : Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
705 [ - + - - ]: 160142855 : Assert(key->heapkeyspace || key->scantid == NULL);
706 : :
707 : : /*
708 : : * Force result ">" if target item is first data item on an internal page
709 : : * --- see NOTE above.
710 : : */
9124 bruce@momjian.us 711 [ + + + + : 160142855 : if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
+ + ]
10057 712 : 2112719 : return 1;
713 : :
7354 tgl@sss.pgh.pa.us 714 : 158030136 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2552 pg@bowt.ie 715 [ + + ]: 158030136 : ntupatts = BTreeTupleGetNAtts(itup, rel);
716 : :
717 : : /*
718 : : * The scan key is set up with the attribute number associated with each
719 : : * term in the key. It is important that, if the index is multi-key, the
720 : : * scan contain the first k key attributes, and that they be in order. If
721 : : * you think about how multi-key ordering works, you'll understand why
722 : : * this is.
723 : : *
724 : : * We don't test for violation of this condition here, however. The
725 : : * initial setup for the index scan had better have gotten it right (see
726 : : * _bt_first).
727 : : */
728 : :
729 : 158030136 : ncmpkey = Min(ntupatts, key->keysz);
730 [ - + - - ]: 158030136 : Assert(key->heapkeyspace || ncmpkey == key->keysz);
2209 731 [ + + - + ]: 158030136 : Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
2552 732 : 158030136 : scankey = key->scankeys;
733 [ + + ]: 195131718 : for (int i = 1; i <= ncmpkey; i++)
734 : : {
735 : : Datum datum;
736 : : bool isNull;
737 : :
8159 tgl@sss.pgh.pa.us 738 : 182108334 : datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
739 : :
3189 740 [ + + ]: 182108334 : if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
741 : : {
9368 742 [ + + ]: 313653 : if (isNull)
9420 743 : 78692 : result = 0; /* NULL "=" NULL */
7005 744 [ + + ]: 234961 : else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
745 : 312 : result = -1; /* NULL "<" NOT_NULL */
746 : : else
9420 747 : 234649 : result = 1; /* NULL ">" NOT_NULL */
748 : : }
9368 749 [ + + ]: 181794681 : else if (isNull) /* key is NOT_NULL and item is NULL */
750 : : {
7005 751 [ - + ]: 174 : if (scankey->sk_flags & SK_BT_NULLS_FIRST)
7005 tgl@sss.pgh.pa.us 752 :UBC 0 : result = 1; /* NOT_NULL ">" NULL */
753 : : else
7005 tgl@sss.pgh.pa.us 754 :CBC 174 : result = -1; /* NOT_NULL "<" NULL */
755 : : }
756 : : else
757 : : {
758 : : /*
759 : : * The sk_func needs to be passed the index value as left arg and
760 : : * the sk_argument as right arg (they might be of different
761 : : * types). Since it is convenient for callers to think of
762 : : * _bt_compare as comparing the scankey to the index item, we have
763 : : * to flip the sign of the comparison result. (Unless it's a DESC
764 : : * column, in which case we *don't* flip the sign.)
765 : : */
5451 766 : 181794507 : result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
767 : : scankey->sk_collation,
768 : : datum,
769 : : scankey->sk_argument));
770 : :
7005 771 [ + + ]: 181794507 : if (!(scankey->sk_flags & SK_BT_DESC))
2718 772 [ + + ]: 181794474 : INVERT_COMPARE_RESULT(result);
773 : : }
774 : :
775 : : /* if the keys are unequal, return the difference */
10416 bruce@momjian.us 776 [ + + ]: 182108334 : if (result != 0)
10057 777 : 145006752 : return result;
778 : :
8159 tgl@sss.pgh.pa.us 779 : 37101582 : scankey++;
780 : : }
781 : :
782 : : /*
783 : : * All non-truncated attributes (other than heap TID) were found to be
784 : : * equal. Treat truncated attributes as minus infinity when scankey has a
785 : : * key attribute value that would otherwise be compared directly.
786 : : *
787 : : * Note: it doesn't matter if ntupatts includes non-key attributes;
788 : : * scankey won't, so explicitly excluding non-key attributes isn't
789 : : * necessary.
790 : : */
2552 pg@bowt.ie 791 [ + + ]: 13023384 : if (key->keysz > ntupatts)
792 : 97471 : return 1;
793 : :
794 : : /*
795 : : * Use the heap TID attribute and scantid to try to break the tie. The
796 : : * rules are the same as any other key attribute -- only the
797 : : * representation differs.
798 : : */
799 : 12925913 : heapTid = BTreeTupleGetHeapTID(itup);
800 [ + + ]: 12925913 : if (key->scantid == NULL)
801 : : {
802 : : /*
803 : : * Forward scans have a scankey that is considered greater than a
804 : : * truncated pivot tuple if and when the scankey has equal values for
805 : : * attributes up to and including the least significant untruncated
806 : : * attribute in tuple. Even attributes that were omitted from the
807 : : * scan key are considered greater than -inf truncated attributes.
808 : : * (See _bt_binsrch for an explanation of our backward scan behavior.)
809 : : *
810 : : * For example, if an index has the minimum two attributes (single
811 : : * user key attribute, plus heap TID attribute), and a page's high key
812 : : * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
813 : : * will not descend to the page to the left. The search will descend
814 : : * right instead. The truncated attribute in pivot tuple means that
815 : : * all non-pivot tuples on the page to the left are strictly < 'foo',
816 : : * so it isn't necessary to descend left. In other words, search
817 : : * doesn't have to descend left because it isn't interested in a match
818 : : * that has a heap TID value of -inf.
819 : : *
820 : : * Note: the heap TID part of the test ensures that scankey is being
821 : : * compared to a pivot tuple with one or more truncated -inf key
822 : : * attributes. The heap TID attribute is the last key attribute in
823 : : * every index, of course, but other than that it isn't special.
824 : : */
828 825 [ + + + + : 9408617 : if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
+ + ]
826 [ + - ]: 5415 : key->heapkeyspace)
2552 827 : 5415 : return 1;
828 : :
829 : : /* All provided scankey arguments found to be equal */
830 : 9403202 : return 0;
831 : : }
832 : :
833 : : /*
834 : : * Treat truncated heap TID as minus infinity, since scankey has a key
835 : : * attribute value (scantid) that would otherwise be compared directly
836 : : */
837 [ - + ]: 3517296 : Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
838 [ + + ]: 3517296 : if (heapTid == NULL)
839 : 3675 : return 1;
840 : :
841 : : /*
842 : : * Scankey must be treated as equal to a posting list tuple if its scantid
843 : : * value falls within the range of the posting list. In all other cases
844 : : * there can only be a single heap TID value, which is compared directly
845 : : * with scantid.
846 : : */
847 [ - + ]: 3513621 : Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
2209 848 : 3513621 : result = ItemPointerCompare(key->scantid, heapTid);
849 [ + + + + ]: 3513621 : if (result <= 0 || !BTreeTupleIsPosting(itup))
850 : 3420507 : return result;
851 : : else
852 : : {
853 : 93114 : result = ItemPointerCompare(key->scantid,
2209 pg@bowt.ie 854 :GIC 93114 : BTreeTupleGetMaxHeapTID(itup));
2209 pg@bowt.ie 855 [ + + ]:CBC 93114 : if (result > 0)
856 : 76703 : return 1;
857 : : }
858 : :
859 : 16411 : return 0;
860 : : }
861 : :
862 : : /*
863 : : * _bt_first() -- Find the first item in a scan.
864 : : *
865 : : * We need to be clever about the direction of scan, the search
866 : : * conditions, and the tree ordering. We find the first item (or,
867 : : * if backwards scan, the last item) in the tree that satisfies the
868 : : * qualifications in the scan key. On success exit, data about the
869 : : * matching tuple(s) on the page has been loaded into so->currPos. We'll
870 : : * drop all locks and hold onto a pin on page's buffer, except during
871 : : * so->dropPin scans, when we drop both the lock and the pin.
872 : : * _bt_returnitem sets the next item to return to scan on success exit.
873 : : *
874 : : * If there are no matching items in the index, we return false, with no
875 : : * pins or locks held. so->currPos will remain invalid.
876 : : *
877 : : * Note that scan->keyData[], and the so->keyData[] scankey built from it,
878 : : * are both search-type scankeys (see nbtree/README for more about this).
879 : : * Within this routine, we build a temporary insertion-type scankey to use
880 : : * in locating the scan start position.
881 : : */
882 : : bool
10841 scrappy@hub.org 883 : 7910153 : _bt_first(IndexScanDesc scan, ScanDirection dir)
884 : : {
8696 tgl@sss.pgh.pa.us 885 : 7910153 : Relation rel = scan->indexRelation;
886 : 7910153 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
887 : : OffsetNumber offnum;
888 : : BTScanInsertData inskey;
889 : : ScanKey startKeys[INDEX_MAX_KEYS];
890 : : ScanKeyData notnullkey;
828 pg@bowt.ie 891 : 7910153 : int keysz = 0;
242 pg@bowt.ie 892 :GNC 7910153 : StrategyNumber strat_total = InvalidStrategy;
432 pg@bowt.ie 893 :CBC 7910153 : BlockNumber blkno = InvalidBlockNumber,
894 : : lastcurrblkno;
895 : :
4008 kgrittn@postgresql.o 896 [ + - - + : 7910153 : Assert(!BTScanPosIsValid(so->currPos));
- + ]
897 : :
898 : : /*
899 : : * Examine the scan keys and eliminate any redundant keys; also mark the
900 : : * keys that must be matched to continue the scan.
901 : : */
8159 tgl@sss.pgh.pa.us 902 : 7910153 : _bt_preprocess_keys(scan);
903 : :
904 : : /*
905 : : * Quit now if _bt_preprocess_keys() discovered that the scan keys can
906 : : * never be satisfied (eg, x == 1 AND x > 2).
907 : : */
9124 bruce@momjian.us 908 [ + + ]: 7910153 : if (!so->qual_ok)
909 : : {
432 pg@bowt.ie 910 [ - + ]: 1139 : Assert(!so->needPrimScan);
2005 akapila@postgresql.o 911 : 1139 : _bt_parallel_done(scan);
8700 tgl@sss.pgh.pa.us 912 : 1139 : return false;
913 : : }
914 : :
915 : : /*
916 : : * If this is a parallel scan, we must seize the scan. _bt_readfirstpage
917 : : * will likely release the parallel scan later on.
918 : : */
432 pg@bowt.ie 919 [ + + ]: 7909014 : if (scan->parallel_scan != NULL &&
920 [ + + ]: 223 : !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
921 : 114 : return false;
922 : :
923 : : /*
924 : : * Initialize the scan's arrays (if any) for the current scan direction
925 : : * (except when they were already set to later values as part of
926 : : * scheduling the primitive index scan that is now underway)
927 : : */
928 [ + + + + ]: 7908900 : if (so->numArrayKeys && !so->needPrimScan)
929 : 35697 : _bt_start_array_keys(scan, dir);
930 : :
931 [ + + ]: 7908900 : if (blkno != InvalidBlockNumber)
932 : : {
933 : : /*
934 : : * We anticipated calling _bt_search, but another worker bet us to it.
935 : : * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
936 : : */
937 [ - + ]: 47 : Assert(scan->parallel_scan != NULL);
938 [ - + ]: 47 : Assert(!so->needPrimScan);
492 939 [ - + ]: 47 : Assert(blkno != P_NONE);
940 : :
432 941 [ - + ]: 47 : if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
432 pg@bowt.ie 942 :UBC 0 : return false;
943 : :
432 pg@bowt.ie 944 :CBC 47 : _bt_returnitem(scan, so);
945 : 47 : return true;
946 : : }
947 : :
948 : : /*
949 : : * Count an indexscan for stats, now that we know that we'll call
950 : : * _bt_search/_bt_endpoint below
951 : : */
541 952 [ + + + + : 7908853 : pgstat_count_index_scan(rel);
+ + ]
369 953 [ + + ]: 7908853 : if (scan->instrument)
954 : 543227 : scan->instrument->nsearches++;
955 : :
956 : : /*----------
957 : : * Examine the scan keys to discover where we need to start the scan.
958 : : * The selected scan keys (at most one per index column) are remembered by
959 : : * storing their addresses into the local startKeys[] array. The final
960 : : * startKeys[] entry's strategy is set in strat_total. (Actually, there
961 : : * are a couple of cases where we force a less/more restrictive strategy.)
962 : : *
963 : : * We must use the key that was marked required (in the direction opposite
964 : : * our own scan's) during preprocessing. Each index attribute can only
965 : : * have one such required key. In general, the keys that we use to find
966 : : * an initial position when scanning forwards are the same keys that end
967 : : * the scan on the leaf level when scanning backwards (and vice-versa).
968 : : *
969 : : * When the scan keys include cross-type operators, _bt_preprocess_keys
970 : : * may not be able to eliminate redundant keys; in such cases it will
971 : : * arbitrarily pick a usable key for each attribute (and scan direction),
972 : : * ensuring that there is no more than one key required in each direction.
973 : : * We stop considering further keys once we reach the first nonrequired
974 : : * key (which must come after all required keys), so this can't affect us.
975 : : *
976 : : * The required keys that we use as starting boundaries have to be =, >,
977 : : * or >= keys for a forward scan or =, <, <= keys for a backwards scan.
978 : : * We can use keys for multiple attributes so long as the prior attributes
979 : : * had only =, >= (resp. =, <=) keys. These rules are very similar to the
980 : : * rules that preprocessing used to determine which keys to mark required.
981 : : * We cannot always use every required key as a positioning key, though.
982 : : * Skip arrays necessitate independently applying our own rules here.
983 : : * Skip arrays are always generally considered = array keys, but we'll
984 : : * nevertheless treat them as inequalities at certain points of the scan.
985 : : * When that happens, it _might_ have implications for the number of
986 : : * required keys that we can safely use for initial positioning purposes.
987 : : *
988 : : * For example, a forward scan with a skip array on its leading attribute
989 : : * (with no low_compare/high_compare) will have at least two required scan
990 : : * keys, but we won't use any of them as boundary keys during the scan's
991 : : * initial call here. Our positioning key during the first call here can
992 : : * be thought of as representing "> -infinity". Similarly, if such a skip
993 : : * array's low_compare is "a > 'foo'", then we position using "a > 'foo'"
994 : : * during the scan's initial call here; a lower-order key such as "b = 42"
995 : : * can't be used until the "a" array advances beyond MINVAL/low_compare.
996 : : *
997 : : * On the other hand, if such a skip array's low_compare was "a >= 'foo'",
998 : : * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here.
999 : : * A subsequent call here might have us use "a = 'fop' AND b = 42". Note
1000 : : * that we treat = and >= as equivalent when scanning forwards (just as we
1001 : : * treat = and <= as equivalent when scanning backwards). We effectively
1002 : : * do the same thing (though with a distinct "a" element/value) each time.
1003 : : *
1004 : : * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
1005 : : * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
1006 : : * If the index stores nulls at the end of the index we'll be starting
1007 : : * from, and we have no boundary key for the column (which means the key
1008 : : * we deduced NOT NULL from is an inequality key that constrains the other
1009 : : * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1010 : : * use as a boundary key. If we didn't do this, we might find ourselves
1011 : : * traversing a lot of null entries at the start of the scan.
1012 : : *
1013 : : * In this loop, row-comparison keys are treated the same as keys on their
1014 : : * first (leftmost) columns. We'll add all lower-order columns of the row
1015 : : * comparison that were marked required during preprocessing below.
1016 : : *
1017 : : * _bt_advance_array_keys needs to know exactly how we'll reposition the
1018 : : * scan (should it opt to schedule another primitive index scan). It is
1019 : : * critical that primscans only be scheduled when they'll definitely make
1020 : : * some useful progress. _bt_advance_array_keys does this by calling
1021 : : * _bt_checkkeys routines that report whether a tuple is past the end of
1022 : : * matches for the scan's keys (given the scan's current array elements).
1023 : : * If the page's final tuple is "after the end of matches" for a scan that
1024 : : * uses the *opposite* scan direction, then it must follow that it's also
1025 : : * "before the start of matches" for the actual current scan direction.
1026 : : * It is therefore essential that all of our initial positioning rules are
1027 : : * symmetric with _bt_checkkeys's corresponding continuescan=false rule.
1028 : : * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1029 : : * need to be kept in sync.
1030 : : *----------
1031 : : */
9364 tgl@sss.pgh.pa.us 1032 [ + + ]: 7908853 : if (so->numberOfKeys > 0)
1033 : : {
1034 : : AttrNumber curattr;
1035 : : ScanKey bkey;
1036 : : ScanKey impliesNN;
1037 : : ScanKey cur;
1038 : :
1039 : : /*
1040 : : * bkey will be set to the key that preprocessing left behind as the
1041 : : * boundary key for this attribute, in this scan direction (if any)
1042 : : */
492 pg@bowt.ie 1043 : 7902077 : cur = so->keyData;
8159 tgl@sss.pgh.pa.us 1044 : 7902077 : curattr = 1;
256 pg@bowt.ie 1045 : 7902077 : bkey = NULL;
1046 : : /* Also remember any scankey that implies a NOT NULL constraint */
5247 tgl@sss.pgh.pa.us 1047 : 7902077 : impliesNN = NULL;
1048 : :
1049 : : /*
1050 : : * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1051 : : * pass to handle after-last-key processing. Actual exit from the
1052 : : * loop is at one of the "break" statements below.
1053 : : */
492 pg@bowt.ie 1054 : 20617663 : for (int i = 0;; cur++, i++)
1055 : : {
8159 tgl@sss.pgh.pa.us 1056 [ + + + + ]: 20617663 : if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1057 : : {
1058 : : /* Done looking for the curattr boundary key */
256 pg@bowt.ie 1059 [ + + + - : 12714665 : Assert(bkey == NULL ||
- + ]
1060 : : (bkey->sk_attno == curattr &&
1061 : : (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1062 [ + + + - : 12714665 : Assert(impliesNN == NULL ||
- + ]
1063 : : (impliesNN->sk_attno == curattr &&
1064 : : (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1065 : :
1066 : : /*
1067 : : * If this is a scan key for a skip array whose current
1068 : : * element is MINVAL, choose low_compare (when scanning
1069 : : * backwards it'll be MAXVAL, and we'll choose high_compare).
1070 : : *
1071 : : * Note: if the array's low_compare key makes 'bkey' NULL,
1072 : : * then we behave as if the array's first element is -inf,
1073 : : * except when !array->null_elem implies a usable NOT NULL
1074 : : * constraint.
1075 : : */
1076 [ + + ]: 12714665 : if (bkey != NULL &&
1077 [ + + ]: 12675223 : (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
1078 : : {
1079 : 1882 : int ikey = bkey - so->keyData;
1080 : 1882 : ScanKey skipequalitykey = bkey;
345 1081 : 1882 : BTArrayKeyInfo *array = NULL;
1082 : :
1083 [ + - ]: 1938 : for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
1084 : : {
1085 : 1938 : array = &so->arrayKeys[arridx];
1086 [ + + ]: 1938 : if (array->scan_key == ikey)
1087 : 1882 : break;
1088 : : }
1089 : :
1090 [ + + ]: 1882 : if (ScanDirectionIsForward(dir))
1091 : : {
1092 [ - + ]: 1873 : Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
256 1093 : 1873 : bkey = array->low_compare;
1094 : : }
1095 : : else
1096 : : {
345 1097 [ - + ]: 9 : Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
256 1098 : 9 : bkey = array->high_compare;
1099 : : }
1100 : :
1101 [ + + - + ]: 1882 : Assert(bkey == NULL ||
1102 : : bkey->sk_attno == skipequalitykey->sk_attno);
1103 : :
345 1104 [ + + ]: 1882 : if (!array->null_elem)
1105 : 63 : impliesNN = skipequalitykey;
1106 : : else
256 1107 [ + - - + ]: 1819 : Assert(bkey == NULL && impliesNN == NULL);
1108 : : }
1109 : :
1110 : : /*
1111 : : * If we didn't find a usable boundary key, see if we can
1112 : : * deduce a NOT NULL key
1113 : : */
1114 [ + + + + : 12754137 : if (bkey == NULL && impliesNN != NULL &&
+ + ]
5247 tgl@sss.pgh.pa.us 1115 [ + + ]: 39472 : ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1116 : : ScanDirectionIsForward(dir) :
1117 : : ScanDirectionIsBackward(dir)))
1118 : : {
1119 : : /* Final startKeys[] entry will be deduced NOT NULL key */
242 pg@bowt.ie 1120 :GNC 15 : bkey = ¬nullkey;
256 pg@bowt.ie 1121 [ + + ]:CBC 15 : ScanKeyEntryInitialize(bkey,
1122 : : (SK_SEARCHNOTNULL | SK_ISNULL |
5247 tgl@sss.pgh.pa.us 1123 : 15 : (impliesNN->sk_flags &
1124 : : (SK_BT_DESC | SK_BT_NULLS_FIRST))),
1125 : : curattr,
1126 : : ScanDirectionIsForward(dir) ?
1127 : : BTGreaterStrategyNumber : BTLessStrategyNumber,
1128 : : InvalidOid,
1129 : : InvalidOid,
1130 : : InvalidOid,
1131 : : (Datum) 0);
1132 : : }
1133 : :
1134 : : /*
1135 : : * If preprocessing didn't leave a usable boundary key, quit;
1136 : : * else save the boundary key pointer in startKeys[]
1137 : : */
256 pg@bowt.ie 1138 [ + + ]: 12714665 : if (bkey == NULL)
8159 tgl@sss.pgh.pa.us 1139 : 41276 : break;
256 pg@bowt.ie 1140 : 12673389 : startKeys[keysz++] = bkey;
1141 : :
1142 : : /*
1143 : : * We can only consider adding more boundary keys when the one
1144 : : * that we just chose to add uses either the = or >= strategy
1145 : : * (during backwards scans we can only do so when the key that
1146 : : * we just added to startKeys[] uses the = or <= strategy)
1147 : : */
1148 : 12673389 : strat_total = bkey->sk_strategy;
496 1149 [ + + + + ]: 12673389 : if (strat_total == BTGreaterStrategyNumber ||
1150 : : strat_total == BTLessStrategyNumber)
1151 : : break;
1152 : :
1153 : : /*
1154 : : * If the key that we just added to startKeys[] is a skip
1155 : : * array = key whose current element is marked NEXT or PRIOR,
1156 : : * make strat_total > or < (and stop adding boundary keys).
1157 : : * This can only happen with opclasses that lack skip support.
1158 : : */
256 1159 [ + + ]: 11933936 : if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
1160 : : {
1161 [ - + ]: 6 : Assert(bkey->sk_flags & SK_BT_SKIP);
345 1162 [ - + ]: 6 : Assert(strat_total == BTEqualStrategyNumber);
1163 : :
1164 [ + + ]: 6 : if (ScanDirectionIsForward(dir))
1165 : : {
256 1166 [ - + ]: 3 : Assert(!(bkey->sk_flags & SK_BT_PRIOR));
345 1167 : 3 : strat_total = BTGreaterStrategyNumber;
1168 : : }
1169 : : else
1170 : : {
256 1171 [ - + ]: 3 : Assert(!(bkey->sk_flags & SK_BT_NEXT));
345 1172 : 3 : strat_total = BTLessStrategyNumber;
1173 : : }
1174 : :
1175 : : /*
1176 : : * We're done. We'll never find an exact = match for a
1177 : : * NEXT or PRIOR sentinel sk_argument value. There's no
1178 : : * sense in trying to add more keys to startKeys[].
1179 : : */
1180 : 6 : break;
1181 : : }
1182 : :
1183 : : /*
1184 : : * Done if that was the last scan key output by preprocessing.
1185 : : * Also done if we've now examined all keys marked required.
1186 : : */
7580 tgl@sss.pgh.pa.us 1187 [ + + ]: 11933930 : if (i >= so->numberOfKeys ||
256 pg@bowt.ie 1188 [ + + ]: 4812591 : !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1189 : : break;
1190 : :
1191 : : /*
1192 : : * Reset for next attr.
1193 : : */
1194 [ - + ]: 4812588 : Assert(cur->sk_attno == curattr + 1);
8159 tgl@sss.pgh.pa.us 1195 : 4812588 : curattr = cur->sk_attno;
256 pg@bowt.ie 1196 : 4812588 : bkey = NULL;
5247 tgl@sss.pgh.pa.us 1197 : 4812588 : impliesNN = NULL;
1198 : : }
1199 : :
1200 : : /*
1201 : : * If we've located the starting boundary key for curattr, we have
1202 : : * no interest in curattr's other required key
1203 : : */
256 pg@bowt.ie 1204 [ + + ]: 12715586 : if (bkey != NULL)
1205 : 909 : continue;
1206 : :
1207 : : /*
1208 : : * Is this key the starting boundary key for curattr?
1209 : : *
1210 : : * If not, does it imply a NOT NULL constraint? (Because
1211 : : * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1212 : : * *any* inequality key works for that; we need not test.)
1213 : : */
8159 tgl@sss.pgh.pa.us 1214 [ + + + - ]: 12714677 : switch (cur->sk_strategy)
1215 : : {
1216 : 72646 : case BTLessStrategyNumber:
1217 : : case BTLessEqualStrategyNumber:
256 pg@bowt.ie 1218 [ + + ]: 72646 : if (ScanDirectionIsBackward(dir))
1219 : 33213 : bkey = cur;
1220 [ + - ]: 39433 : else if (impliesNN == NULL)
1221 : 39433 : impliesNN = cur;
8159 tgl@sss.pgh.pa.us 1222 : 72646 : break;
1223 : 11930704 : case BTEqualStrategyNumber:
256 pg@bowt.ie 1224 : 11930704 : bkey = cur;
8159 tgl@sss.pgh.pa.us 1225 : 11930704 : break;
1226 : 711327 : case BTGreaterEqualStrategyNumber:
1227 : : case BTGreaterStrategyNumber:
256 pg@bowt.ie 1228 [ + + ]: 711327 : if (ScanDirectionIsForward(dir))
1229 : 711306 : bkey = cur;
1230 [ + - ]: 21 : else if (impliesNN == NULL)
1231 : 21 : impliesNN = cur;
9833 bruce@momjian.us 1232 : 711327 : break;
1233 : : }
1234 : : }
1235 : : }
1236 : :
1237 : : /*
1238 : : * If we found no usable boundary keys, we have to start from one end of
1239 : : * the tree. Walk down that edge to the first or last key, and scan from
1240 : : * there.
1241 : : *
1242 : : * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
1243 : : */
828 pg@bowt.ie 1244 [ + + ]: 7908853 : if (keysz == 0)
496 1245 : 47682 : return _bt_endpoint(scan, dir);
1246 : :
1247 : : /*
1248 : : * We want to start the scan somewhere within the index. Set up an
1249 : : * insertion scankey we can use to search for the boundary point we
1250 : : * identified above. The insertion scankey is built using the keys
1251 : : * identified by startKeys[]. (Remaining insertion scankey fields are
1252 : : * initialized after initial-positioning scan keys are finalized.)
1253 : : */
828 1254 [ - + ]: 7861171 : Assert(keysz <= INDEX_MAX_KEYS);
492 1255 [ + + ]: 20534536 : for (int i = 0; i < keysz; i++)
1256 : : {
256 1257 : 12673389 : ScanKey bkey = startKeys[i];
1258 : :
1259 [ - + ]: 12673389 : Assert(bkey->sk_attno == i + 1);
1260 : :
1261 [ + + ]: 12673389 : if (bkey->sk_flags & SK_ROW_HEADER)
1262 : : {
1263 : : /*
1264 : : * Row comparison header: look to the first row member instead
1265 : : */
1266 : 24 : ScanKey subkey = (ScanKey) DatumGetPointer(bkey->sk_argument);
1267 : 24 : bool loosen_strat = false,
1268 : 24 : tighten_strat = false;
1269 : :
1270 : : /*
1271 : : * Cannot be a NULL in the first row member: _bt_preprocess_keys
1272 : : * would've marked the qual as unsatisfiable, preventing us from
1273 : : * ever getting this far
1274 : : */
7005 tgl@sss.pgh.pa.us 1275 [ - + ]: 24 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
256 pg@bowt.ie 1276 [ - + ]: 24 : Assert(subkey->sk_attno == bkey->sk_attno);
432 1277 [ - + ]: 24 : Assert(!(subkey->sk_flags & SK_ISNULL));
1278 : :
1279 : : /*
1280 : : * This is either a > or >= key (during backwards scans it is
1281 : : * either < or <=) that was marked required during preprocessing.
1282 : : * Later so->keyData[] keys can't have been marked required, so
1283 : : * our row compare header key must be the final startKeys[] entry.
1284 : : */
256 1285 [ - + ]: 24 : Assert(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD));
133 1286 [ - + ]: 24 : Assert(subkey->sk_strategy == bkey->sk_strategy);
1287 [ - + ]: 24 : Assert(subkey->sk_strategy == strat_total);
256 1288 [ - + ]: 24 : Assert(i == keysz - 1);
1289 : :
1290 : : /*
1291 : : * The member scankeys are already in insertion format (ie, they
1292 : : * have sk_func = 3-way-comparison function)
1293 : : */
2552 1294 : 24 : memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1295 : :
1296 : : /*
1297 : : * Now look to later row compare members.
1298 : : *
1299 : : * If there's an "index attribute gap" between two row compare
1300 : : * members, the second member won't have been marked required, and
1301 : : * so can't be used as a starting boundary key here. The part of
1302 : : * the row comparison that we do still use has to be treated as a
1303 : : * ">=" or "<=" condition. For example, a qual "(a, c) > (1, 42)"
1304 : : * with an omitted intervening index attribute "b" will use an
1305 : : * insertion scan key "a >= 1". Even the first "a = 1" tuple on
1306 : : * the leaf level might satisfy the row compare qual.
1307 : : *
1308 : : * We're able to use a _more_ restrictive strategy when we reach a
1309 : : * NULL row compare member, since they're always unsatisfiable.
1310 : : * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an
1311 : : * insertion scan key "a > 1". All tuples where "a = 1" cannot
1312 : : * possibly satisfy the row compare qual, so this is safe.
1313 : : */
256 1314 [ - + ]: 24 : Assert(!(subkey->sk_flags & SK_ROW_END));
1315 : : for (;;)
1316 : : {
1317 : 24 : subkey++;
1318 [ - + ]: 24 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1319 : :
1320 [ + + ]: 24 : if (subkey->sk_flags & SK_ISNULL)
1321 : : {
1322 : : /*
1323 : : * NULL member key, can only use earlier keys.
1324 : : *
1325 : : * We deliberately avoid checking if this key is marked
1326 : : * required. All earlier keys are required, and this key
1327 : : * is unsatisfiable either way, so we can't miss anything.
1328 : : */
1329 : 6 : tighten_strat = true;
1330 : 6 : break;
1331 : : }
1332 : :
1333 [ + + ]: 18 : if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1334 : : {
1335 : : /* nonrequired member key, can only use earlier keys */
1336 : 6 : loosen_strat = true;
1337 : 6 : break;
1338 : : }
1339 : :
1340 [ - + ]: 12 : Assert(subkey->sk_attno == keysz + 1);
1341 [ - + ]: 12 : Assert(subkey->sk_strategy == bkey->sk_strategy);
1342 [ - + ]: 12 : Assert(keysz < INDEX_MAX_KEYS);
1343 : :
133 1344 : 12 : memcpy(inskey.scankeys + keysz, subkey, sizeof(ScanKeyData));
256 1345 : 12 : keysz++;
1346 : :
1347 [ + - ]: 12 : if (subkey->sk_flags & SK_ROW_END)
1348 : 12 : break;
1349 : : }
1350 [ + + - + ]: 24 : Assert(!(loosen_strat && tighten_strat));
1351 [ + + ]: 24 : if (loosen_strat)
1352 : : {
1353 : : /* Use less restrictive strategy (and fewer member keys) */
1354 [ + + - ]: 6 : switch (strat_total)
1355 : : {
1356 : 3 : case BTLessStrategyNumber:
1357 : 3 : strat_total = BTLessEqualStrategyNumber;
1358 : 3 : break;
1359 : 3 : case BTGreaterStrategyNumber:
1360 : 3 : strat_total = BTGreaterEqualStrategyNumber;
1361 : 3 : break;
1362 : : }
1363 : : }
1364 [ + + ]: 24 : if (tighten_strat)
1365 : : {
1366 : : /* Use more restrictive strategy (and fewer member keys) */
1367 [ + + - ]: 6 : switch (strat_total)
1368 : : {
1369 : 3 : case BTLessEqualStrategyNumber:
1370 : 3 : strat_total = BTLessStrategyNumber;
1371 : 3 : break;
1372 : 3 : case BTGreaterEqualStrategyNumber:
1373 : 3 : strat_total = BTGreaterStrategyNumber;
1374 : 3 : break;
1375 : : }
1376 : : }
1377 : :
1378 : : /* Done (row compare header key is always last startKeys[] key) */
1379 : 24 : break;
1380 : : }
1381 : :
1382 : : /*
1383 : : * Ordinary comparison key/search-style key.
1384 : : *
1385 : : * Transform the search-style scan key to an insertion scan key by
1386 : : * replacing the sk_func with the appropriate btree 3-way-comparison
1387 : : * function.
1388 : : *
1389 : : * If scankey operator is not a cross-type comparison, we can use the
1390 : : * cached comparison function; otherwise gotta look it up in the
1391 : : * catalogs. (That can't lead to infinite recursion, since no
1392 : : * indexscan initiated by syscache lookup will use cross-data-type
1393 : : * operators.)
1394 : : *
1395 : : * We support the convention that sk_subtype == InvalidOid means the
1396 : : * opclass input type; this hack simplifies life for ScanKeyInit().
1397 : : */
1398 [ + + ]: 12673365 : if (bkey->sk_subtype == rel->rd_opcintype[i] ||
1399 [ + + ]: 12107197 : bkey->sk_subtype == InvalidOid)
1400 : 12667930 : {
1401 : : FmgrInfo *procinfo;
1402 : :
1403 : 12667930 : procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC);
1404 : 12667930 : ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1405 : : bkey->sk_flags,
1406 : 12667930 : bkey->sk_attno,
1407 : : InvalidStrategy,
1408 : : bkey->sk_subtype,
1409 : : bkey->sk_collation,
1410 : : procinfo,
1411 : : bkey->sk_argument);
1412 : : }
1413 : : else
1414 : : {
1415 : : RegProcedure cmp_proc;
1416 : :
1417 : 5435 : cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1418 : 5435 : rel->rd_opcintype[i],
1419 : : bkey->sk_subtype, BTORDER_PROC);
1420 [ - + ]: 5435 : if (!RegProcedureIsValid(cmp_proc))
256 pg@bowt.ie 1421 [ # # ]:UBC 0 : elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1422 : : BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype,
1423 : : bkey->sk_attno, RelationGetRelationName(rel));
256 pg@bowt.ie 1424 :CBC 5435 : ScanKeyEntryInitialize(inskey.scankeys + i,
1425 : : bkey->sk_flags,
1426 : 5435 : bkey->sk_attno,
1427 : : InvalidStrategy,
1428 : : bkey->sk_subtype,
1429 : : bkey->sk_collation,
1430 : : cmp_proc,
1431 : : bkey->sk_argument);
1432 : : }
1433 : : }
1434 : :
1435 : : /*----------
1436 : : * Examine the selected initial-positioning strategy to determine exactly
1437 : : * where we need to start the scan, and set flag variables to control the
1438 : : * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1439 : : * page _bt_search returns).
1440 : : *----------
1441 : : */
828 1442 : 7861171 : _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1443 : 7861171 : inskey.anynullkeys = false; /* unused */
1444 : 7861171 : inskey.scantid = NULL;
1445 : 7861171 : inskey.keysz = keysz;
8120 tgl@sss.pgh.pa.us 1446 [ + + + + : 7861171 : switch (strat_total)
+ - ]
1447 : : {
1448 : 33216 : case BTLessStrategyNumber:
1449 : :
828 pg@bowt.ie 1450 : 33216 : inskey.nextkey = false;
1451 : 33216 : inskey.backward = true;
8120 tgl@sss.pgh.pa.us 1452 : 33216 : break;
1453 : :
1454 : 9 : case BTLessEqualStrategyNumber:
1455 : :
828 pg@bowt.ie 1456 : 9 : inskey.nextkey = true;
1457 : 9 : inskey.backward = true;
8120 tgl@sss.pgh.pa.us 1458 : 9 : break;
1459 : :
1460 : 7116628 : case BTEqualStrategyNumber:
1461 : :
1462 : : /*
1463 : : * If a backward scan was specified, need to start with last equal
1464 : : * item not first one.
1465 : : */
1466 [ + + ]: 7116628 : if (ScanDirectionIsBackward(dir))
1467 : : {
1468 : : /*
1469 : : * This is the same as the <= strategy
1470 : : */
828 pg@bowt.ie 1471 : 103 : inskey.nextkey = true;
1472 : 103 : inskey.backward = true;
1473 : : }
1474 : : else
1475 : : {
1476 : : /*
1477 : : * This is the same as the >= strategy
1478 : : */
1479 : 7116525 : inskey.nextkey = false;
1480 : 7116525 : inskey.backward = false;
1481 : : }
8120 tgl@sss.pgh.pa.us 1482 : 7116628 : break;
1483 : :
1484 : 5075 : case BTGreaterEqualStrategyNumber:
1485 : :
1486 : : /*
1487 : : * Find first item >= scankey
1488 : : */
828 pg@bowt.ie 1489 : 5075 : inskey.nextkey = false;
1490 : 5075 : inskey.backward = false;
8120 tgl@sss.pgh.pa.us 1491 : 5075 : break;
1492 : :
1493 : 706243 : case BTGreaterStrategyNumber:
1494 : :
1495 : : /*
1496 : : * Find first item > scankey
1497 : : */
828 pg@bowt.ie 1498 : 706243 : inskey.nextkey = true;
1499 : 706243 : inskey.backward = false;
8120 tgl@sss.pgh.pa.us 1500 : 706243 : break;
1501 : :
8120 tgl@sss.pgh.pa.us 1502 :UBC 0 : default:
1503 : : /* can't get here, but keep compiler quiet */
1504 [ # # ]: 0 : elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1505 : : return false;
1506 : : }
1507 : :
1508 : : /*
1509 : : * Use the manufactured insertion scan key to descend the tree and
1510 : : * position ourselves on the target leaf page.
1511 : : */
828 pg@bowt.ie 1512 [ - + ]:CBC 7861171 : Assert(ScanDirectionIsBackward(dir) == inskey.backward);
3 pg@bowt.ie 1513 :GNC 7861171 : _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ, false);
1514 : :
513 pg@bowt.ie 1515 [ + + ]:CBC 7861171 : if (!BufferIsValid(so->currPos.buf))
1516 : : {
256 1517 [ - + ]: 285500 : Assert(!so->needPrimScan);
1518 : :
1519 : : /*
1520 : : * We only get here if the index is completely empty. Lock relation
1521 : : * because nothing finer to lock exists. Without a buffer lock, it's
1522 : : * possible for another transaction to insert data between
1523 : : * _bt_search() and PredicateLockRelation(). We have to try again
1524 : : * after taking the relation-level predicate lock, to close a narrow
1525 : : * window where we wouldn't scan concurrently inserted tuples, but the
1526 : : * writer wouldn't see our predicate lock.
1527 : : */
986 tmunro@postgresql.or 1528 [ + + ]: 285500 : if (IsolationIsSerializable())
1529 : : {
1530 : 2826 : PredicateLockRelation(rel, scan->xs_snapshot);
3 pg@bowt.ie 1531 :GNC 2826 : _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ, false);
1532 : : }
1533 : :
513 pg@bowt.ie 1534 [ + - ]:CBC 285500 : if (!BufferIsValid(so->currPos.buf))
1535 : : {
986 tmunro@postgresql.or 1536 : 285500 : _bt_parallel_done(scan);
1537 : 285500 : return false;
1538 : : }
1539 : : }
1540 : :
1541 : : /* position to the precise item on the page */
513 pg@bowt.ie 1542 : 7575671 : offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
1543 : :
1544 : : /*
1545 : : * Now load data from the first page of the scan (usually the page
1546 : : * currently in so->currPos.buf).
1547 : : *
1548 : : * If inskey.nextkey = false and inskey.backward = false, offnum is
1549 : : * positioned at the first non-pivot tuple >= inskey.scankeys.
1550 : : *
1551 : : * If inskey.nextkey = false and inskey.backward = true, offnum is
1552 : : * positioned at the last non-pivot tuple < inskey.scankeys.
1553 : : *
1554 : : * If inskey.nextkey = true and inskey.backward = false, offnum is
1555 : : * positioned at the first non-pivot tuple > inskey.scankeys.
1556 : : *
1557 : : * If inskey.nextkey = true and inskey.backward = true, offnum is
1558 : : * positioned at the last non-pivot tuple <= inskey.scankeys.
1559 : : *
1560 : : * It's possible that _bt_binsrch returned an offnum that is out of bounds
1561 : : * for the page. For example, when inskey is both < the leaf page's high
1562 : : * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1563 : : */
1564 [ + + ]: 7575671 : if (!_bt_readfirstpage(scan, offnum, dir))
1565 : 2186303 : return false;
1566 : :
487 1567 : 5389368 : _bt_returnitem(scan, so);
7252 tgl@sss.pgh.pa.us 1568 : 5389368 : return true;
1569 : : }
1570 : :
1571 : : /*
1572 : : * _bt_next() -- Get the next item in a scan.
1573 : : *
1574 : : * On entry, so->currPos describes the current page, which may be pinned
1575 : : * but is not locked, and so->currPos.itemIndex identifies which item was
1576 : : * previously returned.
1577 : : *
1578 : : * On success exit, so->currPos is updated as needed, and _bt_returnitem
1579 : : * sets the next item to return to the scan. so->currPos remains valid.
1580 : : *
1581 : : * On failure exit (no more tuples), we invalidate so->currPos. It'll
1582 : : * still be possible for the scan to return tuples by changing direction,
1583 : : * though we'll need to call _bt_first anew in that other direction.
1584 : : */
1585 : : bool
1586 : 9885262 : _bt_next(IndexScanDesc scan, ScanDirection dir)
1587 : : {
1588 : 9885262 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1589 : :
496 pg@bowt.ie 1590 [ - + - - : 9885262 : Assert(BTScanPosIsValid(so->currPos));
- + ]
1591 : :
1592 : : /*
1593 : : * Advance to next tuple on current page; or if there's no more, try to
1594 : : * step to the next page with data.
1595 : : */
7252 tgl@sss.pgh.pa.us 1596 [ + + ]: 9885262 : if (ScanDirectionIsForward(dir))
1597 : : {
1598 [ + + ]: 9866617 : if (++so->currPos.itemIndex > so->currPos.lastItem)
1599 : : {
1600 [ + + ]: 1312894 : if (!_bt_steppage(scan, dir))
1601 : 1296445 : return false;
1602 : : }
1603 : : }
1604 : : else
1605 : : {
1606 [ + + ]: 18645 : if (--so->currPos.itemIndex < so->currPos.firstItem)
1607 : : {
1608 [ + + ]: 67 : if (!_bt_steppage(scan, dir))
8700 1609 : 46 : return false;
1610 : : }
1611 : : }
1612 : :
487 pg@bowt.ie 1613 : 8588771 : _bt_returnitem(scan, so);
7252 tgl@sss.pgh.pa.us 1614 : 8588771 : return true;
1615 : : }
1616 : :
1617 : : /*
1618 : : * Return the index item from so->currPos.items[so->currPos.itemIndex] to the
1619 : : * index scan by setting the relevant fields in caller's index scan descriptor
1620 : : */
1621 : : static inline void
487 pg@bowt.ie 1622 : 14021097 : _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
1623 : : {
1624 : 14021097 : BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
1625 : :
1626 : : /* Most recent _bt_readpage must have succeeded */
1627 [ - + - - : 14021097 : Assert(BTScanPosIsValid(so->currPos));
- + ]
1628 [ - + ]: 14021097 : Assert(so->currPos.itemIndex >= so->currPos.firstItem);
1629 [ - + ]: 14021097 : Assert(so->currPos.itemIndex <= so->currPos.lastItem);
1630 : :
1631 : : /* Return next item, per amgettuple contract */
1632 : 14021097 : scan->xs_heaptid = currItem->heapTid;
1633 [ + + ]: 14021097 : if (so->currTuples)
1634 : 2202788 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1635 : 14021097 : }
1636 : :
1637 : : /*
1638 : : * _bt_steppage() -- Step to next page containing valid data for scan
1639 : : *
1640 : : * Wrapper on _bt_readnextpage that performs final steps for the current page.
1641 : : *
1642 : : * On entry, so->currPos must be valid. Its buffer will be pinned, though
1643 : : * never locked. (Actually, when so->dropPin there won't even be a pin held,
1644 : : * though so->currPos.currPage must still be set to a valid block number.)
1645 : : */
1646 : : static bool
7252 tgl@sss.pgh.pa.us 1647 : 3500238 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1648 : : {
9368 1649 : 3500238 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1650 : : BlockNumber blkno,
1651 : : lastcurrblkno;
1652 : :
4008 kgrittn@postgresql.o 1653 [ - + - - : 3500238 : Assert(BTScanPosIsValid(so->currPos));
- + ]
1654 : :
1655 : : /* Before leaving current page, deal with any killed items */
7252 tgl@sss.pgh.pa.us 1656 [ + + ]: 3500238 : if (so->numKilled > 0)
4008 kgrittn@postgresql.o 1657 : 42325 : _bt_killitems(scan);
1658 : :
1659 : : /*
1660 : : * Before we modify currPos, make a copy of the page data if there was a
1661 : : * mark position that needs it.
1662 : : */
7143 tgl@sss.pgh.pa.us 1663 [ + + ]: 3500238 : if (so->markItemIndex >= 0)
1664 : : {
1665 : : /* bump pin on current buffer for assignment to mark buffer */
4008 kgrittn@postgresql.o 1666 [ - + - - : 187 : if (BTScanPosIsPinned(so->currPos))
+ + ]
1667 : 174 : IncrBufferRefCount(so->currPos.buf);
7143 tgl@sss.pgh.pa.us 1668 : 187 : memcpy(&so->markPos, &so->currPos,
1669 : : offsetof(BTScanPosData, items[1]) +
1670 : 187 : so->currPos.lastItem * sizeof(BTScanPosItem));
5271 1671 [ + + ]: 187 : if (so->markTuples)
1672 : 174 : memcpy(so->markTuples, so->currTuples,
1673 : 174 : so->currPos.nextTupleOffset);
7143 1674 : 187 : so->markPos.itemIndex = so->markItemIndex;
1675 : 187 : so->markItemIndex = -1;
1676 : :
1677 : : /*
1678 : : * If we're just about to start the next primitive index scan
1679 : : * (possible with a scan that has arrays keys, and needs to skip to
1680 : : * continue in the current scan direction), moreLeft/moreRight only
1681 : : * indicate the end of the current primitive index scan. They must
1682 : : * never be taken to indicate that the top-level index scan has ended
1683 : : * (that would be wrong).
1684 : : *
1685 : : * We could handle this case by treating the current array keys as
1686 : : * markPos state. But depending on the current array state like this
1687 : : * would add complexity. Instead, we just unset markPos's copy of
1688 : : * moreRight or moreLeft (whichever might be affected), while making
1689 : : * btrestrpos reset the scan's arrays to their initial scan positions.
1690 : : * In effect, btrestrpos leaves advancing the arrays up to the first
1691 : : * _bt_readpage call (that takes place after it has restored markPos).
1692 : : */
708 pg@bowt.ie 1693 [ - + ]: 187 : if (so->needPrimScan)
1694 : : {
501 pg@bowt.ie 1695 [ # # ]:UBC 0 : if (ScanDirectionIsForward(so->currPos.dir))
708 1696 : 0 : so->markPos.moreRight = true;
1697 : : else
1698 : 0 : so->markPos.moreLeft = true;
1699 : : }
1700 : :
1701 : : /* mark/restore not supported by parallel scans */
513 pg@bowt.ie 1702 [ - + ]:CBC 187 : Assert(!scan->parallel_scan);
1703 : : }
1704 : :
1705 [ - + - - : 3500238 : BTScanPosUnpinIfPinned(so->currPos);
+ + ]
1706 : :
1707 : : /* Walk to the next page with data */
492 1708 [ + + ]: 3500238 : if (ScanDirectionIsForward(dir))
1709 : 3500114 : blkno = so->currPos.nextPage;
1710 : : else
1711 : 124 : blkno = so->currPos.prevPage;
1712 : 3500238 : lastcurrblkno = so->currPos.currPage;
1713 : :
1714 : : /*
1715 : : * Cancel primitive index scans that were scheduled when the call to
1716 : : * _bt_readpage for currPos happened to use the opposite direction to the
1717 : : * one that we're stepping in now. (It's okay to leave the scan's array
1718 : : * keys as-is, since the next _bt_readpage will advance them.)
1719 : : */
1720 [ + + ]: 3500238 : if (so->currPos.dir != dir)
1721 : 18 : so->needPrimScan = false;
1722 : :
1723 : 3500238 : return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
1724 : : }
1725 : :
1726 : : /*
1727 : : * _bt_readfirstpage() -- Read first page containing valid data for _bt_first
1728 : : *
1729 : : * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
1730 : : * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
1731 : : * When we're passed an offnum past the end of the page, we might still manage
1732 : : * to stop the scan on this page by calling _bt_checkkeys against the high
1733 : : * key. See _bt_readpage for full details.
1734 : : *
1735 : : * On entry, so->currPos must be pinned and locked (so offnum stays valid).
1736 : : * Parallel scan callers must have seized the scan before calling here.
1737 : : *
1738 : : * On exit, we'll have updated so->currPos and retained locks and pins
1739 : : * according to the same rules as those laid out for _bt_readnextpage exit.
1740 : : * Like _bt_readnextpage, our return value indicates if there are any matching
1741 : : * records in the given direction.
1742 : : *
1743 : : * We always release the scan for a parallel scan caller, regardless of
1744 : : * success or failure; we'll call _bt_parallel_release as soon as possible.
1745 : : */
1746 : : static bool
513 1747 : 7619473 : _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
1748 : : {
1749 : 7619473 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1750 : :
1751 : 7619473 : so->numKilled = 0; /* just paranoia */
1752 : 7619473 : so->markItemIndex = -1; /* ditto */
1753 : :
1754 : : /* Initialize so->currPos for the first page (page in so->currPos.buf) */
1755 [ + + ]: 7619473 : if (so->needPrimScan)
1756 : : {
1757 [ - + ]: 8788 : Assert(so->numArrayKeys);
1758 : :
1759 : 8788 : so->currPos.moreLeft = true;
1760 : 8788 : so->currPos.moreRight = true;
1761 : 8788 : so->needPrimScan = false;
1762 : : }
1763 [ + + ]: 7610685 : else if (ScanDirectionIsForward(dir))
1764 : : {
1765 : 7577342 : so->currPos.moreLeft = false;
3315 rhaas@postgresql.org 1766 : 7577342 : so->currPos.moreRight = true;
1767 : : }
1768 : : else
1769 : : {
513 pg@bowt.ie 1770 : 33343 : so->currPos.moreLeft = true;
1771 : 33343 : so->currPos.moreRight = false;
1772 : : }
1773 : :
1774 : : /*
1775 : : * Attempt to load matching tuples from the first page.
1776 : : *
1777 : : * Note that _bt_readpage will finish initializing the so->currPos fields.
1778 : : * _bt_readpage also releases parallel scan (even when it returns false).
1779 : : */
1780 [ + + ]: 7619473 : if (_bt_readpage(scan, dir, offnum, true))
1781 : : {
282 1782 : 5432196 : Relation rel = scan->indexRelation;
1783 : :
1784 : : /*
1785 : : * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
1786 : : * so->currPos.buf in preparation for btgettuple returning tuples.
1787 : : */
513 1788 [ - + - - : 5432196 : Assert(BTScanPosIsPinned(so->currPos));
- + ]
282 1789 : 5432196 : _bt_drop_lock_and_maybe_pin(rel, so);
513 1790 : 5432196 : return true;
1791 : : }
1792 : :
1793 : : /* There's no actually-matching data on the page in so->currPos.buf */
1794 : 2187277 : _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1795 : :
1796 : : /* Call _bt_readnextpage using its _bt_steppage wrapper function */
1797 [ + + ]: 2187277 : if (!_bt_steppage(scan, dir))
1798 : 2187194 : return false;
1799 : :
1800 : : /* _bt_readpage for a later page (now in so->currPos) succeeded */
3315 rhaas@postgresql.org 1801 : 83 : return true;
1802 : : }
1803 : :
1804 : : /*
1805 : : * _bt_readnextpage() -- Read next page containing valid data for _bt_next
1806 : : *
1807 : : * Caller's blkno is the next interesting page's link, taken from either the
1808 : : * previously-saved right link or left link. lastcurrblkno is the page that
1809 : : * was current at the point where the blkno link was saved, which we use to
1810 : : * reason about concurrent page splits/page deletions during backwards scans.
1811 : : * In the common case where seized=false, blkno is either so->currPos.nextPage
1812 : : * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage.
1813 : : *
1814 : : * On entry, so->currPos shouldn't be locked by caller. so->currPos.buf must
1815 : : * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno
1816 : : * won't need to be read again in almost all cases). Parallel scan callers
1817 : : * that seized the scan before calling here should pass seized=true; such a
1818 : : * caller's blkno and lastcurrblkno arguments come from the seized scan.
1819 : : * seized=false callers just pass us the blkno/lastcurrblkno taken from their
1820 : : * so->currPos, which (along with so->currPos itself) can be used to end the
1821 : : * scan. A seized=false caller's blkno can never be assumed to be the page
1822 : : * that must be read next during a parallel scan, though. We must figure that
1823 : : * part out for ourselves by seizing the scan (the correct page to read might
1824 : : * already be beyond the seized=false caller's blkno during a parallel scan,
1825 : : * unless blkno/so->currPos.nextPage/so->currPos.prevPage is already P_NONE,
1826 : : * or unless so->currPos.moreRight/so->currPos.moreLeft is already unset).
1827 : : *
1828 : : * On success exit, so->currPos is updated to contain data from the next
1829 : : * interesting page, and we return true. We hold a pin on the buffer on
1830 : : * success exit (except during so->dropPin index scans, when we drop the pin
1831 : : * eagerly to avoid blocking VACUUM).
1832 : : *
1833 : : * If there are no more matching records in the given direction, we invalidate
1834 : : * so->currPos (while ensuring it retains no locks or pins), and return false.
1835 : : *
1836 : : * We always release the scan for a parallel scan caller, regardless of
1837 : : * success or failure; we'll call _bt_parallel_release as soon as possible.
1838 : : */
1839 : : static bool
513 pg@bowt.ie 1840 : 3500285 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
1841 : : BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
1842 : : {
1843 : 3500285 : Relation rel = scan->indexRelation;
3315 rhaas@postgresql.org 1844 : 3500285 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1845 : :
492 pg@bowt.ie 1846 [ + + - + ]: 3500285 : Assert(so->currPos.currPage == lastcurrblkno || seized);
373 1847 [ + + - + ]: 3500285 : Assert(!(blkno == P_NONE && seized));
513 1848 [ + + - + : 3500285 : Assert(!BTScanPosIsPinned(so->currPos));
- + ]
1849 : :
1850 : : /*
1851 : : * Remember that the scan already read lastcurrblkno, a page to the left
1852 : : * of blkno (or remember reading a page to the right, for backwards scans)
1853 : : */
3315 rhaas@postgresql.org 1854 [ + + ]: 3500285 : if (ScanDirectionIsForward(dir))
513 pg@bowt.ie 1855 : 3500161 : so->currPos.moreLeft = true;
1856 : : else
1857 : 124 : so->currPos.moreRight = true;
1858 : :
1859 : : for (;;)
10416 bruce@momjian.us 1860 : 1154 : {
1861 : : Page page;
1862 : : BTPageOpaque opaque;
1863 : :
513 pg@bowt.ie 1864 [ + + + + ]: 3501439 : if (blkno == P_NONE ||
1865 : : (ScanDirectionIsForward(dir) ?
1866 [ + + + + ]: 1188514 : !so->currPos.moreRight : !so->currPos.moreLeft))
1867 : : {
1868 : : /* most recent _bt_readpage call (for lastcurrblkno) ended scan */
492 1869 [ + - - + ]: 3483638 : Assert(so->currPos.currPage == lastcurrblkno && !seized);
581 1870 : 3483638 : BTScanPosInvalidate(so->currPos);
492 1871 : 3483638 : _bt_parallel_done(scan); /* iff !so->needPrimScan */
581 1872 : 3483638 : return false;
1873 : : }
1874 : :
501 1875 [ - + ]: 17801 : Assert(!so->needPrimScan);
1876 : :
1877 : : /* parallel scan must never actually visit so->currPos blkno */
492 1878 [ + + + + ]: 17801 : if (!seized && scan->parallel_scan != NULL &&
1879 [ + + ]: 606 : !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
1880 : : {
1881 : : /* whole scan is now done (or another primitive scan required) */
1882 : 47 : BTScanPosInvalidate(so->currPos);
1883 : 47 : return false;
1884 : : }
1885 : :
513 1886 [ + + ]: 17754 : if (ScanDirectionIsForward(dir))
1887 : : {
1888 : : /* read blkno, but check for interrupts first */
1889 [ - + ]: 17681 : CHECK_FOR_INTERRUPTS();
1890 : 17681 : so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1891 : : }
1892 : : else
1893 : : {
1894 : : /* read blkno, avoiding race (also checks for interrupts) */
1895 : 73 : so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
1896 : : lastcurrblkno);
1897 [ - + ]: 73 : if (so->currPos.buf == InvalidBuffer)
1898 : : {
1899 : : /* must have been a concurrent deletion of leftmost page */
4008 kgrittn@postgresql.o 1900 :UBC 0 : BTScanPosInvalidate(so->currPos);
496 pg@bowt.ie 1901 : 0 : _bt_parallel_done(scan);
7252 tgl@sss.pgh.pa.us 1902 : 0 : return false;
1903 : : }
1904 : : }
1905 : :
513 pg@bowt.ie 1906 :CBC 17754 : page = BufferGetPage(so->currPos.buf);
1907 : 17754 : opaque = BTPageGetOpaque(page);
1908 : 17754 : lastcurrblkno = blkno;
1909 [ + + ]: 17754 : if (likely(!P_IGNORE(opaque)))
1910 : : {
1911 : : /* see if there are any matches on this page */
1912 [ + + ]: 17753 : if (ScanDirectionIsForward(dir))
1913 : : {
1914 : : /* note that this will clear moreRight if we can stop */
358 1915 [ + + + + ]: 17680 : if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))
513 1916 : 16537 : break;
1917 : 1143 : blkno = so->currPos.nextPage;
1918 : : }
1919 : : else
1920 : : {
1921 : : /* note that this will clear moreLeft if we can stop */
358 1922 [ + + ]: 73 : if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))
7252 tgl@sss.pgh.pa.us 1923 : 63 : break;
513 pg@bowt.ie 1924 : 10 : blkno = so->currPos.prevPage;
1925 : : }
1926 : : }
1927 : : else
1928 : : {
1929 : : /* _bt_readpage not called, so do all this for ourselves */
513 pg@bowt.ie 1930 [ + - ]:GBC 1 : if (ScanDirectionIsForward(dir))
1931 : 1 : blkno = opaque->btpo_next;
1932 : : else
513 pg@bowt.ie 1933 :UBC 0 : blkno = opaque->btpo_prev;
3315 rhaas@postgresql.org 1934 [ - + ]:GBC 1 : if (scan->parallel_scan != NULL)
513 pg@bowt.ie 1935 :UBC 0 : _bt_parallel_release(scan, blkno, lastcurrblkno);
1936 : : }
1937 : :
1938 : : /* no matching tuples on this page */
513 pg@bowt.ie 1939 :CBC 1154 : _bt_relbuf(rel, so->currPos.buf);
492 1940 : 1154 : seized = false; /* released by _bt_readpage (or by us) */
1941 : : }
1942 : :
1943 : : /*
1944 : : * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
1945 : : * so->currPos.buf in preparation for btgettuple returning tuples.
1946 : : */
513 1947 [ - + ]: 16600 : Assert(so->currPos.currPage == blkno);
1948 [ - + - - : 16600 : Assert(BTScanPosIsPinned(so->currPos));
- + ]
282 1949 : 16600 : _bt_drop_lock_and_maybe_pin(rel, so);
1950 : :
10057 bruce@momjian.us 1951 : 16600 : return true;
1952 : : }
1953 : :
1954 : : /*
1955 : : * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,
1956 : : * recovering from concurrent page splits/page deletions when necessary
1957 : : *
1958 : : * Called during backwards scans, to deal with their unique concurrency rules.
1959 : : *
1960 : : * blkno points to the block number of the page that we expect to move the
1961 : : * scan to. We'll successfully move the scan there when we find that its
1962 : : * right sibling link still points to lastcurrblkno (the page we just read).
1963 : : * Otherwise, we have to figure out which page is the correct one for the scan
1964 : : * to now read the hard way, reasoning about concurrent splits and deletions.
1965 : : * See nbtree/README.
1966 : : *
1967 : : * On return, we have both a pin and a read lock on the returned page, whose
1968 : : * block number will be set in *blkno. Returns InvalidBuffer if there is no
1969 : : * page to the left (no lock or pin is held in that case).
1970 : : *
1971 : : * It is possible for the returned leaf page to be half-dead; caller must
1972 : : * check that condition and step left again when required.
1973 : : */
1974 : : static Buffer
513 pg@bowt.ie 1975 : 73 : _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
1976 : : BlockNumber lastcurrblkno)
1977 : : {
1978 : 73 : BlockNumber origblkno = *blkno; /* detects circular links */
1979 : :
1980 : : for (;;)
8422 tgl@sss.pgh.pa.us 1981 :UBC 0 : {
1982 : : Buffer buf;
1983 : : Page page;
1984 : : BTPageOpaque opaque;
1985 : : int tries;
1986 : :
1987 : : /* check for interrupts while we're not holding any buffer lock */
6158 tgl@sss.pgh.pa.us 1988 [ - + ]:CBC 73 : CHECK_FOR_INTERRUPTS();
513 pg@bowt.ie 1989 : 73 : buf = _bt_getbuf(rel, *blkno, BT_READ);
3616 kgrittn@postgresql.o 1990 : 73 : page = BufferGetPage(buf);
1444 michael@paquier.xyz 1991 : 73 : opaque = BTPageGetOpaque(page);
1992 : :
1993 : : /*
1994 : : * If this isn't the page we want, walk right till we find what we
1995 : : * want --- but go no more than four hops (an arbitrary limit). If we
1996 : : * don't find the correct page by then, the most likely bet is that
1997 : : * lastcurrblkno got deleted and isn't in the sibling chain at all
1998 : : * anymore, not that its left sibling got split more than four times.
1999 : : *
2000 : : * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2001 : : * because half-dead pages are still in the sibling chain.
2002 : : */
8422 tgl@sss.pgh.pa.us 2003 : 73 : tries = 0;
2004 : : for (;;)
2005 : : {
513 pg@bowt.ie 2006 [ + - + - : 73 : if (likely(!P_ISDELETED(opaque) &&
+ - ]
2007 : : opaque->btpo_next == lastcurrblkno))
2008 : : {
2009 : : /* Found desired page, return it */
8422 tgl@sss.pgh.pa.us 2010 : 73 : return buf;
2011 : : }
8422 tgl@sss.pgh.pa.us 2012 [ # # # # ]:UBC 0 : if (P_RIGHTMOST(opaque) || ++tries > 4)
2013 : : break;
2014 : : /* step right */
513 pg@bowt.ie 2015 : 0 : *blkno = opaque->btpo_next;
2016 : 0 : buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
3616 kgrittn@postgresql.o 2017 : 0 : page = BufferGetPage(buf);
1444 michael@paquier.xyz 2018 : 0 : opaque = BTPageGetOpaque(page);
2019 : : }
2020 : :
2021 : : /*
2022 : : * Return to the original page (usually the page most recently read by
2023 : : * _bt_readpage, which is passed by caller as lastcurrblkno) to see
2024 : : * what's up with its prev sibling link
2025 : : */
513 pg@bowt.ie 2026 : 0 : buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
3616 kgrittn@postgresql.o 2027 : 0 : page = BufferGetPage(buf);
1444 michael@paquier.xyz 2028 : 0 : opaque = BTPageGetOpaque(page);
8422 tgl@sss.pgh.pa.us 2029 [ # # ]: 0 : if (P_ISDELETED(opaque))
2030 : : {
2031 : : /*
2032 : : * It was deleted. Move right to first nondeleted page (there
2033 : : * must be one); that is the page that has acquired the deleted
2034 : : * one's keyspace, so stepping left from it will take us where we
2035 : : * want to be.
2036 : : */
2037 : : for (;;)
2038 : : {
2039 [ # # ]: 0 : if (P_RIGHTMOST(opaque))
6649 2040 [ # # ]: 0 : elog(ERROR, "fell off the end of index \"%s\"",
2041 : : RelationGetRelationName(rel));
513 pg@bowt.ie 2042 : 0 : lastcurrblkno = opaque->btpo_next;
2043 : 0 : buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
3616 kgrittn@postgresql.o 2044 : 0 : page = BufferGetPage(buf);
1444 michael@paquier.xyz 2045 : 0 : opaque = BTPageGetOpaque(page);
8422 tgl@sss.pgh.pa.us 2046 [ # # ]: 0 : if (!P_ISDELETED(opaque))
2047 : 0 : break;
2048 : : }
2049 : : }
2050 : : else
2051 : : {
2052 : : /*
2053 : : * Original lastcurrblkno wasn't deleted; the explanation had
2054 : : * better be that the page to the left got split or deleted.
2055 : : * Without this check, we risk going into an infinite loop.
2056 : : */
513 pg@bowt.ie 2057 [ # # ]: 0 : if (opaque->btpo_prev == origblkno)
6649 tgl@sss.pgh.pa.us 2058 [ # # ]: 0 : elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2059 : : lastcurrblkno, RelationGetRelationName(rel));
2060 : : /* Okay to try again, since left sibling link changed */
2061 : : }
2062 : :
2063 : : /*
2064 : : * Original lastcurrblkno from caller was concurrently deleted (could
2065 : : * also have been a great many concurrent left sibling page splits).
2066 : : * Found a non-deleted page that should now act as our lastcurrblkno.
2067 : : */
513 pg@bowt.ie 2068 [ # # ]: 0 : if (P_LEFTMOST(opaque))
2069 : : {
2070 : : /* New lastcurrblkno has no left sibling (concurrently deleted) */
2071 : 0 : _bt_relbuf(rel, buf);
2072 : 0 : break;
2073 : : }
2074 : :
2075 : : /* Start from scratch with new lastcurrblkno's blkno/prev link */
2076 : 0 : *blkno = origblkno = opaque->btpo_prev;
2077 : 0 : _bt_relbuf(rel, buf);
2078 : : }
2079 : :
8422 tgl@sss.pgh.pa.us 2080 : 0 : return InvalidBuffer;
2081 : : }
2082 : :
2083 : : /*
2084 : : * _bt_get_endpoint() -- Find the first or last page on a given tree level
2085 : : *
2086 : : * If the index is empty, we will return InvalidBuffer; any other failure
2087 : : * condition causes ereport(). We will not return a dead page.
2088 : : *
2089 : : * The returned buffer is pinned and read-locked.
2090 : : */
2091 : : Buffer
919 tmunro@postgresql.or 2092 :CBC 47691 : _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
2093 : : {
2094 : : Buffer buf;
2095 : : Page page;
2096 : : BTPageOpaque opaque;
2097 : : OffsetNumber offnum;
2098 : : BlockNumber blkno;
2099 : : IndexTuple itup;
2100 : :
2101 : : /*
2102 : : * If we are looking for a leaf page, okay to descend from fast root;
2103 : : * otherwise better descend from true root. (There is no point in being
2104 : : * smarter about intermediate levels.)
2105 : : */
8423 tgl@sss.pgh.pa.us 2106 [ + + ]: 47691 : if (level == 0)
1009 pg@bowt.ie 2107 : 47682 : buf = _bt_getroot(rel, NULL, BT_READ);
2108 : : else
2109 : 9 : buf = _bt_gettrueroot(rel);
2110 : :
8423 tgl@sss.pgh.pa.us 2111 [ + + ]: 47691 : if (!BufferIsValid(buf))
2112 : 3880 : return InvalidBuffer;
2113 : :
3616 kgrittn@postgresql.o 2114 : 43811 : page = BufferGetPage(buf);
1444 michael@paquier.xyz 2115 : 43811 : opaque = BTPageGetOpaque(page);
2116 : :
2117 : : for (;;)
2118 : : {
2119 : : /*
2120 : : * If we landed on a deleted page, step right to find a live page
2121 : : * (there must be one). Also, if we want the rightmost page, step
2122 : : * right if needed to get to it (this could happen if the page split
2123 : : * since we obtained a pointer to it).
2124 : : */
8422 tgl@sss.pgh.pa.us 2125 [ - + + + ]: 56580 : while (P_IGNORE(opaque) ||
8423 2126 [ - + ]: 33 : (rightmost && !P_RIGHTMOST(opaque)))
2127 : : {
8423 tgl@sss.pgh.pa.us 2128 :UBC 0 : blkno = opaque->btpo_next;
2129 [ # # ]: 0 : if (blkno == P_NONE)
6649 2130 [ # # ]: 0 : elog(ERROR, "fell off the end of index \"%s\"",
2131 : : RelationGetRelationName(rel));
7998 2132 : 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
3616 kgrittn@postgresql.o 2133 : 0 : page = BufferGetPage(buf);
1444 michael@paquier.xyz 2134 : 0 : opaque = BTPageGetOpaque(page);
2135 : : }
2136 : :
2137 : : /* Done? */
1845 pg@bowt.ie 2138 [ + + ]:CBC 56580 : if (opaque->btpo_level == level)
8423 tgl@sss.pgh.pa.us 2139 : 43811 : break;
1845 pg@bowt.ie 2140 [ - + ]: 12769 : if (opaque->btpo_level < level)
2418 peter@eisentraut.org 2141 [ # # ]:UBC 0 : ereport(ERROR,
2142 : : (errcode(ERRCODE_INDEX_CORRUPTED),
2143 : : errmsg_internal("btree level %u not found in index \"%s\"",
2144 : : level, RelationGetRelationName(rel))));
2145 : :
2146 : : /* Descend to leftmost or rightmost child page */
8423 tgl@sss.pgh.pa.us 2147 [ + + ]:CBC 12769 : if (rightmost)
2148 : 3 : offnum = PageGetMaxOffsetNumber(page);
2149 : : else
2150 [ + + ]: 12766 : offnum = P_FIRSTDATAKEY(opaque);
2151 : :
90 tgl@sss.pgh.pa.us 2152 [ + - - + ]:GNC 12769 : if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page))
90 tgl@sss.pgh.pa.us 2153 [ # # ]:UNC 0 : elog(PANIC, "offnum out of range");
2154 : :
7354 tgl@sss.pgh.pa.us 2155 :CBC 12769 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2281 pg@bowt.ie 2156 : 12769 : blkno = BTreeTupleGetDownLink(itup);
2157 : :
7998 tgl@sss.pgh.pa.us 2158 : 12769 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
3616 kgrittn@postgresql.o 2159 : 12769 : page = BufferGetPage(buf);
1444 michael@paquier.xyz 2160 : 12769 : opaque = BTPageGetOpaque(page);
2161 : : }
2162 : :
8423 tgl@sss.pgh.pa.us 2163 : 43811 : return buf;
2164 : : }
2165 : :
2166 : : /*
2167 : : * _bt_endpoint() -- Find the first or last page in the index, and scan
2168 : : * from there to the first key satisfying all the quals.
2169 : : *
2170 : : * This is used by _bt_first() to set up a scan when we've determined
2171 : : * that the scan must start at the beginning or end of the index (for
2172 : : * a forward or backward scan respectively).
2173 : : *
2174 : : * Parallel scan callers must have seized the scan before calling here.
2175 : : * Exit conditions are the same as for _bt_first().
2176 : : */
2177 : : static bool
10841 scrappy@hub.org 2178 : 47682 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2179 : : {
7252 tgl@sss.pgh.pa.us 2180 : 47682 : Relation rel = scan->indexRelation;
2181 : 47682 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2182 : : Page page;
2183 : : BTPageOpaque opaque;
2184 : : OffsetNumber start;
2185 : :
496 pg@bowt.ie 2186 [ + - - + : 47682 : Assert(!BTScanPosIsValid(so->currPos));
- + ]
432 2187 [ - + ]: 47682 : Assert(!so->needPrimScan);
2188 : :
2189 : : /*
2190 : : * Scan down to the leftmost or rightmost leaf page. This is a simplified
2191 : : * version of _bt_search().
2192 : : */
513 2193 : 47682 : so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
2194 : :
2195 [ + + ]: 47682 : if (!BufferIsValid(so->currPos.buf))
2196 : : {
2197 : : /*
2198 : : * Empty index. Lock the whole relation, as nothing finer to lock
2199 : : * exists.
2200 : : */
5387 heikki.linnakangas@i 2201 : 3880 : PredicateLockRelation(rel, scan->xs_snapshot);
496 pg@bowt.ie 2202 : 3880 : _bt_parallel_done(scan);
8700 tgl@sss.pgh.pa.us 2203 : 3880 : return false;
2204 : : }
2205 : :
513 pg@bowt.ie 2206 : 43802 : page = BufferGetPage(so->currPos.buf);
1444 michael@paquier.xyz 2207 : 43802 : opaque = BTPageGetOpaque(page);
8423 tgl@sss.pgh.pa.us 2208 [ - + ]: 43802 : Assert(P_ISLEAF(opaque));
2209 : :
10416 bruce@momjian.us 2210 [ + + ]: 43802 : if (ScanDirectionIsForward(dir))
2211 : : {
2212 : : /* There could be dead pages to the left, so not this: */
2213 : : /* Assert(P_LEFTMOST(opaque)); */
2214 : :
9368 tgl@sss.pgh.pa.us 2215 [ + + ]: 43772 : start = P_FIRSTDATAKEY(opaque);
2216 : : }
10416 bruce@momjian.us 2217 [ + - ]: 30 : else if (ScanDirectionIsBackward(dir))
2218 : : {
9368 tgl@sss.pgh.pa.us 2219 [ - + ]: 30 : Assert(P_RIGHTMOST(opaque));
2220 : :
2221 : 30 : start = PageGetMaxOffsetNumber(page);
2222 : : }
2223 : : else
2224 : : {
8273 tgl@sss.pgh.pa.us 2225 [ # # ]:UBC 0 : elog(ERROR, "invalid scan direction: %d", (int) dir);
2226 : : start = 0; /* keep compiler quiet */
2227 : : }
2228 : :
2229 : : /*
2230 : : * Now load data from the first page of the scan.
2231 : : */
513 pg@bowt.ie 2232 [ + + ]:CBC 43802 : if (!_bt_readfirstpage(scan, start, dir))
2233 : 891 : return false;
2234 : :
487 2235 : 42911 : _bt_returnitem(scan, so);
7252 tgl@sss.pgh.pa.us 2236 : 42911 : return true;
2237 : : }
|