Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * nbtree.c
4 : : * Implementation of Lehman and Yao's btree management algorithm for
5 : : * Postgres.
6 : : *
7 : : * NOTES
8 : : * This file contains only the public interface routines.
9 : : *
10 : : *
11 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
12 : : * Portions Copyright (c) 1994, Regents of the University of California
13 : : *
14 : : * IDENTIFICATION
15 : : * src/backend/access/nbtree/nbtree.c
16 : : *
17 : : *-------------------------------------------------------------------------
18 : : */
19 : : #include "postgres.h"
20 : :
21 : : #include "access/nbtree.h"
22 : : #include "access/relscan.h"
23 : : #include "access/stratnum.h"
24 : : #include "commands/progress.h"
25 : : #include "commands/vacuum.h"
26 : : #include "nodes/execnodes.h"
27 : : #include "pgstat.h"
28 : : #include "storage/bulk_write.h"
29 : : #include "storage/condition_variable.h"
30 : : #include "storage/indexfsm.h"
31 : : #include "storage/ipc.h"
32 : : #include "storage/lmgr.h"
33 : : #include "storage/read_stream.h"
34 : : #include "utils/datum.h"
35 : : #include "utils/fmgrprotos.h"
36 : : #include "utils/index_selfuncs.h"
37 : : #include "utils/memutils.h"
38 : : #include "utils/wait_event.h"
39 : :
40 : :
41 : : /*
42 : : * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
43 : : *
44 : : * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
45 : : * scan to advance it via another call to _bt_first.
46 : : *
47 : : * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
48 : : * a new page; others must wait.
49 : : *
50 : : * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
51 : : * to a new page; some process can start doing that.
52 : : *
53 : : * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
54 : : */
55 : : typedef enum
56 : : {
57 : : BTPARALLEL_NOT_INITIALIZED,
58 : : BTPARALLEL_NEED_PRIMSCAN,
59 : : BTPARALLEL_ADVANCING,
60 : : BTPARALLEL_IDLE,
61 : : BTPARALLEL_DONE,
62 : : } BTPS_State;
63 : :
64 : : /*
65 : : * BTParallelScanDescData contains btree specific shared information required
66 : : * for parallel scan.
67 : : */
68 : : typedef struct BTParallelScanDescData
69 : : {
70 : : BlockNumber btps_nextScanPage; /* next page to be scanned */
71 : : BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
72 : : * btps_nextScanPage */
73 : : BTPS_State btps_pageStatus; /* indicates whether next page is
74 : : * available for scan. see above for
75 : : * possible states of parallel scan. */
76 : : LWLock btps_lock; /* protects shared parallel state */
77 : : ConditionVariable btps_cv; /* used to synchronize parallel scan */
78 : :
79 : : /*
80 : : * btps_arrElems is used when scans need to schedule another primitive
81 : : * index scan with one or more SAOP arrays. Holds BTArrayKeyInfo.cur_elem
82 : : * offsets for each = scan key associated with a ScalarArrayOp array.
83 : : */
84 : : int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
85 : :
86 : : /*
87 : : * Additional space (at the end of the struct) is used when scans need to
88 : : * schedule another primitive index scan with one or more skip arrays.
89 : : * Holds a flattened datum representation for each = scan key associated
90 : : * with a skip array.
91 : : */
92 : : } BTParallelScanDescData;
93 : :
94 : : typedef struct BTParallelScanDescData *BTParallelScanDesc;
95 : :
96 : :
97 : : static bool _bt_start_prim_scan(IndexScanDesc scan);
98 : : static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
99 : : BTScanOpaque so);
100 : : static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
101 : : BTScanOpaque so);
102 : : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
103 : : IndexBulkDeleteCallback callback, void *callback_state,
104 : : BTCycleId cycleid);
105 : : static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf);
106 : : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
107 : : IndexTuple posting,
108 : : OffsetNumber updatedoffset,
109 : : int *nremaining);
110 : :
111 : :
112 : : /*
113 : : * Btree handler function: return IndexAmRoutine with access method parameters
114 : : * and callbacks.
115 : : */
116 : : Datum
3710 tgl@sss.pgh.pa.us 117 :CBC 1582860 : bthandler(PG_FUNCTION_ARGS)
118 : : {
119 : : static const IndexAmRoutine amroutine = {
120 : : .type = T_IndexAmRoutine,
121 : : .amstrategies = BTMaxStrategyNumber,
122 : : .amsupport = BTNProcs,
123 : : .amoptsprocnum = BTOPTIONS_PROC,
124 : : .amcanorder = true,
125 : : .amcanorderbyop = false,
126 : : .amcanhash = false,
127 : : .amconsistentequality = true,
128 : : .amconsistentordering = true,
129 : : .amcanbackward = true,
130 : : .amcanunique = true,
131 : : .amcanmulticol = true,
132 : : .amoptionalkey = true,
133 : : .amsearcharray = true,
134 : : .amsearchnulls = true,
135 : : .amstorage = false,
136 : : .amclusterable = true,
137 : : .ampredlocks = true,
138 : : .amcanparallel = true,
139 : : .amcanbuildparallel = true,
140 : : .amcaninclude = true,
141 : : .amusemaintenanceworkmem = false,
142 : : .amsummarizing = false,
143 : : .amparallelvacuumoptions =
144 : : VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP,
145 : : .amkeytype = InvalidOid,
146 : :
147 : : .ambuild = btbuild,
148 : : .ambuildempty = btbuildempty,
149 : : .aminsert = btinsert,
150 : : .aminsertcleanup = NULL,
151 : : .ambulkdelete = btbulkdelete,
152 : : .amvacuumcleanup = btvacuumcleanup,
153 : : .amcanreturn = btcanreturn,
154 : : .amcostestimate = btcostestimate,
155 : : .amgettreeheight = btgettreeheight,
156 : : .amoptions = btoptions,
157 : : .amproperty = btproperty,
158 : : .ambuildphasename = btbuildphasename,
159 : : .amvalidate = btvalidate,
160 : : .amadjustmembers = btadjustmembers,
161 : : .ambeginscan = btbeginscan,
162 : : .amrescan = btrescan,
163 : : .amgettuple = btgettuple,
164 : : .amgetbitmap = btgetbitmap,
165 : : .amendscan = btendscan,
166 : : .ammarkpos = btmarkpos,
167 : : .amrestrpos = btrestrpos,
168 : : .amestimateparallelscan = btestimateparallelscan,
169 : : .aminitparallelscan = btinitparallelscan,
170 : : .amparallelrescan = btparallelrescan,
171 : : .amtranslatestrategy = bttranslatestrategy,
172 : : .amtranslatecmptype = bttranslatecmptype,
173 : : };
174 : :
75 tgl@sss.pgh.pa.us 175 :GNC 1582860 : PG_RETURN_POINTER(&amroutine);
176 : : }
177 : :
178 : : /*
179 : : * btbuildempty() -- build an empty btree index in the initialization fork
180 : : */
181 : : void
3710 tgl@sss.pgh.pa.us 182 :CBC 95 : btbuildempty(Relation index)
183 : : {
935 heikki.linnakangas@i 184 : 95 : bool allequalimage = _bt_allequalimage(index, false);
185 : : BulkWriteState *bulkstate;
186 : : BulkWriteBuffer metabuf;
187 : :
751 188 : 95 : bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
189 : :
190 : : /* Construct metapage. */
191 : 95 : metabuf = smgr_bulk_get_buf(bulkstate);
192 : 95 : _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
193 : 95 : smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
194 : :
195 : 95 : smgr_bulk_finish(bulkstate);
5555 rhaas@postgresql.org 196 : 95 : }
197 : :
198 : : /*
199 : : * btinsert() -- insert an index tuple into a btree.
200 : : *
201 : : * Descend the tree recursively, find the appropriate location for our
202 : : * new tuple, and put it there.
203 : : */
204 : : bool
3710 tgl@sss.pgh.pa.us 205 : 4273976 : btinsert(Relation rel, Datum *values, bool *isnull,
206 : : ItemPointer ht_ctid, Relation heapRel,
207 : : IndexUniqueCheck checkUnique,
208 : : bool indexUnchanged,
209 : : IndexInfo *indexInfo)
210 : : {
211 : : bool result;
212 : : IndexTuple itup;
213 : :
214 : : /* generate an index tuple */
7664 215 : 4273976 : itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
10416 bruce@momjian.us 216 : 4273976 : itup->t_tid = *ht_ctid;
217 : :
1887 pg@bowt.ie 218 : 4273976 : result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
219 : :
10416 bruce@momjian.us 220 : 4273707 : pfree(itup);
221 : :
3710 tgl@sss.pgh.pa.us 222 : 4273707 : return result;
223 : : }
224 : :
225 : : /*
226 : : * btgettuple() -- Get the next tuple in the scan.
227 : : */
228 : : bool
229 : 17765945 : btgettuple(IndexScanDesc scan, ScanDirection dir)
230 : : {
8696 231 : 17765945 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
232 : : bool res;
233 : :
282 pg@bowt.ie 234 [ - + ]: 17765945 : Assert(scan->heapRelation != NULL);
235 : :
236 : : /* btree indexes are never lossy */
6545 tgl@sss.pgh.pa.us 237 : 17765945 : scan->xs_recheck = false;
238 : :
239 : : /* Each loop iteration performs another primitive index scan */
240 : : do
241 : : {
242 : : /*
243 : : * If we've already initialized this scan, we can just advance it in
244 : : * the appropriate direction. If we haven't done so yet, we call
245 : : * _bt_first() to get the first item in the scan.
246 : : */
5264 247 [ + + - + : 17774446 : if (!BTScanPosIsValid(so->currPos))
+ + ]
248 : 7900328 : res = _bt_first(scan, dir);
249 : : else
250 : : {
251 : : /*
252 : : * Check to see if we should kill the previously-fetched tuple.
253 : : */
254 [ + + ]: 9874118 : if (scan->kill_prior_tuple)
255 : : {
256 : : /*
257 : : * Yes, remember it for later. (We'll deal with all such
258 : : * tuples at once right before leaving the index page.) The
259 : : * test for numKilled overrun is not just paranoia: if the
260 : : * caller reverses direction in the indexscan then the same
261 : : * item might get entered multiple times. It's not worth
262 : : * trying to optimize that, so we don't detect it, but instead
263 : : * just forget any excess entries.
264 : : */
265 [ + + ]: 203828 : if (so->killedItems == NULL)
95 michael@paquier.xyz 266 :GNC 86388 : so->killedItems = palloc_array(int, MaxTIDsPerBTreePage);
2209 pg@bowt.ie 267 [ + - ]:CBC 203828 : if (so->numKilled < MaxTIDsPerBTreePage)
5264 tgl@sss.pgh.pa.us 268 : 203828 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
269 : : }
270 : :
271 : : /*
272 : : * Now continue the scan.
273 : : */
274 : 9874118 : res = _bt_next(scan, dir);
275 : : }
276 : :
277 : : /* If we have a tuple, return it ... */
278 [ + + ]: 17774446 : if (res)
279 : 14009475 : break;
280 : : /* ... otherwise see if we need another primitive index scan */
97 pg@bowt.ie 281 [ + + + + ]:GNC 3764971 : } while (so->numArrayKeys && _bt_start_prim_scan(scan));
282 : :
3710 tgl@sss.pgh.pa.us 283 :CBC 17765945 : return res;
284 : : }
285 : :
286 : : /*
287 : : * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
288 : : */
289 : : int64
290 : 9048 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
291 : : {
7658 292 : 9048 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
6548 293 : 9048 : int64 ntids = 0;
294 : : ItemPointer heapTid;
295 : :
282 pg@bowt.ie 296 [ - + ]: 9048 : Assert(scan->heapRelation == NULL);
297 : :
298 : : /* Each loop iteration performs another primitive index scan */
299 : : do
300 : : {
301 : : /* Fetch the first page & tuple */
5264 tgl@sss.pgh.pa.us 302 [ + + ]: 9335 : if (_bt_first(scan, ForwardScanDirection))
303 : : {
304 : : /* Save tuple ID, and continue scanning */
2561 andres@anarazel.de 305 : 7815 : heapTid = &scan->xs_heaptid;
5264 tgl@sss.pgh.pa.us 306 : 7815 : tbm_add_tuples(tbm, heapTid, 1, false);
307 : 7815 : ntids++;
308 : :
309 : : for (;;)
310 : : {
311 : : /*
312 : : * Advance to next tuple within page. This is the same as the
313 : : * easy case in _bt_next().
314 : : */
315 [ + + ]: 1104289 : if (++so->currPos.itemIndex > so->currPos.lastItem)
316 : : {
317 : : /* let _bt_next do the heavy lifting */
318 [ + + ]: 10795 : if (!_bt_next(scan, ForwardScanDirection))
319 : 7815 : break;
320 : : }
321 : :
322 : : /* Save tuple ID, and continue scanning */
323 : 1096474 : heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
324 : 1096474 : tbm_add_tuples(tbm, heapTid, 1, false);
325 : 1096474 : ntids++;
326 : : }
327 : : }
328 : : /* Now see if we need another primitive index scan */
97 pg@bowt.ie 329 [ + + + + ]:GNC 9335 : } while (so->numArrayKeys && _bt_start_prim_scan(scan));
330 : :
3710 tgl@sss.pgh.pa.us 331 :CBC 9048 : return ntids;
332 : : }
333 : :
334 : : /*
335 : : * btbeginscan() -- start a scan on a btree index
336 : : */
337 : : IndexScanDesc
338 : 7547869 : btbeginscan(Relation rel, int nkeys, int norderbys)
339 : : {
340 : : IndexScanDesc scan;
341 : : BTScanOpaque so;
342 : :
343 : : /* no order by operators allowed */
5582 344 [ - + ]: 7547869 : Assert(norderbys == 0);
345 : :
346 : : /* get the scan */
347 : 7547869 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
348 : :
349 : : /* allocate private workspace */
95 michael@paquier.xyz 350 :GNC 7547869 : so = palloc_object(BTScanOpaqueData);
4008 kgrittn@postgresql.o 351 :CBC 7547869 : BTScanPosInvalidate(so->currPos);
352 : 7547869 : BTScanPosInvalidate(so->markPos);
5582 tgl@sss.pgh.pa.us 353 [ + + ]: 7547869 : if (scan->numberOfKeys > 0)
354 : 7541144 : so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
355 : : else
356 : 6725 : so->keyData = NULL;
357 : :
345 pg@bowt.ie 358 : 7547869 : so->skipScan = false;
708 359 : 7547869 : so->needPrimScan = false;
360 : 7547869 : so->scanBehind = false;
515 361 : 7547869 : so->oppositeDirCheck = false;
5264 tgl@sss.pgh.pa.us 362 : 7547869 : so->arrayKeys = NULL;
708 pg@bowt.ie 363 : 7547869 : so->orderProcs = NULL;
5264 tgl@sss.pgh.pa.us 364 : 7547869 : so->arrayContext = NULL;
365 : :
5582 366 : 7547869 : so->killedItems = NULL; /* until needed */
367 : 7547869 : so->numKilled = 0;
368 : :
369 : : /*
370 : : * We don't know yet whether the scan will be index-only, so we do not
371 : : * allocate the tuple workspace arrays until btrescan. However, we set up
372 : : * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
373 : : */
5271 374 : 7547869 : so->currTuples = so->markTuples = NULL;
375 : :
5264 376 : 7547869 : scan->xs_itupdesc = RelationGetDescr(rel);
377 : :
5582 378 : 7547869 : scan->opaque = so;
379 : :
3710 380 : 7547869 : return scan;
381 : : }
382 : :
383 : : /*
384 : : * btrescan() -- rescan an index relation
385 : : */
386 : : void
387 : 7901555 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
388 : : ScanKey orderbys, int norderbys)
389 : : {
5582 390 : 7901555 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
391 : :
392 : : /* we aren't holding any read locks, but gotta drop the pins */
7252 393 [ + + - + : 7901555 : if (BTScanPosIsValid(so->currPos))
+ + ]
394 : : {
395 : : /* Before leaving current page, deal with any killed items */
396 [ + + ]: 39862 : if (so->numKilled > 0)
4008 kgrittn@postgresql.o 397 : 408 : _bt_killitems(scan);
398 [ - + - - : 39862 : BTScanPosUnpinIfPinned(so->currPos);
+ + ]
399 : 39862 : BTScanPosInvalidate(so->currPos);
400 : : }
401 : :
402 : : /*
403 : : * We prefer to eagerly drop leaf page pins before btgettuple returns.
404 : : * This avoids making VACUUM wait to acquire a cleanup lock on the page.
405 : : *
406 : : * We cannot safely drop leaf page pins during index-only scans due to a
407 : : * race condition involving VACUUM setting pages all-visible in the VM.
408 : : * It's also unsafe for plain index scans that use a non-MVCC snapshot.
409 : : *
410 : : * Also opt out of dropping leaf page pins eagerly during bitmap scans.
411 : : * Pins cannot be held for more than an instant during bitmap scans either
412 : : * way, so we might as well avoid wasting cycles on acquiring page LSNs.
413 : : *
414 : : * See nbtree/README section on making concurrent TID recycling safe.
415 : : *
416 : : * Note: so->dropPin should never change across rescans.
417 : : */
282 pg@bowt.ie 418 : 23518776 : so->dropPin = (!scan->xs_want_itup &&
2 andres@anarazel.de 419 [ + + + + :GNC 14972692 : IsMVCCLikeSnapshot(scan->xs_snapshot) &&
+ + ]
282 pg@bowt.ie 420 [ + + ]:CBC 7071137 : scan->heapRelation != NULL);
421 : :
7143 tgl@sss.pgh.pa.us 422 : 7901555 : so->markItemIndex = -1;
708 pg@bowt.ie 423 : 7901555 : so->needPrimScan = false;
424 : 7901555 : so->scanBehind = false;
515 425 : 7901555 : so->oppositeDirCheck = false;
4008 kgrittn@postgresql.o 426 [ + - - + : 7901555 : BTScanPosUnpinIfPinned(so->markPos);
- + ]
427 : 7901555 : BTScanPosInvalidate(so->markPos);
428 : :
429 : : /*
430 : : * Allocate tuple workspace arrays, if needed for an index-only scan and
431 : : * not already done in a previous rescan call. To save on palloc
432 : : * overhead, both workspaces are allocated as one palloc block; only this
433 : : * function and btendscan know that.
434 : : */
5271 tgl@sss.pgh.pa.us 435 [ + + + + ]: 7901555 : if (scan->xs_want_itup && so->currTuples == NULL)
436 : : {
437 : 76054 : so->currTuples = (char *) palloc(BLCKSZ * 2);
438 : 76054 : so->markTuples = so->currTuples + BLCKSZ;
439 : : }
440 : :
441 : : /*
442 : : * Reset the scan keys
443 : : */
8393 444 [ + + + + ]: 7901555 : if (scankey && scan->numberOfKeys > 0)
550 peter@eisentraut.org 445 : 7894715 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
8159 tgl@sss.pgh.pa.us 446 : 7901555 : so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
708 pg@bowt.ie 447 : 7901555 : so->numArrayKeys = 0; /* ditto */
10841 scrappy@hub.org 448 : 7901555 : }
449 : :
450 : : /*
451 : : * btendscan() -- close down a scan
452 : : */
453 : : void
3710 tgl@sss.pgh.pa.us 454 : 7547048 : btendscan(IndexScanDesc scan)
455 : : {
7252 456 : 7547048 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
457 : :
458 : : /* we aren't holding any read locks, but gotta drop the pins */
459 [ + + - + : 7547048 : if (BTScanPosIsValid(so->currPos))
+ + ]
460 : : {
461 : : /* Before leaving current page, deal with any killed items */
462 [ + + ]: 4094906 : if (so->numKilled > 0)
4008 kgrittn@postgresql.o 463 : 44445 : _bt_killitems(scan);
464 [ - + - - : 4094906 : BTScanPosUnpinIfPinned(so->currPos);
+ + ]
465 : : }
466 : :
7143 tgl@sss.pgh.pa.us 467 : 7547048 : so->markItemIndex = -1;
4008 kgrittn@postgresql.o 468 [ + + - + : 7547048 : BTScanPosUnpinIfPinned(so->markPos);
+ + ]
469 : :
470 : : /* No need to invalidate positions, the RAM is about to be freed. */
471 : :
472 : : /* Release storage */
8103 neilc@samurai.com 473 [ + + ]: 7547048 : if (so->keyData != NULL)
10416 bruce@momjian.us 474 : 7540338 : pfree(so->keyData);
475 : : /* so->arrayKeys and so->orderProcs are in arrayContext */
5264 tgl@sss.pgh.pa.us 476 [ + + ]: 7547048 : if (so->arrayContext != NULL)
477 : 2487 : MemoryContextDelete(so->arrayContext);
478 [ + + ]: 7547048 : if (so->killedItems != NULL)
479 : 86361 : pfree(so->killedItems);
5271 480 [ + + ]: 7547048 : if (so->currTuples != NULL)
481 : 76032 : pfree(so->currTuples);
482 : : /* so->markTuples should not be pfree'd, see btrescan */
10416 bruce@momjian.us 483 : 7547048 : pfree(so);
10841 scrappy@hub.org 484 : 7547048 : }
485 : :
486 : : /*
487 : : * btmarkpos() -- save current scan position
488 : : */
489 : : void
3710 tgl@sss.pgh.pa.us 490 : 65082 : btmarkpos(IndexScanDesc scan)
491 : : {
7252 492 : 65082 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
493 : :
494 : : /* There may be an old mark with a pin (but no lock). */
4008 kgrittn@postgresql.o 495 [ + + - + : 65082 : BTScanPosUnpinIfPinned(so->markPos);
+ + ]
496 : :
497 : : /*
498 : : * Just record the current itemIndex. If we later step to next page
499 : : * before releasing the marked position, _bt_steppage makes a full copy of
500 : : * the currPos struct in markPos. If (as often happens) the mark is moved
501 : : * before we leave the page, we don't have to do that work.
502 : : */
7252 tgl@sss.pgh.pa.us 503 [ - + - - : 65082 : if (BTScanPosIsValid(so->currPos))
+ - ]
7143 504 : 65082 : so->markItemIndex = so->currPos.itemIndex;
505 : : else
506 : : {
4008 kgrittn@postgresql.o 507 :UBC 0 : BTScanPosInvalidate(so->markPos);
7143 tgl@sss.pgh.pa.us 508 : 0 : so->markItemIndex = -1;
509 : : }
10841 scrappy@hub.org 510 :CBC 65082 : }
511 : :
512 : : /*
513 : : * btrestrpos() -- restore scan to last saved position
514 : : */
515 : : void
3710 tgl@sss.pgh.pa.us 516 : 27027 : btrestrpos(IndexScanDesc scan)
517 : : {
7252 518 : 27027 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
519 : :
7143 520 [ + + ]: 27027 : if (so->markItemIndex >= 0)
521 : : {
522 : : /*
523 : : * The scan has never moved to a new page since the last mark. Just
524 : : * restore the itemIndex.
525 : : *
526 : : * NB: In this case we can't count on anything in so->markPos to be
527 : : * accurate.
528 : : */
529 : 26973 : so->currPos.itemIndex = so->markItemIndex;
530 : : }
531 : : else
532 : : {
533 : : /*
534 : : * The scan moved to a new page after last mark or restore, and we are
535 : : * now restoring to the marked page. We aren't holding any read
536 : : * locks, but if we're still holding the pin for the current position,
537 : : * we must drop it.
538 : : */
539 [ - + - - : 54 : if (BTScanPosIsValid(so->currPos))
+ - ]
540 : : {
541 : : /* Before leaving current page, deal with any killed items */
4008 kgrittn@postgresql.o 542 [ - + ]: 54 : if (so->numKilled > 0)
4008 kgrittn@postgresql.o 543 :UBC 0 : _bt_killitems(scan);
4008 kgrittn@postgresql.o 544 [ - + - - :CBC 54 : BTScanPosUnpinIfPinned(so->currPos);
- + ]
545 : : }
546 : :
7143 tgl@sss.pgh.pa.us 547 [ - + - - : 54 : if (BTScanPosIsValid(so->markPos))
+ - ]
548 : : {
549 : : /* bump pin on mark buffer for assignment to current buffer */
4008 kgrittn@postgresql.o 550 [ - + - - : 54 : if (BTScanPosIsPinned(so->markPos))
- + ]
4008 kgrittn@postgresql.o 551 :UBC 0 : IncrBufferRefCount(so->markPos.buf);
7143 tgl@sss.pgh.pa.us 552 :CBC 54 : memcpy(&so->currPos, &so->markPos,
553 : : offsetof(BTScanPosData, items[1]) +
554 : 54 : so->markPos.lastItem * sizeof(BTScanPosItem));
5271 555 [ - + ]: 54 : if (so->currTuples)
5271 tgl@sss.pgh.pa.us 556 :UBC 0 : memcpy(so->currTuples, so->markTuples,
557 : 0 : so->markPos.nextTupleOffset);
558 : : /* Reset the scan's array keys (see _bt_steppage for why) */
708 pg@bowt.ie 559 [ - + ]:CBC 54 : if (so->numArrayKeys)
560 : : {
708 pg@bowt.ie 561 :UBC 0 : _bt_start_array_keys(scan, so->currPos.dir);
562 : 0 : so->needPrimScan = false;
563 : : }
564 : : }
565 : : else
4008 kgrittn@postgresql.o 566 : 0 : BTScanPosInvalidate(so->currPos);
567 : : }
10841 scrappy@hub.org 568 :CBC 27027 : }
569 : :
570 : : /*
571 : : * btestimateparallelscan -- estimate storage for BTParallelScanDescData
572 : : */
573 : : Size
345 pg@bowt.ie 574 : 32 : btestimateparallelscan(Relation rel, int nkeys, int norderbys)
575 : : {
576 : 32 : int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
577 : : Size estnbtreeshared,
578 : : genericattrspace;
579 : :
580 : : /*
581 : : * Pessimistically assume that every input scan key will be output with
582 : : * its own SAOP array
583 : : */
584 : 32 : estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) +
585 : : sizeof(int) * nkeys;
586 : :
587 : : /* Single column indexes cannot possibly use a skip array */
588 [ + + ]: 32 : if (nkeyatts == 1)
589 : 23 : return estnbtreeshared;
590 : :
591 : : /*
592 : : * Pessimistically assume that all attributes prior to the least
593 : : * significant attribute require a skip array (and an associated key)
594 : : */
595 : 9 : genericattrspace = datumEstimateSpace((Datum) 0, false, true,
596 : : sizeof(Datum));
597 [ + + ]: 18 : for (int attnum = 1; attnum < nkeyatts; attnum++)
598 : : {
599 : : CompactAttribute *attr;
600 : :
601 : : /*
602 : : * We make the conservative assumption that every index column will
603 : : * also require a skip array.
604 : : *
605 : : * Every skip array must have space to store its scan key's sk_flags.
606 : : */
607 : 9 : estnbtreeshared = add_size(estnbtreeshared, sizeof(int));
608 : :
609 : : /* Consider space required to store a datum of opclass input type */
610 : 9 : attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
611 [ + - ]: 9 : if (attr->attbyval)
612 : 9 : {
613 : : /* This index attribute stores pass-by-value datums */
614 : 9 : Size estfixed = datumEstimateSpace((Datum) 0, false,
615 : 9 : true, attr->attlen);
616 : :
617 : 9 : estnbtreeshared = add_size(estnbtreeshared, estfixed);
618 : 9 : continue;
619 : : }
620 : :
621 : : /*
622 : : * This index attribute stores pass-by-reference datums.
623 : : *
624 : : * Assume that serializing this array will use just as much space as a
625 : : * pass-by-value datum, in addition to space for the largest possible
626 : : * whole index tuple (this is not just a per-datum portion of the
627 : : * largest possible tuple because that'd be almost as large anyway).
628 : : *
629 : : * This is quite conservative, but it's not clear how we could do much
630 : : * better. The executor requires an up-front storage request size
631 : : * that reliably covers the scan's high watermark memory usage. We
632 : : * can't be sure of the real high watermark until the scan is over.
633 : : */
345 pg@bowt.ie 634 :UBC 0 : estnbtreeshared = add_size(estnbtreeshared, genericattrspace);
635 : 0 : estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize);
636 : : }
637 : :
345 pg@bowt.ie 638 :CBC 9 : return estnbtreeshared;
639 : : }
640 : :
641 : : /*
642 : : * _bt_start_prim_scan() -- start scheduled primitive index scan?
643 : : *
644 : : * Returns true if _bt_checkkeys scheduled another primitive index scan, just
645 : : * as the last one ended. Otherwise returns false, indicating that the array
646 : : * keys are now fully exhausted.
647 : : *
648 : : * Only call here during scans with one or more equality type array scan keys,
649 : : * after _bt_first or _bt_next return false.
650 : : */
651 : : static bool
97 pg@bowt.ie 652 :GNC 44474 : _bt_start_prim_scan(IndexScanDesc scan)
653 : : {
654 : 44474 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
655 : :
656 [ - + ]: 44474 : Assert(so->numArrayKeys);
657 : :
658 : 44474 : so->scanBehind = so->oppositeDirCheck = false; /* reset */
659 : :
660 : : /*
661 : : * Array keys are advanced within _bt_checkkeys when the scan reaches the
662 : : * leaf level (more precisely, they're advanced when the scan reaches the
663 : : * end of each distinct set of array elements). This process avoids
664 : : * repeat access to leaf pages (across multiple primitive index scans) by
665 : : * advancing the scan's array keys when it allows the primitive index scan
666 : : * to find nearby matching tuples (or when it eliminates ranges of array
667 : : * key space that can't possibly be satisfied by any index tuple).
668 : : *
669 : : * _bt_checkkeys sets a simple flag variable to schedule another primitive
670 : : * index scan. The flag tells us what to do.
671 : : *
672 : : * We cannot rely on _bt_first always reaching _bt_checkkeys. There are
673 : : * various cases where that won't happen. For example, if the index is
674 : : * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
675 : : * We also don't expect a call to _bt_checkkeys during searches for a
676 : : * non-existent value that happens to be lower/higher than any existing
677 : : * value in the index.
678 : : *
679 : : * We don't require special handling for these cases -- we don't need to
680 : : * be explicitly instructed to _not_ perform another primitive index scan.
681 : : * It's up to code under the control of _bt_first to always set the flag
682 : : * when another primitive index scan will be required.
683 : : *
684 : : * This works correctly, even with the tricky cases listed above, which
685 : : * all involve access to leaf pages "near the boundaries of the key space"
686 : : * (whether it's from a leftmost/rightmost page, or an imaginary empty
687 : : * leaf root page). If _bt_checkkeys cannot be reached by a primitive
688 : : * index scan for one set of array keys, then it also won't be reached for
689 : : * any later set ("later" in terms of the direction that we scan the index
690 : : * and advance the arrays). The array keys won't have advanced in these
691 : : * cases, but that's the correct behavior (even _bt_advance_array_keys
692 : : * won't always advance the arrays at the point they become "exhausted").
693 : : */
694 [ + + ]: 44474 : if (so->needPrimScan)
695 : : {
696 : : /*
697 : : * Flag was set -- must call _bt_first again, which will reset the
698 : : * scan's needPrimScan flag
699 : : */
700 : 8788 : return true;
701 : : }
702 : :
703 : : /* The top-level index scan ran out of tuples in this scan direction */
704 [ + + ]: 35686 : if (scan->parallel_scan != NULL)
705 : 15 : _bt_parallel_done(scan);
706 : :
707 : 35686 : return false;
708 : : }
709 : :
710 : : /*
711 : : * _bt_parallel_serialize_arrays() -- Serialize parallel array state.
712 : : *
713 : : * Caller must have exclusively locked btscan->btps_lock when called.
714 : : */
715 : : static void
345 pg@bowt.ie 716 :CBC 18 : _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
717 : : BTScanOpaque so)
718 : : {
719 : : char *datumshared;
720 : :
721 : : /* Space for serialized datums begins immediately after btps_arrElems[] */
722 : 18 : datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
723 [ + + ]: 36 : for (int i = 0; i < so->numArrayKeys; i++)
724 : : {
725 : 18 : BTArrayKeyInfo *array = &so->arrayKeys[i];
726 : 18 : ScanKey skey = &so->keyData[array->scan_key];
727 : :
728 [ + - ]: 18 : if (array->num_elems != -1)
729 : : {
730 : : /* Save SAOP array's cur_elem (no need to copy key/datum) */
731 [ - + ]: 18 : Assert(!(skey->sk_flags & SK_BT_SKIP));
732 : 18 : btscan->btps_arrElems[i] = array->cur_elem;
733 : 18 : continue;
734 : : }
735 : :
736 : : /* Save all mutable state associated with skip array's key */
345 pg@bowt.ie 737 [ # # ]:UBC 0 : Assert(skey->sk_flags & SK_BT_SKIP);
738 : 0 : memcpy(datumshared, &skey->sk_flags, sizeof(int));
739 : 0 : datumshared += sizeof(int);
740 : :
741 [ # # ]: 0 : if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
742 : : {
743 : : /* No sk_argument datum to serialize */
744 [ # # ]: 0 : Assert(skey->sk_argument == 0);
745 : 0 : continue;
746 : : }
747 : :
748 : 0 : datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
749 : 0 : array->attbyval, array->attlen, &datumshared);
750 : : }
345 pg@bowt.ie 751 :CBC 18 : }
752 : :
753 : : /*
754 : : * _bt_parallel_restore_arrays() -- Restore serialized parallel array state.
755 : : *
756 : : * Caller must have exclusively locked btscan->btps_lock when called.
757 : : */
758 : : static void
759 : 18 : _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
760 : : BTScanOpaque so)
761 : : {
762 : : char *datumshared;
763 : :
764 : : /* Space for serialized datums begins immediately after btps_arrElems[] */
765 : 18 : datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
766 [ + + ]: 36 : for (int i = 0; i < so->numArrayKeys; i++)
767 : : {
768 : 18 : BTArrayKeyInfo *array = &so->arrayKeys[i];
769 : 18 : ScanKey skey = &so->keyData[array->scan_key];
770 : : bool isnull;
771 : :
772 [ + - ]: 18 : if (array->num_elems != -1)
773 : : {
774 : : /* Restore SAOP array using its saved cur_elem */
775 [ - + ]: 18 : Assert(!(skey->sk_flags & SK_BT_SKIP));
776 : 18 : array->cur_elem = btscan->btps_arrElems[i];
777 : 18 : skey->sk_argument = array->elem_values[array->cur_elem];
778 : 18 : continue;
779 : : }
780 : :
781 : : /* Restore skip array by restoring its key directly */
345 pg@bowt.ie 782 [ # # # # ]:UBC 0 : if (!array->attbyval && skey->sk_argument)
783 : 0 : pfree(DatumGetPointer(skey->sk_argument));
784 : 0 : skey->sk_argument = (Datum) 0;
785 : 0 : memcpy(&skey->sk_flags, datumshared, sizeof(int));
786 : 0 : datumshared += sizeof(int);
787 : :
788 [ # # ]: 0 : Assert(skey->sk_flags & SK_BT_SKIP);
789 : :
790 [ # # ]: 0 : if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
791 : : {
792 : : /* No sk_argument datum to restore */
793 : 0 : continue;
794 : : }
795 : :
796 : 0 : skey->sk_argument = datumRestore(&datumshared, &isnull);
797 [ # # ]: 0 : if (isnull)
798 : : {
799 [ # # ]: 0 : Assert(skey->sk_argument == 0);
800 [ # # ]: 0 : Assert(skey->sk_flags & SK_SEARCHNULL);
801 [ # # ]: 0 : Assert(skey->sk_flags & SK_ISNULL);
802 : : }
803 : : }
3315 rhaas@postgresql.org 804 :CBC 18 : }
805 : :
806 : : /*
807 : : * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
808 : : */
809 : : void
810 : 32 : btinitparallelscan(void *target)
811 : : {
812 : 32 : BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
813 : :
372 pg@bowt.ie 814 : 32 : LWLockInitialize(&bt_target->btps_lock,
815 : : LWTRANCHE_PARALLEL_BTREE_SCAN);
513 816 : 32 : bt_target->btps_nextScanPage = InvalidBlockNumber;
817 : 32 : bt_target->btps_lastCurrPage = InvalidBlockNumber;
3315 rhaas@postgresql.org 818 : 32 : bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
819 : 32 : ConditionVariableInit(&bt_target->btps_cv);
820 : 32 : }
821 : :
822 : : /*
823 : : * btparallelrescan() -- reset parallel scan
824 : : */
825 : : void
826 : 12 : btparallelrescan(IndexScanDesc scan)
827 : : {
828 : : BTParallelScanDesc btscan;
829 : 12 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
830 : :
831 [ - + ]: 12 : Assert(parallel_scan);
832 : :
472 peter@eisentraut.org 833 : 12 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
834 : : parallel_scan->ps_offset_am);
835 : :
836 : : /*
837 : : * In theory, we don't need to acquire the LWLock here, because there
838 : : * shouldn't be any other workers running at this point, but we do so for
839 : : * consistency.
840 : : */
372 pg@bowt.ie 841 : 12 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
513 842 : 12 : btscan->btps_nextScanPage = InvalidBlockNumber;
843 : 12 : btscan->btps_lastCurrPage = InvalidBlockNumber;
3315 rhaas@postgresql.org 844 : 12 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
372 pg@bowt.ie 845 : 12 : LWLockRelease(&btscan->btps_lock);
3315 rhaas@postgresql.org 846 : 12 : }
847 : :
848 : : /*
849 : : * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
850 : : * page. Other scans must wait until we call _bt_parallel_release()
851 : : * or _bt_parallel_done().
852 : : *
853 : : * The return value is true if we successfully seized the scan and false
854 : : * if we did not. The latter case occurs when no pages remain, or when
855 : : * another primitive index scan is scheduled that caller's backend cannot
856 : : * start just yet (only backends that call from _bt_first are capable of
857 : : * starting primitive index scans, which they indicate by passing first=true).
858 : : *
859 : : * If the return value is true, *next_scan_page returns the next page of the
860 : : * scan, and *last_curr_page returns the page that *next_scan_page came from.
861 : : * An invalid *next_scan_page means the scan hasn't yet started, or that
862 : : * caller needs to start the next primitive index scan (if it's the latter
863 : : * case we'll set so.needPrimScan).
864 : : *
865 : : * Callers should ignore the value of *next_scan_page and *last_curr_page if
866 : : * the return value is false.
867 : : */
868 : : bool
513 pg@bowt.ie 869 : 829 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
870 : : BlockNumber *last_curr_page, bool first)
871 : : {
345 872 : 829 : Relation rel = scan->indexRelation;
3315 rhaas@postgresql.org 873 : 829 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
492 pg@bowt.ie 874 : 829 : bool exit_loop = false,
875 : 829 : status = true,
876 : 829 : endscan = false;
3315 rhaas@postgresql.org 877 : 829 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
878 : : BTParallelScanDesc btscan;
879 : :
492 pg@bowt.ie 880 : 829 : *next_scan_page = InvalidBlockNumber;
513 881 : 829 : *last_curr_page = InvalidBlockNumber;
882 : :
883 : : /*
884 : : * Reset so->currPos, and initialize moreLeft/moreRight such that the next
885 : : * call to _bt_readnextpage treats this backend similarly to a serial
886 : : * backend that steps from *last_curr_page to *next_scan_page (unless this
887 : : * backend's so->currPos is initialized by _bt_readfirstpage before then).
888 : : */
889 : 829 : BTScanPosInvalidate(so->currPos);
890 : 829 : so->currPos.moreLeft = so->currPos.moreRight = true;
891 : :
708 892 [ + + ]: 829 : if (first)
893 : : {
894 : : /*
895 : : * Initialize array related state when called from _bt_first, assuming
896 : : * that this will be the first primitive index scan for the scan
897 : : */
898 : 223 : so->needPrimScan = false;
899 : 223 : so->scanBehind = false;
515 900 : 223 : so->oppositeDirCheck = false;
901 : : }
902 : : else
903 : : {
904 : : /*
905 : : * Don't attempt to seize the scan when it requires another primitive
906 : : * index scan, since caller's backend cannot start it right now
907 : : */
708 908 [ - + ]: 606 : if (so->needPrimScan)
708 pg@bowt.ie 909 :UBC 0 : return false;
910 : : }
911 : :
472 peter@eisentraut.org 912 :CBC 829 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
913 : : parallel_scan->ps_offset_am);
914 : :
915 : : while (1)
916 : : {
372 pg@bowt.ie 917 : 831 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
918 : :
708 919 [ + + ]: 831 : if (btscan->btps_pageStatus == BTPARALLEL_DONE)
920 : : {
921 : : /* We're done with this parallel index scan */
3315 rhaas@postgresql.org 922 : 145 : status = false;
923 : : }
492 pg@bowt.ie 924 [ + + ]: 686 : else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
925 [ + + ]: 622 : btscan->btps_nextScanPage == P_NONE)
926 : : {
927 : : /* End this parallel index scan */
928 : 16 : status = false;
929 : 16 : endscan = true;
930 : : }
708 931 [ + + ]: 670 : else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
932 : : {
933 [ - + ]: 18 : Assert(so->numArrayKeys);
934 : :
935 [ + - ]: 18 : if (first)
936 : : {
937 : : /* Can start scheduled primitive scan right away, so do so */
938 : 18 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
939 : :
940 : : /* Restore scan's array keys from serialized values */
345 941 : 18 : _bt_parallel_restore_arrays(rel, btscan, so);
708 942 : 18 : exit_loop = true;
943 : : }
944 : : else
945 : : {
946 : : /*
947 : : * Don't attempt to seize the scan when it requires another
948 : : * primitive index scan, since caller's backend cannot start
949 : : * it right now
950 : : */
544 pg@bowt.ie 951 :UBC 0 : status = false;
952 : : }
953 : :
954 : : /*
955 : : * Either way, update backend local state to indicate that a
956 : : * pending primitive scan is required
957 : : */
544 pg@bowt.ie 958 :CBC 18 : so->needPrimScan = true;
959 : 18 : so->scanBehind = false;
515 960 : 18 : so->oppositeDirCheck = false;
961 : : }
708 962 [ + + ]: 652 : else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
963 : : {
964 : : /*
965 : : * We have successfully seized control of the scan for the purpose
966 : : * of advancing it to a new page!
967 : : */
3315 rhaas@postgresql.org 968 : 650 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
492 pg@bowt.ie 969 [ - + ]: 650 : Assert(btscan->btps_nextScanPage != P_NONE);
513 970 : 650 : *next_scan_page = btscan->btps_nextScanPage;
971 : 650 : *last_curr_page = btscan->btps_lastCurrPage;
3315 rhaas@postgresql.org 972 : 650 : exit_loop = true;
973 : : }
372 pg@bowt.ie 974 : 831 : LWLockRelease(&btscan->btps_lock);
3315 rhaas@postgresql.org 975 [ + + + + ]: 831 : if (exit_loop || !status)
976 : : break;
977 : 2 : ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
978 : : }
979 : 829 : ConditionVariableCancelSleep();
980 : :
981 : : /* When the scan has reached the rightmost (or leftmost) page, end it */
492 pg@bowt.ie 982 [ + + ]: 829 : if (endscan)
983 : 16 : _bt_parallel_done(scan);
984 : :
3315 rhaas@postgresql.org 985 : 829 : return status;
986 : : }
987 : :
988 : : /*
989 : : * _bt_parallel_release() -- Complete the process of advancing the scan to a
990 : : * new page. We now have the new value btps_nextScanPage; another backend
991 : : * can now begin advancing the scan.
992 : : *
993 : : * Callers whose scan uses array keys must save their curr_page argument so
994 : : * that it can be passed to _bt_parallel_primscan_schedule, should caller
995 : : * determine that another primitive index scan is required.
996 : : *
997 : : * If caller's next_scan_page is P_NONE, the scan has reached the index's
998 : : * rightmost/leftmost page. This is treated as reaching the end of the scan
999 : : * within _bt_parallel_seize.
1000 : : *
1001 : : * Note: unlike the serial case, parallel scans don't need to remember both
1002 : : * sibling links. next_scan_page is whichever link is next given the scan's
1003 : : * direction. That's all we'll ever need, since the direction of a parallel
1004 : : * scan can never change.
1005 : : */
1006 : : void
513 pg@bowt.ie 1007 : 668 : _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
1008 : : BlockNumber curr_page)
1009 : : {
3315 rhaas@postgresql.org 1010 : 668 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1011 : : BTParallelScanDesc btscan;
1012 : :
492 pg@bowt.ie 1013 [ - + ]: 668 : Assert(BlockNumberIsValid(next_scan_page));
1014 : :
472 peter@eisentraut.org 1015 : 668 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1016 : : parallel_scan->ps_offset_am);
1017 : :
372 pg@bowt.ie 1018 : 668 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
513 1019 : 668 : btscan->btps_nextScanPage = next_scan_page;
1020 : 668 : btscan->btps_lastCurrPage = curr_page;
3315 rhaas@postgresql.org 1021 : 668 : btscan->btps_pageStatus = BTPARALLEL_IDLE;
372 pg@bowt.ie 1022 : 668 : LWLockRelease(&btscan->btps_lock);
3315 rhaas@postgresql.org 1023 : 668 : ConditionVariableSignal(&btscan->btps_cv);
1024 : 668 : }
1025 : :
1026 : : /*
1027 : : * _bt_parallel_done() -- Mark the parallel scan as complete.
1028 : : *
1029 : : * When there are no pages left to scan, this function should be called to
1030 : : * notify other workers. Otherwise, they might wait forever for the scan to
1031 : : * advance to the next page.
1032 : : */
1033 : : void
1034 : 3774176 : _bt_parallel_done(IndexScanDesc scan)
1035 : : {
544 pg@bowt.ie 1036 : 3774176 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
3315 rhaas@postgresql.org 1037 : 3774176 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1038 : : BTParallelScanDesc btscan;
1039 : 3774176 : bool status_changed = false;
1040 : :
496 pg@bowt.ie 1041 [ + - - + : 3774176 : Assert(!BTScanPosIsValid(so->currPos));
- + ]
1042 : :
1043 : : /* Do nothing, for non-parallel scans */
3315 rhaas@postgresql.org 1044 [ + + ]: 3774176 : if (parallel_scan == NULL)
1045 : 3774086 : return;
1046 : :
1047 : : /*
1048 : : * Should not mark parallel scan done when there's still a pending
1049 : : * primitive index scan
1050 : : */
544 pg@bowt.ie 1051 [ + + ]: 90 : if (so->needPrimScan)
1052 : 18 : return;
1053 : :
472 peter@eisentraut.org 1054 : 72 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1055 : : parallel_scan->ps_offset_am);
1056 : :
1057 : : /*
1058 : : * Mark the parallel scan as done, unless some other process did so
1059 : : * already
1060 : : */
372 pg@bowt.ie 1061 : 72 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
544 1062 [ - + ]: 72 : Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
708 1063 [ + + ]: 72 : if (btscan->btps_pageStatus != BTPARALLEL_DONE)
1064 : : {
3315 rhaas@postgresql.org 1065 : 44 : btscan->btps_pageStatus = BTPARALLEL_DONE;
1066 : 44 : status_changed = true;
1067 : : }
372 pg@bowt.ie 1068 : 72 : LWLockRelease(&btscan->btps_lock);
1069 : :
1070 : : /* wake up all the workers associated with this parallel scan */
3315 rhaas@postgresql.org 1071 [ + + ]: 72 : if (status_changed)
1072 : 44 : ConditionVariableBroadcast(&btscan->btps_cv);
1073 : : }
1074 : :
1075 : : /*
1076 : : * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
1077 : : *
1078 : : * Caller passes the curr_page most recently passed to _bt_parallel_release
1079 : : * by its backend. Caller successfully schedules the next primitive index scan
1080 : : * if the shared parallel state hasn't been seized since caller's backend last
1081 : : * advanced the scan.
1082 : : */
1083 : : void
513 pg@bowt.ie 1084 : 18 : _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
1085 : : {
345 1086 : 18 : Relation rel = scan->indexRelation;
3315 rhaas@postgresql.org 1087 : 18 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1088 : 18 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1089 : : BTParallelScanDesc btscan;
1090 : :
708 pg@bowt.ie 1091 [ - + ]: 18 : Assert(so->numArrayKeys);
1092 : :
472 peter@eisentraut.org 1093 : 18 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1094 : : parallel_scan->ps_offset_am);
1095 : :
372 pg@bowt.ie 1096 : 18 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
513 1097 [ + - ]: 18 : if (btscan->btps_lastCurrPage == curr_page &&
708 1098 [ + - ]: 18 : btscan->btps_pageStatus == BTPARALLEL_IDLE)
1099 : : {
513 1100 : 18 : btscan->btps_nextScanPage = InvalidBlockNumber;
1101 : 18 : btscan->btps_lastCurrPage = InvalidBlockNumber;
708 1102 : 18 : btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1103 : :
1104 : : /* Serialize scan's current array keys */
345 1105 : 18 : _bt_parallel_serialize_arrays(rel, btscan, so);
1106 : : }
372 1107 : 18 : LWLockRelease(&btscan->btps_lock);
3315 rhaas@postgresql.org 1108 : 18 : }
1109 : :
1110 : : /*
1111 : : * Bulk deletion of all index entries pointing to a set of heap tuples.
1112 : : * The set of target tuples is specified via a callback routine that tells
1113 : : * whether any given heap tuple (identified by ItemPointer) is being deleted.
1114 : : *
1115 : : * Result: a palloc'd struct containing statistical info for VACUUM displays.
1116 : : */
1117 : : IndexBulkDeleteResult *
3710 tgl@sss.pgh.pa.us 1118 : 1626 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1119 : : IndexBulkDeleteCallback callback, void *callback_state)
1120 : : {
7257 1121 : 1626 : Relation rel = info->index;
1122 : : BTCycleId cycleid;
1123 : :
1124 : : /* allocate stats if first time through, else re-use existing struct */
7251 1125 [ + + ]: 1626 : if (stats == NULL)
95 michael@paquier.xyz 1126 :GNC 1616 : stats = palloc0_object(IndexBulkDeleteResult);
1127 : :
1128 : : /* Establish the vacuum cycle ID to use for this scan */
1129 : : /* The ENSURE stuff ensures we clean up shared memory on failure */
6542 tgl@sss.pgh.pa.us 1130 [ + + ]:CBC 1626 : PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
1131 : : {
7251 1132 : 1626 : cycleid = _bt_start_vacuum(rel);
1133 : :
2144 pg@bowt.ie 1134 : 1626 : btvacuumscan(info, stats, callback, callback_state, cycleid);
1135 : : }
6542 tgl@sss.pgh.pa.us 1136 [ - + ]: 1626 : PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
1137 : 1625 : _bt_end_vacuum(rel);
1138 : :
3710 1139 : 1625 : return stats;
1140 : : }
1141 : :
1142 : : /*
1143 : : * Post-VACUUM cleanup.
1144 : : *
1145 : : * Result: a palloc'd struct containing statistical info for VACUUM displays.
1146 : : */
1147 : : IndexBulkDeleteResult *
1148 : 28218 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
1149 : : {
1150 : : BlockNumber num_delpages;
1151 : :
1152 : : /* No-op in ANALYZE ONLY mode */
6200 1153 [ + + ]: 28218 : if (info->analyze_only)
3710 1154 : 8994 : return stats;
1155 : :
1156 : : /*
1157 : : * If btbulkdelete was called, we need not do anything (we just maintain
1158 : : * the information used within _bt_vacuum_needs_cleanup() by calling
1159 : : * _bt_set_cleanup_info() below).
1160 : : *
1161 : : * If btbulkdelete was _not_ called, then we have a choice to make: we
1162 : : * must decide whether or not a btvacuumscan() call is needed now (i.e.
1163 : : * whether the ongoing VACUUM operation can entirely avoid a physical scan
1164 : : * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1165 : : * now.
1166 : : */
7257 1167 [ + + ]: 19224 : if (stats == NULL)
1168 : : {
1169 : : /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1009 pg@bowt.ie 1170 [ + + ]: 17921 : if (!_bt_vacuum_needs_cleanup(info->index))
2902 teodor@sigaev.ru 1171 : 17913 : return NULL;
1172 : :
1173 : : /*
1174 : : * Since we aren't going to actually delete any leaf items, there's no
1175 : : * need to go through all the vacuum-cycle-ID pushups here.
1176 : : *
1177 : : * Posting list tuples are a source of inaccuracy for cleanup-only
1178 : : * scans. btvacuumscan() will assume that the number of index tuples
1179 : : * from each page can be used as num_index_tuples, even though
1180 : : * num_index_tuples is supposed to represent the number of TIDs in the
1181 : : * index. This naive approach can underestimate the number of tuples
1182 : : * in the index significantly.
1183 : : *
1184 : : * We handle the problem by making num_index_tuples an estimate in
1185 : : * cleanup-only case.
1186 : : */
95 michael@paquier.xyz 1187 :GNC 8 : stats = palloc0_object(IndexBulkDeleteResult);
2144 pg@bowt.ie 1188 :CBC 8 : btvacuumscan(info, stats, NULL, NULL, 0);
1831 1189 : 8 : stats->estimated_count = true;
1190 : : }
1191 : :
1192 : : /*
1193 : : * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1194 : : *
1195 : : * num_delpages is the number of deleted pages now in the index that were
1196 : : * not safe to place in the FSM to be recycled just yet. num_delpages is
1197 : : * greater than 0 only when _bt_pagedel() actually deleted pages during
1198 : : * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1199 : : * have failed to place any newly deleted pages in the FSM just moments
1200 : : * ago. (Actually, there are edge cases where recycling of the current
1201 : : * VACUUM's newly deleted pages does not even become safe by the time the
1202 : : * next VACUUM comes around. See nbtree/README.)
1203 : : */
1845 1204 [ - + ]: 1311 : Assert(stats->pages_deleted >= stats->pages_free);
1205 : 1311 : num_delpages = stats->pages_deleted - stats->pages_free;
1009 1206 : 1311 : _bt_set_cleanup_info(info->index, num_delpages);
1207 : :
1208 : : /*
1209 : : * It's quite possible for us to be fooled by concurrent page splits into
1210 : : * double-counting some index tuples, so disbelieve any total that exceeds
1211 : : * the underlying heap's count ... if we know that accurately. Otherwise
1212 : : * this might just make matters worse.
1213 : : */
5879 tgl@sss.pgh.pa.us 1214 [ + + ]: 1311 : if (!info->estimated_count)
1215 : : {
7251 1216 [ + + ]: 1272 : if (stats->num_index_tuples > info->num_heap_tuples)
1217 : 23 : stats->num_index_tuples = info->num_heap_tuples;
1218 : : }
1219 : :
3710 1220 : 1311 : return stats;
1221 : : }
1222 : :
1223 : : /*
1224 : : * btvacuumscan --- scan the index for VACUUMing purposes
1225 : : *
1226 : : * This combines the functions of looking for leaf tuples that are deletable
1227 : : * according to the vacuum callback, looking for empty pages that can be
1228 : : * deleted, and looking for old deleted pages that can be recycled. Both
1229 : : * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
1230 : : * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
1231 : : *
1232 : : * The caller is responsible for initially allocating/zeroing a stats struct
1233 : : * and for obtaining a vacuum cycle ID if necessary.
1234 : : */
1235 : : static void
7251 1236 : 1634 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1237 : : IndexBulkDeleteCallback callback, void *callback_state,
1238 : : BTCycleId cycleid)
1239 : : {
1240 : 1634 : Relation rel = info->index;
1241 : : BTVacState vstate;
1242 : : BlockNumber num_pages;
1243 : : bool needLock;
1244 : : BlockRangeReadStreamPrivate p;
359 melanieplageman@gmai 1245 : 1634 : ReadStream *stream = NULL;
1246 : :
1247 : : /*
1248 : : * Reset fields that track information about the entire index now. This
1249 : : * avoids double-counting in the case where a single VACUUM command
1250 : : * requires multiple scans of the index.
1251 : : *
1252 : : * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
1253 : : * since they track information about the VACUUM command, and so must last
1254 : : * across each call to btvacuumscan().
1255 : : *
1256 : : * (Note that pages_free is treated as state about the whole index, not
1257 : : * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1258 : : * calls are idempotent, and get repeated for the same deleted pages in
1259 : : * some scenarios. The point for us is to track the number of recyclable
1260 : : * pages in the index at the end of the VACUUM command.)
1261 : : */
1845 pg@bowt.ie 1262 : 1634 : stats->num_pages = 0;
7251 tgl@sss.pgh.pa.us 1263 : 1634 : stats->num_index_tuples = 0;
1264 : 1634 : stats->pages_deleted = 0;
1845 pg@bowt.ie 1265 : 1634 : stats->pages_free = 0;
1266 : :
1267 : : /* Set up info to pass down to btvacuumpage */
7251 tgl@sss.pgh.pa.us 1268 : 1634 : vstate.info = info;
1269 : 1634 : vstate.stats = stats;
1270 : 1634 : vstate.callback = callback;
1271 : 1634 : vstate.callback_state = callback_state;
1272 : 1634 : vstate.cycleid = cycleid;
1273 : :
1274 : : /* Create a temporary memory context to run _bt_pagedel in */
1275 : 1634 : vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
1276 : : "_bt_pagedel",
1277 : : ALLOCSET_DEFAULT_SIZES);
1278 : :
1279 : : /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1820 pg@bowt.ie 1280 : 1634 : vstate.bufsize = 0;
1281 : 1634 : vstate.maxbufsize = 0;
1282 : 1634 : vstate.pendingpages = NULL;
1283 : 1634 : vstate.npendingpages = 0;
1284 : : /* Consider applying _bt_pendingfsm_finalize optimization */
1285 : 1634 : _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1286 : :
1287 : : /*
1288 : : * The outer loop iterates over all index pages except the metapage, in
1289 : : * physical order (we hope the kernel will cooperate in providing
1290 : : * read-ahead for speed). It is critical that we visit all leaf pages,
1291 : : * including ones added after we start the scan, else we might fail to
1292 : : * delete some deletable tuples. Hence, we must repeatedly check the
1293 : : * relation length. We must acquire the relation-extension lock while
1294 : : * doing so to avoid a race condition: if someone else is extending the
1295 : : * relation, there is a window where bufmgr/smgr have created a new
1296 : : * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1297 : : * we manage to scan such a page here, we'll improperly assume it can be
1298 : : * recycled. Taking the lock synchronizes things enough to prevent a
1299 : : * problem: either num_pages won't include the new page, or _bt_getbuf
1300 : : * already has write lock on the buffer and it will be fully initialized
1301 : : * before we can examine it. Also, we need not worry if a page is added
1302 : : * immediately after we look; the page splitting code already has
1303 : : * write-lock on the left page before it adds a right page, so we must
1304 : : * already have processed any tuples due to be moved into such a page.
1305 : : *
1306 : : * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1307 : : * think the use of the extension lock is still required.
1308 : : *
1309 : : * We can skip locking for new or temp relations, however, since no one
1310 : : * else could be accessing them.
1311 : : */
7251 tgl@sss.pgh.pa.us 1312 [ + + + - ]: 1634 : needLock = !RELATION_IS_LOCAL(rel);
1313 : :
359 melanieplageman@gmai 1314 : 1634 : p.current_blocknum = BTREE_METAPAGE + 1;
1315 : :
1316 : : /*
1317 : : * It is safe to use batchmode as block_range_read_stream_cb takes no
1318 : : * locks.
1319 : : */
321 1320 : 1634 : stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
1321 : : READ_STREAM_FULL |
1322 : : READ_STREAM_USE_BATCHING,
1323 : : info->strategy,
1324 : : rel,
1325 : : MAIN_FORKNUM,
1326 : : block_range_read_stream_cb,
1327 : : &p,
1328 : : 0);
1329 : : for (;;)
1330 : : {
1331 : : /* Get the current relation length */
7251 tgl@sss.pgh.pa.us 1332 [ + + ]: 3121 : if (needLock)
1333 : 3119 : LockRelationForExtension(rel, ExclusiveLock);
1334 : 3121 : num_pages = RelationGetNumberOfBlocks(rel);
1335 [ + + ]: 3121 : if (needLock)
1336 : 3119 : UnlockRelationForExtension(rel, ExclusiveLock);
1337 : :
2539 alvherre@alvh.no-ip. 1338 [ + + ]: 3121 : if (info->report_progress)
1339 : 476 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1340 : : num_pages);
1341 : :
1342 : : /* Quit if we've scanned the whole relation */
359 melanieplageman@gmai 1343 [ + + ]: 3121 : if (p.current_blocknum >= num_pages)
7251 tgl@sss.pgh.pa.us 1344 : 1633 : break;
1345 : :
359 melanieplageman@gmai 1346 : 1488 : p.last_exclusive = num_pages;
1347 : :
1348 : : /* Iterate over pages, then loop back to recheck relation length */
1349 : : while (true)
7336 tgl@sss.pgh.pa.us 1350 : 17657 : {
1351 : : BlockNumber current_block;
1352 : : Buffer buf;
1353 : :
1354 : : /* call vacuum_delay_point while not holding any buffer lock */
359 melanieplageman@gmai 1355 : 19145 : vacuum_delay_point(false);
1356 : :
1357 : 19145 : buf = read_stream_next_buffer(stream, NULL);
1358 : :
1359 [ + + ]: 19145 : if (!BufferIsValid(buf))
1360 : 1487 : break;
1361 : :
1362 : 17658 : current_block = btvacuumpage(&vstate, buf);
1363 : :
2539 alvherre@alvh.no-ip. 1364 [ + + ]: 17657 : if (info->report_progress)
1365 : 485 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1366 : : current_block);
1367 : : }
1368 : :
1369 : : /*
1370 : : * We have to reset the read stream to use it again. After returning
1371 : : * InvalidBuffer, the read stream API won't invoke our callback again
1372 : : * until the stream has been reset.
1373 : : */
359 melanieplageman@gmai 1374 : 1487 : read_stream_reset(stream);
1375 : : }
1376 : :
1377 : 1633 : read_stream_end(stream);
1378 : :
1379 : : /* Set statistics num_pages field to final size of index */
1845 pg@bowt.ie 1380 : 1633 : stats->num_pages = num_pages;
1381 : :
7251 tgl@sss.pgh.pa.us 1382 : 1633 : MemoryContextDelete(vstate.pagedelcontext);
1383 : :
1384 : : /*
1385 : : * If there were any calls to _bt_pagedel() during scan of the index then
1386 : : * see if any of the resulting pages can be placed in the FSM now. When
1387 : : * it's not safe we'll have to leave it up to a future VACUUM operation.
1388 : : *
1389 : : * Finally, if we placed any pages in the FSM (either just now or during
1390 : : * the scan), forcibly update the upper-level FSM pages to ensure that
1391 : : * searchers can find them.
1392 : : */
1820 pg@bowt.ie 1393 : 1633 : _bt_pendingfsm_finalize(rel, &vstate);
1845 1394 [ + + ]: 1633 : if (stats->pages_free > 0)
2907 tgl@sss.pgh.pa.us 1395 : 59 : IndexFreeSpaceMapVacuum(rel);
7251 1396 : 1633 : }
1397 : :
1398 : : /*
1399 : : * btvacuumpage --- VACUUM one page
1400 : : *
1401 : : * This processes a single page for btvacuumscan(). In some cases we must
1402 : : * backtrack to re-examine and VACUUM pages that were on buf's page during
1403 : : * a previous call here. This is how we handle page splits (that happened
1404 : : * after our cycleid was acquired) whose right half page happened to reuse
1405 : : * a block that we might have processed at some point before it was
1406 : : * recycled (i.e. before the page split).
1407 : : *
1408 : : * Returns BlockNumber of a scanned page (not backtracked).
1409 : : */
1410 : : static BlockNumber
359 melanieplageman@gmai 1411 : 17658 : btvacuumpage(BTVacState *vstate, Buffer buf)
1412 : : {
7251 tgl@sss.pgh.pa.us 1413 : 17658 : IndexVacuumInfo *info = vstate->info;
1414 : 17658 : IndexBulkDeleteResult *stats = vstate->stats;
1415 : 17658 : IndexBulkDeleteCallback callback = vstate->callback;
1416 : 17658 : void *callback_state = vstate->callback_state;
1417 : 17658 : Relation rel = info->index;
1077 pg@bowt.ie 1418 : 17658 : Relation heaprel = info->heaprel;
1419 : : bool attempt_pagedel;
1420 : : BlockNumber blkno,
1421 : : backtrack_to;
359 melanieplageman@gmai 1422 : 17658 : BlockNumber scanblkno = BufferGetBlockNumber(buf);
1423 : : Page page;
1424 : : BTPageOpaque opaque;
1425 : :
2143 pg@bowt.ie 1426 : 17658 : blkno = scanblkno;
1427 : :
1428 : 17658 : backtrack:
1429 : :
1430 : 17658 : attempt_pagedel = false;
1431 : 17658 : backtrack_to = P_NONE;
1432 : :
2063 1433 : 17658 : _bt_lockbuf(rel, buf, BT_READ);
3616 kgrittn@postgresql.o 1434 : 17658 : page = BufferGetPage(buf);
2143 pg@bowt.ie 1435 : 17658 : opaque = NULL;
7251 tgl@sss.pgh.pa.us 1436 [ + - ]: 17658 : if (!PageIsNew(page))
1437 : : {
1438 : 17658 : _bt_checkpage(rel, buf);
1444 michael@paquier.xyz 1439 : 17658 : opaque = BTPageGetOpaque(page);
1440 : : }
1441 : :
2143 pg@bowt.ie 1442 [ - + ]: 17658 : Assert(blkno <= scanblkno);
1443 [ - + ]: 17658 : if (blkno != scanblkno)
1444 : : {
1445 : : /*
1446 : : * We're backtracking.
1447 : : *
1448 : : * We followed a right link to a sibling leaf page (a page that
1449 : : * happens to be from a block located before scanblkno). The only
1450 : : * case we want to do anything with is a live leaf page having the
1451 : : * current vacuum cycle ID.
1452 : : *
1453 : : * The page had better be in a state that's consistent with what we
1454 : : * expect. Check for conditions that imply corruption in passing. It
1455 : : * can't be half-dead because only an interrupted VACUUM process can
1456 : : * leave pages in that state, so we'd definitely have dealt with it
1457 : : * back when the page was the scanblkno page (half-dead pages are
1458 : : * always marked fully deleted by _bt_pagedel(), barring corruption).
1459 : : */
2143 pg@bowt.ie 1460 [ # # # # :UBC 0 : if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
# # ]
1461 : : {
1462 : 0 : Assert(false);
1463 : : ereport(LOG,
1464 : : (errcode(ERRCODE_INDEX_CORRUPTED),
1465 : : errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1466 : : blkno, scanblkno, RelationGetRelationName(rel))));
1467 : : _bt_relbuf(rel, buf);
1468 : : return scanblkno;
1469 : : }
1470 : :
1471 : : /*
1472 : : * We may have already processed the page in an earlier call, when the
1473 : : * page was scanblkno. This happens when the leaf page split occurred
1474 : : * after the scan began, but before the right sibling page became the
1475 : : * scanblkno.
1476 : : *
1477 : : * Page may also have been deleted by current btvacuumpage() call,
1478 : : * since _bt_pagedel() sometimes deletes the right sibling page of
1479 : : * scanblkno in passing (it does so after we decided where to
1480 : : * backtrack to). We don't need to process this page as a deleted
1481 : : * page a second time now (in fact, it would be wrong to count it as a
1482 : : * deleted page in the bulk delete statistics a second time).
1483 : : */
1484 [ # # # # ]: 0 : if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1485 : : {
1486 : : /* Done with current scanblkno (and all lower split pages) */
7251 tgl@sss.pgh.pa.us 1487 : 0 : _bt_relbuf(rel, buf);
359 melanieplageman@gmai 1488 : 0 : return scanblkno;
1489 : : }
1490 : : }
1491 : :
1077 pg@bowt.ie 1492 [ + - + + ]:CBC 17658 : if (!opaque || BTPageIsRecyclable(page, heaprel))
1493 : : {
1494 : : /* Okay to recycle this page (which could be leaf or internal) */
6375 heikki.linnakangas@i 1495 : 1593 : RecordFreeIndexPage(rel, blkno);
7251 tgl@sss.pgh.pa.us 1496 : 1593 : stats->pages_deleted++;
1845 pg@bowt.ie 1497 : 1593 : stats->pages_free++;
1498 : : }
7251 tgl@sss.pgh.pa.us 1499 [ + + ]: 16065 : else if (P_ISDELETED(opaque))
1500 : : {
1501 : : /*
1502 : : * Already deleted page (which could be leaf or internal). Can't
1503 : : * recycle yet.
1504 : : */
1505 : 149 : stats->pages_deleted++;
1506 : : }
7074 1507 [ + + ]: 15916 : else if (P_ISHALFDEAD(opaque))
1508 : : {
1509 : : /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1844 pg@bowt.ie 1510 :GBC 1 : attempt_pagedel = true;
1511 : :
1512 : : /*
1513 : : * _bt_pagedel() will increment both pages_newly_deleted and
1514 : : * pages_deleted stats in all cases (barring corruption)
1515 : : */
1516 : : }
7251 tgl@sss.pgh.pa.us 1517 [ + + ]:CBC 15915 : else if (P_ISLEAF(opaque))
1518 : : {
1519 : : OffsetNumber deletable[MaxIndexTuplesPerPage];
1520 : : int ndeletable;
1521 : : BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1522 : : int nupdatable;
1523 : : OffsetNumber offnum,
1524 : : minoff,
1525 : : maxoff;
1526 : : int nhtidsdead,
1527 : : nhtidslive;
1528 : :
1529 : : /*
1530 : : * Trade in the initial read lock for a full cleanup lock on this
1531 : : * page. We must get such a lock on every leaf page over the course
1532 : : * of the vacuum scan, whether or not it actually contains any
1533 : : * deletable tuples --- see nbtree/README.
1534 : : */
2063 pg@bowt.ie 1535 : 15027 : _bt_upgradelockbufcleanup(rel, buf);
1536 : :
1537 : : /*
1538 : : * Check whether we need to backtrack to earlier pages. What we are
1539 : : * concerned about is a page split that happened since we started the
1540 : : * vacuum scan. If the split moved tuples on the right half of the
1541 : : * split (i.e. the tuples that sort high) to a block that we already
1542 : : * passed over, then we might have missed the tuples. We need to
1543 : : * backtrack now. (Must do this before possibly clearing btpo_cycleid
1544 : : * or deleting scanblkno page below!)
1545 : : */
7251 tgl@sss.pgh.pa.us 1546 [ + + ]: 15027 : if (vstate->cycleid != 0 &&
1547 [ + + ]: 14949 : opaque->btpo_cycleid == vstate->cycleid &&
7251 tgl@sss.pgh.pa.us 1548 [ + + ]:GBC 4 : !(opaque->btpo_flags & BTP_SPLIT_END) &&
1549 [ + + ]: 3 : !P_RIGHTMOST(opaque) &&
2143 pg@bowt.ie 1550 [ - + ]: 2 : opaque->btpo_next < scanblkno)
2143 pg@bowt.ie 1551 :UBC 0 : backtrack_to = opaque->btpo_next;
1552 : :
7251 tgl@sss.pgh.pa.us 1553 :CBC 15027 : ndeletable = 0;
2209 pg@bowt.ie 1554 : 15027 : nupdatable = 0;
7251 tgl@sss.pgh.pa.us 1555 [ + + ]: 15027 : minoff = P_FIRSTDATAKEY(opaque);
1556 : 15027 : maxoff = PageGetMaxOffsetNumber(page);
2209 pg@bowt.ie 1557 : 15027 : nhtidsdead = 0;
1558 : 15027 : nhtidslive = 0;
7251 tgl@sss.pgh.pa.us 1559 [ + + ]: 15027 : if (callback)
1560 : : {
1561 : : /* btbulkdelete callback tells us what to delete (or update) */
1562 : 14949 : for (offnum = minoff;
1563 [ + + ]: 2939839 : offnum <= maxoff;
1564 : 2924890 : offnum = OffsetNumberNext(offnum))
1565 : : {
1566 : : IndexTuple itup;
1567 : :
1568 : 2924890 : itup = (IndexTuple) PageGetItem(page,
1569 : 2924890 : PageGetItemId(page, offnum));
1570 : :
2209 pg@bowt.ie 1571 [ - + ]: 2924890 : Assert(!BTreeTupleIsPivot(itup));
1572 [ + + ]: 2924890 : if (!BTreeTupleIsPosting(itup))
1573 : : {
1574 : : /* Regular tuple, standard table TID representation */
1575 [ + + ]: 2827304 : if (callback(&itup->t_tid, callback_state))
1576 : : {
1577 : 1017897 : deletable[ndeletable++] = offnum;
1578 : 1017897 : nhtidsdead++;
1579 : : }
1580 : : else
1581 : 1809407 : nhtidslive++;
1582 : : }
1583 : : else
1584 : : {
1585 : : BTVacuumPosting vacposting;
1586 : : int nremaining;
1587 : :
1588 : : /* Posting list tuple */
1589 : 97586 : vacposting = btreevacuumposting(vstate, itup, offnum,
1590 : : &nremaining);
1591 [ + + ]: 97586 : if (vacposting == NULL)
1592 : : {
1593 : : /*
1594 : : * All table TIDs from the posting tuple remain, so no
1595 : : * delete or update required
1596 : : */
1597 [ - + ]: 34875 : Assert(nremaining == BTreeTupleGetNPosting(itup));
1598 : : }
1599 [ + + ]: 62711 : else if (nremaining > 0)
1600 : : {
1601 : :
1602 : : /*
1603 : : * Store metadata about posting list tuple in
1604 : : * updatable array for entire page. Existing tuple
1605 : : * will be updated during the later call to
1606 : : * _bt_delitems_vacuum().
1607 : : */
1608 [ - + ]: 23839 : Assert(nremaining < BTreeTupleGetNPosting(itup));
1609 : 23839 : updatable[nupdatable++] = vacposting;
1610 : 23839 : nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1611 : : }
1612 : : else
1613 : : {
1614 : : /*
1615 : : * All table TIDs from the posting list must be
1616 : : * deleted. We'll delete the index tuple completely
1617 : : * (no update required).
1618 : : */
1619 [ - + ]: 38872 : Assert(nremaining == 0);
1620 : 38872 : deletable[ndeletable++] = offnum;
1621 : 38872 : nhtidsdead += BTreeTupleGetNPosting(itup);
1622 : 38872 : pfree(vacposting);
1623 : : }
1624 : :
1625 : 97586 : nhtidslive += nremaining;
1626 : : }
1627 : : }
1628 : : }
1629 : :
1630 : : /*
1631 : : * Apply any needed deletes or updates. We issue just one
1632 : : * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1633 : : */
1634 [ + + + + ]: 15027 : if (ndeletable > 0 || nupdatable > 0)
1635 : : {
2143 1636 [ - + ]: 8484 : Assert(nhtidsdead >= ndeletable + nupdatable);
2209 1637 : 8484 : _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1638 : : nupdatable);
1639 : :
1640 : 8484 : stats->tuples_removed += nhtidsdead;
1641 : : /* must recompute maxoff */
7251 tgl@sss.pgh.pa.us 1642 : 8484 : maxoff = PageGetMaxOffsetNumber(page);
1643 : :
1644 : : /* can't leak memory here */
2209 pg@bowt.ie 1645 [ + + ]: 32323 : for (int i = 0; i < nupdatable; i++)
1646 : 23839 : pfree(updatable[i]);
1647 : : }
1648 : : else
1649 : : {
1650 : : /*
1651 : : * If the leaf page has been split during this vacuum cycle, it
1652 : : * seems worth expending a write to clear btpo_cycleid even if we
1653 : : * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1654 : : * takes care of this.) This ensures we won't process the page
1655 : : * again.
1656 : : *
1657 : : * We treat this like a hint-bit update because there's no need to
1658 : : * WAL-log it.
1659 : : */
1660 [ - + ]: 6543 : Assert(nhtidsdead == 0);
7251 tgl@sss.pgh.pa.us 1661 [ + + ]: 6543 : if (vstate->cycleid != 0 &&
1662 [ + + ]: 6465 : opaque->btpo_cycleid == vstate->cycleid)
1663 : : {
7251 tgl@sss.pgh.pa.us 1664 :GBC 1 : opaque->btpo_cycleid = 0;
4654 jdavis@postgresql.or 1665 : 1 : MarkBufferDirtyHint(buf, true);
1666 : : }
1667 : : }
1668 : :
1669 : : /*
1670 : : * If the leaf page is now empty, try to delete it; else count the
1671 : : * live tuples (live table TIDs in posting lists are counted as
1672 : : * separate live tuples). We don't delete when backtracking, though,
1673 : : * since that would require teaching _bt_pagedel() about backtracking
1674 : : * (doesn't seem worth adding more complexity to deal with that).
1675 : : *
1676 : : * We don't count the number of live TIDs during cleanup-only calls to
1677 : : * btvacuumscan (i.e. when callback is not set). We count the number
1678 : : * of index tuples directly instead. This avoids the expense of
1679 : : * directly examining all of the tuples on each page. VACUUM will
1680 : : * treat num_index_tuples as an estimate in cleanup-only case, so it
1681 : : * doesn't matter that this underestimates num_index_tuples
1682 : : * significantly in some cases.
1683 : : */
7251 tgl@sss.pgh.pa.us 1684 [ + + ]:CBC 15027 : if (minoff > maxoff)
2143 pg@bowt.ie 1685 : 3166 : attempt_pagedel = (blkno == scanblkno);
1957 1686 [ + + ]: 11861 : else if (callback)
2209 1687 : 11787 : stats->num_index_tuples += nhtidslive;
1688 : : else
1957 1689 : 74 : stats->num_index_tuples += maxoff - minoff + 1;
1690 : :
2143 1691 [ + + - + ]: 15027 : Assert(!attempt_pagedel || nhtidslive == 0);
1692 : : }
1693 : :
1694 [ + + ]: 17658 : if (attempt_pagedel)
1695 : : {
1696 : : MemoryContext oldcontext;
1697 : :
1698 : : /* Run pagedel in a temp context to avoid memory leakage */
7251 tgl@sss.pgh.pa.us 1699 : 3167 : MemoryContextReset(vstate->pagedelcontext);
1700 : 3167 : oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1701 : :
1702 : : /*
1703 : : * _bt_pagedel maintains the bulk delete stats on our behalf;
1704 : : * pages_newly_deleted and pages_deleted are likely to be incremented
1705 : : * during call
1706 : : */
2143 pg@bowt.ie 1707 [ - + ]: 3167 : Assert(blkno == scanblkno);
1844 1708 : 3167 : _bt_pagedel(rel, buf, vstate);
1709 : :
7251 tgl@sss.pgh.pa.us 1710 : 3166 : MemoryContextSwitchTo(oldcontext);
1711 : : /* pagedel released buffer, so we shouldn't */
1712 : : }
1713 : : else
1714 : 14491 : _bt_relbuf(rel, buf);
1715 : :
2143 pg@bowt.ie 1716 [ - + ]: 17657 : if (backtrack_to != P_NONE)
1717 : : {
2143 pg@bowt.ie 1718 :UBC 0 : blkno = backtrack_to;
1719 : :
1720 : : /* check for vacuum delay while not holding any buffer lock */
359 melanieplageman@gmai 1721 : 0 : vacuum_delay_point(false);
1722 : :
1723 : : /*
1724 : : * We can't use _bt_getbuf() here because it always applies
1725 : : * _bt_checkpage(), which will barf on an all-zero page. We want to
1726 : : * recycle all-zero pages, not fail. Also, we want to use a
1727 : : * nondefault buffer access strategy.
1728 : : */
1729 : 0 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1730 : : info->strategy);
2143 pg@bowt.ie 1731 : 0 : goto backtrack;
1732 : : }
1733 : :
359 melanieplageman@gmai 1734 :CBC 17657 : return scanblkno;
1735 : : }
1736 : :
1737 : : /*
1738 : : * btreevacuumposting --- determine TIDs still needed in posting list
1739 : : *
1740 : : * Returns metadata describing how to build replacement tuple without the TIDs
1741 : : * that VACUUM needs to delete. Returned value is NULL in the common case
1742 : : * where no changes are needed to caller's posting list tuple (we avoid
1743 : : * allocating memory here as an optimization).
1744 : : *
1745 : : * The number of TIDs that should remain in the posting list tuple is set for
1746 : : * caller in *nremaining.
1747 : : */
1748 : : static BTVacuumPosting
2209 pg@bowt.ie 1749 : 97586 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1750 : : OffsetNumber updatedoffset, int *nremaining)
1751 : : {
1752 : 97586 : int live = 0;
1753 : 97586 : int nitem = BTreeTupleGetNPosting(posting);
1754 : 97586 : ItemPointer items = BTreeTupleGetPosting(posting);
1755 : 97586 : BTVacuumPosting vacposting = NULL;
1756 : :
1757 [ + + ]: 526197 : for (int i = 0; i < nitem; i++)
1758 : : {
1759 [ + + ]: 428611 : if (!vstate->callback(items + i, vstate->callback_state))
1760 : : {
1761 : : /* Live table TID */
1762 : 221916 : live++;
1763 : : }
1764 [ + + ]: 206695 : else if (vacposting == NULL)
1765 : : {
1766 : : /*
1767 : : * First dead table TID encountered.
1768 : : *
1769 : : * It's now clear that we need to delete one or more dead table
1770 : : * TIDs, so start maintaining metadata describing how to update
1771 : : * existing posting list tuple.
1772 : : */
1773 : 62711 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1774 : : nitem * sizeof(uint16));
1775 : :
1776 : 62711 : vacposting->itup = posting;
1777 : 62711 : vacposting->updatedoffset = updatedoffset;
1778 : 62711 : vacposting->ndeletedtids = 0;
1779 : 62711 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1780 : : }
1781 : : else
1782 : : {
1783 : : /* Second or subsequent dead table TID */
1784 : 143984 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1785 : : }
1786 : : }
1787 : :
1788 : 97586 : *nremaining = live;
1789 : 97586 : return vacposting;
1790 : : }
1791 : :
1792 : : /*
1793 : : * btcanreturn() -- Check whether btree indexes support index-only scans.
1794 : : *
1795 : : * btrees always do, so this is trivial.
1796 : : */
1797 : : bool
3710 tgl@sss.pgh.pa.us 1798 : 640643 : btcanreturn(Relation index, int attno)
1799 : : {
1800 : 640643 : return true;
1801 : : }
1802 : :
1803 : : /*
1804 : : * btgettreeheight() -- Compute tree height for use by btcostestimate().
1805 : : */
1806 : : int
551 peter@eisentraut.org 1807 : 410925 : btgettreeheight(Relation rel)
1808 : : {
1809 : 410925 : return _bt_getrootheight(rel);
1810 : : }
1811 : :
1812 : : CompareType
387 peter@eisentraut.org 1813 :UBC 0 : bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
1814 : : {
406 1815 [ # # # # : 0 : switch (strategy)
# # ]
1816 : : {
1817 : 0 : case BTLessStrategyNumber:
1818 : 0 : return COMPARE_LT;
1819 : 0 : case BTLessEqualStrategyNumber:
1820 : 0 : return COMPARE_LE;
1821 : 0 : case BTEqualStrategyNumber:
1822 : 0 : return COMPARE_EQ;
1823 : 0 : case BTGreaterEqualStrategyNumber:
1824 : 0 : return COMPARE_GE;
1825 : 0 : case BTGreaterStrategyNumber:
1826 : 0 : return COMPARE_GT;
1827 : 0 : default:
1828 : 0 : return COMPARE_INVALID;
1829 : : }
1830 : : }
1831 : :
1832 : : StrategyNumber
387 1833 : 0 : bttranslatecmptype(CompareType cmptype, Oid opfamily)
1834 : : {
406 1835 [ # # # # : 0 : switch (cmptype)
# # ]
1836 : : {
1837 : 0 : case COMPARE_LT:
1838 : 0 : return BTLessStrategyNumber;
1839 : 0 : case COMPARE_LE:
1840 : 0 : return BTLessEqualStrategyNumber;
1841 : 0 : case COMPARE_EQ:
1842 : 0 : return BTEqualStrategyNumber;
1843 : 0 : case COMPARE_GE:
1844 : 0 : return BTGreaterEqualStrategyNumber;
1845 : 0 : case COMPARE_GT:
1846 : 0 : return BTGreaterStrategyNumber;
1847 : 0 : default:
1848 : 0 : return InvalidStrategy;
1849 : : }
1850 : : }
|