Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * nbtree.c
4 : : * Implementation of Lehman and Yao's btree management algorithm for
5 : : * Postgres.
6 : : *
7 : : * NOTES
8 : : * This file contains only the public interface routines.
9 : : *
10 : : *
11 : : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
12 : : * Portions Copyright (c) 1994, Regents of the University of California
13 : : *
14 : : * IDENTIFICATION
15 : : * src/backend/access/nbtree/nbtree.c
16 : : *
17 : : *-------------------------------------------------------------------------
18 : : */
19 : : #include "postgres.h"
20 : :
21 : : #include "access/nbtree.h"
22 : : #include "access/relscan.h"
23 : : #include "access/stratnum.h"
24 : : #include "commands/progress.h"
25 : : #include "commands/vacuum.h"
26 : : #include "nodes/execnodes.h"
27 : : #include "pgstat.h"
28 : : #include "storage/bulk_write.h"
29 : : #include "storage/condition_variable.h"
30 : : #include "storage/indexfsm.h"
31 : : #include "storage/ipc.h"
32 : : #include "storage/lmgr.h"
33 : : #include "storage/read_stream.h"
34 : : #include "utils/datum.h"
35 : : #include "utils/fmgrprotos.h"
36 : : #include "utils/index_selfuncs.h"
37 : : #include "utils/memutils.h"
38 : :
39 : :
40 : : /*
41 : : * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
42 : : *
43 : : * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
44 : : * scan to advance it via another call to _bt_first.
45 : : *
46 : : * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
47 : : * a new page; others must wait.
48 : : *
49 : : * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
50 : : * to a new page; some process can start doing that.
51 : : *
52 : : * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
53 : : */
54 : : typedef enum
55 : : {
56 : : BTPARALLEL_NOT_INITIALIZED,
57 : : BTPARALLEL_NEED_PRIMSCAN,
58 : : BTPARALLEL_ADVANCING,
59 : : BTPARALLEL_IDLE,
60 : : BTPARALLEL_DONE,
61 : : } BTPS_State;
62 : :
63 : : /*
64 : : * BTParallelScanDescData contains btree specific shared information required
65 : : * for parallel scan.
66 : : */
67 : : typedef struct BTParallelScanDescData
68 : : {
69 : : BlockNumber btps_nextScanPage; /* next page to be scanned */
70 : : BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
71 : : * btps_nextScanPage */
72 : : BTPS_State btps_pageStatus; /* indicates whether next page is
73 : : * available for scan. see above for
74 : : * possible states of parallel scan. */
75 : : LWLock btps_lock; /* protects shared parallel state */
76 : : ConditionVariable btps_cv; /* used to synchronize parallel scan */
77 : :
78 : : /*
79 : : * btps_arrElems is used when scans need to schedule another primitive
80 : : * index scan with one or more SAOP arrays. Holds BTArrayKeyInfo.cur_elem
81 : : * offsets for each = scan key associated with a ScalarArrayOp array.
82 : : */
83 : : int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
84 : :
85 : : /*
86 : : * Additional space (at the end of the struct) is used when scans need to
87 : : * schedule another primitive index scan with one or more skip arrays.
88 : : * Holds a flattened datum representation for each = scan key associated
89 : : * with a skip array.
90 : : */
91 : : } BTParallelScanDescData;
92 : :
93 : : typedef struct BTParallelScanDescData *BTParallelScanDesc;
94 : :
95 : :
96 : : static bool _bt_start_prim_scan(IndexScanDesc scan);
97 : : static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
98 : : BTScanOpaque so);
99 : : static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
100 : : BTScanOpaque so);
101 : : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
102 : : IndexBulkDeleteCallback callback, void *callback_state,
103 : : BTCycleId cycleid);
104 : : static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf);
105 : : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
106 : : IndexTuple posting,
107 : : OffsetNumber updatedoffset,
108 : : int *nremaining);
109 : :
110 : :
111 : : /*
112 : : * Btree handler function: return IndexAmRoutine with access method parameters
113 : : * and callbacks.
114 : : */
115 : : Datum
3621 tgl@sss.pgh.pa.us 116 :CBC 1452068 : bthandler(PG_FUNCTION_ARGS)
117 : : {
118 : 1452068 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
119 : :
3519 teodor@sigaev.ru 120 : 1452068 : amroutine->amstrategies = BTMaxStrategyNumber;
121 : 1452068 : amroutine->amsupport = BTNProcs;
2087 akorotkov@postgresql 122 : 1452068 : amroutine->amoptsprocnum = BTOPTIONS_PROC;
3621 tgl@sss.pgh.pa.us 123 : 1452068 : amroutine->amcanorder = true;
124 : 1452068 : amroutine->amcanorderbyop = false;
292 peter@eisentraut.org 125 : 1452068 : amroutine->amcanhash = false;
284 126 : 1452068 : amroutine->amconsistentequality = true;
127 : 1452068 : amroutine->amconsistentordering = true;
3621 tgl@sss.pgh.pa.us 128 : 1452068 : amroutine->amcanbackward = true;
129 : 1452068 : amroutine->amcanunique = true;
130 : 1452068 : amroutine->amcanmulticol = true;
131 : 1452068 : amroutine->amoptionalkey = true;
132 : 1452068 : amroutine->amsearcharray = true;
133 : 1452068 : amroutine->amsearchnulls = true;
134 : 1452068 : amroutine->amstorage = false;
135 : 1452068 : amroutine->amclusterable = true;
136 : 1452068 : amroutine->ampredlocks = true;
3226 rhaas@postgresql.org 137 : 1452068 : amroutine->amcanparallel = true;
739 tomas.vondra@postgre 138 : 1452068 : amroutine->amcanbuildparallel = true;
2810 teodor@sigaev.ru 139 : 1452068 : amroutine->amcaninclude = true;
2162 akapila@postgresql.o 140 : 1452068 : amroutine->amusemaintenanceworkmem = false;
1002 tomas.vondra@postgre 141 : 1452068 : amroutine->amsummarizing = false;
2162 akapila@postgresql.o 142 : 1452068 : amroutine->amparallelvacuumoptions =
143 : : VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
3621 tgl@sss.pgh.pa.us 144 : 1452068 : amroutine->amkeytype = InvalidOid;
145 : :
146 : 1452068 : amroutine->ambuild = btbuild;
147 : 1452068 : amroutine->ambuildempty = btbuildempty;
148 : 1452068 : amroutine->aminsert = btinsert;
752 tomas.vondra@postgre 149 : 1452068 : amroutine->aminsertcleanup = NULL;
3621 tgl@sss.pgh.pa.us 150 : 1452068 : amroutine->ambulkdelete = btbulkdelete;
151 : 1452068 : amroutine->amvacuumcleanup = btvacuumcleanup;
152 : 1452068 : amroutine->amcanreturn = btcanreturn;
153 : 1452068 : amroutine->amcostestimate = btcostestimate;
462 peter@eisentraut.org 154 : 1452068 : amroutine->amgettreeheight = btgettreeheight;
3621 tgl@sss.pgh.pa.us 155 : 1452068 : amroutine->amoptions = btoptions;
3412 156 : 1452068 : amroutine->amproperty = btproperty;
2450 alvherre@alvh.no-ip. 157 : 1452068 : amroutine->ambuildphasename = btbuildphasename;
3621 tgl@sss.pgh.pa.us 158 : 1452068 : amroutine->amvalidate = btvalidate;
1963 159 : 1452068 : amroutine->amadjustmembers = btadjustmembers;
3621 160 : 1452068 : amroutine->ambeginscan = btbeginscan;
161 : 1452068 : amroutine->amrescan = btrescan;
162 : 1452068 : amroutine->amgettuple = btgettuple;
163 : 1452068 : amroutine->amgetbitmap = btgetbitmap;
164 : 1452068 : amroutine->amendscan = btendscan;
165 : 1452068 : amroutine->ammarkpos = btmarkpos;
166 : 1452068 : amroutine->amrestrpos = btrestrpos;
3226 rhaas@postgresql.org 167 : 1452068 : amroutine->amestimateparallelscan = btestimateparallelscan;
168 : 1452068 : amroutine->aminitparallelscan = btinitparallelscan;
169 : 1452068 : amroutine->amparallelrescan = btparallelrescan;
317 peter@eisentraut.org 170 : 1452068 : amroutine->amtranslatestrategy = bttranslatestrategy;
171 : 1452068 : amroutine->amtranslatecmptype = bttranslatecmptype;
172 : :
3621 tgl@sss.pgh.pa.us 173 : 1452068 : PG_RETURN_POINTER(amroutine);
174 : : }
175 : :
176 : : /*
177 : : * btbuildempty() -- build an empty btree index in the initialization fork
178 : : */
179 : : void
180 : 80 : btbuildempty(Relation index)
181 : : {
846 heikki.linnakangas@i 182 : 80 : bool allequalimage = _bt_allequalimage(index, false);
183 : : BulkWriteState *bulkstate;
184 : : BulkWriteBuffer metabuf;
185 : :
662 186 : 80 : bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
187 : :
188 : : /* Construct metapage. */
189 : 80 : metabuf = smgr_bulk_get_buf(bulkstate);
190 : 80 : _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
191 : 80 : smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
192 : :
193 : 80 : smgr_bulk_finish(bulkstate);
5466 rhaas@postgresql.org 194 : 80 : }
195 : :
196 : : /*
197 : : * btinsert() -- insert an index tuple into a btree.
198 : : *
199 : : * Descend the tree recursively, find the appropriate location for our
200 : : * new tuple, and put it there.
201 : : */
202 : : bool
3621 tgl@sss.pgh.pa.us 203 : 3757192 : btinsert(Relation rel, Datum *values, bool *isnull,
204 : : ItemPointer ht_ctid, Relation heapRel,
205 : : IndexUniqueCheck checkUnique,
206 : : bool indexUnchanged,
207 : : IndexInfo *indexInfo)
208 : : {
209 : : bool result;
210 : : IndexTuple itup;
211 : :
212 : : /* generate an index tuple */
7575 213 : 3757192 : itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
10327 bruce@momjian.us 214 : 3757192 : itup->t_tid = *ht_ctid;
215 : :
1798 pg@bowt.ie 216 : 3757192 : result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
217 : :
10327 bruce@momjian.us 218 : 3756926 : pfree(itup);
219 : :
3621 tgl@sss.pgh.pa.us 220 : 3756926 : return result;
221 : : }
222 : :
223 : : /*
224 : : * btgettuple() -- Get the next tuple in the scan.
225 : : */
226 : : bool
227 : 16846424 : btgettuple(IndexScanDesc scan, ScanDirection dir)
228 : : {
8607 229 : 16846424 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
230 : : bool res;
231 : :
193 pg@bowt.ie 232 [ - + ]: 16846424 : Assert(scan->heapRelation != NULL);
233 : :
234 : : /* btree indexes are never lossy */
6456 tgl@sss.pgh.pa.us 235 : 16846424 : scan->xs_recheck = false;
236 : :
237 : : /* Each loop iteration performs another primitive index scan */
238 : : do
239 : : {
240 : : /*
241 : : * If we've already initialized this scan, we can just advance it in
242 : : * the appropriate direction. If we haven't done so yet, we call
243 : : * _bt_first() to get the first item in the scan.
244 : : */
5175 245 [ + + - + : 16854923 : if (!BTScanPosIsValid(so->currPos))
+ + ]
246 : 7353894 : res = _bt_first(scan, dir);
247 : : else
248 : : {
249 : : /*
250 : : * Check to see if we should kill the previously-fetched tuple.
251 : : */
252 [ + + ]: 9501029 : if (scan->kill_prior_tuple)
253 : : {
254 : : /*
255 : : * Yes, remember it for later. (We'll deal with all such
256 : : * tuples at once right before leaving the index page.) The
257 : : * test for numKilled overrun is not just paranoia: if the
258 : : * caller reverses direction in the indexscan then the same
259 : : * item might get entered multiple times. It's not worth
260 : : * trying to optimize that, so we don't detect it, but instead
261 : : * just forget any excess entries.
262 : : */
263 [ + + ]: 203215 : if (so->killedItems == NULL)
6 michael@paquier.xyz 264 :GNC 88701 : so->killedItems = palloc_array(int, MaxTIDsPerBTreePage);
2120 pg@bowt.ie 265 [ + - ]:CBC 203215 : if (so->numKilled < MaxTIDsPerBTreePage)
5175 tgl@sss.pgh.pa.us 266 : 203215 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
267 : : }
268 : :
269 : : /*
270 : : * Now continue the scan.
271 : : */
272 : 9501029 : res = _bt_next(scan, dir);
273 : : }
274 : :
275 : : /* If we have a tuple, return it ... */
276 [ + + ]: 16854923 : if (res)
277 : 13315606 : break;
278 : : /* ... otherwise see if we need another primitive index scan */
8 pg@bowt.ie 279 [ + + + + ]:GNC 3539317 : } while (so->numArrayKeys && _bt_start_prim_scan(scan));
280 : :
3621 tgl@sss.pgh.pa.us 281 :CBC 16846424 : return res;
282 : : }
283 : :
284 : : /*
285 : : * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
286 : : */
287 : : int64
288 : 8909 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
289 : : {
7569 290 : 8909 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
6459 291 : 8909 : int64 ntids = 0;
292 : : ItemPointer heapTid;
293 : :
193 pg@bowt.ie 294 [ - + ]: 8909 : Assert(scan->heapRelation == NULL);
295 : :
296 : : /* Each loop iteration performs another primitive index scan */
297 : : do
298 : : {
299 : : /* Fetch the first page & tuple */
5175 tgl@sss.pgh.pa.us 300 [ + + ]: 9201 : if (_bt_first(scan, ForwardScanDirection))
301 : : {
302 : : /* Save tuple ID, and continue scanning */
2472 andres@anarazel.de 303 : 7642 : heapTid = &scan->xs_heaptid;
5175 tgl@sss.pgh.pa.us 304 : 7642 : tbm_add_tuples(tbm, heapTid, 1, false);
305 : 7642 : ntids++;
306 : :
307 : : for (;;)
308 : : {
309 : : /*
310 : : * Advance to next tuple within page. This is the same as the
311 : : * easy case in _bt_next().
312 : : */
313 [ + + ]: 1015876 : if (++so->currPos.itemIndex > so->currPos.lastItem)
314 : : {
315 : : /* let _bt_next do the heavy lifting */
316 [ + + ]: 10290 : if (!_bt_next(scan, ForwardScanDirection))
317 : 7642 : break;
318 : : }
319 : :
320 : : /* Save tuple ID, and continue scanning */
321 : 1008234 : heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
322 : 1008234 : tbm_add_tuples(tbm, heapTid, 1, false);
323 : 1008234 : ntids++;
324 : : }
325 : : }
326 : : /* Now see if we need another primitive index scan */
8 pg@bowt.ie 327 [ + + + + ]:GNC 9201 : } while (so->numArrayKeys && _bt_start_prim_scan(scan));
328 : :
3621 tgl@sss.pgh.pa.us 329 :CBC 8909 : return ntids;
330 : : }
331 : :
332 : : /*
333 : : * btbeginscan() -- start a scan on a btree index
334 : : */
335 : : IndexScanDesc
336 : 7037939 : btbeginscan(Relation rel, int nkeys, int norderbys)
337 : : {
338 : : IndexScanDesc scan;
339 : : BTScanOpaque so;
340 : :
341 : : /* no order by operators allowed */
5493 342 [ - + ]: 7037939 : Assert(norderbys == 0);
343 : :
344 : : /* get the scan */
345 : 7037939 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
346 : :
347 : : /* allocate private workspace */
6 michael@paquier.xyz 348 :GNC 7037939 : so = palloc_object(BTScanOpaqueData);
3919 kgrittn@postgresql.o 349 :CBC 7037939 : BTScanPosInvalidate(so->currPos);
350 : 7037939 : BTScanPosInvalidate(so->markPos);
5493 tgl@sss.pgh.pa.us 351 [ + + ]: 7037939 : if (scan->numberOfKeys > 0)
352 : 7031438 : so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
353 : : else
354 : 6501 : so->keyData = NULL;
355 : :
256 pg@bowt.ie 356 : 7037939 : so->skipScan = false;
619 357 : 7037939 : so->needPrimScan = false;
358 : 7037939 : so->scanBehind = false;
426 359 : 7037939 : so->oppositeDirCheck = false;
5175 tgl@sss.pgh.pa.us 360 : 7037939 : so->arrayKeys = NULL;
619 pg@bowt.ie 361 : 7037939 : so->orderProcs = NULL;
5175 tgl@sss.pgh.pa.us 362 : 7037939 : so->arrayContext = NULL;
363 : :
5493 364 : 7037939 : so->killedItems = NULL; /* until needed */
365 : 7037939 : so->numKilled = 0;
366 : :
367 : : /*
368 : : * We don't know yet whether the scan will be index-only, so we do not
369 : : * allocate the tuple workspace arrays until btrescan. However, we set up
370 : : * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
371 : : */
5182 372 : 7037939 : so->currTuples = so->markTuples = NULL;
373 : :
5175 374 : 7037939 : scan->xs_itupdesc = RelationGetDescr(rel);
375 : :
5493 376 : 7037939 : scan->opaque = so;
377 : :
3621 378 : 7037939 : return scan;
379 : : }
380 : :
381 : : /*
382 : : * btrescan() -- rescan an index relation
383 : : */
384 : : void
385 : 7354977 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
386 : : ScanKey orderbys, int norderbys)
387 : : {
5493 388 : 7354977 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
389 : :
390 : : /* we aren't holding any read locks, but gotta drop the pins */
7163 391 [ + + - + : 7354977 : if (BTScanPosIsValid(so->currPos))
+ + ]
392 : : {
393 : : /* Before leaving current page, deal with any killed items */
394 [ + + ]: 40357 : if (so->numKilled > 0)
3919 kgrittn@postgresql.o 395 : 453 : _bt_killitems(scan);
396 [ - + - - : 40357 : BTScanPosUnpinIfPinned(so->currPos);
+ + ]
397 : 40357 : BTScanPosInvalidate(so->currPos);
398 : : }
399 : :
400 : : /*
401 : : * We prefer to eagerly drop leaf page pins before btgettuple returns.
402 : : * This avoids making VACUUM wait to acquire a cleanup lock on the page.
403 : : *
404 : : * We cannot safely drop leaf page pins during index-only scans due to a
405 : : * race condition involving VACUUM setting pages all-visible in the VM.
406 : : * It's also unsafe for plain index scans that use a non-MVCC snapshot.
407 : : *
408 : : * When we drop pins eagerly, the mechanism that marks so->killedItems[]
409 : : * index tuples LP_DEAD has to deal with concurrent TID recycling races.
410 : : * The scheme used to detect unsafe TID recycling won't work when scanning
411 : : * unlogged relations (since it involves saving an affected page's LSN).
412 : : * Opt out of eager pin dropping during unlogged relation scans for now
413 : : * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting).
414 : : *
415 : : * Also opt out of dropping leaf page pins eagerly during bitmap scans.
416 : : * Pins cannot be held for more than an instant during bitmap scans either
417 : : * way, so we might as well avoid wasting cycles on acquiring page LSNs.
418 : : *
419 : : * See nbtree/README section on making concurrent TID recycling safe.
420 : : *
421 : : * Note: so->dropPin should never change across rescans.
422 : : */
193 pg@bowt.ie 423 : 21884740 : so->dropPin = (!scan->xs_want_itup &&
424 [ + + + + ]: 7174786 : IsMVCCSnapshot(scan->xs_snapshot) &&
425 [ + + + + : 21072161 : RelationNeedsWAL(scan->indexRelation) &&
+ + + + +
+ ]
426 [ + + ]: 6542398 : scan->heapRelation != NULL);
427 : :
7054 tgl@sss.pgh.pa.us 428 : 7354977 : so->markItemIndex = -1;
619 pg@bowt.ie 429 : 7354977 : so->needPrimScan = false;
430 : 7354977 : so->scanBehind = false;
426 431 : 7354977 : so->oppositeDirCheck = false;
3919 kgrittn@postgresql.o 432 [ + - - + : 7354977 : BTScanPosUnpinIfPinned(so->markPos);
- + ]
433 : 7354977 : BTScanPosInvalidate(so->markPos);
434 : :
435 : : /*
436 : : * Allocate tuple workspace arrays, if needed for an index-only scan and
437 : : * not already done in a previous rescan call. To save on palloc
438 : : * overhead, both workspaces are allocated as one palloc block; only this
439 : : * function and btendscan know that.
440 : : *
441 : : * NOTE: this data structure also makes it safe to return data from a
442 : : * "name" column, even though btree name_ops uses an underlying storage
443 : : * datatype of cstring. The risk there is that "name" is supposed to be
444 : : * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
445 : : * However, since we only return data out of tuples sitting in the
446 : : * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
447 : : * data out of the markTuples array --- running off the end of memory for
448 : : * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
449 : : * adding special-case treatment for name_ops elsewhere.
450 : : */
5182 tgl@sss.pgh.pa.us 451 [ + + + + ]: 7354977 : if (scan->xs_want_itup && so->currTuples == NULL)
452 : : {
453 : 70428 : so->currTuples = (char *) palloc(BLCKSZ * 2);
454 : 70428 : so->markTuples = so->currTuples + BLCKSZ;
455 : : }
456 : :
457 : : /*
458 : : * Reset the scan keys
459 : : */
8304 460 [ + + + + ]: 7354977 : if (scankey && scan->numberOfKeys > 0)
461 peter@eisentraut.org 461 : 7348361 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
8070 tgl@sss.pgh.pa.us 462 : 7354977 : so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
619 pg@bowt.ie 463 : 7354977 : so->numArrayKeys = 0; /* ditto */
10752 scrappy@hub.org 464 : 7354977 : }
465 : :
466 : : /*
467 : : * btendscan() -- close down a scan
468 : : */
469 : : void
3621 tgl@sss.pgh.pa.us 470 : 7037129 : btendscan(IndexScanDesc scan)
471 : : {
7163 472 : 7037129 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
473 : :
474 : : /* we aren't holding any read locks, but gotta drop the pins */
475 [ + + - + : 7037129 : if (BTScanPosIsValid(so->currPos))
+ + ]
476 : : {
477 : : /* Before leaving current page, deal with any killed items */
478 [ + + ]: 3773641 : if (so->numKilled > 0)
3919 kgrittn@postgresql.o 479 : 46404 : _bt_killitems(scan);
480 [ - + - - : 3773641 : BTScanPosUnpinIfPinned(so->currPos);
+ + ]
481 : : }
482 : :
7054 tgl@sss.pgh.pa.us 483 : 7037129 : so->markItemIndex = -1;
3919 kgrittn@postgresql.o 484 [ + + - + : 7037129 : BTScanPosUnpinIfPinned(so->markPos);
+ + ]
485 : :
486 : : /* No need to invalidate positions, the RAM is about to be freed. */
487 : :
488 : : /* Release storage */
8014 neilc@samurai.com 489 [ + + ]: 7037129 : if (so->keyData != NULL)
10327 bruce@momjian.us 490 : 7030643 : pfree(so->keyData);
491 : : /* so->arrayKeys and so->orderProcs are in arrayContext */
5175 tgl@sss.pgh.pa.us 492 [ + + ]: 7037129 : if (so->arrayContext != NULL)
493 : 2468 : MemoryContextDelete(so->arrayContext);
494 [ + + ]: 7037129 : if (so->killedItems != NULL)
495 : 88671 : pfree(so->killedItems);
5182 496 [ + + ]: 7037129 : if (so->currTuples != NULL)
497 : 70406 : pfree(so->currTuples);
498 : : /* so->markTuples should not be pfree'd, see btrescan */
10327 bruce@momjian.us 499 : 7037129 : pfree(so);
10752 scrappy@hub.org 500 : 7037129 : }
501 : :
502 : : /*
503 : : * btmarkpos() -- save current scan position
504 : : */
505 : : void
3621 tgl@sss.pgh.pa.us 506 : 65056 : btmarkpos(IndexScanDesc scan)
507 : : {
7163 508 : 65056 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
509 : :
510 : : /* There may be an old mark with a pin (but no lock). */
3919 kgrittn@postgresql.o 511 [ + + - + : 65056 : BTScanPosUnpinIfPinned(so->markPos);
+ + ]
512 : :
513 : : /*
514 : : * Just record the current itemIndex. If we later step to next page
515 : : * before releasing the marked position, _bt_steppage makes a full copy of
516 : : * the currPos struct in markPos. If (as often happens) the mark is moved
517 : : * before we leave the page, we don't have to do that work.
518 : : */
7163 tgl@sss.pgh.pa.us 519 [ - + - - : 65056 : if (BTScanPosIsValid(so->currPos))
+ - ]
7054 520 : 65056 : so->markItemIndex = so->currPos.itemIndex;
521 : : else
522 : : {
3919 kgrittn@postgresql.o 523 :UBC 0 : BTScanPosInvalidate(so->markPos);
7054 tgl@sss.pgh.pa.us 524 : 0 : so->markItemIndex = -1;
525 : : }
10752 scrappy@hub.org 526 :CBC 65056 : }
527 : :
528 : : /*
529 : : * btrestrpos() -- restore scan to last saved position
530 : : */
531 : : void
3621 tgl@sss.pgh.pa.us 532 : 27015 : btrestrpos(IndexScanDesc scan)
533 : : {
7163 534 : 27015 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
535 : :
7054 536 [ + + ]: 27015 : if (so->markItemIndex >= 0)
537 : : {
538 : : /*
539 : : * The scan has never moved to a new page since the last mark. Just
540 : : * restore the itemIndex.
541 : : *
542 : : * NB: In this case we can't count on anything in so->markPos to be
543 : : * accurate.
544 : : */
545 : 26961 : so->currPos.itemIndex = so->markItemIndex;
546 : : }
547 : : else
548 : : {
549 : : /*
550 : : * The scan moved to a new page after last mark or restore, and we are
551 : : * now restoring to the marked page. We aren't holding any read
552 : : * locks, but if we're still holding the pin for the current position,
553 : : * we must drop it.
554 : : */
555 [ - + - - : 54 : if (BTScanPosIsValid(so->currPos))
+ - ]
556 : : {
557 : : /* Before leaving current page, deal with any killed items */
3919 kgrittn@postgresql.o 558 [ - + ]: 54 : if (so->numKilled > 0)
3919 kgrittn@postgresql.o 559 :UBC 0 : _bt_killitems(scan);
3919 kgrittn@postgresql.o 560 [ - + - - :CBC 54 : BTScanPosUnpinIfPinned(so->currPos);
- + ]
561 : : }
562 : :
7054 tgl@sss.pgh.pa.us 563 [ - + - - : 54 : if (BTScanPosIsValid(so->markPos))
+ - ]
564 : : {
565 : : /* bump pin on mark buffer for assignment to current buffer */
3919 kgrittn@postgresql.o 566 [ - + - - : 54 : if (BTScanPosIsPinned(so->markPos))
- + ]
3919 kgrittn@postgresql.o 567 :UBC 0 : IncrBufferRefCount(so->markPos.buf);
7054 tgl@sss.pgh.pa.us 568 :CBC 54 : memcpy(&so->currPos, &so->markPos,
569 : : offsetof(BTScanPosData, items[1]) +
570 : 54 : so->markPos.lastItem * sizeof(BTScanPosItem));
5182 571 [ - + ]: 54 : if (so->currTuples)
5182 tgl@sss.pgh.pa.us 572 :UBC 0 : memcpy(so->currTuples, so->markTuples,
573 : 0 : so->markPos.nextTupleOffset);
574 : : /* Reset the scan's array keys (see _bt_steppage for why) */
619 pg@bowt.ie 575 [ - + ]:CBC 54 : if (so->numArrayKeys)
576 : : {
619 pg@bowt.ie 577 :UBC 0 : _bt_start_array_keys(scan, so->currPos.dir);
578 : 0 : so->needPrimScan = false;
579 : : }
580 : : }
581 : : else
3919 kgrittn@postgresql.o 582 : 0 : BTScanPosInvalidate(so->currPos);
583 : : }
10752 scrappy@hub.org 584 :CBC 27015 : }
585 : :
586 : : /*
587 : : * btestimateparallelscan -- estimate storage for BTParallelScanDescData
588 : : */
589 : : Size
256 pg@bowt.ie 590 : 32 : btestimateparallelscan(Relation rel, int nkeys, int norderbys)
591 : : {
592 : 32 : int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
593 : : Size estnbtreeshared,
594 : : genericattrspace;
595 : :
596 : : /*
597 : : * Pessimistically assume that every input scan key will be output with
598 : : * its own SAOP array
599 : : */
600 : 32 : estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) +
601 : : sizeof(int) * nkeys;
602 : :
603 : : /* Single column indexes cannot possibly use a skip array */
604 [ + + ]: 32 : if (nkeyatts == 1)
605 : 23 : return estnbtreeshared;
606 : :
607 : : /*
608 : : * Pessimistically assume that all attributes prior to the least
609 : : * significant attribute require a skip array (and an associated key)
610 : : */
611 : 9 : genericattrspace = datumEstimateSpace((Datum) 0, false, true,
612 : : sizeof(Datum));
613 [ + + ]: 18 : for (int attnum = 1; attnum < nkeyatts; attnum++)
614 : : {
615 : : CompactAttribute *attr;
616 : :
617 : : /*
618 : : * We make the conservative assumption that every index column will
619 : : * also require a skip array.
620 : : *
621 : : * Every skip array must have space to store its scan key's sk_flags.
622 : : */
623 : 9 : estnbtreeshared = add_size(estnbtreeshared, sizeof(int));
624 : :
625 : : /* Consider space required to store a datum of opclass input type */
626 : 9 : attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
627 [ + - ]: 9 : if (attr->attbyval)
628 : 9 : {
629 : : /* This index attribute stores pass-by-value datums */
630 : 9 : Size estfixed = datumEstimateSpace((Datum) 0, false,
631 : 9 : true, attr->attlen);
632 : :
633 : 9 : estnbtreeshared = add_size(estnbtreeshared, estfixed);
634 : 9 : continue;
635 : : }
636 : :
637 : : /*
638 : : * This index attribute stores pass-by-reference datums.
639 : : *
640 : : * Assume that serializing this array will use just as much space as a
641 : : * pass-by-value datum, in addition to space for the largest possible
642 : : * whole index tuple (this is not just a per-datum portion of the
643 : : * largest possible tuple because that'd be almost as large anyway).
644 : : *
645 : : * This is quite conservative, but it's not clear how we could do much
646 : : * better. The executor requires an up-front storage request size
647 : : * that reliably covers the scan's high watermark memory usage. We
648 : : * can't be sure of the real high watermark until the scan is over.
649 : : */
256 pg@bowt.ie 650 :UBC 0 : estnbtreeshared = add_size(estnbtreeshared, genericattrspace);
651 : 0 : estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize);
652 : : }
653 : :
256 pg@bowt.ie 654 :CBC 9 : return estnbtreeshared;
655 : : }
656 : :
657 : : /*
658 : : * _bt_start_prim_scan() -- start scheduled primitive index scan?
659 : : *
660 : : * Returns true if _bt_checkkeys scheduled another primitive index scan, just
661 : : * as the last one ended. Otherwise returns false, indicating that the array
662 : : * keys are now fully exhausted.
663 : : *
664 : : * Only call here during scans with one or more equality type array scan keys,
665 : : * after _bt_first or _bt_next return false.
666 : : */
667 : : static bool
8 pg@bowt.ie 668 :GNC 44474 : _bt_start_prim_scan(IndexScanDesc scan)
669 : : {
670 : 44474 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
671 : :
672 [ - + ]: 44474 : Assert(so->numArrayKeys);
673 : :
674 : 44474 : so->scanBehind = so->oppositeDirCheck = false; /* reset */
675 : :
676 : : /*
677 : : * Array keys are advanced within _bt_checkkeys when the scan reaches the
678 : : * leaf level (more precisely, they're advanced when the scan reaches the
679 : : * end of each distinct set of array elements). This process avoids
680 : : * repeat access to leaf pages (across multiple primitive index scans) by
681 : : * advancing the scan's array keys when it allows the primitive index scan
682 : : * to find nearby matching tuples (or when it eliminates ranges of array
683 : : * key space that can't possibly be satisfied by any index tuple).
684 : : *
685 : : * _bt_checkkeys sets a simple flag variable to schedule another primitive
686 : : * index scan. The flag tells us what to do.
687 : : *
688 : : * We cannot rely on _bt_first always reaching _bt_checkkeys. There are
689 : : * various cases where that won't happen. For example, if the index is
690 : : * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
691 : : * We also don't expect a call to _bt_checkkeys during searches for a
692 : : * non-existent value that happens to be lower/higher than any existing
693 : : * value in the index.
694 : : *
695 : : * We don't require special handling for these cases -- we don't need to
696 : : * be explicitly instructed to _not_ perform another primitive index scan.
697 : : * It's up to code under the control of _bt_first to always set the flag
698 : : * when another primitive index scan will be required.
699 : : *
700 : : * This works correctly, even with the tricky cases listed above, which
701 : : * all involve access to leaf pages "near the boundaries of the key space"
702 : : * (whether it's from a leftmost/rightmost page, or an imaginary empty
703 : : * leaf root page). If _bt_checkkeys cannot be reached by a primitive
704 : : * index scan for one set of array keys, then it also won't be reached for
705 : : * any later set ("later" in terms of the direction that we scan the index
706 : : * and advance the arrays). The array keys won't have advanced in these
707 : : * cases, but that's the correct behavior (even _bt_advance_array_keys
708 : : * won't always advance the arrays at the point they become "exhausted").
709 : : */
710 [ + + ]: 44474 : if (so->needPrimScan)
711 : : {
712 : : /*
713 : : * Flag was set -- must call _bt_first again, which will reset the
714 : : * scan's needPrimScan flag
715 : : */
716 : 8791 : return true;
717 : : }
718 : :
719 : : /* The top-level index scan ran out of tuples in this scan direction */
720 [ + + ]: 35683 : if (scan->parallel_scan != NULL)
721 : 15 : _bt_parallel_done(scan);
722 : :
723 : 35683 : return false;
724 : : }
725 : :
726 : : /*
727 : : * _bt_parallel_serialize_arrays() -- Serialize parallel array state.
728 : : *
729 : : * Caller must have exclusively locked btscan->btps_lock when called.
730 : : */
731 : : static void
256 pg@bowt.ie 732 :CBC 18 : _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
733 : : BTScanOpaque so)
734 : : {
735 : : char *datumshared;
736 : :
737 : : /* Space for serialized datums begins immediately after btps_arrElems[] */
738 : 18 : datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
739 [ + + ]: 36 : for (int i = 0; i < so->numArrayKeys; i++)
740 : : {
741 : 18 : BTArrayKeyInfo *array = &so->arrayKeys[i];
742 : 18 : ScanKey skey = &so->keyData[array->scan_key];
743 : :
744 [ + - ]: 18 : if (array->num_elems != -1)
745 : : {
746 : : /* Save SAOP array's cur_elem (no need to copy key/datum) */
747 [ - + ]: 18 : Assert(!(skey->sk_flags & SK_BT_SKIP));
748 : 18 : btscan->btps_arrElems[i] = array->cur_elem;
749 : 18 : continue;
750 : : }
751 : :
752 : : /* Save all mutable state associated with skip array's key */
256 pg@bowt.ie 753 [ # # ]:UBC 0 : Assert(skey->sk_flags & SK_BT_SKIP);
754 : 0 : memcpy(datumshared, &skey->sk_flags, sizeof(int));
755 : 0 : datumshared += sizeof(int);
756 : :
757 [ # # ]: 0 : if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
758 : : {
759 : : /* No sk_argument datum to serialize */
760 [ # # ]: 0 : Assert(skey->sk_argument == 0);
761 : 0 : continue;
762 : : }
763 : :
764 : 0 : datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
765 : 0 : array->attbyval, array->attlen, &datumshared);
766 : : }
256 pg@bowt.ie 767 :CBC 18 : }
768 : :
769 : : /*
770 : : * _bt_parallel_restore_arrays() -- Restore serialized parallel array state.
771 : : *
772 : : * Caller must have exclusively locked btscan->btps_lock when called.
773 : : */
774 : : static void
775 : 18 : _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
776 : : BTScanOpaque so)
777 : : {
778 : : char *datumshared;
779 : :
780 : : /* Space for serialized datums begins immediately after btps_arrElems[] */
781 : 18 : datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
782 [ + + ]: 36 : for (int i = 0; i < so->numArrayKeys; i++)
783 : : {
784 : 18 : BTArrayKeyInfo *array = &so->arrayKeys[i];
785 : 18 : ScanKey skey = &so->keyData[array->scan_key];
786 : : bool isnull;
787 : :
788 [ + - ]: 18 : if (array->num_elems != -1)
789 : : {
790 : : /* Restore SAOP array using its saved cur_elem */
791 [ - + ]: 18 : Assert(!(skey->sk_flags & SK_BT_SKIP));
792 : 18 : array->cur_elem = btscan->btps_arrElems[i];
793 : 18 : skey->sk_argument = array->elem_values[array->cur_elem];
794 : 18 : continue;
795 : : }
796 : :
797 : : /* Restore skip array by restoring its key directly */
256 pg@bowt.ie 798 [ # # # # ]:UBC 0 : if (!array->attbyval && skey->sk_argument)
799 : 0 : pfree(DatumGetPointer(skey->sk_argument));
800 : 0 : skey->sk_argument = (Datum) 0;
801 : 0 : memcpy(&skey->sk_flags, datumshared, sizeof(int));
802 : 0 : datumshared += sizeof(int);
803 : :
804 [ # # ]: 0 : Assert(skey->sk_flags & SK_BT_SKIP);
805 : :
806 [ # # ]: 0 : if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
807 : : {
808 : : /* No sk_argument datum to restore */
809 : 0 : continue;
810 : : }
811 : :
812 : 0 : skey->sk_argument = datumRestore(&datumshared, &isnull);
813 [ # # ]: 0 : if (isnull)
814 : : {
815 [ # # ]: 0 : Assert(skey->sk_argument == 0);
816 [ # # ]: 0 : Assert(skey->sk_flags & SK_SEARCHNULL);
817 [ # # ]: 0 : Assert(skey->sk_flags & SK_ISNULL);
818 : : }
819 : : }
3226 rhaas@postgresql.org 820 :CBC 18 : }
821 : :
822 : : /*
823 : : * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
824 : : */
825 : : void
826 : 32 : btinitparallelscan(void *target)
827 : : {
828 : 32 : BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
829 : :
283 pg@bowt.ie 830 : 32 : LWLockInitialize(&bt_target->btps_lock,
831 : : LWTRANCHE_PARALLEL_BTREE_SCAN);
424 832 : 32 : bt_target->btps_nextScanPage = InvalidBlockNumber;
833 : 32 : bt_target->btps_lastCurrPage = InvalidBlockNumber;
3226 rhaas@postgresql.org 834 : 32 : bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
835 : 32 : ConditionVariableInit(&bt_target->btps_cv);
836 : 32 : }
837 : :
838 : : /*
839 : : * btparallelrescan() -- reset parallel scan
840 : : */
841 : : void
842 : 12 : btparallelrescan(IndexScanDesc scan)
843 : : {
844 : : BTParallelScanDesc btscan;
845 : 12 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
846 : :
847 [ - + ]: 12 : Assert(parallel_scan);
848 : :
383 peter@eisentraut.org 849 : 12 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
850 : : parallel_scan->ps_offset_am);
851 : :
852 : : /*
853 : : * In theory, we don't need to acquire the LWLock here, because there
854 : : * shouldn't be any other workers running at this point, but we do so for
855 : : * consistency.
856 : : */
283 pg@bowt.ie 857 : 12 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
424 858 : 12 : btscan->btps_nextScanPage = InvalidBlockNumber;
859 : 12 : btscan->btps_lastCurrPage = InvalidBlockNumber;
3226 rhaas@postgresql.org 860 : 12 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
283 pg@bowt.ie 861 : 12 : LWLockRelease(&btscan->btps_lock);
3226 rhaas@postgresql.org 862 : 12 : }
863 : :
864 : : /*
865 : : * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
866 : : * page. Other scans must wait until we call _bt_parallel_release()
867 : : * or _bt_parallel_done().
868 : : *
869 : : * The return value is true if we successfully seized the scan and false
870 : : * if we did not. The latter case occurs when no pages remain, or when
871 : : * another primitive index scan is scheduled that caller's backend cannot
872 : : * start just yet (only backends that call from _bt_first are capable of
873 : : * starting primitive index scans, which they indicate by passing first=true).
874 : : *
875 : : * If the return value is true, *next_scan_page returns the next page of the
876 : : * scan, and *last_curr_page returns the page that *next_scan_page came from.
877 : : * An invalid *next_scan_page means the scan hasn't yet started, or that
878 : : * caller needs to start the next primitive index scan (if it's the latter
879 : : * case we'll set so.needPrimScan).
880 : : *
881 : : * Callers should ignore the value of *next_scan_page and *last_curr_page if
882 : : * the return value is false.
883 : : */
884 : : bool
424 pg@bowt.ie 885 : 829 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
886 : : BlockNumber *last_curr_page, bool first)
887 : : {
256 888 : 829 : Relation rel = scan->indexRelation;
3226 rhaas@postgresql.org 889 : 829 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
403 pg@bowt.ie 890 : 829 : bool exit_loop = false,
891 : 829 : status = true,
892 : 829 : endscan = false;
3226 rhaas@postgresql.org 893 : 829 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
894 : : BTParallelScanDesc btscan;
895 : :
403 pg@bowt.ie 896 : 829 : *next_scan_page = InvalidBlockNumber;
424 897 : 829 : *last_curr_page = InvalidBlockNumber;
898 : :
899 : : /*
900 : : * Reset so->currPos, and initialize moreLeft/moreRight such that the next
901 : : * call to _bt_readnextpage treats this backend similarly to a serial
902 : : * backend that steps from *last_curr_page to *next_scan_page (unless this
903 : : * backend's so->currPos is initialized by _bt_readfirstpage before then).
904 : : */
905 : 829 : BTScanPosInvalidate(so->currPos);
906 : 829 : so->currPos.moreLeft = so->currPos.moreRight = true;
907 : :
619 908 [ + + ]: 829 : if (first)
909 : : {
910 : : /*
911 : : * Initialize array related state when called from _bt_first, assuming
912 : : * that this will be the first primitive index scan for the scan
913 : : */
914 : 223 : so->needPrimScan = false;
915 : 223 : so->scanBehind = false;
426 916 : 223 : so->oppositeDirCheck = false;
917 : : }
918 : : else
919 : : {
920 : : /*
921 : : * Don't attempt to seize the scan when it requires another primitive
922 : : * index scan, since caller's backend cannot start it right now
923 : : */
619 924 [ - + ]: 606 : if (so->needPrimScan)
619 pg@bowt.ie 925 :UBC 0 : return false;
926 : : }
927 : :
383 peter@eisentraut.org 928 :CBC 829 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
929 : : parallel_scan->ps_offset_am);
930 : :
931 : : while (1)
932 : : {
283 pg@bowt.ie 933 : 829 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
934 : :
619 935 [ + + ]: 829 : if (btscan->btps_pageStatus == BTPARALLEL_DONE)
936 : : {
937 : : /* We're done with this parallel index scan */
3226 rhaas@postgresql.org 938 : 147 : status = false;
939 : : }
403 pg@bowt.ie 940 [ + + ]: 682 : else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
941 [ + + ]: 620 : btscan->btps_nextScanPage == P_NONE)
942 : : {
943 : : /* End this parallel index scan */
944 : 14 : status = false;
945 : 14 : endscan = true;
946 : : }
619 947 [ + + ]: 668 : else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
948 : : {
949 [ - + ]: 18 : Assert(so->numArrayKeys);
950 : :
951 [ + - ]: 18 : if (first)
952 : : {
953 : : /* Can start scheduled primitive scan right away, so do so */
954 : 18 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
955 : :
956 : : /* Restore scan's array keys from serialized values */
256 957 : 18 : _bt_parallel_restore_arrays(rel, btscan, so);
619 958 : 18 : exit_loop = true;
959 : : }
960 : : else
961 : : {
962 : : /*
963 : : * Don't attempt to seize the scan when it requires another
964 : : * primitive index scan, since caller's backend cannot start
965 : : * it right now
966 : : */
455 pg@bowt.ie 967 :UBC 0 : status = false;
968 : : }
969 : :
970 : : /*
971 : : * Either way, update backend local state to indicate that a
972 : : * pending primitive scan is required
973 : : */
455 pg@bowt.ie 974 :CBC 18 : so->needPrimScan = true;
975 : 18 : so->scanBehind = false;
426 976 : 18 : so->oppositeDirCheck = false;
977 : : }
619 978 [ + - ]: 650 : else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
979 : : {
980 : : /*
981 : : * We have successfully seized control of the scan for the purpose
982 : : * of advancing it to a new page!
983 : : */
3226 rhaas@postgresql.org 984 : 650 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
403 pg@bowt.ie 985 [ - + ]: 650 : Assert(btscan->btps_nextScanPage != P_NONE);
424 986 : 650 : *next_scan_page = btscan->btps_nextScanPage;
987 : 650 : *last_curr_page = btscan->btps_lastCurrPage;
3226 rhaas@postgresql.org 988 : 650 : exit_loop = true;
989 : : }
283 pg@bowt.ie 990 : 829 : LWLockRelease(&btscan->btps_lock);
3226 rhaas@postgresql.org 991 [ + + - + ]: 829 : if (exit_loop || !status)
992 : : break;
3226 rhaas@postgresql.org 993 :LBC (5) : ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
994 : : }
3226 rhaas@postgresql.org 995 :CBC 829 : ConditionVariableCancelSleep();
996 : :
997 : : /* When the scan has reached the rightmost (or leftmost) page, end it */
403 pg@bowt.ie 998 [ + + ]: 829 : if (endscan)
999 : 14 : _bt_parallel_done(scan);
1000 : :
3226 rhaas@postgresql.org 1001 : 829 : return status;
1002 : : }
1003 : :
1004 : : /*
1005 : : * _bt_parallel_release() -- Complete the process of advancing the scan to a
1006 : : * new page. We now have the new value btps_nextScanPage; another backend
1007 : : * can now begin advancing the scan.
1008 : : *
1009 : : * Callers whose scan uses array keys must save their curr_page argument so
1010 : : * that it can be passed to _bt_parallel_primscan_schedule, should caller
1011 : : * determine that another primitive index scan is required.
1012 : : *
1013 : : * If caller's next_scan_page is P_NONE, the scan has reached the index's
1014 : : * rightmost/leftmost page. This is treated as reaching the end of the scan
1015 : : * within _bt_parallel_seize.
1016 : : *
1017 : : * Note: unlike the serial case, parallel scans don't need to remember both
1018 : : * sibling links. next_scan_page is whichever link is next given the scan's
1019 : : * direction. That's all we'll ever need, since the direction of a parallel
1020 : : * scan can never change.
1021 : : */
1022 : : void
424 pg@bowt.ie 1023 : 668 : _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
1024 : : BlockNumber curr_page)
1025 : : {
3226 rhaas@postgresql.org 1026 : 668 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1027 : : BTParallelScanDesc btscan;
1028 : :
403 pg@bowt.ie 1029 [ - + ]: 668 : Assert(BlockNumberIsValid(next_scan_page));
1030 : :
383 peter@eisentraut.org 1031 : 668 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1032 : : parallel_scan->ps_offset_am);
1033 : :
283 pg@bowt.ie 1034 : 668 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
424 1035 : 668 : btscan->btps_nextScanPage = next_scan_page;
1036 : 668 : btscan->btps_lastCurrPage = curr_page;
3226 rhaas@postgresql.org 1037 : 668 : btscan->btps_pageStatus = BTPARALLEL_IDLE;
283 pg@bowt.ie 1038 : 668 : LWLockRelease(&btscan->btps_lock);
3226 rhaas@postgresql.org 1039 : 668 : ConditionVariableSignal(&btscan->btps_cv);
1040 : 668 : }
1041 : :
1042 : : /*
1043 : : * _bt_parallel_done() -- Mark the parallel scan as complete.
1044 : : *
1045 : : * When there are no pages left to scan, this function should be called to
1046 : : * notify other workers. Otherwise, they might wait forever for the scan to
1047 : : * advance to the next page.
1048 : : */
1049 : : void
1050 : 3548386 : _bt_parallel_done(IndexScanDesc scan)
1051 : : {
455 pg@bowt.ie 1052 : 3548386 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
3226 rhaas@postgresql.org 1053 : 3548386 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1054 : : BTParallelScanDesc btscan;
1055 : 3548386 : bool status_changed = false;
1056 : :
407 pg@bowt.ie 1057 [ + - - + : 3548386 : Assert(!BTScanPosIsValid(so->currPos));
- + ]
1058 : :
1059 : : /* Do nothing, for non-parallel scans */
3226 rhaas@postgresql.org 1060 [ + + ]: 3548386 : if (parallel_scan == NULL)
1061 : 3548298 : return;
1062 : :
1063 : : /*
1064 : : * Should not mark parallel scan done when there's still a pending
1065 : : * primitive index scan
1066 : : */
455 pg@bowt.ie 1067 [ + + ]: 88 : if (so->needPrimScan)
1068 : 18 : return;
1069 : :
383 peter@eisentraut.org 1070 : 70 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1071 : : parallel_scan->ps_offset_am);
1072 : :
1073 : : /*
1074 : : * Mark the parallel scan as done, unless some other process did so
1075 : : * already
1076 : : */
283 pg@bowt.ie 1077 : 70 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
455 1078 [ - + ]: 70 : Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
619 1079 [ + + ]: 70 : if (btscan->btps_pageStatus != BTPARALLEL_DONE)
1080 : : {
3226 rhaas@postgresql.org 1081 : 44 : btscan->btps_pageStatus = BTPARALLEL_DONE;
1082 : 44 : status_changed = true;
1083 : : }
283 pg@bowt.ie 1084 : 70 : LWLockRelease(&btscan->btps_lock);
1085 : :
1086 : : /* wake up all the workers associated with this parallel scan */
3226 rhaas@postgresql.org 1087 [ + + ]: 70 : if (status_changed)
1088 : 44 : ConditionVariableBroadcast(&btscan->btps_cv);
1089 : : }
1090 : :
1091 : : /*
1092 : : * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
1093 : : *
1094 : : * Caller passes the curr_page most recently passed to _bt_parallel_release
1095 : : * by its backend. Caller successfully schedules the next primitive index scan
1096 : : * if the shared parallel state hasn't been seized since caller's backend last
1097 : : * advanced the scan.
1098 : : */
1099 : : void
424 pg@bowt.ie 1100 : 18 : _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
1101 : : {
256 1102 : 18 : Relation rel = scan->indexRelation;
3226 rhaas@postgresql.org 1103 : 18 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1104 : 18 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1105 : : BTParallelScanDesc btscan;
1106 : :
619 pg@bowt.ie 1107 [ - + ]: 18 : Assert(so->numArrayKeys);
1108 : :
383 peter@eisentraut.org 1109 : 18 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1110 : : parallel_scan->ps_offset_am);
1111 : :
283 pg@bowt.ie 1112 : 18 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
424 1113 [ + - ]: 18 : if (btscan->btps_lastCurrPage == curr_page &&
619 1114 [ + - ]: 18 : btscan->btps_pageStatus == BTPARALLEL_IDLE)
1115 : : {
424 1116 : 18 : btscan->btps_nextScanPage = InvalidBlockNumber;
1117 : 18 : btscan->btps_lastCurrPage = InvalidBlockNumber;
619 1118 : 18 : btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1119 : :
1120 : : /* Serialize scan's current array keys */
256 1121 : 18 : _bt_parallel_serialize_arrays(rel, btscan, so);
1122 : : }
283 1123 : 18 : LWLockRelease(&btscan->btps_lock);
3226 rhaas@postgresql.org 1124 : 18 : }
1125 : :
1126 : : /*
1127 : : * Bulk deletion of all index entries pointing to a set of heap tuples.
1128 : : * The set of target tuples is specified via a callback routine that tells
1129 : : * whether any given heap tuple (identified by ItemPointer) is being deleted.
1130 : : *
1131 : : * Result: a palloc'd struct containing statistical info for VACUUM displays.
1132 : : */
1133 : : IndexBulkDeleteResult *
3621 tgl@sss.pgh.pa.us 1134 : 1540 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1135 : : IndexBulkDeleteCallback callback, void *callback_state)
1136 : : {
7168 1137 : 1540 : Relation rel = info->index;
1138 : : BTCycleId cycleid;
1139 : :
1140 : : /* allocate stats if first time through, else re-use existing struct */
7162 1141 [ + + ]: 1540 : if (stats == NULL)
6 michael@paquier.xyz 1142 :GNC 1530 : stats = palloc0_object(IndexBulkDeleteResult);
1143 : :
1144 : : /* Establish the vacuum cycle ID to use for this scan */
1145 : : /* The ENSURE stuff ensures we clean up shared memory on failure */
6453 tgl@sss.pgh.pa.us 1146 [ + - ]:CBC 1540 : PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
1147 : : {
7162 1148 : 1540 : cycleid = _bt_start_vacuum(rel);
1149 : :
2055 pg@bowt.ie 1150 : 1540 : btvacuumscan(info, stats, callback, callback_state, cycleid);
1151 : : }
6453 tgl@sss.pgh.pa.us 1152 [ - + ]: 1540 : PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
1153 : 1540 : _bt_end_vacuum(rel);
1154 : :
3621 1155 : 1540 : return stats;
1156 : : }
1157 : :
1158 : : /*
1159 : : * Post-VACUUM cleanup.
1160 : : *
1161 : : * Result: a palloc'd struct containing statistical info for VACUUM displays.
1162 : : */
1163 : : IndexBulkDeleteResult *
1164 : 28172 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
1165 : : {
1166 : : BlockNumber num_delpages;
1167 : :
1168 : : /* No-op in ANALYZE ONLY mode */
6111 1169 [ + + ]: 28172 : if (info->analyze_only)
3621 1170 : 9059 : return stats;
1171 : :
1172 : : /*
1173 : : * If btbulkdelete was called, we need not do anything (we just maintain
1174 : : * the information used within _bt_vacuum_needs_cleanup() by calling
1175 : : * _bt_set_cleanup_info() below).
1176 : : *
1177 : : * If btbulkdelete was _not_ called, then we have a choice to make: we
1178 : : * must decide whether or not a btvacuumscan() call is needed now (i.e.
1179 : : * whether the ongoing VACUUM operation can entirely avoid a physical scan
1180 : : * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1181 : : * now.
1182 : : */
7168 1183 [ + + ]: 19113 : if (stats == NULL)
1184 : : {
1185 : : /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
920 pg@bowt.ie 1186 [ + + ]: 17882 : if (!_bt_vacuum_needs_cleanup(info->index))
2813 teodor@sigaev.ru 1187 : 17875 : return NULL;
1188 : :
1189 : : /*
1190 : : * Since we aren't going to actually delete any leaf items, there's no
1191 : : * need to go through all the vacuum-cycle-ID pushups here.
1192 : : *
1193 : : * Posting list tuples are a source of inaccuracy for cleanup-only
1194 : : * scans. btvacuumscan() will assume that the number of index tuples
1195 : : * from each page can be used as num_index_tuples, even though
1196 : : * num_index_tuples is supposed to represent the number of TIDs in the
1197 : : * index. This naive approach can underestimate the number of tuples
1198 : : * in the index significantly.
1199 : : *
1200 : : * We handle the problem by making num_index_tuples an estimate in
1201 : : * cleanup-only case.
1202 : : */
6 michael@paquier.xyz 1203 :GNC 7 : stats = palloc0_object(IndexBulkDeleteResult);
2055 pg@bowt.ie 1204 :CBC 7 : btvacuumscan(info, stats, NULL, NULL, 0);
1742 1205 : 7 : stats->estimated_count = true;
1206 : : }
1207 : :
1208 : : /*
1209 : : * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1210 : : *
1211 : : * num_delpages is the number of deleted pages now in the index that were
1212 : : * not safe to place in the FSM to be recycled just yet. num_delpages is
1213 : : * greater than 0 only when _bt_pagedel() actually deleted pages during
1214 : : * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1215 : : * have failed to place any newly deleted pages in the FSM just moments
1216 : : * ago. (Actually, there are edge cases where recycling of the current
1217 : : * VACUUM's newly deleted pages does not even become safe by the time the
1218 : : * next VACUUM comes around. See nbtree/README.)
1219 : : */
1756 1220 [ - + ]: 1238 : Assert(stats->pages_deleted >= stats->pages_free);
1221 : 1238 : num_delpages = stats->pages_deleted - stats->pages_free;
920 1222 : 1238 : _bt_set_cleanup_info(info->index, num_delpages);
1223 : :
1224 : : /*
1225 : : * It's quite possible for us to be fooled by concurrent page splits into
1226 : : * double-counting some index tuples, so disbelieve any total that exceeds
1227 : : * the underlying heap's count ... if we know that accurately. Otherwise
1228 : : * this might just make matters worse.
1229 : : */
5790 tgl@sss.pgh.pa.us 1230 [ + + ]: 1238 : if (!info->estimated_count)
1231 : : {
7162 1232 [ + + ]: 1206 : if (stats->num_index_tuples > info->num_heap_tuples)
1233 : 37 : stats->num_index_tuples = info->num_heap_tuples;
1234 : : }
1235 : :
3621 1236 : 1238 : return stats;
1237 : : }
1238 : :
1239 : : /*
1240 : : * btvacuumscan --- scan the index for VACUUMing purposes
1241 : : *
1242 : : * This combines the functions of looking for leaf tuples that are deletable
1243 : : * according to the vacuum callback, looking for empty pages that can be
1244 : : * deleted, and looking for old deleted pages that can be recycled. Both
1245 : : * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
1246 : : * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
1247 : : *
1248 : : * The caller is responsible for initially allocating/zeroing a stats struct
1249 : : * and for obtaining a vacuum cycle ID if necessary.
1250 : : */
1251 : : static void
7162 1252 : 1547 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1253 : : IndexBulkDeleteCallback callback, void *callback_state,
1254 : : BTCycleId cycleid)
1255 : : {
1256 : 1547 : Relation rel = info->index;
1257 : : BTVacState vstate;
1258 : : BlockNumber num_pages;
1259 : : bool needLock;
1260 : : BlockRangeReadStreamPrivate p;
270 melanieplageman@gmai 1261 : 1547 : ReadStream *stream = NULL;
1262 : :
1263 : : /*
1264 : : * Reset fields that track information about the entire index now. This
1265 : : * avoids double-counting in the case where a single VACUUM command
1266 : : * requires multiple scans of the index.
1267 : : *
1268 : : * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
1269 : : * since they track information about the VACUUM command, and so must last
1270 : : * across each call to btvacuumscan().
1271 : : *
1272 : : * (Note that pages_free is treated as state about the whole index, not
1273 : : * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1274 : : * calls are idempotent, and get repeated for the same deleted pages in
1275 : : * some scenarios. The point for us is to track the number of recyclable
1276 : : * pages in the index at the end of the VACUUM command.)
1277 : : */
1756 pg@bowt.ie 1278 : 1547 : stats->num_pages = 0;
7162 tgl@sss.pgh.pa.us 1279 : 1547 : stats->num_index_tuples = 0;
1280 : 1547 : stats->pages_deleted = 0;
1756 pg@bowt.ie 1281 : 1547 : stats->pages_free = 0;
1282 : :
1283 : : /* Set up info to pass down to btvacuumpage */
7162 tgl@sss.pgh.pa.us 1284 : 1547 : vstate.info = info;
1285 : 1547 : vstate.stats = stats;
1286 : 1547 : vstate.callback = callback;
1287 : 1547 : vstate.callback_state = callback_state;
1288 : 1547 : vstate.cycleid = cycleid;
1289 : :
1290 : : /* Create a temporary memory context to run _bt_pagedel in */
1291 : 1547 : vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
1292 : : "_bt_pagedel",
1293 : : ALLOCSET_DEFAULT_SIZES);
1294 : :
1295 : : /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1731 pg@bowt.ie 1296 : 1547 : vstate.bufsize = 0;
1297 : 1547 : vstate.maxbufsize = 0;
1298 : 1547 : vstate.pendingpages = NULL;
1299 : 1547 : vstate.npendingpages = 0;
1300 : : /* Consider applying _bt_pendingfsm_finalize optimization */
1301 : 1547 : _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1302 : :
1303 : : /*
1304 : : * The outer loop iterates over all index pages except the metapage, in
1305 : : * physical order (we hope the kernel will cooperate in providing
1306 : : * read-ahead for speed). It is critical that we visit all leaf pages,
1307 : : * including ones added after we start the scan, else we might fail to
1308 : : * delete some deletable tuples. Hence, we must repeatedly check the
1309 : : * relation length. We must acquire the relation-extension lock while
1310 : : * doing so to avoid a race condition: if someone else is extending the
1311 : : * relation, there is a window where bufmgr/smgr have created a new
1312 : : * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1313 : : * we manage to scan such a page here, we'll improperly assume it can be
1314 : : * recycled. Taking the lock synchronizes things enough to prevent a
1315 : : * problem: either num_pages won't include the new page, or _bt_getbuf
1316 : : * already has write lock on the buffer and it will be fully initialized
1317 : : * before we can examine it. Also, we need not worry if a page is added
1318 : : * immediately after we look; the page splitting code already has
1319 : : * write-lock on the left page before it adds a right page, so we must
1320 : : * already have processed any tuples due to be moved into such a page.
1321 : : *
1322 : : * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1323 : : * think the use of the extension lock is still required.
1324 : : *
1325 : : * We can skip locking for new or temp relations, however, since no one
1326 : : * else could be accessing them.
1327 : : */
7162 tgl@sss.pgh.pa.us 1328 [ + + + - ]: 1547 : needLock = !RELATION_IS_LOCAL(rel);
1329 : :
270 melanieplageman@gmai 1330 : 1547 : p.current_blocknum = BTREE_METAPAGE + 1;
1331 : :
1332 : : /*
1333 : : * It is safe to use batchmode as block_range_read_stream_cb takes no
1334 : : * locks.
1335 : : */
232 1336 : 1547 : stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
1337 : : READ_STREAM_FULL |
1338 : : READ_STREAM_USE_BATCHING,
1339 : : info->strategy,
1340 : : rel,
1341 : : MAIN_FORKNUM,
1342 : : block_range_read_stream_cb,
1343 : : &p,
1344 : : 0);
1345 : : for (;;)
1346 : : {
1347 : : /* Get the current relation length */
7162 tgl@sss.pgh.pa.us 1348 [ + + ]: 2961 : if (needLock)
1349 : 2959 : LockRelationForExtension(rel, ExclusiveLock);
1350 : 2961 : num_pages = RelationGetNumberOfBlocks(rel);
1351 [ + + ]: 2961 : if (needLock)
1352 : 2959 : UnlockRelationForExtension(rel, ExclusiveLock);
1353 : :
2450 alvherre@alvh.no-ip. 1354 [ + + ]: 2961 : if (info->report_progress)
1355 : 465 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1356 : : num_pages);
1357 : :
1358 : : /* Quit if we've scanned the whole relation */
270 melanieplageman@gmai 1359 [ + + ]: 2961 : if (p.current_blocknum >= num_pages)
7162 tgl@sss.pgh.pa.us 1360 : 1547 : break;
1361 : :
270 melanieplageman@gmai 1362 : 1414 : p.last_exclusive = num_pages;
1363 : :
1364 : : /* Iterate over pages, then loop back to recheck relation length */
1365 : : while (true)
7247 tgl@sss.pgh.pa.us 1366 : 15044 : {
1367 : : BlockNumber current_block;
1368 : : Buffer buf;
1369 : :
1370 : : /* call vacuum_delay_point while not holding any buffer lock */
270 melanieplageman@gmai 1371 : 16458 : vacuum_delay_point(false);
1372 : :
1373 : 16458 : buf = read_stream_next_buffer(stream, NULL);
1374 : :
1375 [ + + ]: 16458 : if (!BufferIsValid(buf))
1376 : 1414 : break;
1377 : :
1378 : 15044 : current_block = btvacuumpage(&vstate, buf);
1379 : :
2450 alvherre@alvh.no-ip. 1380 [ + + ]: 15044 : if (info->report_progress)
1381 : 487 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1382 : : current_block);
1383 : : }
1384 : :
1385 : : /*
1386 : : * We have to reset the read stream to use it again. After returning
1387 : : * InvalidBuffer, the read stream API won't invoke our callback again
1388 : : * until the stream has been reset.
1389 : : */
270 melanieplageman@gmai 1390 : 1414 : read_stream_reset(stream);
1391 : : }
1392 : :
1393 : 1547 : read_stream_end(stream);
1394 : :
1395 : : /* Set statistics num_pages field to final size of index */
1756 pg@bowt.ie 1396 : 1547 : stats->num_pages = num_pages;
1397 : :
7162 tgl@sss.pgh.pa.us 1398 : 1547 : MemoryContextDelete(vstate.pagedelcontext);
1399 : :
1400 : : /*
1401 : : * If there were any calls to _bt_pagedel() during scan of the index then
1402 : : * see if any of the resulting pages can be placed in the FSM now. When
1403 : : * it's not safe we'll have to leave it up to a future VACUUM operation.
1404 : : *
1405 : : * Finally, if we placed any pages in the FSM (either just now or during
1406 : : * the scan), forcibly update the upper-level FSM pages to ensure that
1407 : : * searchers can find them.
1408 : : */
1731 pg@bowt.ie 1409 : 1547 : _bt_pendingfsm_finalize(rel, &vstate);
1756 1410 [ + + ]: 1547 : if (stats->pages_free > 0)
2818 tgl@sss.pgh.pa.us 1411 : 56 : IndexFreeSpaceMapVacuum(rel);
7162 1412 : 1547 : }
1413 : :
1414 : : /*
1415 : : * btvacuumpage --- VACUUM one page
1416 : : *
1417 : : * This processes a single page for btvacuumscan(). In some cases we must
1418 : : * backtrack to re-examine and VACUUM pages that were on buf's page during
1419 : : * a previous call here. This is how we handle page splits (that happened
1420 : : * after our cycleid was acquired) whose right half page happened to reuse
1421 : : * a block that we might have processed at some point before it was
1422 : : * recycled (i.e. before the page split).
1423 : : *
1424 : : * Returns BlockNumber of a scanned page (not backtracked).
1425 : : */
1426 : : static BlockNumber
270 melanieplageman@gmai 1427 : 15044 : btvacuumpage(BTVacState *vstate, Buffer buf)
1428 : : {
7162 tgl@sss.pgh.pa.us 1429 : 15044 : IndexVacuumInfo *info = vstate->info;
1430 : 15044 : IndexBulkDeleteResult *stats = vstate->stats;
1431 : 15044 : IndexBulkDeleteCallback callback = vstate->callback;
1432 : 15044 : void *callback_state = vstate->callback_state;
1433 : 15044 : Relation rel = info->index;
988 pg@bowt.ie 1434 : 15044 : Relation heaprel = info->heaprel;
1435 : : bool attempt_pagedel;
1436 : : BlockNumber blkno,
1437 : : backtrack_to;
270 melanieplageman@gmai 1438 : 15044 : BlockNumber scanblkno = BufferGetBlockNumber(buf);
1439 : : Page page;
1440 : : BTPageOpaque opaque;
1441 : :
2054 pg@bowt.ie 1442 : 15044 : blkno = scanblkno;
1443 : :
1444 : 15044 : backtrack:
1445 : :
1446 : 15044 : attempt_pagedel = false;
1447 : 15044 : backtrack_to = P_NONE;
1448 : :
1974 1449 : 15044 : _bt_lockbuf(rel, buf, BT_READ);
3527 kgrittn@postgresql.o 1450 : 15044 : page = BufferGetPage(buf);
2054 pg@bowt.ie 1451 : 15044 : opaque = NULL;
7162 tgl@sss.pgh.pa.us 1452 [ + - ]: 15044 : if (!PageIsNew(page))
1453 : : {
1454 : 15044 : _bt_checkpage(rel, buf);
1355 michael@paquier.xyz 1455 : 15044 : opaque = BTPageGetOpaque(page);
1456 : : }
1457 : :
2054 pg@bowt.ie 1458 [ - + ]: 15044 : Assert(blkno <= scanblkno);
1459 [ - + ]: 15044 : if (blkno != scanblkno)
1460 : : {
1461 : : /*
1462 : : * We're backtracking.
1463 : : *
1464 : : * We followed a right link to a sibling leaf page (a page that
1465 : : * happens to be from a block located before scanblkno). The only
1466 : : * case we want to do anything with is a live leaf page having the
1467 : : * current vacuum cycle ID.
1468 : : *
1469 : : * The page had better be in a state that's consistent with what we
1470 : : * expect. Check for conditions that imply corruption in passing. It
1471 : : * can't be half-dead because only an interrupted VACUUM process can
1472 : : * leave pages in that state, so we'd definitely have dealt with it
1473 : : * back when the page was the scanblkno page (half-dead pages are
1474 : : * always marked fully deleted by _bt_pagedel(), barring corruption).
1475 : : */
2054 pg@bowt.ie 1476 [ # # # # :UBC 0 : if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
# # ]
1477 : : {
1478 : 0 : Assert(false);
1479 : : ereport(LOG,
1480 : : (errcode(ERRCODE_INDEX_CORRUPTED),
1481 : : errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1482 : : blkno, scanblkno, RelationGetRelationName(rel))));
1483 : : _bt_relbuf(rel, buf);
1484 : : return scanblkno;
1485 : : }
1486 : :
1487 : : /*
1488 : : * We may have already processed the page in an earlier call, when the
1489 : : * page was scanblkno. This happens when the leaf page split occurred
1490 : : * after the scan began, but before the right sibling page became the
1491 : : * scanblkno.
1492 : : *
1493 : : * Page may also have been deleted by current btvacuumpage() call,
1494 : : * since _bt_pagedel() sometimes deletes the right sibling page of
1495 : : * scanblkno in passing (it does so after we decided where to
1496 : : * backtrack to). We don't need to process this page as a deleted
1497 : : * page a second time now (in fact, it would be wrong to count it as a
1498 : : * deleted page in the bulk delete statistics a second time).
1499 : : */
1500 [ # # # # ]: 0 : if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1501 : : {
1502 : : /* Done with current scanblkno (and all lower split pages) */
7162 tgl@sss.pgh.pa.us 1503 : 0 : _bt_relbuf(rel, buf);
270 melanieplageman@gmai 1504 : 0 : return scanblkno;
1505 : : }
1506 : : }
1507 : :
988 pg@bowt.ie 1508 [ + - + + ]:CBC 15044 : if (!opaque || BTPageIsRecyclable(page, heaprel))
1509 : : {
1510 : : /* Okay to recycle this page (which could be leaf or internal) */
6286 heikki.linnakangas@i 1511 : 617 : RecordFreeIndexPage(rel, blkno);
7162 tgl@sss.pgh.pa.us 1512 : 617 : stats->pages_deleted++;
1756 pg@bowt.ie 1513 : 617 : stats->pages_free++;
1514 : : }
7162 tgl@sss.pgh.pa.us 1515 [ + + ]: 14427 : else if (P_ISDELETED(opaque))
1516 : : {
1517 : : /*
1518 : : * Already deleted page (which could be leaf or internal). Can't
1519 : : * recycle yet.
1520 : : */
1521 : 62 : stats->pages_deleted++;
1522 : : }
6985 1523 [ - + ]: 14365 : else if (P_ISHALFDEAD(opaque))
1524 : : {
1525 : : /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1755 pg@bowt.ie 1526 :UBC 0 : attempt_pagedel = true;
1527 : :
1528 : : /*
1529 : : * _bt_pagedel() will increment both pages_newly_deleted and
1530 : : * pages_deleted stats in all cases (barring corruption)
1531 : : */
1532 : : }
7162 tgl@sss.pgh.pa.us 1533 [ + + ]:CBC 14365 : else if (P_ISLEAF(opaque))
1534 : : {
1535 : : OffsetNumber deletable[MaxIndexTuplesPerPage];
1536 : : int ndeletable;
1537 : : BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1538 : : int nupdatable;
1539 : : OffsetNumber offnum,
1540 : : minoff,
1541 : : maxoff;
1542 : : int nhtidsdead,
1543 : : nhtidslive;
1544 : :
1545 : : /*
1546 : : * Trade in the initial read lock for a full cleanup lock on this
1547 : : * page. We must get such a lock on every leaf page over the course
1548 : : * of the vacuum scan, whether or not it actually contains any
1549 : : * deletable tuples --- see nbtree/README.
1550 : : */
1974 pg@bowt.ie 1551 : 13544 : _bt_upgradelockbufcleanup(rel, buf);
1552 : :
1553 : : /*
1554 : : * Check whether we need to backtrack to earlier pages. What we are
1555 : : * concerned about is a page split that happened since we started the
1556 : : * vacuum scan. If the split moved tuples on the right half of the
1557 : : * split (i.e. the tuples that sort high) to a block that we already
1558 : : * passed over, then we might have missed the tuples. We need to
1559 : : * backtrack now. (Must do this before possibly clearing btpo_cycleid
1560 : : * or deleting scanblkno page below!)
1561 : : */
7162 tgl@sss.pgh.pa.us 1562 [ + + ]: 13544 : if (vstate->cycleid != 0 &&
1563 [ - + ]: 13470 : opaque->btpo_cycleid == vstate->cycleid &&
7162 tgl@sss.pgh.pa.us 1564 [ # # ]:LBC (4) : !(opaque->btpo_flags & BTP_SPLIT_END) &&
1565 [ # # ]: (3) : !P_RIGHTMOST(opaque) &&
2054 pg@bowt.ie 1566 [ # # ]: (2) : opaque->btpo_next < scanblkno)
2054 pg@bowt.ie 1567 :UBC 0 : backtrack_to = opaque->btpo_next;
1568 : :
7162 tgl@sss.pgh.pa.us 1569 :CBC 13544 : ndeletable = 0;
2120 pg@bowt.ie 1570 : 13544 : nupdatable = 0;
7162 tgl@sss.pgh.pa.us 1571 [ + + ]: 13544 : minoff = P_FIRSTDATAKEY(opaque);
1572 : 13544 : maxoff = PageGetMaxOffsetNumber(page);
2120 pg@bowt.ie 1573 : 13544 : nhtidsdead = 0;
1574 : 13544 : nhtidslive = 0;
7162 tgl@sss.pgh.pa.us 1575 [ + + ]: 13544 : if (callback)
1576 : : {
1577 : : /* btbulkdelete callback tells us what to delete (or update) */
1578 : 13470 : for (offnum = minoff;
1579 [ + + ]: 2593018 : offnum <= maxoff;
1580 : 2579548 : offnum = OffsetNumberNext(offnum))
1581 : : {
1582 : : IndexTuple itup;
1583 : :
1584 : 2579548 : itup = (IndexTuple) PageGetItem(page,
1585 : 2579548 : PageGetItemId(page, offnum));
1586 : :
2120 pg@bowt.ie 1587 [ - + ]: 2579548 : Assert(!BTreeTupleIsPivot(itup));
1588 [ + + ]: 2579548 : if (!BTreeTupleIsPosting(itup))
1589 : : {
1590 : : /* Regular tuple, standard table TID representation */
1591 [ + + ]: 2488456 : if (callback(&itup->t_tid, callback_state))
1592 : : {
1593 : 975620 : deletable[ndeletable++] = offnum;
1594 : 975620 : nhtidsdead++;
1595 : : }
1596 : : else
1597 : 1512836 : nhtidslive++;
1598 : : }
1599 : : else
1600 : : {
1601 : : BTVacuumPosting vacposting;
1602 : : int nremaining;
1603 : :
1604 : : /* Posting list tuple */
1605 : 91092 : vacposting = btreevacuumposting(vstate, itup, offnum,
1606 : : &nremaining);
1607 [ + + ]: 91092 : if (vacposting == NULL)
1608 : : {
1609 : : /*
1610 : : * All table TIDs from the posting tuple remain, so no
1611 : : * delete or update required
1612 : : */
1613 [ - + ]: 32334 : Assert(nremaining == BTreeTupleGetNPosting(itup));
1614 : : }
1615 [ + + ]: 58758 : else if (nremaining > 0)
1616 : : {
1617 : :
1618 : : /*
1619 : : * Store metadata about posting list tuple in
1620 : : * updatable array for entire page. Existing tuple
1621 : : * will be updated during the later call to
1622 : : * _bt_delitems_vacuum().
1623 : : */
1624 [ - + ]: 22658 : Assert(nremaining < BTreeTupleGetNPosting(itup));
1625 : 22658 : updatable[nupdatable++] = vacposting;
1626 : 22658 : nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1627 : : }
1628 : : else
1629 : : {
1630 : : /*
1631 : : * All table TIDs from the posting list must be
1632 : : * deleted. We'll delete the index tuple completely
1633 : : * (no update required).
1634 : : */
1635 [ - + ]: 36100 : Assert(nremaining == 0);
1636 : 36100 : deletable[ndeletable++] = offnum;
1637 : 36100 : nhtidsdead += BTreeTupleGetNPosting(itup);
1638 : 36100 : pfree(vacposting);
1639 : : }
1640 : :
1641 : 91092 : nhtidslive += nremaining;
1642 : : }
1643 : : }
1644 : : }
1645 : :
1646 : : /*
1647 : : * Apply any needed deletes or updates. We issue just one
1648 : : * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1649 : : */
1650 [ + + + + ]: 13544 : if (ndeletable > 0 || nupdatable > 0)
1651 : : {
2054 1652 [ - + ]: 8445 : Assert(nhtidsdead >= ndeletable + nupdatable);
2120 1653 : 8445 : _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1654 : : nupdatable);
1655 : :
1656 : 8445 : stats->tuples_removed += nhtidsdead;
1657 : : /* must recompute maxoff */
7162 tgl@sss.pgh.pa.us 1658 : 8445 : maxoff = PageGetMaxOffsetNumber(page);
1659 : :
1660 : : /* can't leak memory here */
2120 pg@bowt.ie 1661 [ + + ]: 31103 : for (int i = 0; i < nupdatable; i++)
1662 : 22658 : pfree(updatable[i]);
1663 : : }
1664 : : else
1665 : : {
1666 : : /*
1667 : : * If the leaf page has been split during this vacuum cycle, it
1668 : : * seems worth expending a write to clear btpo_cycleid even if we
1669 : : * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1670 : : * takes care of this.) This ensures we won't process the page
1671 : : * again.
1672 : : *
1673 : : * We treat this like a hint-bit update because there's no need to
1674 : : * WAL-log it.
1675 : : */
1676 [ - + ]: 5099 : Assert(nhtidsdead == 0);
7162 tgl@sss.pgh.pa.us 1677 [ + + ]: 5099 : if (vstate->cycleid != 0 &&
1678 [ - + ]: 5025 : opaque->btpo_cycleid == vstate->cycleid)
1679 : : {
7162 tgl@sss.pgh.pa.us 1680 :LBC (1) : opaque->btpo_cycleid = 0;
4565 jdavis@postgresql.or 1681 : (1) : MarkBufferDirtyHint(buf, true);
1682 : : }
1683 : : }
1684 : :
1685 : : /*
1686 : : * If the leaf page is now empty, try to delete it; else count the
1687 : : * live tuples (live table TIDs in posting lists are counted as
1688 : : * separate live tuples). We don't delete when backtracking, though,
1689 : : * since that would require teaching _bt_pagedel() about backtracking
1690 : : * (doesn't seem worth adding more complexity to deal with that).
1691 : : *
1692 : : * We don't count the number of live TIDs during cleanup-only calls to
1693 : : * btvacuumscan (i.e. when callback is not set). We count the number
1694 : : * of index tuples directly instead. This avoids the expense of
1695 : : * directly examining all of the tuples on each page. VACUUM will
1696 : : * treat num_index_tuples as an estimate in cleanup-only case, so it
1697 : : * doesn't matter that this underestimates num_index_tuples
1698 : : * significantly in some cases.
1699 : : */
7162 tgl@sss.pgh.pa.us 1700 [ + + ]:CBC 13544 : if (minoff > maxoff)
2054 pg@bowt.ie 1701 : 3147 : attempt_pagedel = (blkno == scanblkno);
1868 1702 [ + + ]: 10397 : else if (callback)
2120 1703 : 10327 : stats->num_index_tuples += nhtidslive;
1704 : : else
1868 1705 : 70 : stats->num_index_tuples += maxoff - minoff + 1;
1706 : :
2054 1707 [ + + - + ]: 13544 : Assert(!attempt_pagedel || nhtidslive == 0);
1708 : : }
1709 : :
1710 [ + + ]: 15044 : if (attempt_pagedel)
1711 : : {
1712 : : MemoryContext oldcontext;
1713 : :
1714 : : /* Run pagedel in a temp context to avoid memory leakage */
7162 tgl@sss.pgh.pa.us 1715 : 3147 : MemoryContextReset(vstate->pagedelcontext);
1716 : 3147 : oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1717 : :
1718 : : /*
1719 : : * _bt_pagedel maintains the bulk delete stats on our behalf;
1720 : : * pages_newly_deleted and pages_deleted are likely to be incremented
1721 : : * during call
1722 : : */
2054 pg@bowt.ie 1723 [ - + ]: 3147 : Assert(blkno == scanblkno);
1755 1724 : 3147 : _bt_pagedel(rel, buf, vstate);
1725 : :
7162 tgl@sss.pgh.pa.us 1726 : 3147 : MemoryContextSwitchTo(oldcontext);
1727 : : /* pagedel released buffer, so we shouldn't */
1728 : : }
1729 : : else
1730 : 11897 : _bt_relbuf(rel, buf);
1731 : :
2054 pg@bowt.ie 1732 [ - + ]: 15044 : if (backtrack_to != P_NONE)
1733 : : {
2054 pg@bowt.ie 1734 :UBC 0 : blkno = backtrack_to;
1735 : :
1736 : : /* check for vacuum delay while not holding any buffer lock */
270 melanieplageman@gmai 1737 : 0 : vacuum_delay_point(false);
1738 : :
1739 : : /*
1740 : : * We can't use _bt_getbuf() here because it always applies
1741 : : * _bt_checkpage(), which will barf on an all-zero page. We want to
1742 : : * recycle all-zero pages, not fail. Also, we want to use a
1743 : : * nondefault buffer access strategy.
1744 : : */
1745 : 0 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1746 : : info->strategy);
2054 pg@bowt.ie 1747 : 0 : goto backtrack;
1748 : : }
1749 : :
270 melanieplageman@gmai 1750 :CBC 15044 : return scanblkno;
1751 : : }
1752 : :
1753 : : /*
1754 : : * btreevacuumposting --- determine TIDs still needed in posting list
1755 : : *
1756 : : * Returns metadata describing how to build replacement tuple without the TIDs
1757 : : * that VACUUM needs to delete. Returned value is NULL in the common case
1758 : : * where no changes are needed to caller's posting list tuple (we avoid
1759 : : * allocating memory here as an optimization).
1760 : : *
1761 : : * The number of TIDs that should remain in the posting list tuple is set for
1762 : : * caller in *nremaining.
1763 : : */
1764 : : static BTVacuumPosting
2120 pg@bowt.ie 1765 : 91092 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1766 : : OffsetNumber updatedoffset, int *nremaining)
1767 : : {
1768 : 91092 : int live = 0;
1769 : 91092 : int nitem = BTreeTupleGetNPosting(posting);
1770 : 91092 : ItemPointer items = BTreeTupleGetPosting(posting);
1771 : 91092 : BTVacuumPosting vacposting = NULL;
1772 : :
1773 [ + + ]: 498079 : for (int i = 0; i < nitem; i++)
1774 : : {
1775 [ + + ]: 406987 : if (!vstate->callback(items + i, vstate->callback_state))
1776 : : {
1777 : : /* Live table TID */
1778 : 202927 : live++;
1779 : : }
1780 [ + + ]: 204060 : else if (vacposting == NULL)
1781 : : {
1782 : : /*
1783 : : * First dead table TID encountered.
1784 : : *
1785 : : * It's now clear that we need to delete one or more dead table
1786 : : * TIDs, so start maintaining metadata describing how to update
1787 : : * existing posting list tuple.
1788 : : */
1789 : 58758 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1790 : : nitem * sizeof(uint16));
1791 : :
1792 : 58758 : vacposting->itup = posting;
1793 : 58758 : vacposting->updatedoffset = updatedoffset;
1794 : 58758 : vacposting->ndeletedtids = 0;
1795 : 58758 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1796 : : }
1797 : : else
1798 : : {
1799 : : /* Second or subsequent dead table TID */
1800 : 145302 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1801 : : }
1802 : : }
1803 : :
1804 : 91092 : *nremaining = live;
1805 : 91092 : return vacposting;
1806 : : }
1807 : :
1808 : : /*
1809 : : * btcanreturn() -- Check whether btree indexes support index-only scans.
1810 : : *
1811 : : * btrees always do, so this is trivial.
1812 : : */
1813 : : bool
3621 tgl@sss.pgh.pa.us 1814 : 575950 : btcanreturn(Relation index, int attno)
1815 : : {
1816 : 575950 : return true;
1817 : : }
1818 : :
1819 : : /*
1820 : : * btgettreeheight() -- Compute tree height for use by btcostestimate().
1821 : : */
1822 : : int
462 peter@eisentraut.org 1823 : 367917 : btgettreeheight(Relation rel)
1824 : : {
1825 : 367917 : return _bt_getrootheight(rel);
1826 : : }
1827 : :
1828 : : CompareType
298 peter@eisentraut.org 1829 :UBC 0 : bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
1830 : : {
317 1831 [ # # # # : 0 : switch (strategy)
# # ]
1832 : : {
1833 : 0 : case BTLessStrategyNumber:
1834 : 0 : return COMPARE_LT;
1835 : 0 : case BTLessEqualStrategyNumber:
1836 : 0 : return COMPARE_LE;
1837 : 0 : case BTEqualStrategyNumber:
1838 : 0 : return COMPARE_EQ;
1839 : 0 : case BTGreaterEqualStrategyNumber:
1840 : 0 : return COMPARE_GE;
1841 : 0 : case BTGreaterStrategyNumber:
1842 : 0 : return COMPARE_GT;
1843 : 0 : default:
1844 : 0 : return COMPARE_INVALID;
1845 : : }
1846 : : }
1847 : :
1848 : : StrategyNumber
298 1849 : 0 : bttranslatecmptype(CompareType cmptype, Oid opfamily)
1850 : : {
317 1851 [ # # # # : 0 : switch (cmptype)
# # ]
1852 : : {
1853 : 0 : case COMPARE_LT:
1854 : 0 : return BTLessStrategyNumber;
1855 : 0 : case COMPARE_LE:
1856 : 0 : return BTLessEqualStrategyNumber;
1857 : 0 : case COMPARE_EQ:
1858 : 0 : return BTEqualStrategyNumber;
1859 : 0 : case COMPARE_GE:
1860 : 0 : return BTGreaterEqualStrategyNumber;
1861 : 0 : case COMPARE_GT:
1862 : 0 : return BTGreaterStrategyNumber;
1863 : 0 : default:
1864 : 0 : return InvalidStrategy;
1865 : : }
1866 : : }
|