Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * nbtree.c
4 : : * Implementation of Lehman and Yao's btree management algorithm for
5 : : * Postgres.
6 : : *
7 : : * NOTES
8 : : * This file contains only the public interface routines.
9 : : *
10 : : *
11 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
12 : : * Portions Copyright (c) 1994, Regents of the University of California
13 : : *
14 : : * IDENTIFICATION
15 : : * src/backend/access/nbtree/nbtree.c
16 : : *
17 : : *-------------------------------------------------------------------------
18 : : */
19 : : #include "postgres.h"
20 : :
21 : : #include "access/nbtree.h"
22 : : #include "access/relscan.h"
23 : : #include "access/stratnum.h"
24 : : #include "commands/progress.h"
25 : : #include "commands/vacuum.h"
26 : : #include "nodes/execnodes.h"
27 : : #include "pgstat.h"
28 : : #include "storage/bulk_write.h"
29 : : #include "storage/condition_variable.h"
30 : : #include "storage/indexfsm.h"
31 : : #include "storage/ipc.h"
32 : : #include "storage/lmgr.h"
33 : : #include "storage/lwlock.h"
34 : : #include "storage/read_stream.h"
35 : : #include "utils/datum.h"
36 : : #include "utils/fmgrprotos.h"
37 : : #include "utils/index_selfuncs.h"
38 : : #include "utils/memutils.h"
39 : : #include "utils/wait_event.h"
40 : :
41 : :
42 : : /*
43 : : * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
44 : : *
45 : : * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
46 : : * scan to advance it via another call to _bt_first.
47 : : *
48 : : * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
49 : : * a new page; others must wait.
50 : : *
51 : : * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
52 : : * to a new page; some process can start doing that.
53 : : *
54 : : * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
55 : : */
56 : : typedef enum
57 : : {
58 : : BTPARALLEL_NOT_INITIALIZED,
59 : : BTPARALLEL_NEED_PRIMSCAN,
60 : : BTPARALLEL_ADVANCING,
61 : : BTPARALLEL_IDLE,
62 : : BTPARALLEL_DONE,
63 : : } BTPS_State;
64 : :
65 : : /*
66 : : * BTParallelScanDescData contains btree specific shared information required
67 : : * for parallel scan.
68 : : */
69 : : typedef struct BTParallelScanDescData
70 : : {
71 : : BlockNumber btps_nextScanPage; /* next page to be scanned */
72 : : BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
73 : : * btps_nextScanPage */
74 : : BTPS_State btps_pageStatus; /* indicates whether next page is
75 : : * available for scan. see above for
76 : : * possible states of parallel scan. */
77 : : LWLock btps_lock; /* protects shared parallel state */
78 : : ConditionVariable btps_cv; /* used to synchronize parallel scan */
79 : :
80 : : /*
81 : : * btps_arrElems is used when scans need to schedule another primitive
82 : : * index scan with one or more SAOP arrays. Holds BTArrayKeyInfo.cur_elem
83 : : * offsets for each = scan key associated with a ScalarArrayOp array.
84 : : */
85 : : int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
86 : :
87 : : /*
88 : : * Additional space (at the end of the struct) is used when scans need to
89 : : * schedule another primitive index scan with one or more skip arrays.
90 : : * Holds a flattened datum representation for each = scan key associated
91 : : * with a skip array.
92 : : */
93 : : } BTParallelScanDescData;
94 : :
95 : : typedef struct BTParallelScanDescData *BTParallelScanDesc;
96 : :
97 : :
98 : : static bool _bt_start_prim_scan(IndexScanDesc scan);
99 : : static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
100 : : BTScanOpaque so);
101 : : static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
102 : : BTScanOpaque so);
103 : : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
104 : : IndexBulkDeleteCallback callback, void *callback_state,
105 : : BTCycleId cycleid);
106 : : static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf);
107 : : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
108 : : IndexTuple posting,
109 : : OffsetNumber updatedoffset,
110 : : int *nremaining);
111 : :
112 : :
113 : : /*
114 : : * Btree handler function: return IndexAmRoutine with access method parameters
115 : : * and callbacks.
116 : : */
117 : : Datum
3761 tgl@sss.pgh.pa.us 118 :CBC 1894896 : bthandler(PG_FUNCTION_ARGS)
119 : : {
120 : : static const IndexAmRoutine amroutine = {
121 : : .type = T_IndexAmRoutine,
122 : : .amstrategies = BTMaxStrategyNumber,
123 : : .amsupport = BTNProcs,
124 : : .amoptsprocnum = BTOPTIONS_PROC,
125 : : .amcanorder = true,
126 : : .amcanorderbyop = false,
127 : : .amcanhash = false,
128 : : .amconsistentequality = true,
129 : : .amconsistentordering = true,
130 : : .amcanbackward = true,
131 : : .amcanunique = true,
132 : : .amcanmulticol = true,
133 : : .amoptionalkey = true,
134 : : .amsearcharray = true,
135 : : .amsearchnulls = true,
136 : : .amstorage = false,
137 : : .amclusterable = true,
138 : : .ampredlocks = true,
139 : : .amcanparallel = true,
140 : : .amcanbuildparallel = true,
141 : : .amcaninclude = true,
142 : : .amusemaintenanceworkmem = false,
143 : : .amsummarizing = false,
144 : : .amparallelvacuumoptions =
145 : : VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP,
146 : : .amkeytype = InvalidOid,
147 : :
148 : : .ambuild = btbuild,
149 : : .ambuildempty = btbuildempty,
150 : : .aminsert = btinsert,
151 : : .aminsertcleanup = NULL,
152 : : .ambulkdelete = btbulkdelete,
153 : : .amvacuumcleanup = btvacuumcleanup,
154 : : .amcanreturn = btcanreturn,
155 : : .amcostestimate = btcostestimate,
156 : : .amgettreeheight = btgettreeheight,
157 : : .amoptions = btoptions,
158 : : .amproperty = btproperty,
159 : : .ambuildphasename = btbuildphasename,
160 : : .amvalidate = btvalidate,
161 : : .amadjustmembers = btadjustmembers,
162 : : .ambeginscan = btbeginscan,
163 : : .amrescan = btrescan,
164 : : .amgettuple = btgettuple,
165 : : .amgetbitmap = btgetbitmap,
166 : : .amendscan = btendscan,
167 : : .ammarkpos = btmarkpos,
168 : : .amrestrpos = btrestrpos,
169 : : .amestimateparallelscan = btestimateparallelscan,
170 : : .aminitparallelscan = btinitparallelscan,
171 : : .amparallelrescan = btparallelrescan,
172 : : .amtranslatestrategy = bttranslatestrategy,
173 : : .amtranslatecmptype = bttranslatecmptype,
174 : : };
175 : :
126 tgl@sss.pgh.pa.us 176 :GNC 1894896 : PG_RETURN_POINTER(&amroutine);
177 : : }
178 : :
179 : : /*
180 : : * btbuildempty() -- build an empty btree index in the initialization fork
181 : : */
182 : : void
3761 tgl@sss.pgh.pa.us 183 :CBC 124 : btbuildempty(Relation index)
184 : : {
986 heikki.linnakangas@i 185 : 124 : bool allequalimage = _bt_allequalimage(index, false);
186 : : BulkWriteState *bulkstate;
187 : : BulkWriteBuffer metabuf;
188 : :
802 189 : 124 : bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
190 : :
191 : : /* Construct metapage. */
192 : 124 : metabuf = smgr_bulk_get_buf(bulkstate);
193 : 124 : _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
194 : 124 : smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
195 : :
196 : 124 : smgr_bulk_finish(bulkstate);
5606 rhaas@postgresql.org 197 : 124 : }
198 : :
199 : : /*
200 : : * btinsert() -- insert an index tuple into a btree.
201 : : *
202 : : * Descend the tree recursively, find the appropriate location for our
203 : : * new tuple, and put it there.
204 : : */
205 : : bool
3761 tgl@sss.pgh.pa.us 206 : 5355524 : btinsert(Relation rel, Datum *values, bool *isnull,
207 : : ItemPointer ht_ctid, Relation heapRel,
208 : : IndexUniqueCheck checkUnique,
209 : : bool indexUnchanged,
210 : : IndexInfo *indexInfo)
211 : : {
212 : : bool result;
213 : : IndexTuple itup;
214 : :
215 : : /* generate an index tuple */
7715 216 : 5355524 : itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
10467 bruce@momjian.us 217 : 5355524 : itup->t_tid = *ht_ctid;
218 : :
1938 pg@bowt.ie 219 : 5355524 : result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
220 : :
10467 bruce@momjian.us 221 : 5355198 : pfree(itup);
222 : :
3761 tgl@sss.pgh.pa.us 223 : 5355198 : return result;
224 : : }
225 : :
226 : : /*
227 : : * btgettuple() -- Get the next tuple in the scan.
228 : : */
229 : : bool
230 : 22324411 : btgettuple(IndexScanDesc scan, ScanDirection dir)
231 : : {
8747 232 : 22324411 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
233 : : bool res;
234 : :
333 pg@bowt.ie 235 [ - + ]: 22324411 : Assert(scan->heapRelation != NULL);
236 : :
237 : : /* btree indexes are never lossy */
6596 tgl@sss.pgh.pa.us 238 : 22324411 : scan->xs_recheck = false;
239 : :
240 : : /* Each loop iteration performs another primitive index scan */
241 : : do
242 : : {
243 : : /*
244 : : * If we've already initialized this scan, we can just advance it in
245 : : * the appropriate direction. If we haven't done so yet, we call
246 : : * _bt_first() to get the first item in the scan.
247 : : */
5315 248 [ + + - + : 22335743 : if (!BTScanPosIsValid(so->currPos))
+ + ]
249 : 9672193 : res = _bt_first(scan, dir);
250 : : else
251 : : {
252 : : /*
253 : : * Check to see if we should kill the previously-fetched tuple.
254 : : */
255 [ + + ]: 12663550 : if (scan->kill_prior_tuple)
256 : : {
257 : : /*
258 : : * Yes, remember it for later. (We'll deal with all such
259 : : * tuples at once right before leaving the index page.) The
260 : : * test for numKilled overrun is not just paranoia: if the
261 : : * caller reverses direction in the indexscan then the same
262 : : * item might get entered multiple times. It's not worth
263 : : * trying to optimize that, so we don't detect it, but instead
264 : : * just forget any excess entries.
265 : : */
266 [ + + ]: 258579 : if (so->killedItems == NULL)
146 michael@paquier.xyz 267 :GNC 104673 : so->killedItems = palloc_array(int, MaxTIDsPerBTreePage);
2260 pg@bowt.ie 268 [ + - ]:CBC 258579 : if (so->numKilled < MaxTIDsPerBTreePage)
5315 tgl@sss.pgh.pa.us 269 : 258579 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
270 : : }
271 : :
272 : : /*
273 : : * Now continue the scan.
274 : : */
275 : 12663550 : res = _bt_next(scan, dir);
276 : : }
277 : :
278 : : /* If we have a tuple, return it ... */
279 [ + + ]: 22335743 : if (res)
280 : 17814618 : break;
281 : : /* ... otherwise see if we need another primitive index scan */
148 pg@bowt.ie 282 [ + + + + ]:GNC 4521125 : } while (so->numArrayKeys && _bt_start_prim_scan(scan));
283 : :
3761 tgl@sss.pgh.pa.us 284 :CBC 22324411 : return res;
285 : : }
286 : :
287 : : /*
288 : : * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
289 : : */
290 : : int64
291 : 10214 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
292 : : {
7709 293 : 10214 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
6599 294 : 10214 : int64 ntids = 0;
295 : : ItemPointer heapTid;
296 : :
333 pg@bowt.ie 297 [ - + ]: 10214 : Assert(scan->heapRelation == NULL);
298 : :
299 : : /* Each loop iteration performs another primitive index scan */
300 : : do
301 : : {
302 : : /* Fetch the first page & tuple */
5315 tgl@sss.pgh.pa.us 303 [ + + ]: 10681 : if (_bt_first(scan, ForwardScanDirection))
304 : : {
305 : : /* Save tuple ID, and continue scanning */
2612 andres@anarazel.de 306 : 8855 : heapTid = &scan->xs_heaptid;
5315 tgl@sss.pgh.pa.us 307 : 8855 : tbm_add_tuples(tbm, heapTid, 1, false);
308 : 8855 : ntids++;
309 : :
310 : : for (;;)
311 : : {
312 : : /*
313 : : * Advance to next tuple within page. This is the same as the
314 : : * easy case in _bt_next().
315 : : */
316 [ + + ]: 1354186 : if (++so->currPos.itemIndex > so->currPos.lastItem)
317 : : {
318 : : /* let _bt_next do the heavy lifting */
319 [ + + ]: 12149 : if (!_bt_next(scan, ForwardScanDirection))
320 : 8855 : break;
321 : : }
322 : :
323 : : /* Save tuple ID, and continue scanning */
324 : 1345331 : heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
325 : 1345331 : tbm_add_tuples(tbm, heapTid, 1, false);
326 : 1345331 : ntids++;
327 : : }
328 : : }
329 : : /* Now see if we need another primitive index scan */
148 pg@bowt.ie 330 [ + + + + ]:GNC 10681 : } while (so->numArrayKeys && _bt_start_prim_scan(scan));
331 : :
3761 tgl@sss.pgh.pa.us 332 :CBC 10214 : return ntids;
333 : : }
334 : :
335 : : /*
336 : : * btbeginscan() -- start a scan on a btree index
337 : : */
338 : : IndexScanDesc
339 : 9198391 : btbeginscan(Relation rel, int nkeys, int norderbys)
340 : : {
341 : : IndexScanDesc scan;
342 : : BTScanOpaque so;
343 : :
344 : : /* no order by operators allowed */
5633 345 [ - + ]: 9198391 : Assert(norderbys == 0);
346 : :
347 : : /* get the scan */
348 : 9198391 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
349 : :
350 : : /* allocate private workspace */
146 michael@paquier.xyz 351 :GNC 9198391 : so = palloc_object(BTScanOpaqueData);
4059 kgrittn@postgresql.o 352 :CBC 9198391 : BTScanPosInvalidate(so->currPos);
353 : 9198391 : BTScanPosInvalidate(so->markPos);
5633 tgl@sss.pgh.pa.us 354 [ + + ]: 9198391 : if (scan->numberOfKeys > 0)
355 : 9190705 : so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
356 : : else
357 : 7686 : so->keyData = NULL;
358 : :
396 pg@bowt.ie 359 : 9198391 : so->skipScan = false;
759 360 : 9198391 : so->needPrimScan = false;
361 : 9198391 : so->scanBehind = false;
566 362 : 9198391 : so->oppositeDirCheck = false;
5315 tgl@sss.pgh.pa.us 363 : 9198391 : so->arrayKeys = NULL;
759 pg@bowt.ie 364 : 9198391 : so->orderProcs = NULL;
5315 tgl@sss.pgh.pa.us 365 : 9198391 : so->arrayContext = NULL;
366 : :
5633 367 : 9198391 : so->killedItems = NULL; /* until needed */
368 : 9198391 : so->numKilled = 0;
369 : :
370 : : /*
371 : : * We don't know yet whether the scan will be index-only, so we do not
372 : : * allocate the tuple workspace arrays until btrescan. However, we set up
373 : : * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
374 : : */
5322 375 : 9198391 : so->currTuples = so->markTuples = NULL;
376 : :
5315 377 : 9198391 : scan->xs_itupdesc = RelationGetDescr(rel);
378 : :
5633 379 : 9198391 : scan->opaque = so;
380 : :
3761 381 : 9198391 : return scan;
382 : : }
383 : :
384 : : /*
385 : : * btrescan() -- rescan an index relation
386 : : */
387 : : void
388 : 9671870 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
389 : : ScanKey orderbys, int norderbys)
390 : : {
5633 391 : 9671870 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
392 : :
393 : : /* we aren't holding any read locks, but gotta drop the pins */
7303 394 [ + + - + : 9671870 : if (BTScanPosIsValid(so->currPos))
+ + ]
395 : : {
396 : : /* Before leaving current page, deal with any killed items */
397 [ + + ]: 97789 : if (so->numKilled > 0)
4059 kgrittn@postgresql.o 398 : 499 : _bt_killitems(scan);
399 [ - + - - : 97789 : BTScanPosUnpinIfPinned(so->currPos);
+ + ]
400 : 97789 : BTScanPosInvalidate(so->currPos);
401 : : }
402 : :
403 : : /*
404 : : * We prefer to eagerly drop leaf page pins before btgettuple returns.
405 : : * This avoids making VACUUM wait to acquire a cleanup lock on the page.
406 : : *
407 : : * We cannot safely drop leaf page pins during index-only scans due to a
408 : : * race condition involving VACUUM setting pages all-visible in the VM.
409 : : * It's also unsafe for plain index scans that use a non-MVCC snapshot.
410 : : *
411 : : * Also opt out of dropping leaf page pins eagerly during bitmap scans.
412 : : * Pins cannot be held for more than an instant during bitmap scans either
413 : : * way, so we might as well avoid wasting cycles on acquiring page LSNs.
414 : : *
415 : : * See nbtree/README section on making concurrent TID recycling safe.
416 : : *
417 : : * Note: so->dropPin should never change across rescans.
418 : : */
333 pg@bowt.ie 419 : 28778350 : so->dropPin = (!scan->xs_want_itup &&
53 andres@anarazel.de 420 [ + + + + :GNC 18389406 : IsMVCCLikeSnapshot(scan->xs_snapshot) &&
+ + ]
333 pg@bowt.ie 421 [ + + ]:CBC 8717536 : scan->heapRelation != NULL);
422 : :
7194 tgl@sss.pgh.pa.us 423 : 9671870 : so->markItemIndex = -1;
759 pg@bowt.ie 424 : 9671870 : so->needPrimScan = false;
425 : 9671870 : so->scanBehind = false;
566 426 : 9671870 : so->oppositeDirCheck = false;
4059 kgrittn@postgresql.o 427 [ + - - + : 9671870 : BTScanPosUnpinIfPinned(so->markPos);
- + ]
428 : 9671870 : BTScanPosInvalidate(so->markPos);
429 : :
430 : : /*
431 : : * Allocate tuple workspace arrays, if needed for an index-only scan and
432 : : * not already done in a previous rescan call. To save on palloc
433 : : * overhead, both workspaces are allocated as one palloc block; only this
434 : : * function and btendscan know that.
435 : : */
5322 tgl@sss.pgh.pa.us 436 [ + + + + ]: 9671870 : if (scan->xs_want_itup && so->currTuples == NULL)
437 : : {
438 : 89151 : so->currTuples = (char *) palloc(BLCKSZ * 2);
439 : 89151 : so->markTuples = so->currTuples + BLCKSZ;
440 : : }
441 : :
442 : : /*
443 : : * Reset the scan keys
444 : : */
8444 445 [ + + + + ]: 9671870 : if (scankey && scan->numberOfKeys > 0)
601 peter@eisentraut.org 446 : 9664136 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
8210 tgl@sss.pgh.pa.us 447 : 9671870 : so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
759 pg@bowt.ie 448 : 9671870 : so->numArrayKeys = 0; /* ditto */
10892 scrappy@hub.org 449 : 9671870 : }
450 : :
451 : : /*
452 : : * btendscan() -- close down a scan
453 : : */
454 : : void
3761 tgl@sss.pgh.pa.us 455 : 9197315 : btendscan(IndexScanDesc scan)
456 : : {
7303 457 : 9197315 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
458 : :
459 : : /* we aren't holding any read locks, but gotta drop the pins */
460 [ + + - + : 9197315 : if (BTScanPosIsValid(so->currPos))
+ + ]
461 : : {
462 : : /* Before leaving current page, deal with any killed items */
463 [ + + ]: 5052523 : if (so->numKilled > 0)
4059 kgrittn@postgresql.o 464 : 54005 : _bt_killitems(scan);
465 [ - + - - : 5052523 : BTScanPosUnpinIfPinned(so->currPos);
+ + ]
466 : : }
467 : :
7194 tgl@sss.pgh.pa.us 468 : 9197315 : so->markItemIndex = -1;
4059 kgrittn@postgresql.o 469 [ + + - + : 9197315 : BTScanPosUnpinIfPinned(so->markPos);
+ + ]
470 : :
471 : : /* No need to invalidate positions, the RAM is about to be freed. */
472 : :
473 : : /* Release storage */
8154 neilc@samurai.com 474 [ + + ]: 9197315 : if (so->keyData != NULL)
10467 bruce@momjian.us 475 : 9189648 : pfree(so->keyData);
476 : : /* so->arrayKeys and so->orderProcs are in arrayContext */
5315 tgl@sss.pgh.pa.us 477 [ + + ]: 9197315 : if (so->arrayContext != NULL)
478 : 11775 : MemoryContextDelete(so->arrayContext);
479 [ + + ]: 9197315 : if (so->killedItems != NULL)
480 : 104635 : pfree(so->killedItems);
5322 481 [ + + ]: 9197315 : if (so->currTuples != NULL)
482 : 89122 : pfree(so->currTuples);
483 : : /* so->markTuples should not be pfree'd, see btrescan */
10467 bruce@momjian.us 484 : 9197315 : pfree(so);
10892 scrappy@hub.org 485 : 9197315 : }
486 : :
487 : : /*
488 : : * btmarkpos() -- save current scan position
489 : : */
490 : : void
3761 tgl@sss.pgh.pa.us 491 : 86084 : btmarkpos(IndexScanDesc scan)
492 : : {
7303 493 : 86084 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
494 : :
495 : : /* There may be an old mark with a pin (but no lock). */
4059 kgrittn@postgresql.o 496 [ + + - + : 86084 : BTScanPosUnpinIfPinned(so->markPos);
+ + ]
497 : :
498 : : /*
499 : : * Just record the current itemIndex. If we later step to next page
500 : : * before releasing the marked position, _bt_steppage makes a full copy of
501 : : * the currPos struct in markPos. If (as often happens) the mark is moved
502 : : * before we leave the page, we don't have to do that work.
503 : : */
7303 tgl@sss.pgh.pa.us 504 [ - + - - : 86084 : if (BTScanPosIsValid(so->currPos))
+ - ]
7194 505 : 86084 : so->markItemIndex = so->currPos.itemIndex;
506 : : else
507 : : {
4059 kgrittn@postgresql.o 508 :UBC 0 : BTScanPosInvalidate(so->markPos);
7194 tgl@sss.pgh.pa.us 509 : 0 : so->markItemIndex = -1;
510 : : }
10892 scrappy@hub.org 511 :CBC 86084 : }
512 : :
513 : : /*
514 : : * btrestrpos() -- restore scan to last saved position
515 : : */
516 : : void
3761 tgl@sss.pgh.pa.us 517 : 36024 : btrestrpos(IndexScanDesc scan)
518 : : {
7303 519 : 36024 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
520 : :
7194 521 [ + + ]: 36024 : if (so->markItemIndex >= 0)
522 : : {
523 : : /*
524 : : * The scan has never moved to a new page since the last mark. Just
525 : : * restore the itemIndex.
526 : : *
527 : : * NB: In this case we can't count on anything in so->markPos to be
528 : : * accurate.
529 : : */
530 : 35952 : so->currPos.itemIndex = so->markItemIndex;
531 : : }
532 : : else
533 : : {
534 : : /*
535 : : * The scan moved to a new page after last mark or restore, and we are
536 : : * now restoring to the marked page. We aren't holding any read
537 : : * locks, but if we're still holding the pin for the current position,
538 : : * we must drop it.
539 : : */
540 [ - + - - : 72 : if (BTScanPosIsValid(so->currPos))
+ - ]
541 : : {
542 : : /* Before leaving current page, deal with any killed items */
4059 kgrittn@postgresql.o 543 [ - + ]: 72 : if (so->numKilled > 0)
4059 kgrittn@postgresql.o 544 :UBC 0 : _bt_killitems(scan);
4059 kgrittn@postgresql.o 545 [ - + - - :CBC 72 : BTScanPosUnpinIfPinned(so->currPos);
- + ]
546 : : }
547 : :
7194 tgl@sss.pgh.pa.us 548 [ - + - - : 72 : if (BTScanPosIsValid(so->markPos))
+ - ]
549 : : {
550 : : /* bump pin on mark buffer for assignment to current buffer */
4059 kgrittn@postgresql.o 551 [ - + - - : 72 : if (BTScanPosIsPinned(so->markPos))
- + ]
4059 kgrittn@postgresql.o 552 :UBC 0 : IncrBufferRefCount(so->markPos.buf);
7194 tgl@sss.pgh.pa.us 553 :CBC 72 : memcpy(&so->currPos, &so->markPos,
554 : : offsetof(BTScanPosData, items[1]) +
555 : 72 : so->markPos.lastItem * sizeof(BTScanPosItem));
5322 556 [ - + ]: 72 : if (so->currTuples)
5322 tgl@sss.pgh.pa.us 557 :UBC 0 : memcpy(so->currTuples, so->markTuples,
558 : 0 : so->markPos.nextTupleOffset);
559 : : /* Reset the scan's array keys (see _bt_steppage for why) */
759 pg@bowt.ie 560 [ - + ]:CBC 72 : if (so->numArrayKeys)
561 : : {
759 pg@bowt.ie 562 :UBC 0 : _bt_start_array_keys(scan, so->currPos.dir);
563 : 0 : so->needPrimScan = false;
564 : : }
565 : : }
566 : : else
4059 kgrittn@postgresql.o 567 : 0 : BTScanPosInvalidate(so->currPos);
568 : : }
10892 scrappy@hub.org 569 :CBC 36024 : }
570 : :
571 : : /*
572 : : * btestimateparallelscan -- estimate storage for BTParallelScanDescData
573 : : */
574 : : Size
396 pg@bowt.ie 575 : 42 : btestimateparallelscan(Relation rel, int nkeys, int norderbys)
576 : : {
577 : 42 : int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
578 : : Size estnbtreeshared,
579 : : genericattrspace;
580 : :
581 : : /*
582 : : * Pessimistically assume that every input scan key will be output with
583 : : * its own SAOP array
584 : : */
585 : 42 : estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) +
586 : : sizeof(int) * nkeys;
587 : :
588 : : /* Single column indexes cannot possibly use a skip array */
589 [ + + ]: 42 : if (nkeyatts == 1)
590 : 30 : return estnbtreeshared;
591 : :
592 : : /*
593 : : * Pessimistically assume that all attributes prior to the least
594 : : * significant attribute require a skip array (and an associated key)
595 : : */
596 : 12 : genericattrspace = datumEstimateSpace((Datum) 0, false, true,
597 : : sizeof(Datum));
598 [ + + ]: 24 : for (int attnum = 1; attnum < nkeyatts; attnum++)
599 : : {
600 : : CompactAttribute *attr;
601 : :
602 : : /*
603 : : * We make the conservative assumption that every index column will
604 : : * also require a skip array.
605 : : *
606 : : * Every skip array must have space to store its scan key's sk_flags.
607 : : * We also need space for each skip array's unused btps_arrElems slot
608 : : * (we need to be able to subscript btps_arrElems using a simple
609 : : * so->arrayKeys[]-wise offset for any subsequent SAOP arrays).
610 : : */
6 611 : 12 : estnbtreeshared = add_size(estnbtreeshared, sizeof(int) * 2);
612 : :
613 : : /* Consider space required to store a datum of opclass input type */
396 614 : 12 : attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
615 [ + - ]: 12 : if (attr->attbyval)
616 : 12 : {
617 : : /* This index attribute stores pass-by-value datums */
618 : 12 : Size estfixed = datumEstimateSpace((Datum) 0, false,
619 : 12 : true, attr->attlen);
620 : :
621 : 12 : estnbtreeshared = add_size(estnbtreeshared, estfixed);
622 : 12 : continue;
623 : : }
624 : :
625 : : /*
626 : : * This index attribute stores pass-by-reference datums.
627 : : *
628 : : * Assume that serializing this array will use just as much space as a
629 : : * pass-by-value datum, in addition to space for the largest possible
630 : : * whole index tuple (this is not just a per-datum portion of the
631 : : * largest possible tuple because that'd be almost as large anyway).
632 : : *
633 : : * This is quite conservative, but it's not clear how we could do much
634 : : * better. The executor requires an up-front storage request size
635 : : * that reliably covers the scan's high watermark memory usage. We
636 : : * can't be sure of the real high watermark until the scan is over.
637 : : */
396 pg@bowt.ie 638 :UBC 0 : estnbtreeshared = add_size(estnbtreeshared, genericattrspace);
639 : 0 : estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize);
640 : : }
641 : :
396 pg@bowt.ie 642 :CBC 12 : return estnbtreeshared;
643 : : }
644 : :
645 : : /*
646 : : * _bt_start_prim_scan() -- start scheduled primitive index scan?
647 : : *
648 : : * Returns true if _bt_checkkeys scheduled another primitive index scan, just
649 : : * as the last one ended. Otherwise returns false, indicating that the array
650 : : * keys are now fully exhausted.
651 : : *
652 : : * Only call here during scans with one or more equality type array scan keys,
653 : : * after _bt_first or _bt_next return false.
654 : : */
655 : : static bool
148 pg@bowt.ie 656 :GNC 67852 : _bt_start_prim_scan(IndexScanDesc scan)
657 : : {
658 : 67852 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
659 : :
660 [ - + ]: 67852 : Assert(so->numArrayKeys);
661 : :
662 : 67852 : so->scanBehind = so->oppositeDirCheck = false; /* reset */
663 : :
664 : : /*
665 : : * Array keys are advanced within _bt_checkkeys when the scan reaches the
666 : : * leaf level (more precisely, they're advanced when the scan reaches the
667 : : * end of each distinct set of array elements). This process avoids
668 : : * repeat access to leaf pages (across multiple primitive index scans) by
669 : : * advancing the scan's array keys when it allows the primitive index scan
670 : : * to find nearby matching tuples (or when it eliminates ranges of array
671 : : * key space that can't possibly be satisfied by any index tuple).
672 : : *
673 : : * _bt_checkkeys sets a simple flag variable to schedule another primitive
674 : : * index scan. The flag tells us what to do.
675 : : *
676 : : * We cannot rely on _bt_first always reaching _bt_checkkeys. There are
677 : : * various cases where that won't happen. For example, if the index is
678 : : * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
679 : : * We also don't expect a call to _bt_checkkeys during searches for a
680 : : * non-existent value that happens to be lower/higher than any existing
681 : : * value in the index.
682 : : *
683 : : * We don't require special handling for these cases -- we don't need to
684 : : * be explicitly instructed to _not_ perform another primitive index scan.
685 : : * It's up to code under the control of _bt_first to always set the flag
686 : : * when another primitive index scan will be required.
687 : : *
688 : : * This works correctly, even with the tricky cases listed above, which
689 : : * all involve access to leaf pages "near the boundaries of the key space"
690 : : * (whether it's from a leftmost/rightmost page, or an imaginary empty
691 : : * leaf root page). If _bt_checkkeys cannot be reached by a primitive
692 : : * index scan for one set of array keys, then it also won't be reached for
693 : : * any later set ("later" in terms of the direction that we scan the index
694 : : * and advance the arrays). The array keys won't have advanced in these
695 : : * cases, but that's the correct behavior (even _bt_advance_array_keys
696 : : * won't always advance the arrays at the point they become "exhausted").
697 : : */
698 [ + + ]: 67852 : if (so->needPrimScan)
699 : : {
700 : : /*
701 : : * Flag was set -- must call _bt_first again, which will reset the
702 : : * scan's needPrimScan flag
703 : : */
704 : 11799 : return true;
705 : : }
706 : :
707 : : /* The top-level index scan ran out of tuples in this scan direction */
708 [ + + ]: 56053 : if (scan->parallel_scan != NULL)
709 : 20 : _bt_parallel_done(scan);
710 : :
711 : 56053 : return false;
712 : : }
713 : :
714 : : /*
715 : : * _bt_parallel_serialize_arrays() -- Serialize parallel array state.
716 : : *
717 : : * Caller must have exclusively locked btscan->btps_lock when called.
718 : : */
719 : : static void
396 pg@bowt.ie 720 :CBC 24 : _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
721 : : BTScanOpaque so)
722 : : {
723 : : char *datumshared;
724 : :
725 : : /* Space for serialized datums begins immediately after btps_arrElems[] */
726 : 24 : datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
727 [ + + ]: 48 : for (int i = 0; i < so->numArrayKeys; i++)
728 : : {
729 : 24 : BTArrayKeyInfo *array = &so->arrayKeys[i];
730 : 24 : ScanKey skey = &so->keyData[array->scan_key];
731 : :
732 [ + - ]: 24 : if (array->num_elems != -1)
733 : : {
734 : : /* Save SAOP array's cur_elem (no need to copy key/datum) */
735 [ - + ]: 24 : Assert(!(skey->sk_flags & SK_BT_SKIP));
736 : 24 : btscan->btps_arrElems[i] = array->cur_elem;
737 : 24 : continue;
738 : : }
739 : :
740 : : /* Save all mutable state associated with skip array's key */
396 pg@bowt.ie 741 [ # # ]:UBC 0 : Assert(skey->sk_flags & SK_BT_SKIP);
742 : 0 : memcpy(datumshared, &skey->sk_flags, sizeof(int));
743 : 0 : datumshared += sizeof(int);
744 : :
745 [ # # ]: 0 : if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
746 : : {
747 : : /* No sk_argument datum to serialize */
748 [ # # ]: 0 : Assert(skey->sk_argument == 0);
749 : 0 : continue;
750 : : }
751 : :
752 : 0 : datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
753 : 0 : array->attbyval, array->attlen, &datumshared);
754 : : }
396 pg@bowt.ie 755 :CBC 24 : }
756 : :
757 : : /*
758 : : * _bt_parallel_restore_arrays() -- Restore serialized parallel array state.
759 : : *
760 : : * Caller must have exclusively locked btscan->btps_lock when called.
761 : : */
762 : : static void
763 : 24 : _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
764 : : BTScanOpaque so)
765 : : {
766 : : char *datumshared;
767 : :
768 : : /* Space for serialized datums begins immediately after btps_arrElems[] */
769 : 24 : datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
770 [ + + ]: 48 : for (int i = 0; i < so->numArrayKeys; i++)
771 : : {
772 : 24 : BTArrayKeyInfo *array = &so->arrayKeys[i];
773 : 24 : ScanKey skey = &so->keyData[array->scan_key];
774 : : bool isnull;
775 : :
776 [ + - ]: 24 : if (array->num_elems != -1)
777 : : {
778 : : /* Restore SAOP array using its saved cur_elem */
779 [ - + ]: 24 : Assert(!(skey->sk_flags & SK_BT_SKIP));
780 : 24 : array->cur_elem = btscan->btps_arrElems[i];
781 : 24 : skey->sk_argument = array->elem_values[array->cur_elem];
782 : 24 : continue;
783 : : }
784 : :
785 : : /* Restore skip array by restoring its key directly */
396 pg@bowt.ie 786 [ # # # # ]:UBC 0 : if (!array->attbyval && skey->sk_argument)
787 : 0 : pfree(DatumGetPointer(skey->sk_argument));
788 : 0 : skey->sk_argument = (Datum) 0;
789 : 0 : memcpy(&skey->sk_flags, datumshared, sizeof(int));
790 : 0 : datumshared += sizeof(int);
791 : :
792 [ # # ]: 0 : Assert(skey->sk_flags & SK_BT_SKIP);
793 : :
794 [ # # ]: 0 : if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
795 : : {
796 : : /* No sk_argument datum to restore */
797 : 0 : continue;
798 : : }
799 : :
800 : 0 : skey->sk_argument = datumRestore(&datumshared, &isnull);
801 [ # # ]: 0 : if (isnull)
802 : : {
803 [ # # ]: 0 : Assert(skey->sk_argument == 0);
804 [ # # ]: 0 : Assert(skey->sk_flags & SK_SEARCHNULL);
805 [ # # ]: 0 : Assert(skey->sk_flags & SK_ISNULL);
806 : : }
807 : : }
3366 rhaas@postgresql.org 808 :CBC 24 : }
809 : :
810 : : /*
811 : : * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
812 : : */
813 : : void
814 : 42 : btinitparallelscan(void *target)
815 : : {
816 : 42 : BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
817 : :
423 pg@bowt.ie 818 : 42 : LWLockInitialize(&bt_target->btps_lock,
819 : : LWTRANCHE_PARALLEL_BTREE_SCAN);
564 820 : 42 : bt_target->btps_nextScanPage = InvalidBlockNumber;
821 : 42 : bt_target->btps_lastCurrPage = InvalidBlockNumber;
3366 rhaas@postgresql.org 822 : 42 : bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
823 : 42 : ConditionVariableInit(&bt_target->btps_cv);
824 : 42 : }
825 : :
826 : : /*
827 : : * btparallelrescan() -- reset parallel scan
828 : : */
829 : : void
830 : 16 : btparallelrescan(IndexScanDesc scan)
831 : : {
832 : : BTParallelScanDesc btscan;
833 : 16 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
834 : :
835 [ - + ]: 16 : Assert(parallel_scan);
836 : :
523 peter@eisentraut.org 837 : 16 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
838 : : parallel_scan->ps_offset_am);
839 : :
840 : : /*
841 : : * In theory, we don't need to acquire the LWLock here, because there
842 : : * shouldn't be any other workers running at this point, but we do so for
843 : : * consistency.
844 : : */
423 pg@bowt.ie 845 : 16 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
564 846 : 16 : btscan->btps_nextScanPage = InvalidBlockNumber;
847 : 16 : btscan->btps_lastCurrPage = InvalidBlockNumber;
3366 rhaas@postgresql.org 848 : 16 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
423 pg@bowt.ie 849 : 16 : LWLockRelease(&btscan->btps_lock);
3366 rhaas@postgresql.org 850 : 16 : }
851 : :
852 : : /*
853 : : * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
854 : : * page. Other scans must wait until we call _bt_parallel_release()
855 : : * or _bt_parallel_done().
856 : : *
857 : : * The return value is true if we successfully seized the scan and false
858 : : * if we did not. The latter case occurs when no pages remain, or when
859 : : * another primitive index scan is scheduled that caller's backend cannot
860 : : * start just yet (only backends that call from _bt_first are capable of
861 : : * starting primitive index scans, which they indicate by passing first=true).
862 : : *
863 : : * If the return value is true, *next_scan_page returns the next page of the
864 : : * scan, and *last_curr_page returns the page that *next_scan_page came from.
865 : : * An invalid *next_scan_page means the scan hasn't yet started, or that
866 : : * caller needs to start the next primitive index scan (if it's the latter
867 : : * case we'll set so.needPrimScan).
868 : : *
869 : : * Callers should ignore the value of *next_scan_page and *last_curr_page if
870 : : * the return value is false.
871 : : */
872 : : bool
564 pg@bowt.ie 873 : 1104 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
874 : : BlockNumber *last_curr_page, bool first)
875 : : {
396 876 : 1104 : Relation rel = scan->indexRelation;
3366 rhaas@postgresql.org 877 : 1104 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
543 pg@bowt.ie 878 : 1104 : bool exit_loop = false,
879 : 1104 : status = true,
880 : 1104 : endscan = false;
3366 rhaas@postgresql.org 881 : 1104 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
882 : : BTParallelScanDesc btscan;
883 : :
543 pg@bowt.ie 884 : 1104 : *next_scan_page = InvalidBlockNumber;
564 885 : 1104 : *last_curr_page = InvalidBlockNumber;
886 : :
887 : : /*
888 : : * Reset so->currPos, and initialize moreLeft/moreRight such that the next
889 : : * call to _bt_readnextpage treats this backend similarly to a serial
890 : : * backend that steps from *last_curr_page to *next_scan_page (unless this
891 : : * backend's so->currPos is initialized by _bt_readfirstpage before then).
892 : : */
893 : 1104 : BTScanPosInvalidate(so->currPos);
894 : 1104 : so->currPos.moreLeft = so->currPos.moreRight = true;
895 : :
759 896 [ + + ]: 1104 : if (first)
897 : : {
898 : : /*
899 : : * Initialize array related state when called from _bt_first, assuming
900 : : * that this will be the first primitive index scan for the scan
901 : : */
902 : 296 : so->needPrimScan = false;
903 : 296 : so->scanBehind = false;
566 904 : 296 : so->oppositeDirCheck = false;
905 : : }
906 : : else
907 : : {
908 : : /*
909 : : * Don't attempt to seize the scan when it requires another primitive
910 : : * index scan, since caller's backend cannot start it right now
911 : : */
759 912 [ - + ]: 808 : if (so->needPrimScan)
759 pg@bowt.ie 913 :UBC 0 : return false;
914 : : }
915 : :
523 peter@eisentraut.org 916 :CBC 1104 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
917 : : parallel_scan->ps_offset_am);
918 : :
919 : : while (1)
920 : : {
423 pg@bowt.ie 921 : 1110 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
922 : :
759 923 [ + + ]: 1110 : if (btscan->btps_pageStatus == BTPARALLEL_DONE)
924 : : {
925 : : /* We're done with this parallel index scan */
3366 rhaas@postgresql.org 926 : 198 : status = false;
927 : : }
543 pg@bowt.ie 928 [ + + ]: 912 : else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
929 [ + + ]: 824 : btscan->btps_nextScanPage == P_NONE)
930 : : {
931 : : /* End this parallel index scan */
932 : 16 : status = false;
933 : 16 : endscan = true;
934 : : }
759 935 [ + + ]: 896 : else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
936 : : {
937 [ - + ]: 24 : Assert(so->numArrayKeys);
938 : :
939 [ + - ]: 24 : if (first)
940 : : {
941 : : /* Can start scheduled primitive scan right away, so do so */
942 : 24 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
943 : :
944 : : /* Restore scan's array keys from serialized values */
396 945 : 24 : _bt_parallel_restore_arrays(rel, btscan, so);
759 946 : 24 : exit_loop = true;
947 : : }
948 : : else
949 : : {
950 : : /*
951 : : * Don't attempt to seize the scan when it requires another
952 : : * primitive index scan, since caller's backend cannot start
953 : : * it right now
954 : : */
595 pg@bowt.ie 955 :UBC 0 : status = false;
956 : : }
957 : :
958 : : /*
959 : : * Either way, update backend local state to indicate that a
960 : : * pending primitive scan is required
961 : : */
595 pg@bowt.ie 962 :CBC 24 : so->needPrimScan = true;
963 : 24 : so->scanBehind = false;
566 964 : 24 : so->oppositeDirCheck = false;
965 : : }
759 966 [ + + ]: 872 : else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
967 : : {
968 : : /*
969 : : * We have successfully seized control of the scan for the purpose
970 : : * of advancing it to a new page!
971 : : */
3366 rhaas@postgresql.org 972 : 866 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
543 pg@bowt.ie 973 [ - + ]: 866 : Assert(btscan->btps_nextScanPage != P_NONE);
564 974 : 866 : *next_scan_page = btscan->btps_nextScanPage;
975 : 866 : *last_curr_page = btscan->btps_lastCurrPage;
3366 rhaas@postgresql.org 976 : 866 : exit_loop = true;
977 : : }
423 pg@bowt.ie 978 : 1110 : LWLockRelease(&btscan->btps_lock);
3366 rhaas@postgresql.org 979 [ + + + + ]: 1110 : if (exit_loop || !status)
980 : : break;
981 : 6 : ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
982 : : }
983 : 1104 : ConditionVariableCancelSleep();
984 : :
985 : : /* When the scan has reached the rightmost (or leftmost) page, end it */
543 pg@bowt.ie 986 [ + + ]: 1104 : if (endscan)
987 : 16 : _bt_parallel_done(scan);
988 : :
3366 rhaas@postgresql.org 989 : 1104 : return status;
990 : : }
991 : :
992 : : /*
993 : : * _bt_parallel_release() -- Complete the process of advancing the scan to a
994 : : * new page. We now have the new value btps_nextScanPage; another backend
995 : : * can now begin advancing the scan.
996 : : *
997 : : * Callers whose scan uses array keys must save their curr_page argument so
998 : : * that it can be passed to _bt_parallel_primscan_schedule, should caller
999 : : * determine that another primitive index scan is required.
1000 : : *
1001 : : * If caller's next_scan_page is P_NONE, the scan has reached the index's
1002 : : * rightmost/leftmost page. This is treated as reaching the end of the scan
1003 : : * within _bt_parallel_seize.
1004 : : *
1005 : : * Note: unlike the serial case, parallel scans don't need to remember both
1006 : : * sibling links. next_scan_page is whichever link is next given the scan's
1007 : : * direction. That's all we'll ever need, since the direction of a parallel
1008 : : * scan can never change.
1009 : : */
1010 : : void
564 pg@bowt.ie 1011 : 890 : _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
1012 : : BlockNumber curr_page)
1013 : : {
3366 rhaas@postgresql.org 1014 : 890 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1015 : : BTParallelScanDesc btscan;
1016 : :
543 pg@bowt.ie 1017 [ - + ]: 890 : Assert(BlockNumberIsValid(next_scan_page));
1018 : :
523 peter@eisentraut.org 1019 : 890 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1020 : : parallel_scan->ps_offset_am);
1021 : :
423 pg@bowt.ie 1022 : 890 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
564 1023 : 890 : btscan->btps_nextScanPage = next_scan_page;
1024 : 890 : btscan->btps_lastCurrPage = curr_page;
3366 rhaas@postgresql.org 1025 : 890 : btscan->btps_pageStatus = BTPARALLEL_IDLE;
423 pg@bowt.ie 1026 : 890 : LWLockRelease(&btscan->btps_lock);
3366 rhaas@postgresql.org 1027 : 890 : ConditionVariableSignal(&btscan->btps_cv);
1028 : 890 : }
1029 : :
1030 : : /*
1031 : : * _bt_parallel_done() -- Mark the parallel scan as complete.
1032 : : *
1033 : : * When there are no pages left to scan, this function should be called to
1034 : : * notify other workers. Otherwise, they might wait forever for the scan to
1035 : : * advance to the next page.
1036 : : */
1037 : : void
1038 : 4531628 : _bt_parallel_done(IndexScanDesc scan)
1039 : : {
595 pg@bowt.ie 1040 : 4531628 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
3366 rhaas@postgresql.org 1041 : 4531628 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1042 : : BTParallelScanDesc btscan;
1043 : 4531628 : bool status_changed = false;
1044 : :
547 pg@bowt.ie 1045 [ + - - + : 4531628 : Assert(!BTScanPosIsValid(so->currPos));
- + ]
1046 : :
1047 : : /* Do nothing, for non-parallel scans */
3366 rhaas@postgresql.org 1048 [ + + ]: 4531628 : if (parallel_scan == NULL)
1049 : 4531514 : return;
1050 : :
1051 : : /*
1052 : : * Should not mark parallel scan done when there's still a pending
1053 : : * primitive index scan
1054 : : */
595 pg@bowt.ie 1055 [ + + ]: 114 : if (so->needPrimScan)
1056 : 24 : return;
1057 : :
523 peter@eisentraut.org 1058 : 90 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1059 : : parallel_scan->ps_offset_am);
1060 : :
1061 : : /*
1062 : : * Mark the parallel scan as done, unless some other process did so
1063 : : * already
1064 : : */
423 pg@bowt.ie 1065 : 90 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
595 1066 [ - + ]: 90 : Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
759 1067 [ + + ]: 90 : if (btscan->btps_pageStatus != BTPARALLEL_DONE)
1068 : : {
3366 rhaas@postgresql.org 1069 : 58 : btscan->btps_pageStatus = BTPARALLEL_DONE;
1070 : 58 : status_changed = true;
1071 : : }
423 pg@bowt.ie 1072 : 90 : LWLockRelease(&btscan->btps_lock);
1073 : :
1074 : : /* wake up all the workers associated with this parallel scan */
3366 rhaas@postgresql.org 1075 [ + + ]: 90 : if (status_changed)
1076 : 58 : ConditionVariableBroadcast(&btscan->btps_cv);
1077 : : }
1078 : :
1079 : : /*
1080 : : * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
1081 : : *
1082 : : * Caller passes the curr_page most recently passed to _bt_parallel_release
1083 : : * by its backend. Caller successfully schedules the next primitive index scan
1084 : : * if the shared parallel state hasn't been seized since caller's backend last
1085 : : * advanced the scan.
1086 : : */
1087 : : void
564 pg@bowt.ie 1088 : 24 : _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
1089 : : {
396 1090 : 24 : Relation rel = scan->indexRelation;
3366 rhaas@postgresql.org 1091 : 24 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1092 : 24 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1093 : : BTParallelScanDesc btscan;
1094 : :
759 pg@bowt.ie 1095 [ - + ]: 24 : Assert(so->numArrayKeys);
1096 : :
523 peter@eisentraut.org 1097 : 24 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1098 : : parallel_scan->ps_offset_am);
1099 : :
423 pg@bowt.ie 1100 : 24 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
564 1101 [ + - ]: 24 : if (btscan->btps_lastCurrPage == curr_page &&
759 1102 [ + - ]: 24 : btscan->btps_pageStatus == BTPARALLEL_IDLE)
1103 : : {
564 1104 : 24 : btscan->btps_nextScanPage = InvalidBlockNumber;
1105 : 24 : btscan->btps_lastCurrPage = InvalidBlockNumber;
759 1106 : 24 : btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1107 : :
1108 : : /* Serialize scan's current array keys */
396 1109 : 24 : _bt_parallel_serialize_arrays(rel, btscan, so);
1110 : : }
423 1111 : 24 : LWLockRelease(&btscan->btps_lock);
3366 rhaas@postgresql.org 1112 : 24 : }
1113 : :
1114 : : /*
1115 : : * Bulk deletion of all index entries pointing to a set of heap tuples.
1116 : : * The set of target tuples is specified via a callback routine that tells
1117 : : * whether any given heap tuple (identified by ItemPointer) is being deleted.
1118 : : *
1119 : : * Result: a palloc'd struct containing statistical info for VACUUM displays.
1120 : : */
1121 : : IndexBulkDeleteResult *
3761 tgl@sss.pgh.pa.us 1122 : 2056 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1123 : : IndexBulkDeleteCallback callback, void *callback_state)
1124 : : {
7308 1125 : 2056 : Relation rel = info->index;
1126 : : BTCycleId cycleid;
1127 : :
1128 : : /* allocate stats if first time through, else re-use existing struct */
7302 1129 [ + + ]: 2056 : if (stats == NULL)
146 michael@paquier.xyz 1130 :GNC 2046 : stats = palloc0_object(IndexBulkDeleteResult);
1131 : :
1132 : : /* Establish the vacuum cycle ID to use for this scan */
1133 : : /* The ENSURE stuff ensures we clean up shared memory on failure */
6593 tgl@sss.pgh.pa.us 1134 [ + + ]:CBC 2056 : PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
1135 : : {
7302 1136 : 2056 : cycleid = _bt_start_vacuum(rel);
1137 : :
2195 pg@bowt.ie 1138 : 2056 : btvacuumscan(info, stats, callback, callback_state, cycleid);
1139 : : }
6593 tgl@sss.pgh.pa.us 1140 [ - + ]: 2055 : PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
1141 : 2054 : _bt_end_vacuum(rel);
1142 : :
3761 1143 : 2054 : return stats;
1144 : : }
1145 : :
1146 : : /*
1147 : : * Post-VACUUM cleanup.
1148 : : *
1149 : : * Result: a palloc'd struct containing statistical info for VACUUM displays.
1150 : : */
1151 : : IndexBulkDeleteResult *
1152 : 33943 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
1153 : : {
1154 : : BlockNumber num_delpages;
1155 : :
1156 : : /* No-op in ANALYZE ONLY mode */
6251 1157 [ + + ]: 33943 : if (info->analyze_only)
3761 1158 : 10889 : return stats;
1159 : :
1160 : : /*
1161 : : * If btbulkdelete was called, we need not do anything (we just maintain
1162 : : * the information used within _bt_vacuum_needs_cleanup() by calling
1163 : : * _bt_set_cleanup_info() below).
1164 : : *
1165 : : * If btbulkdelete was _not_ called, then we have a choice to make: we
1166 : : * must decide whether or not a btvacuumscan() call is needed now (i.e.
1167 : : * whether the ongoing VACUUM operation can entirely avoid a physical scan
1168 : : * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1169 : : * now.
1170 : : */
7308 1171 [ + + ]: 23054 : if (stats == NULL)
1172 : : {
1173 : : /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1060 pg@bowt.ie 1174 [ + + ]: 21376 : if (!_bt_vacuum_needs_cleanup(info->index))
2953 teodor@sigaev.ru 1175 : 21367 : return NULL;
1176 : :
1177 : : /*
1178 : : * Since we aren't going to actually delete any leaf items, there's no
1179 : : * need to go through all the vacuum-cycle-ID pushups here.
1180 : : *
1181 : : * Posting list tuples are a source of inaccuracy for cleanup-only
1182 : : * scans. btvacuumscan() will assume that the number of index tuples
1183 : : * from each page can be used as num_index_tuples, even though
1184 : : * num_index_tuples is supposed to represent the number of TIDs in the
1185 : : * index. This naive approach can underestimate the number of tuples
1186 : : * in the index significantly.
1187 : : *
1188 : : * We handle the problem by making num_index_tuples an estimate in
1189 : : * cleanup-only case.
1190 : : */
146 michael@paquier.xyz 1191 :GNC 9 : stats = palloc0_object(IndexBulkDeleteResult);
2195 pg@bowt.ie 1192 :CBC 9 : btvacuumscan(info, stats, NULL, NULL, 0);
1882 1193 : 9 : stats->estimated_count = true;
1194 : : }
1195 : :
1196 : : /*
1197 : : * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1198 : : *
1199 : : * num_delpages is the number of deleted pages now in the index that were
1200 : : * not safe to place in the FSM to be recycled just yet. num_delpages is
1201 : : * greater than 0 only when _bt_pagedel() actually deleted pages during
1202 : : * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1203 : : * have failed to place any newly deleted pages in the FSM just moments
1204 : : * ago. (Actually, there are edge cases where recycling of the current
1205 : : * VACUUM's newly deleted pages does not even become safe by the time the
1206 : : * next VACUUM comes around. See nbtree/README.)
1207 : : */
1896 1208 [ - + ]: 1687 : Assert(stats->pages_deleted >= stats->pages_free);
1209 : 1687 : num_delpages = stats->pages_deleted - stats->pages_free;
1060 1210 : 1687 : _bt_set_cleanup_info(info->index, num_delpages);
1211 : :
1212 : : /*
1213 : : * It's quite possible for us to be fooled by concurrent page splits into
1214 : : * double-counting some index tuples, so disbelieve any total that exceeds
1215 : : * the underlying heap's count ... if we know that accurately. Otherwise
1216 : : * this might just make matters worse.
1217 : : */
5930 tgl@sss.pgh.pa.us 1218 [ + + ]: 1687 : if (!info->estimated_count)
1219 : : {
7302 1220 [ + + ]: 1638 : if (stats->num_index_tuples > info->num_heap_tuples)
1221 : 45 : stats->num_index_tuples = info->num_heap_tuples;
1222 : : }
1223 : :
3761 1224 : 1687 : return stats;
1225 : : }
1226 : :
1227 : : /*
1228 : : * btvacuumscan --- scan the index for VACUUMing purposes
1229 : : *
1230 : : * This combines the functions of looking for leaf tuples that are deletable
1231 : : * according to the vacuum callback, looking for empty pages that can be
1232 : : * deleted, and looking for old deleted pages that can be recycled. Both
1233 : : * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
1234 : : * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
1235 : : *
1236 : : * The caller is responsible for initially allocating/zeroing a stats struct
1237 : : * and for obtaining a vacuum cycle ID if necessary.
1238 : : */
1239 : : static void
7302 1240 : 2065 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1241 : : IndexBulkDeleteCallback callback, void *callback_state,
1242 : : BTCycleId cycleid)
1243 : : {
1244 : 2065 : Relation rel = info->index;
1245 : : BTVacState vstate;
1246 : : BlockNumber num_pages;
1247 : : bool needLock;
1248 : : BlockRangeReadStreamPrivate p;
410 melanieplageman@gmai 1249 : 2065 : ReadStream *stream = NULL;
1250 : :
1251 : : /*
1252 : : * Reset fields that track information about the entire index now. This
1253 : : * avoids double-counting in the case where a single VACUUM command
1254 : : * requires multiple scans of the index.
1255 : : *
1256 : : * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
1257 : : * since they track information about the VACUUM command, and so must last
1258 : : * across each call to btvacuumscan().
1259 : : *
1260 : : * (Note that pages_free is treated as state about the whole index, not
1261 : : * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1262 : : * calls are idempotent, and get repeated for the same deleted pages in
1263 : : * some scenarios. The point for us is to track the number of recyclable
1264 : : * pages in the index at the end of the VACUUM command.)
1265 : : */
1896 pg@bowt.ie 1266 : 2065 : stats->num_pages = 0;
7302 tgl@sss.pgh.pa.us 1267 : 2065 : stats->num_index_tuples = 0;
1268 : 2065 : stats->pages_deleted = 0;
1896 pg@bowt.ie 1269 : 2065 : stats->pages_free = 0;
1270 : :
1271 : : /* Set up info to pass down to btvacuumpage */
7302 tgl@sss.pgh.pa.us 1272 : 2065 : vstate.info = info;
1273 : 2065 : vstate.stats = stats;
1274 : 2065 : vstate.callback = callback;
1275 : 2065 : vstate.callback_state = callback_state;
1276 : 2065 : vstate.cycleid = cycleid;
1277 : :
1278 : : /* Create a temporary memory context to run _bt_pagedel in */
1279 : 2065 : vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
1280 : : "_bt_pagedel",
1281 : : ALLOCSET_DEFAULT_SIZES);
1282 : :
1283 : : /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1871 pg@bowt.ie 1284 : 2065 : vstate.bufsize = 0;
1285 : 2065 : vstate.maxbufsize = 0;
1286 : 2065 : vstate.pendingpages = NULL;
1287 : 2065 : vstate.npendingpages = 0;
1288 : : /* Consider applying _bt_pendingfsm_finalize optimization */
1289 : 2065 : _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1290 : :
1291 : : /*
1292 : : * The outer loop iterates over all index pages except the metapage, in
1293 : : * physical order (we hope the kernel will cooperate in providing
1294 : : * read-ahead for speed). It is critical that we visit all leaf pages,
1295 : : * including ones added after we start the scan, else we might fail to
1296 : : * delete some deletable tuples. Hence, we must repeatedly check the
1297 : : * relation length. We must acquire the relation-extension lock while
1298 : : * doing so to avoid a race condition: if someone else is extending the
1299 : : * relation, there is a window where bufmgr/smgr have created a new
1300 : : * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1301 : : * we manage to scan such a page here, we'll improperly assume it can be
1302 : : * recycled. Taking the lock synchronizes things enough to prevent a
1303 : : * problem: either num_pages won't include the new page, or _bt_getbuf
1304 : : * already has write lock on the buffer and it will be fully initialized
1305 : : * before we can examine it. Also, we need not worry if a page is added
1306 : : * immediately after we look; the page splitting code already has
1307 : : * write-lock on the left page before it adds a right page, so we must
1308 : : * already have processed any tuples due to be moved into such a page.
1309 : : *
1310 : : * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1311 : : * think the use of the extension lock is still required.
1312 : : *
1313 : : * We can skip locking for new or temp relations, however, since no one
1314 : : * else could be accessing them.
1315 : : */
7302 tgl@sss.pgh.pa.us 1316 [ + + + - ]: 2065 : needLock = !RELATION_IS_LOCAL(rel);
1317 : :
410 melanieplageman@gmai 1318 : 2065 : p.current_blocknum = BTREE_METAPAGE + 1;
1319 : :
1320 : : /*
1321 : : * It is safe to use batchmode as block_range_read_stream_cb takes no
1322 : : * locks.
1323 : : */
372 1324 : 2065 : stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
1325 : : READ_STREAM_FULL |
1326 : : READ_STREAM_USE_BATCHING,
1327 : : info->strategy,
1328 : : rel,
1329 : : MAIN_FORKNUM,
1330 : : block_range_read_stream_cb,
1331 : : &p,
1332 : : 0);
1333 : : for (;;)
1334 : : {
1335 : : /* Get the current relation length */
7302 tgl@sss.pgh.pa.us 1336 [ + + ]: 3941 : if (needLock)
1337 : 3939 : LockRelationForExtension(rel, ExclusiveLock);
1338 : 3941 : num_pages = RelationGetNumberOfBlocks(rel);
1339 [ + + ]: 3941 : if (needLock)
1340 : 3939 : UnlockRelationForExtension(rel, ExclusiveLock);
1341 : :
2590 alvherre@alvh.no-ip. 1342 [ + + ]: 3941 : if (info->report_progress)
1343 : 538 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1344 : : num_pages);
1345 : :
1346 : : /* Quit if we've scanned the whole relation */
410 melanieplageman@gmai 1347 [ + + ]: 3941 : if (p.current_blocknum >= num_pages)
7302 tgl@sss.pgh.pa.us 1348 : 2063 : break;
1349 : :
410 melanieplageman@gmai 1350 : 1878 : p.last_exclusive = num_pages;
1351 : :
1352 : : /* Iterate over pages, then loop back to recheck relation length */
1353 : : while (true)
7387 tgl@sss.pgh.pa.us 1354 : 22153 : {
1355 : : BlockNumber current_block;
1356 : : Buffer buf;
1357 : :
1358 : : /* call vacuum_delay_point while not holding any buffer lock */
410 melanieplageman@gmai 1359 : 24031 : vacuum_delay_point(false);
1360 : :
1361 : 24030 : buf = read_stream_next_buffer(stream, NULL);
1362 : :
1363 [ + + ]: 24030 : if (!BufferIsValid(buf))
1364 : 1876 : break;
1365 : :
1366 : 22154 : current_block = btvacuumpage(&vstate, buf);
1367 : :
2590 alvherre@alvh.no-ip. 1368 [ + + ]: 22153 : if (info->report_progress)
1369 : 511 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1370 : : current_block);
1371 : : }
1372 : :
1373 : : /*
1374 : : * We have to reset the read stream to use it again. After returning
1375 : : * InvalidBuffer, the read stream API won't invoke our callback again
1376 : : * until the stream has been reset.
1377 : : */
410 melanieplageman@gmai 1378 : 1876 : read_stream_reset(stream);
1379 : : }
1380 : :
1381 : 2063 : read_stream_end(stream);
1382 : :
1383 : : /* Set statistics num_pages field to final size of index */
1896 pg@bowt.ie 1384 : 2063 : stats->num_pages = num_pages;
1385 : :
7302 tgl@sss.pgh.pa.us 1386 : 2063 : MemoryContextDelete(vstate.pagedelcontext);
1387 : :
1388 : : /*
1389 : : * If there were any calls to _bt_pagedel() during scan of the index then
1390 : : * see if any of the resulting pages can be placed in the FSM now. When
1391 : : * it's not safe we'll have to leave it up to a future VACUUM operation.
1392 : : *
1393 : : * Finally, if we placed any pages in the FSM (either just now or during
1394 : : * the scan), forcibly update the upper-level FSM pages to ensure that
1395 : : * searchers can find them.
1396 : : */
1871 pg@bowt.ie 1397 : 2063 : _bt_pendingfsm_finalize(rel, &vstate);
1896 1398 [ + + ]: 2063 : if (stats->pages_free > 0)
2958 tgl@sss.pgh.pa.us 1399 : 68 : IndexFreeSpaceMapVacuum(rel);
7302 1400 : 2063 : }
1401 : :
1402 : : /*
1403 : : * btvacuumpage --- VACUUM one page
1404 : : *
1405 : : * This processes a single page for btvacuumscan(). In some cases we must
1406 : : * backtrack to re-examine and VACUUM pages that were on buf's page during
1407 : : * a previous call here. This is how we handle page splits (that happened
1408 : : * after our cycleid was acquired) whose right half page happened to reuse
1409 : : * a block that we might have processed at some point before it was
1410 : : * recycled (i.e. before the page split).
1411 : : *
1412 : : * Returns BlockNumber of a scanned page (not backtracked).
1413 : : */
1414 : : static BlockNumber
410 melanieplageman@gmai 1415 : 22154 : btvacuumpage(BTVacState *vstate, Buffer buf)
1416 : : {
7302 tgl@sss.pgh.pa.us 1417 : 22154 : IndexVacuumInfo *info = vstate->info;
1418 : 22154 : IndexBulkDeleteResult *stats = vstate->stats;
1419 : 22154 : IndexBulkDeleteCallback callback = vstate->callback;
1420 : 22154 : void *callback_state = vstate->callback_state;
1421 : 22154 : Relation rel = info->index;
1128 pg@bowt.ie 1422 : 22154 : Relation heaprel = info->heaprel;
1423 : : bool attempt_pagedel;
1424 : : BlockNumber blkno,
1425 : : backtrack_to;
410 melanieplageman@gmai 1426 : 22154 : BlockNumber scanblkno = BufferGetBlockNumber(buf);
1427 : : Page page;
1428 : : BTPageOpaque opaque;
1429 : :
2194 pg@bowt.ie 1430 : 22154 : blkno = scanblkno;
1431 : :
1432 : 22154 : backtrack:
1433 : :
1434 : 22154 : attempt_pagedel = false;
1435 : 22154 : backtrack_to = P_NONE;
1436 : :
2114 1437 : 22154 : _bt_lockbuf(rel, buf, BT_READ);
3667 kgrittn@postgresql.o 1438 : 22154 : page = BufferGetPage(buf);
2194 pg@bowt.ie 1439 : 22154 : opaque = NULL;
7302 tgl@sss.pgh.pa.us 1440 [ + - ]: 22154 : if (!PageIsNew(page))
1441 : : {
1442 : 22154 : _bt_checkpage(rel, buf);
1495 michael@paquier.xyz 1443 : 22154 : opaque = BTPageGetOpaque(page);
1444 : : }
1445 : :
2194 pg@bowt.ie 1446 [ - + ]: 22154 : Assert(blkno <= scanblkno);
1447 [ - + ]: 22154 : if (blkno != scanblkno)
1448 : : {
1449 : : /*
1450 : : * We're backtracking.
1451 : : *
1452 : : * We followed a right link to a sibling leaf page (a page that
1453 : : * happens to be from a block located before scanblkno). The only
1454 : : * case we want to do anything with is a live leaf page having the
1455 : : * current vacuum cycle ID.
1456 : : *
1457 : : * The page had better be in a state that's consistent with what we
1458 : : * expect. Check for conditions that imply corruption in passing. It
1459 : : * can't be half-dead because only an interrupted VACUUM process can
1460 : : * leave pages in that state, so we'd definitely have dealt with it
1461 : : * back when the page was the scanblkno page (half-dead pages are
1462 : : * always marked fully deleted by _bt_pagedel(), barring corruption).
1463 : : */
2194 pg@bowt.ie 1464 [ # # # # :UBC 0 : if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
# # ]
1465 : : {
1466 : 0 : Assert(false);
1467 : : ereport(LOG,
1468 : : (errcode(ERRCODE_INDEX_CORRUPTED),
1469 : : errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1470 : : blkno, scanblkno, RelationGetRelationName(rel))));
1471 : : _bt_relbuf(rel, buf);
1472 : : return scanblkno;
1473 : : }
1474 : :
1475 : : /*
1476 : : * We may have already processed the page in an earlier call, when the
1477 : : * page was scanblkno. This happens when the leaf page split occurred
1478 : : * after the scan began, but before the right sibling page became the
1479 : : * scanblkno.
1480 : : *
1481 : : * Page may also have been deleted by current btvacuumpage() call,
1482 : : * since _bt_pagedel() sometimes deletes the right sibling page of
1483 : : * scanblkno in passing (it does so after we decided where to
1484 : : * backtrack to). We don't need to process this page as a deleted
1485 : : * page a second time now (in fact, it would be wrong to count it as a
1486 : : * deleted page in the bulk delete statistics a second time).
1487 : : */
1488 [ # # # # ]: 0 : if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1489 : : {
1490 : : /* Done with current scanblkno (and all lower split pages) */
7302 tgl@sss.pgh.pa.us 1491 : 0 : _bt_relbuf(rel, buf);
410 melanieplageman@gmai 1492 : 0 : return scanblkno;
1493 : : }
1494 : : }
1495 : :
1128 pg@bowt.ie 1496 [ + - + + ]:CBC 22154 : if (!opaque || BTPageIsRecyclable(page, heaprel))
1497 : : {
1498 : : /* Okay to recycle this page (which could be leaf or internal) */
6426 heikki.linnakangas@i 1499 : 838 : RecordFreeIndexPage(rel, blkno);
7302 tgl@sss.pgh.pa.us 1500 : 838 : stats->pages_deleted++;
1896 pg@bowt.ie 1501 : 838 : stats->pages_free++;
1502 : : }
7302 tgl@sss.pgh.pa.us 1503 [ + + ]: 21316 : else if (P_ISDELETED(opaque))
1504 : : {
1505 : : /*
1506 : : * Already deleted page (which could be leaf or internal). Can't
1507 : : * recycle yet.
1508 : : */
1509 : 124 : stats->pages_deleted++;
1510 : : }
7125 1511 [ + + ]: 21192 : else if (P_ISHALFDEAD(opaque))
1512 : : {
1513 : : /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1895 pg@bowt.ie 1514 :GBC 1 : attempt_pagedel = true;
1515 : :
1516 : : /*
1517 : : * _bt_pagedel() will increment both pages_newly_deleted and
1518 : : * pages_deleted stats in all cases (barring corruption)
1519 : : */
1520 : : }
7302 tgl@sss.pgh.pa.us 1521 [ + + ]:CBC 21191 : else if (P_ISLEAF(opaque))
1522 : : {
1523 : : OffsetNumber deletable[MaxIndexTuplesPerPage];
1524 : : int ndeletable;
1525 : : BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1526 : : int nupdatable;
1527 : : OffsetNumber offnum,
1528 : : minoff,
1529 : : maxoff;
1530 : : int nhtidsdead,
1531 : : nhtidslive;
1532 : :
1533 : : /*
1534 : : * Trade in the initial read lock for a full cleanup lock on this
1535 : : * page. We must get such a lock on every leaf page over the course
1536 : : * of the vacuum scan, whether or not it actually contains any
1537 : : * deletable tuples --- see nbtree/README.
1538 : : */
2114 pg@bowt.ie 1539 : 20059 : _bt_upgradelockbufcleanup(rel, buf);
1540 : :
1541 : : /*
1542 : : * Check whether we need to backtrack to earlier pages. What we are
1543 : : * concerned about is a page split that happened since we started the
1544 : : * vacuum scan. If the split moved tuples on the right half of the
1545 : : * split (i.e. the tuples that sort high) to a block that we already
1546 : : * passed over, then we might have missed the tuples. We need to
1547 : : * backtrack now. (Must do this before possibly clearing btpo_cycleid
1548 : : * or deleting scanblkno page below!)
1549 : : */
7302 tgl@sss.pgh.pa.us 1550 [ + + ]: 20059 : if (vstate->cycleid != 0 &&
1551 [ + + ]: 19945 : opaque->btpo_cycleid == vstate->cycleid &&
1552 [ + - ]: 2 : !(opaque->btpo_flags & BTP_SPLIT_END) &&
1553 [ + + ]: 2 : !P_RIGHTMOST(opaque) &&
2194 pg@bowt.ie 1554 [ - + ]: 1 : opaque->btpo_next < scanblkno)
2194 pg@bowt.ie 1555 :UBC 0 : backtrack_to = opaque->btpo_next;
1556 : :
7302 tgl@sss.pgh.pa.us 1557 :CBC 20059 : ndeletable = 0;
2260 pg@bowt.ie 1558 : 20059 : nupdatable = 0;
7302 tgl@sss.pgh.pa.us 1559 [ + + ]: 20059 : minoff = P_FIRSTDATAKEY(opaque);
1560 : 20059 : maxoff = PageGetMaxOffsetNumber(page);
2260 pg@bowt.ie 1561 : 20059 : nhtidsdead = 0;
1562 : 20059 : nhtidslive = 0;
7302 tgl@sss.pgh.pa.us 1563 [ + + ]: 20059 : if (callback)
1564 : : {
1565 : : /* btbulkdelete callback tells us what to delete (or update) */
1566 : 19945 : for (offnum = minoff;
1567 [ + + ]: 3761921 : offnum <= maxoff;
1568 : 3741976 : offnum = OffsetNumberNext(offnum))
1569 : : {
1570 : : IndexTuple itup;
1571 : :
1572 : 3741976 : itup = (IndexTuple) PageGetItem(page,
1573 : 3741976 : PageGetItemId(page, offnum));
1574 : :
2260 pg@bowt.ie 1575 [ - + ]: 3741976 : Assert(!BTreeTupleIsPivot(itup));
1576 [ + + ]: 3741976 : if (!BTreeTupleIsPosting(itup))
1577 : : {
1578 : : /* Regular tuple, standard table TID representation */
1579 [ + + ]: 3579340 : if (callback(&itup->t_tid, callback_state))
1580 : : {
1581 : 1324116 : deletable[ndeletable++] = offnum;
1582 : 1324116 : nhtidsdead++;
1583 : : }
1584 : : else
1585 : 2255224 : nhtidslive++;
1586 : : }
1587 : : else
1588 : : {
1589 : : BTVacuumPosting vacposting;
1590 : : int nremaining;
1591 : :
1592 : : /* Posting list tuple */
1593 : 162636 : vacposting = btreevacuumposting(vstate, itup, offnum,
1594 : : &nremaining);
1595 [ + + ]: 162636 : if (vacposting == NULL)
1596 : : {
1597 : : /*
1598 : : * All table TIDs from the posting tuple remain, so no
1599 : : * delete or update required
1600 : : */
1601 [ - + ]: 50598 : Assert(nremaining == BTreeTupleGetNPosting(itup));
1602 : : }
1603 [ + + ]: 112038 : else if (nremaining > 0)
1604 : : {
1605 : :
1606 : : /*
1607 : : * Store metadata about posting list tuple in
1608 : : * updatable array for entire page. Existing tuple
1609 : : * will be updated during the later call to
1610 : : * _bt_delitems_vacuum().
1611 : : */
1612 [ - + ]: 62003 : Assert(nremaining < BTreeTupleGetNPosting(itup));
1613 : 62003 : updatable[nupdatable++] = vacposting;
1614 : 62003 : nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1615 : : }
1616 : : else
1617 : : {
1618 : : /*
1619 : : * All table TIDs from the posting list must be
1620 : : * deleted. We'll delete the index tuple completely
1621 : : * (no update required).
1622 : : */
1623 [ - + ]: 50035 : Assert(nremaining == 0);
1624 : 50035 : deletable[ndeletable++] = offnum;
1625 : 50035 : nhtidsdead += BTreeTupleGetNPosting(itup);
1626 : 50035 : pfree(vacposting);
1627 : : }
1628 : :
1629 : 162636 : nhtidslive += nremaining;
1630 : : }
1631 : : }
1632 : : }
1633 : :
1634 : : /*
1635 : : * Apply any needed deletes or updates. We issue just one
1636 : : * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1637 : : */
1638 [ + + + + ]: 20059 : if (ndeletable > 0 || nupdatable > 0)
1639 : : {
2194 1640 [ - + ]: 11042 : Assert(nhtidsdead >= ndeletable + nupdatable);
2260 1641 : 11042 : _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1642 : : nupdatable);
1643 : :
1644 : 11042 : stats->tuples_removed += nhtidsdead;
1645 : : /* must recompute maxoff */
7302 tgl@sss.pgh.pa.us 1646 : 11042 : maxoff = PageGetMaxOffsetNumber(page);
1647 : :
1648 : : /* can't leak memory here */
2260 pg@bowt.ie 1649 [ + + ]: 73045 : for (int i = 0; i < nupdatable; i++)
1650 : 62003 : pfree(updatable[i]);
1651 : : }
1652 : : else
1653 : : {
1654 : : /*
1655 : : * If the leaf page has been split during this vacuum cycle, it
1656 : : * seems worth expending a write to clear btpo_cycleid even if we
1657 : : * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1658 : : * takes care of this.) This ensures we won't process the page
1659 : : * again.
1660 : : *
1661 : : * We treat this like a hint-bit update because there's no need to
1662 : : * WAL-log it.
1663 : : */
1664 [ - + ]: 9017 : Assert(nhtidsdead == 0);
7302 tgl@sss.pgh.pa.us 1665 [ + + ]: 9017 : if (vstate->cycleid != 0 &&
1666 [ + + ]: 8903 : opaque->btpo_cycleid == vstate->cycleid)
1667 : : {
1668 : 2 : opaque->btpo_cycleid = 0;
4705 jdavis@postgresql.or 1669 : 2 : MarkBufferDirtyHint(buf, true);
1670 : : }
1671 : : }
1672 : :
1673 : : /*
1674 : : * If the leaf page is now empty, try to delete it; else count the
1675 : : * live tuples (live table TIDs in posting lists are counted as
1676 : : * separate live tuples). We don't delete when backtracking, though,
1677 : : * since that would require teaching _bt_pagedel() about backtracking
1678 : : * (doesn't seem worth adding more complexity to deal with that).
1679 : : *
1680 : : * We don't count the number of live TIDs during cleanup-only calls to
1681 : : * btvacuumscan (i.e. when callback is not set). We count the number
1682 : : * of index tuples directly instead. This avoids the expense of
1683 : : * directly examining all of the tuples on each page. VACUUM will
1684 : : * treat num_index_tuples as an estimate in cleanup-only case, so it
1685 : : * doesn't matter that this underestimates num_index_tuples
1686 : : * significantly in some cases.
1687 : : */
7302 tgl@sss.pgh.pa.us 1688 [ + + ]: 20059 : if (minoff > maxoff)
2194 pg@bowt.ie 1689 : 3784 : attempt_pagedel = (blkno == scanblkno);
2008 1690 [ + + ]: 16275 : else if (callback)
2260 1691 : 16165 : stats->num_index_tuples += nhtidslive;
1692 : : else
2008 1693 : 110 : stats->num_index_tuples += maxoff - minoff + 1;
1694 : :
2194 1695 [ + + - + ]: 20059 : Assert(!attempt_pagedel || nhtidslive == 0);
1696 : : }
1697 : :
1698 [ + + ]: 22154 : if (attempt_pagedel)
1699 : : {
1700 : : MemoryContext oldcontext;
1701 : :
1702 : : /* Run pagedel in a temp context to avoid memory leakage */
7302 tgl@sss.pgh.pa.us 1703 : 3785 : MemoryContextReset(vstate->pagedelcontext);
1704 : 3785 : oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1705 : :
1706 : : /*
1707 : : * _bt_pagedel maintains the bulk delete stats on our behalf;
1708 : : * pages_newly_deleted and pages_deleted are likely to be incremented
1709 : : * during call
1710 : : */
2194 pg@bowt.ie 1711 [ - + ]: 3785 : Assert(blkno == scanblkno);
1895 1712 : 3785 : _bt_pagedel(rel, buf, vstate);
1713 : :
7302 tgl@sss.pgh.pa.us 1714 : 3784 : MemoryContextSwitchTo(oldcontext);
1715 : : /* pagedel released buffer, so we shouldn't */
1716 : : }
1717 : : else
1718 : 18369 : _bt_relbuf(rel, buf);
1719 : :
2194 pg@bowt.ie 1720 [ - + ]: 22153 : if (backtrack_to != P_NONE)
1721 : : {
2194 pg@bowt.ie 1722 :UBC 0 : blkno = backtrack_to;
1723 : :
1724 : : /* check for vacuum delay while not holding any buffer lock */
410 melanieplageman@gmai 1725 : 0 : vacuum_delay_point(false);
1726 : :
1727 : : /*
1728 : : * We can't use _bt_getbuf() here because it always applies
1729 : : * _bt_checkpage(), which will barf on an all-zero page. We want to
1730 : : * recycle all-zero pages, not fail. Also, we want to use a
1731 : : * nondefault buffer access strategy.
1732 : : */
1733 : 0 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1734 : : info->strategy);
2194 pg@bowt.ie 1735 : 0 : goto backtrack;
1736 : : }
1737 : :
410 melanieplageman@gmai 1738 :CBC 22153 : return scanblkno;
1739 : : }
1740 : :
1741 : : /*
1742 : : * btreevacuumposting --- determine TIDs still needed in posting list
1743 : : *
1744 : : * Returns metadata describing how to build replacement tuple without the TIDs
1745 : : * that VACUUM needs to delete. Returned value is NULL in the common case
1746 : : * where no changes are needed to caller's posting list tuple (we avoid
1747 : : * allocating memory here as an optimization).
1748 : : *
1749 : : * The number of TIDs that should remain in the posting list tuple is set for
1750 : : * caller in *nremaining.
1751 : : */
1752 : : static BTVacuumPosting
2260 pg@bowt.ie 1753 : 162636 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1754 : : OffsetNumber updatedoffset, int *nremaining)
1755 : : {
1756 : 162636 : int live = 0;
1757 : 162636 : int nitem = BTreeTupleGetNPosting(posting);
1758 : 162636 : ItemPointer items = BTreeTupleGetPosting(posting);
1759 : 162636 : BTVacuumPosting vacposting = NULL;
1760 : :
1761 [ + + ]: 839833 : for (int i = 0; i < nitem; i++)
1762 : : {
1763 [ + + ]: 677197 : if (!vstate->callback(items + i, vstate->callback_state))
1764 : : {
1765 : : /* Live table TID */
1766 : 357537 : live++;
1767 : : }
1768 [ + + ]: 319660 : else if (vacposting == NULL)
1769 : : {
1770 : : /*
1771 : : * First dead table TID encountered.
1772 : : *
1773 : : * It's now clear that we need to delete one or more dead table
1774 : : * TIDs, so start maintaining metadata describing how to update
1775 : : * existing posting list tuple.
1776 : : */
1777 : 112038 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1778 : : nitem * sizeof(uint16));
1779 : :
1780 : 112038 : vacposting->itup = posting;
1781 : 112038 : vacposting->updatedoffset = updatedoffset;
1782 : 112038 : vacposting->ndeletedtids = 0;
1783 : 112038 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1784 : : }
1785 : : else
1786 : : {
1787 : : /* Second or subsequent dead table TID */
1788 : 207622 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1789 : : }
1790 : : }
1791 : :
1792 : 162636 : *nremaining = live;
1793 : 162636 : return vacposting;
1794 : : }
1795 : :
1796 : : /*
1797 : : * btcanreturn() -- Check whether btree indexes support index-only scans.
1798 : : *
1799 : : * btrees always do, so this is trivial.
1800 : : */
1801 : : bool
3761 tgl@sss.pgh.pa.us 1802 : 865177 : btcanreturn(Relation index, int attno)
1803 : : {
1804 : 865177 : return true;
1805 : : }
1806 : :
1807 : : /*
1808 : : * btgettreeheight() -- Compute tree height for use by btcostestimate().
1809 : : */
1810 : : int
602 peter@eisentraut.org 1811 : 562897 : btgettreeheight(Relation rel)
1812 : : {
1813 : 562897 : return _bt_getrootheight(rel);
1814 : : }
1815 : :
1816 : : CompareType
438 peter@eisentraut.org 1817 :UBC 0 : bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
1818 : : {
457 1819 [ # # # # : 0 : switch (strategy)
# # ]
1820 : : {
1821 : 0 : case BTLessStrategyNumber:
1822 : 0 : return COMPARE_LT;
1823 : 0 : case BTLessEqualStrategyNumber:
1824 : 0 : return COMPARE_LE;
1825 : 0 : case BTEqualStrategyNumber:
1826 : 0 : return COMPARE_EQ;
1827 : 0 : case BTGreaterEqualStrategyNumber:
1828 : 0 : return COMPARE_GE;
1829 : 0 : case BTGreaterStrategyNumber:
1830 : 0 : return COMPARE_GT;
1831 : 0 : default:
1832 : 0 : return COMPARE_INVALID;
1833 : : }
1834 : : }
1835 : :
1836 : : StrategyNumber
438 1837 : 0 : bttranslatecmptype(CompareType cmptype, Oid opfamily)
1838 : : {
457 1839 [ # # # # : 0 : switch (cmptype)
# # ]
1840 : : {
1841 : 0 : case COMPARE_LT:
1842 : 0 : return BTLessStrategyNumber;
1843 : 0 : case COMPARE_LE:
1844 : 0 : return BTLessEqualStrategyNumber;
1845 : 0 : case COMPARE_EQ:
1846 : 0 : return BTEqualStrategyNumber;
1847 : 0 : case COMPARE_GE:
1848 : 0 : return BTGreaterEqualStrategyNumber;
1849 : 0 : case COMPARE_GT:
1850 : 0 : return BTGreaterStrategyNumber;
1851 : 0 : default:
1852 : 0 : return InvalidStrategy;
1853 : : }
1854 : : }
|